diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 9 | ||||
| -rw-r--r-- | mm/Makefile | 5 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 129 | ||||
| -rw-r--r-- | mm/bootmem.c | 206 | ||||
| -rw-r--r-- | mm/fadvise.c | 15 | ||||
| -rw-r--r-- | mm/filemap.c | 105 | ||||
| -rw-r--r-- | mm/filemap.h | 30 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 2 | ||||
| -rw-r--r-- | mm/fremap.c | 4 | ||||
| -rw-r--r-- | mm/highmem.c | 19 | ||||
| -rw-r--r-- | mm/hugetlb.c | 10 | ||||
| -rw-r--r-- | mm/internal.h | 4 | ||||
| -rw-r--r-- | mm/memory.c | 221 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 164 | ||||
| -rw-r--r-- | mm/mempolicy.c | 51 | ||||
| -rw-r--r-- | mm/mempool.c | 9 | ||||
| -rw-r--r-- | mm/migrate.c | 32 | ||||
| -rw-r--r-- | mm/mmap.c | 38 | ||||
| -rw-r--r-- | mm/mmzone.c | 7 | ||||
| -rw-r--r-- | mm/mprotect.c | 51 | ||||
| -rw-r--r-- | mm/mremap.c | 2 | ||||
| -rw-r--r-- | mm/msync.c | 196 | ||||
| -rw-r--r-- | mm/nommu.c | 251 | ||||
| -rw-r--r-- | mm/oom_kill.c | 134 | ||||
| -rw-r--r-- | mm/page-writeback.c | 146 | ||||
| -rw-r--r-- | mm/page_alloc.c | 1418 | ||||
| -rw-r--r-- | mm/page_io.c | 52 | ||||
| -rw-r--r-- | mm/pdflush.c | 15 | ||||
| -rw-r--r-- | mm/readahead.c | 20 | ||||
| -rw-r--r-- | mm/rmap.c | 81 | ||||
| -rw-r--r-- | mm/shmem.c | 120 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 197 | ||||
| -rw-r--r-- | mm/slab.c | 620 | ||||
| -rw-r--r-- | mm/slob.c | 53 | ||||
| -rw-r--r-- | mm/sparse.c | 3 | ||||
| -rw-r--r-- | mm/swap.c | 76 | ||||
| -rw-r--r-- | mm/swap_state.c | 8 | ||||
| -rw-r--r-- | mm/swapfile.c | 11 | ||||
| -rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
| -rw-r--r-- | mm/truncate.c | 22 | ||||
| -rw-r--r-- | mm/vmalloc.c | 47 | ||||
| -rw-r--r-- | mm/vmscan.c | 250 | ||||
| -rw-r--r-- | mm/vmstat.c | 706 | 
43 files changed, 3826 insertions, 1717 deletions
| diff --git a/mm/Kconfig b/mm/Kconfig index 66e65ab39426..8f5b45615f7b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME | |||
| 115 | # eventually, we can have this option just 'select SPARSEMEM' | 115 | # eventually, we can have this option just 'select SPARSEMEM' | 
| 116 | config MEMORY_HOTPLUG | 116 | config MEMORY_HOTPLUG | 
| 117 | bool "Allow for memory hot-add" | 117 | bool "Allow for memory hot-add" | 
| 118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND | 118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG | 
| 119 | depends on (IA64 || X86 || PPC64) | ||
| 119 | 120 | ||
| 120 | comment "Memory hotplug is currently incompatible with Software Suspend" | 121 | comment "Memory hotplug is currently incompatible with Software Suspend" | 
| 121 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | 122 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | 
| @@ -145,3 +146,9 @@ config MIGRATION | |||
| 145 | while the virtual addresses are not changed. This is useful for | 146 | while the virtual addresses are not changed. This is useful for | 
| 146 | example on NUMA systems to put pages nearer to the processors accessing | 147 | example on NUMA systems to put pages nearer to the processors accessing | 
| 147 | the page. | 148 | the page. | 
| 149 | |||
| 150 | config RESOURCES_64BIT | ||
| 151 | bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) | ||
| 152 | default 64BIT | ||
| 153 | help | ||
| 154 | This option allows memory and IO resources to be 64 bit. | ||
| diff --git a/mm/Makefile b/mm/Makefile index 0b8f73f2ed16..6200c6d6afd2 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -10,17 +10,18 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
| 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 
| 11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ | 
| 12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ | 
| 13 | prio_tree.o util.o mmzone.o $(mmu-y) | 13 | prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) | 
| 14 | 14 | ||
| 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 
| 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 
| 17 | obj-$(CONFIG_NUMA) += mempolicy.o | 17 | obj-$(CONFIG_NUMA) += mempolicy.o | 
| 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 
| 19 | obj-$(CONFIG_SHMEM) += shmem.o | 19 | obj-$(CONFIG_SHMEM) += shmem.o | 
| 20 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
| 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 21 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 
| 21 | obj-$(CONFIG_SLOB) += slob.o | 22 | obj-$(CONFIG_SLOB) += slob.o | 
| 22 | obj-$(CONFIG_SLAB) += slab.o | 23 | obj-$(CONFIG_SLAB) += slab.o | 
| 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 24 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 
| 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 25 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 
| 25 | obj-$(CONFIG_MIGRATION) += migrate.o | 26 | obj-$(CONFIG_MIGRATION) += migrate.o | 
| 26 | 27 | obj-$(CONFIG_SMP) += allocpercpu.o | |
| diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c new file mode 100644 index 000000000000..eaa9abeea536 --- /dev/null +++ b/mm/allocpercpu.c | |||
| @@ -0,0 +1,129 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/allocpercpu.c | ||
| 3 | * | ||
| 4 | * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com> | ||
| 5 | */ | ||
| 6 | #include <linux/mm.h> | ||
| 7 | #include <linux/module.h> | ||
| 8 | |||
| 9 | /** | ||
| 10 | * percpu_depopulate - depopulate per-cpu data for given cpu | ||
| 11 | * @__pdata: per-cpu data to depopulate | ||
| 12 | * @cpu: depopulate per-cpu data for this cpu | ||
| 13 | * | ||
| 14 | * Depopulating per-cpu data for a cpu going offline would be a typical | ||
| 15 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
| 16 | */ | ||
| 17 | void percpu_depopulate(void *__pdata, int cpu) | ||
| 18 | { | ||
| 19 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
| 20 | if (pdata->ptrs[cpu]) { | ||
| 21 | kfree(pdata->ptrs[cpu]); | ||
| 22 | pdata->ptrs[cpu] = NULL; | ||
| 23 | } | ||
| 24 | } | ||
| 25 | EXPORT_SYMBOL_GPL(percpu_depopulate); | ||
| 26 | |||
| 27 | /** | ||
| 28 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | ||
| 29 | * @__pdata: per-cpu data to depopulate | ||
| 30 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | ||
| 31 | */ | ||
| 32 | void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | ||
| 33 | { | ||
| 34 | int cpu; | ||
| 35 | for_each_cpu_mask(cpu, *mask) | ||
| 36 | percpu_depopulate(__pdata, cpu); | ||
| 37 | } | ||
| 38 | EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | ||
| 39 | |||
| 40 | /** | ||
| 41 | * percpu_populate - populate per-cpu data for given cpu | ||
| 42 | * @__pdata: per-cpu data to populate further | ||
| 43 | * @size: size of per-cpu object | ||
| 44 | * @gfp: may sleep or not etc. | ||
| 45 | * @cpu: populate per-data for this cpu | ||
| 46 | * | ||
| 47 | * Populating per-cpu data for a cpu coming online would be a typical | ||
| 48 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
| 49 | * Per-cpu object is populated with zeroed buffer. | ||
| 50 | */ | ||
| 51 | void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | ||
| 52 | { | ||
| 53 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
| 54 | int node = cpu_to_node(cpu); | ||
| 55 | |||
| 56 | BUG_ON(pdata->ptrs[cpu]); | ||
| 57 | if (node_online(node)) { | ||
| 58 | /* FIXME: kzalloc_node(size, gfp, node) */ | ||
| 59 | pdata->ptrs[cpu] = kmalloc_node(size, gfp, node); | ||
| 60 | if (pdata->ptrs[cpu]) | ||
| 61 | memset(pdata->ptrs[cpu], 0, size); | ||
| 62 | } else | ||
| 63 | pdata->ptrs[cpu] = kzalloc(size, gfp); | ||
| 64 | return pdata->ptrs[cpu]; | ||
| 65 | } | ||
| 66 | EXPORT_SYMBOL_GPL(percpu_populate); | ||
| 67 | |||
| 68 | /** | ||
| 69 | * percpu_populate_mask - populate per-cpu data for more cpu's | ||
| 70 | * @__pdata: per-cpu data to populate further | ||
| 71 | * @size: size of per-cpu object | ||
| 72 | * @gfp: may sleep or not etc. | ||
| 73 | * @mask: populate per-cpu data for cpu's selected through mask bits | ||
| 74 | * | ||
| 75 | * Per-cpu objects are populated with zeroed buffers. | ||
| 76 | */ | ||
| 77 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | ||
| 78 | cpumask_t *mask) | ||
| 79 | { | ||
| 80 | cpumask_t populated = CPU_MASK_NONE; | ||
| 81 | int cpu; | ||
| 82 | |||
| 83 | for_each_cpu_mask(cpu, *mask) | ||
| 84 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | ||
| 85 | __percpu_depopulate_mask(__pdata, &populated); | ||
| 86 | return -ENOMEM; | ||
| 87 | } else | ||
| 88 | cpu_set(cpu, populated); | ||
| 89 | return 0; | ||
| 90 | } | ||
| 91 | EXPORT_SYMBOL_GPL(__percpu_populate_mask); | ||
| 92 | |||
| 93 | /** | ||
| 94 | * percpu_alloc_mask - initial setup of per-cpu data | ||
| 95 | * @size: size of per-cpu object | ||
| 96 | * @gfp: may sleep or not etc. | ||
| 97 | * @mask: populate per-data for cpu's selected through mask bits | ||
| 98 | * | ||
| 99 | * Populating per-cpu data for all online cpu's would be a typical use case, | ||
| 100 | * which is simplified by the percpu_alloc() wrapper. | ||
| 101 | * Per-cpu objects are populated with zeroed buffers. | ||
| 102 | */ | ||
| 103 | void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) | ||
| 104 | { | ||
| 105 | void *pdata = kzalloc(sizeof(struct percpu_data), gfp); | ||
| 106 | void *__pdata = __percpu_disguise(pdata); | ||
| 107 | |||
| 108 | if (unlikely(!pdata)) | ||
| 109 | return NULL; | ||
| 110 | if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) | ||
| 111 | return __pdata; | ||
| 112 | kfree(pdata); | ||
| 113 | return NULL; | ||
| 114 | } | ||
| 115 | EXPORT_SYMBOL_GPL(__percpu_alloc_mask); | ||
| 116 | |||
| 117 | /** | ||
| 118 | * percpu_free - final cleanup of per-cpu data | ||
| 119 | * @__pdata: object to clean up | ||
| 120 | * | ||
| 121 | * We simply clean up any per-cpu object left. No need for the client to | ||
| 122 | * track and specify through a bis mask which per-cpu objects are to free. | ||
| 123 | */ | ||
| 124 | void percpu_free(void *__pdata) | ||
| 125 | { | ||
| 126 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); | ||
| 127 | kfree(__percpu_disguise(__pdata)); | ||
| 128 | } | ||
| 129 | EXPORT_SYMBOL_GPL(percpu_free); | ||
| diff --git a/mm/bootmem.c b/mm/bootmem.c index d213feded10d..d53112fcb404 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -8,17 +8,15 @@ | |||
| 8 | * free memory collector. It's used to deal with reserved | 8 | * free memory collector. It's used to deal with reserved | 
| 9 | * system memory and memory holes as well. | 9 | * system memory and memory holes as well. | 
| 10 | */ | 10 | */ | 
| 11 | |||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/kernel_stat.h> | ||
| 14 | #include <linux/swap.h> | ||
| 15 | #include <linux/interrupt.h> | ||
| 16 | #include <linux/init.h> | 11 | #include <linux/init.h> | 
| 12 | #include <linux/pfn.h> | ||
| 17 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> | 
| 18 | #include <linux/mmzone.h> | ||
| 19 | #include <linux/module.h> | 14 | #include <linux/module.h> | 
| 20 | #include <asm/dma.h> | 15 | |
| 16 | #include <asm/bug.h> | ||
| 21 | #include <asm/io.h> | 17 | #include <asm/io.h> | 
| 18 | #include <asm/processor.h> | ||
| 19 | |||
| 22 | #include "internal.h" | 20 | #include "internal.h" | 
| 23 | 21 | ||
| 24 | /* | 22 | /* | 
| @@ -29,9 +27,7 @@ unsigned long max_low_pfn; | |||
| 29 | unsigned long min_low_pfn; | 27 | unsigned long min_low_pfn; | 
| 30 | unsigned long max_pfn; | 28 | unsigned long max_pfn; | 
| 31 | 29 | ||
| 32 | EXPORT_SYMBOL(max_pfn); /* This is exported so | 30 | EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ | 
| 33 | * dma_get_required_mask(), which uses | ||
| 34 | * it, can be an inline function */ | ||
| 35 | 31 | ||
| 36 | static LIST_HEAD(bdata_list); | 32 | static LIST_HEAD(bdata_list); | 
| 37 | #ifdef CONFIG_CRASH_DUMP | 33 | #ifdef CONFIG_CRASH_DUMP | 
| @@ -43,7 +39,7 @@ unsigned long saved_max_pfn; | |||
| 43 | #endif | 39 | #endif | 
| 44 | 40 | ||
| 45 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | 41 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | 
| 46 | unsigned long __init bootmem_bootmap_pages (unsigned long pages) | 42 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) | 
| 47 | { | 43 | { | 
| 48 | unsigned long mapsize; | 44 | unsigned long mapsize; | 
| 49 | 45 | ||
| @@ -53,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) | |||
| 53 | 49 | ||
| 54 | return mapsize; | 50 | return mapsize; | 
| 55 | } | 51 | } | 
| 52 | |||
| 56 | /* | 53 | /* | 
| 57 | * link bdata in order | 54 | * link bdata in order | 
| 58 | */ | 55 | */ | 
| 59 | static void link_bootmem(bootmem_data_t *bdata) | 56 | static void __init link_bootmem(bootmem_data_t *bdata) | 
| 60 | { | 57 | { | 
| 61 | bootmem_data_t *ent; | 58 | bootmem_data_t *ent; | 
| 59 | |||
| 62 | if (list_empty(&bdata_list)) { | 60 | if (list_empty(&bdata_list)) { | 
| 63 | list_add(&bdata->list, &bdata_list); | 61 | list_add(&bdata->list, &bdata_list); | 
| 64 | return; | 62 | return; | 
| @@ -71,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata) | |||
| 71 | } | 69 | } | 
| 72 | } | 70 | } | 
| 73 | list_add_tail(&bdata->list, &bdata_list); | 71 | list_add_tail(&bdata->list, &bdata_list); | 
| 74 | return; | ||
| 75 | } | 72 | } | 
| 76 | 73 | ||
| 74 | /* | ||
| 75 | * Given an initialised bdata, it returns the size of the boot bitmap | ||
| 76 | */ | ||
| 77 | static unsigned long __init get_mapsize(bootmem_data_t *bdata) | ||
| 78 | { | ||
| 79 | unsigned long mapsize; | ||
| 80 | unsigned long start = PFN_DOWN(bdata->node_boot_start); | ||
| 81 | unsigned long end = bdata->node_low_pfn; | ||
| 82 | |||
| 83 | mapsize = ((end - start) + 7) / 8; | ||
| 84 | return ALIGN(mapsize, sizeof(long)); | ||
| 85 | } | ||
| 77 | 86 | ||
| 78 | /* | 87 | /* | 
| 79 | * Called once to set up the allocator itself. | 88 | * Called once to set up the allocator itself. | 
| 80 | */ | 89 | */ | 
| 81 | static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | 90 | static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | 
| 82 | unsigned long mapstart, unsigned long start, unsigned long end) | 91 | unsigned long mapstart, unsigned long start, unsigned long end) | 
| 83 | { | 92 | { | 
| 84 | bootmem_data_t *bdata = pgdat->bdata; | 93 | bootmem_data_t *bdata = pgdat->bdata; | 
| 85 | unsigned long mapsize = ((end - start)+7)/8; | 94 | unsigned long mapsize; | 
| 86 | 95 | ||
| 87 | mapsize = ALIGN(mapsize, sizeof(long)); | 96 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); | 
| 88 | bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); | 97 | bdata->node_boot_start = PFN_PHYS(start); | 
| 89 | bdata->node_boot_start = (start << PAGE_SHIFT); | ||
| 90 | bdata->node_low_pfn = end; | 98 | bdata->node_low_pfn = end; | 
| 91 | link_bootmem(bdata); | 99 | link_bootmem(bdata); | 
| 92 | 100 | ||
| @@ -94,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | |||
| 94 | * Initially all pages are reserved - setup_arch() has to | 102 | * Initially all pages are reserved - setup_arch() has to | 
| 95 | * register free RAM areas explicitly. | 103 | * register free RAM areas explicitly. | 
| 96 | */ | 104 | */ | 
| 105 | mapsize = get_mapsize(bdata); | ||
| 97 | memset(bdata->node_bootmem_map, 0xff, mapsize); | 106 | memset(bdata->node_bootmem_map, 0xff, mapsize); | 
| 98 | 107 | ||
| 99 | return mapsize; | 108 | return mapsize; | 
| @@ -104,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | |||
| 104 | * might be used for boot-time allocations - or it might get added | 113 | * might be used for boot-time allocations - or it might get added | 
| 105 | * to the free page pool later on. | 114 | * to the free page pool later on. | 
| 106 | */ | 115 | */ | 
| 107 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | 116 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 
| 117 | unsigned long size) | ||
| 108 | { | 118 | { | 
| 119 | unsigned long sidx, eidx; | ||
| 109 | unsigned long i; | 120 | unsigned long i; | 
| 121 | |||
| 110 | /* | 122 | /* | 
| 111 | * round up, partially reserved pages are considered | 123 | * round up, partially reserved pages are considered | 
| 112 | * fully reserved. | 124 | * fully reserved. | 
| 113 | */ | 125 | */ | 
| 114 | unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; | ||
| 115 | unsigned long eidx = (addr + size - bdata->node_boot_start + | ||
| 116 | PAGE_SIZE-1)/PAGE_SIZE; | ||
| 117 | unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; | ||
| 118 | |||
| 119 | BUG_ON(!size); | 126 | BUG_ON(!size); | 
| 120 | BUG_ON(sidx >= eidx); | 127 | BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); | 
| 121 | BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); | 128 | BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); | 
| 122 | BUG_ON(end > bdata->node_low_pfn); | 129 | |
| 130 | sidx = PFN_DOWN(addr - bdata->node_boot_start); | ||
| 131 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | ||
| 123 | 132 | ||
| 124 | for (i = sidx; i < eidx; i++) | 133 | for (i = sidx; i < eidx; i++) | 
| 125 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | 134 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | 
| @@ -129,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add | |||
| 129 | } | 138 | } | 
| 130 | } | 139 | } | 
| 131 | 140 | ||
| 132 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) | 141 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 
| 142 | unsigned long size) | ||
| 133 | { | 143 | { | 
| 144 | unsigned long sidx, eidx; | ||
| 134 | unsigned long i; | 145 | unsigned long i; | 
| 135 | unsigned long start; | 146 | |
| 136 | /* | 147 | /* | 
| 137 | * round down end of usable mem, partially free pages are | 148 | * round down end of usable mem, partially free pages are | 
| 138 | * considered reserved. | 149 | * considered reserved. | 
| 139 | */ | 150 | */ | 
| 140 | unsigned long sidx; | ||
| 141 | unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; | ||
| 142 | unsigned long end = (addr + size)/PAGE_SIZE; | ||
| 143 | |||
| 144 | BUG_ON(!size); | 151 | BUG_ON(!size); | 
| 145 | BUG_ON(end > bdata->node_low_pfn); | 152 | BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); | 
| 146 | 153 | ||
| 147 | if (addr < bdata->last_success) | 154 | if (addr < bdata->last_success) | 
| 148 | bdata->last_success = addr; | 155 | bdata->last_success = addr; | 
| @@ -150,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | |||
| 150 | /* | 157 | /* | 
| 151 | * Round up the beginning of the address. | 158 | * Round up the beginning of the address. | 
| 152 | */ | 159 | */ | 
| 153 | start = (addr + PAGE_SIZE-1) / PAGE_SIZE; | 160 | sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); | 
| 154 | sidx = start - (bdata->node_boot_start/PAGE_SIZE); | 161 | eidx = PFN_DOWN(addr + size - bdata->node_boot_start); | 
| 155 | 162 | ||
| 156 | for (i = sidx; i < eidx; i++) { | 163 | for (i = sidx; i < eidx; i++) { | 
| 157 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | 164 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | 
| @@ -177,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
| 177 | unsigned long align, unsigned long goal, unsigned long limit) | 184 | unsigned long align, unsigned long goal, unsigned long limit) | 
| 178 | { | 185 | { | 
| 179 | unsigned long offset, remaining_size, areasize, preferred; | 186 | unsigned long offset, remaining_size, areasize, preferred; | 
| 180 | unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; | 187 | unsigned long i, start = 0, incr, eidx, end_pfn; | 
| 181 | void *ret; | 188 | void *ret; | 
| 182 | 189 | ||
| 183 | if(!size) { | 190 | if (!size) { | 
| 184 | printk("__alloc_bootmem_core(): zero-sized request\n"); | 191 | printk("__alloc_bootmem_core(): zero-sized request\n"); | 
| 185 | BUG(); | 192 | BUG(); | 
| 186 | } | 193 | } | 
| @@ -189,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
| 189 | if (limit && bdata->node_boot_start >= limit) | 196 | if (limit && bdata->node_boot_start >= limit) | 
| 190 | return NULL; | 197 | return NULL; | 
| 191 | 198 | ||
| 192 | limit >>=PAGE_SHIFT; | 199 | end_pfn = bdata->node_low_pfn; | 
| 200 | limit = PFN_DOWN(limit); | ||
| 193 | if (limit && end_pfn > limit) | 201 | if (limit && end_pfn > limit) | 
| 194 | end_pfn = limit; | 202 | end_pfn = limit; | 
| 195 | 203 | ||
| 196 | eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | 204 | eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); | 
| 197 | offset = 0; | 205 | offset = 0; | 
| 198 | if (align && | 206 | if (align && (bdata->node_boot_start & (align - 1UL)) != 0) | 
| 199 | (bdata->node_boot_start & (align - 1UL)) != 0) | 207 | offset = align - (bdata->node_boot_start & (align - 1UL)); | 
| 200 | offset = (align - (bdata->node_boot_start & (align - 1UL))); | 208 | offset = PFN_DOWN(offset); | 
| 201 | offset >>= PAGE_SHIFT; | ||
| 202 | 209 | ||
| 203 | /* | 210 | /* | 
| 204 | * We try to allocate bootmem pages above 'goal' | 211 | * We try to allocate bootmem pages above 'goal' | 
| 205 | * first, then we try to allocate lower pages. | 212 | * first, then we try to allocate lower pages. | 
| 206 | */ | 213 | */ | 
| 207 | if (goal && (goal >= bdata->node_boot_start) && | 214 | if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { | 
| 208 | ((goal >> PAGE_SHIFT) < end_pfn)) { | ||
| 209 | preferred = goal - bdata->node_boot_start; | 215 | preferred = goal - bdata->node_boot_start; | 
| 210 | 216 | ||
| 211 | if (bdata->last_success >= preferred) | 217 | if (bdata->last_success >= preferred) | 
| @@ -214,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
| 214 | } else | 220 | } else | 
| 215 | preferred = 0; | 221 | preferred = 0; | 
| 216 | 222 | ||
| 217 | preferred = ALIGN(preferred, align) >> PAGE_SHIFT; | 223 | preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; | 
| 218 | preferred += offset; | 224 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; | 
| 219 | areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; | ||
| 220 | incr = align >> PAGE_SHIFT ? : 1; | 225 | incr = align >> PAGE_SHIFT ? : 1; | 
| 221 | 226 | ||
| 222 | restart_scan: | 227 | restart_scan: | 
| @@ -231,7 +236,7 @@ restart_scan: | |||
| 231 | for (j = i + 1; j < i + areasize; ++j) { | 236 | for (j = i + 1; j < i + areasize; ++j) { | 
| 232 | if (j >= eidx) | 237 | if (j >= eidx) | 
| 233 | goto fail_block; | 238 | goto fail_block; | 
| 234 | if (test_bit (j, bdata->node_bootmem_map)) | 239 | if (test_bit(j, bdata->node_bootmem_map)) | 
| 235 | goto fail_block; | 240 | goto fail_block; | 
| 236 | } | 241 | } | 
| 237 | start = i; | 242 | start = i; | 
| @@ -247,7 +252,7 @@ restart_scan: | |||
| 247 | return NULL; | 252 | return NULL; | 
| 248 | 253 | ||
| 249 | found: | 254 | found: | 
| 250 | bdata->last_success = start << PAGE_SHIFT; | 255 | bdata->last_success = PFN_PHYS(start); | 
| 251 | BUG_ON(start >= eidx); | 256 | BUG_ON(start >= eidx); | 
| 252 | 257 | ||
| 253 | /* | 258 | /* | 
| @@ -259,19 +264,21 @@ found: | |||
| 259 | bdata->last_offset && bdata->last_pos+1 == start) { | 264 | bdata->last_offset && bdata->last_pos+1 == start) { | 
| 260 | offset = ALIGN(bdata->last_offset, align); | 265 | offset = ALIGN(bdata->last_offset, align); | 
| 261 | BUG_ON(offset > PAGE_SIZE); | 266 | BUG_ON(offset > PAGE_SIZE); | 
| 262 | remaining_size = PAGE_SIZE-offset; | 267 | remaining_size = PAGE_SIZE - offset; | 
| 263 | if (size < remaining_size) { | 268 | if (size < remaining_size) { | 
| 264 | areasize = 0; | 269 | areasize = 0; | 
| 265 | /* last_pos unchanged */ | 270 | /* last_pos unchanged */ | 
| 266 | bdata->last_offset = offset+size; | 271 | bdata->last_offset = offset + size; | 
| 267 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | 272 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | 
| 268 | bdata->node_boot_start); | 273 | offset + | 
| 274 | bdata->node_boot_start); | ||
| 269 | } else { | 275 | } else { | 
| 270 | remaining_size = size - remaining_size; | 276 | remaining_size = size - remaining_size; | 
| 271 | areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; | 277 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; | 
| 272 | ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + | 278 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | 
| 273 | bdata->node_boot_start); | 279 | offset + | 
| 274 | bdata->last_pos = start+areasize-1; | 280 | bdata->node_boot_start); | 
| 281 | bdata->last_pos = start + areasize - 1; | ||
| 275 | bdata->last_offset = remaining_size; | 282 | bdata->last_offset = remaining_size; | 
| 276 | } | 283 | } | 
| 277 | bdata->last_offset &= ~PAGE_MASK; | 284 | bdata->last_offset &= ~PAGE_MASK; | 
| @@ -284,7 +291,7 @@ found: | |||
| 284 | /* | 291 | /* | 
| 285 | * Reserve the area now: | 292 | * Reserve the area now: | 
| 286 | */ | 293 | */ | 
| 287 | for (i = start; i < start+areasize; i++) | 294 | for (i = start; i < start + areasize; i++) | 
| 288 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) | 295 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) | 
| 289 | BUG(); | 296 | BUG(); | 
| 290 | memset(ret, 0, size); | 297 | memset(ret, 0, size); | 
| @@ -305,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 305 | 312 | ||
| 306 | count = 0; | 313 | count = 0; | 
| 307 | /* first extant page of the node */ | 314 | /* first extant page of the node */ | 
| 308 | pfn = bdata->node_boot_start >> PAGE_SHIFT; | 315 | pfn = PFN_DOWN(bdata->node_boot_start); | 
| 309 | idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); | 316 | idx = bdata->node_low_pfn - pfn; | 
| 310 | map = bdata->node_bootmem_map; | 317 | map = bdata->node_bootmem_map; | 
| 311 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | 318 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | 
| 312 | if (bdata->node_boot_start == 0 || | 319 | if (bdata->node_boot_start == 0 || | 
| @@ -335,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 335 | } | 342 | } | 
| 336 | } | 343 | } | 
| 337 | } else { | 344 | } else { | 
| 338 | i+=BITS_PER_LONG; | 345 | i += BITS_PER_LONG; | 
| 339 | } | 346 | } | 
| 340 | pfn += BITS_PER_LONG; | 347 | pfn += BITS_PER_LONG; | 
| 341 | } | 348 | } | 
| @@ -347,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 347 | */ | 354 | */ | 
| 348 | page = virt_to_page(bdata->node_bootmem_map); | 355 | page = virt_to_page(bdata->node_bootmem_map); | 
| 349 | count = 0; | 356 | count = 0; | 
| 350 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { | 357 | idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; | 
| 351 | count++; | 358 | for (i = 0; i < idx; i++, page++) { | 
| 352 | __free_pages_bootmem(page, 0); | 359 | __free_pages_bootmem(page, 0); | 
| 360 | count++; | ||
| 353 | } | 361 | } | 
| 354 | total += count; | 362 | total += count; | 
| 355 | bdata->node_bootmem_map = NULL; | 363 | bdata->node_bootmem_map = NULL; | 
| @@ -357,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 357 | return total; | 365 | return total; | 
| 358 | } | 366 | } | 
| 359 | 367 | ||
| 360 | unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) | 368 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, | 
| 369 | unsigned long startpfn, unsigned long endpfn) | ||
| 361 | { | 370 | { | 
| 362 | return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); | 371 | return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); | 
| 363 | } | 372 | } | 
| 364 | 373 | ||
| 365 | void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | 374 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 
| 375 | unsigned long size) | ||
| 366 | { | 376 | { | 
| 367 | reserve_bootmem_core(pgdat->bdata, physaddr, size); | 377 | reserve_bootmem_core(pgdat->bdata, physaddr, size); | 
| 368 | } | 378 | } | 
| 369 | 379 | ||
| 370 | void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) | 380 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 
| 381 | unsigned long size) | ||
| 371 | { | 382 | { | 
| 372 | free_bootmem_core(pgdat->bdata, physaddr, size); | 383 | free_bootmem_core(pgdat->bdata, physaddr, size); | 
| 373 | } | 384 | } | 
| 374 | 385 | ||
| 375 | unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) | 386 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 
| 376 | { | 387 | { | 
| 377 | return(free_all_bootmem_core(pgdat)); | 388 | return free_all_bootmem_core(pgdat); | 
| 378 | } | 389 | } | 
| 379 | 390 | ||
| 380 | unsigned long __init init_bootmem (unsigned long start, unsigned long pages) | 391 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | 
| 381 | { | 392 | { | 
| 382 | max_low_pfn = pages; | 393 | max_low_pfn = pages; | 
| 383 | min_low_pfn = start; | 394 | min_low_pfn = start; | 
| 384 | return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); | 395 | return init_bootmem_core(NODE_DATA(0), start, 0, pages); | 
| 385 | } | 396 | } | 
| 386 | 397 | ||
| 387 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 398 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 
| 388 | void __init reserve_bootmem (unsigned long addr, unsigned long size) | 399 | void __init reserve_bootmem(unsigned long addr, unsigned long size) | 
| 389 | { | 400 | { | 
| 390 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 401 | reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 
| 391 | } | 402 | } | 
| 392 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 403 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 
| 393 | 404 | ||
| 394 | void __init free_bootmem (unsigned long addr, unsigned long size) | 405 | void __init free_bootmem(unsigned long addr, unsigned long size) | 
| 395 | { | 406 | { | 
| 396 | free_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 407 | free_bootmem_core(NODE_DATA(0)->bdata, addr, size); | 
| 397 | } | 408 | } | 
| 398 | 409 | ||
| 399 | unsigned long __init free_all_bootmem (void) | 410 | unsigned long __init free_all_bootmem(void) | 
| 400 | { | 411 | { | 
| 401 | return(free_all_bootmem_core(NODE_DATA(0))); | 412 | return free_all_bootmem_core(NODE_DATA(0)); | 
| 402 | } | 413 | } | 
| 403 | 414 | ||
| 404 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) | 415 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 
| 416 | unsigned long goal) | ||
| 405 | { | 417 | { | 
| 406 | bootmem_data_t *bdata; | 418 | bootmem_data_t *bdata; | 
| 407 | void *ptr; | 419 | void *ptr; | 
| 408 | 420 | ||
| 409 | list_for_each_entry(bdata, &bdata_list, list) | 421 | list_for_each_entry(bdata, &bdata_list, list) { | 
| 410 | if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) | 422 | ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); | 
| 411 | return(ptr); | 423 | if (ptr) | 
| 424 | return ptr; | ||
| 425 | } | ||
| 412 | return NULL; | 426 | return NULL; | 
| 413 | } | 427 | } | 
| 414 | 428 | ||
| 415 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) | 429 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 
| 430 | unsigned long goal) | ||
| 416 | { | 431 | { | 
| 417 | void *mem = __alloc_bootmem_nopanic(size,align,goal); | 432 | void *mem = __alloc_bootmem_nopanic(size,align,goal); | 
| 433 | |||
| 418 | if (mem) | 434 | if (mem) | 
| 419 | return mem; | 435 | return mem; | 
| 420 | /* | 436 | /* | 
| @@ -426,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned | |||
| 426 | } | 442 | } | 
| 427 | 443 | ||
| 428 | 444 | ||
| 429 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, | 445 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 
| 430 | unsigned long goal) | 446 | unsigned long align, unsigned long goal) | 
| 431 | { | 447 | { | 
| 432 | void *ptr; | 448 | void *ptr; | 
| 433 | 449 | ||
| 434 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 450 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 
| 435 | if (ptr) | 451 | if (ptr) | 
| 436 | return (ptr); | 452 | return ptr; | 
| 437 | 453 | ||
| 438 | return __alloc_bootmem(size, align, goal); | 454 | return __alloc_bootmem(size, align, goal); | 
| 439 | } | 455 | } | 
| 440 | 456 | ||
| 441 | #define LOW32LIMIT 0xffffffff | 457 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 
| 458 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | ||
| 459 | #endif | ||
| 442 | 460 | ||
| 443 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) | 461 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | 
| 462 | unsigned long goal) | ||
| 444 | { | 463 | { | 
| 445 | bootmem_data_t *bdata; | 464 | bootmem_data_t *bdata; | 
| 446 | void *ptr; | 465 | void *ptr; | 
| 447 | 466 | ||
| 448 | list_for_each_entry(bdata, &bdata_list, list) | 467 | list_for_each_entry(bdata, &bdata_list, list) { | 
| 449 | if ((ptr = __alloc_bootmem_core(bdata, size, | 468 | ptr = __alloc_bootmem_core(bdata, size, align, goal, | 
| 450 | align, goal, LOW32LIMIT))) | 469 | ARCH_LOW_ADDRESS_LIMIT); | 
| 451 | return(ptr); | 470 | if (ptr) | 
| 471 | return ptr; | ||
| 472 | } | ||
| 452 | 473 | ||
| 453 | /* | 474 | /* | 
| 454 | * Whoops, we cannot satisfy the allocation request. | 475 | * Whoops, we cannot satisfy the allocation request. | 
| @@ -461,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig | |||
| 461 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 482 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 
| 462 | unsigned long align, unsigned long goal) | 483 | unsigned long align, unsigned long goal) | 
| 463 | { | 484 | { | 
| 464 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); | 485 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, | 
| 486 | ARCH_LOW_ADDRESS_LIMIT); | ||
| 465 | } | 487 | } | 
| diff --git a/mm/fadvise.c b/mm/fadvise.c index 0a03357a1f8e..168c78a121bb 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -23,18 +23,6 @@ | |||
| 23 | /* | 23 | /* | 
| 24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | 24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | 
| 25 | * deactivate the pages and clear PG_Referenced. | 25 | * deactivate the pages and clear PG_Referenced. | 
| 26 | * | ||
| 27 | * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file | ||
| 28 | * offsets `offset' and `offset+len' inclusive. Any pages which are currently | ||
| 29 | * under writeout are skipped, whether or not they are dirty. | ||
| 30 | * | ||
| 31 | * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file | ||
| 32 | * offsets `offset' and `offset+len'. | ||
| 33 | * | ||
| 34 | * By combining these two operations the application may do several things: | ||
| 35 | * | ||
| 36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. | ||
| 37 | * | ||
| 38 | */ | 26 | */ | 
| 39 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 27 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 
| 40 | { | 28 | { | 
| @@ -85,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
| 85 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 73 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 
| 86 | break; | 74 | break; | 
| 87 | case POSIX_FADV_WILLNEED: | 75 | case POSIX_FADV_WILLNEED: | 
| 88 | case POSIX_FADV_NOREUSE: | ||
| 89 | if (!mapping->a_ops->readpage) { | 76 | if (!mapping->a_ops->readpage) { | 
| 90 | ret = -EINVAL; | 77 | ret = -EINVAL; | 
| 91 | break; | 78 | break; | 
| @@ -106,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
| 106 | if (ret > 0) | 93 | if (ret > 0) | 
| 107 | ret = 0; | 94 | ret = 0; | 
| 108 | break; | 95 | break; | 
| 96 | case POSIX_FADV_NOREUSE: | ||
| 97 | break; | ||
| 109 | case POSIX_FADV_DONTNEED: | 98 | case POSIX_FADV_DONTNEED: | 
| 110 | if (!bdi_write_congested(mapping->backing_dev_info)) | 99 | if (!bdi_write_congested(mapping->backing_dev_info)) | 
| 111 | filemap_flush(mapping); | 100 | filemap_flush(mapping); | 
| diff --git a/mm/filemap.c b/mm/filemap.c index 807a463fd5ed..3277f3b23524 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | * most "normal" filesystems (but you don't /have/ to use this: | 9 | * most "normal" filesystems (but you don't /have/ to use this: | 
| 10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) | 
| 11 | */ | 11 | */ | 
| 12 | #include <linux/config.h> | ||
| 13 | #include <linux/module.h> | 12 | #include <linux/module.h> | 
| 14 | #include <linux/slab.h> | 13 | #include <linux/slab.h> | 
| 15 | #include <linux/compiler.h> | 14 | #include <linux/compiler.h> | 
| @@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page) | |||
| 120 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); | 
| 121 | page->mapping = NULL; | 120 | page->mapping = NULL; | 
| 122 | mapping->nrpages--; | 121 | mapping->nrpages--; | 
| 123 | pagecache_acct(-1); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 
| 124 | } | 123 | } | 
| 125 | 124 | ||
| 126 | void remove_from_page_cache(struct page *page) | 125 | void remove_from_page_cache(struct page *page) | 
| @@ -449,7 +448,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
| 449 | page->mapping = mapping; | 448 | page->mapping = mapping; | 
| 450 | page->index = offset; | 449 | page->index = offset; | 
| 451 | mapping->nrpages++; | 450 | mapping->nrpages++; | 
| 452 | pagecache_acct(1); | 451 | __inc_zone_page_state(page, NR_FILE_PAGES); | 
| 453 | } | 452 | } | 
| 454 | write_unlock_irq(&mapping->tree_lock); | 453 | write_unlock_irq(&mapping->tree_lock); | 
| 455 | radix_tree_preload_end(); | 454 | radix_tree_preload_end(); | 
| @@ -489,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x) | |||
| 489 | EXPORT_SYMBOL(page_cache_alloc_cold); | 488 | EXPORT_SYMBOL(page_cache_alloc_cold); | 
| 490 | #endif | 489 | #endif | 
| 491 | 490 | ||
| 491 | static int __sleep_on_page_lock(void *word) | ||
| 492 | { | ||
| 493 | io_schedule(); | ||
| 494 | return 0; | ||
| 495 | } | ||
| 496 | |||
| 492 | /* | 497 | /* | 
| 493 | * In order to wait for pages to become available there must be | 498 | * In order to wait for pages to become available there must be | 
| 494 | * waitqueues associated with pages. By using a hash table of | 499 | * waitqueues associated with pages. By using a hash table of | 
| @@ -578,13 +583,24 @@ void fastcall __lock_page(struct page *page) | |||
| 578 | } | 583 | } | 
| 579 | EXPORT_SYMBOL(__lock_page); | 584 | EXPORT_SYMBOL(__lock_page); | 
| 580 | 585 | ||
| 586 | /* | ||
| 587 | * Variant of lock_page that does not require the caller to hold a reference | ||
| 588 | * on the page's mapping. | ||
| 589 | */ | ||
| 590 | void fastcall __lock_page_nosync(struct page *page) | ||
| 591 | { | ||
| 592 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
| 593 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
| 594 | TASK_UNINTERRUPTIBLE); | ||
| 595 | } | ||
| 596 | |||
| 581 | /** | 597 | /** | 
| 582 | * find_get_page - find and get a page reference | 598 | * find_get_page - find and get a page reference | 
| 583 | * @mapping: the address_space to search | 599 | * @mapping: the address_space to search | 
| 584 | * @offset: the page index | 600 | * @offset: the page index | 
| 585 | * | 601 | * | 
| 586 | * A rather lightweight function, finding and getting a reference to a | 602 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 
| 587 | * hashed page atomically. | 603 | * If yes, increment its refcount and return it; if no, return NULL. | 
| 588 | */ | 604 | */ | 
| 589 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 605 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 
| 590 | { | 606 | { | 
| @@ -828,6 +844,30 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | |||
| 828 | } | 844 | } | 
| 829 | EXPORT_SYMBOL(grab_cache_page_nowait); | 845 | EXPORT_SYMBOL(grab_cache_page_nowait); | 
| 830 | 846 | ||
| 847 | /* | ||
| 848 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | ||
| 849 | * a _large_ part of the i/o request. Imagine the worst scenario: | ||
| 850 | * | ||
| 851 | * ---R__________________________________________B__________ | ||
| 852 | * ^ reading here ^ bad block(assume 4k) | ||
| 853 | * | ||
| 854 | * read(R) => miss => readahead(R...B) => media error => frustrating retries | ||
| 855 | * => failing the whole request => read(R) => read(R+1) => | ||
| 856 | * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => | ||
| 857 | * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => | ||
| 858 | * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... | ||
| 859 | * | ||
| 860 | * It is going insane. Fix it by quickly scaling down the readahead size. | ||
| 861 | */ | ||
| 862 | static void shrink_readahead_size_eio(struct file *filp, | ||
| 863 | struct file_ra_state *ra) | ||
| 864 | { | ||
| 865 | if (!ra->ra_pages) | ||
| 866 | return; | ||
| 867 | |||
| 868 | ra->ra_pages /= 4; | ||
| 869 | } | ||
| 870 | |||
| 831 | /** | 871 | /** | 
| 832 | * do_generic_mapping_read - generic file read routine | 872 | * do_generic_mapping_read - generic file read routine | 
| 833 | * @mapping: address_space to be read | 873 | * @mapping: address_space to be read | 
| @@ -947,7 +987,7 @@ page_not_up_to_date: | |||
| 947 | /* Get exclusive access to the page ... */ | 987 | /* Get exclusive access to the page ... */ | 
| 948 | lock_page(page); | 988 | lock_page(page); | 
| 949 | 989 | ||
| 950 | /* Did it get unhashed before we got the lock? */ | 990 | /* Did it get truncated before we got the lock? */ | 
| 951 | if (!page->mapping) { | 991 | if (!page->mapping) { | 
| 952 | unlock_page(page); | 992 | unlock_page(page); | 
| 953 | page_cache_release(page); | 993 | page_cache_release(page); | 
| @@ -985,6 +1025,7 @@ readpage: | |||
| 985 | } | 1025 | } | 
| 986 | unlock_page(page); | 1026 | unlock_page(page); | 
| 987 | error = -EIO; | 1027 | error = -EIO; | 
| 1028 | shrink_readahead_size_eio(filp, &ra); | ||
| 988 | goto readpage_error; | 1029 | goto readpage_error; | 
| 989 | } | 1030 | } | 
| 990 | unlock_page(page); | 1031 | unlock_page(page); | 
| @@ -1389,7 +1430,7 @@ retry_find: | |||
| 1389 | */ | 1430 | */ | 
| 1390 | if (!did_readaround) { | 1431 | if (!did_readaround) { | 
| 1391 | majmin = VM_FAULT_MAJOR; | 1432 | majmin = VM_FAULT_MAJOR; | 
| 1392 | inc_page_state(pgmajfault); | 1433 | count_vm_event(PGMAJFAULT); | 
| 1393 | } | 1434 | } | 
| 1394 | did_readaround = 1; | 1435 | did_readaround = 1; | 
| 1395 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | 1436 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | 
| @@ -1430,7 +1471,7 @@ outside_data_content: | |||
| 1430 | * accessible.. | 1471 | * accessible.. | 
| 1431 | */ | 1472 | */ | 
| 1432 | if (area->vm_mm == current->mm) | 1473 | if (area->vm_mm == current->mm) | 
| 1433 | return NULL; | 1474 | return NOPAGE_SIGBUS; | 
| 1434 | /* Fall through to the non-read-ahead case */ | 1475 | /* Fall through to the non-read-ahead case */ | 
| 1435 | no_cached_page: | 1476 | no_cached_page: | 
| 1436 | /* | 1477 | /* | 
| @@ -1455,12 +1496,12 @@ no_cached_page: | |||
| 1455 | */ | 1496 | */ | 
| 1456 | if (error == -ENOMEM) | 1497 | if (error == -ENOMEM) | 
| 1457 | return NOPAGE_OOM; | 1498 | return NOPAGE_OOM; | 
| 1458 | return NULL; | 1499 | return NOPAGE_SIGBUS; | 
| 1459 | 1500 | ||
| 1460 | page_not_uptodate: | 1501 | page_not_uptodate: | 
| 1461 | if (!did_readaround) { | 1502 | if (!did_readaround) { | 
| 1462 | majmin = VM_FAULT_MAJOR; | 1503 | majmin = VM_FAULT_MAJOR; | 
| 1463 | inc_page_state(pgmajfault); | 1504 | count_vm_event(PGMAJFAULT); | 
| 1464 | } | 1505 | } | 
| 1465 | lock_page(page); | 1506 | lock_page(page); | 
| 1466 | 1507 | ||
| @@ -1522,8 +1563,9 @@ page_not_uptodate: | |||
| 1522 | * Things didn't work out. Return zero to tell the | 1563 | * Things didn't work out. Return zero to tell the | 
| 1523 | * mm layer so, possibly freeing the page cache page first. | 1564 | * mm layer so, possibly freeing the page cache page first. | 
| 1524 | */ | 1565 | */ | 
| 1566 | shrink_readahead_size_eio(file, ra); | ||
| 1525 | page_cache_release(page); | 1567 | page_cache_release(page); | 
| 1526 | return NULL; | 1568 | return NOPAGE_SIGBUS; | 
| 1527 | } | 1569 | } | 
| 1528 | EXPORT_SYMBOL(filemap_nopage); | 1570 | EXPORT_SYMBOL(filemap_nopage); | 
| 1529 | 1571 | ||
| @@ -1585,7 +1627,7 @@ no_cached_page: | |||
| 1585 | page_not_uptodate: | 1627 | page_not_uptodate: | 
| 1586 | lock_page(page); | 1628 | lock_page(page); | 
| 1587 | 1629 | ||
| 1588 | /* Did it get unhashed while we waited for it? */ | 1630 | /* Did it get truncated while we waited for it? */ | 
| 1589 | if (!page->mapping) { | 1631 | if (!page->mapping) { | 
| 1590 | unlock_page(page); | 1632 | unlock_page(page); | 
| 1591 | goto err; | 1633 | goto err; | 
| @@ -1892,7 +1934,7 @@ int remove_suid(struct dentry *dentry) | |||
| 1892 | EXPORT_SYMBOL(remove_suid); | 1934 | EXPORT_SYMBOL(remove_suid); | 
| 1893 | 1935 | ||
| 1894 | size_t | 1936 | size_t | 
| 1895 | __filemap_copy_from_user_iovec(char *vaddr, | 1937 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | 
| 1896 | const struct iovec *iov, size_t base, size_t bytes) | 1938 | const struct iovec *iov, size_t base, size_t bytes) | 
| 1897 | { | 1939 | { | 
| 1898 | size_t copied = 0, left = 0; | 1940 | size_t copied = 0, left = 0; | 
| @@ -1908,12 +1950,8 @@ __filemap_copy_from_user_iovec(char *vaddr, | |||
| 1908 | vaddr += copy; | 1950 | vaddr += copy; | 
| 1909 | iov++; | 1951 | iov++; | 
| 1910 | 1952 | ||
| 1911 | if (unlikely(left)) { | 1953 | if (unlikely(left)) | 
| 1912 | /* zero the rest of the target like __copy_from_user */ | ||
| 1913 | if (bytes) | ||
| 1914 | memset(vaddr, 0, bytes); | ||
| 1915 | break; | 1954 | break; | 
| 1916 | } | ||
| 1917 | } | 1955 | } | 
| 1918 | return copied - left; | 1956 | return copied - left; | 
| 1919 | } | 1957 | } | 
| @@ -2045,7 +2083,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2045 | { | 2083 | { | 
| 2046 | struct file *file = iocb->ki_filp; | 2084 | struct file *file = iocb->ki_filp; | 
| 2047 | struct address_space * mapping = file->f_mapping; | 2085 | struct address_space * mapping = file->f_mapping; | 
| 2048 | struct address_space_operations *a_ops = mapping->a_ops; | 2086 | const struct address_space_operations *a_ops = mapping->a_ops; | 
| 2049 | struct inode *inode = mapping->host; | 2087 | struct inode *inode = mapping->host; | 
| 2050 | long status = 0; | 2088 | long status = 0; | 
| 2051 | struct page *page; | 2089 | struct page *page; | 
| @@ -2071,14 +2109,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2071 | do { | 2109 | do { | 
| 2072 | unsigned long index; | 2110 | unsigned long index; | 
| 2073 | unsigned long offset; | 2111 | unsigned long offset; | 
| 2074 | unsigned long maxlen; | ||
| 2075 | size_t copied; | 2112 | size_t copied; | 
| 2076 | 2113 | ||
| 2077 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 2114 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 
| 2078 | index = pos >> PAGE_CACHE_SHIFT; | 2115 | index = pos >> PAGE_CACHE_SHIFT; | 
| 2079 | bytes = PAGE_CACHE_SIZE - offset; | 2116 | bytes = PAGE_CACHE_SIZE - offset; | 
| 2080 | if (bytes > count) | 2117 | |
| 2081 | bytes = count; | 2118 | /* Limit the size of the copy to the caller's write size */ | 
| 2119 | bytes = min(bytes, count); | ||
| 2120 | |||
| 2121 | /* | ||
| 2122 | * Limit the size of the copy to that of the current segment, | ||
| 2123 | * because fault_in_pages_readable() doesn't know how to walk | ||
| 2124 | * segments. | ||
| 2125 | */ | ||
| 2126 | bytes = min(bytes, cur_iov->iov_len - iov_base); | ||
| 2082 | 2127 | ||
| 2083 | /* | 2128 | /* | 
| 2084 | * Bring in the user page that we will copy from _first_. | 2129 | * Bring in the user page that we will copy from _first_. | 
| @@ -2086,10 +2131,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2086 | * same page as we're writing to, without it being marked | 2131 | * same page as we're writing to, without it being marked | 
| 2087 | * up-to-date. | 2132 | * up-to-date. | 
| 2088 | */ | 2133 | */ | 
| 2089 | maxlen = cur_iov->iov_len - iov_base; | 2134 | fault_in_pages_readable(buf, bytes); | 
| 2090 | if (maxlen > bytes) | ||
| 2091 | maxlen = bytes; | ||
| 2092 | fault_in_pages_readable(buf, maxlen); | ||
| 2093 | 2135 | ||
| 2094 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 2136 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 
| 2095 | if (!page) { | 2137 | if (!page) { | 
| @@ -2097,6 +2139,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2097 | break; | 2139 | break; | 
| 2098 | } | 2140 | } | 
| 2099 | 2141 | ||
| 2142 | if (unlikely(bytes == 0)) { | ||
| 2143 | status = 0; | ||
| 2144 | copied = 0; | ||
| 2145 | goto zero_length_segment; | ||
| 2146 | } | ||
| 2147 | |||
| 2100 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2148 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 
| 2101 | if (unlikely(status)) { | 2149 | if (unlikely(status)) { | 
| 2102 | loff_t isize = i_size_read(inode); | 2150 | loff_t isize = i_size_read(inode); | 
| @@ -2126,7 +2174,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2126 | page_cache_release(page); | 2174 | page_cache_release(page); | 
| 2127 | continue; | 2175 | continue; | 
| 2128 | } | 2176 | } | 
| 2129 | if (likely(copied > 0)) { | 2177 | zero_length_segment: | 
| 2178 | if (likely(copied >= 0)) { | ||
| 2130 | if (!status) | 2179 | if (!status) | 
| 2131 | status = copied; | 2180 | status = copied; | 
| 2132 | 2181 | ||
| @@ -2191,7 +2240,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
| 2191 | unsigned long nr_segs, loff_t *ppos) | 2240 | unsigned long nr_segs, loff_t *ppos) | 
| 2192 | { | 2241 | { | 
| 2193 | struct file *file = iocb->ki_filp; | 2242 | struct file *file = iocb->ki_filp; | 
| 2194 | struct address_space * mapping = file->f_mapping; | 2243 | const struct address_space * mapping = file->f_mapping; | 
| 2195 | size_t ocount; /* original count */ | 2244 | size_t ocount; /* original count */ | 
| 2196 | size_t count; /* after file limit checks */ | 2245 | size_t count; /* after file limit checks */ | 
| 2197 | struct inode *inode = mapping->host; | 2246 | struct inode *inode = mapping->host; | 
| diff --git a/mm/filemap.h b/mm/filemap.h index 5683cde22055..3f2a343c6015 100644 --- a/mm/filemap.h +++ b/mm/filemap.h | |||
| @@ -16,15 +16,23 @@ | |||
| 16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> | 
| 17 | 17 | ||
| 18 | size_t | 18 | size_t | 
| 19 | __filemap_copy_from_user_iovec(char *vaddr, | 19 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | 
| 20 | const struct iovec *iov, | 20 | const struct iovec *iov, | 
| 21 | size_t base, | 21 | size_t base, | 
| 22 | size_t bytes); | 22 | size_t bytes); | 
| 23 | 23 | ||
| 24 | /* | 24 | /* | 
| 25 | * Copy as much as we can into the page and return the number of bytes which | 25 | * Copy as much as we can into the page and return the number of bytes which | 
| 26 | * were sucessfully copied. If a fault is encountered then clear the page | 26 | * were sucessfully copied. If a fault is encountered then clear the page | 
| 27 | * out to (offset+bytes) and return the number of bytes which were copied. | 27 | * out to (offset+bytes) and return the number of bytes which were copied. | 
| 28 | * | ||
| 29 | * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache | ||
| 30 | * to *NOT* zero any tail of the buffer that it failed to copy. If it does, | ||
| 31 | * and if the following non-atomic copy succeeds, then there is a small window | ||
| 32 | * where the target page contains neither the data before the write, nor the | ||
| 33 | * data after the write (it contains zero). A read at this time will see | ||
| 34 | * data that is inconsistent with any ordering of the read and the write. | ||
| 35 | * (This has been detected in practice). | ||
| 28 | */ | 36 | */ | 
| 29 | static inline size_t | 37 | static inline size_t | 
| 30 | filemap_copy_from_user(struct page *page, unsigned long offset, | 38 | filemap_copy_from_user(struct page *page, unsigned long offset, | 
| @@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | |||
| 60 | size_t copied; | 68 | size_t copied; | 
| 61 | 69 | ||
| 62 | kaddr = kmap_atomic(page, KM_USER0); | 70 | kaddr = kmap_atomic(page, KM_USER0); | 
| 63 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | 71 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | 
| 64 | base, bytes); | 72 | base, bytes); | 
| 65 | kunmap_atomic(kaddr, KM_USER0); | 73 | kunmap_atomic(kaddr, KM_USER0); | 
| 66 | if (copied != bytes) { | 74 | if (copied != bytes) { | 
| 67 | kaddr = kmap(page); | 75 | kaddr = kmap(page); | 
| 68 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | 76 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | 
| 69 | base, bytes); | 77 | base, bytes); | 
| 78 | if (bytes - copied) | ||
| 79 | memset(kaddr + offset + copied, 0, bytes - copied); | ||
| 70 | kunmap(page); | 80 | kunmap(page); | 
| 71 | } | 81 | } | 
| 72 | return copied; | 82 | return copied; | 
| @@ -78,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
| 78 | const struct iovec *iov = *iovp; | 88 | const struct iovec *iov = *iovp; | 
| 79 | size_t base = *basep; | 89 | size_t base = *basep; | 
| 80 | 90 | ||
| 81 | while (bytes) { | 91 | do { | 
| 82 | int copy = min(bytes, iov->iov_len - base); | 92 | int copy = min(bytes, iov->iov_len - base); | 
| 83 | 93 | ||
| 84 | bytes -= copy; | 94 | bytes -= copy; | 
| @@ -87,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
| 87 | iov++; | 97 | iov++; | 
| 88 | base = 0; | 98 | base = 0; | 
| 89 | } | 99 | } | 
| 90 | } | 100 | } while (bytes); | 
| 91 | *iovp = iov; | 101 | *iovp = iov; | 
| 92 | *basep = base; | 102 | *basep = base; | 
| 93 | } | 103 | } | 
| diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b960ac8e5918..b4fd0d7c9bfb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
| 273 | size_t count, loff_t pos, loff_t *ppos) | 273 | size_t count, loff_t pos, loff_t *ppos) | 
| 274 | { | 274 | { | 
| 275 | struct address_space * mapping = filp->f_mapping; | 275 | struct address_space * mapping = filp->f_mapping; | 
| 276 | struct address_space_operations *a_ops = mapping->a_ops; | 276 | const struct address_space_operations *a_ops = mapping->a_ops; | 
| 277 | struct inode *inode = mapping->host; | 277 | struct inode *inode = mapping->host; | 
| 278 | long status = 0; | 278 | long status = 0; | 
| 279 | struct page *page; | 279 | struct page *page; | 
| diff --git a/mm/fremap.c b/mm/fremap.c index 21b7d0cbc98c..aa30618ec6b2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 79 | inc_mm_counter(mm, file_rss); | 79 | inc_mm_counter(mm, file_rss); | 
| 80 | 80 | ||
| 81 | flush_icache_page(vma, page); | 81 | flush_icache_page(vma, page); | 
| 82 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 82 | pte_val = mk_pte(page, prot); | 
| 83 | set_pte_at(mm, addr, pte, pte_val); | ||
| 83 | page_add_file_rmap(page); | 84 | page_add_file_rmap(page); | 
| 84 | pte_val = *pte; | ||
| 85 | update_mmu_cache(vma, addr, pte_val); | 85 | update_mmu_cache(vma, addr, pte_val); | 
| 86 | lazy_mmu_prot_update(pte_val); | 86 | lazy_mmu_prot_update(pte_val); | 
| 87 | err = 0; | 87 | err = 0; | 
| diff --git a/mm/highmem.c b/mm/highmem.c index 9b274fdf9d08..ee5519b176ee 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -46,6 +46,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) | |||
| 46 | */ | 46 | */ | 
| 47 | #ifdef CONFIG_HIGHMEM | 47 | #ifdef CONFIG_HIGHMEM | 
| 48 | 48 | ||
| 49 | unsigned long totalhigh_pages __read_mostly; | ||
| 50 | |||
| 51 | unsigned int nr_free_highpages (void) | ||
| 52 | { | ||
| 53 | pg_data_t *pgdat; | ||
| 54 | unsigned int pages = 0; | ||
| 55 | |||
| 56 | for_each_online_pgdat(pgdat) | ||
| 57 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
| 58 | |||
| 59 | return pages; | ||
| 60 | } | ||
| 61 | |||
| 49 | static int pkmap_count[LAST_PKMAP]; | 62 | static int pkmap_count[LAST_PKMAP]; | 
| 50 | static unsigned int last_pkmap_nr; | 63 | static unsigned int last_pkmap_nr; | 
| 51 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | 64 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | 
| @@ -315,8 +328,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | |||
| 315 | if (bvec->bv_page == org_vec->bv_page) | 328 | if (bvec->bv_page == org_vec->bv_page) | 
| 316 | continue; | 329 | continue; | 
| 317 | 330 | ||
| 318 | mempool_free(bvec->bv_page, pool); | 331 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); | 
| 319 | dec_page_state(nr_bounce); | 332 | mempool_free(bvec->bv_page, pool); | 
| 320 | } | 333 | } | 
| 321 | 334 | ||
| 322 | bio_endio(bio_orig, bio_orig->bi_size, err); | 335 | bio_endio(bio_orig, bio_orig->bi_size, err); | 
| @@ -397,7 +410,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | |||
| 397 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | 410 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | 
| 398 | to->bv_len = from->bv_len; | 411 | to->bv_len = from->bv_len; | 
| 399 | to->bv_offset = from->bv_offset; | 412 | to->bv_offset = from->bv_offset; | 
| 400 | inc_page_state(nr_bounce); | 413 | inc_zone_page_state(to->bv_page, NR_BOUNCE); | 
| 401 | 414 | ||
| 402 | if (rw == WRITE) { | 415 | if (rw == WRITE) { | 
| 403 | char *vto, *vfrom; | 416 | char *vto, *vfrom; | 
| diff --git a/mm/hugetlb.c b/mm/hugetlb.c index df499973255f..7c7d03dbf73d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
| 72 | struct zone **z; | 72 | struct zone **z; | 
| 73 | 73 | ||
| 74 | for (z = zonelist->zones; *z; z++) { | 74 | for (z = zonelist->zones; *z; z++) { | 
| 75 | nid = (*z)->zone_pgdat->node_id; | 75 | nid = zone_to_nid(*z); | 
| 76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && | 76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && | 
| 77 | !list_empty(&hugepage_freelists[nid])) | 77 | !list_empty(&hugepage_freelists[nid])) | 
| 78 | break; | 78 | break; | 
| @@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page) | |||
| 177 | { | 177 | { | 
| 178 | int i; | 178 | int i; | 
| 179 | nr_huge_pages--; | 179 | nr_huge_pages--; | 
| 180 | nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; | 180 | nr_huge_pages_node[page_to_nid(page)]--; | 
| 181 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 181 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 
| 182 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 182 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 
| 183 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 183 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 
| @@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page) | |||
| 191 | #ifdef CONFIG_HIGHMEM | 191 | #ifdef CONFIG_HIGHMEM | 
| 192 | static void try_to_free_low(unsigned long count) | 192 | static void try_to_free_low(unsigned long count) | 
| 193 | { | 193 | { | 
| 194 | int i, nid; | 194 | int i; | 
| 195 | |||
| 195 | for (i = 0; i < MAX_NUMNODES; ++i) { | 196 | for (i = 0; i < MAX_NUMNODES; ++i) { | 
| 196 | struct page *page, *next; | 197 | struct page *page, *next; | 
| 197 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 198 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 
| @@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count) | |||
| 199 | continue; | 200 | continue; | 
| 200 | list_del(&page->lru); | 201 | list_del(&page->lru); | 
| 201 | update_and_free_page(page); | 202 | update_and_free_page(page); | 
| 202 | nid = page_zone(page)->zone_pgdat->node_id; | ||
| 203 | free_huge_pages--; | 203 | free_huge_pages--; | 
| 204 | free_huge_pages_node[nid]--; | 204 | free_huge_pages_node[page_to_nid(page)]--; | 
| 205 | if (count >= nr_huge_pages) | 205 | if (count >= nr_huge_pages) | 
| 206 | return; | 206 | return; | 
| 207 | } | 207 | } | 
| diff --git a/mm/internal.h b/mm/internal.h index d20e3cc4aef0..d527b80b292f 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v) | |||
| 24 | */ | 24 | */ | 
| 25 | static inline void set_page_refcounted(struct page *page) | 25 | static inline void set_page_refcounted(struct page *page) | 
| 26 | { | 26 | { | 
| 27 | BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); | 27 | VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); | 
| 28 | BUG_ON(atomic_read(&page->_count)); | 28 | VM_BUG_ON(atomic_read(&page->_count)); | 
| 29 | set_page_count(page, 1); | 29 | set_page_count(page, 1); | 
| 30 | } | 30 | } | 
| 31 | 31 | ||
| diff --git a/mm/memory.c b/mm/memory.c index 247b5c312b9b..160f5b503ead 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -47,7 +47,9 @@ | |||
| 47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> | 
| 48 | #include <linux/rmap.h> | 48 | #include <linux/rmap.h> | 
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> | 
| 50 | #include <linux/delayacct.h> | ||
| 50 | #include <linux/init.h> | 51 | #include <linux/init.h> | 
| 52 | #include <linux/writeback.h> | ||
| 51 | 53 | ||
| 52 | #include <asm/pgalloc.h> | 54 | #include <asm/pgalloc.h> | 
| 53 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> | 
| @@ -126,7 +128,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |||
| 126 | pmd_clear(pmd); | 128 | pmd_clear(pmd); | 
| 127 | pte_lock_deinit(page); | 129 | pte_lock_deinit(page); | 
| 128 | pte_free_tlb(tlb, page); | 130 | pte_free_tlb(tlb, page); | 
| 129 | dec_page_state(nr_page_table_pages); | 131 | dec_zone_page_state(page, NR_PAGETABLE); | 
| 130 | tlb->mm->nr_ptes--; | 132 | tlb->mm->nr_ptes--; | 
| 131 | } | 133 | } | 
| 132 | 134 | ||
| @@ -311,7 +313,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
| 311 | pte_free(new); | 313 | pte_free(new); | 
| 312 | } else { | 314 | } else { | 
| 313 | mm->nr_ptes++; | 315 | mm->nr_ptes++; | 
| 314 | inc_page_state(nr_page_table_pages); | 316 | inc_zone_page_state(new, NR_PAGETABLE); | 
| 315 | pmd_populate(mm, pmd, new); | 317 | pmd_populate(mm, pmd, new); | 
| 316 | } | 318 | } | 
| 317 | spin_unlock(&mm->page_table_lock); | 319 | spin_unlock(&mm->page_table_lock); | 
| @@ -503,7 +505,7 @@ again: | |||
| 503 | return -ENOMEM; | 505 | return -ENOMEM; | 
| 504 | src_pte = pte_offset_map_nested(src_pmd, addr); | 506 | src_pte = pte_offset_map_nested(src_pmd, addr); | 
| 505 | src_ptl = pte_lockptr(src_mm, src_pmd); | 507 | src_ptl = pte_lockptr(src_mm, src_pmd); | 
| 506 | spin_lock(src_ptl); | 508 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 
| 507 | 509 | ||
| 508 | do { | 510 | do { | 
| 509 | /* | 511 | /* | 
| @@ -1225,7 +1227,12 @@ out: | |||
| 1225 | return retval; | 1227 | return retval; | 
| 1226 | } | 1228 | } | 
| 1227 | 1229 | ||
| 1228 | /* | 1230 | /** | 
| 1231 | * vm_insert_page - insert single page into user vma | ||
| 1232 | * @vma: user vma to map to | ||
| 1233 | * @addr: target user address of this page | ||
| 1234 | * @page: source kernel page | ||
| 1235 | * | ||
| 1229 | * This allows drivers to insert individual pages they've allocated | 1236 | * This allows drivers to insert individual pages they've allocated | 
| 1230 | * into a user vma. | 1237 | * into a user vma. | 
| 1231 | * | 1238 | * | 
| @@ -1317,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
| 1317 | return 0; | 1324 | return 0; | 
| 1318 | } | 1325 | } | 
| 1319 | 1326 | ||
| 1320 | /* Note: this is only safe if the mm semaphore is held when called. */ | 1327 | /** | 
| 1328 | * remap_pfn_range - remap kernel memory to userspace | ||
| 1329 | * @vma: user vma to map to | ||
| 1330 | * @addr: target user address to start at | ||
| 1331 | * @pfn: physical address of kernel memory | ||
| 1332 | * @size: size of map area | ||
| 1333 | * @prot: page protection flags for this mapping | ||
| 1334 | * | ||
| 1335 | * Note: this is only safe if the mm semaphore is held when called. | ||
| 1336 | */ | ||
| 1321 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 1337 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 
| 1322 | unsigned long pfn, unsigned long size, pgprot_t prot) | 1338 | unsigned long pfn, unsigned long size, pgprot_t prot) | 
| 1323 | { | 1339 | { | 
| @@ -1457,14 +1473,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1457 | { | 1473 | { | 
| 1458 | struct page *old_page, *new_page; | 1474 | struct page *old_page, *new_page; | 
| 1459 | pte_t entry; | 1475 | pte_t entry; | 
| 1460 | int reuse, ret = VM_FAULT_MINOR; | 1476 | int reuse = 0, ret = VM_FAULT_MINOR; | 
| 1477 | struct page *dirty_page = NULL; | ||
| 1461 | 1478 | ||
| 1462 | old_page = vm_normal_page(vma, address, orig_pte); | 1479 | old_page = vm_normal_page(vma, address, orig_pte); | 
| 1463 | if (!old_page) | 1480 | if (!old_page) | 
| 1464 | goto gotten; | 1481 | goto gotten; | 
| 1465 | 1482 | ||
| 1466 | if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == | 1483 | /* | 
| 1467 | (VM_SHARED|VM_WRITE))) { | 1484 | * Take out anonymous pages first, anonymous shared vmas are | 
| 1485 | * not dirty accountable. | ||
| 1486 | */ | ||
| 1487 | if (PageAnon(old_page)) { | ||
| 1488 | if (!TestSetPageLocked(old_page)) { | ||
| 1489 | reuse = can_share_swap_page(old_page); | ||
| 1490 | unlock_page(old_page); | ||
| 1491 | } | ||
| 1492 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 1493 | (VM_WRITE|VM_SHARED))) { | ||
| 1494 | /* | ||
| 1495 | * Only catch write-faults on shared writable pages, | ||
| 1496 | * read-only shared pages can get COWed by | ||
| 1497 | * get_user_pages(.write=1, .force=1). | ||
| 1498 | */ | ||
| 1468 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 1499 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 
| 1469 | /* | 1500 | /* | 
| 1470 | * Notify the address space that the page is about to | 1501 | * Notify the address space that the page is about to | 
| @@ -1493,13 +1524,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1493 | if (!pte_same(*page_table, orig_pte)) | 1524 | if (!pte_same(*page_table, orig_pte)) | 
| 1494 | goto unlock; | 1525 | goto unlock; | 
| 1495 | } | 1526 | } | 
| 1496 | 1527 | dirty_page = old_page; | |
| 1528 | get_page(dirty_page); | ||
| 1497 | reuse = 1; | 1529 | reuse = 1; | 
| 1498 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | ||
| 1499 | reuse = can_share_swap_page(old_page); | ||
| 1500 | unlock_page(old_page); | ||
| 1501 | } else { | ||
| 1502 | reuse = 0; | ||
| 1503 | } | 1530 | } | 
| 1504 | 1531 | ||
| 1505 | if (reuse) { | 1532 | if (reuse) { | 
| @@ -1549,9 +1576,16 @@ gotten: | |||
| 1549 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1576 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 
| 1550 | entry = mk_pte(new_page, vma->vm_page_prot); | 1577 | entry = mk_pte(new_page, vma->vm_page_prot); | 
| 1551 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1578 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 
| 1552 | ptep_establish(vma, address, page_table, entry); | ||
| 1553 | update_mmu_cache(vma, address, entry); | ||
| 1554 | lazy_mmu_prot_update(entry); | 1579 | lazy_mmu_prot_update(entry); | 
| 1580 | /* | ||
| 1581 | * Clear the pte entry and flush it first, before updating the | ||
| 1582 | * pte with the new entry. This will avoid a race condition | ||
| 1583 | * seen in the presence of one thread doing SMC and another | ||
| 1584 | * thread doing COW. | ||
| 1585 | */ | ||
| 1586 | ptep_clear_flush(vma, address, page_table); | ||
| 1587 | set_pte_at(mm, address, page_table, entry); | ||
| 1588 | update_mmu_cache(vma, address, entry); | ||
| 1555 | lru_cache_add_active(new_page); | 1589 | lru_cache_add_active(new_page); | 
| 1556 | page_add_new_anon_rmap(new_page, vma, address); | 1590 | page_add_new_anon_rmap(new_page, vma, address); | 
| 1557 | 1591 | ||
| @@ -1565,6 +1599,10 @@ gotten: | |||
| 1565 | page_cache_release(old_page); | 1599 | page_cache_release(old_page); | 
| 1566 | unlock: | 1600 | unlock: | 
| 1567 | pte_unmap_unlock(page_table, ptl); | 1601 | pte_unmap_unlock(page_table, ptl); | 
| 1602 | if (dirty_page) { | ||
| 1603 | set_page_dirty_balance(dirty_page); | ||
| 1604 | put_page(dirty_page); | ||
| 1605 | } | ||
| 1568 | return ret; | 1606 | return ret; | 
| 1569 | oom: | 1607 | oom: | 
| 1570 | if (old_page) | 1608 | if (old_page) | 
| @@ -1784,9 +1822,10 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 1784 | } | 1822 | } | 
| 1785 | EXPORT_SYMBOL(unmap_mapping_range); | 1823 | EXPORT_SYMBOL(unmap_mapping_range); | 
| 1786 | 1824 | ||
| 1787 | /* | 1825 | /** | 
| 1788 | * Handle all mappings that got truncated by a "truncate()" | 1826 | * vmtruncate - unmap mappings "freed" by truncate() syscall | 
| 1789 | * system call. | 1827 | * @inode: inode of the file used | 
| 1828 | * @offset: file offset to start truncating | ||
| 1790 | * | 1829 | * | 
| 1791 | * NOTE! We have to be ready to update the memory sharing | 1830 | * NOTE! We have to be ready to update the memory sharing | 
| 1792 | * between the file and the memory map for a potential last | 1831 | * between the file and the memory map for a potential last | 
| @@ -1853,13 +1892,18 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
| 1853 | 1892 | ||
| 1854 | return 0; | 1893 | return 0; | 
| 1855 | } | 1894 | } | 
| 1856 | EXPORT_SYMBOL(vmtruncate_range); | 1895 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ | 
| 1857 | 1896 | ||
| 1858 | /* | 1897 | /** | 
| 1898 | * swapin_readahead - swap in pages in hope we need them soon | ||
| 1899 | * @entry: swap entry of this memory | ||
| 1900 | * @addr: address to start | ||
| 1901 | * @vma: user vma this addresses belong to | ||
| 1902 | * | ||
| 1859 | * Primitive swap readahead code. We simply read an aligned block of | 1903 | * Primitive swap readahead code. We simply read an aligned block of | 
| 1860 | * (1 << page_cluster) entries in the swap area. This method is chosen | 1904 | * (1 << page_cluster) entries in the swap area. This method is chosen | 
| 1861 | * because it doesn't cost us any seek time. We also make sure to queue | 1905 | * because it doesn't cost us any seek time. We also make sure to queue | 
| 1862 | * the 'original' request together with the readahead ones... | 1906 | * the 'original' request together with the readahead ones... | 
| 1863 | * | 1907 | * | 
| 1864 | * This has been extended to use the NUMA policies from the mm triggering | 1908 | * This has been extended to use the NUMA policies from the mm triggering | 
| 1865 | * the readahead. | 1909 | * the readahead. | 
| @@ -1934,6 +1978,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1934 | migration_entry_wait(mm, pmd, address); | 1978 | migration_entry_wait(mm, pmd, address); | 
| 1935 | goto out; | 1979 | goto out; | 
| 1936 | } | 1980 | } | 
| 1981 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | ||
| 1937 | page = lookup_swap_cache(entry); | 1982 | page = lookup_swap_cache(entry); | 
| 1938 | if (!page) { | 1983 | if (!page) { | 
| 1939 | swapin_readahead(entry, address, vma); | 1984 | swapin_readahead(entry, address, vma); | 
| @@ -1946,15 +1991,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1946 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1991 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 
| 1947 | if (likely(pte_same(*page_table, orig_pte))) | 1992 | if (likely(pte_same(*page_table, orig_pte))) | 
| 1948 | ret = VM_FAULT_OOM; | 1993 | ret = VM_FAULT_OOM; | 
| 1994 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 1949 | goto unlock; | 1995 | goto unlock; | 
| 1950 | } | 1996 | } | 
| 1951 | 1997 | ||
| 1952 | /* Had to read the page from swap area: Major fault */ | 1998 | /* Had to read the page from swap area: Major fault */ | 
| 1953 | ret = VM_FAULT_MAJOR; | 1999 | ret = VM_FAULT_MAJOR; | 
| 1954 | inc_page_state(pgmajfault); | 2000 | count_vm_event(PGMAJFAULT); | 
| 1955 | grab_swap_token(); | 2001 | grab_swap_token(); | 
| 1956 | } | 2002 | } | 
| 1957 | 2003 | ||
| 2004 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 1958 | mark_page_accessed(page); | 2005 | mark_page_accessed(page); | 
| 1959 | lock_page(page); | 2006 | lock_page(page); | 
| 1960 | 2007 | ||
| @@ -2094,6 +2141,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2094 | unsigned int sequence = 0; | 2141 | unsigned int sequence = 0; | 
| 2095 | int ret = VM_FAULT_MINOR; | 2142 | int ret = VM_FAULT_MINOR; | 
| 2096 | int anon = 0; | 2143 | int anon = 0; | 
| 2144 | struct page *dirty_page = NULL; | ||
| 2097 | 2145 | ||
| 2098 | pte_unmap(page_table); | 2146 | pte_unmap(page_table); | 
| 2099 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2147 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 
| @@ -2188,6 +2236,10 @@ retry: | |||
| 2188 | } else { | 2236 | } else { | 
| 2189 | inc_mm_counter(mm, file_rss); | 2237 | inc_mm_counter(mm, file_rss); | 
| 2190 | page_add_file_rmap(new_page); | 2238 | page_add_file_rmap(new_page); | 
| 2239 | if (write_access) { | ||
| 2240 | dirty_page = new_page; | ||
| 2241 | get_page(dirty_page); | ||
| 2242 | } | ||
| 2191 | } | 2243 | } | 
| 2192 | } else { | 2244 | } else { | 
| 2193 | /* One of our sibling threads was faster, back out. */ | 2245 | /* One of our sibling threads was faster, back out. */ | 
| @@ -2200,6 +2252,10 @@ retry: | |||
| 2200 | lazy_mmu_prot_update(entry); | 2252 | lazy_mmu_prot_update(entry); | 
| 2201 | unlock: | 2253 | unlock: | 
| 2202 | pte_unmap_unlock(page_table, ptl); | 2254 | pte_unmap_unlock(page_table, ptl); | 
| 2255 | if (dirty_page) { | ||
| 2256 | set_page_dirty_balance(dirty_page); | ||
| 2257 | put_page(dirty_page); | ||
| 2258 | } | ||
| 2203 | return ret; | 2259 | return ret; | 
| 2204 | oom: | 2260 | oom: | 
| 2205 | page_cache_release(new_page); | 2261 | page_cache_release(new_page); | 
| @@ -2207,6 +2263,54 @@ oom: | |||
| 2207 | } | 2263 | } | 
| 2208 | 2264 | ||
| 2209 | /* | 2265 | /* | 
| 2266 | * do_no_pfn() tries to create a new page mapping for a page without | ||
| 2267 | * a struct_page backing it | ||
| 2268 | * | ||
| 2269 | * As this is called only for pages that do not currently exist, we | ||
| 2270 | * do not need to flush old virtual caches or the TLB. | ||
| 2271 | * | ||
| 2272 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 2273 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
| 2274 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 2275 | * | ||
| 2276 | * It is expected that the ->nopfn handler always returns the same pfn | ||
| 2277 | * for a given virtual mapping. | ||
| 2278 | * | ||
| 2279 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
| 2280 | */ | ||
| 2281 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2282 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 2283 | int write_access) | ||
| 2284 | { | ||
| 2285 | spinlock_t *ptl; | ||
| 2286 | pte_t entry; | ||
| 2287 | unsigned long pfn; | ||
| 2288 | int ret = VM_FAULT_MINOR; | ||
| 2289 | |||
| 2290 | pte_unmap(page_table); | ||
| 2291 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | ||
| 2292 | BUG_ON(is_cow_mapping(vma->vm_flags)); | ||
| 2293 | |||
| 2294 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
| 2295 | if (pfn == NOPFN_OOM) | ||
| 2296 | return VM_FAULT_OOM; | ||
| 2297 | if (pfn == NOPFN_SIGBUS) | ||
| 2298 | return VM_FAULT_SIGBUS; | ||
| 2299 | |||
| 2300 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2301 | |||
| 2302 | /* Only go through if we didn't race with anybody else... */ | ||
| 2303 | if (pte_none(*page_table)) { | ||
| 2304 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
| 2305 | if (write_access) | ||
| 2306 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 2307 | set_pte_at(mm, address, page_table, entry); | ||
| 2308 | } | ||
| 2309 | pte_unmap_unlock(page_table, ptl); | ||
| 2310 | return ret; | ||
| 2311 | } | ||
| 2312 | |||
| 2313 | /* | ||
| 2210 | * Fault of a previously existing named mapping. Repopulate the pte | 2314 | * Fault of a previously existing named mapping. Repopulate the pte | 
| 2211 | * from the encoded file_pte if possible. This enables swappable | 2315 | * from the encoded file_pte if possible. This enables swappable | 
| 2212 | * nonlinear vmas. | 2316 | * nonlinear vmas. | 
| @@ -2268,11 +2372,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2268 | old_entry = entry = *pte; | 2372 | old_entry = entry = *pte; | 
| 2269 | if (!pte_present(entry)) { | 2373 | if (!pte_present(entry)) { | 
| 2270 | if (pte_none(entry)) { | 2374 | if (pte_none(entry)) { | 
| 2271 | if (!vma->vm_ops || !vma->vm_ops->nopage) | 2375 | if (vma->vm_ops) { | 
| 2272 | return do_anonymous_page(mm, vma, address, | 2376 | if (vma->vm_ops->nopage) | 
| 2273 | pte, pmd, write_access); | 2377 | return do_no_page(mm, vma, address, | 
| 2274 | return do_no_page(mm, vma, address, | 2378 | pte, pmd, | 
| 2275 | pte, pmd, write_access); | 2379 | write_access); | 
| 2380 | if (unlikely(vma->vm_ops->nopfn)) | ||
| 2381 | return do_no_pfn(mm, vma, address, pte, | ||
| 2382 | pmd, write_access); | ||
| 2383 | } | ||
| 2384 | return do_anonymous_page(mm, vma, address, | ||
| 2385 | pte, pmd, write_access); | ||
| 2276 | } | 2386 | } | 
| 2277 | if (pte_file(entry)) | 2387 | if (pte_file(entry)) | 
| 2278 | return do_file_page(mm, vma, address, | 2388 | return do_file_page(mm, vma, address, | 
| @@ -2324,7 +2434,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2324 | 2434 | ||
| 2325 | __set_current_state(TASK_RUNNING); | 2435 | __set_current_state(TASK_RUNNING); | 
| 2326 | 2436 | ||
| 2327 | inc_page_state(pgfault); | 2437 | count_vm_event(PGFAULT); | 
| 2328 | 2438 | ||
| 2329 | if (unlikely(is_vm_hugetlb_page(vma))) | 2439 | if (unlikely(is_vm_hugetlb_page(vma))) | 
| 2330 | return hugetlb_fault(mm, vma, address, write_access); | 2440 | return hugetlb_fault(mm, vma, address, write_access); | 
| @@ -2501,3 +2611,56 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 2501 | } | 2611 | } | 
| 2502 | 2612 | ||
| 2503 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2613 | #endif /* __HAVE_ARCH_GATE_AREA */ | 
| 2614 | |||
| 2615 | /* | ||
| 2616 | * Access another process' address space. | ||
| 2617 | * Source/target buffer must be kernel space, | ||
| 2618 | * Do not walk the page table directly, use get_user_pages | ||
| 2619 | */ | ||
| 2620 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
| 2621 | { | ||
| 2622 | struct mm_struct *mm; | ||
| 2623 | struct vm_area_struct *vma; | ||
| 2624 | struct page *page; | ||
| 2625 | void *old_buf = buf; | ||
| 2626 | |||
| 2627 | mm = get_task_mm(tsk); | ||
| 2628 | if (!mm) | ||
| 2629 | return 0; | ||
| 2630 | |||
| 2631 | down_read(&mm->mmap_sem); | ||
| 2632 | /* ignore errors, just check how much was sucessfully transfered */ | ||
| 2633 | while (len) { | ||
| 2634 | int bytes, ret, offset; | ||
| 2635 | void *maddr; | ||
| 2636 | |||
| 2637 | ret = get_user_pages(tsk, mm, addr, 1, | ||
| 2638 | write, 1, &page, &vma); | ||
| 2639 | if (ret <= 0) | ||
| 2640 | break; | ||
| 2641 | |||
| 2642 | bytes = len; | ||
| 2643 | offset = addr & (PAGE_SIZE-1); | ||
| 2644 | if (bytes > PAGE_SIZE-offset) | ||
| 2645 | bytes = PAGE_SIZE-offset; | ||
| 2646 | |||
| 2647 | maddr = kmap(page); | ||
| 2648 | if (write) { | ||
| 2649 | copy_to_user_page(vma, page, addr, | ||
| 2650 | maddr + offset, buf, bytes); | ||
| 2651 | set_page_dirty_lock(page); | ||
| 2652 | } else { | ||
| 2653 | copy_from_user_page(vma, page, addr, | ||
| 2654 | buf, maddr + offset, bytes); | ||
| 2655 | } | ||
| 2656 | kunmap(page); | ||
| 2657 | page_cache_release(page); | ||
| 2658 | len -= bytes; | ||
| 2659 | buf += bytes; | ||
| 2660 | addr += bytes; | ||
| 2661 | } | ||
| 2662 | up_read(&mm->mmap_sem); | ||
| 2663 | mmput(mm); | ||
| 2664 | |||
| 2665 | return buf - old_buf; | ||
| 2666 | } | ||
| diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 841a077d5aeb..2053bb165a21 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | * Copyright (C) | 4 | * Copyright (C) | 
| 5 | */ | 5 | */ | 
| 6 | 6 | ||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/stddef.h> | 7 | #include <linux/stddef.h> | 
| 9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> | 
| 10 | #include <linux/swap.h> | 9 | #include <linux/swap.h> | 
| @@ -14,6 +13,7 @@ | |||
| 14 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> | 
| 15 | #include <linux/module.h> | 14 | #include <linux/module.h> | 
| 16 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> | 
| 16 | #include <linux/writeback.h> | ||
| 17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> | 
| 18 | #include <linux/sysctl.h> | 18 | #include <linux/sysctl.h> | 
| 19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> | 
| @@ -21,6 +21,8 @@ | |||
| 21 | #include <linux/memory_hotplug.h> | 21 | #include <linux/memory_hotplug.h> | 
| 22 | #include <linux/highmem.h> | 22 | #include <linux/highmem.h> | 
| 23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> | 
| 24 | #include <linux/ioport.h> | ||
| 25 | #include <linux/cpuset.h> | ||
| 24 | 26 | ||
| 25 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> | 
| 26 | 28 | ||
| @@ -52,6 +54,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | |||
| 52 | int nr_pages = PAGES_PER_SECTION; | 54 | int nr_pages = PAGES_PER_SECTION; | 
| 53 | int ret; | 55 | int ret; | 
| 54 | 56 | ||
| 57 | if (pfn_valid(phys_start_pfn)) | ||
| 58 | return -EEXIST; | ||
| 59 | |||
| 55 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | 60 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | 
| 56 | 61 | ||
| 57 | if (ret < 0) | 62 | if (ret < 0) | 
| @@ -76,15 +81,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
| 76 | { | 81 | { | 
| 77 | unsigned long i; | 82 | unsigned long i; | 
| 78 | int err = 0; | 83 | int err = 0; | 
| 84 | int start_sec, end_sec; | ||
| 85 | /* during initialize mem_map, align hot-added range to section */ | ||
| 86 | start_sec = pfn_to_section_nr(phys_start_pfn); | ||
| 87 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | ||
| 79 | 88 | ||
| 80 | for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { | 89 | for (i = start_sec; i <= end_sec; i++) { | 
| 81 | err = __add_section(zone, phys_start_pfn + i); | 90 | err = __add_section(zone, i << PFN_SECTION_SHIFT); | 
| 82 | 91 | ||
| 83 | /* We want to keep adding the rest of the | 92 | /* | 
| 84 | * sections if the first ones already exist | 93 | * EEXIST is finally dealed with by ioresource collision | 
| 94 | * check. see add_memory() => register_memory_resource() | ||
| 95 | * Warning will be printed if there is collision. | ||
| 85 | */ | 96 | */ | 
| 86 | if (err && (err != -EEXIST)) | 97 | if (err && (err != -EEXIST)) | 
| 87 | break; | 98 | break; | 
| 99 | err = 0; | ||
| 88 | } | 100 | } | 
| 89 | 101 | ||
| 90 | return err; | 102 | return err; | 
| @@ -126,6 +138,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 126 | unsigned long i; | 138 | unsigned long i; | 
| 127 | unsigned long flags; | 139 | unsigned long flags; | 
| 128 | unsigned long onlined_pages = 0; | 140 | unsigned long onlined_pages = 0; | 
| 141 | struct resource res; | ||
| 142 | u64 section_end; | ||
| 143 | unsigned long start_pfn; | ||
| 129 | struct zone *zone; | 144 | struct zone *zone; | 
| 130 | int need_zonelists_rebuild = 0; | 145 | int need_zonelists_rebuild = 0; | 
| 131 | 146 | ||
| @@ -148,10 +163,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 148 | if (!populated_zone(zone)) | 163 | if (!populated_zone(zone)) | 
| 149 | need_zonelists_rebuild = 1; | 164 | need_zonelists_rebuild = 1; | 
| 150 | 165 | ||
| 151 | for (i = 0; i < nr_pages; i++) { | 166 | res.start = (u64)pfn << PAGE_SHIFT; | 
| 152 | struct page *page = pfn_to_page(pfn + i); | 167 | res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; | 
| 153 | online_page(page); | 168 | res.flags = IORESOURCE_MEM; /* we just need system ram */ | 
| 154 | onlined_pages++; | 169 | section_end = res.end; | 
| 170 | |||
| 171 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { | ||
| 172 | start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); | ||
| 173 | nr_pages = (unsigned long) | ||
| 174 | ((res.end + 1 - res.start) >> PAGE_SHIFT); | ||
| 175 | |||
| 176 | if (PageReserved(pfn_to_page(start_pfn))) { | ||
| 177 | /* this region's page is not onlined now */ | ||
| 178 | for (i = 0; i < nr_pages; i++) { | ||
| 179 | struct page *page = pfn_to_page(start_pfn + i); | ||
| 180 | online_page(page); | ||
| 181 | onlined_pages++; | ||
| 182 | } | ||
| 183 | } | ||
| 184 | |||
| 185 | res.start = res.end + 1; | ||
| 186 | res.end = section_end; | ||
| 155 | } | 187 | } | 
| 156 | zone->present_pages += onlined_pages; | 188 | zone->present_pages += onlined_pages; | 
| 157 | zone->zone_pgdat->node_present_pages += onlined_pages; | 189 | zone->zone_pgdat->node_present_pages += onlined_pages; | 
| @@ -161,5 +193,119 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 161 | if (need_zonelists_rebuild) | 193 | if (need_zonelists_rebuild) | 
| 162 | build_all_zonelists(); | 194 | build_all_zonelists(); | 
| 163 | vm_total_pages = nr_free_pagecache_pages(); | 195 | vm_total_pages = nr_free_pagecache_pages(); | 
| 196 | writeback_set_ratelimit(); | ||
| 164 | return 0; | 197 | return 0; | 
| 165 | } | 198 | } | 
| 199 | |||
| 200 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | ||
| 201 | { | ||
| 202 | struct pglist_data *pgdat; | ||
| 203 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | ||
| 204 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | ||
| 205 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 206 | |||
| 207 | pgdat = arch_alloc_nodedata(nid); | ||
| 208 | if (!pgdat) | ||
| 209 | return NULL; | ||
| 210 | |||
| 211 | arch_refresh_nodedata(nid, pgdat); | ||
| 212 | |||
| 213 | /* we can use NODE_DATA(nid) from here */ | ||
| 214 | |||
| 215 | /* init node's zones as empty zones, we don't have any present pages.*/ | ||
| 216 | free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); | ||
| 217 | |||
| 218 | return pgdat; | ||
| 219 | } | ||
| 220 | |||
| 221 | static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | ||
| 222 | { | ||
| 223 | arch_refresh_nodedata(nid, NULL); | ||
| 224 | arch_free_nodedata(pgdat); | ||
| 225 | return; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* add this memory to iomem resource */ | ||
| 229 | static struct resource *register_memory_resource(u64 start, u64 size) | ||
| 230 | { | ||
| 231 | struct resource *res; | ||
| 232 | res = kzalloc(sizeof(struct resource), GFP_KERNEL); | ||
| 233 | BUG_ON(!res); | ||
| 234 | |||
| 235 | res->name = "System RAM"; | ||
| 236 | res->start = start; | ||
| 237 | res->end = start + size - 1; | ||
| 238 | res->flags = IORESOURCE_MEM; | ||
| 239 | if (request_resource(&iomem_resource, res) < 0) { | ||
| 240 | printk("System RAM resource %llx - %llx cannot be added\n", | ||
| 241 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
| 242 | kfree(res); | ||
| 243 | res = NULL; | ||
| 244 | } | ||
| 245 | return res; | ||
| 246 | } | ||
| 247 | |||
| 248 | static void release_memory_resource(struct resource *res) | ||
| 249 | { | ||
| 250 | if (!res) | ||
| 251 | return; | ||
| 252 | release_resource(res); | ||
| 253 | kfree(res); | ||
| 254 | return; | ||
| 255 | } | ||
| 256 | |||
| 257 | |||
| 258 | |||
| 259 | int add_memory(int nid, u64 start, u64 size) | ||
| 260 | { | ||
| 261 | pg_data_t *pgdat = NULL; | ||
| 262 | int new_pgdat = 0; | ||
| 263 | struct resource *res; | ||
| 264 | int ret; | ||
| 265 | |||
| 266 | res = register_memory_resource(start, size); | ||
| 267 | if (!res) | ||
| 268 | return -EEXIST; | ||
| 269 | |||
| 270 | if (!node_online(nid)) { | ||
| 271 | pgdat = hotadd_new_pgdat(nid, start); | ||
| 272 | if (!pgdat) | ||
| 273 | return -ENOMEM; | ||
| 274 | new_pgdat = 1; | ||
| 275 | ret = kswapd_run(nid); | ||
| 276 | if (ret) | ||
| 277 | goto error; | ||
| 278 | } | ||
| 279 | |||
| 280 | /* call arch's memory hotadd */ | ||
| 281 | ret = arch_add_memory(nid, start, size); | ||
| 282 | |||
| 283 | if (ret < 0) | ||
| 284 | goto error; | ||
| 285 | |||
| 286 | /* we online node here. we can't roll back from here. */ | ||
| 287 | node_set_online(nid); | ||
| 288 | |||
| 289 | cpuset_track_online_nodes(); | ||
| 290 | |||
| 291 | if (new_pgdat) { | ||
| 292 | ret = register_one_node(nid); | ||
| 293 | /* | ||
| 294 | * If sysfs file of new node can't create, cpu on the node | ||
| 295 | * can't be hot-added. There is no rollback way now. | ||
| 296 | * So, check by BUG_ON() to catch it reluctantly.. | ||
| 297 | */ | ||
| 298 | BUG_ON(ret); | ||
| 299 | } | ||
| 300 | |||
| 301 | return ret; | ||
| 302 | error: | ||
| 303 | /* rollback pgdat allocation and others */ | ||
| 304 | if (new_pgdat) | ||
| 305 | rollback_node_hotadd(nid, pgdat); | ||
| 306 | if (res) | ||
| 307 | release_memory_resource(res); | ||
| 308 | |||
| 309 | return ret; | ||
| 310 | } | ||
| 311 | EXPORT_SYMBOL_GPL(add_memory); | ||
| diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ec4a1a950df9..cf18f0942553 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache; | |||
| 105 | 105 | ||
| 106 | /* Highest zone. An specific allocation for a zone below that is not | 106 | /* Highest zone. An specific allocation for a zone below that is not | 
| 107 | policied. */ | 107 | policied. */ | 
| 108 | int policy_zone = ZONE_DMA; | 108 | enum zone_type policy_zone = ZONE_DMA; | 
| 109 | 109 | ||
| 110 | struct mempolicy default_policy = { | 110 | struct mempolicy default_policy = { | 
| 111 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 111 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 
| @@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
| 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 
| 138 | { | 138 | { | 
| 139 | struct zonelist *zl; | 139 | struct zonelist *zl; | 
| 140 | int num, max, nd, k; | 140 | int num, max, nd; | 
| 141 | enum zone_type k; | ||
| 141 | 142 | ||
| 142 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 
| 143 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 
| @@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
| 148 | lower zones etc. Avoid empty zones because the memory allocator | 149 | lower zones etc. Avoid empty zones because the memory allocator | 
| 149 | doesn't like them. If you implement node hot removal you | 150 | doesn't like them. If you implement node hot removal you | 
| 150 | have to fix that. */ | 151 | have to fix that. */ | 
| 151 | for (k = policy_zone; k >= 0; k--) { | 152 | k = policy_zone; | 
| 153 | while (1) { | ||
| 152 | for_each_node_mask(nd, *nodes) { | 154 | for_each_node_mask(nd, *nodes) { | 
| 153 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 155 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 
| 154 | if (z->present_pages > 0) | 156 | if (z->present_pages > 0) | 
| 155 | zl->zones[num++] = z; | 157 | zl->zones[num++] = z; | 
| 156 | } | 158 | } | 
| 159 | if (k == 0) | ||
| 160 | break; | ||
| 161 | k--; | ||
| 157 | } | 162 | } | 
| 158 | zl->zones[num] = NULL; | 163 | zl->zones[num] = NULL; | 
| 159 | return zl; | 164 | return zl; | 
| @@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
| 482 | switch (p->policy) { | 487 | switch (p->policy) { | 
| 483 | case MPOL_BIND: | 488 | case MPOL_BIND: | 
| 484 | for (i = 0; p->v.zonelist->zones[i]; i++) | 489 | for (i = 0; p->v.zonelist->zones[i]; i++) | 
| 485 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, | 490 | node_set(zone_to_nid(p->v.zonelist->zones[i]), | 
| 486 | *nodes); | 491 | *nodes); | 
| 487 | break; | 492 | break; | 
| 488 | case MPOL_DEFAULT: | 493 | case MPOL_DEFAULT: | 
| @@ -632,6 +637,10 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 632 | 637 | ||
| 633 | down_read(&mm->mmap_sem); | 638 | down_read(&mm->mmap_sem); | 
| 634 | 639 | ||
| 640 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | ||
| 641 | if (err) | ||
| 642 | goto out; | ||
| 643 | |||
| 635 | /* | 644 | /* | 
| 636 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 645 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 
| 637 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 646 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 
| @@ -691,7 +700,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 691 | if (err < 0) | 700 | if (err < 0) | 
| 692 | break; | 701 | break; | 
| 693 | } | 702 | } | 
| 694 | 703 | out: | |
| 695 | up_read(&mm->mmap_sem); | 704 | up_read(&mm->mmap_sem); | 
| 696 | if (err < 0) | 705 | if (err < 0) | 
| 697 | return err; | 706 | return err; | 
| @@ -1127,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 1127 | */ | 1136 | */ | 
| 1128 | unsigned slab_node(struct mempolicy *policy) | 1137 | unsigned slab_node(struct mempolicy *policy) | 
| 1129 | { | 1138 | { | 
| 1130 | switch (policy->policy) { | 1139 | int pol = policy ? policy->policy : MPOL_DEFAULT; | 
| 1140 | |||
| 1141 | switch (pol) { | ||
| 1131 | case MPOL_INTERLEAVE: | 1142 | case MPOL_INTERLEAVE: | 
| 1132 | return interleave_nodes(policy); | 1143 | return interleave_nodes(policy); | 
| 1133 | 1144 | ||
| @@ -1136,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy) | |||
| 1136 | * Follow bind policy behavior and start allocation at the | 1147 | * Follow bind policy behavior and start allocation at the | 
| 1137 | * first node. | 1148 | * first node. | 
| 1138 | */ | 1149 | */ | 
| 1139 | return policy->v.zonelist->zones[0]->zone_pgdat->node_id; | 1150 | return zone_to_nid(policy->v.zonelist->zones[0]); | 
| 1140 | 1151 | ||
| 1141 | case MPOL_PREFERRED: | 1152 | case MPOL_PREFERRED: | 
| 1142 | if (policy->v.preferred_node >= 0) | 1153 | if (policy->v.preferred_node >= 0) | 
| @@ -1172,7 +1183,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
| 1172 | if (vma) { | 1183 | if (vma) { | 
| 1173 | unsigned long off; | 1184 | unsigned long off; | 
| 1174 | 1185 | ||
| 1175 | off = vma->vm_pgoff; | 1186 | /* | 
| 1187 | * for small pages, there is no difference between | ||
| 1188 | * shift and PAGE_SHIFT, so the bit-shift is safe. | ||
| 1189 | * for huge pages, since vm_pgoff is in units of small | ||
| 1190 | * pages, we need to shift off the always 0 bits to get | ||
| 1191 | * a useful offset. | ||
| 1192 | */ | ||
| 1193 | BUG_ON(shift < PAGE_SHIFT); | ||
| 1194 | off = vma->vm_pgoff >> (shift - PAGE_SHIFT); | ||
| 1176 | off += (addr - vma->vm_start) >> shift; | 1195 | off += (addr - vma->vm_start) >> shift; | 
| 1177 | return offset_il_node(pol, vma, off); | 1196 | return offset_il_node(pol, vma, off); | 
| 1178 | } else | 1197 | } else | 
| @@ -1205,10 +1224,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
| 1205 | 1224 | ||
| 1206 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 1225 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 
| 1207 | page = __alloc_pages(gfp, order, zl); | 1226 | page = __alloc_pages(gfp, order, zl); | 
| 1208 | if (page && page_zone(page) == zl->zones[0]) { | 1227 | if (page && page_zone(page) == zl->zones[0]) | 
| 1209 | zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; | 1228 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); | 
| 1210 | put_cpu(); | ||
| 1211 | } | ||
| 1212 | return page; | 1229 | return page; | 
| 1213 | } | 1230 | } | 
| 1214 | 1231 | ||
| @@ -1275,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
| 1275 | 1292 | ||
| 1276 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 1293 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 
| 1277 | cpuset_update_task_memory_state(); | 1294 | cpuset_update_task_memory_state(); | 
| 1278 | if (!pol || in_interrupt()) | 1295 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 
| 1279 | pol = &default_policy; | 1296 | pol = &default_policy; | 
| 1280 | if (pol->policy == MPOL_INTERLEAVE) | 1297 | if (pol->policy == MPOL_INTERLEAVE) | 
| 1281 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1298 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 
| @@ -1634,7 +1651,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | |||
| 1634 | 1651 | ||
| 1635 | nodes_clear(nodes); | 1652 | nodes_clear(nodes); | 
| 1636 | for (z = pol->v.zonelist->zones; *z; z++) | 1653 | for (z = pol->v.zonelist->zones; *z; z++) | 
| 1637 | node_set((*z)->zone_pgdat->node_id, nodes); | 1654 | node_set(zone_to_nid(*z), nodes); | 
| 1638 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | 1655 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | 
| 1639 | nodes = tmp; | 1656 | nodes = tmp; | 
| 1640 | 1657 | ||
| @@ -1817,7 +1834,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, | |||
| 1817 | 1834 | ||
| 1818 | int show_numa_map(struct seq_file *m, void *v) | 1835 | int show_numa_map(struct seq_file *m, void *v) | 
| 1819 | { | 1836 | { | 
| 1820 | struct task_struct *task = m->private; | 1837 | struct proc_maps_private *priv = m->private; | 
| 1821 | struct vm_area_struct *vma = v; | 1838 | struct vm_area_struct *vma = v; | 
| 1822 | struct numa_maps *md; | 1839 | struct numa_maps *md; | 
| 1823 | struct file *file = vma->vm_file; | 1840 | struct file *file = vma->vm_file; | 
| @@ -1833,7 +1850,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
| 1833 | return 0; | 1850 | return 0; | 
| 1834 | 1851 | ||
| 1835 | mpol_to_str(buffer, sizeof(buffer), | 1852 | mpol_to_str(buffer, sizeof(buffer), | 
| 1836 | get_vma_policy(task, vma, vma->vm_start)); | 1853 | get_vma_policy(priv->task, vma, vma->vm_start)); | 
| 1837 | 1854 | ||
| 1838 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 1855 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 
| 1839 | 1856 | ||
| @@ -1887,7 +1904,7 @@ out: | |||
| 1887 | kfree(md); | 1904 | kfree(md); | 
| 1888 | 1905 | ||
| 1889 | if (m->count < m->size) | 1906 | if (m->count < m->size) | 
| 1890 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | 1907 | m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; | 
| 1891 | return 0; | 1908 | return 0; | 
| 1892 | } | 1909 | } | 
| 1893 | 1910 | ||
| diff --git a/mm/mempool.c b/mm/mempool.c index fe6e05289cc5..ccd8cb8cd41f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -238,8 +238,13 @@ repeat_alloc: | |||
| 238 | init_wait(&wait); | 238 | init_wait(&wait); | 
| 239 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | 239 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | 
| 240 | smp_mb(); | 240 | smp_mb(); | 
| 241 | if (!pool->curr_nr) | 241 | if (!pool->curr_nr) { | 
| 242 | io_schedule(); | 242 | /* | 
| 243 | * FIXME: this should be io_schedule(). The timeout is there | ||
| 244 | * as a workaround for some DM problems in 2.6.18. | ||
| 245 | */ | ||
| 246 | io_schedule_timeout(5*HZ); | ||
| 247 | } | ||
| 243 | finish_wait(&pool->wait, &wait); | 248 | finish_wait(&pool->wait, &wait); | 
| 244 | 249 | ||
| 245 | goto repeat_alloc; | 250 | goto repeat_alloc; | 
| diff --git a/mm/migrate.c b/mm/migrate.c index 1c2a71aa05cd..20a8c2687b1e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -616,15 +616,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 616 | /* | 616 | /* | 
| 617 | * Establish migration ptes or remove ptes | 617 | * Establish migration ptes or remove ptes | 
| 618 | */ | 618 | */ | 
| 619 | if (try_to_unmap(page, 1) != SWAP_FAIL) { | 619 | try_to_unmap(page, 1); | 
| 620 | if (!page_mapped(page)) | 620 | if (!page_mapped(page)) | 
| 621 | rc = move_to_new_page(newpage, page); | 621 | rc = move_to_new_page(newpage, page); | 
| 622 | } else | ||
| 623 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
| 624 | rc = -EPERM; | ||
| 625 | 622 | ||
| 626 | if (rc) | 623 | if (rc) | 
| 627 | remove_migration_ptes(page, page); | 624 | remove_migration_ptes(page, page); | 
| 625 | |||
| 628 | unlock: | 626 | unlock: | 
| 629 | unlock_page(page); | 627 | unlock_page(page); | 
| 630 | 628 | ||
| @@ -743,7 +741,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 743 | 741 | ||
| 744 | *result = &pm->status; | 742 | *result = &pm->status; | 
| 745 | 743 | ||
| 746 | return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); | 744 | return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0); | 
| 747 | } | 745 | } | 
| 748 | 746 | ||
| 749 | /* | 747 | /* | 
| @@ -976,3 +974,23 @@ out2: | |||
| 976 | } | 974 | } | 
| 977 | #endif | 975 | #endif | 
| 978 | 976 | ||
| 977 | /* | ||
| 978 | * Call migration functions in the vma_ops that may prepare | ||
| 979 | * memory in a vm for migration. migration functions may perform | ||
| 980 | * the migration for vmas that do not have an underlying page struct. | ||
| 981 | */ | ||
| 982 | int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | ||
| 983 | const nodemask_t *from, unsigned long flags) | ||
| 984 | { | ||
| 985 | struct vm_area_struct *vma; | ||
| 986 | int err = 0; | ||
| 987 | |||
| 988 | for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { | ||
| 989 | if (vma->vm_ops && vma->vm_ops->migrate) { | ||
| 990 | err = vma->vm_ops->migrate(vma, to, from, flags); | ||
| 991 | if (err) | ||
| 992 | break; | ||
| 993 | } | ||
| 994 | } | ||
| 995 | return err; | ||
| 996 | } | ||
| @@ -30,6 +30,10 @@ | |||
| 30 | #include <asm/cacheflush.h> | 30 | #include <asm/cacheflush.h> | 
| 31 | #include <asm/tlb.h> | 31 | #include <asm/tlb.h> | 
| 32 | 32 | ||
| 33 | #ifndef arch_mmap_check | ||
| 34 | #define arch_mmap_check(addr, len, flags) (0) | ||
| 35 | #endif | ||
| 36 | |||
| 33 | static void unmap_region(struct mm_struct *mm, | 37 | static void unmap_region(struct mm_struct *mm, | 
| 34 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 38 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 
| 35 | unsigned long start, unsigned long end); | 39 | unsigned long start, unsigned long end); | 
| @@ -60,6 +64,13 @@ pgprot_t protection_map[16] = { | |||
| 60 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 64 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 
| 61 | }; | 65 | }; | 
| 62 | 66 | ||
| 67 | pgprot_t vm_get_page_prot(unsigned long vm_flags) | ||
| 68 | { | ||
| 69 | return protection_map[vm_flags & | ||
| 70 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
| 71 | } | ||
| 72 | EXPORT_SYMBOL(vm_get_page_prot); | ||
| 73 | |||
| 63 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 74 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 
| 64 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 75 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 
| 65 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 76 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 
| @@ -96,7 +107,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
| 96 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 107 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 
| 97 | unsigned long n; | 108 | unsigned long n; | 
| 98 | 109 | ||
| 99 | free = get_page_cache_size(); | 110 | free = global_page_state(NR_FILE_PAGES); | 
| 100 | free += nr_swap_pages; | 111 | free += nr_swap_pages; | 
| 101 | 112 | ||
| 102 | /* | 113 | /* | 
| @@ -105,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
| 105 | * which are reclaimable, under pressure. The dentry | 116 | * which are reclaimable, under pressure. The dentry | 
| 106 | * cache and most inode caches should fall into this | 117 | * cache and most inode caches should fall into this | 
| 107 | */ | 118 | */ | 
| 108 | free += atomic_read(&slab_reclaim_pages); | 119 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 
| 109 | 120 | ||
| 110 | /* | 121 | /* | 
| 111 | * Leave the last 3% for root | 122 | * Leave the last 3% for root | 
| @@ -913,6 +924,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
| 913 | if (!len) | 924 | if (!len) | 
| 914 | return -EINVAL; | 925 | return -EINVAL; | 
| 915 | 926 | ||
| 927 | error = arch_mmap_check(addr, len, flags); | ||
| 928 | if (error) | ||
| 929 | return error; | ||
| 930 | |||
| 916 | /* Careful about overflows.. */ | 931 | /* Careful about overflows.. */ | 
| 917 | len = PAGE_ALIGN(len); | 932 | len = PAGE_ALIGN(len); | 
| 918 | if (!len || len > TASK_SIZE) | 933 | if (!len || len > TASK_SIZE) | 
| @@ -1090,12 +1105,6 @@ munmap_back: | |||
| 1090 | goto free_vma; | 1105 | goto free_vma; | 
| 1091 | } | 1106 | } | 
| 1092 | 1107 | ||
| 1093 | /* Don't make the VMA automatically writable if it's shared, but the | ||
| 1094 | * backer wishes to know when pages are first written to */ | ||
| 1095 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
| 1096 | vma->vm_page_prot = | ||
| 1097 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
| 1098 | |||
| 1099 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1108 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 
| 1100 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1109 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 
| 1101 | * that memory reservation must be checked; but that reservation | 1110 | * that memory reservation must be checked; but that reservation | 
| @@ -1113,6 +1122,10 @@ munmap_back: | |||
| 1113 | pgoff = vma->vm_pgoff; | 1122 | pgoff = vma->vm_pgoff; | 
| 1114 | vm_flags = vma->vm_flags; | 1123 | vm_flags = vma->vm_flags; | 
| 1115 | 1124 | ||
| 1125 | if (vma_wants_writenotify(vma)) | ||
| 1126 | vma->vm_page_prot = | ||
| 1127 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
| 1128 | |||
| 1116 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | 1129 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | 
| 1117 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1130 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 
| 1118 | file = vma->vm_file; | 1131 | file = vma->vm_file; | 
| @@ -1859,6 +1872,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 1859 | unsigned long flags; | 1872 | unsigned long flags; | 
| 1860 | struct rb_node ** rb_link, * rb_parent; | 1873 | struct rb_node ** rb_link, * rb_parent; | 
| 1861 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 1874 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 
| 1875 | int error; | ||
| 1862 | 1876 | ||
| 1863 | len = PAGE_ALIGN(len); | 1877 | len = PAGE_ALIGN(len); | 
| 1864 | if (!len) | 1878 | if (!len) | 
| @@ -1867,6 +1881,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 1867 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | 1881 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | 
| 1868 | return -EINVAL; | 1882 | return -EINVAL; | 
| 1869 | 1883 | ||
| 1884 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | ||
| 1885 | |||
| 1886 | error = arch_mmap_check(addr, len, flags); | ||
| 1887 | if (error) | ||
| 1888 | return error; | ||
| 1889 | |||
| 1870 | /* | 1890 | /* | 
| 1871 | * mlock MCL_FUTURE? | 1891 | * mlock MCL_FUTURE? | 
| 1872 | */ | 1892 | */ | 
| @@ -1907,8 +1927,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 1907 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) | 1927 | if (security_vm_enough_memory(len >> PAGE_SHIFT)) | 
| 1908 | return -ENOMEM; | 1928 | return -ENOMEM; | 
| 1909 | 1929 | ||
| 1910 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | ||
| 1911 | |||
| 1912 | /* Can we just expand an old private anonymous mapping? */ | 1930 | /* Can we just expand an old private anonymous mapping? */ | 
| 1913 | if (vma_merge(mm, prev, addr, addr + len, flags, | 1931 | if (vma_merge(mm, prev, addr, addr + len, flags, | 
| 1914 | NULL, NULL, pgoff, NULL)) | 1932 | NULL, NULL, pgoff, NULL)) | 
| diff --git a/mm/mmzone.c b/mm/mmzone.c index b022370e612e..febea1c98168 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
| @@ -5,7 +5,6 @@ | |||
| 5 | */ | 5 | */ | 
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | #include <linux/config.h> | ||
| 9 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> | 
| 10 | #include <linux/mmzone.h> | 9 | #include <linux/mmzone.h> | 
| 11 | #include <linux/module.h> | 10 | #include <linux/module.h> | 
| @@ -15,7 +14,7 @@ struct pglist_data *first_online_pgdat(void) | |||
| 15 | return NODE_DATA(first_online_node); | 14 | return NODE_DATA(first_online_node); | 
| 16 | } | 15 | } | 
| 17 | 16 | ||
| 18 | EXPORT_SYMBOL(first_online_pgdat); | 17 | EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ | 
| 19 | 18 | ||
| 20 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | 19 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | 
| 21 | { | 20 | { | 
| @@ -25,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | |||
| 25 | return NULL; | 24 | return NULL; | 
| 26 | return NODE_DATA(nid); | 25 | return NODE_DATA(nid); | 
| 27 | } | 26 | } | 
| 28 | EXPORT_SYMBOL(next_online_pgdat); | 27 | EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ | 
| 29 | 28 | ||
| 30 | 29 | ||
| 31 | /* | 30 | /* | 
| @@ -46,5 +45,5 @@ struct zone *next_zone(struct zone *zone) | |||
| 46 | } | 45 | } | 
| 47 | return zone; | 46 | return zone; | 
| 48 | } | 47 | } | 
| 49 | EXPORT_SYMBOL(next_zone); | 48 | EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ | 
| 50 | 49 | ||
| diff --git a/mm/mprotect.c b/mm/mprotect.c index 638edabaff71..955f9d0e38aa 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -27,7 +27,8 @@ | |||
| 27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> | 
| 28 | 28 | ||
| 29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 
| 30 | unsigned long addr, unsigned long end, pgprot_t newprot) | 30 | unsigned long addr, unsigned long end, pgprot_t newprot, | 
| 31 | int dirty_accountable) | ||
| 31 | { | 32 | { | 
| 32 | pte_t *pte, oldpte; | 33 | pte_t *pte, oldpte; | 
| 33 | spinlock_t *ptl; | 34 | spinlock_t *ptl; | 
| @@ -42,7 +43,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 42 | * bits by wiping the pte and then setting the new pte | 43 | * bits by wiping the pte and then setting the new pte | 
| 43 | * into place. | 44 | * into place. | 
| 44 | */ | 45 | */ | 
| 45 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); | 46 | ptent = ptep_get_and_clear(mm, addr, pte); | 
| 47 | ptent = pte_modify(ptent, newprot); | ||
| 48 | /* | ||
| 49 | * Avoid taking write faults for pages we know to be | ||
| 50 | * dirty. | ||
| 51 | */ | ||
| 52 | if (dirty_accountable && pte_dirty(ptent)) | ||
| 53 | ptent = pte_mkwrite(ptent); | ||
| 46 | set_pte_at(mm, addr, pte, ptent); | 54 | set_pte_at(mm, addr, pte, ptent); | 
| 47 | lazy_mmu_prot_update(ptent); | 55 | lazy_mmu_prot_update(ptent); | 
| 48 | #ifdef CONFIG_MIGRATION | 56 | #ifdef CONFIG_MIGRATION | 
| @@ -66,7 +74,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 66 | } | 74 | } | 
| 67 | 75 | ||
| 68 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 76 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 
| 69 | unsigned long addr, unsigned long end, pgprot_t newprot) | 77 | unsigned long addr, unsigned long end, pgprot_t newprot, | 
| 78 | int dirty_accountable) | ||
| 70 | { | 79 | { | 
| 71 | pmd_t *pmd; | 80 | pmd_t *pmd; | 
| 72 | unsigned long next; | 81 | unsigned long next; | 
| @@ -76,12 +85,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 76 | next = pmd_addr_end(addr, end); | 85 | next = pmd_addr_end(addr, end); | 
| 77 | if (pmd_none_or_clear_bad(pmd)) | 86 | if (pmd_none_or_clear_bad(pmd)) | 
| 78 | continue; | 87 | continue; | 
| 79 | change_pte_range(mm, pmd, addr, next, newprot); | 88 | change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); | 
| 80 | } while (pmd++, addr = next, addr != end); | 89 | } while (pmd++, addr = next, addr != end); | 
| 81 | } | 90 | } | 
| 82 | 91 | ||
| 83 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 92 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 
| 84 | unsigned long addr, unsigned long end, pgprot_t newprot) | 93 | unsigned long addr, unsigned long end, pgprot_t newprot, | 
| 94 | int dirty_accountable) | ||
| 85 | { | 95 | { | 
| 86 | pud_t *pud; | 96 | pud_t *pud; | 
| 87 | unsigned long next; | 97 | unsigned long next; | 
| @@ -91,12 +101,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
| 91 | next = pud_addr_end(addr, end); | 101 | next = pud_addr_end(addr, end); | 
| 92 | if (pud_none_or_clear_bad(pud)) | 102 | if (pud_none_or_clear_bad(pud)) | 
| 93 | continue; | 103 | continue; | 
| 94 | change_pmd_range(mm, pud, addr, next, newprot); | 104 | change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); | 
| 95 | } while (pud++, addr = next, addr != end); | 105 | } while (pud++, addr = next, addr != end); | 
| 96 | } | 106 | } | 
| 97 | 107 | ||
| 98 | static void change_protection(struct vm_area_struct *vma, | 108 | static void change_protection(struct vm_area_struct *vma, | 
| 99 | unsigned long addr, unsigned long end, pgprot_t newprot) | 109 | unsigned long addr, unsigned long end, pgprot_t newprot, | 
| 110 | int dirty_accountable) | ||
| 100 | { | 111 | { | 
| 101 | struct mm_struct *mm = vma->vm_mm; | 112 | struct mm_struct *mm = vma->vm_mm; | 
| 102 | pgd_t *pgd; | 113 | pgd_t *pgd; | 
| @@ -110,7 +121,7 @@ static void change_protection(struct vm_area_struct *vma, | |||
| 110 | next = pgd_addr_end(addr, end); | 121 | next = pgd_addr_end(addr, end); | 
| 111 | if (pgd_none_or_clear_bad(pgd)) | 122 | if (pgd_none_or_clear_bad(pgd)) | 
| 112 | continue; | 123 | continue; | 
| 113 | change_pud_range(mm, pgd, addr, next, newprot); | 124 | change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); | 
| 114 | } while (pgd++, addr = next, addr != end); | 125 | } while (pgd++, addr = next, addr != end); | 
| 115 | flush_tlb_range(vma, start, end); | 126 | flush_tlb_range(vma, start, end); | 
| 116 | } | 127 | } | 
| @@ -123,10 +134,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
| 123 | unsigned long oldflags = vma->vm_flags; | 134 | unsigned long oldflags = vma->vm_flags; | 
| 124 | long nrpages = (end - start) >> PAGE_SHIFT; | 135 | long nrpages = (end - start) >> PAGE_SHIFT; | 
| 125 | unsigned long charged = 0; | 136 | unsigned long charged = 0; | 
| 126 | unsigned int mask; | ||
| 127 | pgprot_t newprot; | ||
| 128 | pgoff_t pgoff; | 137 | pgoff_t pgoff; | 
| 129 | int error; | 138 | int error; | 
| 139 | int dirty_accountable = 0; | ||
| 130 | 140 | ||
| 131 | if (newflags == oldflags) { | 141 | if (newflags == oldflags) { | 
| 132 | *pprev = vma; | 142 | *pprev = vma; | 
| @@ -176,24 +186,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
| 176 | } | 186 | } | 
| 177 | 187 | ||
| 178 | success: | 188 | success: | 
| 179 | /* Don't make the VMA automatically writable if it's shared, but the | ||
| 180 | * backer wishes to know when pages are first written to */ | ||
| 181 | mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; | ||
| 182 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
| 183 | mask &= ~VM_SHARED; | ||
| 184 | |||
| 185 | newprot = protection_map[newflags & mask]; | ||
| 186 | |||
| 187 | /* | 189 | /* | 
| 188 | * vm_flags and vm_page_prot are protected by the mmap_sem | 190 | * vm_flags and vm_page_prot are protected by the mmap_sem | 
| 189 | * held in write mode. | 191 | * held in write mode. | 
| 190 | */ | 192 | */ | 
| 191 | vma->vm_flags = newflags; | 193 | vma->vm_flags = newflags; | 
| 192 | vma->vm_page_prot = newprot; | 194 | vma->vm_page_prot = protection_map[newflags & | 
| 195 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
| 196 | if (vma_wants_writenotify(vma)) { | ||
| 197 | vma->vm_page_prot = protection_map[newflags & | ||
| 198 | (VM_READ|VM_WRITE|VM_EXEC)]; | ||
| 199 | dirty_accountable = 1; | ||
| 200 | } | ||
| 201 | |||
| 193 | if (is_vm_hugetlb_page(vma)) | 202 | if (is_vm_hugetlb_page(vma)) | 
| 194 | hugetlb_change_protection(vma, start, end, newprot); | 203 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 
| 195 | else | 204 | else | 
| 196 | change_protection(vma, start, end, newprot); | 205 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | 
| 197 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 206 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 
| 198 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 207 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 
| 199 | return 0; | 208 | return 0; | 
| diff --git a/mm/mremap.c b/mm/mremap.c index 1903bdf65e42..7c15cf3373ad 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -97,7 +97,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 97 | new_pte = pte_offset_map_nested(new_pmd, new_addr); | 97 | new_pte = pte_offset_map_nested(new_pmd, new_addr); | 
| 98 | new_ptl = pte_lockptr(mm, new_pmd); | 98 | new_ptl = pte_lockptr(mm, new_pmd); | 
| 99 | if (new_ptl != old_ptl) | 99 | if (new_ptl != old_ptl) | 
| 100 | spin_lock(new_ptl); | 100 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 
| 101 | 101 | ||
| 102 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, | 102 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, | 
| 103 | new_pte++, new_addr += PAGE_SIZE) { | 103 | new_pte++, new_addr += PAGE_SIZE) { | 
| diff --git a/mm/msync.c b/mm/msync.c index d083544df21b..358d73cf7b78 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
| @@ -7,149 +7,33 @@ | |||
| 7 | /* | 7 | /* | 
| 8 | * The msync() system call. | 8 | * The msync() system call. | 
| 9 | */ | 9 | */ | 
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/pagemap.h> | ||
| 12 | #include <linux/fs.h> | 10 | #include <linux/fs.h> | 
| 13 | #include <linux/mm.h> | 11 | #include <linux/mm.h> | 
| 14 | #include <linux/mman.h> | 12 | #include <linux/mman.h> | 
| 15 | #include <linux/hugetlb.h> | ||
| 16 | #include <linux/writeback.h> | ||
| 17 | #include <linux/file.h> | 13 | #include <linux/file.h> | 
| 18 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> | 
| 19 | 15 | ||
| 20 | #include <asm/pgtable.h> | ||
| 21 | #include <asm/tlbflush.h> | ||
| 22 | |||
| 23 | static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 24 | unsigned long addr, unsigned long end) | ||
| 25 | { | ||
| 26 | pte_t *pte; | ||
| 27 | spinlock_t *ptl; | ||
| 28 | int progress = 0; | ||
| 29 | unsigned long ret = 0; | ||
| 30 | |||
| 31 | again: | ||
| 32 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
| 33 | do { | ||
| 34 | struct page *page; | ||
| 35 | |||
| 36 | if (progress >= 64) { | ||
| 37 | progress = 0; | ||
| 38 | if (need_resched() || need_lockbreak(ptl)) | ||
| 39 | break; | ||
| 40 | } | ||
| 41 | progress++; | ||
| 42 | if (!pte_present(*pte)) | ||
| 43 | continue; | ||
| 44 | if (!pte_maybe_dirty(*pte)) | ||
| 45 | continue; | ||
| 46 | page = vm_normal_page(vma, addr, *pte); | ||
| 47 | if (!page) | ||
| 48 | continue; | ||
| 49 | if (ptep_clear_flush_dirty(vma, addr, pte) || | ||
| 50 | page_test_and_clear_dirty(page)) | ||
| 51 | ret += set_page_dirty(page); | ||
| 52 | progress += 3; | ||
| 53 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 54 | pte_unmap_unlock(pte - 1, ptl); | ||
| 55 | cond_resched(); | ||
| 56 | if (addr != end) | ||
| 57 | goto again; | ||
| 58 | return ret; | ||
| 59 | } | ||
| 60 | |||
| 61 | static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, | ||
| 62 | pud_t *pud, unsigned long addr, unsigned long end) | ||
| 63 | { | ||
| 64 | pmd_t *pmd; | ||
| 65 | unsigned long next; | ||
| 66 | unsigned long ret = 0; | ||
| 67 | |||
| 68 | pmd = pmd_offset(pud, addr); | ||
| 69 | do { | ||
| 70 | next = pmd_addr_end(addr, end); | ||
| 71 | if (pmd_none_or_clear_bad(pmd)) | ||
| 72 | continue; | ||
| 73 | ret += msync_pte_range(vma, pmd, addr, next); | ||
| 74 | } while (pmd++, addr = next, addr != end); | ||
| 75 | return ret; | ||
| 76 | } | ||
| 77 | |||
| 78 | static inline unsigned long msync_pud_range(struct vm_area_struct *vma, | ||
| 79 | pgd_t *pgd, unsigned long addr, unsigned long end) | ||
| 80 | { | ||
| 81 | pud_t *pud; | ||
| 82 | unsigned long next; | ||
| 83 | unsigned long ret = 0; | ||
| 84 | |||
| 85 | pud = pud_offset(pgd, addr); | ||
| 86 | do { | ||
| 87 | next = pud_addr_end(addr, end); | ||
| 88 | if (pud_none_or_clear_bad(pud)) | ||
| 89 | continue; | ||
| 90 | ret += msync_pmd_range(vma, pud, addr, next); | ||
| 91 | } while (pud++, addr = next, addr != end); | ||
| 92 | return ret; | ||
| 93 | } | ||
| 94 | |||
| 95 | static unsigned long msync_page_range(struct vm_area_struct *vma, | ||
| 96 | unsigned long addr, unsigned long end) | ||
| 97 | { | ||
| 98 | pgd_t *pgd; | ||
| 99 | unsigned long next; | ||
| 100 | unsigned long ret = 0; | ||
| 101 | |||
| 102 | /* For hugepages we can't go walking the page table normally, | ||
| 103 | * but that's ok, hugetlbfs is memory based, so we don't need | ||
| 104 | * to do anything more on an msync(). | ||
| 105 | */ | ||
| 106 | if (vma->vm_flags & VM_HUGETLB) | ||
| 107 | return 0; | ||
| 108 | |||
| 109 | BUG_ON(addr >= end); | ||
| 110 | pgd = pgd_offset(vma->vm_mm, addr); | ||
| 111 | flush_cache_range(vma, addr, end); | ||
| 112 | do { | ||
| 113 | next = pgd_addr_end(addr, end); | ||
| 114 | if (pgd_none_or_clear_bad(pgd)) | ||
| 115 | continue; | ||
| 116 | ret += msync_pud_range(vma, pgd, addr, next); | ||
| 117 | } while (pgd++, addr = next, addr != end); | ||
| 118 | return ret; | ||
| 119 | } | ||
| 120 | |||
| 121 | /* | 16 | /* | 
| 122 | * MS_SYNC syncs the entire file - including mappings. | 17 | * MS_SYNC syncs the entire file - including mappings. | 
| 123 | * | 18 | * | 
| 124 | * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just | 19 | * MS_ASYNC does not start I/O (it used to, up to 2.5.67). | 
| 125 | * marks the relevant pages dirty. The application may now run fsync() to | 20 | * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). | 
| 21 | * Now it doesn't do anything, since dirty pages are properly tracked. | ||
| 22 | * | ||
| 23 | * The application may now run fsync() to | ||
| 126 | * write out the dirty pages and wait on the writeout and check the result. | 24 | * write out the dirty pages and wait on the writeout and check the result. | 
| 127 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | 25 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | 
| 128 | * async writeout immediately. | 26 | * async writeout immediately. | 
| 129 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 27 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 
| 130 | * applications. | 28 | * applications. | 
| 131 | */ | 29 | */ | 
| 132 | static int msync_interval(struct vm_area_struct *vma, unsigned long addr, | ||
| 133 | unsigned long end, int flags, | ||
| 134 | unsigned long *nr_pages_dirtied) | ||
| 135 | { | ||
| 136 | struct file *file = vma->vm_file; | ||
| 137 | |||
| 138 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) | ||
| 139 | return -EBUSY; | ||
| 140 | |||
| 141 | if (file && (vma->vm_flags & VM_SHARED)) | ||
| 142 | *nr_pages_dirtied = msync_page_range(vma, addr, end); | ||
| 143 | return 0; | ||
| 144 | } | ||
| 145 | |||
| 146 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 30 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 
| 147 | { | 31 | { | 
| 148 | unsigned long end; | 32 | unsigned long end; | 
| 33 | struct mm_struct *mm = current->mm; | ||
| 149 | struct vm_area_struct *vma; | 34 | struct vm_area_struct *vma; | 
| 150 | int unmapped_error = 0; | 35 | int unmapped_error = 0; | 
| 151 | int error = -EINVAL; | 36 | int error = -EINVAL; | 
| 152 | int done = 0; | ||
| 153 | 37 | ||
| 154 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 38 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 
| 155 | goto out; | 39 | goto out; | 
| @@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
| 169 | * If the interval [start,end) covers some unmapped address ranges, | 53 | * If the interval [start,end) covers some unmapped address ranges, | 
| 170 | * just ignore them, but return -ENOMEM at the end. | 54 | * just ignore them, but return -ENOMEM at the end. | 
| 171 | */ | 55 | */ | 
| 172 | down_read(¤t->mm->mmap_sem); | 56 | down_read(&mm->mmap_sem); | 
| 173 | vma = find_vma(current->mm, start); | 57 | vma = find_vma(mm, start); | 
| 174 | if (!vma) { | 58 | for (;;) { | 
| 175 | error = -ENOMEM; | ||
| 176 | goto out_unlock; | ||
| 177 | } | ||
| 178 | do { | ||
| 179 | unsigned long nr_pages_dirtied = 0; | ||
| 180 | struct file *file; | 59 | struct file *file; | 
| 181 | 60 | ||
| 61 | /* Still start < end. */ | ||
| 62 | error = -ENOMEM; | ||
| 63 | if (!vma) | ||
| 64 | goto out_unlock; | ||
| 182 | /* Here start < vma->vm_end. */ | 65 | /* Here start < vma->vm_end. */ | 
| 183 | if (start < vma->vm_start) { | 66 | if (start < vma->vm_start) { | 
| 184 | unmapped_error = -ENOMEM; | ||
| 185 | start = vma->vm_start; | 67 | start = vma->vm_start; | 
| 68 | if (start >= end) | ||
| 69 | goto out_unlock; | ||
| 70 | unmapped_error = -ENOMEM; | ||
| 186 | } | 71 | } | 
| 187 | /* Here vma->vm_start <= start < vma->vm_end. */ | 72 | /* Here vma->vm_start <= start < vma->vm_end. */ | 
| 188 | if (end <= vma->vm_end) { | 73 | if ((flags & MS_INVALIDATE) && | 
| 189 | if (start < end) { | 74 | (vma->vm_flags & VM_LOCKED)) { | 
| 190 | error = msync_interval(vma, start, end, flags, | 75 | error = -EBUSY; | 
| 191 | &nr_pages_dirtied); | 76 | goto out_unlock; | 
| 192 | if (error) | ||
| 193 | goto out_unlock; | ||
| 194 | } | ||
| 195 | error = unmapped_error; | ||
| 196 | done = 1; | ||
| 197 | } else { | ||
| 198 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
| 199 | error = msync_interval(vma, start, vma->vm_end, flags, | ||
| 200 | &nr_pages_dirtied); | ||
| 201 | if (error) | ||
| 202 | goto out_unlock; | ||
| 203 | } | 77 | } | 
| 204 | file = vma->vm_file; | 78 | file = vma->vm_file; | 
| 205 | start = vma->vm_end; | 79 | start = vma->vm_end; | 
| 206 | if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { | 80 | if ((flags & MS_SYNC) && file && | 
| 207 | get_file(file); | ||
| 208 | up_read(¤t->mm->mmap_sem); | ||
| 209 | balance_dirty_pages_ratelimited_nr(file->f_mapping, | ||
| 210 | nr_pages_dirtied); | ||
| 211 | fput(file); | ||
| 212 | down_read(¤t->mm->mmap_sem); | ||
| 213 | vma = find_vma(current->mm, start); | ||
| 214 | } else if ((flags & MS_SYNC) && file && | ||
| 215 | (vma->vm_flags & VM_SHARED)) { | 81 | (vma->vm_flags & VM_SHARED)) { | 
| 216 | get_file(file); | 82 | get_file(file); | 
| 217 | up_read(¤t->mm->mmap_sem); | 83 | up_read(&mm->mmap_sem); | 
| 218 | error = do_fsync(file, 0); | 84 | error = do_fsync(file, 0); | 
| 219 | fput(file); | 85 | fput(file); | 
| 220 | down_read(¤t->mm->mmap_sem); | 86 | if (error || start >= end) | 
| 221 | if (error) | 87 | goto out; | 
| 222 | goto out_unlock; | 88 | down_read(&mm->mmap_sem); | 
| 223 | vma = find_vma(current->mm, start); | 89 | vma = find_vma(mm, start); | 
| 224 | } else { | 90 | } else { | 
| 91 | if (start >= end) { | ||
| 92 | error = 0; | ||
| 93 | goto out_unlock; | ||
| 94 | } | ||
| 225 | vma = vma->vm_next; | 95 | vma = vma->vm_next; | 
| 226 | } | 96 | } | 
| 227 | } while (vma && !done); | 97 | } | 
| 228 | out_unlock: | 98 | out_unlock: | 
| 229 | up_read(¤t->mm->mmap_sem); | 99 | up_read(&mm->mmap_sem); | 
| 230 | out: | 100 | out: | 
| 231 | return error; | 101 | return error ? : unmapped_error; | 
| 232 | } | 102 | } | 
| diff --git a/mm/nommu.c b/mm/nommu.c index 029fadac0fb5..564540662192 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp) | |||
| 122 | } | 122 | } | 
| 123 | 123 | ||
| 124 | /* | 124 | /* | 
| 125 | * The nommu dodgy version :-) | 125 | * get a list of pages in an address range belonging to the specified process | 
| 126 | * and indicate the VMA that covers each page | ||
| 127 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
| 128 | * slab page or a secondary page from a compound page | ||
| 129 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
| 126 | */ | 130 | */ | 
| 127 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 131 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 
| 128 | unsigned long start, int len, int write, int force, | 132 | unsigned long start, int len, int write, int force, | 
| 129 | struct page **pages, struct vm_area_struct **vmas) | 133 | struct page **pages, struct vm_area_struct **vmas) | 
| 130 | { | 134 | { | 
| 135 | struct vm_area_struct *vma; | ||
| 136 | unsigned long vm_flags; | ||
| 131 | int i; | 137 | int i; | 
| 132 | static struct vm_area_struct dummy_vma; | 138 | |
| 139 | /* calculate required read or write permissions. | ||
| 140 | * - if 'force' is set, we only require the "MAY" flags. | ||
| 141 | */ | ||
| 142 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | ||
| 143 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
| 133 | 144 | ||
| 134 | for (i = 0; i < len; i++) { | 145 | for (i = 0; i < len; i++) { | 
| 146 | vma = find_vma(mm, start); | ||
| 147 | if (!vma) | ||
| 148 | goto finish_or_fault; | ||
| 149 | |||
| 150 | /* protect what we can, including chardevs */ | ||
| 151 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | ||
| 152 | !(vm_flags & vma->vm_flags)) | ||
| 153 | goto finish_or_fault; | ||
| 154 | |||
| 135 | if (pages) { | 155 | if (pages) { | 
| 136 | pages[i] = virt_to_page(start); | 156 | pages[i] = virt_to_page(start); | 
| 137 | if (pages[i]) | 157 | if (pages[i]) | 
| 138 | page_cache_get(pages[i]); | 158 | page_cache_get(pages[i]); | 
| 139 | } | 159 | } | 
| 140 | if (vmas) | 160 | if (vmas) | 
| 141 | vmas[i] = &dummy_vma; | 161 | vmas[i] = vma; | 
| 142 | start += PAGE_SIZE; | 162 | start += PAGE_SIZE; | 
| 143 | } | 163 | } | 
| 144 | return(i); | 164 | |
| 165 | return i; | ||
| 166 | |||
| 167 | finish_or_fault: | ||
| 168 | return i ? : -EFAULT; | ||
| 145 | } | 169 | } | 
| 146 | 170 | ||
| 147 | EXPORT_SYMBOL(get_user_pages); | 171 | EXPORT_SYMBOL(get_user_pages); | 
| @@ -286,6 +310,77 @@ static void show_process_blocks(void) | |||
| 286 | } | 310 | } | 
| 287 | #endif /* DEBUG */ | 311 | #endif /* DEBUG */ | 
| 288 | 312 | ||
| 313 | /* | ||
| 314 | * add a VMA into a process's mm_struct in the appropriate place in the list | ||
| 315 | * - should be called with mm->mmap_sem held writelocked | ||
| 316 | */ | ||
| 317 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | ||
| 318 | { | ||
| 319 | struct vm_list_struct **ppv; | ||
| 320 | |||
| 321 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | ||
| 322 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | ||
| 323 | break; | ||
| 324 | |||
| 325 | vml->next = *ppv; | ||
| 326 | *ppv = vml; | ||
| 327 | } | ||
| 328 | |||
| 329 | /* | ||
| 330 | * look up the first VMA in which addr resides, NULL if none | ||
| 331 | * - should be called with mm->mmap_sem at least held readlocked | ||
| 332 | */ | ||
| 333 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
| 334 | { | ||
| 335 | struct vm_list_struct *loop, *vml; | ||
| 336 | |||
| 337 | /* search the vm_start ordered list */ | ||
| 338 | vml = NULL; | ||
| 339 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | ||
| 340 | if (loop->vma->vm_start > addr) | ||
| 341 | break; | ||
| 342 | vml = loop; | ||
| 343 | } | ||
| 344 | |||
| 345 | if (vml && vml->vma->vm_end > addr) | ||
| 346 | return vml->vma; | ||
| 347 | |||
| 348 | return NULL; | ||
| 349 | } | ||
| 350 | EXPORT_SYMBOL(find_vma); | ||
| 351 | |||
| 352 | /* | ||
| 353 | * find a VMA | ||
| 354 | * - we don't extend stack VMAs under NOMMU conditions | ||
| 355 | */ | ||
| 356 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 357 | { | ||
| 358 | return find_vma(mm, addr); | ||
| 359 | } | ||
| 360 | |||
| 361 | /* | ||
| 362 | * look up the first VMA exactly that exactly matches addr | ||
| 363 | * - should be called with mm->mmap_sem at least held readlocked | ||
| 364 | */ | ||
| 365 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
| 366 | unsigned long addr) | ||
| 367 | { | ||
| 368 | struct vm_list_struct *vml; | ||
| 369 | |||
| 370 | /* search the vm_start ordered list */ | ||
| 371 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | ||
| 372 | if (vml->vma->vm_start == addr) | ||
| 373 | return vml->vma; | ||
| 374 | if (vml->vma->vm_start > addr) | ||
| 375 | break; | ||
| 376 | } | ||
| 377 | |||
| 378 | return NULL; | ||
| 379 | } | ||
| 380 | |||
| 381 | /* | ||
| 382 | * find a VMA in the global tree | ||
| 383 | */ | ||
| 289 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 384 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 
| 290 | { | 385 | { | 
| 291 | struct vm_area_struct *vma; | 386 | struct vm_area_struct *vma; | 
| @@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | |||
| 305 | return NULL; | 400 | return NULL; | 
| 306 | } | 401 | } | 
| 307 | 402 | ||
| 403 | /* | ||
| 404 | * add a VMA in the global tree | ||
| 405 | */ | ||
| 308 | static void add_nommu_vma(struct vm_area_struct *vma) | 406 | static void add_nommu_vma(struct vm_area_struct *vma) | 
| 309 | { | 407 | { | 
| 310 | struct vm_area_struct *pvma; | 408 | struct vm_area_struct *pvma; | 
| @@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma) | |||
| 351 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 449 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 
| 352 | } | 450 | } | 
| 353 | 451 | ||
| 452 | /* | ||
| 453 | * delete a VMA from the global list | ||
| 454 | */ | ||
| 354 | static void delete_nommu_vma(struct vm_area_struct *vma) | 455 | static void delete_nommu_vma(struct vm_area_struct *vma) | 
| 355 | { | 456 | { | 
| 356 | struct address_space *mapping; | 457 | struct address_space *mapping; | 
| @@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 828 | realalloc += kobjsize(vml); | 929 | realalloc += kobjsize(vml); | 
| 829 | askedalloc += sizeof(*vml); | 930 | askedalloc += sizeof(*vml); | 
| 830 | 931 | ||
| 831 | vml->next = current->mm->context.vmlist; | 932 | add_vma_to_mm(current->mm, vml); | 
| 832 | current->mm->context.vmlist = vml; | ||
| 833 | 933 | ||
| 834 | up_write(&nommu_vma_sem); | 934 | up_write(&nommu_vma_sem); | 
| 835 | 935 | ||
| @@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma) | |||
| 908 | } | 1008 | } | 
| 909 | } | 1009 | } | 
| 910 | 1010 | ||
| 1011 | /* | ||
| 1012 | * release a mapping | ||
| 1013 | * - under NOMMU conditions the parameters must match exactly to the mapping to | ||
| 1014 | * be removed | ||
| 1015 | */ | ||
| 911 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1016 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 
| 912 | { | 1017 | { | 
| 913 | struct vm_list_struct *vml, **parent; | 1018 | struct vm_list_struct *vml, **parent; | 
| @@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
| 917 | printk("do_munmap:\n"); | 1022 | printk("do_munmap:\n"); | 
| 918 | #endif | 1023 | #endif | 
| 919 | 1024 | ||
| 920 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) | 1025 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { | 
| 1026 | if ((*parent)->vma->vm_start > addr) | ||
| 1027 | break; | ||
| 921 | if ((*parent)->vma->vm_start == addr && | 1028 | if ((*parent)->vma->vm_start == addr && | 
| 922 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1029 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 
| 923 | goto found; | 1030 | goto found; | 
| 1031 | } | ||
| 924 | 1032 | ||
| 925 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1033 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 
| 926 | current->pid, current->comm, (void *) addr); | 1034 | current->pid, current->comm, (void *) addr); | 
| @@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
| 946 | return 0; | 1054 | return 0; | 
| 947 | } | 1055 | } | 
| 948 | 1056 | ||
| 949 | /* Release all mmaps. */ | 1057 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | 
| 1058 | { | ||
| 1059 | int ret; | ||
| 1060 | struct mm_struct *mm = current->mm; | ||
| 1061 | |||
| 1062 | down_write(&mm->mmap_sem); | ||
| 1063 | ret = do_munmap(mm, addr, len); | ||
| 1064 | up_write(&mm->mmap_sem); | ||
| 1065 | return ret; | ||
| 1066 | } | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * Release all mappings | ||
| 1070 | */ | ||
| 950 | void exit_mmap(struct mm_struct * mm) | 1071 | void exit_mmap(struct mm_struct * mm) | 
| 951 | { | 1072 | { | 
| 952 | struct vm_list_struct *tmp; | 1073 | struct vm_list_struct *tmp; | 
| @@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm) | |||
| 973 | } | 1094 | } | 
| 974 | } | 1095 | } | 
| 975 | 1096 | ||
| 976 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | ||
| 977 | { | ||
| 978 | int ret; | ||
| 979 | struct mm_struct *mm = current->mm; | ||
| 980 | |||
| 981 | down_write(&mm->mmap_sem); | ||
| 982 | ret = do_munmap(mm, addr, len); | ||
| 983 | up_write(&mm->mmap_sem); | ||
| 984 | return ret; | ||
| 985 | } | ||
| 986 | |||
| 987 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1097 | unsigned long do_brk(unsigned long addr, unsigned long len) | 
| 988 | { | 1098 | { | 
| 989 | return -ENOMEM; | 1099 | return -ENOMEM; | 
| 990 | } | 1100 | } | 
| 991 | 1101 | ||
| 992 | /* | 1102 | /* | 
| 993 | * Expand (or shrink) an existing mapping, potentially moving it at the | 1103 | * expand (or shrink) an existing mapping, potentially moving it at the same | 
| 994 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1104 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 
| 995 | * | 1105 | * | 
| 996 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | 1106 | * under NOMMU conditions, we only permit changing a mapping's size, and only | 
| 997 | * This option implies MREMAP_MAYMOVE. | 1107 | * as long as it stays within the hole allocated by the kmalloc() call in | 
| 1108 | * do_mmap_pgoff() and the block is not shareable | ||
| 998 | * | 1109 | * | 
| 999 | * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the | 1110 | * MREMAP_FIXED is not supported under NOMMU conditions | 
| 1000 | * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable | ||
| 1001 | */ | 1111 | */ | 
| 1002 | unsigned long do_mremap(unsigned long addr, | 1112 | unsigned long do_mremap(unsigned long addr, | 
| 1003 | unsigned long old_len, unsigned long new_len, | 1113 | unsigned long old_len, unsigned long new_len, | 
| 1004 | unsigned long flags, unsigned long new_addr) | 1114 | unsigned long flags, unsigned long new_addr) | 
| 1005 | { | 1115 | { | 
| 1006 | struct vm_list_struct *vml = NULL; | 1116 | struct vm_area_struct *vma; | 
| 1007 | 1117 | ||
| 1008 | /* insanity checks first */ | 1118 | /* insanity checks first */ | 
| 1009 | if (new_len == 0) | 1119 | if (new_len == 0) | 
| @@ -1012,64 +1122,53 @@ unsigned long do_mremap(unsigned long addr, | |||
| 1012 | if (flags & MREMAP_FIXED && new_addr != addr) | 1122 | if (flags & MREMAP_FIXED && new_addr != addr) | 
| 1013 | return (unsigned long) -EINVAL; | 1123 | return (unsigned long) -EINVAL; | 
| 1014 | 1124 | ||
| 1015 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | 1125 | vma = find_vma_exact(current->mm, addr); | 
| 1016 | if (vml->vma->vm_start == addr) | 1126 | if (!vma) | 
| 1017 | goto found; | 1127 | return (unsigned long) -EINVAL; | 
| 1018 | |||
| 1019 | return (unsigned long) -EINVAL; | ||
| 1020 | 1128 | ||
| 1021 | found: | 1129 | if (vma->vm_end != vma->vm_start + old_len) | 
| 1022 | if (vml->vma->vm_end != vml->vma->vm_start + old_len) | ||
| 1023 | return (unsigned long) -EFAULT; | 1130 | return (unsigned long) -EFAULT; | 
| 1024 | 1131 | ||
| 1025 | if (vml->vma->vm_flags & VM_MAYSHARE) | 1132 | if (vma->vm_flags & VM_MAYSHARE) | 
| 1026 | return (unsigned long) -EPERM; | 1133 | return (unsigned long) -EPERM; | 
| 1027 | 1134 | ||
| 1028 | if (new_len > kobjsize((void *) addr)) | 1135 | if (new_len > kobjsize((void *) addr)) | 
| 1029 | return (unsigned long) -ENOMEM; | 1136 | return (unsigned long) -ENOMEM; | 
| 1030 | 1137 | ||
| 1031 | /* all checks complete - do it */ | 1138 | /* all checks complete - do it */ | 
| 1032 | vml->vma->vm_end = vml->vma->vm_start + new_len; | 1139 | vma->vm_end = vma->vm_start + new_len; | 
| 1033 | 1140 | ||
| 1034 | askedalloc -= old_len; | 1141 | askedalloc -= old_len; | 
| 1035 | askedalloc += new_len; | 1142 | askedalloc += new_len; | 
| 1036 | 1143 | ||
| 1037 | return vml->vma->vm_start; | 1144 | return vma->vm_start; | 
| 1038 | } | 1145 | } | 
| 1039 | 1146 | ||
| 1040 | /* | 1147 | asmlinkage unsigned long sys_mremap(unsigned long addr, | 
| 1041 | * Look up the first VMA which satisfies addr < vm_end, NULL if none | 1148 | unsigned long old_len, unsigned long new_len, | 
| 1042 | */ | 1149 | unsigned long flags, unsigned long new_addr) | 
| 1043 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1044 | { | 1150 | { | 
| 1045 | struct vm_list_struct *vml; | 1151 | unsigned long ret; | 
| 1046 | |||
| 1047 | for (vml = mm->context.vmlist; vml; vml = vml->next) | ||
| 1048 | if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) | ||
| 1049 | return vml->vma; | ||
| 1050 | 1152 | ||
| 1051 | return NULL; | 1153 | down_write(¤t->mm->mmap_sem); | 
| 1154 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
| 1155 | up_write(¤t->mm->mmap_sem); | ||
| 1156 | return ret; | ||
| 1052 | } | 1157 | } | 
| 1053 | 1158 | ||
| 1054 | EXPORT_SYMBOL(find_vma); | ||
| 1055 | |||
| 1056 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1159 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 
| 1057 | unsigned int foll_flags) | 1160 | unsigned int foll_flags) | 
| 1058 | { | 1161 | { | 
| 1059 | return NULL; | 1162 | return NULL; | 
| 1060 | } | 1163 | } | 
| 1061 | 1164 | ||
| 1062 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1063 | { | ||
| 1064 | return NULL; | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1165 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 
| 1068 | unsigned long to, unsigned long size, pgprot_t prot) | 1166 | unsigned long to, unsigned long size, pgprot_t prot) | 
| 1069 | { | 1167 | { | 
| 1070 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 1168 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 
| 1071 | return 0; | 1169 | return 0; | 
| 1072 | } | 1170 | } | 
| 1171 | EXPORT_SYMBOL(remap_pfn_range); | ||
| 1073 | 1172 | ||
| 1074 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 1173 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 
| 1075 | { | 1174 | { | 
| @@ -1090,6 +1189,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 1090 | int even_cows) | 1189 | int even_cows) | 
| 1091 | { | 1190 | { | 
| 1092 | } | 1191 | } | 
| 1192 | EXPORT_SYMBOL(unmap_mapping_range); | ||
| 1093 | 1193 | ||
| 1094 | /* | 1194 | /* | 
| 1095 | * Check that a process has enough memory to allocate a new virtual | 1195 | * Check that a process has enough memory to allocate a new virtual | 
| @@ -1122,7 +1222,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
| 1122 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1222 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 
| 1123 | unsigned long n; | 1223 | unsigned long n; | 
| 1124 | 1224 | ||
| 1125 | free = get_page_cache_size(); | 1225 | free = global_page_state(NR_FILE_PAGES); | 
| 1126 | free += nr_swap_pages; | 1226 | free += nr_swap_pages; | 
| 1127 | 1227 | ||
| 1128 | /* | 1228 | /* | 
| @@ -1131,7 +1231,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
| 1131 | * which are reclaimable, under pressure. The dentry | 1231 | * which are reclaimable, under pressure. The dentry | 
| 1132 | * cache and most inode caches should fall into this | 1232 | * cache and most inode caches should fall into this | 
| 1133 | */ | 1233 | */ | 
| 1134 | free += atomic_read(&slab_reclaim_pages); | 1234 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 
| 1135 | 1235 | ||
| 1136 | /* | 1236 | /* | 
| 1137 | * Leave the last 3% for root | 1237 | * Leave the last 3% for root | 
| @@ -1204,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area, | |||
| 1204 | BUG(); | 1304 | BUG(); | 
| 1205 | return NULL; | 1305 | return NULL; | 
| 1206 | } | 1306 | } | 
| 1307 | |||
| 1308 | /* | ||
| 1309 | * Access another process' address space. | ||
| 1310 | * - source/target buffer must be kernel space | ||
| 1311 | */ | ||
| 1312 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
| 1313 | { | ||
| 1314 | struct vm_area_struct *vma; | ||
| 1315 | struct mm_struct *mm; | ||
| 1316 | |||
| 1317 | if (addr + len < addr) | ||
| 1318 | return 0; | ||
| 1319 | |||
| 1320 | mm = get_task_mm(tsk); | ||
| 1321 | if (!mm) | ||
| 1322 | return 0; | ||
| 1323 | |||
| 1324 | down_read(&mm->mmap_sem); | ||
| 1325 | |||
| 1326 | /* the access must start within one of the target process's mappings */ | ||
| 1327 | vma = find_vma(mm, addr); | ||
| 1328 | if (vma) { | ||
| 1329 | /* don't overrun this mapping */ | ||
| 1330 | if (addr + len >= vma->vm_end) | ||
| 1331 | len = vma->vm_end - addr; | ||
| 1332 | |||
| 1333 | /* only read or write mappings where it is permitted */ | ||
| 1334 | if (write && vma->vm_flags & VM_MAYWRITE) | ||
| 1335 | len -= copy_to_user((void *) addr, buf, len); | ||
| 1336 | else if (!write && vma->vm_flags & VM_MAYREAD) | ||
| 1337 | len -= copy_from_user(buf, (void *) addr, len); | ||
| 1338 | else | ||
| 1339 | len = 0; | ||
| 1340 | } else { | ||
| 1341 | len = 0; | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | up_read(&mm->mmap_sem); | ||
| 1345 | mmput(mm); | ||
| 1346 | return len; | ||
| 1347 | } | ||
| diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d46ed0f1dc06..20f41b082e16 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -21,6 +21,8 @@ | |||
| 21 | #include <linux/timex.h> | 21 | #include <linux/timex.h> | 
| 22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> | 
| 23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> | 
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/notifier.h> | ||
| 24 | 26 | ||
| 25 | int sysctl_panic_on_oom; | 27 | int sysctl_panic_on_oom; | 
| 26 | /* #define DEBUG */ | 28 | /* #define DEBUG */ | 
| @@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 58 | } | 60 | } | 
| 59 | 61 | ||
| 60 | /* | 62 | /* | 
| 63 | * swapoff can easily use up all memory, so kill those first. | ||
| 64 | */ | ||
| 65 | if (p->flags & PF_SWAPOFF) | ||
| 66 | return ULONG_MAX; | ||
| 67 | |||
| 68 | /* | ||
| 61 | * The memory size of the process is the basis for the badness. | 69 | * The memory size of the process is the basis for the badness. | 
| 62 | */ | 70 | */ | 
| 63 | points = mm->total_vm; | 71 | points = mm->total_vm; | 
| @@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 127 | points /= 4; | 135 | points /= 4; | 
| 128 | 136 | ||
| 129 | /* | 137 | /* | 
| 138 | * If p's nodes don't overlap ours, it may still help to kill p | ||
| 139 | * because p may have allocated or otherwise mapped memory on | ||
| 140 | * this node before. However it will be less likely. | ||
| 141 | */ | ||
| 142 | if (!cpuset_excl_nodes_overlap(p)) | ||
| 143 | points /= 8; | ||
| 144 | |||
| 145 | /* | ||
| 130 | * Adjust the score by oomkilladj. | 146 | * Adjust the score by oomkilladj. | 
| 131 | */ | 147 | */ | 
| 132 | if (p->oomkilladj) { | 148 | if (p->oomkilladj) { | 
| @@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 161 | 177 | ||
| 162 | for (z = zonelist->zones; *z; z++) | 178 | for (z = zonelist->zones; *z; z++) | 
| 163 | if (cpuset_zone_allowed(*z, gfp_mask)) | 179 | if (cpuset_zone_allowed(*z, gfp_mask)) | 
| 164 | node_clear((*z)->zone_pgdat->node_id, | 180 | node_clear(zone_to_nid(*z), nodes); | 
| 165 | nodes); | ||
| 166 | else | 181 | else | 
| 167 | return CONSTRAINT_CPUSET; | 182 | return CONSTRAINT_CPUSET; | 
| 168 | 183 | ||
| @@ -189,27 +204,49 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
| 189 | do_posix_clock_monotonic_gettime(&uptime); | 204 | do_posix_clock_monotonic_gettime(&uptime); | 
| 190 | do_each_thread(g, p) { | 205 | do_each_thread(g, p) { | 
| 191 | unsigned long points; | 206 | unsigned long points; | 
| 192 | int releasing; | ||
| 193 | 207 | ||
| 194 | /* skip the init task with pid == 1 */ | 208 | /* | 
| 195 | if (p->pid == 1) | 209 | * skip kernel threads and tasks which have already released | 
| 196 | continue; | 210 | * their mm. | 
| 197 | if (p->oomkilladj == OOM_DISABLE) | 211 | */ | 
| 212 | if (!p->mm) | ||
| 198 | continue; | 213 | continue; | 
| 199 | /* If p's nodes don't overlap ours, it won't help to kill p. */ | 214 | /* skip the init task */ | 
| 200 | if (!cpuset_excl_nodes_overlap(p)) | 215 | if (is_init(p)) | 
| 201 | continue; | 216 | continue; | 
| 202 | 217 | ||
| 203 | /* | 218 | /* | 
| 219 | * This task already has access to memory reserves and is | ||
| 220 | * being killed. Don't allow any other task access to the | ||
| 221 | * memory reserve. | ||
| 222 | * | ||
| 223 | * Note: this may have a chance of deadlock if it gets | ||
| 224 | * blocked waiting for another task which itself is waiting | ||
| 225 | * for memory. Is there a better alternative? | ||
| 226 | */ | ||
| 227 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | ||
| 228 | return ERR_PTR(-1UL); | ||
| 229 | |||
| 230 | /* | ||
| 204 | * This is in the process of releasing memory so wait for it | 231 | * This is in the process of releasing memory so wait for it | 
| 205 | * to finish before killing some other task by mistake. | 232 | * to finish before killing some other task by mistake. | 
| 233 | * | ||
| 234 | * However, if p is the current task, we allow the 'kill' to | ||
| 235 | * go ahead if it is exiting: this will simply set TIF_MEMDIE, | ||
| 236 | * which will allow it to gain access to memory reserves in | ||
| 237 | * the process of exiting and releasing its resources. | ||
| 238 | * Otherwise we could get an easy OOM deadlock. | ||
| 206 | */ | 239 | */ | 
| 207 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || | 240 | if (p->flags & PF_EXITING) { | 
| 208 | p->flags & PF_EXITING; | 241 | if (p != current) | 
| 209 | if (releasing && !(p->flags & PF_DEAD)) | 242 | return ERR_PTR(-1UL); | 
| 210 | return ERR_PTR(-1UL); | 243 | |
| 211 | if (p->flags & PF_SWAPOFF) | 244 | chosen = p; | 
| 212 | return p; | 245 | *ppoints = ULONG_MAX; | 
| 246 | } | ||
| 247 | |||
| 248 | if (p->oomkilladj == OOM_DISABLE) | ||
| 249 | continue; | ||
| 213 | 250 | ||
| 214 | points = badness(p, uptime.tv_sec); | 251 | points = badness(p, uptime.tv_sec); | 
| 215 | if (points > *ppoints || !chosen) { | 252 | if (points > *ppoints || !chosen) { | 
| @@ -217,32 +254,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
| 217 | *ppoints = points; | 254 | *ppoints = points; | 
| 218 | } | 255 | } | 
| 219 | } while_each_thread(g, p); | 256 | } while_each_thread(g, p); | 
| 257 | |||
| 220 | return chosen; | 258 | return chosen; | 
| 221 | } | 259 | } | 
| 222 | 260 | ||
| 223 | /** | 261 | /** | 
| 224 | * We must be careful though to never send SIGKILL a process with | 262 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 
| 225 | * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that | 263 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 
| 226 | * we select a process with CAP_SYS_RAW_IO set). | 264 | * set. | 
| 227 | */ | 265 | */ | 
| 228 | static void __oom_kill_task(task_t *p, const char *message) | 266 | static void __oom_kill_task(struct task_struct *p, const char *message) | 
| 229 | { | 267 | { | 
| 230 | if (p->pid == 1) { | 268 | if (is_init(p)) { | 
| 231 | WARN_ON(1); | 269 | WARN_ON(1); | 
| 232 | printk(KERN_WARNING "tried to kill init!\n"); | 270 | printk(KERN_WARNING "tried to kill init!\n"); | 
| 233 | return; | 271 | return; | 
| 234 | } | 272 | } | 
| 235 | 273 | ||
| 236 | task_lock(p); | 274 | if (!p->mm) { | 
| 237 | if (!p->mm || p->mm == &init_mm) { | ||
| 238 | WARN_ON(1); | 275 | WARN_ON(1); | 
| 239 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 276 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 
| 240 | task_unlock(p); | ||
| 241 | return; | 277 | return; | 
| 242 | } | 278 | } | 
| 243 | task_unlock(p); | 279 | |
| 244 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | 280 | if (message) { | 
| 281 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | ||
| 245 | message, p->pid, p->comm); | 282 | message, p->pid, p->comm); | 
| 283 | } | ||
| 246 | 284 | ||
| 247 | /* | 285 | /* | 
| 248 | * We give our sacrificial lamb high priority and access to | 286 | * We give our sacrificial lamb high priority and access to | 
| @@ -255,10 +293,10 @@ static void __oom_kill_task(task_t *p, const char *message) | |||
| 255 | force_sig(SIGKILL, p); | 293 | force_sig(SIGKILL, p); | 
| 256 | } | 294 | } | 
| 257 | 295 | ||
| 258 | static int oom_kill_task(task_t *p, const char *message) | 296 | static int oom_kill_task(struct task_struct *p, const char *message) | 
| 259 | { | 297 | { | 
| 260 | struct mm_struct *mm; | 298 | struct mm_struct *mm; | 
| 261 | task_t * g, * q; | 299 | struct task_struct *g, *q; | 
| 262 | 300 | ||
| 263 | mm = p->mm; | 301 | mm = p->mm; | 
| 264 | 302 | ||
| @@ -271,7 +309,7 @@ static int oom_kill_task(task_t *p, const char *message) | |||
| 271 | * However, this is of no concern to us. | 309 | * However, this is of no concern to us. | 
| 272 | */ | 310 | */ | 
| 273 | 311 | ||
| 274 | if (mm == NULL || mm == &init_mm) | 312 | if (mm == NULL) | 
| 275 | return 1; | 313 | return 1; | 
| 276 | 314 | ||
| 277 | __oom_kill_task(p, message); | 315 | __oom_kill_task(p, message); | 
| @@ -293,8 +331,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
| 293 | struct task_struct *c; | 331 | struct task_struct *c; | 
| 294 | struct list_head *tsk; | 332 | struct list_head *tsk; | 
| 295 | 333 | ||
| 296 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " | 334 | /* | 
| 297 | "children.\n", p->pid, p->comm, points); | 335 | * If the task is already exiting, don't alarm the sysadmin or kill | 
| 336 | * its children or threads, just set TIF_MEMDIE so it can die quickly | ||
| 337 | */ | ||
| 338 | if (p->flags & PF_EXITING) { | ||
| 339 | __oom_kill_task(p, NULL); | ||
| 340 | return 0; | ||
| 341 | } | ||
| 342 | |||
| 343 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" | ||
| 344 | " and children.\n", p->pid, p->comm, points); | ||
| 298 | /* Try to kill a child first */ | 345 | /* Try to kill a child first */ | 
| 299 | list_for_each(tsk, &p->children) { | 346 | list_for_each(tsk, &p->children) { | 
| 300 | c = list_entry(tsk, struct task_struct, sibling); | 347 | c = list_entry(tsk, struct task_struct, sibling); | 
| @@ -306,6 +353,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
| 306 | return oom_kill_task(p, message); | 353 | return oom_kill_task(p, message); | 
| 307 | } | 354 | } | 
| 308 | 355 | ||
| 356 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | ||
| 357 | |||
| 358 | int register_oom_notifier(struct notifier_block *nb) | ||
| 359 | { | ||
| 360 | return blocking_notifier_chain_register(&oom_notify_list, nb); | ||
| 361 | } | ||
| 362 | EXPORT_SYMBOL_GPL(register_oom_notifier); | ||
| 363 | |||
| 364 | int unregister_oom_notifier(struct notifier_block *nb) | ||
| 365 | { | ||
| 366 | return blocking_notifier_chain_unregister(&oom_notify_list, nb); | ||
| 367 | } | ||
| 368 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | ||
| 369 | |||
| 309 | /** | 370 | /** | 
| 310 | * out_of_memory - kill the "best" process when we run out of memory | 371 | * out_of_memory - kill the "best" process when we run out of memory | 
| 311 | * | 372 | * | 
| @@ -316,12 +377,19 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
| 316 | */ | 377 | */ | 
| 317 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 378 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 
| 318 | { | 379 | { | 
| 319 | task_t *p; | 380 | struct task_struct *p; | 
| 320 | unsigned long points = 0; | 381 | unsigned long points = 0; | 
| 382 | unsigned long freed = 0; | ||
| 383 | |||
| 384 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | ||
| 385 | if (freed > 0) | ||
| 386 | /* Got some memory back in the last second. */ | ||
| 387 | return; | ||
| 321 | 388 | ||
| 322 | if (printk_ratelimit()) { | 389 | if (printk_ratelimit()) { | 
| 323 | printk("oom-killer: gfp_mask=0x%x, order=%d\n", | 390 | printk(KERN_WARNING "%s invoked oom-killer: " | 
| 324 | gfp_mask, order); | 391 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 
| 392 | current->comm, gfp_mask, order, current->oomkilladj); | ||
| 325 | dump_stack(); | 393 | dump_stack(); | 
| 326 | show_mem(); | 394 | show_mem(); | 
| 327 | } | 395 | } | 
| diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8ccf6f1b1473..488b7088557c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> | 
| 24 | #include <linux/blkdev.h> | 24 | #include <linux/blkdev.h> | 
| 25 | #include <linux/mpage.h> | 25 | #include <linux/mpage.h> | 
| 26 | #include <linux/rmap.h> | ||
| 26 | #include <linux/percpu.h> | 27 | #include <linux/percpu.h> | 
| 27 | #include <linux/notifier.h> | 28 | #include <linux/notifier.h> | 
| 28 | #include <linux/smp.h> | 29 | #include <linux/smp.h> | 
| @@ -45,7 +46,6 @@ | |||
| 45 | */ | 46 | */ | 
| 46 | static long ratelimit_pages = 32; | 47 | static long ratelimit_pages = 32; | 
| 47 | 48 | ||
| 48 | static long total_pages; /* The total number of pages in the machine. */ | ||
| 49 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | 49 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | 
| 50 | 50 | ||
| 51 | /* | 51 | /* | 
| @@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode); | |||
| 99 | 99 | ||
| 100 | static void background_writeout(unsigned long _min_pages); | 100 | static void background_writeout(unsigned long _min_pages); | 
| 101 | 101 | ||
| 102 | struct writeback_state | ||
| 103 | { | ||
| 104 | unsigned long nr_dirty; | ||
| 105 | unsigned long nr_unstable; | ||
| 106 | unsigned long nr_mapped; | ||
| 107 | unsigned long nr_writeback; | ||
| 108 | }; | ||
| 109 | |||
| 110 | static void get_writeback_state(struct writeback_state *wbs) | ||
| 111 | { | ||
| 112 | wbs->nr_dirty = read_page_state(nr_dirty); | ||
| 113 | wbs->nr_unstable = read_page_state(nr_unstable); | ||
| 114 | wbs->nr_mapped = read_page_state(nr_mapped); | ||
| 115 | wbs->nr_writeback = read_page_state(nr_writeback); | ||
| 116 | } | ||
| 117 | |||
| 118 | /* | 102 | /* | 
| 119 | * Work out the current dirty-memory clamping and background writeout | 103 | * Work out the current dirty-memory clamping and background writeout | 
| 120 | * thresholds. | 104 | * thresholds. | 
| @@ -133,19 +117,17 @@ static void get_writeback_state(struct writeback_state *wbs) | |||
| 133 | * clamping level. | 117 | * clamping level. | 
| 134 | */ | 118 | */ | 
| 135 | static void | 119 | static void | 
| 136 | get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | 120 | get_dirty_limits(long *pbackground, long *pdirty, | 
| 137 | struct address_space *mapping) | 121 | struct address_space *mapping) | 
| 138 | { | 122 | { | 
| 139 | int background_ratio; /* Percentages */ | 123 | int background_ratio; /* Percentages */ | 
| 140 | int dirty_ratio; | 124 | int dirty_ratio; | 
| 141 | int unmapped_ratio; | 125 | int unmapped_ratio; | 
| 142 | long background; | 126 | long background; | 
| 143 | long dirty; | 127 | long dirty; | 
| 144 | unsigned long available_memory = total_pages; | 128 | unsigned long available_memory = vm_total_pages; | 
| 145 | struct task_struct *tsk; | 129 | struct task_struct *tsk; | 
| 146 | 130 | ||
| 147 | get_writeback_state(wbs); | ||
| 148 | |||
| 149 | #ifdef CONFIG_HIGHMEM | 131 | #ifdef CONFIG_HIGHMEM | 
| 150 | /* | 132 | /* | 
| 151 | * If this mapping can only allocate from low memory, | 133 | * If this mapping can only allocate from low memory, | 
| @@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | |||
| 156 | #endif | 138 | #endif | 
| 157 | 139 | ||
| 158 | 140 | ||
| 159 | unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; | 141 | unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + | 
| 142 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
| 143 | vm_total_pages; | ||
| 160 | 144 | ||
| 161 | dirty_ratio = vm_dirty_ratio; | 145 | dirty_ratio = vm_dirty_ratio; | 
| 162 | if (dirty_ratio > unmapped_ratio / 2) | 146 | if (dirty_ratio > unmapped_ratio / 2) | 
| @@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | |||
| 189 | */ | 173 | */ | 
| 190 | static void balance_dirty_pages(struct address_space *mapping) | 174 | static void balance_dirty_pages(struct address_space *mapping) | 
| 191 | { | 175 | { | 
| 192 | struct writeback_state wbs; | ||
| 193 | long nr_reclaimable; | 176 | long nr_reclaimable; | 
| 194 | long background_thresh; | 177 | long background_thresh; | 
| 195 | long dirty_thresh; | 178 | long dirty_thresh; | 
| @@ -207,11 +190,12 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 207 | .range_cyclic = 1, | 190 | .range_cyclic = 1, | 
| 208 | }; | 191 | }; | 
| 209 | 192 | ||
| 210 | get_dirty_limits(&wbs, &background_thresh, | 193 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); | 
| 211 | &dirty_thresh, mapping); | 194 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 
| 212 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | 195 | global_page_state(NR_UNSTABLE_NFS); | 
| 213 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 196 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= | 
| 214 | break; | 197 | dirty_thresh) | 
| 198 | break; | ||
| 215 | 199 | ||
| 216 | if (!dirty_exceeded) | 200 | if (!dirty_exceeded) | 
| 217 | dirty_exceeded = 1; | 201 | dirty_exceeded = 1; | 
| @@ -224,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 224 | */ | 208 | */ | 
| 225 | if (nr_reclaimable) { | 209 | if (nr_reclaimable) { | 
| 226 | writeback_inodes(&wbc); | 210 | writeback_inodes(&wbc); | 
| 227 | get_dirty_limits(&wbs, &background_thresh, | 211 | get_dirty_limits(&background_thresh, | 
| 228 | &dirty_thresh, mapping); | 212 | &dirty_thresh, mapping); | 
| 229 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | 213 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 
| 230 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 214 | global_page_state(NR_UNSTABLE_NFS); | 
| 231 | break; | 215 | if (nr_reclaimable + | 
| 216 | global_page_state(NR_WRITEBACK) | ||
| 217 | <= dirty_thresh) | ||
| 218 | break; | ||
| 232 | pages_written += write_chunk - wbc.nr_to_write; | 219 | pages_written += write_chunk - wbc.nr_to_write; | 
| 233 | if (pages_written >= write_chunk) | 220 | if (pages_written >= write_chunk) | 
| 234 | break; /* We've done our duty */ | 221 | break; /* We've done our duty */ | 
| @@ -236,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 236 | blk_congestion_wait(WRITE, HZ/10); | 223 | blk_congestion_wait(WRITE, HZ/10); | 
| 237 | } | 224 | } | 
| 238 | 225 | ||
| 239 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) | 226 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 
| 240 | dirty_exceeded = 0; | 227 | <= dirty_thresh && dirty_exceeded) | 
| 228 | dirty_exceeded = 0; | ||
| 241 | 229 | ||
| 242 | if (writeback_in_progress(bdi)) | 230 | if (writeback_in_progress(bdi)) | 
| 243 | return; /* pdflush is already working this queue */ | 231 | return; /* pdflush is already working this queue */ | 
| @@ -255,6 +243,16 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 255 | pdflush_operation(background_writeout, 0); | 243 | pdflush_operation(background_writeout, 0); | 
| 256 | } | 244 | } | 
| 257 | 245 | ||
| 246 | void set_page_dirty_balance(struct page *page) | ||
| 247 | { | ||
| 248 | if (set_page_dirty(page)) { | ||
| 249 | struct address_space *mapping = page_mapping(page); | ||
| 250 | |||
| 251 | if (mapping) | ||
| 252 | balance_dirty_pages_ratelimited(mapping); | ||
| 253 | } | ||
| 254 | } | ||
| 255 | |||
| 258 | /** | 256 | /** | 
| 259 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 257 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 
| 260 | * @mapping: address_space which was dirtied | 258 | * @mapping: address_space which was dirtied | 
| @@ -299,12 +297,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | |||
| 299 | 297 | ||
| 300 | void throttle_vm_writeout(void) | 298 | void throttle_vm_writeout(void) | 
| 301 | { | 299 | { | 
| 302 | struct writeback_state wbs; | ||
| 303 | long background_thresh; | 300 | long background_thresh; | 
| 304 | long dirty_thresh; | 301 | long dirty_thresh; | 
| 305 | 302 | ||
| 306 | for ( ; ; ) { | 303 | for ( ; ; ) { | 
| 307 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | 304 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 
| 308 | 305 | ||
| 309 | /* | 306 | /* | 
| 310 | * Boost the allowable dirty threshold a bit for page | 307 | * Boost the allowable dirty threshold a bit for page | 
| @@ -312,8 +309,9 @@ void throttle_vm_writeout(void) | |||
| 312 | */ | 309 | */ | 
| 313 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | 310 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | 
| 314 | 311 | ||
| 315 | if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) | 312 | if (global_page_state(NR_UNSTABLE_NFS) + | 
| 316 | break; | 313 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 
| 314 | break; | ||
| 317 | blk_congestion_wait(WRITE, HZ/10); | 315 | blk_congestion_wait(WRITE, HZ/10); | 
| 318 | } | 316 | } | 
| 319 | } | 317 | } | 
| @@ -336,12 +334,12 @@ static void background_writeout(unsigned long _min_pages) | |||
| 336 | }; | 334 | }; | 
| 337 | 335 | ||
| 338 | for ( ; ; ) { | 336 | for ( ; ; ) { | 
| 339 | struct writeback_state wbs; | ||
| 340 | long background_thresh; | 337 | long background_thresh; | 
| 341 | long dirty_thresh; | 338 | long dirty_thresh; | 
| 342 | 339 | ||
| 343 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | 340 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 
| 344 | if (wbs.nr_dirty + wbs.nr_unstable < background_thresh | 341 | if (global_page_state(NR_FILE_DIRTY) + | 
| 342 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | ||
| 345 | && min_pages <= 0) | 343 | && min_pages <= 0) | 
| 346 | break; | 344 | break; | 
| 347 | wbc.encountered_congestion = 0; | 345 | wbc.encountered_congestion = 0; | 
| @@ -365,12 +363,9 @@ static void background_writeout(unsigned long _min_pages) | |||
| 365 | */ | 363 | */ | 
| 366 | int wakeup_pdflush(long nr_pages) | 364 | int wakeup_pdflush(long nr_pages) | 
| 367 | { | 365 | { | 
| 368 | if (nr_pages == 0) { | 366 | if (nr_pages == 0) | 
| 369 | struct writeback_state wbs; | 367 | nr_pages = global_page_state(NR_FILE_DIRTY) + | 
| 370 | 368 | global_page_state(NR_UNSTABLE_NFS); | |
| 371 | get_writeback_state(&wbs); | ||
| 372 | nr_pages = wbs.nr_dirty + wbs.nr_unstable; | ||
| 373 | } | ||
| 374 | return pdflush_operation(background_writeout, nr_pages); | 369 | return pdflush_operation(background_writeout, nr_pages); | 
| 375 | } | 370 | } | 
| 376 | 371 | ||
| @@ -401,7 +396,6 @@ static void wb_kupdate(unsigned long arg) | |||
| 401 | unsigned long start_jif; | 396 | unsigned long start_jif; | 
| 402 | unsigned long next_jif; | 397 | unsigned long next_jif; | 
| 403 | long nr_to_write; | 398 | long nr_to_write; | 
| 404 | struct writeback_state wbs; | ||
| 405 | struct writeback_control wbc = { | 399 | struct writeback_control wbc = { | 
| 406 | .bdi = NULL, | 400 | .bdi = NULL, | 
| 407 | .sync_mode = WB_SYNC_NONE, | 401 | .sync_mode = WB_SYNC_NONE, | 
| @@ -414,11 +408,11 @@ static void wb_kupdate(unsigned long arg) | |||
| 414 | 408 | ||
| 415 | sync_supers(); | 409 | sync_supers(); | 
| 416 | 410 | ||
| 417 | get_writeback_state(&wbs); | ||
| 418 | oldest_jif = jiffies - dirty_expire_interval; | 411 | oldest_jif = jiffies - dirty_expire_interval; | 
| 419 | start_jif = jiffies; | 412 | start_jif = jiffies; | 
| 420 | next_jif = start_jif + dirty_writeback_interval; | 413 | next_jif = start_jif + dirty_writeback_interval; | 
| 421 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + | 414 | nr_to_write = global_page_state(NR_FILE_DIRTY) + | 
| 415 | global_page_state(NR_UNSTABLE_NFS) + | ||
| 422 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 416 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 
| 423 | while (nr_to_write > 0) { | 417 | while (nr_to_write > 0) { | 
| 424 | wbc.encountered_congestion = 0; | 418 | wbc.encountered_congestion = 0; | 
| @@ -507,23 +501,23 @@ void laptop_sync_completion(void) | |||
| 507 | * will write six megabyte chunks, max. | 501 | * will write six megabyte chunks, max. | 
| 508 | */ | 502 | */ | 
| 509 | 503 | ||
| 510 | static void set_ratelimit(void) | 504 | void writeback_set_ratelimit(void) | 
| 511 | { | 505 | { | 
| 512 | ratelimit_pages = total_pages / (num_online_cpus() * 32); | 506 | ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); | 
| 513 | if (ratelimit_pages < 16) | 507 | if (ratelimit_pages < 16) | 
| 514 | ratelimit_pages = 16; | 508 | ratelimit_pages = 16; | 
| 515 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | 509 | if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) | 
| 516 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | 510 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | 
| 517 | } | 511 | } | 
| 518 | 512 | ||
| 519 | static int | 513 | static int __cpuinit | 
| 520 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 514 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 
| 521 | { | 515 | { | 
| 522 | set_ratelimit(); | 516 | writeback_set_ratelimit(); | 
| 523 | return 0; | 517 | return 0; | 
| 524 | } | 518 | } | 
| 525 | 519 | ||
| 526 | static struct notifier_block ratelimit_nb = { | 520 | static struct notifier_block __cpuinitdata ratelimit_nb = { | 
| 527 | .notifier_call = ratelimit_handler, | 521 | .notifier_call = ratelimit_handler, | 
| 528 | .next = NULL, | 522 | .next = NULL, | 
| 529 | }; | 523 | }; | 
| @@ -538,9 +532,7 @@ void __init page_writeback_init(void) | |||
| 538 | long buffer_pages = nr_free_buffer_pages(); | 532 | long buffer_pages = nr_free_buffer_pages(); | 
| 539 | long correction; | 533 | long correction; | 
| 540 | 534 | ||
| 541 | total_pages = nr_free_pagecache_pages(); | 535 | correction = (100 * 4 * buffer_pages) / vm_total_pages; | 
| 542 | |||
| 543 | correction = (100 * 4 * buffer_pages) / total_pages; | ||
| 544 | 536 | ||
| 545 | if (correction < 100) { | 537 | if (correction < 100) { | 
| 546 | dirty_background_ratio *= correction; | 538 | dirty_background_ratio *= correction; | 
| @@ -554,7 +546,7 @@ void __init page_writeback_init(void) | |||
| 554 | vm_dirty_ratio = 1; | 546 | vm_dirty_ratio = 1; | 
| 555 | } | 547 | } | 
| 556 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 548 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 
| 557 | set_ratelimit(); | 549 | writeback_set_ratelimit(); | 
| 558 | register_cpu_notifier(&ratelimit_nb); | 550 | register_cpu_notifier(&ratelimit_nb); | 
| 559 | } | 551 | } | 
| 560 | 552 | ||
| @@ -566,7 +558,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
| 566 | return 0; | 558 | return 0; | 
| 567 | wbc->for_writepages = 1; | 559 | wbc->for_writepages = 1; | 
| 568 | if (mapping->a_ops->writepages) | 560 | if (mapping->a_ops->writepages) | 
| 569 | ret = mapping->a_ops->writepages(mapping, wbc); | 561 | ret = mapping->a_ops->writepages(mapping, wbc); | 
| 570 | else | 562 | else | 
| 571 | ret = generic_writepages(mapping, wbc); | 563 | ret = generic_writepages(mapping, wbc); | 
| 572 | wbc->for_writepages = 0; | 564 | wbc->for_writepages = 0; | 
| @@ -640,7 +632,8 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
| 640 | if (mapping2) { /* Race with truncate? */ | 632 | if (mapping2) { /* Race with truncate? */ | 
| 641 | BUG_ON(mapping2 != mapping); | 633 | BUG_ON(mapping2 != mapping); | 
| 642 | if (mapping_cap_account_dirty(mapping)) | 634 | if (mapping_cap_account_dirty(mapping)) | 
| 643 | inc_page_state(nr_dirty); | 635 | __inc_zone_page_state(page, | 
| 636 | NR_FILE_DIRTY); | ||
| 644 | radix_tree_tag_set(&mapping->page_tree, | 637 | radix_tree_tag_set(&mapping->page_tree, | 
| 645 | page_index(page), PAGECACHE_TAG_DIRTY); | 638 | page_index(page), PAGECACHE_TAG_DIRTY); | 
| 646 | } | 639 | } | 
| @@ -705,7 +698,7 @@ int set_page_dirty_lock(struct page *page) | |||
| 705 | { | 698 | { | 
| 706 | int ret; | 699 | int ret; | 
| 707 | 700 | ||
| 708 | lock_page(page); | 701 | lock_page_nosync(page); | 
| 709 | ret = set_page_dirty(page); | 702 | ret = set_page_dirty(page); | 
| 710 | unlock_page(page); | 703 | unlock_page(page); | 
| 711 | return ret; | 704 | return ret; | 
| @@ -728,8 +721,14 @@ int test_clear_page_dirty(struct page *page) | |||
| 728 | page_index(page), | 721 | page_index(page), | 
| 729 | PAGECACHE_TAG_DIRTY); | 722 | PAGECACHE_TAG_DIRTY); | 
| 730 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 723 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 
| 731 | if (mapping_cap_account_dirty(mapping)) | 724 | /* | 
| 732 | dec_page_state(nr_dirty); | 725 | * We can continue to use `mapping' here because the | 
| 726 | * page is locked, which pins the address_space | ||
| 727 | */ | ||
| 728 | if (mapping_cap_account_dirty(mapping)) { | ||
| 729 | page_mkclean(page); | ||
| 730 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
| 731 | } | ||
| 733 | return 1; | 732 | return 1; | 
| 734 | } | 733 | } | 
| 735 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 734 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 
| @@ -759,8 +758,10 @@ int clear_page_dirty_for_io(struct page *page) | |||
| 759 | 758 | ||
| 760 | if (mapping) { | 759 | if (mapping) { | 
| 761 | if (TestClearPageDirty(page)) { | 760 | if (TestClearPageDirty(page)) { | 
| 762 | if (mapping_cap_account_dirty(mapping)) | 761 | if (mapping_cap_account_dirty(mapping)) { | 
| 763 | dec_page_state(nr_dirty); | 762 | page_mkclean(page); | 
| 763 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
| 764 | } | ||
| 764 | return 1; | 765 | return 1; | 
| 765 | } | 766 | } | 
| 766 | return 0; | 767 | return 0; | 
| @@ -818,6 +819,15 @@ int test_set_page_writeback(struct page *page) | |||
| 818 | EXPORT_SYMBOL(test_set_page_writeback); | 819 | EXPORT_SYMBOL(test_set_page_writeback); | 
| 819 | 820 | ||
| 820 | /* | 821 | /* | 
| 822 | * Wakes up tasks that are being throttled due to writeback congestion | ||
| 823 | */ | ||
| 824 | void writeback_congestion_end(void) | ||
| 825 | { | ||
| 826 | blk_congestion_end(WRITE); | ||
| 827 | } | ||
| 828 | EXPORT_SYMBOL(writeback_congestion_end); | ||
| 829 | |||
| 830 | /* | ||
| 821 | * Return true if any of the pages in the mapping are marged with the | 831 | * Return true if any of the pages in the mapping are marged with the | 
| 822 | * passed tag. | 832 | * passed tag. | 
| 823 | */ | 833 | */ | 
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 423db0db7c02..4f59d90b81e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -14,7 +14,6 @@ | |||
| 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 
| 15 | */ | 15 | */ | 
| 16 | 16 | ||
| 17 | #include <linux/config.h> | ||
| 18 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> | 
| 19 | #include <linux/mm.h> | 18 | #include <linux/mm.h> | 
| 20 | #include <linux/swap.h> | 19 | #include <linux/swap.h> | 
| @@ -38,6 +37,8 @@ | |||
| 38 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> | 
| 39 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> | 
| 40 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> | 
| 40 | #include <linux/sort.h> | ||
| 41 | #include <linux/pfn.h> | ||
| 41 | 42 | ||
| 42 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> | 
| 43 | #include <asm/div64.h> | 44 | #include <asm/div64.h> | 
| @@ -52,7 +53,6 @@ EXPORT_SYMBOL(node_online_map); | |||
| 52 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 53 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 
| 53 | EXPORT_SYMBOL(node_possible_map); | 54 | EXPORT_SYMBOL(node_possible_map); | 
| 54 | unsigned long totalram_pages __read_mostly; | 55 | unsigned long totalram_pages __read_mostly; | 
| 55 | unsigned long totalhigh_pages __read_mostly; | ||
| 56 | unsigned long totalreserve_pages __read_mostly; | 56 | unsigned long totalreserve_pages __read_mostly; | 
| 57 | long nr_swap_pages; | 57 | long nr_swap_pages; | 
| 58 | int percpu_pagelist_fraction; | 58 | int percpu_pagelist_fraction; | 
| @@ -70,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
| 70 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 70 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 
| 71 | * don't need any ZONE_NORMAL reservation | 71 | * don't need any ZONE_NORMAL reservation | 
| 72 | */ | 72 | */ | 
| 73 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; | 73 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | 
| 74 | 256, | ||
| 75 | #ifdef CONFIG_ZONE_DMA32 | ||
| 76 | 256, | ||
| 77 | #endif | ||
| 78 | #ifdef CONFIG_HIGHMEM | ||
| 79 | 32 | ||
| 80 | #endif | ||
| 81 | }; | ||
| 74 | 82 | ||
| 75 | EXPORT_SYMBOL(totalram_pages); | 83 | EXPORT_SYMBOL(totalram_pages); | 
| 76 | 84 | ||
| @@ -81,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages); | |||
| 81 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | 89 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | 
| 82 | EXPORT_SYMBOL(zone_table); | 90 | EXPORT_SYMBOL(zone_table); | 
| 83 | 91 | ||
| 84 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; | 92 | static char *zone_names[MAX_NR_ZONES] = { | 
| 93 | "DMA", | ||
| 94 | #ifdef CONFIG_ZONE_DMA32 | ||
| 95 | "DMA32", | ||
| 96 | #endif | ||
| 97 | "Normal", | ||
| 98 | #ifdef CONFIG_HIGHMEM | ||
| 99 | "HighMem" | ||
| 100 | #endif | ||
| 101 | }; | ||
| 102 | |||
| 85 | int min_free_kbytes = 1024; | 103 | int min_free_kbytes = 1024; | 
| 86 | 104 | ||
| 87 | unsigned long __meminitdata nr_kernel_pages; | 105 | unsigned long __meminitdata nr_kernel_pages; | 
| 88 | unsigned long __meminitdata nr_all_pages; | 106 | unsigned long __meminitdata nr_all_pages; | 
| 107 | static unsigned long __initdata dma_reserve; | ||
| 108 | |||
| 109 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 110 | /* | ||
| 111 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct | ||
| 112 | * ranges of memory (RAM) that may be registered with add_active_range(). | ||
| 113 | * Ranges passed to add_active_range() will be merged if possible | ||
| 114 | * so the number of times add_active_range() can be called is | ||
| 115 | * related to the number of nodes and the number of holes | ||
| 116 | */ | ||
| 117 | #ifdef CONFIG_MAX_ACTIVE_REGIONS | ||
| 118 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ | ||
| 119 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS | ||
| 120 | #else | ||
| 121 | #if MAX_NUMNODES >= 32 | ||
| 122 | /* If there can be many nodes, allow up to 50 holes per node */ | ||
| 123 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) | ||
| 124 | #else | ||
| 125 | /* By default, allow up to 256 distinct regions */ | ||
| 126 | #define MAX_ACTIVE_REGIONS 256 | ||
| 127 | #endif | ||
| 128 | #endif | ||
| 129 | |||
| 130 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | ||
| 131 | int __initdata nr_nodemap_entries; | ||
| 132 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | ||
| 133 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | ||
| 134 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 135 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
| 136 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
| 137 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 138 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 89 | 139 | ||
| 90 | #ifdef CONFIG_DEBUG_VM | 140 | #ifdef CONFIG_DEBUG_VM | 
| 91 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 141 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 
| @@ -128,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page) | |||
| 128 | 178 | ||
| 129 | return 0; | 179 | return 0; | 
| 130 | } | 180 | } | 
| 131 | |||
| 132 | #else | 181 | #else | 
| 133 | static inline int bad_range(struct zone *zone, struct page *page) | 182 | static inline int bad_range(struct zone *zone, struct page *page) | 
| 134 | { | 183 | { | 
| @@ -219,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 219 | { | 268 | { | 
| 220 | int i; | 269 | int i; | 
| 221 | 270 | ||
| 222 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 271 | VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 
| 223 | /* | 272 | /* | 
| 224 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 273 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 
| 225 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 274 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 
| 226 | */ | 275 | */ | 
| 227 | BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 276 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 
| 228 | for (i = 0; i < (1 << order); i++) | 277 | for (i = 0; i < (1 << order); i++) | 
| 229 | clear_highpage(page + i); | 278 | clear_highpage(page + i); | 
| 230 | } | 279 | } | 
| @@ -266,7 +315,7 @@ static inline void rmv_page_order(struct page *page) | |||
| 266 | * satisfies the following equation: | 315 | * satisfies the following equation: | 
| 267 | * P = B & ~(1 << O) | 316 | * P = B & ~(1 << O) | 
| 268 | * | 317 | * | 
| 269 | * Assumption: *_mem_map is contigious at least up to MAX_ORDER | 318 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 
| 270 | */ | 319 | */ | 
| 271 | static inline struct page * | 320 | static inline struct page * | 
| 272 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | 321 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | 
| @@ -348,8 +397,8 @@ static inline void __free_one_page(struct page *page, | |||
| 348 | 397 | ||
| 349 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 398 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 
| 350 | 399 | ||
| 351 | BUG_ON(page_idx & (order_size - 1)); | 400 | VM_BUG_ON(page_idx & (order_size - 1)); | 
| 352 | BUG_ON(bad_range(zone, page)); | 401 | VM_BUG_ON(bad_range(zone, page)); | 
| 353 | 402 | ||
| 354 | zone->free_pages += order_size; | 403 | zone->free_pages += order_size; | 
| 355 | while (order < MAX_ORDER-1) { | 404 | while (order < MAX_ORDER-1) { | 
| @@ -422,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
| 422 | while (count--) { | 471 | while (count--) { | 
| 423 | struct page *page; | 472 | struct page *page; | 
| 424 | 473 | ||
| 425 | BUG_ON(list_empty(list)); | 474 | VM_BUG_ON(list_empty(list)); | 
| 426 | page = list_entry(list->prev, struct page, lru); | 475 | page = list_entry(list->prev, struct page, lru); | 
| 427 | /* have to delete it as __free_one_page list manipulates */ | 476 | /* have to delete it as __free_one_page list manipulates */ | 
| 428 | list_del(&page->lru); | 477 | list_del(&page->lru); | 
| @@ -433,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
| 433 | 482 | ||
| 434 | static void free_one_page(struct zone *zone, struct page *page, int order) | 483 | static void free_one_page(struct zone *zone, struct page *page, int order) | 
| 435 | { | 484 | { | 
| 436 | LIST_HEAD(list); | 485 | spin_lock(&zone->lock); | 
| 437 | list_add(&page->lru, &list); | 486 | zone->all_unreclaimable = 0; | 
| 438 | free_pages_bulk(zone, 1, &list, order); | 487 | zone->pages_scanned = 0; | 
| 488 | __free_one_page(page, zone ,order); | ||
| 489 | spin_unlock(&zone->lock); | ||
| 439 | } | 490 | } | 
| 440 | 491 | ||
| 441 | static void __free_pages_ok(struct page *page, unsigned int order) | 492 | static void __free_pages_ok(struct page *page, unsigned int order) | 
| @@ -446,8 +497,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 446 | 497 | ||
| 447 | arch_free_page(page, order); | 498 | arch_free_page(page, order); | 
| 448 | if (!PageHighMem(page)) | 499 | if (!PageHighMem(page)) | 
| 449 | mutex_debug_check_no_locks_freed(page_address(page), | 500 | debug_check_no_locks_freed(page_address(page), | 
| 450 | PAGE_SIZE<<order); | 501 | PAGE_SIZE<<order); | 
| 451 | 502 | ||
| 452 | for (i = 0 ; i < (1 << order) ; ++i) | 503 | for (i = 0 ; i < (1 << order) ; ++i) | 
| 453 | reserved += free_pages_check(page + i); | 504 | reserved += free_pages_check(page + i); | 
| @@ -456,7 +507,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 456 | 507 | ||
| 457 | kernel_map_pages(page, 1 << order, 0); | 508 | kernel_map_pages(page, 1 << order, 0); | 
| 458 | local_irq_save(flags); | 509 | local_irq_save(flags); | 
| 459 | __mod_page_state(pgfree, 1 << order); | 510 | __count_vm_events(PGFREE, 1 << order); | 
| 460 | free_one_page(page_zone(page), page, order); | 511 | free_one_page(page_zone(page), page, order); | 
| 461 | local_irq_restore(flags); | 512 | local_irq_restore(flags); | 
| 462 | } | 513 | } | 
| @@ -513,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 513 | area--; | 564 | area--; | 
| 514 | high--; | 565 | high--; | 
| 515 | size >>= 1; | 566 | size >>= 1; | 
| 516 | BUG_ON(bad_range(zone, &page[size])); | 567 | VM_BUG_ON(bad_range(zone, &page[size])); | 
| 517 | list_add(&page[size].lru, &area->free_list); | 568 | list_add(&page[size].lru, &area->free_list); | 
| 518 | area->nr_free++; | 569 | area->nr_free++; | 
| 519 | set_page_order(&page[size], high); | 570 | set_page_order(&page[size], high); | 
| @@ -616,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 616 | #ifdef CONFIG_NUMA | 667 | #ifdef CONFIG_NUMA | 
| 617 | /* | 668 | /* | 
| 618 | * Called from the slab reaper to drain pagesets on a particular node that | 669 | * Called from the slab reaper to drain pagesets on a particular node that | 
| 619 | * belong to the currently executing processor. | 670 | * belongs to the currently executing processor. | 
| 620 | * Note that this function must be called with the thread pinned to | 671 | * Note that this function must be called with the thread pinned to | 
| 621 | * a single processor. | 672 | * a single processor. | 
| 622 | */ | 673 | */ | 
| 623 | void drain_node_pages(int nodeid) | 674 | void drain_node_pages(int nodeid) | 
| 624 | { | 675 | { | 
| 625 | int i, z; | 676 | int i; | 
| 677 | enum zone_type z; | ||
| 626 | unsigned long flags; | 678 | unsigned long flags; | 
| 627 | 679 | ||
| 628 | for (z = 0; z < MAX_NR_ZONES; z++) { | 680 | for (z = 0; z < MAX_NR_ZONES; z++) { | 
| 629 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 681 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 
| 630 | struct per_cpu_pageset *pset; | 682 | struct per_cpu_pageset *pset; | 
| 631 | 683 | ||
| 684 | if (!populated_zone(zone)) | ||
| 685 | continue; | ||
| 686 | |||
| 632 | pset = zone_pcp(zone, smp_processor_id()); | 687 | pset = zone_pcp(zone, smp_processor_id()); | 
| 633 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 688 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 
| 634 | struct per_cpu_pages *pcp; | 689 | struct per_cpu_pages *pcp; | 
| @@ -673,7 +728,8 @@ static void __drain_pages(unsigned int cpu) | |||
| 673 | 728 | ||
| 674 | void mark_free_pages(struct zone *zone) | 729 | void mark_free_pages(struct zone *zone) | 
| 675 | { | 730 | { | 
| 676 | unsigned long zone_pfn, flags; | 731 | unsigned long pfn, max_zone_pfn; | 
| 732 | unsigned long flags; | ||
| 677 | int order; | 733 | int order; | 
| 678 | struct list_head *curr; | 734 | struct list_head *curr; | 
| 679 | 735 | ||
| @@ -681,18 +737,25 @@ void mark_free_pages(struct zone *zone) | |||
| 681 | return; | 737 | return; | 
| 682 | 738 | ||
| 683 | spin_lock_irqsave(&zone->lock, flags); | 739 | spin_lock_irqsave(&zone->lock, flags); | 
| 684 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 740 | |
| 685 | ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); | 741 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 
| 742 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | ||
| 743 | if (pfn_valid(pfn)) { | ||
| 744 | struct page *page = pfn_to_page(pfn); | ||
| 745 | |||
| 746 | if (!PageNosave(page)) | ||
| 747 | ClearPageNosaveFree(page); | ||
| 748 | } | ||
| 686 | 749 | ||
| 687 | for (order = MAX_ORDER - 1; order >= 0; --order) | 750 | for (order = MAX_ORDER - 1; order >= 0; --order) | 
| 688 | list_for_each(curr, &zone->free_area[order].free_list) { | 751 | list_for_each(curr, &zone->free_area[order].free_list) { | 
| 689 | unsigned long start_pfn, i; | 752 | unsigned long i; | 
| 690 | 753 | ||
| 691 | start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 754 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 
| 755 | for (i = 0; i < (1UL << order); i++) | ||
| 756 | SetPageNosaveFree(pfn_to_page(pfn + i)); | ||
| 757 | } | ||
| 692 | 758 | ||
| 693 | for (i=0; i < (1<<order); i++) | ||
| 694 | SetPageNosaveFree(pfn_to_page(start_pfn+i)); | ||
| 695 | } | ||
| 696 | spin_unlock_irqrestore(&zone->lock, flags); | 759 | spin_unlock_irqrestore(&zone->lock, flags); | 
| 697 | } | 760 | } | 
| 698 | 761 | ||
| @@ -709,27 +772,6 @@ void drain_local_pages(void) | |||
| 709 | } | 772 | } | 
| 710 | #endif /* CONFIG_PM */ | 773 | #endif /* CONFIG_PM */ | 
| 711 | 774 | ||
| 712 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) | ||
| 713 | { | ||
| 714 | #ifdef CONFIG_NUMA | ||
| 715 | pg_data_t *pg = z->zone_pgdat; | ||
| 716 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | ||
| 717 | struct per_cpu_pageset *p; | ||
| 718 | |||
| 719 | p = zone_pcp(z, cpu); | ||
| 720 | if (pg == orig) { | ||
| 721 | p->numa_hit++; | ||
| 722 | } else { | ||
| 723 | p->numa_miss++; | ||
| 724 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; | ||
| 725 | } | ||
| 726 | if (pg == NODE_DATA(numa_node_id())) | ||
| 727 | p->local_node++; | ||
| 728 | else | ||
| 729 | p->other_node++; | ||
| 730 | #endif | ||
| 731 | } | ||
| 732 | |||
| 733 | /* | 775 | /* | 
| 734 | * Free a 0-order page | 776 | * Free a 0-order page | 
| 735 | */ | 777 | */ | 
| @@ -750,7 +792,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
| 750 | 792 | ||
| 751 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 793 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 
| 752 | local_irq_save(flags); | 794 | local_irq_save(flags); | 
| 753 | __inc_page_state(pgfree); | 795 | __count_vm_event(PGFREE); | 
| 754 | list_add(&page->lru, &pcp->list); | 796 | list_add(&page->lru, &pcp->list); | 
| 755 | pcp->count++; | 797 | pcp->count++; | 
| 756 | if (pcp->count >= pcp->high) { | 798 | if (pcp->count >= pcp->high) { | 
| @@ -783,8 +825,8 @@ void split_page(struct page *page, unsigned int order) | |||
| 783 | { | 825 | { | 
| 784 | int i; | 826 | int i; | 
| 785 | 827 | ||
| 786 | BUG_ON(PageCompound(page)); | 828 | VM_BUG_ON(PageCompound(page)); | 
| 787 | BUG_ON(!page_count(page)); | 829 | VM_BUG_ON(!page_count(page)); | 
| 788 | for (i = 1; i < (1 << order); i++) | 830 | for (i = 1; i < (1 << order); i++) | 
| 789 | set_page_refcounted(page + i); | 831 | set_page_refcounted(page + i); | 
| 790 | } | 832 | } | 
| @@ -826,12 +868,12 @@ again: | |||
| 826 | goto failed; | 868 | goto failed; | 
| 827 | } | 869 | } | 
| 828 | 870 | ||
| 829 | __mod_page_state_zone(zone, pgalloc, 1 << order); | 871 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 
| 830 | zone_statistics(zonelist, zone, cpu); | 872 | zone_statistics(zonelist, zone); | 
| 831 | local_irq_restore(flags); | 873 | local_irq_restore(flags); | 
| 832 | put_cpu(); | 874 | put_cpu(); | 
| 833 | 875 | ||
| 834 | BUG_ON(bad_range(zone, page)); | 876 | VM_BUG_ON(bad_range(zone, page)); | 
| 835 | if (prep_new_page(page, order, gfp_flags)) | 877 | if (prep_new_page(page, order, gfp_flags)) | 
| 836 | goto again; | 878 | goto again; | 
| 837 | return page; | 879 | return page; | 
| @@ -892,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
| 892 | struct zone **z = zonelist->zones; | 934 | struct zone **z = zonelist->zones; | 
| 893 | struct page *page = NULL; | 935 | struct page *page = NULL; | 
| 894 | int classzone_idx = zone_idx(*z); | 936 | int classzone_idx = zone_idx(*z); | 
| 937 | struct zone *zone; | ||
| 895 | 938 | ||
| 896 | /* | 939 | /* | 
| 897 | * Go through the zonelist once, looking for a zone with enough free. | 940 | * Go through the zonelist once, looking for a zone with enough free. | 
| 898 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 941 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 
| 899 | */ | 942 | */ | 
| 900 | do { | 943 | do { | 
| 944 | zone = *z; | ||
| 945 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
| 946 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
| 947 | break; | ||
| 901 | if ((alloc_flags & ALLOC_CPUSET) && | 948 | if ((alloc_flags & ALLOC_CPUSET) && | 
| 902 | !cpuset_zone_allowed(*z, gfp_mask)) | 949 | !cpuset_zone_allowed(zone, gfp_mask)) | 
| 903 | continue; | 950 | continue; | 
| 904 | 951 | ||
| 905 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 952 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 
| 906 | unsigned long mark; | 953 | unsigned long mark; | 
| 907 | if (alloc_flags & ALLOC_WMARK_MIN) | 954 | if (alloc_flags & ALLOC_WMARK_MIN) | 
| 908 | mark = (*z)->pages_min; | 955 | mark = zone->pages_min; | 
| 909 | else if (alloc_flags & ALLOC_WMARK_LOW) | 956 | else if (alloc_flags & ALLOC_WMARK_LOW) | 
| 910 | mark = (*z)->pages_low; | 957 | mark = zone->pages_low; | 
| 911 | else | 958 | else | 
| 912 | mark = (*z)->pages_high; | 959 | mark = zone->pages_high; | 
| 913 | if (!zone_watermark_ok(*z, order, mark, | 960 | if (!zone_watermark_ok(zone , order, mark, | 
| 914 | classzone_idx, alloc_flags)) | 961 | classzone_idx, alloc_flags)) | 
| 915 | if (!zone_reclaim_mode || | 962 | if (!zone_reclaim_mode || | 
| 916 | !zone_reclaim(*z, gfp_mask, order)) | 963 | !zone_reclaim(zone, gfp_mask, order)) | 
| 917 | continue; | 964 | continue; | 
| 918 | } | 965 | } | 
| 919 | 966 | ||
| 920 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); | 967 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 
| 921 | if (page) { | 968 | if (page) { | 
| 922 | break; | 969 | break; | 
| 923 | } | 970 | } | 
| @@ -957,8 +1004,7 @@ restart: | |||
| 957 | goto got_pg; | 1004 | goto got_pg; | 
| 958 | 1005 | ||
| 959 | do { | 1006 | do { | 
| 960 | if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) | 1007 | wakeup_kswapd(*z, order); | 
| 961 | wakeup_kswapd(*z, order); | ||
| 962 | } while (*(++z)); | 1008 | } while (*(++z)); | 
| 963 | 1009 | ||
| 964 | /* | 1010 | /* | 
| @@ -1106,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) | |||
| 1106 | * get_zeroed_page() returns a 32-bit address, which cannot represent | 1152 | * get_zeroed_page() returns a 32-bit address, which cannot represent | 
| 1107 | * a highmem page | 1153 | * a highmem page | 
| 1108 | */ | 1154 | */ | 
| 1109 | BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 1155 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 
| 1110 | 1156 | ||
| 1111 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | 1157 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | 
| 1112 | if (page) | 1158 | if (page) | 
| @@ -1139,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages); | |||
| 1139 | fastcall void free_pages(unsigned long addr, unsigned int order) | 1185 | fastcall void free_pages(unsigned long addr, unsigned int order) | 
| 1140 | { | 1186 | { | 
| 1141 | if (addr != 0) { | 1187 | if (addr != 0) { | 
| 1142 | BUG_ON(!virt_addr_valid((void *)addr)); | 1188 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 
| 1143 | __free_pages(virt_to_page((void *)addr), order); | 1189 | __free_pages(virt_to_page((void *)addr), order); | 
| 1144 | } | 1190 | } | 
| 1145 | } | 1191 | } | 
| @@ -1165,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages); | |||
| 1165 | #ifdef CONFIG_NUMA | 1211 | #ifdef CONFIG_NUMA | 
| 1166 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) | 1212 | unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) | 
| 1167 | { | 1213 | { | 
| 1168 | unsigned int i, sum = 0; | 1214 | unsigned int sum = 0; | 
| 1215 | enum zone_type i; | ||
| 1169 | 1216 | ||
| 1170 | for (i = 0; i < MAX_NR_ZONES; i++) | 1217 | for (i = 0; i < MAX_NR_ZONES; i++) | 
| 1171 | sum += pgdat->node_zones[i].free_pages; | 1218 | sum += pgdat->node_zones[i].free_pages; | 
| @@ -1210,161 +1257,10 @@ unsigned int nr_free_pagecache_pages(void) | |||
| 1210 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1257 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 
| 1211 | } | 1258 | } | 
| 1212 | 1259 | ||
| 1213 | #ifdef CONFIG_HIGHMEM | 1260 | static inline void show_node(struct zone *zone) | 
| 1214 | unsigned int nr_free_highpages (void) | ||
| 1215 | { | ||
| 1216 | pg_data_t *pgdat; | ||
| 1217 | unsigned int pages = 0; | ||
| 1218 | |||
| 1219 | for_each_online_pgdat(pgdat) | ||
| 1220 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | ||
| 1221 | |||
| 1222 | return pages; | ||
| 1223 | } | ||
| 1224 | #endif | ||
| 1225 | |||
| 1226 | #ifdef CONFIG_NUMA | ||
| 1227 | static void show_node(struct zone *zone) | ||
| 1228 | { | ||
| 1229 | printk("Node %d ", zone->zone_pgdat->node_id); | ||
| 1230 | } | ||
| 1231 | #else | ||
| 1232 | #define show_node(zone) do { } while (0) | ||
| 1233 | #endif | ||
| 1234 | |||
| 1235 | /* | ||
| 1236 | * Accumulate the page_state information across all CPUs. | ||
| 1237 | * The result is unavoidably approximate - it can change | ||
| 1238 | * during and after execution of this function. | ||
| 1239 | */ | ||
| 1240 | static DEFINE_PER_CPU(struct page_state, page_states) = {0}; | ||
| 1241 | |||
| 1242 | atomic_t nr_pagecache = ATOMIC_INIT(0); | ||
| 1243 | EXPORT_SYMBOL(nr_pagecache); | ||
| 1244 | #ifdef CONFIG_SMP | ||
| 1245 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | ||
| 1246 | #endif | ||
| 1247 | |||
| 1248 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | ||
| 1249 | { | ||
| 1250 | unsigned cpu; | ||
| 1251 | |||
| 1252 | memset(ret, 0, nr * sizeof(unsigned long)); | ||
| 1253 | cpus_and(*cpumask, *cpumask, cpu_online_map); | ||
| 1254 | |||
| 1255 | for_each_cpu_mask(cpu, *cpumask) { | ||
| 1256 | unsigned long *in; | ||
| 1257 | unsigned long *out; | ||
| 1258 | unsigned off; | ||
| 1259 | unsigned next_cpu; | ||
| 1260 | |||
| 1261 | in = (unsigned long *)&per_cpu(page_states, cpu); | ||
| 1262 | |||
| 1263 | next_cpu = next_cpu(cpu, *cpumask); | ||
| 1264 | if (likely(next_cpu < NR_CPUS)) | ||
| 1265 | prefetch(&per_cpu(page_states, next_cpu)); | ||
| 1266 | |||
| 1267 | out = (unsigned long *)ret; | ||
| 1268 | for (off = 0; off < nr; off++) | ||
| 1269 | *out++ += *in++; | ||
| 1270 | } | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | void get_page_state_node(struct page_state *ret, int node) | ||
| 1274 | { | ||
| 1275 | int nr; | ||
| 1276 | cpumask_t mask = node_to_cpumask(node); | ||
| 1277 | |||
| 1278 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | ||
| 1279 | nr /= sizeof(unsigned long); | ||
| 1280 | |||
| 1281 | __get_page_state(ret, nr+1, &mask); | ||
| 1282 | } | ||
| 1283 | |||
| 1284 | void get_page_state(struct page_state *ret) | ||
| 1285 | { | ||
| 1286 | int nr; | ||
| 1287 | cpumask_t mask = CPU_MASK_ALL; | ||
| 1288 | |||
| 1289 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | ||
| 1290 | nr /= sizeof(unsigned long); | ||
| 1291 | |||
| 1292 | __get_page_state(ret, nr + 1, &mask); | ||
| 1293 | } | ||
| 1294 | |||
| 1295 | void get_full_page_state(struct page_state *ret) | ||
| 1296 | { | ||
| 1297 | cpumask_t mask = CPU_MASK_ALL; | ||
| 1298 | |||
| 1299 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | ||
| 1300 | } | ||
| 1301 | |||
| 1302 | unsigned long read_page_state_offset(unsigned long offset) | ||
| 1303 | { | ||
| 1304 | unsigned long ret = 0; | ||
| 1305 | int cpu; | ||
| 1306 | |||
| 1307 | for_each_online_cpu(cpu) { | ||
| 1308 | unsigned long in; | ||
| 1309 | |||
| 1310 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; | ||
| 1311 | ret += *((unsigned long *)in); | ||
| 1312 | } | ||
| 1313 | return ret; | ||
| 1314 | } | ||
| 1315 | |||
| 1316 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
| 1317 | { | 1261 | { | 
| 1318 | void *ptr; | 1262 | if (NUMA_BUILD) | 
| 1319 | 1263 | printk("Node %ld ", zone_to_nid(zone)); | |
| 1320 | ptr = &__get_cpu_var(page_states); | ||
| 1321 | *(unsigned long *)(ptr + offset) += delta; | ||
| 1322 | } | ||
| 1323 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
| 1324 | |||
| 1325 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
| 1326 | { | ||
| 1327 | unsigned long flags; | ||
| 1328 | void *ptr; | ||
| 1329 | |||
| 1330 | local_irq_save(flags); | ||
| 1331 | ptr = &__get_cpu_var(page_states); | ||
| 1332 | *(unsigned long *)(ptr + offset) += delta; | ||
| 1333 | local_irq_restore(flags); | ||
| 1334 | } | ||
| 1335 | EXPORT_SYMBOL(mod_page_state_offset); | ||
| 1336 | |||
| 1337 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | ||
| 1338 | unsigned long *free, struct pglist_data *pgdat) | ||
| 1339 | { | ||
| 1340 | struct zone *zones = pgdat->node_zones; | ||
| 1341 | int i; | ||
| 1342 | |||
| 1343 | *active = 0; | ||
| 1344 | *inactive = 0; | ||
| 1345 | *free = 0; | ||
| 1346 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 1347 | *active += zones[i].nr_active; | ||
| 1348 | *inactive += zones[i].nr_inactive; | ||
| 1349 | *free += zones[i].free_pages; | ||
| 1350 | } | ||
| 1351 | } | ||
| 1352 | |||
| 1353 | void get_zone_counts(unsigned long *active, | ||
| 1354 | unsigned long *inactive, unsigned long *free) | ||
| 1355 | { | ||
| 1356 | struct pglist_data *pgdat; | ||
| 1357 | |||
| 1358 | *active = 0; | ||
| 1359 | *inactive = 0; | ||
| 1360 | *free = 0; | ||
| 1361 | for_each_online_pgdat(pgdat) { | ||
| 1362 | unsigned long l, m, n; | ||
| 1363 | __get_zone_counts(&l, &m, &n, pgdat); | ||
| 1364 | *active += l; | ||
| 1365 | *inactive += m; | ||
| 1366 | *free += n; | ||
| 1367 | } | ||
| 1368 | } | 1264 | } | 
| 1369 | 1265 | ||
| 1370 | void si_meminfo(struct sysinfo *val) | 1266 | void si_meminfo(struct sysinfo *val) | 
| @@ -1373,13 +1269,8 @@ void si_meminfo(struct sysinfo *val) | |||
| 1373 | val->sharedram = 0; | 1269 | val->sharedram = 0; | 
| 1374 | val->freeram = nr_free_pages(); | 1270 | val->freeram = nr_free_pages(); | 
| 1375 | val->bufferram = nr_blockdev_pages(); | 1271 | val->bufferram = nr_blockdev_pages(); | 
| 1376 | #ifdef CONFIG_HIGHMEM | ||
| 1377 | val->totalhigh = totalhigh_pages; | 1272 | val->totalhigh = totalhigh_pages; | 
| 1378 | val->freehigh = nr_free_highpages(); | 1273 | val->freehigh = nr_free_highpages(); | 
| 1379 | #else | ||
| 1380 | val->totalhigh = 0; | ||
| 1381 | val->freehigh = 0; | ||
| 1382 | #endif | ||
| 1383 | val->mem_unit = PAGE_SIZE; | 1274 | val->mem_unit = PAGE_SIZE; | 
| 1384 | } | 1275 | } | 
| 1385 | 1276 | ||
| @@ -1392,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
| 1392 | 1283 | ||
| 1393 | val->totalram = pgdat->node_present_pages; | 1284 | val->totalram = pgdat->node_present_pages; | 
| 1394 | val->freeram = nr_free_pages_pgdat(pgdat); | 1285 | val->freeram = nr_free_pages_pgdat(pgdat); | 
| 1286 | #ifdef CONFIG_HIGHMEM | ||
| 1395 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 1287 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 
| 1396 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 1288 | val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 
| 1289 | #else | ||
| 1290 | val->totalhigh = 0; | ||
| 1291 | val->freehigh = 0; | ||
| 1292 | #endif | ||
| 1397 | val->mem_unit = PAGE_SIZE; | 1293 | val->mem_unit = PAGE_SIZE; | 
| 1398 | } | 1294 | } | 
| 1399 | #endif | 1295 | #endif | 
| @@ -1407,60 +1303,54 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
| 1407 | */ | 1303 | */ | 
| 1408 | void show_free_areas(void) | 1304 | void show_free_areas(void) | 
| 1409 | { | 1305 | { | 
| 1410 | struct page_state ps; | 1306 | int cpu; | 
| 1411 | int cpu, temperature; | ||
| 1412 | unsigned long active; | 1307 | unsigned long active; | 
| 1413 | unsigned long inactive; | 1308 | unsigned long inactive; | 
| 1414 | unsigned long free; | 1309 | unsigned long free; | 
| 1415 | struct zone *zone; | 1310 | struct zone *zone; | 
| 1416 | 1311 | ||
| 1417 | for_each_zone(zone) { | 1312 | for_each_zone(zone) { | 
| 1418 | show_node(zone); | 1313 | if (!populated_zone(zone)) | 
| 1419 | printk("%s per-cpu:", zone->name); | ||
| 1420 | |||
| 1421 | if (!populated_zone(zone)) { | ||
| 1422 | printk(" empty\n"); | ||
| 1423 | continue; | 1314 | continue; | 
| 1424 | } else | 1315 | |
| 1425 | printk("\n"); | 1316 | show_node(zone); | 
| 1317 | printk("%s per-cpu:\n", zone->name); | ||
| 1426 | 1318 | ||
| 1427 | for_each_online_cpu(cpu) { | 1319 | for_each_online_cpu(cpu) { | 
| 1428 | struct per_cpu_pageset *pageset; | 1320 | struct per_cpu_pageset *pageset; | 
| 1429 | 1321 | ||
| 1430 | pageset = zone_pcp(zone, cpu); | 1322 | pageset = zone_pcp(zone, cpu); | 
| 1431 | 1323 | ||
| 1432 | for (temperature = 0; temperature < 2; temperature++) | 1324 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " | 
| 1433 | printk("cpu %d %s: high %d, batch %d used:%d\n", | 1325 | "Cold: hi:%5d, btch:%4d usd:%4d\n", | 
| 1434 | cpu, | 1326 | cpu, pageset->pcp[0].high, | 
| 1435 | temperature ? "cold" : "hot", | 1327 | pageset->pcp[0].batch, pageset->pcp[0].count, | 
| 1436 | pageset->pcp[temperature].high, | 1328 | pageset->pcp[1].high, pageset->pcp[1].batch, | 
| 1437 | pageset->pcp[temperature].batch, | 1329 | pageset->pcp[1].count); | 
| 1438 | pageset->pcp[temperature].count); | ||
| 1439 | } | 1330 | } | 
| 1440 | } | 1331 | } | 
| 1441 | 1332 | ||
| 1442 | get_page_state(&ps); | ||
| 1443 | get_zone_counts(&active, &inactive, &free); | 1333 | get_zone_counts(&active, &inactive, &free); | 
| 1444 | 1334 | ||
| 1445 | printk("Free pages: %11ukB (%ukB HighMem)\n", | ||
| 1446 | K(nr_free_pages()), | ||
| 1447 | K(nr_free_highpages())); | ||
| 1448 | |||
| 1449 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | 1335 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | 
| 1450 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 1336 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 
| 1451 | active, | 1337 | active, | 
| 1452 | inactive, | 1338 | inactive, | 
| 1453 | ps.nr_dirty, | 1339 | global_page_state(NR_FILE_DIRTY), | 
| 1454 | ps.nr_writeback, | 1340 | global_page_state(NR_WRITEBACK), | 
| 1455 | ps.nr_unstable, | 1341 | global_page_state(NR_UNSTABLE_NFS), | 
| 1456 | nr_free_pages(), | 1342 | nr_free_pages(), | 
| 1457 | ps.nr_slab, | 1343 | global_page_state(NR_SLAB_RECLAIMABLE) + | 
| 1458 | ps.nr_mapped, | 1344 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 
| 1459 | ps.nr_page_table_pages); | 1345 | global_page_state(NR_FILE_MAPPED), | 
| 1346 | global_page_state(NR_PAGETABLE)); | ||
| 1460 | 1347 | ||
| 1461 | for_each_zone(zone) { | 1348 | for_each_zone(zone) { | 
| 1462 | int i; | 1349 | int i; | 
| 1463 | 1350 | ||
| 1351 | if (!populated_zone(zone)) | ||
| 1352 | continue; | ||
| 1353 | |||
| 1464 | show_node(zone); | 1354 | show_node(zone); | 
| 1465 | printk("%s" | 1355 | printk("%s" | 
| 1466 | " free:%lukB" | 1356 | " free:%lukB" | 
| @@ -1493,12 +1383,11 @@ void show_free_areas(void) | |||
| 1493 | for_each_zone(zone) { | 1383 | for_each_zone(zone) { | 
| 1494 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 1384 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 
| 1495 | 1385 | ||
| 1386 | if (!populated_zone(zone)) | ||
| 1387 | continue; | ||
| 1388 | |||
| 1496 | show_node(zone); | 1389 | show_node(zone); | 
| 1497 | printk("%s: ", zone->name); | 1390 | printk("%s: ", zone->name); | 
| 1498 | if (!populated_zone(zone)) { | ||
| 1499 | printk("empty\n"); | ||
| 1500 | continue; | ||
| 1501 | } | ||
| 1502 | 1391 | ||
| 1503 | spin_lock_irqsave(&zone->lock, flags); | 1392 | spin_lock_irqsave(&zone->lock, flags); | 
| 1504 | for (order = 0; order < MAX_ORDER; order++) { | 1393 | for (order = 0; order < MAX_ORDER; order++) { | 
| @@ -1520,39 +1409,25 @@ void show_free_areas(void) | |||
| 1520 | * Add all populated zones of a node to the zonelist. | 1409 | * Add all populated zones of a node to the zonelist. | 
| 1521 | */ | 1410 | */ | 
| 1522 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1411 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 
| 1523 | struct zonelist *zonelist, int nr_zones, int zone_type) | 1412 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) | 
| 1524 | { | 1413 | { | 
| 1525 | struct zone *zone; | 1414 | struct zone *zone; | 
| 1526 | 1415 | ||
| 1527 | BUG_ON(zone_type > ZONE_HIGHMEM); | 1416 | BUG_ON(zone_type >= MAX_NR_ZONES); | 
| 1417 | zone_type++; | ||
| 1528 | 1418 | ||
| 1529 | do { | 1419 | do { | 
| 1420 | zone_type--; | ||
| 1530 | zone = pgdat->node_zones + zone_type; | 1421 | zone = pgdat->node_zones + zone_type; | 
| 1531 | if (populated_zone(zone)) { | 1422 | if (populated_zone(zone)) { | 
| 1532 | #ifndef CONFIG_HIGHMEM | ||
| 1533 | BUG_ON(zone_type > ZONE_NORMAL); | ||
| 1534 | #endif | ||
| 1535 | zonelist->zones[nr_zones++] = zone; | 1423 | zonelist->zones[nr_zones++] = zone; | 
| 1536 | check_highest_zone(zone_type); | 1424 | check_highest_zone(zone_type); | 
| 1537 | } | 1425 | } | 
| 1538 | zone_type--; | ||
| 1539 | 1426 | ||
| 1540 | } while (zone_type >= 0); | 1427 | } while (zone_type); | 
| 1541 | return nr_zones; | 1428 | return nr_zones; | 
| 1542 | } | 1429 | } | 
| 1543 | 1430 | ||
| 1544 | static inline int highest_zone(int zone_bits) | ||
| 1545 | { | ||
| 1546 | int res = ZONE_NORMAL; | ||
| 1547 | if (zone_bits & (__force int)__GFP_HIGHMEM) | ||
| 1548 | res = ZONE_HIGHMEM; | ||
| 1549 | if (zone_bits & (__force int)__GFP_DMA32) | ||
| 1550 | res = ZONE_DMA32; | ||
| 1551 | if (zone_bits & (__force int)__GFP_DMA) | ||
| 1552 | res = ZONE_DMA; | ||
| 1553 | return res; | ||
| 1554 | } | ||
| 1555 | |||
| 1556 | #ifdef CONFIG_NUMA | 1431 | #ifdef CONFIG_NUMA | 
| 1557 | #define MAX_NODE_LOAD (num_online_nodes()) | 1432 | #define MAX_NODE_LOAD (num_online_nodes()) | 
| 1558 | static int __meminitdata node_load[MAX_NUMNODES]; | 1433 | static int __meminitdata node_load[MAX_NUMNODES]; | 
| @@ -1618,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 1618 | 1493 | ||
| 1619 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1494 | static void __meminit build_zonelists(pg_data_t *pgdat) | 
| 1620 | { | 1495 | { | 
| 1621 | int i, j, k, node, local_node; | 1496 | int j, node, local_node; | 
| 1497 | enum zone_type i; | ||
| 1622 | int prev_node, load; | 1498 | int prev_node, load; | 
| 1623 | struct zonelist *zonelist; | 1499 | struct zonelist *zonelist; | 
| 1624 | nodemask_t used_mask; | 1500 | nodemask_t used_mask; | 
| 1625 | 1501 | ||
| 1626 | /* initialize zonelists */ | 1502 | /* initialize zonelists */ | 
| 1627 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1503 | for (i = 0; i < MAX_NR_ZONES; i++) { | 
| 1628 | zonelist = pgdat->node_zonelists + i; | 1504 | zonelist = pgdat->node_zonelists + i; | 
| 1629 | zonelist->zones[0] = NULL; | 1505 | zonelist->zones[0] = NULL; | 
| 1630 | } | 1506 | } | 
| @@ -1654,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
| 1654 | node_load[node] += load; | 1530 | node_load[node] += load; | 
| 1655 | prev_node = node; | 1531 | prev_node = node; | 
| 1656 | load--; | 1532 | load--; | 
| 1657 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1533 | for (i = 0; i < MAX_NR_ZONES; i++) { | 
| 1658 | zonelist = pgdat->node_zonelists + i; | 1534 | zonelist = pgdat->node_zonelists + i; | 
| 1659 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1535 | for (j = 0; zonelist->zones[j] != NULL; j++); | 
| 1660 | 1536 | ||
| 1661 | k = highest_zone(i); | 1537 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 
| 1662 | |||
| 1663 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | ||
| 1664 | zonelist->zones[j] = NULL; | 1538 | zonelist->zones[j] = NULL; | 
| 1665 | } | 1539 | } | 
| 1666 | } | 1540 | } | 
| @@ -1670,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
| 1670 | 1544 | ||
| 1671 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1545 | static void __meminit build_zonelists(pg_data_t *pgdat) | 
| 1672 | { | 1546 | { | 
| 1673 | int i, j, k, node, local_node; | 1547 | int node, local_node; | 
| 1548 | enum zone_type i,j; | ||
| 1674 | 1549 | ||
| 1675 | local_node = pgdat->node_id; | 1550 | local_node = pgdat->node_id; | 
| 1676 | for (i = 0; i < GFP_ZONETYPES; i++) { | 1551 | for (i = 0; i < MAX_NR_ZONES; i++) { | 
| 1677 | struct zonelist *zonelist; | 1552 | struct zonelist *zonelist; | 
| 1678 | 1553 | ||
| 1679 | zonelist = pgdat->node_zonelists + i; | 1554 | zonelist = pgdat->node_zonelists + i; | 
| 1680 | 1555 | ||
| 1681 | j = 0; | 1556 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 
| 1682 | k = highest_zone(i); | ||
| 1683 | j = build_zonelists_node(pgdat, zonelist, j, k); | ||
| 1684 | /* | 1557 | /* | 
| 1685 | * Now we build the zonelist so that it contains the zones | 1558 | * Now we build the zonelist so that it contains the zones | 
| 1686 | * of all the other nodes. | 1559 | * of all the other nodes. | 
| @@ -1692,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
| 1692 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 1565 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 
| 1693 | if (!node_online(node)) | 1566 | if (!node_online(node)) | 
| 1694 | continue; | 1567 | continue; | 
| 1695 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1568 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 
| 1696 | } | 1569 | } | 
| 1697 | for (node = 0; node < local_node; node++) { | 1570 | for (node = 0; node < local_node; node++) { | 
| 1698 | if (!node_online(node)) | 1571 | if (!node_online(node)) | 
| 1699 | continue; | 1572 | continue; | 
| 1700 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); | 1573 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 
| 1701 | } | 1574 | } | 
| 1702 | 1575 | ||
| 1703 | zonelist->zones[j] = NULL; | 1576 | zonelist->zones[j] = NULL; | 
| @@ -1718,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
| 1718 | void __meminit build_all_zonelists(void) | 1591 | void __meminit build_all_zonelists(void) | 
| 1719 | { | 1592 | { | 
| 1720 | if (system_state == SYSTEM_BOOTING) { | 1593 | if (system_state == SYSTEM_BOOTING) { | 
| 1721 | __build_all_zonelists(0); | 1594 | __build_all_zonelists(NULL); | 
| 1722 | cpuset_init_current_mems_allowed(); | 1595 | cpuset_init_current_mems_allowed(); | 
| 1723 | } else { | 1596 | } else { | 
| 1724 | /* we have to stop all cpus to guaranntee there is no user | 1597 | /* we have to stop all cpus to guaranntee there is no user | 
| @@ -1799,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
| 1799 | 1672 | ||
| 1800 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1673 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 
| 1801 | 1674 | ||
| 1802 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | ||
| 1803 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 1804 | { | ||
| 1805 | unsigned long realtotalpages, totalpages = 0; | ||
| 1806 | int i; | ||
| 1807 | |||
| 1808 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1809 | totalpages += zones_size[i]; | ||
| 1810 | pgdat->node_spanned_pages = totalpages; | ||
| 1811 | |||
| 1812 | realtotalpages = totalpages; | ||
| 1813 | if (zholes_size) | ||
| 1814 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1815 | realtotalpages -= zholes_size[i]; | ||
| 1816 | pgdat->node_present_pages = realtotalpages; | ||
| 1817 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | |||
| 1821 | /* | 1675 | /* | 
| 1822 | * Initially all pages are reserved - free ones are freed | 1676 | * Initially all pages are reserved - free ones are freed | 
| 1823 | * up by free_all_bootmem() once the early boot process is | 1677 | * up by free_all_bootmem() once the early boot process is | 
| @@ -1858,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
| 1858 | } | 1712 | } | 
| 1859 | 1713 | ||
| 1860 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | 1714 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | 
| 1861 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 1715 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, | 
| 1862 | unsigned long size) | 1716 | unsigned long pfn, unsigned long size) | 
| 1863 | { | 1717 | { | 
| 1864 | unsigned long snum = pfn_to_section_nr(pfn); | 1718 | unsigned long snum = pfn_to_section_nr(pfn); | 
| 1865 | unsigned long end = pfn_to_section_nr(pfn + size); | 1719 | unsigned long end = pfn_to_section_nr(pfn + size); | 
| @@ -1975,6 +1829,9 @@ static int __cpuinit process_zones(int cpu) | |||
| 1975 | 1829 | ||
| 1976 | for_each_zone(zone) { | 1830 | for_each_zone(zone) { | 
| 1977 | 1831 | ||
| 1832 | if (!populated_zone(zone)) | ||
| 1833 | continue; | ||
| 1834 | |||
| 1978 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 1835 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 
| 1979 | GFP_KERNEL, cpu_to_node(cpu)); | 1836 | GFP_KERNEL, cpu_to_node(cpu)); | 
| 1980 | if (!zone_pcp(zone, cpu)) | 1837 | if (!zone_pcp(zone, cpu)) | 
| @@ -2005,12 +1862,14 @@ static inline void free_zone_pagesets(int cpu) | |||
| 2005 | for_each_zone(zone) { | 1862 | for_each_zone(zone) { | 
| 2006 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 1863 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 
| 2007 | 1864 | ||
| 1865 | /* Free per_cpu_pageset if it is slab allocated */ | ||
| 1866 | if (pset != &boot_pageset[cpu]) | ||
| 1867 | kfree(pset); | ||
| 2008 | zone_pcp(zone, cpu) = NULL; | 1868 | zone_pcp(zone, cpu) = NULL; | 
| 2009 | kfree(pset); | ||
| 2010 | } | 1869 | } | 
| 2011 | } | 1870 | } | 
| 2012 | 1871 | ||
| 2013 | static int pageset_cpuup_callback(struct notifier_block *nfb, | 1872 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | 
| 2014 | unsigned long action, | 1873 | unsigned long action, | 
| 2015 | void *hcpu) | 1874 | void *hcpu) | 
| 2016 | { | 1875 | { | 
| @@ -2032,7 +1891,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb, | |||
| 2032 | return ret; | 1891 | return ret; | 
| 2033 | } | 1892 | } | 
| 2034 | 1893 | ||
| 2035 | static struct notifier_block pageset_notifier = | 1894 | static struct notifier_block __cpuinitdata pageset_notifier = | 
| 2036 | { &pageset_cpuup_callback, NULL, 0 }; | 1895 | { &pageset_cpuup_callback, NULL, 0 }; | 
| 2037 | 1896 | ||
| 2038 | void __init setup_per_cpu_pageset(void) | 1897 | void __init setup_per_cpu_pageset(void) | 
| @@ -2132,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
| 2132 | return 0; | 1991 | return 0; | 
| 2133 | } | 1992 | } | 
| 2134 | 1993 | ||
| 1994 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 1995 | /* | ||
| 1996 | * Basic iterator support. Return the first range of PFNs for a node | ||
| 1997 | * Note: nid == MAX_NUMNODES returns first region regardless of node | ||
| 1998 | */ | ||
| 1999 | static int __init first_active_region_index_in_nid(int nid) | ||
| 2000 | { | ||
| 2001 | int i; | ||
| 2002 | |||
| 2003 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 2004 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | ||
| 2005 | return i; | ||
| 2006 | |||
| 2007 | return -1; | ||
| 2008 | } | ||
| 2009 | |||
| 2010 | /* | ||
| 2011 | * Basic iterator support. Return the next active range of PFNs for a node | ||
| 2012 | * Note: nid == MAX_NUMNODES returns next region regardles of node | ||
| 2013 | */ | ||
| 2014 | static int __init next_active_region_index_in_nid(int index, int nid) | ||
| 2015 | { | ||
| 2016 | for (index = index + 1; index < nr_nodemap_entries; index++) | ||
| 2017 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
| 2018 | return index; | ||
| 2019 | |||
| 2020 | return -1; | ||
| 2021 | } | ||
| 2022 | |||
| 2023 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | ||
| 2024 | /* | ||
| 2025 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | ||
| 2026 | * Architectures may implement their own version but if add_active_range() | ||
| 2027 | * was used and there are no special requirements, this is a convenient | ||
| 2028 | * alternative | ||
| 2029 | */ | ||
| 2030 | int __init early_pfn_to_nid(unsigned long pfn) | ||
| 2031 | { | ||
| 2032 | int i; | ||
| 2033 | |||
| 2034 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
| 2035 | unsigned long start_pfn = early_node_map[i].start_pfn; | ||
| 2036 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
| 2037 | |||
| 2038 | if (start_pfn <= pfn && pfn < end_pfn) | ||
| 2039 | return early_node_map[i].nid; | ||
| 2040 | } | ||
| 2041 | |||
| 2042 | return 0; | ||
| 2043 | } | ||
| 2044 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | ||
| 2045 | |||
| 2046 | /* Basic iterator support to walk early_node_map[] */ | ||
| 2047 | #define for_each_active_range_index_in_nid(i, nid) \ | ||
| 2048 | for (i = first_active_region_index_in_nid(nid); i != -1; \ | ||
| 2049 | i = next_active_region_index_in_nid(i, nid)) | ||
| 2050 | |||
| 2051 | /** | ||
| 2052 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | ||
| 2053 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed | ||
| 2054 | * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node | ||
| 2055 | * | ||
| 2056 | * If an architecture guarantees that all ranges registered with | ||
| 2057 | * add_active_ranges() contain no holes and may be freed, this | ||
| 2058 | * this function may be used instead of calling free_bootmem() manually. | ||
| 2059 | */ | ||
| 2060 | void __init free_bootmem_with_active_regions(int nid, | ||
| 2061 | unsigned long max_low_pfn) | ||
| 2062 | { | ||
| 2063 | int i; | ||
| 2064 | |||
| 2065 | for_each_active_range_index_in_nid(i, nid) { | ||
| 2066 | unsigned long size_pages = 0; | ||
| 2067 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
| 2068 | |||
| 2069 | if (early_node_map[i].start_pfn >= max_low_pfn) | ||
| 2070 | continue; | ||
| 2071 | |||
| 2072 | if (end_pfn > max_low_pfn) | ||
| 2073 | end_pfn = max_low_pfn; | ||
| 2074 | |||
| 2075 | size_pages = end_pfn - early_node_map[i].start_pfn; | ||
| 2076 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), | ||
| 2077 | PFN_PHYS(early_node_map[i].start_pfn), | ||
| 2078 | size_pages << PAGE_SHIFT); | ||
| 2079 | } | ||
| 2080 | } | ||
| 2081 | |||
| 2082 | /** | ||
| 2083 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | ||
| 2084 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used | ||
| 2085 | * | ||
| 2086 | * If an architecture guarantees that all ranges registered with | ||
| 2087 | * add_active_ranges() contain no holes and may be freed, this | ||
| 2088 | * this function may be used instead of calling memory_present() manually. | ||
| 2089 | */ | ||
| 2090 | void __init sparse_memory_present_with_active_regions(int nid) | ||
| 2091 | { | ||
| 2092 | int i; | ||
| 2093 | |||
| 2094 | for_each_active_range_index_in_nid(i, nid) | ||
| 2095 | memory_present(early_node_map[i].nid, | ||
| 2096 | early_node_map[i].start_pfn, | ||
| 2097 | early_node_map[i].end_pfn); | ||
| 2098 | } | ||
| 2099 | |||
| 2100 | /** | ||
| 2101 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
| 2102 | * @nid: The nid of the node to push the boundary for | ||
| 2103 | * @start_pfn: The start pfn of the node | ||
| 2104 | * @end_pfn: The end pfn of the node | ||
| 2105 | * | ||
| 2106 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
| 2107 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
| 2108 | * be hotplugged even though no physical memory exists. This function allows | ||
| 2109 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
| 2110 | * be used later. | ||
| 2111 | */ | ||
| 2112 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 2113 | void __init push_node_boundaries(unsigned int nid, | ||
| 2114 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 2115 | { | ||
| 2116 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
| 2117 | nid, start_pfn, end_pfn); | ||
| 2118 | |||
| 2119 | /* Initialise the boundary for this node if necessary */ | ||
| 2120 | if (node_boundary_end_pfn[nid] == 0) | ||
| 2121 | node_boundary_start_pfn[nid] = -1UL; | ||
| 2122 | |||
| 2123 | /* Update the boundaries */ | ||
| 2124 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
| 2125 | node_boundary_start_pfn[nid] = start_pfn; | ||
| 2126 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
| 2127 | node_boundary_end_pfn[nid] = end_pfn; | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
| 2131 | static void __init account_node_boundary(unsigned int nid, | ||
| 2132 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
| 2133 | { | ||
| 2134 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
| 2135 | nid, *start_pfn, *end_pfn); | ||
| 2136 | |||
| 2137 | /* Return if boundary information has not been provided */ | ||
| 2138 | if (node_boundary_end_pfn[nid] == 0) | ||
| 2139 | return; | ||
| 2140 | |||
| 2141 | /* Check the boundaries and update if necessary */ | ||
| 2142 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
| 2143 | *start_pfn = node_boundary_start_pfn[nid]; | ||
| 2144 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
| 2145 | *end_pfn = node_boundary_end_pfn[nid]; | ||
| 2146 | } | ||
| 2147 | #else | ||
| 2148 | void __init push_node_boundaries(unsigned int nid, | ||
| 2149 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
| 2150 | |||
| 2151 | static void __init account_node_boundary(unsigned int nid, | ||
| 2152 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
| 2153 | #endif | ||
| 2154 | |||
| 2155 | |||
| 2156 | /** | ||
| 2157 | * get_pfn_range_for_nid - Return the start and end page frames for a node | ||
| 2158 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned | ||
| 2159 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn | ||
| 2160 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn | ||
| 2161 | * | ||
| 2162 | * It returns the start and end page frame of a node based on information | ||
| 2163 | * provided by an arch calling add_active_range(). If called for a node | ||
| 2164 | * with no available memory, a warning is printed and the start and end | ||
| 2165 | * PFNs will be 0 | ||
| 2166 | */ | ||
| 2167 | void __init get_pfn_range_for_nid(unsigned int nid, | ||
| 2168 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
| 2169 | { | ||
| 2170 | int i; | ||
| 2171 | *start_pfn = -1UL; | ||
| 2172 | *end_pfn = 0; | ||
| 2173 | |||
| 2174 | for_each_active_range_index_in_nid(i, nid) { | ||
| 2175 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); | ||
| 2176 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | ||
| 2177 | } | ||
| 2178 | |||
| 2179 | if (*start_pfn == -1UL) { | ||
| 2180 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
| 2181 | *start_pfn = 0; | ||
| 2182 | } | ||
| 2183 | |||
| 2184 | /* Push the node boundaries out if requested */ | ||
| 2185 | account_node_boundary(nid, start_pfn, end_pfn); | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Return the number of pages a zone spans in a node, including holes | ||
| 2190 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | ||
| 2191 | */ | ||
| 2192 | unsigned long __init zone_spanned_pages_in_node(int nid, | ||
| 2193 | unsigned long zone_type, | ||
| 2194 | unsigned long *ignored) | ||
| 2195 | { | ||
| 2196 | unsigned long node_start_pfn, node_end_pfn; | ||
| 2197 | unsigned long zone_start_pfn, zone_end_pfn; | ||
| 2198 | |||
| 2199 | /* Get the start and end of the node and zone */ | ||
| 2200 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
| 2201 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | ||
| 2202 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | ||
| 2203 | |||
| 2204 | /* Check that this node has pages within the zone's required range */ | ||
| 2205 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | ||
| 2206 | return 0; | ||
| 2207 | |||
| 2208 | /* Move the zone boundaries inside the node if necessary */ | ||
| 2209 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | ||
| 2210 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | ||
| 2211 | |||
| 2212 | /* Return the spanned pages */ | ||
| 2213 | return zone_end_pfn - zone_start_pfn; | ||
| 2214 | } | ||
| 2215 | |||
| 2216 | /* | ||
| 2217 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | ||
| 2218 | * then all holes in the requested range will be accounted for | ||
| 2219 | */ | ||
| 2220 | unsigned long __init __absent_pages_in_range(int nid, | ||
| 2221 | unsigned long range_start_pfn, | ||
| 2222 | unsigned long range_end_pfn) | ||
| 2223 | { | ||
| 2224 | int i = 0; | ||
| 2225 | unsigned long prev_end_pfn = 0, hole_pages = 0; | ||
| 2226 | unsigned long start_pfn; | ||
| 2227 | |||
| 2228 | /* Find the end_pfn of the first active range of pfns in the node */ | ||
| 2229 | i = first_active_region_index_in_nid(nid); | ||
| 2230 | if (i == -1) | ||
| 2231 | return 0; | ||
| 2232 | |||
| 2233 | /* Account for ranges before physical memory on this node */ | ||
| 2234 | if (early_node_map[i].start_pfn > range_start_pfn) | ||
| 2235 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; | ||
| 2236 | |||
| 2237 | prev_end_pfn = early_node_map[i].start_pfn; | ||
| 2238 | |||
| 2239 | /* Find all holes for the zone within the node */ | ||
| 2240 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { | ||
| 2241 | |||
| 2242 | /* No need to continue if prev_end_pfn is outside the zone */ | ||
| 2243 | if (prev_end_pfn >= range_end_pfn) | ||
| 2244 | break; | ||
| 2245 | |||
| 2246 | /* Make sure the end of the zone is not within the hole */ | ||
| 2247 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); | ||
| 2248 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); | ||
| 2249 | |||
| 2250 | /* Update the hole size cound and move on */ | ||
| 2251 | if (start_pfn > range_start_pfn) { | ||
| 2252 | BUG_ON(prev_end_pfn > start_pfn); | ||
| 2253 | hole_pages += start_pfn - prev_end_pfn; | ||
| 2254 | } | ||
| 2255 | prev_end_pfn = early_node_map[i].end_pfn; | ||
| 2256 | } | ||
| 2257 | |||
| 2258 | /* Account for ranges past physical memory on this node */ | ||
| 2259 | if (range_end_pfn > prev_end_pfn) | ||
| 2260 | hole_pages = range_end_pfn - | ||
| 2261 | max(range_start_pfn, prev_end_pfn); | ||
| 2262 | |||
| 2263 | return hole_pages; | ||
| 2264 | } | ||
| 2265 | |||
| 2266 | /** | ||
| 2267 | * absent_pages_in_range - Return number of page frames in holes within a range | ||
| 2268 | * @start_pfn: The start PFN to start searching for holes | ||
| 2269 | * @end_pfn: The end PFN to stop searching for holes | ||
| 2270 | * | ||
| 2271 | * It returns the number of pages frames in memory holes within a range | ||
| 2272 | */ | ||
| 2273 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | ||
| 2274 | unsigned long end_pfn) | ||
| 2275 | { | ||
| 2276 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | ||
| 2277 | } | ||
| 2278 | |||
| 2279 | /* Return the number of page frames in holes in a zone on a node */ | ||
| 2280 | unsigned long __init zone_absent_pages_in_node(int nid, | ||
| 2281 | unsigned long zone_type, | ||
| 2282 | unsigned long *ignored) | ||
| 2283 | { | ||
| 2284 | unsigned long node_start_pfn, node_end_pfn; | ||
| 2285 | unsigned long zone_start_pfn, zone_end_pfn; | ||
| 2286 | |||
| 2287 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
| 2288 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], | ||
| 2289 | node_start_pfn); | ||
| 2290 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], | ||
| 2291 | node_end_pfn); | ||
| 2292 | |||
| 2293 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | ||
| 2294 | } | ||
| 2295 | |||
| 2296 | /* Return the zone index a PFN is in */ | ||
| 2297 | int memmap_zone_idx(struct page *lmem_map) | ||
| 2298 | { | ||
| 2299 | int i; | ||
| 2300 | unsigned long phys_addr = virt_to_phys(lmem_map); | ||
| 2301 | unsigned long pfn = phys_addr >> PAGE_SHIFT; | ||
| 2302 | |||
| 2303 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2304 | if (pfn < arch_zone_highest_possible_pfn[i]) | ||
| 2305 | break; | ||
| 2306 | |||
| 2307 | return i; | ||
| 2308 | } | ||
| 2309 | #else | ||
| 2310 | static inline unsigned long zone_spanned_pages_in_node(int nid, | ||
| 2311 | unsigned long zone_type, | ||
| 2312 | unsigned long *zones_size) | ||
| 2313 | { | ||
| 2314 | return zones_size[zone_type]; | ||
| 2315 | } | ||
| 2316 | |||
| 2317 | static inline unsigned long zone_absent_pages_in_node(int nid, | ||
| 2318 | unsigned long zone_type, | ||
| 2319 | unsigned long *zholes_size) | ||
| 2320 | { | ||
| 2321 | if (!zholes_size) | ||
| 2322 | return 0; | ||
| 2323 | |||
| 2324 | return zholes_size[zone_type]; | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | static inline int memmap_zone_idx(struct page *lmem_map) | ||
| 2328 | { | ||
| 2329 | return MAX_NR_ZONES; | ||
| 2330 | } | ||
| 2331 | #endif | ||
| 2332 | |||
| 2333 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | ||
| 2334 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 2335 | { | ||
| 2336 | unsigned long realtotalpages, totalpages = 0; | ||
| 2337 | enum zone_type i; | ||
| 2338 | |||
| 2339 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2340 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | ||
| 2341 | zones_size); | ||
| 2342 | pgdat->node_spanned_pages = totalpages; | ||
| 2343 | |||
| 2344 | realtotalpages = totalpages; | ||
| 2345 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2346 | realtotalpages -= | ||
| 2347 | zone_absent_pages_in_node(pgdat->node_id, i, | ||
| 2348 | zholes_size); | ||
| 2349 | pgdat->node_present_pages = realtotalpages; | ||
| 2350 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | ||
| 2351 | realtotalpages); | ||
| 2352 | } | ||
| 2353 | |||
| 2135 | /* | 2354 | /* | 
| 2136 | * Set up the zone data structures: | 2355 | * Set up the zone data structures: | 
| 2137 | * - mark all pages reserved | 2356 | * - mark all pages reserved | 
| @@ -2141,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
| 2141 | static void __meminit free_area_init_core(struct pglist_data *pgdat, | 2360 | static void __meminit free_area_init_core(struct pglist_data *pgdat, | 
| 2142 | unsigned long *zones_size, unsigned long *zholes_size) | 2361 | unsigned long *zones_size, unsigned long *zholes_size) | 
| 2143 | { | 2362 | { | 
| 2144 | unsigned long j; | 2363 | enum zone_type j; | 
| 2145 | int nid = pgdat->node_id; | 2364 | int nid = pgdat->node_id; | 
| 2146 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 2365 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 
| 2147 | int ret; | 2366 | int ret; | 
| @@ -2153,18 +2372,47 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
| 2153 | 2372 | ||
| 2154 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2373 | for (j = 0; j < MAX_NR_ZONES; j++) { | 
| 2155 | struct zone *zone = pgdat->node_zones + j; | 2374 | struct zone *zone = pgdat->node_zones + j; | 
| 2156 | unsigned long size, realsize; | 2375 | unsigned long size, realsize, memmap_pages; | 
| 2157 | 2376 | ||
| 2158 | realsize = size = zones_size[j]; | 2377 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 
| 2159 | if (zholes_size) | 2378 | realsize = size - zone_absent_pages_in_node(nid, j, | 
| 2160 | realsize -= zholes_size[j]; | 2379 | zholes_size); | 
| 2161 | 2380 | ||
| 2162 | if (j < ZONE_HIGHMEM) | 2381 | /* | 
| 2382 | * Adjust realsize so that it accounts for how much memory | ||
| 2383 | * is used by this zone for memmap. This affects the watermark | ||
| 2384 | * and per-cpu initialisations | ||
| 2385 | */ | ||
| 2386 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; | ||
| 2387 | if (realsize >= memmap_pages) { | ||
| 2388 | realsize -= memmap_pages; | ||
| 2389 | printk(KERN_DEBUG | ||
| 2390 | " %s zone: %lu pages used for memmap\n", | ||
| 2391 | zone_names[j], memmap_pages); | ||
| 2392 | } else | ||
| 2393 | printk(KERN_WARNING | ||
| 2394 | " %s zone: %lu pages exceeds realsize %lu\n", | ||
| 2395 | zone_names[j], memmap_pages, realsize); | ||
| 2396 | |||
| 2397 | /* Account for reserved DMA pages */ | ||
| 2398 | if (j == ZONE_DMA && realsize > dma_reserve) { | ||
| 2399 | realsize -= dma_reserve; | ||
| 2400 | printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", | ||
| 2401 | dma_reserve); | ||
| 2402 | } | ||
| 2403 | |||
| 2404 | if (!is_highmem_idx(j)) | ||
| 2163 | nr_kernel_pages += realsize; | 2405 | nr_kernel_pages += realsize; | 
| 2164 | nr_all_pages += realsize; | 2406 | nr_all_pages += realsize; | 
| 2165 | 2407 | ||
| 2166 | zone->spanned_pages = size; | 2408 | zone->spanned_pages = size; | 
| 2167 | zone->present_pages = realsize; | 2409 | zone->present_pages = realsize; | 
| 2410 | #ifdef CONFIG_NUMA | ||
| 2411 | zone->node = nid; | ||
| 2412 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | ||
| 2413 | / 100; | ||
| 2414 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | ||
| 2415 | #endif | ||
| 2168 | zone->name = zone_names[j]; | 2416 | zone->name = zone_names[j]; | 
| 2169 | spin_lock_init(&zone->lock); | 2417 | spin_lock_init(&zone->lock); | 
| 2170 | spin_lock_init(&zone->lru_lock); | 2418 | spin_lock_init(&zone->lru_lock); | 
| @@ -2181,6 +2429,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
| 2181 | zone->nr_scan_inactive = 0; | 2429 | zone->nr_scan_inactive = 0; | 
| 2182 | zone->nr_active = 0; | 2430 | zone->nr_active = 0; | 
| 2183 | zone->nr_inactive = 0; | 2431 | zone->nr_inactive = 0; | 
| 2432 | zap_zone_vm_stats(zone); | ||
| 2184 | atomic_set(&zone->reclaim_in_progress, 0); | 2433 | atomic_set(&zone->reclaim_in_progress, 0); | 
| 2185 | if (!size) | 2434 | if (!size) | 
| 2186 | continue; | 2435 | continue; | 
| @@ -2222,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 2222 | /* | 2471 | /* | 
| 2223 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2472 | * With no DISCONTIG, the global mem_map is just set as node 0's | 
| 2224 | */ | 2473 | */ | 
| 2225 | if (pgdat == NODE_DATA(0)) | 2474 | if (pgdat == NODE_DATA(0)) { | 
| 2226 | mem_map = NODE_DATA(0)->node_mem_map; | 2475 | mem_map = NODE_DATA(0)->node_mem_map; | 
| 2476 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 2477 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | ||
| 2478 | mem_map -= pgdat->node_start_pfn; | ||
| 2479 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 2480 | } | ||
| 2227 | #endif | 2481 | #endif | 
| 2228 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2482 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 
| 2229 | } | 2483 | } | 
| @@ -2234,327 +2488,280 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | |||
| 2234 | { | 2488 | { | 
| 2235 | pgdat->node_id = nid; | 2489 | pgdat->node_id = nid; | 
| 2236 | pgdat->node_start_pfn = node_start_pfn; | 2490 | pgdat->node_start_pfn = node_start_pfn; | 
| 2237 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | 2491 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 
| 2238 | 2492 | ||
| 2239 | alloc_node_mem_map(pgdat); | 2493 | alloc_node_mem_map(pgdat); | 
| 2240 | 2494 | ||
| 2241 | free_area_init_core(pgdat, zones_size, zholes_size); | 2495 | free_area_init_core(pgdat, zones_size, zholes_size); | 
| 2242 | } | 2496 | } | 
| 2243 | 2497 | ||
| 2244 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2498 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 
| 2245 | static bootmem_data_t contig_bootmem_data; | 2499 | /** | 
| 2246 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2500 | * add_active_range - Register a range of PFNs backed by physical memory | 
| 2501 | * @nid: The node ID the range resides on | ||
| 2502 | * @start_pfn: The start PFN of the available physical memory | ||
| 2503 | * @end_pfn: The end PFN of the available physical memory | ||
| 2504 | * | ||
| 2505 | * These ranges are stored in an early_node_map[] and later used by | ||
| 2506 | * free_area_init_nodes() to calculate zone sizes and holes. If the | ||
| 2507 | * range spans a memory hole, it is up to the architecture to ensure | ||
| 2508 | * the memory is not freed by the bootmem allocator. If possible | ||
| 2509 | * the range being registered will be merged with existing ranges. | ||
| 2510 | */ | ||
| 2511 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, | ||
| 2512 | unsigned long end_pfn) | ||
| 2513 | { | ||
| 2514 | int i; | ||
| 2247 | 2515 | ||
| 2248 | EXPORT_SYMBOL(contig_page_data); | 2516 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " | 
| 2249 | #endif | 2517 | "%d entries of %d used\n", | 
| 2518 | nid, start_pfn, end_pfn, | ||
| 2519 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
| 2250 | 2520 | ||
| 2251 | void __init free_area_init(unsigned long *zones_size) | 2521 | /* Merge with existing active regions if possible */ | 
| 2252 | { | 2522 | for (i = 0; i < nr_nodemap_entries; i++) { | 
| 2253 | free_area_init_node(0, NODE_DATA(0), zones_size, | 2523 | if (early_node_map[i].nid != nid) | 
| 2254 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2524 | continue; | 
| 2255 | } | ||
| 2256 | 2525 | ||
| 2257 | #ifdef CONFIG_PROC_FS | 2526 | /* Skip if an existing region covers this new one */ | 
| 2527 | if (start_pfn >= early_node_map[i].start_pfn && | ||
| 2528 | end_pfn <= early_node_map[i].end_pfn) | ||
| 2529 | return; | ||
| 2258 | 2530 | ||
| 2259 | #include <linux/seq_file.h> | 2531 | /* Merge forward if suitable */ | 
| 2532 | if (start_pfn <= early_node_map[i].end_pfn && | ||
| 2533 | end_pfn > early_node_map[i].end_pfn) { | ||
| 2534 | early_node_map[i].end_pfn = end_pfn; | ||
| 2535 | return; | ||
| 2536 | } | ||
| 2260 | 2537 | ||
| 2261 | static void *frag_start(struct seq_file *m, loff_t *pos) | 2538 | /* Merge backward if suitable */ | 
| 2262 | { | 2539 | if (start_pfn < early_node_map[i].end_pfn && | 
| 2263 | pg_data_t *pgdat; | 2540 | end_pfn >= early_node_map[i].start_pfn) { | 
| 2264 | loff_t node = *pos; | 2541 | early_node_map[i].start_pfn = start_pfn; | 
| 2265 | for (pgdat = first_online_pgdat(); | 2542 | return; | 
| 2266 | pgdat && node; | 2543 | } | 
| 2267 | pgdat = next_online_pgdat(pgdat)) | 2544 | } | 
| 2268 | --node; | ||
| 2269 | 2545 | ||
| 2270 | return pgdat; | 2546 | /* Check that early_node_map is large enough */ | 
| 2547 | if (i >= MAX_ACTIVE_REGIONS) { | ||
| 2548 | printk(KERN_CRIT "More than %d memory regions, truncating\n", | ||
| 2549 | MAX_ACTIVE_REGIONS); | ||
| 2550 | return; | ||
| 2551 | } | ||
| 2552 | |||
| 2553 | early_node_map[i].nid = nid; | ||
| 2554 | early_node_map[i].start_pfn = start_pfn; | ||
| 2555 | early_node_map[i].end_pfn = end_pfn; | ||
| 2556 | nr_nodemap_entries = i + 1; | ||
| 2271 | } | 2557 | } | 
| 2272 | 2558 | ||
| 2273 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | 2559 | /** | 
| 2560 | * shrink_active_range - Shrink an existing registered range of PFNs | ||
| 2561 | * @nid: The node id the range is on that should be shrunk | ||
| 2562 | * @old_end_pfn: The old end PFN of the range | ||
| 2563 | * @new_end_pfn: The new PFN of the range | ||
| 2564 | * | ||
| 2565 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. | ||
| 2566 | * The map is kept at the end physical page range that has already been | ||
| 2567 | * registered with add_active_range(). This function allows an arch to shrink | ||
| 2568 | * an existing registered range. | ||
| 2569 | */ | ||
| 2570 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, | ||
| 2571 | unsigned long new_end_pfn) | ||
| 2274 | { | 2572 | { | 
| 2275 | pg_data_t *pgdat = (pg_data_t *)arg; | 2573 | int i; | 
| 2276 | 2574 | ||
| 2277 | (*pos)++; | 2575 | /* Find the old active region end and shrink */ | 
| 2278 | return next_online_pgdat(pgdat); | 2576 | for_each_active_range_index_in_nid(i, nid) | 
| 2577 | if (early_node_map[i].end_pfn == old_end_pfn) { | ||
| 2578 | early_node_map[i].end_pfn = new_end_pfn; | ||
| 2579 | break; | ||
| 2580 | } | ||
| 2279 | } | 2581 | } | 
| 2280 | 2582 | ||
| 2281 | static void frag_stop(struct seq_file *m, void *arg) | 2583 | /** | 
| 2584 | * remove_all_active_ranges - Remove all currently registered regions | ||
| 2585 | * During discovery, it may be found that a table like SRAT is invalid | ||
| 2586 | * and an alternative discovery method must be used. This function removes | ||
| 2587 | * all currently registered regions. | ||
| 2588 | */ | ||
| 2589 | void __init remove_all_active_ranges() | ||
| 2282 | { | 2590 | { | 
| 2591 | memset(early_node_map, 0, sizeof(early_node_map)); | ||
| 2592 | nr_nodemap_entries = 0; | ||
| 2593 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 2594 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
| 2595 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
| 2596 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 2283 | } | 2597 | } | 
| 2284 | 2598 | ||
| 2285 | /* | 2599 | /* Compare two active node_active_regions */ | 
| 2286 | * This walks the free areas for each zone. | 2600 | static int __init cmp_node_active_region(const void *a, const void *b) | 
| 2287 | */ | ||
| 2288 | static int frag_show(struct seq_file *m, void *arg) | ||
| 2289 | { | 2601 | { | 
| 2290 | pg_data_t *pgdat = (pg_data_t *)arg; | 2602 | struct node_active_region *arange = (struct node_active_region *)a; | 
| 2291 | struct zone *zone; | 2603 | struct node_active_region *brange = (struct node_active_region *)b; | 
| 2292 | struct zone *node_zones = pgdat->node_zones; | ||
| 2293 | unsigned long flags; | ||
| 2294 | int order; | ||
| 2295 | 2604 | ||
| 2296 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2605 | /* Done this way to avoid overflows */ | 
| 2297 | if (!populated_zone(zone)) | 2606 | if (arange->start_pfn > brange->start_pfn) | 
| 2298 | continue; | 2607 | return 1; | 
| 2608 | if (arange->start_pfn < brange->start_pfn) | ||
| 2609 | return -1; | ||
| 2299 | 2610 | ||
| 2300 | spin_lock_irqsave(&zone->lock, flags); | ||
| 2301 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 2302 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 2303 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
| 2304 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 2305 | seq_putc(m, '\n'); | ||
| 2306 | } | ||
| 2307 | return 0; | 2611 | return 0; | 
| 2308 | } | 2612 | } | 
| 2309 | 2613 | ||
| 2310 | struct seq_operations fragmentation_op = { | 2614 | /* sort the node_map by start_pfn */ | 
| 2311 | .start = frag_start, | 2615 | static void __init sort_node_map(void) | 
| 2312 | .next = frag_next, | ||
| 2313 | .stop = frag_stop, | ||
| 2314 | .show = frag_show, | ||
| 2315 | }; | ||
| 2316 | |||
| 2317 | /* | ||
| 2318 | * Output information about zones in @pgdat. | ||
| 2319 | */ | ||
| 2320 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
| 2321 | { | 2616 | { | 
| 2322 | pg_data_t *pgdat = arg; | 2617 | sort(early_node_map, (size_t)nr_nodemap_entries, | 
| 2323 | struct zone *zone; | 2618 | sizeof(struct node_active_region), | 
| 2324 | struct zone *node_zones = pgdat->node_zones; | 2619 | cmp_node_active_region, NULL); | 
| 2325 | unsigned long flags; | 2620 | } | 
| 2326 | |||
| 2327 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
| 2328 | int i; | ||
| 2329 | 2621 | ||
| 2330 | if (!populated_zone(zone)) | 2622 | /* Find the lowest pfn for a node. This depends on a sorted early_node_map */ | 
| 2331 | continue; | 2623 | unsigned long __init find_min_pfn_for_node(unsigned long nid) | 
| 2624 | { | ||
| 2625 | int i; | ||
| 2332 | 2626 | ||
| 2333 | spin_lock_irqsave(&zone->lock, flags); | 2627 | /* Assuming a sorted map, the first range found has the starting pfn */ | 
| 2334 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 2628 | for_each_active_range_index_in_nid(i, nid) | 
| 2335 | seq_printf(m, | 2629 | return early_node_map[i].start_pfn; | 
| 2336 | "\n pages free %lu" | ||
| 2337 | "\n min %lu" | ||
| 2338 | "\n low %lu" | ||
| 2339 | "\n high %lu" | ||
| 2340 | "\n active %lu" | ||
| 2341 | "\n inactive %lu" | ||
| 2342 | "\n scanned %lu (a: %lu i: %lu)" | ||
| 2343 | "\n spanned %lu" | ||
| 2344 | "\n present %lu", | ||
| 2345 | zone->free_pages, | ||
| 2346 | zone->pages_min, | ||
| 2347 | zone->pages_low, | ||
| 2348 | zone->pages_high, | ||
| 2349 | zone->nr_active, | ||
| 2350 | zone->nr_inactive, | ||
| 2351 | zone->pages_scanned, | ||
| 2352 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
| 2353 | zone->spanned_pages, | ||
| 2354 | zone->present_pages); | ||
| 2355 | seq_printf(m, | ||
| 2356 | "\n protection: (%lu", | ||
| 2357 | zone->lowmem_reserve[0]); | ||
| 2358 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
| 2359 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
| 2360 | seq_printf(m, | ||
| 2361 | ")" | ||
| 2362 | "\n pagesets"); | ||
| 2363 | for_each_online_cpu(i) { | ||
| 2364 | struct per_cpu_pageset *pageset; | ||
| 2365 | int j; | ||
| 2366 | 2630 | ||
| 2367 | pageset = zone_pcp(zone, i); | 2631 | printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); | 
| 2368 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 2369 | if (pageset->pcp[j].count) | ||
| 2370 | break; | ||
| 2371 | } | ||
| 2372 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
| 2373 | continue; | ||
| 2374 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 2375 | seq_printf(m, | ||
| 2376 | "\n cpu: %i pcp: %i" | ||
| 2377 | "\n count: %i" | ||
| 2378 | "\n high: %i" | ||
| 2379 | "\n batch: %i", | ||
| 2380 | i, j, | ||
| 2381 | pageset->pcp[j].count, | ||
| 2382 | pageset->pcp[j].high, | ||
| 2383 | pageset->pcp[j].batch); | ||
| 2384 | } | ||
| 2385 | #ifdef CONFIG_NUMA | ||
| 2386 | seq_printf(m, | ||
| 2387 | "\n numa_hit: %lu" | ||
| 2388 | "\n numa_miss: %lu" | ||
| 2389 | "\n numa_foreign: %lu" | ||
| 2390 | "\n interleave_hit: %lu" | ||
| 2391 | "\n local_node: %lu" | ||
| 2392 | "\n other_node: %lu", | ||
| 2393 | pageset->numa_hit, | ||
| 2394 | pageset->numa_miss, | ||
| 2395 | pageset->numa_foreign, | ||
| 2396 | pageset->interleave_hit, | ||
| 2397 | pageset->local_node, | ||
| 2398 | pageset->other_node); | ||
| 2399 | #endif | ||
| 2400 | } | ||
| 2401 | seq_printf(m, | ||
| 2402 | "\n all_unreclaimable: %u" | ||
| 2403 | "\n prev_priority: %i" | ||
| 2404 | "\n temp_priority: %i" | ||
| 2405 | "\n start_pfn: %lu", | ||
| 2406 | zone->all_unreclaimable, | ||
| 2407 | zone->prev_priority, | ||
| 2408 | zone->temp_priority, | ||
| 2409 | zone->zone_start_pfn); | ||
| 2410 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 2411 | seq_putc(m, '\n'); | ||
| 2412 | } | ||
| 2413 | return 0; | 2632 | return 0; | 
| 2414 | } | 2633 | } | 
| 2415 | 2634 | ||
| 2416 | struct seq_operations zoneinfo_op = { | 2635 | /** | 
| 2417 | .start = frag_start, /* iterate over all zones. The same as in | 2636 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 
| 2418 | * fragmentation. */ | 2637 | * | 
| 2419 | .next = frag_next, | 2638 | * It returns the minimum PFN based on information provided via | 
| 2420 | .stop = frag_stop, | 2639 | * add_active_range() | 
| 2421 | .show = zoneinfo_show, | 2640 | */ | 
| 2422 | }; | 2641 | unsigned long __init find_min_pfn_with_active_regions(void) | 
| 2423 | 2642 | { | |
| 2424 | static char *vmstat_text[] = { | 2643 | return find_min_pfn_for_node(MAX_NUMNODES); | 
| 2425 | "nr_dirty", | 2644 | } | 
| 2426 | "nr_writeback", | ||
| 2427 | "nr_unstable", | ||
| 2428 | "nr_page_table_pages", | ||
| 2429 | "nr_mapped", | ||
| 2430 | "nr_slab", | ||
| 2431 | |||
| 2432 | "pgpgin", | ||
| 2433 | "pgpgout", | ||
| 2434 | "pswpin", | ||
| 2435 | "pswpout", | ||
| 2436 | |||
| 2437 | "pgalloc_high", | ||
| 2438 | "pgalloc_normal", | ||
| 2439 | "pgalloc_dma32", | ||
| 2440 | "pgalloc_dma", | ||
| 2441 | |||
| 2442 | "pgfree", | ||
| 2443 | "pgactivate", | ||
| 2444 | "pgdeactivate", | ||
| 2445 | |||
| 2446 | "pgfault", | ||
| 2447 | "pgmajfault", | ||
| 2448 | |||
| 2449 | "pgrefill_high", | ||
| 2450 | "pgrefill_normal", | ||
| 2451 | "pgrefill_dma32", | ||
| 2452 | "pgrefill_dma", | ||
| 2453 | |||
| 2454 | "pgsteal_high", | ||
| 2455 | "pgsteal_normal", | ||
| 2456 | "pgsteal_dma32", | ||
| 2457 | "pgsteal_dma", | ||
| 2458 | |||
| 2459 | "pgscan_kswapd_high", | ||
| 2460 | "pgscan_kswapd_normal", | ||
| 2461 | "pgscan_kswapd_dma32", | ||
| 2462 | "pgscan_kswapd_dma", | ||
| 2463 | |||
| 2464 | "pgscan_direct_high", | ||
| 2465 | "pgscan_direct_normal", | ||
| 2466 | "pgscan_direct_dma32", | ||
| 2467 | "pgscan_direct_dma", | ||
| 2468 | |||
| 2469 | "pginodesteal", | ||
| 2470 | "slabs_scanned", | ||
| 2471 | "kswapd_steal", | ||
| 2472 | "kswapd_inodesteal", | ||
| 2473 | "pageoutrun", | ||
| 2474 | "allocstall", | ||
| 2475 | |||
| 2476 | "pgrotated", | ||
| 2477 | "nr_bounce", | ||
| 2478 | }; | ||
| 2479 | 2645 | ||
| 2480 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | 2646 | /** | 
| 2647 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
| 2648 | * | ||
| 2649 | * It returns the maximum PFN based on information provided via | ||
| 2650 | * add_active_range() | ||
| 2651 | */ | ||
| 2652 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
| 2481 | { | 2653 | { | 
| 2482 | struct page_state *ps; | 2654 | int i; | 
| 2655 | unsigned long max_pfn = 0; | ||
| 2483 | 2656 | ||
| 2484 | if (*pos >= ARRAY_SIZE(vmstat_text)) | 2657 | for (i = 0; i < nr_nodemap_entries; i++) | 
| 2485 | return NULL; | 2658 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | 
| 2486 | 2659 | ||
| 2487 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | 2660 | return max_pfn; | 
| 2488 | m->private = ps; | ||
| 2489 | if (!ps) | ||
| 2490 | return ERR_PTR(-ENOMEM); | ||
| 2491 | get_full_page_state(ps); | ||
| 2492 | ps->pgpgin /= 2; /* sectors -> kbytes */ | ||
| 2493 | ps->pgpgout /= 2; | ||
| 2494 | return (unsigned long *)ps + *pos; | ||
| 2495 | } | 2661 | } | 
| 2496 | 2662 | ||
| 2497 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | 2663 | /** | 
| 2498 | { | 2664 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 
| 2499 | (*pos)++; | 2665 | * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA | 
| 2500 | if (*pos >= ARRAY_SIZE(vmstat_text)) | 2666 | * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 | 
| 2501 | return NULL; | 2667 | * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL | 
| 2502 | return (unsigned long *)m->private + *pos; | 2668 | * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM | 
| 2503 | } | 2669 | * | 
| 2670 | * This will call free_area_init_node() for each active node in the system. | ||
| 2671 | * Using the page ranges provided by add_active_range(), the size of each | ||
| 2672 | * zone in each node and their holes is calculated. If the maximum PFN | ||
| 2673 | * between two adjacent zones match, it is assumed that the zone is empty. | ||
| 2674 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | ||
| 2675 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | ||
| 2676 | * starts where the previous one ended. For example, ZONE_DMA32 starts | ||
| 2677 | * at arch_max_dma_pfn. | ||
| 2678 | */ | ||
| 2679 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | ||
| 2680 | { | ||
| 2681 | unsigned long nid; | ||
| 2682 | enum zone_type i; | ||
| 2683 | |||
| 2684 | /* Record where the zone boundaries are */ | ||
| 2685 | memset(arch_zone_lowest_possible_pfn, 0, | ||
| 2686 | sizeof(arch_zone_lowest_possible_pfn)); | ||
| 2687 | memset(arch_zone_highest_possible_pfn, 0, | ||
| 2688 | sizeof(arch_zone_highest_possible_pfn)); | ||
| 2689 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | ||
| 2690 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | ||
| 2691 | for (i = 1; i < MAX_NR_ZONES; i++) { | ||
| 2692 | arch_zone_lowest_possible_pfn[i] = | ||
| 2693 | arch_zone_highest_possible_pfn[i-1]; | ||
| 2694 | arch_zone_highest_possible_pfn[i] = | ||
| 2695 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | ||
| 2696 | } | ||
| 2504 | 2697 | ||
| 2505 | static int vmstat_show(struct seq_file *m, void *arg) | 2698 | /* Regions in the early_node_map can be in any order */ | 
| 2506 | { | 2699 | sort_node_map(); | 
| 2507 | unsigned long *l = arg; | ||
| 2508 | unsigned long off = l - (unsigned long *)m->private; | ||
| 2509 | 2700 | ||
| 2510 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | 2701 | /* Print out the zone ranges */ | 
| 2511 | return 0; | 2702 | printk("Zone PFN ranges:\n"); | 
| 2703 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2704 | printk(" %-8s %8lu -> %8lu\n", | ||
| 2705 | zone_names[i], | ||
| 2706 | arch_zone_lowest_possible_pfn[i], | ||
| 2707 | arch_zone_highest_possible_pfn[i]); | ||
| 2708 | |||
| 2709 | /* Print out the early_node_map[] */ | ||
| 2710 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); | ||
| 2711 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 2712 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, | ||
| 2713 | early_node_map[i].start_pfn, | ||
| 2714 | early_node_map[i].end_pfn); | ||
| 2715 | |||
| 2716 | /* Initialise every node */ | ||
| 2717 | for_each_online_node(nid) { | ||
| 2718 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 2719 | free_area_init_node(nid, pgdat, NULL, | ||
| 2720 | find_min_pfn_for_node(nid), NULL); | ||
| 2721 | } | ||
| 2512 | } | 2722 | } | 
| 2723 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 2513 | 2724 | ||
| 2514 | static void vmstat_stop(struct seq_file *m, void *arg) | 2725 | /** | 
| 2726 | * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA | ||
| 2727 | * @new_dma_reserve - The number of pages to mark reserved | ||
| 2728 | * | ||
| 2729 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | ||
| 2730 | * In the DMA zone, a significant percentage may be consumed by kernel image | ||
| 2731 | * and other unfreeable allocations which can skew the watermarks badly. This | ||
| 2732 | * function may optionally be used to account for unfreeable pages in | ||
| 2733 | * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize | ||
| 2734 | */ | ||
| 2735 | void __init set_dma_reserve(unsigned long new_dma_reserve) | ||
| 2515 | { | 2736 | { | 
| 2516 | kfree(m->private); | 2737 | dma_reserve = new_dma_reserve; | 
| 2517 | m->private = NULL; | ||
| 2518 | } | 2738 | } | 
| 2519 | 2739 | ||
| 2520 | struct seq_operations vmstat_op = { | 2740 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 
| 2521 | .start = vmstat_start, | 2741 | static bootmem_data_t contig_bootmem_data; | 
| 2522 | .next = vmstat_next, | 2742 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 
| 2523 | .stop = vmstat_stop, | 2743 | |
| 2524 | .show = vmstat_show, | 2744 | EXPORT_SYMBOL(contig_page_data); | 
| 2525 | }; | 2745 | #endif | 
| 2526 | 2746 | ||
| 2527 | #endif /* CONFIG_PROC_FS */ | 2747 | void __init free_area_init(unsigned long *zones_size) | 
| 2748 | { | ||
| 2749 | free_area_init_node(0, NODE_DATA(0), zones_size, | ||
| 2750 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | ||
| 2751 | } | ||
| 2528 | 2752 | ||
| 2529 | #ifdef CONFIG_HOTPLUG_CPU | 2753 | #ifdef CONFIG_HOTPLUG_CPU | 
| 2530 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2754 | static int page_alloc_cpu_notify(struct notifier_block *self, | 
| 2531 | unsigned long action, void *hcpu) | 2755 | unsigned long action, void *hcpu) | 
| 2532 | { | 2756 | { | 
| 2533 | int cpu = (unsigned long)hcpu; | 2757 | int cpu = (unsigned long)hcpu; | 
| 2534 | long *count; | ||
| 2535 | unsigned long *src, *dest; | ||
| 2536 | 2758 | ||
| 2537 | if (action == CPU_DEAD) { | 2759 | if (action == CPU_DEAD) { | 
| 2538 | int i; | ||
| 2539 | |||
| 2540 | /* Drain local pagecache count. */ | ||
| 2541 | count = &per_cpu(nr_pagecache_local, cpu); | ||
| 2542 | atomic_add(*count, &nr_pagecache); | ||
| 2543 | *count = 0; | ||
| 2544 | local_irq_disable(); | 2760 | local_irq_disable(); | 
| 2545 | __drain_pages(cpu); | 2761 | __drain_pages(cpu); | 
| 2546 | 2762 | vm_events_fold_cpu(cpu); | |
| 2547 | /* Add dead cpu's page_states to our own. */ | ||
| 2548 | dest = (unsigned long *)&__get_cpu_var(page_states); | ||
| 2549 | src = (unsigned long *)&per_cpu(page_states, cpu); | ||
| 2550 | |||
| 2551 | for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); | ||
| 2552 | i++) { | ||
| 2553 | dest[i] += src[i]; | ||
| 2554 | src[i] = 0; | ||
| 2555 | } | ||
| 2556 | |||
| 2557 | local_irq_enable(); | 2763 | local_irq_enable(); | 
| 2764 | refresh_cpu_vm_stats(cpu); | ||
| 2558 | } | 2765 | } | 
| 2559 | return NOTIFY_OK; | 2766 | return NOTIFY_OK; | 
| 2560 | } | 2767 | } | 
| @@ -2573,7 +2780,7 @@ static void calculate_totalreserve_pages(void) | |||
| 2573 | { | 2780 | { | 
| 2574 | struct pglist_data *pgdat; | 2781 | struct pglist_data *pgdat; | 
| 2575 | unsigned long reserve_pages = 0; | 2782 | unsigned long reserve_pages = 0; | 
| 2576 | int i, j; | 2783 | enum zone_type i, j; | 
| 2577 | 2784 | ||
| 2578 | for_each_online_pgdat(pgdat) { | 2785 | for_each_online_pgdat(pgdat) { | 
| 2579 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2786 | for (i = 0; i < MAX_NR_ZONES; i++) { | 
| @@ -2606,7 +2813,7 @@ static void calculate_totalreserve_pages(void) | |||
| 2606 | static void setup_per_zone_lowmem_reserve(void) | 2813 | static void setup_per_zone_lowmem_reserve(void) | 
| 2607 | { | 2814 | { | 
| 2608 | struct pglist_data *pgdat; | 2815 | struct pglist_data *pgdat; | 
| 2609 | int j, idx; | 2816 | enum zone_type j, idx; | 
| 2610 | 2817 | ||
| 2611 | for_each_online_pgdat(pgdat) { | 2818 | for_each_online_pgdat(pgdat) { | 
| 2612 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2819 | for (j = 0; j < MAX_NR_ZONES; j++) { | 
| @@ -2615,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 2615 | 2822 | ||
| 2616 | zone->lowmem_reserve[j] = 0; | 2823 | zone->lowmem_reserve[j] = 0; | 
| 2617 | 2824 | ||
| 2618 | for (idx = j-1; idx >= 0; idx--) { | 2825 | idx = j; | 
| 2826 | while (idx) { | ||
| 2619 | struct zone *lower_zone; | 2827 | struct zone *lower_zone; | 
| 2620 | 2828 | ||
| 2829 | idx--; | ||
| 2830 | |||
| 2621 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 2831 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 
| 2622 | sysctl_lowmem_reserve_ratio[idx] = 1; | 2832 | sysctl_lowmem_reserve_ratio[idx] = 1; | 
| 2623 | 2833 | ||
| @@ -2746,6 +2956,40 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
| 2746 | return 0; | 2956 | return 0; | 
| 2747 | } | 2957 | } | 
| 2748 | 2958 | ||
| 2959 | #ifdef CONFIG_NUMA | ||
| 2960 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | ||
| 2961 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2962 | { | ||
| 2963 | struct zone *zone; | ||
| 2964 | int rc; | ||
| 2965 | |||
| 2966 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2967 | if (rc) | ||
| 2968 | return rc; | ||
| 2969 | |||
| 2970 | for_each_zone(zone) | ||
| 2971 | zone->min_unmapped_pages = (zone->present_pages * | ||
| 2972 | sysctl_min_unmapped_ratio) / 100; | ||
| 2973 | return 0; | ||
| 2974 | } | ||
| 2975 | |||
| 2976 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | ||
| 2977 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2978 | { | ||
| 2979 | struct zone *zone; | ||
| 2980 | int rc; | ||
| 2981 | |||
| 2982 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2983 | if (rc) | ||
| 2984 | return rc; | ||
| 2985 | |||
| 2986 | for_each_zone(zone) | ||
| 2987 | zone->min_slab_pages = (zone->present_pages * | ||
| 2988 | sysctl_min_slab_ratio) / 100; | ||
| 2989 | return 0; | ||
| 2990 | } | ||
| 2991 | #endif | ||
| 2992 | |||
| 2749 | /* | 2993 | /* | 
| 2750 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 2994 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 
| 2751 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 2995 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 
| @@ -2789,7 +3033,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
| 2789 | return 0; | 3033 | return 0; | 
| 2790 | } | 3034 | } | 
| 2791 | 3035 | ||
| 2792 | __initdata int hashdist = HASHDIST_DEFAULT; | 3036 | int hashdist = HASHDIST_DEFAULT; | 
| 2793 | 3037 | ||
| 2794 | #ifdef CONFIG_NUMA | 3038 | #ifdef CONFIG_NUMA | 
| 2795 | static int __init set_hashdist(char *str) | 3039 | static int __init set_hashdist(char *str) | 
| diff --git a/mm/page_io.c b/mm/page_io.c index bb2b0d53889c..d4840ecbf8f9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) | |||
| 52 | if (bio->bi_size) | 52 | if (bio->bi_size) | 
| 53 | return 1; | 53 | return 1; | 
| 54 | 54 | ||
| 55 | if (!uptodate) | 55 | if (!uptodate) { | 
| 56 | SetPageError(page); | 56 | SetPageError(page); | 
| 57 | /* | ||
| 58 | * We failed to write the page out to swap-space. | ||
| 59 | * Re-dirty the page in order to avoid it being reclaimed. | ||
| 60 | * Also print a dire warning that things will go BAD (tm) | ||
| 61 | * very quickly. | ||
| 62 | * | ||
| 63 | * Also clear PG_reclaim to avoid rotate_reclaimable_page() | ||
| 64 | */ | ||
| 65 | set_page_dirty(page); | ||
| 66 | printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", | ||
| 67 | imajor(bio->bi_bdev->bd_inode), | ||
| 68 | iminor(bio->bi_bdev->bd_inode), | ||
| 69 | (unsigned long long)bio->bi_sector); | ||
| 70 | ClearPageReclaim(page); | ||
| 71 | } | ||
| 57 | end_page_writeback(page); | 72 | end_page_writeback(page); | 
| 58 | bio_put(bio); | 73 | bio_put(bio); | 
| 59 | return 0; | 74 | return 0; | 
| 60 | } | 75 | } | 
| 61 | 76 | ||
| 62 | static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | 77 | int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | 
| 63 | { | 78 | { | 
| 64 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 79 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 
| 65 | struct page *page = bio->bi_io_vec[0].bv_page; | 80 | struct page *page = bio->bi_io_vec[0].bv_page; | 
| @@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) | |||
| 70 | if (!uptodate) { | 85 | if (!uptodate) { | 
| 71 | SetPageError(page); | 86 | SetPageError(page); | 
| 72 | ClearPageUptodate(page); | 87 | ClearPageUptodate(page); | 
| 88 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | ||
| 89 | imajor(bio->bi_bdev->bd_inode), | ||
| 90 | iminor(bio->bi_bdev->bd_inode), | ||
| 91 | (unsigned long long)bio->bi_sector); | ||
| 73 | } else { | 92 | } else { | 
| 74 | SetPageUptodate(page); | 93 | SetPageUptodate(page); | 
| 75 | } | 94 | } | 
| @@ -101,7 +120,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
| 101 | } | 120 | } | 
| 102 | if (wbc->sync_mode == WB_SYNC_ALL) | 121 | if (wbc->sync_mode == WB_SYNC_ALL) | 
| 103 | rw |= (1 << BIO_RW_SYNC); | 122 | rw |= (1 << BIO_RW_SYNC); | 
| 104 | inc_page_state(pswpout); | 123 | count_vm_event(PSWPOUT); | 
| 105 | set_page_writeback(page); | 124 | set_page_writeback(page); | 
| 106 | unlock_page(page); | 125 | unlock_page(page); | 
| 107 | submit_bio(rw, bio); | 126 | submit_bio(rw, bio); | 
| @@ -123,7 +142,7 @@ int swap_readpage(struct file *file, struct page *page) | |||
| 123 | ret = -ENOMEM; | 142 | ret = -ENOMEM; | 
| 124 | goto out; | 143 | goto out; | 
| 125 | } | 144 | } | 
| 126 | inc_page_state(pswpin); | 145 | count_vm_event(PSWPIN); | 
| 127 | submit_bio(READ, bio); | 146 | submit_bio(READ, bio); | 
| 128 | out: | 147 | out: | 
| 129 | return ret; | 148 | return ret; | 
| @@ -137,10 +156,12 @@ out: | |||
| 137 | * We use end_swap_bio_read() even for writes, because it happens to do what | 156 | * We use end_swap_bio_read() even for writes, because it happens to do what | 
| 138 | * we want. | 157 | * we want. | 
| 139 | */ | 158 | */ | 
| 140 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) | 159 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, | 
| 160 | struct bio **bio_chain) | ||
| 141 | { | 161 | { | 
| 142 | struct bio *bio; | 162 | struct bio *bio; | 
| 143 | int ret = 0; | 163 | int ret = 0; | 
| 164 | int bio_rw; | ||
| 144 | 165 | ||
| 145 | lock_page(page); | 166 | lock_page(page); | 
| 146 | 167 | ||
| @@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) | |||
| 151 | goto out; | 172 | goto out; | 
| 152 | } | 173 | } | 
| 153 | 174 | ||
| 154 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | 175 | bio_rw = rw; | 
| 155 | wait_on_page_locked(page); | 176 | if (!bio_chain) | 
| 156 | 177 | bio_rw |= (1 << BIO_RW_SYNC); | |
| 157 | if (!PageUptodate(page) || PageError(page)) | 178 | if (bio_chain) | 
| 158 | ret = -EIO; | 179 | bio_get(bio); | 
| 180 | submit_bio(bio_rw, bio); | ||
| 181 | if (bio_chain == NULL) { | ||
| 182 | wait_on_page_locked(page); | ||
| 183 | |||
| 184 | if (!PageUptodate(page) || PageError(page)) | ||
| 185 | ret = -EIO; | ||
| 186 | } | ||
| 187 | if (bio_chain) { | ||
| 188 | bio->bi_private = *bio_chain; | ||
| 189 | *bio_chain = bio; | ||
| 190 | } | ||
| 159 | out: | 191 | out: | 
| 160 | return ret; | 192 | return ret; | 
| 161 | } | 193 | } | 
| diff --git a/mm/pdflush.c b/mm/pdflush.c index df7e50b8f70c..b02102feeb4b 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
| @@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work) | |||
| 104 | list_move(&my_work->list, &pdflush_list); | 104 | list_move(&my_work->list, &pdflush_list); | 
| 105 | my_work->when_i_went_to_sleep = jiffies; | 105 | my_work->when_i_went_to_sleep = jiffies; | 
| 106 | spin_unlock_irq(&pdflush_lock); | 106 | spin_unlock_irq(&pdflush_lock); | 
| 107 | |||
| 108 | schedule(); | 107 | schedule(); | 
| 109 | if (try_to_freeze()) { | 108 | try_to_freeze(); | 
| 110 | spin_lock_irq(&pdflush_lock); | ||
| 111 | continue; | ||
| 112 | } | ||
| 113 | |||
| 114 | spin_lock_irq(&pdflush_lock); | 109 | spin_lock_irq(&pdflush_lock); | 
| 115 | if (!list_empty(&my_work->list)) { | 110 | if (!list_empty(&my_work->list)) { | 
| 116 | printk("pdflush: bogus wakeup!\n"); | 111 | /* | 
| 112 | * Someone woke us up, but without removing our control | ||
| 113 | * structure from the global list. swsusp will do this | ||
| 114 | * in try_to_freeze()->refrigerator(). Handle it. | ||
| 115 | */ | ||
| 117 | my_work->fn = NULL; | 116 | my_work->fn = NULL; | 
| 118 | continue; | 117 | continue; | 
| 119 | } | 118 | } | 
| 120 | if (my_work->fn == NULL) { | 119 | if (my_work->fn == NULL) { | 
| 121 | printk("pdflush: NULL work function\n"); | 120 | printk("pdflush: bogus wakeup\n"); | 
| 122 | continue; | 121 | continue; | 
| 123 | } | 122 | } | 
| 124 | spin_unlock_irq(&pdflush_lock); | 123 | spin_unlock_irq(&pdflush_lock); | 
| diff --git a/mm/readahead.c b/mm/readahead.c index 0f142a40984b..aa7ec424656a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -118,8 +118,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra) | |||
| 118 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | 118 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | 
| 119 | 119 | ||
| 120 | /** | 120 | /** | 
| 121 | * read_cache_pages - populate an address space with some pages, and | 121 | * read_cache_pages - populate an address space with some pages & start reads against them | 
| 122 | * start reads against them. | ||
| 123 | * @mapping: the address_space | 122 | * @mapping: the address_space | 
| 124 | * @pages: The address of a list_head which contains the target pages. These | 123 | * @pages: The address of a list_head which contains the target pages. These | 
| 125 | * pages have their ->index populated and are otherwise uninitialised. | 124 | * pages have their ->index populated and are otherwise uninitialised. | 
| @@ -182,14 +181,11 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
| 182 | list_del(&page->lru); | 181 | list_del(&page->lru); | 
| 183 | if (!add_to_page_cache(page, mapping, | 182 | if (!add_to_page_cache(page, mapping, | 
| 184 | page->index, GFP_KERNEL)) { | 183 | page->index, GFP_KERNEL)) { | 
| 185 | ret = mapping->a_ops->readpage(filp, page); | 184 | mapping->a_ops->readpage(filp, page); | 
| 186 | if (ret != AOP_TRUNCATED_PAGE) { | 185 | if (!pagevec_add(&lru_pvec, page)) | 
| 187 | if (!pagevec_add(&lru_pvec, page)) | 186 | __pagevec_lru_add(&lru_pvec); | 
| 188 | __pagevec_lru_add(&lru_pvec); | 187 | } else | 
| 189 | continue; | 188 | page_cache_release(page); | 
| 190 | } /* else fall through to release */ | ||
| 191 | } | ||
| 192 | page_cache_release(page); | ||
| 193 | } | 189 | } | 
| 194 | pagevec_lru_add(&lru_pvec); | 190 | pagevec_lru_add(&lru_pvec); | 
| 195 | ret = 0; | 191 | ret = 0; | 
| @@ -394,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 394 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | 390 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | 
| 395 | * is set wait till the read completes. Otherwise attempt to read without | 391 | * is set wait till the read completes. Otherwise attempt to read without | 
| 396 | * blocking. | 392 | * blocking. | 
| 397 | * Returns 1 meaning 'success' if read is succesfull without switching off | 393 | * Returns 1 meaning 'success' if read is successful without switching off | 
| 398 | * readhaead mode. Otherwise return failure. | 394 | * readahead mode. Otherwise return failure. | 
| 399 | */ | 395 | */ | 
| 400 | static int | 396 | static int | 
| 401 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | 397 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | 
| @@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked) | |||
| 434 | return referenced; | 434 | return referenced; | 
| 435 | } | 435 | } | 
| 436 | 436 | ||
| 437 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | ||
| 438 | { | ||
| 439 | struct mm_struct *mm = vma->vm_mm; | ||
| 440 | unsigned long address; | ||
| 441 | pte_t *pte, entry; | ||
| 442 | spinlock_t *ptl; | ||
| 443 | int ret = 0; | ||
| 444 | |||
| 445 | address = vma_address(page, vma); | ||
| 446 | if (address == -EFAULT) | ||
| 447 | goto out; | ||
| 448 | |||
| 449 | pte = page_check_address(page, mm, address, &ptl); | ||
| 450 | if (!pte) | ||
| 451 | goto out; | ||
| 452 | |||
| 453 | if (!pte_dirty(*pte) && !pte_write(*pte)) | ||
| 454 | goto unlock; | ||
| 455 | |||
| 456 | entry = ptep_get_and_clear(mm, address, pte); | ||
| 457 | entry = pte_mkclean(entry); | ||
| 458 | entry = pte_wrprotect(entry); | ||
| 459 | ptep_establish(vma, address, pte, entry); | ||
| 460 | lazy_mmu_prot_update(entry); | ||
| 461 | ret = 1; | ||
| 462 | |||
| 463 | unlock: | ||
| 464 | pte_unmap_unlock(pte, ptl); | ||
| 465 | out: | ||
| 466 | return ret; | ||
| 467 | } | ||
| 468 | |||
| 469 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | ||
| 470 | { | ||
| 471 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 472 | struct vm_area_struct *vma; | ||
| 473 | struct prio_tree_iter iter; | ||
| 474 | int ret = 0; | ||
| 475 | |||
| 476 | BUG_ON(PageAnon(page)); | ||
| 477 | |||
| 478 | spin_lock(&mapping->i_mmap_lock); | ||
| 479 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
| 480 | if (vma->vm_flags & VM_SHARED) | ||
| 481 | ret += page_mkclean_one(page, vma); | ||
| 482 | } | ||
| 483 | spin_unlock(&mapping->i_mmap_lock); | ||
| 484 | return ret; | ||
| 485 | } | ||
| 486 | |||
| 487 | int page_mkclean(struct page *page) | ||
| 488 | { | ||
| 489 | int ret = 0; | ||
| 490 | |||
| 491 | BUG_ON(!PageLocked(page)); | ||
| 492 | |||
| 493 | if (page_mapped(page)) { | ||
| 494 | struct address_space *mapping = page_mapping(page); | ||
| 495 | if (mapping) | ||
| 496 | ret = page_mkclean_file(mapping, page); | ||
| 497 | } | ||
| 498 | |||
| 499 | return ret; | ||
| 500 | } | ||
| 501 | |||
| 437 | /** | 502 | /** | 
| 438 | * page_set_anon_rmap - setup new anonymous rmap | 503 | * page_set_anon_rmap - setup new anonymous rmap | 
| 439 | * @page: the page to add the mapping to | 504 | * @page: the page to add the mapping to | 
| @@ -455,7 +520,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
| 455 | * nr_mapped state can be updated without turning off | 520 | * nr_mapped state can be updated without turning off | 
| 456 | * interrupts because it is not modified via interrupt. | 521 | * interrupts because it is not modified via interrupt. | 
| 457 | */ | 522 | */ | 
| 458 | __inc_page_state(nr_mapped); | 523 | __inc_zone_page_state(page, NR_ANON_PAGES); | 
| 459 | } | 524 | } | 
| 460 | 525 | ||
| 461 | /** | 526 | /** | 
| @@ -499,7 +564,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 499 | void page_add_file_rmap(struct page *page) | 564 | void page_add_file_rmap(struct page *page) | 
| 500 | { | 565 | { | 
| 501 | if (atomic_inc_and_test(&page->_mapcount)) | 566 | if (atomic_inc_and_test(&page->_mapcount)) | 
| 502 | __inc_page_state(nr_mapped); | 567 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 
| 503 | } | 568 | } | 
| 504 | 569 | ||
| 505 | /** | 570 | /** | 
| @@ -531,7 +596,8 @@ void page_remove_rmap(struct page *page) | |||
| 531 | */ | 596 | */ | 
| 532 | if (page_test_and_clear_dirty(page)) | 597 | if (page_test_and_clear_dirty(page)) | 
| 533 | set_page_dirty(page); | 598 | set_page_dirty(page); | 
| 534 | __dec_page_state(nr_mapped); | 599 | __dec_zone_page_state(page, | 
| 600 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
| 535 | } | 601 | } | 
| 536 | } | 602 | } | 
| 537 | 603 | ||
| @@ -562,9 +628,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 562 | * If it's recently referenced (perhaps page_referenced | 628 | * If it's recently referenced (perhaps page_referenced | 
| 563 | * skipped over this mm) then we should reactivate it. | 629 | * skipped over this mm) then we should reactivate it. | 
| 564 | */ | 630 | */ | 
| 565 | if ((vma->vm_flags & VM_LOCKED) || | 631 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 
| 566 | (ptep_clear_flush_young(vma, address, pte) | 632 | (ptep_clear_flush_young(vma, address, pte)))) { | 
| 567 | && !migration)) { | ||
| 568 | ret = SWAP_FAIL; | 633 | ret = SWAP_FAIL; | 
| 569 | goto out_unmap; | 634 | goto out_unmap; | 
| 570 | } | 635 | } | 
| @@ -771,7 +836,7 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 771 | 836 | ||
| 772 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 837 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 
| 773 | shared.vm_set.list) { | 838 | shared.vm_set.list) { | 
| 774 | if (vma->vm_flags & VM_LOCKED) | 839 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 
| 775 | continue; | 840 | continue; | 
| 776 | cursor = (unsigned long) vma->vm_private_data; | 841 | cursor = (unsigned long) vma->vm_private_data; | 
| 777 | if (cursor > max_nl_cursor) | 842 | if (cursor > max_nl_cursor) | 
| @@ -805,7 +870,7 @@ static int try_to_unmap_file(struct page *page, int migration) | |||
| 805 | do { | 870 | do { | 
| 806 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 871 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 
| 807 | shared.vm_set.list) { | 872 | shared.vm_set.list) { | 
| 808 | if (vma->vm_flags & VM_LOCKED) | 873 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 
| 809 | continue; | 874 | continue; | 
| 810 | cursor = (unsigned long) vma->vm_private_data; | 875 | cursor = (unsigned long) vma->vm_private_data; | 
| 811 | while ( cursor < max_nl_cursor && | 876 | while ( cursor < max_nl_cursor && | 
| diff --git a/mm/shmem.c b/mm/shmem.c index 84b5cf9b63c5..b96de69f236b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -23,11 +23,11 @@ | |||
| 23 | * which makes it a completely usable filesystem. | 23 | * which makes it a completely usable filesystem. | 
| 24 | */ | 24 | */ | 
| 25 | 25 | ||
| 26 | #include <linux/config.h> | ||
| 27 | #include <linux/module.h> | 26 | #include <linux/module.h> | 
| 28 | #include <linux/init.h> | 27 | #include <linux/init.h> | 
| 29 | #include <linux/devfs_fs_kernel.h> | ||
| 30 | #include <linux/fs.h> | 28 | #include <linux/fs.h> | 
| 29 | #include <linux/xattr.h> | ||
| 30 | #include <linux/generic_acl.h> | ||
| 31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> | 
| 32 | #include <linux/mman.h> | 32 | #include <linux/mman.h> | 
| 33 | #include <linux/file.h> | 33 | #include <linux/file.h> | 
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/namei.h> | 47 | #include <linux/namei.h> | 
| 48 | #include <linux/ctype.h> | 48 | #include <linux/ctype.h> | 
| 49 | #include <linux/migrate.h> | 49 | #include <linux/migrate.h> | 
| 50 | #include <linux/highmem.h> | ||
| 50 | 51 | ||
| 51 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> | 
| 52 | #include <asm/div64.h> | 53 | #include <asm/div64.h> | 
| @@ -174,10 +175,11 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
| 174 | } | 175 | } | 
| 175 | 176 | ||
| 176 | static struct super_operations shmem_ops; | 177 | static struct super_operations shmem_ops; | 
| 177 | static struct address_space_operations shmem_aops; | 178 | static const struct address_space_operations shmem_aops; | 
| 178 | static struct file_operations shmem_file_operations; | 179 | static struct file_operations shmem_file_operations; | 
| 179 | static struct inode_operations shmem_inode_operations; | 180 | static struct inode_operations shmem_inode_operations; | 
| 180 | static struct inode_operations shmem_dir_inode_operations; | 181 | static struct inode_operations shmem_dir_inode_operations; | 
| 182 | static struct inode_operations shmem_special_inode_operations; | ||
| 181 | static struct vm_operations_struct shmem_vm_ops; | 183 | static struct vm_operations_struct shmem_vm_ops; | 
| 182 | 184 | ||
| 183 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 185 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 
| @@ -638,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
| 638 | struct page *page = NULL; | 640 | struct page *page = NULL; | 
| 639 | int error; | 641 | int error; | 
| 640 | 642 | ||
| 641 | if (attr->ia_valid & ATTR_SIZE) { | 643 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 
| 642 | if (attr->ia_size < inode->i_size) { | 644 | if (attr->ia_size < inode->i_size) { | 
| 643 | /* | 645 | /* | 
| 644 | * If truncating down to a partial page, then | 646 | * If truncating down to a partial page, then | 
| @@ -671,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
| 671 | error = inode_change_ok(inode, attr); | 673 | error = inode_change_ok(inode, attr); | 
| 672 | if (!error) | 674 | if (!error) | 
| 673 | error = inode_setattr(inode, attr); | 675 | error = inode_setattr(inode, attr); | 
| 676 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 677 | if (!error && (attr->ia_valid & ATTR_MODE)) | ||
| 678 | error = generic_acl_chmod(inode, &shmem_acl_ops); | ||
| 679 | #endif | ||
| 674 | if (page) | 680 | if (page) | 
| 675 | page_cache_release(page); | 681 | page_cache_release(page); | 
| 676 | return error; | 682 | return error; | 
| @@ -1046,12 +1052,12 @@ repeat: | |||
| 1046 | swappage = lookup_swap_cache(swap); | 1052 | swappage = lookup_swap_cache(swap); | 
| 1047 | if (!swappage) { | 1053 | if (!swappage) { | 
| 1048 | shmem_swp_unmap(entry); | 1054 | shmem_swp_unmap(entry); | 
| 1049 | spin_unlock(&info->lock); | ||
| 1050 | /* here we actually do the io */ | 1055 | /* here we actually do the io */ | 
| 1051 | if (type && *type == VM_FAULT_MINOR) { | 1056 | if (type && *type == VM_FAULT_MINOR) { | 
| 1052 | inc_page_state(pgmajfault); | 1057 | __count_vm_event(PGMAJFAULT); | 
| 1053 | *type = VM_FAULT_MAJOR; | 1058 | *type = VM_FAULT_MAJOR; | 
| 1054 | } | 1059 | } | 
| 1060 | spin_unlock(&info->lock); | ||
| 1055 | swappage = shmem_swapin(info, swap, idx); | 1061 | swappage = shmem_swapin(info, swap, idx); | 
| 1056 | if (!swappage) { | 1062 | if (!swappage) { | 
| 1057 | spin_lock(&info->lock); | 1063 | spin_lock(&info->lock); | 
| @@ -1352,7 +1358,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
| 1352 | inode->i_mode = mode; | 1358 | inode->i_mode = mode; | 
| 1353 | inode->i_uid = current->fsuid; | 1359 | inode->i_uid = current->fsuid; | 
| 1354 | inode->i_gid = current->fsgid; | 1360 | inode->i_gid = current->fsgid; | 
| 1355 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 1356 | inode->i_blocks = 0; | 1361 | inode->i_blocks = 0; | 
| 1357 | inode->i_mapping->a_ops = &shmem_aops; | 1362 | inode->i_mapping->a_ops = &shmem_aops; | 
| 1358 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1363 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 
| @@ -1364,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
| 1364 | 1369 | ||
| 1365 | switch (mode & S_IFMT) { | 1370 | switch (mode & S_IFMT) { | 
| 1366 | default: | 1371 | default: | 
| 1372 | inode->i_op = &shmem_special_inode_operations; | ||
| 1367 | init_special_inode(inode, mode, dev); | 1373 | init_special_inode(inode, mode, dev); | 
| 1368 | break; | 1374 | break; | 
| 1369 | case S_IFREG: | 1375 | case S_IFREG: | 
| @@ -1684,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
| 1684 | iput(inode); | 1690 | iput(inode); | 
| 1685 | return error; | 1691 | return error; | 
| 1686 | } | 1692 | } | 
| 1687 | error = 0; | 1693 | } | 
| 1694 | error = shmem_acl_init(inode, dir); | ||
| 1695 | if (error) { | ||
| 1696 | iput(inode); | ||
| 1697 | return error; | ||
| 1688 | } | 1698 | } | 
| 1689 | if (dir->i_mode & S_ISGID) { | 1699 | if (dir->i_mode & S_ISGID) { | 
| 1690 | inode->i_gid = dir->i_gid; | 1700 | inode->i_gid = dir->i_gid; | 
| @@ -1899,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = { | |||
| 1899 | .put_link = shmem_put_link, | 1909 | .put_link = shmem_put_link, | 
| 1900 | }; | 1910 | }; | 
| 1901 | 1911 | ||
| 1912 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 1913 | /** | ||
| 1914 | * Superblocks without xattr inode operations will get security.* xattr | ||
| 1915 | * support from the VFS "for free". As soon as we have any other xattrs | ||
| 1916 | * like ACLs, we also need to implement the security.* handlers at | ||
| 1917 | * filesystem level, though. | ||
| 1918 | */ | ||
| 1919 | |||
| 1920 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | ||
| 1921 | size_t list_len, const char *name, | ||
| 1922 | size_t name_len) | ||
| 1923 | { | ||
| 1924 | return security_inode_listsecurity(inode, list, list_len); | ||
| 1925 | } | ||
| 1926 | |||
| 1927 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | ||
| 1928 | void *buffer, size_t size) | ||
| 1929 | { | ||
| 1930 | if (strcmp(name, "") == 0) | ||
| 1931 | return -EINVAL; | ||
| 1932 | return security_inode_getsecurity(inode, name, buffer, size, | ||
| 1933 | -EOPNOTSUPP); | ||
| 1934 | } | ||
| 1935 | |||
| 1936 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | ||
| 1937 | const void *value, size_t size, int flags) | ||
| 1938 | { | ||
| 1939 | if (strcmp(name, "") == 0) | ||
| 1940 | return -EINVAL; | ||
| 1941 | return security_inode_setsecurity(inode, name, value, size, flags); | ||
| 1942 | } | ||
| 1943 | |||
| 1944 | struct xattr_handler shmem_xattr_security_handler = { | ||
| 1945 | .prefix = XATTR_SECURITY_PREFIX, | ||
| 1946 | .list = shmem_xattr_security_list, | ||
| 1947 | .get = shmem_xattr_security_get, | ||
| 1948 | .set = shmem_xattr_security_set, | ||
| 1949 | }; | ||
| 1950 | |||
| 1951 | static struct xattr_handler *shmem_xattr_handlers[] = { | ||
| 1952 | &shmem_xattr_acl_access_handler, | ||
| 1953 | &shmem_xattr_acl_default_handler, | ||
| 1954 | &shmem_xattr_security_handler, | ||
| 1955 | NULL | ||
| 1956 | }; | ||
| 1957 | #endif | ||
| 1958 | |||
| 1902 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, | 1959 | static int shmem_parse_options(char *options, int *mode, uid_t *uid, | 
| 1903 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, | 1960 | gid_t *gid, unsigned long *blocks, unsigned long *inodes, | 
| 1904 | int *policy, nodemask_t *policy_nodes) | 1961 | int *policy, nodemask_t *policy_nodes) | 
| @@ -2096,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb, | |||
| 2096 | sb->s_magic = TMPFS_MAGIC; | 2153 | sb->s_magic = TMPFS_MAGIC; | 
| 2097 | sb->s_op = &shmem_ops; | 2154 | sb->s_op = &shmem_ops; | 
| 2098 | sb->s_time_gran = 1; | 2155 | sb->s_time_gran = 1; | 
| 2156 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2157 | sb->s_xattr = shmem_xattr_handlers; | ||
| 2158 | sb->s_flags |= MS_POSIXACL; | ||
| 2159 | #endif | ||
| 2099 | 2160 | ||
| 2100 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); | 2161 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); | 
| 2101 | if (!inode) | 2162 | if (!inode) | 
| @@ -2132,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
| 2132 | /* only struct inode is valid if it's an inline symlink */ | 2193 | /* only struct inode is valid if it's an inline symlink */ | 
| 2133 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2194 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 
| 2134 | } | 2195 | } | 
| 2196 | shmem_acl_destroy_inode(inode); | ||
| 2135 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2197 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 
| 2136 | } | 2198 | } | 
| 2137 | 2199 | ||
| @@ -2143,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep, | |||
| 2143 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 2205 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 
| 2144 | SLAB_CTOR_CONSTRUCTOR) { | 2206 | SLAB_CTOR_CONSTRUCTOR) { | 
| 2145 | inode_init_once(&p->vfs_inode); | 2207 | inode_init_once(&p->vfs_inode); | 
| 2208 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2209 | p->i_acl = NULL; | ||
| 2210 | p->i_default_acl = NULL; | ||
| 2211 | #endif | ||
| 2146 | } | 2212 | } | 
| 2147 | } | 2213 | } | 
| 2148 | 2214 | ||
| @@ -2158,11 +2224,10 @@ static int init_inodecache(void) | |||
| 2158 | 2224 | ||
| 2159 | static void destroy_inodecache(void) | 2225 | static void destroy_inodecache(void) | 
| 2160 | { | 2226 | { | 
| 2161 | if (kmem_cache_destroy(shmem_inode_cachep)) | 2227 | kmem_cache_destroy(shmem_inode_cachep); | 
| 2162 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); | ||
| 2163 | } | 2228 | } | 
| 2164 | 2229 | ||
| 2165 | static struct address_space_operations shmem_aops = { | 2230 | static const struct address_space_operations shmem_aops = { | 
| 2166 | .writepage = shmem_writepage, | 2231 | .writepage = shmem_writepage, | 
| 2167 | .set_page_dirty = __set_page_dirty_nobuffers, | 2232 | .set_page_dirty = __set_page_dirty_nobuffers, | 
| 2168 | #ifdef CONFIG_TMPFS | 2233 | #ifdef CONFIG_TMPFS | 
| @@ -2187,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = { | |||
| 2187 | .truncate = shmem_truncate, | 2252 | .truncate = shmem_truncate, | 
| 2188 | .setattr = shmem_notify_change, | 2253 | .setattr = shmem_notify_change, | 
| 2189 | .truncate_range = shmem_truncate_range, | 2254 | .truncate_range = shmem_truncate_range, | 
| 2255 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2256 | .setxattr = generic_setxattr, | ||
| 2257 | .getxattr = generic_getxattr, | ||
| 2258 | .listxattr = generic_listxattr, | ||
| 2259 | .removexattr = generic_removexattr, | ||
| 2260 | .permission = shmem_permission, | ||
| 2261 | #endif | ||
| 2262 | |||
| 2190 | }; | 2263 | }; | 
| 2191 | 2264 | ||
| 2192 | static struct inode_operations shmem_dir_inode_operations = { | 2265 | static struct inode_operations shmem_dir_inode_operations = { | 
| @@ -2201,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = { | |||
| 2201 | .mknod = shmem_mknod, | 2274 | .mknod = shmem_mknod, | 
| 2202 | .rename = shmem_rename, | 2275 | .rename = shmem_rename, | 
| 2203 | #endif | 2276 | #endif | 
| 2277 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2278 | .setattr = shmem_notify_change, | ||
| 2279 | .setxattr = generic_setxattr, | ||
| 2280 | .getxattr = generic_getxattr, | ||
| 2281 | .listxattr = generic_listxattr, | ||
| 2282 | .removexattr = generic_removexattr, | ||
| 2283 | .permission = shmem_permission, | ||
| 2284 | #endif | ||
| 2285 | }; | ||
| 2286 | |||
| 2287 | static struct inode_operations shmem_special_inode_operations = { | ||
| 2288 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2289 | .setattr = shmem_notify_change, | ||
| 2290 | .setxattr = generic_setxattr, | ||
| 2291 | .getxattr = generic_getxattr, | ||
| 2292 | .listxattr = generic_listxattr, | ||
| 2293 | .removexattr = generic_removexattr, | ||
| 2294 | .permission = shmem_permission, | ||
| 2295 | #endif | ||
| 2204 | }; | 2296 | }; | 
| 2205 | 2297 | ||
| 2206 | static struct super_operations shmem_ops = { | 2298 | static struct super_operations shmem_ops = { | 
| @@ -2252,10 +2344,8 @@ static int __init init_tmpfs(void) | |||
| 2252 | printk(KERN_ERR "Could not register tmpfs\n"); | 2344 | printk(KERN_ERR "Could not register tmpfs\n"); | 
| 2253 | goto out2; | 2345 | goto out2; | 
| 2254 | } | 2346 | } | 
| 2255 | #ifdef CONFIG_TMPFS | 2347 | |
| 2256 | devfs_mk_dir("shm"); | 2348 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, | 
| 2257 | #endif | ||
| 2258 | shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER, | ||
| 2259 | tmpfs_fs_type.name, NULL); | 2349 | tmpfs_fs_type.name, NULL); | 
| 2260 | if (IS_ERR(shm_mnt)) { | 2350 | if (IS_ERR(shm_mnt)) { | 
| 2261 | error = PTR_ERR(shm_mnt); | 2351 | error = PTR_ERR(shm_mnt); | 
| diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c new file mode 100644 index 000000000000..c946bf468718 --- /dev/null +++ b/mm/shmem_acl.c | |||
| @@ -0,0 +1,197 @@ | |||
| 1 | /* | ||
| 2 | * mm/shmem_acl.c | ||
| 3 | * | ||
| 4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/fs.h> | ||
| 10 | #include <linux/shmem_fs.h> | ||
| 11 | #include <linux/xattr.h> | ||
| 12 | #include <linux/generic_acl.h> | ||
| 13 | |||
| 14 | /** | ||
| 15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
| 16 | */ | ||
| 17 | static struct posix_acl * | ||
| 18 | shmem_get_acl(struct inode *inode, int type) | ||
| 19 | { | ||
| 20 | struct posix_acl *acl = NULL; | ||
| 21 | |||
| 22 | spin_lock(&inode->i_lock); | ||
| 23 | switch(type) { | ||
| 24 | case ACL_TYPE_ACCESS: | ||
| 25 | acl = posix_acl_dup(SHMEM_I(inode)->i_acl); | ||
| 26 | break; | ||
| 27 | |||
| 28 | case ACL_TYPE_DEFAULT: | ||
| 29 | acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); | ||
| 30 | break; | ||
| 31 | } | ||
| 32 | spin_unlock(&inode->i_lock); | ||
| 33 | |||
| 34 | return acl; | ||
| 35 | } | ||
| 36 | |||
| 37 | /** | ||
| 38 | * shmem_get_acl - generic_acl_operations->setacl() operation | ||
| 39 | */ | ||
| 40 | static void | ||
| 41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
| 42 | { | ||
| 43 | struct posix_acl *free = NULL; | ||
| 44 | |||
| 45 | spin_lock(&inode->i_lock); | ||
| 46 | switch(type) { | ||
| 47 | case ACL_TYPE_ACCESS: | ||
| 48 | free = SHMEM_I(inode)->i_acl; | ||
| 49 | SHMEM_I(inode)->i_acl = posix_acl_dup(acl); | ||
| 50 | break; | ||
| 51 | |||
| 52 | case ACL_TYPE_DEFAULT: | ||
| 53 | free = SHMEM_I(inode)->i_default_acl; | ||
| 54 | SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); | ||
| 55 | break; | ||
| 56 | } | ||
| 57 | spin_unlock(&inode->i_lock); | ||
| 58 | posix_acl_release(free); | ||
| 59 | } | ||
| 60 | |||
| 61 | struct generic_acl_operations shmem_acl_ops = { | ||
| 62 | .getacl = shmem_get_acl, | ||
| 63 | .setacl = shmem_set_acl, | ||
| 64 | }; | ||
| 65 | |||
| 66 | /** | ||
| 67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
| 68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
| 69 | * system.posix_acl_access xattr using the generic acl functions. | ||
| 70 | */ | ||
| 71 | |||
| 72 | static size_t | ||
| 73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
| 74 | const char *name, size_t name_len) | ||
| 75 | { | ||
| 76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
| 77 | list, list_size); | ||
| 78 | } | ||
| 79 | |||
| 80 | static int | ||
| 81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
| 82 | size_t size) | ||
| 83 | { | ||
| 84 | if (strcmp(name, "") != 0) | ||
| 85 | return -EINVAL; | ||
| 86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
| 87 | size); | ||
| 88 | } | ||
| 89 | |||
| 90 | static int | ||
| 91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
| 92 | size_t size, int flags) | ||
| 93 | { | ||
| 94 | if (strcmp(name, "") != 0) | ||
| 95 | return -EINVAL; | ||
| 96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
| 97 | size); | ||
| 98 | } | ||
| 99 | |||
| 100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
| 101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
| 102 | .list = shmem_list_acl_access, | ||
| 103 | .get = shmem_get_acl_access, | ||
| 104 | .set = shmem_set_acl_access, | ||
| 105 | }; | ||
| 106 | |||
| 107 | /** | ||
| 108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
| 109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
| 110 | * system.posix_acl_default xattr using the generic acl functions. | ||
| 111 | */ | ||
| 112 | |||
| 113 | static size_t | ||
| 114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
| 115 | const char *name, size_t name_len) | ||
| 116 | { | ||
| 117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
| 118 | list, list_size); | ||
| 119 | } | ||
| 120 | |||
| 121 | static int | ||
| 122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
| 123 | size_t size) | ||
| 124 | { | ||
| 125 | if (strcmp(name, "") != 0) | ||
| 126 | return -EINVAL; | ||
| 127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
| 128 | size); | ||
| 129 | } | ||
| 130 | |||
| 131 | static int | ||
| 132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
| 133 | size_t size, int flags) | ||
| 134 | { | ||
| 135 | if (strcmp(name, "") != 0) | ||
| 136 | return -EINVAL; | ||
| 137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
| 138 | size); | ||
| 139 | } | ||
| 140 | |||
| 141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
| 142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
| 143 | .list = shmem_list_acl_default, | ||
| 144 | .get = shmem_get_acl_default, | ||
| 145 | .set = shmem_set_acl_default, | ||
| 146 | }; | ||
| 147 | |||
| 148 | /** | ||
| 149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
| 150 | */ | ||
| 151 | int | ||
| 152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
| 153 | { | ||
| 154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
| 155 | } | ||
| 156 | |||
| 157 | /** | ||
| 158 | * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode | ||
| 159 | * | ||
| 160 | * This is done before destroying the actual inode. | ||
| 161 | */ | ||
| 162 | |||
| 163 | void | ||
| 164 | shmem_acl_destroy_inode(struct inode *inode) | ||
| 165 | { | ||
| 166 | if (SHMEM_I(inode)->i_acl) | ||
| 167 | posix_acl_release(SHMEM_I(inode)->i_acl); | ||
| 168 | SHMEM_I(inode)->i_acl = NULL; | ||
| 169 | if (SHMEM_I(inode)->i_default_acl) | ||
| 170 | posix_acl_release(SHMEM_I(inode)->i_default_acl); | ||
| 171 | SHMEM_I(inode)->i_default_acl = NULL; | ||
| 172 | } | ||
| 173 | |||
| 174 | /** | ||
| 175 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
| 176 | */ | ||
| 177 | static int | ||
| 178 | shmem_check_acl(struct inode *inode, int mask) | ||
| 179 | { | ||
| 180 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
| 181 | |||
| 182 | if (acl) { | ||
| 183 | int error = posix_acl_permission(inode, acl, mask); | ||
| 184 | posix_acl_release(acl); | ||
| 185 | return error; | ||
| 186 | } | ||
| 187 | return -EAGAIN; | ||
| 188 | } | ||
| 189 | |||
| 190 | /** | ||
| 191 | * shmem_permission - permission() inode operation | ||
| 192 | */ | ||
| 193 | int | ||
| 194 | shmem_permission(struct inode *inode, int mask, struct nameidata *nd) | ||
| 195 | { | ||
| 196 | return generic_permission(inode, mask, shmem_check_acl); | ||
| 197 | } | ||
| @@ -89,6 +89,7 @@ | |||
| 89 | #include <linux/config.h> | 89 | #include <linux/config.h> | 
| 90 | #include <linux/slab.h> | 90 | #include <linux/slab.h> | 
| 91 | #include <linux/mm.h> | 91 | #include <linux/mm.h> | 
| 92 | #include <linux/poison.h> | ||
| 92 | #include <linux/swap.h> | 93 | #include <linux/swap.h> | 
| 93 | #include <linux/cache.h> | 94 | #include <linux/cache.h> | 
| 94 | #include <linux/interrupt.h> | 95 | #include <linux/interrupt.h> | 
| @@ -106,6 +107,7 @@ | |||
| 106 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> | 
| 107 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> | 
| 108 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> | 
| 110 | #include <linux/rtmutex.h> | ||
| 109 | 111 | ||
| 110 | #include <asm/uaccess.h> | 112 | #include <asm/uaccess.h> | 
| 111 | #include <asm/cacheflush.h> | 113 | #include <asm/cacheflush.h> | 
| @@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | |||
| 307 | #define SIZE_AC 1 | 309 | #define SIZE_AC 1 | 
| 308 | #define SIZE_L3 (1 + MAX_NUMNODES) | 310 | #define SIZE_L3 (1 + MAX_NUMNODES) | 
| 309 | 311 | ||
| 312 | static int drain_freelist(struct kmem_cache *cache, | ||
| 313 | struct kmem_list3 *l3, int tofree); | ||
| 314 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | ||
| 315 | int node); | ||
| 316 | static int enable_cpucache(struct kmem_cache *cachep); | ||
| 317 | static void cache_reap(void *unused); | ||
| 318 | |||
| 310 | /* | 319 | /* | 
| 311 | * This function must be completely optimized away if a constant is passed to | 320 | * This function must be completely optimized away if a constant is passed to | 
| 312 | * it. Mostly the same as what is in linux/slab.h except it returns an index. | 321 | * it. Mostly the same as what is in linux/slab.h except it returns an index. | 
| @@ -454,7 +463,7 @@ struct kmem_cache { | |||
| 454 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) | 463 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) | 
| 455 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 464 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 
| 456 | #define STATS_INC_GROWN(x) ((x)->grown++) | 465 | #define STATS_INC_GROWN(x) ((x)->grown++) | 
| 457 | #define STATS_INC_REAPED(x) ((x)->reaped++) | 466 | #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) | 
| 458 | #define STATS_SET_HIGH(x) \ | 467 | #define STATS_SET_HIGH(x) \ | 
| 459 | do { \ | 468 | do { \ | 
| 460 | if ((x)->num_active > (x)->high_mark) \ | 469 | if ((x)->num_active > (x)->high_mark) \ | 
| @@ -478,7 +487,7 @@ struct kmem_cache { | |||
| 478 | #define STATS_DEC_ACTIVE(x) do { } while (0) | 487 | #define STATS_DEC_ACTIVE(x) do { } while (0) | 
| 479 | #define STATS_INC_ALLOCED(x) do { } while (0) | 488 | #define STATS_INC_ALLOCED(x) do { } while (0) | 
| 480 | #define STATS_INC_GROWN(x) do { } while (0) | 489 | #define STATS_INC_GROWN(x) do { } while (0) | 
| 481 | #define STATS_INC_REAPED(x) do { } while (0) | 490 | #define STATS_ADD_REAPED(x,y) do { } while (0) | 
| 482 | #define STATS_SET_HIGH(x) do { } while (0) | 491 | #define STATS_SET_HIGH(x) do { } while (0) | 
| 483 | #define STATS_INC_ERR(x) do { } while (0) | 492 | #define STATS_INC_ERR(x) do { } while (0) | 
| 484 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 493 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 
| @@ -492,17 +501,6 @@ struct kmem_cache { | |||
| 492 | #endif | 501 | #endif | 
| 493 | 502 | ||
| 494 | #if DEBUG | 503 | #if DEBUG | 
| 495 | /* | ||
| 496 | * Magic nums for obj red zoning. | ||
| 497 | * Placed in the first word before and the first word after an obj. | ||
| 498 | */ | ||
| 499 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | ||
| 500 | #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ | ||
| 501 | |||
| 502 | /* ...and for poisoning */ | ||
| 503 | #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ | ||
| 504 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | ||
| 505 | #define POISON_END 0xa5 /* end-byte of poisoning */ | ||
| 506 | 504 | ||
| 507 | /* | 505 | /* | 
| 508 | * memory layout of objects: | 506 | * memory layout of objects: | 
| @@ -676,17 +674,66 @@ static struct kmem_cache cache_cache = { | |||
| 676 | #endif | 674 | #endif | 
| 677 | }; | 675 | }; | 
| 678 | 676 | ||
| 679 | /* Guard access to the cache-chain. */ | 677 | #define BAD_ALIEN_MAGIC 0x01020304ul | 
| 680 | static DEFINE_MUTEX(cache_chain_mutex); | 678 | |
| 681 | static struct list_head cache_chain; | 679 | #ifdef CONFIG_LOCKDEP | 
| 682 | 680 | ||
| 683 | /* | 681 | /* | 
| 684 | * vm_enough_memory() looks at this to determine how many slab-allocated pages | 682 | * Slab sometimes uses the kmalloc slabs to store the slab headers | 
| 685 | * are possibly freeable under pressure | 683 | * for other slabs "off slab". | 
| 684 | * The locking for this is tricky in that it nests within the locks | ||
| 685 | * of all other slabs in a few places; to deal with this special | ||
| 686 | * locking we put on-slab caches into a separate lock-class. | ||
| 686 | * | 687 | * | 
| 687 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | 688 | * We set lock class for alien array caches which are up during init. | 
| 689 | * The lock annotation will be lost if all cpus of a node goes down and | ||
| 690 | * then comes back up during hotplug | ||
| 688 | */ | 691 | */ | 
| 689 | atomic_t slab_reclaim_pages; | 692 | static struct lock_class_key on_slab_l3_key; | 
| 693 | static struct lock_class_key on_slab_alc_key; | ||
| 694 | |||
| 695 | static inline void init_lock_keys(void) | ||
| 696 | |||
| 697 | { | ||
| 698 | int q; | ||
| 699 | struct cache_sizes *s = malloc_sizes; | ||
| 700 | |||
| 701 | while (s->cs_size != ULONG_MAX) { | ||
| 702 | for_each_node(q) { | ||
| 703 | struct array_cache **alc; | ||
| 704 | int r; | ||
| 705 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | ||
| 706 | if (!l3 || OFF_SLAB(s->cs_cachep)) | ||
| 707 | continue; | ||
| 708 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | ||
| 709 | alc = l3->alien; | ||
| 710 | /* | ||
| 711 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
| 712 | * should go away when common slab code is taught to | ||
| 713 | * work even without alien caches. | ||
| 714 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
| 715 | * for alloc_alien_cache, | ||
| 716 | */ | ||
| 717 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
| 718 | continue; | ||
| 719 | for_each_node(r) { | ||
| 720 | if (alc[r]) | ||
| 721 | lockdep_set_class(&alc[r]->lock, | ||
| 722 | &on_slab_alc_key); | ||
| 723 | } | ||
| 724 | } | ||
| 725 | s++; | ||
| 726 | } | ||
| 727 | } | ||
| 728 | #else | ||
| 729 | static inline void init_lock_keys(void) | ||
| 730 | { | ||
| 731 | } | ||
| 732 | #endif | ||
| 733 | |||
| 734 | /* Guard access to the cache-chain. */ | ||
| 735 | static DEFINE_MUTEX(cache_chain_mutex); | ||
| 736 | static struct list_head cache_chain; | ||
| 690 | 737 | ||
| 691 | /* | 738 | /* | 
| 692 | * chicken and egg problem: delay the per-cpu array allocation | 739 | * chicken and egg problem: delay the per-cpu array allocation | 
| @@ -709,12 +756,6 @@ int slab_is_available(void) | |||
| 709 | 756 | ||
| 710 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 757 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 
| 711 | 758 | ||
| 712 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | ||
| 713 | int node); | ||
| 714 | static void enable_cpucache(struct kmem_cache *cachep); | ||
| 715 | static void cache_reap(void *unused); | ||
| 716 | static int __node_shrink(struct kmem_cache *cachep, int node); | ||
| 717 | |||
| 718 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 759 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 
| 719 | { | 760 | { | 
| 720 | return cachep->array[smp_processor_id()]; | 761 | return cachep->array[smp_processor_id()]; | 
| @@ -745,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, | |||
| 745 | return csizep->cs_cachep; | 786 | return csizep->cs_cachep; | 
| 746 | } | 787 | } | 
| 747 | 788 | ||
| 748 | struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) | 789 | static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) | 
| 749 | { | 790 | { | 
| 750 | return __find_general_cachep(size, gfpflags); | 791 | return __find_general_cachep(size, gfpflags); | 
| 751 | } | 792 | } | 
| 752 | EXPORT_SYMBOL(kmem_find_general_cachep); | ||
| 753 | 793 | ||
| 754 | static size_t slab_mgmt_size(size_t nr_objs, size_t align) | 794 | static size_t slab_mgmt_size(size_t nr_objs, size_t align) | 
| 755 | { | 795 | { | 
| @@ -932,7 +972,39 @@ static int transfer_objects(struct array_cache *to, | |||
| 932 | return nr; | 972 | return nr; | 
| 933 | } | 973 | } | 
| 934 | 974 | ||
| 935 | #ifdef CONFIG_NUMA | 975 | #ifndef CONFIG_NUMA | 
| 976 | |||
| 977 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
| 978 | #define reap_alien(cachep, l3) do { } while (0) | ||
| 979 | |||
| 980 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
| 981 | { | ||
| 982 | return (struct array_cache **)BAD_ALIEN_MAGIC; | ||
| 983 | } | ||
| 984 | |||
| 985 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
| 986 | { | ||
| 987 | } | ||
| 988 | |||
| 989 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
| 990 | { | ||
| 991 | return 0; | ||
| 992 | } | ||
| 993 | |||
| 994 | static inline void *alternate_node_alloc(struct kmem_cache *cachep, | ||
| 995 | gfp_t flags) | ||
| 996 | { | ||
| 997 | return NULL; | ||
| 998 | } | ||
| 999 | |||
| 1000 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | ||
| 1001 | gfp_t flags, int nodeid) | ||
| 1002 | { | ||
| 1003 | return NULL; | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | #else /* CONFIG_NUMA */ | ||
| 1007 | |||
| 936 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1008 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 
| 937 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1009 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 
| 938 | 1010 | ||
| @@ -1061,29 +1133,9 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
| 1061 | } | 1133 | } | 
| 1062 | return 1; | 1134 | return 1; | 
| 1063 | } | 1135 | } | 
| 1064 | |||
| 1065 | #else | ||
| 1066 | |||
| 1067 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
| 1068 | #define reap_alien(cachep, l3) do { } while (0) | ||
| 1069 | |||
| 1070 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
| 1071 | { | ||
| 1072 | return (struct array_cache **) 0x01020304ul; | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
| 1076 | { | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
| 1080 | { | ||
| 1081 | return 0; | ||
| 1082 | } | ||
| 1083 | |||
| 1084 | #endif | 1136 | #endif | 
| 1085 | 1137 | ||
| 1086 | static int cpuup_callback(struct notifier_block *nfb, | 1138 | static int __cpuinit cpuup_callback(struct notifier_block *nfb, | 
| 1087 | unsigned long action, void *hcpu) | 1139 | unsigned long action, void *hcpu) | 
| 1088 | { | 1140 | { | 
| 1089 | long cpu = (long)hcpu; | 1141 | long cpu = (long)hcpu; | 
| @@ -1250,10 +1302,7 @@ free_array_cache: | |||
| 1250 | l3 = cachep->nodelists[node]; | 1302 | l3 = cachep->nodelists[node]; | 
| 1251 | if (!l3) | 1303 | if (!l3) | 
| 1252 | continue; | 1304 | continue; | 
| 1253 | spin_lock_irq(&l3->list_lock); | 1305 | drain_freelist(cachep, l3, l3->free_objects); | 
| 1254 | /* free slabs belonging to this node */ | ||
| 1255 | __node_shrink(cachep, node); | ||
| 1256 | spin_unlock_irq(&l3->list_lock); | ||
| 1257 | } | 1306 | } | 
| 1258 | mutex_unlock(&cache_chain_mutex); | 1307 | mutex_unlock(&cache_chain_mutex); | 
| 1259 | break; | 1308 | break; | 
| @@ -1265,7 +1314,9 @@ bad: | |||
| 1265 | return NOTIFY_BAD; | 1314 | return NOTIFY_BAD; | 
| 1266 | } | 1315 | } | 
| 1267 | 1316 | ||
| 1268 | static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | 1317 | static struct notifier_block __cpuinitdata cpucache_notifier = { | 
| 1318 | &cpuup_callback, NULL, 0 | ||
| 1319 | }; | ||
| 1269 | 1320 | ||
| 1270 | /* | 1321 | /* | 
| 1271 | * swap the static kmem_list3 with kmalloced memory | 1322 | * swap the static kmem_list3 with kmalloced memory | 
| @@ -1281,6 +1332,11 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
| 1281 | 1332 | ||
| 1282 | local_irq_disable(); | 1333 | local_irq_disable(); | 
| 1283 | memcpy(ptr, list, sizeof(struct kmem_list3)); | 1334 | memcpy(ptr, list, sizeof(struct kmem_list3)); | 
| 1335 | /* | ||
| 1336 | * Do not assume that spinlocks can be initialized via memcpy: | ||
| 1337 | */ | ||
| 1338 | spin_lock_init(&ptr->list_lock); | ||
| 1339 | |||
| 1284 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 1340 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 
| 1285 | cachep->nodelists[nodeid] = ptr; | 1341 | cachep->nodelists[nodeid] = ptr; | 
| 1286 | local_irq_enable(); | 1342 | local_irq_enable(); | 
| @@ -1407,7 +1463,7 @@ void __init kmem_cache_init(void) | |||
| 1407 | } | 1463 | } | 
| 1408 | /* 4) Replace the bootstrap head arrays */ | 1464 | /* 4) Replace the bootstrap head arrays */ | 
| 1409 | { | 1465 | { | 
| 1410 | void *ptr; | 1466 | struct array_cache *ptr; | 
| 1411 | 1467 | ||
| 1412 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1468 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 
| 1413 | 1469 | ||
| @@ -1415,6 +1471,11 @@ void __init kmem_cache_init(void) | |||
| 1415 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 1471 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 
| 1416 | memcpy(ptr, cpu_cache_get(&cache_cache), | 1472 | memcpy(ptr, cpu_cache_get(&cache_cache), | 
| 1417 | sizeof(struct arraycache_init)); | 1473 | sizeof(struct arraycache_init)); | 
| 1474 | /* | ||
| 1475 | * Do not assume that spinlocks can be initialized via memcpy: | ||
| 1476 | */ | ||
| 1477 | spin_lock_init(&ptr->lock); | ||
| 1478 | |||
| 1418 | cache_cache.array[smp_processor_id()] = ptr; | 1479 | cache_cache.array[smp_processor_id()] = ptr; | 
| 1419 | local_irq_enable(); | 1480 | local_irq_enable(); | 
| 1420 | 1481 | ||
| @@ -1425,6 +1486,11 @@ void __init kmem_cache_init(void) | |||
| 1425 | != &initarray_generic.cache); | 1486 | != &initarray_generic.cache); | 
| 1426 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), | 1487 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), | 
| 1427 | sizeof(struct arraycache_init)); | 1488 | sizeof(struct arraycache_init)); | 
| 1489 | /* | ||
| 1490 | * Do not assume that spinlocks can be initialized via memcpy: | ||
| 1491 | */ | ||
| 1492 | spin_lock_init(&ptr->lock); | ||
| 1493 | |||
| 1428 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 1494 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 
| 1429 | ptr; | 1495 | ptr; | 
| 1430 | local_irq_enable(); | 1496 | local_irq_enable(); | 
| @@ -1453,10 +1519,15 @@ void __init kmem_cache_init(void) | |||
| 1453 | struct kmem_cache *cachep; | 1519 | struct kmem_cache *cachep; | 
| 1454 | mutex_lock(&cache_chain_mutex); | 1520 | mutex_lock(&cache_chain_mutex); | 
| 1455 | list_for_each_entry(cachep, &cache_chain, next) | 1521 | list_for_each_entry(cachep, &cache_chain, next) | 
| 1456 | enable_cpucache(cachep); | 1522 | if (enable_cpucache(cachep)) | 
| 1523 | BUG(); | ||
| 1457 | mutex_unlock(&cache_chain_mutex); | 1524 | mutex_unlock(&cache_chain_mutex); | 
| 1458 | } | 1525 | } | 
| 1459 | 1526 | ||
| 1527 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
| 1528 | init_lock_keys(); | ||
| 1529 | |||
| 1530 | |||
| 1460 | /* Done! */ | 1531 | /* Done! */ | 
| 1461 | g_cpucache_up = FULL; | 1532 | g_cpucache_up = FULL; | 
| 1462 | 1533 | ||
| @@ -1505,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1505 | */ | 1576 | */ | 
| 1506 | flags |= __GFP_COMP; | 1577 | flags |= __GFP_COMP; | 
| 1507 | #endif | 1578 | #endif | 
| 1508 | flags |= cachep->gfpflags; | 1579 | |
| 1580 | /* | ||
| 1581 | * Under NUMA we want memory on the indicated node. We will handle | ||
| 1582 | * the needed fallback ourselves since we want to serve from our | ||
| 1583 | * per node object lists first for other nodes. | ||
| 1584 | */ | ||
| 1585 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
| 1509 | 1586 | ||
| 1510 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1587 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 
| 1511 | if (!page) | 1588 | if (!page) | 
| @@ -1513,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1513 | 1590 | ||
| 1514 | nr_pages = (1 << cachep->gfporder); | 1591 | nr_pages = (1 << cachep->gfporder); | 
| 1515 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1592 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 
| 1516 | atomic_add(nr_pages, &slab_reclaim_pages); | 1593 | add_zone_page_state(page_zone(page), | 
| 1517 | add_page_state(nr_slab, nr_pages); | 1594 | NR_SLAB_RECLAIMABLE, nr_pages); | 
| 1595 | else | ||
| 1596 | add_zone_page_state(page_zone(page), | ||
| 1597 | NR_SLAB_UNRECLAIMABLE, nr_pages); | ||
| 1518 | for (i = 0; i < nr_pages; i++) | 1598 | for (i = 0; i < nr_pages; i++) | 
| 1519 | __SetPageSlab(page + i); | 1599 | __SetPageSlab(page + i); | 
| 1520 | return page_address(page); | 1600 | return page_address(page); | 
| @@ -1529,17 +1609,20 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
| 1529 | struct page *page = virt_to_page(addr); | 1609 | struct page *page = virt_to_page(addr); | 
| 1530 | const unsigned long nr_freed = i; | 1610 | const unsigned long nr_freed = i; | 
| 1531 | 1611 | ||
| 1612 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 1613 | sub_zone_page_state(page_zone(page), | ||
| 1614 | NR_SLAB_RECLAIMABLE, nr_freed); | ||
| 1615 | else | ||
| 1616 | sub_zone_page_state(page_zone(page), | ||
| 1617 | NR_SLAB_UNRECLAIMABLE, nr_freed); | ||
| 1532 | while (i--) { | 1618 | while (i--) { | 
| 1533 | BUG_ON(!PageSlab(page)); | 1619 | BUG_ON(!PageSlab(page)); | 
| 1534 | __ClearPageSlab(page); | 1620 | __ClearPageSlab(page); | 
| 1535 | page++; | 1621 | page++; | 
| 1536 | } | 1622 | } | 
| 1537 | sub_page_state(nr_slab, nr_freed); | ||
| 1538 | if (current->reclaim_state) | 1623 | if (current->reclaim_state) | 
| 1539 | current->reclaim_state->reclaimed_slab += nr_freed; | 1624 | current->reclaim_state->reclaimed_slab += nr_freed; | 
| 1540 | free_pages((unsigned long)addr, cachep->gfporder); | 1625 | free_pages((unsigned long)addr, cachep->gfporder); | 
| 1541 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 1542 | atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); | ||
| 1543 | } | 1626 | } | 
| 1544 | 1627 | ||
| 1545 | static void kmem_rcu_free(struct rcu_head *head) | 1628 | static void kmem_rcu_free(struct rcu_head *head) | 
| @@ -1600,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | |||
| 1600 | static void dump_line(char *data, int offset, int limit) | 1683 | static void dump_line(char *data, int offset, int limit) | 
| 1601 | { | 1684 | { | 
| 1602 | int i; | 1685 | int i; | 
| 1686 | unsigned char error = 0; | ||
| 1687 | int bad_count = 0; | ||
| 1688 | |||
| 1603 | printk(KERN_ERR "%03x:", offset); | 1689 | printk(KERN_ERR "%03x:", offset); | 
| 1604 | for (i = 0; i < limit; i++) | 1690 | for (i = 0; i < limit; i++) { | 
| 1691 | if (data[offset + i] != POISON_FREE) { | ||
| 1692 | error = data[offset + i]; | ||
| 1693 | bad_count++; | ||
| 1694 | } | ||
| 1605 | printk(" %02x", (unsigned char)data[offset + i]); | 1695 | printk(" %02x", (unsigned char)data[offset + i]); | 
| 1696 | } | ||
| 1606 | printk("\n"); | 1697 | printk("\n"); | 
| 1698 | |||
| 1699 | if (bad_count == 1) { | ||
| 1700 | error ^= POISON_FREE; | ||
| 1701 | if (!(error & (error - 1))) { | ||
| 1702 | printk(KERN_ERR "Single bit error detected. Probably " | ||
| 1703 | "bad RAM.\n"); | ||
| 1704 | #ifdef CONFIG_X86 | ||
| 1705 | printk(KERN_ERR "Run memtest86+ or a similar memory " | ||
| 1706 | "test tool.\n"); | ||
| 1707 | #else | ||
| 1708 | printk(KERN_ERR "Run a memory test tool.\n"); | ||
| 1709 | #endif | ||
| 1710 | } | ||
| 1711 | } | ||
| 1607 | } | 1712 | } | 
| 1608 | #endif | 1713 | #endif | 
| 1609 | 1714 | ||
| @@ -1796,6 +1901,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) | |||
| 1796 | } | 1901 | } | 
| 1797 | } | 1902 | } | 
| 1798 | 1903 | ||
| 1904 | static void __kmem_cache_destroy(struct kmem_cache *cachep) | ||
| 1905 | { | ||
| 1906 | int i; | ||
| 1907 | struct kmem_list3 *l3; | ||
| 1908 | |||
| 1909 | for_each_online_cpu(i) | ||
| 1910 | kfree(cachep->array[i]); | ||
| 1911 | |||
| 1912 | /* NUMA: free the list3 structures */ | ||
| 1913 | for_each_online_node(i) { | ||
| 1914 | l3 = cachep->nodelists[i]; | ||
| 1915 | if (l3) { | ||
| 1916 | kfree(l3->shared); | ||
| 1917 | free_alien_cache(l3->alien); | ||
| 1918 | kfree(l3); | ||
| 1919 | } | ||
| 1920 | } | ||
| 1921 | kmem_cache_free(&cache_cache, cachep); | ||
| 1922 | } | ||
| 1923 | |||
| 1924 | |||
| 1799 | /** | 1925 | /** | 
| 1800 | * calculate_slab_order - calculate size (page order) of slabs | 1926 | * calculate_slab_order - calculate size (page order) of slabs | 
| 1801 | * @cachep: pointer to the cache that is being created | 1927 | * @cachep: pointer to the cache that is being created | 
| @@ -1866,12 +1992,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
| 1866 | return left_over; | 1992 | return left_over; | 
| 1867 | } | 1993 | } | 
| 1868 | 1994 | ||
| 1869 | static void setup_cpu_cache(struct kmem_cache *cachep) | 1995 | static int setup_cpu_cache(struct kmem_cache *cachep) | 
| 1870 | { | 1996 | { | 
| 1871 | if (g_cpucache_up == FULL) { | 1997 | if (g_cpucache_up == FULL) | 
| 1872 | enable_cpucache(cachep); | 1998 | return enable_cpucache(cachep); | 
| 1873 | return; | 1999 | |
| 1874 | } | ||
| 1875 | if (g_cpucache_up == NONE) { | 2000 | if (g_cpucache_up == NONE) { | 
| 1876 | /* | 2001 | /* | 
| 1877 | * Note: the first kmem_cache_create must create the cache | 2002 | * Note: the first kmem_cache_create must create the cache | 
| @@ -1918,6 +2043,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep) | |||
| 1918 | cpu_cache_get(cachep)->touched = 0; | 2043 | cpu_cache_get(cachep)->touched = 0; | 
| 1919 | cachep->batchcount = 1; | 2044 | cachep->batchcount = 1; | 
| 1920 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | 2045 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | 
| 2046 | return 0; | ||
| 1921 | } | 2047 | } | 
| 1922 | 2048 | ||
| 1923 | /** | 2049 | /** | 
| @@ -2059,6 +2185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2059 | } else { | 2185 | } else { | 
| 2060 | ralign = BYTES_PER_WORD; | 2186 | ralign = BYTES_PER_WORD; | 
| 2061 | } | 2187 | } | 
| 2188 | |||
| 2189 | /* | ||
| 2190 | * Redzoning and user store require word alignment. Note this will be | ||
| 2191 | * overridden by architecture or caller mandated alignment if either | ||
| 2192 | * is greater than BYTES_PER_WORD. | ||
| 2193 | */ | ||
| 2194 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | ||
| 2195 | ralign = BYTES_PER_WORD; | ||
| 2196 | |||
| 2062 | /* 2) arch mandated alignment: disables debug if necessary */ | 2197 | /* 2) arch mandated alignment: disables debug if necessary */ | 
| 2063 | if (ralign < ARCH_SLAB_MINALIGN) { | 2198 | if (ralign < ARCH_SLAB_MINALIGN) { | 
| 2064 | ralign = ARCH_SLAB_MINALIGN; | 2199 | ralign = ARCH_SLAB_MINALIGN; | 
| @@ -2072,8 +2207,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2072 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2207 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 
| 2073 | } | 2208 | } | 
| 2074 | /* | 2209 | /* | 
| 2075 | * 4) Store it. Note that the debug code below can reduce | 2210 | * 4) Store it. | 
| 2076 | * the alignment to BYTES_PER_WORD. | ||
| 2077 | */ | 2211 | */ | 
| 2078 | align = ralign; | 2212 | align = ralign; | 
| 2079 | 2213 | ||
| @@ -2085,20 +2219,19 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2085 | #if DEBUG | 2219 | #if DEBUG | 
| 2086 | cachep->obj_size = size; | 2220 | cachep->obj_size = size; | 
| 2087 | 2221 | ||
| 2222 | /* | ||
| 2223 | * Both debugging options require word-alignment which is calculated | ||
| 2224 | * into align above. | ||
| 2225 | */ | ||
| 2088 | if (flags & SLAB_RED_ZONE) { | 2226 | if (flags & SLAB_RED_ZONE) { | 
| 2089 | /* redzoning only works with word aligned caches */ | ||
| 2090 | align = BYTES_PER_WORD; | ||
| 2091 | |||
| 2092 | /* add space for red zone words */ | 2227 | /* add space for red zone words */ | 
| 2093 | cachep->obj_offset += BYTES_PER_WORD; | 2228 | cachep->obj_offset += BYTES_PER_WORD; | 
| 2094 | size += 2 * BYTES_PER_WORD; | 2229 | size += 2 * BYTES_PER_WORD; | 
| 2095 | } | 2230 | } | 
| 2096 | if (flags & SLAB_STORE_USER) { | 2231 | if (flags & SLAB_STORE_USER) { | 
| 2097 | /* user store requires word alignment and | 2232 | /* user store requires one word storage behind the end of | 
| 2098 | * one word storage behind the end of the real | 2233 | * the real object. | 
| 2099 | * object. | ||
| 2100 | */ | 2234 | */ | 
| 2101 | align = BYTES_PER_WORD; | ||
| 2102 | size += BYTES_PER_WORD; | 2235 | size += BYTES_PER_WORD; | 
| 2103 | } | 2236 | } | 
| 2104 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2237 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 
| @@ -2162,14 +2295,26 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2162 | cachep->gfpflags |= GFP_DMA; | 2295 | cachep->gfpflags |= GFP_DMA; | 
| 2163 | cachep->buffer_size = size; | 2296 | cachep->buffer_size = size; | 
| 2164 | 2297 | ||
| 2165 | if (flags & CFLGS_OFF_SLAB) | 2298 | if (flags & CFLGS_OFF_SLAB) { | 
| 2166 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); | 2299 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); | 
| 2300 | /* | ||
| 2301 | * This is a possibility for one of the malloc_sizes caches. | ||
| 2302 | * But since we go off slab only for object size greater than | ||
| 2303 | * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, | ||
| 2304 | * this should not happen at all. | ||
| 2305 | * But leave a BUG_ON for some lucky dude. | ||
| 2306 | */ | ||
| 2307 | BUG_ON(!cachep->slabp_cache); | ||
| 2308 | } | ||
| 2167 | cachep->ctor = ctor; | 2309 | cachep->ctor = ctor; | 
| 2168 | cachep->dtor = dtor; | 2310 | cachep->dtor = dtor; | 
| 2169 | cachep->name = name; | 2311 | cachep->name = name; | 
| 2170 | 2312 | ||
| 2171 | 2313 | if (setup_cpu_cache(cachep)) { | |
| 2172 | setup_cpu_cache(cachep); | 2314 | __kmem_cache_destroy(cachep); | 
| 2315 | cachep = NULL; | ||
| 2316 | goto oops; | ||
| 2317 | } | ||
| 2173 | 2318 | ||
| 2174 | /* cache setup completed, link it into the list */ | 2319 | /* cache setup completed, link it into the list */ | 
| 2175 | list_add(&cachep->next, &cache_chain); | 2320 | list_add(&cachep->next, &cache_chain); | 
| @@ -2255,32 +2400,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
| 2255 | } | 2400 | } | 
| 2256 | } | 2401 | } | 
| 2257 | 2402 | ||
| 2258 | static int __node_shrink(struct kmem_cache *cachep, int node) | 2403 | /* | 
| 2404 | * Remove slabs from the list of free slabs. | ||
| 2405 | * Specify the number of slabs to drain in tofree. | ||
| 2406 | * | ||
| 2407 | * Returns the actual number of slabs released. | ||
| 2408 | */ | ||
| 2409 | static int drain_freelist(struct kmem_cache *cache, | ||
| 2410 | struct kmem_list3 *l3, int tofree) | ||
| 2259 | { | 2411 | { | 
| 2412 | struct list_head *p; | ||
| 2413 | int nr_freed; | ||
| 2260 | struct slab *slabp; | 2414 | struct slab *slabp; | 
| 2261 | struct kmem_list3 *l3 = cachep->nodelists[node]; | ||
| 2262 | int ret; | ||
| 2263 | 2415 | ||
| 2264 | for (;;) { | 2416 | nr_freed = 0; | 
| 2265 | struct list_head *p; | 2417 | while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { | 
| 2266 | 2418 | ||
| 2419 | spin_lock_irq(&l3->list_lock); | ||
| 2267 | p = l3->slabs_free.prev; | 2420 | p = l3->slabs_free.prev; | 
| 2268 | if (p == &l3->slabs_free) | 2421 | if (p == &l3->slabs_free) { | 
| 2269 | break; | 2422 | spin_unlock_irq(&l3->list_lock); | 
| 2423 | goto out; | ||
| 2424 | } | ||
| 2270 | 2425 | ||
| 2271 | slabp = list_entry(l3->slabs_free.prev, struct slab, list); | 2426 | slabp = list_entry(p, struct slab, list); | 
| 2272 | #if DEBUG | 2427 | #if DEBUG | 
| 2273 | BUG_ON(slabp->inuse); | 2428 | BUG_ON(slabp->inuse); | 
| 2274 | #endif | 2429 | #endif | 
| 2275 | list_del(&slabp->list); | 2430 | list_del(&slabp->list); | 
| 2276 | 2431 | /* | |
| 2277 | l3->free_objects -= cachep->num; | 2432 | * Safe to drop the lock. The slab is no longer linked | 
| 2433 | * to the cache. | ||
| 2434 | */ | ||
| 2435 | l3->free_objects -= cache->num; | ||
| 2278 | spin_unlock_irq(&l3->list_lock); | 2436 | spin_unlock_irq(&l3->list_lock); | 
| 2279 | slab_destroy(cachep, slabp); | 2437 | slab_destroy(cache, slabp); | 
| 2280 | spin_lock_irq(&l3->list_lock); | 2438 | nr_freed++; | 
| 2281 | } | 2439 | } | 
| 2282 | ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); | 2440 | out: | 
| 2283 | return ret; | 2441 | return nr_freed; | 
| 2284 | } | 2442 | } | 
| 2285 | 2443 | ||
| 2286 | static int __cache_shrink(struct kmem_cache *cachep) | 2444 | static int __cache_shrink(struct kmem_cache *cachep) | 
| @@ -2293,11 +2451,13 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
| 2293 | check_irq_on(); | 2451 | check_irq_on(); | 
| 2294 | for_each_online_node(i) { | 2452 | for_each_online_node(i) { | 
| 2295 | l3 = cachep->nodelists[i]; | 2453 | l3 = cachep->nodelists[i]; | 
| 2296 | if (l3) { | 2454 | if (!l3) | 
| 2297 | spin_lock_irq(&l3->list_lock); | 2455 | continue; | 
| 2298 | ret += __node_shrink(cachep, i); | 2456 | |
| 2299 | spin_unlock_irq(&l3->list_lock); | 2457 | drain_freelist(cachep, l3, l3->free_objects); | 
| 2300 | } | 2458 | |
| 2459 | ret += !list_empty(&l3->slabs_full) || | ||
| 2460 | !list_empty(&l3->slabs_partial); | ||
| 2301 | } | 2461 | } | 
| 2302 | return (ret ? 1 : 0); | 2462 | return (ret ? 1 : 0); | 
| 2303 | } | 2463 | } | 
| @@ -2322,7 +2482,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
| 2322 | * @cachep: the cache to destroy | 2482 | * @cachep: the cache to destroy | 
| 2323 | * | 2483 | * | 
| 2324 | * Remove a struct kmem_cache object from the slab cache. | 2484 | * Remove a struct kmem_cache object from the slab cache. | 
| 2325 | * Returns 0 on success. | ||
| 2326 | * | 2485 | * | 
| 2327 | * It is expected this function will be called by a module when it is | 2486 | * It is expected this function will be called by a module when it is | 
| 2328 | * unloaded. This will remove the cache completely, and avoid a duplicate | 2487 | * unloaded. This will remove the cache completely, and avoid a duplicate | 
| @@ -2334,11 +2493,8 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
| 2334 | * The caller must guarantee that noone will allocate memory from the cache | 2493 | * The caller must guarantee that noone will allocate memory from the cache | 
| 2335 | * during the kmem_cache_destroy(). | 2494 | * during the kmem_cache_destroy(). | 
| 2336 | */ | 2495 | */ | 
| 2337 | int kmem_cache_destroy(struct kmem_cache *cachep) | 2496 | void kmem_cache_destroy(struct kmem_cache *cachep) | 
| 2338 | { | 2497 | { | 
| 2339 | int i; | ||
| 2340 | struct kmem_list3 *l3; | ||
| 2341 | |||
| 2342 | BUG_ON(!cachep || in_interrupt()); | 2498 | BUG_ON(!cachep || in_interrupt()); | 
| 2343 | 2499 | ||
| 2344 | /* Don't let CPUs to come and go */ | 2500 | /* Don't let CPUs to come and go */ | 
| @@ -2358,31 +2514,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2358 | list_add(&cachep->next, &cache_chain); | 2514 | list_add(&cachep->next, &cache_chain); | 
| 2359 | mutex_unlock(&cache_chain_mutex); | 2515 | mutex_unlock(&cache_chain_mutex); | 
| 2360 | unlock_cpu_hotplug(); | 2516 | unlock_cpu_hotplug(); | 
| 2361 | return 1; | 2517 | return; | 
| 2362 | } | 2518 | } | 
| 2363 | 2519 | ||
| 2364 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 2520 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 
| 2365 | synchronize_rcu(); | 2521 | synchronize_rcu(); | 
| 2366 | 2522 | ||
| 2367 | for_each_online_cpu(i) | 2523 | __kmem_cache_destroy(cachep); | 
| 2368 | kfree(cachep->array[i]); | ||
| 2369 | |||
| 2370 | /* NUMA: free the list3 structures */ | ||
| 2371 | for_each_online_node(i) { | ||
| 2372 | l3 = cachep->nodelists[i]; | ||
| 2373 | if (l3) { | ||
| 2374 | kfree(l3->shared); | ||
| 2375 | free_alien_cache(l3->alien); | ||
| 2376 | kfree(l3); | ||
| 2377 | } | ||
| 2378 | } | ||
| 2379 | kmem_cache_free(&cache_cache, cachep); | ||
| 2380 | unlock_cpu_hotplug(); | 2524 | unlock_cpu_hotplug(); | 
| 2381 | return 0; | ||
| 2382 | } | 2525 | } | 
| 2383 | EXPORT_SYMBOL(kmem_cache_destroy); | 2526 | EXPORT_SYMBOL(kmem_cache_destroy); | 
| 2384 | 2527 | ||
| 2385 | /* Get the memory for a slab management obj. */ | 2528 | /* | 
| 2529 | * Get the memory for a slab management obj. | ||
| 2530 | * For a slab cache when the slab descriptor is off-slab, slab descriptors | ||
| 2531 | * always come from malloc_sizes caches. The slab descriptor cannot | ||
| 2532 | * come from the same cache which is getting created because, | ||
| 2533 | * when we are searching for an appropriate cache for these | ||
| 2534 | * descriptors in kmem_cache_create, we search through the malloc_sizes array. | ||
| 2535 | * If we are creating a malloc_sizes cache here it would not be visible to | ||
| 2536 | * kmem_find_general_cachep till the initialization is complete. | ||
| 2537 | * Hence we cannot have slabp_cache same as the original cache. | ||
| 2538 | */ | ||
| 2386 | static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | 2539 | static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | 
| 2387 | int colour_off, gfp_t local_flags, | 2540 | int colour_off, gfp_t local_flags, | 
| 2388 | int nodeid) | 2541 | int nodeid) | 
| @@ -2915,14 +3068,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 2915 | void *objp; | 3068 | void *objp; | 
| 2916 | struct array_cache *ac; | 3069 | struct array_cache *ac; | 
| 2917 | 3070 | ||
| 2918 | #ifdef CONFIG_NUMA | ||
| 2919 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { | ||
| 2920 | objp = alternate_node_alloc(cachep, flags); | ||
| 2921 | if (objp != NULL) | ||
| 2922 | return objp; | ||
| 2923 | } | ||
| 2924 | #endif | ||
| 2925 | |||
| 2926 | check_irq_off(); | 3071 | check_irq_off(); | 
| 2927 | ac = cpu_cache_get(cachep); | 3072 | ac = cpu_cache_get(cachep); | 
| 2928 | if (likely(ac->avail)) { | 3073 | if (likely(ac->avail)) { | 
| @@ -2940,12 +3085,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
| 2940 | gfp_t flags, void *caller) | 3085 | gfp_t flags, void *caller) | 
| 2941 | { | 3086 | { | 
| 2942 | unsigned long save_flags; | 3087 | unsigned long save_flags; | 
| 2943 | void *objp; | 3088 | void *objp = NULL; | 
| 2944 | 3089 | ||
| 2945 | cache_alloc_debugcheck_before(cachep, flags); | 3090 | cache_alloc_debugcheck_before(cachep, flags); | 
| 2946 | 3091 | ||
| 2947 | local_irq_save(save_flags); | 3092 | local_irq_save(save_flags); | 
| 2948 | objp = ____cache_alloc(cachep, flags); | 3093 | |
| 3094 | if (unlikely(NUMA_BUILD && | ||
| 3095 | current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) | ||
| 3096 | objp = alternate_node_alloc(cachep, flags); | ||
| 3097 | |||
| 3098 | if (!objp) | ||
| 3099 | objp = ____cache_alloc(cachep, flags); | ||
| 3100 | /* | ||
| 3101 | * We may just have run out of memory on the local node. | ||
| 3102 | * __cache_alloc_node() knows how to locate memory on other nodes | ||
| 3103 | */ | ||
| 3104 | if (NUMA_BUILD && !objp) | ||
| 3105 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | ||
| 2949 | local_irq_restore(save_flags); | 3106 | local_irq_restore(save_flags); | 
| 2950 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3107 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 
| 2951 | caller); | 3108 | caller); | 
| @@ -2964,7 +3121,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 2964 | { | 3121 | { | 
| 2965 | int nid_alloc, nid_here; | 3122 | int nid_alloc, nid_here; | 
| 2966 | 3123 | ||
| 2967 | if (in_interrupt()) | 3124 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 
| 2968 | return NULL; | 3125 | return NULL; | 
| 2969 | nid_alloc = nid_here = numa_node_id(); | 3126 | nid_alloc = nid_here = numa_node_id(); | 
| 2970 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3127 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 
| @@ -2977,6 +3134,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 2977 | } | 3134 | } | 
| 2978 | 3135 | ||
| 2979 | /* | 3136 | /* | 
| 3137 | * Fallback function if there was no memory available and no objects on a | ||
| 3138 | * certain node and we are allowed to fall back. We mimick the behavior of | ||
| 3139 | * the page allocator. We fall back according to a zonelist determined by | ||
| 3140 | * the policy layer while obeying cpuset constraints. | ||
| 3141 | */ | ||
| 3142 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | ||
| 3143 | { | ||
| 3144 | struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) | ||
| 3145 | ->node_zonelists[gfp_zone(flags)]; | ||
| 3146 | struct zone **z; | ||
| 3147 | void *obj = NULL; | ||
| 3148 | |||
| 3149 | for (z = zonelist->zones; *z && !obj; z++) | ||
| 3150 | if (zone_idx(*z) <= ZONE_NORMAL && | ||
| 3151 | cpuset_zone_allowed(*z, flags)) | ||
| 3152 | obj = __cache_alloc_node(cache, | ||
| 3153 | flags | __GFP_THISNODE, | ||
| 3154 | zone_to_nid(*z)); | ||
| 3155 | return obj; | ||
| 3156 | } | ||
| 3157 | |||
| 3158 | /* | ||
| 2980 | * A interface to enable slab creation on nodeid | 3159 | * A interface to enable slab creation on nodeid | 
| 2981 | */ | 3160 | */ | 
| 2982 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3161 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 
| @@ -3029,11 +3208,15 @@ retry: | |||
| 3029 | must_grow: | 3208 | must_grow: | 
| 3030 | spin_unlock(&l3->list_lock); | 3209 | spin_unlock(&l3->list_lock); | 
| 3031 | x = cache_grow(cachep, flags, nodeid); | 3210 | x = cache_grow(cachep, flags, nodeid); | 
| 3211 | if (x) | ||
| 3212 | goto retry; | ||
| 3032 | 3213 | ||
| 3033 | if (!x) | 3214 | if (!(flags & __GFP_THISNODE)) | 
| 3034 | return NULL; | 3215 | /* Unable to grow the cache. Fall back to other nodes. */ | 
| 3216 | return fallback_alloc(cachep, flags); | ||
| 3217 | |||
| 3218 | return NULL; | ||
| 3035 | 3219 | ||
| 3036 | goto retry; | ||
| 3037 | done: | 3220 | done: | 
| 3038 | return obj; | 3221 | return obj; | 
| 3039 | } | 3222 | } | 
| @@ -3066,6 +3249,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
| 3066 | if (slabp->inuse == 0) { | 3249 | if (slabp->inuse == 0) { | 
| 3067 | if (l3->free_objects > l3->free_limit) { | 3250 | if (l3->free_objects > l3->free_limit) { | 
| 3068 | l3->free_objects -= cachep->num; | 3251 | l3->free_objects -= cachep->num; | 
| 3252 | /* No need to drop any previously held | ||
| 3253 | * lock here, even if we have a off-slab slab | ||
| 3254 | * descriptor it is guaranteed to come from | ||
| 3255 | * a different cache, refer to comments before | ||
| 3256 | * alloc_slabmgmt. | ||
| 3257 | */ | ||
| 3069 | slab_destroy(cachep, slabp); | 3258 | slab_destroy(cachep, slabp); | 
| 3070 | } else { | 3259 | } else { | 
| 3071 | list_add(&slabp->list, &l3->slabs_free); | 3260 | list_add(&slabp->list, &l3->slabs_free); | 
| @@ -3171,7 +3360,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3171 | EXPORT_SYMBOL(kmem_cache_alloc); | 3360 | EXPORT_SYMBOL(kmem_cache_alloc); | 
| 3172 | 3361 | ||
| 3173 | /** | 3362 | /** | 
| 3174 | * kmem_cache_alloc - Allocate an object. The memory is set to zero. | 3363 | * kmem_cache_zalloc - Allocate an object. The memory is set to zero. | 
| 3175 | * @cache: The cache to allocate from. | 3364 | * @cache: The cache to allocate from. | 
| 3176 | * @flags: See kmalloc(). | 3365 | * @flags: See kmalloc(). | 
| 3177 | * | 3366 | * | 
| @@ -3264,7 +3453,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 3264 | } | 3453 | } | 
| 3265 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3454 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 
| 3266 | 3455 | ||
| 3267 | void *kmalloc_node(size_t size, gfp_t flags, int node) | 3456 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 
| 3268 | { | 3457 | { | 
| 3269 | struct kmem_cache *cachep; | 3458 | struct kmem_cache *cachep; | 
| 3270 | 3459 | ||
| @@ -3273,7 +3462,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
| 3273 | return NULL; | 3462 | return NULL; | 
| 3274 | return kmem_cache_alloc_node(cachep, flags, node); | 3463 | return kmem_cache_alloc_node(cachep, flags, node); | 
| 3275 | } | 3464 | } | 
| 3276 | EXPORT_SYMBOL(kmalloc_node); | 3465 | EXPORT_SYMBOL(__kmalloc_node); | 
| 3277 | #endif | 3466 | #endif | 
| 3278 | 3467 | ||
| 3279 | /** | 3468 | /** | 
| @@ -3317,55 +3506,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) | |||
| 3317 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3506 | EXPORT_SYMBOL(__kmalloc_track_caller); | 
| 3318 | #endif | 3507 | #endif | 
| 3319 | 3508 | ||
| 3320 | #ifdef CONFIG_SMP | ||
| 3321 | /** | ||
| 3322 | * __alloc_percpu - allocate one copy of the object for every present | ||
| 3323 | * cpu in the system, zeroing them. | ||
| 3324 | * Objects should be dereferenced using the per_cpu_ptr macro only. | ||
| 3325 | * | ||
| 3326 | * @size: how many bytes of memory are required. | ||
| 3327 | */ | ||
| 3328 | void *__alloc_percpu(size_t size) | ||
| 3329 | { | ||
| 3330 | int i; | ||
| 3331 | struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); | ||
| 3332 | |||
| 3333 | if (!pdata) | ||
| 3334 | return NULL; | ||
| 3335 | |||
| 3336 | /* | ||
| 3337 | * Cannot use for_each_online_cpu since a cpu may come online | ||
| 3338 | * and we have no way of figuring out how to fix the array | ||
| 3339 | * that we have allocated then.... | ||
| 3340 | */ | ||
| 3341 | for_each_possible_cpu(i) { | ||
| 3342 | int node = cpu_to_node(i); | ||
| 3343 | |||
| 3344 | if (node_online(node)) | ||
| 3345 | pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); | ||
| 3346 | else | ||
| 3347 | pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); | ||
| 3348 | |||
| 3349 | if (!pdata->ptrs[i]) | ||
| 3350 | goto unwind_oom; | ||
| 3351 | memset(pdata->ptrs[i], 0, size); | ||
| 3352 | } | ||
| 3353 | |||
| 3354 | /* Catch derefs w/o wrappers */ | ||
| 3355 | return (void *)(~(unsigned long)pdata); | ||
| 3356 | |||
| 3357 | unwind_oom: | ||
| 3358 | while (--i >= 0) { | ||
| 3359 | if (!cpu_possible(i)) | ||
| 3360 | continue; | ||
| 3361 | kfree(pdata->ptrs[i]); | ||
| 3362 | } | ||
| 3363 | kfree(pdata); | ||
| 3364 | return NULL; | ||
| 3365 | } | ||
| 3366 | EXPORT_SYMBOL(__alloc_percpu); | ||
| 3367 | #endif | ||
| 3368 | |||
| 3369 | /** | 3509 | /** | 
| 3370 | * kmem_cache_free - Deallocate an object | 3510 | * kmem_cache_free - Deallocate an object | 
| 3371 | * @cachep: The cache the allocation was from. | 3511 | * @cachep: The cache the allocation was from. | 
| @@ -3405,35 +3545,12 @@ void kfree(const void *objp) | |||
| 3405 | local_irq_save(flags); | 3545 | local_irq_save(flags); | 
| 3406 | kfree_debugcheck(objp); | 3546 | kfree_debugcheck(objp); | 
| 3407 | c = virt_to_cache(objp); | 3547 | c = virt_to_cache(objp); | 
| 3408 | mutex_debug_check_no_locks_freed(objp, obj_size(c)); | 3548 | debug_check_no_locks_freed(objp, obj_size(c)); | 
| 3409 | __cache_free(c, (void *)objp); | 3549 | __cache_free(c, (void *)objp); | 
| 3410 | local_irq_restore(flags); | 3550 | local_irq_restore(flags); | 
| 3411 | } | 3551 | } | 
| 3412 | EXPORT_SYMBOL(kfree); | 3552 | EXPORT_SYMBOL(kfree); | 
| 3413 | 3553 | ||
| 3414 | #ifdef CONFIG_SMP | ||
| 3415 | /** | ||
| 3416 | * free_percpu - free previously allocated percpu memory | ||
| 3417 | * @objp: pointer returned by alloc_percpu. | ||
| 3418 | * | ||
| 3419 | * Don't free memory not originally allocated by alloc_percpu() | ||
| 3420 | * The complemented objp is to check for that. | ||
| 3421 | */ | ||
| 3422 | void free_percpu(const void *objp) | ||
| 3423 | { | ||
| 3424 | int i; | ||
| 3425 | struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); | ||
| 3426 | |||
| 3427 | /* | ||
| 3428 | * We allocate for all cpus so we cannot use for online cpu here. | ||
| 3429 | */ | ||
| 3430 | for_each_possible_cpu(i) | ||
| 3431 | kfree(p->ptrs[i]); | ||
| 3432 | kfree(p); | ||
| 3433 | } | ||
| 3434 | EXPORT_SYMBOL(free_percpu); | ||
| 3435 | #endif | ||
| 3436 | |||
| 3437 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | 3554 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | 
| 3438 | { | 3555 | { | 
| 3439 | return obj_size(cachep); | 3556 | return obj_size(cachep); | 
| @@ -3550,22 +3667,26 @@ static void do_ccupdate_local(void *info) | |||
| 3550 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3667 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 
| 3551 | int batchcount, int shared) | 3668 | int batchcount, int shared) | 
| 3552 | { | 3669 | { | 
| 3553 | struct ccupdate_struct new; | 3670 | struct ccupdate_struct *new; | 
| 3554 | int i, err; | 3671 | int i; | 
| 3672 | |||
| 3673 | new = kzalloc(sizeof(*new), GFP_KERNEL); | ||
| 3674 | if (!new) | ||
| 3675 | return -ENOMEM; | ||
| 3555 | 3676 | ||
| 3556 | memset(&new.new, 0, sizeof(new.new)); | ||
| 3557 | for_each_online_cpu(i) { | 3677 | for_each_online_cpu(i) { | 
| 3558 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3678 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, | 
| 3559 | batchcount); | 3679 | batchcount); | 
| 3560 | if (!new.new[i]) { | 3680 | if (!new->new[i]) { | 
| 3561 | for (i--; i >= 0; i--) | 3681 | for (i--; i >= 0; i--) | 
| 3562 | kfree(new.new[i]); | 3682 | kfree(new->new[i]); | 
| 3683 | kfree(new); | ||
| 3563 | return -ENOMEM; | 3684 | return -ENOMEM; | 
| 3564 | } | 3685 | } | 
| 3565 | } | 3686 | } | 
| 3566 | new.cachep = cachep; | 3687 | new->cachep = cachep; | 
| 3567 | 3688 | ||
| 3568 | on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); | 3689 | on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); | 
| 3569 | 3690 | ||
| 3570 | check_irq_on(); | 3691 | check_irq_on(); | 
| 3571 | cachep->batchcount = batchcount; | 3692 | cachep->batchcount = batchcount; | 
| @@ -3573,7 +3694,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
| 3573 | cachep->shared = shared; | 3694 | cachep->shared = shared; | 
| 3574 | 3695 | ||
| 3575 | for_each_online_cpu(i) { | 3696 | for_each_online_cpu(i) { | 
| 3576 | struct array_cache *ccold = new.new[i]; | 3697 | struct array_cache *ccold = new->new[i]; | 
| 3577 | if (!ccold) | 3698 | if (!ccold) | 
| 3578 | continue; | 3699 | continue; | 
| 3579 | spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 3700 | spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 
| @@ -3581,18 +3702,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
| 3581 | spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 3702 | spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 
| 3582 | kfree(ccold); | 3703 | kfree(ccold); | 
| 3583 | } | 3704 | } | 
| 3584 | 3705 | kfree(new); | |
| 3585 | err = alloc_kmemlist(cachep); | 3706 | return alloc_kmemlist(cachep); | 
| 3586 | if (err) { | ||
| 3587 | printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", | ||
| 3588 | cachep->name, -err); | ||
| 3589 | BUG(); | ||
| 3590 | } | ||
| 3591 | return 0; | ||
| 3592 | } | 3707 | } | 
| 3593 | 3708 | ||
| 3594 | /* Called with cache_chain_mutex held always */ | 3709 | /* Called with cache_chain_mutex held always */ | 
| 3595 | static void enable_cpucache(struct kmem_cache *cachep) | 3710 | static int enable_cpucache(struct kmem_cache *cachep) | 
| 3596 | { | 3711 | { | 
| 3597 | int err; | 3712 | int err; | 
| 3598 | int limit, shared; | 3713 | int limit, shared; | 
| @@ -3644,6 +3759,7 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
| 3644 | if (err) | 3759 | if (err) | 
| 3645 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3760 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 
| 3646 | cachep->name, -err); | 3761 | cachep->name, -err); | 
| 3762 | return err; | ||
| 3647 | } | 3763 | } | 
| 3648 | 3764 | ||
| 3649 | /* | 3765 | /* | 
| @@ -3701,10 +3817,6 @@ static void cache_reap(void *unused) | |||
| 3701 | } | 3817 | } | 
| 3702 | 3818 | ||
| 3703 | list_for_each_entry(searchp, &cache_chain, next) { | 3819 | list_for_each_entry(searchp, &cache_chain, next) { | 
| 3704 | struct list_head *p; | ||
| 3705 | int tofree; | ||
| 3706 | struct slab *slabp; | ||
| 3707 | |||
| 3708 | check_irq_on(); | 3820 | check_irq_on(); | 
| 3709 | 3821 | ||
| 3710 | /* | 3822 | /* | 
| @@ -3729,47 +3841,22 @@ static void cache_reap(void *unused) | |||
| 3729 | 3841 | ||
| 3730 | drain_array(searchp, l3, l3->shared, 0, node); | 3842 | drain_array(searchp, l3, l3->shared, 0, node); | 
| 3731 | 3843 | ||
| 3732 | if (l3->free_touched) { | 3844 | if (l3->free_touched) | 
| 3733 | l3->free_touched = 0; | 3845 | l3->free_touched = 0; | 
| 3734 | goto next; | 3846 | else { | 
| 3735 | } | 3847 | int freed; | 
| 3736 | 3848 | ||
| 3737 | tofree = (l3->free_limit + 5 * searchp->num - 1) / | 3849 | freed = drain_freelist(searchp, l3, (l3->free_limit + | 
| 3738 | (5 * searchp->num); | 3850 | 5 * searchp->num - 1) / (5 * searchp->num)); | 
| 3739 | do { | 3851 | STATS_ADD_REAPED(searchp, freed); | 
| 3740 | /* | 3852 | } | 
| 3741 | * Do not lock if there are no free blocks. | ||
| 3742 | */ | ||
| 3743 | if (list_empty(&l3->slabs_free)) | ||
| 3744 | break; | ||
| 3745 | |||
| 3746 | spin_lock_irq(&l3->list_lock); | ||
| 3747 | p = l3->slabs_free.next; | ||
| 3748 | if (p == &(l3->slabs_free)) { | ||
| 3749 | spin_unlock_irq(&l3->list_lock); | ||
| 3750 | break; | ||
| 3751 | } | ||
| 3752 | |||
| 3753 | slabp = list_entry(p, struct slab, list); | ||
| 3754 | BUG_ON(slabp->inuse); | ||
| 3755 | list_del(&slabp->list); | ||
| 3756 | STATS_INC_REAPED(searchp); | ||
| 3757 | |||
| 3758 | /* | ||
| 3759 | * Safe to drop the lock. The slab is no longer linked | ||
| 3760 | * to the cache. searchp cannot disappear, we hold | ||
| 3761 | * cache_chain_lock | ||
| 3762 | */ | ||
| 3763 | l3->free_objects -= searchp->num; | ||
| 3764 | spin_unlock_irq(&l3->list_lock); | ||
| 3765 | slab_destroy(searchp, slabp); | ||
| 3766 | } while (--tofree > 0); | ||
| 3767 | next: | 3853 | next: | 
| 3768 | cond_resched(); | 3854 | cond_resched(); | 
| 3769 | } | 3855 | } | 
| 3770 | check_irq_on(); | 3856 | check_irq_on(); | 
| 3771 | mutex_unlock(&cache_chain_mutex); | 3857 | mutex_unlock(&cache_chain_mutex); | 
| 3772 | next_reap_node(); | 3858 | next_reap_node(); | 
| 3859 | refresh_cpu_vm_stats(smp_processor_id()); | ||
| 3773 | /* Set up the next iteration */ | 3860 | /* Set up the next iteration */ | 
| 3774 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3861 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 
| 3775 | } | 3862 | } | 
| @@ -4133,6 +4220,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
| 4133 | show_symbol(m, n[2*i+2]); | 4220 | show_symbol(m, n[2*i+2]); | 
| 4134 | seq_putc(m, '\n'); | 4221 | seq_putc(m, '\n'); | 
| 4135 | } | 4222 | } | 
| 4223 | |||
| 4136 | return 0; | 4224 | return 0; | 
| 4137 | } | 4225 | } | 
| 4138 | 4226 | ||
| @@ -29,7 +29,6 @@ | |||
| 29 | * essentially no allocation space overhead. | 29 | * essentially no allocation space overhead. | 
| 30 | */ | 30 | */ | 
| 31 | 31 | ||
| 32 | #include <linux/config.h> | ||
| 33 | #include <linux/slab.h> | 32 | #include <linux/slab.h> | 
| 34 | #include <linux/mm.h> | 33 | #include <linux/mm.h> | 
| 35 | #include <linux/cache.h> | 34 | #include <linux/cache.h> | 
| @@ -271,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 271 | } | 270 | } | 
| 272 | EXPORT_SYMBOL(kmem_cache_create); | 271 | EXPORT_SYMBOL(kmem_cache_create); | 
| 273 | 272 | ||
| 274 | int kmem_cache_destroy(struct kmem_cache *c) | 273 | void kmem_cache_destroy(struct kmem_cache *c) | 
| 275 | { | 274 | { | 
| 276 | slob_free(c, sizeof(struct kmem_cache)); | 275 | slob_free(c, sizeof(struct kmem_cache)); | 
| 277 | return 0; | ||
| 278 | } | 276 | } | 
| 279 | EXPORT_SYMBOL(kmem_cache_destroy); | 277 | EXPORT_SYMBOL(kmem_cache_destroy); | 
| 280 | 278 | ||
| @@ -340,52 +338,3 @@ void kmem_cache_init(void) | |||
| 340 | 338 | ||
| 341 | mod_timer(&slob_timer, jiffies + HZ); | 339 | mod_timer(&slob_timer, jiffies + HZ); | 
| 342 | } | 340 | } | 
| 343 | |||
| 344 | atomic_t slab_reclaim_pages = ATOMIC_INIT(0); | ||
| 345 | EXPORT_SYMBOL(slab_reclaim_pages); | ||
| 346 | |||
| 347 | #ifdef CONFIG_SMP | ||
| 348 | |||
| 349 | void *__alloc_percpu(size_t size) | ||
| 350 | { | ||
| 351 | int i; | ||
| 352 | struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); | ||
| 353 | |||
| 354 | if (!pdata) | ||
| 355 | return NULL; | ||
| 356 | |||
| 357 | for_each_possible_cpu(i) { | ||
| 358 | pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); | ||
| 359 | if (!pdata->ptrs[i]) | ||
| 360 | goto unwind_oom; | ||
| 361 | memset(pdata->ptrs[i], 0, size); | ||
| 362 | } | ||
| 363 | |||
| 364 | /* Catch derefs w/o wrappers */ | ||
| 365 | return (void *) (~(unsigned long) pdata); | ||
| 366 | |||
| 367 | unwind_oom: | ||
| 368 | while (--i >= 0) { | ||
| 369 | if (!cpu_possible(i)) | ||
| 370 | continue; | ||
| 371 | kfree(pdata->ptrs[i]); | ||
| 372 | } | ||
| 373 | kfree(pdata); | ||
| 374 | return NULL; | ||
| 375 | } | ||
| 376 | EXPORT_SYMBOL(__alloc_percpu); | ||
| 377 | |||
| 378 | void | ||
| 379 | free_percpu(const void *objp) | ||
| 380 | { | ||
| 381 | int i; | ||
| 382 | struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); | ||
| 383 | |||
| 384 | for_each_possible_cpu(i) | ||
| 385 | kfree(p->ptrs[i]); | ||
| 386 | |||
| 387 | kfree(p); | ||
| 388 | } | ||
| 389 | EXPORT_SYMBOL(free_percpu); | ||
| 390 | |||
| 391 | #endif | ||
| diff --git a/mm/sparse.c b/mm/sparse.c index e0a3fe48aa37..86c52ab80878 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -1,7 +1,6 @@ | |||
| 1 | /* | 1 | /* | 
| 2 | * sparse memory mappings. | 2 | * sparse memory mappings. | 
| 3 | */ | 3 | */ | 
| 4 | #include <linux/config.h> | ||
| 5 | #include <linux/mm.h> | 4 | #include <linux/mm.h> | 
| 6 | #include <linux/mmzone.h> | 5 | #include <linux/mmzone.h> | 
| 7 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> | 
| @@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid) | |||
| 45 | 44 | ||
| 46 | static int sparse_index_init(unsigned long section_nr, int nid) | 45 | static int sparse_index_init(unsigned long section_nr, int nid) | 
| 47 | { | 46 | { | 
| 48 | static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; | 47 | static DEFINE_SPINLOCK(index_init_lock); | 
| 49 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 48 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 
| 50 | struct mem_section *section; | 49 | struct mem_section *section; | 
| 51 | int ret = 0; | 50 | int ret = 0; | 
| @@ -34,6 +34,25 @@ | |||
| 34 | /* How many pages do we try to swap or page in/out together? */ | 34 | /* How many pages do we try to swap or page in/out together? */ | 
| 35 | int page_cluster; | 35 | int page_cluster; | 
| 36 | 36 | ||
| 37 | /* | ||
| 38 | * This path almost never happens for VM activity - pages are normally | ||
| 39 | * freed via pagevecs. But it gets used by networking. | ||
| 40 | */ | ||
| 41 | static void fastcall __page_cache_release(struct page *page) | ||
| 42 | { | ||
| 43 | if (PageLRU(page)) { | ||
| 44 | unsigned long flags; | ||
| 45 | struct zone *zone = page_zone(page); | ||
| 46 | |||
| 47 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
| 48 | VM_BUG_ON(!PageLRU(page)); | ||
| 49 | __ClearPageLRU(page); | ||
| 50 | del_page_from_lru(zone, page); | ||
| 51 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
| 52 | } | ||
| 53 | free_hot_page(page); | ||
| 54 | } | ||
| 55 | |||
| 37 | static void put_compound_page(struct page *page) | 56 | static void put_compound_page(struct page *page) | 
| 38 | { | 57 | { | 
| 39 | page = (struct page *)page_private(page); | 58 | page = (struct page *)page_private(page); | 
| @@ -54,6 +73,26 @@ void put_page(struct page *page) | |||
| 54 | } | 73 | } | 
| 55 | EXPORT_SYMBOL(put_page); | 74 | EXPORT_SYMBOL(put_page); | 
| 56 | 75 | ||
| 76 | /** | ||
| 77 | * put_pages_list(): release a list of pages | ||
| 78 | * | ||
| 79 | * Release a list of pages which are strung together on page.lru. Currently | ||
| 80 | * used by read_cache_pages() and related error recovery code. | ||
| 81 | * | ||
| 82 | * @pages: list of pages threaded on page->lru | ||
| 83 | */ | ||
| 84 | void put_pages_list(struct list_head *pages) | ||
| 85 | { | ||
| 86 | while (!list_empty(pages)) { | ||
| 87 | struct page *victim; | ||
| 88 | |||
| 89 | victim = list_entry(pages->prev, struct page, lru); | ||
| 90 | list_del(&victim->lru); | ||
| 91 | page_cache_release(victim); | ||
| 92 | } | ||
| 93 | } | ||
| 94 | EXPORT_SYMBOL(put_pages_list); | ||
| 95 | |||
| 57 | /* | 96 | /* | 
| 58 | * Writeback is about to end against a page which has been marked for immediate | 97 | * Writeback is about to end against a page which has been marked for immediate | 
| 59 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 98 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 
| @@ -86,9 +125,8 @@ int rotate_reclaimable_page(struct page *page) | |||
| 86 | zone = page_zone(page); | 125 | zone = page_zone(page); | 
| 87 | spin_lock_irqsave(&zone->lru_lock, flags); | 126 | spin_lock_irqsave(&zone->lru_lock, flags); | 
| 88 | if (PageLRU(page) && !PageActive(page)) { | 127 | if (PageLRU(page) && !PageActive(page)) { | 
| 89 | list_del(&page->lru); | 128 | list_move_tail(&page->lru, &zone->inactive_list); | 
| 90 | list_add_tail(&page->lru, &zone->inactive_list); | 129 | __count_vm_event(PGROTATED); | 
| 91 | inc_page_state(pgrotated); | ||
| 92 | } | 130 | } | 
| 93 | if (!test_clear_page_writeback(page)) | 131 | if (!test_clear_page_writeback(page)) | 
| 94 | BUG(); | 132 | BUG(); | 
| @@ -108,7 +146,7 @@ void fastcall activate_page(struct page *page) | |||
| 108 | del_page_from_inactive_list(zone, page); | 146 | del_page_from_inactive_list(zone, page); | 
| 109 | SetPageActive(page); | 147 | SetPageActive(page); | 
| 110 | add_page_to_active_list(zone, page); | 148 | add_page_to_active_list(zone, page); | 
| 111 | inc_page_state(pgactivate); | 149 | __count_vm_event(PGACTIVATE); | 
| 112 | } | 150 | } | 
| 113 | spin_unlock_irq(&zone->lru_lock); | 151 | spin_unlock_irq(&zone->lru_lock); | 
| 114 | } | 152 | } | 
| @@ -204,26 +242,6 @@ int lru_add_drain_all(void) | |||
| 204 | #endif | 242 | #endif | 
| 205 | 243 | ||
| 206 | /* | 244 | /* | 
| 207 | * This path almost never happens for VM activity - pages are normally | ||
| 208 | * freed via pagevecs. But it gets used by networking. | ||
| 209 | */ | ||
| 210 | void fastcall __page_cache_release(struct page *page) | ||
| 211 | { | ||
| 212 | if (PageLRU(page)) { | ||
| 213 | unsigned long flags; | ||
| 214 | struct zone *zone = page_zone(page); | ||
| 215 | |||
| 216 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
| 217 | BUG_ON(!PageLRU(page)); | ||
| 218 | __ClearPageLRU(page); | ||
| 219 | del_page_from_lru(zone, page); | ||
| 220 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
| 221 | } | ||
| 222 | free_hot_page(page); | ||
| 223 | } | ||
| 224 | EXPORT_SYMBOL(__page_cache_release); | ||
| 225 | |||
| 226 | /* | ||
| 227 | * Batched page_cache_release(). Decrement the reference count on all the | 245 | * Batched page_cache_release(). Decrement the reference count on all the | 
| 228 | * passed pages. If it fell to zero then remove the page from the LRU and | 246 | * passed pages. If it fell to zero then remove the page from the LRU and | 
| 229 | * free it. | 247 | * free it. | 
| @@ -265,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 265 | zone = pagezone; | 283 | zone = pagezone; | 
| 266 | spin_lock_irq(&zone->lru_lock); | 284 | spin_lock_irq(&zone->lru_lock); | 
| 267 | } | 285 | } | 
| 268 | BUG_ON(!PageLRU(page)); | 286 | VM_BUG_ON(!PageLRU(page)); | 
| 269 | __ClearPageLRU(page); | 287 | __ClearPageLRU(page); | 
| 270 | del_page_from_lru(zone, page); | 288 | del_page_from_lru(zone, page); | 
| 271 | } | 289 | } | 
| @@ -318,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec) | |||
| 318 | for (i = 0; i < pagevec_count(pvec); i++) { | 336 | for (i = 0; i < pagevec_count(pvec); i++) { | 
| 319 | struct page *page = pvec->pages[i]; | 337 | struct page *page = pvec->pages[i]; | 
| 320 | 338 | ||
| 321 | BUG_ON(PageLRU(page)); | 339 | VM_BUG_ON(PageLRU(page)); | 
| 322 | if (put_page_testzero(page)) | 340 | if (put_page_testzero(page)) | 
| 323 | pagevec_add(&pages_to_free, page); | 341 | pagevec_add(&pages_to_free, page); | 
| 324 | } | 342 | } | 
| @@ -345,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
| 345 | zone = pagezone; | 363 | zone = pagezone; | 
| 346 | spin_lock_irq(&zone->lru_lock); | 364 | spin_lock_irq(&zone->lru_lock); | 
| 347 | } | 365 | } | 
| 348 | BUG_ON(PageLRU(page)); | 366 | VM_BUG_ON(PageLRU(page)); | 
| 349 | SetPageLRU(page); | 367 | SetPageLRU(page); | 
| 350 | add_page_to_inactive_list(zone, page); | 368 | add_page_to_inactive_list(zone, page); | 
| 351 | } | 369 | } | 
| @@ -372,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec) | |||
| 372 | zone = pagezone; | 390 | zone = pagezone; | 
| 373 | spin_lock_irq(&zone->lru_lock); | 391 | spin_lock_irq(&zone->lru_lock); | 
| 374 | } | 392 | } | 
| 375 | BUG_ON(PageLRU(page)); | 393 | VM_BUG_ON(PageLRU(page)); | 
| 376 | SetPageLRU(page); | 394 | SetPageLRU(page); | 
| 377 | BUG_ON(PageActive(page)); | 395 | VM_BUG_ON(PageActive(page)); | 
| 378 | SetPageActive(page); | 396 | SetPageActive(page); | 
| 379 | add_page_to_active_list(zone, page); | 397 | add_page_to_active_list(zone, page); | 
| 380 | } | 398 | } | 
| diff --git a/mm/swap_state.c b/mm/swap_state.c index e0e1583f32c2..5f7cf2a4cb55 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -24,7 +24,7 @@ | |||
| 24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | 24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | 
| 25 | * future use of radix_tree tags in the swap cache. | 25 | * future use of radix_tree tags in the swap cache. | 
| 26 | */ | 26 | */ | 
| 27 | static struct address_space_operations swap_aops = { | 27 | static const struct address_space_operations swap_aops = { | 
| 28 | .writepage = swap_writepage, | 28 | .writepage = swap_writepage, | 
| 29 | .sync_page = block_sync_page, | 29 | .sync_page = block_sync_page, | 
| 30 | .set_page_dirty = __set_page_dirty_nobuffers, | 30 | .set_page_dirty = __set_page_dirty_nobuffers, | 
| @@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
| 38 | 38 | ||
| 39 | struct address_space swapper_space = { | 39 | struct address_space swapper_space = { | 
| 40 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 40 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 
| 41 | .tree_lock = RW_LOCK_UNLOCKED, | 41 | .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), | 
| 42 | .a_ops = &swap_aops, | 42 | .a_ops = &swap_aops, | 
| 43 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 43 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 
| 44 | .backing_dev_info = &swap_backing_dev_info, | 44 | .backing_dev_info = &swap_backing_dev_info, | 
| @@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
| 87 | SetPageSwapCache(page); | 87 | SetPageSwapCache(page); | 
| 88 | set_page_private(page, entry.val); | 88 | set_page_private(page, entry.val); | 
| 89 | total_swapcache_pages++; | 89 | total_swapcache_pages++; | 
| 90 | pagecache_acct(1); | 90 | __inc_zone_page_state(page, NR_FILE_PAGES); | 
| 91 | } | 91 | } | 
| 92 | write_unlock_irq(&swapper_space.tree_lock); | 92 | write_unlock_irq(&swapper_space.tree_lock); | 
| 93 | radix_tree_preload_end(); | 93 | radix_tree_preload_end(); | 
| @@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page) | |||
| 132 | set_page_private(page, 0); | 132 | set_page_private(page, 0); | 
| 133 | ClearPageSwapCache(page); | 133 | ClearPageSwapCache(page); | 
| 134 | total_swapcache_pages--; | 134 | total_swapcache_pages--; | 
| 135 | pagecache_acct(-1); | 135 | __dec_zone_page_state(page, NR_FILE_PAGES); | 
| 136 | INC_CACHE_INFO(del_total); | 136 | INC_CACHE_INFO(del_total); | 
| 137 | } | 137 | } | 
| 138 | 138 | ||
| diff --git a/mm/swapfile.c b/mm/swapfile.c index cc367f7e75d8..a15def63f28f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -5,7 +5,6 @@ | |||
| 5 | * Swap reorganised 29.12.95, Stephen Tweedie | 5 | * Swap reorganised 29.12.95, Stephen Tweedie | 
| 6 | */ | 6 | */ | 
| 7 | 7 | ||
| 8 | #include <linux/config.h> | ||
| 9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> | 
| 10 | #include <linux/hugetlb.h> | 9 | #include <linux/hugetlb.h> | 
| 11 | #include <linux/mman.h> | 10 | #include <linux/mman.h> | 
| @@ -443,11 +442,12 @@ int swap_type_of(dev_t device) | |||
| 443 | 442 | ||
| 444 | if (!(swap_info[i].flags & SWP_WRITEOK)) | 443 | if (!(swap_info[i].flags & SWP_WRITEOK)) | 
| 445 | continue; | 444 | continue; | 
| 445 | |||
| 446 | if (!device) { | 446 | if (!device) { | 
| 447 | spin_unlock(&swap_lock); | 447 | spin_unlock(&swap_lock); | 
| 448 | return i; | 448 | return i; | 
| 449 | } | 449 | } | 
| 450 | inode = swap_info->swap_file->f_dentry->d_inode; | 450 | inode = swap_info[i].swap_file->f_dentry->d_inode; | 
| 451 | if (S_ISBLK(inode->i_mode) && | 451 | if (S_ISBLK(inode->i_mode) && | 
| 452 | device == MKDEV(imajor(inode), iminor(inode))) { | 452 | device == MKDEV(imajor(inode), iminor(inode))) { | 
| 453 | spin_unlock(&swap_lock); | 453 | spin_unlock(&swap_lock); | 
| @@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type) | |||
| 1723 | */ | 1723 | */ | 
| 1724 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 1724 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | 
| 1725 | { | 1725 | { | 
| 1726 | int ret = 0, i = 1 << page_cluster; | 1726 | int our_page_cluster = page_cluster; | 
| 1727 | int ret = 0, i = 1 << our_page_cluster; | ||
| 1727 | unsigned long toff; | 1728 | unsigned long toff; | 
| 1728 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | 1729 | struct swap_info_struct *swapdev = swp_type(entry) + swap_info; | 
| 1729 | 1730 | ||
| 1730 | if (!page_cluster) /* no readahead */ | 1731 | if (!our_page_cluster) /* no readahead */ | 
| 1731 | return 0; | 1732 | return 0; | 
| 1732 | toff = (swp_offset(entry) >> page_cluster) << page_cluster; | 1733 | toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; | 
| 1733 | if (!toff) /* first page is swap header */ | 1734 | if (!toff) /* first page is swap header */ | 
| 1734 | toff++, i--; | 1735 | toff++, i--; | 
| 1735 | *offset = toff; | 1736 | *offset = toff; | 
| diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index f9d6a9cc91c4..5f2cbf0f153c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
| @@ -12,7 +12,6 @@ | |||
| 12 | 12 | ||
| 13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> | 
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> | 
| 15 | #include <linux/devfs_fs_kernel.h> | ||
| 16 | #include <linux/vfs.h> | 15 | #include <linux/vfs.h> | 
| 17 | #include <linux/mount.h> | 16 | #include <linux/mount.h> | 
| 18 | #include <linux/file.h> | 17 | #include <linux/file.h> | 
| @@ -33,9 +32,6 @@ static int __init init_tmpfs(void) | |||
| 33 | { | 32 | { | 
| 34 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 33 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 
| 35 | 34 | ||
| 36 | #ifdef CONFIG_TMPFS | ||
| 37 | devfs_mk_dir("shm"); | ||
| 38 | #endif | ||
| 39 | shm_mnt = kern_mount(&tmpfs_fs_type); | 35 | shm_mnt = kern_mount(&tmpfs_fs_type); | 
| 40 | BUG_ON(IS_ERR(shm_mnt)); | 36 | BUG_ON(IS_ERR(shm_mnt)); | 
| 41 | 37 | ||
| diff --git a/mm/truncate.c b/mm/truncate.c index cf1b015df4a7..a654928323dc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> | 
| 11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> | 
| 12 | #include <linux/swap.h> | ||
| 12 | #include <linux/module.h> | 13 | #include <linux/module.h> | 
| 13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> | 
| 14 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> | 
| @@ -52,33 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 52 | /* | 53 | /* | 
| 53 | * This is for invalidate_inode_pages(). That function can be called at | 54 | * This is for invalidate_inode_pages(). That function can be called at | 
| 54 | * any time, and is not supposed to throw away dirty pages. But pages can | 55 | * any time, and is not supposed to throw away dirty pages. But pages can | 
| 55 | * be marked dirty at any time too. So we re-check the dirtiness inside | 56 | * be marked dirty at any time too, so use remove_mapping which safely | 
| 56 | * ->tree_lock. That provides exclusion against the __set_page_dirty | 57 | * discards clean, unused pages. | 
| 57 | * functions. | ||
| 58 | * | 58 | * | 
| 59 | * Returns non-zero if the page was successfully invalidated. | 59 | * Returns non-zero if the page was successfully invalidated. | 
| 60 | */ | 60 | */ | 
| 61 | static int | 61 | static int | 
| 62 | invalidate_complete_page(struct address_space *mapping, struct page *page) | 62 | invalidate_complete_page(struct address_space *mapping, struct page *page) | 
| 63 | { | 63 | { | 
| 64 | int ret; | ||
| 65 | |||
| 64 | if (page->mapping != mapping) | 66 | if (page->mapping != mapping) | 
| 65 | return 0; | 67 | return 0; | 
| 66 | 68 | ||
| 67 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 69 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 
| 68 | return 0; | 70 | return 0; | 
| 69 | 71 | ||
| 70 | write_lock_irq(&mapping->tree_lock); | 72 | ret = remove_mapping(mapping, page); | 
| 71 | if (PageDirty(page)) { | ||
| 72 | write_unlock_irq(&mapping->tree_lock); | ||
| 73 | return 0; | ||
| 74 | } | ||
| 75 | |||
| 76 | BUG_ON(PagePrivate(page)); | ||
| 77 | __remove_from_page_cache(page); | ||
| 78 | write_unlock_irq(&mapping->tree_lock); | ||
| 79 | ClearPageUptodate(page); | 73 | ClearPageUptodate(page); | 
| 80 | page_cache_release(page); /* pagecache ref */ | 74 | |
| 81 | return 1; | 75 | return ret; | 
| 82 | } | 76 | } | 
| 83 | 77 | ||
| 84 | /** | 78 | /** | 
| diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35f8553f893a..1ac191ce5641 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -24,6 +24,9 @@ | |||
| 24 | DEFINE_RWLOCK(vmlist_lock); | 24 | DEFINE_RWLOCK(vmlist_lock); | 
| 25 | struct vm_struct *vmlist; | 25 | struct vm_struct *vmlist; | 
| 26 | 26 | ||
| 27 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
| 28 | int node); | ||
| 29 | |||
| 27 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 30 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 
| 28 | { | 31 | { | 
| 29 | pte_t *pte; | 32 | pte_t *pte; | 
| @@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | |||
| 238 | 241 | ||
| 239 | /** | 242 | /** | 
| 240 | * get_vm_area - reserve a contingous kernel virtual area | 243 | * get_vm_area - reserve a contingous kernel virtual area | 
| 241 | * | ||
| 242 | * @size: size of the area | 244 | * @size: size of the area | 
| 243 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | 245 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | 
| 244 | * | 246 | * | 
| @@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr) | |||
| 270 | } | 272 | } | 
| 271 | 273 | ||
| 272 | /* Caller must hold vmlist_lock */ | 274 | /* Caller must hold vmlist_lock */ | 
| 273 | struct vm_struct *__remove_vm_area(void *addr) | 275 | static struct vm_struct *__remove_vm_area(void *addr) | 
| 274 | { | 276 | { | 
| 275 | struct vm_struct **p, *tmp; | 277 | struct vm_struct **p, *tmp; | 
| 276 | 278 | ||
| @@ -293,7 +295,6 @@ found: | |||
| 293 | 295 | ||
| 294 | /** | 296 | /** | 
| 295 | * remove_vm_area - find and remove a contingous kernel virtual area | 297 | * remove_vm_area - find and remove a contingous kernel virtual area | 
| 296 | * | ||
| 297 | * @addr: base address | 298 | * @addr: base address | 
| 298 | * | 299 | * | 
| 299 | * Search for the kernel VM area starting at @addr, and remove it. | 300 | * Search for the kernel VM area starting at @addr, and remove it. | 
| @@ -330,6 +331,8 @@ void __vunmap(void *addr, int deallocate_pages) | |||
| 330 | return; | 331 | return; | 
| 331 | } | 332 | } | 
| 332 | 333 | ||
| 334 | debug_check_no_locks_freed(addr, area->size); | ||
| 335 | |||
| 333 | if (deallocate_pages) { | 336 | if (deallocate_pages) { | 
| 334 | int i; | 337 | int i; | 
| 335 | 338 | ||
| @@ -338,7 +341,7 @@ void __vunmap(void *addr, int deallocate_pages) | |||
| 338 | __free_page(area->pages[i]); | 341 | __free_page(area->pages[i]); | 
| 339 | } | 342 | } | 
| 340 | 343 | ||
| 341 | if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) | 344 | if (area->flags & VM_VPAGES) | 
| 342 | vfree(area->pages); | 345 | vfree(area->pages); | 
| 343 | else | 346 | else | 
| 344 | kfree(area->pages); | 347 | kfree(area->pages); | 
| @@ -350,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages) | |||
| 350 | 353 | ||
| 351 | /** | 354 | /** | 
| 352 | * vfree - release memory allocated by vmalloc() | 355 | * vfree - release memory allocated by vmalloc() | 
| 353 | * | ||
| 354 | * @addr: memory base address | 356 | * @addr: memory base address | 
| 355 | * | 357 | * | 
| 356 | * Free the virtually contiguous memory area starting at @addr, as | 358 | * Free the virtually contiguous memory area starting at @addr, as | 
| @@ -368,7 +370,6 @@ EXPORT_SYMBOL(vfree); | |||
| 368 | 370 | ||
| 369 | /** | 371 | /** | 
| 370 | * vunmap - release virtual mapping obtained by vmap() | 372 | * vunmap - release virtual mapping obtained by vmap() | 
| 371 | * | ||
| 372 | * @addr: memory base address | 373 | * @addr: memory base address | 
| 373 | * | 374 | * | 
| 374 | * Free the virtually contiguous memory area starting at @addr, | 375 | * Free the virtually contiguous memory area starting at @addr, | 
| @@ -385,7 +386,6 @@ EXPORT_SYMBOL(vunmap); | |||
| 385 | 386 | ||
| 386 | /** | 387 | /** | 
| 387 | * vmap - map an array of pages into virtually contiguous space | 388 | * vmap - map an array of pages into virtually contiguous space | 
| 388 | * | ||
| 389 | * @pages: array of page pointers | 389 | * @pages: array of page pointers | 
| 390 | * @count: number of pages to map | 390 | * @count: number of pages to map | 
| 391 | * @flags: vm_area->flags | 391 | * @flags: vm_area->flags | 
| @@ -425,9 +425,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
| 425 | 425 | ||
| 426 | area->nr_pages = nr_pages; | 426 | area->nr_pages = nr_pages; | 
| 427 | /* Please note that the recursion is strictly bounded. */ | 427 | /* Please note that the recursion is strictly bounded. */ | 
| 428 | if (array_size > PAGE_SIZE) | 428 | if (array_size > PAGE_SIZE) { | 
| 429 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); | 429 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); | 
| 430 | else | 430 | area->flags |= VM_VPAGES; | 
| 431 | } else | ||
| 431 | pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); | 432 | pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); | 
| 432 | area->pages = pages; | 433 | area->pages = pages; | 
| 433 | if (!area->pages) { | 434 | if (!area->pages) { | 
| @@ -465,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 465 | 466 | ||
| 466 | /** | 467 | /** | 
| 467 | * __vmalloc_node - allocate virtually contiguous memory | 468 | * __vmalloc_node - allocate virtually contiguous memory | 
| 468 | * | ||
| 469 | * @size: allocation size | 469 | * @size: allocation size | 
| 470 | * @gfp_mask: flags for the page level allocator | 470 | * @gfp_mask: flags for the page level allocator | 
| 471 | * @prot: protection mask for the allocated pages | 471 | * @prot: protection mask for the allocated pages | 
| @@ -475,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 475 | * allocator with @gfp_mask flags. Map them into contiguous | 475 | * allocator with @gfp_mask flags. Map them into contiguous | 
| 476 | * kernel virtual space, using a pagetable protection of @prot. | 476 | * kernel virtual space, using a pagetable protection of @prot. | 
| 477 | */ | 477 | */ | 
| 478 | void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 478 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 
| 479 | int node) | 479 | int node) | 
| 480 | { | 480 | { | 
| 481 | struct vm_struct *area; | 481 | struct vm_struct *area; | 
| 482 | 482 | ||
| @@ -490,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
| 490 | 490 | ||
| 491 | return __vmalloc_area_node(area, gfp_mask, prot, node); | 491 | return __vmalloc_area_node(area, gfp_mask, prot, node); | 
| 492 | } | 492 | } | 
| 493 | EXPORT_SYMBOL(__vmalloc_node); | ||
| 494 | 493 | ||
| 495 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 494 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 
| 496 | { | 495 | { | 
| @@ -500,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc); | |||
| 500 | 499 | ||
| 501 | /** | 500 | /** | 
| 502 | * vmalloc - allocate virtually contiguous memory | 501 | * vmalloc - allocate virtually contiguous memory | 
| 503 | * | ||
| 504 | * @size: allocation size | 502 | * @size: allocation size | 
| 505 | * | ||
| 506 | * Allocate enough pages to cover @size from the page level | 503 | * Allocate enough pages to cover @size from the page level | 
| 507 | * allocator and map them into contiguous kernel virtual space. | 504 | * allocator and map them into contiguous kernel virtual space. | 
| 508 | * | 505 | * | 
| @@ -516,11 +513,11 @@ void *vmalloc(unsigned long size) | |||
| 516 | EXPORT_SYMBOL(vmalloc); | 513 | EXPORT_SYMBOL(vmalloc); | 
| 517 | 514 | ||
| 518 | /** | 515 | /** | 
| 519 | * vmalloc_user - allocate virtually contiguous memory which has | 516 | * vmalloc_user - allocate zeroed virtually contiguous memory for userspace | 
| 520 | * been zeroed so it can be mapped to userspace without | 517 | * @size: allocation size | 
| 521 | * leaking data. | ||
| 522 | * | 518 | * | 
| 523 | * @size: allocation size | 519 | * The resulting memory area is zeroed so it can be mapped to userspace | 
| 520 | * without leaking data. | ||
| 524 | */ | 521 | */ | 
| 525 | void *vmalloc_user(unsigned long size) | 522 | void *vmalloc_user(unsigned long size) | 
| 526 | { | 523 | { | 
| @@ -539,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user); | |||
| 539 | 536 | ||
| 540 | /** | 537 | /** | 
| 541 | * vmalloc_node - allocate memory on a specific node | 538 | * vmalloc_node - allocate memory on a specific node | 
| 542 | * | ||
| 543 | * @size: allocation size | 539 | * @size: allocation size | 
| 544 | * @node: numa node | 540 | * @node: numa node | 
| 545 | * | 541 | * | 
| @@ -561,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node); | |||
| 561 | 557 | ||
| 562 | /** | 558 | /** | 
| 563 | * vmalloc_exec - allocate virtually contiguous, executable memory | 559 | * vmalloc_exec - allocate virtually contiguous, executable memory | 
| 564 | * | ||
| 565 | * @size: allocation size | 560 | * @size: allocation size | 
| 566 | * | 561 | * | 
| 567 | * Kernel-internal function to allocate enough pages to cover @size | 562 | * Kernel-internal function to allocate enough pages to cover @size | 
| @@ -579,7 +574,6 @@ void *vmalloc_exec(unsigned long size) | |||
| 579 | 574 | ||
| 580 | /** | 575 | /** | 
| 581 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 576 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 
| 582 | * | ||
| 583 | * @size: allocation size | 577 | * @size: allocation size | 
| 584 | * | 578 | * | 
| 585 | * Allocate enough 32bit PA addressable pages to cover @size from the | 579 | * Allocate enough 32bit PA addressable pages to cover @size from the | 
| @@ -592,11 +586,11 @@ void *vmalloc_32(unsigned long size) | |||
| 592 | EXPORT_SYMBOL(vmalloc_32); | 586 | EXPORT_SYMBOL(vmalloc_32); | 
| 593 | 587 | ||
| 594 | /** | 588 | /** | 
| 595 | * vmalloc_32_user - allocate virtually contiguous memory (32bit | 589 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory | 
| 596 | * addressable) which is zeroed so it can be | ||
| 597 | * mapped to userspace without leaking data. | ||
| 598 | * | ||
| 599 | * @size: allocation size | 590 | * @size: allocation size | 
| 591 | * | ||
| 592 | * The resulting memory area is 32bit addressable and zeroed so it can be | ||
| 593 | * mapped to userspace without leaking data. | ||
| 600 | */ | 594 | */ | 
| 601 | void *vmalloc_32_user(unsigned long size) | 595 | void *vmalloc_32_user(unsigned long size) | 
| 602 | { | 596 | { | 
| @@ -690,7 +684,6 @@ finished: | |||
| 690 | 684 | ||
| 691 | /** | 685 | /** | 
| 692 | * remap_vmalloc_range - map vmalloc pages to userspace | 686 | * remap_vmalloc_range - map vmalloc pages to userspace | 
| 693 | * | ||
| 694 | * @vma: vma to cover (map full range of vma) | 687 | * @vma: vma to cover (map full range of vma) | 
| 695 | * @addr: vmalloc memory | 688 | * @addr: vmalloc memory | 
| 696 | * @pgoff: number of pages into addr before first page to map | 689 | * @pgoff: number of pages into addr before first page to map | 
| diff --git a/mm/vmscan.c b/mm/vmscan.c index 72babac71dea..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> | 
| 20 | #include <linux/init.h> | 20 | #include <linux/init.h> | 
| 21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> | 
| 22 | #include <linux/vmstat.h> | ||
| 22 | #include <linux/file.h> | 23 | #include <linux/file.h> | 
| 23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> | 
| 24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> | 
| @@ -34,6 +35,7 @@ | |||
| 34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> | 
| 35 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> | 
| 36 | #include <linux/delay.h> | 37 | #include <linux/delay.h> | 
| 38 | #include <linux/kthread.h> | ||
| 37 | 39 | ||
| 38 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> | 
| 39 | #include <asm/div64.h> | 41 | #include <asm/div64.h> | 
| @@ -46,8 +48,6 @@ struct scan_control { | |||
| 46 | /* Incremented by the number of inactive pages that were scanned */ | 48 | /* Incremented by the number of inactive pages that were scanned */ | 
| 47 | unsigned long nr_scanned; | 49 | unsigned long nr_scanned; | 
| 48 | 50 | ||
| 49 | unsigned long nr_mapped; /* From page_state */ | ||
| 50 | |||
| 51 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ | 
| 52 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; | 
| 53 | 53 | ||
| @@ -63,6 +63,8 @@ struct scan_control { | |||
| 63 | int swap_cluster_max; | 63 | int swap_cluster_max; | 
| 64 | 64 | ||
| 65 | int swappiness; | 65 | int swappiness; | 
| 66 | |||
| 67 | int all_unreclaimable; | ||
| 66 | }; | 68 | }; | 
| 67 | 69 | ||
| 68 | /* | 70 | /* | 
| @@ -216,7 +218,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
| 216 | break; | 218 | break; | 
| 217 | if (shrink_ret < nr_before) | 219 | if (shrink_ret < nr_before) | 
| 218 | ret += nr_before - shrink_ret; | 220 | ret += nr_before - shrink_ret; | 
| 219 | mod_page_state(slabs_scanned, this_scan); | 221 | count_vm_events(SLABS_SCANNED, this_scan); | 
| 220 | total_scan -= this_scan; | 222 | total_scan -= this_scan; | 
| 221 | 223 | ||
| 222 | cond_resched(); | 224 | cond_resched(); | 
| @@ -369,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 369 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ | 
| 370 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); | 
| 371 | } | 373 | } | 
| 372 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
| 373 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; | 
| 374 | } | 376 | } | 
| 375 | 377 | ||
| @@ -378,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 378 | 380 | ||
| 379 | int remove_mapping(struct address_space *mapping, struct page *page) | 381 | int remove_mapping(struct address_space *mapping, struct page *page) | 
| 380 | { | 382 | { | 
| 381 | if (!mapping) | 383 | BUG_ON(!PageLocked(page)); | 
| 382 | return 0; /* truncate got there first */ | 384 | BUG_ON(mapping != page_mapping(page)); | 
| 383 | 385 | ||
| 384 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); | 
| 385 | |||
| 386 | /* | 387 | /* | 
| 387 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. | 
| 388 | * PageDirty _after_ making sure that the page is freeable and | 389 | * | 
| 389 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has | 
| 391 | * a ref to the page, it may be possible that they dirty it then | ||
| 392 | * drop the reference. So if PageDirty is tested before page_count | ||
| 393 | * here, then the following race may occur: | ||
| 394 | * | ||
| 395 | * get_user_pages(&page); | ||
| 396 | * [user mapping goes away] | ||
| 397 | * write_to(page); | ||
| 398 | * !PageDirty(page) [good] | ||
| 399 | * SetPageDirty(page); | ||
| 400 | * put_page(page); | ||
| 401 | * !page_count(page) [good, discard it] | ||
| 402 | * | ||
| 403 | * [oops, our write_to data is lost] | ||
| 404 | * | ||
| 405 | * Reversing the order of the tests ensures such a situation cannot | ||
| 406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
| 407 | * load is not satisfied before that of page->_count. | ||
| 408 | * | ||
| 409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
| 410 | * and thus under tree_lock, then this ordering is not required. | ||
| 390 | */ | 411 | */ | 
| 391 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) | 
| 392 | goto cannot_free; | 413 | goto cannot_free; | 
| @@ -441,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 441 | if (TestSetPageLocked(page)) | 462 | if (TestSetPageLocked(page)) | 
| 442 | goto keep; | 463 | goto keep; | 
| 443 | 464 | ||
| 444 | BUG_ON(PageActive(page)); | 465 | VM_BUG_ON(PageActive(page)); | 
| 445 | 466 | ||
| 446 | sc->nr_scanned++; | 467 | sc->nr_scanned++; | 
| 447 | 468 | ||
| @@ -548,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 548 | goto free_it; | 569 | goto free_it; | 
| 549 | } | 570 | } | 
| 550 | 571 | ||
| 551 | if (!remove_mapping(mapping, page)) | 572 | if (!mapping || !remove_mapping(mapping, page)) | 
| 552 | goto keep_locked; | 573 | goto keep_locked; | 
| 553 | 574 | ||
| 554 | free_it: | 575 | free_it: | 
| @@ -565,12 +586,12 @@ keep_locked: | |||
| 565 | unlock_page(page); | 586 | unlock_page(page); | 
| 566 | keep: | 587 | keep: | 
| 567 | list_add(&page->lru, &ret_pages); | 588 | list_add(&page->lru, &ret_pages); | 
| 568 | BUG_ON(PageLRU(page)); | 589 | VM_BUG_ON(PageLRU(page)); | 
| 569 | } | 590 | } | 
| 570 | list_splice(&ret_pages, page_list); | 591 | list_splice(&ret_pages, page_list); | 
| 571 | if (pagevec_count(&freed_pvec)) | 592 | if (pagevec_count(&freed_pvec)) | 
| 572 | __pagevec_release_nonlru(&freed_pvec); | 593 | __pagevec_release_nonlru(&freed_pvec); | 
| 573 | mod_page_state(pgactivate, pgactivate); | 594 | count_vm_events(PGACTIVATE, pgactivate); | 
| 574 | return nr_reclaimed; | 595 | return nr_reclaimed; | 
| 575 | } | 596 | } | 
| 576 | 597 | ||
| @@ -604,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 604 | page = lru_to_page(src); | 625 | page = lru_to_page(src); | 
| 605 | prefetchw_prev_lru_page(page, src, flags); | 626 | prefetchw_prev_lru_page(page, src, flags); | 
| 606 | 627 | ||
| 607 | BUG_ON(!PageLRU(page)); | 628 | VM_BUG_ON(!PageLRU(page)); | 
| 608 | 629 | ||
| 609 | list_del(&page->lru); | 630 | list_del(&page->lru); | 
| 610 | target = src; | 631 | target = src; | 
| @@ -660,11 +681,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 660 | nr_reclaimed += nr_freed; | 681 | nr_reclaimed += nr_freed; | 
| 661 | local_irq_disable(); | 682 | local_irq_disable(); | 
| 662 | if (current_is_kswapd()) { | 683 | if (current_is_kswapd()) { | 
| 663 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 684 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | 
| 664 | __mod_page_state(kswapd_steal, nr_freed); | 685 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 
| 665 | } else | 686 | } else | 
| 666 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 687 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 
| 667 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 688 | __count_vm_events(PGACTIVATE, nr_freed); | 
| 668 | 689 | ||
| 669 | if (nr_taken == 0) | 690 | if (nr_taken == 0) | 
| 670 | goto done; | 691 | goto done; | 
| @@ -675,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 675 | */ | 696 | */ | 
| 676 | while (!list_empty(&page_list)) { | 697 | while (!list_empty(&page_list)) { | 
| 677 | page = lru_to_page(&page_list); | 698 | page = lru_to_page(&page_list); | 
| 678 | BUG_ON(PageLRU(page)); | 699 | VM_BUG_ON(PageLRU(page)); | 
| 679 | SetPageLRU(page); | 700 | SetPageLRU(page); | 
| 680 | list_del(&page->lru); | 701 | list_del(&page->lru); | 
| 681 | if (PageActive(page)) | 702 | if (PageActive(page)) | 
| @@ -696,6 +717,11 @@ done: | |||
| 696 | return nr_reclaimed; | 717 | return nr_reclaimed; | 
| 697 | } | 718 | } | 
| 698 | 719 | ||
| 720 | static inline int zone_is_near_oom(struct zone *zone) | ||
| 721 | { | ||
| 722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | ||
| 723 | } | ||
| 724 | |||
| 699 | /* | 725 | /* | 
| 700 | * This moves pages from the active list to the inactive list. | 726 | * This moves pages from the active list to the inactive list. | 
| 701 | * | 727 | * | 
| @@ -731,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 731 | long distress; | 757 | long distress; | 
| 732 | long swap_tendency; | 758 | long swap_tendency; | 
| 733 | 759 | ||
| 760 | if (zone_is_near_oom(zone)) | ||
| 761 | goto force_reclaim_mapped; | ||
| 762 | |||
| 734 | /* | 763 | /* | 
| 735 | * `distress' is a measure of how much trouble we're having | 764 | * `distress' is a measure of how much trouble we're having | 
| 736 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 
| @@ -743,7 +772,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 743 | * how much memory | 772 | * how much memory | 
| 744 | * is mapped. | 773 | * is mapped. | 
| 745 | */ | 774 | */ | 
| 746 | mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; | 775 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | 
| 776 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
| 777 | vm_total_pages; | ||
| 747 | 778 | ||
| 748 | /* | 779 | /* | 
| 749 | * Now decide how much we really want to unmap some pages. The | 780 | * Now decide how much we really want to unmap some pages. The | 
| @@ -764,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 764 | * memory onto the inactive list. | 795 | * memory onto the inactive list. | 
| 765 | */ | 796 | */ | 
| 766 | if (swap_tendency >= 100) | 797 | if (swap_tendency >= 100) | 
| 798 | force_reclaim_mapped: | ||
| 767 | reclaim_mapped = 1; | 799 | reclaim_mapped = 1; | 
| 768 | } | 800 | } | 
| 769 | 801 | ||
| @@ -796,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 796 | while (!list_empty(&l_inactive)) { | 828 | while (!list_empty(&l_inactive)) { | 
| 797 | page = lru_to_page(&l_inactive); | 829 | page = lru_to_page(&l_inactive); | 
| 798 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 830 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 
| 799 | BUG_ON(PageLRU(page)); | 831 | VM_BUG_ON(PageLRU(page)); | 
| 800 | SetPageLRU(page); | 832 | SetPageLRU(page); | 
| 801 | BUG_ON(!PageActive(page)); | 833 | VM_BUG_ON(!PageActive(page)); | 
| 802 | ClearPageActive(page); | 834 | ClearPageActive(page); | 
| 803 | 835 | ||
| 804 | list_move(&page->lru, &zone->inactive_list); | 836 | list_move(&page->lru, &zone->inactive_list); | 
| @@ -826,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 826 | while (!list_empty(&l_active)) { | 858 | while (!list_empty(&l_active)) { | 
| 827 | page = lru_to_page(&l_active); | 859 | page = lru_to_page(&l_active); | 
| 828 | prefetchw_prev_lru_page(page, &l_active, flags); | 860 | prefetchw_prev_lru_page(page, &l_active, flags); | 
| 829 | BUG_ON(PageLRU(page)); | 861 | VM_BUG_ON(PageLRU(page)); | 
| 830 | SetPageLRU(page); | 862 | SetPageLRU(page); | 
| 831 | BUG_ON(!PageActive(page)); | 863 | VM_BUG_ON(!PageActive(page)); | 
| 832 | list_move(&page->lru, &zone->active_list); | 864 | list_move(&page->lru, &zone->active_list); | 
| 833 | pgmoved++; | 865 | pgmoved++; | 
| 834 | if (!pagevec_add(&pvec, page)) { | 866 | if (!pagevec_add(&pvec, page)) { | 
| @@ -840,11 +872,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 840 | } | 872 | } | 
| 841 | } | 873 | } | 
| 842 | zone->nr_active += pgmoved; | 874 | zone->nr_active += pgmoved; | 
| 843 | spin_unlock(&zone->lru_lock); | ||
| 844 | 875 | ||
| 845 | __mod_page_state_zone(zone, pgrefill, pgscanned); | 876 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 
| 846 | __mod_page_state(pgdeactivate, pgdeactivate); | 877 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | 
| 847 | local_irq_enable(); | 878 | spin_unlock_irq(&zone->lru_lock); | 
| 848 | 879 | ||
| 849 | pagevec_release(&pvec); | 880 | pagevec_release(&pvec); | 
| 850 | } | 881 | } | 
| @@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
| 925 | unsigned long nr_reclaimed = 0; | 956 | unsigned long nr_reclaimed = 0; | 
| 926 | int i; | 957 | int i; | 
| 927 | 958 | ||
| 959 | sc->all_unreclaimable = 1; | ||
| 928 | for (i = 0; zones[i] != NULL; i++) { | 960 | for (i = 0; zones[i] != NULL; i++) { | 
| 929 | struct zone *zone = zones[i]; | 961 | struct zone *zone = zones[i]; | 
| 930 | 962 | ||
| @@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
| 941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 
| 942 | continue; /* Let kswapd poll it */ | 974 | continue; /* Let kswapd poll it */ | 
| 943 | 975 | ||
| 976 | sc->all_unreclaimable = 0; | ||
| 977 | |||
| 944 | nr_reclaimed += shrink_zone(priority, zone, sc); | 978 | nr_reclaimed += shrink_zone(priority, zone, sc); | 
| 945 | } | 979 | } | 
| 946 | return nr_reclaimed; | 980 | return nr_reclaimed; | 
| @@ -976,7 +1010,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 976 | .swappiness = vm_swappiness, | 1010 | .swappiness = vm_swappiness, | 
| 977 | }; | 1011 | }; | 
| 978 | 1012 | ||
| 979 | inc_page_state(allocstall); | 1013 | count_vm_event(ALLOCSTALL); | 
| 980 | 1014 | ||
| 981 | for (i = 0; zones[i] != NULL; i++) { | 1015 | for (i = 0; zones[i] != NULL; i++) { | 
| 982 | struct zone *zone = zones[i]; | 1016 | struct zone *zone = zones[i]; | 
| @@ -989,7 +1023,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 989 | } | 1023 | } | 
| 990 | 1024 | ||
| 991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1025 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 
| 992 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 993 | sc.nr_scanned = 0; | 1026 | sc.nr_scanned = 0; | 
| 994 | if (!priority) | 1027 | if (!priority) | 
| 995 | disable_swap_token(); | 1028 | disable_swap_token(); | 
| @@ -1022,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 1022 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 
| 1023 | blk_congestion_wait(WRITE, HZ/10); | 1056 | blk_congestion_wait(WRITE, HZ/10); | 
| 1024 | } | 1057 | } | 
| 1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | ||
| 1059 | if (!sc.all_unreclaimable) | ||
| 1060 | ret = 1; | ||
| 1025 | out: | 1061 | out: | 
| 1026 | for (i = 0; zones[i] != 0; i++) { | 1062 | for (i = 0; zones[i] != 0; i++) { | 
| 1027 | struct zone *zone = zones[i]; | 1063 | struct zone *zone = zones[i]; | 
| @@ -1074,9 +1110,7 @@ loop_again: | |||
| 1074 | total_scanned = 0; | 1110 | total_scanned = 0; | 
| 1075 | nr_reclaimed = 0; | 1111 | nr_reclaimed = 0; | 
| 1076 | sc.may_writepage = !laptop_mode; | 1112 | sc.may_writepage = !laptop_mode; | 
| 1077 | sc.nr_mapped = read_page_state(nr_mapped); | 1113 | count_vm_event(PAGEOUTRUN); | 
| 1078 | |||
| 1079 | inc_page_state(pageoutrun); | ||
| 1080 | 1114 | ||
| 1081 | for (i = 0; i < pgdat->nr_zones; i++) { | 1115 | for (i = 0; i < pgdat->nr_zones; i++) { | 
| 1082 | struct zone *zone = pgdat->node_zones + i; | 1116 | struct zone *zone = pgdat->node_zones + i; | 
| @@ -1156,7 +1190,7 @@ scan: | |||
| 1156 | if (zone->all_unreclaimable) | 1190 | if (zone->all_unreclaimable) | 
| 1157 | continue; | 1191 | continue; | 
| 1158 | if (nr_slab == 0 && zone->pages_scanned >= | 1192 | if (nr_slab == 0 && zone->pages_scanned >= | 
| 1159 | (zone->nr_active + zone->nr_inactive) * 4) | 1193 | (zone->nr_active + zone->nr_inactive) * 6) | 
| 1160 | zone->all_unreclaimable = 1; | 1194 | zone->all_unreclaimable = 1; | 
| 1161 | /* | 1195 | /* | 
| 1162 | * If we've done a decent amount of scanning and | 1196 | * If we've done a decent amount of scanning and | 
| @@ -1223,7 +1257,6 @@ static int kswapd(void *p) | |||
| 1223 | }; | 1257 | }; | 
| 1224 | cpumask_t cpumask; | 1258 | cpumask_t cpumask; | 
| 1225 | 1259 | ||
| 1226 | daemonize("kswapd%d", pgdat->node_id); | ||
| 1227 | cpumask = node_to_cpumask(pgdat->node_id); | 1260 | cpumask = node_to_cpumask(pgdat->node_id); | 
| 1228 | if (!cpus_empty(cpumask)) | 1261 | if (!cpus_empty(cpumask)) | 
| 1229 | set_cpus_allowed(tsk, cpumask); | 1262 | set_cpus_allowed(tsk, cpumask); | 
| @@ -1365,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1365 | for_each_zone(zone) | 1398 | for_each_zone(zone) | 
| 1366 | lru_pages += zone->nr_active + zone->nr_inactive; | 1399 | lru_pages += zone->nr_active + zone->nr_inactive; | 
| 1367 | 1400 | ||
| 1368 | nr_slab = read_page_state(nr_slab); | 1401 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 
| 1369 | /* If slab caches are huge, it's better to hit them first */ | 1402 | /* If slab caches are huge, it's better to hit them first */ | 
| 1370 | while (nr_slab >= lru_pages) { | 1403 | while (nr_slab >= lru_pages) { | 
| 1371 | reclaim_state.reclaimed_slab = 0; | 1404 | reclaim_state.reclaimed_slab = 0; | 
| @@ -1407,9 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1407 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | 1440 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | 
| 1408 | unsigned long nr_to_scan = nr_pages - ret; | 1441 | unsigned long nr_to_scan = nr_pages - ret; | 
| 1409 | 1442 | ||
| 1410 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 1411 | sc.nr_scanned = 0; | 1443 | sc.nr_scanned = 0; | 
| 1412 | |||
| 1413 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | 1444 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | 
| 1414 | if (ret >= nr_pages) | 1445 | if (ret >= nr_pages) | 
| 1415 | goto out; | 1446 | goto out; | 
| @@ -1450,7 +1481,7 @@ out: | |||
| 1450 | not required for correctness. So if the last cpu in a node goes | 1481 | not required for correctness. So if the last cpu in a node goes | 
| 1451 | away, we get changed to run anywhere: as the first one comes back, | 1482 | away, we get changed to run anywhere: as the first one comes back, | 
| 1452 | restore their cpu bindings. */ | 1483 | restore their cpu bindings. */ | 
| 1453 | static int cpu_callback(struct notifier_block *nfb, | 1484 | static int __devinit cpu_callback(struct notifier_block *nfb, | 
| 1454 | unsigned long action, void *hcpu) | 1485 | unsigned long action, void *hcpu) | 
| 1455 | { | 1486 | { | 
| 1456 | pg_data_t *pgdat; | 1487 | pg_data_t *pgdat; | 
| @@ -1468,20 +1499,35 @@ static int cpu_callback(struct notifier_block *nfb, | |||
| 1468 | } | 1499 | } | 
| 1469 | #endif /* CONFIG_HOTPLUG_CPU */ | 1500 | #endif /* CONFIG_HOTPLUG_CPU */ | 
| 1470 | 1501 | ||
| 1502 | /* | ||
| 1503 | * This kswapd start function will be called by init and node-hot-add. | ||
| 1504 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. | ||
| 1505 | */ | ||
| 1506 | int kswapd_run(int nid) | ||
| 1507 | { | ||
| 1508 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 1509 | int ret = 0; | ||
| 1510 | |||
| 1511 | if (pgdat->kswapd) | ||
| 1512 | return 0; | ||
| 1513 | |||
| 1514 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); | ||
| 1515 | if (IS_ERR(pgdat->kswapd)) { | ||
| 1516 | /* failure at boot is fatal */ | ||
| 1517 | BUG_ON(system_state == SYSTEM_BOOTING); | ||
| 1518 | printk("Failed to start kswapd on node %d\n",nid); | ||
| 1519 | ret = -1; | ||
| 1520 | } | ||
| 1521 | return ret; | ||
| 1522 | } | ||
| 1523 | |||
| 1471 | static int __init kswapd_init(void) | 1524 | static int __init kswapd_init(void) | 
| 1472 | { | 1525 | { | 
| 1473 | pg_data_t *pgdat; | 1526 | int nid; | 
| 1474 | 1527 | ||
| 1475 | swap_setup(); | 1528 | swap_setup(); | 
| 1476 | for_each_online_pgdat(pgdat) { | 1529 | for_each_online_node(nid) | 
| 1477 | pid_t pid; | 1530 | kswapd_run(nid); | 
| 1478 | |||
| 1479 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
| 1480 | BUG_ON(pid < 0); | ||
| 1481 | read_lock(&tasklist_lock); | ||
| 1482 | pgdat->kswapd = find_task_by_pid(pid); | ||
| 1483 | read_unlock(&tasklist_lock); | ||
| 1484 | } | ||
| 1485 | hotcpu_notifier(cpu_callback, 0); | 1531 | hotcpu_notifier(cpu_callback, 0); | 
| 1486 | return 0; | 1532 | return 0; | 
| 1487 | } | 1533 | } | 
| @@ -1494,10 +1540,6 @@ module_init(kswapd_init) | |||
| 1494 | * | 1540 | * | 
| 1495 | * If non-zero call zone_reclaim when the number of free pages falls below | 1541 | * If non-zero call zone_reclaim when the number of free pages falls below | 
| 1496 | * the watermarks. | 1542 | * the watermarks. | 
| 1497 | * | ||
| 1498 | * In the future we may add flags to the mode. However, the page allocator | ||
| 1499 | * should only have to check that zone_reclaim_mode != 0 before calling | ||
| 1500 | * zone_reclaim(). | ||
| 1501 | */ | 1543 | */ | 
| 1502 | int zone_reclaim_mode __read_mostly; | 1544 | int zone_reclaim_mode __read_mostly; | 
| 1503 | 1545 | ||
| @@ -1505,12 +1547,6 @@ int zone_reclaim_mode __read_mostly; | |||
| 1505 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1547 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 
| 1506 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1548 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 
| 1507 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1549 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 
| 1508 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | ||
| 1509 | |||
| 1510 | /* | ||
| 1511 | * Mininum time between zone reclaim scans | ||
| 1512 | */ | ||
| 1513 | int zone_reclaim_interval __read_mostly = 30*HZ; | ||
| 1514 | 1550 | ||
| 1515 | /* | 1551 | /* | 
| 1516 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1552 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 
| @@ -1520,6 +1556,18 @@ int zone_reclaim_interval __read_mostly = 30*HZ; | |||
| 1520 | #define ZONE_RECLAIM_PRIORITY 4 | 1556 | #define ZONE_RECLAIM_PRIORITY 4 | 
| 1521 | 1557 | ||
| 1522 | /* | 1558 | /* | 
| 1559 | * Percentage of pages in a zone that must be unmapped for zone_reclaim to | ||
| 1560 | * occur. | ||
| 1561 | */ | ||
| 1562 | int sysctl_min_unmapped_ratio = 1; | ||
| 1563 | |||
| 1564 | /* | ||
| 1565 | * If the number of slab pages in a zone grows beyond this percentage then | ||
| 1566 | * slab reclaim needs to occur. | ||
| 1567 | */ | ||
| 1568 | int sysctl_min_slab_ratio = 5; | ||
| 1569 | |||
| 1570 | /* | ||
| 1523 | * Try to free up some pages from this zone through reclaim. | 1571 | * Try to free up some pages from this zone through reclaim. | 
| 1524 | */ | 1572 | */ | 
| 1525 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1573 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 
| @@ -1533,12 +1581,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1533 | struct scan_control sc = { | 1581 | struct scan_control sc = { | 
| 1534 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 1582 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 
| 1535 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 1583 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 
| 1536 | .nr_mapped = read_page_state(nr_mapped), | ||
| 1537 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1584 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 
| 1538 | SWAP_CLUSTER_MAX), | 1585 | SWAP_CLUSTER_MAX), | 
| 1539 | .gfp_mask = gfp_mask, | 1586 | .gfp_mask = gfp_mask, | 
| 1540 | .swappiness = vm_swappiness, | 1587 | .swappiness = vm_swappiness, | 
| 1541 | }; | 1588 | }; | 
| 1589 | unsigned long slab_reclaimable; | ||
| 1542 | 1590 | ||
| 1543 | disable_swap_token(); | 1591 | disable_swap_token(); | 
| 1544 | cond_resched(); | 1592 | cond_resched(); | 
| @@ -1551,43 +1599,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1551 | reclaim_state.reclaimed_slab = 0; | 1599 | reclaim_state.reclaimed_slab = 0; | 
| 1552 | p->reclaim_state = &reclaim_state; | 1600 | p->reclaim_state = &reclaim_state; | 
| 1553 | 1601 | ||
| 1554 | /* | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - | 
| 1555 | * Free memory by calling shrink zone with increasing priorities | 1603 | zone_page_state(zone, NR_FILE_MAPPED) > | 
| 1556 | * until we have enough memory freed. | 1604 | zone->min_unmapped_pages) { | 
| 1557 | */ | 1605 | /* | 
| 1558 | priority = ZONE_RECLAIM_PRIORITY; | 1606 | * Free memory by calling shrink zone with increasing | 
| 1559 | do { | 1607 | * priorities until we have enough memory freed. | 
| 1560 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1608 | */ | 
| 1561 | priority--; | 1609 | priority = ZONE_RECLAIM_PRIORITY; | 
| 1562 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1610 | do { | 
| 1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
| 1612 | priority--; | ||
| 1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
| 1614 | } | ||
| 1563 | 1615 | ||
| 1564 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1616 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 
| 1617 | if (slab_reclaimable > zone->min_slab_pages) { | ||
| 1565 | /* | 1618 | /* | 
| 1566 | * shrink_slab() does not currently allow us to determine how | 1619 | * shrink_slab() does not currently allow us to determine how | 
| 1567 | * many pages were freed in this zone. So we just shake the slab | 1620 | * many pages were freed in this zone. So we take the current | 
| 1568 | * a bit and then go off node for this particular allocation | 1621 | * number of slab pages and shake the slab until it is reduced | 
| 1569 | * despite possibly having freed enough memory to allocate in | 1622 | * by the same nr_pages that we used for reclaiming unmapped | 
| 1570 | * this zone. If we freed local memory then the next | 1623 | * pages. | 
| 1571 | * allocations will be local again. | ||
| 1572 | * | 1624 | * | 
| 1573 | * shrink_slab will free memory on all zones and may take | 1625 | * Note that shrink_slab will free memory on all zones and may | 
| 1574 | * a long time. | 1626 | * take a long time. | 
| 1575 | */ | 1627 | */ | 
| 1576 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1628 | while (shrink_slab(sc.nr_scanned, gfp_mask, order) && | 
| 1577 | } | 1629 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) > | 
| 1630 | slab_reclaimable - nr_pages) | ||
| 1631 | ; | ||
| 1578 | 1632 | ||
| 1579 | p->reclaim_state = NULL; | ||
| 1580 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | ||
| 1581 | |||
| 1582 | if (nr_reclaimed == 0) { | ||
| 1583 | /* | 1633 | /* | 
| 1584 | * We were unable to reclaim enough pages to stay on node. We | 1634 | * Update nr_reclaimed by the number of slab pages we | 
| 1585 | * now allow off node accesses for a certain time period before | 1635 | * reclaimed from this zone. | 
| 1586 | * trying again to reclaim pages from the local zone. | ||
| 1587 | */ | 1636 | */ | 
| 1588 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1637 | nr_reclaimed += slab_reclaimable - | 
| 1638 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
| 1589 | } | 1639 | } | 
| 1590 | 1640 | ||
| 1641 | p->reclaim_state = NULL; | ||
| 1642 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | ||
| 1591 | return nr_reclaimed >= nr_pages; | 1643 | return nr_reclaimed >= nr_pages; | 
| 1592 | } | 1644 | } | 
| 1593 | 1645 | ||
| @@ -1597,14 +1649,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1597 | int node_id; | 1649 | int node_id; | 
| 1598 | 1650 | ||
| 1599 | /* | 1651 | /* | 
| 1600 | * Do not reclaim if there was a recent unsuccessful attempt at zone | 1652 | * Zone reclaim reclaims unmapped file backed pages and | 
| 1601 | * reclaim. In that case we let allocations go off node for the | 1653 | * slab pages if we are over the defined limits. | 
| 1602 | * zone_reclaim_interval. Otherwise we would scan for each off-node | 1654 | * | 
| 1603 | * page allocation. | 1655 | * A small portion of unmapped file backed pages is needed for | 
| 1656 | * file I/O otherwise pages read by file I/O will be immediately | ||
| 1657 | * thrown out if the zone is overallocated. So we do not reclaim | ||
| 1658 | * if less than a specified percentage of the zone is used by | ||
| 1659 | * unmapped file backed pages. | ||
| 1604 | */ | 1660 | */ | 
| 1605 | if (time_before(jiffies, | 1661 | if (zone_page_state(zone, NR_FILE_PAGES) - | 
| 1606 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1662 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 
| 1607 | return 0; | 1663 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 
| 1664 | <= zone->min_slab_pages) | ||
| 1665 | return 0; | ||
| 1608 | 1666 | ||
| 1609 | /* | 1667 | /* | 
| 1610 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 1668 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 
| @@ -1623,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1623 | * over remote processors and spread off node memory allocations | 1681 | * over remote processors and spread off node memory allocations | 
| 1624 | * as wide as possible. | 1682 | * as wide as possible. | 
| 1625 | */ | 1683 | */ | 
| 1626 | node_id = zone->zone_pgdat->node_id; | 1684 | node_id = zone_to_nid(zone); | 
| 1627 | mask = node_to_cpumask(node_id); | 1685 | mask = node_to_cpumask(node_id); | 
| 1628 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1686 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 
| 1629 | return 0; | 1687 | return 0; | 
| diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 000000000000..a2b6a9f96e5c --- /dev/null +++ b/mm/vmstat.c | |||
| @@ -0,0 +1,706 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/vmstat.c | ||
| 3 | * | ||
| 4 | * Manages VM statistics | ||
| 5 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
| 6 | * | ||
| 7 | * zoned VM statistics | ||
| 8 | * Copyright (C) 2006 Silicon Graphics, Inc., | ||
| 9 | * Christoph Lameter <christoph@lameter.com> | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/config.h> | ||
| 13 | #include <linux/mm.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/cpu.h> | ||
| 16 | |||
| 17 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | ||
| 18 | unsigned long *free, struct pglist_data *pgdat) | ||
| 19 | { | ||
| 20 | struct zone *zones = pgdat->node_zones; | ||
| 21 | int i; | ||
| 22 | |||
| 23 | *active = 0; | ||
| 24 | *inactive = 0; | ||
| 25 | *free = 0; | ||
| 26 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 27 | *active += zones[i].nr_active; | ||
| 28 | *inactive += zones[i].nr_inactive; | ||
| 29 | *free += zones[i].free_pages; | ||
| 30 | } | ||
| 31 | } | ||
| 32 | |||
| 33 | void get_zone_counts(unsigned long *active, | ||
| 34 | unsigned long *inactive, unsigned long *free) | ||
| 35 | { | ||
| 36 | struct pglist_data *pgdat; | ||
| 37 | |||
| 38 | *active = 0; | ||
| 39 | *inactive = 0; | ||
| 40 | *free = 0; | ||
| 41 | for_each_online_pgdat(pgdat) { | ||
| 42 | unsigned long l, m, n; | ||
| 43 | __get_zone_counts(&l, &m, &n, pgdat); | ||
| 44 | *active += l; | ||
| 45 | *inactive += m; | ||
| 46 | *free += n; | ||
| 47 | } | ||
| 48 | } | ||
| 49 | |||
| 50 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
| 51 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | ||
| 52 | EXPORT_PER_CPU_SYMBOL(vm_event_states); | ||
| 53 | |||
| 54 | static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | ||
| 55 | { | ||
| 56 | int cpu = 0; | ||
| 57 | int i; | ||
| 58 | |||
| 59 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); | ||
| 60 | |||
| 61 | cpu = first_cpu(*cpumask); | ||
| 62 | while (cpu < NR_CPUS) { | ||
| 63 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); | ||
| 64 | |||
| 65 | cpu = next_cpu(cpu, *cpumask); | ||
| 66 | |||
| 67 | if (cpu < NR_CPUS) | ||
| 68 | prefetch(&per_cpu(vm_event_states, cpu)); | ||
| 69 | |||
| 70 | |||
| 71 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) | ||
| 72 | ret[i] += this->event[i]; | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Accumulate the vm event counters across all CPUs. | ||
| 78 | * The result is unavoidably approximate - it can change | ||
| 79 | * during and after execution of this function. | ||
| 80 | */ | ||
| 81 | void all_vm_events(unsigned long *ret) | ||
| 82 | { | ||
| 83 | sum_vm_events(ret, &cpu_online_map); | ||
| 84 | } | ||
| 85 | EXPORT_SYMBOL_GPL(all_vm_events); | ||
| 86 | |||
| 87 | #ifdef CONFIG_HOTPLUG | ||
| 88 | /* | ||
| 89 | * Fold the foreign cpu events into our own. | ||
| 90 | * | ||
| 91 | * This is adding to the events on one processor | ||
| 92 | * but keeps the global counts constant. | ||
| 93 | */ | ||
| 94 | void vm_events_fold_cpu(int cpu) | ||
| 95 | { | ||
| 96 | struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); | ||
| 97 | int i; | ||
| 98 | |||
| 99 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { | ||
| 100 | count_vm_events(i, fold_state->event[i]); | ||
| 101 | fold_state->event[i] = 0; | ||
| 102 | } | ||
| 103 | } | ||
| 104 | #endif /* CONFIG_HOTPLUG */ | ||
| 105 | |||
| 106 | #endif /* CONFIG_VM_EVENT_COUNTERS */ | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Manage combined zone based / global counters | ||
| 110 | * | ||
| 111 | * vm_stat contains the global counters | ||
| 112 | */ | ||
| 113 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | ||
| 114 | EXPORT_SYMBOL(vm_stat); | ||
| 115 | |||
| 116 | #ifdef CONFIG_SMP | ||
| 117 | |||
| 118 | static int calculate_threshold(struct zone *zone) | ||
| 119 | { | ||
| 120 | int threshold; | ||
| 121 | int mem; /* memory in 128 MB units */ | ||
| 122 | |||
| 123 | /* | ||
| 124 | * The threshold scales with the number of processors and the amount | ||
| 125 | * of memory per zone. More memory means that we can defer updates for | ||
| 126 | * longer, more processors could lead to more contention. | ||
| 127 | * fls() is used to have a cheap way of logarithmic scaling. | ||
| 128 | * | ||
| 129 | * Some sample thresholds: | ||
| 130 | * | ||
| 131 | * Threshold Processors (fls) Zonesize fls(mem+1) | ||
| 132 | * ------------------------------------------------------------------ | ||
| 133 | * 8 1 1 0.9-1 GB 4 | ||
| 134 | * 16 2 2 0.9-1 GB 4 | ||
| 135 | * 20 2 2 1-2 GB 5 | ||
| 136 | * 24 2 2 2-4 GB 6 | ||
| 137 | * 28 2 2 4-8 GB 7 | ||
| 138 | * 32 2 2 8-16 GB 8 | ||
| 139 | * 4 2 2 <128M 1 | ||
| 140 | * 30 4 3 2-4 GB 5 | ||
| 141 | * 48 4 3 8-16 GB 8 | ||
| 142 | * 32 8 4 1-2 GB 4 | ||
| 143 | * 32 8 4 0.9-1GB 4 | ||
| 144 | * 10 16 5 <128M 1 | ||
| 145 | * 40 16 5 900M 4 | ||
| 146 | * 70 64 7 2-4 GB 5 | ||
| 147 | * 84 64 7 4-8 GB 6 | ||
| 148 | * 108 512 9 4-8 GB 6 | ||
| 149 | * 125 1024 10 8-16 GB 8 | ||
| 150 | * 125 1024 10 16-32 GB 9 | ||
| 151 | */ | ||
| 152 | |||
| 153 | mem = zone->present_pages >> (27 - PAGE_SHIFT); | ||
| 154 | |||
| 155 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | ||
| 156 | |||
| 157 | /* | ||
| 158 | * Maximum threshold is 125 | ||
| 159 | */ | ||
| 160 | threshold = min(125, threshold); | ||
| 161 | |||
| 162 | return threshold; | ||
| 163 | } | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Refresh the thresholds for each zone. | ||
| 167 | */ | ||
| 168 | static void refresh_zone_stat_thresholds(void) | ||
| 169 | { | ||
| 170 | struct zone *zone; | ||
| 171 | int cpu; | ||
| 172 | int threshold; | ||
| 173 | |||
| 174 | for_each_zone(zone) { | ||
| 175 | |||
| 176 | if (!zone->present_pages) | ||
| 177 | continue; | ||
| 178 | |||
| 179 | threshold = calculate_threshold(zone); | ||
| 180 | |||
| 181 | for_each_online_cpu(cpu) | ||
| 182 | zone_pcp(zone, cpu)->stat_threshold = threshold; | ||
| 183 | } | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * For use when we know that interrupts are disabled. | ||
| 188 | */ | ||
| 189 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
| 190 | int delta) | ||
| 191 | { | ||
| 192 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | ||
| 193 | s8 *p = pcp->vm_stat_diff + item; | ||
| 194 | long x; | ||
| 195 | |||
| 196 | x = delta + *p; | ||
| 197 | |||
| 198 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { | ||
| 199 | zone_page_state_add(x, zone, item); | ||
| 200 | x = 0; | ||
| 201 | } | ||
| 202 | *p = x; | ||
| 203 | } | ||
| 204 | EXPORT_SYMBOL(__mod_zone_page_state); | ||
| 205 | |||
| 206 | /* | ||
| 207 | * For an unknown interrupt state | ||
| 208 | */ | ||
| 209 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
| 210 | int delta) | ||
| 211 | { | ||
| 212 | unsigned long flags; | ||
| 213 | |||
| 214 | local_irq_save(flags); | ||
| 215 | __mod_zone_page_state(zone, item, delta); | ||
| 216 | local_irq_restore(flags); | ||
| 217 | } | ||
| 218 | EXPORT_SYMBOL(mod_zone_page_state); | ||
| 219 | |||
| 220 | /* | ||
| 221 | * Optimized increment and decrement functions. | ||
| 222 | * | ||
| 223 | * These are only for a single page and therefore can take a struct page * | ||
| 224 | * argument instead of struct zone *. This allows the inclusion of the code | ||
| 225 | * generated for page_zone(page) into the optimized functions. | ||
| 226 | * | ||
| 227 | * No overflow check is necessary and therefore the differential can be | ||
| 228 | * incremented or decremented in place which may allow the compilers to | ||
| 229 | * generate better code. | ||
| 230 | * The increment or decrement is known and therefore one boundary check can | ||
| 231 | * be omitted. | ||
| 232 | * | ||
| 233 | * NOTE: These functions are very performance sensitive. Change only | ||
| 234 | * with care. | ||
| 235 | * | ||
| 236 | * Some processors have inc/dec instructions that are atomic vs an interrupt. | ||
| 237 | * However, the code must first determine the differential location in a zone | ||
| 238 | * based on the processor number and then inc/dec the counter. There is no | ||
| 239 | * guarantee without disabling preemption that the processor will not change | ||
| 240 | * in between and therefore the atomicity vs. interrupt cannot be exploited | ||
| 241 | * in a useful way here. | ||
| 242 | */ | ||
| 243 | static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
| 244 | { | ||
| 245 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | ||
| 246 | s8 *p = pcp->vm_stat_diff + item; | ||
| 247 | |||
| 248 | (*p)++; | ||
| 249 | |||
| 250 | if (unlikely(*p > pcp->stat_threshold)) { | ||
| 251 | int overstep = pcp->stat_threshold / 2; | ||
| 252 | |||
| 253 | zone_page_state_add(*p + overstep, zone, item); | ||
| 254 | *p = -overstep; | ||
| 255 | } | ||
| 256 | } | ||
| 257 | |||
| 258 | void __inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 259 | { | ||
| 260 | __inc_zone_state(page_zone(page), item); | ||
| 261 | } | ||
| 262 | EXPORT_SYMBOL(__inc_zone_page_state); | ||
| 263 | |||
| 264 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 265 | { | ||
| 266 | struct zone *zone = page_zone(page); | ||
| 267 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | ||
| 268 | s8 *p = pcp->vm_stat_diff + item; | ||
| 269 | |||
| 270 | (*p)--; | ||
| 271 | |||
| 272 | if (unlikely(*p < - pcp->stat_threshold)) { | ||
| 273 | int overstep = pcp->stat_threshold / 2; | ||
| 274 | |||
| 275 | zone_page_state_add(*p - overstep, zone, item); | ||
| 276 | *p = overstep; | ||
| 277 | } | ||
| 278 | } | ||
| 279 | EXPORT_SYMBOL(__dec_zone_page_state); | ||
| 280 | |||
| 281 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
| 282 | { | ||
| 283 | unsigned long flags; | ||
| 284 | |||
| 285 | local_irq_save(flags); | ||
| 286 | __inc_zone_state(zone, item); | ||
| 287 | local_irq_restore(flags); | ||
| 288 | } | ||
| 289 | |||
| 290 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 291 | { | ||
| 292 | unsigned long flags; | ||
| 293 | struct zone *zone; | ||
| 294 | |||
| 295 | zone = page_zone(page); | ||
| 296 | local_irq_save(flags); | ||
| 297 | __inc_zone_state(zone, item); | ||
| 298 | local_irq_restore(flags); | ||
| 299 | } | ||
| 300 | EXPORT_SYMBOL(inc_zone_page_state); | ||
| 301 | |||
| 302 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
| 303 | { | ||
| 304 | unsigned long flags; | ||
| 305 | |||
| 306 | local_irq_save(flags); | ||
| 307 | __dec_zone_page_state(page, item); | ||
| 308 | local_irq_restore(flags); | ||
| 309 | } | ||
| 310 | EXPORT_SYMBOL(dec_zone_page_state); | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Update the zone counters for one cpu. | ||
| 314 | */ | ||
| 315 | void refresh_cpu_vm_stats(int cpu) | ||
| 316 | { | ||
| 317 | struct zone *zone; | ||
| 318 | int i; | ||
| 319 | unsigned long flags; | ||
| 320 | |||
| 321 | for_each_zone(zone) { | ||
| 322 | struct per_cpu_pageset *pcp; | ||
| 323 | |||
| 324 | if (!populated_zone(zone)) | ||
| 325 | continue; | ||
| 326 | |||
| 327 | pcp = zone_pcp(zone, cpu); | ||
| 328 | |||
| 329 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
| 330 | if (pcp->vm_stat_diff[i]) { | ||
| 331 | local_irq_save(flags); | ||
| 332 | zone_page_state_add(pcp->vm_stat_diff[i], | ||
| 333 | zone, i); | ||
| 334 | pcp->vm_stat_diff[i] = 0; | ||
| 335 | local_irq_restore(flags); | ||
| 336 | } | ||
| 337 | } | ||
| 338 | } | ||
| 339 | |||
| 340 | static void __refresh_cpu_vm_stats(void *dummy) | ||
| 341 | { | ||
| 342 | refresh_cpu_vm_stats(smp_processor_id()); | ||
| 343 | } | ||
| 344 | |||
| 345 | /* | ||
| 346 | * Consolidate all counters. | ||
| 347 | * | ||
| 348 | * Note that the result is less inaccurate but still inaccurate | ||
| 349 | * if concurrent processes are allowed to run. | ||
| 350 | */ | ||
| 351 | void refresh_vm_stats(void) | ||
| 352 | { | ||
| 353 | on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); | ||
| 354 | } | ||
| 355 | EXPORT_SYMBOL(refresh_vm_stats); | ||
| 356 | |||
| 357 | #endif | ||
| 358 | |||
| 359 | #ifdef CONFIG_NUMA | ||
| 360 | /* | ||
| 361 | * zonelist = the list of zones passed to the allocator | ||
| 362 | * z = the zone from which the allocation occurred. | ||
| 363 | * | ||
| 364 | * Must be called with interrupts disabled. | ||
| 365 | */ | ||
| 366 | void zone_statistics(struct zonelist *zonelist, struct zone *z) | ||
| 367 | { | ||
| 368 | if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { | ||
| 369 | __inc_zone_state(z, NUMA_HIT); | ||
| 370 | } else { | ||
| 371 | __inc_zone_state(z, NUMA_MISS); | ||
| 372 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); | ||
| 373 | } | ||
| 374 | if (z->node == numa_node_id()) | ||
| 375 | __inc_zone_state(z, NUMA_LOCAL); | ||
| 376 | else | ||
| 377 | __inc_zone_state(z, NUMA_OTHER); | ||
| 378 | } | ||
| 379 | #endif | ||
| 380 | |||
| 381 | #ifdef CONFIG_PROC_FS | ||
| 382 | |||
| 383 | #include <linux/seq_file.h> | ||
| 384 | |||
| 385 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
| 386 | { | ||
| 387 | pg_data_t *pgdat; | ||
| 388 | loff_t node = *pos; | ||
| 389 | for (pgdat = first_online_pgdat(); | ||
| 390 | pgdat && node; | ||
| 391 | pgdat = next_online_pgdat(pgdat)) | ||
| 392 | --node; | ||
| 393 | |||
| 394 | return pgdat; | ||
| 395 | } | ||
| 396 | |||
| 397 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 398 | { | ||
| 399 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 400 | |||
| 401 | (*pos)++; | ||
| 402 | return next_online_pgdat(pgdat); | ||
| 403 | } | ||
| 404 | |||
| 405 | static void frag_stop(struct seq_file *m, void *arg) | ||
| 406 | { | ||
| 407 | } | ||
| 408 | |||
| 409 | /* | ||
| 410 | * This walks the free areas for each zone. | ||
| 411 | */ | ||
| 412 | static int frag_show(struct seq_file *m, void *arg) | ||
| 413 | { | ||
| 414 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 415 | struct zone *zone; | ||
| 416 | struct zone *node_zones = pgdat->node_zones; | ||
| 417 | unsigned long flags; | ||
| 418 | int order; | ||
| 419 | |||
| 420 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
| 421 | if (!populated_zone(zone)) | ||
| 422 | continue; | ||
| 423 | |||
| 424 | spin_lock_irqsave(&zone->lock, flags); | ||
| 425 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 426 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 427 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
| 428 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 429 | seq_putc(m, '\n'); | ||
| 430 | } | ||
| 431 | return 0; | ||
| 432 | } | ||
| 433 | |||
| 434 | struct seq_operations fragmentation_op = { | ||
| 435 | .start = frag_start, | ||
| 436 | .next = frag_next, | ||
| 437 | .stop = frag_stop, | ||
| 438 | .show = frag_show, | ||
| 439 | }; | ||
| 440 | |||
| 441 | #ifdef CONFIG_ZONE_DMA32 | ||
| 442 | #define TEXT_FOR_DMA32(xx) xx "_dma32", | ||
| 443 | #else | ||
| 444 | #define TEXT_FOR_DMA32(xx) | ||
| 445 | #endif | ||
| 446 | |||
| 447 | #ifdef CONFIG_HIGHMEM | ||
| 448 | #define TEXT_FOR_HIGHMEM(xx) xx "_high", | ||
| 449 | #else | ||
| 450 | #define TEXT_FOR_HIGHMEM(xx) | ||
| 451 | #endif | ||
| 452 | |||
| 453 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ | ||
| 454 | TEXT_FOR_HIGHMEM(xx) | ||
| 455 | |||
| 456 | static char *vmstat_text[] = { | ||
| 457 | /* Zoned VM counters */ | ||
| 458 | "nr_anon_pages", | ||
| 459 | "nr_mapped", | ||
| 460 | "nr_file_pages", | ||
| 461 | "nr_slab_reclaimable", | ||
| 462 | "nr_slab_unreclaimable", | ||
| 463 | "nr_page_table_pages", | ||
| 464 | "nr_dirty", | ||
| 465 | "nr_writeback", | ||
| 466 | "nr_unstable", | ||
| 467 | "nr_bounce", | ||
| 468 | "nr_vmscan_write", | ||
| 469 | |||
| 470 | #ifdef CONFIG_NUMA | ||
| 471 | "numa_hit", | ||
| 472 | "numa_miss", | ||
| 473 | "numa_foreign", | ||
| 474 | "numa_interleave", | ||
| 475 | "numa_local", | ||
| 476 | "numa_other", | ||
| 477 | #endif | ||
| 478 | |||
| 479 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
| 480 | "pgpgin", | ||
| 481 | "pgpgout", | ||
| 482 | "pswpin", | ||
| 483 | "pswpout", | ||
| 484 | |||
| 485 | TEXTS_FOR_ZONES("pgalloc") | ||
| 486 | |||
| 487 | "pgfree", | ||
| 488 | "pgactivate", | ||
| 489 | "pgdeactivate", | ||
| 490 | |||
| 491 | "pgfault", | ||
| 492 | "pgmajfault", | ||
| 493 | |||
| 494 | TEXTS_FOR_ZONES("pgrefill") | ||
| 495 | TEXTS_FOR_ZONES("pgsteal") | ||
| 496 | TEXTS_FOR_ZONES("pgscan_kswapd") | ||
| 497 | TEXTS_FOR_ZONES("pgscan_direct") | ||
| 498 | |||
| 499 | "pginodesteal", | ||
| 500 | "slabs_scanned", | ||
| 501 | "kswapd_steal", | ||
| 502 | "kswapd_inodesteal", | ||
| 503 | "pageoutrun", | ||
| 504 | "allocstall", | ||
| 505 | |||
| 506 | "pgrotated", | ||
| 507 | #endif | ||
| 508 | }; | ||
| 509 | |||
| 510 | /* | ||
| 511 | * Output information about zones in @pgdat. | ||
| 512 | */ | ||
| 513 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
| 514 | { | ||
| 515 | pg_data_t *pgdat = arg; | ||
| 516 | struct zone *zone; | ||
| 517 | struct zone *node_zones = pgdat->node_zones; | ||
| 518 | unsigned long flags; | ||
| 519 | |||
| 520 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
| 521 | int i; | ||
| 522 | |||
| 523 | if (!populated_zone(zone)) | ||
| 524 | continue; | ||
| 525 | |||
| 526 | spin_lock_irqsave(&zone->lock, flags); | ||
| 527 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
| 528 | seq_printf(m, | ||
| 529 | "\n pages free %lu" | ||
| 530 | "\n min %lu" | ||
| 531 | "\n low %lu" | ||
| 532 | "\n high %lu" | ||
| 533 | "\n active %lu" | ||
| 534 | "\n inactive %lu" | ||
| 535 | "\n scanned %lu (a: %lu i: %lu)" | ||
| 536 | "\n spanned %lu" | ||
| 537 | "\n present %lu", | ||
| 538 | zone->free_pages, | ||
| 539 | zone->pages_min, | ||
| 540 | zone->pages_low, | ||
| 541 | zone->pages_high, | ||
| 542 | zone->nr_active, | ||
| 543 | zone->nr_inactive, | ||
| 544 | zone->pages_scanned, | ||
| 545 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
| 546 | zone->spanned_pages, | ||
| 547 | zone->present_pages); | ||
| 548 | |||
| 549 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
| 550 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | ||
| 551 | zone_page_state(zone, i)); | ||
| 552 | |||
| 553 | seq_printf(m, | ||
| 554 | "\n protection: (%lu", | ||
| 555 | zone->lowmem_reserve[0]); | ||
| 556 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
| 557 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
| 558 | seq_printf(m, | ||
| 559 | ")" | ||
| 560 | "\n pagesets"); | ||
| 561 | for_each_online_cpu(i) { | ||
| 562 | struct per_cpu_pageset *pageset; | ||
| 563 | int j; | ||
| 564 | |||
| 565 | pageset = zone_pcp(zone, i); | ||
| 566 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 567 | if (pageset->pcp[j].count) | ||
| 568 | break; | ||
| 569 | } | ||
| 570 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
| 571 | continue; | ||
| 572 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 573 | seq_printf(m, | ||
| 574 | "\n cpu: %i pcp: %i" | ||
| 575 | "\n count: %i" | ||
| 576 | "\n high: %i" | ||
| 577 | "\n batch: %i", | ||
| 578 | i, j, | ||
| 579 | pageset->pcp[j].count, | ||
| 580 | pageset->pcp[j].high, | ||
| 581 | pageset->pcp[j].batch); | ||
| 582 | } | ||
| 583 | #ifdef CONFIG_SMP | ||
| 584 | seq_printf(m, "\n vm stats threshold: %d", | ||
| 585 | pageset->stat_threshold); | ||
| 586 | #endif | ||
| 587 | } | ||
| 588 | seq_printf(m, | ||
| 589 | "\n all_unreclaimable: %u" | ||
| 590 | "\n prev_priority: %i" | ||
| 591 | "\n temp_priority: %i" | ||
| 592 | "\n start_pfn: %lu", | ||
| 593 | zone->all_unreclaimable, | ||
| 594 | zone->prev_priority, | ||
| 595 | zone->temp_priority, | ||
| 596 | zone->zone_start_pfn); | ||
| 597 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 598 | seq_putc(m, '\n'); | ||
| 599 | } | ||
| 600 | return 0; | ||
| 601 | } | ||
| 602 | |||
| 603 | struct seq_operations zoneinfo_op = { | ||
| 604 | .start = frag_start, /* iterate over all zones. The same as in | ||
| 605 | * fragmentation. */ | ||
| 606 | .next = frag_next, | ||
| 607 | .stop = frag_stop, | ||
| 608 | .show = zoneinfo_show, | ||
| 609 | }; | ||
| 610 | |||
| 611 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | ||
| 612 | { | ||
| 613 | unsigned long *v; | ||
| 614 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
| 615 | unsigned long *e; | ||
| 616 | #endif | ||
| 617 | int i; | ||
| 618 | |||
| 619 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
| 620 | return NULL; | ||
| 621 | |||
| 622 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
| 623 | v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) | ||
| 624 | + sizeof(struct vm_event_state), GFP_KERNEL); | ||
| 625 | #else | ||
| 626 | v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), | ||
| 627 | GFP_KERNEL); | ||
| 628 | #endif | ||
| 629 | m->private = v; | ||
| 630 | if (!v) | ||
| 631 | return ERR_PTR(-ENOMEM); | ||
| 632 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
| 633 | v[i] = global_page_state(i); | ||
| 634 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
| 635 | e = v + NR_VM_ZONE_STAT_ITEMS; | ||
| 636 | all_vm_events(e); | ||
| 637 | e[PGPGIN] /= 2; /* sectors -> kbytes */ | ||
| 638 | e[PGPGOUT] /= 2; | ||
| 639 | #endif | ||
| 640 | return v + *pos; | ||
| 641 | } | ||
| 642 | |||
| 643 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | ||
| 644 | { | ||
| 645 | (*pos)++; | ||
| 646 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
| 647 | return NULL; | ||
| 648 | return (unsigned long *)m->private + *pos; | ||
| 649 | } | ||
| 650 | |||
| 651 | static int vmstat_show(struct seq_file *m, void *arg) | ||
| 652 | { | ||
| 653 | unsigned long *l = arg; | ||
| 654 | unsigned long off = l - (unsigned long *)m->private; | ||
| 655 | |||
| 656 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | ||
| 657 | return 0; | ||
| 658 | } | ||
| 659 | |||
| 660 | static void vmstat_stop(struct seq_file *m, void *arg) | ||
| 661 | { | ||
| 662 | kfree(m->private); | ||
| 663 | m->private = NULL; | ||
| 664 | } | ||
| 665 | |||
| 666 | struct seq_operations vmstat_op = { | ||
| 667 | .start = vmstat_start, | ||
| 668 | .next = vmstat_next, | ||
| 669 | .stop = vmstat_stop, | ||
| 670 | .show = vmstat_show, | ||
| 671 | }; | ||
| 672 | |||
| 673 | #endif /* CONFIG_PROC_FS */ | ||
| 674 | |||
| 675 | #ifdef CONFIG_SMP | ||
| 676 | /* | ||
| 677 | * Use the cpu notifier to insure that the thresholds are recalculated | ||
| 678 | * when necessary. | ||
| 679 | */ | ||
| 680 | static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | ||
| 681 | unsigned long action, | ||
| 682 | void *hcpu) | ||
| 683 | { | ||
| 684 | switch (action) { | ||
| 685 | case CPU_UP_PREPARE: | ||
| 686 | case CPU_UP_CANCELED: | ||
| 687 | case CPU_DEAD: | ||
| 688 | refresh_zone_stat_thresholds(); | ||
| 689 | break; | ||
| 690 | default: | ||
| 691 | break; | ||
| 692 | } | ||
| 693 | return NOTIFY_OK; | ||
| 694 | } | ||
| 695 | |||
| 696 | static struct notifier_block __cpuinitdata vmstat_notifier = | ||
| 697 | { &vmstat_cpuup_callback, NULL, 0 }; | ||
| 698 | |||
| 699 | int __init setup_vmstat(void) | ||
| 700 | { | ||
| 701 | refresh_zone_stat_thresholds(); | ||
| 702 | register_cpu_notifier(&vmstat_notifier); | ||
| 703 | return 0; | ||
| 704 | } | ||
| 705 | module_init(setup_vmstat) | ||
| 706 | #endif | ||
