From caeab084deb61cd2d51cb8facc0e894a5b406aa4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Mar 2008 23:57:49 -0700 Subject: slub page alloc fallback: Enable interrupts for GFP_WAIT. The fallback path needs to enable interrupts like done for the other page allocator calls. This was not necessary with the alternate fast path since we handled irq enable/disable in the slow path. The regular fastpath handles irq enable/disable around calls to the slow path so we need to restore the proper status before calling the page allocator from the slowpath. Signed-off-by: Christoph Lameter --- mm/slub.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 96d63eb3ab17..ca71d5b81e4a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1536,9 +1536,15 @@ new_slab: * That is only possible if certain conditions are met that are being * checked when a slab is created. */ - if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK)) - return kmalloc_large(s->objsize, gfpflags); - + if (!(gfpflags & __GFP_NORETRY) && + (s->flags & __PAGE_ALLOC_FALLBACK)) { + if (gfpflags & __GFP_WAIT) + local_irq_enable(); + object = kmalloc_large(s->objsize, gfpflags); + if (gfpflags & __GFP_WAIT) + local_irq_disable(); + return object; + } return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) -- cgit v1.2.2 From 8a03feab32dceb78b9b1edf220e833d36d416b00 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Feb 2008 05:46:50 -0500 Subject: [PATCH] double dput() on failure exit in tiny-shmem Signed-off-by: Al Viro --- mm/tiny-shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 702083638c16..a63ee939178b 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -89,6 +89,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) close_file: put_filp(file); + return ERR_PTR(error); + put_dentry: dput(dentry); put_memory: -- cgit v1.2.2 From 7682486b3ee06f800d5b11033371c7c5e92e3057 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:40 -0700 Subject: mm: fix various kernel-doc comments Fix various kernel-doc notation in mm/: filemap.c: add function short description; convert 2 to kernel-doc fremap.c: change parameter 'prot' to @prot pagewalk.c: change "-" in function parameters to ":" slab.c: fix short description of kmem_ptr_validate() swap.c: fix description & parameters of put_pages_list() swap_state.c: fix function parameters vmalloc.c: change "@returns" to "Returns:" since that is not a parameter Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 20 +++++++++++++++++--- mm/fremap.c | 2 +- mm/pagewalk.c | 10 +++++----- mm/slab.c | 5 ++--- mm/swap.c | 5 ++--- mm/swap_state.c | 2 ++ mm/vmalloc.c | 6 ++++-- 7 files changed, 33 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index df343d1e6345..07e9d9258b48 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -343,7 +343,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /** - * sync_page_range_nolock + * sync_page_range_nolock - write & wait on all pages in the passed range without locking * @inode: target inode * @mapping: target address_space * @pos: beginning offset in pages to write @@ -611,7 +611,10 @@ int __lock_page_killable(struct page *page) sync_page_killable, TASK_KILLABLE); } -/* +/** + * __lock_page_nosync - get a lock on the page, without calling sync_page() + * @page: the page to lock + * * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ @@ -1538,9 +1541,20 @@ repeat: return page; } -/* +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, diff --git a/mm/fremap.c b/mm/fremap.c index 69a37c2bdf81..07a9c82ce1a3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -113,7 +113,7 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * mmap()/mremap() it does not create any new vmas. The new mappings are * also safe across swapout. * - * NOTE: the 'prot' parameter right now is ignored (but must be zero), + * NOTE: the @prot parameter right now is ignored (but must be zero), * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b4f27d22da91..1cf1417ef8b7 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -77,11 +77,11 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm - memory map to walk - * @addr - starting address - * @end - ending address - * @walk - set of callbacks to invoke for each level of the tree - * @private - private data passed to the callback function + * @mm: memory map to walk + * @addr: starting address + * @end: ending address + * @walk: set of callbacks to invoke for each level of the tree + * @private: private data passed to the callback function * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first diff --git a/mm/slab.c b/mm/slab.c index e6c698f55674..bb4070e1079f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3624,12 +3624,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_ptr_validate - check if an untrusted pointer might - * be a slab entry. + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against * @ptr: pointer to validate * - * This verifies that the untrusted pointer looks sane: + * This verifies that the untrusted pointer looks sane; * it is _not_ a guarantee that the pointer is actually * part of the slab cache in question, but it at least * validates that the pointer can be dereferenced and diff --git a/mm/swap.c b/mm/swap.c index d4ec59aa5c46..aa1139ccf3a7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,12 +78,11 @@ void put_page(struct page *page) EXPORT_SYMBOL(put_page); /** - * put_pages_list(): release a list of pages + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. Currently * used by read_cache_pages() and related error recovery code. - * - * @pages: list of pages threaded on page->lru */ void put_pages_list(struct list_head *pages) { diff --git a/mm/swap_state.c b/mm/swap_state.c index ec42f01a8d02..50757ee3f9f3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -115,6 +115,7 @@ void __delete_from_swap_cache(struct page *page) /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap + * @gfp_mask: memory allocation flags * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. @@ -315,6 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags * @vma: user vma this address belongs to * @addr: target address for mempolicy * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 950c0be9ca81..ecf91f8034bf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -757,7 +757,8 @@ finished: * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map - * @returns: 0 for success, -Exxx on failure + * + * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if @@ -829,7 +830,8 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area - * @returns: NULL on failure, vm_struct on success + * + * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings -- cgit v1.2.2 From 46711810200c50e639ffc52e755b3dba9b4c82a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:41 -0700 Subject: mm/shmem and tiny-shmem: fix some kernel-doc Convert tiny-shmem.c function comments to kernel-doc. Add parameters and convert/fix other kernel-doc in shmem.c. Signed-off-by: Randy Dunlap Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++--------------- mm/tiny-shmem.c | 8 +++----- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 3372bc579e89..f514dd392cd9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -244,9 +244,8 @@ static void shmem_free_inode(struct super_block *sb) } } -/* +/** * shmem_recalc_inode - recalculate the size of an inode - * * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop @@ -270,9 +269,8 @@ static void shmem_recalc_inode(struct inode *inode) } } -/* +/** * shmem_swp_entry - find the swap vector position in the info structure - * * @info: info structure for the inode * @index: index of the page to find * @page: optional page to add to the structure. Has to be preset to @@ -374,13 +372,13 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns } } -/* +/** * shmem_swp_alloc - get the position of the swap entry for the page. - * If it does not exist allocate the entry. - * * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? + * + * If the entry does not exist, allocate it. */ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) { @@ -440,9 +438,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long return entry; } -/* +/** * shmem_free_swp - free some swap entries in a directory - * * @dir: pointer to the directory * @edir: pointer after last entry of the directory * @punch_lock: pointer to spinlock when needed for the holepunch case @@ -2022,7 +2019,7 @@ static const struct inode_operations shmem_symlink_inode_operations = { }; #ifdef CONFIG_TMPFS_POSIX_ACL -/** +/* * Superblocks without xattr inode operations will get security.* xattr * support from the VFS "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at @@ -2561,12 +2558,11 @@ out4: } module_init(init_tmpfs) -/* +/** * shmem_file_setup - get an unlinked file living in tmpfs - * * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * + * @flags: vm_flags */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -2621,9 +2617,8 @@ put_memory: return ERR_PTR(error); } -/* +/** * shmem_zero_setup - setup a shared anonymous mapping - * * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ int shmem_zero_setup(struct vm_area_struct *vma) diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 702083638c16..f0f55875dd6a 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -39,12 +39,11 @@ static int __init init_tmpfs(void) } module_init(init_tmpfs) -/* +/** * shmem_file_setup - get an unlinked file living in tmpfs - * * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * + * @flags: vm_flags */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -95,9 +94,8 @@ put_memory: return ERR_PTR(error); } -/* +/** * shmem_zero_setup - setup a shared anonymous mapping - * * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ int shmem_zero_setup(struct vm_area_struct *vma) -- cgit v1.2.2 From 1b578df02207a67a29e8ced4db3b36d89df52fef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:42 -0700 Subject: mm/oom_kill: fix kernel-doc Fix kernel-doc notation in oom_kill.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 44b2da11bf43..f255eda693b0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -37,6 +37,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex); * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate * @uptime: current uptime in seconds + * @mem: target memory controller * * The formula used is relatively simple and documented inline in the * function. The main rationale is that we want to select a good task @@ -264,6 +265,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, } /** + * dump_tasks - dump current memory state of all system tasks + * @mem: target memory controller + * * Dumps the current memory state of all system tasks, excluding kernel threads. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj * score, and name. @@ -298,7 +302,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } -/** +/* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO * set. @@ -504,6 +508,9 @@ void clear_zonelist_oom(struct zonelist *zonelist) /** * out_of_memory - kill the "best" process when we run out of memory + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) -- cgit v1.2.2 From 77f6078aa8945a18a7780694940e52be0322c2b8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:42 -0700 Subject: mm: highmem kernel-doc additions Add kernel-doc comments to highmem.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 35d47733cde4..7da4a7b6af11 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,8 +104,9 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/* Flush all unused kmap mappings in order to remove stray - mappings. */ +/** + * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings + */ void kmap_flush_unused(void) { spin_lock(&kmap_lock); @@ -163,6 +164,14 @@ start: return vaddr; } +/** + * kmap_high - map a highmem page into memory + * @page: &struct page to map + * + * Returns the page's virtual memory address. + * + * We cannot call this from interrupts, as it may block. + */ void *kmap_high(struct page *page) { unsigned long vaddr; @@ -170,8 +179,6 @@ void *kmap_high(struct page *page) /* * For highmem pages, we can't trust "virtual" until * after we have the lock. - * - * We cannot call this from interrupts, as it may block */ spin_lock(&kmap_lock); vaddr = (unsigned long)page_address(page); @@ -185,6 +192,10 @@ void *kmap_high(struct page *page) EXPORT_SYMBOL(kmap_high); +/** + * kunmap_high - map a highmem page into memory + * @page: &struct page to unmap + */ void kunmap_high(struct page *page) { unsigned long vaddr; @@ -259,6 +270,12 @@ static struct page_address_slot *page_slot(struct page *page) return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } +/** + * page_address - get the mapped virtual address of a page + * @page: &struct page to get the virtual address of + * + * Returns the page's virtual address. + */ void *page_address(struct page *page) { unsigned long flags; @@ -288,6 +305,11 @@ done: EXPORT_SYMBOL(page_address); +/** + * set_page_address - set a page's virtual address + * @page: &struct page to set + * @virtual: virtual address to use + */ void set_page_address(struct page *page, void *virtual) { unsigned long flags; -- cgit v1.2.2 From 43d8eac44f28d384d2377dcdd1407f51f79dda55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:43 -0700 Subject: mm: rmap kernel-doc fixes Correct kernel-doc function names and parameters in rmap.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 0c9a2df06c39..997f06907b6d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -335,6 +335,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. + * @mem_cont: target memory controller * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -402,6 +403,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page + * @mem_cont: target memory controller * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. @@ -506,7 +508,7 @@ int page_mkclean(struct page *page) EXPORT_SYMBOL_GPL(page_mkclean); /** - * page_set_anon_rmap - setup new anonymous rmap + * __page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped @@ -530,7 +532,7 @@ static void __page_set_anon_rmap(struct page *page, } /** - * page_set_anon_rmap - sanity check anonymous rmap addition + * __page_check_anon_rmap - sanity check anonymous rmap addition * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped @@ -583,7 +585,7 @@ void page_add_anon_rmap(struct page *page, } } -/* +/** * page_add_new_anon_rmap - add pte mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added @@ -623,6 +625,8 @@ void page_add_file_rmap(struct page *page) /** * page_dup_rmap - duplicate pte mapping to a page * @page: the page to add the mapping to + * @vma: the vm area being duplicated + * @address: the user virtual address mapped * * For copy_page_range only: minimal extract from page_add_file_rmap / * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's @@ -642,6 +646,7 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area in which the mapping is removed * * The caller needs to hold the pte lock. */ @@ -890,6 +895,7 @@ static int try_to_unmap_anon(struct page *page, int migration) /** * try_to_unmap_file - unmap file page using the object-based rmap method * @page: the page to unmap + * @migration: migration flag * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -986,6 +992,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped + * @migration: migration flag * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. -- cgit v1.2.2 From 52ea27eb4cd5f250f33638029a134ff03c5e6bbb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 19 Mar 2008 17:00:45 -0700 Subject: memcgroup: fix check for thread being a group leader in memcgroup The check t->pid == t->pid is not the blessed way to check whether a task is a group leader. This is not about the code beautifulness only, but about pid namespaces fixes - both the tgid and the pid fields on the task_struct are (slowly :( ) becoming deprecated. Besides, the thread_group_leader() macro makes only one dereference :) Signed-off-by: Pavel Emelyanov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b9f6cae938e..9b648bd63451 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1079,7 +1079,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, * Only thread group leaders are allowed to migrate, the mm_struct is * in effect owned by the leader */ - if (p->tgid != p->pid) + if (!thread_group_leader(p)) goto out; css_get(&mem->css); -- cgit v1.2.2 From f7850d932fc69cb4bad83117f0bef1a658cce350 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:01:02 -0700 Subject: mm/readahead: fix kernel-doc notation Fix kernel-doc notation in mm/readahead.c. Change ":" to ";" so that it doesn't get treated as a doc section heading. Move the comment block ending "*/" to a line by itself so that the text on that last line is not lost (dropped). Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index c9c50ca1ec38..8762e8988972 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -443,9 +443,10 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * pagecache pages * * page_cache_async_ondemand() should be called when a page is used which - * has the PG_readahead flag: this is a marker to suggest that the application + * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in - * more pages. */ + * more pages. + */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, -- cgit v1.2.2 From 5a982cbc7b3fe6cf72266f319286f29963c71b9e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 24 Mar 2008 12:29:45 -0700 Subject: mm: fix boundary checking in free_bootmem_core With numa enabled, some callers could have a range of memory on one node but try to free that on other node. This can cause some pages to be freed wrongly. For example: when we try to allocate 128g boot ram early for gart/swiotlb, and free that range later so gart/swiotlb can get some range afterwards. With this patch, we don't need to care which node holds the range, just loop to call free_bootmem_node for all online nodes. This patch makes free_bootmem_core() more robust by trimming the sidx and eidx according the ram range that the node has. And make the free_bootmem_core handle this out of range case. We could use bdata_list to make sure the range can be freed for sure. So next time, we don't need to loop online nodes and could use free_bootmem directly. Signed-off-by: Yinghai Lu Cc: Andi Kleen Cc: Yasunori Goto Cc: KAMEZAWA Hiroyuki Acked-by: Ingo Molnar Tested-by: Ingo Molnar Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f6ff4337b424..2ccea700968f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -125,6 +125,7 @@ static int __init reserve_bootmem_core(bootmem_data_t *bdata, BUG_ON(!size); BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); + BUG_ON(addr < bdata->node_boot_start); sidx = PFN_DOWN(addr - bdata->node_boot_start); eidx = PFN_UP(addr + size - bdata->node_boot_start); @@ -156,21 +157,31 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long sidx, eidx; unsigned long i; + BUG_ON(!size); + + /* out range */ + if (addr + size < bdata->node_boot_start || + PFN_DOWN(addr) > bdata->node_low_pfn) + return; /* * round down end of usable mem, partially free pages are * considered reserved. */ - BUG_ON(!size); - BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); - if (addr < bdata->last_success) + if (addr >= bdata->node_boot_start && addr < bdata->last_success) bdata->last_success = addr; /* - * Round up the beginning of the address. + * Round up to index to the range. */ - sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) + sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + else + sidx = 0; + eidx = PFN_DOWN(addr + size - bdata->node_boot_start); + if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) + eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); for (i = sidx; i < eidx; i++) { if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) @@ -421,7 +432,9 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, void __init free_bootmem(unsigned long addr, unsigned long size) { - free_bootmem_core(NODE_DATA(0)->bdata, addr, size); + bootmem_data_t *bdata; + list_for_each_entry(bdata, &bdata_list, list) + free_bootmem_core(bdata, addr, size); } unsigned long __init free_all_bootmem(void) -- cgit v1.2.2 From 4dd4b920218326231156c7991ce5b94afad841c3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 24 Mar 2008 12:29:52 -0700 Subject: revert "kswapd should only wait on IO if there is IO" Revert commit f1a9ee758de7de1e040de849fdef46e6802ea117: Author: Rik van Riel Date: Thu Feb 7 00:14:08 2008 -0800 kswapd should only wait on IO if there is IO The current kswapd (and try_to_free_pages) code has an oddity where the code will wait on IO, even if there is no IO in flight. This problem is notable especially when the system scans through many unfreeable pages, causing unnecessary stalls in the VM. Additionally, tasks without __GFP_FS or __GFP_IO in the direct reclaim path will sleep if a significant number of pages are encountered that should be written out. This gives kswapd a chance to write out those pages, while the direct reclaim task sleeps. Signed-off-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Because of large latencies and interactivity problems reported by Carlos, here: http://lkml.org/lkml/2008/3/22/211 Cc: Rik van Riel Cc: "Carlos R. Mafra" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 45711585684e..4046434046e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -70,13 +70,6 @@ struct scan_control { int order; - /* - * Pages that have (or should have) IO pending. If we run into - * a lot of these, we're better off waiting a little for IO to - * finish rather than scanning more pages in the VM. - */ - int nr_io_pages; - /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -512,10 +505,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) wait_on_page_writeback(page); - else { - sc->nr_io_pages++; + else goto keep_locked; - } } referenced = page_referenced(page, 1, sc->mem_cgroup); @@ -554,10 +545,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) goto keep_locked; - if (!may_enter_fs) { - sc->nr_io_pages++; + if (!may_enter_fs) goto keep_locked; - } if (!sc->may_writepage) goto keep_locked; @@ -568,10 +557,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) { - sc->nr_io_pages++; + if (PageWriteback(page) || PageDirty(page)) goto keep; - } /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -1344,7 +1331,6 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; - sc->nr_io_pages = 0; if (!priority) disable_swap_token(); nr_reclaimed += shrink_zones(priority, zones, sc); @@ -1379,8 +1365,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2 && - sc->nr_io_pages > sc->swap_cluster_max) + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ @@ -1514,7 +1499,6 @@ loop_again: if (!priority) disable_swap_token(); - sc.nr_io_pages = 0; all_zones_ok = 1; /* @@ -1607,8 +1591,7 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2 && - sc.nr_io_pages > sc.swap_cluster_max) + if (total_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); /* -- cgit v1.2.2 From 53625b4204753b904addd40ca96d9ba802e6977d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 19 Mar 2008 13:42:07 -0700 Subject: count_partial() is not used if !SLUB_DEBUG and !CONFIG_SLABINFO Avoid warnings about unused functions if neither SLUB_DEBUG nor CONFIG_SLABINFO is defined. This patch will be reversed when slab defrag is merged since slab defrag requires count_partial() to determine the fragmentation status of slab caches. Signed-off-by: Christoph Lameter --- mm/slub.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ca71d5b81e4a..b72bc98e2dc1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2685,6 +2685,7 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +#if defined(SLUB_DEBUG) || defined(CONFIG_SLABINFO) static unsigned long count_partial(struct kmem_cache_node *n) { unsigned long flags; @@ -2697,6 +2698,7 @@ static unsigned long count_partial(struct kmem_cache_node *n) spin_unlock_irqrestore(&n->list_lock, flags); return x; } +#endif /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts -- cgit v1.2.2 From ec1f5eeeb5a79a0d48036de649a3498da42db565 Mon Sep 17 00:00:00 2001 From: Daniel Yeisley Date: Tue, 25 Mar 2008 23:59:08 +0200 Subject: slab: fix cache_cache bootstrap in kmem_cache_init() Commit 556a169dab38b5100df6f4a45b655dddd3db94c1 ("slab: fix bootstrap on memoryless node") introduced bootstrap-time cache_cache list3s for all nodes but forgot that initkmem_list3 needs to be accessed by [somevalue + node]. This patch fixes list_add() corruption in mm/slab.c seen on the ES7000. Cc: Mel Gorman Cc: Olaf Hering Cc: Christoph Lameter Signed-off-by: Dan Yeisley Signed-off-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bb4070e1079f..04b308c3bc54 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1481,7 +1481,7 @@ void __init kmem_cache_init(void) list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids, which @@ -1602,7 +1602,7 @@ void __init kmem_cache_init(void) int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid); + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); -- cgit v1.2.2 From a1de09195b294c6a4c5dec8c8defd0a2688d3f75 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 26 Mar 2008 14:37:53 -0700 Subject: hugetlb: indicate surplus huge page counts in per-node meminfo Currently we show the surplus hugetlb pool state in /proc/meminfo, but not in the per-node meminfo files, even though we track the information on a per-node basis. Printing it there can help track down dynamic pool bugs including the one in the follow-on patch. Signed-off-by: Nishanth Aravamudan Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74c1b6b0b37b..40d841cb5126 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -671,9 +671,11 @@ int hugetlb_report_node_meminfo(int nid, char *buf) { return sprintf(buf, "Node %d HugePages_Total: %5u\n" - "Node %d HugePages_Free: %5u\n", + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", nid, nr_huge_pages_node[nid], - nid, free_huge_pages_node[nid]); + nid, free_huge_pages_node[nid], + nid, surplus_huge_pages_node[nid]); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ -- cgit v1.2.2 From 11320d17ce4ecf8002dc8f9b6f1e49cd18e45a94 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 26 Mar 2008 14:40:20 -0700 Subject: hugetlb: fix potential livelock in return_unused_surplus_hugepages() Running the counters testcase from libhugetlbfs results in on 2.6.25-rc5 and 2.6.25-rc5-mm1: BUG: soft lockup - CPU#3 stuck for 61s! [counters:10531] NIP: c0000000000d1f3c LR: c0000000000d1f2c CTR: c0000000001b5088 REGS: c000005db12cb360 TRAP: 0901 Not tainted (2.6.25-rc5-autokern1) MSR: 8000000000009032 CR: 48008448 XER: 20000000 TASK = c000005dbf3d6000[10531] 'counters' THREAD: c000005db12c8000 CPU: 3 GPR00: 0000000000000004 c000005db12cb5e0 c000000000879228 0000000000000004 GPR04: 0000000000000010 0000000000000000 0000000000200200 0000000000100100 GPR08: c0000000008aba10 000000000000ffff 0000000000000004 0000000000000000 GPR12: 0000000028000442 c000000000770080 NIP [c0000000000d1f3c] .return_unused_surplus_pages+0x84/0x18c LR [c0000000000d1f2c] .return_unused_surplus_pages+0x74/0x18c Call Trace: [c000005db12cb5e0] [c000005db12cb670] 0xc000005db12cb670 (unreliable) [c000005db12cb670] [c0000000000d24c4] .hugetlb_acct_memory+0x2e0/0x354 [c000005db12cb740] [c0000000001b5048] .truncate_hugepages+0x1d4/0x214 [c000005db12cb890] [c0000000001b50a4] .hugetlbfs_delete_inode+0x1c/0x3c [c000005db12cb920] [c000000000103fd8] .generic_delete_inode+0xf8/0x1c0 [c000005db12cb9b0] [c0000000001b5100] .hugetlbfs_drop_inode+0x3c/0x24c [c000005db12cba50] [c00000000010287c] .iput+0xdc/0xf8 [c000005db12cbad0] [c0000000000fee54] .dentry_iput+0x12c/0x194 [c000005db12cbb60] [c0000000000ff050] .d_kill+0x6c/0xa4 [c000005db12cbbf0] [c0000000000ffb74] .dput+0x18c/0x1b0 [c000005db12cbc70] [c0000000000e9e98] .__fput+0x1a4/0x1e8 [c000005db12cbd10] [c0000000000e61ec] .filp_close+0xb8/0xe0 [c000005db12cbda0] [c0000000000e62d0] .sys_close+0xbc/0x134 [c000005db12cbe30] [c00000000000872c] syscall_exit+0x0/0x40 Instruction dump: ebbe8038 38800010 e8bf0002 3bbd0008 7fa3eb78 38a50001 7ca507b4 4818df25 60000000 38800010 38a00000 7c601b78 <7fa3eb78> 2f800010 409d0008 38000010 This was tracked down to a potential livelock in return_unused_surplus_hugepages(). In the case where we have surplus pages on some node, but no free pages on the same node, we may never break out of the loop. To avoid this livelock, terminate the search if we iterate a number of times equal to the number of online nodes without freeing a page. Thanks to Andy Whitcroft and Adam Litke for helping with debugging and the patch. Signed-off-by: Nishanth Aravamudan Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 40d841cb5126..51c9e2c01640 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -401,12 +401,20 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) struct page *page; unsigned long nr_pages; + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + */ + unsigned long remaining_iterations = num_online_nodes(); + /* Uncommit the reservation */ resv_huge_pages -= unused_resv_pages; nr_pages = min(unused_resv_pages, surplus_huge_pages); - while (nr_pages) { + while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); @@ -424,6 +432,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) surplus_huge_pages--; surplus_huge_pages_node[nid]--; nr_pages--; + remaining_iterations = num_online_nodes(); } } } -- cgit v1.2.2 From e72e9c23ee025a4c063ca112ba0a6059f9ecc9b7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Mar 2008 20:56:33 -0700 Subject: Revert "SLUB: remove useless masking of GFP_ZERO" This reverts commit 3811dbf67162bd08412f1b0e02e554f353e93bdb. The masking was not at all useless, and it was sensible. We handle GFP_ZERO in the caller, and passing it down to any page allocator logic is buggy and wrong. Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b72bc98e2dc1..84ed734b96b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1470,6 +1470,9 @@ static void *__slab_alloc(struct kmem_cache *s, void **object; struct page *new; + /* We handle __GFP_ZERO in the caller */ + gfpflags &= ~__GFP_ZERO; + if (!c->page) goto new_slab; -- cgit v1.2.2 From 9dce07f1a441b77a15631cf0ed0238e0baa7ed64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 29 Mar 2008 03:07:28 +0000 Subject: NULL noise: fs/*, mm/*, kernel/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cd75b21dd4c3..99c4f36eb8a3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -76,7 +76,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t entry; void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) - return 0; + return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); set_pte_at(&init_mm, addr, pte, entry); } @@ -89,7 +89,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) if (pmd_none(*pmd)) { void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) - return 0; + return NULL; pmd_populate_kernel(&init_mm, pmd, p); } return pmd; @@ -101,7 +101,7 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) if (pud_none(*pud)) { void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) - return 0; + return NULL; pud_populate(&init_mm, pud, p); } return pud; @@ -113,7 +113,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) if (pgd_none(*pgd)) { void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) - return 0; + return NULL; pgd_populate(&init_mm, pgd, p); } return pgd; -- cgit v1.2.2 From 00460dd5f4b886f72699f2245206c935f9fd4b82 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 1 Apr 2008 12:07:41 -0700 Subject: Fix undefined count_partial if !CONFIG_SLABINFO Small typo in the patch recently merged to avoid the unused symbol message for count_partial(). Discussion thread with confirmation of fix at http://marc.info/?t=120696854400001&r=1&w=2 Typo in the check if we need the count_partial function that was introduced by 53625b4204753b904addd40ca96d9ba802e6977d Signed-off-by: Christoph Lameter Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 84ed734b96b3..acc975fcc8cc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2688,7 +2688,7 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); -#if defined(SLUB_DEBUG) || defined(CONFIG_SLABINFO) +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO) static unsigned long count_partial(struct kmem_cache_node *n) { unsigned long flags; -- cgit v1.2.2