From 00e4d116a7ef94eb910be037912b0b2fc09f608b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Fri, 22 Sep 2006 09:33:58 +0100 Subject: [DCCP]: Allow default/fallback service code. This has been discussed on dccp@vger and removes the necessity for applications to supply service codes in each and every case. If an application does not want to provide a service code, that's fine, it will be given 0. Otherwise, service codes can be set via socket options as before. This patch has been tested using various client/server configurations (including listening on multiple service codes). Signed-off-by: Gerrit Renker Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/dccp.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 2d7671c92c0b..29f9291e45c0 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -404,6 +404,7 @@ struct dccp_service_list { }; #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) +#define DCCP_SERVICE_CODE_IS_ABSENT 0 static inline int dccp_list_has_service(const struct dccp_service_list *sl, const __be32 service) @@ -484,11 +485,6 @@ static inline struct dccp_minisock *dccp_msk(const struct sock *sk) return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; } -static inline int dccp_service_not_initialized(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_service == DCCP_SERVICE_INVALID_VALUE; -} - static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { -- cgit v1.2.2 From b83eff641ed39bd631535b9a8971e161b056f541 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Fri, 22 Sep 2006 14:25:36 +1200 Subject: [DCCP]: Introduce constants for CCID numbers Signed-off-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/dccp.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 29f9291e45c0..d6f4ec467a4b 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -169,6 +169,12 @@ enum { DCCPO_MAX_CCID_SPECIFIC = 255, }; +/* DCCP CCIDS */ +enum { + DCCPC_CCID2 = 2, + DCCPC_CCID3 = 3, +}; + /* DCCP features */ enum { DCCPF_RESERVED = 0, @@ -320,7 +326,7 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) /* initial values for each feature */ #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 -#define DCCPF_INITIAL_CCID 2 +#define DCCPF_INITIAL_CCID DCCPC_CCID2 #define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 -- cgit v1.2.2 From 0d6c7ae22d4fadbc83677ea1a6144eea8911e319 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Sun, 24 Sep 2006 19:28:47 -0700 Subject: [NETFILTER]: Add dscp,DSCP headers to header-y This patch adds xt_dscp.h and xt_DSCP.h to the kernel headers which are exported via 'make headers_install'. These are necessary for userspace to add rules using dscp match and DSCP target. Signed-off-by: Yasuyuki Kozakai Signed-off-by: David S. Miller --- include/linux/netfilter/Kbuild | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 9a285cecf249..312bd2ffee33 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -10,6 +10,8 @@ header-y += xt_connmark.h header-y += xt_CONNMARK.h header-y += xt_conntrack.h header-y += xt_dccp.h +header-y += xt_dscp.h +header-y += xt_DSCP.h header-y += xt_esp.h header-y += xt_helper.h header-y += xt_length.h -- cgit v1.2.2 From 632bbfeee4f042c05bc65150b4433a297d3fe387 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Mon, 25 Sep 2006 23:30:53 -0700 Subject: [PATCH] trigger a syntax error if percpu macros are incorrectly used get_cpu_var()/per_cpu()/__get_cpu_var() arguments must be simple identifiers. Otherwise the arch dependent implementations might break. This patch enforces the correct usage of the macros by producing a syntax error if the variable is not a simple identifier. Signed-off-by: Jan Blunck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index cb9039a21f2a..f926490a7d8b 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -11,8 +11,14 @@ #define PERCPU_ENOUGH_ROOM 32768 #endif -/* Must be an lvalue. */ -#define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); })) +/* + * Must be an lvalue. Since @var must be a simple identifier, + * we force a syntax error here if it isn't. + */ +#define get_cpu_var(var) (*({ \ + extern int simple_indentifier_##var(void); \ + preempt_disable(); \ + &__get_cpu_var(var); })) #define put_cpu_var(var) preempt_enable() #ifdef CONFIG_SMP -- cgit v1.2.2 From a6ca1b99ed434f3fb41bbed647ed36c0420501e5 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Mon, 25 Sep 2006 23:30:55 -0700 Subject: [PATCH] update to the kernel kmap/kunmap API Give non-highmem architectures access to the kmap API for the purposes of overriding (this is what the attached patch does). The proposal is that we should now require all architectures with coherence issues to manage data coherence via the kmap/kunmap API. Thus driver writers never have to write code like kmap(page) modify data in page flush_kernel_dcache_page(page) kunmap(page) instead, kmap/kunmap will manage the coherence and driver (and filesystem) writers don't need to worry about how to flush between kmap and kunmap. For most architectures, the page only needs to be flushed if it was actually written to *and* there are user mappings of it, so the best implementation looks to be: clear the page dirty pte bit in the kernel page tables on kmap and on kunmap, check page->mappings for user maps, and then the dirty bit, and only flush if it both has user mappings and is dirty. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 85ce7ef9a512..42620e723abb 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -29,6 +29,7 @@ unsigned int nr_free_highpages(void); static inline unsigned int nr_free_highpages(void) { return 0; } +#ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { might_sleep(); @@ -41,6 +42,7 @@ static inline void *kmap(struct page *page) #define kunmap_atomic(addr, idx) do { } while (0) #define kmap_atomic_pfn(pfn, idx) page_address(pfn_to_page(pfn)) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) +#endif #endif /* CONFIG_HIGHMEM */ -- cgit v1.2.2 From 725d704ecaca4a43f067092c140d4f3271cf2856 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:30:55 -0700 Subject: [PATCH] mm: VM_BUG_ON Introduce a VM_BUG_ON, which is turned on with CONFIG_DEBUG_VM. Use this in the lightweight, inline refcounting functions; PageLRU and PageActive checks in vmscan, because they're pretty well confined to vmscan. And in page allocate/free fastpaths which can be the hottest parts of the kernel for kbuilds. Unlike BUG_ON, VM_BUG_ON must not be used to execute statements with side-effects, and should not be used outside core mm code. Signed-off-by: Nick Piggin Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 224178a000d2..7d20b25c58fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -278,6 +278,12 @@ struct page { */ #include +#ifdef CONFIG_DEBUG_VM +#define VM_BUG_ON(cond) BUG_ON(cond) +#else +#define VM_BUG_ON(condition) do { } while(0) +#endif + /* * Methods to modify the page usage count. * @@ -297,7 +303,7 @@ struct page { */ static inline int put_page_testzero(struct page *page) { - BUG_ON(atomic_read(&page->_count) == 0); + VM_BUG_ON(atomic_read(&page->_count) == 0); return atomic_dec_and_test(&page->_count); } @@ -307,6 +313,7 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { + VM_BUG_ON(PageCompound(page)); return atomic_inc_not_zero(&page->_count); } @@ -323,6 +330,7 @@ static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); + VM_BUG_ON(atomic_read(&page->_count) == 0); atomic_inc(&page->_count); } -- cgit v1.2.2 From d08b3851da41d0ee60851f2c75b118e1f7a5fc89 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Sep 2006 23:30:57 -0700 Subject: [PATCH] mm: tracking shared dirty pages Tracking of dirty pages in shared writeable mmap()s. The idea is simple: write protect clean shared writeable pages, catch the write-fault, make writeable and set dirty. On page write-back clean all the PTE dirty bits and write protect them once again. The implementation is a tad harder, mainly because the default backing_dev_info capabilities were too loosely maintained. Hence it is not enough to test the backing_dev_info for cap_account_dirty. The current heuristic is as follows, a VMA is eligible when: - its shared writeable (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED) - it is not a 'special' mapping (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0 - the backing_dev_info is cap_account_dirty mapping_cap_account_dirty(vma->vm_file->f_mapping) - f_op->mmap() didn't change the default page protection Page from remap_pfn_range() are explicitly excluded because their COW semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and because they don't have a backing store anyway. mprotect() is taught about the new behaviour as well. However it overrides the last condition. Cleaning the pages on write-back is done with page_mkclean() a new rmap call. It can be called on any page, but is currently only implemented for mapped pages, if the page is found the be of a VMA that accounts dirty pages it will also wrprotect the PTE. Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from under ->private_lock. This seems to be safe, since ->private_lock is used to serialize access to the buffers, not the page itself. This is needed because clear_page_dirty() will call into page_mkclean() and would thereby violate locking order. [dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU] Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/rmap.h | 14 ++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d20b25c58fc..449841413cf1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -810,6 +811,39 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); +/* + * Some shared mappigns will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +static inline int vma_wants_writenotify(struct vm_area_struct *vma) +{ + unsigned int vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + return 1; + + /* The open routine did something to the protections already? */ + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) + return 0; + + /* Specialty mapping? */ + if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_cap_account_dirty(vma->vm_file->f_mapping); +} + extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf97b0900014..db2c1df4fef9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -103,6 +103,14 @@ pte_t *page_check_address(struct page *, struct mm_struct *, */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +/* + * Cleans the PTEs of shared mappings. + * (and since clean PTEs should also be readonly, write protects them too) + * + * returns the number of cleaned PTEs. + */ +int page_mkclean(struct page *); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) @@ -112,6 +120,12 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); #define page_referenced(page,l) TestClearPageReferenced(page) #define try_to_unmap(page, refs) SWAP_FAIL +static inline int page_mkclean(struct page *page) +{ + return 0; +} + + #endif /* CONFIG_MMU */ /* -- cgit v1.2.2 From edc79b2a46ed854595e40edcf3f8b37f9f14aa3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Sep 2006 23:30:58 -0700 Subject: [PATCH] mm: balance dirty pages Now that we can detect writers of shared mappings, throttle them. Avoids OOM by surprise. Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0422036af4eb..56a23a0e7f2e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -116,6 +116,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count); int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count); +void set_page_dirty_balance(struct page *page); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl -- cgit v1.2.2 From b221385bc41d6789edde3d2fa0cb20d5045730eb Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 25 Sep 2006 23:31:02 -0700 Subject: [PATCH] mm/: make functions static This patch makes the following needlessly global functions static: - slab.c: kmem_find_general_cachep() - swap.c: __page_cache_release() - vmalloc.c: __vmalloc_node() Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- include/linux/slab.h | 2 -- include/linux/vmalloc.h | 2 -- 3 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 449841413cf1..45678b036955 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -318,8 +318,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -extern void FASTCALL(__page_cache_release(struct page *)); - static inline int page_count(struct page *page) { if (unlikely(PageCompound(page))) diff --git a/include/linux/slab.h b/include/linux/slab.h index 45ad55b70d1c..193c03c547ec 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -67,7 +67,6 @@ extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t); extern void kmem_cache_free(kmem_cache_t *, void *); extern unsigned int kmem_cache_size(kmem_cache_t *); extern const char *kmem_cache_name(kmem_cache_t *); -extern kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags); /* Size description struct for general caches. */ struct cache_sizes { @@ -223,7 +222,6 @@ extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)); /* SLOB allocator routines */ void kmem_cache_init(void); -struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags); struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t, unsigned long, void (*)(void *, struct kmem_cache *, unsigned long), diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 71b6363caaaf..dee88c6b6fa7 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -44,8 +44,6 @@ extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, - pgprot_t prot, int node); extern void vfree(void *addr); extern void *vmap(struct page **pages, unsigned int count, -- cgit v1.2.2 From 2d1a07d487d8b36658404839cdf03a974968cefd Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 25 Sep 2006 23:31:03 -0700 Subject: [PATCH] bootmem: remove useless __init in header file __init in headers is pretty useless because the compiler doesn't check it, and they get out of sync relatively frequently. So if you see an __init in a header file, it's quite unreliable and you need to check the definition anyway. Signed-off-by: Franck Bui-Huu Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index e319c649e4fd..c7124d4ce7cf 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -41,23 +41,23 @@ typedef struct bootmem_data { struct list_head list; } bootmem_data_t; -extern unsigned long __init bootmem_bootmap_pages (unsigned long); -extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend); -extern void __init free_bootmem (unsigned long addr, unsigned long size); -extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal); -extern void * __init __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal); -extern void * __init __alloc_bootmem_low(unsigned long size, +extern unsigned long bootmem_bootmap_pages (unsigned long); +extern unsigned long init_bootmem (unsigned long addr, unsigned long memend); +extern void free_bootmem (unsigned long addr, unsigned long size); +extern void * __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal); +extern void * __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal); +extern void * __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); -extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, +extern void * __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata, +extern void * __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -extern void __init reserve_bootmem (unsigned long addr, unsigned long size); +extern void reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low(x) \ @@ -67,12 +67,12 @@ extern void __init reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem_low_pages(x) \ __alloc_bootmem_low((x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ -extern unsigned long __init free_all_bootmem (void); -extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); -extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size); -extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size); -extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat); +extern unsigned long free_all_bootmem (void); +extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +extern unsigned long init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); +extern void reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size); +extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size); +extern unsigned long free_all_bootmem_node (pg_data_t *pgdat); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) @@ -94,14 +94,14 @@ static inline void *alloc_remap(int nid, unsigned long size) extern unsigned long __meminitdata nr_kernel_pages; extern unsigned long nr_all_pages; -extern void *__init alloc_large_system_hash(const char *tablename, - unsigned long bucketsize, - unsigned long numentries, - int scale, - int flags, - unsigned int *_hash_shift, - unsigned int *_hash_mask, - unsigned long limit); +extern void * alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit); #define HASH_HIGHMEM 0x00000001 /* Consider highmem? */ #define HASH_EARLY 0x00000002 /* Allocating during early boot? */ -- cgit v1.2.2 From 71fb2e8f8753b66b1f4295aa264a2eb4e69381e8 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 25 Sep 2006 23:31:05 -0700 Subject: [PATCH] bootmem: remove useless parentheses in bootmem header file Signed-off-by: Franck Bui-Huu Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index c7124d4ce7cf..fa2523f47628 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -59,13 +59,13 @@ extern void * __alloc_bootmem_core(struct bootmem_data *bdata, #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ - __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low(x) \ - __alloc_bootmem_low((x), SMP_CACHE_BYTES, 0) + __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_low((x), PAGE_SIZE, 0) + __alloc_bootmem_low(x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern unsigned long free_all_bootmem (void); extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); @@ -75,11 +75,11 @@ extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned lo extern unsigned long free_all_bootmem_node (pg_data_t *pgdat); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(pgdat, x) \ - __alloc_bootmem_low_node((pgdat), (x), PAGE_SIZE, 0) + __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP -- cgit v1.2.2 From bb0923a66820718f636736b22ce47372f79e3400 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 25 Sep 2006 23:31:05 -0700 Subject: [PATCH] bootmem: limit to 80 columns width Signed-off-by: Franck Bui-Huu Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index fa2523f47628..1e8dddfb3083 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -44,18 +44,24 @@ typedef struct bootmem_data { extern unsigned long bootmem_bootmap_pages (unsigned long); extern unsigned long init_bootmem (unsigned long addr, unsigned long memend); extern void free_bootmem (unsigned long addr, unsigned long size); -extern void * __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal); -extern void * __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal); +extern void * __alloc_bootmem (unsigned long size, + unsigned long align, + unsigned long goal); +extern void * __alloc_bootmem_nopanic (unsigned long size, + unsigned long align, + unsigned long goal); extern void * __alloc_bootmem_low(unsigned long size, - unsigned long align, - unsigned long goal); + unsigned long align, + unsigned long goal); extern void * __alloc_bootmem_low_node(pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal); + unsigned long size, + unsigned long align, + unsigned long goal); extern void * __alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ @@ -68,10 +74,20 @@ extern void reserve_bootmem (unsigned long addr, unsigned long size); __alloc_bootmem_low(x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern unsigned long free_all_bootmem (void); -extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -extern unsigned long init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); -extern void reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size); -extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size); +extern void * __alloc_bootmem_node (pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); +extern unsigned long init_bootmem_node (pg_data_t *pgdat, + unsigned long freepfn, + unsigned long startpfn, + unsigned long endpfn); +extern void reserve_bootmem_node (pg_data_t *pgdat, + unsigned long physaddr, + unsigned long size); +extern void free_bootmem_node (pg_data_t *pgdat, + unsigned long addr, + unsigned long size); extern unsigned long free_all_bootmem_node (pg_data_t *pgdat); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ -- cgit v1.2.2 From e786e86a542ccc1133f333402526ad00b9c088ae Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 25 Sep 2006 23:31:06 -0700 Subject: [PATCH] bootmem: remove useless headers inclusions Signed-off-by: Franck Bui-Huu Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 1e8dddfb3083..b2067e4a4486 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -4,11 +4,8 @@ #ifndef _LINUX_BOOTMEM_H #define _LINUX_BOOTMEM_H -#include -#include -#include -#include #include +#include /* * simple boot-time physical memory area allocator. -- cgit v1.2.2 From f71bf0cac730ccb5ebcdf21747db75ae0445ccde Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 25 Sep 2006 23:31:08 -0700 Subject: [PATCH] bootmem: miscellaneous coding style fixes It fixes various coding style issues, specially when spaces are useless. For example '*' go next to the function name. Signed-off-by: Franck Bui-Huu Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 95 +++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b2067e4a4486..31e9abb6d977 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -38,29 +38,30 @@ typedef struct bootmem_data { struct list_head list; } bootmem_data_t; -extern unsigned long bootmem_bootmap_pages (unsigned long); -extern unsigned long init_bootmem (unsigned long addr, unsigned long memend); -extern void free_bootmem (unsigned long addr, unsigned long size); -extern void * __alloc_bootmem (unsigned long size, - unsigned long align, - unsigned long goal); -extern void * __alloc_bootmem_nopanic (unsigned long size, - unsigned long align, - unsigned long goal); -extern void * __alloc_bootmem_low(unsigned long size, +extern unsigned long bootmem_bootmap_pages(unsigned long); +extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); +extern void free_bootmem(unsigned long addr, unsigned long size); +extern void *__alloc_bootmem(unsigned long size, + unsigned long align, + unsigned long goal); +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); +extern void *__alloc_bootmem_low(unsigned long size, + unsigned long align, + unsigned long goal); +extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); +extern void *__alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, - unsigned long goal); -extern void * __alloc_bootmem_low_node(pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal); -extern void * __alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit); + unsigned long goal, + unsigned long limit); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -extern void reserve_bootmem (unsigned long addr, unsigned long size); +extern void reserve_bootmem(unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low(x) \ @@ -70,22 +71,24 @@ extern void reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem_low_pages(x) \ __alloc_bootmem_low(x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ -extern unsigned long free_all_bootmem (void); -extern void * __alloc_bootmem_node (pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal); -extern unsigned long init_bootmem_node (pg_data_t *pgdat, - unsigned long freepfn, - unsigned long startpfn, - unsigned long endpfn); -extern void reserve_bootmem_node (pg_data_t *pgdat, - unsigned long physaddr, - unsigned long size); -extern void free_bootmem_node (pg_data_t *pgdat, - unsigned long addr, - unsigned long size); -extern unsigned long free_all_bootmem_node (pg_data_t *pgdat); + +extern unsigned long free_all_bootmem(void); +extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); +extern void *__alloc_bootmem_node(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); +extern unsigned long init_bootmem_node(pg_data_t *pgdat, + unsigned long freepfn, + unsigned long startpfn, + unsigned long endpfn); +extern void reserve_bootmem_node(pg_data_t *pgdat, + unsigned long physaddr, + unsigned long size); +extern void free_bootmem_node(pg_data_t *pgdat, + unsigned long addr, + unsigned long size); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) @@ -102,19 +105,19 @@ static inline void *alloc_remap(int nid, unsigned long size) { return NULL; } -#endif +#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */ extern unsigned long __meminitdata nr_kernel_pages; extern unsigned long nr_all_pages; -extern void * alloc_large_system_hash(const char *tablename, - unsigned long bucketsize, - unsigned long numentries, - int scale, - int flags, - unsigned int *_hash_shift, - unsigned int *_hash_mask, - unsigned long limit); +extern void *alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit); #define HASH_HIGHMEM 0x00000001 /* Consider highmem? */ #define HASH_EARLY 0x00000002 /* Allocating during early boot? */ -- cgit v1.2.2 From c1f60a5a419cc60aff27daffb150f5a3a3a79ef4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:11 -0700 Subject: [PATCH] reduce MAX_NR_ZONES: move HIGHMEM counters into highmem.c/.h Move totalhigh_pages and nr_free_highpages() into highmem.c/.h Move the totalhigh_pages definition into highmem.c/.h. Move the nr_free_highpages function into highmem.c [yoichi_yuasa@tripeaks.co.jp: build fix] Signed-off-by: Christoph Lameter Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 3 +++ include/linux/swap.h | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 42620e723abb..fd7d12daa94f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -24,11 +24,14 @@ static inline void flush_kernel_dcache_page(struct page *page) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); +extern unsigned long totalhigh_pages; #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } +#define totalhigh_pages 0 + #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 5e59184c9096..34a6bc3e6cf3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -162,7 +162,6 @@ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct * /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; -extern unsigned long totalhigh_pages; extern unsigned long totalreserve_pages; extern long nr_swap_pages; extern unsigned int nr_free_pages(void); -- cgit v1.2.2 From 2f1b6248682f8b39ca3c7e549dfc216d26c4109b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:13 -0700 Subject: [PATCH] reduce MAX_NR_ZONES: use enum to define zones, reformat and comment Use enum for zones and reformat zones dependent information Add comments explaning the use of zones and add a zones_t type for zone numbers. Line up information that will be #ifdefd by the following patches. [akpm@osdl.org: comment cleanups] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++--- include/linux/mmzone.h | 66 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 51 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 45678b036955..2db4229a0066 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -470,7 +470,7 @@ void split_page(struct page *page, unsigned int order); #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) -static inline unsigned long page_zonenum(struct page *page) +static inline enum zone_type page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } @@ -499,11 +499,12 @@ static inline unsigned long page_to_section(struct page *page) return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -static inline void set_page_zone(struct page *page, unsigned long zone) +static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } + static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); @@ -515,7 +516,7 @@ static inline void set_page_section(struct page *page, unsigned long section) page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline void set_page_links(struct page *page, unsigned long zone, +static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f45163c528e8..03a5a6eb0ffa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -88,14 +88,53 @@ struct per_cpu_pageset { #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) #endif -#define ZONE_DMA 0 -#define ZONE_DMA32 1 -#define ZONE_NORMAL 2 -#define ZONE_HIGHMEM 3 +enum zone_type { + /* + * ZONE_DMA is used when there are devices that are not able + * to do DMA to all of addressable memory (ZONE_NORMAL). Then we + * carve out the portion of memory that is needed for these devices. + * The range is arch specific. + * + * Some examples + * + * Architecture Limit + * --------------------------- + * parisc, ia64, sparc <4G + * s390 <2G + * arm26 <48M + * arm Various + * alpha Unlimited or 0-16MB. + * + * i386, x86_64 and multiple other arches + * <16M. + */ + ZONE_DMA, + /* + * x86_64 needs two ZONE_DMAs because it supports devices that are + * only able to do DMA to the lower 16M but also 32 bit devices that + * can only do DMA areas below 4G. + */ + ZONE_DMA32, + /* + * Normal addressable memory is in ZONE_NORMAL. DMA operations can be + * performed on pages in ZONE_NORMAL if the DMA devices support + * transfers to all addressable memory. + */ + ZONE_NORMAL, + /* + * A memory area that is only addressable by the kernel through + * mapping portions into its own address space. This is for example + * used by i386 to allow the kernel to address the memory beyond + * 900MB. The kernel will set up special mappings (page + * table entries on i386) for each page that the kernel needs to + * access. + */ + ZONE_HIGHMEM, -#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */ -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ + MAX_NR_ZONES +}; +#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ /* * When a memory allocation must conform to specific limitations (such @@ -126,16 +165,6 @@ struct per_cpu_pageset { /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ -/* - * On machines where it is needed (eg PCs) we divide physical memory - * into multiple physical zones. On a 32bit PC we have 4 zones: - * - * ZONE_DMA < 16 MB ISA DMA capable memory - * ZONE_DMA32 0 MB Empty - * ZONE_NORMAL 16-896 MB direct mapped by the kernel - * ZONE_HIGHMEM > 896 MB only page cache and user processes - */ - struct zone { /* Fields commonly accessed by the page allocator */ unsigned long free_pages; @@ -266,7 +295,6 @@ struct zone { char *name; } ____cacheline_internodealigned_in_smp; - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -373,12 +401,12 @@ static inline int populated_zone(struct zone *zone) return (!!zone->present_pages); } -static inline int is_highmem_idx(int idx) +static inline int is_highmem_idx(enum zone_type idx) { return (idx == ZONE_HIGHMEM); } -static inline int is_normal_idx(int idx) +static inline int is_normal_idx(enum zone_type idx) { return (idx == ZONE_NORMAL); } -- cgit v1.2.2 From fb0e7942bdcbbd2f90e61cb4cfa4fa892a873f8a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:13 -0700 Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_DMA32 optional Make ZONE_DMA32 optional - Add #ifdefs around ZONE_DMA32 specific code and definitions. - Add CONFIG_ZONE_DMA32 config option and use that for x86_64 that alone needs this zone. - Remove the use of CONFIG_DMA_IS_DMA32 and CONFIG_DMA_IS_NORMAL for ia64 and fix up the way per node ZVCs are calculated. - Fall back to prior GFP_ZONEMASK of 0x03 if there is no DMA32 zone. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- include/linux/mmzone.h | 16 +++++++++++++--- include/linux/vmstat.h | 4 +--- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index cc9e60844484..14610b56c132 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -13,7 +13,7 @@ struct vm_area_struct; /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */ #define __GFP_DMA ((__force gfp_t)0x01u) #define __GFP_HIGHMEM ((__force gfp_t)0x02u) -#ifdef CONFIG_DMA_IS_DMA32 +#ifndef CONFIG_ZONE_DMA32 #define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */ #elif BITS_PER_LONG < 64 #define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 03a5a6eb0ffa..adae3c915938 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -109,12 +109,14 @@ enum zone_type { * <16M. */ ZONE_DMA, +#ifdef CONFIG_ZONE_DMA32 /* * x86_64 needs two ZONE_DMAs because it supports devices that are * only able to do DMA to the lower 16M but also 32 bit devices that * can only do DMA areas below 4G. */ ZONE_DMA32, +#endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support @@ -161,9 +163,13 @@ enum zone_type { * * NOTE! Make sure this matches the zones in */ -#define GFP_ZONEMASK 0x07 -/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ -#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ +#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ + +#ifdef CONFIG_ZONE_DMA32 +#define GFP_ZONEMASK 0x07 +#else +#define GFP_ZONEMASK 0x03 +#endif struct zone { /* Fields commonly accessed by the page allocator */ @@ -429,7 +435,11 @@ static inline int is_normal(struct zone *zone) static inline int is_dma32(struct zone *zone) { +#ifdef CONFIG_ZONE_DMA32 return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; +#else + return 0; +#endif } static inline int is_dma(struct zone *zone) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 2d9b1b60798a..9c6e62c56ec2 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -124,12 +124,10 @@ static inline unsigned long node_page_state(int node, struct zone *zones = NODE_DATA(node)->node_zones; return -#ifndef CONFIG_DMA_IS_NORMAL -#if !defined(CONFIG_DMA_IS_DMA32) && BITS_PER_LONG >= 64 +#ifdef CONFIG_ZONE_DMA32 zone_page_state(&zones[ZONE_DMA32], item) + #endif zone_page_state(&zones[ZONE_NORMAL], item) + -#endif #ifdef CONFIG_HIGHMEM zone_page_state(&zones[ZONE_HIGHMEM], item) + #endif -- cgit v1.2.2 From e53ef38d05dd59ed281a35590e4a5b64d8ff4c52 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:14 -0700 Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_HIGHMEM optional Make ZONE_HIGHMEM optional - ifdef out code and definitions related to CONFIG_HIGHMEM - __GFP_HIGHMEM falls back to normal allocations if there is no ZONE_HIGHMEM - GFP_ZONEMASK becomes 0x01 if there is no DMA32 and no HIGHMEM zone. [jdike@addtoit.com: build fix] Signed-off-by: Jeff Dike Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 18 ++++++++++-------- include/linux/mmzone.h | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 14610b56c132..2a2153ebfe0b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -9,17 +9,19 @@ struct vm_area_struct; /* * GFP bitmasks.. + * + * Zone modifiers (see linux/mmzone.h - low three bits) + * + * These may be masked by GFP_ZONEMASK to make allocations with this bit + * set fall back to ZONE_NORMAL. + * + * Do not put any conditional on these. If necessary modify the definitions + * without the underscores and use the consistently. The definitions here may + * be used in bit comparisons. */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */ #define __GFP_DMA ((__force gfp_t)0x01u) #define __GFP_HIGHMEM ((__force gfp_t)0x02u) -#ifndef CONFIG_ZONE_DMA32 -#define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */ -#elif BITS_PER_LONG < 64 -#define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */ -#else -#define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */ -#endif +#define __GFP_DMA32 ((__force gfp_t)0x04u) /* * Action modifiers - doesn't change the zoning diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index adae3c915938..76d33e688593 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -123,6 +123,7 @@ enum zone_type { * transfers to all addressable memory. */ ZONE_NORMAL, +#ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example @@ -132,11 +133,10 @@ enum zone_type { * access. */ ZONE_HIGHMEM, - +#endif MAX_NR_ZONES }; -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ /* * When a memory allocation must conform to specific limitations (such @@ -163,12 +163,34 @@ enum zone_type { * * NOTE! Make sure this matches the zones in */ -#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ #ifdef CONFIG_ZONE_DMA32 + +#ifdef CONFIG_HIGHMEM +#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ #define GFP_ZONEMASK 0x07 +#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ #else +#define GFP_ZONETYPES ((0x07 + 1) / 2 + 1) /* Loner */ +/* Mask __GFP_HIGHMEM */ +#define GFP_ZONEMASK 0x05 +#define ZONES_SHIFT 2 +#endif + +#else +#ifdef CONFIG_HIGHMEM + #define GFP_ZONEMASK 0x03 +#define ZONES_SHIFT 2 +#define GFP_ZONETYPES 3 + +#else + +#define GFP_ZONEMASK 0x01 +#define ZONES_SHIFT 1 +#define GFP_ZONETYPES 2 + +#endif #endif struct zone { @@ -409,7 +431,11 @@ static inline int populated_zone(struct zone *zone) static inline int is_highmem_idx(enum zone_type idx) { +#ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM); +#else + return 0; +#endif } static inline int is_normal_idx(enum zone_type idx) @@ -425,7 +451,11 @@ static inline int is_normal_idx(enum zone_type idx) */ static inline int is_highmem(struct zone *zone) { +#ifdef CONFIG_HIGHMEM return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; +#else + return 0; +#endif } static inline int is_normal(struct zone *zone) -- cgit v1.2.2 From 27bf71c2a7e596ed34e9bf2d4a5030321a09a1ad Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:15 -0700 Subject: [PATCH] reduce MAX_NR_ZONES: remove display of counters for unconfigured zones eventcounters: Do not display counters for zones that are not available on an arch Do not define or display counters for the DMA32 and the HIGHMEM zone if such zones were not configured. [akpm@osdl.org: s390 fix] [heiko.carstens@de.ibm.com: s390 fix] Signed-off-by: Christoph Lameter Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9c6e62c56ec2..176c7f797339 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -18,7 +18,19 @@ * generated will simply be the increment of a global address. */ -#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH +#ifdef CONFIG_ZONE_DMA32 +#define DMA32_ZONE(xx) xx##_DMA32, +#else +#define DMA32_ZONE(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define HIGHMEM_ZONE(xx) , xx##_HIGH +#else +#define HIGHMEM_ZONE(xx) +#endif + +#define FOR_ALL_ZONES(xx) xx##_DMA, DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), -- cgit v1.2.2 From 4e4785bcf0c8503224fa6c17d8e0228de781bff6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:17 -0700 Subject: [PATCH] mempolicies: fix policy_zone check There is a check in zonelist_policy that compares pieces of the bitmap obtained from a gfp mask via GFP_ZONETYPES with a zone number in function zonelist_policy(). The bitmap is an ORed mask of __GFP_DMA, __GFP_DMA32 and __GFP_HIGHMEM. The policy_zone is a zone number with the possible values of ZONE_DMA, ZONE_DMA32, ZONE_HIGHMEM and ZONE_NORMAL. These are two different domains of values. For some reason seemed to work before the zone reduction patchset (It definitely works on SGI boxes since we just have one zone and the check cannot fail). With the zone reduction patchset this check definitely fails on systems with two zones if the system actually has memory in both zones. This is because ZONE_NORMAL is selected using no __GFP flag at all and thus gfp_zone(gfpmask) == 0. ZONE_DMA is selected when __GFP_DMA is set. __GFP_DMA is 0x01. So gfp_zone(gfpmask) == 1. policy_zone is set to ZONE_NORMAL (==1) if ZONE_NORMAL and ZONE_DMA are populated. For ZONE_NORMAL gfp_zone() yields 0 which is < policy_zone(ZONE_NORMAL) and so policy is not applied to regular memory allocations! Instead gfp_zone(__GFP_DMA) == 1 which results in policy being applied to DMA allocations! What we realy want in that place is to establish the highest allowable zone for a given gfp_mask. If the highest zone is higher or equal to the policy_zone then memory policies need to be applied. We have such a highest_zone() function in page_alloc.c. So move the highest_zone() function from mm/page_alloc.c into include/linux/gfp.h. On the way we simplify the function and use the new zone_type that was also introduced with the zone reduction patchset plus we also specify the right type for the gfp flags parameter. Signed-off-by: Christoph Lameter Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2a2153ebfe0b..a0992d392d79 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -85,6 +85,21 @@ static inline int gfp_zone(gfp_t gfp) return zone; } +static inline enum zone_type highest_zone(gfp_t flags) +{ + if (flags & __GFP_DMA) + return ZONE_DMA; +#ifdef CONFIG_ZONE_DMA32 + if (flags & __GFP_DMA32) + return ZONE_DMA32; +#endif +#ifdef CONFIG_HIGHMEM + if (flags & __GFP_HIGHMEM) + return ZONE_HIGHMEM; +#endif + return ZONE_NORMAL; +} + /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such -- cgit v1.2.2 From 2f6726e54a9410e2e4cee864947c05e954051916 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:18 -0700 Subject: [PATCH] Apply type enum zone_type After we have done this we can now do some typing cleanup. The memory policy layer keeps a policy_zone that specifies the zone that gets memory policies applied. This variable can now be of type enum zone_type. The check_highest_zone function and the build_zonelists funnctionm must then also take a enum zone_type parameter. Plus there are a number of loops over zones that also should use zone_type. We run into some troubles at some points with functions that need a zone_type variable to become -1. Fix that up. [pj@sgi.com: fix set_mempolicy() crash] Signed-off-by: Christoph Lameter Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 72440f0a443d..09f0f575ddff 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -162,9 +162,9 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr); extern unsigned slab_node(struct mempolicy *policy); -extern int policy_zone; +extern enum zone_type policy_zone; -static inline void check_highest_zone(int k) +static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone) policy_zone = k; -- cgit v1.2.2 From 19655d3487001d7df0e10e9cbfc27c758b77c2b5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:19 -0700 Subject: [PATCH] linearly index zone->node_zonelists[] I wonder why we need this bitmask indexing into zone->node_zonelists[]? We always start with the highest zone and then include all lower zones if we build zonelists. Are there really cases where we need allocation from ZONE_DMA or ZONE_HIGHMEM but not ZONE_NORMAL? It seems that the current implementation of highest_zone() makes that already impossible. If we go linear on the index then gfp_zone() == highest_zone() and a lot of definitions fall by the wayside. We can now revert back to the use of gfp_zone() in mempolicy.c ;-) Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 +----------- include/linux/mmzone.h | 52 +++++--------------------------------------------- 2 files changed, 6 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a0992d392d79..63ab88a36f39 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -12,9 +12,6 @@ struct vm_area_struct; * * Zone modifiers (see linux/mmzone.h - low three bits) * - * These may be masked by GFP_ZONEMASK to make allocations with this bit - * set fall back to ZONE_NORMAL. - * * Do not put any conditional on these. If necessary modify the definitions * without the underscores and use the consistently. The definitions here may * be used in bit comparisons. @@ -78,14 +75,7 @@ struct vm_area_struct; #define GFP_DMA32 __GFP_DMA32 -static inline int gfp_zone(gfp_t gfp) -{ - int zone = GFP_ZONEMASK & (__force int) gfp; - BUG_ON(zone >= GFP_ZONETYPES); - return zone; -} - -static inline enum zone_type highest_zone(gfp_t flags) +static inline enum zone_type gfp_zone(gfp_t flags) { if (flags & __GFP_DMA) return ZONE_DMA; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 76d33e688593..7fe317164b73 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -137,60 +137,18 @@ enum zone_type { MAX_NR_ZONES }; - /* * When a memory allocation must conform to specific limitations (such * as being suitable for DMA) the caller will pass in hints to the * allocator in the gfp_mask, in the zone modifier bits. These bits * are used to select a priority ordered list of memory zones which - * match the requested limits. GFP_ZONEMASK defines which bits within - * the gfp_mask should be considered as zone modifiers. Each valid - * combination of the zone modifier bits has a corresponding list - * of zones (in node_zonelists). Thus for two zone modifiers there - * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will - * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible - * combinations of zone modifiers in "zone modifier space". - * - * As an optimisation any zone modifier bits which are only valid when - * no other zone modifier bits are set (loners) should be placed in - * the highest order bits of this field. This allows us to reduce the - * extent of the zonelists thus saving space. For example in the case - * of three zone modifier bits, we could require up to eight zonelists. - * If the left most zone modifier is a "loner" then the highest valid - * zonelist would be four allowing us to allocate only five zonelists. - * Use the first form for GFP_ZONETYPES when the left most bit is not - * a "loner", otherwise use the second. - * - * NOTE! Make sure this matches the zones in + * match the requested limits. See gfp_zone() in include/linux/gfp.h */ -#ifdef CONFIG_ZONE_DMA32 - -#ifdef CONFIG_HIGHMEM -#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ -#define GFP_ZONEMASK 0x07 -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ -#else -#define GFP_ZONETYPES ((0x07 + 1) / 2 + 1) /* Loner */ -/* Mask __GFP_HIGHMEM */ -#define GFP_ZONEMASK 0x05 -#define ZONES_SHIFT 2 -#endif - -#else -#ifdef CONFIG_HIGHMEM - -#define GFP_ZONEMASK 0x03 -#define ZONES_SHIFT 2 -#define GFP_ZONETYPES 3 - +#if !defined(CONFIG_ZONE_DMA32) && !defined(CONFIG_HIGHMEM) +#define ZONES_SHIFT 1 #else - -#define GFP_ZONEMASK 0x01 -#define ZONES_SHIFT 1 -#define GFP_ZONETYPES 2 - -#endif +#define ZONES_SHIFT 2 #endif struct zone { @@ -360,7 +318,7 @@ struct zonelist { struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; - struct zonelist node_zonelists[GFP_ZONETYPES]; + struct zonelist node_zonelists[MAX_NR_ZONES]; int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; -- cgit v1.2.2 From 8bc719d3cab8414938f9ea6e33b58d8810d18068 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 25 Sep 2006 23:31:20 -0700 Subject: [PATCH] out of memory notifier Add a notifer chain to the out of memory killer. If one of the registered callbacks could release some memory, do not kill the process but return and retry the allocation that forced the oom killer to run. The purpose of the notifier is to add a safety net in the presence of memory ballooners. If the resource manager inflated the balloon to a size where memory allocations can not be satisfied anymore, it is better to deflate the balloon a bit instead of killing processes. The implementation for the s390 ballooner is included. [akpm@osdl.org: cleanups] Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 34a6bc3e6cf3..32db06c8ffe0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -10,6 +10,8 @@ #include #include +struct notifier_block; + #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 @@ -156,6 +158,8 @@ struct swap_list_t { /* linux/mm/oom_kill.c */ extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern int register_oom_notifier(struct notifier_block *nb); +extern int unregister_oom_notifier(struct notifier_block *nb); /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); -- cgit v1.2.2 From 7ff6f08295d90ab20d25200ef485ebb45b1b8d71 Mon Sep 17 00:00:00 2001 From: Martin Peschke Date: Mon, 25 Sep 2006 23:31:21 -0700 Subject: [PATCH] CPU hotplug compatible alloc_percpu() This patch splits alloc_percpu() up into two phases. Likewise for free_percpu(). This allows clients to limit initial allocations to online cpu's, and to populate or depopulate per-cpu data at run time as needed: struct my_struct *obj; /* initial allocation for online cpu's */ obj = percpu_alloc(sizeof(struct my_struct), GFP_KERNEL); ... /* populate per-cpu data for cpu coming online */ ptr = percpu_populate(obj, sizeof(struct my_struct), GFP_KERNEL, cpu); ... /* access per-cpu object */ ptr = percpu_ptr(obj, smp_processor_id()); ... /* depopulate per-cpu data for cpu going offline */ percpu_depopulate(obj, cpu); ... /* final removal */ percpu_free(obj); Signed-off-by: Martin Peschke Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 79 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f926490a7d8b..3835a9642f13 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -1,9 +1,12 @@ #ifndef __LINUX_PERCPU_H #define __LINUX_PERCPU_H + #include /* For preempt_disable() */ #include /* For kmalloc() */ #include #include /* For memset() */ +#include + #include /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ @@ -27,39 +30,77 @@ struct percpu_data { void *ptrs[NR_CPUS]; }; +#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) /* - * Use this to get to a cpu's version of the per-cpu object allocated using - * alloc_percpu. Non-atomic access to the current CPU's version should + * Use this to get to a cpu's version of the per-cpu object dynamically + * allocated. Non-atomic access to the current CPU's version should * probably be combined with get_cpu()/put_cpu(). */ -#define per_cpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ +#define percpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -extern void *__alloc_percpu(size_t size); -extern void free_percpu(const void *); +extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu); +extern void percpu_depopulate(void *__pdata, int cpu); +extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask); +extern void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask); +extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); +extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) + +static inline void percpu_depopulate(void *__pdata, int cpu) +{ +} + +static inline void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +{ +} -static inline void *__alloc_percpu(size_t size) +static inline void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, + int cpu) { - void *ret = kmalloc(size, GFP_KERNEL); - if (ret) - memset(ret, 0, size); - return ret; + return percpu_ptr(__pdata, cpu); } -static inline void free_percpu(const void *ptr) -{ - kfree(ptr); + +static inline int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) +{ + return 0; +} + +static inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +{ + return kzalloc(size, gfp); +} + +static inline void percpu_free(void *__pdata) +{ + kfree(__pdata); } #endif /* CONFIG_SMP */ -/* Simple wrapper for the common case: zeros memory. */ -#define alloc_percpu(type) ((type *)(__alloc_percpu(sizeof(type)))) +#define percpu_populate_mask(__pdata, size, gfp, mask) \ + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) +#define percpu_depopulate_mask(__pdata, mask) \ + __percpu_depopulate_mask((__pdata), &(mask)) +#define percpu_alloc_mask(size, gfp, mask) \ + __percpu_alloc_mask((size), (gfp), &(mask)) + +#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map) + +/* (legacy) interface for use without CPU hotplug handling */ + +#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \ + cpu_possible_map) +#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) +#define free_percpu(ptr) percpu_free((ptr)) +#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) #endif /* __LINUX_PERCPU_H */ -- cgit v1.2.2 From db37648cd6ce9b828abd6d49aa3d269926ee7b7d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:24 -0700 Subject: [PATCH] mm: non syncing lock_page() lock_page needs the caller to have a reference on the page->mapping inode due to sync_page, ergo set_page_dirty_lock is obviously buggy according to its comments. Solve it by introducing a new lock_page_nosync which does not do a sync_page. akpm: unpleasant solution to an unpleasant problem. If it goes wrong it could cause great slowdowns while the lock_page() caller waits for kblockd to perform the unplug. And if a filesystem has special sync_page() requirements (none presently do), permanent hangs are possible. otoh, set_page_dirty_lock() is usually (always?) called against userspace pages. They are always up-to-date, so there shouldn't be any pending read I/O against these pages. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0a2f5d27f60e..64f950925151 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -130,14 +130,29 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, } extern void FASTCALL(__lock_page(struct page *page)); +extern void FASTCALL(__lock_page_nosync(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); +/* + * lock_page may only be called if we have the page's inode pinned. + */ static inline void lock_page(struct page *page) { might_sleep(); if (TestSetPageLocked(page)) __lock_page(page); } + +/* + * lock_page_nosync should only be used if we can't pin the page's inode. + * Doesn't play quite so well with block device plugging. + */ +static inline void lock_page_nosync(struct page *page) +{ + might_sleep(); + if (TestSetPageLocked(page)) + __lock_page_nosync(page); +} /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. -- cgit v1.2.2 From da6052f7b33abe55fbfd7d2213815f58c00a88d4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 25 Sep 2006 23:31:35 -0700 Subject: [PATCH] update some mm/ comments Let's try to keep mm/ comments more useful and up to date. This is a start. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 68 +++++++++++++++++++++++++++------------------- include/linux/page-flags.h | 35 ++++++++++++++---------- 2 files changed, 60 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2db4229a0066..f2018775b995 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -219,7 +219,8 @@ struct inode; * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using - * a page. + * a page, though if it is a pagecache page, rmap structures can tell us + * who is mapping it. */ struct page { unsigned long flags; /* Atomic flags, some possibly @@ -299,8 +300,7 @@ struct page { */ /* - * Drop a ref, return true if the logical refcount fell to zero (the page has - * no users) + * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { @@ -356,43 +356,55 @@ void split_page(struct page *page, unsigned int order); * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. - * page_count() == 1 means the page is used for exactly one purpose - * (e.g. a private data page of one process). + * page_count() > 0 means the page has been allocated. * - * A page may be used for kmalloc() or anyone else who does a - * __get_free_page(). In this case the page_count() is at least 1, and - * all other fields are unused but should be 0 or NULL. The - * management of this page is the responsibility of the one who uses - * it. + * Pages are allocated by the slab allocator in order to provide memory + * to kmalloc and kmem_cache_alloc. In this case, the management of the + * page, and the fields in 'struct page' are the responsibility of mm/slab.c + * unless a particular usage is carefully commented. (the responsibility of + * freeing the kmalloc memory is the caller's, of course). * - * The other pages (we may call them "process pages") are completely + * A page may be used by anyone else who does a __get_free_page(). + * In this case, page_count still tracks the references, and should only + * be used through the normal accessor functions. The top bits of page->flags + * and page->virtual store page management information, but all other fields + * are unused and could be used privately, carefully. The management of this + * page is the responsibility of the one who allocated it, and those who have + * subsequently been given references to it. + * + * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * - * A page may belong to an inode's memory mapping. In this case, - * page->mapping is the pointer to the inode, and page->index is the - * file offset of the page, in units of PAGE_CACHE_SIZE. + * A pagecache page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular list of + * the page's disk buffers. PG_private must be set to tell the VM to call + * into the filesystem to release these pages. * - * A page contains an opaque `private' member, which belongs to the - * page's address_space. Usually, this is the address of a circular - * list of the page's disk buffers. + * A page may belong to an inode's memory mapping. In this case, page->mapping + * is the pointer to the inode, and page->index is the file offset of the page, + * in units of PAGE_CACHE_SIZE. * - * For pages belonging to inodes, the page_count() is the number of - * attaches, plus 1 if `private' contains something, plus one for - * the page cache itself. + * If pagecache pages are not associated with an inode, they are said to be + * anonymous pages. These may become associated with the swapcache, and in that + * case PG_swapcache is set, and page->private is an offset into the swapcache. * - * Instead of keeping dirty/clean pages in per address-space lists, we instead - * now tag pages as dirty/under writeback in the radix tree. + * In either case (swapcache or inode backed), the pagecache itself holds one + * reference to the page. Setting PG_private should also increment the + * refcount. The each user mapping also has a reference to the page. * - * There is also a per-mapping radix tree mapping index to the page - * in memory if present. The tree is rooted at mapping->root. + * The pagecache pages are stored in a per-mapping radix tree, which is + * rooted at mapping->page_tree, and indexed by offset. + * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space + * lists, we instead now tag pages as dirty/writeback in the radix tree. * - * All process pages can do I/O: + * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need - * to be written to disk, - * - private pages which have been modified may need to be swapped out - * to swap space and (later) to be read back into memory. + * to be written back to the inode on disk, + * - anonymous pages (including MAP_PRIVATE file mappings) which have been + * modified may need to be swapped out to swap space and (later) to be read + * back into memory. */ /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5748642e9f36..9d7921dd50f0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -13,24 +13,25 @@ * PG_reserved is set for special pages, which can never be swapped out. Some * of them might not even exist (eg empty_bad_page)... * - * The PG_private bitflag is set if page->private contains a valid value. + * The PG_private bitflag is set on pagecache pages if they contain filesystem + * specific data (which is normally at page->private). It can be used by + * private allocations for its own usage. * - * During disk I/O, PG_locked is used. This bit is set before I/O and - * reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks - * waiting for the I/O on this page to complete. + * During initiation of disk I/O, PG_locked is set. This bit is set before I/O + * and cleared when writeback _starts_ or when read _completes_. PG_writeback + * is set before writeback starts and cleared when it finishes. + * + * PG_locked also pins a page in pagecache, and blocks truncation of the file + * while it is held. + * + * page_waitqueue(page) is a wait queue of all tasks waiting for the page + * to become unlocked. * * PG_uptodate tells whether the page's contents is valid. When a read * completes, the page becomes uptodate, unless a disk I/O error happened. * - * For choosing which pages to swap out, inode pages carry a PG_referenced bit, - * which is set any time the system accesses that page through the (mapping, - * index) hash table. This referenced bit, together with the referenced bit - * in the page tables, is used to manipulate page->age and move the page across - * the active, inactive_dirty and inactive_clean lists. - * - * Note that the referenced bit, the page->lru list_head and the active, - * inactive_dirty and inactive_clean lists are protected by the - * zone->lru_lock, and *NOT* by the usual PG_locked bit! + * PG_referenced, PG_reclaim are used for page reclaim for anonymous and + * file-backed pagecache (see mm/vmscan.c). * * PG_error is set to indicate that an I/O error occurred on this page. * @@ -42,6 +43,10 @@ * space, they need to be kmapped separately for doing IO on the pages. The * struct page (these bits with information) are always mapped into kernel * address space... + * + * PG_buddy is set to indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + * */ /* @@ -74,7 +79,7 @@ #define PG_checked 8 /* kill me in 2.5.. */ #define PG_arch_1 9 #define PG_reserved 10 -#define PG_private 11 /* Has something at ->private */ +#define PG_private 11 /* If pagecache, has fs-private data */ #define PG_writeback 12 /* Page is under writeback */ #define PG_nosave 13 /* Used for system suspend/resume */ @@ -83,7 +88,7 @@ #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ #define PG_reclaim 17 /* To be reclaimed asap */ -#define PG_nosave_free 18 /* Free, should not be written */ +#define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ -- cgit v1.2.2 From dbe5e69d2d6e591996ea2b817b887d03b60bb143 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Sep 2006 23:31:36 -0700 Subject: [PATCH] slab: optimize kmalloc_node the same way as kmalloc [akpm@osdl.org: export fix] Signed-off-by: Christoph Hellwig Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 193c03c547ec..2f6bef6a98c9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -202,7 +202,30 @@ extern int slab_is_available(void); #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node); -extern void *kmalloc_node(size_t size, gfp_t flags, int node); +extern void *__kmalloc_node(size_t size, gfp_t flags, int node); + +static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size)) { + int i = 0; +#define CACHE(x) \ + if (size <= x) \ + goto found; \ + else \ + i++; +#include "kmalloc_sizes.h" +#undef CACHE + { + extern void __you_cannot_kmalloc_that_much(void); + __you_cannot_kmalloc_that_much(); + } +found: + return kmem_cache_alloc_node((flags & GFP_DMA) ? + malloc_sizes[i].cs_dmacachep : + malloc_sizes[i].cs_cachep, flags, node); + } + return __kmalloc_node(size, flags, node); +} #else static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int node) { -- cgit v1.2.2 From 9b819d204cf602eab1a53a9ec4b8d2ca51e02a1d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:40 -0700 Subject: [PATCH] Add __GFP_THISNODE to avoid fallback to other nodes and ignore cpuset/memory policy restrictions Add a new gfp flag __GFP_THISNODE to avoid fallback to other nodes. This flag is essential if a kernel component requires memory to be located on a certain node. It will be needed for alloc_pages_node() to force allocation on the indicated node and for alloc_pages() to force allocation on the current node. Signed-off-by: Christoph Lameter Cc: Andy Whitcroft Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 63ab88a36f39..0eda5b6dedbd 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,6 +45,7 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ +#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -53,7 +54,7 @@ struct vm_area_struct; #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ - __GFP_NOMEMALLOC|__GFP_HARDWALL) + __GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE) /* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) -- cgit v1.2.2 From 980128f223fa3c75e3ebdde650c9f1bcabd4c0a2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:46 -0700 Subject: [PATCH] Define easier to handle GFP_THISNODE In many places we will need to use the same combination of flags. Specify a single GFP_THISNODE definition for ease of use in gfp.h. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0eda5b6dedbd..8b34aabfe4c6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -67,6 +67,8 @@ struct vm_area_struct; #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ __GFP_HIGHMEM) +#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) + /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ -- cgit v1.2.2 From 8417bba4b151346ed475fcc923693c9e3be89063 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:51 -0700 Subject: [PATCH] Replace min_unmapped_ratio by min_unmapped_pages in struct zone *_pages is a better description of the role of the variable. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fe317164b73..a703527e2b45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -169,7 +169,7 @@ struct zone { /* * zone reclaim becomes active if more unmapped pages exist. */ - unsigned long min_unmapped_ratio; + unsigned long min_unmapped_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; -- cgit v1.2.2 From 972d1a7b140569084439a81265a0f15b74e924e0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:51 -0700 Subject: [PATCH] ZVC: Support NR_SLAB_RECLAIMABLE / NR_SLAB_UNRECLAIMABLE Remove the atomic counter for slab_reclaim_pages and replace the counter and NR_SLAB with two ZVC counter that account for unreclaimable and reclaimable slab pages: NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE. Change the check in vmscan.c to refer to to NR_SLAB_RECLAIMABLE. The intend seems to be to check for slab pages that could be freed. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- include/linux/slab.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a703527e2b45..08c41b9f92e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -51,7 +51,8 @@ enum zone_stat_item { NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, - NR_SLAB, /* Pages used by slab allocator */ + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_FILE_DIRTY, NR_WRITEBACK, diff --git a/include/linux/slab.h b/include/linux/slab.h index 2f6bef6a98c9..66d6eb78d1c6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,8 +284,6 @@ extern kmem_cache_t *fs_cachep; extern kmem_cache_t *sighand_cachep; extern kmem_cache_t *bio_cachep; -extern atomic_t slab_reclaim_pages; - #endif /* __KERNEL__ */ #endif /* _LINUX_SLAB_H */ -- cgit v1.2.2 From 0ff38490c836dc379ff7ec45b10a15a662f4e5f6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:52 -0700 Subject: [PATCH] zone_reclaim: dynamic slab reclaim Currently one can enable slab reclaim by setting an explicit option in /proc/sys/vm/zone_reclaim_mode. Slab reclaim is then used as a final option if the freeing of unmapped file backed pages is not enough to free enough pages to allow a local allocation. However, that means that the slab can grow excessively and that most memory of a node may be used by slabs. We have had a case where a machine with 46GB of memory was using 40-42GB for slab. Zone reclaim was effective in dealing with pagecache pages. However, slab reclaim was only done during global reclaim (which is a bit rare on NUMA systems). This patch implements slab reclaim during zone reclaim. Zone reclaim occurs if there is a danger of an off node allocation. At that point we 1. Shrink the per node page cache if the number of pagecache pages is more than min_unmapped_ratio percent of pages in a zone. 2. Shrink the slab cache if the number of the nodes reclaimable slab pages (patch depends on earlier one that implements that counter) are more than min_slab_ratio (a new /proc/sys/vm tunable). The shrinking of the slab cache is a bit problematic since it is not node specific. So we simply calculate what point in the slab we want to reach (current per node slab use minus the number of pages that neeed to be allocated) and then repeately run the global reclaim until that is unsuccessful or we have reached the limit. I hope we will have zone based slab reclaim at some point which will make that easier. The default for the min_slab_ratio is 5% Also remove the slab option from /proc/sys/vm/zone_reclaim_mode. [akpm@osdl.org: cleanups] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 +++ include/linux/swap.h | 1 + include/linux/sysctl.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 08c41b9f92e0..3693f1a52788 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -171,6 +171,7 @@ struct zone { * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; + unsigned long min_slab_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 32db06c8ffe0..a2f5ad7c2d2e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -193,6 +193,7 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; +extern int sysctl_min_slab_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 736ed917a4f8..eca555781d05 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -191,6 +191,7 @@ enum VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ + VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ }; -- cgit v1.2.2 From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:55 -0700 Subject: [PATCH] NUMA: Add zone_to_nid function There are many places where we need to determine the node of a zone. Currently we use a difficult to read sequence of pointer dereferencing. Put that into an inline function and use throughout VM. Maybe we can find a way to optimize the lookup in the future. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f2018775b995..856f0ee7e84a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -499,12 +499,17 @@ static inline struct zone *page_zone(struct page *page) return zone_table[page_zone_id(page)]; } +static inline unsigned long zone_to_nid(struct zone *zone) +{ + return zone->zone_pgdat->node_id; +} + static inline unsigned long page_to_nid(struct page *page) { if (FLAGS_HAS_NODE) return (page->flags >> NODES_PGSHIFT) & NODES_MASK; else - return page_zone(page)->zone_pgdat->node_id; + return zone_to_nid(page_zone(page)); } static inline unsigned long page_to_section(struct page *page) { -- cgit v1.2.2 From 62bac0185ad3dfef11d9602980445c54d45199c6 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 25 Sep 2006 23:31:56 -0700 Subject: [PATCH] selinux: eliminate selinux_task_ctxid Eliminate selinux_task_ctxid since it duplicates selinux_task_get_sid. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/selinux.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/selinux.h b/include/linux/selinux.h index aad4e390d6a5..79e4707ca772 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -69,16 +69,6 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, */ void selinux_audit_set_callback(int (*callback)(void)); -/** - * selinux_task_ctxid - determine a context ID for a process. - * @tsk: the task object - * @ctxid: ID value returned via this - * - * On return, ctxid will contain an ID for the context. This value - * should only be used opaquely. - */ -void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid); - /** * selinux_ctxid_to_string - map a security context ID to a string * @ctxid: security context ID to be converted. @@ -166,11 +156,6 @@ static inline void selinux_audit_set_callback(int (*callback)(void)) return; } -static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid) -{ - *ctxid = 0; -} - static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen) { *ctx = NULL; -- cgit v1.2.2 From 1a70cd40cb291c25b67ec0da715a49d76719329d Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 25 Sep 2006 23:31:57 -0700 Subject: [PATCH] selinux: rename selinux_ctxid_to_string Rename selinux_ctxid_to_string to selinux_sid_to_string to be consistent with other interfaces. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/selinux.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 79e4707ca772..df9098de4c99 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -70,8 +70,8 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, void selinux_audit_set_callback(int (*callback)(void)); /** - * selinux_ctxid_to_string - map a security context ID to a string - * @ctxid: security context ID to be converted. + * selinux_sid_to_string - map a security context ID to a string + * @sid: security context ID to be converted. * @ctx: address of context string to be returned * @ctxlen: length of returned context string. * @@ -79,7 +79,7 @@ void selinux_audit_set_callback(int (*callback)(void)); * string will be allocated internally, and the caller must call * kfree() on it after use. */ -int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen); +int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen); /** * selinux_get_inode_sid - get the inode's security context ID @@ -156,7 +156,7 @@ static inline void selinux_audit_set_callback(int (*callback)(void)) return; } -static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen) +static inline int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) { *ctx = NULL; *ctxlen = 0; -- cgit v1.2.2 From 9a2f44f01a67a6ecca71515af999895b45a2aeb0 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 25 Sep 2006 23:31:58 -0700 Subject: [PATCH] selinux: replace ctxid with sid in selinux_audit_rule_match interface Replace ctxid with sid in selinux_audit_rule_match interface for consistency with other interfaces. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/selinux.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/selinux.h b/include/linux/selinux.h index df9098de4c99..d1b7ca6c1c57 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -46,7 +46,7 @@ void selinux_audit_rule_free(struct selinux_audit_rule *rule); /** * selinux_audit_rule_match - determine if a context ID matches a rule. - * @ctxid: the context ID to check + * @sid: the context ID to check * @field: the field this rule refers to * @op: the operater the rule uses * @rule: pointer to the audit rule to check against @@ -55,7 +55,7 @@ void selinux_audit_rule_free(struct selinux_audit_rule *rule); * Returns 1 if the context id matches the rule, 0 if it does not, and * -errno on failure. */ -int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, +int selinux_audit_rule_match(u32 sid, u32 field, u32 op, struct selinux_audit_rule *rule, struct audit_context *actx); @@ -144,7 +144,7 @@ static inline void selinux_audit_rule_free(struct selinux_audit_rule *rule) return; } -static inline int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, +static inline int selinux_audit_rule_match(u32 sid, u32 field, u32 op, struct selinux_audit_rule *rule, struct audit_context *actx) { -- cgit v1.2.2 From af8c65b57aaa4ae321af34dbfc5ca7f5625263fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Sep 2006 23:32:07 -0700 Subject: [PATCH] FRV: permit __do_IRQ() to be dispensed with Permit __do_IRQ() to be dispensed with based on a configuration option. Signed-off-by: David Howells Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index fbf6d901e9c2..48d3cb3b6a47 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -320,7 +320,9 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, * Monolithic do_IRQ implementation. * (is an explicit fastcall, because i386 4KSTACKS calls it from assembly) */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs); +#endif /* * Architectures call this to let the generic IRQ layer @@ -332,10 +334,14 @@ static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs) { struct irq_desc *desc = irq_desc + irq; +#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ + desc->handle_irq(irq, desc, regs); +#else if (likely(desc->handle_irq)) desc->handle_irq(irq, desc, regs); else __do_IRQ(irq, regs); +#endif } /* Handling of unhandled and spurious interrupts: */ -- cgit v1.2.2 From 5f97f7f9400de47ae837170bb274e90ad3934386 Mon Sep 17 00:00:00 2001 From: Haavard Skinnemoen Date: Mon, 25 Sep 2006 23:32:13 -0700 Subject: [PATCH] avr32 architecture This adds support for the Atmel AVR32 architecture as well as the AT32AP7000 CPU and the AT32STK1000 development board. AVR32 is a new high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular emphasis on low power consumption and high code density. The AVR32 architecture is not binary compatible with earlier 8-bit AVR architectures. The AVR32 architecture, including the instruction set, is described by the AVR32 Architecture Manual, available from http://www.atmel.com/dyn/resources/prod_documents/doc32000.pdf The Atmel AT32AP7000 is the first CPU implementing the AVR32 architecture. It features a 7-stage pipeline, 16KB instruction and data caches and a full Memory Management Unit. It also comes with a large set of integrated peripherals, many of which are shared with the AT91 ARM-based controllers from Atmel. Full data sheet is available from http://www.atmel.com/dyn/resources/prod_documents/doc32003.pdf while the CPU core implementation including caches and MMU is documented by the AVR32 AP Technical Reference, available from http://www.atmel.com/dyn/resources/prod_documents/doc32001.pdf Information about the AT32STK1000 development board can be found at http://www.atmel.com/dyn/products/tools_card.asp?tool_id=3918 including a BSP CD image with an earlier version of this patch, development tools (binaries and source/patches) and a root filesystem image suitable for booting from SD card. Alternatively, there's a preliminary "getting started" guide available at http://avr32linux.org/twiki/bin/view/Main/GettingStarted which provides links to the sources and patches you will need in order to set up a cross-compiling environment for avr32-linux. This patch, as well as the other patches included with the BSP and the toolchain patches, is actively supported by Atmel Corporation. [dmccr@us.ibm.com: Fix more pxx_page macro locations] [bunk@stusta.de: fix `make defconfig'] Signed-off-by: Haavard Skinnemoen Signed-off-by: Adrian Bunk Signed-off-by: Dave McCracken Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elf-em.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elf-em.h b/include/linux/elf-em.h index 6a5796c81c90..666e0a5f00fc 100644 --- a/include/linux/elf-em.h +++ b/include/linux/elf-em.h @@ -31,6 +31,7 @@ #define EM_M32R 88 /* Renesas M32R */ #define EM_H8_300 46 /* Renesas H8/300,300H,H8S */ #define EM_FRV 0x5441 /* Fujitsu FR-V */ +#define EM_AVR32 0x18ad /* Atmel AVR32 */ /* * This is an interim value that we will use until the committee comes -- cgit v1.2.2 From 9c9b8b388296ad5a306ab238dc677cfe6ff4cb12 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 25 Sep 2006 23:32:26 -0700 Subject: [PATCH] x86: put .note.* sections into a PT_NOTE segment in vmlinux This patch will pack any .note.* section into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. This only changes i386 for now, but I presume the corresponding changes for other architectures will be as simple. This change also adds , which defines C and Assembler macros for actually creating ELF notes. Signed-off-by: Jeremy Fitzhardinge Cc: Eric W. Biederman Cc: Hollis Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfnote.h | 88 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 include/linux/elfnote.h (limited to 'include/linux') diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h new file mode 100644 index 000000000000..16f9f8ebffd9 --- /dev/null +++ b/include/linux/elfnote.h @@ -0,0 +1,88 @@ +#ifndef _LINUX_ELFNOTE_H +#define _LINUX_ELFNOTE_H +/* + * Helper macros to generate ELF Note structures, which are put into a + * PT_NOTE segment of the final vmlinux image. These are useful for + * including name-value pairs of metadata into the kernel binary (or + * modules?) for use by external programs. + * + * Each note has three parts: a name, a type and a desc. The name is + * intended to distinguish the note's originator, so it would be a + * company, project, subsystem, etc; it must be in a suitable form for + * use in a section name. The type is an integer which is used to tag + * the data, and is considered to be within the "name" namespace (so + * "FooCo"'s type 42 is distinct from "BarProj"'s type 42). The + * "desc" field is the actual data. There are no constraints on the + * desc field's contents, though typically they're fairly small. + * + * All notes from a given NAME are put into a section named + * .note.NAME. When the kernel image is finally linked, all the notes + * are packed into a single .notes section, which is mapped into the + * PT_NOTE segment. Because notes for a given name are grouped into + * the same section, they'll all be adjacent the output file. + * + * This file defines macros for both C and assembler use. Their + * syntax is slightly different, but they're semantically similar. + * + * See the ELF specification for more detail about ELF notes. + */ + +#ifdef __ASSEMBLER__ +/* + * Generate a structure with the same shape as Elf{32,64}_Nhdr (which + * turn out to be the same size and shape), followed by the name and + * desc data with appropriate padding. The 'desc' argument includes + * the assembler pseudo op defining the type of the data: .asciz + * "hello, world" + */ +.macro ELFNOTE name type desc:vararg +.pushsection ".note.\name" + .align 4 + .long 2f - 1f /* namesz */ + .long 4f - 3f /* descsz */ + .long \type +1:.asciz "\name" +2:.align 4 +3:\desc +4:.align 4 +.popsection +.endm +#else /* !__ASSEMBLER__ */ +#include +/* + * Use an anonymous structure which matches the shape of + * Elf{32,64}_Nhdr, but includes the name and desc data. The size and + * type of name and desc depend on the macro arguments. "name" must + * be a literal string, and "desc" must be passed by value. You may + * only define one note per line, since __LINE__ is used to generate + * unique symbols. + */ +#define _ELFNOTE_PASTE(a,b) a##b +#define _ELFNOTE(size, name, unique, type, desc) \ + static const struct { \ + struct elf##size##_note _nhdr; \ + unsigned char _name[sizeof(name)] \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + typeof(desc) _desc \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + } _ELFNOTE_PASTE(_note_, unique) \ + __attribute_used__ \ + __attribute__((section(".note." name), \ + aligned(sizeof(Elf##size##_Word)), \ + unused)) = { \ + { \ + sizeof(name), \ + sizeof(desc), \ + type, \ + }, \ + name, \ + desc \ + } +#define ELFNOTE(size, name, type, desc) \ + _ELFNOTE(size, name, __LINE__, type, desc) + +#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) +#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) +#endif /* __ASSEMBLER__ */ + +#endif /* _LINUX_ELFNOTE_H */ -- cgit v1.2.2 From 5091e746848f74c9a2c0579b4ef8d8cd1a6b135d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 25 Sep 2006 23:32:28 -0700 Subject: [PATCH] Translate asm version of ELFNOTE macro into preprocessor macro I've come across some problems with the assembly version of the ELFNOTE macro currently in -mm. (in x86-put-note-sections-into-a-pt_note-segment-in-vmlinux.patch) The first is that older gas does not support :varargs in .macro definitions (in my testing 2.17 does while 2.15 does not, I don't know when it became supported). The Changes file says binutils >= 2.12 so I think we need to avoid using it. There are no other uses in mainline or -mm. Old gas appears to just ignore it so you get "too many arguments" type errors. Secondly it seems that passing strings as arguments to assembler macros is broken without varargs. It looks like they get unquoted or each character is treated as a separate argument or something and this causes all manner of grief. I think this is because of the use of -traditional when compiling assembly files. Therefore I have translated the assembler macro into a pre-processor macro. I added the desctype as a separate argument instead of including it with the descdata as the previous version did since -traditional means the ELFNOTE definition after the #else needs to have the same number of arguments (I think so anyway, the -traditional CPP semantics are pretty fscking strange!). With this patch I am able to define elfnotes in assembly like this with both old and new assemblers. ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) Which seems reasonable enough. Signed-off-by: Ian Campbell Acked-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfnote.h | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 16f9f8ebffd9..67396db141e8 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -31,22 +31,24 @@ /* * Generate a structure with the same shape as Elf{32,64}_Nhdr (which * turn out to be the same size and shape), followed by the name and - * desc data with appropriate padding. The 'desc' argument includes - * the assembler pseudo op defining the type of the data: .asciz - * "hello, world" + * desc data with appropriate padding. The 'desctype' argument is the + * assembler pseudo op defining the type of the data e.g. .asciz while + * 'descdata' is the data itself e.g. "hello, world". + * + * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two") + * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) */ -.macro ELFNOTE name type desc:vararg -.pushsection ".note.\name" - .align 4 - .long 2f - 1f /* namesz */ - .long 4f - 3f /* descsz */ - .long \type -1:.asciz "\name" -2:.align 4 -3:\desc -4:.align 4 -.popsection -.endm +#define ELFNOTE(name, type, desctype, descdata) \ +.pushsection .note.name ; \ + .align 4 ; \ + .long 2f - 1f /* namesz */ ; \ + .long 4f - 3f /* descsz */ ; \ + .long type ; \ +1:.asciz "name" ; \ +2:.align 4 ; \ +3:desctype descdata ; \ +4:.align 4 ; \ +.popsection ; #else /* !__ASSEMBLER__ */ #include /* -- cgit v1.2.2 From a3bc0dbc81d36fd38991b4373f6de8e1a507605a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:33 -0700 Subject: [PATCH] smp_call_function_single() cleanup If we're going to implement smp_call_function_single() on three architecture with the same prototype then it should have a declaration in a non-arch-specific header file. Move it into . Cc: Stephane Eranian Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 837e8bce1349..51649987f691 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -53,6 +53,9 @@ extern void smp_cpus_done(unsigned int max_cpus); */ int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); +int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, + int retry, int wait); + /* * Call a function on all processors */ -- cgit v1.2.2 From 930631edd4b1fe2781d9fe90edbe35d89dfc94cc Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 25 Sep 2006 23:32:40 -0700 Subject: [PATCH] add DIV_ROUND_UP() Add the DIV_ROUND_UP() helper macro: divide `n' by `d', rounding up. Stolen from the gfs2 tree(!) because the swsusp patches need it. Signed-off-by: Steven Whitehouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b2ae4fdce8b..e44a37e2c71c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ extern const char linux_banner[]; #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define ALIGN(x,a) (((x)+(a)-1UL)&~((a)-1UL)) #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) #define KERN_EMERG "<0>" /* system is unusable */ -- cgit v1.2.2 From ab954160350c91c77ae03740ef90458c3ad5412c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:42 -0700 Subject: [PATCH] swsusp: write speedup Switch the swsusp writeout code from 4k-at-a-time to 4MB-at-a-time. Crufty old PIII testbox: 12.9 MB/s -> 20.9 MB/s Sony Vaio: 14.7 MB/s -> 26.5 MB/s The implementation is crude. A better one would use larger BIOs, but wouldn't gain any performance. The memcpys will be mostly pipelined with the IO and basically come for free. The ENOMEM path has not been tested. It should be. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a2f5ad7c2d2e..3d434cbffe26 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -12,6 +12,8 @@ struct notifier_block; +struct bio; + #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 @@ -216,7 +218,8 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern int rw_swap_page_sync(int, swp_entry_t, struct page *); +extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, + struct bio **bio_chain); /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -- cgit v1.2.2 From 546e0d271941dd1ff6961e2a1f7eac75f1fc277e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:44 -0700 Subject: [PATCH] swsusp: read speedup Implement async reads for swsusp resuming. Crufty old PIII testbox: 15.7 MB/s -> 20.3 MB/s Sony Vaio: 14.6 MB/s -> 33.3 MB/s I didn't implement the post-resume bio_set_pages_dirty(). I don't really understand why resume needs to run set_page_dirty() against these pages. It might be a worry that this code modifies PG_Uptodate, PG_Error and PG_Locked against the image pages. Can this possibly affect the resumed-into kernel? Hopefully not, if we're atomically restoring its mem_map? Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Jens Axboe Cc: Laurent Riffard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3d434cbffe26..e7c36ba2a2db 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -220,6 +220,7 @@ extern int swap_readpage(struct file *, struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, struct bio **bio_chain); +extern int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err); /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -- cgit v1.2.2 From e3920fb42c8ddfe63befb54d95c0e13eabacea9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:48 -0700 Subject: [PATCH] Disable CPU hotplug during suspend The current suspend code has to be run on one CPU, so we use the CPU hotplug to take the non-boot CPUs offline on SMP machines. However, we should also make sure that these CPUs will not be enabled by someone else after we have disabled them. The functions disable_nonboot_cpus() and enable_nonboot_cpus() are moved to kernel/cpu.c, because they now refer to some stuff in there that should better be static. Also it's better if disable_nonboot_cpus() returns an error instead of panicking if something goes wrong, and enable_nonboot_cpus() has no reason to panic(), because the CPUs may have been enabled by the userland before it tries to take them online. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 8 ++++++++ include/linux/suspend.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 8fb344a9abd8..3fef7d67aedc 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -89,4 +89,12 @@ int cpu_down(unsigned int cpu); static inline int cpu_is_offline(int cpu) { return 0; } #endif +#ifdef CONFIG_SUSPEND_SMP +extern int disable_nonboot_cpus(void); +extern void enable_nonboot_cpus(void); +#else +static inline int disable_nonboot_cpus(void) { return 0; } +static inline void enable_nonboot_cpus(void) {} +#endif + #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 96e31aa64cc7..6e8a06c950f4 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -57,14 +57,6 @@ static inline int software_suspend(void) } #endif /* CONFIG_PM */ -#ifdef CONFIG_SUSPEND_SMP -extern void disable_nonboot_cpus(void); -extern void enable_nonboot_cpus(void); -#else -static inline void disable_nonboot_cpus(void) {} -static inline void enable_nonboot_cpus(void) {} -#endif - void save_processor_state(void); void restore_processor_state(void); struct saved_context; -- cgit v1.2.2 From dcbb5a54f6e3984efa24772394f2225b11495c55 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:51 -0700 Subject: [PATCH] swsusp: clean up suspend header Remove some things that are no longer used or defined elsewhere from suspend.h and make the inline version of software_suspend() return the right error code. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/suspend.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 6e8a06c950f4..c11cacf1a13b 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -10,11 +10,11 @@ #include /* page backup entry */ -typedef struct pbe { +struct pbe { unsigned long address; /* address of the copy */ unsigned long orig_address; /* original address of page */ struct pbe *next; -} suspend_pagedir_t; +}; #define for_each_pbe(pbe, pblist) \ for (pbe = pblist ; pbe ; pbe = pbe->next) @@ -25,15 +25,6 @@ typedef struct pbe { #define for_each_pb_page(pbe, pblist) \ for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next) - -#define SWAP_FILENAME_MAXLENGTH 32 - - -extern dev_t swsusp_resume_device; - -/* mm/vmscan.c */ -extern int shrink_mem(void); - /* mm/page_alloc.c */ extern void drain_local_pages(void); extern void mark_free_pages(struct zone *zone); @@ -53,7 +44,7 @@ static inline void pm_restore_console(void) {} static inline int software_suspend(void) { printk("Warning: fake suspend called\n"); - return -EPERM; + return -ENOSYS; } #endif /* CONFIG_PM */ -- cgit v1.2.2 From 940864ddabdb180e02041c4dcd46ba6f9eee732f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:55 -0700 Subject: [PATCH] swsusp: Use memory bitmaps during resume Make swsusp use memory bitmaps to store its internal information during the resume phase of the suspend-resume cycle. If the pfns of saveable pages are saved during the suspend phase instead of the kernel virtual addresses of these pages, we can use them during the resume phase directly to set the corresponding bits in a memory bitmap. Then, this bitmap is used to mark the page frames corresponding to the pages that were saveable before the suspend (aka "unsafe" page frames). Next, we allocate as many page frames as needed to store the entire suspend image and make sure that there will be some extra free "safe" page frames for the list of PBEs constructed later. Subsequently, the image is loaded and, if possible, the data loaded from it are written into their "original" page frames (ie. the ones they had occupied before the suspend). The image data that cannot be written into their "original" page frames are loaded into "safe" page frames and their "original" kernel virtual addresses, as well as the addresses of the "safe" pages containing their copies, are stored in a list of PBEs. Finally, the list of PBEs is used to copy the remaining image data into their "original" page frames (this is done atomically, by the architecture-dependent parts of swsusp). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/suspend.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index c11cacf1a13b..b1237f16ecde 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -16,15 +16,6 @@ struct pbe { struct pbe *next; }; -#define for_each_pbe(pbe, pblist) \ - for (pbe = pblist ; pbe ; pbe = pbe->next) - -#define PBES_PER_PAGE (PAGE_SIZE/sizeof(struct pbe)) -#define PB_PAGE_SKIP (PBES_PER_PAGE-1) - -#define for_each_pb_page(pbe, pblist) \ - for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next) - /* mm/page_alloc.c */ extern void drain_local_pages(void); extern void mark_free_pages(struct zone *zone); -- cgit v1.2.2 From c8eb8b4025175f967af0ba8e933f23aa9954dc35 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:56 -0700 Subject: [PATCH] PM: make it possible to disable console suspending Change suspend_console() so that it waits for all consoles to flush the remaining messages and make it possible to switch the console suspending off with the help of a Kconfig option. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Stefan Seyfried Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 3bdf2155e565..76a1807726eb 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -120,9 +120,14 @@ extern void console_stop(struct console *); extern void console_start(struct console *); extern int is_console_locked(void); +#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); +#else +static inline void suspend_console(void) {} +static inline void resume_console(void) {} +#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */ /* Some debug stub to catch some of the obvious races in the VT code */ #if 1 -- cgit v1.2.2 From c5c6ba4e08ab9c9e390a0f3a7d9a5c332f5cc6ef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:58 -0700 Subject: [PATCH] PM: Add pm_trace switch Add the pm_trace attribute in /sys/power which has to be explicitly set to one to really enable the "PM tracing" code compiled in when CONFIG_PM_TRACE is set (which modifies the machine's CMOS clock in unpredictable ways). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/resume-trace.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h index a376bd4ade39..81e9299ca148 100644 --- a/include/linux/resume-trace.h +++ b/include/linux/resume-trace.h @@ -3,21 +3,25 @@ #ifdef CONFIG_PM_TRACE +extern int pm_trace_enabled; + struct device; extern void set_trace_device(struct device *); extern void generate_resume_trace(void *tracedata, unsigned int user); #define TRACE_DEVICE(dev) set_trace_device(dev) -#define TRACE_RESUME(user) do { \ - void *tracedata; \ - asm volatile("movl $1f,%0\n" \ - ".section .tracedata,\"a\"\n" \ - "1:\t.word %c1\n" \ - "\t.long %c2\n" \ - ".previous" \ - :"=r" (tracedata) \ - : "i" (__LINE__), "i" (__FILE__)); \ - generate_resume_trace(tracedata, user); \ +#define TRACE_RESUME(user) do { \ + if (pm_trace_enabled) { \ + void *tracedata; \ + asm volatile("movl $1f,%0\n" \ + ".section .tracedata,\"a\"\n" \ + "1:\t.word %c1\n" \ + "\t.long %c2\n" \ + ".previous" \ + :"=r" (tracedata) \ + : "i" (__LINE__), "i" (__FILE__)); \ + generate_resume_trace(tracedata, user); \ + } \ } while (0) #else -- cgit v1.2.2