From 1edf223485c42c99655dcd001db1e46ad5e5d2d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:06:57 -0800 Subject: mm/page-writeback.c: make determine_dirtyable_memory static again The tracing ring-buffer used this function briefly, but not anymore. Make it local to the writeback code again. Also, move the function so that no forward declaration needs to be reintroduced. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a378c295851f..34a005515fef 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -138,8 +138,6 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern unsigned long determine_dirtyable_memory(void); - extern int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.2 From cc59850ef940e4ee6a765d28b439b9bafe07cf63 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:04 -0800 Subject: mm: add free_hot_cold_page_list() helper This patch adds helper free_hot_cold_page_list() to free list of 0-order pages. It frees pages directly from list without temporary page-vector. It also calls trace_mm_pagevec_free() to simulate pagevec_free() behaviour. bloat-o-meter: add/remove: 1/1 grow/shrink: 1/3 up/down: 267/-295 (-28) function old new delta free_hot_cold_page_list - 264 +264 get_page_from_freelist 2129 2132 +3 __pagevec_free 243 239 -4 split_free_page 380 373 -7 release_pages 606 510 -96 free_page_list 188 - -188 Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3a76faf6a3ee..656295865d58 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -358,6 +358,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); +extern void free_hot_cold_page_list(struct list_head *list, int cold); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) -- cgit v1.2.2 From da066ad3570b88e7dee82e76a06ee9a7adffcf0d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:06 -0800 Subject: mm: remove unused pagevec_free It not exported and now nobody uses it. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index bab82f4c571c..ed17024d2ebe 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_free(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, @@ -67,12 +66,6 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } -static inline void pagevec_free(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_free(pvec); -} - static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); -- cgit v1.2.2 From b413d48aa70605701c0b395b2e350ca15f5d643a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:09 -0800 Subject: mm-tracepoint: rename page-free events Rename mm_page_free_direct into mm_page_free and mm_pagevec_free into mm_page_free_batched Since v2.6.33-5426-gc475dab the kernel triggers mm_page_free_direct for all freed pages, not only for directly freed. So, let's name it properly. For pages freed via page-list we also trigger mm_page_free_batched event. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index a9c87ad8331c..5f889f16b0c8 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -147,7 +147,7 @@ DEFINE_EVENT(kmem_free, kmem_cache_free, TP_ARGS(call_site, ptr) ); -TRACE_EVENT(mm_page_free_direct, +TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), @@ -169,7 +169,7 @@ TRACE_EVENT(mm_page_free_direct, __entry->order) ); -TRACE_EVENT(mm_pagevec_free, +TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page, int cold), -- cgit v1.2.2 From f90ac3982a78d36f894824636beeef13361d7c59 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:15 -0800 Subject: mm: avoid livelock on !__GFP_FS allocations Colin Cross reported; Under the following conditions, __alloc_pages_slowpath can loop forever: gfp_mask & __GFP_WAIT is true gfp_mask & __GFP_FS is false reclaim and compaction make no progress order <= PAGE_ALLOC_COSTLY_ORDER These conditions happen very often during suspend and resume, when pm_restrict_gfp_mask() effectively converts all GFP_KERNEL allocations into __GFP_WAIT. The oom killer is not run because gfp_mask & __GFP_FS is false, but should_alloc_retry will always return true when order is less than PAGE_ALLOC_COSTLY_ORDER. In his fix, he avoided retrying the allocation if reclaim made no progress and __GFP_FS was not set. The problem is that this would result in GFP_NOIO allocations failing that previously succeeded which would be very unfortunate. The big difference between GFP_NOIO and suspend converting GFP_KERNEL to behave like GFP_NOIO is that normally flushers will be cleaning pages and kswapd reclaims pages allowing GFP_NOIO to succeed after a short delay. The same does not necessarily apply during suspend as the storage device may be suspended. This patch special cases the suspend case to fail the page allocation if reclaim cannot make progress and adds some documentation on how gfp_allowed_mask is currently used. Failing allocations like this may cause suspend to abort but that is better than a livelock. [mgorman@suse.de: Rework fix to be suspend specific] [rientjes@google.com: Move suspended device check to should_alloc_retry] Reported-by: Colin Cross Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Minchan Kim Cc: Pekka Enberg Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 656295865d58..91812df1351a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -368,9 +368,25 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +/* + * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what + * GFP flags are used before interrupts are enabled. Once interrupts are + * enabled, it is set to __GFP_BITS_MASK while the system is running. During + * hibernation, it is used by PM to avoid I/O during memory allocation while + * devices are suspended. + */ extern gfp_t gfp_allowed_mask; extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +#ifdef CONFIG_PM_SLEEP +extern bool pm_suspended_storage(void); +#else +static inline bool pm_suspended_storage(void) +{ + return false; +} +#endif /* CONFIG_PM_SLEEP */ + #endif /* __LINUX_GFP_H */ -- cgit v1.2.2 From 1399ff86f2a2bbacbbe68fa00c5f8c752b344723 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:07:25 -0800 Subject: kernel.h: add BUILD_BUG() macro We can place this in definitions that we expect the compiler to remove by dead code elimination. If this assertion fails, we get a nice error message at build time. The GCC function attribute error("message") was added in version 4.3, so we define a new macro __linktime_error(message) to expand to this for GCC-4.3 and later. This will give us an error diagnostic from the compiler on the line that fails. For other compilers __linktime_error(message) expands to nothing, and we have to be content with a link time error, but at least we will still get a build error. BUILD_BUG() expands to the undefined function __build_bug_failed() and will fail at link time if the compiler ever emits code for it. On GCC-4.3 and later, attribute((error())) is used so that the failure will be noted at compile time instead. Signed-off-by: David Daney Acked-by: David Rientjes Cc: DM Cc: Ralf Baechle Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc4.h | 1 + include/linux/compiler.h | 4 +++- include/linux/kernel.h | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index dfadc96e9d63..2f4079175afb 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -29,6 +29,7 @@ the kernel context */ #define __cold __attribute__((__cold__)) +#define __linktime_error(message) __attribute__((__error__(message))) #if __GNUC_MINOR__ >= 5 /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 320d6c94ff84..4a243546d142 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -293,7 +293,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #ifndef __compiletime_error # define __compiletime_error(message) #endif - +#ifndef __linktime_error +# define __linktime_error(message) +#endif /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8b1597b5cf2..f48e8a528544 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -665,6 +665,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define BUILD_BUG_ON_ZERO(e) (0) #define BUILD_BUG_ON_NULL(e) ((void*)0) #define BUILD_BUG_ON(condition) +#define BUILD_BUG() (0) #else /* __CHECKER__ */ /* Force a compilation error if a constant expression is not a power of 2 */ @@ -703,6 +704,21 @@ extern int __build_bug_on_failed; if (condition) __build_bug_on_failed = 1; \ } while(0) #endif + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() \ + do { \ + extern void __build_bug_failed(void) \ + __linktime_error("BUILD_BUG failed"); \ + __build_bug_failed(); \ + } while (0) + #endif /* __CHECKER__ */ /* Trap pasters of __FUNCTION__ at compile-time */ -- cgit v1.2.2 From c0a32fc5a2e470d0b02597b23ad79a317735253e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:28 -0800 Subject: mm: more intensive memory corruption debugging With CONFIG_DEBUG_PAGEALLOC configured, the CPU will generate an exception on access (read,write) to an unallocated page, which permits us to catch code which corrupts memory. However the kernel is trying to maximise memory usage, hence there are usually few free pages in the system and buggy code usually corrupts some crucial data. This patch changes the buddy allocator to keep more free/protected pages and to interlace free/protected and allocated pages to increase the probability of catching corruption. When the kernel is compiled with CONFIG_DEBUG_PAGEALLOC, debug_guardpage_minorder defines the minimum order used by the page allocator to grant a request. The requested size will be returned with the remaining pages used as guard pages. The default value of debug_guardpage_minorder is zero: no change from current behaviour. [akpm@linux-foundation.org: tweak documentation, s/flg/flag/] Signed-off-by: Stanislaw Gruszka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 +++++++++++++++++ include/linux/page-debug-flags.h | 4 +++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d9b4c9813bd..5568553a41fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1618,5 +1618,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_DEBUG_PAGEALLOC +extern unsigned int _debug_guardpage_minorder; + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool page_is_guard(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool page_is_guard(struct page *page) { return false; } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h index b0638fd91e92..22691f614043 100644 --- a/include/linux/page-debug-flags.h +++ b/include/linux/page-debug-flags.h @@ -13,6 +13,7 @@ enum page_debug_flags { PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ + PAGE_DEBUG_FLAG_GUARD, }; /* @@ -21,7 +22,8 @@ enum page_debug_flags { */ #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS -#if !defined(CONFIG_PAGE_POISONING) \ +#if !defined(CONFIG_PAGE_POISONING) && \ + !defined(CONFIG_PAGE_GUARD) \ /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! #endif -- cgit v1.2.2 From f6d7e0cb3ecc248e98fa11d83253f6174bd7e085 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 10 Jan 2012 15:07:38 -0800 Subject: mm, debug: test for online nid when allocating on single node Calling alloc_pages_exact_node() means the allocation only passes the zonelist of a single node into the page allocator. If that node isn't online, it's zonelist may never have been initialized causing a strange oops that may not immediately be clear. I recently debugged an issue where node 0 wasn't online and an allocator was passing 0 to alloc_pages_exact_node() and it resulted in a NULL pointer on zonelist->_zoneref. If CONFIG_DEBUG_VM is enabled, though, it would be nice to catch this a bit earlier. Signed-off-by: David Rientjes Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 91812df1351a..66f172fdf5fe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -313,7 +313,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, unsigned int order) { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } -- cgit v1.2.2 From ab8fabd46f811d5153d8a0cd2fac9a0d41fb593d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:42 -0800 Subject: mm: exclude reserved pages from dirtyable memory Per-zone dirty limits try to distribute page cache pages allocated for writing across zones in proportion to the individual zone sizes, to reduce the likelihood of reclaim having to write back individual pages from the LRU lists in order to make progress. This patch: The amount of dirtyable pages should not include the full number of free pages: there is a number of reserved pages that the page allocator and kswapd always try to keep free. The closer (reclaimable pages - dirty pages) is to the number of reserved pages, the more likely it becomes for reclaim to run into dirty pages: +----------+ --- | anon | | +----------+ | | | | | | -- dirty limit new -- flusher new | file | | | | | | | | | -- dirty limit old -- flusher old | | | +----------+ --- reclaim | reserved | +----------+ | kernel | +----------+ This patch introduces a per-zone dirty reserve that takes both the lowmem reserve as well as the high watermark of the zone into account, and a global sum of those per-zone values that is subtracted from the global amount of dirtyable pages. The lowmem reserve is unavailable to page cache allocations and kswapd tries to keep the high watermark free. We don't want to end up in a situation where reclaim has to clean pages in order to balance zones. Not treating reserved pages as dirtyable on a global level is only a conceptual fix. In reality, dirty pages are not distributed equally across zones and reclaim runs into dirty pages on a regular basis. But it is important to get this right before tackling the problem on a per-zone level, where the distance between reclaim and the dirty pages is mostly much smaller in absolute numbers. [akpm@linux-foundation.org: fix highmem build] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Michal Hocko Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ include/linux/swap.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ac040f19369..ca6ca92418a6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -317,6 +317,12 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; + /* + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. + */ + unsigned long dirty_balance_reserve; + #ifdef CONFIG_NUMA int node; /* diff --git a/include/linux/swap.h b/include/linux/swap.h index 1e22e126d2ac..06061a7f8e69 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -207,6 +207,7 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern unsigned long dirty_balance_reserve; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); -- cgit v1.2.2 From a756cf5908530e8b40bdf569eb48b40139e8d7fd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:49 -0800 Subject: mm: try to distribute dirty pages fairly across zones The maximum number of dirty pages that exist in the system at any time is determined by a number of pages considered dirtyable and a user-configured percentage of those, or an absolute number in bytes. This number of dirtyable pages is the sum of memory provided by all the zones in the system minus their lowmem reserves and high watermarks, so that the system can retain a healthy number of free pages without having to reclaim dirty pages. But there is a flaw in that we have a zoned page allocator which does not care about the global state but rather the state of individual memory zones. And right now there is nothing that prevents one zone from filling up with dirty pages while other zones are spared, which frequently leads to situations where kswapd, in order to restore the watermark of free pages, does indeed have to write pages from that zone's LRU list. This can interfere so badly with IO from the flusher threads that major filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim already, taking away the VM's only possibility to keep such a zone balanced, aside from hoping the flushers will soon clean pages from that zone. Enter per-zone dirty limits. They are to a zone's dirtyable memory what the global limit is to the global amount of dirtyable memory, and try to make sure that no single zone receives more than its fair share of the globally allowed dirty pages in the first place. As the number of pages considered dirtyable excludes the zones' lowmem reserves and high watermarks, the maximum number of dirty pages in a zone is such that the zone can always be balanced without requiring page cleaning. As this is a placement decision in the page allocator and pages are dirtied only after the allocation, this patch allows allocators to pass __GFP_WRITE when they know in advance that the page will be written to and become dirty soon. The page allocator will then attempt to allocate from the first zone of the zonelist - which on NUMA is determined by the task's NUMA memory policy - that has not exceeded its dirty limit. At first glance, it would appear that the diversion to lower zones can increase pressure on them, but this is not the case. With a full high zone, allocations will be diverted to lower zones eventually, so it is more of a shift in timing of the lower zone allocations. Workloads that previously could fit their dirty pages completely in the higher zone may be forced to allocate from lower zones, but the amount of pages that "spill over" are limited themselves by the lower zones' dirty constraints, and thus unlikely to become a problem. For now, the problem of unfair dirty page distribution remains for NUMA configurations where the zones allowed for allocation are in sum not big enough to trigger the global dirty limits, wake up the flusher threads and remedy the situation. Because of this, an allocation that could not succeed on any of the considered zones is allowed to ignore the dirty limits before going into direct reclaim or even failing the allocation, until a future patch changes the global dirty throttling and flusher thread activation so that they take individual zone states into account. Test results 15M DMA + 3246M DMA32 + 504 Normal = 3765M memory 40% dirty ratio 16G USB thumb drive 10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15)) seconds nr_vmscan_write (stddev) min| median| max xfs vanilla: 549.747( 3.492) 0.000| 0.000| 0.000 patched: 550.996( 3.802) 0.000| 0.000| 0.000 fuse-ntfs vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000 patched: 558.049(17.914) 0.000| 0.000| 43.000 btrfs vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000 patched: 563.365(11.368) 0.000| 0.000| 1362.000 ext4 vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000 patched: 568.806(17.496) 0.000| 0.000| 0.000 Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Tested-by: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++- include/linux/writeback.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 66f172fdf5fe..581e74b7df95 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,6 +36,7 @@ struct vm_area_struct; #endif #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u +#define ___GFP_WRITE 0x1000000u /* * GFP bitmasks.. @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -92,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 34a005515fef..6dff47304971 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -124,6 +124,7 @@ void laptop_mode_timer_fn(unsigned long data); static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +bool zone_dirty_ok(struct zone *zone); extern unsigned long global_dirty_limit; -- cgit v1.2.2 From 948f017b093a9baac23855fcd920d3a970b71bb6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 10 Jan 2012 15:08:05 -0800 Subject: mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma() migrate was doing an rmap_walk with speculative lock-less access on pagetables. That could lead it to not serializing properly against mremap PT locks. But a second problem remains in the order of vmas in the same_anon_vma list used by the rmap_walk. If vma_merge succeeds in copy_vma, the src vma could be placed after the dst vma in the same_anon_vma list. That could still lead to migrate missing some pte. This patch adds an anon_vma_moveto_tail() function to force the dst vma at the end of the list before mremap starts to solve the problem. If the mremap is very large and there are a lots of parents or childs sharing the anon_vma root lock, this should still scale better than taking the anon_vma root lock around every pte copy practically for the whole duration of mremap. Update: Hugh noticed special care is needed in the error path where move_page_tables goes in the reverse direction, a second anon_vma_moveto_tail() call is needed in the error path. This program exercises the anon_vma_moveto_tail: === int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, SIZE)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); printf("%p\n", p); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); if (memcmp(p, p2, SIZE)) printf("error\n"); p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3); if (p4 != p3) perror("mremap"), exit(1); p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2); if (p4 != p+SIZE/2) perror("mremap"), exit(1); if (memcmp(p, p2, SIZE)) printf("error\n"); printf("ok\n"); return 0; } === $ perf probe -a anon_vma_moveto_tail Add new event: probe:anon_vma_moveto_tail (on anon_vma_moveto_tail) You can now use it on all perf tools, such as: perf record -e probe:anon_vma_moveto_tail -aR sleep 1 $ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail 0x7f2ca2800000 ok [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ] $ perf report --stdio 100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail Signed-off-by: Andrea Arcangeli Reported-by: Nai Xia Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Pawel Sikora Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2148b122779b..1afb9954bbf1 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -120,6 +120,7 @@ void anon_vma_init(void); /* create anon_vma_cachep */ int anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); +void anon_vma_moveto_tail(struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); -- cgit v1.2.2 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/oom.h | 33 ++++++++++++++++++++++++ include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 include/trace/events/oom.h create mode 100644 include/trace/events/task.h (limited to 'include') diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 000000000000..dd4ba3b92002 --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 000000000000..b53add02e929 --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include -- cgit v1.2.2 From fcfb4dcc9698f932836aa63ba0d82e7dbd300fb3 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:08:21 -0800 Subject: mm/mempolicy.c: mpol_equal(): use bool mpol_equal() logically returns a boolean. Use a bool type to slightly improve readability. Signed-off-by: KOSAKI Motohiro Cc: Stephen Wilson Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7978eec1b7d9..7c727a90d70d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -164,11 +164,11 @@ static inline void mpol_get(struct mempolicy *pol) atomic_inc(&pol->refcnt); } -extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b); -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) - return 1; + return true; return __mpol_equal(a, b); } @@ -257,9 +257,9 @@ static inline int vma_migratable(struct vm_area_struct *vma) struct mempolicy {}; -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { - return 1; + return true; } static inline void mpol_put(struct mempolicy *p) -- cgit v1.2.2 From a6d511e5155406cd214d3af3ff9cffc69548b006 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 10 Jan 2012 15:09:40 -0800 Subject: leds: add driver for TCA6507 LED controller TI's TCA6507 is the LED driver in the GTA04 Openmoko motherboard. The driver provides full support for brightness levels and hardware blinking. This driver can drive each of 7 outputs as an LED or a GPIO output, and provides hardware-assist blinking. [akpm@linux-foundation.org: fix __mod_i2c_device_table alias] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: NeilBrown Cc: Richard Purdie Cc: Randy Dunlap Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/leds-tca6507.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/leds-tca6507.h (limited to 'include') diff --git a/include/linux/leds-tca6507.h b/include/linux/leds-tca6507.h new file mode 100644 index 000000000000..dcabf4fa2aef --- /dev/null +++ b/include/linux/leds-tca6507.h @@ -0,0 +1,34 @@ +/* + * TCA6507 LED chip driver. + * + * Copyright (C) 2011 Neil Brown + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#ifndef __LINUX_TCA6507_H +#define __LINUX_TCA6507_H +#include + +struct tca6507_platform_data { + struct led_platform_data leds; +#ifdef CONFIG_GPIOLIB + int gpio_base; + void (*setup)(unsigned gpio_base, unsigned ngpio); +#endif +}; + +#define TCA6507_MAKE_GPIO 1 +#endif /* __LINUX_TCA6507_H*/ -- cgit v1.2.2 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index a822300a253b..7987ce74874b 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern void block_sigmask(struct k_sigaction *ka, int signr); extern void exit_signals(struct task_struct *tsk); extern struct kmem_cache *sighand_cachep; -- cgit v1.2.2 From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Jan 2012 15:11:20 -0800 Subject: procfs: make proc_get_link to use dentry instead of inode Prepare the ground for the next "map_files" patch which needs a name of a link file to analyse. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Tejun Heo Cc: Vasiliy Kulikov Cc: "Kirill A. Shutemov" Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 6d9e575519cc..85c507306239 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; union proc_op { - int (*proc_get_link)(struct inode *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *); int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, -- cgit v1.2.2 From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 10 Jan 2012 15:11:23 -0800 Subject: procfs: introduce the /proc//map_files/ directory This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc//map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vasiliy Kulikov Reviewed-by: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Alexey Dobriyan Cc: Al Viro Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5568553a41fd..6eba2cc016c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +/* Look up the first VMA which exactly match the interval vm_start ... vm_end */ +static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, + unsigned long vm_start, unsigned long vm_end) +{ + struct vm_area_struct *vma = find_vma(mm, vm_start); + + if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) + vma = NULL; + + return vma; +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); #else -- cgit v1.2.2 From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:31 -0800 Subject: procfs: add hidepid= and gid= mount options Add support for mount options to restrict access to /proc/PID/ directories. The default backward-compatible "relaxed" behaviour is left untouched. The first mount option is called "hidepid" and its value defines how much info about processes we want to be available for non-owners: hidepid=0 (default) means the old behavior - anybody may read all world-readable /proc/PID/* files. hidepid=1 means users may not access any /proc// directories, but their own. Sensitive files like cmdline, sched*, status are now protected against other users. As permission checking done in proc_pid_permission() and files' permissions are left untouched, programs expecting specific files' modes are not confused. hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other users. It doesn't mean that it hides whether a process exists (it can be learned by other means, e.g. by kill -0 $PID), but it hides process' euid and egid. It compicates intruder's task of gathering info about running processes, whether some daemon runs with elevated privileges, whether another user runs some sensitive program, whether other users run any program at all, etc. gid=XXX defines a group that will be able to gather all processes' info (as in hidepid=0 mode). This group should be used instead of putting nonroot user in sudoers file or something. However, untrusted users (like daemons, etc.) which are not supposed to monitor the tasks in the whole system should not be added to the group. hidepid=1 or higher is designed to restrict access to procfs files, which might reveal some sensitive private information like precise keystrokes timings: http://www.openwall.com/lists/oss-security/2011/11/05/3 hidepid=1/2 doesn't break monitoring userspace tools. ps, top, pgrep, and conky gracefully handle EPERM/ENOENT and behave as if the current user is the only user running processes. pstree shows the process subtree which contains "pstree" process. Note: the patch doesn't deal with setuid/setgid issues of keeping preopened descriptors of procfs files (like https://lkml.org/lkml/2011/2/7/368). We rely on that the leaked information like the scheduling counters of setuid apps doesn't threaten anybody's privacy - only the user started the setuid program may read the counters. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 38d10326246a..e7cf6669ac34 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -30,6 +30,8 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif + gid_t pid_gid; + int hide_pid; }; extern struct pid_namespace init_pid_ns; -- cgit v1.2.2 From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:11:35 -0800 Subject: workqueue: make alloc_workqueue() take printf fmt and args for name alloc_workqueue() currently expects the passed in @name pointer to remain accessible. This is inconvenient and a bit silly given that the whole wq is being dynamically allocated. This patch updates alloc_workqueue() and friends to take printf format string instead of opaque string and matching varargs at the end. The name is allocated together with the wq and formatted. alloc_ordered_workqueue() is converted to a macro to unify varargs handling with alloc_workqueue(), and, while at it, add comment to alloc_workqueue(). None of the current in-kernel users pass in string with '%' as constant name and this change shouldn't cause any problem. [akpm@linux-foundation.org: use __printf] Signed-off-by: Tejun Heo Suggested-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/workqueue.h | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0d556deb497b..eb8b9f15f2e0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct * -__alloc_workqueue_key(const char *name, unsigned int flags, int max_active, - struct lock_class_key *key, const char *lock_name); +__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, + struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); +/** + * alloc_workqueue - allocate a workqueue + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @args: args for @fmt + * + * Allocate a workqueue with the specified parameters. For detailed + * information on WQ_* flags, please refer to Documentation/workqueue.txt. + * + * The __lock_name macro dance is to guarantee that single lock_class_key + * doesn't end up with different namesm, which isn't allowed by lockdep. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ #ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ ({ \ static struct lock_class_key __key; \ const char *__lock_name; \ \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ + if (__builtin_constant_p(fmt)) \ + __lock_name = (fmt); \ else \ - __lock_name = #name; \ + __lock_name = #fmt; \ \ - __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + &__key, __lock_name, ##args); \ }) #else -#define alloc_workqueue(name, flags, max_active) \ - __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL) +#define alloc_workqueue(fmt, flags, max_active, args...) \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + NULL, NULL, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue - * @name: name of the workqueue + * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are @@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -static inline struct workqueue_struct * -alloc_ordered_workqueue(const char *name, unsigned int flags) -{ - return alloc_workqueue(name, WQ_UNBOUND | flags, 1); -} +#define alloc_ordered_workqueue(fmt, flags, args...) \ + alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) -- cgit v1.2.2