From 80a05b9ffa7dc13f6693902dd8999a2b61a3a0d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Mar 2010 17:34:14 +0100 Subject: clockevents: Sanitize min_delta_ns adjustment and prevent overflows The current logic which handles clock events programming failures can increase min_delta_ns unlimited and even can cause overflows. Sanitize it by: - prevent zero increase when min_delta_ns == 1 - limiting min_delta_ns to a jiffie - bail out if the jiffie limit is hit - add retries stats for /proc/timer_list so we can gather data Reported-by: Uwe Kleine-Koenig Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 0cf725bdd2a1..fc53492b6ad7 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -73,6 +73,7 @@ enum clock_event_nofitiers { * @list: list head for the management code * @mode: operating mode assigned by the management code * @next_event: local storage for the next event in oneshot mode + * @retries: number of forced programming retries */ struct clock_event_device { const char *name; @@ -93,6 +94,7 @@ struct clock_event_device { struct list_head list; enum clock_event_mode mode; ktime_t next_event; + unsigned long retries; }; /* -- cgit v1.2.2 From e3818b8dce2a934cd1521dbc4827e5238d8f45d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Mar 2010 17:03:43 -0700 Subject: rcu: Make rcu_read_lock_bh_held() allow for disabled BH Disabling BH can stand in for rcu_read_lock_bh(), and this patch updates rcu_read_lock_bh_held() to allow for this. In order to avoid include-file hell, this function is moved out of line to kernel/rcupdate.c. This fixes a false positive RCU warning. Reported-by: Arnd Bergmann Reported-by: Eric Dumazet Signed-off-by: Paul E. McKenney Acked-by: Lai Jiangshan Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20100316000343.GA25857@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3024050c82a1..e1bdc4bfd275 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -123,22 +123,11 @@ static inline int rcu_read_lock_held(void) return lock_is_held(&rcu_lock_map); } -/** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? - * - * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in - * an RCU-bh read-side critical section. In absence of CONFIG_PROVE_LOCKING, - * this assumes we are in an RCU-bh read-side critical section unless it can - * prove otherwise. - * - * Check rcu_scheduler_active to prevent false positives during boot. +/* + * rcu_read_lock_bh_held() is defined out of line to avoid #include-file + * hell. */ -static inline int rcu_read_lock_bh_held(void) -{ - if (!debug_lockdep_rcu_enabled()) - return 1; - return lock_is_held(&rcu_bh_lock_map); -} +extern int rcu_read_lock_bh_held(void); /** * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? -- cgit v1.2.2 From 7f5b774275df8c76a959eae7488128b637fcbfc8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 16 Mar 2010 17:00:29 +0800 Subject: rcu: Fix tracepoints & lockdep false positive tracepoint.h uses rcu_dereference(), which triggers this warning: [ 0.701161] =================================================== [ 0.702211] [ INFO: suspicious rcu_dereference_check() usage. ] [ 0.702716] --------------------------------------------------- [ 0.703203] include/trace/events/workqueue.h:68 invoked rcu_dereference_check() without protection! [ 0.703971] [ 0.703990] other info that might help us debug this: [ 0.703993] [ 0.705590] [ 0.705604] rcu_scheduler_active = 1, debug_locks = 0 [ 0.706712] 1 lock held by swapper/1: [ 0.707229] #0: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_maps_update_begin+0x14/0x20 [ 0.710097] [ 0.710106] stack backtrace: [ 0.712602] Pid: 1, comm: swapper Not tainted 2.6.34-rc1-tip-01613-g72662bb #168 [ 0.713231] Call Trace: [ 0.713997] [] lockdep_rcu_dereference+0x9d/0xb0 [ 0.714746] [] create_workqueue_thread+0x107/0x110 [ 0.715353] [] ? worker_thread+0x0/0x340 [ 0.715845] [] __create_workqueue_key+0x138/0x240 [ 0.716427] [] ? cpu_maps_update_done+0x12/0x20 [ 0.717012] [] init_workqueues+0x6f/0x80 [ 0.717530] [] kernel_init+0x102/0x1f0 [ 0.717570] [] ? kernel_init+0x0/0x1f0 [ 0.718944] [] kernel_thread_helper+0x6/0x10 Signed-off-by: Lai Jiangshan Cc: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4B9F48AD.4000404@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index f59604ed0ec6..78b4bd3be496 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -49,7 +49,7 @@ struct tracepoint { void **it_func; \ \ rcu_read_lock_sched_notrace(); \ - it_func = rcu_dereference((tp)->funcs); \ + it_func = rcu_dereference_sched((tp)->funcs); \ if (it_func) { \ do { \ ((void(*)(proto))(*it_func))(args); \ -- cgit v1.2.2 From 0cff810f54b3b52075c27f7a7021d5b195264b6c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 18 Mar 2010 12:25:33 -0700 Subject: rcu: Fix local_irq_disable() CONFIG_PROVE_RCU=y false positives It is documented that local_irq_disable() also delimits RCU_SCHED read-site critical sections. See the document of synchronize_sched() or Documentation/RCU/whatisRCU.txt. So we have to test irqs_disabled() in rcu_read_lock_sched_held(). Otherwise rcu-lockdep brings incorrect complaint. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: eric.dumazet@gmail.com LKML-Reference: <1268940334-10892-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e1bdc4bfd275..872a98e13d6a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -149,7 +149,7 @@ static inline int rcu_read_lock_sched_held(void) return 1; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0; + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); } #else /* #ifdef CONFIG_PREEMPT */ static inline int rcu_read_lock_sched_held(void) @@ -180,7 +180,7 @@ static inline int rcu_read_lock_bh_held(void) #ifdef CONFIG_PREEMPT static inline int rcu_read_lock_sched_held(void) { - return !rcu_scheduler_active || preempt_count() != 0; + return !rcu_scheduler_active || preempt_count() != 0 || irqs_disabled(); } #else /* #ifdef CONFIG_PREEMPT */ static inline int rcu_read_lock_sched_held(void) -- cgit v1.2.2 From 66f1207bce10fd80ee8ce99b67d617644612f05e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 11 Mar 2010 17:01:09 -0700 Subject: resources: add interfaces that return conflict information request_resource() and insert_resource() only return success or failure, which no information about what existing resource conflicted with the proposed new reservation. This patch adds request_resource_conflict() and insert_resource_conflict(), which return the conflicting resource. Callers may use this for better error messages or to adjust the new resource and retry the request. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- include/linux/ioport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 71ab79da7e7f..26fad187d661 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -112,12 +112,14 @@ struct resource_list { extern struct resource ioport_resource; extern struct resource iomem_resource; +extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); void release_child_resources(struct resource *new); extern void reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name); +extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); extern int allocate_resource(struct resource *root, struct resource *new, -- cgit v1.2.2 From 03e6d819c2cb2cc8ce5642669a0a7c72336ee7a2 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 23 Mar 2010 20:40:50 +0000 Subject: skbuff: remove unused dma_head & dma_maps fields The dma map fields in the skb_shared_info structure no longer has any users and can be dropped since it is making the skb_shared_info unecessarily larger. Running slabtop show that we were using 4K slabs for the skb->head on x86_64 w/ an allocation size of 1522. It turns out that the dma_head and dma_maps array made skb_shared large enough that we had crossed over the 2k boundary with standard frames and as such we were using 4k blocks of memory for all skbs. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 03f816a9b659..124f90cd5a38 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -190,9 +190,6 @@ struct skb_shared_info { atomic_t dataref; unsigned short nr_frags; unsigned short gso_size; -#ifdef CONFIG_HAS_DMA - dma_addr_t dma_head; -#endif /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; unsigned short gso_type; @@ -201,9 +198,6 @@ struct skb_shared_info { struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; skb_frag_t frags[MAX_SKB_FRAGS]; -#ifdef CONFIG_HAS_DMA - dma_addr_t dma_maps[MAX_SKB_FRAGS]; -#endif /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; -- cgit v1.2.2 From 9c13886665c43600bd0af4b38e33c654e648e078 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Thu, 25 Mar 2010 11:17:26 +0100 Subject: netfilter: ip6table_raw: fix table priority The order of the IPv6 raw table is currently reversed, that makes impossible to use the NOTRACK target in IPv6: for example if someone enters ip6tables -t raw -A PREROUTING -p tcp --dport 80 -j NOTRACK and if we receive fragmented packets then the first fragment will be untracked and thus skip nf_ct_frag6_gather (and conntrack), while all subsequent fragments enter nf_ct_frag6_gather and reassembly will never successfully be finished. Singed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- include/linux/netfilter_ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index d654873aa25a..1f7e300094cd 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -59,6 +59,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_FIRST = INT_MIN, NF_IP6_PRI_CONNTRACK_DEFRAG = -400, + NF_IP6_PRI_RAW = -300, NF_IP6_PRI_SELINUX_FIRST = -225, NF_IP6_PRI_CONNTRACK = -200, NF_IP6_PRI_MANGLE = -150, -- cgit v1.2.2 From 5a7aadfe2fcb0f69e2acc1fbefe22a096e792fc9 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Fri, 26 Mar 2010 23:51:44 +0100 Subject: Freezer: Fix buggy resume test for tasks frozen with cgroup freezer When the cgroup freezer is used to freeze tasks we do not want to thaw those tasks during resume. Currently we test the cgroup freezer state of the resuming tasks to see if the cgroup is FROZEN. If so then we don't thaw the task. However, the FREEZING state also indicates that the task should remain frozen. This also avoids a problem pointed out by Oren Ladaan: the freezer state transition from FREEZING to FROZEN is updated lazily when userspace reads or writes the freezer.state file in the cgroup filesystem. This means that resume will thaw tasks in cgroups which should be in the FROZEN state if there is no read/write of the freezer.state file to trigger this transition before suspend. NOTE: Another "simple" solution would be to always update the cgroup freezer state during resume. However it's a bad choice for several reasons: Updating the cgroup freezer state is somewhat expensive because it requires walking all the tasks in the cgroup and checking if they are each frozen. Worse, this could easily make resume run in N^2 time where N is the number of tasks in the cgroup. Finally, updating the freezer state from this code path requires trickier locking because of the way locks must be ordered. Instead of updating the freezer state we rely on the fact that lazy updates only manage the transition from FREEZING to FROZEN. We know that a cgroup with the FREEZING state may actually be FROZEN so test for that state too. This makes sense in the resume path even for partially-frozen cgroups -- those that really are FREEZING but not FROZEN. Reported-by: Oren Ladaan Signed-off-by: Matt Helsley Cc: stable@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/freezer.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 5a361f85cfec..da7e52b099f3 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -64,9 +64,12 @@ extern bool freeze_task(struct task_struct *p, bool sig_only); extern void cancel_freezing(struct task_struct *p); #ifdef CONFIG_CGROUP_FREEZER -extern int cgroup_frozen(struct task_struct *task); +extern int cgroup_freezing_or_frozen(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline int cgroup_frozen(struct task_struct *task) { return 0; } +static inline int cgroup_freezing_or_frozen(struct task_struct *task) +{ + return 0; +} #endif /* !CONFIG_CGROUP_FREEZER */ /* -- cgit v1.2.2 From 71c5c1595c04852d6fbf3c4882b47b30b61a4d32 Mon Sep 17 00:00:00 2001 From: Brandon L Black Date: Fri, 26 Mar 2010 16:18:03 +0000 Subject: net: Add MSG_WAITFORONE flag to recvmmsg Add new flag MSG_WAITFORONE for the recvmmsg() syscall. When this flag is specified for a blocking socket, recvmmsg() will only block until at least 1 packet is available. The default behavior is to block until all vlen packets are available. This flag has no effect on non-blocking sockets or when used in combination with MSG_DONTWAIT. Signed-off-by: Brandon L Black Acked-by: Ulrich Drepper Acked-by: Eric Dumazet Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 7b3aae2052a6..354cc5617f8b 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -255,6 +255,7 @@ struct ucred { #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ +#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_EOF MSG_FIN -- cgit v1.2.2 From a53f4f9efaeb1d87cfae066346979d4d70e1abe9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Mar 2010 13:08:52 +0100 Subject: SLOW_WORK: CONFIG_SLOW_WORK_PROC should be CONFIG_SLOW_WORK_DEBUG CONFIG_SLOW_WORK_PROC was changed to CONFIG_SLOW_WORK_DEBUG, but not in all instances. Change the remaining instances. This makes the debugfs file display the time mark and the owner's description again. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/fscache-cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 7be0c6fbe880..c57db27ac861 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -105,7 +105,7 @@ struct fscache_operation { /* operation releaser */ fscache_operation_release_t release; -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG const char *name; /* operation name */ const char *state; /* operation state */ #define fscache_set_op_name(OP, N) do { (OP)->name = (N); } while(0) -- cgit v1.2.2 From c36207a4624f15020f2918324405c1c88a5d4cbc Mon Sep 17 00:00:00 2001 From: viresh kumar Date: Mon, 29 Mar 2010 05:28:32 +0100 Subject: ARM: 5999/1: Including device.h and resource.h header files in linux/amba/bus.h linux/amba/bus.h have dependencies on linux/device.h and linux/resource.h, but it doesn't include them. We get compilation errors in our files which include bus.h but doesn't include device.h and resource.h. This patch includes device.h and resource.h in linux/amba/bus.h file. Signed-off-by: Viresh Kumar Acked-by: Linux Walleij Signed-off-by: Russell King --- include/linux/amba/bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 6816be6c3f77..8b1038607831 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -14,6 +14,9 @@ #ifndef ASMARM_AMBA_H #define ASMARM_AMBA_H +#include +#include + #define AMBA_NR_IRQS 2 struct amba_device { -- cgit v1.2.2 From 367d6acceaacff1adc44f121543effb9c060e575 Mon Sep 17 00:00:00 2001 From: viresh kumar Date: Mon, 29 Mar 2010 05:29:56 +0100 Subject: ARM: 6003/1: removing compilation warning from pl061.h pl061.h is using u8 type. including in pl061.h to avoid warning. Signed-off-by: Viresh Kumar Acked-by: Baruch Siach Signed-off-by: Russell King --- include/linux/amba/pl061.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amba/pl061.h b/include/linux/amba/pl061.h index b4fbd9862606..5ddd9ad4b19c 100644 --- a/include/linux/amba/pl061.h +++ b/include/linux/amba/pl061.h @@ -1,3 +1,5 @@ +#include + /* platform data for the PL061 GPIO driver */ struct pl061_platform_data { -- cgit v1.2.2 From de329820e920cd9cfbc2127cad26a37026260cce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Mar 2010 14:30:19 -0700 Subject: ext3: fix broken handling of EXT3_STATE_NEW In commit 9df93939b735 ("ext3: Use bitops to read/modify EXT3_I(inode)->i_state") ext3 changed its internal 'i_state' variable to use bitops for its state handling. However, unline the same ext4 change, it didn't actually change the name of the field when it changed the semantics of it. As a result, an old use of 'i_state' remained in fs/ext3/ialloc.c that initialized the field to EXT3_STATE_NEW. And that does not work _at_all_ when we're now working with individually named bits rather than values that get masked. So the code tried to mark the state to be new, but in actual fact set the field to EXT3_STATE_JDATA. Which makes no sense at all, and screws up all the code that checks whether the inode was newly allocated. In particular, it made the xattr code unhappy, and caused various random behavior, like apparently https://bugzilla.redhat.com/show_bug.cgi?id=577911 So fix the initialization, and rename the field to match ext4 so that we don't have this happen again. Cc: James Morris Cc: Stephen Smalley Cc: Daniel J Walsh Cc: Eric Paris Cc: Jan Kara Signed-off-by: Linus Torvalds --- include/linux/ext3_fs.h | 6 +++--- include/linux/ext3_fs_i.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index cac84b006667..5f494b465097 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -565,17 +565,17 @@ enum { static inline int ext3_test_inode_state(struct inode *inode, int bit) { - return test_bit(bit, &EXT3_I(inode)->i_state); + return test_bit(bit, &EXT3_I(inode)->i_state_flags); } static inline void ext3_set_inode_state(struct inode *inode, int bit) { - set_bit(bit, &EXT3_I(inode)->i_state); + set_bit(bit, &EXT3_I(inode)->i_state_flags); } static inline void ext3_clear_inode_state(struct inode *inode, int bit) { - clear_bit(bit, &EXT3_I(inode)->i_state); + clear_bit(bit, &EXT3_I(inode)->i_state_flags); } #else /* Assume that user mode programs are passing in an ext3fs superblock, not diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h index 7679acdb519a..f42c098aed8d 100644 --- a/include/linux/ext3_fs_i.h +++ b/include/linux/ext3_fs_i.h @@ -87,7 +87,7 @@ struct ext3_inode_info { * near to their parent directory's inode. */ __u32 i_block_group; - unsigned long i_state; /* Dynamic state flags for ext3 */ + unsigned long i_state_flags; /* Dynamic state flags for ext3 */ /* block reservation info */ struct ext3_block_alloc_info *i_block_alloc_info; -- cgit v1.2.2 From e49a5bd38159dfb1928fd25b173bc9de4bbadb21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Mar 2010 19:40:03 +0100 Subject: perf: Use hot regs with software sched switch/migrate events Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar Cc: David Miller --- include/linux/perf_event.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 95477038a72a..c8e375440403 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -842,13 +842,6 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); -static inline void -perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) -{ - if (atomic_read(&perf_swevent_enabled[event_id])) - __perf_sw_event(event_id, nr, nmi, regs, addr); -} - extern void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip); @@ -887,6 +880,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip) return perf_arch_fetch_caller_regs(regs, ip, skip); } +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swevent_enabled[event_id])) { + struct pt_regs hot_regs; + + if (!regs) { + perf_fetch_caller_regs(&hot_regs, 1); + regs = &hot_regs; + } + __perf_sw_event(event_id, nr, nmi, regs, addr); + } +} + extern void __perf_event_mmap(struct vm_area_struct *vma); static inline void perf_event_mmap(struct vm_area_struct *vma) -- cgit v1.2.2