aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cgroup.c20
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpu_pm.c16
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/uprobes.c1667
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/irq/irqdomain.c106
-rw-r--r--kernel/irq/manage.c14
-rw-r--r--kernel/kallsyms.c32
-rw-r--r--kernel/kcmp.c196
-rw-r--r--kernel/kfifo.c1
-rw-r--r--kernel/kmod.c30
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/sys.c213
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/time/Kconfig58
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/trace/Kconfig20
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_kprobe.c899
-rw-r--r--kernel/trace/trace_probe.c839
-rw-r--r--kernel/trace/trace_probe.h161
-rw-r--r--kernel/trace/trace_uprobe.c788
-rw-r--r--kernel/watchdog.c12
33 files changed, 4194 insertions, 1044 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bf1034008aca..6f3d0ae044b2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,9 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
29obj-$(CONFIG_X86) += kcmp.o
30endif
28obj-$(CONFIG_FREEZER) += freezer.o 31obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 32obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o 33obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0c6af34d500..0f3527d6184a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5132,7 +5132,7 @@ EXPORT_SYMBOL_GPL(css_depth);
5132 * @root: the css supporsed to be an ancestor of the child. 5132 * @root: the css supporsed to be an ancestor of the child.
5133 * 5133 *
5134 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 5134 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
5135 * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). 5135 * this function reads css->id, the caller must hold rcu_read_lock().
5136 * But, considering usual usage, the csses should be valid objects after test. 5136 * But, considering usual usage, the csses should be valid objects after test.
5137 * Assuming that the caller will do some action to the child if this returns 5137 * Assuming that the caller will do some action to the child if this returns
5138 * returns true, the caller must take "child";s reference count. 5138 * returns true, the caller must take "child";s reference count.
@@ -5144,18 +5144,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5144{ 5144{
5145 struct css_id *child_id; 5145 struct css_id *child_id;
5146 struct css_id *root_id; 5146 struct css_id *root_id;
5147 bool ret = true;
5148 5147
5149 rcu_read_lock();
5150 child_id = rcu_dereference(child->id); 5148 child_id = rcu_dereference(child->id);
5149 if (!child_id)
5150 return false;
5151 root_id = rcu_dereference(root->id); 5151 root_id = rcu_dereference(root->id);
5152 if (!child_id 5152 if (!root_id)
5153 || !root_id 5153 return false;
5154 || (child_id->depth < root_id->depth) 5154 if (child_id->depth < root_id->depth)
5155 || (child_id->stack[root_id->depth] != root_id->id)) 5155 return false;
5156 ret = false; 5156 if (child_id->stack[root_id->depth] != root_id->id)
5157 rcu_read_unlock(); 5157 return false;
5158 return ret; 5158 return true;
5159} 5159}
5160 5160
5161void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5161void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0e6353cf147a..a4eb5227a19e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,7 +10,10 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/unistd.h> 11#include <linux/unistd.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/oom.h>
14#include <linux/rcupdate.h>
13#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/bug.h>
14#include <linux/kthread.h> 17#include <linux/kthread.h>
15#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
16#include <linux/mutex.h> 19#include <linux/mutex.h>
@@ -173,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
173} 176}
174EXPORT_SYMBOL(unregister_cpu_notifier); 177EXPORT_SYMBOL(unregister_cpu_notifier);
175 178
179/**
180 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
181 * @cpu: a CPU id
182 *
183 * This function walks all processes, finds a valid mm struct for each one and
184 * then clears a corresponding bit in mm's cpumask. While this all sounds
185 * trivial, there are various non-obvious corner cases, which this function
186 * tries to solve in a safe manner.
187 *
188 * Also note that the function uses a somewhat relaxed locking scheme, so it may
189 * be called only for an already offlined CPU.
190 */
191void clear_tasks_mm_cpumask(int cpu)
192{
193 struct task_struct *p;
194
195 /*
196 * This function is called after the cpu is taken down and marked
197 * offline, so its not like new tasks will ever get this cpu set in
198 * their mm mask. -- Peter Zijlstra
199 * Thus, we may use rcu_read_lock() here, instead of grabbing
200 * full-fledged tasklist_lock.
201 */
202 WARN_ON(cpu_online(cpu));
203 rcu_read_lock();
204 for_each_process(p) {
205 struct task_struct *t;
206
207 /*
208 * Main thread might exit, but other threads may still have
209 * a valid mm. Find one.
210 */
211 t = find_lock_task_mm(p);
212 if (!t)
213 continue;
214 cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
215 task_unlock(t);
216 }
217 rcu_read_unlock();
218}
219
176static inline void check_for_tasks(int cpu) 220static inline void check_for_tasks(int cpu)
177{ 221{
178 struct task_struct *p; 222 struct task_struct *p;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e15308..9656a3c36503 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); 81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
82 82
83/** 83/**
84 * cpm_pm_enter - CPU low power entry notifier 84 * cpu_pm_enter - CPU low power entry notifier
85 * 85 *
86 * Notifies listeners that a single CPU is entering a low power state that may 86 * Notifies listeners that a single CPU is entering a low power state that may
87 * cause some blocks in the same power domain as the cpu to reset. 87 * cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
89 * Must be called on the affected CPU with interrupts disabled. Platform is 89 * Must be called on the affected CPU with interrupts disabled. Platform is
90 * responsible for ensuring that cpu_pm_enter is not called twice on the same 90 * responsible for ensuring that cpu_pm_enter is not called twice on the same
91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP 91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
92 * co-processor, interrupt controller and it's PM extensions, local CPU 92 * co-processor, interrupt controller and its PM extensions, local CPU
93 * timers context save/restore which shouldn't be interrupted. Hence it 93 * timers context save/restore which shouldn't be interrupted. Hence it
94 * must be called with interrupts disabled. 94 * must be called with interrupts disabled.
95 * 95 *
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
115EXPORT_SYMBOL_GPL(cpu_pm_enter); 115EXPORT_SYMBOL_GPL(cpu_pm_enter);
116 116
117/** 117/**
118 * cpm_pm_exit - CPU low power exit notifier 118 * cpu_pm_exit - CPU low power exit notifier
119 * 119 *
120 * Notifies listeners that a single CPU is exiting a low power state that may 120 * Notifies listeners that a single CPU is exiting a low power state that may
121 * have caused some blocks in the same power domain as the cpu to reset. 121 * have caused some blocks in the same power domain as the cpu to reset.
122 * 122 *
123 * Notified drivers can include VFP co-processor, interrupt controller 123 * Notified drivers can include VFP co-processor, interrupt controller
124 * and it's PM extensions, local CPU timers context save/restore which 124 * and its PM extensions, local CPU timers context save/restore which
125 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 125 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
126 * 126 *
127 * Return conditions are same as __raw_notifier_call_chain. 127 * Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
139EXPORT_SYMBOL_GPL(cpu_pm_exit); 139EXPORT_SYMBOL_GPL(cpu_pm_exit);
140 140
141/** 141/**
142 * cpm_cluster_pm_enter - CPU cluster low power entry notifier 142 * cpu_cluster_pm_enter - CPU cluster low power entry notifier
143 * 143 *
144 * Notifies listeners that all cpus in a power domain are entering a low power 144 * Notifies listeners that all cpus in a power domain are entering a low power
145 * state that may cause some blocks in the same power domain to reset. 145 * state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
147 * Must be called after cpu_pm_enter has been called on all cpus in the power 147 * Must be called after cpu_pm_enter has been called on all cpus in the power
148 * domain, and before cpu_pm_exit has been called on any cpu in the power 148 * domain, and before cpu_pm_exit has been called on any cpu in the power
149 * domain. Notified drivers can include VFP co-processor, interrupt controller 149 * domain. Notified drivers can include VFP co-processor, interrupt controller
150 * and it's PM extensions, local CPU timers context save/restore which 150 * and its PM extensions, local CPU timers context save/restore which
151 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 151 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
152 * 152 *
153 * Must be called with interrupts disabled. 153 * Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); 174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
175 175
176/** 176/**
177 * cpm_cluster_pm_exit - CPU cluster low power exit notifier 177 * cpu_cluster_pm_exit - CPU cluster low power exit notifier
178 * 178 *
179 * Notifies listeners that all cpus in a power domain are exiting form a 179 * Notifies listeners that all cpus in a power domain are exiting form a
180 * low power state that may have caused some blocks in the same power domain 180 * low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
183 * Must be called after cpu_pm_exit has been called on all cpus in the power 183 * Must be called after cpu_pm_exit has been called on all cpus in the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power 184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller 185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and it's PM extensions, local CPU timers context save/restore which 186 * and its PM extensions, local CPU timers context save/restore which
187 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 187 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
188 * 188 *
189 * Return conditions are same as __raw_notifier_call_chain. 189 * Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..103f5d147b2f 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
6
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 7obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
8obj-$(CONFIG_UPROBES) += uprobes.o
9
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000000..985be4d80fe8
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1667 @@
1/*
2 * User-space Probes (UProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2012
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 */
24
25#include <linux/kernel.h>
26#include <linux/highmem.h>
27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h>
29#include <linux/sched.h>
30#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */
35
36#include <linux/uprobes.h>
37
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT;
43
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45
46#define UPROBES_HASH_SZ 13
47
48/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50
51#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
52
53/* serialize uprobe->pending_list */
54static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
55#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
56
57/*
58 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
59 * events active at this time. Probably a fine grained per inode count is
60 * better?
61 */
62static atomic_t uprobe_events = ATOMIC_INIT(0);
63
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref;
78 struct rw_semaphore consumer_rwsem;
79 struct list_head pending_list;
80 struct uprobe_consumer *consumers;
81 struct inode *inode; /* Also hold a ref to inode */
82 loff_t offset;
83 int flags;
84 struct arch_uprobe arch;
85};
86
87/*
88 * valid_vma: Verify if the specified vma is an executable vma
89 * Relax restrictions while unregistering: vm_flags might have
90 * changed after breakpoint was inserted.
91 * - is_register: indicates if we are in register context.
92 * - Return 1 if the specified virtual address is in an
93 * executable vma.
94 */
95static bool valid_vma(struct vm_area_struct *vma, bool is_register)
96{
97 if (!vma->vm_file)
98 return false;
99
100 if (!is_register)
101 return true;
102
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
104 return true;
105
106 return false;
107}
108
109static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
110{
111 loff_t vaddr;
112
113 vaddr = vma->vm_start + offset;
114 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
115
116 return vaddr;
117}
118
119/**
120 * __replace_page - replace page in vma by new page.
121 * based on replace_page in mm/ksm.c
122 *
123 * @vma: vma that holds the pte pointing to page
124 * @page: the cowed page we are replacing by kpage
125 * @kpage: the modified page we replace page by
126 *
127 * Returns 0 on success, -EFAULT on failure.
128 */
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
130{
131 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl;
137 unsigned long addr;
138 int err = -EFAULT;
139
140 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT)
142 goto out;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151
152 pmd = pmd_offset(pud, addr);
153 if (!pmd_present(*pmd))
154 goto out;
155
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
157 if (!ptep)
158 goto out;
159
160 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr);
162
163 if (!PageAnon(page)) {
164 dec_mm_counter(mm, MM_FILEPAGES);
165 inc_mm_counter(mm, MM_ANONPAGES);
166 }
167
168 flush_cache_page(vma, addr, pte_pfn(*ptep));
169 ptep_clear_flush(vma, addr, ptep);
170 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
171
172 page_remove_rmap(page);
173 if (!page_mapped(page))
174 try_to_free_swap(page);
175 put_page(page);
176 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178
179out:
180 return err;
181}
182
183/**
184 * is_swbp_insn - check if instruction is breakpoint instruction.
185 * @insn: instruction to be checked.
186 * Default implementation of is_swbp_insn
187 * Returns true if @insn is a breakpoint instruction.
188 */
189bool __weak is_swbp_insn(uprobe_opcode_t *insn)
190{
191 return *insn == UPROBE_SWBP_INSN;
192}
193
194/*
195 * NOTE:
196 * Expect the breakpoint instruction to be the smallest size instruction for
197 * the architecture. If an arch has variable length instruction and the
198 * breakpoint instruction is not of the smallest length instruction
199 * supported by that architecture then we need to modify read_opcode /
200 * write_opcode accordingly. This would never be a problem for archs that
201 * have fixed length instructions.
202 */
203
204/*
205 * write_opcode - write the opcode at a given virtual address.
206 * @auprobe: arch breakpointing information.
207 * @mm: the probed process address space.
208 * @vaddr: the virtual address to store the opcode.
209 * @opcode: opcode to be written at @vaddr.
210 *
211 * Called with mm->mmap_sem held (for read and with a reference to
212 * mm).
213 *
214 * For mm @mm, write the opcode at @vaddr.
215 * Return 0 (success) or a negative errno.
216 */
217static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
218 unsigned long vaddr, uprobe_opcode_t opcode)
219{
220 struct page *old_page, *new_page;
221 struct address_space *mapping;
222 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma;
224 struct uprobe *uprobe;
225 loff_t addr;
226 int ret;
227
228 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0)
231 return ret;
232
233 ret = -EINVAL;
234
235 /*
236 * We are interested in text pages only. Our pages of interest
237 * should be mapped for read and execute only. We desist from
238 * adding probes in write mapped pages since the breakpoints
239 * might end up in the file copy.
240 */
241 if (!valid_vma(vma, is_swbp_insn(&opcode)))
242 goto put_out;
243
244 uprobe = container_of(auprobe, struct uprobe, arch);
245 mapping = uprobe->inode->i_mapping;
246 if (mapping != vma->vm_file->f_mapping)
247 goto put_out;
248
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page)
256 goto put_out;
257
258 __SetPageUptodate(new_page);
259
260 /*
261 * lock page will serialize against do_wp_page()'s
262 * PageAnon() handling
263 */
264 lock_page(old_page);
265 /* copy the page now that we've got it stable */
266 vaddr_old = kmap_atomic(old_page);
267 vaddr_new = kmap_atomic(new_page);
268
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275
276 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old);
278
279 ret = anon_vma_prepare(vma);
280 if (ret)
281 goto unlock_out;
282
283 lock_page(new_page);
284 ret = __replace_page(vma, old_page, new_page);
285 unlock_page(new_page);
286
287unlock_out:
288 unlock_page(old_page);
289 page_cache_release(new_page);
290
291put_out:
292 put_page(old_page);
293
294 return ret;
295}
296
297/**
298 * read_opcode - read the opcode at a given virtual address.
299 * @mm: the probed process address space.
300 * @vaddr: the virtual address to read the opcode.
301 * @opcode: location to store the read opcode.
302 *
303 * Called with mm->mmap_sem held (for read and with a reference to
304 * mm.
305 *
306 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
307 * Return 0 (success) or a negative errno.
308 */
309static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
310{
311 struct page *page;
312 void *vaddr_new;
313 int ret;
314
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
316 if (ret <= 0)
317 return ret;
318
319 lock_page(page);
320 vaddr_new = kmap_atomic(page);
321 vaddr &= ~PAGE_MASK;
322 memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
323 kunmap_atomic(vaddr_new);
324 unlock_page(page);
325
326 put_page(page);
327
328 return 0;
329}
330
331static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
332{
333 uprobe_opcode_t opcode;
334 int result;
335
336 result = read_opcode(mm, vaddr, &opcode);
337 if (result)
338 return result;
339
340 if (is_swbp_insn(&opcode))
341 return 1;
342
343 return 0;
344}
345
346/**
347 * set_swbp - store breakpoint at a given address.
348 * @auprobe: arch specific probepoint information.
349 * @mm: the probed process address space.
350 * @vaddr: the virtual address to insert the opcode.
351 *
352 * For mm @mm, store the breakpoint instruction at @vaddr.
353 * Return 0 (success) or a negative errno.
354 */
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{
357 int result;
358
359 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1)
361 return -EEXIST;
362
363 if (result)
364 return result;
365
366 return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
367}
368
369/**
370 * set_orig_insn - Restore the original instruction.
371 * @mm: the probed process address space.
372 * @auprobe: arch specific probepoint information.
373 * @vaddr: the virtual address to insert the opcode.
374 * @verify: if true, verify existance of breakpoint instruction.
375 *
376 * For mm @mm, restore the original opcode (opcode) at @vaddr.
377 * Return 0 (success) or a negative errno.
378 */
379int __weak
380set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
381{
382 if (verify) {
383 int result;
384
385 result = is_swbp_at_addr(mm, vaddr);
386 if (!result)
387 return -EINVAL;
388
389 if (result != 1)
390 return result;
391 }
392 return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
393}
394
395static int match_uprobe(struct uprobe *l, struct uprobe *r)
396{
397 if (l->inode < r->inode)
398 return -1;
399
400 if (l->inode > r->inode)
401 return 1;
402
403 if (l->offset < r->offset)
404 return -1;
405
406 if (l->offset > r->offset)
407 return 1;
408
409 return 0;
410}
411
412static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
413{
414 struct uprobe u = { .inode = inode, .offset = offset };
415 struct rb_node *n = uprobes_tree.rb_node;
416 struct uprobe *uprobe;
417 int match;
418
419 while (n) {
420 uprobe = rb_entry(n, struct uprobe, rb_node);
421 match = match_uprobe(&u, uprobe);
422 if (!match) {
423 atomic_inc(&uprobe->ref);
424 return uprobe;
425 }
426
427 if (match < 0)
428 n = n->rb_left;
429 else
430 n = n->rb_right;
431 }
432 return NULL;
433}
434
435/*
436 * Find a uprobe corresponding to a given inode:offset
437 * Acquires uprobes_treelock
438 */
439static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
440{
441 struct uprobe *uprobe;
442 unsigned long flags;
443
444 spin_lock_irqsave(&uprobes_treelock, flags);
445 uprobe = __find_uprobe(inode, offset);
446 spin_unlock_irqrestore(&uprobes_treelock, flags);
447
448 return uprobe;
449}
450
451static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
452{
453 struct rb_node **p = &uprobes_tree.rb_node;
454 struct rb_node *parent = NULL;
455 struct uprobe *u;
456 int match;
457
458 while (*p) {
459 parent = *p;
460 u = rb_entry(parent, struct uprobe, rb_node);
461 match = match_uprobe(uprobe, u);
462 if (!match) {
463 atomic_inc(&u->ref);
464 return u;
465 }
466
467 if (match < 0)
468 p = &parent->rb_left;
469 else
470 p = &parent->rb_right;
471
472 }
473
474 u = NULL;
475 rb_link_node(&uprobe->rb_node, parent, p);
476 rb_insert_color(&uprobe->rb_node, &uprobes_tree);
477 /* get access + creation ref */
478 atomic_set(&uprobe->ref, 2);
479
480 return u;
481}
482
483/*
484 * Acquire uprobes_treelock.
485 * Matching uprobe already exists in rbtree;
486 * increment (access refcount) and return the matching uprobe.
487 *
488 * No matching uprobe; insert the uprobe in rb_tree;
489 * get a double refcount (access + creation) and return NULL.
490 */
491static struct uprobe *insert_uprobe(struct uprobe *uprobe)
492{
493 unsigned long flags;
494 struct uprobe *u;
495
496 spin_lock_irqsave(&uprobes_treelock, flags);
497 u = __insert_uprobe(uprobe);
498 spin_unlock_irqrestore(&uprobes_treelock, flags);
499
500 /* For now assume that the instruction need not be single-stepped */
501 uprobe->flags |= UPROBE_SKIP_SSTEP;
502
503 return u;
504}
505
506static void put_uprobe(struct uprobe *uprobe)
507{
508 if (atomic_dec_and_test(&uprobe->ref))
509 kfree(uprobe);
510}
511
512static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
513{
514 struct uprobe *uprobe, *cur_uprobe;
515
516 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
517 if (!uprobe)
518 return NULL;
519
520 uprobe->inode = igrab(inode);
521 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524
525 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe);
527
528 /* a uprobe exists for this inode:offset combination */
529 if (cur_uprobe) {
530 kfree(uprobe);
531 uprobe = cur_uprobe;
532 iput(inode);
533 } else {
534 atomic_inc(&uprobe_events);
535 }
536
537 return uprobe;
538}
539
540static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
541{
542 struct uprobe_consumer *uc;
543
544 if (!(uprobe->flags & UPROBE_RUN_HANDLER))
545 return;
546
547 down_read(&uprobe->consumer_rwsem);
548 for (uc = uprobe->consumers; uc; uc = uc->next) {
549 if (!uc->filter || uc->filter(uc, current))
550 uc->handler(uc, regs);
551 }
552 up_read(&uprobe->consumer_rwsem);
553}
554
555/* Returns the previous consumer */
556static struct uprobe_consumer *
557consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
558{
559 down_write(&uprobe->consumer_rwsem);
560 uc->next = uprobe->consumers;
561 uprobe->consumers = uc;
562 up_write(&uprobe->consumer_rwsem);
563
564 return uc->next;
565}
566
567/*
568 * For uprobe @uprobe, delete the consumer @uc.
569 * Return true if the @uc is deleted successfully
570 * or return false.
571 */
572static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
573{
574 struct uprobe_consumer **con;
575 bool ret = false;
576
577 down_write(&uprobe->consumer_rwsem);
578 for (con = &uprobe->consumers; *con; con = &(*con)->next) {
579 if (*con == uc) {
580 *con = uc->next;
581 ret = true;
582 break;
583 }
584 }
585 up_write(&uprobe->consumer_rwsem);
586
587 return ret;
588}
589
590static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
592 unsigned long nbytes, unsigned long offset)
593{
594 struct file *filp = vma->vm_file;
595 struct page *page;
596 void *vaddr;
597 unsigned long off1;
598 unsigned long idx;
599
600 if (!filp)
601 return -EINVAL;
602
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
604 off1 = offset &= ~PAGE_MASK;
605
606 /*
607 * Ensure that the page that has the original instruction is
608 * populated and in page-cache.
609 */
610 page = read_mapping_page(mapping, idx, filp);
611 if (IS_ERR(page))
612 return PTR_ERR(page);
613
614 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes);
616 kunmap_atomic(vaddr);
617 page_cache_release(page);
618
619 return 0;
620}
621
622static int
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{
625 struct address_space *mapping;
626 unsigned long nbytes;
627 int bytes;
628
629 addr &= ~PAGE_MASK;
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping;
632
633 /* Instruction at end of binary; copy only available bytes */
634 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
635 bytes = uprobe->inode->i_size - uprobe->offset;
636 else
637 bytes = MAX_UINSN_BYTES;
638
639 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes))
643 return -ENOMEM;
644
645 bytes = nbytes;
646 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
648}
649
650/*
651 * How mm->uprobes_state.count gets updated
652 * uprobe_mmap() increments the count if
653 * - it successfully adds a breakpoint.
654 * - it cannot add a breakpoint, but sees that there is a underlying
655 * breakpoint (via a is_swbp_at_addr()).
656 *
657 * uprobe_munmap() decrements the count if
658 * - it sees a underlying breakpoint, (via is_swbp_at_addr)
659 * (Subsequent uprobe_unregister wouldnt find the breakpoint
660 * unless a uprobe_mmap kicks in, since the old vma would be
661 * dropped just after uprobe_munmap.)
662 *
663 * uprobe_register increments the count if:
664 * - it successfully adds a breakpoint.
665 *
666 * uprobe_unregister decrements the count if:
667 * - it sees a underlying breakpoint and removes successfully.
668 * (via is_swbp_at_addr)
669 * (Subsequent uprobe_munmap wouldnt find the breakpoint
670 * since there is no underlying breakpoint after the
671 * breakpoint removal.)
672 */
673static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr)
676{
677 unsigned long addr;
678 int ret;
679
680 /*
681 * If probe is being deleted, unregister thread could be done with
682 * the vma-rmap-walk through. Adding a probe now can be fatal since
683 * nobody will be able to cleanup. Also we could be from fork or
684 * mremap path, where the probe might have already been inserted.
685 * Hence behave as if probe already existed.
686 */
687 if (!uprobe->consumers)
688 return -EEXIST;
689
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr);
694 if (ret)
695 return ret;
696
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST;
699
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
701 if (ret)
702 return ret;
703
704 uprobe->flags |= UPROBE_COPY_INSN;
705 }
706
707 /*
708 * Ideally, should be updating the probe count after the breakpoint
709 * has been successfully inserted. However a thread could hit the
710 * breakpoint we just inserted even before the probe count is
711 * incremented. If this is the first breakpoint placed, breakpoint
712 * notifier might ignore uprobes and pass the trap to the thread.
713 * Hence increment before and decrement on failure.
714 */
715 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr);
717 if (ret)
718 atomic_dec(&mm->uprobes_state.count);
719
720 return ret;
721}
722
723static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
725{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
727 atomic_dec(&mm->uprobes_state.count);
728}
729
730/*
731 * There could be threads that have hit the breakpoint and are entering the
732 * notifier code and trying to acquire the uprobes_treelock. The thread
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */
740static void delete_uprobe(struct uprobe *uprobe)
741{
742 unsigned long flags;
743
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags);
748 iput(uprobe->inode);
749 put_uprobe(uprobe);
750 atomic_dec(&uprobe_events);
751}
752
753static struct vma_info *
754__find_next_vma_info(struct address_space *mapping, struct list_head *head,
755 struct vma_info *vi, loff_t offset, bool is_register)
756{
757 struct prio_tree_iter iter;
758 struct vm_area_struct *vma;
759 struct vma_info *tmpvi;
760 unsigned long pgoff;
761 int existing_vma;
762 loff_t vaddr;
763
764 pgoff = offset >> PAGE_SHIFT;
765
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register))
768 continue;
769
770 existing_vma = 0;
771 vaddr = vma_address(vma, offset);
772
773 list_for_each_entry(tmpvi, head, probe_list) {
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
775 existing_vma = 1;
776 break;
777 }
778 }
779
780 /*
781 * Another vma needs a probe to be installed. However skip
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 }
791 }
792
793 return NULL;
794}
795
796/*
797 * Iterate in the rmap prio tree and find a vma where a probe has not
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
807 if (!vi)
808 return ERR_PTR(-ENOMEM);
809
810 mutex_lock(&mapping->i_mmap_mutex);
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
812 mutex_unlock(&mapping->i_mmap_mutex);
813
814 if (!retvi)
815 kfree(vi);
816
817 return retvi;
818}
819
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{
822 struct list_head try_list;
823 struct vm_area_struct *vma;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829
830 mapping = uprobe->inode->i_mapping;
831 INIT_LIST_HEAD(&try_list);
832
833 ret = 0;
834
835 for (;;) {
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
837 if (!vi)
838 break;
839
840 if (IS_ERR(vi)) {
841 ret = PTR_ERR(vi);
842 break;
843 }
844
845 mm = vi->mm;
846 down_read(&mm->mmap_sem);
847 vma = find_vma(mm, (unsigned long)vi->vaddr);
848 if (!vma || !valid_vma(vma, is_register)) {
849 list_del(&vi->probe_list);
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) {
858 list_del(&vi->probe_list);
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864
865 if (is_register)
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
867 else
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) {
873 if (ret && ret == -EEXIST)
874 ret = 0;
875 if (ret)
876 break;
877 }
878 }
879
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886}
887
888static int __uprobe_register(struct uprobe *uprobe)
889{
890 return register_for_each_vma(uprobe, true);
891}
892
893static void __uprobe_unregister(struct uprobe *uprobe)
894{
895 if (!register_for_each_vma(uprobe, false))
896 delete_uprobe(uprobe);
897
898 /* TODO : cant unregister? schedule a worker thread */
899}
900
901/*
902 * uprobe_register - register a probe
903 * @inode: the file in which the probe has to be placed.
904 * @offset: offset from the start of the file.
905 * @uc: information on howto handle the probe..
906 *
907 * Apart from the access refcount, uprobe_register() takes a creation
908 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
909 * inserted into the rbtree (i.e first consumer for a @inode:@offset
910 * tuple). Creation refcount stops uprobe_unregister from freeing the
911 * @uprobe even before the register operation is complete. Creation
912 * refcount is released when the last @uc for the @uprobe
913 * unregisters.
914 *
915 * Return errno if it cannot successully install probes
916 * else return 0 (success)
917 */
918int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
919{
920 struct uprobe *uprobe;
921 int ret;
922
923 if (!inode || !uc || uc->next)
924 return -EINVAL;
925
926 if (offset > i_size_read(inode))
927 return -EINVAL;
928
929 ret = 0;
930 mutex_lock(uprobes_hash(inode));
931 uprobe = alloc_uprobe(inode, offset);
932
933 if (uprobe && !consumer_add(uprobe, uc)) {
934 ret = __uprobe_register(uprobe);
935 if (ret) {
936 uprobe->consumers = NULL;
937 __uprobe_unregister(uprobe);
938 } else {
939 uprobe->flags |= UPROBE_RUN_HANDLER;
940 }
941 }
942
943 mutex_unlock(uprobes_hash(inode));
944 put_uprobe(uprobe);
945
946 return ret;
947}
948
949/*
950 * uprobe_unregister - unregister a already registered probe.
951 * @inode: the file in which the probe has to be removed.
952 * @offset: offset from the start of the file.
953 * @uc: identify which probe if multiple probes are colocated.
954 */
955void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
956{
957 struct uprobe *uprobe;
958
959 if (!inode || !uc)
960 return;
961
962 uprobe = find_uprobe(inode, offset);
963 if (!uprobe)
964 return;
965
966 mutex_lock(uprobes_hash(inode));
967
968 if (consumer_del(uprobe, uc)) {
969 if (!uprobe->consumers) {
970 __uprobe_unregister(uprobe);
971 uprobe->flags &= ~UPROBE_RUN_HANDLER;
972 }
973 }
974
975 mutex_unlock(uprobes_hash(inode));
976 if (uprobe)
977 put_uprobe(uprobe);
978}
979
980/*
981 * Of all the nodes that correspond to the given inode, return the node
982 * with the least offset.
983 */
984static struct rb_node *find_least_offset_node(struct inode *inode)
985{
986 struct uprobe u = { .inode = inode, .offset = 0};
987 struct rb_node *n = uprobes_tree.rb_node;
988 struct rb_node *close_node = NULL;
989 struct uprobe *uprobe;
990 int match;
991
992 while (n) {
993 uprobe = rb_entry(n, struct uprobe, rb_node);
994 match = match_uprobe(&u, uprobe);
995
996 if (uprobe->inode == inode)
997 close_node = n;
998
999 if (!match)
1000 return close_node;
1001
1002 if (match < 0)
1003 n = n->rb_left;
1004 else
1005 n = n->rb_right;
1006 }
1007
1008 return close_node;
1009}
1010
1011/*
1012 * For a given inode, build a list of probes that need to be inserted.
1013 */
1014static void build_probe_list(struct inode *inode, struct list_head *head)
1015{
1016 struct uprobe *uprobe;
1017 unsigned long flags;
1018 struct rb_node *n;
1019
1020 spin_lock_irqsave(&uprobes_treelock, flags);
1021
1022 n = find_least_offset_node(inode);
1023
1024 for (; n; n = rb_next(n)) {
1025 uprobe = rb_entry(n, struct uprobe, rb_node);
1026 if (uprobe->inode != inode)
1027 break;
1028
1029 list_add(&uprobe->pending_list, head);
1030 atomic_inc(&uprobe->ref);
1031 }
1032
1033 spin_unlock_irqrestore(&uprobes_treelock, flags);
1034}
1035
1036/*
1037 * Called from mmap_region.
1038 * called with mm->mmap_sem acquired.
1039 *
1040 * Return -ve no if we fail to insert probes and we cannot
1041 * bail-out.
1042 * Return 0 otherwise. i.e:
1043 *
1044 * - successful insertion of probes
1045 * - (or) no possible probes to be inserted.
1046 * - (or) insertion of probes failed but we can bail-out.
1047 */
1048int uprobe_mmap(struct vm_area_struct *vma)
1049{
1050 struct list_head tmp_list;
1051 struct uprobe *uprobe, *u;
1052 struct inode *inode;
1053 int ret, count;
1054
1055 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
1056 return 0;
1057
1058 inode = vma->vm_file->f_mapping->host;
1059 if (!inode)
1060 return 0;
1061
1062 INIT_LIST_HEAD(&tmp_list);
1063 mutex_lock(uprobes_mmap_hash(inode));
1064 build_probe_list(inode, &tmp_list);
1065
1066 ret = 0;
1067 count = 0;
1068
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset);
1075
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe);
1078 continue;
1079 }
1080
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082
1083 /* Ignore double add: */
1084 if (ret == -EEXIST) {
1085 ret = 0;
1086
1087 if (!is_swbp_at_addr(vma->vm_mm, vaddr))
1088 continue;
1089
1090 /*
1091 * Unable to insert a breakpoint, but
1092 * breakpoint lies underneath. Increment the
1093 * probe count.
1094 */
1095 atomic_inc(&vma->vm_mm->uprobes_state.count);
1096 }
1097
1098 if (!ret)
1099 count++;
1100 }
1101 put_uprobe(uprobe);
1102 }
1103
1104 mutex_unlock(uprobes_mmap_hash(inode));
1105
1106 if (ret)
1107 atomic_sub(count, &vma->vm_mm->uprobes_state.count);
1108
1109 return ret;
1110}
1111
1112/*
1113 * Called in context of a munmap of a vma.
1114 */
1115void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1116{
1117 struct list_head tmp_list;
1118 struct uprobe *uprobe, *u;
1119 struct inode *inode;
1120
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1122 return;
1123
1124 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1125 return;
1126
1127 inode = vma->vm_file->f_mapping->host;
1128 if (!inode)
1129 return;
1130
1131 INIT_LIST_HEAD(&tmp_list);
1132 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list);
1134
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1136 loff_t vaddr;
1137
1138 list_del(&uprobe->pending_list);
1139 vaddr = vma_address(vma, uprobe->offset);
1140
1141 if (vaddr >= start && vaddr < end) {
1142 /*
1143 * An unregister could have removed the probe before
1144 * unmap. So check before we decrement the count.
1145 */
1146 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1147 atomic_dec(&vma->vm_mm->uprobes_state.count);
1148 }
1149 put_uprobe(uprobe);
1150 }
1151 mutex_unlock(uprobes_mmap_hash(inode));
1152}
1153
1154/* Slot allocation for XOL */
1155static int xol_add_vma(struct xol_area *area)
1156{
1157 struct mm_struct *mm;
1158 int ret;
1159
1160 area->page = alloc_page(GFP_HIGHUSER);
1161 if (!area->page)
1162 return -ENOMEM;
1163
1164 ret = -EALREADY;
1165 mm = current->mm;
1166
1167 down_write(&mm->mmap_sem);
1168 if (mm->uprobes_state.xol_area)
1169 goto fail;
1170
1171 ret = -ENOMEM;
1172
1173 /* Try to map as high as possible, this is only a hint. */
1174 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1175 if (area->vaddr & ~PAGE_MASK) {
1176 ret = area->vaddr;
1177 goto fail;
1178 }
1179
1180 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1181 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1182 if (ret)
1183 goto fail;
1184
1185 smp_wmb(); /* pairs with get_xol_area() */
1186 mm->uprobes_state.xol_area = area;
1187 ret = 0;
1188
1189fail:
1190 up_write(&mm->mmap_sem);
1191 if (ret)
1192 __free_page(area->page);
1193
1194 return ret;
1195}
1196
1197static struct xol_area *get_xol_area(struct mm_struct *mm)
1198{
1199 struct xol_area *area;
1200
1201 area = mm->uprobes_state.xol_area;
1202 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1203
1204 return area;
1205}
1206
1207/*
1208 * xol_alloc_area - Allocate process's xol_area.
1209 * This area will be used for storing instructions for execution out of
1210 * line.
1211 *
1212 * Returns the allocated area or NULL.
1213 */
1214static struct xol_area *xol_alloc_area(void)
1215{
1216 struct xol_area *area;
1217
1218 area = kzalloc(sizeof(*area), GFP_KERNEL);
1219 if (unlikely(!area))
1220 return NULL;
1221
1222 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1223
1224 if (!area->bitmap)
1225 goto fail;
1226
1227 init_waitqueue_head(&area->wq);
1228 if (!xol_add_vma(area))
1229 return area;
1230
1231fail:
1232 kfree(area->bitmap);
1233 kfree(area);
1234
1235 return get_xol_area(current->mm);
1236}
1237
1238/*
1239 * uprobe_clear_state - Free the area allocated for slots.
1240 */
1241void uprobe_clear_state(struct mm_struct *mm)
1242{
1243 struct xol_area *area = mm->uprobes_state.xol_area;
1244
1245 if (!area)
1246 return;
1247
1248 put_page(area->page);
1249 kfree(area->bitmap);
1250 kfree(area);
1251}
1252
1253/*
1254 * uprobe_reset_state - Free the area allocated for slots.
1255 */
1256void uprobe_reset_state(struct mm_struct *mm)
1257{
1258 mm->uprobes_state.xol_area = NULL;
1259 atomic_set(&mm->uprobes_state.count, 0);
1260}
1261
1262/*
1263 * - search for a free slot.
1264 */
1265static unsigned long xol_take_insn_slot(struct xol_area *area)
1266{
1267 unsigned long slot_addr;
1268 int slot_nr;
1269
1270 do {
1271 slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1272 if (slot_nr < UINSNS_PER_PAGE) {
1273 if (!test_and_set_bit(slot_nr, area->bitmap))
1274 break;
1275
1276 slot_nr = UINSNS_PER_PAGE;
1277 continue;
1278 }
1279 wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1280 } while (slot_nr >= UINSNS_PER_PAGE);
1281
1282 slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1283 atomic_inc(&area->slot_count);
1284
1285 return slot_addr;
1286}
1287
1288/*
1289 * xol_get_insn_slot - If was not allocated a slot, then
1290 * allocate a slot.
1291 * Returns the allocated slot address or 0.
1292 */
1293static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
1294{
1295 struct xol_area *area;
1296 unsigned long offset;
1297 void *vaddr;
1298
1299 area = get_xol_area(current->mm);
1300 if (!area) {
1301 area = xol_alloc_area();
1302 if (!area)
1303 return 0;
1304 }
1305 current->utask->xol_vaddr = xol_take_insn_slot(area);
1306
1307 /*
1308 * Initialize the slot if xol_vaddr points to valid
1309 * instruction slot.
1310 */
1311 if (unlikely(!current->utask->xol_vaddr))
1312 return 0;
1313
1314 current->utask->vaddr = slot_addr;
1315 offset = current->utask->xol_vaddr & ~PAGE_MASK;
1316 vaddr = kmap_atomic(area->page);
1317 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1318 kunmap_atomic(vaddr);
1319
1320 return current->utask->xol_vaddr;
1321}
1322
1323/*
1324 * xol_free_insn_slot - If slot was earlier allocated by
1325 * @xol_get_insn_slot(), make the slot available for
1326 * subsequent requests.
1327 */
1328static void xol_free_insn_slot(struct task_struct *tsk)
1329{
1330 struct xol_area *area;
1331 unsigned long vma_end;
1332 unsigned long slot_addr;
1333
1334 if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1335 return;
1336
1337 slot_addr = tsk->utask->xol_vaddr;
1338
1339 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1340 return;
1341
1342 area = tsk->mm->uprobes_state.xol_area;
1343 vma_end = area->vaddr + PAGE_SIZE;
1344 if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1345 unsigned long offset;
1346 int slot_nr;
1347
1348 offset = slot_addr - area->vaddr;
1349 slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1350 if (slot_nr >= UINSNS_PER_PAGE)
1351 return;
1352
1353 clear_bit(slot_nr, area->bitmap);
1354 atomic_dec(&area->slot_count);
1355 if (waitqueue_active(&area->wq))
1356 wake_up(&area->wq);
1357
1358 tsk->utask->xol_vaddr = 0;
1359 }
1360}
1361
1362/**
1363 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1364 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1365 * instruction.
1366 * Return the address of the breakpoint instruction.
1367 */
1368unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1369{
1370 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1371}
1372
1373/*
1374 * Called with no locks held.
1375 * Called in context of a exiting or a exec-ing thread.
1376 */
1377void uprobe_free_utask(struct task_struct *t)
1378{
1379 struct uprobe_task *utask = t->utask;
1380
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask)
1385 return;
1386
1387 if (utask->active_uprobe)
1388 put_uprobe(utask->active_uprobe);
1389
1390 xol_free_insn_slot(t);
1391 kfree(utask);
1392 t->utask = NULL;
1393}
1394
1395/*
1396 * Called in context of a new clone/fork from copy_process.
1397 */
1398void uprobe_copy_process(struct task_struct *t)
1399{
1400 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402}
1403
1404/*
1405 * Allocate a uprobe_task object for the task.
1406 * Called when the thread hits a breakpoint for the first time.
1407 *
1408 * Returns:
1409 * - pointer to new uprobe_task on success
1410 * - NULL otherwise
1411 */
1412static struct uprobe_task *add_utask(void)
1413{
1414 struct uprobe_task *utask;
1415
1416 utask = kzalloc(sizeof *utask, GFP_KERNEL);
1417 if (unlikely(!utask))
1418 return NULL;
1419
1420 utask->active_uprobe = NULL;
1421 current->utask = utask;
1422 return utask;
1423}
1424
1425/* Prepare to single-step probed instruction out of line. */
1426static int
1427pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
1428{
1429 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
1430 return 0;
1431
1432 return -EFAULT;
1433}
1434
1435/*
1436 * If we are singlestepping, then ensure this thread is not connected to
1437 * non-fatal signals until completion of singlestep. When xol insn itself
1438 * triggers the signal, restart the original insn even if the task is
1439 * already SIGKILL'ed (since coredump should report the correct ip). This
1440 * is even more important if the task has a handler for SIGSEGV/etc, The
1441 * _same_ instruction should be repeated again after return from the signal
1442 * handler, and SSTEP can never finish in this case.
1443 */
1444bool uprobe_deny_signal(void)
1445{
1446 struct task_struct *t = current;
1447 struct uprobe_task *utask = t->utask;
1448
1449 if (likely(!utask || !utask->active_uprobe))
1450 return false;
1451
1452 WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1453
1454 if (signal_pending(t)) {
1455 spin_lock_irq(&t->sighand->siglock);
1456 clear_tsk_thread_flag(t, TIF_SIGPENDING);
1457 spin_unlock_irq(&t->sighand->siglock);
1458
1459 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1460 utask->state = UTASK_SSTEP_TRAPPED;
1461 set_tsk_thread_flag(t, TIF_UPROBE);
1462 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1463 }
1464 }
1465
1466 return true;
1467}
1468
1469/*
1470 * Avoid singlestepping the original instruction if the original instruction
1471 * is a NOP or can be emulated.
1472 */
1473static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1474{
1475 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1476 return true;
1477
1478 uprobe->flags &= ~UPROBE_SKIP_SSTEP;
1479 return false;
1480}
1481
1482/*
1483 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */
1486static void handle_swbp(struct pt_regs *regs)
1487{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask;
1490 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr;
1493
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm;
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513
1514 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */
1516 send_sig(SIGTRAP, current, 0);
1517 return;
1518 }
1519
1520 utask = current->utask;
1521 if (!utask) {
1522 utask = add_utask();
1523 /* Cannot allocate; re-execute the instruction. */
1524 if (!utask)
1525 goto cleanup_ret;
1526 }
1527 utask->active_uprobe = uprobe;
1528 handler_chain(uprobe, regs);
1529 if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
1530 goto cleanup_ret;
1531
1532 utask->state = UTASK_SSTEP;
1533 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1534 user_enable_single_step(current);
1535 return;
1536 }
1537
1538cleanup_ret:
1539 if (utask) {
1540 utask->active_uprobe = NULL;
1541 utask->state = UTASK_RUNNING;
1542 }
1543 if (uprobe) {
1544 if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
1545
1546 /*
1547 * cannot singlestep; cannot skip instruction;
1548 * re-execute the instruction.
1549 */
1550 instruction_pointer_set(regs, bp_vaddr);
1551
1552 put_uprobe(uprobe);
1553 }
1554}
1555
1556/*
1557 * Perform required fix-ups and disable singlestep.
1558 * Allow pending signals to take effect.
1559 */
1560static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1561{
1562 struct uprobe *uprobe;
1563
1564 uprobe = utask->active_uprobe;
1565 if (utask->state == UTASK_SSTEP_ACK)
1566 arch_uprobe_post_xol(&uprobe->arch, regs);
1567 else if (utask->state == UTASK_SSTEP_TRAPPED)
1568 arch_uprobe_abort_xol(&uprobe->arch, regs);
1569 else
1570 WARN_ON_ONCE(1);
1571
1572 put_uprobe(uprobe);
1573 utask->active_uprobe = NULL;
1574 utask->state = UTASK_RUNNING;
1575 user_disable_single_step(current);
1576 xol_free_insn_slot(current);
1577
1578 spin_lock_irq(&current->sighand->siglock);
1579 recalc_sigpending(); /* see uprobe_deny_signal() */
1580 spin_unlock_irq(&current->sighand->siglock);
1581}
1582
1583/*
1584 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
1585 * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
1586 * allows the thread to return from interrupt.
1587 *
1588 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
1589 * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
1590 * interrupt.
1591 *
1592 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1593 * uprobe_notify_resume().
1594 */
1595void uprobe_notify_resume(struct pt_regs *regs)
1596{
1597 struct uprobe_task *utask;
1598
1599 utask = current->utask;
1600 if (!utask || utask->state == UTASK_BP_HIT)
1601 handle_swbp(regs);
1602 else
1603 handle_singlestep(utask, regs);
1604}
1605
1606/*
1607 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1608 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1609 */
1610int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1611{
1612 struct uprobe_task *utask;
1613
1614 if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
1615 /* task is currently not uprobed */
1616 return 0;
1617
1618 utask = current->utask;
1619 if (utask)
1620 utask->state = UTASK_BP_HIT;
1621
1622 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624
1625 return 1;
1626}
1627
1628/*
1629 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1630 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1631 */
1632int uprobe_post_sstep_notifier(struct pt_regs *regs)
1633{
1634 struct uprobe_task *utask = current->utask;
1635
1636 if (!current->mm || !utask || !utask->active_uprobe)
1637 /* task is currently not uprobed */
1638 return 0;
1639
1640 utask->state = UTASK_SSTEP_ACK;
1641 set_thread_flag(TIF_UPROBE);
1642 return 1;
1643}
1644
1645static struct notifier_block uprobe_exception_nb = {
1646 .notifier_call = arch_uprobe_exception_notify,
1647 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
1648};
1649
1650static int __init init_uprobes(void)
1651{
1652 int i;
1653
1654 for (i = 0; i < UPROBES_HASH_SZ; i++) {
1655 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]);
1657 }
1658 init_srcu_struct(&uprobes_srcu);
1659
1660 return register_die_notifier(&uprobe_exception_nb);
1661}
1662module_init(init_uprobes);
1663
1664static void __exit exit_uprobes(void)
1665{
1666}
1667module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index 3ecd096e5d4d..34867cc5b42a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,9 +884,9 @@ static void check_stack_usage(void)
884 884
885 spin_lock(&low_water_lock); 885 spin_lock(&low_water_lock);
886 if (free < lowest_to_date) { 886 if (free < lowest_to_date) {
887 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " 887 printk(KERN_WARNING "%s (%d) used greatest stack depth: "
888 "left\n", 888 "%lu bytes left\n",
889 current->comm, free); 889 current->comm, task_pid_nr(current), free);
890 lowest_to_date = free; 890 lowest_to_date = free;
891 } 891 }
892 spin_unlock(&low_water_lock); 892 spin_unlock(&low_water_lock);
@@ -1215,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1215 unsigned long state; 1215 unsigned long state;
1216 int retval, status, traced; 1216 int retval, status, traced;
1217 pid_t pid = task_pid_vnr(p); 1217 pid_t pid = task_pid_vnr(p);
1218 uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); 1218 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1219 struct siginfo __user *infop; 1219 struct siginfo __user *infop;
1220 1220
1221 if (!likely(wo->wo_flags & WEXITED)) 1221 if (!likely(wo->wo_flags & WEXITED))
diff --git a/kernel/fork.c b/kernel/fork.c
index a46db217a589..ab5211b9e622 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -69,6 +69,7 @@
69#include <linux/oom.h> 69#include <linux/oom.h>
70#include <linux/khugepaged.h> 70#include <linux/khugepaged.h>
71#include <linux/signalfd.h> 71#include <linux/signalfd.h>
72#include <linux/uprobes.h>
72 73
73#include <asm/pgtable.h> 74#include <asm/pgtable.h>
74#include <asm/pgalloc.h> 75#include <asm/pgalloc.h>
@@ -385,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
385 } 386 }
386 charge = 0; 387 charge = 0;
387 if (mpnt->vm_flags & VM_ACCOUNT) { 388 if (mpnt->vm_flags & VM_ACCOUNT) {
388 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 389 unsigned long len;
390 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
389 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 391 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
390 goto fail_nomem; 392 goto fail_nomem;
391 charge = len; 393 charge = len;
@@ -451,6 +453,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
451 453
452 if (retval) 454 if (retval)
453 goto out; 455 goto out;
456
457 if (file && uprobe_mmap(tmp))
458 goto out;
454 } 459 }
455 /* a new mm has just been created */ 460 /* a new mm has just been created */
456 arch_dup_mmap(oldmm, mm); 461 arch_dup_mmap(oldmm, mm);
@@ -599,6 +604,7 @@ void mmput(struct mm_struct *mm)
599 might_sleep(); 604 might_sleep();
600 605
601 if (atomic_dec_and_test(&mm->mm_users)) { 606 if (atomic_dec_and_test(&mm->mm_users)) {
607 uprobe_clear_state(mm);
602 exit_aio(mm); 608 exit_aio(mm);
603 ksm_exit(mm); 609 ksm_exit(mm);
604 khugepaged_exit(mm); /* must run before exit_mmap */ 610 khugepaged_exit(mm); /* must run before exit_mmap */
@@ -609,7 +615,6 @@ void mmput(struct mm_struct *mm)
609 list_del(&mm->mmlist); 615 list_del(&mm->mmlist);
610 spin_unlock(&mmlist_lock); 616 spin_unlock(&mmlist_lock);
611 } 617 }
612 put_swap_token(mm);
613 if (mm->binfmt) 618 if (mm->binfmt)
614 module_put(mm->binfmt->module); 619 module_put(mm->binfmt->module);
615 mmdrop(mm); 620 mmdrop(mm);
@@ -777,12 +782,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
777 exit_pi_state_list(tsk); 782 exit_pi_state_list(tsk);
778#endif 783#endif
779 784
785 uprobe_free_utask(tsk);
786
780 /* Get rid of any cached register state */ 787 /* Get rid of any cached register state */
781 deactivate_mm(tsk, mm); 788 deactivate_mm(tsk, mm);
782 789
783 if (tsk->vfork_done)
784 complete_vfork_done(tsk);
785
786 /* 790 /*
787 * If we're exiting normally, clear a user-space tid field if 791 * If we're exiting normally, clear a user-space tid field if
788 * requested. We leave this alone when dying by signal, to leave 792 * requested. We leave this alone when dying by signal, to leave
@@ -803,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
803 } 807 }
804 tsk->clear_child_tid = NULL; 808 tsk->clear_child_tid = NULL;
805 } 809 }
810
811 /*
812 * All done, finally we can wake up parent and return this mm to him.
813 * Also kthread_stop() uses this completion for synchronization.
814 */
815 if (tsk->vfork_done)
816 complete_vfork_done(tsk);
806} 817}
807 818
808/* 819/*
@@ -824,13 +835,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
824 memcpy(mm, oldmm, sizeof(*mm)); 835 memcpy(mm, oldmm, sizeof(*mm));
825 mm_init_cpumask(mm); 836 mm_init_cpumask(mm);
826 837
827 /* Initializing for Swap token stuff */
828 mm->token_priority = 0;
829 mm->last_interval = 0;
830
831#ifdef CONFIG_TRANSPARENT_HUGEPAGE 838#ifdef CONFIG_TRANSPARENT_HUGEPAGE
832 mm->pmd_huge_pte = NULL; 839 mm->pmd_huge_pte = NULL;
833#endif 840#endif
841 uprobe_reset_state(mm);
834 842
835 if (!mm_init(mm, tsk)) 843 if (!mm_init(mm, tsk))
836 goto fail_nomem; 844 goto fail_nomem;
@@ -905,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
905 goto fail_nomem; 913 goto fail_nomem;
906 914
907good_mm: 915good_mm:
908 /* Initializing for Swap token stuff */
909 mm->token_priority = 0;
910 mm->last_interval = 0;
911
912 tsk->mm = mm; 916 tsk->mm = mm;
913 tsk->active_mm = mm; 917 tsk->active_mm = mm;
914 return 0; 918 return 0;
@@ -976,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
976 * Share io context with parent, if CLONE_IO is set 980 * Share io context with parent, if CLONE_IO is set
977 */ 981 */
978 if (clone_flags & CLONE_IO) { 982 if (clone_flags & CLONE_IO) {
979 tsk->io_context = ioc_task_link(ioc); 983 ioc_task_link(ioc);
980 if (unlikely(!tsk->io_context)) 984 tsk->io_context = ioc;
981 return -ENOMEM;
982 } else if (ioprio_valid(ioc->ioprio)) { 985 } else if (ioprio_valid(ioc->ioprio)) {
983 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); 986 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
984 if (unlikely(!new_ioc)) 987 if (unlikely(!new_ioc))
@@ -1373,6 +1376,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 INIT_LIST_HEAD(&p->pi_state_list); 1376 INIT_LIST_HEAD(&p->pi_state_list);
1374 p->pi_state_cache = NULL; 1377 p->pi_state_cache = NULL;
1375#endif 1378#endif
1379 uprobe_copy_process(p);
1376 /* 1380 /*
1377 * sigaltstack should be cleared when sharing the same VM 1381 * sigaltstack should be cleared when sharing the same VM
1378 */ 1382 */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 0e0ba5f840b2..41c1564103f1 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) "irq: " fmt
2
1#include <linux/debugfs.h> 3#include <linux/debugfs.h>
2#include <linux/hardirq.h> 4#include <linux/hardirq.h>
3#include <linux/interrupt.h> 5#include <linux/interrupt.h>
@@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
56 return domain; 58 return domain;
57} 59}
58 60
61static void irq_domain_free(struct irq_domain *domain)
62{
63 of_node_put(domain->of_node);
64 kfree(domain);
65}
66
59static void irq_domain_add(struct irq_domain *domain) 67static void irq_domain_add(struct irq_domain *domain)
60{ 68{
61 mutex_lock(&irq_domain_mutex); 69 mutex_lock(&irq_domain_mutex);
62 list_add(&domain->link, &irq_domain_list); 70 list_add(&domain->link, &irq_domain_list);
63 mutex_unlock(&irq_domain_mutex); 71 mutex_unlock(&irq_domain_mutex);
64 pr_debug("irq: Allocated domain of type %d @0x%p\n", 72 pr_debug("Allocated domain of type %d @0x%p\n",
73 domain->revmap_type, domain);
74}
75
76/**
77 * irq_domain_remove() - Remove an irq domain.
78 * @domain: domain to remove
79 *
80 * This routine is used to remove an irq domain. The caller must ensure
81 * that all mappings within the domain have been disposed of prior to
82 * use, depending on the revmap type.
83 */
84void irq_domain_remove(struct irq_domain *domain)
85{
86 mutex_lock(&irq_domain_mutex);
87
88 switch (domain->revmap_type) {
89 case IRQ_DOMAIN_MAP_LEGACY:
90 /*
91 * Legacy domains don't manage their own irq_desc
92 * allocations, we expect the caller to handle irq_desc
93 * freeing on their own.
94 */
95 break;
96 case IRQ_DOMAIN_MAP_TREE:
97 /*
98 * radix_tree_delete() takes care of destroying the root
99 * node when all entries are removed. Shout if there are
100 * any mappings left.
101 */
102 WARN_ON(domain->revmap_data.tree.height);
103 break;
104 case IRQ_DOMAIN_MAP_LINEAR:
105 kfree(domain->revmap_data.linear.revmap);
106 domain->revmap_data.linear.size = 0;
107 break;
108 case IRQ_DOMAIN_MAP_NOMAP:
109 break;
110 }
111
112 list_del(&domain->link);
113
114 /*
115 * If the going away domain is the default one, reset it.
116 */
117 if (unlikely(irq_default_domain == domain))
118 irq_set_default_host(NULL);
119
120 mutex_unlock(&irq_domain_mutex);
121
122 pr_debug("Removed domain of type %d @0x%p\n",
65 domain->revmap_type, domain); 123 domain->revmap_type, domain);
124
125 irq_domain_free(domain);
66} 126}
127EXPORT_SYMBOL_GPL(irq_domain_remove);
67 128
68static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, 129static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
69 irq_hw_number_t hwirq) 130 irq_hw_number_t hwirq)
@@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
117 178
118 if (WARN_ON(!irq_data || irq_data->domain)) { 179 if (WARN_ON(!irq_data || irq_data->domain)) {
119 mutex_unlock(&irq_domain_mutex); 180 mutex_unlock(&irq_domain_mutex);
120 of_node_put(domain->of_node); 181 irq_domain_free(domain);
121 kfree(domain);
122 return NULL; 182 return NULL;
123 } 183 }
124 } 184 }
@@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
152 irq_domain_add(domain); 212 irq_domain_add(domain);
153 return domain; 213 return domain;
154} 214}
215EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
155 216
156/** 217/**
157 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. 218 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
158 * @of_node: pointer to interrupt controller's device tree node. 219 * @of_node: pointer to interrupt controller's device tree node.
220 * @size: Number of interrupts in the domain.
159 * @ops: map/unmap domain callbacks 221 * @ops: map/unmap domain callbacks
160 * @host_data: Controller private data pointer 222 * @host_data: Controller private data pointer
161 */ 223 */
@@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
181 irq_domain_add(domain); 243 irq_domain_add(domain);
182 return domain; 244 return domain;
183} 245}
246EXPORT_SYMBOL_GPL(irq_domain_add_linear);
184 247
185struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, 248struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
186 unsigned int max_irq, 249 unsigned int max_irq,
@@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
195 } 258 }
196 return domain; 259 return domain;
197} 260}
261EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
198 262
199/** 263/**
200 * irq_domain_add_tree() 264 * irq_domain_add_tree()
@@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
216 } 280 }
217 return domain; 281 return domain;
218} 282}
283EXPORT_SYMBOL_GPL(irq_domain_add_tree);
219 284
220/** 285/**
221 * irq_find_host() - Locates a domain for a given device node 286 * irq_find_host() - Locates a domain for a given device node
@@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host);
259 */ 324 */
260void irq_set_default_host(struct irq_domain *domain) 325void irq_set_default_host(struct irq_domain *domain)
261{ 326{
262 pr_debug("irq: Default domain set to @0x%p\n", domain); 327 pr_debug("Default domain set to @0x%p\n", domain);
263 328
264 irq_default_domain = domain; 329 irq_default_domain = domain;
265} 330}
331EXPORT_SYMBOL_GPL(irq_set_default_host);
266 332
267static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 333static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
268 irq_hw_number_t hwirq) 334 irq_hw_number_t hwirq)
@@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
272 irq_data->hwirq = hwirq; 338 irq_data->hwirq = hwirq;
273 irq_data->domain = domain; 339 irq_data->domain = domain;
274 if (domain->ops->map(domain, virq, hwirq)) { 340 if (domain->ops->map(domain, virq, hwirq)) {
275 pr_debug("irq: -> mapping failed, freeing\n"); 341 pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
276 irq_data->domain = NULL; 342 irq_data->domain = NULL;
277 irq_data->hwirq = 0; 343 irq_data->hwirq = 0;
278 return -1; 344 return -1;
@@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
303 369
304 virq = irq_alloc_desc_from(1, 0); 370 virq = irq_alloc_desc_from(1, 0);
305 if (!virq) { 371 if (!virq) {
306 pr_debug("irq: create_direct virq allocation failed\n"); 372 pr_debug("create_direct virq allocation failed\n");
307 return 0; 373 return 0;
308 } 374 }
309 if (virq >= domain->revmap_data.nomap.max_irq) { 375 if (virq >= domain->revmap_data.nomap.max_irq) {
@@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
312 irq_free_desc(virq); 378 irq_free_desc(virq);
313 return 0; 379 return 0;
314 } 380 }
315 pr_debug("irq: create_direct obtained virq %d\n", virq); 381 pr_debug("create_direct obtained virq %d\n", virq);
316 382
317 if (irq_setup_virq(domain, virq, virq)) { 383 if (irq_setup_virq(domain, virq, virq)) {
318 irq_free_desc(virq); 384 irq_free_desc(virq);
@@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
321 387
322 return virq; 388 return virq;
323} 389}
390EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
324 391
325/** 392/**
326 * irq_create_mapping() - Map a hardware interrupt into linux irq space 393 * irq_create_mapping() - Map a hardware interrupt into linux irq space
@@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
338 unsigned int hint; 405 unsigned int hint;
339 int virq; 406 int virq;
340 407
341 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 408 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
342 409
343 /* Look for default domain if nececssary */ 410 /* Look for default domain if nececssary */
344 if (domain == NULL) 411 if (domain == NULL)
345 domain = irq_default_domain; 412 domain = irq_default_domain;
346 if (domain == NULL) { 413 if (domain == NULL) {
347 printk(KERN_WARNING "irq_create_mapping called for" 414 pr_warning("irq_create_mapping called for"
348 " NULL domain, hwirq=%lx\n", hwirq); 415 " NULL domain, hwirq=%lx\n", hwirq);
349 WARN_ON(1); 416 WARN_ON(1);
350 return 0; 417 return 0;
351 } 418 }
352 pr_debug("irq: -> using domain @%p\n", domain); 419 pr_debug("-> using domain @%p\n", domain);
353 420
354 /* Check if mapping already exists */ 421 /* Check if mapping already exists */
355 virq = irq_find_mapping(domain, hwirq); 422 virq = irq_find_mapping(domain, hwirq);
356 if (virq) { 423 if (virq) {
357 pr_debug("irq: -> existing mapping on virq %d\n", virq); 424 pr_debug("-> existing mapping on virq %d\n", virq);
358 return virq; 425 return virq;
359 } 426 }
360 427
@@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
370 if (virq <= 0) 437 if (virq <= 0)
371 virq = irq_alloc_desc_from(1, 0); 438 virq = irq_alloc_desc_from(1, 0);
372 if (virq <= 0) { 439 if (virq <= 0) {
373 pr_debug("irq: -> virq allocation failed\n"); 440 pr_debug("-> virq allocation failed\n");
374 return 0; 441 return 0;
375 } 442 }
376 443
@@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
380 return 0; 447 return 0;
381 } 448 }
382 449
383 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", 450 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
384 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); 451 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
385 452
386 return virq; 453 return virq;
@@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
409 if (intsize > 0) 476 if (intsize > 0)
410 return intspec[0]; 477 return intspec[0];
411#endif 478#endif
412 printk(KERN_WARNING "irq: no irq domain found for %s !\n", 479 pr_warning("no irq domain found for %s !\n",
413 controller->full_name); 480 controller->full_name);
414 return 0; 481 return 0;
415 } 482 }
416 483
@@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
560 */ 627 */
561 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); 628 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
562} 629}
630EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
563 631
564/** 632/**
565 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. 633 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
@@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
584 mutex_unlock(&revmap_trees_mutex); 652 mutex_unlock(&revmap_trees_mutex);
585 } 653 }
586} 654}
655EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
587 656
588/** 657/**
589 * irq_linear_revmap() - Find a linux irq from a hw irq number. 658 * irq_linear_revmap() - Find a linux irq from a hw irq number.
@@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain,
617 686
618 return revmap[hwirq]; 687 return revmap[hwirq];
619} 688}
689EXPORT_SYMBOL_GPL(irq_linear_revmap);
620 690
621#ifdef CONFIG_IRQ_DOMAIN_DEBUG 691#ifdef CONFIG_IRQ_DOMAIN_DEBUG
622static int virq_debug_show(struct seq_file *m, void *private) 692static int virq_debug_show(struct seq_file *m, void *private)
@@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void)
691__initcall(irq_debugfs_init); 761__initcall(irq_debugfs_init);
692#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ 762#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
693 763
694int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, 764static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
695 irq_hw_number_t hwirq) 765 irq_hw_number_t hwirq)
696{ 766{
697 return 0; 767 return 0;
698} 768}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4d1f8f897414..ea0c6c2ae6f7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
7 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
8 */ 8 */
9 9
10#define pr_fmt(fmt) "genirq: " fmt
11
10#include <linux/irq.h> 12#include <linux/irq.h>
11#include <linux/kthread.h> 13#include <linux/kthread.h>
12#include <linux/module.h> 14#include <linux/module.h>
@@ -566,7 +568,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
566 * IRQF_TRIGGER_* but the PIC does not support multiple 568 * IRQF_TRIGGER_* but the PIC does not support multiple
567 * flow-types? 569 * flow-types?
568 */ 570 */
569 pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, 571 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
570 chip ? (chip->name ? : "unknown") : "unknown"); 572 chip ? (chip->name ? : "unknown") : "unknown");
571 return 0; 573 return 0;
572 } 574 }
@@ -601,7 +603,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
601 ret = 0; 603 ret = 0;
602 break; 604 break;
603 default: 605 default:
604 pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", 606 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
605 flags, irq, chip->irq_set_type); 607 flags, irq, chip->irq_set_type);
606 } 608 }
607 if (unmask) 609 if (unmask)
@@ -785,7 +787,7 @@ static void irq_thread_dtor(struct task_work *unused)
785 787
786 action = kthread_data(tsk); 788 action = kthread_data(tsk);
787 789
788 pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 790 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
789 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 791 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
790 792
791 793
@@ -1042,7 +1044,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1042 * has. The type flags are unreliable as the 1044 * has. The type flags are unreliable as the
1043 * underlying chip implementation can override them. 1045 * underlying chip implementation can override them.
1044 */ 1046 */
1045 pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", 1047 pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1046 irq); 1048 irq);
1047 ret = -EINVAL; 1049 ret = -EINVAL;
1048 goto out_mask; 1050 goto out_mask;
@@ -1093,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1093 1095
1094 if (nmsk != omsk) 1096 if (nmsk != omsk)
1095 /* hope the handler works with current trigger mode */ 1097 /* hope the handler works with current trigger mode */
1096 pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", 1098 pr_warning("irq %d uses trigger mode %u; requested %u\n",
1097 irq, nmsk, omsk); 1099 irq, nmsk, omsk);
1098 } 1100 }
1099 1101
@@ -1131,7 +1133,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1131 1133
1132mismatch: 1134mismatch:
1133 if (!(new->flags & IRQF_PROBE_SHARED)) { 1135 if (!(new->flags & IRQF_PROBE_SHARED)) {
1134 pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", 1136 pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1135 irq, new->flags, new->name, old->flags, old->name); 1137 irq, new->flags, new->name, old->flags, old->name);
1136#ifdef CONFIG_DEBUG_SHIRQ 1138#ifdef CONFIG_DEBUG_SHIRQ
1137 dump_stack(); 1139 dump_stack();
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b8..2169feeba529 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345static int __sprint_symbol(char *buffer, unsigned long address, 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset) 346 int symbol_offset, int add_offset)
347{ 347{
348 char *modname; 348 char *modname;
349 const char *name; 349 const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
358 if (name != buffer) 358 if (name != buffer)
359 strcpy(buffer, name); 359 strcpy(buffer, name);
360 len = strlen(buffer); 360 len = strlen(buffer);
361 buffer += len;
362 offset -= symbol_offset; 361 offset -= symbol_offset;
363 362
363 if (add_offset)
364 len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
365
364 if (modname) 366 if (modname)
365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); 367 len += sprintf(buffer + len, " [%s]", modname);
366 else
367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
368 368
369 return len; 369 return len;
370} 370}
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
382 */ 382 */
383int sprint_symbol(char *buffer, unsigned long address) 383int sprint_symbol(char *buffer, unsigned long address)
384{ 384{
385 return __sprint_symbol(buffer, address, 0); 385 return __sprint_symbol(buffer, address, 0, 1);
386} 386}
387
388EXPORT_SYMBOL_GPL(sprint_symbol); 387EXPORT_SYMBOL_GPL(sprint_symbol);
389 388
390/** 389/**
390 * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
391 * @buffer: buffer to be stored
392 * @address: address to lookup
393 *
394 * This function looks up a kernel symbol with @address and stores its name
395 * and module name to @buffer if possible. If no symbol was found, just saves
396 * its @address as is.
397 *
398 * This function returns the number of bytes stored in @buffer.
399 */
400int sprint_symbol_no_offset(char *buffer, unsigned long address)
401{
402 return __sprint_symbol(buffer, address, 0, 0);
403}
404EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
405
406/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer 407 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored 408 * @buffer: buffer to be stored
393 * @address: address to lookup 409 * @address: address to lookup
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
403 */ 419 */
404int sprint_backtrace(char *buffer, unsigned long address) 420int sprint_backtrace(char *buffer, unsigned long address)
405{ 421{
406 return __sprint_symbol(buffer, address, -1); 422 return __sprint_symbol(buffer, address, -1, 1);
407} 423}
408 424
409/* Look up a kernel symbol and print it to the kernel messages. */ 425/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
1#include <linux/kernel.h>
2#include <linux/syscalls.h>
3#include <linux/fdtable.h>
4#include <linux/string.h>
5#include <linux/random.h>
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/cache.h>
10#include <linux/bug.h>
11#include <linux/err.h>
12#include <linux/kcmp.h>
13
14#include <asm/unistd.h>
15
16/*
17 * We don't expose the real in-memory order of objects for security reasons.
18 * But still the comparison results should be suitable for sorting. So we
19 * obfuscate kernel pointers values and compare the production instead.
20 *
21 * The obfuscation is done in two steps. First we xor the kernel pointer with
22 * a random value, which puts pointer into a new position in a reordered space.
23 * Secondly we multiply the xor production with a large odd random number to
24 * permute its bits even more (the odd multiplier guarantees that the product
25 * is unique ever after the high bits are truncated, since any odd number is
26 * relative prime to 2^n).
27 *
28 * Note also that the obfuscation itself is invisible to userspace and if needed
29 * it can be changed to an alternate scheme.
30 */
31static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
32
33static long kptr_obfuscate(long v, int type)
34{
35 return (v ^ cookies[type][0]) * cookies[type][1];
36}
37
38/*
39 * 0 - equal, i.e. v1 = v2
40 * 1 - less than, i.e. v1 < v2
41 * 2 - greater than, i.e. v1 > v2
42 * 3 - not equal but ordering unavailable (reserved for future)
43 */
44static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
45{
46 long ret;
47
48 ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
49
50 return (ret < 0) | ((ret > 0) << 1);
51}
52
53/* The caller must have pinned the task */
54static struct file *
55get_file_raw_ptr(struct task_struct *task, unsigned int idx)
56{
57 struct file *file = NULL;
58
59 task_lock(task);
60 rcu_read_lock();
61
62 if (task->files)
63 file = fcheck_files(task->files, idx);
64
65 rcu_read_unlock();
66 task_unlock(task);
67
68 return file;
69}
70
71static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
72{
73 if (likely(m2 != m1))
74 mutex_unlock(m2);
75 mutex_unlock(m1);
76}
77
78static int kcmp_lock(struct mutex *m1, struct mutex *m2)
79{
80 int err;
81
82 if (m2 > m1)
83 swap(m1, m2);
84
85 err = mutex_lock_killable(m1);
86 if (!err && likely(m1 != m2)) {
87 err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
88 if (err)
89 mutex_unlock(m1);
90 }
91
92 return err;
93}
94
95SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
96 unsigned long, idx1, unsigned long, idx2)
97{
98 struct task_struct *task1, *task2;
99 int ret;
100
101 rcu_read_lock();
102
103 /*
104 * Tasks are looked up in caller's PID namespace only.
105 */
106 task1 = find_task_by_vpid(pid1);
107 task2 = find_task_by_vpid(pid2);
108 if (!task1 || !task2)
109 goto err_no_task;
110
111 get_task_struct(task1);
112 get_task_struct(task2);
113
114 rcu_read_unlock();
115
116 /*
117 * One should have enough rights to inspect task details.
118 */
119 ret = kcmp_lock(&task1->signal->cred_guard_mutex,
120 &task2->signal->cred_guard_mutex);
121 if (ret)
122 goto err;
123 if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
124 !ptrace_may_access(task2, PTRACE_MODE_READ)) {
125 ret = -EPERM;
126 goto err_unlock;
127 }
128
129 switch (type) {
130 case KCMP_FILE: {
131 struct file *filp1, *filp2;
132
133 filp1 = get_file_raw_ptr(task1, idx1);
134 filp2 = get_file_raw_ptr(task2, idx2);
135
136 if (filp1 && filp2)
137 ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
138 else
139 ret = -EBADF;
140 break;
141 }
142 case KCMP_VM:
143 ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
144 break;
145 case KCMP_FILES:
146 ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
147 break;
148 case KCMP_FS:
149 ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
150 break;
151 case KCMP_SIGHAND:
152 ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
153 break;
154 case KCMP_IO:
155 ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
156 break;
157 case KCMP_SYSVSEM:
158#ifdef CONFIG_SYSVIPC
159 ret = kcmp_ptr(task1->sysvsem.undo_list,
160 task2->sysvsem.undo_list,
161 KCMP_SYSVSEM);
162#else
163 ret = -EOPNOTSUPP;
164#endif
165 break;
166 default:
167 ret = -EINVAL;
168 break;
169 }
170
171err_unlock:
172 kcmp_unlock(&task1->signal->cred_guard_mutex,
173 &task2->signal->cred_guard_mutex);
174err:
175 put_task_struct(task1);
176 put_task_struct(task2);
177
178 return ret;
179
180err_no_task:
181 rcu_read_unlock();
182 return -ESRCH;
183}
184
185static __init int kcmp_cookies_init(void)
186{
187 int i;
188
189 get_random_bytes(cookies, sizeof(cookies));
190
191 for (i = 0; i < KCMP_TYPES; i++)
192 cookies[i][1] |= (~(~0UL >> 1) | 1);
193
194 return 0;
195}
196arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index c744b88c44e2..59dcf5b81d24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
402 return max; 402 return max;
403 return len; 403 return len;
404} 404}
405EXPORT_SYMBOL(__kfifo_max_r);
405 406
406#define __KFIFO_PEEK(data, out, mask) \ 407#define __KFIFO_PEEK(data, out, mask) \
407 ((data)[(out) & (mask)]) 408 ((data)[(out) & (mask)])
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..ff2c7cb86d77 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -221,13 +221,12 @@ fail:
221 return 0; 221 return 0;
222} 222}
223 223
224void call_usermodehelper_freeinfo(struct subprocess_info *info) 224static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 225{
226 if (info->cleanup) 226 if (info->cleanup)
227 (*info->cleanup)(info); 227 (*info->cleanup)(info);
228 kfree(info); 228 kfree(info);
229} 229}
230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
231 230
232static void umh_complete(struct subprocess_info *sub_info) 231static void umh_complete(struct subprocess_info *sub_info)
233{ 232{
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
410 409
411/** 410/**
412 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. 411 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
413 * depth: New value to assign to usermodehelper_disabled. 412 * @depth: New value to assign to usermodehelper_disabled.
414 * 413 *
415 * Change the value of usermodehelper_disabled (under umhelper_sem locked for 414 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
416 * writing) and wakeup tasks waiting for it to change. 415 * writing) and wakeup tasks waiting for it to change.
@@ -479,6 +478,7 @@ static void helper_unlock(void)
479 * structure. This should be passed to call_usermodehelper_exec to 478 * structure. This should be passed to call_usermodehelper_exec to
480 * exec the process and free the structure. 479 * exec the process and free the structure.
481 */ 480 */
481static
482struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 482struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
483 char **envp, gfp_t gfp_mask) 483 char **envp, gfp_t gfp_mask)
484{ 484{
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
494 out: 494 out:
495 return sub_info; 495 return sub_info;
496} 496}
497EXPORT_SYMBOL(call_usermodehelper_setup);
498 497
499/** 498/**
500 * call_usermodehelper_setfns - set a cleanup/init function 499 * call_usermodehelper_setfns - set a cleanup/init function
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
512 * Function must be runnable in either a process context or the 511 * Function must be runnable in either a process context or the
513 * context in which call_usermodehelper_exec is called. 512 * context in which call_usermodehelper_exec is called.
514 */ 513 */
514static
515void call_usermodehelper_setfns(struct subprocess_info *info, 515void call_usermodehelper_setfns(struct subprocess_info *info,
516 int (*init)(struct subprocess_info *info, struct cred *new), 516 int (*init)(struct subprocess_info *info, struct cred *new),
517 void (*cleanup)(struct subprocess_info *info), 517 void (*cleanup)(struct subprocess_info *info),
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
521 info->init = init; 521 info->init = init;
522 info->data = data; 522 info->data = data;
523} 523}
524EXPORT_SYMBOL(call_usermodehelper_setfns);
525 524
526/** 525/**
527 * call_usermodehelper_exec - start a usermode application 526 * call_usermodehelper_exec - start a usermode application
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
535 * asynchronously if wait is not set, and runs as a child of keventd. 534 * asynchronously if wait is not set, and runs as a child of keventd.
536 * (ie. it runs with full root capabilities). 535 * (ie. it runs with full root capabilities).
537 */ 536 */
537static
538int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 538int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
539{ 539{
540 DECLARE_COMPLETION_ONSTACK(done); 540 DECLARE_COMPLETION_ONSTACK(done);
@@ -576,7 +576,25 @@ unlock:
576 helper_unlock(); 576 helper_unlock();
577 return retval; 577 return retval;
578} 578}
579EXPORT_SYMBOL(call_usermodehelper_exec); 579
580int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new),
583 void (*cleanup)(struct subprocess_info *), void *data)
584{
585 struct subprocess_info *info;
586 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
587
588 info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
589
590 if (info == NULL)
591 return -ENOMEM;
592
593 call_usermodehelper_setfns(info, init, cleanup, data);
594
595 return call_usermodehelper_exec(info, wait);
596}
597EXPORT_SYMBOL(call_usermodehelper_fns);
580 598
581static int proc_cap_handler(struct ctl_table *table, int write, 599static int proc_cap_handler(struct ctl_table *table, int write,
582 void __user *buffer, size_t *lenp, loff_t *ppos) 600 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/pid.c b/kernel/pid.c
index 9f08dfabaf13..e86b291ad834 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -547,7 +547,8 @@ void __init pidhash_init(void)
547 547
548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
549 HASH_EARLY | HASH_SMALL, 549 HASH_EARLY | HASH_SMALL,
550 &pidhash_shift, NULL, 4096); 550 &pidhash_shift, NULL,
551 0, 4096);
551 pidhash_size = 1U << pidhash_shift; 552 pidhash_size = 1U << pidhash_shift;
552 553
553 for (i = 0; i < pidhash_size; i++) 554 for (i = 0; i < pidhash_size; i++)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd35b3c..16b20e38c4a1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
149{ 149{
150 int nr; 150 int nr;
151 int rc; 151 int rc;
152 struct task_struct *task; 152 struct task_struct *task, *me = current;
153
154 /* Ignore SIGCHLD causing any terminated children to autoreap */
155 spin_lock_irq(&me->sighand->siglock);
156 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
157 spin_unlock_irq(&me->sighand->siglock);
153 158
154 /* 159 /*
155 * The last thread in the cgroup-init thread group is terminating. 160 * The last thread in the cgroup-init thread group is terminating.
@@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 196 return;
192} 197}
193 198
199#ifdef CONFIG_CHECKPOINT_RESTORE
194static int pid_ns_ctl_handler(struct ctl_table *table, int write, 200static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos) 201 void __user *buffer, size_t *lenp, loff_t *ppos)
196{ 202{
@@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = {
218 }, 224 },
219 { } 225 { }
220}; 226};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 227static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
228#endif /* CONFIG_CHECKPOINT_RESTORE */
223 229
224int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) 230int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
225{ 231{
@@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
253static __init int pid_namespaces_init(void) 259static __init int pid_namespaces_init(void)
254{ 260{
255 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 261 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
262
263#ifdef CONFIG_CHECKPOINT_RESTORE
256 register_sysctl_paths(kern_path, pid_ns_ctl_table); 264 register_sysctl_paths(kern_path, pid_ns_ctl_table);
265#endif
257 return 0; 266 return 0;
258} 267}
259 268
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bebe2b170d49..ad581aa2369a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
94 counter->usage -= val; 94 counter->usage -= val;
95} 95}
96 96
97void res_counter_uncharge(struct res_counter *counter, unsigned long val) 97void res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top,
99 unsigned long val)
98{ 100{
99 unsigned long flags; 101 unsigned long flags;
100 struct res_counter *c; 102 struct res_counter *c;
101 103
102 local_irq_save(flags); 104 local_irq_save(flags);
103 for (c = counter; c != NULL; c = c->parent) { 105 for (c = counter; c != top; c = c->parent) {
104 spin_lock(&c->lock); 106 spin_lock(&c->lock);
105 res_counter_uncharge_locked(c, val); 107 res_counter_uncharge_locked(c, val);
106 spin_unlock(&c->lock); 108 spin_unlock(&c->lock);
@@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
108 local_irq_restore(flags); 110 local_irq_restore(flags);
109} 111}
110 112
113void res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{
115 res_counter_uncharge_until(counter, NULL, val);
116}
111 117
112static inline unsigned long long * 118static inline unsigned long long *
113res_counter_member(struct res_counter *counter, int member) 119res_counter_member(struct res_counter *counter, int member)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66a8c01..e1d2b8ee76d5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -515,8 +515,8 @@ out:
515 * @root: root resource descriptor 515 * @root: root resource descriptor
516 * @new: resource descriptor desired by caller 516 * @new: resource descriptor desired by caller
517 * @size: requested resource region size 517 * @size: requested resource region size
518 * @min: minimum size to allocate 518 * @min: minimum boundary to allocate
519 * @max: maximum size to allocate 519 * @max: maximum boundary to allocate
520 * @align: alignment requested, in bytes 520 * @align: alignment requested, in bytes
521 * @alignf: alignment function, optional, called if not NULL 521 * @alignf: alignment function, optional, called if not NULL
522 * @alignf_data: arbitrary data to pass to the @alignf function 522 * @alignf_data: arbitrary data to pass to the @alignf function
diff --git a/kernel/signal.c b/kernel/signal.c
index 4dbf00dfb359..08dfbd748cd2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h> 31#include <linux/user_namespace.h>
32#include <linux/uprobes.h>
32#define CREATE_TRACE_POINTS 33#define CREATE_TRACE_POINTS
33#include <trace/events/signal.h> 34#include <trace/events/signal.h>
34 35
@@ -1655,19 +1656,18 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1655 info.si_signo = sig; 1656 info.si_signo = sig;
1656 info.si_errno = 0; 1657 info.si_errno = 0;
1657 /* 1658 /*
1658 * we are under tasklist_lock here so our parent is tied to 1659 * We are under tasklist_lock here so our parent is tied to
1659 * us and cannot exit and release its namespace. 1660 * us and cannot change.
1660 * 1661 *
1661 * the only it can is to switch its nsproxy with sys_unshare, 1662 * task_active_pid_ns will always return the same pid namespace
1662 * bu uncharing pid namespaces is not allowed, so we'll always 1663 * until a task passes through release_task.
1663 * see relevant namespace
1664 * 1664 *
1665 * write_lock() currently calls preempt_disable() which is the 1665 * write_lock() currently calls preempt_disable() which is the
1666 * same as rcu_read_lock(), but according to Oleg, this is not 1666 * same as rcu_read_lock(), but according to Oleg, this is not
1667 * correct to rely on this 1667 * correct to rely on this
1668 */ 1668 */
1669 rcu_read_lock(); 1669 rcu_read_lock();
1670 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1670 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
1671 info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), 1671 info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
1672 task_uid(tsk)); 1672 task_uid(tsk));
1673 rcu_read_unlock(); 1673 rcu_read_unlock();
@@ -2191,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2191 struct signal_struct *signal = current->signal; 2191 struct signal_struct *signal = current->signal;
2192 int signr; 2192 int signr;
2193 2193
2194 if (unlikely(uprobe_deny_signal()))
2195 return 0;
2196
2194relock: 2197relock:
2195 /* 2198 /*
2196 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2199 * We'll jump back here after any time we were stopped in TASK_STOPPED.
diff --git a/kernel/sys.c b/kernel/sys.c
index 6df42624e454..9ff89cb9657a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
36#include <linux/personality.h> 36#include <linux/personality.h>
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/file.h>
40#include <linux/mount.h>
39#include <linux/gfp.h> 41#include <linux/gfp.h>
40#include <linux/syscore_ops.h> 42#include <linux/syscore_ops.h>
41#include <linux/version.h> 43#include <linux/version.h>
@@ -1378,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1378 memcpy(u->nodename, tmp, len); 1380 memcpy(u->nodename, tmp, len);
1379 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1381 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1380 errno = 0; 1382 errno = 0;
1383 uts_proc_notify(UTS_PROC_HOSTNAME);
1381 } 1384 }
1382 uts_proc_notify(UTS_PROC_HOSTNAME);
1383 up_write(&uts_sem); 1385 up_write(&uts_sem);
1384 return errno; 1386 return errno;
1385} 1387}
@@ -1429,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1429 memcpy(u->domainname, tmp, len); 1431 memcpy(u->domainname, tmp, len);
1430 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1432 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1431 errno = 0; 1433 errno = 0;
1434 uts_proc_notify(UTS_PROC_DOMAINNAME);
1432 } 1435 }
1433 uts_proc_notify(UTS_PROC_DOMAINNAME);
1434 up_write(&uts_sem); 1436 up_write(&uts_sem);
1435 return errno; 1437 return errno;
1436} 1438}
@@ -1784,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask)
1784} 1786}
1785 1787
1786#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static bool vma_flags_mismatch(struct vm_area_struct *vma,
1790 unsigned long required,
1791 unsigned long banned)
1792{
1793 return (vma->vm_flags & required) != required ||
1794 (vma->vm_flags & banned);
1795}
1796
1797static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1798{
1799 struct file *exe_file;
1800 struct dentry *dentry;
1801 int err;
1802
1803 /*
1804 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
1805 * remain. So perform a quick test first.
1806 */
1807 if (mm->num_exe_file_vmas)
1808 return -EBUSY;
1809
1810 exe_file = fget(fd);
1811 if (!exe_file)
1812 return -EBADF;
1813
1814 dentry = exe_file->f_path.dentry;
1815
1816 /*
1817 * Because the original mm->exe_file points to executable file, make
1818 * sure that this one is executable as well, to avoid breaking an
1819 * overall picture.
1820 */
1821 err = -EACCES;
1822 if (!S_ISREG(dentry->d_inode->i_mode) ||
1823 exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1824 goto exit;
1825
1826 err = inode_permission(dentry->d_inode, MAY_EXEC);
1827 if (err)
1828 goto exit;
1829
1830 /*
1831 * The symlink can be changed only once, just to disallow arbitrary
1832 * transitions malicious software might bring in. This means one
1833 * could make a snapshot over all processes running and monitor
1834 * /proc/pid/exe changes to notice unusual activity if needed.
1835 */
1836 down_write(&mm->mmap_sem);
1837 if (likely(!mm->exe_file))
1838 set_mm_exe_file(mm, exe_file);
1839 else
1840 err = -EBUSY;
1841 up_write(&mm->mmap_sem);
1842
1843exit:
1844 fput(exe_file);
1845 return err;
1846}
1847
1787static int prctl_set_mm(int opt, unsigned long addr, 1848static int prctl_set_mm(int opt, unsigned long addr,
1788 unsigned long arg4, unsigned long arg5) 1849 unsigned long arg4, unsigned long arg5)
1789{ 1850{
1790 unsigned long rlim = rlimit(RLIMIT_DATA); 1851 unsigned long rlim = rlimit(RLIMIT_DATA);
1791 unsigned long vm_req_flags;
1792 unsigned long vm_bad_flags;
1793 struct vm_area_struct *vma;
1794 int error = 0;
1795 struct mm_struct *mm = current->mm; 1852 struct mm_struct *mm = current->mm;
1853 struct vm_area_struct *vma;
1854 int error;
1796 1855
1797 if (arg4 | arg5) 1856 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
1798 return -EINVAL; 1857 return -EINVAL;
1799 1858
1800 if (!capable(CAP_SYS_RESOURCE)) 1859 if (!capable(CAP_SYS_RESOURCE))
1801 return -EPERM; 1860 return -EPERM;
1802 1861
1862 if (opt == PR_SET_MM_EXE_FILE)
1863 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1864
1803 if (addr >= TASK_SIZE) 1865 if (addr >= TASK_SIZE)
1804 return -EINVAL; 1866 return -EINVAL;
1805 1867
1868 error = -EINVAL;
1869
1806 down_read(&mm->mmap_sem); 1870 down_read(&mm->mmap_sem);
1807 vma = find_vma(mm, addr); 1871 vma = find_vma(mm, addr);
1808 1872
1809 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1810 /* It must be existing VMA */
1811 if (!vma || vma->vm_start > addr)
1812 goto out;
1813 }
1814
1815 error = -EINVAL;
1816 switch (opt) { 1873 switch (opt) {
1817 case PR_SET_MM_START_CODE: 1874 case PR_SET_MM_START_CODE:
1875 mm->start_code = addr;
1876 break;
1818 case PR_SET_MM_END_CODE: 1877 case PR_SET_MM_END_CODE:
1819 vm_req_flags = VM_READ | VM_EXEC; 1878 mm->end_code = addr;
1820 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1821
1822 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1823 (vma->vm_flags & vm_bad_flags))
1824 goto out;
1825
1826 if (opt == PR_SET_MM_START_CODE)
1827 mm->start_code = addr;
1828 else
1829 mm->end_code = addr;
1830 break; 1879 break;
1831
1832 case PR_SET_MM_START_DATA: 1880 case PR_SET_MM_START_DATA:
1833 case PR_SET_MM_END_DATA: 1881 mm->start_data = addr;
1834 vm_req_flags = VM_READ | VM_WRITE;
1835 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1836
1837 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1838 (vma->vm_flags & vm_bad_flags))
1839 goto out;
1840
1841 if (opt == PR_SET_MM_START_DATA)
1842 mm->start_data = addr;
1843 else
1844 mm->end_data = addr;
1845 break; 1882 break;
1846 1883 case PR_SET_MM_END_DATA:
1847 case PR_SET_MM_START_STACK: 1884 mm->end_data = addr;
1848
1849#ifdef CONFIG_STACK_GROWSUP
1850 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1851#else
1852 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1853#endif
1854 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1855 goto out;
1856
1857 mm->start_stack = addr;
1858 break; 1885 break;
1859 1886
1860 case PR_SET_MM_START_BRK: 1887 case PR_SET_MM_START_BRK:
@@ -1881,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr,
1881 mm->brk = addr; 1908 mm->brk = addr;
1882 break; 1909 break;
1883 1910
1911 /*
1912 * If command line arguments and environment
1913 * are placed somewhere else on stack, we can
1914 * set them up here, ARG_START/END to setup
1915 * command line argumets and ENV_START/END
1916 * for environment.
1917 */
1918 case PR_SET_MM_START_STACK:
1919 case PR_SET_MM_ARG_START:
1920 case PR_SET_MM_ARG_END:
1921 case PR_SET_MM_ENV_START:
1922 case PR_SET_MM_ENV_END:
1923 if (!vma) {
1924 error = -EFAULT;
1925 goto out;
1926 }
1927#ifdef CONFIG_STACK_GROWSUP
1928 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
1929#else
1930 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
1931#endif
1932 goto out;
1933 if (opt == PR_SET_MM_START_STACK)
1934 mm->start_stack = addr;
1935 else if (opt == PR_SET_MM_ARG_START)
1936 mm->arg_start = addr;
1937 else if (opt == PR_SET_MM_ARG_END)
1938 mm->arg_end = addr;
1939 else if (opt == PR_SET_MM_ENV_START)
1940 mm->env_start = addr;
1941 else if (opt == PR_SET_MM_ENV_END)
1942 mm->env_end = addr;
1943 break;
1944
1945 /*
1946 * This doesn't move auxiliary vector itself
1947 * since it's pinned to mm_struct, but allow
1948 * to fill vector with new values. It's up
1949 * to a caller to provide sane values here
1950 * otherwise user space tools which use this
1951 * vector might be unhappy.
1952 */
1953 case PR_SET_MM_AUXV: {
1954 unsigned long user_auxv[AT_VECTOR_SIZE];
1955
1956 if (arg4 > sizeof(user_auxv))
1957 goto out;
1958 up_read(&mm->mmap_sem);
1959
1960 if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
1961 return -EFAULT;
1962
1963 /* Make sure the last entry is always AT_NULL */
1964 user_auxv[AT_VECTOR_SIZE - 2] = 0;
1965 user_auxv[AT_VECTOR_SIZE - 1] = 0;
1966
1967 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1968
1969 task_lock(current);
1970 memcpy(mm->saved_auxv, user_auxv, arg4);
1971 task_unlock(current);
1972
1973 return 0;
1974 }
1884 default: 1975 default:
1885 error = -EINVAL;
1886 goto out; 1976 goto out;
1887 } 1977 }
1888 1978
1889 error = 0; 1979 error = 0;
1890
1891out: 1980out:
1892 up_read(&mm->mmap_sem); 1981 up_read(&mm->mmap_sem);
1893
1894 return error; 1982 return error;
1895} 1983}
1896#else /* CONFIG_CHECKPOINT_RESTORE */ 1984#else /* CONFIG_CHECKPOINT_RESTORE */
@@ -2114,7 +2202,6 @@ int orderly_poweroff(bool force)
2114 NULL 2202 NULL
2115 }; 2203 };
2116 int ret = -ENOMEM; 2204 int ret = -ENOMEM;
2117 struct subprocess_info *info;
2118 2205
2119 if (argv == NULL) { 2206 if (argv == NULL) {
2120 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2207 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2122,18 +2209,16 @@ int orderly_poweroff(bool force)
2122 goto out; 2209 goto out;
2123 } 2210 }
2124 2211
2125 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); 2212 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2126 if (info == NULL) { 2213 NULL, argv_cleanup, NULL);
2127 argv_free(argv); 2214out:
2128 goto out; 2215 if (likely(!ret))
2129 } 2216 return 0;
2130
2131 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
2132 2217
2133 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 2218 if (ret == -ENOMEM)
2219 argv_free(argv);
2134 2220
2135 out: 2221 if (force) {
2136 if (ret && force) {
2137 printk(KERN_WARNING "Failed to start orderly shutdown: " 2222 printk(KERN_WARNING "Failed to start orderly shutdown: "
2138 "forcing the issue\n"); 2223 "forcing the issue\n");
2139 2224
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
203cond_syscall(sys_name_to_handle_at); 203cond_syscall(sys_name_to_handle_at);
204cond_syscall(sys_open_by_handle_at); 204cond_syscall(sys_open_by_handle_at);
205cond_syscall(compat_sys_open_by_handle_at); 205cond_syscall(compat_sys_open_by_handle_at);
206
207/* compare kernel pointers */
208cond_syscall(sys_kcmp);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a20dc8a3c949..fd42bd452b75 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -2,6 +2,55 @@
2# Timer subsystem related configuration options 2# Timer subsystem related configuration options
3# 3#
4 4
5# Options selectable by arch Kconfig
6
7# Watchdog function for clocksources to detect instabilities
8config CLOCKSOURCE_WATCHDOG
9 bool
10
11# Architecture has extra clocksource data
12config ARCH_CLOCKSOURCE_DATA
13 bool
14
15# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL
17 bool
18
19# ktime_t scalar 64bit nsec representation
20config KTIME_SCALAR
21 bool
22
23# Old style timekeeping
24config ARCH_USES_GETTIMEOFFSET
25 bool
26
27# The generic clock events infrastructure
28config GENERIC_CLOCKEVENTS
29 bool
30
31# Migration helper. Builds, but does not invoke
32config GENERIC_CLOCKEVENTS_BUILD
33 bool
34 default y
35 depends on GENERIC_CLOCKEVENTS
36
37# Clockevents broadcasting infrastructure
38config GENERIC_CLOCKEVENTS_BROADCAST
39 bool
40 depends on GENERIC_CLOCKEVENTS
41
42# Automatically adjust the min. reprogramming time for
43# clock event device
44config GENERIC_CLOCKEVENTS_MIN_ADJUST
45 bool
46
47# Generic update of CMOS clock
48config GENERIC_CMOS_UPDATE
49 bool
50
51if GENERIC_CLOCKEVENTS
52menu "Timers subsystem"
53
5# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is 54# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
6# only related to the tick functionality. Oneshot clockevent devices 55# only related to the tick functionality. Oneshot clockevent devices
7# are supported independ of this. 56# are supported independ of this.
@@ -26,10 +75,5 @@ config HIGH_RES_TIMERS
26 hardware is not capable then this option only increases 75 hardware is not capable then this option only increases
27 the size of the kernel image. 76 the size of the kernel image.
28 77
29config GENERIC_CLOCKEVENTS_BUILD 78endmenu
30 bool 79endif
31 default y
32 depends on GENERIC_CLOCKEVENTS
33
34config GENERIC_CLOCKEVENTS_MIN_ADJUST
35 bool
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f03fd83b170b..70b33abcc7bb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -412,6 +412,7 @@ int second_overflow(unsigned long secs)
412 if (secs % 86400 == 0) { 412 if (secs % 86400 == 0) {
413 leap = -1; 413 leap = -1;
414 time_state = TIME_OOP; 414 time_state = TIME_OOP;
415 time_tai++;
415 printk(KERN_NOTICE 416 printk(KERN_NOTICE
416 "Clock: inserting leap second 23:59:60 UTC\n"); 417 "Clock: inserting leap second 23:59:60 UTC\n");
417 } 418 }
@@ -426,7 +427,6 @@ int second_overflow(unsigned long secs)
426 } 427 }
427 break; 428 break;
428 case TIME_OOP: 429 case TIME_OOP:
429 time_tai++;
430 time_state = TIME_WAIT; 430 time_state = TIME_WAIT;
431 break; 431 break;
432 432
@@ -473,8 +473,6 @@ int second_overflow(unsigned long secs)
473 << NTP_SCALE_SHIFT; 473 << NTP_SCALE_SHIFT;
474 time_adjust = 0; 474 time_adjust = 0;
475 475
476
477
478out: 476out:
479 spin_unlock_irqrestore(&ntp_lock, flags); 477 spin_unlock_irqrestore(&ntp_lock, flags);
480 478
@@ -559,10 +557,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
559 /* only set allowed bits */ 557 /* only set allowed bits */
560 time_status &= STA_RONLY; 558 time_status &= STA_RONLY;
561 time_status |= txc->status & ~STA_RONLY; 559 time_status |= txc->status & ~STA_RONLY;
562
563} 560}
561
564/* 562/*
565 * Called with the xtime lock held, so we can access and modify 563 * Called with ntp_lock held, so we can access and modify
566 * all the global NTP state: 564 * all the global NTP state:
567 */ 565 */
568static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) 566static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d66b21308f7c..6e46cacf5969 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -240,7 +240,6 @@ void getnstimeofday(struct timespec *ts)
240 240
241 timespec_add_ns(ts, nsecs); 241 timespec_add_ns(ts, nsecs);
242} 242}
243
244EXPORT_SYMBOL(getnstimeofday); 243EXPORT_SYMBOL(getnstimeofday);
245 244
246ktime_t ktime_get(void) 245ktime_t ktime_get(void)
@@ -357,8 +356,8 @@ void do_gettimeofday(struct timeval *tv)
357 tv->tv_sec = now.tv_sec; 356 tv->tv_sec = now.tv_sec;
358 tv->tv_usec = now.tv_nsec/1000; 357 tv->tv_usec = now.tv_nsec/1000;
359} 358}
360
361EXPORT_SYMBOL(do_gettimeofday); 359EXPORT_SYMBOL(do_gettimeofday);
360
362/** 361/**
363 * do_settimeofday - Sets the time of day 362 * do_settimeofday - Sets the time of day
364 * @tv: pointer to the timespec variable containing the new time 363 * @tv: pointer to the timespec variable containing the new time
@@ -392,7 +391,6 @@ int do_settimeofday(const struct timespec *tv)
392 391
393 return 0; 392 return 0;
394} 393}
395
396EXPORT_SYMBOL(do_settimeofday); 394EXPORT_SYMBOL(do_settimeofday);
397 395
398 396
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f347ac91292d..8c4c07071cc5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -372,6 +372,7 @@ config KPROBE_EVENT
372 depends on HAVE_REGS_AND_STACK_ACCESS_API 372 depends on HAVE_REGS_AND_STACK_ACCESS_API
373 bool "Enable kprobes-based dynamic events" 373 bool "Enable kprobes-based dynamic events"
374 select TRACING 374 select TRACING
375 select PROBE_EVENTS
375 default y 376 default y
376 help 377 help
377 This allows the user to add tracing events (similar to tracepoints) 378 This allows the user to add tracing events (similar to tracepoints)
@@ -384,6 +385,25 @@ config KPROBE_EVENT
384 This option is also required by perf-probe subcommand of perf tools. 385 This option is also required by perf-probe subcommand of perf tools.
385 If you want to use perf tools, this option is strongly recommended. 386 If you want to use perf tools, this option is strongly recommended.
386 387
388config UPROBE_EVENT
389 bool "Enable uprobes-based dynamic events"
390 depends on ARCH_SUPPORTS_UPROBES
391 depends on MMU
392 select UPROBES
393 select PROBE_EVENTS
394 select TRACING
395 default n
396 help
397 This allows the user to add tracing events on top of userspace
398 dynamic events (similar to tracepoints) on the fly via the trace
399 events interface. Those events can be inserted wherever uprobes
400 can probe, and record various registers.
401 This option is required if you plan to use perf-probe subcommand
402 of perf tools on user space applications.
403
404config PROBE_EVENTS
405 def_bool n
406
387config DYNAMIC_FTRACE 407config DYNAMIC_FTRACE
388 bool "enable/disable ftrace tracepoints dynamically" 408 bool "enable/disable ftrace tracepoints dynamically"
389 depends on FUNCTION_TRACER 409 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b3afe0e76f79..b831087c8200 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -60,5 +60,7 @@ endif
60ifeq ($(CONFIG_TRACING),y) 60ifeq ($(CONFIG_TRACING),y)
61obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 61obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
62endif 62endif
63obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
64obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
63 65
64libftrace-y := ftrace.o 66libftrace-y := ftrace.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6420cda62336..1d0f6a8a0e5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1486,6 +1486,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1486 if (!buffer) 1486 if (!buffer)
1487 return size; 1487 return size;
1488 1488
1489 /* Make sure the requested buffer exists */
1490 if (cpu_id != RING_BUFFER_ALL_CPUS &&
1491 !cpumask_test_cpu(cpu_id, buffer->cpumask))
1492 return size;
1493
1489 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1494 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1490 size *= BUF_PAGE_SIZE; 1495 size *= BUF_PAGE_SIZE;
1491 1496
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6c6f7933eede..5aec220d2de0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 103 unsigned long ret_ip;
104}; 104};
105 105
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
106/* 111/*
107 * trace_flag_type is an enumeration that holds different 112 * trace_flag_type is an enumeration that holds different
108 * states when a trace occurs. These are: 113 * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 580a05ec926b..b31d3d5699fe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,547 +19,15 @@
19 19
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <asm/bitsperlong.h>
35
36#include "trace.h"
37#include "trace_output.h"
38
39#define MAX_TRACE_ARGS 128
40#define MAX_ARGSTR_LEN 63
41#define MAX_EVENT_NAME_LEN 64
42#define MAX_STRING_SIZE PATH_MAX
43#define KPROBE_EVENT_SYSTEM "kprobes"
44
45/* Reserved field names */
46#define FIELD_STRING_IP "__probe_ip"
47#define FIELD_STRING_RETIP "__probe_ret_ip"
48#define FIELD_STRING_FUNC "__probe_func"
49
50const char *reserved_field_names[] = {
51 "common_type",
52 "common_flags",
53 "common_preempt_count",
54 "common_pid",
55 "common_tgid",
56 FIELD_STRING_IP,
57 FIELD_STRING_RETIP,
58 FIELD_STRING_FUNC,
59};
60
61/* Printing function type */
62typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
63 void *);
64#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
65#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
66
67/* Printing in basic type function template */
68#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
69static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
70 const char *name, \
71 void *data, void *ent)\
72{ \
73 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
74} \
75static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
76
77DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
78DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
80DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
82DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
83DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
84DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
85
86/* data_rloc: data relative location, compatible with u32 */
87#define make_data_rloc(len, roffs) \
88 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
89#define get_rloc_len(dl) ((u32)(dl) >> 16)
90#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
91
92static inline void *get_rloc_data(u32 *dl)
93{
94 return (u8 *)dl + get_rloc_offs(*dl);
95}
96
97/* For data_loc conversion */
98static inline void *get_loc_data(u32 *dl, void *ent)
99{
100 return (u8 *)ent + get_rloc_offs(*dl);
101}
102
103/*
104 * Convert data_rloc to data_loc:
105 * data_rloc stores the offset from data_rloc itself, but data_loc
106 * stores the offset from event entry.
107 */
108#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
109
110/* For defining macros, define string/string_size types */
111typedef u32 string;
112typedef u32 string_size;
113
114/* Print type function for string type */
115static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
116 const char *name,
117 void *data, void *ent)
118{
119 int len = *(u32 *)data >> 16;
120
121 if (!len)
122 return trace_seq_printf(s, " %s=(fault)", name);
123 else
124 return trace_seq_printf(s, " %s=\"%s\"", name,
125 (const char *)get_loc_data(data, ent));
126}
127static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
128
129/* Data fetch function type */
130typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
131
132struct fetch_param {
133 fetch_func_t fn;
134 void *data;
135};
136
137static __kprobes void call_fetch(struct fetch_param *fprm,
138 struct pt_regs *regs, void *dest)
139{
140 return fprm->fn(regs, fprm->data, dest);
141}
142
143#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
144/*
145 * Define macro for basic types - we don't need to define s* types, because
146 * we have to care only about bitwidth at recording time.
147 */
148#define DEFINE_BASIC_FETCH_FUNCS(method) \
149DEFINE_FETCH_##method(u8) \
150DEFINE_FETCH_##method(u16) \
151DEFINE_FETCH_##method(u32) \
152DEFINE_FETCH_##method(u64)
153
154#define CHECK_FETCH_FUNCS(method, fn) \
155 (((FETCH_FUNC_NAME(method, u8) == fn) || \
156 (FETCH_FUNC_NAME(method, u16) == fn) || \
157 (FETCH_FUNC_NAME(method, u32) == fn) || \
158 (FETCH_FUNC_NAME(method, u64) == fn) || \
159 (FETCH_FUNC_NAME(method, string) == fn) || \
160 (FETCH_FUNC_NAME(method, string_size) == fn)) \
161 && (fn != NULL))
162
163/* Data fetch function templates */
164#define DEFINE_FETCH_reg(type) \
165static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
166 void *offset, void *dest) \
167{ \
168 *(type *)dest = (type)regs_get_register(regs, \
169 (unsigned int)((unsigned long)offset)); \
170}
171DEFINE_BASIC_FETCH_FUNCS(reg)
172/* No string on the register */
173#define fetch_reg_string NULL
174#define fetch_reg_string_size NULL
175
176#define DEFINE_FETCH_stack(type) \
177static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
178 void *offset, void *dest) \
179{ \
180 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
181 (unsigned int)((unsigned long)offset)); \
182}
183DEFINE_BASIC_FETCH_FUNCS(stack)
184/* No string on the stack entry */
185#define fetch_stack_string NULL
186#define fetch_stack_string_size NULL
187
188#define DEFINE_FETCH_retval(type) \
189static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
190 void *dummy, void *dest) \
191{ \
192 *(type *)dest = (type)regs_return_value(regs); \
193}
194DEFINE_BASIC_FETCH_FUNCS(retval)
195/* No string on the retval */
196#define fetch_retval_string NULL
197#define fetch_retval_string_size NULL
198
199#define DEFINE_FETCH_memory(type) \
200static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
201 void *addr, void *dest) \
202{ \
203 type retval; \
204 if (probe_kernel_address(addr, retval)) \
205 *(type *)dest = 0; \
206 else \
207 *(type *)dest = retval; \
208}
209DEFINE_BASIC_FETCH_FUNCS(memory)
210/*
211 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
212 * length and relative data location.
213 */
214static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
215 void *addr, void *dest)
216{
217 long ret;
218 int maxlen = get_rloc_len(*(u32 *)dest);
219 u8 *dst = get_rloc_data(dest);
220 u8 *src = addr;
221 mm_segment_t old_fs = get_fs();
222 if (!maxlen)
223 return;
224 /*
225 * Try to get string again, since the string can be changed while
226 * probing.
227 */
228 set_fs(KERNEL_DS);
229 pagefault_disable();
230 do
231 ret = __copy_from_user_inatomic(dst++, src++, 1);
232 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
233 dst[-1] = '\0';
234 pagefault_enable();
235 set_fs(old_fs);
236
237 if (ret < 0) { /* Failed to fetch string */
238 ((u8 *)get_rloc_data(dest))[0] = '\0';
239 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
240 } else
241 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
242 get_rloc_offs(*(u32 *)dest));
243}
244/* Return the length of string -- including null terminal byte */
245static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
246 void *addr, void *dest)
247{
248 int ret, len = 0;
249 u8 c;
250 mm_segment_t old_fs = get_fs();
251
252 set_fs(KERNEL_DS);
253 pagefault_disable();
254 do {
255 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
256 len++;
257 } while (c && ret == 0 && len < MAX_STRING_SIZE);
258 pagefault_enable();
259 set_fs(old_fs);
260
261 if (ret < 0) /* Failed to check the length */
262 *(u32 *)dest = 0;
263 else
264 *(u32 *)dest = len;
265}
266
267/* Memory fetching by symbol */
268struct symbol_cache {
269 char *symbol;
270 long offset;
271 unsigned long addr;
272};
273
274static unsigned long update_symbol_cache(struct symbol_cache *sc)
275{
276 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
277 if (sc->addr)
278 sc->addr += sc->offset;
279 return sc->addr;
280}
281
282static void free_symbol_cache(struct symbol_cache *sc)
283{
284 kfree(sc->symbol);
285 kfree(sc);
286}
287
288static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
289{
290 struct symbol_cache *sc;
291
292 if (!sym || strlen(sym) == 0)
293 return NULL;
294 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
295 if (!sc)
296 return NULL;
297
298 sc->symbol = kstrdup(sym, GFP_KERNEL);
299 if (!sc->symbol) {
300 kfree(sc);
301 return NULL;
302 }
303 sc->offset = offset;
304 22
305 update_symbol_cache(sc); 23#include "trace_probe.h"
306 return sc;
307}
308
309#define DEFINE_FETCH_symbol(type) \
310static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
311 void *data, void *dest) \
312{ \
313 struct symbol_cache *sc = data; \
314 if (sc->addr) \
315 fetch_memory_##type(regs, (void *)sc->addr, dest); \
316 else \
317 *(type *)dest = 0; \
318}
319DEFINE_BASIC_FETCH_FUNCS(symbol)
320DEFINE_FETCH_symbol(string)
321DEFINE_FETCH_symbol(string_size)
322
323/* Dereference memory access function */
324struct deref_fetch_param {
325 struct fetch_param orig;
326 long offset;
327};
328
329#define DEFINE_FETCH_deref(type) \
330static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
331 void *data, void *dest) \
332{ \
333 struct deref_fetch_param *dprm = data; \
334 unsigned long addr; \
335 call_fetch(&dprm->orig, regs, &addr); \
336 if (addr) { \
337 addr += dprm->offset; \
338 fetch_memory_##type(regs, (void *)addr, dest); \
339 } else \
340 *(type *)dest = 0; \
341}
342DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size)
345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
355{
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 free_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 free_symbol_cache(data->orig.data);
360 kfree(data);
361}
362
363/* Bitfield fetch function */
364struct bitfield_fetch_param {
365 struct fetch_param orig;
366 unsigned char hi_shift;
367 unsigned char low_shift;
368};
369 24
370#define DEFINE_FETCH_bitfield(type) \ 25#define KPROBE_EVENT_SYSTEM "kprobes"
371static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
372 void *data, void *dest) \
373{ \
374 struct bitfield_fetch_param *bprm = data; \
375 type buf = 0; \
376 call_fetch(&bprm->orig, regs, &buf); \
377 if (buf) { \
378 buf <<= bprm->hi_shift; \
379 buf >>= bprm->low_shift; \
380 } \
381 *(type *)dest = buf; \
382}
383DEFINE_BASIC_FETCH_FUNCS(bitfield)
384#define fetch_bitfield_string NULL
385#define fetch_bitfield_string_size NULL
386
387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
402{
403 /*
404 * Don't check the bitfield itself, because this must be the
405 * last fetch function.
406 */
407 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
408 free_deref_fetch_param(data->orig.data);
409 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
410 free_symbol_cache(data->orig.data);
411 kfree(data);
412}
413
414/* Default (unsigned long) fetch type */
415#define __DEFAULT_FETCH_TYPE(t) u##t
416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
417#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
418#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
419
420/* Fetch types */
421enum {
422 FETCH_MTD_reg = 0,
423 FETCH_MTD_stack,
424 FETCH_MTD_retval,
425 FETCH_MTD_memory,
426 FETCH_MTD_symbol,
427 FETCH_MTD_deref,
428 FETCH_MTD_bitfield,
429 FETCH_MTD_END,
430};
431
432#define ASSIGN_FETCH_FUNC(method, type) \
433 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
434
435#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
436 {.name = _name, \
437 .size = _size, \
438 .is_signed = sign, \
439 .print = PRINT_TYPE_FUNC_NAME(ptype), \
440 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
441 .fmttype = _fmttype, \
442 .fetch = { \
443ASSIGN_FETCH_FUNC(reg, ftype), \
444ASSIGN_FETCH_FUNC(stack, ftype), \
445ASSIGN_FETCH_FUNC(retval, ftype), \
446ASSIGN_FETCH_FUNC(memory, ftype), \
447ASSIGN_FETCH_FUNC(symbol, ftype), \
448ASSIGN_FETCH_FUNC(deref, ftype), \
449ASSIGN_FETCH_FUNC(bitfield, ftype), \
450 } \
451 }
452
453#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
454 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
455
456#define FETCH_TYPE_STRING 0
457#define FETCH_TYPE_STRSIZE 1
458
459/* Fetch type information table */
460static const struct fetch_type {
461 const char *name; /* Name of type */
462 size_t size; /* Byte size of type */
463 int is_signed; /* Signed flag */
464 print_type_func_t print; /* Print functions */
465 const char *fmt; /* Fromat string */
466 const char *fmttype; /* Name in format file */
467 /* Fetch functions */
468 fetch_func_t fetch[FETCH_MTD_END];
469} fetch_type_table[] = {
470 /* Special types */
471 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
472 sizeof(u32), 1, "__data_loc char[]"),
473 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
474 string_size, sizeof(u32), 0, "u32"),
475 /* Basic types */
476 ASSIGN_FETCH_TYPE(u8, u8, 0),
477 ASSIGN_FETCH_TYPE(u16, u16, 0),
478 ASSIGN_FETCH_TYPE(u32, u32, 0),
479 ASSIGN_FETCH_TYPE(u64, u64, 0),
480 ASSIGN_FETCH_TYPE(s8, u8, 1),
481 ASSIGN_FETCH_TYPE(s16, u16, 1),
482 ASSIGN_FETCH_TYPE(s32, u32, 1),
483 ASSIGN_FETCH_TYPE(s64, u64, 1),
484};
485
486static const struct fetch_type *find_fetch_type(const char *type)
487{
488 int i;
489
490 if (!type)
491 type = DEFAULT_FETCH_TYPE_STR;
492
493 /* Special case: bitfield */
494 if (*type == 'b') {
495 unsigned long bs;
496 type = strchr(type, '/');
497 if (!type)
498 goto fail;
499 type++;
500 if (strict_strtoul(type, 0, &bs))
501 goto fail;
502 switch (bs) {
503 case 8:
504 return find_fetch_type("u8");
505 case 16:
506 return find_fetch_type("u16");
507 case 32:
508 return find_fetch_type("u32");
509 case 64:
510 return find_fetch_type("u64");
511 default:
512 goto fail;
513 }
514 }
515
516 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
517 if (strcmp(type, fetch_type_table[i].name) == 0)
518 return &fetch_type_table[i];
519fail:
520 return NULL;
521}
522
523/* Special function : only accept unsigned long */
524static __kprobes void fetch_stack_address(struct pt_regs *regs,
525 void *dummy, void *dest)
526{
527 *(unsigned long *)dest = kernel_stack_pointer(regs);
528}
529
530static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
531 fetch_func_t orig_fn)
532{
533 int i;
534
535 if (type != &fetch_type_table[FETCH_TYPE_STRING])
536 return NULL; /* Only string type needs size function */
537 for (i = 0; i < FETCH_MTD_END; i++)
538 if (type->fetch[i] == orig_fn)
539 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
540
541 WARN_ON(1); /* This should not happen */
542 return NULL;
543}
544 26
545/** 27/**
546 * Kprobe event core functions 28 * Kprobe event core functions
547 */ 29 */
548 30
549struct probe_arg {
550 struct fetch_param fetch;
551 struct fetch_param fetch_size;
552 unsigned int offset; /* Offset from argument entry */
553 const char *name; /* Name of this argument */
554 const char *comm; /* Command of this argument */
555 const struct fetch_type *type; /* Type of this argument */
556};
557
558/* Flags for trace_probe */
559#define TP_FLAG_TRACE 1
560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
562
563struct trace_probe { 31struct trace_probe {
564 struct list_head list; 32 struct list_head list;
565 struct kretprobe rp; /* Use rp.kp for kprobe use */ 33 struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
631static int kretprobe_dispatcher(struct kretprobe_instance *ri, 99static int kretprobe_dispatcher(struct kretprobe_instance *ri,
632 struct pt_regs *regs); 100 struct pt_regs *regs);
633 101
634/* Check the name is good for event/group/fields */
635static int is_good_name(const char *name)
636{
637 if (!isalpha(*name) && *name != '_')
638 return 0;
639 while (*++name != '\0') {
640 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
641 return 0;
642 }
643 return 1;
644}
645
646/* 102/*
647 * Allocate new trace_probe and initialize it (including kprobes). 103 * Allocate new trace_probe and initialize it (including kprobes).
648 */ 104 */
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
651 void *addr, 107 void *addr,
652 const char *symbol, 108 const char *symbol,
653 unsigned long offs, 109 unsigned long offs,
654 int nargs, int is_return) 110 int nargs, bool is_return)
655{ 111{
656 struct trace_probe *tp; 112 struct trace_probe *tp;
657 int ret = -ENOMEM; 113 int ret = -ENOMEM;
@@ -702,34 +158,12 @@ error:
702 return ERR_PTR(ret); 158 return ERR_PTR(ret);
703} 159}
704 160
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
715static void free_probe_arg(struct probe_arg *arg)
716{
717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
718 free_bitfield_fetch_param(arg->fetch.data);
719 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
720 free_deref_fetch_param(arg->fetch.data);
721 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
722 free_symbol_cache(arg->fetch.data);
723 kfree(arg->name);
724 kfree(arg->comm);
725}
726
727static void free_trace_probe(struct trace_probe *tp) 161static void free_trace_probe(struct trace_probe *tp)
728{ 162{
729 int i; 163 int i;
730 164
731 for (i = 0; i < tp->nr_args; i++) 165 for (i = 0; i < tp->nr_args; i++)
732 free_probe_arg(&tp->args[i]); 166 traceprobe_free_probe_arg(&tp->args[i]);
733 167
734 kfree(tp->call.class->system); 168 kfree(tp->call.class->system);
735 kfree(tp->call.name); 169 kfree(tp->call.name);
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp)
787 return -EINVAL; 221 return -EINVAL;
788 222
789 for (i = 0; i < tp->nr_args; i++) 223 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]); 224 traceprobe_update_arg(&tp->args[i]);
791 225
792 /* Set/clear disabled flag according to tp->flag */ 226 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp)) 227 if (trace_probe_is_enabled(tp))
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = {
919 .priority = 1 /* Invoked after kprobe module callback */ 353 .priority = 1 /* Invoked after kprobe module callback */
920}; 354};
921 355
922/* Split symbol and offset. */
923static int split_symbol_offset(char *symbol, unsigned long *offset)
924{
925 char *tmp;
926 int ret;
927
928 if (!offset)
929 return -EINVAL;
930
931 tmp = strchr(symbol, '+');
932 if (tmp) {
933 /* skip sign because strict_strtol doesn't accept '+' */
934 ret = strict_strtoul(tmp + 1, 0, offset);
935 if (ret)
936 return ret;
937 *tmp = '\0';
938 } else
939 *offset = 0;
940 return 0;
941}
942
943#define PARAM_MAX_ARGS 16
944#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
945
946static int parse_probe_vars(char *arg, const struct fetch_type *t,
947 struct fetch_param *f, int is_return)
948{
949 int ret = 0;
950 unsigned long param;
951
952 if (strcmp(arg, "retval") == 0) {
953 if (is_return)
954 f->fn = t->fetch[FETCH_MTD_retval];
955 else
956 ret = -EINVAL;
957 } else if (strncmp(arg, "stack", 5) == 0) {
958 if (arg[5] == '\0') {
959 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
960 f->fn = fetch_stack_address;
961 else
962 ret = -EINVAL;
963 } else if (isdigit(arg[5])) {
964 ret = strict_strtoul(arg + 5, 10, &param);
965 if (ret || param > PARAM_MAX_STACK)
966 ret = -EINVAL;
967 else {
968 f->fn = t->fetch[FETCH_MTD_stack];
969 f->data = (void *)param;
970 }
971 } else
972 ret = -EINVAL;
973 } else
974 ret = -EINVAL;
975 return ret;
976}
977
978/* Recursive argument parser */
979static int __parse_probe_arg(char *arg, const struct fetch_type *t,
980 struct fetch_param *f, int is_return)
981{
982 int ret = 0;
983 unsigned long param;
984 long offset;
985 char *tmp;
986
987 switch (arg[0]) {
988 case '$':
989 ret = parse_probe_vars(arg + 1, t, f, is_return);
990 break;
991 case '%': /* named register */
992 ret = regs_query_register_offset(arg + 1);
993 if (ret >= 0) {
994 f->fn = t->fetch[FETCH_MTD_reg];
995 f->data = (void *)(unsigned long)ret;
996 ret = 0;
997 }
998 break;
999 case '@': /* memory or symbol */
1000 if (isdigit(arg[1])) {
1001 ret = strict_strtoul(arg + 1, 0, &param);
1002 if (ret)
1003 break;
1004 f->fn = t->fetch[FETCH_MTD_memory];
1005 f->data = (void *)param;
1006 } else {
1007 ret = split_symbol_offset(arg + 1, &offset);
1008 if (ret)
1009 break;
1010 f->data = alloc_symbol_cache(arg + 1, offset);
1011 if (f->data)
1012 f->fn = t->fetch[FETCH_MTD_symbol];
1013 }
1014 break;
1015 case '+': /* deref memory */
1016 arg++; /* Skip '+', because strict_strtol() rejects it. */
1017 case '-':
1018 tmp = strchr(arg, '(');
1019 if (!tmp)
1020 break;
1021 *tmp = '\0';
1022 ret = strict_strtol(arg, 0, &offset);
1023 if (ret)
1024 break;
1025 arg = tmp + 1;
1026 tmp = strrchr(arg, ')');
1027 if (tmp) {
1028 struct deref_fetch_param *dprm;
1029 const struct fetch_type *t2 = find_fetch_type(NULL);
1030 *tmp = '\0';
1031 dprm = kzalloc(sizeof(struct deref_fetch_param),
1032 GFP_KERNEL);
1033 if (!dprm)
1034 return -ENOMEM;
1035 dprm->offset = offset;
1036 ret = __parse_probe_arg(arg, t2, &dprm->orig,
1037 is_return);
1038 if (ret)
1039 kfree(dprm);
1040 else {
1041 f->fn = t->fetch[FETCH_MTD_deref];
1042 f->data = (void *)dprm;
1043 }
1044 }
1045 break;
1046 }
1047 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
1048 pr_info("%s type has no corresponding fetch method.\n",
1049 t->name);
1050 ret = -EINVAL;
1051 }
1052 return ret;
1053}
1054
1055#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
1056
1057/* Bitfield type needs to be parsed into a fetch function */
1058static int __parse_bitfield_probe_arg(const char *bf,
1059 const struct fetch_type *t,
1060 struct fetch_param *f)
1061{
1062 struct bitfield_fetch_param *bprm;
1063 unsigned long bw, bo;
1064 char *tail;
1065
1066 if (*bf != 'b')
1067 return 0;
1068
1069 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1070 if (!bprm)
1071 return -ENOMEM;
1072 bprm->orig = *f;
1073 f->fn = t->fetch[FETCH_MTD_bitfield];
1074 f->data = (void *)bprm;
1075
1076 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
1077 if (bw == 0 || *tail != '@')
1078 return -EINVAL;
1079
1080 bf = tail + 1;
1081 bo = simple_strtoul(bf, &tail, 0);
1082 if (tail == bf || *tail != '/')
1083 return -EINVAL;
1084
1085 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
1086 bprm->low_shift = bprm->hi_shift + bo;
1087 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
1088}
1089
1090/* String length checking wrapper */
1091static int parse_probe_arg(char *arg, struct trace_probe *tp,
1092 struct probe_arg *parg, int is_return)
1093{
1094 const char *t;
1095 int ret;
1096
1097 if (strlen(arg) > MAX_ARGSTR_LEN) {
1098 pr_info("Argument is too long.: %s\n", arg);
1099 return -ENOSPC;
1100 }
1101 parg->comm = kstrdup(arg, GFP_KERNEL);
1102 if (!parg->comm) {
1103 pr_info("Failed to allocate memory for command '%s'.\n", arg);
1104 return -ENOMEM;
1105 }
1106 t = strchr(parg->comm, ':');
1107 if (t) {
1108 arg[t - parg->comm] = '\0';
1109 t++;
1110 }
1111 parg->type = find_fetch_type(t);
1112 if (!parg->type) {
1113 pr_info("Unsupported type: %s\n", t);
1114 return -EINVAL;
1115 }
1116 parg->offset = tp->size;
1117 tp->size += parg->type->size;
1118 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
1119 if (ret >= 0 && t != NULL)
1120 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
1121 if (ret >= 0) {
1122 parg->fetch_size.fn = get_fetch_size_function(parg->type,
1123 parg->fetch.fn);
1124 parg->fetch_size.data = parg->fetch.data;
1125 }
1126 return ret;
1127}
1128
1129/* Return 1 if name is reserved or already used by another argument */
1130static int conflict_field_name(const char *name,
1131 struct probe_arg *args, int narg)
1132{
1133 int i;
1134 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
1135 if (strcmp(reserved_field_names[i], name) == 0)
1136 return 1;
1137 for (i = 0; i < narg; i++)
1138 if (strcmp(args[i].name, name) == 0)
1139 return 1;
1140 return 0;
1141}
1142
1143static int create_trace_probe(int argc, char **argv) 356static int create_trace_probe(int argc, char **argv)
1144{ 357{
1145 /* 358 /*
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv)
1162 */ 375 */
1163 struct trace_probe *tp; 376 struct trace_probe *tp;
1164 int i, ret = 0; 377 int i, ret = 0;
1165 int is_return = 0, is_delete = 0; 378 bool is_return = false, is_delete = false;
1166 char *symbol = NULL, *event = NULL, *group = NULL; 379 char *symbol = NULL, *event = NULL, *group = NULL;
1167 char *arg; 380 char *arg;
1168 unsigned long offset = 0; 381 unsigned long offset = 0;
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv)
1171 384
1172 /* argc must be >= 1 */ 385 /* argc must be >= 1 */
1173 if (argv[0][0] == 'p') 386 if (argv[0][0] == 'p')
1174 is_return = 0; 387 is_return = false;
1175 else if (argv[0][0] == 'r') 388 else if (argv[0][0] == 'r')
1176 is_return = 1; 389 is_return = true;
1177 else if (argv[0][0] == '-') 390 else if (argv[0][0] == '-')
1178 is_delete = 1; 391 is_delete = true;
1179 else { 392 else {
1180 pr_info("Probe definition must be started with 'p', 'r' or" 393 pr_info("Probe definition must be started with 'p', 'r' or"
1181 " '-'.\n"); 394 " '-'.\n");
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv)
1240 /* a symbol specified */ 453 /* a symbol specified */
1241 symbol = argv[1]; 454 symbol = argv[1];
1242 /* TODO: support .init module functions */ 455 /* TODO: support .init module functions */
1243 ret = split_symbol_offset(symbol, &offset); 456 ret = traceprobe_split_symbol_offset(symbol, &offset);
1244 if (ret) { 457 if (ret) {
1245 pr_info("Failed to parse symbol.\n"); 458 pr_info("Failed to parse symbol.\n");
1246 return ret; 459 return ret;
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv)
1302 goto error; 515 goto error;
1303 } 516 }
1304 517
1305 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 518 if (traceprobe_conflict_field_name(tp->args[i].name,
519 tp->args, i)) {
1306 pr_info("Argument[%d] name '%s' conflicts with " 520 pr_info("Argument[%d] name '%s' conflicts with "
1307 "another field.\n", i, argv[i]); 521 "another field.\n", i, argv[i]);
1308 ret = -EINVAL; 522 ret = -EINVAL;
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv)
1310 } 524 }
1311 525
1312 /* Parse fetch argument */ 526 /* Parse fetch argument */
1313 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 527 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
528 is_return, true);
1314 if (ret) { 529 if (ret) {
1315 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 530 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
1316 goto error; 531 goto error;
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file)
1412 return seq_open(file, &probes_seq_op); 627 return seq_open(file, &probes_seq_op);
1413} 628}
1414 629
1415static int command_trace_probe(const char *buf)
1416{
1417 char **argv;
1418 int argc = 0, ret = 0;
1419
1420 argv = argv_split(GFP_KERNEL, buf, &argc);
1421 if (!argv)
1422 return -ENOMEM;
1423
1424 if (argc)
1425 ret = create_trace_probe(argc, argv);
1426
1427 argv_free(argv);
1428 return ret;
1429}
1430
1431#define WRITE_BUFSIZE 4096
1432
1433static ssize_t probes_write(struct file *file, const char __user *buffer, 630static ssize_t probes_write(struct file *file, const char __user *buffer,
1434 size_t count, loff_t *ppos) 631 size_t count, loff_t *ppos)
1435{ 632{
1436 char *kbuf, *tmp; 633 return traceprobe_probes_write(file, buffer, count, ppos,
1437 int ret; 634 create_trace_probe);
1438 size_t done;
1439 size_t size;
1440
1441 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
1442 if (!kbuf)
1443 return -ENOMEM;
1444
1445 ret = done = 0;
1446 while (done < count) {
1447 size = count - done;
1448 if (size >= WRITE_BUFSIZE)
1449 size = WRITE_BUFSIZE - 1;
1450 if (copy_from_user(kbuf, buffer + done, size)) {
1451 ret = -EFAULT;
1452 goto out;
1453 }
1454 kbuf[size] = '\0';
1455 tmp = strchr(kbuf, '\n');
1456 if (tmp) {
1457 *tmp = '\0';
1458 size = tmp - kbuf + 1;
1459 } else if (done + size < count) {
1460 pr_warning("Line length is too long: "
1461 "Should be less than %d.", WRITE_BUFSIZE);
1462 ret = -EINVAL;
1463 goto out;
1464 }
1465 done += size;
1466 /* Remove comments */
1467 tmp = strchr(kbuf, '#');
1468 if (tmp)
1469 *tmp = '\0';
1470
1471 ret = command_trace_probe(kbuf);
1472 if (ret)
1473 goto out;
1474 }
1475 ret = done;
1476out:
1477 kfree(kbuf);
1478 return ret;
1479} 635}
1480 636
1481static const struct file_operations kprobe_events_ops = { 637static const struct file_operations kprobe_events_ops = {
@@ -1711,16 +867,6 @@ partial:
1711 return TRACE_TYPE_PARTIAL_LINE; 867 return TRACE_TYPE_PARTIAL_LINE;
1712} 868}
1713 869
1714#undef DEFINE_FIELD
1715#define DEFINE_FIELD(type, item, name, is_signed) \
1716 do { \
1717 ret = trace_define_field(event_call, #type, name, \
1718 offsetof(typeof(field), item), \
1719 sizeof(field.item), is_signed, \
1720 FILTER_OTHER); \
1721 if (ret) \
1722 return ret; \
1723 } while (0)
1724 870
1725static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 871static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1726{ 872{
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void)
2051 1197
2052 pr_info("Testing kprobe tracing: "); 1198 pr_info("Testing kprobe tracing: ");
2053 1199
2054 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1200 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
2055 "$stack $stack0 +0($stack)"); 1201 "$stack $stack0 +0($stack)",
1202 create_trace_probe);
2056 if (WARN_ON_ONCE(ret)) { 1203 if (WARN_ON_ONCE(ret)) {
2057 pr_warning("error on probing function entry.\n"); 1204 pr_warning("error on probing function entry.\n");
2058 warn++; 1205 warn++;
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void)
2066 enable_trace_probe(tp, TP_FLAG_TRACE); 1213 enable_trace_probe(tp, TP_FLAG_TRACE);
2067 } 1214 }
2068 1215
2069 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1216 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
2070 "$retval"); 1217 "$retval", create_trace_probe);
2071 if (WARN_ON_ONCE(ret)) { 1218 if (WARN_ON_ONCE(ret)) {
2072 pr_warning("error on probing function return.\n"); 1219 pr_warning("error on probing function return.\n");
2073 warn++; 1220 warn++;
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void)
2101 } else 1248 } else
2102 disable_trace_probe(tp, TP_FLAG_TRACE); 1249 disable_trace_probe(tp, TP_FLAG_TRACE);
2103 1250
2104 ret = command_trace_probe("-:testprobe"); 1251 ret = traceprobe_command("-:testprobe", create_trace_probe);
2105 if (WARN_ON_ONCE(ret)) { 1252 if (WARN_ON_ONCE(ret)) {
2106 pr_warning("error on deleting a probe.\n"); 1253 pr_warning("error on deleting a probe.\n");
2107 warn++; 1254 warn++;
2108 } 1255 }
2109 1256
2110 ret = command_trace_probe("-:testprobe2"); 1257 ret = traceprobe_command("-:testprobe2", create_trace_probe);
2111 if (WARN_ON_ONCE(ret)) { 1258 if (WARN_ON_ONCE(ret)) {
2112 pr_warning("error on deleting a probe.\n"); 1259 pr_warning("error on deleting a probe.\n");
2113 warn++; 1260 warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000000..daa9980153af
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,839 @@
1/*
2 * Common code for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.c written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include "trace_probe.h"
26
27const char *reserved_field_names[] = {
28 "common_type",
29 "common_flags",
30 "common_preempt_count",
31 "common_pid",
32 "common_tgid",
33 FIELD_STRING_IP,
34 FIELD_STRING_RETIP,
35 FIELD_STRING_FUNC,
36};
37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \
46 void *data, void *ent)\
47{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
49} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65
66/* For data_loc conversion */
67static inline void *get_loc_data(u32 *dl, void *ent)
68{
69 return (u8 *)ent + get_rloc_offs(*dl);
70}
71
72/* For defining macros, define string/string_size types */
73typedef u32 string;
74typedef u32 string_size;
75
76/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name,
79 void *data, void *ent)
80{
81 int len = *(u32 *)data >> 16;
82
83 if (!len)
84 return trace_seq_printf(s, " %s=(fault)", name);
85 else
86 return trace_seq_printf(s, " %s=\"%s\"", name,
87 (const char *)get_loc_data(data, ent));
88}
89
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102
103#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \
105 (FETCH_FUNC_NAME(method, u16) == fn) || \
106 (FETCH_FUNC_NAME(method, u32) == fn) || \
107 (FETCH_FUNC_NAME(method, u64) == fn) || \
108 (FETCH_FUNC_NAME(method, string) == fn) || \
109 (FETCH_FUNC_NAME(method, string_size) == fn)) \
110 && (fn != NULL))
111
112/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \
116{ \
117 *(type *)dest = (type)regs_get_register(regs, \
118 (unsigned int)((unsigned long)offset)); \
119}
120DEFINE_BASIC_FETCH_FUNCS(reg)
121/* No string on the register */
122#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL
124
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
139 void *dummy, void *dest) \
140{ \
141 *(type *)dest = (type)regs_return_value(regs); \
142}
143DEFINE_BASIC_FETCH_FUNCS(retval)
144/* No string on the retval */
145#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL
147
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */
285struct deref_fetch_param {
286 struct fetch_param orig;
287 long offset;
288};
289
290#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
292 void *data, void *dest) \
293{ \
294 struct deref_fetch_param *dprm = data; \
295 unsigned long addr; \
296 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \
298 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \
300 } else \
301 *(type *)dest = 0; \
302}
303DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size)
306
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{
309 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
310 update_deref_fetch_param(data->orig.data);
311 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
312 update_symbol_cache(data->orig.data);
313}
314
315static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
316{
317 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
318 free_deref_fetch_param(data->orig.data);
319 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
320 free_symbol_cache(data->orig.data);
321 kfree(data);
322}
323
324/* Bitfield fetch function */
325struct bitfield_fetch_param {
326 struct fetch_param orig;
327 unsigned char hi_shift;
328 unsigned char low_shift;
329};
330
331#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
333 void *data, void *dest) \
334{ \
335 struct bitfield_fetch_param *bprm = data; \
336 type buf = 0; \
337 call_fetch(&bprm->orig, regs, &buf); \
338 if (buf) { \
339 buf <<= bprm->hi_shift; \
340 buf >>= bprm->low_shift; \
341 } \
342 *(type *)dest = buf; \
343}
344
345DEFINE_BASIC_FETCH_FUNCS(bitfield)
346#define fetch_bitfield_string NULL
347#define fetch_bitfield_string_size NULL
348
349static __kprobes void
350update_bitfield_fetch_param(struct bitfield_fetch_param *data)
351{
352 /*
353 * Don't check the bitfield itself, because this must be the
354 * last fetch function.
355 */
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 update_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 update_symbol_cache(data->orig.data);
360}
361
362static __kprobes void
363free_bitfield_fetch_param(struct bitfield_fetch_param *data)
364{
365 /*
366 * Don't check the bitfield itself, because this must be the
367 * last fetch function.
368 */
369 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
370 free_deref_fetch_param(data->orig.data);
371 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
372 free_symbol_cache(data->orig.data);
373
374 kfree(data);
375}
376
377/* Default (unsigned long) fetch type */
378#define __DEFAULT_FETCH_TYPE(t) u##t
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{
430 int i;
431
432 if (!type)
433 type = DEFAULT_FETCH_TYPE_STR;
434
435 /* Special case: bitfield */
436 if (*type == 'b') {
437 unsigned long bs;
438
439 type = strchr(type, '/');
440 if (!type)
441 goto fail;
442
443 type++;
444 if (strict_strtoul(type, 0, &bs))
445 goto fail;
446
447 switch (bs) {
448 case 8:
449 return find_fetch_type("u8");
450 case 16:
451 return find_fetch_type("u16");
452 case 32:
453 return find_fetch_type("u32");
454 case 64:
455 return find_fetch_type("u64");
456 default:
457 goto fail;
458 }
459 }
460
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
462 if (strcmp(type, fetch_type_table[i].name) == 0)
463 return &fetch_type_table[i];
464
465fail:
466 return NULL;
467}
468
469/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest)
472{
473 *(unsigned long *)dest = kernel_stack_pointer(regs);
474}
475
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn)
478{
479 int i;
480
481 if (type != &fetch_type_table[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */
483
484 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
487
488 WARN_ON(1); /* This should not happen */
489
490 return NULL;
491}
492
493/* Split symbol and offset. */
494int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
495{
496 char *tmp;
497 int ret;
498
499 if (!offset)
500 return -EINVAL;
501
502 tmp = strchr(symbol, '+');
503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset);
506 if (ret)
507 return ret;
508
509 *tmp = '\0';
510 } else
511 *offset = 0;
512
513 return 0;
514}
515
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517
518static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return)
520{
521 int ret = 0;
522 unsigned long param;
523
524 if (strcmp(arg, "retval") == 0) {
525 if (is_return)
526 f->fn = t->fetch[FETCH_MTD_retval];
527 else
528 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
532 f->fn = fetch_stack_address;
533 else
534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL;
539 else {
540 f->fn = t->fetch[FETCH_MTD_stack];
541 f->data = (void *)param;
542 }
543 } else
544 ret = -EINVAL;
545 } else
546 ret = -EINVAL;
547
548 return ret;
549}
550
551/* Recursive argument parser */
552static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe)
554{
555 unsigned long param;
556 long offset;
557 char *tmp;
558 int ret;
559
560 ret = 0;
561
562 /* Until uprobe_events supports only reg arguments */
563 if (!is_kprobe && arg[0] != '%')
564 return -EINVAL;
565
566 switch (arg[0]) {
567 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return);
569 break;
570
571 case '%': /* named register */
572 ret = regs_query_register_offset(arg + 1);
573 if (ret >= 0) {
574 f->fn = t->fetch[FETCH_MTD_reg];
575 f->data = (void *)(unsigned long)ret;
576 ret = 0;
577 }
578 break;
579
580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param);
583 if (ret)
584 break;
585
586 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param;
588 } else {
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret)
591 break;
592
593 f->data = alloc_symbol_cache(arg + 1, offset);
594 if (f->data)
595 f->fn = t->fetch[FETCH_MTD_symbol];
596 }
597 break;
598
599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */
601 case '-':
602 tmp = strchr(arg, '(');
603 if (!tmp)
604 break;
605
606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset);
608
609 if (ret)
610 break;
611
612 arg = tmp + 1;
613 tmp = strrchr(arg, ')');
614
615 if (tmp) {
616 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2;
618
619 t2 = find_fetch_type(NULL);
620 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622
623 if (!dprm)
624 return -ENOMEM;
625
626 dprm->offset = offset;
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe);
629 if (ret)
630 kfree(dprm);
631 else {
632 f->fn = t->fetch[FETCH_MTD_deref];
633 f->data = (void *)dprm;
634 }
635 }
636 break;
637 }
638 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
639 pr_info("%s type has no corresponding fetch method.\n", t->name);
640 ret = -EINVAL;
641 }
642
643 return ret;
644}
645
646#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
647
648/* Bitfield type needs to be parsed into a fetch function */
649static int __parse_bitfield_probe_arg(const char *bf,
650 const struct fetch_type *t,
651 struct fetch_param *f)
652{
653 struct bitfield_fetch_param *bprm;
654 unsigned long bw, bo;
655 char *tail;
656
657 if (*bf != 'b')
658 return 0;
659
660 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
661 if (!bprm)
662 return -ENOMEM;
663
664 bprm->orig = *f;
665 f->fn = t->fetch[FETCH_MTD_bitfield];
666 f->data = (void *)bprm;
667 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
668
669 if (bw == 0 || *tail != '@')
670 return -EINVAL;
671
672 bf = tail + 1;
673 bo = simple_strtoul(bf, &tail, 0);
674
675 if (tail == bf || *tail != '/')
676 return -EINVAL;
677
678 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
679 bprm->low_shift = bprm->hi_shift + bo;
680
681 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
682}
683
684/* String length checking wrapper */
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{
688 const char *t;
689 int ret;
690
691 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC;
694 }
695 parg->comm = kstrdup(arg, GFP_KERNEL);
696 if (!parg->comm) {
697 pr_info("Failed to allocate memory for command '%s'.\n", arg);
698 return -ENOMEM;
699 }
700 t = strchr(parg->comm, ':');
701 if (t) {
702 arg[t - parg->comm] = '\0';
703 t++;
704 }
705 parg->type = find_fetch_type(t);
706 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL;
709 }
710 parg->offset = *size;
711 *size += parg->type->size;
712 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
713
714 if (ret >= 0 && t != NULL)
715 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
716
717 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn);
720 parg->fetch_size.data = parg->fetch.data;
721 }
722
723 return ret;
724}
725
726/* Return 1 if name is reserved or already used by another argument */
727int traceprobe_conflict_field_name(const char *name,
728 struct probe_arg *args, int narg)
729{
730 int i;
731
732 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
733 if (strcmp(reserved_field_names[i], name) == 0)
734 return 1;
735
736 for (i = 0; i < narg; i++)
737 if (strcmp(args[i].name, name) == 0)
738 return 1;
739
740 return 0;
741}
742
743void traceprobe_update_arg(struct probe_arg *arg)
744{
745 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
746 update_bitfield_fetch_param(arg->fetch.data);
747 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
748 update_deref_fetch_param(arg->fetch.data);
749 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
750 update_symbol_cache(arg->fetch.data);
751}
752
753void traceprobe_free_probe_arg(struct probe_arg *arg)
754{
755 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
756 free_bitfield_fetch_param(arg->fetch.data);
757 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
758 free_deref_fetch_param(arg->fetch.data);
759 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
760 free_symbol_cache(arg->fetch.data);
761
762 kfree(arg->name);
763 kfree(arg->comm);
764}
765
766int traceprobe_command(const char *buf, int (*createfn)(int, char **))
767{
768 char **argv;
769 int argc, ret;
770
771 argc = 0;
772 ret = 0;
773 argv = argv_split(GFP_KERNEL, buf, &argc);
774 if (!argv)
775 return -ENOMEM;
776
777 if (argc)
778 ret = createfn(argc, argv);
779
780 argv_free(argv);
781
782 return ret;
783}
784
785#define WRITE_BUFSIZE 4096
786
787ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
788 size_t count, loff_t *ppos,
789 int (*createfn)(int, char **))
790{
791 char *kbuf, *tmp;
792 int ret = 0;
793 size_t done = 0;
794 size_t size;
795
796 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
797 if (!kbuf)
798 return -ENOMEM;
799
800 while (done < count) {
801 size = count - done;
802
803 if (size >= WRITE_BUFSIZE)
804 size = WRITE_BUFSIZE - 1;
805
806 if (copy_from_user(kbuf, buffer + done, size)) {
807 ret = -EFAULT;
808 goto out;
809 }
810 kbuf[size] = '\0';
811 tmp = strchr(kbuf, '\n');
812
813 if (tmp) {
814 *tmp = '\0';
815 size = tmp - kbuf + 1;
816 } else if (done + size < count) {
817 pr_warning("Line length is too long: "
818 "Should be less than %d.", WRITE_BUFSIZE);
819 ret = -EINVAL;
820 goto out;
821 }
822 done += size;
823 /* Remove comments */
824 tmp = strchr(kbuf, '#');
825
826 if (tmp)
827 *tmp = '\0';
828
829 ret = traceprobe_command(kbuf, createfn);
830 if (ret)
831 goto out;
832 }
833 ret = done;
834
835out:
836 kfree(kbuf);
837
838 return ret;
839}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000000..933708677814
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,161 @@
1/*
2 * Common header file for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.h written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include <linux/seq_file.h>
26#include <linux/slab.h>
27#include <linux/smp.h>
28#include <linux/debugfs.h>
29#include <linux/types.h>
30#include <linux/string.h>
31#include <linux/ctype.h>
32#include <linux/ptrace.h>
33#include <linux/perf_event.h>
34#include <linux/kprobes.h>
35#include <linux/stringify.h>
36#include <linux/limits.h>
37#include <linux/uaccess.h>
38#include <asm/bitsperlong.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define MAX_TRACE_ARGS 128
44#define MAX_ARGSTR_LEN 63
45#define MAX_EVENT_NAME_LEN 64
46#define MAX_STRING_SIZE PATH_MAX
47
48/* Reserved field names */
49#define FIELD_STRING_IP "__probe_ip"
50#define FIELD_STRING_RETIP "__probe_ret_ip"
51#define FIELD_STRING_FUNC "__probe_func"
52
53#undef DEFINE_FIELD
54#define DEFINE_FIELD(type, item, name, is_signed) \
55 do { \
56 ret = trace_define_field(event_call, #type, name, \
57 offsetof(typeof(field), item), \
58 sizeof(field.item), is_signed, \
59 FILTER_OTHER); \
60 if (ret) \
61 return ret; \
62 } while (0)
63
64
65/* Flags for trace_probe */
66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70
71
72/* data_rloc: data relative location, compatible with u32 */
73#define make_data_rloc(len, roffs) \
74 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
75#define get_rloc_len(dl) ((u32)(dl) >> 16)
76#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
77
78/*
79 * Convert data_rloc to data_loc:
80 * data_rloc stores the offset from data_rloc itself, but data_loc
81 * stores the offset from event entry.
82 */
83#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
84
85/* Data fetch function type */
86typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
87/* Printing function type */
88typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
89
90/* Fetch types */
91enum {
92 FETCH_MTD_reg = 0,
93 FETCH_MTD_stack,
94 FETCH_MTD_retval,
95 FETCH_MTD_memory,
96 FETCH_MTD_symbol,
97 FETCH_MTD_deref,
98 FETCH_MTD_bitfield,
99 FETCH_MTD_END,
100};
101
102/* Fetch type information table */
103struct fetch_type {
104 const char *name; /* Name of type */
105 size_t size; /* Byte size of type */
106 int is_signed; /* Signed flag */
107 print_type_func_t print; /* Print functions */
108 const char *fmt; /* Fromat string */
109 const char *fmttype; /* Name in format file */
110 /* Fetch functions */
111 fetch_func_t fetch[FETCH_MTD_END];
112};
113
114struct fetch_param {
115 fetch_func_t fn;
116 void *data;
117};
118
119struct probe_arg {
120 struct fetch_param fetch;
121 struct fetch_param fetch_size;
122 unsigned int offset; /* Offset from argument entry */
123 const char *name; /* Name of this argument */
124 const char *comm; /* Command of this argument */
125 const struct fetch_type *type; /* Type of this argument */
126};
127
128static inline __kprobes void call_fetch(struct fetch_param *fprm,
129 struct pt_regs *regs, void *dest)
130{
131 return fprm->fn(regs, fprm->data, dest);
132}
133
134/* Check the name is good for event/group/fields */
135static inline int is_good_name(const char *name)
136{
137 if (!isalpha(*name) && *name != '_')
138 return 0;
139 while (*++name != '\0') {
140 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
141 return 0;
142 }
143 return 1;
144}
145
146extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
147 struct probe_arg *parg, bool is_return, bool is_kprobe);
148
149extern int traceprobe_conflict_field_name(const char *name,
150 struct probe_arg *args, int narg);
151
152extern void traceprobe_update_arg(struct probe_arg *arg);
153extern void traceprobe_free_probe_arg(struct probe_arg *arg);
154
155extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
156
157extern ssize_t traceprobe_probes_write(struct file *file,
158 const char __user *buffer, size_t count, loff_t *ppos,
159 int (*createfn)(int, char**));
160
161extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000000..2b36ac68549e
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,788 @@
1/*
2 * uprobes-based tracing events
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * Copyright (C) IBM Corporation, 2010-2012
18 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
19 */
20
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/uprobes.h>
24#include <linux/namei.h>
25
26#include "trace_probe.h"
27
28#define UPROBE_EVENT_SYSTEM "uprobes"
29
30/*
31 * uprobe event core functions
32 */
33struct trace_uprobe;
34struct uprobe_trace_consumer {
35 struct uprobe_consumer cons;
36 struct trace_uprobe *tu;
37};
38
39struct trace_uprobe {
40 struct list_head list;
41 struct ftrace_event_class class;
42 struct ftrace_event_call call;
43 struct uprobe_trace_consumer *consumer;
44 struct inode *inode;
45 char *filename;
46 unsigned long offset;
47 unsigned long nhit;
48 unsigned int flags; /* For TP_FLAG_* */
49 ssize_t size; /* trace entry size */
50 unsigned int nr_args;
51 struct probe_arg args[];
52};
53
54#define SIZEOF_TRACE_UPROBE(n) \
55 (offsetof(struct trace_uprobe, args) + \
56 (sizeof(struct probe_arg) * (n)))
57
58static int register_uprobe_event(struct trace_uprobe *tu);
59static void unregister_uprobe_event(struct trace_uprobe *tu);
60
61static DEFINE_MUTEX(uprobe_lock);
62static LIST_HEAD(uprobe_list);
63
64static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
65
66/*
67 * Allocate new trace_uprobe and initialize it (including uprobes).
68 */
69static struct trace_uprobe *
70alloc_trace_uprobe(const char *group, const char *event, int nargs)
71{
72 struct trace_uprobe *tu;
73
74 if (!event || !is_good_name(event))
75 return ERR_PTR(-EINVAL);
76
77 if (!group || !is_good_name(group))
78 return ERR_PTR(-EINVAL);
79
80 tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
81 if (!tu)
82 return ERR_PTR(-ENOMEM);
83
84 tu->call.class = &tu->class;
85 tu->call.name = kstrdup(event, GFP_KERNEL);
86 if (!tu->call.name)
87 goto error;
88
89 tu->class.system = kstrdup(group, GFP_KERNEL);
90 if (!tu->class.system)
91 goto error;
92
93 INIT_LIST_HEAD(&tu->list);
94 return tu;
95
96error:
97 kfree(tu->call.name);
98 kfree(tu);
99
100 return ERR_PTR(-ENOMEM);
101}
102
103static void free_trace_uprobe(struct trace_uprobe *tu)
104{
105 int i;
106
107 for (i = 0; i < tu->nr_args; i++)
108 traceprobe_free_probe_arg(&tu->args[i]);
109
110 iput(tu->inode);
111 kfree(tu->call.class->system);
112 kfree(tu->call.name);
113 kfree(tu->filename);
114 kfree(tu);
115}
116
117static struct trace_uprobe *find_probe_event(const char *event, const char *group)
118{
119 struct trace_uprobe *tu;
120
121 list_for_each_entry(tu, &uprobe_list, list)
122 if (strcmp(tu->call.name, event) == 0 &&
123 strcmp(tu->call.class->system, group) == 0)
124 return tu;
125
126 return NULL;
127}
128
129/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
130static void unregister_trace_uprobe(struct trace_uprobe *tu)
131{
132 list_del(&tu->list);
133 unregister_uprobe_event(tu);
134 free_trace_uprobe(tu);
135}
136
137/* Register a trace_uprobe and probe_event */
138static int register_trace_uprobe(struct trace_uprobe *tu)
139{
140 struct trace_uprobe *old_tp;
141 int ret;
142
143 mutex_lock(&uprobe_lock);
144
145 /* register as an event */
146 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
147 if (old_tp)
148 /* delete old event */
149 unregister_trace_uprobe(old_tp);
150
151 ret = register_uprobe_event(tu);
152 if (ret) {
153 pr_warning("Failed to register probe event(%d)\n", ret);
154 goto end;
155 }
156
157 list_add_tail(&tu->list, &uprobe_list);
158
159end:
160 mutex_unlock(&uprobe_lock);
161
162 return ret;
163}
164
165/*
166 * Argument syntax:
167 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
168 *
169 * - Remove uprobe: -:[GRP/]EVENT
170 */
171static int create_trace_uprobe(int argc, char **argv)
172{
173 struct trace_uprobe *tu;
174 struct inode *inode;
175 char *arg, *event, *group, *filename;
176 char buf[MAX_EVENT_NAME_LEN];
177 struct path path;
178 unsigned long offset;
179 bool is_delete;
180 int i, ret;
181
182 inode = NULL;
183 ret = 0;
184 is_delete = false;
185 event = NULL;
186 group = NULL;
187
188 /* argc must be >= 1 */
189 if (argv[0][0] == '-')
190 is_delete = true;
191 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
193 return -EINVAL;
194 }
195
196 if (argv[0][1] == ':') {
197 event = &argv[0][2];
198 arg = strchr(event, '/');
199
200 if (arg) {
201 group = event;
202 event = arg + 1;
203 event[-1] = '\0';
204
205 if (strlen(group) == 0) {
206 pr_info("Group name is not specified\n");
207 return -EINVAL;
208 }
209 }
210 if (strlen(event) == 0) {
211 pr_info("Event name is not specified\n");
212 return -EINVAL;
213 }
214 }
215 if (!group)
216 group = UPROBE_EVENT_SYSTEM;
217
218 if (is_delete) {
219 if (!event) {
220 pr_info("Delete command needs an event name.\n");
221 return -EINVAL;
222 }
223 mutex_lock(&uprobe_lock);
224 tu = find_probe_event(event, group);
225
226 if (!tu) {
227 mutex_unlock(&uprobe_lock);
228 pr_info("Event %s/%s doesn't exist.\n", group, event);
229 return -ENOENT;
230 }
231 /* delete an event */
232 unregister_trace_uprobe(tu);
233 mutex_unlock(&uprobe_lock);
234 return 0;
235 }
236
237 if (argc < 2) {
238 pr_info("Probe point is not specified.\n");
239 return -EINVAL;
240 }
241 if (isdigit(argv[1][0])) {
242 pr_info("probe point must be have a filename.\n");
243 return -EINVAL;
244 }
245 arg = strchr(argv[1], ':');
246 if (!arg)
247 goto fail_address_parse;
248
249 *arg++ = '\0';
250 filename = argv[1];
251 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
252 if (ret)
253 goto fail_address_parse;
254
255 ret = strict_strtoul(arg, 0, &offset);
256 if (ret)
257 goto fail_address_parse;
258
259 inode = igrab(path.dentry->d_inode);
260
261 argc -= 2;
262 argv += 2;
263
264 /* setup a probe */
265 if (!event) {
266 char *tail = strrchr(filename, '/');
267 char *ptr;
268
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
270 if (!ptr) {
271 ret = -ENOMEM;
272 goto fail_address_parse;
273 }
274
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_");
277 if (ptr)
278 *ptr = '\0';
279
280 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
281 event = buf;
282 kfree(tail);
283 }
284
285 tu = alloc_trace_uprobe(group, event, argc);
286 if (IS_ERR(tu)) {
287 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
288 ret = PTR_ERR(tu);
289 goto fail_address_parse;
290 }
291 tu->offset = offset;
292 tu->inode = inode;
293 tu->filename = kstrdup(filename, GFP_KERNEL);
294
295 if (!tu->filename) {
296 pr_info("Failed to allocate filename.\n");
297 ret = -ENOMEM;
298 goto error;
299 }
300
301 /* parse arguments */
302 ret = 0;
303 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
304 /* Increment count for freeing args in error case */
305 tu->nr_args++;
306
307 /* Parse argument name */
308 arg = strchr(argv[i], '=');
309 if (arg) {
310 *arg++ = '\0';
311 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
312 } else {
313 arg = argv[i];
314 /* If argument name is omitted, set "argN" */
315 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
316 tu->args[i].name = kstrdup(buf, GFP_KERNEL);
317 }
318
319 if (!tu->args[i].name) {
320 pr_info("Failed to allocate argument[%d] name.\n", i);
321 ret = -ENOMEM;
322 goto error;
323 }
324
325 if (!is_good_name(tu->args[i].name)) {
326 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
327 ret = -EINVAL;
328 goto error;
329 }
330
331 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
332 pr_info("Argument[%d] name '%s' conflicts with "
333 "another field.\n", i, argv[i]);
334 ret = -EINVAL;
335 goto error;
336 }
337
338 /* Parse fetch argument */
339 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
340 if (ret) {
341 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
342 goto error;
343 }
344 }
345
346 ret = register_trace_uprobe(tu);
347 if (ret)
348 goto error;
349 return 0;
350
351error:
352 free_trace_uprobe(tu);
353 return ret;
354
355fail_address_parse:
356 if (inode)
357 iput(inode);
358
359 pr_info("Failed to parse address.\n");
360
361 return ret;
362}
363
364static void cleanup_all_probes(void)
365{
366 struct trace_uprobe *tu;
367
368 mutex_lock(&uprobe_lock);
369 while (!list_empty(&uprobe_list)) {
370 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
371 unregister_trace_uprobe(tu);
372 }
373 mutex_unlock(&uprobe_lock);
374}
375
376/* Probes listing interfaces */
377static void *probes_seq_start(struct seq_file *m, loff_t *pos)
378{
379 mutex_lock(&uprobe_lock);
380 return seq_list_start(&uprobe_list, *pos);
381}
382
383static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
384{
385 return seq_list_next(v, &uprobe_list, pos);
386}
387
388static void probes_seq_stop(struct seq_file *m, void *v)
389{
390 mutex_unlock(&uprobe_lock);
391}
392
393static int probes_seq_show(struct seq_file *m, void *v)
394{
395 struct trace_uprobe *tu = v;
396 int i;
397
398 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
399 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
400
401 for (i = 0; i < tu->nr_args; i++)
402 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
403
404 seq_printf(m, "\n");
405 return 0;
406}
407
408static const struct seq_operations probes_seq_op = {
409 .start = probes_seq_start,
410 .next = probes_seq_next,
411 .stop = probes_seq_stop,
412 .show = probes_seq_show
413};
414
415static int probes_open(struct inode *inode, struct file *file)
416{
417 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
418 cleanup_all_probes();
419
420 return seq_open(file, &probes_seq_op);
421}
422
423static ssize_t probes_write(struct file *file, const char __user *buffer,
424 size_t count, loff_t *ppos)
425{
426 return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
427}
428
429static const struct file_operations uprobe_events_ops = {
430 .owner = THIS_MODULE,
431 .open = probes_open,
432 .read = seq_read,
433 .llseek = seq_lseek,
434 .release = seq_release,
435 .write = probes_write,
436};
437
438/* Probes profiling interfaces */
439static int probes_profile_seq_show(struct seq_file *m, void *v)
440{
441 struct trace_uprobe *tu = v;
442
443 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
444 return 0;
445}
446
447static const struct seq_operations profile_seq_op = {
448 .start = probes_seq_start,
449 .next = probes_seq_next,
450 .stop = probes_seq_stop,
451 .show = probes_profile_seq_show
452};
453
454static int profile_open(struct inode *inode, struct file *file)
455{
456 return seq_open(file, &profile_seq_op);
457}
458
459static const struct file_operations uprobe_profile_ops = {
460 .owner = THIS_MODULE,
461 .open = profile_open,
462 .read = seq_read,
463 .llseek = seq_lseek,
464 .release = seq_release,
465};
466
467/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{
470 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event;
472 struct ring_buffer *buffer;
473 u8 *data;
474 int size, i, pc;
475 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call;
477
478 tu->nhit++;
479
480 local_save_flags(irq_flags);
481 pc = preempt_count();
482
483 size = sizeof(*entry) + tu->size;
484
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc);
487 if (!event)
488 return;
489
490 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
492 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495
496 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
498}
499
500/* Event entry printers */
501static enum print_line_t
502print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
503{
504 struct uprobe_trace_entry_head *field;
505 struct trace_seq *s = &iter->seq;
506 struct trace_uprobe *tu;
507 u8 *data;
508 int i;
509
510 field = (struct uprobe_trace_entry_head *)iter->ent;
511 tu = container_of(event, struct trace_uprobe, call.event);
512
513 if (!trace_seq_printf(s, "%s: (", tu->call.name))
514 goto partial;
515
516 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
517 goto partial;
518
519 if (!trace_seq_puts(s, ")"))
520 goto partial;
521
522 data = (u8 *)&field[1];
523 for (i = 0; i < tu->nr_args; i++) {
524 if (!tu->args[i].type->print(s, tu->args[i].name,
525 data + tu->args[i].offset, field))
526 goto partial;
527 }
528
529 if (trace_seq_puts(s, "\n"))
530 return TRACE_TYPE_HANDLED;
531
532partial:
533 return TRACE_TYPE_PARTIAL_LINE;
534}
535
536static int probe_event_enable(struct trace_uprobe *tu, int flag)
537{
538 struct uprobe_trace_consumer *utc;
539 int ret = 0;
540
541 if (!tu->inode || tu->consumer)
542 return -EINTR;
543
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
545 if (!utc)
546 return -EINTR;
547
548 utc->cons.handler = uprobe_dispatcher;
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555
556 tu->flags |= flag;
557 utc->tu = tu;
558 tu->consumer = utc;
559
560 return 0;
561}
562
563static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{
565 if (!tu->inode || !tu->consumer)
566 return;
567
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
569 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572}
573
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
575{
576 int ret, i;
577 struct uprobe_trace_entry_head field;
578 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
579
580 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
581 /* Set argument names as fields */
582 for (i = 0; i < tu->nr_args; i++) {
583 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
584 tu->args[i].name,
585 sizeof(field) + tu->args[i].offset,
586 tu->args[i].type->size,
587 tu->args[i].type->is_signed,
588 FILTER_OTHER);
589
590 if (ret)
591 return ret;
592 }
593 return 0;
594}
595
596#define LEN_OR_ZERO (len ? len - pos : 0)
597static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
598{
599 const char *fmt, *arg;
600 int i;
601 int pos = 0;
602
603 fmt = "(%lx)";
604 arg = "REC->" FIELD_STRING_IP;
605
606 /* When len=0, we just calculate the needed length */
607
608 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
609
610 for (i = 0; i < tu->nr_args; i++) {
611 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
612 tu->args[i].name, tu->args[i].type->fmt);
613 }
614
615 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
616
617 for (i = 0; i < tu->nr_args; i++) {
618 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
619 tu->args[i].name);
620 }
621
622 return pos; /* return the length of print_fmt */
623}
624#undef LEN_OR_ZERO
625
626static int set_print_fmt(struct trace_uprobe *tu)
627{
628 char *print_fmt;
629 int len;
630
631 /* First: called with 0 length to calculate the needed length */
632 len = __set_print_fmt(tu, NULL, 0);
633 print_fmt = kmalloc(len + 1, GFP_KERNEL);
634 if (!print_fmt)
635 return -ENOMEM;
636
637 /* Second: actually write the @print_fmt */
638 __set_print_fmt(tu, print_fmt, len + 1);
639 tu->call.print_fmt = print_fmt;
640
641 return 0;
642}
643
644#ifdef CONFIG_PERF_EVENTS
645/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{
648 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry;
650 struct hlist_head *head;
651 u8 *data;
652 int size, __size, i;
653 int rctx;
654
655 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return;
660
661 preempt_disable();
662
663 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
664 if (!entry)
665 goto out;
666
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
668 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671
672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
674
675 out:
676 preempt_enable();
677}
678#endif /* CONFIG_PERF_EVENTS */
679
680static
681int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
682{
683 struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
684
685 switch (type) {
686 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE);
688
689 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE);
691 return 0;
692
693#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE);
696
697 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0;
700#endif
701 default:
702 return 0;
703 }
704 return 0;
705}
706
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu;
711
712 utc = container_of(con, struct uprobe_trace_consumer, cons);
713 tu = utc->tu;
714 if (!tu || tu->consumer != utc)
715 return 0;
716
717 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs);
719
720#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs);
723#endif
724 return 0;
725}
726
727static struct trace_event_functions uprobe_funcs = {
728 .trace = print_uprobe_event
729};
730
731static int register_uprobe_event(struct trace_uprobe *tu)
732{
733 struct ftrace_event_call *call = &tu->call;
734 int ret;
735
736 /* Initialize ftrace_event_call */
737 INIT_LIST_HEAD(&call->class->fields);
738 call->event.funcs = &uprobe_funcs;
739 call->class->define_fields = uprobe_event_define_fields;
740
741 if (set_print_fmt(tu) < 0)
742 return -ENOMEM;
743
744 ret = register_ftrace_event(&call->event);
745 if (!ret) {
746 kfree(call->print_fmt);
747 return -ENODEV;
748 }
749 call->flags = 0;
750 call->class->reg = trace_uprobe_register;
751 call->data = tu;
752 ret = trace_add_event_call(call);
753
754 if (ret) {
755 pr_info("Failed to register uprobe event: %s\n", call->name);
756 kfree(call->print_fmt);
757 unregister_ftrace_event(&call->event);
758 }
759
760 return ret;
761}
762
763static void unregister_uprobe_event(struct trace_uprobe *tu)
764{
765 /* tu->event is unregistered in trace_remove_event_call() */
766 trace_remove_event_call(&tu->call);
767 kfree(tu->call.print_fmt);
768 tu->call.print_fmt = NULL;
769}
770
771/* Make a trace interface for controling probe points */
772static __init int init_uprobe_trace(void)
773{
774 struct dentry *d_tracer;
775
776 d_tracer = tracing_init_dentry();
777 if (!d_tracer)
778 return 0;
779
780 trace_create_file("uprobe_events", 0644, d_tracer,
781 NULL, &uprobe_events_ops);
782 /* Profile interface */
783 trace_create_file("uprobe_profile", 0444, d_tracer,
784 NULL, &uprobe_profile_ops);
785 return 0;
786}
787
788fs_initcall(init_uprobe_trace);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index df30ee08bdd4..e5e1d85b8c7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25 25
26#include <asm/irq_regs.h> 26#include <asm/irq_regs.h>
27#include <linux/kvm_para.h>
27#include <linux/perf_event.h> 28#include <linux/perf_event.h>
28 29
29int watchdog_enabled = 1; 30int watchdog_enabled = 1;
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
280 __this_cpu_write(softlockup_touch_sync, false); 281 __this_cpu_write(softlockup_touch_sync, false);
281 sched_clock_tick(); 282 sched_clock_tick();
282 } 283 }
284
285 /* Clear the guest paused flag on watchdog reset */
286 kvm_check_and_clear_guest_paused();
283 __touch_watchdog(); 287 __touch_watchdog();
284 return HRTIMER_RESTART; 288 return HRTIMER_RESTART;
285 } 289 }
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
292 */ 296 */
293 duration = is_softlockup(touch_ts); 297 duration = is_softlockup(touch_ts);
294 if (unlikely(duration)) { 298 if (unlikely(duration)) {
299 /*
300 * If a virtual machine is stopped by the host it can look to
301 * the watchdog like a soft lockup, check to see if the host
302 * stopped the vm before we issue the warning
303 */
304 if (kvm_check_and_clear_guest_paused())
305 return HRTIMER_RESTART;
306
295 /* only warn once */ 307 /* only warn once */
296 if (__this_cpu_read(soft_watchdog_warn) == true) 308 if (__this_cpu_read(soft_watchdog_warn) == true)
297 return HRTIMER_RESTART; 309 return HRTIMER_RESTART;