aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAnton Altaparmakov <aia21@cantab.net>2005-06-25 09:27:27 -0400
committerAnton Altaparmakov <aia21@cantab.net>2005-06-25 09:27:27 -0400
commit38b22b6e9f46ab8f73ef5734f0e0a000766a9258 (patch)
tree2ccc41ef55918d3af43e444bde7648562a031559 /kernel
parent3357d4c75f1fb67e7304998c4ad4e9a9fed66fa4 (diff)
parentb3e112bcc19abd8e9657dca34a87316786e096f3 (diff)
Automerge with /usr/src/ntfs-2.6.git.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz46
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kmod.c17
-rw-r--r--kernel/kprobes.c288
-rw-r--r--kernel/module.c97
-rw-r--r--kernel/posix-timers.c34
-rw-r--r--kernel/power/swsusp.c2
-rw-r--r--kernel/printk.c12
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/sys.c110
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/timer.c351
15 files changed, 718 insertions, 279 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
1#
2# Timer Interrupt Frequency Configuration
3#
4
5choice
6 prompt "Timer frequency"
7 default HZ_250
8 help
9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts.
14 Note that the timer interrupt occurs on each processor in an SMP
15 environment leading to NR_CPUS * HZ number of timer interrupts
16 per second.
17
18
19 config HZ_100
20 bool "100 HZ"
21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring.
25
26 config HZ_250
27 bool "250 HZ"
28 help
29 250 HZ is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems.
32
33 config HZ_1000
34 bool "1000 HZ"
35 help
36 1000 HZ is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events.
38
39endchoice
40
41config HZ
42 int
43 default 100 if HZ_100
44 default 250 if HZ_250
45 default 1000 if HZ_1000
46
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..79dd929f4084 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
228 228
229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) 229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
230{ 230{
231 struct qstr qstr; 231 struct dentry *d = lookup_one_len(name, parent, strlen(name));
232 struct dentry *d;
233
234 qstr.name = name;
235 qstr.len = strlen(name);
236 qstr.hash = full_name_hash(name, qstr.len);
237 d = lookup_hash(&qstr, parent);
238 if (!IS_ERR(d)) 232 if (!IS_ERR(d))
239 d->d_op = &cpuset_dops; 233 d->d_op = &cpuset_dops;
240 return d; 234 return d;
diff --git a/kernel/exit.c b/kernel/exit.c
index 2ef2ad540201..3ebcd60a19c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
73 __exit_signal(p); 73 __exit_signal(p);
74 __exit_sighand(p); 74 __exit_sighand(p);
75 /*
76 * Note that the fastpath in sys_times depends on __exit_signal having
77 * updated the counters before a task is removed from the tasklist of
78 * the process by __unhash_process.
79 */
75 __unhash_process(p); 80 __unhash_process(p);
76 81
77 /* 82 /*
@@ -793,6 +798,17 @@ fastcall NORET_TYPE void do_exit(long code)
793 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); 798 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
794 } 799 }
795 800
801 /*
802 * We're taking recursive faults here in do_exit. Safest is to just
803 * leave this task alone and wait for reboot.
804 */
805 if (unlikely(tsk->flags & PF_EXITING)) {
806 printk(KERN_ALERT
807 "Fixing recursive fault but reboot is needed!\n");
808 set_current_state(TASK_UNINTERRUPTIBLE);
809 schedule();
810 }
811
796 tsk->flags |= PF_EXITING; 812 tsk->flags |= PF_EXITING;
797 813
798 /* 814 /*
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..ba039e827d58 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
45 } 45 }
46} 46}
47 47
48void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 48static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
49{ 49{
50 static int count = 100; 50 static int count = 100;
51 51
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
120 char *path; 120 char *path;
121 char **argv; 121 char **argv;
122 char **envp; 122 char **envp;
123 struct key *ring;
123 int wait; 124 int wait;
124 int retval; 125 int retval;
125}; 126};
@@ -130,16 +131,21 @@ struct subprocess_info {
130static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
131{ 132{
132 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session;
133 int retval; 135 int retval;
134 136
135 /* Unblock all signals. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring);
136 flush_signals(current); 139 flush_signals(current);
137 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring);
138 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
139 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
140 recalc_sigpending(); 144 recalc_sigpending();
141 spin_unlock_irq(&current->sighand->siglock); 145 spin_unlock_irq(&current->sighand->siglock);
142 146
147 key_put(old_session);
148
143 /* We can run anywhere, unlike our parent keventd(). */ 149 /* We can run anywhere, unlike our parent keventd(). */
144 set_cpus_allowed(current, CPU_MASK_ALL); 150 set_cpus_allowed(current, CPU_MASK_ALL);
145 151
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
211} 217}
212 218
213/** 219/**
214 * call_usermodehelper - start a usermode application 220 * call_usermodehelper_keys - start a usermode application
215 * @path: pathname for the application 221 * @path: pathname for the application
216 * @argv: null-terminated argument list 222 * @argv: null-terminated argument list
217 * @envp: null-terminated environment list 223 * @envp: null-terminated environment list
224 * @session_keyring: session keyring for process (NULL for an empty keyring)
218 * @wait: wait for the application to finish and return status. 225 * @wait: wait for the application to finish and return status.
219 * 226 *
220 * Runs a user-space application. The application is started 227 * Runs a user-space application. The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
224 * Must be called from process context. Returns a negative error code 231 * Must be called from process context. Returns a negative error code
225 * if program was not execed successfully, or 0. 232 * if program was not execed successfully, or 0.
226 */ 233 */
227int call_usermodehelper(char *path, char **argv, char **envp, int wait) 234int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait)
228{ 236{
229 DECLARE_COMPLETION(done); 237 DECLARE_COMPLETION(done);
230 struct subprocess_info sub_info = { 238 struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
232 .path = path, 240 .path = path,
233 .argv = argv, 241 .argv = argv,
234 .envp = envp, 242 .envp = envp,
243 .ring = session_keyring,
235 .wait = wait, 244 .wait = wait,
236 .retval = 0, 245 .retval = 0,
237 }; 246 };
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
247 wait_for_completion(&done); 256 wait_for_completion(&done);
248 return sub_info.retval; 257 return sub_info.retval;
249} 258}
250EXPORT_SYMBOL(call_usermodehelper); 259EXPORT_SYMBOL(call_usermodehelper_keys);
251 260
252void __init usermodehelper_init(void) 261void __init usermodehelper_init(void)
253{ 262{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..334f37472c56 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,6 +27,9 @@
27 * interface to access function arguments. 27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes 28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list. 29 * exceptions notifier to be first on the priority list.
30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
32 * <prasanna@in.ibm.com> added function-return probes.
30 */ 33 */
31#include <linux/kprobes.h> 34#include <linux/kprobes.h>
32#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -41,6 +44,7 @@
41#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 44#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
42 45
43static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 46static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
47static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
44 48
45unsigned int kprobe_cpu = NR_CPUS; 49unsigned int kprobe_cpu = NR_CPUS;
46static DEFINE_SPINLOCK(kprobe_lock); 50static DEFINE_SPINLOCK(kprobe_lock);
@@ -78,22 +82,23 @@ struct kprobe *get_kprobe(void *addr)
78 * Aggregate handlers for multiple kprobes support - these handlers 82 * Aggregate handlers for multiple kprobes support - these handlers
79 * take care of invoking the individual kprobe handlers on p->list 83 * take care of invoking the individual kprobe handlers on p->list
80 */ 84 */
81int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 85static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
82{ 86{
83 struct kprobe *kp; 87 struct kprobe *kp;
84 88
85 list_for_each_entry(kp, &p->list, list) { 89 list_for_each_entry(kp, &p->list, list) {
86 if (kp->pre_handler) { 90 if (kp->pre_handler) {
87 curr_kprobe = kp; 91 curr_kprobe = kp;
88 kp->pre_handler(kp, regs); 92 if (kp->pre_handler(kp, regs))
89 curr_kprobe = NULL; 93 return 1;
90 } 94 }
95 curr_kprobe = NULL;
91 } 96 }
92 return 0; 97 return 0;
93} 98}
94 99
95void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 100static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
96 unsigned long flags) 101 unsigned long flags)
97{ 102{
98 struct kprobe *kp; 103 struct kprobe *kp;
99 104
@@ -107,7 +112,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
107 return; 112 return;
108} 113}
109 114
110int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) 115static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
116 int trapnr)
111{ 117{
112 /* 118 /*
113 * if we faulted "during" the execution of a user specified 119 * if we faulted "during" the execution of a user specified
@@ -120,19 +126,191 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
120 return 0; 126 return 0;
121} 127}
122 128
129static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
130{
131 struct kprobe *kp = curr_kprobe;
132 if (curr_kprobe && kp->break_handler) {
133 if (kp->break_handler(kp, regs)) {
134 curr_kprobe = NULL;
135 return 1;
136 }
137 }
138 curr_kprobe = NULL;
139 return 0;
140}
141
142struct kprobe trampoline_p = {
143 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
144 .pre_handler = trampoline_probe_handler,
145 .post_handler = trampoline_post_handler
146};
147
148struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
149{
150 struct hlist_node *node;
151 struct kretprobe_instance *ri;
152 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
153 return ri;
154 return NULL;
155}
156
157static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
158{
159 struct hlist_node *node;
160 struct kretprobe_instance *ri;
161 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
162 return ri;
163 return NULL;
164}
165
166struct kretprobe_instance *get_rp_inst(void *sara)
167{
168 struct hlist_head *head;
169 struct hlist_node *node;
170 struct task_struct *tsk;
171 struct kretprobe_instance *ri;
172
173 tsk = arch_get_kprobe_task(sara);
174 head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
175 hlist_for_each_entry(ri, node, head, hlist) {
176 if (ri->stack_addr == sara)
177 return ri;
178 }
179 return NULL;
180}
181
182void add_rp_inst(struct kretprobe_instance *ri)
183{
184 struct task_struct *tsk;
185 /*
186 * Remove rp inst off the free list -
187 * Add it back when probed function returns
188 */
189 hlist_del(&ri->uflist);
190 tsk = arch_get_kprobe_task(ri->stack_addr);
191 /* Add rp inst onto table */
192 INIT_HLIST_NODE(&ri->hlist);
193 hlist_add_head(&ri->hlist,
194 &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
195
196 /* Also add this rp inst to the used list. */
197 INIT_HLIST_NODE(&ri->uflist);
198 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
199}
200
201void recycle_rp_inst(struct kretprobe_instance *ri)
202{
203 /* remove rp inst off the rprobe_inst_table */
204 hlist_del(&ri->hlist);
205 if (ri->rp) {
206 /* remove rp inst off the used list */
207 hlist_del(&ri->uflist);
208 /* put rp inst back onto the free list */
209 INIT_HLIST_NODE(&ri->uflist);
210 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
211 } else
212 /* Unregistering */
213 kfree(ri);
214}
215
216struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
217{
218 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
219}
220
221struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
222{
223 struct task_struct *tsk;
224 struct hlist_head *head;
225 struct hlist_node *node;
226 struct kretprobe_instance *ri;
227
228 head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
229
230 hlist_for_each_entry(ri, node, head, hlist) {
231 tsk = arch_get_kprobe_task(ri->stack_addr);
232 if (tsk == tk)
233 return ri;
234 }
235 return NULL;
236}
237
238/*
239 * This function is called from do_exit or do_execv when task tk's stack is
240 * about to be recycled. Recycle any function-return probe instances
241 * associated with this task. These represent probed functions that have
242 * been called but may never return.
243 */
244void kprobe_flush_task(struct task_struct *tk)
245{
246 unsigned long flags = 0;
247 spin_lock_irqsave(&kprobe_lock, flags);
248 arch_kprobe_flush_task(tk);
249 spin_unlock_irqrestore(&kprobe_lock, flags);
250}
251
252/*
253 * This kprobe pre_handler is registered with every kretprobe. When probe
254 * hits it will set up the return probe.
255 */
256static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
257{
258 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
259
260 /*TODO: consider to only swap the RA after the last pre_handler fired */
261 arch_prepare_kretprobe(rp, regs);
262 return 0;
263}
264
265static inline void free_rp_inst(struct kretprobe *rp)
266{
267 struct kretprobe_instance *ri;
268 while ((ri = get_free_rp_inst(rp)) != NULL) {
269 hlist_del(&ri->uflist);
270 kfree(ri);
271 }
272}
273
274/*
275 * Keep all fields in the kprobe consistent
276 */
277static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
278{
279 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
280 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
281}
282
283/*
284* Add the new probe to old_p->list. Fail if this is the
285* second jprobe at the address - two jprobes can't coexist
286*/
287static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
288{
289 struct kprobe *kp;
290
291 if (p->break_handler) {
292 list_for_each_entry(kp, &old_p->list, list) {
293 if (kp->break_handler)
294 return -EEXIST;
295 }
296 list_add_tail(&p->list, &old_p->list);
297 } else
298 list_add(&p->list, &old_p->list);
299 return 0;
300}
301
123/* 302/*
124 * Fill in the required fields of the "manager kprobe". Replace the 303 * Fill in the required fields of the "manager kprobe". Replace the
125 * earlier kprobe in the hlist with the manager kprobe 304 * earlier kprobe in the hlist with the manager kprobe
126 */ 305 */
127static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 306static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
128{ 307{
308 copy_kprobe(p, ap);
129 ap->addr = p->addr; 309 ap->addr = p->addr;
130 ap->opcode = p->opcode;
131 memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
132
133 ap->pre_handler = aggr_pre_handler; 310 ap->pre_handler = aggr_pre_handler;
134 ap->post_handler = aggr_post_handler; 311 ap->post_handler = aggr_post_handler;
135 ap->fault_handler = aggr_fault_handler; 312 ap->fault_handler = aggr_fault_handler;
313 ap->break_handler = aggr_break_handler;
136 314
137 INIT_LIST_HEAD(&ap->list); 315 INIT_LIST_HEAD(&ap->list);
138 list_add(&p->list, &ap->list); 316 list_add(&p->list, &ap->list);
@@ -153,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
153 int ret = 0; 331 int ret = 0;
154 struct kprobe *ap; 332 struct kprobe *ap;
155 333
156 if (old_p->break_handler || p->break_handler) { 334 if (old_p->pre_handler == aggr_pre_handler) {
157 ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ 335 copy_kprobe(old_p, p);
158 } else if (old_p->pre_handler == aggr_pre_handler) { 336 ret = add_new_kprobe(old_p, p);
159 list_add(&p->list, &old_p->list);
160 } else { 337 } else {
161 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 338 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
162 if (!ap) 339 if (!ap)
163 return -ENOMEM; 340 return -ENOMEM;
164 add_aggr_kprobe(ap, old_p); 341 add_aggr_kprobe(ap, old_p);
165 list_add(&p->list, &ap->list); 342 copy_kprobe(ap, p);
343 ret = add_new_kprobe(ap, p);
166 } 344 }
167 return ret; 345 return ret;
168} 346}
@@ -170,10 +348,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
170/* kprobe removal house-keeping routines */ 348/* kprobe removal house-keeping routines */
171static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) 349static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
172{ 350{
173 *p->addr = p->opcode; 351 arch_disarm_kprobe(p);
174 hlist_del(&p->hlist); 352 hlist_del(&p->hlist);
175 flush_icache_range((unsigned long) p->addr,
176 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
177 spin_unlock_irqrestore(&kprobe_lock, flags); 353 spin_unlock_irqrestore(&kprobe_lock, flags);
178 arch_remove_kprobe(p); 354 arch_remove_kprobe(p);
179} 355}
@@ -200,6 +376,7 @@ int register_kprobe(struct kprobe *p)
200 } 376 }
201 spin_lock_irqsave(&kprobe_lock, flags); 377 spin_lock_irqsave(&kprobe_lock, flags);
202 old_p = get_kprobe(p->addr); 378 old_p = get_kprobe(p->addr);
379 p->nmissed = 0;
203 if (old_p) { 380 if (old_p) {
204 ret = register_aggr_kprobe(old_p, p); 381 ret = register_aggr_kprobe(old_p, p);
205 goto out; 382 goto out;
@@ -210,10 +387,8 @@ int register_kprobe(struct kprobe *p)
210 hlist_add_head(&p->hlist, 387 hlist_add_head(&p->hlist,
211 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 388 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
212 389
213 p->opcode = *p->addr; 390 arch_arm_kprobe(p);
214 *p->addr = BREAKPOINT_INSTRUCTION; 391
215 flush_icache_range((unsigned long) p->addr,
216 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
217out: 392out:
218 spin_unlock_irqrestore(&kprobe_lock, flags); 393 spin_unlock_irqrestore(&kprobe_lock, flags);
219rm_kprobe: 394rm_kprobe:
@@ -257,16 +432,82 @@ void unregister_jprobe(struct jprobe *jp)
257 unregister_kprobe(&jp->kp); 432 unregister_kprobe(&jp->kp);
258} 433}
259 434
435#ifdef ARCH_SUPPORTS_KRETPROBES
436
437int register_kretprobe(struct kretprobe *rp)
438{
439 int ret = 0;
440 struct kretprobe_instance *inst;
441 int i;
442
443 rp->kp.pre_handler = pre_handler_kretprobe;
444
445 /* Pre-allocate memory for max kretprobe instances */
446 if (rp->maxactive <= 0) {
447#ifdef CONFIG_PREEMPT
448 rp->maxactive = max(10, 2 * NR_CPUS);
449#else
450 rp->maxactive = NR_CPUS;
451#endif
452 }
453 INIT_HLIST_HEAD(&rp->used_instances);
454 INIT_HLIST_HEAD(&rp->free_instances);
455 for (i = 0; i < rp->maxactive; i++) {
456 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
457 if (inst == NULL) {
458 free_rp_inst(rp);
459 return -ENOMEM;
460 }
461 INIT_HLIST_NODE(&inst->uflist);
462 hlist_add_head(&inst->uflist, &rp->free_instances);
463 }
464
465 rp->nmissed = 0;
466 /* Establish function entry probe point */
467 if ((ret = register_kprobe(&rp->kp)) != 0)
468 free_rp_inst(rp);
469 return ret;
470}
471
472#else /* ARCH_SUPPORTS_KRETPROBES */
473
474int register_kretprobe(struct kretprobe *rp)
475{
476 return -ENOSYS;
477}
478
479#endif /* ARCH_SUPPORTS_KRETPROBES */
480
481void unregister_kretprobe(struct kretprobe *rp)
482{
483 unsigned long flags;
484 struct kretprobe_instance *ri;
485
486 unregister_kprobe(&rp->kp);
487 /* No race here */
488 spin_lock_irqsave(&kprobe_lock, flags);
489 free_rp_inst(rp);
490 while ((ri = get_used_rp_inst(rp)) != NULL) {
491 ri->rp = NULL;
492 hlist_del(&ri->uflist);
493 }
494 spin_unlock_irqrestore(&kprobe_lock, flags);
495}
496
260static int __init init_kprobes(void) 497static int __init init_kprobes(void)
261{ 498{
262 int i, err = 0; 499 int i, err = 0;
263 500
264 /* FIXME allocate the probe table, currently defined statically */ 501 /* FIXME allocate the probe table, currently defined statically */
265 /* initialize all list heads */ 502 /* initialize all list heads */
266 for (i = 0; i < KPROBE_TABLE_SIZE; i++) 503 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
267 INIT_HLIST_HEAD(&kprobe_table[i]); 504 INIT_HLIST_HEAD(&kprobe_table[i]);
505 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
506 }
268 507
269 err = register_die_notifier(&kprobe_exceptions_nb); 508 err = register_die_notifier(&kprobe_exceptions_nb);
509 /* Register the trampoline probe for return probe */
510 register_kprobe(&trampoline_p);
270 return err; 511 return err;
271} 512}
272 513
@@ -277,3 +518,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
277EXPORT_SYMBOL_GPL(register_jprobe); 518EXPORT_SYMBOL_GPL(register_jprobe);
278EXPORT_SYMBOL_GPL(unregister_jprobe); 519EXPORT_SYMBOL_GPL(unregister_jprobe);
279EXPORT_SYMBOL_GPL(jprobe_return); 520EXPORT_SYMBOL_GPL(jprobe_return);
521EXPORT_SYMBOL_GPL(register_kretprobe);
522EXPORT_SYMBOL_GPL(unregister_kretprobe);
523
diff --git a/kernel/module.c b/kernel/module.c
index a566745dde62..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/stop_machine.h> 36#include <linux/stop_machine.h>
37#include <linux/device.h> 37#include <linux/device.h>
38#include <linux/string.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/semaphore.h> 40#include <asm/semaphore.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
370#endif /* CONFIG_SMP */ 371#endif /* CONFIG_SMP */
371 372
372#ifdef CONFIG_MODULE_UNLOAD 373#ifdef CONFIG_MODULE_UNLOAD
374#define MODINFO_ATTR(field) \
375static void setup_modinfo_##field(struct module *mod, const char *s) \
376{ \
377 mod->field = kstrdup(s, GFP_KERNEL); \
378} \
379static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
380 struct module *mod, char *buffer) \
381{ \
382 return sprintf(buffer, "%s\n", mod->field); \
383} \
384static int modinfo_##field##_exists(struct module *mod) \
385{ \
386 return mod->field != NULL; \
387} \
388static void free_modinfo_##field(struct module *mod) \
389{ \
390 kfree(mod->field); \
391 mod->field = NULL; \
392} \
393static struct module_attribute modinfo_##field = { \
394 .attr = { .name = __stringify(field), .mode = 0444, \
395 .owner = THIS_MODULE }, \
396 .show = show_modinfo_##field, \
397 .setup = setup_modinfo_##field, \
398 .test = modinfo_##field##_exists, \
399 .free = free_modinfo_##field, \
400};
401
402MODINFO_ATTR(version);
403MODINFO_ATTR(srcversion);
404
405static struct module_attribute *modinfo_attrs[] = {
406 &modinfo_version,
407 &modinfo_srcversion,
408 NULL,
409};
410
373/* Init the unload section of the module. */ 411/* Init the unload section of the module. */
374static void module_unload_init(struct module *mod) 412static void module_unload_init(struct module *mod)
375{ 413{
@@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
692 return 0; 730 return 0;
693} 731}
694 732
695int set_obsolete(const char *val, struct kernel_param *kp) 733static int set_obsolete(const char *val, struct kernel_param *kp)
696{ 734{
697 unsigned int min, max; 735 unsigned int min, max;
698 unsigned int size, maxsize; 736 unsigned int size, maxsize;
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
1031} 1069}
1032#endif 1070#endif
1033 1071
1072#ifdef CONFIG_MODULE_UNLOAD
1073static int module_add_modinfo_attrs(struct module *mod)
1074{
1075 struct module_attribute *attr;
1076 int error = 0;
1077 int i;
1078
1079 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
1080 if (!attr->test ||
1081 (attr->test && attr->test(mod)))
1082 error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
1083 }
1084 return error;
1085}
1086
1087static void module_remove_modinfo_attrs(struct module *mod)
1088{
1089 struct module_attribute *attr;
1090 int i;
1091
1092 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1093 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
1094 attr->free(mod);
1095 }
1096}
1097#endif
1034 1098
1035static int mod_sysfs_setup(struct module *mod, 1099static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam, 1100 struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
1056 if (err) 1120 if (err)
1057 goto out_unreg; 1121 goto out_unreg;
1058 1122
1123#ifdef CONFIG_MODULE_UNLOAD
1124 err = module_add_modinfo_attrs(mod);
1125 if (err)
1126 goto out_unreg;
1127#endif
1128
1059 return 0; 1129 return 0;
1060 1130
1061out_unreg: 1131out_unreg:
@@ -1066,6 +1136,9 @@ out:
1066 1136
1067static void mod_kobject_remove(struct module *mod) 1137static void mod_kobject_remove(struct module *mod)
1068{ 1138{
1139#ifdef CONFIG_MODULE_UNLOAD
1140 module_remove_modinfo_attrs(mod);
1141#endif
1069 module_remove_refcnt_attr(mod); 1142 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod); 1143 module_param_sysfs_remove(mod);
1071 1144
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
1311 return NULL; 1384 return NULL;
1312} 1385}
1313 1386
1387#ifdef CONFIG_MODULE_UNLOAD
1388static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1389 unsigned int infoindex)
1390{
1391 struct module_attribute *attr;
1392 int i;
1393
1394 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1395 if (attr->setup)
1396 attr->setup(mod,
1397 get_modinfo(sechdrs,
1398 infoindex,
1399 attr->attr.name));
1400 }
1401}
1402#endif
1403
1314#ifdef CONFIG_KALLSYMS 1404#ifdef CONFIG_KALLSYMS
1315int is_exported(const char *name, const struct module *mod) 1405int is_exported(const char *name, const struct module *mod)
1316{ 1406{
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
1615 /* Set up license info based on the info section */ 1705 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1706 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617 1707
1708#ifdef CONFIG_MODULE_UNLOAD
1709 /* Set up MODINFO_ATTR fields */
1710 setup_modinfo(mod, sechdrs, infoindex);
1711#endif
1712
1618 /* Fix up syms, so that st_value is a pointer to location. */ 1713 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1714 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod); 1715 mod);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index cabb63fc9e16..5b7b4736d82b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -89,23 +89,6 @@ static struct idr posix_timers_id;
89static DEFINE_SPINLOCK(idr_lock); 89static DEFINE_SPINLOCK(idr_lock);
90 90
91/* 91/*
92 * Just because the timer is not in the timer list does NOT mean it is
93 * inactive. It could be in the "fire" routine getting a new expire time.
94 */
95#define TIMER_INACTIVE 1
96
97#ifdef CONFIG_SMP
98# define timer_active(tmr) \
99 ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
100# define set_timer_inactive(tmr) \
101 do { \
102 (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
103 } while (0)
104#else
105# define timer_active(tmr) BARFY // error to use outside of SMP
106# define set_timer_inactive(tmr) do { } while (0)
107#endif
108/*
109 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 92 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
110 * SIGEV values. Here we put out an error if this assumption fails. 93 * SIGEV values. Here we put out an error if this assumption fails.
111 */ 94 */
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
226 init_timer(&new_timer->it.real.timer); 209 init_timer(&new_timer->it.real.timer);
227 new_timer->it.real.timer.data = (unsigned long) new_timer; 210 new_timer->it.real.timer.data = (unsigned long) new_timer;
228 new_timer->it.real.timer.function = posix_timer_fn; 211 new_timer->it.real.timer.function = posix_timer_fn;
229 set_timer_inactive(new_timer);
230 return 0; 212 return 0;
231} 213}
232 214
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
480 int do_notify = 1; 462 int do_notify = 1;
481 463
482 spin_lock_irqsave(&timr->it_lock, flags); 464 spin_lock_irqsave(&timr->it_lock, flags);
483 set_timer_inactive(timr);
484 if (!list_empty(&timr->it.real.abs_timer_entry)) { 465 if (!list_empty(&timr->it.real.abs_timer_entry)) {
485 spin_lock(&abs_list.lock); 466 spin_lock(&abs_list.lock);
486 do { 467 do {
@@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags,
983 * careful here. If smp we could be in the "fire" routine which will 964 * careful here. If smp we could be in the "fire" routine which will
984 * be spinning as we hold the lock. But this is ONLY an SMP issue. 965 * be spinning as we hold the lock. But this is ONLY an SMP issue.
985 */ 966 */
967 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
986#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
987 if (timer_active(timr) && !del_timer(&timr->it.real.timer))
988 /* 969 /*
989 * It can only be active if on an other cpu. Since 970 * It can only be active if on an other cpu. Since
990 * we have cleared the interval stuff above, it should 971 * we have cleared the interval stuff above, it should
@@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags,
994 * a "retry" exit status. 975 * a "retry" exit status.
995 */ 976 */
996 return TIMER_RETRY; 977 return TIMER_RETRY;
997
998 set_timer_inactive(timr);
999#else
1000 del_timer(&timr->it.real.timer);
1001#endif 978#endif
979 }
980
1002 remove_from_abslist(timr); 981 remove_from_abslist(timr);
1003 982
1004 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 983 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
@@ -1083,8 +1062,9 @@ retry:
1083static inline int common_timer_del(struct k_itimer *timer) 1062static inline int common_timer_del(struct k_itimer *timer)
1084{ 1063{
1085 timer->it.real.incr = 0; 1064 timer->it.real.incr = 0;
1065
1066 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
1086#ifdef CONFIG_SMP 1067#ifdef CONFIG_SMP
1087 if (timer_active(timer) && !del_timer(&timer->it.real.timer))
1088 /* 1068 /*
1089 * It can only be active if on an other cpu. Since 1069 * It can only be active if on an other cpu. Since
1090 * we have cleared the interval stuff above, it should 1070 * we have cleared the interval stuff above, it should
@@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer)
1094 * a "retry" exit status. 1074 * a "retry" exit status.
1095 */ 1075 */
1096 return TIMER_RETRY; 1076 return TIMER_RETRY;
1097#else
1098 del_timer(&timer->it.real.timer);
1099#endif 1077#endif
1078 }
1079
1100 remove_from_abslist(timer); 1080 remove_from_abslist(timer);
1101 1081
1102 return 0; 1082 return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..53f9f8720ee4 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -81,7 +81,7 @@ static int nr_copy_pages_check;
81extern char resume_file[]; 81extern char resume_file[];
82 82
83/* Local variables that should not be affected by save */ 83/* Local variables that should not be affected by save */
84unsigned int nr_copy_pages __nosavedata = 0; 84static unsigned int nr_copy_pages __nosavedata = 0;
85 85
86/* Suspend pagedir is allocated before final copy, therefore it 86/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume 87 must be freed after resume
diff --git a/kernel/printk.c b/kernel/printk.c
index 01b58d7d17ff..3a442bfb8bee 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -876,8 +876,10 @@ void register_console(struct console * console)
876 break; 876 break;
877 console->flags |= CON_ENABLED; 877 console->flags |= CON_ENABLED;
878 console->index = console_cmdline[i].index; 878 console->index = console_cmdline[i].index;
879 if (i == preferred_console) 879 if (i == selected_console) {
880 console->flags |= CON_CONSDEV; 880 console->flags |= CON_CONSDEV;
881 preferred_console = selected_console;
882 }
881 break; 883 break;
882 } 884 }
883 885
@@ -897,6 +899,8 @@ void register_console(struct console * console)
897 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 899 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
898 console->next = console_drivers; 900 console->next = console_drivers;
899 console_drivers = console; 901 console_drivers = console;
902 if (console->next)
903 console->next->flags &= ~CON_CONSDEV;
900 } else { 904 } else {
901 console->next = console_drivers->next; 905 console->next = console_drivers->next;
902 console_drivers->next = console; 906 console_drivers->next = console;
@@ -937,10 +941,14 @@ int unregister_console(struct console * console)
937 /* If last console is removed, we re-enable picking the first 941 /* If last console is removed, we re-enable picking the first
938 * one that gets registered. Without that, pmac early boot console 942 * one that gets registered. Without that, pmac early boot console
939 * would prevent fbcon from taking over. 943 * would prevent fbcon from taking over.
944 *
945 * If this isn't the last console and it has CON_CONSDEV set, we
946 * need to set it on the next preferred console.
940 */ 947 */
941 if (console_drivers == NULL) 948 if (console_drivers == NULL)
942 preferred_console = selected_console; 949 preferred_console = selected_console;
943 950 else if (console->flags & CON_CONSDEV)
951 console_drivers->flags |= CON_CONSDEV;
944 952
945 release_console_sem(); 953 release_console_sem();
946 return res; 954 return res;
diff --git a/kernel/sched.c b/kernel/sched.c
index deca041fc364..76080d142e3d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2576,7 +2576,7 @@ void fastcall add_preempt_count(int val)
2576 /* 2576 /*
2577 * Underflow? 2577 * Underflow?
2578 */ 2578 */
2579 BUG_ON(((int)preempt_count() < 0)); 2579 BUG_ON((preempt_count() < 0));
2580 preempt_count() += val; 2580 preempt_count() += val;
2581 /* 2581 /*
2582 * Spinlock count overflowing soon? 2582 * Spinlock count overflowing soon?
@@ -2869,7 +2869,7 @@ need_resched:
2869 2869
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{ 2871{
2872 task_t *p = curr->task; 2872 task_t *p = curr->private;
2873 return try_to_wake_up(p, mode, sync); 2873 return try_to_wake_up(p, mode, sync);
2874} 2874}
2875 2875
diff --git a/kernel/signal.c b/kernel/signal.c
index c89821b69ae3..d1258729a5f9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
213fastcall void recalc_sigpending_tsk(struct task_struct *t) 213fastcall void recalc_sigpending_tsk(struct task_struct *t)
214{ 214{
215 if (t->signal->group_stop_count > 0 || 215 if (t->signal->group_stop_count > 0 ||
216 (t->flags & PF_FREEZE) ||
216 PENDING(&t->pending, &t->blocked) || 217 PENDING(&t->pending, &t->blocked) ||
217 PENDING(&t->signal->shared_pending, &t->blocked)) 218 PENDING(&t->signal->shared_pending, &t->blocked))
218 set_tsk_thread_flag(t, TIF_SIGPENDING); 219 set_tsk_thread_flag(t, TIF_SIGPENDING);
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..da24bc1292db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -525,7 +525,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
525 } 525 }
526 if (new_egid != old_egid) 526 if (new_egid != old_egid)
527 { 527 {
528 current->mm->dumpable = 0; 528 current->mm->dumpable = suid_dumpable;
529 smp_wmb(); 529 smp_wmb();
530 } 530 }
531 if (rgid != (gid_t) -1 || 531 if (rgid != (gid_t) -1 ||
@@ -556,7 +556,7 @@ asmlinkage long sys_setgid(gid_t gid)
556 { 556 {
557 if(old_egid != gid) 557 if(old_egid != gid)
558 { 558 {
559 current->mm->dumpable=0; 559 current->mm->dumpable = suid_dumpable;
560 smp_wmb(); 560 smp_wmb();
561 } 561 }
562 current->gid = current->egid = current->sgid = current->fsgid = gid; 562 current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +565,7 @@ asmlinkage long sys_setgid(gid_t gid)
565 { 565 {
566 if(old_egid != gid) 566 if(old_egid != gid)
567 { 567 {
568 current->mm->dumpable=0; 568 current->mm->dumpable = suid_dumpable;
569 smp_wmb(); 569 smp_wmb();
570 } 570 }
571 current->egid = current->fsgid = gid; 571 current->egid = current->fsgid = gid;
@@ -596,7 +596,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
596 596
597 if(dumpclear) 597 if(dumpclear)
598 { 598 {
599 current->mm->dumpable = 0; 599 current->mm->dumpable = suid_dumpable;
600 smp_wmb(); 600 smp_wmb();
601 } 601 }
602 current->uid = new_ruid; 602 current->uid = new_ruid;
@@ -653,7 +653,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 653
654 if (new_euid != old_euid) 654 if (new_euid != old_euid)
655 { 655 {
656 current->mm->dumpable=0; 656 current->mm->dumpable = suid_dumpable;
657 smp_wmb(); 657 smp_wmb();
658 } 658 }
659 current->fsuid = current->euid = new_euid; 659 current->fsuid = current->euid = new_euid;
@@ -703,7 +703,7 @@ asmlinkage long sys_setuid(uid_t uid)
703 703
704 if (old_euid != uid) 704 if (old_euid != uid)
705 { 705 {
706 current->mm->dumpable = 0; 706 current->mm->dumpable = suid_dumpable;
707 smp_wmb(); 707 smp_wmb();
708 } 708 }
709 current->fsuid = current->euid = uid; 709 current->fsuid = current->euid = uid;
@@ -748,7 +748,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
748 if (euid != (uid_t) -1) { 748 if (euid != (uid_t) -1) {
749 if (euid != current->euid) 749 if (euid != current->euid)
750 { 750 {
751 current->mm->dumpable = 0; 751 current->mm->dumpable = suid_dumpable;
752 smp_wmb(); 752 smp_wmb();
753 } 753 }
754 current->euid = euid; 754 current->euid = euid;
@@ -798,7 +798,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
798 if (egid != (gid_t) -1) { 798 if (egid != (gid_t) -1) {
799 if (egid != current->egid) 799 if (egid != current->egid)
800 { 800 {
801 current->mm->dumpable = 0; 801 current->mm->dumpable = suid_dumpable;
802 smp_wmb(); 802 smp_wmb();
803 } 803 }
804 current->egid = egid; 804 current->egid = egid;
@@ -845,7 +845,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
845 { 845 {
846 if (uid != old_fsuid) 846 if (uid != old_fsuid)
847 { 847 {
848 current->mm->dumpable = 0; 848 current->mm->dumpable = suid_dumpable;
849 smp_wmb(); 849 smp_wmb();
850 } 850 }
851 current->fsuid = uid; 851 current->fsuid = uid;
@@ -875,7 +875,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
875 { 875 {
876 if (gid != old_fsgid) 876 if (gid != old_fsgid)
877 { 877 {
878 current->mm->dumpable = 0; 878 current->mm->dumpable = suid_dumpable;
879 smp_wmb(); 879 smp_wmb();
880 } 880 }
881 current->fsgid = gid; 881 current->fsgid = gid;
@@ -894,35 +894,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
894 */ 894 */
895 if (tbuf) { 895 if (tbuf) {
896 struct tms tmp; 896 struct tms tmp;
897 struct task_struct *tsk = current;
898 struct task_struct *t;
899 cputime_t utime, stime, cutime, cstime; 897 cputime_t utime, stime, cutime, cstime;
900 898
901 read_lock(&tasklist_lock); 899#ifdef CONFIG_SMP
902 utime = tsk->signal->utime; 900 if (thread_group_empty(current)) {
903 stime = tsk->signal->stime; 901 /*
904 t = tsk; 902 * Single thread case without the use of any locks.
905 do { 903 *
906 utime = cputime_add(utime, t->utime); 904 * We may race with release_task if two threads are
907 stime = cputime_add(stime, t->stime); 905 * executing. However, release task first adds up the
908 t = next_thread(t); 906 * counters (__exit_signal) before removing the task
909 } while (t != tsk); 907 * from the process tasklist (__unhash_process).
910 908 * __exit_signal also acquires and releases the
911 /* 909 * siglock which results in the proper memory ordering
912 * While we have tasklist_lock read-locked, no dying thread 910 * so that the list modifications are always visible
913 * can be updating current->signal->[us]time. Instead, 911 * after the counters have been updated.
914 * we got their counts included in the live thread loop. 912 *
915 * However, another thread can come in right now and 913 * If the counters have been updated by the second thread
916 * do a wait call that updates current->signal->c[us]time. 914 * but the thread has not yet been removed from the list
917 * To make sure we always see that pair updated atomically, 915 * then the other branch will be executing which will
918 * we take the siglock around fetching them. 916 * block on tasklist_lock until the exit handling of the
919 */ 917 * other task is finished.
920 spin_lock_irq(&tsk->sighand->siglock); 918 *
921 cutime = tsk->signal->cutime; 919 * This also implies that the sighand->siglock cannot
922 cstime = tsk->signal->cstime; 920 * be held by another processor. So we can also
923 spin_unlock_irq(&tsk->sighand->siglock); 921 * skip acquiring that lock.
924 read_unlock(&tasklist_lock); 922 */
923 utime = cputime_add(current->signal->utime, current->utime);
924 stime = cputime_add(current->signal->utime, current->stime);
925 cutime = current->signal->cutime;
926 cstime = current->signal->cstime;
927 } else
928#endif
929 {
930
931 /* Process with multiple threads */
932 struct task_struct *tsk = current;
933 struct task_struct *t;
925 934
935 read_lock(&tasklist_lock);
936 utime = tsk->signal->utime;
937 stime = tsk->signal->stime;
938 t = tsk;
939 do {
940 utime = cputime_add(utime, t->utime);
941 stime = cputime_add(stime, t->stime);
942 t = next_thread(t);
943 } while (t != tsk);
944
945 /*
946 * While we have tasklist_lock read-locked, no dying thread
947 * can be updating current->signal->[us]time. Instead,
948 * we got their counts included in the live thread loop.
949 * However, another thread can come in right now and
950 * do a wait call that updates current->signal->c[us]time.
951 * To make sure we always see that pair updated atomically,
952 * we take the siglock around fetching them.
953 */
954 spin_lock_irq(&tsk->sighand->siglock);
955 cutime = tsk->signal->cutime;
956 cstime = tsk->signal->cstime;
957 spin_unlock_irq(&tsk->sighand->siglock);
958 read_unlock(&tasklist_lock);
959 }
926 tmp.tms_utime = cputime_to_clock_t(utime); 960 tmp.tms_utime = cputime_to_clock_t(utime);
927 tmp.tms_stime = cputime_to_clock_t(stime); 961 tmp.tms_stime = cputime_to_clock_t(stime);
928 tmp.tms_cutime = cputime_to_clock_t(cutime); 962 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1225,7 +1259,7 @@ static void groups_sort(struct group_info *group_info)
1225} 1259}
1226 1260
1227/* a simple bsearch */ 1261/* a simple bsearch */
1228static int groups_search(struct group_info *group_info, gid_t grp) 1262int groups_search(struct group_info *group_info, gid_t grp)
1229{ 1263{
1230 int left, right; 1264 int left, right;
1231 1265
@@ -1652,7 +1686,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1652 error = 1; 1686 error = 1;
1653 break; 1687 break;
1654 case PR_SET_DUMPABLE: 1688 case PR_SET_DUMPABLE:
1655 if (arg2 != 0 && arg2 != 1) { 1689 if (arg2 < 0 || arg2 > 2) {
1656 error = -EINVAL; 1690 error = -EINVAL;
1657 break; 1691 break;
1658 } 1692 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..24a4d12d5aa9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
58extern int max_threads; 58extern int max_threads;
59extern int sysrq_enabled; 59extern int sysrq_enabled;
60extern int core_uses_pid; 60extern int core_uses_pid;
61extern int suid_dumpable;
61extern char core_pattern[]; 62extern char core_pattern[];
62extern int cad_pid; 63extern int cad_pid;
63extern int pid_max; 64extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
950 .proc_handler = &proc_dointvec, 951 .proc_handler = &proc_dointvec,
951 }, 952 },
952#endif 953#endif
954 {
955 .ctl_name = KERN_SETUID_DUMPABLE,
956 .procname = "suid_dumpable",
957 .data = &suid_dumpable,
958 .maxlen = sizeof(int),
959 .mode = 0644,
960 .proc_handler = &proc_dointvec,
961 },
953 { .ctl_name = 0 } 962 { .ctl_name = 0 }
954}; 963};
955 964
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..51ff917c9590 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
57#define TVN_MASK (TVN_SIZE - 1) 57#define TVN_MASK (TVN_SIZE - 1)
58#define TVR_MASK (TVR_SIZE - 1) 58#define TVR_MASK (TVR_SIZE - 1)
59 59
60struct timer_base_s {
61 spinlock_t lock;
62 struct timer_list *running_timer;
63};
64
60typedef struct tvec_s { 65typedef struct tvec_s {
61 struct list_head vec[TVN_SIZE]; 66 struct list_head vec[TVN_SIZE];
62} tvec_t; 67} tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
66} tvec_root_t; 71} tvec_root_t;
67 72
68struct tvec_t_base_s { 73struct tvec_t_base_s {
69 spinlock_t lock; 74 struct timer_base_s t_base;
70 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
71 struct timer_list *running_timer;
72 tvec_root_t tv1; 76 tvec_root_t tv1;
73 tvec_t tv2; 77 tvec_t tv2;
74 tvec_t tv3; 78 tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
77} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
78 82
79typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
84static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
80 85
81static inline void set_running_timer(tvec_base_t *base, 86static inline void set_running_timer(tvec_base_t *base,
82 struct timer_list *timer) 87 struct timer_list *timer)
83{ 88{
84#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
85 base->running_timer = timer; 90 base->t_base.running_timer = timer;
86#endif 91#endif
87} 92}
88 93
89/* Fake initialization */
90static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
91
92static void check_timer_failed(struct timer_list *timer) 94static void check_timer_failed(struct timer_list *timer)
93{ 95{
94 static int whine_count; 96 static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
103 /* 105 /*
104 * Now fix it up 106 * Now fix it up
105 */ 107 */
106 spin_lock_init(&timer->lock);
107 timer->magic = TIMER_MAGIC; 108 timer->magic = TIMER_MAGIC;
108} 109}
109 110
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
156 list_add_tail(&timer->entry, vec); 157 list_add_tail(&timer->entry, vec);
157} 158}
158 159
160typedef struct timer_base_s timer_base_t;
161/*
162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
163 * at compile time, and we need timer->base to lock the timer.
164 */
165timer_base_t __init_timer_base
166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
167EXPORT_SYMBOL(__init_timer_base);
168
169/***
170 * init_timer - initialize a timer.
171 * @timer: the timer to be initialized
172 *
173 * init_timer() must be done to a timer prior calling *any* of the
174 * other timer functions.
175 */
176void fastcall init_timer(struct timer_list *timer)
177{
178 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181}
182EXPORT_SYMBOL(init_timer);
183
184static inline void detach_timer(struct timer_list *timer,
185 int clear_pending)
186{
187 struct list_head *entry = &timer->entry;
188
189 __list_del(entry->prev, entry->next);
190 if (clear_pending)
191 entry->next = NULL;
192 entry->prev = LIST_POISON2;
193}
194
195/*
196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
197 * means that all timers which are tied to this base via timer->base are
198 * locked, and the base itself is locked too.
199 *
200 * So __run_timers/migrate_timers can safely modify all timers which could
201 * be found on ->tvX lists.
202 *
203 * When the timer's base is locked, and the timer removed from list, it is
204 * possible to set timer->base = NULL and drop the lock: the timer remains
205 * locked.
206 */
207static timer_base_t *lock_timer_base(struct timer_list *timer,
208 unsigned long *flags)
209{
210 timer_base_t *base;
211
212 for (;;) {
213 base = timer->base;
214 if (likely(base != NULL)) {
215 spin_lock_irqsave(&base->lock, *flags);
216 if (likely(base == timer->base))
217 return base;
218 /* The timer has migrated to another CPU */
219 spin_unlock_irqrestore(&base->lock, *flags);
220 }
221 cpu_relax();
222 }
223}
224
159int __mod_timer(struct timer_list *timer, unsigned long expires) 225int __mod_timer(struct timer_list *timer, unsigned long expires)
160{ 226{
161 tvec_base_t *old_base, *new_base; 227 timer_base_t *base;
228 tvec_base_t *new_base;
162 unsigned long flags; 229 unsigned long flags;
163 int ret = 0; 230 int ret = 0;
164 231
165 BUG_ON(!timer->function); 232 BUG_ON(!timer->function);
166
167 check_timer(timer); 233 check_timer(timer);
168 234
169 spin_lock_irqsave(&timer->lock, flags); 235 base = lock_timer_base(timer, &flags);
236
237 if (timer_pending(timer)) {
238 detach_timer(timer, 0);
239 ret = 1;
240 }
241
170 new_base = &__get_cpu_var(tvec_bases); 242 new_base = &__get_cpu_var(tvec_bases);
171repeat:
172 old_base = timer->base;
173 243
174 /* 244 if (base != &new_base->t_base) {
175 * Prevent deadlocks via ordering by old_base < new_base.
176 */
177 if (old_base && (new_base != old_base)) {
178 if (old_base < new_base) {
179 spin_lock(&new_base->lock);
180 spin_lock(&old_base->lock);
181 } else {
182 spin_lock(&old_base->lock);
183 spin_lock(&new_base->lock);
184 }
185 /* 245 /*
186 * The timer base might have been cancelled while we were 246 * We are trying to schedule the timer on the local CPU.
187 * trying to take the lock(s): 247 * However we can't change timer's base while it is running,
248 * otherwise del_timer_sync() can't detect that the timer's
249 * handler yet has not finished. This also guarantees that
250 * the timer is serialized wrt itself.
188 */ 251 */
189 if (timer->base != old_base) { 252 if (unlikely(base->running_timer == timer)) {
190 spin_unlock(&new_base->lock); 253 /* The timer remains on a former base */
191 spin_unlock(&old_base->lock); 254 new_base = container_of(base, tvec_base_t, t_base);
192 goto repeat; 255 } else {
193 } 256 /* See the comment in lock_timer_base() */
194 } else { 257 timer->base = NULL;
195 spin_lock(&new_base->lock); 258 spin_unlock(&base->lock);
196 if (timer->base != old_base) { 259 spin_lock(&new_base->t_base.lock);
197 spin_unlock(&new_base->lock); 260 timer->base = &new_base->t_base;
198 goto repeat;
199 } 261 }
200 } 262 }
201 263
202 /*
203 * Delete the previous timeout (if there was any), and install
204 * the new one:
205 */
206 if (old_base) {
207 list_del(&timer->entry);
208 ret = 1;
209 }
210 timer->expires = expires; 264 timer->expires = expires;
211 internal_add_timer(new_base, timer); 265 internal_add_timer(new_base, timer);
212 timer->base = new_base; 266 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
213
214 if (old_base && (new_base != old_base))
215 spin_unlock(&old_base->lock);
216 spin_unlock(&new_base->lock);
217 spin_unlock_irqrestore(&timer->lock, flags);
218 267
219 return ret; 268 return ret;
220} 269}
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
232{ 281{
233 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 282 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
234 unsigned long flags; 283 unsigned long flags;
235 284
236 BUG_ON(timer_pending(timer) || !timer->function); 285 BUG_ON(timer_pending(timer) || !timer->function);
237 286
238 check_timer(timer); 287 check_timer(timer);
239 288
240 spin_lock_irqsave(&base->lock, flags); 289 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base;
241 internal_add_timer(base, timer); 291 internal_add_timer(base, timer);
242 timer->base = base; 292 spin_unlock_irqrestore(&base->t_base.lock, flags);
243 spin_unlock_irqrestore(&base->lock, flags);
244} 293}
245 294
246 295
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer);
295 */ 344 */
296int del_timer(struct timer_list *timer) 345int del_timer(struct timer_list *timer)
297{ 346{
347 timer_base_t *base;
298 unsigned long flags; 348 unsigned long flags;
299 tvec_base_t *base; 349 int ret = 0;
300 350
301 check_timer(timer); 351 check_timer(timer);
302 352
303repeat: 353 if (timer_pending(timer)) {
304 base = timer->base; 354 base = lock_timer_base(timer, &flags);
305 if (!base) 355 if (timer_pending(timer)) {
306 return 0; 356 detach_timer(timer, 1);
307 spin_lock_irqsave(&base->lock, flags); 357 ret = 1;
308 if (base != timer->base) { 358 }
309 spin_unlock_irqrestore(&base->lock, flags); 359 spin_unlock_irqrestore(&base->lock, flags);
310 goto repeat;
311 } 360 }
312 list_del(&timer->entry);
313 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
314 smp_wmb();
315 timer->base = NULL;
316 spin_unlock_irqrestore(&base->lock, flags);
317 361
318 return 1; 362 return ret;
319} 363}
320 364
321EXPORT_SYMBOL(del_timer); 365EXPORT_SYMBOL(del_timer);
322 366
323#ifdef CONFIG_SMP 367#ifdef CONFIG_SMP
324/*** 368/*
325 * del_timer_sync - deactivate a timer and wait for the handler to finish. 369 * This function tries to deactivate a timer. Upon successful (ret >= 0)
326 * @timer: the timer to be deactivated 370 * exit the timer is not queued and the handler is not running on any CPU.
327 *
328 * This function only differs from del_timer() on SMP: besides deactivating
329 * the timer it also makes sure the handler has finished executing on other
330 * CPUs.
331 *
332 * Synchronization rules: callers must prevent restarting of the timer,
333 * otherwise this function is meaningless. It must not be called from
334 * interrupt contexts. The caller must not hold locks which would prevent
335 * completion of the timer's handler. Upon exit the timer is not queued and
336 * the handler is not running on any CPU.
337 *
338 * The function returns whether it has deactivated a pending timer or not.
339 * 371 *
340 * del_timer_sync() is slow and complicated because it copes with timer 372 * It must not be called from interrupt contexts.
341 * handlers which re-arm the timer (periodic timers). If the timer handler
342 * is known to not do this (a single shot timer) then use
343 * del_singleshot_timer_sync() instead.
344 */ 373 */
345int del_timer_sync(struct timer_list *timer) 374int try_to_del_timer_sync(struct timer_list *timer)
346{ 375{
347 tvec_base_t *base; 376 timer_base_t *base;
348 int i, ret = 0; 377 unsigned long flags;
378 int ret = -1;
349 379
350 check_timer(timer); 380 base = lock_timer_base(timer, &flags);
351 381
352del_again: 382 if (base->running_timer == timer)
353 ret += del_timer(timer); 383 goto out;
354 384
355 for_each_online_cpu(i) { 385 ret = 0;
356 base = &per_cpu(tvec_bases, i); 386 if (timer_pending(timer)) {
357 if (base->running_timer == timer) { 387 detach_timer(timer, 1);
358 while (base->running_timer == timer) { 388 ret = 1;
359 cpu_relax();
360 preempt_check_resched();
361 }
362 break;
363 }
364 } 389 }
365 smp_rmb(); 390out:
366 if (timer_pending(timer)) 391 spin_unlock_irqrestore(&base->lock, flags);
367 goto del_again;
368 392
369 return ret; 393 return ret;
370} 394}
371EXPORT_SYMBOL(del_timer_sync);
372 395
373/*** 396/***
374 * del_singleshot_timer_sync - deactivate a non-recursive timer 397 * del_timer_sync - deactivate a timer and wait for the handler to finish.
375 * @timer: the timer to be deactivated 398 * @timer: the timer to be deactivated
376 * 399 *
377 * This function is an optimization of del_timer_sync for the case where the 400 * This function only differs from del_timer() on SMP: besides deactivating
378 * caller can guarantee the timer does not reschedule itself in its timer 401 * the timer it also makes sure the handler has finished executing on other
379 * function. 402 * CPUs.
380 * 403 *
381 * Synchronization rules: callers must prevent restarting of the timer, 404 * Synchronization rules: callers must prevent restarting of the timer,
382 * otherwise this function is meaningless. It must not be called from 405 * otherwise this function is meaningless. It must not be called from
383 * interrupt contexts. The caller must not hold locks which wold prevent 406 * interrupt contexts. The caller must not hold locks which would prevent
384 * completion of the timer's handler. Upon exit the timer is not queued and 407 * completion of the timer's handler. The timer's handler must not call
385 * the handler is not running on any CPU. 408 * add_timer_on(). Upon exit the timer is not queued and the handler is
409 * not running on any CPU.
386 * 410 *
387 * The function returns whether it has deactivated a pending timer or not. 411 * The function returns whether it has deactivated a pending timer or not.
388 */ 412 */
389int del_singleshot_timer_sync(struct timer_list *timer) 413int del_timer_sync(struct timer_list *timer)
390{ 414{
391 int ret = del_timer(timer); 415 check_timer(timer);
392 416
393 if (!ret) { 417 for (;;) {
394 ret = del_timer_sync(timer); 418 int ret = try_to_del_timer_sync(timer);
395 BUG_ON(ret); 419 if (ret >= 0)
420 return ret;
396 } 421 }
397
398 return ret;
399} 422}
400EXPORT_SYMBOL(del_singleshot_timer_sync); 423
424EXPORT_SYMBOL(del_timer_sync);
401#endif 425#endif
402 426
403static int cascade(tvec_base_t *base, tvec_t *tv, int index) 427static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
415 struct timer_list *tmp; 439 struct timer_list *tmp;
416 440
417 tmp = list_entry(curr, struct timer_list, entry); 441 tmp = list_entry(curr, struct timer_list, entry);
418 BUG_ON(tmp->base != base); 442 BUG_ON(tmp->base != &base->t_base);
419 curr = curr->next; 443 curr = curr->next;
420 internal_add_timer(base, tmp); 444 internal_add_timer(base, tmp);
421 } 445 }
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base)
437{ 461{
438 struct timer_list *timer; 462 struct timer_list *timer;
439 463
440 spin_lock_irq(&base->lock); 464 spin_lock_irq(&base->t_base.lock);
441 while (time_after_eq(jiffies, base->timer_jiffies)) { 465 while (time_after_eq(jiffies, base->timer_jiffies)) {
442 struct list_head work_list = LIST_HEAD_INIT(work_list); 466 struct list_head work_list = LIST_HEAD_INIT(work_list);
443 struct list_head *head = &work_list; 467 struct list_head *head = &work_list;
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base)
453 cascade(base, &base->tv5, INDEX(3)); 477 cascade(base, &base->tv5, INDEX(3));
454 ++base->timer_jiffies; 478 ++base->timer_jiffies;
455 list_splice_init(base->tv1.vec + index, &work_list); 479 list_splice_init(base->tv1.vec + index, &work_list);
456repeat: 480 while (!list_empty(head)) {
457 if (!list_empty(head)) {
458 void (*fn)(unsigned long); 481 void (*fn)(unsigned long);
459 unsigned long data; 482 unsigned long data;
460 483
@@ -462,25 +485,26 @@ repeat:
462 fn = timer->function; 485 fn = timer->function;
463 data = timer->data; 486 data = timer->data;
464 487
465 list_del(&timer->entry);
466 set_running_timer(base, timer); 488 set_running_timer(base, timer);
467 smp_wmb(); 489 detach_timer(timer, 1);
468 timer->base = NULL; 490 spin_unlock_irq(&base->t_base.lock);
469 spin_unlock_irq(&base->lock);
470 { 491 {
471 u32 preempt_count = preempt_count(); 492 int preempt_count = preempt_count();
472 fn(data); 493 fn(data);
473 if (preempt_count != preempt_count()) { 494 if (preempt_count != preempt_count()) {
474 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); 495 printk(KERN_WARNING "huh, entered %p "
496 "with preempt_count %08x, exited"
497 " with %08x?\n",
498 fn, preempt_count,
499 preempt_count());
475 BUG(); 500 BUG();
476 } 501 }
477 } 502 }
478 spin_lock_irq(&base->lock); 503 spin_lock_irq(&base->t_base.lock);
479 goto repeat;
480 } 504 }
481 } 505 }
482 set_running_timer(base, NULL); 506 set_running_timer(base, NULL);
483 spin_unlock_irq(&base->lock); 507 spin_unlock_irq(&base->t_base.lock);
484} 508}
485 509
486#ifdef CONFIG_NO_IDLE_HZ 510#ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void)
499 int i, j; 523 int i, j;
500 524
501 base = &__get_cpu_var(tvec_bases); 525 base = &__get_cpu_var(tvec_bases);
502 spin_lock(&base->lock); 526 spin_lock(&base->t_base.lock);
503 expires = base->timer_jiffies + (LONG_MAX >> 1); 527 expires = base->timer_jiffies + (LONG_MAX >> 1);
504 list = 0; 528 list = 0;
505 529
@@ -547,7 +571,7 @@ found:
547 expires = nte->expires; 571 expires = nte->expires;
548 } 572 }
549 } 573 }
550 spin_unlock(&base->lock); 574 spin_unlock(&base->t_base.lock);
551 return expires; 575 return expires;
552} 576}
553#endif 577#endif
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu)
1286{ 1310{
1287 int j; 1311 int j;
1288 tvec_base_t *base; 1312 tvec_base_t *base;
1289 1313
1290 base = &per_cpu(tvec_bases, cpu); 1314 base = &per_cpu(tvec_bases, cpu);
1291 spin_lock_init(&base->lock); 1315 spin_lock_init(&base->t_base.lock);
1292 for (j = 0; j < TVN_SIZE; j++) { 1316 for (j = 0; j < TVN_SIZE; j++) {
1293 INIT_LIST_HEAD(base->tv5.vec + j); 1317 INIT_LIST_HEAD(base->tv5.vec + j);
1294 INIT_LIST_HEAD(base->tv4.vec + j); 1318 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu)
1302} 1326}
1303 1327
1304#ifdef CONFIG_HOTPLUG_CPU 1328#ifdef CONFIG_HOTPLUG_CPU
1305static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1329static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1306{ 1330{
1307 struct timer_list *timer; 1331 struct timer_list *timer;
1308 1332
1309 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1310 timer = list_entry(head->next, struct timer_list, entry); 1334 timer = list_entry(head->next, struct timer_list, entry);
1311 /* We're locking backwards from __mod_timer order here, 1335 detach_timer(timer, 0);
1312 beware deadlock. */ 1336 timer->base = &new_base->t_base;
1313 if (!spin_trylock(&timer->lock))
1314 return 0;
1315 list_del(&timer->entry);
1316 internal_add_timer(new_base, timer); 1337 internal_add_timer(new_base, timer);
1317 timer->base = new_base;
1318 spin_unlock(&timer->lock);
1319 } 1338 }
1320 return 1;
1321} 1339}
1322 1340
1323static void __devinit migrate_timers(int cpu) 1341static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu)
1331 new_base = &get_cpu_var(tvec_bases); 1349 new_base = &get_cpu_var(tvec_bases);
1332 1350
1333 local_irq_disable(); 1351 local_irq_disable();
1334again: 1352 spin_lock(&new_base->t_base.lock);
1335 /* Prevent deadlocks via ordering by old_base < new_base. */ 1353 spin_lock(&old_base->t_base.lock);
1336 if (old_base < new_base) {
1337 spin_lock(&new_base->lock);
1338 spin_lock(&old_base->lock);
1339 } else {
1340 spin_lock(&old_base->lock);
1341 spin_lock(&new_base->lock);
1342 }
1343 1354
1344 if (old_base->running_timer) 1355 if (old_base->t_base.running_timer)
1345 BUG(); 1356 BUG();
1346 for (i = 0; i < TVR_SIZE; i++) 1357 for (i = 0; i < TVR_SIZE; i++)
1347 if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) 1358 migrate_timer_list(new_base, old_base->tv1.vec + i);
1348 goto unlock_again; 1359 for (i = 0; i < TVN_SIZE; i++) {
1349 for (i = 0; i < TVN_SIZE; i++) 1360 migrate_timer_list(new_base, old_base->tv2.vec + i);
1350 if (!migrate_timer_list(new_base, old_base->tv2.vec + i) 1361 migrate_timer_list(new_base, old_base->tv3.vec + i);
1351 || !migrate_timer_list(new_base, old_base->tv3.vec + i) 1362 migrate_timer_list(new_base, old_base->tv4.vec + i);
1352 || !migrate_timer_list(new_base, old_base->tv4.vec + i) 1363 migrate_timer_list(new_base, old_base->tv5.vec + i);
1353 || !migrate_timer_list(new_base, old_base->tv5.vec + i)) 1364 }
1354 goto unlock_again; 1365
1355 spin_unlock(&old_base->lock); 1366 spin_unlock(&old_base->t_base.lock);
1356 spin_unlock(&new_base->lock); 1367 spin_unlock(&new_base->t_base.lock);
1357 local_irq_enable(); 1368 local_irq_enable();
1358 put_cpu_var(tvec_bases); 1369 put_cpu_var(tvec_bases);
1359 return;
1360
1361unlock_again:
1362 /* Avoid deadlock with __mod_timer, by backing off. */
1363 spin_unlock(&old_base->lock);
1364 spin_unlock(&new_base->lock);
1365 cpu_relax();
1366 goto again;
1367} 1370}
1368#endif /* CONFIG_HOTPLUG_CPU */ 1371#endif /* CONFIG_HOTPLUG_CPU */
1369 1372