aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c12
-rw-r--r--kernel/audit.c162
-rw-r--r--kernel/audit.h10
-rw-r--r--kernel/auditfilter.c289
-rw-r--r--kernel/auditsc.c269
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/cpuset.c92
-rw-r--r--kernel/exit.c151
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c151
-rw-r--r--kernel/futex.c174
-rw-r--r--kernel/futex_compat.c144
-rw-r--r--kernel/hrtimer.c60
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/irq/migration.c5
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/module.c35
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c250
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/pm.c20
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/printk.c6
-rw-r--r--kernel/profile.c55
-rw-r--r--kernel/ptrace.c75
-rw-r--r--kernel/rcupdate.c23
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/sched.c240
-rw-r--r--kernel/signal.c356
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c6
-rw-r--r--kernel/sys.c419
-rw-r--r--kernel/sys_ni.c16
-rw-r--r--kernel/time.c8
-rw-r--r--kernel/timer.c146
-rw-r--r--kernel/uid16.c59
-rw-r--r--kernel/workqueue.c2
44 files changed, 2121 insertions, 1216 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12..58908f9d15 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o
17endif
15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
16obj-$(CONFIG_SMP) += cpu.o spinlock.o 19obj-$(CONFIG_SMP) += cpu.o spinlock.o
17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51..b327f4d201 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
449 /* calculate run_time in nsec*/ 449 /* calculate run_time in nsec*/
450 do_posix_clock_monotonic_gettime(&uptime); 450 do_posix_clock_monotonic_gettime(&uptime);
451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
452 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 452 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
453 + current->start_time.tv_nsec; 453 + current->group_leader->start_time.tv_nsec;
454 /* convert nsec -> AHZ */ 454 /* convert nsec -> AHZ */
455 elapsed = nsec_to_AHZ(run_time); 455 elapsed = nsec_to_AHZ(run_time);
456#if ACCT_VERSION==3 456#if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 469#endif
470 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, 472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime)); 473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, 475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime)); 476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt + 524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->group_leader->min_flt); 525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt + 526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->group_leader->maj_flt); 527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 528 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode; 529 ac.ac_exitcode = exitcode;
530 530
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b..df57b493e1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
55#include <net/netlink.h> 55#include <net/netlink.h>
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h>
59
60#include "audit.h"
58 61
59/* No auditing will take place until audit_initialized != 0. 62/* No auditing will take place until audit_initialized != 0.
60 * (Initialization happens after skb_init is called.) */ 63 * (Initialization happens after skb_init is called.) */
@@ -227,49 +230,103 @@ void audit_log_lost(const char *message)
227 } 230 }
228} 231}
229 232
230static int audit_set_rate_limit(int limit, uid_t loginuid) 233static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
231{ 234{
232 int old = audit_rate_limit; 235 int old = audit_rate_limit;
233 audit_rate_limit = limit; 236
234 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 237 if (sid) {
238 char *ctx = NULL;
239 u32 len;
240 int rc;
241 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
242 return rc;
243 else
244 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
245 "audit_rate_limit=%d old=%d by auid=%u subj=%s",
246 limit, old, loginuid, ctx);
247 kfree(ctx);
248 } else
249 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
235 "audit_rate_limit=%d old=%d by auid=%u", 250 "audit_rate_limit=%d old=%d by auid=%u",
236 audit_rate_limit, old, loginuid); 251 limit, old, loginuid);
252 audit_rate_limit = limit;
237 return old; 253 return old;
238} 254}
239 255
240static int audit_set_backlog_limit(int limit, uid_t loginuid) 256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
241{ 257{
242 int old = audit_backlog_limit; 258 int old = audit_backlog_limit;
243 audit_backlog_limit = limit; 259
244 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 260 if (sid) {
261 char *ctx = NULL;
262 u32 len;
263 int rc;
264 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
265 return rc;
266 else
267 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
268 "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
269 limit, old, loginuid, ctx);
270 kfree(ctx);
271 } else
272 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
245 "audit_backlog_limit=%d old=%d by auid=%u", 273 "audit_backlog_limit=%d old=%d by auid=%u",
246 audit_backlog_limit, old, loginuid); 274 limit, old, loginuid);
275 audit_backlog_limit = limit;
247 return old; 276 return old;
248} 277}
249 278
250static int audit_set_enabled(int state, uid_t loginuid) 279static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
251{ 280{
252 int old = audit_enabled; 281 int old = audit_enabled;
282
253 if (state != 0 && state != 1) 283 if (state != 0 && state != 1)
254 return -EINVAL; 284 return -EINVAL;
255 audit_enabled = state; 285
256 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 286 if (sid) {
287 char *ctx = NULL;
288 u32 len;
289 int rc;
290 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
291 return rc;
292 else
293 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
294 "audit_enabled=%d old=%d by auid=%u subj=%s",
295 state, old, loginuid, ctx);
296 kfree(ctx);
297 } else
298 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
257 "audit_enabled=%d old=%d by auid=%u", 299 "audit_enabled=%d old=%d by auid=%u",
258 audit_enabled, old, loginuid); 300 state, old, loginuid);
301 audit_enabled = state;
259 return old; 302 return old;
260} 303}
261 304
262static int audit_set_failure(int state, uid_t loginuid) 305static int audit_set_failure(int state, uid_t loginuid, u32 sid)
263{ 306{
264 int old = audit_failure; 307 int old = audit_failure;
308
265 if (state != AUDIT_FAIL_SILENT 309 if (state != AUDIT_FAIL_SILENT
266 && state != AUDIT_FAIL_PRINTK 310 && state != AUDIT_FAIL_PRINTK
267 && state != AUDIT_FAIL_PANIC) 311 && state != AUDIT_FAIL_PANIC)
268 return -EINVAL; 312 return -EINVAL;
269 audit_failure = state; 313
270 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 314 if (sid) {
315 char *ctx = NULL;
316 u32 len;
317 int rc;
318 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
319 return rc;
320 else
321 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
322 "audit_failure=%d old=%d by auid=%u subj=%s",
323 state, old, loginuid, ctx);
324 kfree(ctx);
325 } else
326 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
271 "audit_failure=%d old=%d by auid=%u", 327 "audit_failure=%d old=%d by auid=%u",
272 audit_failure, old, loginuid); 328 state, old, loginuid);
329 audit_failure = state;
273 return old; 330 return old;
274} 331}
275 332
@@ -387,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
387 444
388static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 445static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
389{ 446{
390 u32 uid, pid, seq; 447 u32 uid, pid, seq, sid;
391 void *data; 448 void *data;
392 struct audit_status *status_get, status_set; 449 struct audit_status *status_get, status_set;
393 int err; 450 int err;
@@ -413,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
413 pid = NETLINK_CREDS(skb)->pid; 470 pid = NETLINK_CREDS(skb)->pid;
414 uid = NETLINK_CREDS(skb)->uid; 471 uid = NETLINK_CREDS(skb)->uid;
415 loginuid = NETLINK_CB(skb).loginuid; 472 loginuid = NETLINK_CB(skb).loginuid;
473 sid = NETLINK_CB(skb).sid;
416 seq = nlh->nlmsg_seq; 474 seq = nlh->nlmsg_seq;
417 data = NLMSG_DATA(nlh); 475 data = NLMSG_DATA(nlh);
418 476
@@ -433,25 +491,43 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
433 return -EINVAL; 491 return -EINVAL;
434 status_get = (struct audit_status *)data; 492 status_get = (struct audit_status *)data;
435 if (status_get->mask & AUDIT_STATUS_ENABLED) { 493 if (status_get->mask & AUDIT_STATUS_ENABLED) {
436 err = audit_set_enabled(status_get->enabled, loginuid); 494 err = audit_set_enabled(status_get->enabled,
495 loginuid, sid);
437 if (err < 0) return err; 496 if (err < 0) return err;
438 } 497 }
439 if (status_get->mask & AUDIT_STATUS_FAILURE) { 498 if (status_get->mask & AUDIT_STATUS_FAILURE) {
440 err = audit_set_failure(status_get->failure, loginuid); 499 err = audit_set_failure(status_get->failure,
500 loginuid, sid);
441 if (err < 0) return err; 501 if (err < 0) return err;
442 } 502 }
443 if (status_get->mask & AUDIT_STATUS_PID) { 503 if (status_get->mask & AUDIT_STATUS_PID) {
444 int old = audit_pid; 504 int old = audit_pid;
505 if (sid) {
506 char *ctx = NULL;
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len)))
511 return rc;
512 else
513 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE,
515 "audit_pid=%d old=%d by auid=%u subj=%s",
516 status_get->pid, old,
517 loginuid, ctx);
518 kfree(ctx);
519 } else
520 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
521 "audit_pid=%d old=%d by auid=%u",
522 status_get->pid, old, loginuid);
445 audit_pid = status_get->pid; 523 audit_pid = status_get->pid;
446 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
447 "audit_pid=%d old=%d by auid=%u",
448 audit_pid, old, loginuid);
449 } 524 }
450 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
451 audit_set_rate_limit(status_get->rate_limit, loginuid); 526 audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid);
452 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
453 audit_set_backlog_limit(status_get->backlog_limit, 529 audit_set_backlog_limit(status_get->backlog_limit,
454 loginuid); 530 loginuid, sid);
455 break; 531 break;
456 case AUDIT_USER: 532 case AUDIT_USER:
457 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 533 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
@@ -465,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
465 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 541 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
466 if (ab) { 542 if (ab) {
467 audit_log_format(ab, 543 audit_log_format(ab,
468 "user pid=%d uid=%u auid=%u msg='%.1024s'", 544 "user pid=%d uid=%u auid=%u",
469 pid, uid, loginuid, (char *)data); 545 pid, uid, loginuid);
546 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) {
551 audit_log_format(ab,
552 " ssid=%u", sid);
553 /* Maybe call audit_panic? */
554 } else
555 audit_log_format(ab,
556 " subj=%s", ctx);
557 kfree(ctx);
558 }
559 audit_log_format(ab, " msg='%.1024s'",
560 (char *)data);
470 audit_set_pid(ab, pid); 561 audit_set_pid(ab, pid);
471 audit_log_end(ab); 562 audit_log_end(ab);
472 } 563 }
@@ -480,7 +571,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
480 case AUDIT_LIST: 571 case AUDIT_LIST:
481 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 572 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
482 uid, seq, data, nlmsg_len(nlh), 573 uid, seq, data, nlmsg_len(nlh),
483 loginuid); 574 loginuid, sid);
484 break; 575 break;
485 case AUDIT_ADD_RULE: 576 case AUDIT_ADD_RULE:
486 case AUDIT_DEL_RULE: 577 case AUDIT_DEL_RULE:
@@ -490,7 +581,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
490 case AUDIT_LIST_RULES: 581 case AUDIT_LIST_RULES:
491 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 582 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
492 uid, seq, data, nlmsg_len(nlh), 583 uid, seq, data, nlmsg_len(nlh),
493 loginuid); 584 loginuid, sid);
494 break; 585 break;
495 case AUDIT_SIGNAL_INFO: 586 case AUDIT_SIGNAL_INFO:
496 sig_data.uid = audit_sig_uid; 587 sig_data.uid = audit_sig_uid;
@@ -564,6 +655,11 @@ static int __init audit_init(void)
564 skb_queue_head_init(&audit_skb_queue); 655 skb_queue_head_init(&audit_skb_queue);
565 audit_initialized = 1; 656 audit_initialized = 1;
566 audit_enabled = audit_default; 657 audit_enabled = audit_default;
658
659 /* Register the callback with selinux. This callback will be invoked
660 * when a new policy is loaded. */
661 selinux_audit_set_callback(&selinux_audit_rule_update);
662
567 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
568 return 0; 664 return 0;
569} 665}
@@ -578,7 +674,7 @@ static int __init audit_enable(char *str)
578 audit_initialized ? "" : " (after initialization)"); 674 audit_initialized ? "" : " (after initialization)");
579 if (audit_initialized) 675 if (audit_initialized)
580 audit_enabled = audit_default; 676 audit_enabled = audit_default;
581 return 0; 677 return 1;
582} 678}
583 679
584__setup("audit=", audit_enable); 680__setup("audit=", audit_enable);
diff --git a/kernel/audit.h b/kernel/audit.h
index bc5392076e..6f733920fd 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -54,9 +54,11 @@ enum audit_state {
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_field { 56struct audit_field {
57 u32 type; 57 u32 type;
58 u32 val; 58 u32 val;
59 u32 op; 59 u32 op;
60 char *se_str;
61 struct selinux_audit_rule *se_rule;
60}; 62};
61 63
62struct audit_krule { 64struct audit_krule {
@@ -86,3 +88,5 @@ extern void audit_send_reply(int pid, int seq, int type,
86extern void audit_log_lost(const char *message); 88extern void audit_log_lost(const char *message);
87extern void audit_panic(const char *message); 89extern void audit_panic(const char *message);
88extern struct mutex audit_netlink_mutex; 90extern struct mutex audit_netlink_mutex;
91
92extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d3a8539f3a..7c134906d6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -23,6 +23,7 @@
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/netlink.h> 25#include <linux/netlink.h>
26#include <linux/selinux.h>
26#include "audit.h" 27#include "audit.h"
27 28
28/* There are three lists of rules -- one to search at task creation 29/* There are three lists of rules -- one to search at task creation
@@ -42,6 +43,13 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
42 43
43static inline void audit_free_rule(struct audit_entry *e) 44static inline void audit_free_rule(struct audit_entry *e)
44{ 45{
46 int i;
47 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i];
50 kfree(f->se_str);
51 selinux_audit_rule_free(f->se_rule);
52 }
45 kfree(e->rule.fields); 53 kfree(e->rule.fields);
46 kfree(e); 54 kfree(e);
47} 55}
@@ -52,9 +60,29 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
52 audit_free_rule(e); 60 audit_free_rule(e);
53} 61}
54 62
63/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count)
65{
66 struct audit_entry *entry;
67 struct audit_field *fields;
68
69 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
70 if (unlikely(!entry))
71 return NULL;
72
73 fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
74 if (unlikely(!fields)) {
75 kfree(entry);
76 return NULL;
77 }
78 entry->rule.fields = fields;
79
80 return entry;
81}
82
55/* Unpack a filter field's string representation from user-space 83/* Unpack a filter field's string representation from user-space
56 * buffer. */ 84 * buffer. */
57static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len) 85static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
58{ 86{
59 char *str; 87 char *str;
60 88
@@ -84,7 +112,6 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
84{ 112{
85 unsigned listnr; 113 unsigned listnr;
86 struct audit_entry *entry; 114 struct audit_entry *entry;
87 struct audit_field *fields;
88 int i, err; 115 int i, err;
89 116
90 err = -EINVAL; 117 err = -EINVAL;
@@ -108,23 +135,14 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
108 goto exit_err; 135 goto exit_err;
109 136
110 err = -ENOMEM; 137 err = -ENOMEM;
111 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 138 entry = audit_init_entry(rule->field_count);
112 if (unlikely(!entry)) 139 if (!entry)
113 goto exit_err;
114 fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
115 if (unlikely(!fields)) {
116 kfree(entry);
117 goto exit_err; 140 goto exit_err;
118 }
119
120 memset(&entry->rule, 0, sizeof(struct audit_krule));
121 memset(fields, 0, sizeof(struct audit_field));
122 141
123 entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; 142 entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
124 entry->rule.listnr = listnr; 143 entry->rule.listnr = listnr;
125 entry->rule.action = rule->action; 144 entry->rule.action = rule->action;
126 entry->rule.field_count = rule->field_count; 145 entry->rule.field_count = rule->field_count;
127 entry->rule.fields = fields;
128 146
129 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 147 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
130 entry->rule.mask[i] = rule->mask[i]; 148 entry->rule.mask[i] = rule->mask[i];
@@ -150,15 +168,20 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
150 for (i = 0; i < rule->field_count; i++) { 168 for (i = 0; i < rule->field_count; i++) {
151 struct audit_field *f = &entry->rule.fields[i]; 169 struct audit_field *f = &entry->rule.fields[i];
152 170
153 if (rule->fields[i] & AUDIT_UNUSED_BITS) {
154 err = -EINVAL;
155 goto exit_free;
156 }
157
158 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); 171 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
159 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
160 f->val = rule->values[i]; 173 f->val = rule->values[i];
161 174
175 if (f->type & AUDIT_UNUSED_BITS ||
176 f->type == AUDIT_SE_USER ||
177 f->type == AUDIT_SE_ROLE ||
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free;
183 }
184
162 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; 185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
163 186
164 /* Support for legacy operators where 187 /* Support for legacy operators where
@@ -188,8 +211,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
188 int err = 0; 211 int err = 0;
189 struct audit_entry *entry; 212 struct audit_entry *entry;
190 void *bufp; 213 void *bufp;
191 /* size_t remain = datasz - sizeof(struct audit_rule_data); */ 214 size_t remain = datasz - sizeof(struct audit_rule_data);
192 int i; 215 int i;
216 char *str;
193 217
194 entry = audit_to_entry_common((struct audit_rule *)data); 218 entry = audit_to_entry_common((struct audit_rule *)data);
195 if (IS_ERR(entry)) 219 if (IS_ERR(entry))
@@ -207,10 +231,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
207 231
208 f->op = data->fieldflags[i] & AUDIT_OPERATORS; 232 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
209 f->type = data->fields[i]; 233 f->type = data->fields[i];
234 f->val = data->values[i];
235 f->se_str = NULL;
236 f->se_rule = NULL;
210 switch(f->type) { 237 switch(f->type) {
211 /* call type-specific conversion routines here */ 238 case AUDIT_SE_USER:
212 default: 239 case AUDIT_SE_ROLE:
213 f->val = data->values[i]; 240 case AUDIT_SE_TYPE:
241 case AUDIT_SE_SEN:
242 case AUDIT_SE_CLR:
243 str = audit_unpack_string(&bufp, &remain, f->val);
244 if (IS_ERR(str))
245 goto exit_free;
246 entry->rule.buflen += f->val;
247
248 err = selinux_audit_rule_init(f->type, f->op, str,
249 &f->se_rule);
250 /* Keep currently invalid fields around in case they
251 * become valid after a policy reload. */
252 if (err == -EINVAL) {
253 printk(KERN_WARNING "audit rule for selinux "
254 "\'%s\' is invalid\n", str);
255 err = 0;
256 }
257 if (err) {
258 kfree(str);
259 goto exit_free;
260 } else
261 f->se_str = str;
262 break;
214 } 263 }
215 } 264 }
216 265
@@ -286,7 +335,14 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
286 data->fields[i] = f->type; 335 data->fields[i] = f->type;
287 data->fieldflags[i] = f->op; 336 data->fieldflags[i] = f->op;
288 switch(f->type) { 337 switch(f->type) {
289 /* call type-specific conversion routines here */ 338 case AUDIT_SE_USER:
339 case AUDIT_SE_ROLE:
340 case AUDIT_SE_TYPE:
341 case AUDIT_SE_SEN:
342 case AUDIT_SE_CLR:
343 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str);
345 break;
290 default: 346 default:
291 data->values[i] = f->val; 347 data->values[i] = f->val;
292 } 348 }
@@ -314,7 +370,14 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
314 return 1; 370 return 1;
315 371
316 switch(a->fields[i].type) { 372 switch(a->fields[i].type) {
317 /* call type-specific comparison routines here */ 373 case AUDIT_SE_USER:
374 case AUDIT_SE_ROLE:
375 case AUDIT_SE_TYPE:
376 case AUDIT_SE_SEN:
377 case AUDIT_SE_CLR:
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1;
380 break;
318 default: 381 default:
319 if (a->fields[i].val != b->fields[i].val) 382 if (a->fields[i].val != b->fields[i].val)
320 return 1; 383 return 1;
@@ -328,6 +391,81 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
328 return 0; 391 return 0;
329} 392}
330 393
394/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df,
397 struct audit_field *sf)
398{
399 int ret = 0;
400 char *se_str;
401
402 /* our own copy of se_str */
403 se_str = kstrdup(sf->se_str, GFP_KERNEL);
404 if (unlikely(IS_ERR(se_str)))
405 return -ENOMEM;
406 df->se_str = se_str;
407
408 /* our own (refreshed) copy of se_rule */
409 ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
410 &df->se_rule);
411 /* Keep currently invalid fields around in case they
412 * become valid after a policy reload. */
413 if (ret == -EINVAL) {
414 printk(KERN_WARNING "audit rule for selinux \'%s\' is "
415 "invalid\n", df->se_str);
416 ret = 0;
417 }
418
419 return ret;
420}
421
422/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
427{
428 u32 fcount = old->field_count;
429 struct audit_entry *entry;
430 struct audit_krule *new;
431 int i, err = 0;
432
433 entry = audit_init_entry(fcount);
434 if (unlikely(!entry))
435 return ERR_PTR(-ENOMEM);
436
437 new = &entry->rule;
438 new->vers_ops = old->vers_ops;
439 new->flags = old->flags;
440 new->listnr = old->listnr;
441 new->action = old->action;
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen;
445 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447
448 /* deep copy this information, updating the se_rule fields, because
449 * the originals will all be freed when the old rule is freed. */
450 for (i = 0; i < fcount; i++) {
451 switch (new->fields[i].type) {
452 case AUDIT_SE_USER:
453 case AUDIT_SE_ROLE:
454 case AUDIT_SE_TYPE:
455 case AUDIT_SE_SEN:
456 case AUDIT_SE_CLR:
457 err = audit_dupe_selinux_field(&new->fields[i],
458 &old->fields[i]);
459 }
460 if (err) {
461 audit_free_rule(entry);
462 return ERR_PTR(err);
463 }
464 }
465
466 return entry;
467}
468
331/* Add rule to given filterlist if not a duplicate. Protected by 469/* Add rule to given filterlist if not a duplicate. Protected by
332 * audit_netlink_mutex. */ 470 * audit_netlink_mutex. */
333static inline int audit_add_rule(struct audit_entry *entry, 471static inline int audit_add_rule(struct audit_entry *entry,
@@ -448,9 +586,10 @@ static int audit_list_rules(void *_dest)
448 * @data: payload data 586 * @data: payload data
449 * @datasz: size of payload data 587 * @datasz: size of payload data
450 * @loginuid: loginuid of sender 588 * @loginuid: loginuid of sender
589 * @sid: SE Linux Security ID of sender
451 */ 590 */
452int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 591int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
453 size_t datasz, uid_t loginuid) 592 size_t datasz, uid_t loginuid, u32 sid)
454{ 593{
455 struct task_struct *tsk; 594 struct task_struct *tsk;
456 int *dest; 595 int *dest;
@@ -493,9 +632,23 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
493 632
494 err = audit_add_rule(entry, 633 err = audit_add_rule(entry,
495 &audit_filter_list[entry->rule.listnr]); 634 &audit_filter_list[entry->rule.listnr]);
496 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 635 if (sid) {
497 "auid=%u add rule to list=%d res=%d\n", 636 char *ctx = NULL;
498 loginuid, entry->rule.listnr, !err); 637 u32 len;
638 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
639 /* Maybe call audit_panic? */
640 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
641 "auid=%u ssid=%u add rule to list=%d res=%d",
642 loginuid, sid, entry->rule.listnr, !err);
643 } else
644 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
645 "auid=%u subj=%s add rule to list=%d res=%d",
646 loginuid, ctx, entry->rule.listnr, !err);
647 kfree(ctx);
648 } else
649 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
650 "auid=%u add rule to list=%d res=%d",
651 loginuid, entry->rule.listnr, !err);
499 652
500 if (err) 653 if (err)
501 audit_free_rule(entry); 654 audit_free_rule(entry);
@@ -511,9 +664,24 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
511 664
512 err = audit_del_rule(entry, 665 err = audit_del_rule(entry,
513 &audit_filter_list[entry->rule.listnr]); 666 &audit_filter_list[entry->rule.listnr]);
514 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 667
515 "auid=%u remove rule from list=%d res=%d\n", 668 if (sid) {
516 loginuid, entry->rule.listnr, !err); 669 char *ctx = NULL;
670 u32 len;
671 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
672 /* Maybe call audit_panic? */
673 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
674 "auid=%u ssid=%u remove rule from list=%d res=%d",
675 loginuid, sid, entry->rule.listnr, !err);
676 } else
677 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
678 "auid=%u subj=%s remove rule from list=%d res=%d",
679 loginuid, ctx, entry->rule.listnr, !err);
680 kfree(ctx);
681 } else
682 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
683 "auid=%u remove rule from list=%d res=%d",
684 loginuid, entry->rule.listnr, !err);
517 685
518 audit_free_rule(entry); 686 audit_free_rule(entry);
519 break; 687 break;
@@ -628,3 +796,62 @@ unlock_and_return:
628 rcu_read_unlock(); 796 rcu_read_unlock();
629 return result; 797 return result;
630} 798}
799
800/* Check to see if the rule contains any selinux fields. Returns 1 if there
801 are selinux fields specified in the rule, 0 otherwise. */
802static inline int audit_rule_has_selinux(struct audit_krule *rule)
803{
804 int i;
805
806 for (i = 0; i < rule->field_count; i++) {
807 struct audit_field *f = &rule->fields[i];
808 switch (f->type) {
809 case AUDIT_SE_USER:
810 case AUDIT_SE_ROLE:
811 case AUDIT_SE_TYPE:
812 case AUDIT_SE_SEN:
813 case AUDIT_SE_CLR:
814 return 1;
815 }
816 }
817
818 return 0;
819}
820
821/* This function will re-initialize the se_rule field of all applicable rules.
822 * It will traverse the filter lists serarching for rules that contain selinux
823 * specific filter fields. When such a rule is found, it is copied, the
824 * selinux field is re-initialized, and the old rule is replaced with the
825 * updated rule. */
826int selinux_audit_rule_update(void)
827{
828 struct audit_entry *entry, *n, *nentry;
829 int i, err = 0;
830
831 /* audit_netlink_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex);
833
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule))
837 continue;
838
839 nentry = audit_dupe_rule(&entry->rule);
840 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the
842 * return value */
843 if (!err)
844 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters");
846 list_del_rcu(&entry->list);
847 } else {
848 list_replace_rcu(&entry->list, &nentry->list);
849 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 }
852 }
853
854 mutex_unlock(&audit_netlink_mutex);
855
856 return err;
857}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7f160df21a..1c03a4ed1b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -58,6 +58,7 @@
58#include <linux/security.h> 58#include <linux/security.h>
59#include <linux/list.h> 59#include <linux/list.h>
60#include <linux/tty.h> 60#include <linux/tty.h>
61#include <linux/selinux.h>
61 62
62#include "audit.h" 63#include "audit.h"
63 64
@@ -89,7 +90,7 @@ struct audit_names {
89 uid_t uid; 90 uid_t uid;
90 gid_t gid; 91 gid_t gid;
91 dev_t rdev; 92 dev_t rdev;
92 char *ctx; 93 u32 osid;
93}; 94};
94 95
95struct audit_aux_data { 96struct audit_aux_data {
@@ -106,7 +107,7 @@ struct audit_aux_data_ipcctl {
106 uid_t uid; 107 uid_t uid;
107 gid_t gid; 108 gid_t gid;
108 mode_t mode; 109 mode_t mode;
109 char *ctx; 110 u32 osid;
110}; 111};
111 112
112struct audit_aux_data_socketcall { 113struct audit_aux_data_socketcall {
@@ -167,7 +168,8 @@ static int audit_filter_rules(struct task_struct *tsk,
167 struct audit_context *ctx, 168 struct audit_context *ctx,
168 enum audit_state *state) 169 enum audit_state *state)
169{ 170{
170 int i, j; 171 int i, j, need_sid = 1;
172 u32 sid;
171 173
172 for (i = 0; i < rule->field_count; i++) { 174 for (i = 0; i < rule->field_count; i++) {
173 struct audit_field *f = &rule->fields[i]; 175 struct audit_field *f = &rule->fields[i];
@@ -257,6 +259,27 @@ static int audit_filter_rules(struct task_struct *tsk,
257 if (ctx) 259 if (ctx)
258 result = audit_comparator(ctx->loginuid, f->op, f->val); 260 result = audit_comparator(ctx->loginuid, f->op, f->val);
259 break; 261 break;
262 case AUDIT_SE_USER:
263 case AUDIT_SE_ROLE:
264 case AUDIT_SE_TYPE:
265 case AUDIT_SE_SEN:
266 case AUDIT_SE_CLR:
267 /* NOTE: this may return negative values indicating
268 a temporary error. We simply treat this as a
269 match for now to avoid losing information that
270 may be wanted. An error message will also be
271 logged upon error */
272 if (f->se_rule) {
273 if (need_sid) {
274 selinux_task_ctxid(tsk, &sid);
275 need_sid = 0;
276 }
277 result = selinux_audit_rule_match(sid, f->type,
278 f->op,
279 f->se_rule,
280 ctx);
281 }
282 break;
260 case AUDIT_ARG0: 283 case AUDIT_ARG0:
261 case AUDIT_ARG1: 284 case AUDIT_ARG1:
262 case AUDIT_ARG2: 285 case AUDIT_ARG2:
@@ -329,7 +352,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
329 return AUDIT_BUILD_CONTEXT; 352 return AUDIT_BUILD_CONTEXT;
330} 353}
331 354
332/* This should be called with task_lock() held. */
333static inline struct audit_context *audit_get_context(struct task_struct *tsk, 355static inline struct audit_context *audit_get_context(struct task_struct *tsk,
334 int return_valid, 356 int return_valid,
335 int return_code) 357 int return_code)
@@ -391,9 +413,6 @@ static inline void audit_free_names(struct audit_context *context)
391#endif 413#endif
392 414
393 for (i = 0; i < context->name_count; i++) { 415 for (i = 0; i < context->name_count; i++) {
394 char *p = context->names[i].ctx;
395 context->names[i].ctx = NULL;
396 kfree(p);
397 if (context->names[i].name) 416 if (context->names[i].name)
398 __putname(context->names[i].name); 417 __putname(context->names[i].name);
399 } 418 }
@@ -416,11 +435,6 @@ static inline void audit_free_aux(struct audit_context *context)
416 dput(axi->dentry); 435 dput(axi->dentry);
417 mntput(axi->mnt); 436 mntput(axi->mnt);
418 } 437 }
419 if ( aux->type == AUDIT_IPC ) {
420 struct audit_aux_data_ipcctl *axi = (void *)aux;
421 if (axi->ctx)
422 kfree(axi->ctx);
423 }
424 438
425 context->aux = aux->next; 439 context->aux = aux->next;
426 kfree(aux); 440 kfree(aux);
@@ -506,7 +520,7 @@ static inline void audit_free_context(struct audit_context *context)
506 printk(KERN_ERR "audit: freed %d contexts\n", count); 520 printk(KERN_ERR "audit: freed %d contexts\n", count);
507} 521}
508 522
509static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) 523static void audit_log_task_context(struct audit_buffer *ab)
510{ 524{
511 char *ctx = NULL; 525 char *ctx = NULL;
512 ssize_t len = 0; 526 ssize_t len = 0;
@@ -518,7 +532,7 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
518 return; 532 return;
519 } 533 }
520 534
521 ctx = kmalloc(len, gfp_mask); 535 ctx = kmalloc(len, GFP_KERNEL);
522 if (!ctx) 536 if (!ctx)
523 goto error_path; 537 goto error_path;
524 538
@@ -536,47 +550,46 @@ error_path:
536 return; 550 return;
537} 551}
538 552
539static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) 553static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
540{ 554{
541 char name[sizeof(current->comm)]; 555 char name[sizeof(tsk->comm)];
542 struct mm_struct *mm = current->mm; 556 struct mm_struct *mm = tsk->mm;
543 struct vm_area_struct *vma; 557 struct vm_area_struct *vma;
544 558
545 get_task_comm(name, current); 559 /* tsk == current */
560
561 get_task_comm(name, tsk);
546 audit_log_format(ab, " comm="); 562 audit_log_format(ab, " comm=");
547 audit_log_untrustedstring(ab, name); 563 audit_log_untrustedstring(ab, name);
548 564
549 if (!mm) 565 if (mm) {
550 return; 566 down_read(&mm->mmap_sem);
551 567 vma = mm->mmap;
552 /* 568 while (vma) {
553 * this is brittle; all callers that pass GFP_ATOMIC will have 569 if ((vma->vm_flags & VM_EXECUTABLE) &&
554 * NULL current->mm and we won't get here. 570 vma->vm_file) {
555 */ 571 audit_log_d_path(ab, "exe=",
556 down_read(&mm->mmap_sem); 572 vma->vm_file->f_dentry,
557 vma = mm->mmap; 573 vma->vm_file->f_vfsmnt);
558 while (vma) { 574 break;
559 if ((vma->vm_flags & VM_EXECUTABLE) && 575 }
560 vma->vm_file) { 576 vma = vma->vm_next;
561 audit_log_d_path(ab, "exe=",
562 vma->vm_file->f_dentry,
563 vma->vm_file->f_vfsmnt);
564 break;
565 } 577 }
566 vma = vma->vm_next; 578 up_read(&mm->mmap_sem);
567 } 579 }
568 up_read(&mm->mmap_sem); 580 audit_log_task_context(ab);
569 audit_log_task_context(ab, gfp_mask);
570} 581}
571 582
572static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) 583static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
573{ 584{
574 int i; 585 int i, call_panic = 0;
575 struct audit_buffer *ab; 586 struct audit_buffer *ab;
576 struct audit_aux_data *aux; 587 struct audit_aux_data *aux;
577 const char *tty; 588 const char *tty;
578 589
579 ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); 590 /* tsk == current */
591
592 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
580 if (!ab) 593 if (!ab)
581 return; /* audit_panic has been called */ 594 return; /* audit_panic has been called */
582 audit_log_format(ab, "arch=%x syscall=%d", 595 audit_log_format(ab, "arch=%x syscall=%d",
@@ -587,8 +600,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
587 audit_log_format(ab, " success=%s exit=%ld", 600 audit_log_format(ab, " success=%s exit=%ld",
588 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 601 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
589 context->return_code); 602 context->return_code);
590 if (current->signal->tty && current->signal->tty->name) 603 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
591 tty = current->signal->tty->name; 604 tty = tsk->signal->tty->name;
592 else 605 else
593 tty = "(none)"; 606 tty = "(none)";
594 audit_log_format(ab, 607 audit_log_format(ab,
@@ -607,12 +620,12 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
607 context->gid, 620 context->gid,
608 context->euid, context->suid, context->fsuid, 621 context->euid, context->suid, context->fsuid,
609 context->egid, context->sgid, context->fsgid, tty); 622 context->egid, context->sgid, context->fsgid, tty);
610 audit_log_task_info(ab, gfp_mask); 623 audit_log_task_info(ab, tsk);
611 audit_log_end(ab); 624 audit_log_end(ab);
612 625
613 for (aux = context->aux; aux; aux = aux->next) { 626 for (aux = context->aux; aux; aux = aux->next) {
614 627
615 ab = audit_log_start(context, gfp_mask, aux->type); 628 ab = audit_log_start(context, GFP_KERNEL, aux->type);
616 if (!ab) 629 if (!ab)
617 continue; /* audit_panic has been called */ 630 continue; /* audit_panic has been called */
618 631
@@ -620,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
620 case AUDIT_IPC: { 633 case AUDIT_IPC: {
621 struct audit_aux_data_ipcctl *axi = (void *)aux; 634 struct audit_aux_data_ipcctl *axi = (void *)aux;
622 audit_log_format(ab, 635 audit_log_format(ab,
623 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s", 636 " qbytes=%lx iuid=%u igid=%u mode=%x",
624 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx); 637 axi->qbytes, axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) {
639 char *ctx = NULL;
640 u32 len;
641 if (selinux_ctxid_to_string(
642 axi->osid, &ctx, &len)) {
643 audit_log_format(ab, " osid=%u",
644 axi->osid);
645 call_panic = 1;
646 } else
647 audit_log_format(ab, " obj=%s", ctx);
648 kfree(ctx);
649 }
650 break; }
651
652 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) {
658 char *ctx = NULL;
659 u32 len;
660 if (selinux_ctxid_to_string(
661 axi->osid, &ctx, &len)) {
662 audit_log_format(ab, " osid=%u",
663 axi->osid);
664 call_panic = 1;
665 } else
666 audit_log_format(ab, " obj=%s", ctx);
667 kfree(ctx);
668 }
625 break; } 669 break; }
626 670
627 case AUDIT_SOCKETCALL: { 671 case AUDIT_SOCKETCALL: {
@@ -649,7 +693,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
649 } 693 }
650 694
651 if (context->pwd && context->pwdmnt) { 695 if (context->pwd && context->pwdmnt) {
652 ab = audit_log_start(context, gfp_mask, AUDIT_CWD); 696 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
653 if (ab) { 697 if (ab) {
654 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); 698 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
655 audit_log_end(ab); 699 audit_log_end(ab);
@@ -659,7 +703,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
659 unsigned long ino = context->names[i].ino; 703 unsigned long ino = context->names[i].ino;
660 unsigned long pino = context->names[i].pino; 704 unsigned long pino = context->names[i].pino;
661 705
662 ab = audit_log_start(context, gfp_mask, AUDIT_PATH); 706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
663 if (!ab) 707 if (!ab)
664 continue; /* audit_panic has been called */ 708 continue; /* audit_panic has been called */
665 709
@@ -685,32 +729,35 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
685 context->names[i].gid, 729 context->names[i].gid,
686 MAJOR(context->names[i].rdev), 730 MAJOR(context->names[i].rdev),
687 MINOR(context->names[i].rdev)); 731 MINOR(context->names[i].rdev));
688 if (context->names[i].ctx) { 732 if (context->names[i].osid != 0) {
689 audit_log_format(ab, " obj=%s", 733 char *ctx = NULL;
690 context->names[i].ctx); 734 u32 len;
735 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u",
738 context->names[i].osid);
739 call_panic = 2;
740 } else
741 audit_log_format(ab, " obj=%s", ctx);
742 kfree(ctx);
691 } 743 }
692 744
693 audit_log_end(ab); 745 audit_log_end(ab);
694 } 746 }
747 if (call_panic)
748 audit_panic("error converting sid to string");
695} 749}
696 750
697/** 751/**
698 * audit_free - free a per-task audit context 752 * audit_free - free a per-task audit context
699 * @tsk: task whose audit context block to free 753 * @tsk: task whose audit context block to free
700 * 754 *
701 * Called from copy_process and __put_task_struct. 755 * Called from copy_process and do_exit
702 */ 756 */
703void audit_free(struct task_struct *tsk) 757void audit_free(struct task_struct *tsk)
704{ 758{
705 struct audit_context *context; 759 struct audit_context *context;
706 760
707 /*
708 * No need to lock the task - when we execute audit_free()
709 * then the task has no external references anymore, and
710 * we are tearing it down. (The locking also confuses
711 * DEBUG_LOCKDEP - this freeing may occur in softirq
712 * contexts as well, via RCU.)
713 */
714 context = audit_get_context(tsk, 0, 0); 761 context = audit_get_context(tsk, 0, 0);
715 if (likely(!context)) 762 if (likely(!context))
716 return; 763 return;
@@ -719,8 +766,9 @@ void audit_free(struct task_struct *tsk)
719 * function (e.g., exit_group), then free context block. 766 * function (e.g., exit_group), then free context block.
720 * We use GFP_ATOMIC here because we might be doing this 767 * We use GFP_ATOMIC here because we might be doing this
721 * in the context of the idle thread */ 768 * in the context of the idle thread */
769 /* that can happen only if we are called from do_exit() */
722 if (context->in_syscall && context->auditable) 770 if (context->in_syscall && context->auditable)
723 audit_log_exit(context, GFP_ATOMIC); 771 audit_log_exit(context, tsk);
724 772
725 audit_free_context(context); 773 audit_free_context(context);
726} 774}
@@ -743,10 +791,11 @@ void audit_free(struct task_struct *tsk)
743 * will only be written if another part of the kernel requests that it 791 * will only be written if another part of the kernel requests that it
744 * be written). 792 * be written).
745 */ 793 */
746void audit_syscall_entry(struct task_struct *tsk, int arch, int major, 794void audit_syscall_entry(int arch, int major,
747 unsigned long a1, unsigned long a2, 795 unsigned long a1, unsigned long a2,
748 unsigned long a3, unsigned long a4) 796 unsigned long a3, unsigned long a4)
749{ 797{
798 struct task_struct *tsk = current;
750 struct audit_context *context = tsk->audit_context; 799 struct audit_context *context = tsk->audit_context;
751 enum audit_state state; 800 enum audit_state state;
752 801
@@ -824,22 +873,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
824 * message), then write out the syscall information. In call cases, 873 * message), then write out the syscall information. In call cases,
825 * free the names stored from getname(). 874 * free the names stored from getname().
826 */ 875 */
827void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) 876void audit_syscall_exit(int valid, long return_code)
828{ 877{
878 struct task_struct *tsk = current;
829 struct audit_context *context; 879 struct audit_context *context;
830 880
831 get_task_struct(tsk);
832 task_lock(tsk);
833 context = audit_get_context(tsk, valid, return_code); 881 context = audit_get_context(tsk, valid, return_code);
834 task_unlock(tsk);
835 882
836 /* Not having a context here is ok, since the parent may have
837 * called __put_task_struct. */
838 if (likely(!context)) 883 if (likely(!context))
839 goto out; 884 return;
840 885
841 if (context->in_syscall && context->auditable) 886 if (context->in_syscall && context->auditable)
842 audit_log_exit(context, GFP_KERNEL); 887 audit_log_exit(context, tsk);
843 888
844 context->in_syscall = 0; 889 context->in_syscall = 0;
845 context->auditable = 0; 890 context->auditable = 0;
@@ -854,8 +899,6 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
854 audit_free_aux(context); 899 audit_free_aux(context);
855 tsk->audit_context = context; 900 tsk->audit_context = context;
856 } 901 }
857 out:
858 put_task_struct(tsk);
859} 902}
860 903
861/** 904/**
@@ -936,40 +979,11 @@ void audit_putname(const char *name)
936#endif 979#endif
937} 980}
938 981
939void audit_inode_context(int idx, const struct inode *inode) 982static void audit_inode_context(int idx, const struct inode *inode)
940{ 983{
941 struct audit_context *context = current->audit_context; 984 struct audit_context *context = current->audit_context;
942 const char *suffix = security_inode_xattr_getsuffix();
943 char *ctx = NULL;
944 int len = 0;
945
946 if (!suffix)
947 goto ret;
948
949 len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
950 if (len == -EOPNOTSUPP)
951 goto ret;
952 if (len < 0)
953 goto error_path;
954
955 ctx = kmalloc(len, GFP_KERNEL);
956 if (!ctx)
957 goto error_path;
958 985
959 len = security_inode_getsecurity(inode, suffix, ctx, len, 0); 986 selinux_get_inode_sid(inode, &context->names[idx].osid);
960 if (len < 0)
961 goto error_path;
962
963 kfree(context->names[idx].ctx);
964 context->names[idx].ctx = ctx;
965 goto ret;
966
967error_path:
968 if (ctx)
969 kfree(ctx);
970 audit_panic("error in audit_inode_context");
971ret:
972 return;
973} 987}
974 988
975 989
@@ -1155,40 +1169,37 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1155 return ctx ? ctx->loginuid : -1; 1169 return ctx ? ctx->loginuid : -1;
1156} 1170}
1157 1171
1158static char *audit_ipc_context(struct kern_ipc_perm *ipcp) 1172/**
1173 * audit_ipc_obj - record audit data for ipc object
1174 * @ipcp: ipc permissions
1175 *
1176 * Returns 0 for success or NULL context or < 0 on error.
1177 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1159{ 1179{
1180 struct audit_aux_data_ipcctl *ax;
1160 struct audit_context *context = current->audit_context; 1181 struct audit_context *context = current->audit_context;
1161 char *ctx = NULL;
1162 int len = 0;
1163 1182
1164 if (likely(!context)) 1183 if (likely(!context))
1165 return NULL; 1184 return 0;
1166
1167 len = security_ipc_getsecurity(ipcp, NULL, 0);
1168 if (len == -EOPNOTSUPP)
1169 goto ret;
1170 if (len < 0)
1171 goto error_path;
1172
1173 ctx = kmalloc(len, GFP_ATOMIC);
1174 if (!ctx)
1175 goto error_path;
1176 1185
1177 len = security_ipc_getsecurity(ipcp, ctx, len); 1186 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1178 if (len < 0) 1187 if (!ax)
1179 goto error_path; 1188 return -ENOMEM;
1180 1189
1181 return ctx; 1190 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode;
1193 selinux_get_ipc_sid(ipcp, &ax->osid);
1182 1194
1183error_path: 1195 ax->d.type = AUDIT_IPC;
1184 kfree(ctx); 1196 ax->d.next = context->aux;
1185 audit_panic("error in audit_ipc_context"); 1197 context->aux = (void *)ax;
1186ret: 1198 return 0;
1187 return NULL;
1188} 1199}
1189 1200
1190/** 1201/**
1191 * audit_ipc_perms - record audit data for ipc 1202 * audit_ipc_set_perm - record audit data for new ipc permissions
1192 * @qbytes: msgq bytes 1203 * @qbytes: msgq bytes
1193 * @uid: msgq user id 1204 * @uid: msgq user id
1194 * @gid: msgq group id 1205 * @gid: msgq group id
@@ -1196,7 +1207,7 @@ ret:
1196 * 1207 *
1197 * Returns 0 for success or NULL context or < 0 on error. 1208 * Returns 0 for success or NULL context or < 0 on error.
1198 */ 1209 */
1199int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) 1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
1200{ 1211{
1201 struct audit_aux_data_ipcctl *ax; 1212 struct audit_aux_data_ipcctl *ax;
1202 struct audit_context *context = current->audit_context; 1213 struct audit_context *context = current->audit_context;
@@ -1212,9 +1223,9 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str
1212 ax->uid = uid; 1223 ax->uid = uid;
1213 ax->gid = gid; 1224 ax->gid = gid;
1214 ax->mode = mode; 1225 ax->mode = mode;
1215 ax->ctx = audit_ipc_context(ipcp); 1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1216 1227
1217 ax->d.type = AUDIT_IPC; 1228 ax->d.type = AUDIT_IPC_SET_PERM;
1218 ax->d.next = context->aux; 1229 ax->d.next = context->aux;
1219 context->aux = (void *)ax; 1230 context->aux = (void *)ax;
1220 return 0; 1231 return 0;
diff --git a/kernel/compat.c b/kernel/compat.c
index b9bdd1271f..c1601a84f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,7 +17,6 @@
17#include <linux/time.h> 17#include <linux/time.h>
18#include <linux/signal.h> 18#include <linux/signal.h>
19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ 19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
20#include <linux/futex.h> /* for FUTEX_WAIT */
21#include <linux/syscalls.h> 20#include <linux/syscalls.h>
22#include <linux/unistd.h> 21#include <linux/unistd.h>
23#include <linux/security.h> 22#include <linux/security.h>
@@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
239 return ret; 238 return ret;
240} 239}
241 240
242#ifdef CONFIG_FUTEX
243asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
244 struct compat_timespec __user *utime, u32 __user *uaddr2,
245 int val3)
246{
247 struct timespec t;
248 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
249 int val2 = 0;
250
251 if ((op == FUTEX_WAIT) && utime) {
252 if (get_compat_timespec(&t, utime))
253 return -EFAULT;
254 timeout = timespec_to_jiffies(&t) + 1;
255 }
256 if (op >= FUTEX_REQUEUE)
257 val2 = (int) (unsigned long) utime;
258
259 return do_futex((unsigned long)uaddr, op, val, timeout,
260 (unsigned long)uaddr2, val2, val3);
261}
262#endif
263
264asmlinkage long compat_sys_setrlimit(unsigned int resource, 241asmlinkage long compat_sys_setrlimit(unsigned int resource,
265 struct compat_rlimit __user *rlim) 242 struct compat_rlimit __user *rlim)
266{ 243{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd809..fe2b8d0bfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20 20
21static struct notifier_block *cpu_chain; 21static BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 int ret; 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75
76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
77 return ret;
78 ret = notifier_chain_register(&cpu_chain, nb);
79 unlock_cpu_hotplug();
80 return ret;
81} 75}
82EXPORT_SYMBOL(register_cpu_notifier); 76EXPORT_SYMBOL(register_cpu_notifier);
83 77
84void unregister_cpu_notifier(struct notifier_block *nb) 78void unregister_cpu_notifier(struct notifier_block *nb)
85{ 79{
86 lock_cpu_hotplug(); 80 blocking_notifier_chain_unregister(&cpu_chain, nb);
87 notifier_chain_unregister(&cpu_chain, nb);
88 unlock_cpu_hotplug();
89} 81}
90EXPORT_SYMBOL(unregister_cpu_notifier); 82EXPORT_SYMBOL(unregister_cpu_notifier);
91 83
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
141 goto out; 133 goto out;
142 } 134 }
143 135
144 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 136 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
145 (void *)(long)cpu); 137 (void *)(long)cpu);
146 if (err == NOTIFY_BAD) { 138 if (err == NOTIFY_BAD) {
147 printk("%s: attempt to take down CPU %u failed\n", 139 printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
159 p = __stop_machine_run(take_cpu_down, NULL, cpu); 151 p = __stop_machine_run(take_cpu_down, NULL, cpu);
160 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
161 /* CPU didn't die: tell everyone. Can't complain. */ 153 /* CPU didn't die: tell everyone. Can't complain. */
162 if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 154 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
163 (void *)(long)cpu) == NOTIFY_BAD) 155 (void *)(long)cpu) == NOTIFY_BAD)
164 BUG(); 156 BUG();
165 157
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
182 put_cpu(); 174 put_cpu();
183 175
184 /* CPU is completely dead: tell everyone. Too late to complain. */ 176 /* CPU is completely dead: tell everyone. Too late to complain. */
185 if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) 177 if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
186 == NOTIFY_BAD) 178 (void *)(long)cpu) == NOTIFY_BAD)
187 BUG(); 179 BUG();
188 180
189 check_for_tasks(cpu); 181 check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
211 goto out; 203 goto out;
212 } 204 }
213 205
214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 206 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
215 if (ret == NOTIFY_BAD) { 207 if (ret == NOTIFY_BAD) {
216 printk("%s: attempt to bring up CPU %u failed\n", 208 printk("%s: attempt to bring up CPU %u failed\n",
217 __FUNCTION__, cpu); 209 __FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
226 BUG_ON(!cpu_online(cpu)); 218 BUG_ON(!cpu_online(cpu));
227 219
228 /* Now call notifier in preparation. */ 220 /* Now call notifier in preparation. */
229 notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 221 blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
230 222
231out_notify: 223out_notify:
232 if (ret != 0) 224 if (ret != 0)
233 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 225 blocking_notifier_call_chain(&cpu_chain,
226 CPU_UP_CANCELED, hcpu);
234out: 227out:
235 unlock_cpu_hotplug(); 228 unlock_cpu_hotplug();
236 return ret; 229 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd12..ab81fdd457 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
616 * current->cpuset if a task has its memory placement changed. 616 * current->cpuset if a task has its memory placement changed.
617 * Do not call this routine if in_interrupt(). 617 * Do not call this routine if in_interrupt().
618 * 618 *
619 * Call without callback_mutex or task_lock() held. May be called 619 * Call without callback_mutex or task_lock() held. May be
620 * with or without manage_mutex held. Doesn't need task_lock to guard 620 * called with or without manage_mutex held. Thanks in part to
621 * against another task changing a non-NULL cpuset pointer to NULL, 621 * 'the_top_cpuset_hack', the tasks cpuset pointer will never
622 * as that is only done by a task on itself, and if the current task 622 * be NULL. This routine also might acquire callback_mutex and
623 * is here, it is not simultaneously in the exit code NULL'ing its
624 * cpuset pointer. This routine also might acquire callback_mutex and
625 * current->mm->mmap_sem during call. 623 * current->mm->mmap_sem during call.
626 * 624 *
627 * Reading current->cpuset->mems_generation doesn't need task_lock 625 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
836} 834}
837 835
838/* 836/*
837 * cpuset_migrate_mm
838 *
839 * Migrate memory region from one set of nodes to another.
840 *
841 * Temporarilly set tasks mems_allowed to target nodes of migration,
842 * so that the migration code can allocate pages on these nodes.
843 *
844 * Call holding manage_mutex, so our current->cpuset won't change
845 * during this call, as manage_mutex holds off any attach_task()
846 * calls. Therefore we don't need to take task_lock around the
847 * call to guarantee_online_mems(), as we know no one is changing
848 * our tasks cpuset.
849 *
850 * Hold callback_mutex around the two modifications of our tasks
851 * mems_allowed to synchronize with cpuset_mems_allowed().
852 *
853 * While the mm_struct we are migrating is typically from some
854 * other task, the task_struct mems_allowed that we are hacking
855 * is for our current task, which must allocate new pages for that
856 * migrating memory region.
857 *
858 * We call cpuset_update_task_memory_state() before hacking
859 * our tasks mems_allowed, so that we are assured of being in
860 * sync with our tasks cpuset, and in particular, callbacks to
861 * cpuset_update_task_memory_state() from nested page allocations
862 * won't see any mismatch of our cpuset and task mems_generation
863 * values, so won't overwrite our hacked tasks mems_allowed
864 * nodemask.
865 */
866
867static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
868 const nodemask_t *to)
869{
870 struct task_struct *tsk = current;
871
872 cpuset_update_task_memory_state();
873
874 mutex_lock(&callback_mutex);
875 tsk->mems_allowed = *to;
876 mutex_unlock(&callback_mutex);
877
878 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
879
880 mutex_lock(&callback_mutex);
881 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
882 mutex_unlock(&callback_mutex);
883}
884
885/*
839 * Handle user request to change the 'mems' memory placement 886 * Handle user request to change the 'mems' memory placement
840 * of a cpuset. Needs to validate the request, update the 887 * of a cpuset. Needs to validate the request, update the
841 * cpusets mems_allowed and mems_generation, and for each 888 * cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
947 struct mm_struct *mm = mmarray[i]; 994 struct mm_struct *mm = mmarray[i];
948 995
949 mpol_rebind_mm(mm, &cs->mems_allowed); 996 mpol_rebind_mm(mm, &cs->mems_allowed);
950 if (migrate) { 997 if (migrate)
951 do_migrate_pages(mm, &oldmem, &cs->mems_allowed, 998 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
952 MPOL_MF_MOVE_ALL);
953 }
954 mmput(mm); 999 mmput(mm);
955 } 1000 }
956 1001
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1185 mm = get_task_mm(tsk); 1230 mm = get_task_mm(tsk);
1186 if (mm) { 1231 if (mm) {
1187 mpol_rebind_mm(mm, &to); 1232 mpol_rebind_mm(mm, &to);
1233 if (is_memory_migrate(cs))
1234 cpuset_migrate_mm(mm, &from, &to);
1188 mmput(mm); 1235 mmput(mm);
1189 } 1236 }
1190 1237
1191 if (is_memory_migrate(cs))
1192 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1193 put_task_struct(tsk); 1238 put_task_struct(tsk);
1194 synchronize_rcu(); 1239 synchronize_rcu();
1195 if (atomic_dec_and_test(&oldcs->count)) 1240 if (atomic_dec_and_test(&oldcs->count))
@@ -2186,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2186 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2231 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2187 * short of memory, might require taking the callback_mutex mutex. 2232 * short of memory, might require taking the callback_mutex mutex.
2188 * 2233 *
2189 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 2234 * The first call here from mm/page_alloc:get_page_from_freelist()
2190 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 2235 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
2191 * hardwall cpusets - no allocation on a node outside the cpuset is 2236 * no allocation on a node outside the cpuset is allowed (unless in
2192 * allowed (unless in interrupt, of course). 2237 * interrupt, of course).
2193 * 2238 *
2194 * The second loop doesn't even call here for GFP_ATOMIC requests 2239 * The second pass through get_page_from_freelist() doesn't even call
2195 * (if the __alloc_pages() local variable 'wait' is set). That check 2240 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
2196 * and the checks below have the combined affect in the second loop of 2241 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2197 * the __alloc_pages() routine that: 2242 * in alloc_flags. That logic and the checks below have the combined
2243 * affect that:
2198 * in_interrupt - any node ok (current task context irrelevant) 2244 * in_interrupt - any node ok (current task context irrelevant)
2199 * GFP_ATOMIC - any node ok 2245 * GFP_ATOMIC - any node ok
2200 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2246 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
2201 * GFP_USER - only nodes in current tasks mems allowed ok. 2247 * GFP_USER - only nodes in current tasks mems allowed ok.
2248 *
2249 * Rule:
2250 * Don't call cpuset_zone_allowed() if you can't sleep, unless you
2251 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2252 * the code that might scan up ancestor cpusets and sleep.
2202 **/ 2253 **/
2203 2254
2204int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2255int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2210,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2210 if (in_interrupt()) 2261 if (in_interrupt())
2211 return 1; 2262 return 1;
2212 node = z->zone_pgdat->node_id; 2263 node = z->zone_pgdat->node_id;
2264 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2213 if (node_isset(node, current->mems_allowed)) 2265 if (node_isset(node, current->mems_allowed))
2214 return 1; 2266 return 1;
2215 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2267 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e13..e95b932822 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,13 @@
29#include <linux/cpuset.h> 29#include <linux/cpuset.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/posix-timers.h>
32#include <linux/cn_proc.h> 33#include <linux/cn_proc.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/futex.h>
36#include <linux/compat.h>
37#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */
34 39
35#include <asm/uaccess.h> 40#include <asm/uaccess.h>
36#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -48,15 +53,85 @@ static void __unhash_process(struct task_struct *p)
48{ 53{
49 nr_threads--; 54 nr_threads--;
50 detach_pid(p, PIDTYPE_PID); 55 detach_pid(p, PIDTYPE_PID);
51 detach_pid(p, PIDTYPE_TGID);
52 if (thread_group_leader(p)) { 56 if (thread_group_leader(p)) {
53 detach_pid(p, PIDTYPE_PGID); 57 detach_pid(p, PIDTYPE_PGID);
54 detach_pid(p, PIDTYPE_SID); 58 detach_pid(p, PIDTYPE_SID);
55 if (p->pid) 59
56 __get_cpu_var(process_counts)--; 60 list_del_rcu(&p->tasks);
61 __get_cpu_var(process_counts)--;
57 } 62 }
63 list_del_rcu(&p->thread_group);
64 remove_parent(p);
65}
66
67/*
68 * This function expects the tasklist_lock write-locked.
69 */
70static void __exit_signal(struct task_struct *tsk)
71{
72 struct signal_struct *sig = tsk->signal;
73 struct sighand_struct *sighand;
74
75 BUG_ON(!sig);
76 BUG_ON(!atomic_read(&sig->count));
77
78 rcu_read_lock();
79 sighand = rcu_dereference(tsk->sighand);
80 spin_lock(&sighand->siglock);
58 81
59 REMOVE_LINKS(p); 82 posix_cpu_timers_exit(tsk);
83 if (atomic_dec_and_test(&sig->count))
84 posix_cpu_timers_exit_group(tsk);
85 else {
86 /*
87 * If there is any task waiting for the group exit
88 * then notify it:
89 */
90 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
91 wake_up_process(sig->group_exit_task);
92 sig->group_exit_task = NULL;
93 }
94 if (tsk == sig->curr_target)
95 sig->curr_target = next_thread(tsk);
96 /*
97 * Accumulate here the counters for all threads but the
98 * group leader as they die, so they can be added into
99 * the process-wide totals when those are taken.
100 * The group leader stays around as a zombie as long
101 * as there are other threads. When it gets reaped,
102 * the exit.c code will add its counts into these totals.
103 * We won't ever get here for the group leader, since it
104 * will have been the last reference on the signal_struct.
105 */
106 sig->utime = cputime_add(sig->utime, tsk->utime);
107 sig->stime = cputime_add(sig->stime, tsk->stime);
108 sig->min_flt += tsk->min_flt;
109 sig->maj_flt += tsk->maj_flt;
110 sig->nvcsw += tsk->nvcsw;
111 sig->nivcsw += tsk->nivcsw;
112 sig->sched_time += tsk->sched_time;
113 sig = NULL; /* Marker for below. */
114 }
115
116 __unhash_process(tsk);
117
118 tsk->signal = NULL;
119 tsk->sighand = NULL;
120 spin_unlock(&sighand->siglock);
121 rcu_read_unlock();
122
123 __cleanup_sighand(sighand);
124 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
125 flush_sigqueue(&tsk->pending);
126 if (sig) {
127 flush_sigqueue(&sig->shared_pending);
128 __cleanup_signal(sig);
129 }
130}
131
132static void delayed_put_task_struct(struct rcu_head *rhp)
133{
134 put_task_struct(container_of(rhp, struct task_struct, rcu));
60} 135}
61 136
62void release_task(struct task_struct * p) 137void release_task(struct task_struct * p)
@@ -65,21 +140,14 @@ void release_task(struct task_struct * p)
65 task_t *leader; 140 task_t *leader;
66 struct dentry *proc_dentry; 141 struct dentry *proc_dentry;
67 142
68repeat: 143repeat:
69 atomic_dec(&p->user->processes); 144 atomic_dec(&p->user->processes);
70 spin_lock(&p->proc_lock); 145 spin_lock(&p->proc_lock);
71 proc_dentry = proc_pid_unhash(p); 146 proc_dentry = proc_pid_unhash(p);
72 write_lock_irq(&tasklist_lock); 147 write_lock_irq(&tasklist_lock);
73 if (unlikely(p->ptrace)) 148 ptrace_unlink(p);
74 __ptrace_unlink(p);
75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
76 __exit_signal(p); 150 __exit_signal(p);
77 /*
78 * Note that the fastpath in sys_times depends on __exit_signal having
79 * updated the counters before a task is removed from the tasklist of
80 * the process by __unhash_process.
81 */
82 __unhash_process(p);
83 151
84 /* 152 /*
85 * If we are the last non-leader member of the thread 153 * If we are the last non-leader member of the thread
@@ -107,28 +175,13 @@ repeat:
107 spin_unlock(&p->proc_lock); 175 spin_unlock(&p->proc_lock);
108 proc_pid_flush(proc_dentry); 176 proc_pid_flush(proc_dentry);
109 release_thread(p); 177 release_thread(p);
110 put_task_struct(p); 178 call_rcu(&p->rcu, delayed_put_task_struct);
111 179
112 p = leader; 180 p = leader;
113 if (unlikely(zap_leader)) 181 if (unlikely(zap_leader))
114 goto repeat; 182 goto repeat;
115} 183}
116 184
117/* we are using it only for SMP init */
118
119void unhash_process(struct task_struct *p)
120{
121 struct dentry *proc_dentry;
122
123 spin_lock(&p->proc_lock);
124 proc_dentry = proc_pid_unhash(p);
125 write_lock_irq(&tasklist_lock);
126 __unhash_process(p);
127 write_unlock_irq(&tasklist_lock);
128 spin_unlock(&p->proc_lock);
129 proc_pid_flush(proc_dentry);
130}
131
132/* 185/*
133 * This checks not only the pgrp, but falls back on the pid if no 186 * This checks not only the pgrp, but falls back on the pid if no
134 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 187 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +289,10 @@ static void reparent_to_init(void)
236 289
237 ptrace_unlink(current); 290 ptrace_unlink(current);
238 /* Reparent to init */ 291 /* Reparent to init */
239 REMOVE_LINKS(current); 292 remove_parent(current);
240 current->parent = child_reaper; 293 current->parent = child_reaper;
241 current->real_parent = child_reaper; 294 current->real_parent = child_reaper;
242 SET_LINKS(current); 295 add_parent(current);
243 296
244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 297 /* Set the exit signal to SIGCHLD so we signal init on exit */
245 current->exit_signal = SIGCHLD; 298 current->exit_signal = SIGCHLD;
@@ -536,13 +589,13 @@ static void exit_mm(struct task_struct * tsk)
536 mmput(mm); 589 mmput(mm);
537} 590}
538 591
539static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) 592static inline void choose_new_parent(task_t *p, task_t *reaper)
540{ 593{
541 /* 594 /*
542 * Make sure we're not reparenting to ourselves and that 595 * Make sure we're not reparenting to ourselves and that
543 * the parent is not a zombie. 596 * the parent is not a zombie.
544 */ 597 */
545 BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); 598 BUG_ON(p == reaper || reaper->exit_state);
546 p->real_parent = reaper; 599 p->real_parent = reaper;
547} 600}
548 601
@@ -567,9 +620,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
567 * anyway, so let go of it. 620 * anyway, so let go of it.
568 */ 621 */
569 p->ptrace = 0; 622 p->ptrace = 0;
570 list_del_init(&p->sibling); 623 remove_parent(p);
571 p->parent = p->real_parent; 624 p->parent = p->real_parent;
572 list_add_tail(&p->sibling, &p->parent->children); 625 add_parent(p);
573 626
574 /* If we'd notified the old parent about this child's death, 627 /* If we'd notified the old parent about this child's death,
575 * also notify the new parent. 628 * also notify the new parent.
@@ -643,7 +696,7 @@ static void forget_original_parent(struct task_struct * father,
643 696
644 if (father == p->real_parent) { 697 if (father == p->real_parent) {
645 /* reparent with a reaper, real father it's us */ 698 /* reparent with a reaper, real father it's us */
646 choose_new_parent(p, reaper, child_reaper); 699 choose_new_parent(p, reaper);
647 reparent_thread(p, father, 0); 700 reparent_thread(p, father, 0);
648 } else { 701 } else {
649 /* reparent ptraced task to its real parent */ 702 /* reparent ptraced task to its real parent */
@@ -664,7 +717,7 @@ static void forget_original_parent(struct task_struct * father,
664 } 717 }
665 list_for_each_safe(_p, _n, &father->ptrace_children) { 718 list_for_each_safe(_p, _n, &father->ptrace_children) {
666 p = list_entry(_p,struct task_struct,ptrace_list); 719 p = list_entry(_p,struct task_struct,ptrace_list);
667 choose_new_parent(p, reaper, child_reaper); 720 choose_new_parent(p, reaper);
668 reparent_thread(p, father, 1); 721 reparent_thread(p, father, 1);
669 } 722 }
670} 723}
@@ -805,7 +858,7 @@ fastcall NORET_TYPE void do_exit(long code)
805 panic("Aiee, killing interrupt handler!"); 858 panic("Aiee, killing interrupt handler!");
806 if (unlikely(!tsk->pid)) 859 if (unlikely(!tsk->pid))
807 panic("Attempted to kill the idle task!"); 860 panic("Attempted to kill the idle task!");
808 if (unlikely(tsk->pid == 1)) 861 if (unlikely(tsk == child_reaper))
809 panic("Attempted to kill init!"); 862 panic("Attempted to kill init!");
810 863
811 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 864 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
@@ -852,6 +905,14 @@ fastcall NORET_TYPE void do_exit(long code)
852 exit_itimers(tsk->signal); 905 exit_itimers(tsk->signal);
853 acct_process(code); 906 acct_process(code);
854 } 907 }
908 if (unlikely(tsk->robust_list))
909 exit_robust_list(tsk);
910#ifdef CONFIG_COMPAT
911 if (unlikely(tsk->compat_robust_list))
912 compat_exit_robust_list(tsk);
913#endif
914 if (unlikely(tsk->audit_context))
915 audit_free(tsk);
855 exit_mm(tsk); 916 exit_mm(tsk);
856 917
857 exit_sem(tsk); 918 exit_sem(tsk);
@@ -884,6 +945,9 @@ fastcall NORET_TYPE void do_exit(long code)
884 if (tsk->io_context) 945 if (tsk->io_context)
885 exit_io_context(); 946 exit_io_context();
886 947
948 if (tsk->splice_pipe)
949 __free_pipe_info(tsk->splice_pipe);
950
887 /* PF_DEAD causes final put_task_struct after we schedule. */ 951 /* PF_DEAD causes final put_task_struct after we schedule. */
888 preempt_disable(); 952 preempt_disable();
889 BUG_ON(tsk->flags & PF_DEAD); 953 BUG_ON(tsk->flags & PF_DEAD);
@@ -912,13 +976,6 @@ asmlinkage long sys_exit(int error_code)
912 do_exit((error_code&0xff)<<8); 976 do_exit((error_code&0xff)<<8);
913} 977}
914 978
915task_t fastcall *next_thread(const task_t *p)
916{
917 return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
918}
919
920EXPORT_SYMBOL(next_thread);
921
922/* 979/*
923 * Take down every thread in the group. This is called by fatal signals 980 * Take down every thread in the group. This is called by fatal signals
924 * as well as by sys_exit_group (below). 981 * as well as by sys_exit_group (below).
@@ -933,7 +990,6 @@ do_group_exit(int exit_code)
933 else if (!thread_group_empty(current)) { 990 else if (!thread_group_empty(current)) {
934 struct signal_struct *const sig = current->signal; 991 struct signal_struct *const sig = current->signal;
935 struct sighand_struct *const sighand = current->sighand; 992 struct sighand_struct *const sighand = current->sighand;
936 read_lock(&tasklist_lock);
937 spin_lock_irq(&sighand->siglock); 993 spin_lock_irq(&sighand->siglock);
938 if (sig->flags & SIGNAL_GROUP_EXIT) 994 if (sig->flags & SIGNAL_GROUP_EXIT)
939 /* Another thread got here before we took the lock. */ 995 /* Another thread got here before we took the lock. */
@@ -943,7 +999,6 @@ do_group_exit(int exit_code)
943 zap_other_threads(current); 999 zap_other_threads(current);
944 } 1000 }
945 spin_unlock_irq(&sighand->siglock); 1001 spin_unlock_irq(&sighand->siglock);
946 read_unlock(&tasklist_lock);
947 } 1002 }
948 1003
949 do_exit(exit_code); 1004 do_exit(exit_code);
@@ -1273,7 +1328,7 @@ bail_ref:
1273 1328
1274 /* move to end of parent's list to avoid starvation */ 1329 /* move to end of parent's list to avoid starvation */
1275 remove_parent(p); 1330 remove_parent(p);
1276 add_parent(p, p->parent); 1331 add_parent(p);
1277 1332
1278 write_unlock_irq(&tasklist_lock); 1333 write_unlock_irq(&tasklist_lock);
1279 1334
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ce..7fe2628553 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 40 return e;
41} 41}
42 42
43static int core_kernel_text(unsigned long addr) 43int core_kernel_text(unsigned long addr)
44{ 44{
45 if (addr >= (unsigned long)_stext && 45 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 46 addr <= (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449de..ac8100e308 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
84#endif 84#endif
85 85
86/* SLAB cache for signal_struct structures (tsk->signal) */ 86/* SLAB cache for signal_struct structures (tsk->signal) */
87kmem_cache_t *signal_cachep; 87static kmem_cache_t *signal_cachep;
88 88
89/* SLAB cache for sighand_struct structures (tsk->sighand) */ 89/* SLAB cache for sighand_struct structures (tsk->sighand) */
90kmem_cache_t *sighand_cachep; 90kmem_cache_t *sighand_cachep;
@@ -108,16 +108,12 @@ void free_task(struct task_struct *tsk)
108} 108}
109EXPORT_SYMBOL(free_task); 109EXPORT_SYMBOL(free_task);
110 110
111void __put_task_struct_cb(struct rcu_head *rhp) 111void __put_task_struct(struct task_struct *tsk)
112{ 112{
113 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
114
115 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 113 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
116 WARN_ON(atomic_read(&tsk->usage)); 114 WARN_ON(atomic_read(&tsk->usage));
117 WARN_ON(tsk == current); 115 WARN_ON(tsk == current);
118 116
119 if (unlikely(tsk->audit_context))
120 audit_free(tsk);
121 security_task_free(tsk); 117 security_task_free(tsk);
122 free_uid(tsk->user); 118 free_uid(tsk->user);
123 put_group_info(tsk->group_info); 119 put_group_info(tsk->group_info);
@@ -182,6 +178,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
182 atomic_set(&tsk->usage,2); 178 atomic_set(&tsk->usage,2);
183 atomic_set(&tsk->fs_excl, 0); 179 atomic_set(&tsk->fs_excl, 0);
184 tsk->btrace_seq = 0; 180 tsk->btrace_seq = 0;
181 tsk->splice_pipe = NULL;
185 return tsk; 182 return tsk;
186} 183}
187 184
@@ -721,7 +718,7 @@ out_release:
721 free_fdset (new_fdt->open_fds, new_fdt->max_fdset); 718 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
722 free_fd_array(new_fdt->fd, new_fdt->max_fds); 719 free_fd_array(new_fdt->fd, new_fdt->max_fds);
723 kmem_cache_free(files_cachep, newf); 720 kmem_cache_free(files_cachep, newf);
724 goto out; 721 return NULL;
725} 722}
726 723
727static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 724static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -786,14 +783,6 @@ int unshare_files(void)
786 783
787EXPORT_SYMBOL(unshare_files); 784EXPORT_SYMBOL(unshare_files);
788 785
789void sighand_free_cb(struct rcu_head *rhp)
790{
791 struct sighand_struct *sp;
792
793 sp = container_of(rhp, struct sighand_struct, rcu);
794 kmem_cache_free(sighand_cachep, sp);
795}
796
797static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 786static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
798{ 787{
799 struct sighand_struct *sig; 788 struct sighand_struct *sig;
@@ -806,12 +795,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
806 rcu_assign_pointer(tsk->sighand, sig); 795 rcu_assign_pointer(tsk->sighand, sig);
807 if (!sig) 796 if (!sig)
808 return -ENOMEM; 797 return -ENOMEM;
809 spin_lock_init(&sig->siglock);
810 atomic_set(&sig->count, 1); 798 atomic_set(&sig->count, 1);
811 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 799 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
812 return 0; 800 return 0;
813} 801}
814 802
803void __cleanup_sighand(struct sighand_struct *sighand)
804{
805 if (atomic_dec_and_test(&sighand->count))
806 kmem_cache_free(sighand_cachep, sighand);
807}
808
815static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 809static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
816{ 810{
817 struct signal_struct *sig; 811 struct signal_struct *sig;
@@ -881,6 +875,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
881 return 0; 875 return 0;
882} 876}
883 877
878void __cleanup_signal(struct signal_struct *sig)
879{
880 exit_thread_group_keys(sig);
881 kmem_cache_free(signal_cachep, sig);
882}
883
884static inline void cleanup_signal(struct task_struct *tsk)
885{
886 struct signal_struct *sig = tsk->signal;
887
888 atomic_dec(&sig->live);
889
890 if (atomic_dec_and_test(&sig->count))
891 __cleanup_signal(sig);
892}
893
884static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 894static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
885{ 895{
886 unsigned long new_flags = p->flags; 896 unsigned long new_flags = p->flags;
@@ -1061,7 +1071,10 @@ static task_t *copy_process(unsigned long clone_flags,
1061 * Clear TID on mm_release()? 1071 * Clear TID on mm_release()?
1062 */ 1072 */
1063 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1073 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1064 1074 p->robust_list = NULL;
1075#ifdef CONFIG_COMPAT
1076 p->compat_robust_list = NULL;
1077#endif
1065 /* 1078 /*
1066 * sigaltstack should be cleared when sharing the same VM 1079 * sigaltstack should be cleared when sharing the same VM
1067 */ 1080 */
@@ -1092,6 +1105,7 @@ static task_t *copy_process(unsigned long clone_flags,
1092 * We dont wake it up yet. 1105 * We dont wake it up yet.
1093 */ 1106 */
1094 p->group_leader = p; 1107 p->group_leader = p;
1108 INIT_LIST_HEAD(&p->thread_group);
1095 INIT_LIST_HEAD(&p->ptrace_children); 1109 INIT_LIST_HEAD(&p->ptrace_children);
1096 INIT_LIST_HEAD(&p->ptrace_list); 1110 INIT_LIST_HEAD(&p->ptrace_list);
1097 1111
@@ -1115,16 +1129,6 @@ static task_t *copy_process(unsigned long clone_flags,
1115 !cpu_online(task_cpu(p)))) 1129 !cpu_online(task_cpu(p))))
1116 set_task_cpu(p, smp_processor_id()); 1130 set_task_cpu(p, smp_processor_id());
1117 1131
1118 /*
1119 * Check for pending SIGKILL! The new thread should not be allowed
1120 * to slip out of an OOM kill. (or normal SIGKILL.)
1121 */
1122 if (sigismember(&current->pending.signal, SIGKILL)) {
1123 write_unlock_irq(&tasklist_lock);
1124 retval = -EINTR;
1125 goto bad_fork_cleanup_namespace;
1126 }
1127
1128 /* CLONE_PARENT re-uses the old parent */ 1132 /* CLONE_PARENT re-uses the old parent */
1129 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1133 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1130 p->real_parent = current->real_parent; 1134 p->real_parent = current->real_parent;
@@ -1133,6 +1137,23 @@ static task_t *copy_process(unsigned long clone_flags,
1133 p->parent = p->real_parent; 1137 p->parent = p->real_parent;
1134 1138
1135 spin_lock(&current->sighand->siglock); 1139 spin_lock(&current->sighand->siglock);
1140
1141 /*
1142 * Process group and session signals need to be delivered to just the
1143 * parent before the fork or both the parent and the child after the
1144 * fork. Restart if a signal comes in before we add the new process to
1145 * it's process group.
1146 * A fatal signal pending means that current will exit, so the new
1147 * thread can't slip out of an OOM kill (or normal SIGKILL).
1148 */
1149 recalc_sigpending();
1150 if (signal_pending(current)) {
1151 spin_unlock(&current->sighand->siglock);
1152 write_unlock_irq(&tasklist_lock);
1153 retval = -ERESTARTNOINTR;
1154 goto bad_fork_cleanup_namespace;
1155 }
1156
1136 if (clone_flags & CLONE_THREAD) { 1157 if (clone_flags & CLONE_THREAD) {
1137 /* 1158 /*
1138 * Important: if an exit-all has been started then 1159 * Important: if an exit-all has been started then
@@ -1145,17 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
1145 retval = -EAGAIN; 1166 retval = -EAGAIN;
1146 goto bad_fork_cleanup_namespace; 1167 goto bad_fork_cleanup_namespace;
1147 } 1168 }
1148 p->group_leader = current->group_leader;
1149 1169
1150 if (current->signal->group_stop_count > 0) { 1170 p->group_leader = current->group_leader;
1151 /* 1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1152 * There is an all-stop in progress for the group.
1153 * We ourselves will stop as soon as we check signals.
1154 * Make the new thread part of that group stop too.
1155 */
1156 current->signal->group_stop_count++;
1157 set_tsk_thread_flag(p, TIF_SIGPENDING);
1158 }
1159 1172
1160 if (!cputime_eq(current->signal->it_virt_expires, 1173 if (!cputime_eq(current->signal->it_virt_expires,
1161 cputime_zero) || 1174 cputime_zero) ||
@@ -1178,23 +1191,25 @@ static task_t *copy_process(unsigned long clone_flags,
1178 */ 1191 */
1179 p->ioprio = current->ioprio; 1192 p->ioprio = current->ioprio;
1180 1193
1181 SET_LINKS(p); 1194 if (likely(p->pid)) {
1182 if (unlikely(p->ptrace & PT_PTRACED)) 1195 add_parent(p);
1183 __ptrace_link(p, current->parent); 1196 if (unlikely(p->ptrace & PT_PTRACED))
1184 1197 __ptrace_link(p, current->parent);
1185 if (thread_group_leader(p)) { 1198
1186 p->signal->tty = current->signal->tty; 1199 if (thread_group_leader(p)) {
1187 p->signal->pgrp = process_group(current); 1200 p->signal->tty = current->signal->tty;
1188 p->signal->session = current->signal->session; 1201 p->signal->pgrp = process_group(current);
1189 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1202 p->signal->session = current->signal->session;
1190 attach_pid(p, PIDTYPE_SID, p->signal->session); 1203 attach_pid(p, PIDTYPE_PGID, process_group(p));
1191 if (p->pid) 1204 attach_pid(p, PIDTYPE_SID, p->signal->session);
1205
1206 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1192 __get_cpu_var(process_counts)++; 1207 __get_cpu_var(process_counts)++;
1208 }
1209 attach_pid(p, PIDTYPE_PID, p->pid);
1210 nr_threads++;
1193 } 1211 }
1194 attach_pid(p, PIDTYPE_TGID, p->tgid);
1195 attach_pid(p, PIDTYPE_PID, p->pid);
1196 1212
1197 nr_threads++;
1198 total_forks++; 1213 total_forks++;
1199 spin_unlock(&current->sighand->siglock); 1214 spin_unlock(&current->sighand->siglock);
1200 write_unlock_irq(&tasklist_lock); 1215 write_unlock_irq(&tasklist_lock);
@@ -1209,9 +1224,9 @@ bad_fork_cleanup_mm:
1209 if (p->mm) 1224 if (p->mm)
1210 mmput(p->mm); 1225 mmput(p->mm);
1211bad_fork_cleanup_signal: 1226bad_fork_cleanup_signal:
1212 exit_signal(p); 1227 cleanup_signal(p);
1213bad_fork_cleanup_sighand: 1228bad_fork_cleanup_sighand:
1214 exit_sighand(p); 1229 __cleanup_sighand(p->sighand);
1215bad_fork_cleanup_fs: 1230bad_fork_cleanup_fs:
1216 exit_fs(p); /* blocking */ 1231 exit_fs(p); /* blocking */
1217bad_fork_cleanup_files: 1232bad_fork_cleanup_files:
@@ -1258,7 +1273,7 @@ task_t * __devinit fork_idle(int cpu)
1258 if (!task) 1273 if (!task)
1259 return ERR_PTR(-ENOMEM); 1274 return ERR_PTR(-ENOMEM);
1260 init_idle(task, cpu); 1275 init_idle(task, cpu);
1261 unhash_process(task); 1276
1262 return task; 1277 return task;
1263} 1278}
1264 1279
@@ -1293,17 +1308,19 @@ long do_fork(unsigned long clone_flags,
1293{ 1308{
1294 struct task_struct *p; 1309 struct task_struct *p;
1295 int trace = 0; 1310 int trace = 0;
1296 long pid = alloc_pidmap(); 1311 struct pid *pid = alloc_pid();
1312 long nr;
1297 1313
1298 if (pid < 0) 1314 if (!pid)
1299 return -EAGAIN; 1315 return -EAGAIN;
1316 nr = pid->nr;
1300 if (unlikely(current->ptrace)) { 1317 if (unlikely(current->ptrace)) {
1301 trace = fork_traceflag (clone_flags); 1318 trace = fork_traceflag (clone_flags);
1302 if (trace) 1319 if (trace)
1303 clone_flags |= CLONE_PTRACE; 1320 clone_flags |= CLONE_PTRACE;
1304 } 1321 }
1305 1322
1306 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1323 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1307 /* 1324 /*
1308 * Do this prior waking up the new thread - the thread pointer 1325 * Do this prior waking up the new thread - the thread pointer
1309 * might get invalid after that point, if the thread exits quickly. 1326 * might get invalid after that point, if the thread exits quickly.
@@ -1330,7 +1347,7 @@ long do_fork(unsigned long clone_flags,
1330 p->state = TASK_STOPPED; 1347 p->state = TASK_STOPPED;
1331 1348
1332 if (unlikely (trace)) { 1349 if (unlikely (trace)) {
1333 current->ptrace_message = pid; 1350 current->ptrace_message = nr;
1334 ptrace_notify ((trace << 8) | SIGTRAP); 1351 ptrace_notify ((trace << 8) | SIGTRAP);
1335 } 1352 }
1336 1353
@@ -1340,21 +1357,31 @@ long do_fork(unsigned long clone_flags,
1340 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1357 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1341 } 1358 }
1342 } else { 1359 } else {
1343 free_pidmap(pid); 1360 free_pid(pid);
1344 pid = PTR_ERR(p); 1361 nr = PTR_ERR(p);
1345 } 1362 }
1346 return pid; 1363 return nr;
1347} 1364}
1348 1365
1349#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1366#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1350#define ARCH_MIN_MMSTRUCT_ALIGN 0 1367#define ARCH_MIN_MMSTRUCT_ALIGN 0
1351#endif 1368#endif
1352 1369
1370static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
1371{
1372 struct sighand_struct *sighand = data;
1373
1374 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1375 SLAB_CTOR_CONSTRUCTOR)
1376 spin_lock_init(&sighand->siglock);
1377}
1378
1353void __init proc_caches_init(void) 1379void __init proc_caches_init(void)
1354{ 1380{
1355 sighand_cachep = kmem_cache_create("sighand_cache", 1381 sighand_cachep = kmem_cache_create("sighand_cache",
1356 sizeof(struct sighand_struct), 0, 1382 sizeof(struct sighand_struct), 0,
1357 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1383 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1384 sighand_ctor, NULL);
1358 signal_cachep = kmem_cache_create("signal_cache", 1385 signal_cachep = kmem_cache_create("signal_cache",
1359 sizeof(struct signal_struct), 0, 1386 sizeof(struct signal_struct), 0,
1360 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1387 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f9780..5699c51205 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew 16 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation. 17 * Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
829 goto out; 833 goto out;
830} 834}
831 835
836/*
837 * Support for robust futexes: the kernel cleans up held futexes at
838 * thread exit time.
839 *
840 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after
847 * acquiring the lock, but just before it could have added itself to
848 * the list. There can only be one such pending lock.
849 */
850
851/**
852 * sys_set_robust_list - set the robust-futex list head of a task
853 * @head: pointer to the list-head
854 * @len: length of the list-head, as userspace expects
855 */
856asmlinkage long
857sys_set_robust_list(struct robust_list_head __user *head,
858 size_t len)
859{
860 /*
861 * The kernel knows only one size for now:
862 */
863 if (unlikely(len != sizeof(*head)))
864 return -EINVAL;
865
866 current->robust_list = head;
867
868 return 0;
869}
870
871/**
872 * sys_get_robust_list - get the robust-futex list head of a task
873 * @pid: pid of the process [zero for current task]
874 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
875 * @len_ptr: pointer to a length field, the kernel fills in the header size
876 */
877asmlinkage long
878sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
879 size_t __user *len_ptr)
880{
881 struct robust_list_head *head;
882 unsigned long ret;
883
884 if (!pid)
885 head = current->robust_list;
886 else {
887 struct task_struct *p;
888
889 ret = -ESRCH;
890 read_lock(&tasklist_lock);
891 p = find_task_by_pid(pid);
892 if (!p)
893 goto err_unlock;
894 ret = -EPERM;
895 if ((current->euid != p->euid) && (current->euid != p->uid) &&
896 !capable(CAP_SYS_PTRACE))
897 goto err_unlock;
898 head = p->robust_list;
899 read_unlock(&tasklist_lock);
900 }
901
902 if (put_user(sizeof(*head), len_ptr))
903 return -EFAULT;
904 return put_user(head, head_ptr);
905
906err_unlock:
907 read_unlock(&tasklist_lock);
908
909 return ret;
910}
911
912/*
913 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so:
915 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{
918 u32 uval;
919
920retry:
921 if (get_user(uval, uaddr))
922 return -1;
923
924 if ((uval & FUTEX_TID_MASK) == curr->pid) {
925 /*
926 * Ok, this dying thread is truly holding a futex
927 * of interest. Set the OWNER_DIED bit atomically
928 * via cmpxchg, and if the value had FUTEX_WAITERS
929 * set, wake up a waiter (if any). (We have to do a
930 * futex_wake() even if OWNER_DIED is already set -
931 * to handle the rare but possible case of recursive
932 * thread-death.) The rest of the cleanup is done in
933 * userspace.
934 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval)
937 goto retry;
938
939 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1);
941 }
942 return 0;
943}
944
945/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters.
948 *
949 * We silently return on any sign of list-walking problem.
950 */
951void exit_robust_list(struct task_struct *curr)
952{
953 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT;
956 unsigned long futex_offset;
957
958 /*
959 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()):
961 */
962 if (get_user(entry, &head->list.next))
963 return;
964 /*
965 * Fetch the relative futex offset:
966 */
967 if (get_user(futex_offset, &head->futex_offset))
968 return;
969 /*
970 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists:
972 */
973 if (get_user(pending, &head->list_op_pending))
974 return;
975 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr);
977
978 while (entry != &head->list) {
979 /*
980 * A pending lock might already be on the list, so
981 * dont process it twice:
982 */
983 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset,
985 curr))
986 return;
987 /*
988 * Fetch the next entry in the list:
989 */
990 if (get_user(entry, &entry->next))
991 return;
992 /*
993 * Avoid excessively long or circular lists:
994 */
995 if (!--limit)
996 break;
997
998 cond_resched();
999 }
1000}
1001
832long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
833 unsigned long uaddr2, int val2, int val3) 1003 unsigned long uaddr2, int val2, int val3)
834{ 1004{
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
869 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
870 int val2 = 0; 1040 int val2 = 0;
871 1041
872 if ((op == FUTEX_WAIT) && utime) { 1042 if (utime && (op == FUTEX_WAIT)) {
873 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1043 if (copy_from_user(&t, utime, sizeof(t)) != 0)
874 return -EFAULT; 1044 return -EFAULT;
1045 if (!timespec_valid(&t))
1046 return -EINVAL;
875 timeout = timespec_to_jiffies(&t) + 1; 1047 timeout = timespec_to_jiffies(&t) + 1;
876 } 1048 }
877 /* 1049 /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000000..1ab6a0ea3d
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,144 @@
1/*
2 * linux/kernel/futex_compat.c
3 *
4 * Futex compatibililty routines.
5 *
6 * Copyright 2006, Red Hat, Inc., Ingo Molnar
7 */
8
9#include <linux/linkage.h>
10#include <linux/compat.h>
11#include <linux/futex.h>
12
13#include <asm/uaccess.h>
14
15/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters.
18 *
19 * We silently return on any sign of list-walking problem.
20 */
21void compat_exit_robust_list(struct task_struct *curr)
22{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending;
25 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset;
28
29 /*
30 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()):
32 */
33 if (get_user(uentry, &head->list.next))
34 return;
35 entry = compat_ptr(uentry);
36 /*
37 * Fetch the relative futex offset:
38 */
39 if (get_user(futex_offset, &head->futex_offset))
40 return;
41 /*
42 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists:
44 */
45 if (get_user(upending, &head->list_op_pending))
46 return;
47 pending = compat_ptr(upending);
48 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr);
50
51 while (compat_ptr(uentry) != &head->list) {
52 /*
53 * A pending lock might already be on the list, so
54 * dont process it twice:
55 */
56 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset,
58 curr))
59 return;
60
61 /*
62 * Fetch the next entry in the list:
63 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next))
65 return;
66 entry = compat_ptr(uentry);
67 /*
68 * Avoid excessively long or circular lists:
69 */
70 if (!--limit)
71 break;
72
73 cond_resched();
74 }
75}
76
77asmlinkage long
78compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
79 compat_size_t len)
80{
81 if (unlikely(len != sizeof(*head)))
82 return -EINVAL;
83
84 current->compat_robust_list = head;
85
86 return 0;
87}
88
89asmlinkage long
90compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
91 compat_size_t __user *len_ptr)
92{
93 struct compat_robust_list_head *head;
94 unsigned long ret;
95
96 if (!pid)
97 head = current->compat_robust_list;
98 else {
99 struct task_struct *p;
100
101 ret = -ESRCH;
102 read_lock(&tasklist_lock);
103 p = find_task_by_pid(pid);
104 if (!p)
105 goto err_unlock;
106 ret = -EPERM;
107 if ((current->euid != p->euid) && (current->euid != p->uid) &&
108 !capable(CAP_SYS_PTRACE))
109 goto err_unlock;
110 head = p->compat_robust_list;
111 read_unlock(&tasklist_lock);
112 }
113
114 if (put_user(sizeof(*head), len_ptr))
115 return -EFAULT;
116 return put_user(ptr_to_compat(head), head_ptr);
117
118err_unlock:
119 read_unlock(&tasklist_lock);
120
121 return ret;
122}
123
124asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
125 struct compat_timespec __user *utime, u32 __user *uaddr2,
126 u32 val3)
127{
128 struct timespec t;
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0;
131
132 if (utime && (op == FUTEX_WAIT)) {
133 if (get_compat_timespec(&t, utime))
134 return -EFAULT;
135 if (!timespec_valid(&t))
136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1;
138 }
139 if (op >= FUTEX_REQUEUE)
140 val2 = (int) (unsigned long) utime;
141
142 return do_futex((unsigned long)uaddr, op, val, timeout,
143 (unsigned long)uaddr2, val2, val3);
144}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb..01fa2ae98a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -456,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
456 456
457 return ret; 457 return ret;
458} 458}
459EXPORT_SYMBOL_GPL(hrtimer_start);
459 460
460/** 461/**
461 * hrtimer_try_to_cancel - try to deactivate a timer 462 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -484,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
484 return ret; 485 return ret;
485 486
486} 487}
488EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
487 489
488/** 490/**
489 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 491 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -501,8 +503,10 @@ int hrtimer_cancel(struct hrtimer *timer)
501 503
502 if (ret >= 0) 504 if (ret >= 0)
503 return ret; 505 return ret;
506 cpu_relax();
504 } 507 }
505} 508}
509EXPORT_SYMBOL_GPL(hrtimer_cancel);
506 510
507/** 511/**
508 * hrtimer_get_remaining - get remaining time for the timer 512 * hrtimer_get_remaining - get remaining time for the timer
@@ -521,6 +525,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
521 525
522 return rem; 526 return rem;
523} 527}
528EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
524 529
525#ifdef CONFIG_NO_IDLE_HZ 530#ifdef CONFIG_NO_IDLE_HZ
526/** 531/**
@@ -579,6 +584,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
579 timer->base = &bases[clock_id]; 584 timer->base = &bases[clock_id];
580 timer->node.rb_parent = HRTIMER_INACTIVE; 585 timer->node.rb_parent = HRTIMER_INACTIVE;
581} 586}
587EXPORT_SYMBOL_GPL(hrtimer_init);
582 588
583/** 589/**
584 * hrtimer_get_res - get the timer resolution for a clock 590 * hrtimer_get_res - get the timer resolution for a clock
@@ -598,6 +604,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
598 604
599 return 0; 605 return 0;
600} 606}
607EXPORT_SYMBOL_GPL(hrtimer_get_res);
601 608
602/* 609/*
603 * Expire the per base hrtimer-queue: 610 * Expire the per base hrtimer-queue:
@@ -606,6 +613,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
606{ 613{
607 struct rb_node *node; 614 struct rb_node *node;
608 615
616 if (!base->first)
617 return;
618
609 if (base->get_softirq_time) 619 if (base->get_softirq_time)
610 base->softirq_time = base->get_softirq_time(); 620 base->softirq_time = base->get_softirq_time();
611 621
@@ -655,29 +665,28 @@ void hrtimer_run_queues(void)
655/* 665/*
656 * Sleep related functions: 666 * Sleep related functions:
657 */ 667 */
658 668static int hrtimer_wakeup(struct hrtimer *timer)
659struct sleep_hrtimer {
660 struct hrtimer timer;
661 struct task_struct *task;
662 int expired;
663};
664
665static int nanosleep_wakeup(struct hrtimer *timer)
666{ 669{
667 struct sleep_hrtimer *t = 670 struct hrtimer_sleeper *t =
668 container_of(timer, struct sleep_hrtimer, timer); 671 container_of(timer, struct hrtimer_sleeper, timer);
672 struct task_struct *task = t->task;
669 673
670 t->expired = 1; 674 t->task = NULL;
671 wake_up_process(t->task); 675 if (task)
676 wake_up_process(task);
672 677
673 return HRTIMER_NORESTART; 678 return HRTIMER_NORESTART;
674} 679}
675 680
676static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) 681void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
677{ 682{
678 t->timer.function = nanosleep_wakeup; 683 sl->timer.function = hrtimer_wakeup;
679 t->task = current; 684 sl->task = task;
680 t->expired = 0; 685}
686
687static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
688{
689 hrtimer_init_sleeper(t, current);
681 690
682 do { 691 do {
683 set_current_state(TASK_INTERRUPTIBLE); 692 set_current_state(TASK_INTERRUPTIBLE);
@@ -685,18 +694,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
685 694
686 schedule(); 695 schedule();
687 696
688 if (unlikely(!t->expired)) { 697 hrtimer_cancel(&t->timer);
689 hrtimer_cancel(&t->timer); 698 mode = HRTIMER_ABS;
690 mode = HRTIMER_ABS; 699
691 } 700 } while (t->task && !signal_pending(current));
692 } while (!t->expired && !signal_pending(current));
693 701
694 return t->expired; 702 return t->task == NULL;
695} 703}
696 704
697static long __sched nanosleep_restart(struct restart_block *restart) 705static long __sched nanosleep_restart(struct restart_block *restart)
698{ 706{
699 struct sleep_hrtimer t; 707 struct hrtimer_sleeper t;
700 struct timespec __user *rmtp; 708 struct timespec __user *rmtp;
701 struct timespec tu; 709 struct timespec tu;
702 ktime_t time; 710 ktime_t time;
@@ -729,7 +737,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
729 const enum hrtimer_mode mode, const clockid_t clockid) 737 const enum hrtimer_mode mode, const clockid_t clockid)
730{ 738{
731 struct restart_block *restart; 739 struct restart_block *restart;
732 struct sleep_hrtimer t; 740 struct hrtimer_sleeper t;
733 struct timespec tu; 741 struct timespec tu;
734 ktime_t rem; 742 ktime_t rem;
735 743
@@ -834,7 +842,7 @@ static void migrate_hrtimers(int cpu)
834} 842}
835#endif /* CONFIG_HOTPLUG_CPU */ 843#endif /* CONFIG_HOTPLUG_CPU */
836 844
837static int __devinit hrtimer_cpu_notify(struct notifier_block *self, 845static int hrtimer_cpu_notify(struct notifier_block *self,
838 unsigned long action, void *hcpu) 846 unsigned long action, void *hcpu)
839{ 847{
840 long cpu = (long)hcpu; 848 long cpu = (long)hcpu;
@@ -858,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
858 return NOTIFY_OK; 866 return NOTIFY_OK;
859} 867}
860 868
861static struct notifier_block __devinitdata hrtimers_nb = { 869static struct notifier_block hrtimers_nb = {
862 .notifier_call = hrtimer_cpu_notify, 870 .notifier_call = hrtimer_cpu_notify,
863}; 871};
864 872
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2b33f852be..9f77f50d81 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,4 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o migration.o 2obj-y := handle.o manage.o spurious.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac766ad573..1279e34995 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -246,8 +246,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
246 246
247mismatch: 247mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags); 248 spin_unlock_irqrestore(&desc->lock, flags);
249 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); 249 if (!(new->flags & SA_PROBEIRQ)) {
250 dump_stack(); 250 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
251 dump_stack();
252 }
251 return -EBUSY; 253 return -EBUSY;
252} 254}
253 255
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 52a8655fa0..134f9f2e0e 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,6 +1,5 @@
1#include <linux/irq.h>
2 1
3#if defined(CONFIG_GENERIC_PENDING_IRQ) 2#include <linux/irq.h>
4 3
5void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
6{ 5{
@@ -61,5 +60,3 @@ void move_native_irq(int irq)
61 } 60 }
62 cpus_clear(pending_irq_cpumask[irq]); 61 cpus_clear(pending_irq_cpumask[irq]);
63} 62}
64
65#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063a..20a997c73c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
170 sa.sa.sa_handler = SIG_IGN; 170 sa.sa.sa_handler = SIG_IGN;
171 sa.sa.sa_flags = 0; 171 sa.sa.sa_flags = 0;
172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); 172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
173 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); 173 do_sigaction(SIGCHLD, &sa, NULL);
174 allow_signal(SIGCHLD); 174 allow_signal(SIGCHLD);
175 175
176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1156eb0977..1fbf466a29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
585 int i; 585 int i;
586 586
587 rp->kp.pre_handler = pre_handler_kretprobe; 587 rp->kp.pre_handler = pre_handler_kretprobe;
588 rp->kp.post_handler = NULL;
589 rp->kp.fault_handler = NULL;
590 rp->kp.break_handler = NULL;
588 591
589 /* Pre-allocate memory for max kretprobe instances */ 592 /* Pre-allocate memory for max kretprobe instances */
590 if (rp->maxactive <= 0) { 593 if (rp->maxactive <= 0) {
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2f..bbe04862e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
64static DEFINE_MUTEX(module_mutex); 64static DEFINE_MUTEX(module_mutex);
65static LIST_HEAD(modules); 65static LIST_HEAD(modules);
66 66
67static DEFINE_MUTEX(notify_mutex); 67static BLOCKING_NOTIFIER_HEAD(module_notify_list);
68static struct notifier_block * module_notify_list;
69 68
70int register_module_notifier(struct notifier_block * nb) 69int register_module_notifier(struct notifier_block * nb)
71{ 70{
72 int err; 71 return blocking_notifier_chain_register(&module_notify_list, nb);
73 mutex_lock(&notify_mutex);
74 err = notifier_chain_register(&module_notify_list, nb);
75 mutex_unlock(&notify_mutex);
76 return err;
77} 72}
78EXPORT_SYMBOL(register_module_notifier); 73EXPORT_SYMBOL(register_module_notifier);
79 74
80int unregister_module_notifier(struct notifier_block * nb) 75int unregister_module_notifier(struct notifier_block * nb)
81{ 76{
82 int err; 77 return blocking_notifier_chain_unregister(&module_notify_list, nb);
83 mutex_lock(&notify_mutex);
84 err = notifier_chain_unregister(&module_notify_list, nb);
85 mutex_unlock(&notify_mutex);
86 return err;
87} 78}
88EXPORT_SYMBOL(unregister_module_notifier); 79EXPORT_SYMBOL(unregister_module_notifier);
89 80
@@ -136,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[];
136#ifndef CONFIG_MODVERSIONS 127#ifndef CONFIG_MODVERSIONS
137#define symversion(base, idx) NULL 128#define symversion(base, idx) NULL
138#else 129#else
139#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) 130#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
140#endif 131#endif
141 132
142/* lookup symbol in given range of kernel_symbols */ 133/* lookup symbol in given range of kernel_symbols */
@@ -714,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
714 705
715void symbol_put_addr(void *addr) 706void symbol_put_addr(void *addr)
716{ 707{
717 unsigned long flags; 708 struct module *modaddr;
718 709
719 spin_lock_irqsave(&modlist_lock, flags); 710 if (core_kernel_text((unsigned long)addr))
720 if (!kernel_text_address((unsigned long)addr)) 711 return;
721 BUG();
722 712
723 module_put(module_text_address((unsigned long)addr)); 713 if (!(modaddr = module_text_address((unsigned long)addr)))
724 spin_unlock_irqrestore(&modlist_lock, flags); 714 BUG();
715 module_put(modaddr);
725} 716}
726EXPORT_SYMBOL_GPL(symbol_put_addr); 717EXPORT_SYMBOL_GPL(symbol_put_addr);
727 718
@@ -1263,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
1263 || strcmp(license, "GPL v2") == 0 1254 || strcmp(license, "GPL v2") == 0
1264 || strcmp(license, "GPL and additional rights") == 0 1255 || strcmp(license, "GPL and additional rights") == 0
1265 || strcmp(license, "Dual BSD/GPL") == 0 1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1266 || strcmp(license, "Dual MPL/GPL") == 0); 1258 || strcmp(license, "Dual MPL/GPL") == 0);
1267} 1259}
1268 1260
@@ -1816,9 +1808,8 @@ sys_init_module(void __user *umod,
1816 /* Drop lock so they can recurse */ 1808 /* Drop lock so they can recurse */
1817 mutex_unlock(&module_mutex); 1809 mutex_unlock(&module_mutex);
1818 1810
1819 mutex_lock(&notify_mutex); 1811 blocking_notifier_call_chain(&module_notify_list,
1820 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1812 MODULE_STATE_COMING, mod);
1821 mutex_unlock(&notify_mutex);
1822 1813
1823 /* Start the module */ 1814 /* Start the module */
1824 if (mod->init != NULL) 1815 if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb..cc2a4c9c36 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,9 +27,8 @@ static int pause_on_oops_flag;
27static DEFINE_SPINLOCK(pause_on_oops_lock); 27static DEFINE_SPINLOCK(pause_on_oops_lock);
28 28
29int panic_timeout; 29int panic_timeout;
30EXPORT_SYMBOL(panic_timeout);
31 30
32struct notifier_block *panic_notifier_list; 31ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
33 32
34EXPORT_SYMBOL(panic_notifier_list); 33EXPORT_SYMBOL(panic_notifier_list);
35 34
@@ -97,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, ...)
97 smp_send_stop(); 96 smp_send_stop();
98#endif 97#endif
99 98
100 notifier_call_chain(&panic_notifier_list, 0, buf); 99 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
101 100
102 if (!panic_blink) 101 if (!panic_blink)
103 panic_blink = no_blink; 102 panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index 9de637a5c8..af43ecdc8d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34static inline int dash2underscore(char c) 34static inline char dash2underscore(char c)
35{ 35{
36 if (c == '-') 36 if (c == '-')
37 return '_'; 37 return '_';
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc072469..eeb836b65c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr);
153 task_pid->nr = nr;
154 if (pid == NULL) {
155 INIT_LIST_HEAD(&task_pid->pid_list);
156 hlist_add_head_rcu(&task_pid->pid_chain,
157 &pid_hash[type][pid_hashfn(nr)]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162
163 return 0;
164}
165
166static fastcall int __detach_pid(task_t *task, enum pid_type type)
167{
168 struct pid *pid, *pid_next;
169 int nr = 0;
170
171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) {
173 225
174 if (list_empty(&pid->pid_list)) { 226 WARN_ON(!task->pid); /* to be removed soon */
175 nr = pid->nr; 227 WARN_ON(!nr); /* to be removed soon */
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 228
186 list_del_rcu(&pid->pid_list); 229 link = &task->pids[type];
187 pid->nr = 0; 230 link->pid = pid = find_pid(nr);
231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
188 232
189 return nr; 233 return 0;
190} 234}
191 235
192void fastcall detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
193{ 237{
194 int tmp, nr; 238 struct pid_link *link;
239 struct pid *pid;
240 int tmp;
195 241
196 nr = __detach_pid(task, type); 242 link = &task->pids[type];
197 if (!nr) 243 pid = link->pid;
198 return; 244
245 hlist_del_rcu(&link->node);
246 link->pid = NULL;
199 247
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
201 if (tmp != type && find_pid(tmp, nr)) 249 if (!hlist_empty(&pid->tasks[tmp]))
202 return; 250 return;
203 251
204 free_pidmap(nr); 252 free_pid(pid);
205} 253}
206 254
207task_t *find_task_by_pid_type(int type, int nr) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
208{ 256{
209 struct pid *pid; 257 struct task_struct *result = NULL;
210 258 if (pid) {
211 pid = find_pid(type, nr); 259 struct hlist_node *first;
212 if (!pid) 260 first = rcu_dereference(pid->tasks[type].first);
213 return NULL; 261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
214 266
215 return pid_task(&pid->pid_list, type); 267/*
268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
216} 273}
217 274
218EXPORT_SYMBOL(find_task_by_pid_type); 275EXPORT_SYMBOL(find_task_by_pid_type);
219 276
220/* 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
221 * This function switches the PIDs if a non-leader thread calls 278{
222 * sys_execve() - this must be done without releasing the PID. 279 struct task_struct *result;
223 * (which a detach_pid() would eventually do.) 280 rcu_read_lock();
224 */ 281 result = pid_task(pid, type);
225void switch_exec_pids(task_t *leader, task_t *thread) 282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
286}
287
288struct pid *find_get_pid(pid_t nr)
226{ 289{
227 __detach_pid(leader, PIDTYPE_PID); 290 struct pid *pid;
228 __detach_pid(leader, PIDTYPE_TGID); 291
229 __detach_pid(leader, PIDTYPE_PGID); 292 rcu_read_lock();
230 __detach_pid(leader, PIDTYPE_SID); 293 pid = get_pid(find_pid(nr));
231 294 rcu_read_unlock();
232 __detach_pid(thread, PIDTYPE_PID); 295
233 __detach_pid(thread, PIDTYPE_TGID); 296 return pid;
234
235 leader->pid = leader->tgid = thread->pid;
236 thread->pid = thread->tgid;
237
238 attach_pid(thread, PIDTYPE_PID, thread->pid);
239 attach_pid(thread, PIDTYPE_TGID, thread->tgid);
240 attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
241 attach_pid(thread, PIDTYPE_SID, thread->signal->session);
242 list_add_tail(&thread->tasks, &init_task.tasks);
243
244 attach_pid(leader, PIDTYPE_PID, leader->pid);
245 attach_pid(leader, PIDTYPE_TGID, leader->tgid);
246 attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
247 attach_pid(leader, PIDTYPE_SID, leader->signal->session);
248} 297}
249 298
250/* 299/*
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread)
254 */ 303 */
255void __init pidhash_init(void) 304void __init pidhash_init(void)
256{ 305{
257 int i, j, pidhash_size; 306 int i, pidhash_size;
258 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
259 308
260 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -263,30 +312,23 @@ void __init pidhash_init(void)
263 312
264 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
265 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
266 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
267 316
268 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
269 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
270 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
271 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
272 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
273 for (j = 0; j < pidhash_size; j++)
274 INIT_HLIST_HEAD(&pid_hash[i][j]);
275 }
276} 322}
277 323
278void __init pidmap_init(void) 324void __init pidmap_init(void)
279{ 325{
280 int i;
281
282 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 326 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
327 /* Reserve PID 0. We never call free_pidmap(0) */
283 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
284 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
285 330
286 /* 331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
287 * Allocate PID 0, and hash it via all PID types: 332 __alignof__(struct pid),
288 */ 333 SLAB_PANIC, NULL, NULL);
289
290 for (i = 0; i < PIDTYPE_MAX; i++)
291 attach_pid(current, i, 0);
292} 334}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9fd8d4f035..ce0dfb8f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
42 ---help--- 42 ---help---
43 Enable the possibility of suspending the machine. 43 Enable the possibility of suspending the machine.
44 It doesn't need APM. 44 It doesn't need ACPI or APM.
45 You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 45 You may suspend your machine by 'swsusp' or 'shutdown -z <time>'
46 (patch for sysvinit needed). 46 (patch for sysvinit needed).
47 47
diff --git a/kernel/power/main.c b/kernel/power/main.c
index ee371f50cc..a6d9ef4600 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,7 +272,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
272 if (*s && !strncmp(buf, *s, len)) 272 if (*s && !strncmp(buf, *s, len))
273 break; 273 break;
274 } 274 }
275 if (*s) 275 if (state < PM_SUSPEND_MAX && *s)
276 error = enter_state(state); 276 error = enter_state(state);
277 else 277 else
278 error = -EINVAL; 278 error = -EINVAL;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 0f6908cce1..84063ac8fc 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,25 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
75 return dev; 75 return dev;
76} 76}
77 77
78/**
79 * pm_unregister - unregister a device with power management
80 * @dev: device to unregister
81 *
82 * Remove a device from the power management notification lists. The
83 * dev passed must be a handle previously returned by pm_register.
84 */
85
86void pm_unregister(struct pm_dev *dev)
87{
88 if (dev) {
89 mutex_lock(&pm_devs_lock);
90 list_del(&dev->entry);
91 mutex_unlock(&pm_devs_lock);
92
93 kfree(dev);
94 }
95}
96
97static void __pm_unregister(struct pm_dev *dev) 78static void __pm_unregister(struct pm_dev *dev)
98{ 79{
99 if (dev) { 80 if (dev) {
@@ -258,7 +239,6 @@ int pm_send_all(pm_request_t rqst, void *data)
258} 239}
259 240
260EXPORT_SYMBOL(pm_register); 241EXPORT_SYMBOL(pm_register);
261EXPORT_SYMBOL(pm_unregister);
262EXPORT_SYMBOL(pm_unregister_all); 242EXPORT_SYMBOL(pm_unregister_all);
263EXPORT_SYMBOL(pm_send_all); 243EXPORT_SYMBOL(pm_send_all);
264EXPORT_SYMBOL(pm_active); 244EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad..b2a5f671d6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
26 (p->flags & PF_NOFREEZE) || 26 (p->flags & PF_NOFREEZE) ||
27 (p->exit_state == EXIT_ZOMBIE) || 27 (p->exit_state == EXIT_ZOMBIE) ||
28 (p->exit_state == EXIT_DEAD) || 28 (p->exit_state == EXIT_DEAD) ||
29 (p->state == TASK_STOPPED) || 29 (p->state == TASK_STOPPED))
30 (p->state == TASK_TRACED))
31 return 0; 30 return 0;
32 return 1; 31 return 1;
33} 32}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c5863d02c8..3eeedbb13b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist)
240 * free_pagedir - free pages allocated with alloc_pagedir() 240 * free_pagedir - free pages allocated with alloc_pagedir()
241 */ 241 */
242 242
243static void free_pagedir(struct pbe *pblist) 243static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
244{ 244{
245 struct pbe *pbe; 245 struct pbe *pbe;
246 246
247 while (pblist) { 247 while (pblist) {
248 pbe = (pblist + PB_PAGE_SKIP)->next; 248 pbe = (pblist + PB_PAGE_SKIP)->next;
249 ClearPageNosave(virt_to_page(pblist)); 249 ClearPageNosave(virt_to_page(pblist));
250 ClearPageNosaveFree(virt_to_page(pblist)); 250 if (clear_nosave_free)
251 ClearPageNosaveFree(virt_to_page(pblist));
251 free_page((unsigned long)pblist); 252 free_page((unsigned long)pblist);
252 pblist = pbe; 253 pblist = pbe;
253 } 254 }
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
389 pbe->next = alloc_image_page(gfp_mask, safe_needed); 390 pbe->next = alloc_image_page(gfp_mask, safe_needed);
390 } 391 }
391 if (!pbe) { /* get_zeroed_page() failed */ 392 if (!pbe) { /* get_zeroed_page() failed */
392 free_pagedir(pblist); 393 free_pagedir(pblist, 1);
393 pblist = NULL; 394 pblist = NULL;
394 } else 395 } else
395 create_pbe_list(pblist, nr_pages); 396 create_pbe_list(pblist, nr_pages);
@@ -736,7 +737,7 @@ static int create_image(struct snapshot_handle *handle)
736 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
737 if (pblist) 738 if (pblist)
738 copy_page_backup_list(pblist, p); 739 copy_page_backup_list(pblist, p);
739 free_pagedir(p); 740 free_pagedir(p, 0);
740 if (!pblist) 741 if (!pblist)
741 error = -ENOMEM; 742 error = -ENOMEM;
742 } 743 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8cc19431e7..c056f33244 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -360,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
360 unsigned long cur_index, start_print; 360 unsigned long cur_index, start_print;
361 static int msg_level = -1; 361 static int msg_level = -1;
362 362
363 if (((long)(start - end)) > 0) 363 BUG_ON(((long)(start - end)) > 0);
364 BUG();
365 364
366 cur_index = start; 365 cur_index = start;
367 start_print = start; 366 start_print = start;
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
708 */ 707 */
709void acquire_console_sem(void) 708void acquire_console_sem(void)
710{ 709{
711 if (in_interrupt()) 710 BUG_ON(in_interrupt());
712 BUG();
713 down(&console_sem); 711 down(&console_sem);
714 console_locked = 1; 712 console_locked = 1;
715 console_may_schedule = 1; 713 console_may_schedule = 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9..68afe121e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
87 87
88#ifdef CONFIG_PROFILING 88#ifdef CONFIG_PROFILING
89 89
90static DECLARE_RWSEM(profile_rwsem); 90static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
91static DEFINE_RWLOCK(handoff_lock); 91static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
92static struct notifier_block * task_exit_notifier; 92static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
93static struct notifier_block * task_free_notifier;
94static struct notifier_block * munmap_notifier;
95 93
96void profile_task_exit(struct task_struct * task) 94void profile_task_exit(struct task_struct * task)
97{ 95{
98 down_read(&profile_rwsem); 96 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
99 notifier_call_chain(&task_exit_notifier, 0, task);
100 up_read(&profile_rwsem);
101} 97}
102 98
103int profile_handoff_task(struct task_struct * task) 99int profile_handoff_task(struct task_struct * task)
104{ 100{
105 int ret; 101 int ret;
106 read_lock(&handoff_lock); 102 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
107 ret = notifier_call_chain(&task_free_notifier, 0, task);
108 read_unlock(&handoff_lock);
109 return (ret == NOTIFY_OK) ? 1 : 0; 103 return (ret == NOTIFY_OK) ? 1 : 0;
110} 104}
111 105
112void profile_munmap(unsigned long addr) 106void profile_munmap(unsigned long addr)
113{ 107{
114 down_read(&profile_rwsem); 108 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
115 notifier_call_chain(&munmap_notifier, 0, (void *)addr);
116 up_read(&profile_rwsem);
117} 109}
118 110
119int task_handoff_register(struct notifier_block * n) 111int task_handoff_register(struct notifier_block * n)
120{ 112{
121 int err = -EINVAL; 113 return atomic_notifier_chain_register(&task_free_notifier, n);
122
123 write_lock(&handoff_lock);
124 err = notifier_chain_register(&task_free_notifier, n);
125 write_unlock(&handoff_lock);
126 return err;
127} 114}
128 115
129int task_handoff_unregister(struct notifier_block * n) 116int task_handoff_unregister(struct notifier_block * n)
130{ 117{
131 int err = -EINVAL; 118 return atomic_notifier_chain_unregister(&task_free_notifier, n);
132
133 write_lock(&handoff_lock);
134 err = notifier_chain_unregister(&task_free_notifier, n);
135 write_unlock(&handoff_lock);
136 return err;
137} 119}
138 120
139int profile_event_register(enum profile_type type, struct notifier_block * n) 121int profile_event_register(enum profile_type type, struct notifier_block * n)
140{ 122{
141 int err = -EINVAL; 123 int err = -EINVAL;
142 124
143 down_write(&profile_rwsem);
144
145 switch (type) { 125 switch (type) {
146 case PROFILE_TASK_EXIT: 126 case PROFILE_TASK_EXIT:
147 err = notifier_chain_register(&task_exit_notifier, n); 127 err = blocking_notifier_chain_register(
128 &task_exit_notifier, n);
148 break; 129 break;
149 case PROFILE_MUNMAP: 130 case PROFILE_MUNMAP:
150 err = notifier_chain_register(&munmap_notifier, n); 131 err = blocking_notifier_chain_register(
132 &munmap_notifier, n);
151 break; 133 break;
152 } 134 }
153 135
154 up_write(&profile_rwsem);
155
156 return err; 136 return err;
157} 137}
158 138
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
161{ 141{
162 int err = -EINVAL; 142 int err = -EINVAL;
163 143
164 down_write(&profile_rwsem);
165
166 switch (type) { 144 switch (type) {
167 case PROFILE_TASK_EXIT: 145 case PROFILE_TASK_EXIT:
168 err = notifier_chain_unregister(&task_exit_notifier, n); 146 err = blocking_notifier_chain_unregister(
147 &task_exit_notifier, n);
169 break; 148 break;
170 case PROFILE_MUNMAP: 149 case PROFILE_MUNMAP:
171 err = notifier_chain_unregister(&munmap_notifier, n); 150 err = blocking_notifier_chain_unregister(
151 &munmap_notifier, n);
172 break; 152 break;
173 } 153 }
174 154
175 up_write(&profile_rwsem);
176 return err; 155 return err;
177} 156}
178 157
@@ -320,7 +299,7 @@ out:
320} 299}
321 300
322#ifdef CONFIG_HOTPLUG_CPU 301#ifdef CONFIG_HOTPLUG_CPU
323static int __devinit profile_cpu_callback(struct notifier_block *info, 302static int profile_cpu_callback(struct notifier_block *info,
324 unsigned long action, void *__cpu) 303 unsigned long action, void *__cpu)
325{ 304{
326 int node, cpu = (unsigned long)__cpu; 305 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c927..921c22ad16 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -30,14 +30,13 @@
30 */ 30 */
31void __ptrace_link(task_t *child, task_t *new_parent) 31void __ptrace_link(task_t *child, task_t *new_parent)
32{ 32{
33 if (!list_empty(&child->ptrace_list)) 33 BUG_ON(!list_empty(&child->ptrace_list));
34 BUG();
35 if (child->parent == new_parent) 34 if (child->parent == new_parent)
36 return; 35 return;
37 list_add(&child->ptrace_list, &child->parent->ptrace_children); 36 list_add(&child->ptrace_list, &child->parent->ptrace_children);
38 REMOVE_LINKS(child); 37 remove_parent(child);
39 child->parent = new_parent; 38 child->parent = new_parent;
40 SET_LINKS(child); 39 add_parent(child);
41} 40}
42 41
43/* 42/*
@@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child)
57 signal_wake_up(child, 1); 56 signal_wake_up(child, 1);
58 } 57 }
59 } 58 }
60 if (child->signal->flags & SIGNAL_GROUP_EXIT) {
61 sigaddset(&child->pending.signal, SIGKILL);
62 signal_wake_up(child, 1);
63 }
64 spin_unlock(&child->sighand->siglock); 59 spin_unlock(&child->sighand->siglock);
65} 60}
66 61
@@ -77,12 +72,13 @@ void __ptrace_unlink(task_t *child)
77 child->ptrace = 0; 72 child->ptrace = 0;
78 if (!list_empty(&child->ptrace_list)) { 73 if (!list_empty(&child->ptrace_list)) {
79 list_del_init(&child->ptrace_list); 74 list_del_init(&child->ptrace_list);
80 REMOVE_LINKS(child); 75 remove_parent(child);
81 child->parent = child->real_parent; 76 child->parent = child->real_parent;
82 SET_LINKS(child); 77 add_parent(child);
83 } 78 }
84 79
85 ptrace_untrace(child); 80 if (child->state == TASK_TRACED)
81 ptrace_untrace(child);
86} 82}
87 83
88/* 84/*
@@ -152,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
152int ptrace_attach(struct task_struct *task) 148int ptrace_attach(struct task_struct *task)
153{ 149{
154 int retval; 150 int retval;
155 task_lock(task); 151
156 retval = -EPERM; 152 retval = -EPERM;
157 if (task->pid <= 1) 153 if (task->pid <= 1)
158 goto bad; 154 goto out;
159 if (task->tgid == current->tgid) 155 if (task->tgid == current->tgid)
160 goto bad; 156 goto out;
157
158repeat:
159 /*
160 * Nasty, nasty.
161 *
162 * We want to hold both the task-lock and the
163 * tasklist_lock for writing at the same time.
164 * But that's against the rules (tasklist_lock
165 * is taken for reading by interrupts on other
166 * cpu's that may have task_lock).
167 */
168 task_lock(task);
169 local_irq_disable();
170 if (!write_trylock(&tasklist_lock)) {
171 local_irq_enable();
172 task_unlock(task);
173 do {
174 cpu_relax();
175 } while (!write_can_lock(&tasklist_lock));
176 goto repeat;
177 }
178
161 /* the same process cannot be attached many times */ 179 /* the same process cannot be attached many times */
162 if (task->ptrace & PT_PTRACED) 180 if (task->ptrace & PT_PTRACED)
163 goto bad; 181 goto bad;
@@ -170,17 +188,15 @@ int ptrace_attach(struct task_struct *task)
170 ? PT_ATTACHED : 0); 188 ? PT_ATTACHED : 0);
171 if (capable(CAP_SYS_PTRACE)) 189 if (capable(CAP_SYS_PTRACE))
172 task->ptrace |= PT_PTRACE_CAP; 190 task->ptrace |= PT_PTRACE_CAP;
173 task_unlock(task);
174 191
175 write_lock_irq(&tasklist_lock);
176 __ptrace_link(task, current); 192 __ptrace_link(task, current);
177 write_unlock_irq(&tasklist_lock);
178 193
179 force_sig_specific(SIGSTOP, task); 194 force_sig_specific(SIGSTOP, task);
180 return 0;
181 195
182bad: 196bad:
197 write_unlock_irq(&tasklist_lock);
183 task_unlock(task); 198 task_unlock(task);
199out:
184 return retval; 200 return retval;
185} 201}
186 202
@@ -421,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
421 */ 437 */
422int ptrace_traceme(void) 438int ptrace_traceme(void)
423{ 439{
424 int ret; 440 int ret = -EPERM;
425 441
426 /* 442 /*
427 * Are we already being traced? 443 * Are we already being traced?
428 */ 444 */
429 if (current->ptrace & PT_PTRACED) 445 task_lock(current);
430 return -EPERM; 446 if (!(current->ptrace & PT_PTRACED)) {
431 ret = security_ptrace(current->parent, current); 447 ret = security_ptrace(current->parent, current);
432 if (ret) 448 /*
433 return -EPERM; 449 * Set the ptrace bit in the process ptrace flags.
434 /* 450 */
435 * Set the ptrace bit in the process ptrace flags. 451 if (!ret)
436 */ 452 current->ptrace |= PT_PTRACED;
437 current->ptrace |= PT_PTRACED; 453 }
438 return 0; 454 task_unlock(current);
455 return ret;
439} 456}
440 457
441/** 458/**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 13458bbaa1..2058f88c7b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -479,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
479 return 0; 479 return 0;
480} 480}
481 481
482/*
483 * Check to see if there is any immediate RCU-related work to be done
484 * by the current CPU, returning 1 if so. This function is part of the
485 * RCU implementation; it is -not- an exported member of the RCU API.
486 */
482int rcu_pending(int cpu) 487int rcu_pending(int cpu)
483{ 488{
484 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || 489 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
485 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); 490 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
486} 491}
487 492
493/*
494 * Check to see if any future RCU-related work will need to be done
495 * by the current CPU, even if none need be done immediately, returning
496 * 1 if so. This function is part of the RCU implementation; it is -not-
497 * an exported member of the RCU API.
498 */
499int rcu_needs_cpu(int cpu)
500{
501 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
502 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
503
504 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
505}
506
488void rcu_check_callbacks(int cpu, int user) 507void rcu_check_callbacks(int cpu, int user)
489{ 508{
490 if (user || 509 if (user ||
@@ -520,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu)
520 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
521} 540}
522 541
523static int __devinit rcu_cpu_notify(struct notifier_block *self, 542static int rcu_cpu_notify(struct notifier_block *self,
524 unsigned long action, void *hcpu) 543 unsigned long action, void *hcpu)
525{ 544{
526 long cpu = (long)hcpu; 545 long cpu = (long)hcpu;
@@ -537,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
537 return NOTIFY_OK; 556 return NOTIFY_OK;
538} 557}
539 558
540static struct notifier_block __devinitdata rcu_nb = { 559static struct notifier_block rcu_nb = {
541 .notifier_call = rcu_cpu_notify, 560 .notifier_call = rcu_cpu_notify,
542}; 561};
543 562
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5ba..8154e7589d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
303 303
304 for_each_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
535 atomic_set(&n_rcu_torture_error, 0); 535 atomic_set(&n_rcu_torture_error, 0);
536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
537 atomic_set(&rcu_torture_wcount[i], 0); 537 atomic_set(&rcu_torture_wcount[i], 0);
538 for_each_cpu(cpu) { 538 for_each_possible_cpu(cpu) {
539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
540 per_cpu(rcu_torture_count, cpu)[i] = 0; 540 per_cpu(rcu_torture_count, cpu)[i] = 0;
541 per_cpu(rcu_torture_batch, cpu)[i] = 0; 541 per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 78acdefecc..c13f1bd2df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -145,7 +145,8 @@
145 (v1) * (v2_max) / (v1_max) 145 (v1) * (v2_max) / (v1_max)
146 146
147#define DELTA(p) \ 147#define DELTA(p) \
148 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 148 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
149 INTERACTIVE_DELTA)
149 150
150#define TASK_INTERACTIVE(p) \ 151#define TASK_INTERACTIVE(p) \
151 ((p)->prio <= (p)->static_prio - DELTA(p)) 152 ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -666,9 +667,13 @@ static int effective_prio(task_t *p)
666/* 667/*
667 * __activate_task - move a task to the runqueue. 668 * __activate_task - move a task to the runqueue.
668 */ 669 */
669static inline void __activate_task(task_t *p, runqueue_t *rq) 670static void __activate_task(task_t *p, runqueue_t *rq)
670{ 671{
671 enqueue_task(p, rq->active); 672 prio_array_t *target = rq->active;
673
674 if (batch_task(p))
675 target = rq->expired;
676 enqueue_task(p, target);
672 rq->nr_running++; 677 rq->nr_running++;
673} 678}
674 679
@@ -687,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
687 unsigned long long __sleep_time = now - p->timestamp; 692 unsigned long long __sleep_time = now - p->timestamp;
688 unsigned long sleep_time; 693 unsigned long sleep_time;
689 694
690 if (unlikely(p->policy == SCHED_BATCH)) 695 if (batch_task(p))
691 sleep_time = 0; 696 sleep_time = 0;
692 else { 697 else {
693 if (__sleep_time > NS_MAX_SLEEP_AVG) 698 if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -699,21 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
699 if (likely(sleep_time > 0)) { 704 if (likely(sleep_time > 0)) {
700 /* 705 /*
701 * User tasks that sleep a long time are categorised as 706 * User tasks that sleep a long time are categorised as
702 * idle and will get just interactive status to stay active & 707 * idle. They will only have their sleep_avg increased to a
703 * prevent them suddenly becoming cpu hogs and starving 708 * level that makes them just interactive priority to stay
704 * other processes. 709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
705 */ 711 */
706 if (p->mm && p->activated != -1 && 712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
707 sleep_time > INTERACTIVE_SLEEP(p)) { 713 unsigned long ceiling;
708 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - 714
709 DEF_TIMESLICE); 715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
716 DEF_TIMESLICE);
717 if (p->sleep_avg < ceiling)
718 p->sleep_avg = ceiling;
710 } else { 719 } else {
711 /* 720 /*
712 * Tasks waking from uninterruptible sleep are 721 * Tasks waking from uninterruptible sleep are
713 * limited in their sleep_avg rise as they 722 * limited in their sleep_avg rise as they
714 * are likely to be waiting on I/O 723 * are likely to be waiting on I/O
715 */ 724 */
716 if (p->activated == -1 && p->mm) { 725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
717 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
718 sleep_time = 0; 727 sleep_time = 0;
719 else if (p->sleep_avg + sleep_time >= 728 else if (p->sleep_avg + sleep_time >=
@@ -768,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
768 * This checks to make sure it's not an uninterruptible task 777 * This checks to make sure it's not an uninterruptible task
769 * that is now waking up. 778 * that is now waking up.
770 */ 779 */
771 if (!p->activated) { 780 if (p->sleep_type == SLEEP_NORMAL) {
772 /* 781 /*
773 * Tasks which were woken up by interrupts (ie. hw events) 782 * Tasks which were woken up by interrupts (ie. hw events)
774 * are most likely of interactive nature. So we give them 783 * are most likely of interactive nature. So we give them
@@ -777,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
777 * on a CPU, first time around: 786 * on a CPU, first time around:
778 */ 787 */
779 if (in_interrupt()) 788 if (in_interrupt())
780 p->activated = 2; 789 p->sleep_type = SLEEP_INTERRUPTED;
781 else { 790 else {
782 /* 791 /*
783 * Normal first-time wakeups get a credit too for 792 * Normal first-time wakeups get a credit too for
784 * on-runqueue time, but it will be weighted down: 793 * on-runqueue time, but it will be weighted down:
785 */ 794 */
786 p->activated = 1; 795 p->sleep_type = SLEEP_INTERACTIVE;
787 } 796 }
788 } 797 }
789 p->timestamp = now; 798 p->timestamp = now;
@@ -1271,19 +1280,19 @@ out_activate:
1271 * Tasks on involuntary sleep don't earn 1280 * Tasks on involuntary sleep don't earn
1272 * sleep_avg beyond just interactive state. 1281 * sleep_avg beyond just interactive state.
1273 */ 1282 */
1274 p->activated = -1; 1283 p->sleep_type = SLEEP_NONINTERACTIVE;
1275 } 1284 } else
1276 1285
1277 /* 1286 /*
1278 * Tasks that have marked their sleep as noninteractive get 1287 * Tasks that have marked their sleep as noninteractive get
1279 * woken up without updating their sleep average. (i.e. their 1288 * woken up with their sleep average not weighted in an
1280 * sleep is handled in a priority-neutral manner, no priority 1289 * interactive way.
1281 * boost and no penalty.)
1282 */ 1290 */
1283 if (old_state & TASK_NONINTERACTIVE) 1291 if (old_state & TASK_NONINTERACTIVE)
1284 __activate_task(p, rq); 1292 p->sleep_type = SLEEP_NONINTERACTIVE;
1285 else 1293
1286 activate_task(p, rq, cpu == this_cpu); 1294
1295 activate_task(p, rq, cpu == this_cpu);
1287 /* 1296 /*
1288 * Sync wakeups (i.e. those types of wakeups where the waker 1297 * Sync wakeups (i.e. those types of wakeups where the waker
1289 * has indicated that it will leave the CPU in short order) 1298 * has indicated that it will leave the CPU in short order)
@@ -1624,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
1624{ 1633{
1625 unsigned long i, sum = 0; 1634 unsigned long i, sum = 0;
1626 1635
1627 for_each_cpu(i) 1636 for_each_possible_cpu(i)
1628 sum += cpu_rq(i)->nr_uninterruptible; 1637 sum += cpu_rq(i)->nr_uninterruptible;
1629 1638
1630 /* 1639 /*
@@ -1641,7 +1650,7 @@ unsigned long long nr_context_switches(void)
1641{ 1650{
1642 unsigned long long i, sum = 0; 1651 unsigned long long i, sum = 0;
1643 1652
1644 for_each_cpu(i) 1653 for_each_possible_cpu(i)
1645 sum += cpu_rq(i)->nr_switches; 1654 sum += cpu_rq(i)->nr_switches;
1646 1655
1647 return sum; 1656 return sum;
@@ -1651,12 +1660,27 @@ unsigned long nr_iowait(void)
1651{ 1660{
1652 unsigned long i, sum = 0; 1661 unsigned long i, sum = 0;
1653 1662
1654 for_each_cpu(i) 1663 for_each_possible_cpu(i)
1655 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1664 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1656 1665
1657 return sum; 1666 return sum;
1658} 1667}
1659 1668
1669unsigned long nr_active(void)
1670{
1671 unsigned long i, running = 0, uninterruptible = 0;
1672
1673 for_each_online_cpu(i) {
1674 running += cpu_rq(i)->nr_running;
1675 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1676 }
1677
1678 if (unlikely((long)uninterruptible < 0))
1679 uninterruptible = 0;
1680
1681 return running + uninterruptible;
1682}
1683
1660#ifdef CONFIG_SMP 1684#ifdef CONFIG_SMP
1661 1685
1662/* 1686/*
@@ -2859,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
2859 2883
2860#endif 2884#endif
2861 2885
2886static inline int interactive_sleep(enum sleep_type sleep_type)
2887{
2888 return (sleep_type == SLEEP_INTERACTIVE ||
2889 sleep_type == SLEEP_INTERRUPTED);
2890}
2891
2862/* 2892/*
2863 * schedule() is the main scheduler function. 2893 * schedule() is the main scheduler function.
2864 */ 2894 */
@@ -2878,13 +2908,11 @@ asmlinkage void __sched schedule(void)
2878 * schedule() atomically, we ignore that path for now. 2908 * schedule() atomically, we ignore that path for now.
2879 * Otherwise, whine if we are scheduling when we should not be. 2909 * Otherwise, whine if we are scheduling when we should not be.
2880 */ 2910 */
2881 if (likely(!current->exit_state)) { 2911 if (unlikely(in_atomic() && !current->exit_state)) {
2882 if (unlikely(in_atomic())) { 2912 printk(KERN_ERR "BUG: scheduling while atomic: "
2883 printk(KERN_ERR "BUG: scheduling while atomic: " 2913 "%s/0x%08x/%d\n",
2884 "%s/0x%08x/%d\n", 2914 current->comm, preempt_count(), current->pid);
2885 current->comm, preempt_count(), current->pid); 2915 dump_stack();
2886 dump_stack();
2887 }
2888 } 2916 }
2889 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2917 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2890 2918
@@ -2984,12 +3012,12 @@ go_idle:
2984 queue = array->queue + idx; 3012 queue = array->queue + idx;
2985 next = list_entry(queue->next, task_t, run_list); 3013 next = list_entry(queue->next, task_t, run_list);
2986 3014
2987 if (!rt_task(next) && next->activated > 0) { 3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
2988 unsigned long long delta = now - next->timestamp; 3016 unsigned long long delta = now - next->timestamp;
2989 if (unlikely((long long)(now - next->timestamp) < 0)) 3017 if (unlikely((long long)(now - next->timestamp) < 0))
2990 delta = 0; 3018 delta = 0;
2991 3019
2992 if (next->activated == 1) 3020 if (next->sleep_type == SLEEP_INTERACTIVE)
2993 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3021 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2994 3022
2995 array = next->array; 3023 array = next->array;
@@ -2999,10 +3027,9 @@ go_idle:
2999 dequeue_task(next, array); 3027 dequeue_task(next, array);
3000 next->prio = new_prio; 3028 next->prio = new_prio;
3001 enqueue_task(next, array); 3029 enqueue_task(next, array);
3002 } else 3030 }
3003 requeue_task(next, array);
3004 } 3031 }
3005 next->activated = 0; 3032 next->sleep_type = SLEEP_NORMAL;
3006switch_tasks: 3033switch_tasks:
3007 if (next == rq->idle) 3034 if (next == rq->idle)
3008 schedstat_inc(rq, sched_goidle); 3035 schedstat_inc(rq, sched_goidle);
@@ -4761,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4761/* Register at highest priority so that task migration (migrate_all_tasks) 4788/* Register at highest priority so that task migration (migrate_all_tasks)
4762 * happens before everything else. 4789 * happens before everything else.
4763 */ 4790 */
4764static struct notifier_block __devinitdata migration_notifier = { 4791static struct notifier_block migration_notifier = {
4765 .notifier_call = migration_call, 4792 .notifier_call = migration_call,
4766 .priority = 10 4793 .priority = 10
4767}; 4794};
@@ -5575,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu)
5575} 5602}
5576#endif 5603#endif
5577 5604
5605#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS];
5608#endif
5609
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5611static int cpu_to_core_group(int cpu)
5612{
5613 return first_cpu(cpu_sibling_map[cpu]);
5614}
5615#elif defined(CONFIG_SCHED_MC)
5616static int cpu_to_core_group(int cpu)
5617{
5618 return cpu;
5619}
5620#endif
5621
5578static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5622static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5579static struct sched_group sched_group_phys[NR_CPUS]; 5623static struct sched_group sched_group_phys[NR_CPUS];
5580static int cpu_to_phys_group(int cpu) 5624static int cpu_to_phys_group(int cpu)
5581{ 5625{
5582#ifdef CONFIG_SCHED_SMT 5626#if defined(CONFIG_SCHED_MC)
5627 cpumask_t mask = cpu_coregroup_map(cpu);
5628 return first_cpu(mask);
5629#elif defined(CONFIG_SCHED_SMT)
5583 return first_cpu(cpu_sibling_map[cpu]); 5630 return first_cpu(cpu_sibling_map[cpu]);
5584#else 5631#else
5585 return cpu; 5632 return cpu;
@@ -5602,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu)
5602{ 5649{
5603 return cpu_to_node(cpu); 5650 return cpu_to_node(cpu);
5604} 5651}
5652static void init_numa_sched_groups_power(struct sched_group *group_head)
5653{
5654 struct sched_group *sg = group_head;
5655 int j;
5656
5657 if (!sg)
5658 return;
5659next_sg:
5660 for_each_cpu_mask(j, sg->cpumask) {
5661 struct sched_domain *sd;
5662
5663 sd = &per_cpu(phys_domains, j);
5664 if (j != first_cpu(sd->groups->cpumask)) {
5665 /*
5666 * Only add "power" once for each
5667 * physical package.
5668 */
5669 continue;
5670 }
5671
5672 sg->cpu_power += sd->groups->cpu_power;
5673 }
5674 sg = sg->next;
5675 if (sg != group_head)
5676 goto next_sg;
5677}
5605#endif 5678#endif
5606 5679
5607/* 5680/*
@@ -5677,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5677 sd->parent = p; 5750 sd->parent = p;
5678 sd->groups = &sched_group_phys[group]; 5751 sd->groups = &sched_group_phys[group];
5679 5752
5753#ifdef CONFIG_SCHED_MC
5754 p = sd;
5755 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i);
5757 *sd = SD_MC_INIT;
5758 sd->span = cpu_coregroup_map(i);
5759 cpus_and(sd->span, sd->span, *cpu_map);
5760 sd->parent = p;
5761 sd->groups = &sched_group_core[group];
5762#endif
5763
5680#ifdef CONFIG_SCHED_SMT 5764#ifdef CONFIG_SCHED_SMT
5681 p = sd; 5765 p = sd;
5682 sd = &per_cpu(cpu_domains, i); 5766 sd = &per_cpu(cpu_domains, i);
@@ -5702,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5702 } 5786 }
5703#endif 5787#endif
5704 5788
5789#ifdef CONFIG_SCHED_MC
5790 /* Set up multi-core groups */
5791 for_each_cpu_mask(i, *cpu_map) {
5792 cpumask_t this_core_map = cpu_coregroup_map(i);
5793 cpus_and(this_core_map, this_core_map, *cpu_map);
5794 if (i != first_cpu(this_core_map))
5795 continue;
5796 init_sched_build_groups(sched_group_core, this_core_map,
5797 &cpu_to_core_group);
5798 }
5799#endif
5800
5801
5705 /* Set up physical groups */ 5802 /* Set up physical groups */
5706 for (i = 0; i < MAX_NUMNODES; i++) { 5803 for (i = 0; i < MAX_NUMNODES; i++) {
5707 cpumask_t nodemask = node_to_cpumask(i); 5804 cpumask_t nodemask = node_to_cpumask(i);
@@ -5798,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
5798 power = SCHED_LOAD_SCALE; 5895 power = SCHED_LOAD_SCALE;
5799 sd->groups->cpu_power = power; 5896 sd->groups->cpu_power = power;
5800#endif 5897#endif
5898#ifdef CONFIG_SCHED_MC
5899 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power;
5801 5903
5802 sd = &per_cpu(phys_domains, i); 5904 sd = &per_cpu(phys_domains, i);
5905
5906 /*
5907 * This has to be < 2 * SCHED_LOAD_SCALE
5908 * Lets keep it SCHED_LOAD_SCALE, so that
5909 * while calculating NUMA group's cpu_power
5910 * we can simply do
5911 * numa_group->cpu_power += phys_group->cpu_power;
5912 *
5913 * See "only add power once for each physical pkg"
5914 * comment below
5915 */
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else
5918 sd = &per_cpu(phys_domains, i);
5803 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5804 (cpus_weight(sd->groups->cpumask)-1) / 10; 5920 (cpus_weight(sd->groups->cpumask)-1) / 10;
5805 sd->groups->cpu_power = power; 5921 sd->groups->cpu_power = power;
5806
5807#ifdef CONFIG_NUMA
5808 sd = &per_cpu(allnodes_domains, i);
5809 if (sd->groups) {
5810 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5811 (cpus_weight(sd->groups->cpumask)-1) / 10;
5812 sd->groups->cpu_power = power;
5813 }
5814#endif 5922#endif
5815 } 5923 }
5816 5924
5817#ifdef CONFIG_NUMA 5925#ifdef CONFIG_NUMA
5818 for (i = 0; i < MAX_NUMNODES; i++) { 5926 for (i = 0; i < MAX_NUMNODES; i++)
5819 struct sched_group *sg = sched_group_nodes[i]; 5927 init_numa_sched_groups_power(sched_group_nodes[i]);
5820 int j;
5821 5928
5822 if (sg == NULL) 5929 init_numa_sched_groups_power(sched_group_allnodes);
5823 continue;
5824next_sg:
5825 for_each_cpu_mask(j, sg->cpumask) {
5826 struct sched_domain *sd;
5827 int power;
5828
5829 sd = &per_cpu(phys_domains, j);
5830 if (j != first_cpu(sd->groups->cpumask)) {
5831 /*
5832 * Only add "power" once for each
5833 * physical package.
5834 */
5835 continue;
5836 }
5837 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5838 (cpus_weight(sd->groups->cpumask)-1) / 10;
5839
5840 sg->cpu_power += power;
5841 }
5842 sg = sg->next;
5843 if (sg != sched_group_nodes[i])
5844 goto next_sg;
5845 }
5846#endif 5930#endif
5847 5931
5848 /* Attach the domains */ 5932 /* Attach the domains */
@@ -5850,6 +5934,8 @@ next_sg:
5850 struct sched_domain *sd; 5934 struct sched_domain *sd;
5851#ifdef CONFIG_SCHED_SMT 5935#ifdef CONFIG_SCHED_SMT
5852 sd = &per_cpu(cpu_domains, i); 5936 sd = &per_cpu(cpu_domains, i);
5937#elif defined(CONFIG_SCHED_MC)
5938 sd = &per_cpu(core_domains, i);
5853#else 5939#else
5854 sd = &per_cpu(phys_domains, i); 5940 sd = &per_cpu(phys_domains, i);
5855#endif 5941#endif
@@ -6022,7 +6108,7 @@ void __init sched_init(void)
6022 runqueue_t *rq; 6108 runqueue_t *rq;
6023 int i, j, k; 6109 int i, j, k;
6024 6110
6025 for_each_cpu(i) { 6111 for_each_possible_cpu(i) {
6026 prio_array_t *array; 6112 prio_array_t *array;
6027 6113
6028 rq = cpu_rq(i); 6114 rq = cpu_rq(i);
diff --git a/kernel/signal.c b/kernel/signal.c
index 75f7341b0c..e5f8aea78f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/posix-timers.h>
26#include <linux/signal.h> 25#include <linux/signal.h>
27#include <linux/audit.h> 26#include <linux/audit.h>
28#include <linux/capability.h> 27#include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
147#define sig_kernel_stop(sig) \ 146#define sig_kernel_stop(sig) \
148 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) 147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
149 148
149#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
150
150#define sig_user_defined(t, signr) \ 151#define sig_user_defined(t, signr) \
151 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ 152 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
152 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) 153 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
292 kmem_cache_free(sigqueue_cachep, q); 293 kmem_cache_free(sigqueue_cachep, q);
293} 294}
294 295
295static void flush_sigqueue(struct sigpending *queue) 296void flush_sigqueue(struct sigpending *queue)
296{ 297{
297 struct sigqueue *q; 298 struct sigqueue *q;
298 299
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
307/* 308/*
308 * Flush all pending signals for a task. 309 * Flush all pending signals for a task.
309 */ 310 */
310 311void flush_signals(struct task_struct *t)
311void
312flush_signals(struct task_struct *t)
313{ 312{
314 unsigned long flags; 313 unsigned long flags;
315 314
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
321} 320}
322 321
323/* 322/*
324 * This function expects the tasklist_lock write-locked.
325 */
326void __exit_sighand(struct task_struct *tsk)
327{
328 struct sighand_struct * sighand = tsk->sighand;
329
330 /* Ok, we're done with the signal handlers */
331 tsk->sighand = NULL;
332 if (atomic_dec_and_test(&sighand->count))
333 sighand_free(sighand);
334}
335
336void exit_sighand(struct task_struct *tsk)
337{
338 write_lock_irq(&tasklist_lock);
339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
347 write_unlock_irq(&tasklist_lock);
348}
349
350/*
351 * This function expects the tasklist_lock write-locked.
352 */
353void __exit_signal(struct task_struct *tsk)
354{
355 struct signal_struct * sig = tsk->signal;
356 struct sighand_struct * sighand;
357
358 if (!sig)
359 BUG();
360 if (!atomic_read(&sig->count))
361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
364 spin_lock(&sighand->siglock);
365 posix_cpu_timers_exit(tsk);
366 if (atomic_dec_and_test(&sig->count)) {
367 posix_cpu_timers_exit_group(tsk);
368 tsk->signal = NULL;
369 __exit_sighand(tsk);
370 spin_unlock(&sighand->siglock);
371 flush_sigqueue(&sig->shared_pending);
372 } else {
373 /*
374 * If there is any task waiting for the group exit
375 * then notify it:
376 */
377 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
378 wake_up_process(sig->group_exit_task);
379 sig->group_exit_task = NULL;
380 }
381 if (tsk == sig->curr_target)
382 sig->curr_target = next_thread(tsk);
383 tsk->signal = NULL;
384 /*
385 * Accumulate here the counters for all threads but the
386 * group leader as they die, so they can be added into
387 * the process-wide totals when those are taken.
388 * The group leader stays around as a zombie as long
389 * as there are other threads. When it gets reaped,
390 * the exit.c code will add its counts into these totals.
391 * We won't ever get here for the group leader, since it
392 * will have been the last reference on the signal_struct.
393 */
394 sig->utime = cputime_add(sig->utime, tsk->utime);
395 sig->stime = cputime_add(sig->stime, tsk->stime);
396 sig->min_flt += tsk->min_flt;
397 sig->maj_flt += tsk->maj_flt;
398 sig->nvcsw += tsk->nvcsw;
399 sig->nivcsw += tsk->nivcsw;
400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
402 spin_unlock(&sighand->siglock);
403 sig = NULL; /* Marker for below. */
404 }
405 rcu_read_unlock();
406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
407 flush_sigqueue(&tsk->pending);
408 if (sig) {
409 /*
410 * We are cleaning up the signal_struct here.
411 */
412 exit_thread_group_keys(sig);
413 kmem_cache_free(signal_cachep, sig);
414 }
415}
416
417void exit_signal(struct task_struct *tsk)
418{
419 atomic_dec(&tsk->signal->live);
420
421 write_lock_irq(&tasklist_lock);
422 __exit_signal(tsk);
423 write_unlock_irq(&tasklist_lock);
424}
425
426/*
427 * Flush all handlers for a task. 323 * Flush all handlers for a task.
428 */ 324 */
429 325
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
695} 591}
696 592
697/* forward decl */ 593/* forward decl */
698static void do_notify_parent_cldstop(struct task_struct *tsk, 594static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
699 int to_self,
700 int why);
701 595
702/* 596/*
703 * Handle magic process-wide effects of stop/continue signals. 597 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
747 p->signal->group_stop_count = 0; 641 p->signal->group_stop_count = 0;
748 p->signal->flags = SIGNAL_STOP_CONTINUED; 642 p->signal->flags = SIGNAL_STOP_CONTINUED;
749 spin_unlock(&p->sighand->siglock); 643 spin_unlock(&p->sighand->siglock);
750 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); 644 do_notify_parent_cldstop(p, CLD_STOPPED);
751 spin_lock(&p->sighand->siglock); 645 spin_lock(&p->sighand->siglock);
752 } 646 }
753 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 647 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
788 p->signal->flags = SIGNAL_STOP_CONTINUED; 682 p->signal->flags = SIGNAL_STOP_CONTINUED;
789 p->signal->group_exit_code = 0; 683 p->signal->group_exit_code = 0;
790 spin_unlock(&p->sighand->siglock); 684 spin_unlock(&p->sighand->siglock);
791 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); 685 do_notify_parent_cldstop(p, CLD_CONTINUED);
792 spin_lock(&p->sighand->siglock); 686 spin_lock(&p->sighand->siglock);
793 } else { 687 } else {
794 /* 688 /*
@@ -875,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
875{ 769{
876 int ret = 0; 770 int ret = 0;
877 771
878 if (!irqs_disabled()) 772 BUG_ON(!irqs_disabled());
879 BUG();
880 assert_spin_locked(&t->sighand->siglock); 773 assert_spin_locked(&t->sighand->siglock);
881 774
882 /* Short-circuit ignored signals. */ 775 /* Short-circuit ignored signals. */
@@ -975,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
975 if (t == NULL) 868 if (t == NULL)
976 /* restart balancing at this thread */ 869 /* restart balancing at this thread */
977 t = p->signal->curr_target = p; 870 t = p->signal->curr_target = p;
978 BUG_ON(t->tgid != p->tgid);
979 871
980 while (!wants_signal(sig, t)) { 872 while (!wants_signal(sig, t)) {
981 t = next_thread(t); 873 t = next_thread(t);
@@ -1120,27 +1012,37 @@ void zap_other_threads(struct task_struct *p)
1120/* 1012/*
1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 1013 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1122 */ 1014 */
1015struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1016{
1017 struct sighand_struct *sighand;
1018
1019 for (;;) {
1020 sighand = rcu_dereference(tsk->sighand);
1021 if (unlikely(sighand == NULL))
1022 break;
1023
1024 spin_lock_irqsave(&sighand->siglock, *flags);
1025 if (likely(sighand == tsk->sighand))
1026 break;
1027 spin_unlock_irqrestore(&sighand->siglock, *flags);
1028 }
1029
1030 return sighand;
1031}
1032
1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1033int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1124{ 1034{
1125 unsigned long flags; 1035 unsigned long flags;
1126 struct sighand_struct *sp;
1127 int ret; 1036 int ret;
1128 1037
1129retry:
1130 ret = check_kill_permission(sig, info, p); 1038 ret = check_kill_permission(sig, info, p);
1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) { 1039
1132 spin_lock_irqsave(&sp->siglock, flags); 1040 if (!ret && sig) {
1133 if (p->sighand != sp) { 1041 ret = -ESRCH;
1134 spin_unlock_irqrestore(&sp->siglock, flags); 1042 if (lock_task_sighand(p, &flags)) {
1135 goto retry; 1043 ret = __group_send_sig_info(sig, info, p);
1136 } 1044 unlock_task_sighand(p, &flags);
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 } 1045 }
1142 ret = __group_send_sig_info(sig, info, p);
1143 spin_unlock_irqrestore(&sp->siglock, flags);
1144 } 1046 }
1145 1047
1146 return ret; 1048 return ret;
@@ -1189,7 +1091,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1189 struct task_struct *p; 1091 struct task_struct *p;
1190 1092
1191 rcu_read_lock(); 1093 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { 1094 if (unlikely(sig_needs_tasklist(sig))) {
1193 read_lock(&tasklist_lock); 1095 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1; 1096 acquired_tasklist_lock = 1;
1195 } 1097 }
@@ -1405,12 +1307,10 @@ void sigqueue_free(struct sigqueue *q)
1405 __sigqueue_free(q); 1307 __sigqueue_free(q);
1406} 1308}
1407 1309
1408int 1310int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1409send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1410{ 1311{
1411 unsigned long flags; 1312 unsigned long flags;
1412 int ret = 0; 1313 int ret = 0;
1413 struct sighand_struct *sh;
1414 1314
1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1315 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1416 1316
@@ -1424,48 +1324,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1424 */ 1324 */
1425 rcu_read_lock(); 1325 rcu_read_lock();
1426 1326
1427 if (unlikely(p->flags & PF_EXITING)) { 1327 if (!likely(lock_task_sighand(p, &flags))) {
1428 ret = -1; 1328 ret = -1;
1429 goto out_err; 1329 goto out_err;
1430 } 1330 }
1431 1331
1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1461
1462 if (unlikely(!list_empty(&q->list))) { 1332 if (unlikely(!list_empty(&q->list))) {
1463 /* 1333 /*
1464 * If an SI_TIMER entry is already queue just increment 1334 * If an SI_TIMER entry is already queue just increment
1465 * the overrun count. 1335 * the overrun count.
1466 */ 1336 */
1467 if (q->info.si_code != SI_TIMER) 1337 BUG_ON(q->info.si_code != SI_TIMER);
1468 BUG();
1469 q->info.si_overrun++; 1338 q->info.si_overrun++;
1470 goto out; 1339 goto out;
1471 } 1340 }
@@ -1481,7 +1350,7 @@ retry:
1481 signal_wake_up(p, sig == SIGKILL); 1350 signal_wake_up(p, sig == SIGKILL);
1482 1351
1483out: 1352out:
1484 spin_unlock_irqrestore(&sh->siglock, flags); 1353 unlock_task_sighand(p, &flags);
1485out_err: 1354out_err:
1486 rcu_read_unlock(); 1355 rcu_read_unlock();
1487 1356
@@ -1513,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1513 * the overrun count. Other uses should not try to 1382 * the overrun count. Other uses should not try to
1514 * send the signal multiple times. 1383 * send the signal multiple times.
1515 */ 1384 */
1516 if (q->info.si_code != SI_TIMER) 1385 BUG_ON(q->info.si_code != SI_TIMER);
1517 BUG();
1518 q->info.si_overrun++; 1386 q->info.si_overrun++;
1519 goto out; 1387 goto out;
1520 } 1388 }
@@ -1613,14 +1481,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1613 spin_unlock_irqrestore(&psig->siglock, flags); 1481 spin_unlock_irqrestore(&psig->siglock, flags);
1614} 1482}
1615 1483
1616static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) 1484static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1617{ 1485{
1618 struct siginfo info; 1486 struct siginfo info;
1619 unsigned long flags; 1487 unsigned long flags;
1620 struct task_struct *parent; 1488 struct task_struct *parent;
1621 struct sighand_struct *sighand; 1489 struct sighand_struct *sighand;
1622 1490
1623 if (to_self) 1491 if (tsk->ptrace & PT_PTRACED)
1624 parent = tsk->parent; 1492 parent = tsk->parent;
1625 else { 1493 else {
1626 tsk = tsk->group_leader; 1494 tsk = tsk->group_leader;
@@ -1689,13 +1557,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1689 /* Let the debugger run. */ 1557 /* Let the debugger run. */
1690 set_current_state(TASK_TRACED); 1558 set_current_state(TASK_TRACED);
1691 spin_unlock_irq(&current->sighand->siglock); 1559 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze();
1692 read_lock(&tasklist_lock); 1561 read_lock(&tasklist_lock);
1693 if (likely(current->ptrace & PT_PTRACED) && 1562 if (likely(current->ptrace & PT_PTRACED) &&
1694 likely(current->parent != current->real_parent || 1563 likely(current->parent != current->real_parent ||
1695 !(current->ptrace & PT_ATTACHED)) && 1564 !(current->ptrace & PT_ATTACHED)) &&
1696 (likely(current->parent->signal != current->signal) || 1565 (likely(current->parent->signal != current->signal) ||
1697 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1698 do_notify_parent_cldstop(current, 1, CLD_TRAPPED); 1567 do_notify_parent_cldstop(current, CLD_TRAPPED);
1699 read_unlock(&tasklist_lock); 1568 read_unlock(&tasklist_lock);
1700 schedule(); 1569 schedule();
1701 } else { 1570 } else {
@@ -1744,25 +1613,17 @@ void ptrace_notify(int exit_code)
1744static void 1613static void
1745finish_stop(int stop_count) 1614finish_stop(int stop_count)
1746{ 1615{
1747 int to_self;
1748
1749 /* 1616 /*
1750 * If there are no other threads in the group, or if there is 1617 * If there are no other threads in the group, or if there is
1751 * a group stop in progress and we are the last to stop, 1618 * a group stop in progress and we are the last to stop,
1752 * report to the parent. When ptraced, every thread reports itself. 1619 * report to the parent. When ptraced, every thread reports itself.
1753 */ 1620 */
1754 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) 1621 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
1755 to_self = 1; 1622 read_lock(&tasklist_lock);
1756 else if (stop_count == 0) 1623 do_notify_parent_cldstop(current, CLD_STOPPED);
1757 to_self = 0; 1624 read_unlock(&tasklist_lock);
1758 else 1625 }
1759 goto out;
1760
1761 read_lock(&tasklist_lock);
1762 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1763 read_unlock(&tasklist_lock);
1764 1626
1765out:
1766 schedule(); 1627 schedule();
1767 /* 1628 /*
1768 * Now we don't run again until continued. 1629 * Now we don't run again until continued.
@@ -1776,12 +1637,10 @@ out:
1776 * Returns nonzero if we've actually stopped and released the siglock. 1637 * Returns nonzero if we've actually stopped and released the siglock.
1777 * Returns zero if we didn't stop and still hold the siglock. 1638 * Returns zero if we didn't stop and still hold the siglock.
1778 */ 1639 */
1779static int 1640static int do_signal_stop(int signr)
1780do_signal_stop(int signr)
1781{ 1641{
1782 struct signal_struct *sig = current->signal; 1642 struct signal_struct *sig = current->signal;
1783 struct sighand_struct *sighand = current->sighand; 1643 int stop_count;
1784 int stop_count = -1;
1785 1644
1786 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) 1645 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1787 return 0; 1646 return 0;
@@ -1791,86 +1650,37 @@ do_signal_stop(int signr)
1791 * There is a group stop in progress. We don't need to 1650 * There is a group stop in progress. We don't need to
1792 * start another one. 1651 * start another one.
1793 */ 1652 */
1794 signr = sig->group_exit_code;
1795 stop_count = --sig->group_stop_count; 1653 stop_count = --sig->group_stop_count;
1796 current->exit_code = signr; 1654 } else {
1797 set_current_state(TASK_STOPPED);
1798 if (stop_count == 0)
1799 sig->flags = SIGNAL_STOP_STOPPED;
1800 spin_unlock_irq(&sighand->siglock);
1801 }
1802 else if (thread_group_empty(current)) {
1803 /*
1804 * Lock must be held through transition to stopped state.
1805 */
1806 current->exit_code = current->signal->group_exit_code = signr;
1807 set_current_state(TASK_STOPPED);
1808 sig->flags = SIGNAL_STOP_STOPPED;
1809 spin_unlock_irq(&sighand->siglock);
1810 }
1811 else {
1812 /* 1655 /*
1813 * There is no group stop already in progress. 1656 * There is no group stop already in progress.
1814 * We must initiate one now, but that requires 1657 * We must initiate one now.
1815 * dropping siglock to get both the tasklist lock
1816 * and siglock again in the proper order. Note that
1817 * this allows an intervening SIGCONT to be posted.
1818 * We need to check for that and bail out if necessary.
1819 */ 1658 */
1820 struct task_struct *t; 1659 struct task_struct *t;
1821 1660
1822 spin_unlock_irq(&sighand->siglock); 1661 sig->group_exit_code = signr;
1823 1662
1824 /* signals can be posted during this window */ 1663 stop_count = 0;
1825 1664 for (t = next_thread(current); t != current; t = next_thread(t))
1826 read_lock(&tasklist_lock);
1827 spin_lock_irq(&sighand->siglock);
1828
1829 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
1830 /* 1665 /*
1831 * Another stop or continue happened while we 1666 * Setting state to TASK_STOPPED for a group
1832 * didn't have the lock. We can just swallow this 1667 * stop is always done with the siglock held,
1833 * signal now. If we raced with a SIGCONT, that 1668 * so this check has no races.
1834 * should have just cleared it now. If we raced
1835 * with another processor delivering a stop signal,
1836 * then the SIGCONT that wakes us up should clear it.
1837 */ 1669 */
1838 read_unlock(&tasklist_lock); 1670 if (!t->exit_state &&
1839 return 0; 1671 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1840 } 1672 stop_count++;
1841 1673 signal_wake_up(t, 0);
1842 if (sig->group_stop_count == 0) { 1674 }
1843 sig->group_exit_code = signr; 1675 sig->group_stop_count = stop_count;
1844 stop_count = 0;
1845 for (t = next_thread(current); t != current;
1846 t = next_thread(t))
1847 /*
1848 * Setting state to TASK_STOPPED for a group
1849 * stop is always done with the siglock held,
1850 * so this check has no races.
1851 */
1852 if (!t->exit_state &&
1853 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1854 stop_count++;
1855 signal_wake_up(t, 0);
1856 }
1857 sig->group_stop_count = stop_count;
1858 }
1859 else {
1860 /* A race with another thread while unlocked. */
1861 signr = sig->group_exit_code;
1862 stop_count = --sig->group_stop_count;
1863 }
1864
1865 current->exit_code = signr;
1866 set_current_state(TASK_STOPPED);
1867 if (stop_count == 0)
1868 sig->flags = SIGNAL_STOP_STOPPED;
1869
1870 spin_unlock_irq(&sighand->siglock);
1871 read_unlock(&tasklist_lock);
1872 } 1676 }
1873 1677
1678 if (stop_count == 0)
1679 sig->flags = SIGNAL_STOP_STOPPED;
1680 current->exit_code = sig->group_exit_code;
1681 __set_current_state(TASK_STOPPED);
1682
1683 spin_unlock_irq(&current->sighand->siglock);
1874 finish_stop(stop_count); 1684 finish_stop(stop_count);
1875 return 1; 1685 return 1;
1876} 1686}
@@ -1944,9 +1754,9 @@ relock:
1944 /* Let the debugger run. */ 1754 /* Let the debugger run. */
1945 ptrace_stop(signr, signr, info); 1755 ptrace_stop(signr, signr, info);
1946 1756
1947 /* We're back. Did the debugger cancel the sig or group_exit? */ 1757 /* We're back. Did the debugger cancel the sig? */
1948 signr = current->exit_code; 1758 signr = current->exit_code;
1949 if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) 1759 if (signr == 0)
1950 continue; 1760 continue;
1951 1761
1952 current->exit_code = 0; 1762 current->exit_code = 0;
@@ -1990,7 +1800,7 @@ relock:
1990 continue; 1800 continue;
1991 1801
1992 /* Init gets no signals it doesn't want. */ 1802 /* Init gets no signals it doesn't want. */
1993 if (current->pid == 1) 1803 if (current == child_reaper)
1994 continue; 1804 continue;
1995 1805
1996 if (sig_kernel_stop(signr)) { 1806 if (sig_kernel_stop(signr)) {
@@ -2430,8 +2240,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2430 return kill_proc_info(sig, &info, pid); 2240 return kill_proc_info(sig, &info, pid);
2431} 2241}
2432 2242
2433int 2243int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2434do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2435{ 2244{
2436 struct k_sigaction *k; 2245 struct k_sigaction *k;
2437 sigset_t mask; 2246 sigset_t mask;
@@ -2457,6 +2266,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2457 if (act) { 2266 if (act) {
2458 sigdelsetmask(&act->sa.sa_mask, 2267 sigdelsetmask(&act->sa.sa_mask,
2459 sigmask(SIGKILL) | sigmask(SIGSTOP)); 2268 sigmask(SIGKILL) | sigmask(SIGSTOP));
2269 *k = *act;
2460 /* 2270 /*
2461 * POSIX 3.3.1.3: 2271 * POSIX 3.3.1.3:
2462 * "Setting a signal action to SIG_IGN for a signal that is 2272 * "Setting a signal action to SIG_IGN for a signal that is
@@ -2469,19 +2279,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2469 * be discarded, whether or not it is blocked" 2279 * be discarded, whether or not it is blocked"
2470 */ 2280 */
2471 if (act->sa.sa_handler == SIG_IGN || 2281 if (act->sa.sa_handler == SIG_IGN ||
2472 (act->sa.sa_handler == SIG_DFL && 2282 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2473 sig_kernel_ignore(sig))) {
2474 /*
2475 * This is a fairly rare case, so we only take the
2476 * tasklist_lock once we're sure we'll need it.
2477 * Now we must do this little unlock and relock
2478 * dance to maintain the lock hierarchy.
2479 */
2480 struct task_struct *t = current; 2283 struct task_struct *t = current;
2481 spin_unlock_irq(&t->sighand->siglock);
2482 read_lock(&tasklist_lock);
2483 spin_lock_irq(&t->sighand->siglock);
2484 *k = *act;
2485 sigemptyset(&mask); 2284 sigemptyset(&mask);
2486 sigaddset(&mask, sig); 2285 sigaddset(&mask, sig);
2487 rm_from_queue_full(&mask, &t->signal->shared_pending); 2286 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2490,12 +2289,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2490 recalc_sigpending_tsk(t); 2289 recalc_sigpending_tsk(t);
2491 t = next_thread(t); 2290 t = next_thread(t);
2492 } while (t != current); 2291 } while (t != current);
2493 spin_unlock_irq(&current->sighand->siglock);
2494 read_unlock(&tasklist_lock);
2495 return 0;
2496 } 2292 }
2497
2498 *k = *act;
2499 } 2293 }
2500 2294
2501 spin_unlock_irq(&current->sighand->siglock); 2295 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ec8fed42a8..336f92d64e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 446}
447#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
448 448
449static int __devinit cpu_callback(struct notifier_block *nfb, 449static int cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 450 unsigned long action,
451 void *hcpu) 451 void *hcpu)
452{ 452{
@@ -484,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
484 return NOTIFY_OK; 484 return NOTIFY_OK;
485} 485}
486 486
487static struct notifier_block __devinitdata cpu_nfb = { 487static struct notifier_block cpu_nfb = {
488 .notifier_call = cpu_callback 488 .notifier_call = cpu_callback
489}; 489};
490 490
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847e..14c7faf029 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int __devinit 107static int
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -140,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
140 return NOTIFY_OK; 140 return NOTIFY_OK;
141} 141}
142 142
143static struct notifier_block __devinitdata cpu_nfb = { 143static struct notifier_block cpu_nfb = {
144 .notifier_call = cpu_callback 144 .notifier_call = cpu_callback
145}; 145};
146 146
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
153 register_cpu_notifier(&cpu_nfb); 153 register_cpu_notifier(&cpu_nfb);
154 154
155 notifier_chain_register(&panic_notifier_list, &panic_block); 155 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
156} 156}
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2..0b6ec0e793 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
95 * and the like. 95 * and the like.
96 */ 96 */
97 97
98static struct notifier_block *reboot_notifier_list; 98static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
99static DEFINE_RWLOCK(notifier_lock); 99
100/*
101 * Notifier chain core routines. The exported routines below
102 * are layered on top of these, with appropriate locking added.
103 */
104
105static int notifier_chain_register(struct notifier_block **nl,
106 struct notifier_block *n)
107{
108 while ((*nl) != NULL) {
109 if (n->priority > (*nl)->priority)
110 break;
111 nl = &((*nl)->next);
112 }
113 n->next = *nl;
114 rcu_assign_pointer(*nl, n);
115 return 0;
116}
117
118static int notifier_chain_unregister(struct notifier_block **nl,
119 struct notifier_block *n)
120{
121 while ((*nl) != NULL) {
122 if ((*nl) == n) {
123 rcu_assign_pointer(*nl, n->next);
124 return 0;
125 }
126 nl = &((*nl)->next);
127 }
128 return -ENOENT;
129}
130
131static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v)
133{
134 int ret = NOTIFY_DONE;
135 struct notifier_block *nb;
136
137 nb = rcu_dereference(*nl);
138 while (nb) {
139 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break;
142 nb = rcu_dereference(nb->next);
143 }
144 return ret;
145}
146
147/*
148 * Atomic notifier chain routines. Registration and unregistration
149 * use a mutex, and call_chain is synchronized by RCU (no locks).
150 */
100 151
101/** 152/**
102 * notifier_chain_register - Add notifier to a notifier chain 153 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
103 * @list: Pointer to root list pointer 154 * @nh: Pointer to head of the atomic notifier chain
104 * @n: New entry in notifier chain 155 * @n: New entry in notifier chain
105 * 156 *
106 * Adds a notifier to a notifier chain. 157 * Adds a notifier to an atomic notifier chain.
107 * 158 *
108 * Currently always returns zero. 159 * Currently always returns zero.
109 */ 160 */
161
162int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
163 struct notifier_block *n)
164{
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&nh->lock, flags);
169 ret = notifier_chain_register(&nh->head, n);
170 spin_unlock_irqrestore(&nh->lock, flags);
171 return ret;
172}
173
174EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
175
176/**
177 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
178 * @nh: Pointer to head of the atomic notifier chain
179 * @n: Entry to remove from notifier chain
180 *
181 * Removes a notifier from an atomic notifier chain.
182 *
183 * Returns zero on success or %-ENOENT on failure.
184 */
185int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
186 struct notifier_block *n)
187{
188 unsigned long flags;
189 int ret;
190
191 spin_lock_irqsave(&nh->lock, flags);
192 ret = notifier_chain_unregister(&nh->head, n);
193 spin_unlock_irqrestore(&nh->lock, flags);
194 synchronize_rcu();
195 return ret;
196}
197
198EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
199
200/**
201 * atomic_notifier_call_chain - Call functions in an atomic notifier chain
202 * @nh: Pointer to head of the atomic notifier chain
203 * @val: Value passed unmodified to notifier function
204 * @v: Pointer passed unmodified to notifier function
205 *
206 * Calls each function in a notifier chain in turn. The functions
207 * run in an atomic context, so they must not block.
208 * This routine uses RCU to synchronize with changes to the chain.
209 *
210 * If the return value of the notifier can be and'ed
211 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
212 * will return immediately, with the return value of
213 * the notifier function which halted execution.
214 * Otherwise the return value is the return value
215 * of the last notifier function called.
216 */
110 217
111int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) 218int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
219 unsigned long val, void *v)
112{ 220{
113 write_lock(&notifier_lock); 221 int ret;
114 while(*list) 222
115 { 223 rcu_read_lock();
116 if(n->priority > (*list)->priority) 224 ret = notifier_call_chain(&nh->head, val, v);
117 break; 225 rcu_read_unlock();
118 list= &((*list)->next); 226 return ret;
119 }
120 n->next = *list;
121 *list=n;
122 write_unlock(&notifier_lock);
123 return 0;
124} 227}
125 228
126EXPORT_SYMBOL(notifier_chain_register); 229EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
230
231/*
232 * Blocking notifier chain routines. All access to the chain is
233 * synchronized by an rwsem.
234 */
127 235
128/** 236/**
129 * notifier_chain_unregister - Remove notifier from a notifier chain 237 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
130 * @nl: Pointer to root list pointer 238 * @nh: Pointer to head of the blocking notifier chain
131 * @n: New entry in notifier chain 239 * @n: New entry in notifier chain
132 * 240 *
133 * Removes a notifier from a notifier chain. 241 * Adds a notifier to a blocking notifier chain.
242 * Must be called in process context.
134 * 243 *
135 * Returns zero on success, or %-ENOENT on failure. 244 * Currently always returns zero.
136 */ 245 */
137 246
138int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) 247int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
248 struct notifier_block *n)
139{ 249{
140 write_lock(&notifier_lock); 250 int ret;
141 while((*nl)!=NULL) 251
142 { 252 /*
143 if((*nl)==n) 253 * This code gets used during boot-up, when task switching is
144 { 254 * not yet working and interrupts must remain disabled. At
145 *nl=n->next; 255 * such times we must not call down_write().
146 write_unlock(&notifier_lock); 256 */
147 return 0; 257 if (unlikely(system_state == SYSTEM_BOOTING))
148 } 258 return notifier_chain_register(&nh->head, n);
149 nl=&((*nl)->next); 259
150 } 260 down_write(&nh->rwsem);
151 write_unlock(&notifier_lock); 261 ret = notifier_chain_register(&nh->head, n);
152 return -ENOENT; 262 up_write(&nh->rwsem);
263 return ret;
153} 264}
154 265
155EXPORT_SYMBOL(notifier_chain_unregister); 266EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
156 267
157/** 268/**
158 * notifier_call_chain - Call functions in a notifier chain 269 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
159 * @n: Pointer to root pointer of notifier chain 270 * @nh: Pointer to head of the blocking notifier chain
271 * @n: Entry to remove from notifier chain
272 *
273 * Removes a notifier from a blocking notifier chain.
274 * Must be called from process context.
275 *
276 * Returns zero on success or %-ENOENT on failure.
277 */
278int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
279 struct notifier_block *n)
280{
281 int ret;
282
283 /*
284 * This code gets used during boot-up, when task switching is
285 * not yet working and interrupts must remain disabled. At
286 * such times we must not call down_write().
287 */
288 if (unlikely(system_state == SYSTEM_BOOTING))
289 return notifier_chain_unregister(&nh->head, n);
290
291 down_write(&nh->rwsem);
292 ret = notifier_chain_unregister(&nh->head, n);
293 up_write(&nh->rwsem);
294 return ret;
295}
296
297EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
298
299/**
300 * blocking_notifier_call_chain - Call functions in a blocking notifier chain
301 * @nh: Pointer to head of the blocking notifier chain
160 * @val: Value passed unmodified to notifier function 302 * @val: Value passed unmodified to notifier function
161 * @v: Pointer passed unmodified to notifier function 303 * @v: Pointer passed unmodified to notifier function
162 * 304 *
163 * Calls each function in a notifier chain in turn. 305 * Calls each function in a notifier chain in turn. The functions
306 * run in a process context, so they are allowed to block.
164 * 307 *
165 * If the return value of the notifier can be and'd 308 * If the return value of the notifier can be and'ed
166 * with %NOTIFY_STOP_MASK, then notifier_call_chain 309 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
167 * will return immediately, with the return value of 310 * will return immediately, with the return value of
168 * the notifier function which halted execution. 311 * the notifier function which halted execution.
169 * Otherwise, the return value is the return value 312 * Otherwise the return value is the return value
170 * of the last notifier function called. 313 * of the last notifier function called.
171 */ 314 */
172 315
173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 316int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
317 unsigned long val, void *v)
174{ 318{
175 int ret=NOTIFY_DONE; 319 int ret;
176 struct notifier_block *nb = *n;
177 320
178 while(nb) 321 down_read(&nh->rwsem);
179 { 322 ret = notifier_call_chain(&nh->head, val, v);
180 ret=nb->notifier_call(nb,val,v); 323 up_read(&nh->rwsem);
181 if(ret&NOTIFY_STOP_MASK)
182 {
183 return ret;
184 }
185 nb=nb->next;
186 }
187 return ret; 324 return ret;
188} 325}
189 326
190EXPORT_SYMBOL(notifier_call_chain); 327EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
328
329/*
330 * Raw notifier chain routines. There is no protection;
331 * the caller must provide it. Use at your own risk!
332 */
333
334/**
335 * raw_notifier_chain_register - Add notifier to a raw notifier chain
336 * @nh: Pointer to head of the raw notifier chain
337 * @n: New entry in notifier chain
338 *
339 * Adds a notifier to a raw notifier chain.
340 * All locking must be provided by the caller.
341 *
342 * Currently always returns zero.
343 */
344
345int raw_notifier_chain_register(struct raw_notifier_head *nh,
346 struct notifier_block *n)
347{
348 return notifier_chain_register(&nh->head, n);
349}
350
351EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
352
353/**
354 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
355 * @nh: Pointer to head of the raw notifier chain
356 * @n: Entry to remove from notifier chain
357 *
358 * Removes a notifier from a raw notifier chain.
359 * All locking must be provided by the caller.
360 *
361 * Returns zero on success or %-ENOENT on failure.
362 */
363int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
364 struct notifier_block *n)
365{
366 return notifier_chain_unregister(&nh->head, n);
367}
368
369EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
370
371/**
372 * raw_notifier_call_chain - Call functions in a raw notifier chain
373 * @nh: Pointer to head of the raw notifier chain
374 * @val: Value passed unmodified to notifier function
375 * @v: Pointer passed unmodified to notifier function
376 *
377 * Calls each function in a notifier chain in turn. The functions
378 * run in an undefined context.
379 * All locking must be provided by the caller.
380 *
381 * If the return value of the notifier can be and'ed
382 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain
383 * will return immediately, with the return value of
384 * the notifier function which halted execution.
385 * Otherwise the return value is the return value
386 * of the last notifier function called.
387 */
388
389int raw_notifier_call_chain(struct raw_notifier_head *nh,
390 unsigned long val, void *v)
391{
392 return notifier_call_chain(&nh->head, val, v);
393}
394
395EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
191 396
192/** 397/**
193 * register_reboot_notifier - Register function to be called at reboot time 398 * register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
196 * Registers a function with the list of functions 401 * Registers a function with the list of functions
197 * to be called at reboot time. 402 * to be called at reboot time.
198 * 403 *
199 * Currently always returns zero, as notifier_chain_register 404 * Currently always returns zero, as blocking_notifier_chain_register
200 * always returns zero. 405 * always returns zero.
201 */ 406 */
202 407
203int register_reboot_notifier(struct notifier_block * nb) 408int register_reboot_notifier(struct notifier_block * nb)
204{ 409{
205 return notifier_chain_register(&reboot_notifier_list, nb); 410 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
206} 411}
207 412
208EXPORT_SYMBOL(register_reboot_notifier); 413EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
219 424
220int unregister_reboot_notifier(struct notifier_block * nb) 425int unregister_reboot_notifier(struct notifier_block * nb)
221{ 426{
222 return notifier_chain_unregister(&reboot_notifier_list, nb); 427 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
223} 428}
224 429
225EXPORT_SYMBOL(unregister_reboot_notifier); 430EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
380 585
381void kernel_restart_prepare(char *cmd) 586void kernel_restart_prepare(char *cmd)
382{ 587{
383 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
384 system_state = SYSTEM_RESTART; 589 system_state = SYSTEM_RESTART;
385 device_shutdown(); 590 device_shutdown();
386} 591}
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
430 635
431void kernel_shutdown_prepare(enum system_states state) 636void kernel_shutdown_prepare(enum system_states state)
432{ 637{
433 notifier_call_chain(&reboot_notifier_list, 638 blocking_notifier_call_chain(&reboot_notifier_list,
434 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 639 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
435 system_state = state; 640 system_state = state;
436 device_shutdown(); 641 device_shutdown();
@@ -997,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
997 */ 1202 */
998 if (tbuf) { 1203 if (tbuf) {
999 struct tms tmp; 1204 struct tms tmp;
1205 struct task_struct *tsk = current;
1206 struct task_struct *t;
1000 cputime_t utime, stime, cutime, cstime; 1207 cputime_t utime, stime, cutime, cstime;
1001 1208
1002#ifdef CONFIG_SMP 1209 spin_lock_irq(&tsk->sighand->siglock);
1003 if (thread_group_empty(current)) { 1210 utime = tsk->signal->utime;
1004 /* 1211 stime = tsk->signal->stime;
1005 * Single thread case without the use of any locks. 1212 t = tsk;
1006 * 1213 do {
1007 * We may race with release_task if two threads are 1214 utime = cputime_add(utime, t->utime);
1008 * executing. However, release task first adds up the 1215 stime = cputime_add(stime, t->stime);
1009 * counters (__exit_signal) before removing the task 1216 t = next_thread(t);
1010 * from the process tasklist (__unhash_process). 1217 } while (t != tsk);
1011 * __exit_signal also acquires and releases the
1012 * siglock which results in the proper memory ordering
1013 * so that the list modifications are always visible
1014 * after the counters have been updated.
1015 *
1016 * If the counters have been updated by the second thread
1017 * but the thread has not yet been removed from the list
1018 * then the other branch will be executing which will
1019 * block on tasklist_lock until the exit handling of the
1020 * other task is finished.
1021 *
1022 * This also implies that the sighand->siglock cannot
1023 * be held by another processor. So we can also
1024 * skip acquiring that lock.
1025 */
1026 utime = cputime_add(current->signal->utime, current->utime);
1027 stime = cputime_add(current->signal->utime, current->stime);
1028 cutime = current->signal->cutime;
1029 cstime = current->signal->cstime;
1030 } else
1031#endif
1032 {
1033
1034 /* Process with multiple threads */
1035 struct task_struct *tsk = current;
1036 struct task_struct *t;
1037 1218
1038 read_lock(&tasklist_lock); 1219 cutime = tsk->signal->cutime;
1039 utime = tsk->signal->utime; 1220 cstime = tsk->signal->cstime;
1040 stime = tsk->signal->stime; 1221 spin_unlock_irq(&tsk->sighand->siglock);
1041 t = tsk;
1042 do {
1043 utime = cputime_add(utime, t->utime);
1044 stime = cputime_add(stime, t->stime);
1045 t = next_thread(t);
1046 } while (t != tsk);
1047 1222
1048 /*
1049 * While we have tasklist_lock read-locked, no dying thread
1050 * can be updating current->signal->[us]time. Instead,
1051 * we got their counts included in the live thread loop.
1052 * However, another thread can come in right now and
1053 * do a wait call that updates current->signal->c[us]time.
1054 * To make sure we always see that pair updated atomically,
1055 * we take the siglock around fetching them.
1056 */
1057 spin_lock_irq(&tsk->sighand->siglock);
1058 cutime = tsk->signal->cutime;
1059 cstime = tsk->signal->cstime;
1060 spin_unlock_irq(&tsk->sighand->siglock);
1061 read_unlock(&tasklist_lock);
1062 }
1063 tmp.tms_utime = cputime_to_clock_t(utime); 1223 tmp.tms_utime = cputime_to_clock_t(utime);
1064 tmp.tms_stime = cputime_to_clock_t(stime); 1224 tmp.tms_stime = cputime_to_clock_t(stime);
1065 tmp.tms_cutime = cputime_to_clock_t(cutime); 1225 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1212,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
1212asmlinkage long sys_setsid(void) 1372asmlinkage long sys_setsid(void)
1213{ 1373{
1214 struct task_struct *group_leader = current->group_leader; 1374 struct task_struct *group_leader = current->group_leader;
1215 struct pid *pid; 1375 pid_t session;
1216 int err = -EPERM; 1376 int err = -EPERM;
1217 1377
1218 mutex_lock(&tty_mutex); 1378 mutex_lock(&tty_mutex);
1219 write_lock_irq(&tasklist_lock); 1379 write_lock_irq(&tasklist_lock);
1220 1380
1221 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1381 /* Fail if I am already a session leader */
1222 if (pid) 1382 if (group_leader->signal->leader)
1383 goto out;
1384
1385 session = group_leader->pid;
1386 /* Fail if a process group id already exists that equals the
1387 * proposed session id.
1388 *
1389 * Don't check if session id == 1 because kernel threads use this
1390 * session id and so the check will always fail and make it so
1391 * init cannot successfully call setsid.
1392 */
1393 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
1223 goto out; 1394 goto out;
1224 1395
1225 group_leader->signal->leader = 1; 1396 group_leader->signal->leader = 1;
1226 __set_special_pids(group_leader->pid, group_leader->pid); 1397 __set_special_pids(session, session);
1227 group_leader->signal->tty = NULL; 1398 group_leader->signal->tty = NULL;
1228 group_leader->signal->tty_old_pgrp = 0; 1399 group_leader->signal->tty_old_pgrp = 0;
1229 err = process_group(group_leader); 1400 err = process_group(group_leader);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6..5433195040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
42cond_syscall(sys_socketcall); 42cond_syscall(sys_socketcall);
43cond_syscall(sys_futex); 43cond_syscall(sys_futex);
44cond_syscall(compat_sys_futex); 44cond_syscall(compat_sys_futex);
45cond_syscall(sys_set_robust_list);
46cond_syscall(compat_sys_set_robust_list);
47cond_syscall(sys_get_robust_list);
48cond_syscall(compat_sys_get_robust_list);
45cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
46cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
47cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);
@@ -116,3 +120,15 @@ cond_syscall(sys32_sysctl);
116cond_syscall(ppc_rtas); 120cond_syscall(ppc_rtas);
117cond_syscall(sys_spu_run); 121cond_syscall(sys_spu_run);
118cond_syscall(sys_spu_create); 122cond_syscall(sys_spu_create);
123
124/* mmu depending weak syscall entries */
125cond_syscall(sys_mprotect);
126cond_syscall(sys_msync);
127cond_syscall(sys_mlock);
128cond_syscall(sys_munlock);
129cond_syscall(sys_mlockall);
130cond_syscall(sys_munlockall);
131cond_syscall(sys_mincore);
132cond_syscall(sys_madvise);
133cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages);
diff --git a/kernel/time.c b/kernel/time.c
index ff8e7019c4..b00ddc71ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -410,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time);
410 * current_fs_time - Return FS time 410 * current_fs_time - Return FS time
411 * @sb: Superblock. 411 * @sb: Superblock.
412 * 412 *
413 * Return the current time truncated to the time granuality supported by 413 * Return the current time truncated to the time granularity supported by
414 * the fs. 414 * the fs.
415 */ 415 */
416struct timespec current_fs_time(struct super_block *sb) 416struct timespec current_fs_time(struct super_block *sb)
@@ -421,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb)
421EXPORT_SYMBOL(current_fs_time); 421EXPORT_SYMBOL(current_fs_time);
422 422
423/** 423/**
424 * timespec_trunc - Truncate timespec to a granuality 424 * timespec_trunc - Truncate timespec to a granularity
425 * @t: Timespec 425 * @t: Timespec
426 * @gran: Granuality in ns. 426 * @gran: Granularity in ns.
427 * 427 *
428 * Truncate a timespec to a granuality. gran must be smaller than a second. 428 * Truncate a timespec to a granularity. gran must be smaller than a second.
429 * Always rounds down. 429 * Always rounds down.
430 * 430 *
431 * This function should be only used for timestamps returned by 431 * This function should be only used for timestamps returned by
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187..9e49deed46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
54/* 54/*
55 * per-CPU timer vector definitions: 55 * per-CPU timer vector definitions:
56 */ 56 */
57
58#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 57#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
59#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 58#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
60#define TVN_SIZE (1 << TVN_BITS) 59#define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
62#define TVN_MASK (TVN_SIZE - 1) 61#define TVN_MASK (TVN_SIZE - 1)
63#define TVR_MASK (TVR_SIZE - 1) 62#define TVR_MASK (TVR_SIZE - 1)
64 63
65struct timer_base_s {
66 spinlock_t lock;
67 struct timer_list *running_timer;
68};
69
70typedef struct tvec_s { 64typedef struct tvec_s {
71 struct list_head vec[TVN_SIZE]; 65 struct list_head vec[TVN_SIZE];
72} tvec_t; 66} tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
76} tvec_root_t; 70} tvec_root_t;
77 71
78struct tvec_t_base_s { 72struct tvec_t_base_s {
79 struct timer_base_s t_base; 73 spinlock_t lock;
74 struct timer_list *running_timer;
80 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
81 tvec_root_t tv1; 76 tvec_root_t tv1;
82 tvec_t tv2; 77 tvec_t tv2;
@@ -86,14 +81,16 @@ struct tvec_t_base_s {
86} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
87 82
88typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
89static DEFINE_PER_CPU(tvec_base_t *, tvec_bases); 84
90static tvec_base_t boot_tvec_bases; 85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases);
87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
91 88
92static inline void set_running_timer(tvec_base_t *base, 89static inline void set_running_timer(tvec_base_t *base,
93 struct timer_list *timer) 90 struct timer_list *timer)
94{ 91{
95#ifdef CONFIG_SMP 92#ifdef CONFIG_SMP
96 base->t_base.running_timer = timer; 93 base->running_timer = timer;
97#endif 94#endif
98} 95}
99 96
@@ -139,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
139 list_add_tail(&timer->entry, vec); 136 list_add_tail(&timer->entry, vec);
140} 137}
141 138
142typedef struct timer_base_s timer_base_t;
143/*
144 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
145 * at compile time, and we need timer->base to lock the timer.
146 */
147timer_base_t __init_timer_base
148 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
149EXPORT_SYMBOL(__init_timer_base);
150
151/*** 139/***
152 * init_timer - initialize a timer. 140 * init_timer - initialize a timer.
153 * @timer: the timer to be initialized 141 * @timer: the timer to be initialized
@@ -158,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base);
158void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
159{ 147{
160 timer->entry.next = NULL; 148 timer->entry.next = NULL;
161 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base; 149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
162} 150}
163EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
164 152
@@ -174,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer,
174} 162}
175 163
176/* 164/*
177 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 165 * We are using hashed locking: holding per_cpu(tvec_bases).lock
178 * means that all timers which are tied to this base via timer->base are 166 * means that all timers which are tied to this base via timer->base are
179 * locked, and the base itself is locked too. 167 * locked, and the base itself is locked too.
180 * 168 *
@@ -185,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer,
185 * possible to set timer->base = NULL and drop the lock: the timer remains 173 * possible to set timer->base = NULL and drop the lock: the timer remains
186 * locked. 174 * locked.
187 */ 175 */
188static timer_base_t *lock_timer_base(struct timer_list *timer, 176static tvec_base_t *lock_timer_base(struct timer_list *timer,
189 unsigned long *flags) 177 unsigned long *flags)
190{ 178{
191 timer_base_t *base; 179 tvec_base_t *base;
192 180
193 for (;;) { 181 for (;;) {
194 base = timer->base; 182 base = timer->base;
@@ -205,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
205 193
206int __mod_timer(struct timer_list *timer, unsigned long expires) 194int __mod_timer(struct timer_list *timer, unsigned long expires)
207{ 195{
208 timer_base_t *base; 196 tvec_base_t *base, *new_base;
209 tvec_base_t *new_base;
210 unsigned long flags; 197 unsigned long flags;
211 int ret = 0; 198 int ret = 0;
212 199
@@ -221,7 +208,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
221 208
222 new_base = __get_cpu_var(tvec_bases); 209 new_base = __get_cpu_var(tvec_bases);
223 210
224 if (base != &new_base->t_base) { 211 if (base != new_base) {
225 /* 212 /*
226 * We are trying to schedule the timer on the local CPU. 213 * We are trying to schedule the timer on the local CPU.
227 * However we can't change timer's base while it is running, 214 * However we can't change timer's base while it is running,
@@ -229,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
229 * handler yet has not finished. This also guarantees that 216 * handler yet has not finished. This also guarantees that
230 * the timer is serialized wrt itself. 217 * the timer is serialized wrt itself.
231 */ 218 */
232 if (unlikely(base->running_timer == timer)) { 219 if (likely(base->running_timer != timer)) {
233 /* The timer remains on a former base */
234 new_base = container_of(base, tvec_base_t, t_base);
235 } else {
236 /* See the comment in lock_timer_base() */ 220 /* See the comment in lock_timer_base() */
237 timer->base = NULL; 221 timer->base = NULL;
238 spin_unlock(&base->lock); 222 spin_unlock(&base->lock);
239 spin_lock(&new_base->t_base.lock); 223 base = new_base;
240 timer->base = &new_base->t_base; 224 spin_lock(&base->lock);
225 timer->base = base;
241 } 226 }
242 } 227 }
243 228
244 timer->expires = expires; 229 timer->expires = expires;
245 internal_add_timer(new_base, timer); 230 internal_add_timer(base, timer);
246 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 231 spin_unlock_irqrestore(&base->lock, flags);
247 232
248 return ret; 233 return ret;
249} 234}
@@ -263,10 +248,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
263 unsigned long flags; 248 unsigned long flags;
264 249
265 BUG_ON(timer_pending(timer) || !timer->function); 250 BUG_ON(timer_pending(timer) || !timer->function);
266 spin_lock_irqsave(&base->t_base.lock, flags); 251 spin_lock_irqsave(&base->lock, flags);
267 timer->base = &base->t_base; 252 timer->base = base;
268 internal_add_timer(base, timer); 253 internal_add_timer(base, timer);
269 spin_unlock_irqrestore(&base->t_base.lock, flags); 254 spin_unlock_irqrestore(&base->lock, flags);
270} 255}
271 256
272 257
@@ -319,7 +304,7 @@ EXPORT_SYMBOL(mod_timer);
319 */ 304 */
320int del_timer(struct timer_list *timer) 305int del_timer(struct timer_list *timer)
321{ 306{
322 timer_base_t *base; 307 tvec_base_t *base;
323 unsigned long flags; 308 unsigned long flags;
324 int ret = 0; 309 int ret = 0;
325 310
@@ -346,7 +331,7 @@ EXPORT_SYMBOL(del_timer);
346 */ 331 */
347int try_to_del_timer_sync(struct timer_list *timer) 332int try_to_del_timer_sync(struct timer_list *timer)
348{ 333{
349 timer_base_t *base; 334 tvec_base_t *base;
350 unsigned long flags; 335 unsigned long flags;
351 int ret = -1; 336 int ret = -1;
352 337
@@ -410,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
410 struct timer_list *tmp; 395 struct timer_list *tmp;
411 396
412 tmp = list_entry(curr, struct timer_list, entry); 397 tmp = list_entry(curr, struct timer_list, entry);
413 BUG_ON(tmp->base != &base->t_base); 398 BUG_ON(tmp->base != base);
414 curr = curr->next; 399 curr = curr->next;
415 internal_add_timer(base, tmp); 400 internal_add_timer(base, tmp);
416 } 401 }
@@ -432,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base)
432{ 417{
433 struct timer_list *timer; 418 struct timer_list *timer;
434 419
435 spin_lock_irq(&base->t_base.lock); 420 spin_lock_irq(&base->lock);
436 while (time_after_eq(jiffies, base->timer_jiffies)) { 421 while (time_after_eq(jiffies, base->timer_jiffies)) {
437 struct list_head work_list = LIST_HEAD_INIT(work_list); 422 struct list_head work_list = LIST_HEAD_INIT(work_list);
438 struct list_head *head = &work_list; 423 struct list_head *head = &work_list;
@@ -458,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base)
458 443
459 set_running_timer(base, timer); 444 set_running_timer(base, timer);
460 detach_timer(timer, 1); 445 detach_timer(timer, 1);
461 spin_unlock_irq(&base->t_base.lock); 446 spin_unlock_irq(&base->lock);
462 { 447 {
463 int preempt_count = preempt_count(); 448 int preempt_count = preempt_count();
464 fn(data); 449 fn(data);
@@ -471,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base)
471 BUG(); 456 BUG();
472 } 457 }
473 } 458 }
474 spin_lock_irq(&base->t_base.lock); 459 spin_lock_irq(&base->lock);
475 } 460 }
476 } 461 }
477 set_running_timer(base, NULL); 462 set_running_timer(base, NULL);
478 spin_unlock_irq(&base->t_base.lock); 463 spin_unlock_irq(&base->lock);
479} 464}
480 465
481#ifdef CONFIG_NO_IDLE_HZ 466#ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +491,7 @@ unsigned long next_timer_interrupt(void)
506 hr_expires += jiffies; 491 hr_expires += jiffies;
507 492
508 base = __get_cpu_var(tvec_bases); 493 base = __get_cpu_var(tvec_bases);
509 spin_lock(&base->t_base.lock); 494 spin_lock(&base->lock);
510 expires = base->timer_jiffies + (LONG_MAX >> 1); 495 expires = base->timer_jiffies + (LONG_MAX >> 1);
511 list = NULL; 496 list = NULL;
512 497
@@ -554,7 +539,23 @@ found:
554 expires = nte->expires; 539 expires = nte->expires;
555 } 540 }
556 } 541 }
557 spin_unlock(&base->t_base.lock); 542 spin_unlock(&base->lock);
543
544 /*
545 * It can happen that other CPUs service timer IRQs and increment
546 * jiffies, but we have not yet got a local timer tick to process
547 * the timer wheels. In that case, the expiry time can be before
548 * jiffies, but since the high-resolution timer here is relative to
549 * jiffies, the default expression when high-resolution timers are
550 * not active,
551 *
552 * time_before(MAX_JIFFY_OFFSET + jiffies, expires)
553 *
554 * would falsely evaluate to true. If that is the case, just
555 * return jiffies so that we can immediately fire the local timer
556 */
557 if (time_before(expires, jiffies))
558 return jiffies;
558 559
559 if (time_before(hr_expires, expires)) 560 if (time_before(hr_expires, expires))
560 return hr_expires; 561 return hr_expires;
@@ -841,7 +842,7 @@ void update_process_times(int user_tick)
841 */ 842 */
842static unsigned long count_active_tasks(void) 843static unsigned long count_active_tasks(void)
843{ 844{
844 return (nr_running() + nr_uninterruptible()) * FIXED_1; 845 return nr_active() * FIXED_1;
845} 846}
846 847
847/* 848/*
@@ -1240,29 +1241,37 @@ static int __devinit init_timers_cpu(int cpu)
1240{ 1241{
1241 int j; 1242 int j;
1242 tvec_base_t *base; 1243 tvec_base_t *base;
1244 static char __devinitdata tvec_base_done[NR_CPUS];
1243 1245
1244 base = per_cpu(tvec_bases, cpu); 1246 if (!tvec_base_done[cpu]) {
1245 if (!base) {
1246 static char boot_done; 1247 static char boot_done;
1247 1248
1248 /*
1249 * Cannot do allocation in init_timers as that runs before the
1250 * allocator initializes (and would waste memory if there are
1251 * more possible CPUs than will ever be installed/brought up).
1252 */
1253 if (boot_done) { 1249 if (boot_done) {
1250 /*
1251 * The APs use this path later in boot
1252 */
1254 base = kmalloc_node(sizeof(*base), GFP_KERNEL, 1253 base = kmalloc_node(sizeof(*base), GFP_KERNEL,
1255 cpu_to_node(cpu)); 1254 cpu_to_node(cpu));
1256 if (!base) 1255 if (!base)
1257 return -ENOMEM; 1256 return -ENOMEM;
1258 memset(base, 0, sizeof(*base)); 1257 memset(base, 0, sizeof(*base));
1258 per_cpu(tvec_bases, cpu) = base;
1259 } else { 1259 } else {
1260 base = &boot_tvec_bases; 1260 /*
1261 * This is for the boot CPU - we use compile-time
1262 * static initialisation because per-cpu memory isn't
1263 * ready yet and because the memory allocators are not
1264 * initialised either.
1265 */
1261 boot_done = 1; 1266 boot_done = 1;
1267 base = &boot_tvec_bases;
1262 } 1268 }
1263 per_cpu(tvec_bases, cpu) = base; 1269 tvec_base_done[cpu] = 1;
1270 } else {
1271 base = per_cpu(tvec_bases, cpu);
1264 } 1272 }
1265 spin_lock_init(&base->t_base.lock); 1273
1274 spin_lock_init(&base->lock);
1266 for (j = 0; j < TVN_SIZE; j++) { 1275 for (j = 0; j < TVN_SIZE; j++) {
1267 INIT_LIST_HEAD(base->tv5.vec + j); 1276 INIT_LIST_HEAD(base->tv5.vec + j);
1268 INIT_LIST_HEAD(base->tv4.vec + j); 1277 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1293,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1284 while (!list_empty(head)) { 1293 while (!list_empty(head)) {
1285 timer = list_entry(head->next, struct timer_list, entry); 1294 timer = list_entry(head->next, struct timer_list, entry);
1286 detach_timer(timer, 0); 1295 detach_timer(timer, 0);
1287 timer->base = &new_base->t_base; 1296 timer->base = new_base;
1288 internal_add_timer(new_base, timer); 1297 internal_add_timer(new_base, timer);
1289 } 1298 }
1290} 1299}
@@ -1300,11 +1309,11 @@ static void __devinit migrate_timers(int cpu)
1300 new_base = get_cpu_var(tvec_bases); 1309 new_base = get_cpu_var(tvec_bases);
1301 1310
1302 local_irq_disable(); 1311 local_irq_disable();
1303 spin_lock(&new_base->t_base.lock); 1312 spin_lock(&new_base->lock);
1304 spin_lock(&old_base->t_base.lock); 1313 spin_lock(&old_base->lock);
1314
1315 BUG_ON(old_base->running_timer);
1305 1316
1306 if (old_base->t_base.running_timer)
1307 BUG();
1308 for (i = 0; i < TVR_SIZE; i++) 1317 for (i = 0; i < TVR_SIZE; i++)
1309 migrate_timer_list(new_base, old_base->tv1.vec + i); 1318 migrate_timer_list(new_base, old_base->tv1.vec + i);
1310 for (i = 0; i < TVN_SIZE; i++) { 1319 for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,14 +1323,14 @@ static void __devinit migrate_timers(int cpu)
1314 migrate_timer_list(new_base, old_base->tv5.vec + i); 1323 migrate_timer_list(new_base, old_base->tv5.vec + i);
1315 } 1324 }
1316 1325
1317 spin_unlock(&old_base->t_base.lock); 1326 spin_unlock(&old_base->lock);
1318 spin_unlock(&new_base->t_base.lock); 1327 spin_unlock(&new_base->lock);
1319 local_irq_enable(); 1328 local_irq_enable();
1320 put_cpu_var(tvec_bases); 1329 put_cpu_var(tvec_bases);
1321} 1330}
1322#endif /* CONFIG_HOTPLUG_CPU */ 1331#endif /* CONFIG_HOTPLUG_CPU */
1323 1332
1324static int __devinit timer_cpu_notify(struct notifier_block *self, 1333static int timer_cpu_notify(struct notifier_block *self,
1325 unsigned long action, void *hcpu) 1334 unsigned long action, void *hcpu)
1326{ 1335{
1327 long cpu = (long)hcpu; 1336 long cpu = (long)hcpu;
@@ -1341,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
1341 return NOTIFY_OK; 1350 return NOTIFY_OK;
1342} 1351}
1343 1352
1344static struct notifier_block __devinitdata timers_nb = { 1353static struct notifier_block timers_nb = {
1345 .notifier_call = timer_cpu_notify, 1354 .notifier_call = timer_cpu_notify,
1346}; 1355};
1347 1356
@@ -1471,7 +1480,7 @@ static void time_interpolator_update(long delta_nsec)
1471 */ 1480 */
1472 if (jiffies % INTERPOLATOR_ADJUST == 0) 1481 if (jiffies % INTERPOLATOR_ADJUST == 0)
1473 { 1482 {
1474 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) 1483 if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1475 time_interpolator->nsec_per_cyc--; 1484 time_interpolator->nsec_per_cyc--;
1476 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) 1485 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1477 time_interpolator->nsec_per_cyc++; 1486 time_interpolator->nsec_per_cyc++;
@@ -1495,8 +1504,7 @@ register_time_interpolator(struct time_interpolator *ti)
1495 unsigned long flags; 1504 unsigned long flags;
1496 1505
1497 /* Sanity check */ 1506 /* Sanity check */
1498 if (ti->frequency == 0 || ti->mask == 0) 1507 BUG_ON(ti->frequency == 0 || ti->mask == 0);
1499 BUG();
1500 1508
1501 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; 1509 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1502 spin_lock(&time_interpolator_lock); 1510 spin_lock(&time_interpolator_lock);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index aa25605027..187e2a4238 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -20,43 +20,67 @@
20 20
21asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) 21asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
22{ 22{
23 return sys_chown(filename, low2highuid(user), low2highgid(group)); 23 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
24 /* avoid REGPARM breakage on x86: */
25 prevent_tail_call(ret);
26 return ret;
24} 27}
25 28
26asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) 29asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
27{ 30{
28 return sys_lchown(filename, low2highuid(user), low2highgid(group)); 31 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
32 /* avoid REGPARM breakage on x86: */
33 prevent_tail_call(ret);
34 return ret;
29} 35}
30 36
31asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) 37asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
32{ 38{
33 return sys_fchown(fd, low2highuid(user), low2highgid(group)); 39 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
40 /* avoid REGPARM breakage on x86: */
41 prevent_tail_call(ret);
42 return ret;
34} 43}
35 44
36asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) 45asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
37{ 46{
38 return sys_setregid(low2highgid(rgid), low2highgid(egid)); 47 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
48 /* avoid REGPARM breakage on x86: */
49 prevent_tail_call(ret);
50 return ret;
39} 51}
40 52
41asmlinkage long sys_setgid16(old_gid_t gid) 53asmlinkage long sys_setgid16(old_gid_t gid)
42{ 54{
43 return sys_setgid(low2highgid(gid)); 55 long ret = sys_setgid(low2highgid(gid));
56 /* avoid REGPARM breakage on x86: */
57 prevent_tail_call(ret);
58 return ret;
44} 59}
45 60
46asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) 61asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
47{ 62{
48 return sys_setreuid(low2highuid(ruid), low2highuid(euid)); 63 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
64 /* avoid REGPARM breakage on x86: */
65 prevent_tail_call(ret);
66 return ret;
49} 67}
50 68
51asmlinkage long sys_setuid16(old_uid_t uid) 69asmlinkage long sys_setuid16(old_uid_t uid)
52{ 70{
53 return sys_setuid(low2highuid(uid)); 71 long ret = sys_setuid(low2highuid(uid));
72 /* avoid REGPARM breakage on x86: */
73 prevent_tail_call(ret);
74 return ret;
54} 75}
55 76
56asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) 77asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
57{ 78{
58 return sys_setresuid(low2highuid(ruid), low2highuid(euid), 79 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
59 low2highuid(suid)); 80 low2highuid(suid));
81 /* avoid REGPARM breakage on x86: */
82 prevent_tail_call(ret);
83 return ret;
60} 84}
61 85
62asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 86asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
72 96
73asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) 97asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
74{ 98{
75 return sys_setresgid(low2highgid(rgid), low2highgid(egid), 99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
76 low2highgid(sgid)); 100 low2highgid(sgid));
101 /* avoid REGPARM breakage on x86: */
102 prevent_tail_call(ret);
103 return ret;
77} 104}
78 105
79asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
89 116
90asmlinkage long sys_setfsuid16(old_uid_t uid) 117asmlinkage long sys_setfsuid16(old_uid_t uid)
91{ 118{
92 return sys_setfsuid(low2highuid(uid)); 119 long ret = sys_setfsuid(low2highuid(uid));
120 /* avoid REGPARM breakage on x86: */
121 prevent_tail_call(ret);
122 return ret;
93} 123}
94 124
95asmlinkage long sys_setfsgid16(old_gid_t gid) 125asmlinkage long sys_setfsgid16(old_gid_t gid)
96{ 126{
97 return sys_setfsgid(low2highgid(gid)); 127 long ret = sys_setfsgid(low2highgid(gid));
128 /* avoid REGPARM breakage on x86: */
129 prevent_tail_call(ret);
130 return ret;
98} 131}
99 132
100static int groups16_to_user(old_gid_t __user *grouplist, 133static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e9e464a903..880fb415a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -547,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
547} 547}
548 548
549/* We're holding the cpucontrol mutex here */ 549/* We're holding the cpucontrol mutex here */
550static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 550static int workqueue_cpu_callback(struct notifier_block *nfb,
551 unsigned long action, 551 unsigned long action,
552 void *hcpu) 552 void *hcpu)
553{ 553{