aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c249
-rw-r--r--kernel/audit.h13
-rw-r--r--kernel/auditfilter.c55
-rw-r--r--kernel/auditsc.c40
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/cgroup.c333
-rw-r--r--kernel/cgroup_debug.c20
-rw-r--r--kernel/compat.c6
-rw-r--r--kernel/configs.c7
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpuset.c355
-rw-r--r--kernel/dma.c7
-rw-r--r--kernel/exit.c150
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/hrtimer.c177
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/manage.c50
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c9
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/marker.c3
-rw-r--r--kernel/module.c319
-rw-r--r--kernel/notifier.c38
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/nsproxy.c12
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid.c41
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c11
-rw-r--r--kernel/posix-timers.c6
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/pm.c205
-rw-r--r--kernel/printk.c122
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c10
-rw-r--r--kernel/sched.c68
-rw-r--r--kernel/sched_debug.c9
-rw-r--r--kernel/signal.c646
-rw-r--r--kernel/softirq.c20
-rw-r--r--kernel/sys.c110
-rw-r--r--kernel/sysctl.c176
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time.c55
-rw-r--r--kernel/time/ntp.c398
-rw-r--r--kernel/time/timekeeping.c17
-rw-r--r--kernel/time/timer_list.c5
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/timer.c153
-rw-r--r--kernel/user.c27
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/workqueue.c32
63 files changed, 2323 insertions, 1889 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..188c43223f52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 16obj-y += time/
17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a7b16086d36f..b7d3709cc452 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,6 +126,8 @@ static int audit_freelist_count;
126static LIST_HEAD(audit_freelist); 126static LIST_HEAD(audit_freelist);
127 127
128static struct sk_buff_head audit_skb_queue; 128static struct sk_buff_head audit_skb_queue;
129/* queue of skbs to send to auditd when/if it comes back */
130static struct sk_buff_head audit_skb_hold_queue;
129static struct task_struct *kauditd_task; 131static struct task_struct *kauditd_task;
130static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 132static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
131static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -154,6 +156,11 @@ struct audit_buffer {
154 gfp_t gfp_mask; 156 gfp_t gfp_mask;
155}; 157};
156 158
159struct audit_reply {
160 int pid;
161 struct sk_buff *skb;
162};
163
157static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 164static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
158{ 165{
159 if (ab) { 166 if (ab) {
@@ -252,14 +259,15 @@ void audit_log_lost(const char *message)
252} 259}
253 260
254static int audit_log_config_change(char *function_name, int new, int old, 261static int audit_log_config_change(char *function_name, int new, int old,
255 uid_t loginuid, u32 sid, int allow_changes) 262 uid_t loginuid, u32 sessionid, u32 sid,
263 int allow_changes)
256{ 264{
257 struct audit_buffer *ab; 265 struct audit_buffer *ab;
258 int rc = 0; 266 int rc = 0;
259 267
260 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 268 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
261 audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, 269 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
262 old, loginuid); 270 old, loginuid, sessionid);
263 if (sid) { 271 if (sid) {
264 char *ctx = NULL; 272 char *ctx = NULL;
265 u32 len; 273 u32 len;
@@ -279,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
279} 287}
280 288
281static int audit_do_config_change(char *function_name, int *to_change, 289static int audit_do_config_change(char *function_name, int *to_change,
282 int new, uid_t loginuid, u32 sid) 290 int new, uid_t loginuid, u32 sessionid,
291 u32 sid)
283{ 292{
284 int allow_changes, rc = 0, old = *to_change; 293 int allow_changes, rc = 0, old = *to_change;
285 294
@@ -290,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change,
290 allow_changes = 1; 299 allow_changes = 1;
291 300
292 if (audit_enabled != AUDIT_OFF) { 301 if (audit_enabled != AUDIT_OFF) {
293 rc = audit_log_config_change(function_name, new, old, 302 rc = audit_log_config_change(function_name, new, old, loginuid,
294 loginuid, sid, allow_changes); 303 sessionid, sid, allow_changes);
295 if (rc) 304 if (rc)
296 allow_changes = 0; 305 allow_changes = 0;
297 } 306 }
@@ -305,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change,
305 return rc; 314 return rc;
306} 315}
307 316
308static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 317static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
318 u32 sid)
309{ 319{
310 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 320 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
311 limit, loginuid, sid); 321 limit, loginuid, sessionid, sid);
312} 322}
313 323
314static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 324static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
325 u32 sid)
315{ 326{
316 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 327 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
317 limit, loginuid, sid); 328 limit, loginuid, sessionid, sid);
318} 329}
319 330
320static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 331static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
321{ 332{
322 int rc; 333 int rc;
323 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 334 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
324 return -EINVAL; 335 return -EINVAL;
325 336
326 rc = audit_do_config_change("audit_enabled", &audit_enabled, state, 337 rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
327 loginuid, sid); 338 loginuid, sessionid, sid);
328 339
329 if (!rc) 340 if (!rc)
330 audit_ever_enabled |= !!state; 341 audit_ever_enabled |= !!state;
@@ -332,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
332 return rc; 343 return rc;
333} 344}
334 345
335static int audit_set_failure(int state, uid_t loginuid, u32 sid) 346static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
336{ 347{
337 if (state != AUDIT_FAIL_SILENT 348 if (state != AUDIT_FAIL_SILENT
338 && state != AUDIT_FAIL_PRINTK 349 && state != AUDIT_FAIL_PRINTK
@@ -340,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
340 return -EINVAL; 351 return -EINVAL;
341 352
342 return audit_do_config_change("audit_failure", &audit_failure, state, 353 return audit_do_config_change("audit_failure", &audit_failure, state,
343 loginuid, sid); 354 loginuid, sessionid, sid);
355}
356
357/*
358 * Queue skbs to be sent to auditd when/if it comes back. These skbs should
359 * already have been sent via prink/syslog and so if these messages are dropped
360 * it is not a huge concern since we already passed the audit_log_lost()
361 * notification and stuff. This is just nice to get audit messages during
362 * boot before auditd is running or messages generated while auditd is stopped.
363 * This only holds messages is audit_default is set, aka booting with audit=1
364 * or building your kernel that way.
365 */
366static void audit_hold_skb(struct sk_buff *skb)
367{
368 if (audit_default &&
369 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
370 skb_queue_tail(&audit_skb_hold_queue, skb);
371 else
372 kfree_skb(skb);
373}
374
375static void kauditd_send_skb(struct sk_buff *skb)
376{
377 int err;
378 /* take a reference in case we can't send it and we want to hold it */
379 skb_get(skb);
380 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
381 if (err < 0) {
382 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
383 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
384 audit_log_lost("auditd dissapeared\n");
385 audit_pid = 0;
386 /* we might get lucky and get this in the next auditd */
387 audit_hold_skb(skb);
388 } else
389 /* drop the extra reference if sent ok */
390 kfree_skb(skb);
344} 391}
345 392
346static int kauditd_thread(void *dummy) 393static int kauditd_thread(void *dummy)
@@ -349,24 +396,41 @@ static int kauditd_thread(void *dummy)
349 396
350 set_freezable(); 397 set_freezable();
351 while (!kthread_should_stop()) { 398 while (!kthread_should_stop()) {
399 /*
400 * if auditd just started drain the queue of messages already
401 * sent to syslog/printk. remember loss here is ok. we already
402 * called audit_log_lost() if it didn't go out normally. so the
403 * race between the skb_dequeue and the next check for audit_pid
404 * doesn't matter.
405 *
406 * if you ever find kauditd to be too slow we can get a perf win
407 * by doing our own locking and keeping better track if there
408 * are messages in this queue. I don't see the need now, but
409 * in 5 years when I want to play with this again I'll see this
410 * note and still have no friggin idea what i'm thinking today.
411 */
412 if (audit_default && audit_pid) {
413 skb = skb_dequeue(&audit_skb_hold_queue);
414 if (unlikely(skb)) {
415 while (skb && audit_pid) {
416 kauditd_send_skb(skb);
417 skb = skb_dequeue(&audit_skb_hold_queue);
418 }
419 }
420 }
421
352 skb = skb_dequeue(&audit_skb_queue); 422 skb = skb_dequeue(&audit_skb_queue);
353 wake_up(&audit_backlog_wait); 423 wake_up(&audit_backlog_wait);
354 if (skb) { 424 if (skb) {
355 if (audit_pid) { 425 if (audit_pid)
356 int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 426 kauditd_send_skb(skb);
357 if (err < 0) { 427 else {
358 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
359 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
360 audit_log_lost("auditd dissapeared\n");
361 audit_pid = 0;
362 }
363 } else {
364 if (printk_ratelimit()) 428 if (printk_ratelimit())
365 printk(KERN_NOTICE "%s\n", skb->data + 429 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
366 NLMSG_SPACE(0));
367 else 430 else
368 audit_log_lost("printk limit exceeded\n"); 431 audit_log_lost("printk limit exceeded\n");
369 kfree_skb(skb); 432
433 audit_hold_skb(skb);
370 } 434 }
371 } else { 435 } else {
372 DECLARE_WAITQUEUE(wait, current); 436 DECLARE_WAITQUEUE(wait, current);
@@ -385,13 +449,13 @@ static int kauditd_thread(void *dummy)
385 return 0; 449 return 0;
386} 450}
387 451
388static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) 452static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
389{ 453{
390 struct task_struct *tsk; 454 struct task_struct *tsk;
391 int err; 455 int err;
392 456
393 read_lock(&tasklist_lock); 457 read_lock(&tasklist_lock);
394 tsk = find_task_by_pid(pid); 458 tsk = find_task_by_vpid(pid);
395 err = -ESRCH; 459 err = -ESRCH;
396 if (!tsk) 460 if (!tsk)
397 goto out; 461 goto out;
@@ -404,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
404 if (err) 468 if (err)
405 goto out; 469 goto out;
406 470
407 tty_audit_push_task(tsk, loginuid); 471 tty_audit_push_task(tsk, loginuid, sessionid);
408out: 472out:
409 read_unlock(&tasklist_lock); 473 read_unlock(&tasklist_lock);
410 return err; 474 return err;
@@ -469,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */
469 return NULL; 533 return NULL;
470} 534}
471 535
536static int audit_send_reply_thread(void *arg)
537{
538 struct audit_reply *reply = (struct audit_reply *)arg;
539
540 mutex_lock(&audit_cmd_mutex);
541 mutex_unlock(&audit_cmd_mutex);
542
543 /* Ignore failure. It'll only happen if the sender goes away,
544 because our timeout is set to infinite. */
545 netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
546 kfree(reply);
547 return 0;
548}
472/** 549/**
473 * audit_send_reply - send an audit reply message via netlink 550 * audit_send_reply - send an audit reply message via netlink
474 * @pid: process id to send reply to 551 * @pid: process id to send reply to
@@ -485,14 +562,26 @@ nlmsg_failure: /* Used by NLMSG_PUT */
485void audit_send_reply(int pid, int seq, int type, int done, int multi, 562void audit_send_reply(int pid, int seq, int type, int done, int multi,
486 void *payload, int size) 563 void *payload, int size)
487{ 564{
488 struct sk_buff *skb; 565 struct sk_buff *skb;
566 struct task_struct *tsk;
567 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
568 GFP_KERNEL);
569
570 if (!reply)
571 return;
572
489 skb = audit_make_reply(pid, seq, type, done, multi, payload, size); 573 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
490 if (!skb) 574 if (!skb)
491 return; 575 return;
492 /* Ignore failure. It'll only happen if the sender goes away, 576
493 because our timeout is set to infinite. */ 577 reply->pid = pid;
494 netlink_unicast(audit_sock, skb, pid, 0); 578 reply->skb = skb;
495 return; 579
580 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
581 if (IS_ERR(tsk)) {
582 kfree(reply);
583 kfree_skb(skb);
584 }
496} 585}
497 586
498/* 587/*
@@ -534,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
534} 623}
535 624
536static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 625static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
537 u32 pid, u32 uid, uid_t auid, u32 sid) 626 u32 pid, u32 uid, uid_t auid, u32 ses,
627 u32 sid)
538{ 628{
539 int rc = 0; 629 int rc = 0;
540 char *ctx = NULL; 630 char *ctx = NULL;
@@ -546,8 +636,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
546 } 636 }
547 637
548 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 638 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
549 audit_log_format(*ab, "user pid=%d uid=%u auid=%u", 639 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
550 pid, uid, auid); 640 pid, uid, auid, ses);
551 if (sid) { 641 if (sid) {
552 rc = security_secid_to_secctx(sid, &ctx, &len); 642 rc = security_secid_to_secctx(sid, &ctx, &len);
553 if (rc) 643 if (rc)
@@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
570 struct audit_buffer *ab; 660 struct audit_buffer *ab;
571 u16 msg_type = nlh->nlmsg_type; 661 u16 msg_type = nlh->nlmsg_type;
572 uid_t loginuid; /* loginuid of sender */ 662 uid_t loginuid; /* loginuid of sender */
663 u32 sessionid;
573 struct audit_sig_info *sig_data; 664 struct audit_sig_info *sig_data;
574 char *ctx = NULL; 665 char *ctx = NULL;
575 u32 len; 666 u32 len;
@@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
591 pid = NETLINK_CREDS(skb)->pid; 682 pid = NETLINK_CREDS(skb)->pid;
592 uid = NETLINK_CREDS(skb)->uid; 683 uid = NETLINK_CREDS(skb)->uid;
593 loginuid = NETLINK_CB(skb).loginuid; 684 loginuid = NETLINK_CB(skb).loginuid;
685 sessionid = NETLINK_CB(skb).sessionid;
594 sid = NETLINK_CB(skb).sid; 686 sid = NETLINK_CB(skb).sid;
595 seq = nlh->nlmsg_seq; 687 seq = nlh->nlmsg_seq;
596 data = NLMSG_DATA(nlh); 688 data = NLMSG_DATA(nlh);
@@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
613 status_get = (struct audit_status *)data; 705 status_get = (struct audit_status *)data;
614 if (status_get->mask & AUDIT_STATUS_ENABLED) { 706 if (status_get->mask & AUDIT_STATUS_ENABLED) {
615 err = audit_set_enabled(status_get->enabled, 707 err = audit_set_enabled(status_get->enabled,
616 loginuid, sid); 708 loginuid, sessionid, sid);
617 if (err < 0) return err; 709 if (err < 0) return err;
618 } 710 }
619 if (status_get->mask & AUDIT_STATUS_FAILURE) { 711 if (status_get->mask & AUDIT_STATUS_FAILURE) {
620 err = audit_set_failure(status_get->failure, 712 err = audit_set_failure(status_get->failure,
621 loginuid, sid); 713 loginuid, sessionid, sid);
622 if (err < 0) return err; 714 if (err < 0) return err;
623 } 715 }
624 if (status_get->mask & AUDIT_STATUS_PID) { 716 if (status_get->mask & AUDIT_STATUS_PID) {
@@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
627 if (audit_enabled != AUDIT_OFF) 719 if (audit_enabled != AUDIT_OFF)
628 audit_log_config_change("audit_pid", new_pid, 720 audit_log_config_change("audit_pid", new_pid,
629 audit_pid, loginuid, 721 audit_pid, loginuid,
630 sid, 1); 722 sessionid, sid, 1);
631 723
632 audit_pid = new_pid; 724 audit_pid = new_pid;
633 audit_nlk_pid = NETLINK_CB(skb).pid; 725 audit_nlk_pid = NETLINK_CB(skb).pid;
634 } 726 }
635 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 727 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
636 err = audit_set_rate_limit(status_get->rate_limit, 728 err = audit_set_rate_limit(status_get->rate_limit,
637 loginuid, sid); 729 loginuid, sessionid, sid);
638 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 730 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
639 err = audit_set_backlog_limit(status_get->backlog_limit, 731 err = audit_set_backlog_limit(status_get->backlog_limit,
640 loginuid, sid); 732 loginuid, sessionid, sid);
641 break; 733 break;
642 case AUDIT_USER: 734 case AUDIT_USER:
643 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 735 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
649 if (err == 1) { 741 if (err == 1) {
650 err = 0; 742 err = 0;
651 if (msg_type == AUDIT_USER_TTY) { 743 if (msg_type == AUDIT_USER_TTY) {
652 err = audit_prepare_user_tty(pid, loginuid); 744 err = audit_prepare_user_tty(pid, loginuid,
745 sessionid);
653 if (err) 746 if (err)
654 break; 747 break;
655 } 748 }
656 audit_log_common_recv_msg(&ab, msg_type, pid, uid, 749 audit_log_common_recv_msg(&ab, msg_type, pid, uid,
657 loginuid, sid); 750 loginuid, sessionid, sid);
658 751
659 if (msg_type != AUDIT_USER_TTY) 752 if (msg_type != AUDIT_USER_TTY)
660 audit_log_format(ab, " msg='%.1024s'", 753 audit_log_format(ab, " msg='%.1024s'",
@@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
664 757
665 audit_log_format(ab, " msg="); 758 audit_log_format(ab, " msg=");
666 size = nlmsg_len(nlh); 759 size = nlmsg_len(nlh);
667 audit_log_n_untrustedstring(ab, size, 760 audit_log_n_untrustedstring(ab, data, size);
668 data);
669 } 761 }
670 audit_set_pid(ab, pid); 762 audit_set_pid(ab, pid);
671 audit_log_end(ab); 763 audit_log_end(ab);
@@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
677 return -EINVAL; 769 return -EINVAL;
678 if (audit_enabled == AUDIT_LOCKED) { 770 if (audit_enabled == AUDIT_LOCKED) {
679 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 771 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
680 uid, loginuid, sid); 772 uid, loginuid, sessionid, sid);
681 773
682 audit_log_format(ab, " audit_enabled=%d res=0", 774 audit_log_format(ab, " audit_enabled=%d res=0",
683 audit_enabled); 775 audit_enabled);
@@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
688 case AUDIT_LIST: 780 case AUDIT_LIST:
689 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 781 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
690 uid, seq, data, nlmsg_len(nlh), 782 uid, seq, data, nlmsg_len(nlh),
691 loginuid, sid); 783 loginuid, sessionid, sid);
692 break; 784 break;
693 case AUDIT_ADD_RULE: 785 case AUDIT_ADD_RULE:
694 case AUDIT_DEL_RULE: 786 case AUDIT_DEL_RULE:
@@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
696 return -EINVAL; 788 return -EINVAL;
697 if (audit_enabled == AUDIT_LOCKED) { 789 if (audit_enabled == AUDIT_LOCKED) {
698 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 790 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
699 uid, loginuid, sid); 791 uid, loginuid, sessionid, sid);
700 792
701 audit_log_format(ab, " audit_enabled=%d res=0", 793 audit_log_format(ab, " audit_enabled=%d res=0",
702 audit_enabled); 794 audit_enabled);
@@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
707 case AUDIT_LIST_RULES: 799 case AUDIT_LIST_RULES:
708 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 800 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
709 uid, seq, data, nlmsg_len(nlh), 801 uid, seq, data, nlmsg_len(nlh),
710 loginuid, sid); 802 loginuid, sessionid, sid);
711 break; 803 break;
712 case AUDIT_TRIM: 804 case AUDIT_TRIM:
713 audit_trim_trees(); 805 audit_trim_trees();
714 806
715 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 807 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
716 uid, loginuid, sid); 808 uid, loginuid, sessionid, sid);
717 809
718 audit_log_format(ab, " op=trim res=1"); 810 audit_log_format(ab, " op=trim res=1");
719 audit_log_end(ab); 811 audit_log_end(ab);
@@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
721 case AUDIT_MAKE_EQUIV: { 813 case AUDIT_MAKE_EQUIV: {
722 void *bufp = data; 814 void *bufp = data;
723 u32 sizes[2]; 815 u32 sizes[2];
724 size_t len = nlmsg_len(nlh); 816 size_t msglen = nlmsg_len(nlh);
725 char *old, *new; 817 char *old, *new;
726 818
727 err = -EINVAL; 819 err = -EINVAL;
728 if (len < 2 * sizeof(u32)) 820 if (msglen < 2 * sizeof(u32))
729 break; 821 break;
730 memcpy(sizes, bufp, 2 * sizeof(u32)); 822 memcpy(sizes, bufp, 2 * sizeof(u32));
731 bufp += 2 * sizeof(u32); 823 bufp += 2 * sizeof(u32);
732 len -= 2 * sizeof(u32); 824 msglen -= 2 * sizeof(u32);
733 old = audit_unpack_string(&bufp, &len, sizes[0]); 825 old = audit_unpack_string(&bufp, &msglen, sizes[0]);
734 if (IS_ERR(old)) { 826 if (IS_ERR(old)) {
735 err = PTR_ERR(old); 827 err = PTR_ERR(old);
736 break; 828 break;
737 } 829 }
738 new = audit_unpack_string(&bufp, &len, sizes[1]); 830 new = audit_unpack_string(&bufp, &msglen, sizes[1]);
739 if (IS_ERR(new)) { 831 if (IS_ERR(new)) {
740 err = PTR_ERR(new); 832 err = PTR_ERR(new);
741 kfree(old); 833 kfree(old);
@@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
745 err = audit_tag_tree(old, new); 837 err = audit_tag_tree(old, new);
746 838
747 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 839 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
748 uid, loginuid, sid); 840 uid, loginuid, sessionid, sid);
749 841
750 audit_log_format(ab, " op=make_equiv old="); 842 audit_log_format(ab, " op=make_equiv old=");
751 audit_log_untrustedstring(ab, old); 843 audit_log_untrustedstring(ab, old);
@@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
779 struct task_struct *tsk; 871 struct task_struct *tsk;
780 872
781 read_lock(&tasklist_lock); 873 read_lock(&tasklist_lock);
782 tsk = find_task_by_pid(pid); 874 tsk = find_task_by_vpid(pid);
783 if (!tsk) 875 if (!tsk)
784 err = -ESRCH; 876 err = -ESRCH;
785 else { 877 else {
@@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
802 if (s->enabled != 0 && s->enabled != 1) 894 if (s->enabled != 0 && s->enabled != 1)
803 return -EINVAL; 895 return -EINVAL;
804 read_lock(&tasklist_lock); 896 read_lock(&tasklist_lock);
805 tsk = find_task_by_pid(pid); 897 tsk = find_task_by_vpid(pid);
806 if (!tsk) 898 if (!tsk)
807 err = -ESRCH; 899 err = -ESRCH;
808 else { 900 else {
@@ -877,6 +969,7 @@ static int __init audit_init(void)
877 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 969 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
878 970
879 skb_queue_head_init(&audit_skb_queue); 971 skb_queue_head_init(&audit_skb_queue);
972 skb_queue_head_init(&audit_skb_hold_queue);
880 audit_initialized = 1; 973 audit_initialized = 1;
881 audit_enabled = audit_default; 974 audit_enabled = audit_default;
882 audit_ever_enabled |= !!audit_default; 975 audit_ever_enabled |= !!audit_default;
@@ -1199,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
1199 * This function will take the passed buf and convert it into a string of 1292 * This function will take the passed buf and convert it into a string of
1200 * ascii hex digits. The new string is placed onto the skb. 1293 * ascii hex digits. The new string is placed onto the skb.
1201 */ 1294 */
1202void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 1295void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1203 size_t len) 1296 size_t len)
1204{ 1297{
1205 int i, avail, new_len; 1298 int i, avail, new_len;
@@ -1235,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
1235 * Format a string of no more than slen characters into the audit buffer, 1328 * Format a string of no more than slen characters into the audit buffer,
1236 * enclosed in quote marks. 1329 * enclosed in quote marks.
1237 */ 1330 */
1238static void audit_log_n_string(struct audit_buffer *ab, size_t slen, 1331void audit_log_n_string(struct audit_buffer *ab, const char *string,
1239 const char *string) 1332 size_t slen)
1240{ 1333{
1241 int avail, new_len; 1334 int avail, new_len;
1242 unsigned char *ptr; 1335 unsigned char *ptr;
@@ -1292,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len)
1292 * The caller specifies the number of characters in the string to log, which may 1385 * The caller specifies the number of characters in the string to log, which may
1293 * or may not be the entire string. 1386 * or may not be the entire string.
1294 */ 1387 */
1295void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1388void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
1296 const char *string) 1389 size_t len)
1297{ 1390{
1298 if (audit_string_contains_control(string, len)) 1391 if (audit_string_contains_control(string, len))
1299 audit_log_hex(ab, string, len); 1392 audit_log_n_hex(ab, string, len);
1300 else 1393 else
1301 audit_log_n_string(ab, len, string); 1394 audit_log_n_string(ab, string, len);
1302} 1395}
1303 1396
1304/** 1397/**
@@ -1311,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1311 */ 1404 */
1312void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1405void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1313{ 1406{
1314 audit_log_n_untrustedstring(ab, strlen(string), string); 1407 audit_log_n_untrustedstring(ab, string, strlen(string));
1315} 1408}
1316 1409
1317/* This is a helper-function to print the escaped d_path */ 1410/* This is a helper-function to print the escaped d_path */
@@ -1355,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab)
1355 audit_log_lost("rate limit exceeded"); 1448 audit_log_lost("rate limit exceeded");
1356 } else { 1449 } else {
1357 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1450 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1451 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1452
1358 if (audit_pid) { 1453 if (audit_pid) {
1359 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1360 skb_queue_tail(&audit_skb_queue, ab->skb); 1454 skb_queue_tail(&audit_skb_queue, ab->skb);
1361 ab->skb = NULL;
1362 wake_up_interruptible(&kauditd_wait); 1455 wake_up_interruptible(&kauditd_wait);
1363 } else if (nlh->nlmsg_type != AUDIT_EOE) { 1456 } else {
1364 if (printk_ratelimit()) { 1457 if (nlh->nlmsg_type != AUDIT_EOE) {
1365 printk(KERN_NOTICE "type=%d %s\n", 1458 if (printk_ratelimit()) {
1366 nlh->nlmsg_type, 1459 printk(KERN_NOTICE "type=%d %s\n",
1367 ab->skb->data + NLMSG_SPACE(0)); 1460 nlh->nlmsg_type,
1368 } else 1461 ab->skb->data + NLMSG_SPACE(0));
1369 audit_log_lost("printk limit exceeded\n"); 1462 } else
1463 audit_log_lost("printk limit exceeded\n");
1464 }
1465 audit_hold_skb(ab->skb);
1370 } 1466 }
1467 ab->skb = NULL;
1371 } 1468 }
1372 audit_buffer_free(ab); 1469 audit_buffer_free(ab);
1373} 1470}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cfc54ee3e1f..9d6717412fec 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,6 +74,11 @@ struct audit_entry {
74 struct audit_krule rule; 74 struct audit_krule rule;
75}; 75};
76 76
77#ifdef CONFIG_AUDIT
78extern int audit_enabled;
79extern int audit_ever_enabled;
80#endif
81
77extern int audit_pid; 82extern int audit_pid;
78 83
79#define AUDIT_INODE_BUCKETS 32 84#define AUDIT_INODE_BUCKETS 32
@@ -104,6 +109,9 @@ struct audit_netlink_list {
104int audit_send_list(void *); 109int audit_send_list(void *);
105 110
106struct inotify_watch; 111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
107extern void audit_free_parent(struct inotify_watch *); 115extern void audit_free_parent(struct inotify_watch *);
108extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, 116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
109 const char *, struct inode *); 117 const char *, struct inode *);
@@ -111,6 +119,7 @@ extern int selinux_audit_rule_update(void);
111 119
112extern struct mutex audit_filter_mutex; 120extern struct mutex audit_filter_mutex;
113extern void audit_free_rule_rcu(struct rcu_head *); 121extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[];
114 123
115#ifdef CONFIG_AUDIT_TREE 124#ifdef CONFIG_AUDIT_TREE
116extern struct audit_chunk *audit_tree_lookup(const struct inode *); 125extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -137,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *);
137 146
138extern char *audit_unpack_string(void **, size_t *, size_t); 147extern char *audit_unpack_string(void **, size_t *, size_t);
139 148
149extern pid_t audit_sig_pid;
150extern uid_t audit_sig_uid;
151extern u32 audit_sig_sid;
152
140#ifdef CONFIG_AUDITSYSCALL 153#ifdef CONFIG_AUDITSYSCALL
141extern int __audit_signal_info(int sig, struct task_struct *t); 154extern int __audit_signal_info(int sig, struct task_struct *t);
142static inline int audit_signal_info(int sig, struct task_struct *t) 155static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..0e0bd27e6512 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
89 89
90DEFINE_MUTEX(audit_filter_mutex); 90DEFINE_MUTEX(audit_filter_mutex);
91 91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */ 92/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 93#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97 94
98extern int audit_enabled;
99
100void audit_free_parent(struct inotify_watch *i_watch) 95void audit_free_parent(struct inotify_watch *i_watch)
101{ 96{
102 struct audit_parent *parent; 97 struct audit_parent *parent;
@@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
272 return -EINVAL; 267 return -EINVAL;
273 268
274 watch = audit_init_watch(path); 269 watch = audit_init_watch(path);
275 if (unlikely(IS_ERR(watch))) 270 if (IS_ERR(watch))
276 return PTR_ERR(watch); 271 return PTR_ERR(watch);
277 272
278 audit_get_watch(watch); 273 audit_get_watch(watch);
@@ -422,7 +417,7 @@ exit_err:
422static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
423{ 418{
424 struct audit_entry *entry; 419 struct audit_entry *entry;
425 struct audit_field *f; 420 struct audit_field *ino_f;
426 int err = 0; 421 int err = 0;
427 int i; 422 int i;
428 423
@@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
483 if (f->val & ~15) 478 if (f->val & ~15)
484 goto exit_free; 479 goto exit_free;
485 break; 480 break;
481 case AUDIT_FILETYPE:
482 if ((f->val & ~S_IFMT) > S_IFMT)
483 goto exit_free;
484 break;
486 case AUDIT_INODE: 485 case AUDIT_INODE:
487 err = audit_to_inode(&entry->rule, f); 486 err = audit_to_inode(&entry->rule, f);
488 if (err) 487 if (err)
@@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
504 } 503 }
505 } 504 }
506 505
507 f = entry->rule.inode_f; 506 ino_f = entry->rule.inode_f;
508 if (f) { 507 if (ino_f) {
509 switch(f->op) { 508 switch(ino_f->op) {
510 case AUDIT_NOT_EQUAL: 509 case AUDIT_NOT_EQUAL:
511 entry->rule.inode_f = NULL; 510 entry->rule.inode_f = NULL;
512 case AUDIT_EQUAL: 511 case AUDIT_EQUAL:
@@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
531{ 530{
532 int err = 0; 531 int err = 0;
533 struct audit_entry *entry; 532 struct audit_entry *entry;
534 struct audit_field *f; 533 struct audit_field *ino_f;
535 void *bufp; 534 void *bufp;
536 size_t remain = datasz - sizeof(struct audit_rule_data); 535 size_t remain = datasz - sizeof(struct audit_rule_data);
537 int i; 536 int i;
@@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
654 if (f->val & ~15) 653 if (f->val & ~15)
655 goto exit_free; 654 goto exit_free;
656 break; 655 break;
656 case AUDIT_FILETYPE:
657 if ((f->val & ~S_IFMT) > S_IFMT)
658 goto exit_free;
659 break;
657 default: 660 default:
658 goto exit_free; 661 goto exit_free;
659 } 662 }
660 } 663 }
661 664
662 f = entry->rule.inode_f; 665 ino_f = entry->rule.inode_f;
663 if (f) { 666 if (ino_f) {
664 switch(f->op) { 667 switch(ino_f->op) {
665 case AUDIT_NOT_EQUAL: 668 case AUDIT_NOT_EQUAL:
666 entry->rule.inode_f = NULL; 669 entry->rule.inode_f = NULL;
667 case AUDIT_EQUAL: 670 case AUDIT_EQUAL:
@@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
848 return ERR_PTR(-ENOMEM); 851 return ERR_PTR(-ENOMEM);
849 852
850 new = audit_init_watch(path); 853 new = audit_init_watch(path);
851 if (unlikely(IS_ERR(new))) { 854 if (IS_ERR(new)) {
852 kfree(path); 855 kfree(path);
853 goto out; 856 goto out;
854 } 857 }
@@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent,
989 audit_set_auditable(current->audit_context); 992 audit_set_auditable(current->audit_context);
990 993
991 nwatch = audit_dupe_watch(owatch); 994 nwatch = audit_dupe_watch(owatch);
992 if (unlikely(IS_ERR(nwatch))) { 995 if (IS_ERR(nwatch)) {
993 mutex_unlock(&audit_filter_mutex); 996 mutex_unlock(&audit_filter_mutex);
994 audit_panic("error updating watch, skipping"); 997 audit_panic("error updating watch, skipping");
995 return; 998 return;
@@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent,
1004 list_del_rcu(&oentry->list); 1007 list_del_rcu(&oentry->list);
1005 1008
1006 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1009 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1007 if (unlikely(IS_ERR(nentry))) 1010 if (IS_ERR(nentry))
1008 audit_panic("error updating watch, removing"); 1011 audit_panic("error updating watch, removing");
1009 else { 1012 else {
1010 int h = audit_hash_ino((u32)ino); 1013 int h = audit_hash_ino((u32)ino);
@@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1500} 1503}
1501 1504
1502/* Log rule additions and removals */ 1505/* Log rule additions and removals */
1503static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, 1506static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1504 struct audit_krule *rule, int res) 1507 char *action, struct audit_krule *rule,
1508 int res)
1505{ 1509{
1506 struct audit_buffer *ab; 1510 struct audit_buffer *ab;
1507 1511
@@ -1511,7 +1515,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1511 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1515 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1512 if (!ab) 1516 if (!ab)
1513 return; 1517 return;
1514 audit_log_format(ab, "auid=%u", loginuid); 1518 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
1515 if (sid) { 1519 if (sid) {
1516 char *ctx = NULL; 1520 char *ctx = NULL;
1517 u32 len; 1521 u32 len;
@@ -1543,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1543 * @sid: SE Linux Security ID of sender 1547 * @sid: SE Linux Security ID of sender
1544 */ 1548 */
1545int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1549int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1546 size_t datasz, uid_t loginuid, u32 sid) 1550 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
1547{ 1551{
1548 struct task_struct *tsk; 1552 struct task_struct *tsk;
1549 struct audit_netlink_list *dest; 1553 struct audit_netlink_list *dest;
@@ -1590,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1590 1594
1591 err = audit_add_rule(entry, 1595 err = audit_add_rule(entry,
1592 &audit_filter_list[entry->rule.listnr]); 1596 &audit_filter_list[entry->rule.listnr]);
1593 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); 1597 audit_log_rule_change(loginuid, sessionid, sid, "add",
1598 &entry->rule, !err);
1594 1599
1595 if (err) 1600 if (err)
1596 audit_free_rule(entry); 1601 audit_free_rule(entry);
@@ -1606,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1606 1611
1607 err = audit_del_rule(entry, 1612 err = audit_del_rule(entry,
1608 &audit_filter_list[entry->rule.listnr]); 1613 &audit_filter_list[entry->rule.listnr]);
1609 audit_log_rule_change(loginuid, sid, "remove", &entry->rule, 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1610 !err); 1615 &entry->rule, !err);
1611 1616
1612 audit_free_rule(entry); 1617 audit_free_rule(entry);
1613 break; 1618 break;
@@ -1785,7 +1790,7 @@ int audit_update_lsm_rules(void)
1785 watch = entry->rule.watch; 1790 watch = entry->rule.watch;
1786 tree = entry->rule.tree; 1791 tree = entry->rule.tree;
1787 nentry = audit_dupe_rule(&entry->rule, watch); 1792 nentry = audit_dupe_rule(&entry->rule, watch);
1788 if (unlikely(IS_ERR(nentry))) { 1793 if (IS_ERR(nentry)) {
1789 /* save the first error encountered for the 1794 /* save the first error encountered for the
1790 * return value */ 1795 * return value */
1791 if (!err) 1796 if (!err)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 56e56ed594a8..c10e7aae04d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -68,9 +68,6 @@
68 68
69#include "audit.h" 69#include "audit.h"
70 70
71extern struct list_head audit_filter_list[];
72extern int audit_ever_enabled;
73
74/* AUDIT_NAMES is the number of slots we reserve in the audit_context 71/* AUDIT_NAMES is the number of slots we reserve in the audit_context
75 * for saving names from getname(). */ 72 * for saving names from getname(). */
76#define AUDIT_NAMES 20 73#define AUDIT_NAMES 20
@@ -283,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
283 } 280 }
284} 281}
285 282
283static int audit_match_filetype(struct audit_context *ctx, int which)
284{
285 unsigned index = which & ~S_IFMT;
286 mode_t mode = which & S_IFMT;
287 if (index >= ctx->name_count)
288 return 0;
289 if (ctx->names[index].ino == -1)
290 return 0;
291 if ((ctx->names[index].mode ^ mode) & S_IFMT)
292 return 0;
293 return 1;
294}
295
286/* 296/*
287 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; 297 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
288 * ->first_trees points to its beginning, ->trees - to the current end of data. 298 * ->first_trees points to its beginning, ->trees - to the current end of data.
@@ -592,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk,
592 case AUDIT_PERM: 602 case AUDIT_PERM:
593 result = audit_match_perm(ctx, f->val); 603 result = audit_match_perm(ctx, f->val);
594 break; 604 break;
605 case AUDIT_FILETYPE:
606 result = audit_match_filetype(ctx, f->val);
607 break;
595 } 608 }
596 609
597 if (!result) 610 if (!result)
@@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1095 audit_log_format(*ab, "[%d]", i); 1108 audit_log_format(*ab, "[%d]", i);
1096 audit_log_format(*ab, "="); 1109 audit_log_format(*ab, "=");
1097 if (has_cntl) 1110 if (has_cntl)
1098 audit_log_hex(*ab, buf, to_send); 1111 audit_log_n_hex(*ab, buf, to_send);
1099 else 1112 else
1100 audit_log_format(*ab, "\"%s\"", buf); 1113 audit_log_format(*ab, "\"%s\"", buf);
1101 audit_log_format(*ab, "\n"); 1114 audit_log_format(*ab, "\n");
@@ -1296,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1296 break; } 1309 break; }
1297 1310
1298 case AUDIT_SOCKETCALL: { 1311 case AUDIT_SOCKETCALL: {
1299 int i;
1300 struct audit_aux_data_socketcall *axs = (void *)aux; 1312 struct audit_aux_data_socketcall *axs = (void *)aux;
1301 audit_log_format(ab, "nargs=%d", axs->nargs); 1313 audit_log_format(ab, "nargs=%d", axs->nargs);
1302 for (i=0; i<axs->nargs; i++) 1314 for (i=0; i<axs->nargs; i++)
@@ -1307,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1307 struct audit_aux_data_sockaddr *axs = (void *)aux; 1319 struct audit_aux_data_sockaddr *axs = (void *)aux;
1308 1320
1309 audit_log_format(ab, "saddr="); 1321 audit_log_format(ab, "saddr=");
1310 audit_log_hex(ab, axs->a, axs->len); 1322 audit_log_n_hex(ab, axs->a, axs->len);
1311 break; } 1323 break; }
1312 1324
1313 case AUDIT_FD_PAIR: { 1325 case AUDIT_FD_PAIR: {
@@ -1321,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1321 1333
1322 for (aux = context->aux_pids; aux; aux = aux->next) { 1334 for (aux = context->aux_pids; aux; aux = aux->next) {
1323 struct audit_aux_data_pids *axs = (void *)aux; 1335 struct audit_aux_data_pids *axs = (void *)aux;
1324 int i;
1325 1336
1326 for (i = 0; i < axs->pid_count; i++) 1337 for (i = 0; i < axs->pid_count; i++)
1327 if (audit_log_pid_context(context, axs->target_pid[i], 1338 if (audit_log_pid_context(context, axs->target_pid[i],
@@ -1371,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1371 default: 1382 default:
1372 /* log the name's directory component */ 1383 /* log the name's directory component */
1373 audit_log_format(ab, " name="); 1384 audit_log_format(ab, " name=");
1374 audit_log_n_untrustedstring(ab, n->name_len, 1385 audit_log_n_untrustedstring(ab, n->name,
1375 n->name); 1386 n->name_len);
1376 } 1387 }
1377 } else 1388 } else
1378 audit_log_format(ab, " name=(null)"); 1389 audit_log_format(ab, " name=(null)");
@@ -1596,7 +1607,7 @@ static inline void handle_one(const struct inode *inode)
1596 if (likely(put_tree_ref(context, chunk))) 1607 if (likely(put_tree_ref(context, chunk)))
1597 return; 1608 return;
1598 if (unlikely(!grow_tree_refs(context))) { 1609 if (unlikely(!grow_tree_refs(context))) {
1599 printk(KERN_WARNING "out of memory, audit has lost a tree reference"); 1610 printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
1600 audit_set_auditable(context); 1611 audit_set_auditable(context);
1601 audit_put_chunk(chunk); 1612 audit_put_chunk(chunk);
1602 unroll_tree_refs(context, p, count); 1613 unroll_tree_refs(context, p, count);
@@ -1656,7 +1667,7 @@ retry:
1656 } 1667 }
1657 /* too bad */ 1668 /* too bad */
1658 printk(KERN_WARNING 1669 printk(KERN_WARNING
1659 "out of memory, audit has lost a tree reference"); 1670 "out of memory, audit has lost a tree reference\n");
1660 unroll_tree_refs(context, p, count); 1671 unroll_tree_refs(context, p, count);
1661 audit_set_auditable(context); 1672 audit_set_auditable(context);
1662 return; 1673 return;
@@ -1752,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context,
1752 if (context->name_count >= AUDIT_NAMES) { 1763 if (context->name_count >= AUDIT_NAMES) {
1753 if (inode) 1764 if (inode)
1754 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1765 printk(KERN_DEBUG "name_count maxed, losing inode data: "
1755 "dev=%02x:%02x, inode=%lu", 1766 "dev=%02x:%02x, inode=%lu\n",
1756 MAJOR(inode->i_sb->s_dev), 1767 MAJOR(inode->i_sb->s_dev),
1757 MINOR(inode->i_sb->s_dev), 1768 MINOR(inode->i_sb->s_dev),
1758 inode->i_ino); 1769 inode->i_ino);
1759 1770
1760 else 1771 else
1761 printk(KERN_DEBUG "name_count maxed, losing inode data"); 1772 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1762 return 1; 1773 return 1;
1763 } 1774 }
1764 context->name_count++; 1775 context->name_count++;
@@ -2361,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t)
2361 struct audit_aux_data_pids *axp; 2372 struct audit_aux_data_pids *axp;
2362 struct task_struct *tsk = current; 2373 struct task_struct *tsk = current;
2363 struct audit_context *ctx = tsk->audit_context; 2374 struct audit_context *ctx = tsk->audit_context;
2364 extern pid_t audit_sig_pid;
2365 extern uid_t audit_sig_uid;
2366 extern u32 audit_sig_sid;
2367 2375
2368 if (audit_pid && t->tgid == audit_pid) { 2376 if (audit_pid && t->tgid == audit_pid) {
2369 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2377 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c3c55544db2f..3c5301381837 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,11 +8,7 @@
8/* Include headers that define the enum constants of interest */ 8/* Include headers that define the enum constants of interest */
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11 11#include <linux/kbuild.h>
12#define DEFINE(sym, val) \
13 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
14
15#define BLANK() asm volatile("\n->" : : )
16 12
17void foo(void) 13void foo(void)
18{ 14{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..fbc6fc8949b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
44#include <linux/kmod.h> 44#include <linux/kmod.h>
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h>
47 48
48#include <asm/atomic.h> 49#include <asm/atomic.h>
49 50
@@ -118,17 +119,7 @@ static int root_count;
118 * be called. 119 * be called.
119 */ 120 */
120static int need_forkexit_callback; 121static int need_forkexit_callback;
121 122static int need_mm_owner_callback __read_mostly;
122/* bits in struct cgroup flags field */
123enum {
124 /* Control Group is dead */
125 CGRP_REMOVED,
126 /* Control Group has previously had a child cgroup or a task,
127 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
128 CGRP_RELEASABLE,
129 /* Control Group requires release notifications to userspace */
130 CGRP_NOTIFY_ON_RELEASE,
131};
132 123
133/* convenient tests for these bits */ 124/* convenient tests for these bits */
134inline int cgroup_is_removed(const struct cgroup *cgrp) 125inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
204static DEFINE_RWLOCK(css_set_lock); 195static DEFINE_RWLOCK(css_set_lock);
205static int css_set_count; 196static int css_set_count;
206 197
198/* hash table for cgroup groups. This improves the performance to
199 * find an existing css_set */
200#define CSS_SET_HASH_BITS 7
201#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
202static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
203
204static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
205{
206 int i;
207 int index;
208 unsigned long tmp = 0UL;
209
210 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
211 tmp += (unsigned long)css[i];
212 tmp = (tmp >> 16) ^ tmp;
213
214 index = hash_long(tmp, CSS_SET_HASH_BITS);
215
216 return &css_set_table[index];
217}
218
207/* We don't maintain the lists running through each css_set to its 219/* We don't maintain the lists running through each css_set to its
208 * task until after the first call to cgroup_iter_start(). This 220 * task until after the first call to cgroup_iter_start(). This
209 * reduces the fork()/exit() overhead for people who have cgroups 221 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
230static void unlink_css_set(struct css_set *cg) 242static void unlink_css_set(struct css_set *cg)
231{ 243{
232 write_lock(&css_set_lock); 244 write_lock(&css_set_lock);
233 list_del(&cg->list); 245 hlist_del(&cg->hlist);
234 css_set_count--; 246 css_set_count--;
235 while (!list_empty(&cg->cg_links)) { 247 while (!list_empty(&cg->cg_links)) {
236 struct cg_cgroup_link *link; 248 struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
295/* 307/*
296 * find_existing_css_set() is a helper for 308 * find_existing_css_set() is a helper for
297 * find_css_set(), and checks to see whether an existing 309 * find_css_set(), and checks to see whether an existing
298 * css_set is suitable. This currently walks a linked-list for 310 * css_set is suitable.
299 * simplicity; a later patch will use a hash table for better
300 * performance
301 * 311 *
302 * oldcg: the cgroup group that we're using before the cgroup 312 * oldcg: the cgroup group that we're using before the cgroup
303 * transition 313 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
314{ 324{
315 int i; 325 int i;
316 struct cgroupfs_root *root = cgrp->root; 326 struct cgroupfs_root *root = cgrp->root;
317 struct list_head *l = &init_css_set.list; 327 struct hlist_head *hhead;
328 struct hlist_node *node;
329 struct css_set *cg;
318 330
319 /* Built the set of subsystem state objects that we want to 331 /* Built the set of subsystem state objects that we want to
320 * see in the new css_set */ 332 * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
331 } 343 }
332 } 344 }
333 345
334 /* Look through existing cgroup groups to find one to reuse */ 346 hhead = css_set_hash(template);
335 do { 347 hlist_for_each_entry(cg, node, hhead, hlist) {
336 struct css_set *cg =
337 list_entry(l, struct css_set, list);
338
339 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 348 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
340 /* All subsystems matched */ 349 /* All subsystems matched */
341 return cg; 350 return cg;
342 } 351 }
343 /* Try the next cgroup group */ 352 }
344 l = l->next;
345 } while (l != &init_css_set.list);
346 353
347 /* No existing cgroup group matched */ 354 /* No existing cgroup group matched */
348 return NULL; 355 return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
404 struct list_head tmp_cg_links; 411 struct list_head tmp_cg_links;
405 struct cg_cgroup_link *link; 412 struct cg_cgroup_link *link;
406 413
414 struct hlist_head *hhead;
415
407 /* First see if we already have a cgroup group that matches 416 /* First see if we already have a cgroup group that matches
408 * the desired set */ 417 * the desired set */
409 write_lock(&css_set_lock); 418 write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
428 kref_init(&res->ref); 437 kref_init(&res->ref);
429 INIT_LIST_HEAD(&res->cg_links); 438 INIT_LIST_HEAD(&res->cg_links);
430 INIT_LIST_HEAD(&res->tasks); 439 INIT_LIST_HEAD(&res->tasks);
440 INIT_HLIST_NODE(&res->hlist);
431 441
432 /* Copy the set of subsystem state objects generated in 442 /* Copy the set of subsystem state objects generated in
433 * find_existing_css_set() */ 443 * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
467 477
468 BUG_ON(!list_empty(&tmp_cg_links)); 478 BUG_ON(!list_empty(&tmp_cg_links));
469 479
470 /* Link this cgroup group into the list */
471 list_add(&res->list, &init_css_set.list);
472 css_set_count++; 480 css_set_count++;
481
482 /* Add this cgroup group to the hash table */
483 hhead = css_set_hash(res->subsys);
484 hlist_add_head(&res->hlist, hhead);
485
473 write_unlock(&css_set_lock); 486 write_unlock(&css_set_lock);
474 487
475 return res; 488 return res;
@@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations;
562static struct file_operations proc_cgroupstats_operations; 575static struct file_operations proc_cgroupstats_operations;
563 576
564static struct backing_dev_info cgroup_backing_dev_info = { 577static struct backing_dev_info cgroup_backing_dev_info = {
565 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 578 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
566}; 579};
567 580
568static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 581static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
@@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
948 int ret = 0; 961 int ret = 0;
949 struct super_block *sb; 962 struct super_block *sb;
950 struct cgroupfs_root *root; 963 struct cgroupfs_root *root;
951 struct list_head tmp_cg_links, *l; 964 struct list_head tmp_cg_links;
952 INIT_LIST_HEAD(&tmp_cg_links); 965 INIT_LIST_HEAD(&tmp_cg_links);
953 966
954 /* First find the desired set of subsystems */ 967 /* First find the desired set of subsystems */
@@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
990 /* New superblock */ 1003 /* New superblock */
991 struct cgroup *cgrp = &root->top_cgroup; 1004 struct cgroup *cgrp = &root->top_cgroup;
992 struct inode *inode; 1005 struct inode *inode;
1006 int i;
993 1007
994 BUG_ON(sb->s_root != NULL); 1008 BUG_ON(sb->s_root != NULL);
995 1009
@@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1034 /* Link the top cgroup in this hierarchy into all 1048 /* Link the top cgroup in this hierarchy into all
1035 * the css_set objects */ 1049 * the css_set objects */
1036 write_lock(&css_set_lock); 1050 write_lock(&css_set_lock);
1037 l = &init_css_set.list; 1051 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1038 do { 1052 struct hlist_head *hhead = &css_set_table[i];
1053 struct hlist_node *node;
1039 struct css_set *cg; 1054 struct css_set *cg;
1040 struct cg_cgroup_link *link; 1055
1041 cg = list_entry(l, struct css_set, list); 1056 hlist_for_each_entry(cg, node, hhead, hlist) {
1042 BUG_ON(list_empty(&tmp_cg_links)); 1057 struct cg_cgroup_link *link;
1043 link = list_entry(tmp_cg_links.next, 1058
1044 struct cg_cgroup_link, 1059 BUG_ON(list_empty(&tmp_cg_links));
1045 cgrp_link_list); 1060 link = list_entry(tmp_cg_links.next,
1046 list_del(&link->cgrp_link_list); 1061 struct cg_cgroup_link,
1047 link->cg = cg; 1062 cgrp_link_list);
1048 list_add(&link->cgrp_link_list, 1063 list_del(&link->cgrp_link_list);
1049 &root->top_cgroup.css_sets); 1064 link->cg = cg;
1050 list_add(&link->cg_link_list, &cg->cg_links); 1065 list_add(&link->cgrp_link_list,
1051 l = l->next; 1066 &root->top_cgroup.css_sets);
1052 } while (l != &init_css_set.list); 1067 list_add(&link->cg_link_list, &cg->cg_links);
1068 }
1069 }
1053 write_unlock(&css_set_lock); 1070 write_unlock(&css_set_lock);
1054 1071
1055 free_cg_links(&tmp_cg_links); 1072 free_cg_links(&tmp_cg_links);
@@ -1307,18 +1324,16 @@ enum cgroup_filetype {
1307 FILE_DIR, 1324 FILE_DIR,
1308 FILE_TASKLIST, 1325 FILE_TASKLIST,
1309 FILE_NOTIFY_ON_RELEASE, 1326 FILE_NOTIFY_ON_RELEASE,
1310 FILE_RELEASABLE,
1311 FILE_RELEASE_AGENT, 1327 FILE_RELEASE_AGENT,
1312}; 1328};
1313 1329
1314static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, 1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1315 struct file *file, 1331 struct file *file,
1316 const char __user *userbuf, 1332 const char __user *userbuf,
1317 size_t nbytes, loff_t *unused_ppos) 1333 size_t nbytes, loff_t *unused_ppos)
1318{ 1334{
1319 char buffer[64]; 1335 char buffer[64];
1320 int retval = 0; 1336 int retval = 0;
1321 u64 val;
1322 char *end; 1337 char *end;
1323 1338
1324 if (!nbytes) 1339 if (!nbytes)
@@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1329 return -EFAULT; 1344 return -EFAULT;
1330 1345
1331 buffer[nbytes] = 0; /* nul-terminate */ 1346 buffer[nbytes] = 0; /* nul-terminate */
1332 1347 strstrip(buffer);
1333 /* strip newline if necessary */ 1348 if (cft->write_u64) {
1334 if (nbytes && (buffer[nbytes-1] == '\n')) 1349 u64 val = simple_strtoull(buffer, &end, 0);
1335 buffer[nbytes-1] = 0; 1350 if (*end)
1336 val = simple_strtoull(buffer, &end, 0); 1351 return -EINVAL;
1337 if (*end) 1352 retval = cft->write_u64(cgrp, cft, val);
1338 return -EINVAL; 1353 } else {
1339 1354 s64 val = simple_strtoll(buffer, &end, 0);
1340 /* Pass to subsystem */ 1355 if (*end)
1341 retval = cft->write_uint(cgrp, cft, val); 1356 return -EINVAL;
1357 retval = cft->write_s64(cgrp, cft, val);
1358 }
1342 if (!retval) 1359 if (!retval)
1343 retval = nbytes; 1360 retval = nbytes;
1344 return retval; 1361 return retval;
@@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1419 return -ENODEV; 1436 return -ENODEV;
1420 if (cft->write) 1437 if (cft->write)
1421 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1422 if (cft->write_uint) 1439 if (cft->write_u64 || cft->write_s64)
1423 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); 1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes;
1444 }
1424 return -EINVAL; 1445 return -EINVAL;
1425} 1446}
1426 1447
1427static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, 1448static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1428 struct file *file, 1449 struct file *file,
1429 char __user *buf, size_t nbytes, 1450 char __user *buf, size_t nbytes,
1430 loff_t *ppos) 1451 loff_t *ppos)
1431{ 1452{
1432 char tmp[64]; 1453 char tmp[64];
1433 u64 val = cft->read_uint(cgrp, cft); 1454 u64 val = cft->read_u64(cgrp, cft);
1434 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1435 1456
1436 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1457 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1437} 1458}
1438 1459
1460static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1461 struct file *file,
1462 char __user *buf, size_t nbytes,
1463 loff_t *ppos)
1464{
1465 char tmp[64];
1466 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val);
1468
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470}
1471
1439static ssize_t cgroup_common_file_read(struct cgroup *cgrp, 1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1440 struct cftype *cft, 1473 struct cftype *cft,
1441 struct file *file, 1474 struct file *file,
@@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1490 1523
1491 if (cft->read) 1524 if (cft->read)
1492 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 1525 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1493 if (cft->read_uint) 1526 if (cft->read_u64)
1494 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); 1527 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1528 if (cft->read_s64)
1529 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
1495 return -EINVAL; 1530 return -EINVAL;
1496} 1531}
1497 1532
1533/*
1534 * seqfile ops/methods for returning structured data. Currently just
1535 * supports string->u64 maps, but can be extended in future.
1536 */
1537
1538struct cgroup_seqfile_state {
1539 struct cftype *cft;
1540 struct cgroup *cgroup;
1541};
1542
1543static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1544{
1545 struct seq_file *sf = cb->state;
1546 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1547}
1548
1549static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1550{
1551 struct cgroup_seqfile_state *state = m->private;
1552 struct cftype *cft = state->cft;
1553 if (cft->read_map) {
1554 struct cgroup_map_cb cb = {
1555 .fill = cgroup_map_add,
1556 .state = m,
1557 };
1558 return cft->read_map(state->cgroup, cft, &cb);
1559 }
1560 return cft->read_seq_string(state->cgroup, cft, m);
1561}
1562
1563int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{
1565 struct seq_file *seq = file->private_data;
1566 kfree(seq->private);
1567 return single_release(inode, file);
1568}
1569
1570static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read,
1572 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release,
1574};
1575
1498static int cgroup_file_open(struct inode *inode, struct file *file) 1576static int cgroup_file_open(struct inode *inode, struct file *file)
1499{ 1577{
1500 int err; 1578 int err;
@@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1507 cft = __d_cft(file->f_dentry); 1585 cft = __d_cft(file->f_dentry);
1508 if (!cft) 1586 if (!cft)
1509 return -ENODEV; 1587 return -ENODEV;
1510 if (cft->open) 1588 if (cft->read_map || cft->read_seq_string) {
1589 struct cgroup_seqfile_state *state =
1590 kzalloc(sizeof(*state), GFP_USER);
1591 if (!state)
1592 return -ENOMEM;
1593 state->cft = cft;
1594 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1595 file->f_op = &cgroup_seqfile_operations;
1596 err = single_open(file, cgroup_seqfile_show, state);
1597 if (err < 0)
1598 kfree(state);
1599 } else if (cft->open)
1511 err = cft->open(inode, file); 1600 err = cft->open(inode, file);
1512 else 1601 else
1513 err = 0; 1602 err = 0;
@@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1715 * The tasklist_lock is not held here, as do_each_thread() and 1804 * The tasklist_lock is not held here, as do_each_thread() and
1716 * while_each_thread() are protected by RCU. 1805 * while_each_thread() are protected by RCU.
1717 */ 1806 */
1718void cgroup_enable_task_cg_lists(void) 1807static void cgroup_enable_task_cg_lists(void)
1719{ 1808{
1720 struct task_struct *p, *g; 1809 struct task_struct *p, *g;
1721 write_lock(&css_set_lock); 1810 write_lock(&css_set_lock);
@@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1913 2002
1914 if (heap->size) { 2003 if (heap->size) {
1915 for (i = 0; i < heap->size; i++) { 2004 for (i = 0; i < heap->size; i++) {
1916 struct task_struct *p = heap->ptrs[i]; 2005 struct task_struct *q = heap->ptrs[i];
1917 if (i == 0) { 2006 if (i == 0) {
1918 latest_time = p->start_time; 2007 latest_time = q->start_time;
1919 latest_task = p; 2008 latest_task = q;
1920 } 2009 }
1921 /* Process the task per the caller's callback */ 2010 /* Process the task per the caller's callback */
1922 scan->process_task(p, scan); 2011 scan->process_task(q, scan);
1923 put_task_struct(p); 2012 put_task_struct(q);
1924 } 2013 }
1925 /* 2014 /*
1926 * If we had to process any tasks at all, scan again 2015 * If we had to process any tasks at all, scan again
@@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2138 return notify_on_release(cgrp); 2227 return notify_on_release(cgrp);
2139} 2228}
2140 2229
2141static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
2142{
2143 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
2144}
2145
2146/* 2230/*
2147 * for the common functions, 'private' gives the type of file 2231 * for the common functions, 'private' gives the type of file
2148 */ 2232 */
@@ -2158,16 +2242,10 @@ static struct cftype files[] = {
2158 2242
2159 { 2243 {
2160 .name = "notify_on_release", 2244 .name = "notify_on_release",
2161 .read_uint = cgroup_read_notify_on_release, 2245 .read_u64 = cgroup_read_notify_on_release,
2162 .write = cgroup_common_file_write, 2246 .write = cgroup_common_file_write,
2163 .private = FILE_NOTIFY_ON_RELEASE, 2247 .private = FILE_NOTIFY_ON_RELEASE,
2164 }, 2248 },
2165
2166 {
2167 .name = "releasable",
2168 .read_uint = cgroup_read_releasable,
2169 .private = FILE_RELEASABLE,
2170 }
2171}; 2249};
2172 2250
2173static struct cftype cft_release_agent = { 2251static struct cftype cft_release_agent = {
@@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2401 return 0; 2479 return 0;
2402} 2480}
2403 2481
2404static void cgroup_init_subsys(struct cgroup_subsys *ss) 2482static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2405{ 2483{
2406 struct cgroup_subsys_state *css; 2484 struct cgroup_subsys_state *css;
2407 struct list_head *l;
2408 2485
2409 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2486 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2410 2487
@@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
2415 BUG_ON(IS_ERR(css)); 2492 BUG_ON(IS_ERR(css));
2416 init_cgroup_css(css, ss, dummytop); 2493 init_cgroup_css(css, ss, dummytop);
2417 2494
2418 /* Update all cgroup groups to contain a subsys 2495 /* Update the init_css_set to contain a subsys
2419 * pointer to this state - since the subsystem is 2496 * pointer to this state - since the subsystem is
2420 * newly registered, all tasks and hence all cgroup 2497 * newly registered, all tasks and hence the
2421 * groups are in the subsystem's top cgroup. */ 2498 * init_css_set is in the subsystem's top cgroup. */
2422 write_lock(&css_set_lock); 2499 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2423 l = &init_css_set.list;
2424 do {
2425 struct css_set *cg =
2426 list_entry(l, struct css_set, list);
2427 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2428 l = l->next;
2429 } while (l != &init_css_set.list);
2430 write_unlock(&css_set_lock);
2431
2432 /* If this subsystem requested that it be notified with fork
2433 * events, we should send it one now for every process in the
2434 * system */
2435 if (ss->fork) {
2436 struct task_struct *g, *p;
2437
2438 read_lock(&tasklist_lock);
2439 do_each_thread(g, p) {
2440 ss->fork(ss, p);
2441 } while_each_thread(g, p);
2442 read_unlock(&tasklist_lock);
2443 }
2444 2500
2445 need_forkexit_callback |= ss->fork || ss->exit; 2501 need_forkexit_callback |= ss->fork || ss->exit;
2502 need_mm_owner_callback |= !!ss->mm_owner_changed;
2503
2504 /* At system boot, before all subsystems have been
2505 * registered, no tasks have been forked, so we don't
2506 * need to invoke fork callbacks here. */
2507 BUG_ON(!list_empty(&init_task.tasks));
2446 2508
2447 ss->active = 1; 2509 ss->active = 1;
2448} 2510}
@@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void)
2458 int i; 2520 int i;
2459 kref_init(&init_css_set.ref); 2521 kref_init(&init_css_set.ref);
2460 kref_get(&init_css_set.ref); 2522 kref_get(&init_css_set.ref);
2461 INIT_LIST_HEAD(&init_css_set.list);
2462 INIT_LIST_HEAD(&init_css_set.cg_links); 2523 INIT_LIST_HEAD(&init_css_set.cg_links);
2463 INIT_LIST_HEAD(&init_css_set.tasks); 2524 INIT_LIST_HEAD(&init_css_set.tasks);
2525 INIT_HLIST_NODE(&init_css_set.hlist);
2464 css_set_count = 1; 2526 css_set_count = 1;
2465 init_cgroup_root(&rootnode); 2527 init_cgroup_root(&rootnode);
2466 list_add(&rootnode.root_list, &roots); 2528 list_add(&rootnode.root_list, &roots);
@@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void)
2473 list_add(&init_css_set_link.cg_link_list, 2535 list_add(&init_css_set_link.cg_link_list,
2474 &init_css_set.cg_links); 2536 &init_css_set.cg_links);
2475 2537
2538 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
2539 INIT_HLIST_HEAD(&css_set_table[i]);
2540
2476 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2541 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2477 struct cgroup_subsys *ss = subsys[i]; 2542 struct cgroup_subsys *ss = subsys[i];
2478 2543
@@ -2502,7 +2567,7 @@ int __init cgroup_init(void)
2502{ 2567{
2503 int err; 2568 int err;
2504 int i; 2569 int i;
2505 struct proc_dir_entry *entry; 2570 struct hlist_head *hhead;
2506 2571
2507 err = bdi_init(&cgroup_backing_dev_info); 2572 err = bdi_init(&cgroup_backing_dev_info);
2508 if (err) 2573 if (err)
@@ -2514,13 +2579,15 @@ int __init cgroup_init(void)
2514 cgroup_init_subsys(ss); 2579 cgroup_init_subsys(ss);
2515 } 2580 }
2516 2581
2582 /* Add init_css_set to the hash table */
2583 hhead = css_set_hash(init_css_set.subsys);
2584 hlist_add_head(&init_css_set.hlist, hhead);
2585
2517 err = register_filesystem(&cgroup_fs_type); 2586 err = register_filesystem(&cgroup_fs_type);
2518 if (err < 0) 2587 if (err < 0)
2519 goto out; 2588 goto out;
2520 2589
2521 entry = create_proc_entry("cgroups", 0, NULL); 2590 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
2522 if (entry)
2523 entry->proc_fops = &proc_cgroupstats_operations;
2524 2591
2525out: 2592out:
2526 if (err) 2593 if (err)
@@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2683 } 2750 }
2684} 2751}
2685 2752
2753#ifdef CONFIG_MM_OWNER
2754/**
2755 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2756 * @p: the new owner
2757 *
2758 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it.
2761 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{
2764 struct cgroup *oldcgrp, *newcgrp;
2765
2766 if (need_mm_owner_callback) {
2767 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp)
2773 continue;
2774 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp);
2776 }
2777 }
2778}
2779#endif /* CONFIG_MM_OWNER */
2780
2686/** 2781/**
2687 * cgroup_post_fork - called on a new task after adding it to the task list 2782 * cgroup_post_fork - called on a new task after adding it to the task list
2688 * @child: the task in question 2783 * @child: the task in question
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that 2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info 3 * exposes debug info
4 * 4 *
5 * Copyright (C) Google Inc, 2007 5 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
62 return count; 62 return count;
63} 63}
64 64
65static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
66{
67 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
68}
69
65static struct cftype files[] = { 70static struct cftype files[] = {
66 { 71 {
67 .name = "cgroup_refcount", 72 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read, 73 .read_u64 = cgroup_refcount_read,
69 }, 74 },
70 { 75 {
71 .name = "taskcount", 76 .name = "taskcount",
72 .read_uint = taskcount_read, 77 .read_u64 = taskcount_read,
73 }, 78 },
74 79
75 { 80 {
76 .name = "current_css_set", 81 .name = "current_css_set",
77 .read_uint = current_css_set_read, 82 .read_u64 = current_css_set_read,
78 }, 83 },
79 84
80 { 85 {
81 .name = "current_css_set_refcount", 86 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read, 87 .read_u64 = current_css_set_refcount_read,
83 }, 88 },
89
90 {
91 .name = "releasable",
92 .read_u64 = releasable_read,
93 }
84}; 94};
85 95
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index e1ef04870c2a..32c254a8ab9a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -898,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
898 898
899 current->state = TASK_INTERRUPTIBLE; 899 current->state = TASK_INTERRUPTIBLE;
900 schedule(); 900 schedule();
901 set_thread_flag(TIF_RESTORE_SIGMASK); 901 set_restore_sigmask();
902 return -ERESTARTNOHAND; 902 return -ERESTARTNOHAND;
903} 903}
904#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 904#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -955,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
955 __put_user(txc.jitcnt, &utp->jitcnt) || 955 __put_user(txc.jitcnt, &utp->jitcnt) ||
956 __put_user(txc.calcnt, &utp->calcnt) || 956 __put_user(txc.calcnt, &utp->calcnt) ||
957 __put_user(txc.errcnt, &utp->errcnt) || 957 __put_user(txc.errcnt, &utp->errcnt) ||
958 __put_user(txc.stbcnt, &utp->stbcnt)) 958 __put_user(txc.stbcnt, &utp->stbcnt) ||
959 __put_user(txc.tai, &utp->tai))
959 ret = -EFAULT; 960 ret = -EFAULT;
960 961
961 return ret; 962 return ret;
@@ -1080,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1080 1081
1081 return 0; 1082 return 0;
1082} 1083}
1083
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
79 struct proc_dir_entry *entry; 79 struct proc_dir_entry *entry;
80 80
81 /* create the current config file */ 81 /* create the current config file */
82 entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, 82 entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
83 &proc_root); 83 &ikconfig_file_ops);
84 if (!entry) 84 if (!entry)
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 entry->proc_fops = &ikconfig_file_ops;
88 entry->size = kernel_config_data_size; 87 entry->size = kernel_config_data_size;
89 88
90 return 0; 89 return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
95 94
96static void __exit ikconfig_cleanup(void) 95static void __exit ikconfig_cleanup(void)
97{ 96{
98 remove_proc_entry("config.gz", &proc_root); 97 remove_proc_entry("config.gz", NULL);
99} 98}
100 99
101module_init(ikconfig_init); 100module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..c77bc3a1c722 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
33 * an ongoing cpu hotplug operation. 33 * an ongoing cpu hotplug operation.
34 */ 34 */
35 int refcount; 35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug; 36} cpu_hotplug;
38 37
39#define writer_exists() (cpu_hotplug.active_writer != NULL)
40
41void __init cpu_hotplug_init(void) 38void __init cpu_hotplug_init(void)
42{ 39{
43 cpu_hotplug.active_writer = NULL; 40 cpu_hotplug.active_writer = NULL;
44 mutex_init(&cpu_hotplug.lock); 41 mutex_init(&cpu_hotplug.lock);
45 cpu_hotplug.refcount = 0; 42 cpu_hotplug.refcount = 0;
46 init_waitqueue_head(&cpu_hotplug.writer_queue);
47} 43}
48 44
49#ifdef CONFIG_HOTPLUG_CPU 45#ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
65 if (cpu_hotplug.active_writer == current) 61 if (cpu_hotplug.active_writer == current)
66 return; 62 return;
67 mutex_lock(&cpu_hotplug.lock); 63 mutex_lock(&cpu_hotplug.lock);
68 cpu_hotplug.refcount--; 64 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
69 65 wake_up_process(cpu_hotplug.active_writer);
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock); 66 mutex_unlock(&cpu_hotplug.lock);
74 67
75} 68}
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
98 * Note that during a cpu-hotplug operation, the new readers, if any, 91 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock 92 * will be blocked by the cpu_hotplug.lock
100 * 93 *
101 * Since cpu_maps_update_begin is always called after invoking 94 * Since cpu_hotplug_begin() is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active. 95 * cpu_maps_update_begin(), we can be sure that only one writer is active.
103 * 96 *
104 * Note that theoretically, there is a possibility of a livelock: 97 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping 98 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
115 */ 108 */
116static void cpu_hotplug_begin(void) 109static void cpu_hotplug_begin(void)
117{ 110{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current; 111 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); 112
124 while (cpu_hotplug.refcount) { 113 for (;;) {
125 set_current_state(TASK_UNINTERRUPTIBLE); 114 mutex_lock(&cpu_hotplug.lock);
115 if (likely(!cpu_hotplug.refcount))
116 break;
117 __set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock); 118 mutex_unlock(&cpu_hotplug.lock);
127 schedule(); 119 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 } 120 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131} 121}
132 122
133static void cpu_hotplug_done(void) 123static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
136 mutex_unlock(&cpu_hotplug.lock); 126 mutex_unlock(&cpu_hotplug.lock);
137} 127}
138/* Need to know about CPUs going up/down? */ 128/* Need to know about CPUs going up/down? */
139int __cpuinit register_cpu_notifier(struct notifier_block *nb) 129int __ref register_cpu_notifier(struct notifier_block *nb)
140{ 130{
141 int ret; 131 int ret;
142 cpu_maps_update_begin(); 132 cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
149 139
150EXPORT_SYMBOL(register_cpu_notifier); 140EXPORT_SYMBOL(register_cpu_notifier);
151 141
152void unregister_cpu_notifier(struct notifier_block *nb) 142void __ref unregister_cpu_notifier(struct notifier_block *nb)
153{ 143{
154 cpu_maps_update_begin(); 144 cpu_maps_update_begin();
155 raw_notifier_chain_unregister(&cpu_chain, nb); 145 raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
180}; 170};
181 171
182/* Take this CPU down. */ 172/* Take this CPU down. */
183static int take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
184{ 174{
185 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
186 int err; 176 int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
199} 189}
200 190
201/* Requires cpu_add_remove_lock to be held */ 191/* Requires cpu_add_remove_lock to be held */
202static int _cpu_down(unsigned int cpu, int tasks_frozen) 192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
203{ 193{
204 int err, nr_calls = 0; 194 int err, nr_calls = 0;
205 struct task_struct *p; 195 struct task_struct *p;
@@ -225,7 +215,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
225 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 215 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
226 hcpu, nr_calls, NULL); 216 hcpu, nr_calls, NULL);
227 printk("%s: attempt to take down CPU %u failed\n", 217 printk("%s: attempt to take down CPU %u failed\n",
228 __FUNCTION__, cpu); 218 __func__, cpu);
229 err = -EINVAL; 219 err = -EINVAL;
230 goto out_release; 220 goto out_release;
231 } 221 }
@@ -274,7 +264,7 @@ out_release:
274 return err; 264 return err;
275} 265}
276 266
277int cpu_down(unsigned int cpu) 267int __ref cpu_down(unsigned int cpu)
278{ 268{
279 int err = 0; 269 int err = 0;
280 270
@@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
305 if (ret == NOTIFY_BAD) { 295 if (ret == NOTIFY_BAD) {
306 nr_calls--; 296 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 297 printk("%s: attempt to bring up CPU %u failed\n",
308 __FUNCTION__, cpu); 298 __func__, cpu);
309 ret = -EINVAL; 299 ret = -EINVAL;
310 goto out_notify; 300 goto out_notify;
311 } 301 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 48a976c52cf5..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
127typedef enum { 127typedef enum {
128 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
130 CS_MEMORY_MIGRATE, 131 CS_MEMORY_MIGRATE,
131 CS_SCHED_LOAD_BALANCE, 132 CS_SCHED_LOAD_BALANCE,
132 CS_SPREAD_PAGE, 133 CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
144 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
145} 146}
146 147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
147static inline int is_sched_load_balance(const struct cpuset *cs) 153static inline int is_sched_load_balance(const struct cpuset *cs)
148{ 154{
149 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
735 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 741 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
736 * words, if its mask is not equal to its cpuset's mask). 742 * words, if its mask is not equal to its cpuset's mask).
737 */ 743 */
738int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 744static int cpuset_test_cpumask(struct task_struct *tsk,
745 struct cgroup_scanner *scan)
739{ 746{
740 return !cpus_equal(tsk->cpus_allowed, 747 return !cpus_equal(tsk->cpus_allowed,
741 (cgroup_cs(scan->cg))->cpus_allowed); 748 (cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
752 * We don't need to re-check for the cgroup/cpuset membership, since we're 759 * We don't need to re-check for the cgroup/cpuset membership, since we're
753 * holding cgroup_lock() at this point. 760 * holding cgroup_lock() at this point.
754 */ 761 */
755void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 762static void cpuset_change_cpumask(struct task_struct *tsk,
763 struct cgroup_scanner *scan)
756{ 764{
757 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 765 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
758} 766}
@@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void)
1023 return task_cs(current) == cpuset_being_rebound; 1031 return task_cs(current) == cpuset_being_rebound;
1024} 1032}
1025 1033
1026/*
1027 * Call with cgroup_mutex held.
1028 */
1029
1030static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1031{
1032 if (simple_strtoul(buf, NULL, 10) != 0)
1033 cpuset_memory_pressure_enabled = 1;
1034 else
1035 cpuset_memory_pressure_enabled = 0;
1036 return 0;
1037}
1038
1039static int update_relax_domain_level(struct cpuset *cs, char *buf) 1034static int update_relax_domain_level(struct cpuset *cs, char *buf)
1040{ 1035{
1041 int val = simple_strtol(buf, NULL, 10); 1036 int val = simple_strtol(buf, NULL, 10);
@@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
1053 1048
1054/* 1049/*
1055 * update_flag - read a 0 or a 1 in a file and update associated flag 1050 * update_flag - read a 0 or a 1 in a file and update associated flag
1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1051 * bit: the bit to update (see cpuset_flagbits_t)
1057 * CS_SCHED_LOAD_BALANCE, 1052 * cs: the cpuset to update
1058 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1053 * turning_on: whether the flag is being set or cleared
1059 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1060 * cs: the cpuset to update
1061 * buf: the buffer where we read the 0 or 1
1062 * 1054 *
1063 * Call with cgroup_mutex held. 1055 * Call with cgroup_mutex held.
1064 */ 1056 */
1065 1057
1066static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1058static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1059 int turning_on)
1067{ 1060{
1068 int turning_on;
1069 struct cpuset trialcs; 1061 struct cpuset trialcs;
1070 int err; 1062 int err;
1071 int cpus_nonempty, balance_flag_changed; 1063 int cpus_nonempty, balance_flag_changed;
1072 1064
1073 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1074
1075 trialcs = *cs; 1065 trialcs = *cs;
1076 if (turning_on) 1066 if (turning_on)
1077 set_bit(bit, &trialcs.flags); 1067 set_bit(bit, &trialcs.flags);
@@ -1241,6 +1231,7 @@ typedef enum {
1241 FILE_MEMLIST, 1231 FILE_MEMLIST,
1242 FILE_CPU_EXCLUSIVE, 1232 FILE_CPU_EXCLUSIVE,
1243 FILE_MEM_EXCLUSIVE, 1233 FILE_MEM_EXCLUSIVE,
1234 FILE_MEM_HARDWALL,
1244 FILE_SCHED_LOAD_BALANCE, 1235 FILE_SCHED_LOAD_BALANCE,
1245 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1236 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1246 FILE_MEMORY_PRESSURE_ENABLED, 1237 FILE_MEMORY_PRESSURE_ENABLED,
@@ -1289,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1289 case FILE_MEMLIST: 1280 case FILE_MEMLIST:
1290 retval = update_nodemask(cs, buffer); 1281 retval = update_nodemask(cs, buffer);
1291 break; 1282 break;
1283 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1284 retval = update_relax_domain_level(cs, buffer);
1285 break;
1286 default:
1287 retval = -EINVAL;
1288 goto out2;
1289 }
1290
1291 if (retval == 0)
1292 retval = nbytes;
1293out2:
1294 cgroup_unlock();
1295out1:
1296 kfree(buffer);
1297 return retval;
1298}
1299
1300static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1301{
1302 int retval = 0;
1303 struct cpuset *cs = cgroup_cs(cgrp);
1304 cpuset_filetype_t type = cft->private;
1305
1306 cgroup_lock();
1307
1308 if (cgroup_is_removed(cgrp)) {
1309 cgroup_unlock();
1310 return -ENODEV;
1311 }
1312
1313 switch (type) {
1292 case FILE_CPU_EXCLUSIVE: 1314 case FILE_CPU_EXCLUSIVE:
1293 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); 1315 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1294 break; 1316 break;
1295 case FILE_MEM_EXCLUSIVE: 1317 case FILE_MEM_EXCLUSIVE:
1296 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1318 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1297 break; 1319 break;
1298 case FILE_SCHED_LOAD_BALANCE: 1320 case FILE_MEM_HARDWALL:
1299 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1321 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1300 break; 1322 break;
1301 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1323 case FILE_SCHED_LOAD_BALANCE:
1302 retval = update_relax_domain_level(cs, buffer); 1324 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1303 break; 1325 break;
1304 case FILE_MEMORY_MIGRATE: 1326 case FILE_MEMORY_MIGRATE:
1305 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1327 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1306 break; 1328 break;
1307 case FILE_MEMORY_PRESSURE_ENABLED: 1329 case FILE_MEMORY_PRESSURE_ENABLED:
1308 retval = update_memory_pressure_enabled(cs, buffer); 1330 cpuset_memory_pressure_enabled = !!val;
1309 break; 1331 break;
1310 case FILE_MEMORY_PRESSURE: 1332 case FILE_MEMORY_PRESSURE:
1311 retval = -EACCES; 1333 retval = -EACCES;
1312 break; 1334 break;
1313 case FILE_SPREAD_PAGE: 1335 case FILE_SPREAD_PAGE:
1314 retval = update_flag(CS_SPREAD_PAGE, cs, buffer); 1336 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1315 cs->mems_generation = cpuset_mems_generation++; 1337 cs->mems_generation = cpuset_mems_generation++;
1316 break; 1338 break;
1317 case FILE_SPREAD_SLAB: 1339 case FILE_SPREAD_SLAB:
1318 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1340 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1319 cs->mems_generation = cpuset_mems_generation++; 1341 cs->mems_generation = cpuset_mems_generation++;
1320 break; 1342 break;
1321 default: 1343 default:
1322 retval = -EINVAL; 1344 retval = -EINVAL;
1323 goto out2; 1345 break;
1324 } 1346 }
1325
1326 if (retval == 0)
1327 retval = nbytes;
1328out2:
1329 cgroup_unlock(); 1347 cgroup_unlock();
1330out1:
1331 kfree(buffer);
1332 return retval; 1348 return retval;
1333} 1349}
1334 1350
@@ -1390,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1390 case FILE_MEMLIST: 1406 case FILE_MEMLIST:
1391 s += cpuset_sprintf_memlist(s, cs); 1407 s += cpuset_sprintf_memlist(s, cs);
1392 break; 1408 break;
1393 case FILE_CPU_EXCLUSIVE:
1394 *s++ = is_cpu_exclusive(cs) ? '1' : '0';
1395 break;
1396 case FILE_MEM_EXCLUSIVE:
1397 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1398 break;
1399 case FILE_SCHED_LOAD_BALANCE:
1400 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1401 break;
1402 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1409 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1403 s += sprintf(s, "%d", cs->relax_domain_level); 1410 s += sprintf(s, "%d", cs->relax_domain_level);
1404 break; 1411 break;
1405 case FILE_MEMORY_MIGRATE:
1406 *s++ = is_memory_migrate(cs) ? '1' : '0';
1407 break;
1408 case FILE_MEMORY_PRESSURE_ENABLED:
1409 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1410 break;
1411 case FILE_MEMORY_PRESSURE:
1412 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1413 break;
1414 case FILE_SPREAD_PAGE:
1415 *s++ = is_spread_page(cs) ? '1' : '0';
1416 break;
1417 case FILE_SPREAD_SLAB:
1418 *s++ = is_spread_slab(cs) ? '1' : '0';
1419 break;
1420 default: 1412 default:
1421 retval = -EINVAL; 1413 retval = -EINVAL;
1422 goto out; 1414 goto out;
@@ -1429,121 +1421,137 @@ out:
1429 return retval; 1421 return retval;
1430} 1422}
1431 1423
1432 1424static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1433 1425{
1426 struct cpuset *cs = cgroup_cs(cont);
1427 cpuset_filetype_t type = cft->private;
1428 switch (type) {
1429 case FILE_CPU_EXCLUSIVE:
1430 return is_cpu_exclusive(cs);
1431 case FILE_MEM_EXCLUSIVE:
1432 return is_mem_exclusive(cs);
1433 case FILE_MEM_HARDWALL:
1434 return is_mem_hardwall(cs);
1435 case FILE_SCHED_LOAD_BALANCE:
1436 return is_sched_load_balance(cs);
1437 case FILE_MEMORY_MIGRATE:
1438 return is_memory_migrate(cs);
1439 case FILE_MEMORY_PRESSURE_ENABLED:
1440 return cpuset_memory_pressure_enabled;
1441 case FILE_MEMORY_PRESSURE:
1442 return fmeter_getrate(&cs->fmeter);
1443 case FILE_SPREAD_PAGE:
1444 return is_spread_page(cs);
1445 case FILE_SPREAD_SLAB:
1446 return is_spread_slab(cs);
1447 default:
1448 BUG();
1449 }
1450}
1434 1451
1435 1452
1436/* 1453/*
1437 * for the common functions, 'private' gives the type of file 1454 * for the common functions, 'private' gives the type of file
1438 */ 1455 */
1439 1456
1440static struct cftype cft_cpus = { 1457static struct cftype files[] = {
1441 .name = "cpus", 1458 {
1442 .read = cpuset_common_file_read, 1459 .name = "cpus",
1443 .write = cpuset_common_file_write, 1460 .read = cpuset_common_file_read,
1444 .private = FILE_CPULIST, 1461 .write = cpuset_common_file_write,
1445}; 1462 .private = FILE_CPULIST,
1446 1463 },
1447static struct cftype cft_mems = { 1464
1448 .name = "mems", 1465 {
1449 .read = cpuset_common_file_read, 1466 .name = "mems",
1450 .write = cpuset_common_file_write, 1467 .read = cpuset_common_file_read,
1451 .private = FILE_MEMLIST, 1468 .write = cpuset_common_file_write,
1452}; 1469 .private = FILE_MEMLIST,
1453 1470 },
1454static struct cftype cft_cpu_exclusive = { 1471
1455 .name = "cpu_exclusive", 1472 {
1456 .read = cpuset_common_file_read, 1473 .name = "cpu_exclusive",
1457 .write = cpuset_common_file_write, 1474 .read_u64 = cpuset_read_u64,
1458 .private = FILE_CPU_EXCLUSIVE, 1475 .write_u64 = cpuset_write_u64,
1459}; 1476 .private = FILE_CPU_EXCLUSIVE,
1460 1477 },
1461static struct cftype cft_mem_exclusive = { 1478
1462 .name = "mem_exclusive", 1479 {
1463 .read = cpuset_common_file_read, 1480 .name = "mem_exclusive",
1464 .write = cpuset_common_file_write, 1481 .read_u64 = cpuset_read_u64,
1465 .private = FILE_MEM_EXCLUSIVE, 1482 .write_u64 = cpuset_write_u64,
1466}; 1483 .private = FILE_MEM_EXCLUSIVE,
1467 1484 },
1468static struct cftype cft_sched_load_balance = { 1485
1469 .name = "sched_load_balance", 1486 {
1470 .read = cpuset_common_file_read, 1487 .name = "mem_hardwall",
1471 .write = cpuset_common_file_write, 1488 .read_u64 = cpuset_read_u64,
1472 .private = FILE_SCHED_LOAD_BALANCE, 1489 .write_u64 = cpuset_write_u64,
1473}; 1490 .private = FILE_MEM_HARDWALL,
1474 1491 },
1475static struct cftype cft_sched_relax_domain_level = { 1492
1476 .name = "sched_relax_domain_level", 1493 {
1477 .read = cpuset_common_file_read, 1494 .name = "sched_load_balance",
1478 .write = cpuset_common_file_write, 1495 .read_u64 = cpuset_read_u64,
1479 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1496 .write_u64 = cpuset_write_u64,
1480}; 1497 .private = FILE_SCHED_LOAD_BALANCE,
1481 1498 },
1482static struct cftype cft_memory_migrate = { 1499
1483 .name = "memory_migrate", 1500 {
1484 .read = cpuset_common_file_read, 1501 .name = "sched_relax_domain_level",
1485 .write = cpuset_common_file_write, 1502 .read_u64 = cpuset_read_u64,
1486 .private = FILE_MEMORY_MIGRATE, 1503 .write_u64 = cpuset_write_u64,
1504 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1505 },
1506
1507 {
1508 .name = "memory_migrate",
1509 .read_u64 = cpuset_read_u64,
1510 .write_u64 = cpuset_write_u64,
1511 .private = FILE_MEMORY_MIGRATE,
1512 },
1513
1514 {
1515 .name = "memory_pressure",
1516 .read_u64 = cpuset_read_u64,
1517 .write_u64 = cpuset_write_u64,
1518 .private = FILE_MEMORY_PRESSURE,
1519 },
1520
1521 {
1522 .name = "memory_spread_page",
1523 .read_u64 = cpuset_read_u64,
1524 .write_u64 = cpuset_write_u64,
1525 .private = FILE_SPREAD_PAGE,
1526 },
1527
1528 {
1529 .name = "memory_spread_slab",
1530 .read_u64 = cpuset_read_u64,
1531 .write_u64 = cpuset_write_u64,
1532 .private = FILE_SPREAD_SLAB,
1533 },
1487}; 1534};
1488 1535
1489static struct cftype cft_memory_pressure_enabled = { 1536static struct cftype cft_memory_pressure_enabled = {
1490 .name = "memory_pressure_enabled", 1537 .name = "memory_pressure_enabled",
1491 .read = cpuset_common_file_read, 1538 .read_u64 = cpuset_read_u64,
1492 .write = cpuset_common_file_write, 1539 .write_u64 = cpuset_write_u64,
1493 .private = FILE_MEMORY_PRESSURE_ENABLED, 1540 .private = FILE_MEMORY_PRESSURE_ENABLED,
1494}; 1541};
1495 1542
1496static struct cftype cft_memory_pressure = {
1497 .name = "memory_pressure",
1498 .read = cpuset_common_file_read,
1499 .write = cpuset_common_file_write,
1500 .private = FILE_MEMORY_PRESSURE,
1501};
1502
1503static struct cftype cft_spread_page = {
1504 .name = "memory_spread_page",
1505 .read = cpuset_common_file_read,
1506 .write = cpuset_common_file_write,
1507 .private = FILE_SPREAD_PAGE,
1508};
1509
1510static struct cftype cft_spread_slab = {
1511 .name = "memory_spread_slab",
1512 .read = cpuset_common_file_read,
1513 .write = cpuset_common_file_write,
1514 .private = FILE_SPREAD_SLAB,
1515};
1516
1517static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1543static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1518{ 1544{
1519 int err; 1545 int err;
1520 1546
1521 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) 1547 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1522 return err; 1548 if (err)
1523 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1524 return err;
1525 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1526 return err;
1527 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1528 return err;
1529 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1530 return err;
1531 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1532 return err;
1533 if ((err = cgroup_add_file(cont, ss,
1534 &cft_sched_relax_domain_level)) < 0)
1535 return err;
1536 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1537 return err;
1538 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1539 return err;
1540 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1541 return err; 1549 return err;
1542 /* memory_pressure_enabled is in root cpuset only */ 1550 /* memory_pressure_enabled is in root cpuset only */
1543 if (err == 0 && !cont->parent) 1551 if (!cont->parent)
1544 err = cgroup_add_file(cont, ss, 1552 err = cgroup_add_file(cont, ss,
1545 &cft_memory_pressure_enabled); 1553 &cft_memory_pressure_enabled);
1546 return 0; 1554 return err;
1547} 1555}
1548 1556
1549/* 1557/*
@@ -1643,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1643 cpuset_update_task_memory_state(); 1651 cpuset_update_task_memory_state();
1644 1652
1645 if (is_sched_load_balance(cs)) 1653 if (is_sched_load_balance(cs))
1646 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); 1654 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1647 1655
1648 number_of_cpusets--; 1656 number_of_cpusets--;
1649 kfree(cs); 1657 kfree(cs);
@@ -1708,7 +1716,8 @@ int __init cpuset_init(void)
1708 * Called by cgroup_scan_tasks() for each task in a cgroup. 1716 * Called by cgroup_scan_tasks() for each task in a cgroup.
1709 * Return nonzero to stop the walk through the tasks. 1717 * Return nonzero to stop the walk through the tasks.
1710 */ 1718 */
1711void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) 1719static void cpuset_do_move_task(struct task_struct *tsk,
1720 struct cgroup_scanner *scan)
1712{ 1721{
1713 struct cpuset_hotplug_scanner *chsp; 1722 struct cpuset_hotplug_scanner *chsp;
1714 1723
@@ -1970,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1970} 1979}
1971 1980
1972/* 1981/*
1973 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1982 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
1974 * ancestor to the specified cpuset. Call holding callback_mutex. 1983 * mem_hardwall ancestor to the specified cpuset. Call holding
1975 * If no ancestor is mem_exclusive (an unusual configuration), then 1984 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
1976 * returns the root cpuset. 1985 * (an unusual configuration), then returns the root cpuset.
1977 */ 1986 */
1978static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) 1987static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
1979{ 1988{
1980 while (!is_mem_exclusive(cs) && cs->parent) 1989 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
1981 cs = cs->parent; 1990 cs = cs->parent;
1982 return cs; 1991 return cs;
1983} 1992}
@@ -1991,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1991 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2000 * __GFP_THISNODE is set, yes, we can always allocate. If zone
1992 * z's node is in our tasks mems_allowed, yes. If it's not a 2001 * z's node is in our tasks mems_allowed, yes. If it's not a
1993 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2002 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1994 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2003 * hardwalled cpuset ancestor to this tasks cpuset, yes.
1995 * If the task has been OOM killed and has access to memory reserves 2004 * If the task has been OOM killed and has access to memory reserves
1996 * as specified by the TIF_MEMDIE flag, yes. 2005 * as specified by the TIF_MEMDIE flag, yes.
1997 * Otherwise, no. 2006 * Otherwise, no.
@@ -2014,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2014 * and do not allow allocations outside the current tasks cpuset 2023 * and do not allow allocations outside the current tasks cpuset
2015 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2024 * unless the task has been OOM killed as is marked TIF_MEMDIE.
2016 * GFP_KERNEL allocations are not so marked, so can escape to the 2025 * GFP_KERNEL allocations are not so marked, so can escape to the
2017 * nearest enclosing mem_exclusive ancestor cpuset. 2026 * nearest enclosing hardwalled ancestor cpuset.
2018 * 2027 *
2019 * Scanning up parent cpusets requires callback_mutex. The 2028 * Scanning up parent cpusets requires callback_mutex. The
2020 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2029 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2037,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2037 * in_interrupt - any node ok (current task context irrelevant) 2046 * in_interrupt - any node ok (current task context irrelevant)
2038 * GFP_ATOMIC - any node ok 2047 * GFP_ATOMIC - any node ok
2039 * TIF_MEMDIE - any node ok 2048 * TIF_MEMDIE - any node ok
2040 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2049 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2041 * GFP_USER - only nodes in current tasks mems allowed ok. 2050 * GFP_USER - only nodes in current tasks mems allowed ok.
2042 * 2051 *
2043 * Rule: 2052 * Rule:
@@ -2074,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2074 mutex_lock(&callback_mutex); 2083 mutex_lock(&callback_mutex);
2075 2084
2076 task_lock(current); 2085 task_lock(current);
2077 cs = nearest_exclusive_ancestor(task_cs(current)); 2086 cs = nearest_hardwall_ancestor(task_cs(current));
2078 task_unlock(current); 2087 task_unlock(current);
2079 2088
2080 allowed = node_isset(node, cs->mems_allowed); 2089 allowed = node_isset(node, cs->mems_allowed);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
149 149
150static int __init proc_dma_init(void) 150static int __init proc_dma_init(void)
151{ 151{
152 struct proc_dir_entry *e; 152 proc_create("dma", 0, NULL, &proc_dma_operations);
153
154 e = create_proc_entry("dma", 0, NULL);
155 if (e)
156 e->proc_fops = &proc_dma_operations;
157
158 return 0; 153 return 0;
159} 154}
160 155
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..1510f78a0ffa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
19#include <linux/acct.h> 19#include <linux/acct.h>
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h>
22#include <linux/binfmts.h> 23#include <linux/binfmts.h>
23#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
24#include <linux/pid_namespace.h> 25#include <linux/pid_namespace.h>
@@ -52,6 +53,11 @@
52 53
53static void exit_mm(struct task_struct * tsk); 54static void exit_mm(struct task_struct * tsk);
54 55
56static inline int task_detached(struct task_struct *p)
57{
58 return p->exit_signal == -1;
59}
60
55static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p)
56{ 62{
57 nr_threads--; 63 nr_threads--;
@@ -160,7 +166,7 @@ repeat:
160 zap_leader = 0; 166 zap_leader = 0;
161 leader = p->group_leader; 167 leader = p->group_leader;
162 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 168 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
163 BUG_ON(leader->exit_signal == -1); 169 BUG_ON(task_detached(leader));
164 do_notify_parent(leader, leader->exit_signal); 170 do_notify_parent(leader, leader->exit_signal);
165 /* 171 /*
166 * If we were the last child thread and the leader has 172 * If we were the last child thread and the leader has
@@ -170,7 +176,7 @@ repeat:
170 * do_notify_parent() will have marked it self-reaping in 176 * do_notify_parent() will have marked it self-reaping in
171 * that case. 177 * that case.
172 */ 178 */
173 zap_leader = (leader->exit_signal == -1); 179 zap_leader = task_detached(leader);
174 } 180 }
175 181
176 write_unlock_irq(&tasklist_lock); 182 write_unlock_irq(&tasklist_lock);
@@ -329,13 +335,11 @@ void __set_special_pids(struct pid *pid)
329 pid_t nr = pid_nr(pid); 335 pid_t nr = pid_nr(pid);
330 336
331 if (task_session(curr) != pid) { 337 if (task_session(curr) != pid) {
332 detach_pid(curr, PIDTYPE_SID); 338 change_pid(curr, PIDTYPE_SID, pid);
333 attach_pid(curr, PIDTYPE_SID, pid);
334 set_task_session(curr, nr); 339 set_task_session(curr, nr);
335 } 340 }
336 if (task_pgrp(curr) != pid) { 341 if (task_pgrp(curr) != pid) {
337 detach_pid(curr, PIDTYPE_PGID); 342 change_pid(curr, PIDTYPE_PGID, pid);
338 attach_pid(curr, PIDTYPE_PGID, pid);
339 set_task_pgrp(curr, nr); 343 set_task_pgrp(curr, nr);
340 } 344 }
341} 345}
@@ -557,6 +561,88 @@ void exit_fs(struct task_struct *tsk)
557 561
558EXPORT_SYMBOL_GPL(exit_fs); 562EXPORT_SYMBOL_GPL(exit_fs);
559 563
564#ifdef CONFIG_MM_OWNER
565/*
566 * Task p is exiting and it owned mm, lets find a new owner for it
567 */
568static inline int
569mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
570{
571 /*
572 * If there are other users of the mm and the owner (us) is exiting
573 * we need to find a new owner to take on the responsibility.
574 */
575 if (!mm)
576 return 0;
577 if (atomic_read(&mm->mm_users) <= 1)
578 return 0;
579 if (mm->owner != p)
580 return 0;
581 return 1;
582}
583
584void mm_update_next_owner(struct mm_struct *mm)
585{
586 struct task_struct *c, *g, *p = current;
587
588retry:
589 if (!mm_need_new_owner(mm, p))
590 return;
591
592 read_lock(&tasklist_lock);
593 /*
594 * Search in the children
595 */
596 list_for_each_entry(c, &p->children, sibling) {
597 if (c->mm == mm)
598 goto assign_new_owner;
599 }
600
601 /*
602 * Search in the siblings
603 */
604 list_for_each_entry(c, &p->parent->children, sibling) {
605 if (c->mm == mm)
606 goto assign_new_owner;
607 }
608
609 /*
610 * Search through everything else. We should not get
611 * here often
612 */
613 do_each_thread(g, c) {
614 if (c->mm == mm)
615 goto assign_new_owner;
616 } while_each_thread(g, c);
617
618 read_unlock(&tasklist_lock);
619 return;
620
621assign_new_owner:
622 BUG_ON(c == p);
623 get_task_struct(c);
624 /*
625 * The task_lock protects c->mm from changing.
626 * We always want mm->owner->mm == mm
627 */
628 task_lock(c);
629 /*
630 * Delay read_unlock() till we have the task_lock()
631 * to ensure that c does not slip away underneath us
632 */
633 read_unlock(&tasklist_lock);
634 if (c->mm != mm) {
635 task_unlock(c);
636 put_task_struct(c);
637 goto retry;
638 }
639 cgroup_mm_owner_callbacks(mm->owner, c);
640 mm->owner = c;
641 task_unlock(c);
642 put_task_struct(c);
643}
644#endif /* CONFIG_MM_OWNER */
645
560/* 646/*
561 * Turn us into a lazy TLB process if we 647 * Turn us into a lazy TLB process if we
562 * aren't already.. 648 * aren't already..
@@ -596,6 +682,7 @@ static void exit_mm(struct task_struct * tsk)
596 /* We don't want this task to be frozen prematurely */ 682 /* We don't want this task to be frozen prematurely */
597 clear_freeze_flag(tsk); 683 clear_freeze_flag(tsk);
598 task_unlock(tsk); 684 task_unlock(tsk);
685 mm_update_next_owner(mm);
599 mmput(mm); 686 mmput(mm);
600} 687}
601 688
@@ -610,7 +697,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
610 if (unlikely(traced)) { 697 if (unlikely(traced)) {
611 /* Preserve ptrace links if someone else is tracing this child. */ 698 /* Preserve ptrace links if someone else is tracing this child. */
612 list_del_init(&p->ptrace_list); 699 list_del_init(&p->ptrace_list);
613 if (p->parent != p->real_parent) 700 if (ptrace_reparented(p))
614 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 701 list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
615 } else { 702 } else {
616 /* If this child is being traced, then we're the one tracing it 703 /* If this child is being traced, then we're the one tracing it
@@ -634,18 +721,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
634 /* If this is a threaded reparent there is no need to 721 /* If this is a threaded reparent there is no need to
635 * notify anyone anything has happened. 722 * notify anyone anything has happened.
636 */ 723 */
637 if (p->real_parent->group_leader == father->group_leader) 724 if (same_thread_group(p->real_parent, father))
638 return; 725 return;
639 726
640 /* We don't want people slaying init. */ 727 /* We don't want people slaying init. */
641 if (p->exit_signal != -1) 728 if (!task_detached(p))
642 p->exit_signal = SIGCHLD; 729 p->exit_signal = SIGCHLD;
643 730
644 /* If we'd notified the old parent about this child's death, 731 /* If we'd notified the old parent about this child's death,
645 * also notify the new parent. 732 * also notify the new parent.
646 */ 733 */
647 if (!traced && p->exit_state == EXIT_ZOMBIE && 734 if (!traced && p->exit_state == EXIT_ZOMBIE &&
648 p->exit_signal != -1 && thread_group_empty(p)) 735 !task_detached(p) && thread_group_empty(p))
649 do_notify_parent(p, p->exit_signal); 736 do_notify_parent(p, p->exit_signal);
650 737
651 kill_orphaned_pgrp(p, father); 738 kill_orphaned_pgrp(p, father);
@@ -698,18 +785,18 @@ static void forget_original_parent(struct task_struct *father)
698 } else { 785 } else {
699 /* reparent ptraced task to its real parent */ 786 /* reparent ptraced task to its real parent */
700 __ptrace_unlink (p); 787 __ptrace_unlink (p);
701 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && 788 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
702 thread_group_empty(p)) 789 thread_group_empty(p))
703 do_notify_parent(p, p->exit_signal); 790 do_notify_parent(p, p->exit_signal);
704 } 791 }
705 792
706 /* 793 /*
707 * if the ptraced child is a zombie with exit_signal == -1 794 * if the ptraced child is a detached zombie we must collect
708 * we must collect it before we exit, or it will remain 795 * it before we exit, or it will remain zombie forever since
709 * zombie forever since we prevented it from self-reap itself 796 * we prevented it from self-reap itself while it was being
710 * while it was being traced by us, to be able to see it in wait4. 797 * traced by us, to be able to see it in wait4.
711 */ 798 */
712 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 799 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
713 list_add(&p->ptrace_list, &ptrace_dead); 800 list_add(&p->ptrace_list, &ptrace_dead);
714 } 801 }
715 802
@@ -766,29 +853,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
766 * we have changed execution domain as these two values started 853 * we have changed execution domain as these two values started
767 * the same after a fork. 854 * the same after a fork.
768 */ 855 */
769 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && 856 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
770 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 857 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
771 tsk->self_exec_id != tsk->parent_exec_id) 858 tsk->self_exec_id != tsk->parent_exec_id) &&
772 && !capable(CAP_KILL)) 859 !capable(CAP_KILL))
773 tsk->exit_signal = SIGCHLD; 860 tsk->exit_signal = SIGCHLD;
774 861
775
776 /* If something other than our normal parent is ptracing us, then 862 /* If something other than our normal parent is ptracing us, then
777 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 863 * send it a SIGCHLD instead of honoring exit_signal. exit_signal
778 * only has special meaning to our real parent. 864 * only has special meaning to our real parent.
779 */ 865 */
780 if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { 866 if (!task_detached(tsk) && thread_group_empty(tsk)) {
781 int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; 867 int signal = ptrace_reparented(tsk) ?
868 SIGCHLD : tsk->exit_signal;
782 do_notify_parent(tsk, signal); 869 do_notify_parent(tsk, signal);
783 } else if (tsk->ptrace) { 870 } else if (tsk->ptrace) {
784 do_notify_parent(tsk, SIGCHLD); 871 do_notify_parent(tsk, SIGCHLD);
785 } 872 }
786 873
787 state = EXIT_ZOMBIE; 874 state = EXIT_ZOMBIE;
788 if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) 875 if (task_detached(tsk) && likely(!tsk->ptrace))
789 state = EXIT_DEAD; 876 state = EXIT_DEAD;
790 tsk->exit_state = state; 877 tsk->exit_state = state;
791 878
879 /* mt-exec, de_thread() is waiting for us */
792 if (thread_group_leader(tsk) && 880 if (thread_group_leader(tsk) &&
793 tsk->signal->notify_count < 0 && 881 tsk->signal->notify_count < 0 &&
794 tsk->signal->group_exit_task) 882 tsk->signal->group_exit_task)
@@ -1032,12 +1120,13 @@ asmlinkage long sys_exit(int error_code)
1032NORET_TYPE void 1120NORET_TYPE void
1033do_group_exit(int exit_code) 1121do_group_exit(int exit_code)
1034{ 1122{
1123 struct signal_struct *sig = current->signal;
1124
1035 BUG_ON(exit_code & 0x80); /* core dumps don't get here */ 1125 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1036 1126
1037 if (current->signal->flags & SIGNAL_GROUP_EXIT) 1127 if (signal_group_exit(sig))
1038 exit_code = current->signal->group_exit_code; 1128 exit_code = sig->group_exit_code;
1039 else if (!thread_group_empty(current)) { 1129 else if (!thread_group_empty(current)) {
1040 struct signal_struct *const sig = current->signal;
1041 struct sighand_struct *const sighand = current->sighand; 1130 struct sighand_struct *const sighand = current->sighand;
1042 spin_lock_irq(&sighand->siglock); 1131 spin_lock_irq(&sighand->siglock);
1043 if (signal_group_exit(sig)) 1132 if (signal_group_exit(sig))
@@ -1089,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1089 * Do not consider detached threads that are 1178 * Do not consider detached threads that are
1090 * not ptraced: 1179 * not ptraced:
1091 */ 1180 */
1092 if (p->exit_signal == -1 && !p->ptrace) 1181 if (task_detached(p) && !p->ptrace)
1093 return 0; 1182 return 0;
1094 1183
1095 /* Wait for all children (clone and not) if __WALL is set; 1184 /* Wait for all children (clone and not) if __WALL is set;
@@ -1179,8 +1268,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1179 return 0; 1268 return 0;
1180 } 1269 }
1181 1270
1182 /* traced means p->ptrace, but not vice versa */ 1271 traced = ptrace_reparented(p);
1183 traced = (p->real_parent != p->parent);
1184 1272
1185 if (likely(!traced)) { 1273 if (likely(!traced)) {
1186 struct signal_struct *psig; 1274 struct signal_struct *psig;
@@ -1281,9 +1369,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1281 * If it's still not detached after that, don't release 1369 * If it's still not detached after that, don't release
1282 * it now. 1370 * it now.
1283 */ 1371 */
1284 if (p->exit_signal != -1) { 1372 if (!task_detached(p)) {
1285 do_notify_parent(p, p->exit_signal); 1373 do_notify_parent(p, p->exit_signal);
1286 if (p->exit_signal != -1) { 1374 if (!task_detached(p)) {
1287 p->exit_state = EXIT_ZOMBIE; 1375 p->exit_state = EXIT_ZOMBIE;
1288 p = NULL; 1376 p = NULL;
1289 } 1377 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..933e60ebccae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
22#include <linux/mempolicy.h> 22#include <linux/mempolicy.h>
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fdtable.h>
25#include <linux/key.h> 26#include <linux/key.h>
26#include <linux/binfmts.h> 27#include <linux/binfmts.h>
27#include <linux/mman.h> 28#include <linux/mman.h>
@@ -381,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
381 mm->ioctx_list = NULL; 382 mm->ioctx_list = NULL;
382 mm->free_area_cache = TASK_UNMAPPED_BASE; 383 mm->free_area_cache = TASK_UNMAPPED_BASE;
383 mm->cached_hole_size = ~0UL; 384 mm->cached_hole_size = ~0UL;
384 mm_init_cgroup(mm, p); 385 mm_init_owner(mm, p);
385 386
386 if (likely(!mm_alloc_pgd(mm))) { 387 if (likely(!mm_alloc_pgd(mm))) {
387 mm->def_flags = 0; 388 mm->def_flags = 0;
388 return mm; 389 return mm;
389 } 390 }
390 391
391 mm_free_cgroup(mm);
392 free_mm(mm); 392 free_mm(mm);
393 return NULL; 393 return NULL;
394} 394}
@@ -432,13 +432,13 @@ void mmput(struct mm_struct *mm)
432 if (atomic_dec_and_test(&mm->mm_users)) { 432 if (atomic_dec_and_test(&mm->mm_users)) {
433 exit_aio(mm); 433 exit_aio(mm);
434 exit_mmap(mm); 434 exit_mmap(mm);
435 set_mm_exe_file(mm, NULL);
435 if (!list_empty(&mm->mmlist)) { 436 if (!list_empty(&mm->mmlist)) {
436 spin_lock(&mmlist_lock); 437 spin_lock(&mmlist_lock);
437 list_del(&mm->mmlist); 438 list_del(&mm->mmlist);
438 spin_unlock(&mmlist_lock); 439 spin_unlock(&mmlist_lock);
439 } 440 }
440 put_swap_token(mm); 441 put_swap_token(mm);
441 mm_free_cgroup(mm);
442 mmdrop(mm); 442 mmdrop(mm);
443 } 443 }
444} 444}
@@ -545,6 +545,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
545 if (init_new_context(tsk, mm)) 545 if (init_new_context(tsk, mm))
546 goto fail_nocontext; 546 goto fail_nocontext;
547 547
548 dup_mm_exe_file(oldmm, mm);
549
548 err = dup_mmap(mm, oldmm); 550 err = dup_mmap(mm, oldmm);
549 if (err) 551 if (err)
550 goto free_pt; 552 goto free_pt;
@@ -891,7 +893,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
891 sig->group_exit_code = 0; 893 sig->group_exit_code = 0;
892 sig->group_exit_task = NULL; 894 sig->group_exit_task = NULL;
893 sig->group_stop_count = 0; 895 sig->group_stop_count = 0;
894 sig->curr_target = NULL; 896 sig->curr_target = tsk;
895 init_sigpending(&sig->shared_pending); 897 init_sigpending(&sig->shared_pending);
896 INIT_LIST_HEAD(&sig->posix_timers); 898 INIT_LIST_HEAD(&sig->posix_timers);
897 899
@@ -982,6 +984,13 @@ static void rt_mutex_init_task(struct task_struct *p)
982#endif 984#endif
983} 985}
984 986
987#ifdef CONFIG_MM_OWNER
988void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
989{
990 mm->owner = p;
991}
992#endif /* CONFIG_MM_OWNER */
993
985/* 994/*
986 * This creates a new process as a copy of the old one, 995 * This creates a new process as a copy of the old one,
987 * but does not actually start it yet. 996 * but does not actually start it yet.
@@ -1664,18 +1673,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1664} 1673}
1665 1674
1666/* 1675/*
1667 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1668 * supported yet
1669 */
1670static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1671{
1672 if (unshare_flags & CLONE_SYSVSEM)
1673 return -EINVAL;
1674
1675 return 0;
1676}
1677
1678/*
1679 * unshare allows a process to 'unshare' part of the process 1676 * unshare allows a process to 'unshare' part of the process
1680 * context which was originally shared using clone. copy_* 1677 * context which was originally shared using clone. copy_*
1681 * functions used by do_fork() cannot be used here directly 1678 * functions used by do_fork() cannot be used here directly
@@ -1690,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1690 struct sighand_struct *new_sigh = NULL; 1687 struct sighand_struct *new_sigh = NULL;
1691 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1688 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1692 struct files_struct *fd, *new_fd = NULL; 1689 struct files_struct *fd, *new_fd = NULL;
1693 struct sem_undo_list *new_ulist = NULL;
1694 struct nsproxy *new_nsproxy = NULL; 1690 struct nsproxy *new_nsproxy = NULL;
1691 int do_sysvsem = 0;
1695 1692
1696 check_unshare_flags(&unshare_flags); 1693 check_unshare_flags(&unshare_flags);
1697 1694
@@ -1703,6 +1700,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1703 CLONE_NEWNET)) 1700 CLONE_NEWNET))
1704 goto bad_unshare_out; 1701 goto bad_unshare_out;
1705 1702
1703 /*
1704 * CLONE_NEWIPC must also detach from the undolist: after switching
1705 * to a new ipc namespace, the semaphore arrays from the old
1706 * namespace are unreachable.
1707 */
1708 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1709 do_sysvsem = 1;
1706 if ((err = unshare_thread(unshare_flags))) 1710 if ((err = unshare_thread(unshare_flags)))
1707 goto bad_unshare_out; 1711 goto bad_unshare_out;
1708 if ((err = unshare_fs(unshare_flags, &new_fs))) 1712 if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1713,13 +1717,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1713 goto bad_unshare_cleanup_sigh; 1717 goto bad_unshare_cleanup_sigh;
1714 if ((err = unshare_fd(unshare_flags, &new_fd))) 1718 if ((err = unshare_fd(unshare_flags, &new_fd)))
1715 goto bad_unshare_cleanup_vm; 1719 goto bad_unshare_cleanup_vm;
1716 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1717 goto bad_unshare_cleanup_fd;
1718 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1720 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1719 new_fs))) 1721 new_fs)))
1720 goto bad_unshare_cleanup_semundo; 1722 goto bad_unshare_cleanup_fd;
1721 1723
1722 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1724 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
1725 if (do_sysvsem) {
1726 /*
1727 * CLONE_SYSVSEM is equivalent to sys_exit().
1728 */
1729 exit_sem(current);
1730 }
1723 1731
1724 if (new_nsproxy) { 1732 if (new_nsproxy) {
1725 switch_task_namespaces(current, new_nsproxy); 1733 switch_task_namespaces(current, new_nsproxy);
@@ -1755,7 +1763,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1755 if (new_nsproxy) 1763 if (new_nsproxy)
1756 put_nsproxy(new_nsproxy); 1764 put_nsproxy(new_nsproxy);
1757 1765
1758bad_unshare_cleanup_semundo:
1759bad_unshare_cleanup_fd: 1766bad_unshare_cleanup_fd:
1760 if (new_fd) 1767 if (new_fd)
1761 put_files_struct(new_fd); 1768 put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index e43945e995f5..98092c9817f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1266,11 +1266,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1266 if (!abs_time) 1266 if (!abs_time)
1267 schedule(); 1267 schedule();
1268 else { 1268 else {
1269 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1269 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1270 HRTIMER_MODE_ABS);
1270 hrtimer_init_sleeper(&t, current); 1271 hrtimer_init_sleeper(&t, current);
1271 t.timer.expires = *abs_time; 1272 t.timer.expires = *abs_time;
1272 1273
1273 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); 1274 hrtimer_start(&t.timer, t.timer.expires,
1275 HRTIMER_MODE_ABS);
1274 if (!hrtimer_active(&t.timer)) 1276 if (!hrtimer_active(&t.timer))
1275 t.task = NULL; 1277 t.task = NULL;
1276 1278
@@ -1286,6 +1288,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1286 1288
1287 /* Flag if a timeout occured */ 1289 /* Flag if a timeout occured */
1288 rem = (t.task == NULL); 1290 rem = (t.task == NULL);
1291
1292 destroy_hrtimer_on_stack(&t.timer);
1289 } 1293 }
1290 } 1294 }
1291 __set_current_state(TASK_RUNNING); 1295 __set_current_state(TASK_RUNNING);
@@ -1367,7 +1371,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1367 1371
1368 if (time) { 1372 if (time) {
1369 to = &timeout; 1373 to = &timeout;
1370 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1374 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1375 HRTIMER_MODE_ABS);
1371 hrtimer_init_sleeper(to, current); 1376 hrtimer_init_sleeper(to, current);
1372 to->timer.expires = *time; 1377 to->timer.expires = *time;
1373 } 1378 }
@@ -1581,6 +1586,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1581 unqueue_me_pi(&q); 1586 unqueue_me_pi(&q);
1582 futex_unlock_mm(fshared); 1587 futex_unlock_mm(fshared);
1583 1588
1589 if (to)
1590 destroy_hrtimer_on_stack(&to->timer);
1584 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1591 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1585 1592
1586 out_unlock_release_sem: 1593 out_unlock_release_sem:
@@ -1588,6 +1595,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1588 1595
1589 out_release_sem: 1596 out_release_sem:
1590 futex_unlock_mm(fshared); 1597 futex_unlock_mm(fshared);
1598 if (to)
1599 destroy_hrtimer_on_stack(&to->timer);
1591 return ret; 1600 return ret;
1592 1601
1593 uaddr_faulted: 1602 uaddr_faulted:
@@ -1615,6 +1624,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1615 if (!ret && (uval != -EFAULT)) 1624 if (!ret && (uval != -EFAULT))
1616 goto retry; 1625 goto retry;
1617 1626
1627 if (to)
1628 destroy_hrtimer_on_stack(&to->timer);
1618 return ret; 1629 return ret;
1619} 1630}
1620 1631
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index dea4c9124ac8..9af1d6a8095e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,7 @@
43#include <linux/tick.h> 43#include <linux/tick.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45#include <linux/err.h> 45#include <linux/err.h>
46#include <linux/debugobjects.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
@@ -342,6 +343,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
342 return res; 343 return res;
343} 344}
344 345
346#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
347
348static struct debug_obj_descr hrtimer_debug_descr;
349
350/*
351 * fixup_init is called when:
352 * - an active object is initialized
353 */
354static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
355{
356 struct hrtimer *timer = addr;
357
358 switch (state) {
359 case ODEBUG_STATE_ACTIVE:
360 hrtimer_cancel(timer);
361 debug_object_init(timer, &hrtimer_debug_descr);
362 return 1;
363 default:
364 return 0;
365 }
366}
367
368/*
369 * fixup_activate is called when:
370 * - an active object is activated
371 * - an unknown object is activated (might be a statically initialized object)
372 */
373static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
374{
375 switch (state) {
376
377 case ODEBUG_STATE_NOTAVAILABLE:
378 WARN_ON_ONCE(1);
379 return 0;
380
381 case ODEBUG_STATE_ACTIVE:
382 WARN_ON(1);
383
384 default:
385 return 0;
386 }
387}
388
389/*
390 * fixup_free is called when:
391 * - an active object is freed
392 */
393static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
394{
395 struct hrtimer *timer = addr;
396
397 switch (state) {
398 case ODEBUG_STATE_ACTIVE:
399 hrtimer_cancel(timer);
400 debug_object_free(timer, &hrtimer_debug_descr);
401 return 1;
402 default:
403 return 0;
404 }
405}
406
407static struct debug_obj_descr hrtimer_debug_descr = {
408 .name = "hrtimer",
409 .fixup_init = hrtimer_fixup_init,
410 .fixup_activate = hrtimer_fixup_activate,
411 .fixup_free = hrtimer_fixup_free,
412};
413
414static inline void debug_hrtimer_init(struct hrtimer *timer)
415{
416 debug_object_init(timer, &hrtimer_debug_descr);
417}
418
419static inline void debug_hrtimer_activate(struct hrtimer *timer)
420{
421 debug_object_activate(timer, &hrtimer_debug_descr);
422}
423
424static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
425{
426 debug_object_deactivate(timer, &hrtimer_debug_descr);
427}
428
429static inline void debug_hrtimer_free(struct hrtimer *timer)
430{
431 debug_object_free(timer, &hrtimer_debug_descr);
432}
433
434static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
435 enum hrtimer_mode mode);
436
437void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
438 enum hrtimer_mode mode)
439{
440 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
441 __hrtimer_init(timer, clock_id, mode);
442}
443
444void destroy_hrtimer_on_stack(struct hrtimer *timer)
445{
446 debug_object_free(timer, &hrtimer_debug_descr);
447}
448
449#else
450static inline void debug_hrtimer_init(struct hrtimer *timer) { }
451static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
452static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
453#endif
454
345/* 455/*
346 * Check, whether the timer is on the callback pending list 456 * Check, whether the timer is on the callback pending list
347 */ 457 */
@@ -567,6 +677,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
567 /* Timer is expired, act upon the callback mode */ 677 /* Timer is expired, act upon the callback mode */
568 switch(timer->cb_mode) { 678 switch(timer->cb_mode) {
569 case HRTIMER_CB_IRQSAFE_NO_RESTART: 679 case HRTIMER_CB_IRQSAFE_NO_RESTART:
680 debug_hrtimer_deactivate(timer);
570 /* 681 /*
571 * We can call the callback from here. No restart 682 * We can call the callback from here. No restart
572 * happens, so no danger of recursion 683 * happens, so no danger of recursion
@@ -581,6 +692,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
581 * the tick timer in the softirq ! The calling site 692 * the tick timer in the softirq ! The calling site
582 * takes care of this. 693 * takes care of this.
583 */ 694 */
695 debug_hrtimer_deactivate(timer);
584 return 1; 696 return 1;
585 case HRTIMER_CB_IRQSAFE: 697 case HRTIMER_CB_IRQSAFE:
586 case HRTIMER_CB_SOFTIRQ: 698 case HRTIMER_CB_SOFTIRQ:
@@ -735,6 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
735 struct hrtimer *entry; 847 struct hrtimer *entry;
736 int leftmost = 1; 848 int leftmost = 1;
737 849
850 debug_hrtimer_activate(timer);
851
738 /* 852 /*
739 * Find the right place in the rbtree: 853 * Find the right place in the rbtree:
740 */ 854 */
@@ -831,6 +945,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
831 * reprogramming happens in the interrupt handler. This is a 945 * reprogramming happens in the interrupt handler. This is a
832 * rare case and less expensive than a smp call. 946 * rare case and less expensive than a smp call.
833 */ 947 */
948 debug_hrtimer_deactivate(timer);
834 timer_stats_hrtimer_clear_start_info(timer); 949 timer_stats_hrtimer_clear_start_info(timer);
835 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 950 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
836 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 951 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -878,6 +993,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
878 tim = ktime_add_safe(tim, base->resolution); 993 tim = ktime_add_safe(tim, base->resolution);
879#endif 994#endif
880 } 995 }
996
881 timer->expires = tim; 997 timer->expires = tim;
882 998
883 timer_stats_hrtimer_set_start_info(timer); 999 timer_stats_hrtimer_set_start_info(timer);
@@ -1011,14 +1127,8 @@ ktime_t hrtimer_get_next_event(void)
1011} 1127}
1012#endif 1128#endif
1013 1129
1014/** 1130static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1015 * hrtimer_init - initialize a timer to the given clock 1131 enum hrtimer_mode mode)
1016 * @timer: the timer to be initialized
1017 * @clock_id: the clock to be used
1018 * @mode: timer mode abs/rel
1019 */
1020void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1021 enum hrtimer_mode mode)
1022{ 1132{
1023 struct hrtimer_cpu_base *cpu_base; 1133 struct hrtimer_cpu_base *cpu_base;
1024 1134
@@ -1039,6 +1149,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1039 memset(timer->start_comm, 0, TASK_COMM_LEN); 1149 memset(timer->start_comm, 0, TASK_COMM_LEN);
1040#endif 1150#endif
1041} 1151}
1152
1153/**
1154 * hrtimer_init - initialize a timer to the given clock
1155 * @timer: the timer to be initialized
1156 * @clock_id: the clock to be used
1157 * @mode: timer mode abs/rel
1158 */
1159void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1160 enum hrtimer_mode mode)
1161{
1162 debug_hrtimer_init(timer);
1163 __hrtimer_init(timer, clock_id, mode);
1164}
1042EXPORT_SYMBOL_GPL(hrtimer_init); 1165EXPORT_SYMBOL_GPL(hrtimer_init);
1043 1166
1044/** 1167/**
@@ -1072,6 +1195,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1072 timer = list_entry(cpu_base->cb_pending.next, 1195 timer = list_entry(cpu_base->cb_pending.next,
1073 struct hrtimer, cb_entry); 1196 struct hrtimer, cb_entry);
1074 1197
1198 debug_hrtimer_deactivate(timer);
1075 timer_stats_account_hrtimer(timer); 1199 timer_stats_account_hrtimer(timer);
1076 1200
1077 fn = timer->function; 1201 fn = timer->function;
@@ -1120,6 +1244,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1120 enum hrtimer_restart (*fn)(struct hrtimer *); 1244 enum hrtimer_restart (*fn)(struct hrtimer *);
1121 int restart; 1245 int restart;
1122 1246
1247 debug_hrtimer_deactivate(timer);
1123 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1248 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1124 timer_stats_account_hrtimer(timer); 1249 timer_stats_account_hrtimer(timer);
1125 1250
@@ -1378,22 +1503,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1378{ 1503{
1379 struct hrtimer_sleeper t; 1504 struct hrtimer_sleeper t;
1380 struct timespec __user *rmtp; 1505 struct timespec __user *rmtp;
1506 int ret = 0;
1381 1507
1382 hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); 1508 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1509 HRTIMER_MODE_ABS);
1383 t.timer.expires.tv64 = restart->nanosleep.expires; 1510 t.timer.expires.tv64 = restart->nanosleep.expires;
1384 1511
1385 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1512 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1386 return 0; 1513 goto out;
1387 1514
1388 rmtp = restart->nanosleep.rmtp; 1515 rmtp = restart->nanosleep.rmtp;
1389 if (rmtp) { 1516 if (rmtp) {
1390 int ret = update_rmtp(&t.timer, rmtp); 1517 ret = update_rmtp(&t.timer, rmtp);
1391 if (ret <= 0) 1518 if (ret <= 0)
1392 return ret; 1519 goto out;
1393 } 1520 }
1394 1521
1395 /* The other values in restart are already filled in */ 1522 /* The other values in restart are already filled in */
1396 return -ERESTART_RESTARTBLOCK; 1523 ret = -ERESTART_RESTARTBLOCK;
1524out:
1525 destroy_hrtimer_on_stack(&t.timer);
1526 return ret;
1397} 1527}
1398 1528
1399long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1529long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
@@ -1401,20 +1531,23 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1401{ 1531{
1402 struct restart_block *restart; 1532 struct restart_block *restart;
1403 struct hrtimer_sleeper t; 1533 struct hrtimer_sleeper t;
1534 int ret = 0;
1404 1535
1405 hrtimer_init(&t.timer, clockid, mode); 1536 hrtimer_init_on_stack(&t.timer, clockid, mode);
1406 t.timer.expires = timespec_to_ktime(*rqtp); 1537 t.timer.expires = timespec_to_ktime(*rqtp);
1407 if (do_nanosleep(&t, mode)) 1538 if (do_nanosleep(&t, mode))
1408 return 0; 1539 goto out;
1409 1540
1410 /* Absolute timers do not update the rmtp value and restart: */ 1541 /* Absolute timers do not update the rmtp value and restart: */
1411 if (mode == HRTIMER_MODE_ABS) 1542 if (mode == HRTIMER_MODE_ABS) {
1412 return -ERESTARTNOHAND; 1543 ret = -ERESTARTNOHAND;
1544 goto out;
1545 }
1413 1546
1414 if (rmtp) { 1547 if (rmtp) {
1415 int ret = update_rmtp(&t.timer, rmtp); 1548 ret = update_rmtp(&t.timer, rmtp);
1416 if (ret <= 0) 1549 if (ret <= 0)
1417 return ret; 1550 goto out;
1418 } 1551 }
1419 1552
1420 restart = &current_thread_info()->restart_block; 1553 restart = &current_thread_info()->restart_block;
@@ -1423,7 +1556,10 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1423 restart->nanosleep.rmtp = rmtp; 1556 restart->nanosleep.rmtp = rmtp;
1424 restart->nanosleep.expires = t.timer.expires.tv64; 1557 restart->nanosleep.expires = t.timer.expires.tv64;
1425 1558
1426 return -ERESTART_RESTARTBLOCK; 1559 ret = -ERESTART_RESTARTBLOCK;
1560out:
1561 destroy_hrtimer_on_stack(&t.timer);
1562 return ret;
1427} 1563}
1428 1564
1429asmlinkage long 1565asmlinkage long
@@ -1468,6 +1604,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1468 while ((node = rb_first(&old_base->active))) { 1604 while ((node = rb_first(&old_base->active))) {
1469 timer = rb_entry(node, struct hrtimer, node); 1605 timer = rb_entry(node, struct hrtimer, node);
1470 BUG_ON(hrtimer_callback_running(timer)); 1606 BUG_ON(hrtimer_callback_running(timer));
1607 debug_hrtimer_deactivate(timer);
1471 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1608 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
1472 timer->base = new_base; 1609 timer->base = new_base;
1473 /* 1610 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/gfp.h>
4 5
5/* 6/*
6 * Device resource management aware IRQ request/free implementation. 7 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46d6611a33bb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/slab.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -149,6 +150,26 @@ void disable_irq(unsigned int irq)
149} 150}
150EXPORT_SYMBOL(disable_irq); 151EXPORT_SYMBOL(disable_irq);
151 152
153static void __enable_irq(struct irq_desc *desc, unsigned int irq)
154{
155 switch (desc->depth) {
156 case 0:
157 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
158 WARN_ON(1);
159 break;
160 case 1: {
161 unsigned int status = desc->status & ~IRQ_DISABLED;
162
163 /* Prevent probing on this irq: */
164 desc->status = status | IRQ_NOPROBE;
165 check_irq_resend(desc, irq);
166 /* fall-through */
167 }
168 default:
169 desc->depth--;
170 }
171}
172
152/** 173/**
153 * enable_irq - enable handling of an irq 174 * enable_irq - enable handling of an irq
154 * @irq: Interrupt to enable 175 * @irq: Interrupt to enable
@@ -168,22 +189,7 @@ void enable_irq(unsigned int irq)
168 return; 189 return;
169 190
170 spin_lock_irqsave(&desc->lock, flags); 191 spin_lock_irqsave(&desc->lock, flags);
171 switch (desc->depth) { 192 __enable_irq(desc, irq);
172 case 0:
173 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
174 WARN_ON(1);
175 break;
176 case 1: {
177 unsigned int status = desc->status & ~IRQ_DISABLED;
178
179 /* Prevent probing on this irq: */
180 desc->status = status | IRQ_NOPROBE;
181 check_irq_resend(desc, irq);
182 /* fall-through */
183 }
184 default:
185 desc->depth--;
186 }
187 spin_unlock_irqrestore(&desc->lock, flags); 193 spin_unlock_irqrestore(&desc->lock, flags);
188} 194}
189EXPORT_SYMBOL(enable_irq); 195EXPORT_SYMBOL(enable_irq);
@@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
364 compat_irq_chip_set_default_handler(desc); 370 compat_irq_chip_set_default_handler(desc);
365 371
366 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 372 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
367 IRQ_INPROGRESS); 373 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
368 374
369 if (!(desc->status & IRQ_NOAUTOEN)) { 375 if (!(desc->status & IRQ_NOAUTOEN)) {
370 desc->depth = 0; 376 desc->depth = 0;
@@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
380 /* Reset broken irq detection when installing new handler */ 386 /* Reset broken irq detection when installing new handler */
381 desc->irq_count = 0; 387 desc->irq_count = 0;
382 desc->irqs_unhandled = 0; 388 desc->irqs_unhandled = 0;
389
390 /*
391 * Check whether we disabled the irq via the spurious handler
392 * before. Reenable it and give it another chance.
393 */
394 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
395 desc->status &= ~IRQ_SPURIOUS_DISABLED;
396 __enable_irq(desc, irq);
397 }
398
383 spin_unlock_irqrestore(&desc->lock, flags); 399 spin_unlock_irqrestore(&desc->lock, flags);
384 400
385 new->irq = irq; 401 new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6a..c66d3f10e853 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
209 * Now kill the IRQ 209 * Now kill the IRQ
210 */ 210 */
211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
212 desc->status |= IRQ_DISABLED; 212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth = 1; 213 desc->depth++;
214 desc->chip->disable(irq); 214 desc->chip->disable(irq);
215 } 215 }
216 desc->irqs_unhandled = 0; 216 desc->irqs_unhandled = 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
472 472
473static int __init kallsyms_init(void) 473static int __init kallsyms_init(void)
474{ 474{
475 struct proc_dir_entry *entry; 475 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
476
477 entry = create_proc_entry("kallsyms", 0444, NULL);
478 if (entry)
479 entry->proc_fops = &kallsyms_operations;
480 return 0; 476 return 0;
481} 477}
482__initcall(kallsyms_init); 478__initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb85c79989b4..1c5fcacbcf33 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1217,7 +1217,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1217 } 1217 }
1218 1218
1219 /* match ? */ 1219 /* match ? */
1220 if (system_ram >= start && system_ram <= end) { 1220 if (system_ram >= start && system_ram < end) {
1221 *crash_size = size; 1221 *crash_size = size;
1222 break; 1222 break;
1223 } 1223 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec03..8df97d3dfda8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 28#include <linux/completion.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include <linux/mount.h> 33#include <linux/mount.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..bd1b9ea024e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -98,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create)
98 struct sched_param param = { .sched_priority = 0 }; 98 struct sched_param param = { .sched_priority = 0 };
99 wait_for_completion(&create->started); 99 wait_for_completion(&create->started);
100 read_lock(&tasklist_lock); 100 read_lock(&tasklist_lock);
101 create->result = find_task_by_pid(pid); 101 create->result = find_task_by_pid_ns(pid, &init_pid_ns);
102 read_unlock(&tasklist_lock); 102 read_unlock(&tasklist_lock);
103 /* 103 /*
104 * root may have changed our (kthreadd's) priority or CPU mask. 104 * root may have changed our (kthreadd's) priority or CPU mask.
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
144 144
145 spin_lock(&kthread_create_lock); 145 spin_lock(&kthread_create_lock);
146 list_add_tail(&create.list, &kthread_create_list); 146 list_add_tail(&create.list, &kthread_create_list);
147 wake_up_process(kthreadd_task);
148 spin_unlock(&kthread_create_lock); 147 spin_unlock(&kthread_create_lock);
149 148
149 wake_up_process(kthreadd_task);
150 wait_for_completion(&create.done); 150 wait_for_completion(&create.done);
151 151
152 if (!IS_ERR(create.result)) { 152 if (!IS_ERR(create.result)) {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
233 233
234static int __init init_lstats_procfs(void) 234static int __init init_lstats_procfs(void)
235{ 235{
236 struct proc_dir_entry *pe; 236 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237
238 pe = create_proc_entry("latency_stats", 0644, NULL);
239 if (!pe)
240 return -ENOMEM;
241
242 pe->proc_fops = &lstats_fops;
243
244 return 0; 237 return 0;
245} 238}
246__initcall(init_lstats_procfs); 239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
660 660
661static int __init lockdep_proc_init(void) 661static int __init lockdep_proc_init(void)
662{ 662{
663 struct proc_dir_entry *entry; 663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
664 664 proc_create("lockdep_stats", S_IRUSR, NULL,
665 entry = create_proc_entry("lockdep", S_IRUSR, NULL); 665 &proc_lockdep_stats_operations);
666 if (entry)
667 entry->proc_fops = &proc_lockdep_operations;
668
669 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
670 if (entry)
671 entry->proc_fops = &proc_lockdep_stats_operations;
672 666
673#ifdef CONFIG_LOCK_STAT 667#ifdef CONFIG_LOCK_STAT
674 entry = create_proc_entry("lock_stat", S_IRUSR, NULL); 668 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
675 if (entry)
676 entry->proc_fops = &proc_lock_stat_operations;
677#endif 669#endif
678 670
679 return 0; 671 return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..b5a9fe1d50d5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,12 +23,13 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/marker.h> 24#include <linux/marker.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27extern struct marker __start___markers[]; 28extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 29extern struct marker __stop___markers[];
29 30
30/* Set to 1 to enable marker debug output */ 31/* Set to 1 to enable marker debug output */
31const int marker_debug; 32static const int marker_debug;
32 33
33/* 34/*
34 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
diff --git a/kernel/module.c b/kernel/module.c
index 8d6cccc6c3cf..8674a390a2e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -164,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
164 return NULL; 164 return NULL;
165} 165}
166 166
167static void printk_unused_warning(const char *name) 167static bool always_ok(bool gplok, bool warn, const char *name)
168{ 168{
169 printk(KERN_WARNING "Symbol %s is marked as UNUSED, " 169 return true;
170 "however this module is using it.\n", name);
171 printk(KERN_WARNING "This symbol will go away in the future.\n");
172 printk(KERN_WARNING "Please evalute if this is the right api to use, "
173 "and if it really is, submit a report the linux kernel "
174 "mailinglist together with submitting your code for "
175 "inclusion.\n");
176} 170}
177 171
178/* Find a symbol, return value, crc and module which owns it */ 172static bool printk_unused_warning(bool gplok, bool warn, const char *name)
179static unsigned long __find_symbol(const char *name,
180 struct module **owner,
181 const unsigned long **crc,
182 int gplok)
183{ 173{
184 struct module *mod; 174 if (warn) {
185 const struct kernel_symbol *ks; 175 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
186 176 "however this module is using it.\n", name);
187 /* Core kernel first. */ 177 printk(KERN_WARNING
188 *owner = NULL; 178 "This symbol will go away in the future.\n");
189 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); 179 printk(KERN_WARNING
190 if (ks) { 180 "Please evalute if this is the right api to use and if "
191 *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); 181 "it really is, submit a report the linux kernel "
192 return ks->value; 182 "mailinglist together with submitting your code for "
183 "inclusion.\n");
193 } 184 }
194 if (gplok) { 185 return true;
195 ks = lookup_symbol(name, __start___ksymtab_gpl, 186}
196 __stop___ksymtab_gpl); 187
197 if (ks) { 188static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
198 *crc = symversion(__start___kcrctab_gpl, 189{
199 (ks - __start___ksymtab_gpl)); 190 if (!gplok)
200 return ks->value; 191 return false;
201 } 192 return printk_unused_warning(gplok, warn, name);
202 } 193}
203 ks = lookup_symbol(name, __start___ksymtab_gpl_future, 194
204 __stop___ksymtab_gpl_future); 195static bool gpl_only(bool gplok, bool warn, const char *name)
205 if (ks) { 196{
206 if (!gplok) { 197 return gplok;
207 printk(KERN_WARNING "Symbol %s is being used " 198}
208 "by a non-GPL module, which will not " 199
209 "be allowed in the future\n", name); 200static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
210 printk(KERN_WARNING "Please see the file " 201{
211 "Documentation/feature-removal-schedule.txt " 202 if (!gplok && warn) {
212 "in the kernel source tree for more " 203 printk(KERN_WARNING "Symbol %s is being used "
213 "details.\n"); 204 "by a non-GPL module, which will not "
214 } 205 "be allowed in the future\n", name);
215 *crc = symversion(__start___kcrctab_gpl_future, 206 printk(KERN_WARNING "Please see the file "
216 (ks - __start___ksymtab_gpl_future)); 207 "Documentation/feature-removal-schedule.txt "
217 return ks->value; 208 "in the kernel source tree for more details.\n");
218 } 209 }
210 return true;
211}
219 212
220 ks = lookup_symbol(name, __start___ksymtab_unused, 213struct symsearch {
221 __stop___ksymtab_unused); 214 const struct kernel_symbol *start, *stop;
222 if (ks) { 215 const unsigned long *crcs;
223 printk_unused_warning(name); 216 bool (*check)(bool gplok, bool warn, const char *name);
224 *crc = symversion(__start___kcrctab_unused, 217};
225 (ks - __start___ksymtab_unused)); 218
226 return ks->value; 219/* Look through this array of symbol tables for a symbol match which
220 * passes the check function. */
221static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
222 unsigned int num,
223 const char *name,
224 bool gplok,
225 bool warn,
226 const unsigned long **crc)
227{
228 unsigned int i;
229 const struct kernel_symbol *ks;
230
231 for (i = 0; i < num; i++) {
232 ks = lookup_symbol(name, arr[i].start, arr[i].stop);
233 if (!ks || !arr[i].check(gplok, warn, name))
234 continue;
235
236 if (crc)
237 *crc = symversion(arr[i].crcs, ks - arr[i].start);
238 return ks;
227 } 239 }
240 return NULL;
241}
242
243/* Find a symbol, return value, (optional) crc and (optional) module
244 * which owns it */
245static unsigned long find_symbol(const char *name,
246 struct module **owner,
247 const unsigned long **crc,
248 bool gplok,
249 bool warn)
250{
251 struct module *mod;
252 const struct kernel_symbol *ks;
253 const struct symsearch arr[] = {
254 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
255 always_ok },
256 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
257 __start___kcrctab_gpl, gpl_only },
258 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
259 __start___kcrctab_gpl_future, warn_if_not_gpl },
260 { __start___ksymtab_unused, __stop___ksymtab_unused,
261 __start___kcrctab_unused, printk_unused_warning },
262 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
263 __start___kcrctab_unused_gpl, gpl_only_unused_warning },
264 };
228 265
229 if (gplok) 266 /* Core kernel first. */
230 ks = lookup_symbol(name, __start___ksymtab_unused_gpl, 267 ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
231 __stop___ksymtab_unused_gpl);
232 if (ks) { 268 if (ks) {
233 printk_unused_warning(name); 269 if (owner)
234 *crc = symversion(__start___kcrctab_unused_gpl, 270 *owner = NULL;
235 (ks - __start___ksymtab_unused_gpl));
236 return ks->value; 271 return ks->value;
237 } 272 }
238 273
239 /* Now try modules. */ 274 /* Now try modules. */
240 list_for_each_entry(mod, &modules, list) { 275 list_for_each_entry(mod, &modules, list) {
241 *owner = mod; 276 struct symsearch arr[] = {
242 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); 277 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
243 if (ks) { 278 always_ok },
244 *crc = symversion(mod->crcs, (ks - mod->syms)); 279 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
245 return ks->value; 280 mod->gpl_crcs, gpl_only },
246 } 281 { mod->gpl_future_syms,
247 282 mod->gpl_future_syms + mod->num_gpl_future_syms,
248 if (gplok) { 283 mod->gpl_future_crcs, warn_if_not_gpl },
249 ks = lookup_symbol(name, mod->gpl_syms, 284 { mod->unused_syms,
250 mod->gpl_syms + mod->num_gpl_syms); 285 mod->unused_syms + mod->num_unused_syms,
251 if (ks) { 286 mod->unused_crcs, printk_unused_warning },
252 *crc = symversion(mod->gpl_crcs, 287 { mod->unused_gpl_syms,
253 (ks - mod->gpl_syms)); 288 mod->unused_gpl_syms + mod->num_unused_gpl_syms,
254 return ks->value; 289 mod->unused_gpl_crcs, gpl_only_unused_warning },
255 } 290 };
256 } 291
257 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); 292 ks = search_symarrays(arr, ARRAY_SIZE(arr),
293 name, gplok, warn, crc);
258 if (ks) { 294 if (ks) {
259 printk_unused_warning(name); 295 if (owner)
260 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); 296 *owner = mod;
261 return ks->value;
262 }
263
264 if (gplok) {
265 ks = lookup_symbol(name, mod->unused_gpl_syms,
266 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
267 if (ks) {
268 printk_unused_warning(name);
269 *crc = symversion(mod->unused_gpl_crcs,
270 (ks - mod->unused_gpl_syms));
271 return ks->value;
272 }
273 }
274 ks = lookup_symbol(name, mod->gpl_future_syms,
275 (mod->gpl_future_syms +
276 mod->num_gpl_future_syms));
277 if (ks) {
278 if (!gplok) {
279 printk(KERN_WARNING "Symbol %s is being used "
280 "by a non-GPL module, which will not "
281 "be allowed in the future\n", name);
282 printk(KERN_WARNING "Please see the file "
283 "Documentation/feature-removal-schedule.txt "
284 "in the kernel source tree for more "
285 "details.\n");
286 }
287 *crc = symversion(mod->gpl_future_crcs,
288 (ks - mod->gpl_future_syms));
289 return ks->value; 297 return ks->value;
290 } 298 }
291 } 299 }
300
292 DEBUGP("Failed to find symbol %s\n", name); 301 DEBUGP("Failed to find symbol %s\n", name);
293 return -ENOENT; 302 return -ENOENT;
294} 303}
@@ -736,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
736 if (!forced && module_refcount(mod) != 0) 745 if (!forced && module_refcount(mod) != 0)
737 wait_for_zero_refcount(mod); 746 wait_for_zero_refcount(mod);
738 747
748 mutex_unlock(&module_mutex);
739 /* Final destruction now noone is using it. */ 749 /* Final destruction now noone is using it. */
740 if (mod->exit != NULL) { 750 if (mod->exit != NULL)
741 mutex_unlock(&module_mutex);
742 mod->exit(); 751 mod->exit();
743 mutex_lock(&module_mutex); 752 blocking_notifier_call_chain(&module_notify_list,
744 } 753 MODULE_STATE_GOING, mod);
754 mutex_lock(&module_mutex);
745 /* Store the name of the last unloaded module for diagnostic purposes */ 755 /* Store the name of the last unloaded module for diagnostic purposes */
746 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 756 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
747 free_module(mod); 757 free_module(mod);
@@ -777,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
777void __symbol_put(const char *symbol) 787void __symbol_put(const char *symbol)
778{ 788{
779 struct module *owner; 789 struct module *owner;
780 const unsigned long *crc;
781 790
782 preempt_disable(); 791 preempt_disable();
783 if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) 792 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
784 BUG(); 793 BUG();
785 module_put(owner); 794 module_put(owner);
786 preempt_enable(); 795 preempt_enable();
@@ -924,13 +933,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
924 struct module *mod) 933 struct module *mod)
925{ 934{
926 const unsigned long *crc; 935 const unsigned long *crc;
927 struct module *owner;
928 936
929 if (IS_ERR_VALUE(__find_symbol("struct_module", 937 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
930 &owner, &crc, 1)))
931 BUG(); 938 BUG();
932 return check_version(sechdrs, versindex, "struct_module", mod, 939 return check_version(sechdrs, versindex, "struct_module", mod, crc);
933 crc);
934} 940}
935 941
936/* First part is kernel version, which we ignore. */ 942/* First part is kernel version, which we ignore. */
@@ -974,8 +980,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
974 unsigned long ret; 980 unsigned long ret;
975 const unsigned long *crc; 981 const unsigned long *crc;
976 982
977 ret = __find_symbol(name, &owner, &crc, 983 ret = find_symbol(name, &owner, &crc,
978 !(mod->taints & TAINT_PROPRIETARY_MODULE)); 984 !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
979 if (!IS_ERR_VALUE(ret)) { 985 if (!IS_ERR_VALUE(ret)) {
980 /* use_module can fail due to OOM, 986 /* use_module can fail due to OOM,
981 or module initialization or unloading */ 987 or module initialization or unloading */
@@ -991,6 +997,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
991 * J. Corbet <corbet@lwn.net> 997 * J. Corbet <corbet@lwn.net>
992 */ 998 */
993#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 999#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1000struct module_sect_attr
1001{
1002 struct module_attribute mattr;
1003 char *name;
1004 unsigned long address;
1005};
1006
1007struct module_sect_attrs
1008{
1009 struct attribute_group grp;
1010 unsigned int nsections;
1011 struct module_sect_attr attrs[0];
1012};
1013
994static ssize_t module_sect_show(struct module_attribute *mattr, 1014static ssize_t module_sect_show(struct module_attribute *mattr,
995 struct module *mod, char *buf) 1015 struct module *mod, char *buf)
996{ 1016{
@@ -1001,7 +1021,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1001 1021
1002static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1022static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1003{ 1023{
1004 int section; 1024 unsigned int section;
1005 1025
1006 for (section = 0; section < sect_attrs->nsections; section++) 1026 for (section = 0; section < sect_attrs->nsections; section++)
1007 kfree(sect_attrs->attrs[section].name); 1027 kfree(sect_attrs->attrs[section].name);
@@ -1362,10 +1382,9 @@ void *__symbol_get(const char *symbol)
1362{ 1382{
1363 struct module *owner; 1383 struct module *owner;
1364 unsigned long value; 1384 unsigned long value;
1365 const unsigned long *crc;
1366 1385
1367 preempt_disable(); 1386 preempt_disable();
1368 value = __find_symbol(symbol, &owner, &crc, 1); 1387 value = find_symbol(symbol, &owner, NULL, true, true);
1369 if (IS_ERR_VALUE(value)) 1388 if (IS_ERR_VALUE(value))
1370 value = 0; 1389 value = 0;
1371 else if (strong_try_module_get(owner)) 1390 else if (strong_try_module_get(owner))
@@ -1382,33 +1401,33 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1382 */ 1401 */
1383static int verify_export_symbols(struct module *mod) 1402static int verify_export_symbols(struct module *mod)
1384{ 1403{
1385 const char *name = NULL; 1404 unsigned int i;
1386 unsigned long i, ret = 0;
1387 struct module *owner; 1405 struct module *owner;
1388 const unsigned long *crc; 1406 const struct kernel_symbol *s;
1389 1407 struct {
1390 for (i = 0; i < mod->num_syms; i++) 1408 const struct kernel_symbol *sym;
1391 if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, 1409 unsigned int num;
1392 &owner, &crc, 1))) { 1410 } arr[] = {
1393 name = mod->syms[i].name; 1411 { mod->syms, mod->num_syms },
1394 ret = -ENOEXEC; 1412 { mod->gpl_syms, mod->num_gpl_syms },
1395 goto dup; 1413 { mod->gpl_future_syms, mod->num_gpl_future_syms },
1396 } 1414 { mod->unused_syms, mod->num_unused_syms },
1415 { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
1416 };
1397 1417
1398 for (i = 0; i < mod->num_gpl_syms; i++) 1418 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1399 if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, 1419 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1400 &owner, &crc, 1))) { 1420 if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
1401 name = mod->gpl_syms[i].name; 1421 NULL, true, false))) {
1402 ret = -ENOEXEC; 1422 printk(KERN_ERR
1403 goto dup; 1423 "%s: exports duplicate symbol %s"
1424 " (owned by %s)\n",
1425 mod->name, s->name, module_name(owner));
1426 return -ENOEXEC;
1427 }
1404 } 1428 }
1405 1429 }
1406dup: 1430 return 0;
1407 if (ret)
1408 printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
1409 mod->name, name, module_name(owner));
1410
1411 return ret;
1412} 1431}
1413 1432
1414/* Change all symbols so that st_value encodes the pointer directly. */ 1433/* Change all symbols so that st_value encodes the pointer directly. */
@@ -1814,8 +1833,9 @@ static struct module *load_module(void __user *umod,
1814 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); 1833 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1815#endif 1834#endif
1816 1835
1817 /* Don't keep modinfo section */ 1836 /* Don't keep modinfo and version sections. */
1818 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1837 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1838 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1819#ifdef CONFIG_KALLSYMS 1839#ifdef CONFIG_KALLSYMS
1820 /* Keep symbol and string tables for decoding later. */ 1840 /* Keep symbol and string tables for decoding later. */
1821 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1841 sechdrs[symindex].sh_flags |= SHF_ALLOC;
@@ -1977,7 +1997,8 @@ static struct module *load_module(void __user *umod,
1977 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 1997 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1978 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 1998 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1979 if (unusedgplcrcindex) 1999 if (unusedgplcrcindex)
1980 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2000 mod->unused_gpl_crcs
2001 = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1981 2002
1982#ifdef CONFIG_MODVERSIONS 2003#ifdef CONFIG_MODVERSIONS
1983 if ((mod->num_syms && !crcindex) || 2004 if ((mod->num_syms && !crcindex) ||
@@ -2171,6 +2192,8 @@ sys_init_module(void __user *umod,
2171 mod->state = MODULE_STATE_GOING; 2192 mod->state = MODULE_STATE_GOING;
2172 synchronize_sched(); 2193 synchronize_sched();
2173 module_put(mod); 2194 module_put(mod);
2195 blocking_notifier_call_chain(&module_notify_list,
2196 MODULE_STATE_GOING, mod);
2174 mutex_lock(&module_mutex); 2197 mutex_lock(&module_mutex);
2175 free_module(mod); 2198 free_module(mod);
2176 mutex_unlock(&module_mutex); 2199 mutex_unlock(&module_mutex);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
31 return 0; 31 return 0;
32} 32}
33 33
34static int notifier_chain_cond_register(struct notifier_block **nl,
35 struct notifier_block *n)
36{
37 while ((*nl) != NULL) {
38 if ((*nl) == n)
39 return 0;
40 if (n->priority > (*nl)->priority)
41 break;
42 nl = &((*nl)->next);
43 }
44 n->next = *nl;
45 rcu_assign_pointer(*nl, n);
46 return 0;
47}
48
34static int notifier_chain_unregister(struct notifier_block **nl, 49static int notifier_chain_unregister(struct notifier_block **nl,
35 struct notifier_block *n) 50 struct notifier_block *n)
36{ 51{
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
205EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); 220EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
206 221
207/** 222/**
223 * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
224 * @nh: Pointer to head of the blocking notifier chain
225 * @n: New entry in notifier chain
226 *
227 * Adds a notifier to a blocking notifier chain, only if not already
228 * present in the chain.
229 * Must be called in process context.
230 *
231 * Currently always returns zero.
232 */
233int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
234 struct notifier_block *n)
235{
236 int ret;
237
238 down_write(&nh->rwsem);
239 ret = notifier_chain_cond_register(&nh->head, n);
240 up_write(&nh->rwsem);
241 return ret;
242}
243EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
244
245/**
208 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain 246 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
209 * @nh: Pointer to head of the blocking notifier chain 247 * @nh: Pointer to head of the blocking notifier chain
210 * @n: Entry to remove from notifier chain 248 * @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/slab.h>
11#include <linux/nsproxy.h>
10 12
11struct ns_cgroup { 13struct ns_cgroup {
12 struct cgroup_subsys_state css; 14 struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
139 goto out; 139 goto out;
140 } 140 }
141 141
142 /*
143 * CLONE_NEWIPC must detach from the undolist: after switching
144 * to a new ipc namespace, the semaphore arrays from the old
145 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
146 * means share undolist with parent, so we must forbid using
147 * it along with CLONE_NEWIPC.
148 */
149 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
150 err = -EINVAL;
151 goto out;
152 }
153
142 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
143 if (IS_ERR(new_ns)) { 155 if (IS_ERR(new_ns)) {
144 err = PTR_ERR(new_ns); 156 err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
153 * 'M' - System experienced a machine check exception. 153 * 'M' - System experienced a machine check exception.
154 * 'B' - System has hit bad_page. 154 * 'B' - System has hit bad_page.
155 * 'U' - Userspace-defined naughtiness. 155 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning.
156 * 158 *
157 * The string is overwritten by the next call to print_taint(). 159 * The string is overwritten by the next call to print_taint().
158 */ 160 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
161{ 163{
162 static char buf[20]; 164 static char buf[20];
163 if (tainted) { 165 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", 166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 172 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 173 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' ', 174 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); 175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
176 tainted & TAINT_WARN ? 'W' : ' ');
174 } 177 }
175 else 178 else
176 snprintf(buf, sizeof(buf), "Not tainted"); 179 snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
312 print_modules(); 315 print_modules();
313 dump_stack(); 316 dump_stack();
314 print_oops_end_marker(); 317 print_oops_end_marker();
318 add_taint(TAINT_WARN);
315} 319}
316EXPORT_SYMBOL(warn_on_slowpath); 320EXPORT_SYMBOL(warn_on_slowpath);
317#endif 321#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 477691576b33..20d59fa2d493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init);
111 111
112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
113 113
114static void free_pidmap(struct pid_namespace *pid_ns, int pid) 114static void free_pidmap(struct upid *upid)
115{ 115{
116 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; 116 int nr = upid->nr;
117 int offset = pid & BITS_PER_PAGE_MASK; 117 struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
118 int offset = nr & BITS_PER_PAGE_MASK;
118 119
119 clear_bit(offset, map->page); 120 clear_bit(offset, map->page);
120 atomic_inc(&map->nr_free); 121 atomic_inc(&map->nr_free);
@@ -232,7 +233,7 @@ void free_pid(struct pid *pid)
232 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
233 234
234 for (i = 0; i <= pid->level; i++) 235 for (i = 0; i <= pid->level; i++)
235 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 236 free_pidmap(pid->numbers + i);
236 237
237 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
238} 239}
@@ -278,8 +279,8 @@ out:
278 return pid; 279 return pid;
279 280
280out_free: 281out_free:
281 for (i++; i <= ns->level; i++) 282 while (++i <= ns->level)
282 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 283 free_pidmap(pid->numbers + i);
283 284
284 kmem_cache_free(ns->pid_cachep, pid); 285 kmem_cache_free(ns->pid_cachep, pid);
285 pid = NULL; 286 pid = NULL;
@@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid);
316/* 317/*
317 * attach_pid() must be called with the tasklist_lock write-held. 318 * attach_pid() must be called with the tasklist_lock write-held.
318 */ 319 */
319int attach_pid(struct task_struct *task, enum pid_type type, 320void attach_pid(struct task_struct *task, enum pid_type type,
320 struct pid *pid) 321 struct pid *pid)
321{ 322{
322 struct pid_link *link; 323 struct pid_link *link;
@@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type,
324 link = &task->pids[type]; 325 link = &task->pids[type];
325 link->pid = pid; 326 link->pid = pid;
326 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 327 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
327
328 return 0;
329} 328}
330 329
331void detach_pid(struct task_struct *task, enum pid_type type) 330static void __change_pid(struct task_struct *task, enum pid_type type,
331 struct pid *new)
332{ 332{
333 struct pid_link *link; 333 struct pid_link *link;
334 struct pid *pid; 334 struct pid *pid;
@@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type)
338 pid = link->pid; 338 pid = link->pid;
339 339
340 hlist_del_rcu(&link->node); 340 hlist_del_rcu(&link->node);
341 link->pid = NULL; 341 link->pid = new;
342 342
343 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 343 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
344 if (!hlist_empty(&pid->tasks[tmp])) 344 if (!hlist_empty(&pid->tasks[tmp]))
@@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type)
347 free_pid(pid); 347 free_pid(pid);
348} 348}
349 349
350void detach_pid(struct task_struct *task, enum pid_type type)
351{
352 __change_pid(task, type, NULL);
353}
354
355void change_pid(struct task_struct *task, enum pid_type type,
356 struct pid *pid)
357{
358 __change_pid(task, type, pid);
359 attach_pid(task, type, pid);
360}
361
350/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 362/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
351void transfer_pid(struct task_struct *old, struct task_struct *new, 363void transfer_pid(struct task_struct *old, struct task_struct *new,
352 enum pid_type type) 364 enum pid_type type)
353{ 365{
354 new->pids[type].pid = old->pids[type].pid; 366 new->pids[type].pid = old->pids[type].pid;
355 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); 367 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
356 old->pids[type].pid = NULL;
357} 368}
358 369
359struct task_struct *pid_task(struct pid *pid, enum pid_type type) 370struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr,
380 391
381EXPORT_SYMBOL(find_task_by_pid_type_ns); 392EXPORT_SYMBOL(find_task_by_pid_type_ns);
382 393
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 394struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 395{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 396 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 5ca37fa50beb..98702b4b8851 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -66,7 +66,7 @@ err_alloc:
66 return NULL; 66 return NULL;
67} 67}
68 68
69static struct pid_namespace *create_pid_namespace(int level) 69static struct pid_namespace *create_pid_namespace(unsigned int level)
70{ 70{
71 struct pid_namespace *ns; 71 struct pid_namespace *ns;
72 int i; 72 int i;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ae5c6c147c4b..f1525ad06cb3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -4,8 +4,9 @@
4 4
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/posix-timers.h> 6#include <linux/posix-timers.h>
7#include <asm/uaccess.h>
8#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
9 10
10static int check_clock(const clockid_t which_clock) 11static int check_clock(const clockid_t which_clock)
11{ 12{
@@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock,
47 union cpu_time_count cpu, 48 union cpu_time_count cpu,
48 struct timespec *tp) 49 struct timespec *tp)
49{ 50{
50 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 51 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
51 tp->tv_sec = div_long_long_rem(cpu.sched, 52 *tp = ns_to_timespec(cpu.sched);
52 NSEC_PER_SEC, &tp->tv_nsec); 53 else
53 } else {
54 cputime_to_timespec(cpu.cpu, tp); 54 cputime_to_timespec(cpu.cpu, tp);
55 }
56} 55}
57 56
58static inline int cpu_time_before(const clockid_t which_clock, 57static inline int cpu_time_before(const clockid_t which_clock,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 8476956ffd92..dbd8398ddb0b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -310,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
310 310
311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
312 struct task_struct *leader; 312 struct task_struct *leader;
313 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, 313 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
314 timr->it_process);
315 314
316 if (likely(ret >= 0)) 315 if (likely(ret >= 0))
317 return ret; 316 return ret;
@@ -322,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
322 timr->it_process = leader; 321 timr->it_process = leader;
323 } 322 }
324 323
325 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, 324 return send_sigqueue(timr->sigq, timr->it_process, 1);
326 timr->it_process);
327} 325}
328EXPORT_SYMBOL_GPL(posix_timer_event); 326EXPORT_SYMBOL_GPL(posix_timer_event);
329 327
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM
25 default n
26 ---help---
27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
29
30 If unsure, say N.
31
32config PM_DEBUG 22config PM_DEBUG
33 bool "Power Management Debug Support" 23 bool "Power Management Debug Support"
34 depends on PM 24 depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-y := main.o
7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += process.o console.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
10 9
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 * pm.c - Power management interface
3 *
4 * Copyright (C) 2000 Andrew Henroid
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h>
28#include <linux/mutex.h>
29
30/*
31 * Locking notes:
32 * pm_devs_lock can be a semaphore providing pm ops are not called
33 * from an interrupt handler (already a bad idea so no change here). Each
34 * change must be protected so that an unlink of an entry doesn't clash
35 * with a pm send - which is permitted to sleep in the current architecture
36 *
37 * Module unloads clashing with pm events now work out safely, the module
38 * unload path will block until the event has been sent. It may well block
39 * until a resume but that will be fine.
40 */
41
42static DEFINE_MUTEX(pm_devs_lock);
43static LIST_HEAD(pm_devs);
44
45/**
46 * pm_register - register a device with power management
47 * @type: device type
48 * @id: device ID
49 * @callback: callback function
50 *
51 * Add a device to the list of devices that wish to be notified about
52 * power management events. A &pm_dev structure is returned on success,
53 * on failure the return is %NULL.
54 *
55 * The callback function will be called in process context and
56 * it may sleep.
57 */
58
59struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id,
61 pm_callback callback)
62{
63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) {
65 dev->type = type;
66 dev->id = id;
67 dev->callback = callback;
68
69 mutex_lock(&pm_devs_lock);
70 list_add(&dev->entry, &pm_devs);
71 mutex_unlock(&pm_devs_lock);
72 }
73 return dev;
74}
75
76/**
77 * pm_send - send request to a single device
78 * @dev: device to send to
79 * @rqst: power management request
80 * @data: data for the callback
81 *
82 * Issue a power management request to a given device. The
83 * %PM_SUSPEND and %PM_RESUME events are handled specially. The
84 * data field must hold the intended next state. No call is made
85 * if the state matches.
86 *
87 * BUGS: what stops two power management requests occurring in parallel
88 * and conflicting.
89 *
90 * WARNING: Calling pm_send directly is not generally recommended, in
91 * particular there is no locking against the pm_dev going away. The
92 * caller must maintain all needed locking or have 'inside knowledge'
93 * on the safety. Also remember that this function is not locked against
94 * pm_unregister. This means that you must handle SMP races on callback
95 * execution and unload yourself.
96 */
97
98static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
99{
100 int status = 0;
101 unsigned long prev_state, next_state;
102
103 if (in_interrupt())
104 BUG();
105
106 switch (rqst) {
107 case PM_SUSPEND:
108 case PM_RESUME:
109 prev_state = dev->state;
110 next_state = (unsigned long) data;
111 if (prev_state != next_state) {
112 if (dev->callback)
113 status = (*dev->callback)(dev, rqst, data);
114 if (!status) {
115 dev->state = next_state;
116 dev->prev_state = prev_state;
117 }
118 }
119 else {
120 dev->prev_state = prev_state;
121 }
122 break;
123 default:
124 if (dev->callback)
125 status = (*dev->callback)(dev, rqst, data);
126 break;
127 }
128 return status;
129}
130
131/*
132 * Undo incomplete request
133 */
134static void pm_undo_all(struct pm_dev *last)
135{
136 struct list_head *entry = last->entry.prev;
137 while (entry != &pm_devs) {
138 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
139 if (dev->state != dev->prev_state) {
140 /* previous state was zero (running) resume or
141 * previous state was non-zero (suspended) suspend
142 */
143 pm_request_t undo = (dev->prev_state
144 ? PM_SUSPEND:PM_RESUME);
145 pm_send(dev, undo, (void*) dev->prev_state);
146 }
147 entry = entry->prev;
148 }
149}
150
151/**
152 * pm_send_all - send request to all managed devices
153 * @rqst: power management request
154 * @data: data for the callback
155 *
156 * Issue a power management request to a all devices. The
157 * %PM_SUSPEND events are handled specially. Any device is
158 * permitted to fail a suspend by returning a non zero (error)
159 * value from its callback function. If any device vetoes a
160 * suspend request then all other devices that have suspended
161 * during the processing of this request are restored to their
162 * previous state.
163 *
164 * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
165 * the callbacks have completed. This prevents races against pm locking
166 * functions, races against module unload pm_unregister code. It does
167 * mean however that you must not issue pm_ functions within the callback
168 * or you will deadlock and users will hate you.
169 *
170 * Zero is returned on success. If a suspend fails then the status
171 * from the device that vetoes the suspend is returned.
172 *
173 * BUGS: what stops two power management requests occurring in parallel
174 * and conflicting.
175 */
176
177int pm_send_all(pm_request_t rqst, void *data)
178{
179 struct list_head *entry;
180
181 mutex_lock(&pm_devs_lock);
182 entry = pm_devs.next;
183 while (entry != &pm_devs) {
184 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
185 if (dev->callback) {
186 int status = pm_send(dev, rqst, data);
187 if (status) {
188 /* return devices to previous state on
189 * failed suspend request
190 */
191 if (rqst == PM_SUSPEND)
192 pm_undo_all(dev);
193 mutex_unlock(&pm_devs_lock);
194 return status;
195 }
196 }
197 entry = entry->next;
198 }
199 mutex_unlock(&pm_devs_lock);
200 return 0;
201}
202
203EXPORT_SYMBOL(pm_register);
204EXPORT_SYMBOL(pm_send_all);
205
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..8fb01c32aa3b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -111,6 +111,9 @@ struct console_cmdline
111 char name[8]; /* Name of the driver */ 111 char name[8]; /* Name of the driver */
112 int index; /* Minor dev. to use */ 112 int index; /* Minor dev. to use */
113 char *options; /* Options for the driver */ 113 char *options; /* Options for the driver */
114#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
115 char *brl_options; /* Options for braille driver */
116#endif
114}; 117};
115 118
116#define MAX_CMDLINECONSOLES 8 119#define MAX_CMDLINECONSOLES 8
@@ -808,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end)
808 811
809#endif 812#endif
810 813
814static int __add_preferred_console(char *name, int idx, char *options,
815 char *brl_options)
816{
817 struct console_cmdline *c;
818 int i;
819
820 /*
821 * See if this tty is not yet registered, and
822 * if we have a slot free.
823 */
824 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
825 if (strcmp(console_cmdline[i].name, name) == 0 &&
826 console_cmdline[i].index == idx) {
827 if (!brl_options)
828 selected_console = i;
829 return 0;
830 }
831 if (i == MAX_CMDLINECONSOLES)
832 return -E2BIG;
833 if (!brl_options)
834 selected_console = i;
835 c = &console_cmdline[i];
836 strlcpy(c->name, name, sizeof(c->name));
837 c->options = options;
838#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
839 c->brl_options = brl_options;
840#endif
841 c->index = idx;
842 return 0;
843}
811/* 844/*
812 * Set up a list of consoles. Called from init/main.c 845 * Set up a list of consoles. Called from init/main.c
813 */ 846 */
814static int __init console_setup(char *str) 847static int __init console_setup(char *str)
815{ 848{
816 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ 849 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
817 char *s, *options; 850 char *s, *options, *brl_options = NULL;
818 int idx; 851 int idx;
819 852
853#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
854 if (!memcmp(str, "brl,", 4)) {
855 brl_options = "";
856 str += 4;
857 } else if (!memcmp(str, "brl=", 4)) {
858 brl_options = str + 4;
859 str = strchr(brl_options, ',');
860 if (!str) {
861 printk(KERN_ERR "need port name after brl=\n");
862 return 1;
863 }
864 *(str++) = 0;
865 }
866#endif
867
820 /* 868 /*
821 * Decode str into name, index, options. 869 * Decode str into name, index, options.
822 */ 870 */
@@ -841,7 +889,7 @@ static int __init console_setup(char *str)
841 idx = simple_strtoul(s, NULL, 10); 889 idx = simple_strtoul(s, NULL, 10);
842 *s = 0; 890 *s = 0;
843 891
844 add_preferred_console(buf, idx, options); 892 __add_preferred_console(buf, idx, options, brl_options);
845 return 1; 893 return 1;
846} 894}
847__setup("console=", console_setup); 895__setup("console=", console_setup);
@@ -861,28 +909,7 @@ __setup("console=", console_setup);
861 */ 909 */
862int add_preferred_console(char *name, int idx, char *options) 910int add_preferred_console(char *name, int idx, char *options)
863{ 911{
864 struct console_cmdline *c; 912 return __add_preferred_console(name, idx, options, NULL);
865 int i;
866
867 /*
868 * See if this tty is not yet registered, and
869 * if we have a slot free.
870 */
871 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
872 if (strcmp(console_cmdline[i].name, name) == 0 &&
873 console_cmdline[i].index == idx) {
874 selected_console = i;
875 return 0;
876 }
877 if (i == MAX_CMDLINECONSOLES)
878 return -E2BIG;
879 selected_console = i;
880 c = &console_cmdline[i];
881 memcpy(c->name, name, sizeof(c->name));
882 c->name[sizeof(c->name) - 1] = 0;
883 c->options = options;
884 c->index = idx;
885 return 0;
886} 913}
887 914
888int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) 915int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
@@ -894,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
894 if (strcmp(console_cmdline[i].name, name) == 0 && 921 if (strcmp(console_cmdline[i].name, name) == 0 &&
895 console_cmdline[i].index == idx) { 922 console_cmdline[i].index == idx) {
896 c = &console_cmdline[i]; 923 c = &console_cmdline[i];
897 memcpy(c->name, name_new, sizeof(c->name)); 924 strlcpy(c->name, name_new, sizeof(c->name));
898 c->name[sizeof(c->name) - 1] = 0; 925 c->name[sizeof(c->name) - 1] = 0;
899 c->options = options; 926 c->options = options;
900 c->index = idx_new; 927 c->index = idx_new;
@@ -1163,6 +1190,16 @@ void register_console(struct console *console)
1163 continue; 1190 continue;
1164 if (console->index < 0) 1191 if (console->index < 0)
1165 console->index = console_cmdline[i].index; 1192 console->index = console_cmdline[i].index;
1193#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1194 if (console_cmdline[i].brl_options) {
1195 console->flags |= CON_BRL;
1196 braille_register_console(console,
1197 console_cmdline[i].index,
1198 console_cmdline[i].options,
1199 console_cmdline[i].brl_options);
1200 return;
1201 }
1202#endif
1166 if (console->setup && 1203 if (console->setup &&
1167 console->setup(console, console_cmdline[i].options) != 0) 1204 console->setup(console, console_cmdline[i].options) != 0)
1168 break; 1205 break;
@@ -1221,6 +1258,11 @@ int unregister_console(struct console *console)
1221 struct console *a, *b; 1258 struct console *a, *b;
1222 int res = 1; 1259 int res = 1;
1223 1260
1261#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1262 if (console->flags & CON_BRL)
1263 return braille_unregister_console(console);
1264#endif
1265
1224 acquire_console_sem(); 1266 acquire_console_sem();
1225 if (console_drivers == console) { 1267 if (console_drivers == console) {
1226 console_drivers=console->next; 1268 console_drivers=console->next;
@@ -1272,8 +1314,8 @@ late_initcall(disable_boot_consoles);
1272 */ 1314 */
1273void tty_write_message(struct tty_struct *tty, char *msg) 1315void tty_write_message(struct tty_struct *tty, char *msg)
1274{ 1316{
1275 if (tty && tty->driver->write) 1317 if (tty && tty->ops->write)
1276 tty->driver->write(tty, msg, strlen(msg)); 1318 tty->ops->write(tty, msg, strlen(msg));
1277 return; 1319 return;
1278} 1320}
1279 1321
@@ -1287,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1287 */ 1329 */
1288int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1330int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1289{ 1331{
1290 static DEFINE_SPINLOCK(ratelimit_lock); 1332 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1291 static unsigned toks = 10 * 5 * HZ;
1292 static unsigned long last_msg;
1293 static int missed;
1294 unsigned long flags;
1295 unsigned long now = jiffies;
1296
1297 spin_lock_irqsave(&ratelimit_lock, flags);
1298 toks += now - last_msg;
1299 last_msg = now;
1300 if (toks > (ratelimit_burst * ratelimit_jiffies))
1301 toks = ratelimit_burst * ratelimit_jiffies;
1302 if (toks >= ratelimit_jiffies) {
1303 int lost = missed;
1304
1305 missed = 0;
1306 toks -= ratelimit_jiffies;
1307 spin_unlock_irqrestore(&ratelimit_lock, flags);
1308 if (lost)
1309 printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
1310 return 1;
1311 }
1312 missed++;
1313 spin_unlock_irqrestore(&ratelimit_lock, flags);
1314 return 0;
1315} 1333}
1316EXPORT_SYMBOL(__printk_ratelimit); 1334EXPORT_SYMBOL(__printk_ratelimit);
1317 1335
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
587 return 0; 587 return 0;
588 if (create_hash_tables()) 588 if (create_hash_tables())
589 return -1; 589 return -1;
590 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); 590 entry = proc_create("profile", S_IWUSR | S_IRUGO,
591 NULL, &proc_profile_operations);
591 if (!entry) 592 if (!entry)
592 return 0; 593 return 0;
593 entry->proc_fops = &proc_profile_operations;
594 entry->size = (1+prof_len) * sizeof(atomic_t); 594 entry->size = (1+prof_len) * sizeof(atomic_t);
595 hotcpu_notifier(profile_cpu_callback, 0); 595 hotcpu_notifier(profile_cpu_callback, 0);
596 return 0; 596 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dac4b4e57293..6c19e94fd0a5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child)
73 BUG_ON(!child->ptrace); 73 BUG_ON(!child->ptrace);
74 74
75 child->ptrace = 0; 75 child->ptrace = 0;
76 if (!list_empty(&child->ptrace_list)) { 76 if (ptrace_reparented(child)) {
77 list_del_init(&child->ptrace_list); 77 list_del_init(&child->ptrace_list);
78 remove_parent(child); 78 remove_parent(child);
79 child->parent = child->real_parent; 79 child->parent = child->real_parent;
@@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task)
168 audit_ptrace(task); 168 audit_ptrace(task);
169 169
170 retval = -EPERM; 170 retval = -EPERM;
171 if (task->pid <= 1)
172 goto out;
173 if (same_thread_group(task, current)) 171 if (same_thread_group(task, current))
174 goto out; 172 goto out;
175 173
@@ -208,8 +206,7 @@ repeat:
208 206
209 __ptrace_link(task, current); 207 __ptrace_link(task, current);
210 208
211 force_sig_specific(SIGSTOP, task); 209 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
212
213bad: 210bad:
214 write_unlock_irqrestore(&tasklist_lock, flags); 211 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 212 task_unlock(task);
@@ -522,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
522{ 519{
523 struct task_struct *child; 520 struct task_struct *child;
524 521
525 /*
526 * Tracing init is not allowed.
527 */
528 if (pid == 1)
529 return ERR_PTR(-EPERM);
530
531 read_lock(&tasklist_lock); 522 read_lock(&tasklist_lock);
532 child = find_task_by_vpid(pid); 523 child = find_task_by_vpid(pid);
533 if (child) 524 if (child)
@@ -543,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
543#define arch_ptrace_attach(child) do { } while (0) 534#define arch_ptrace_attach(child) do { } while (0)
544#endif 535#endif
545 536
546#ifndef __ARCH_SYS_PTRACE
547asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 537asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
548{ 538{
549 struct task_struct *child; 539 struct task_struct *child;
@@ -591,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
591 unlock_kernel(); 581 unlock_kernel();
592 return ret; 582 return ret;
593} 583}
594#endif /* __ARCH_SYS_PTRACE */
595 584
596int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 585int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
597{ 586{
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
45#include <linux/byteorder/swabb.h> 45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h>
48 49
49MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
67 67
68/*
69 * allocate an array of pointers of struct page
70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{
73 struct page **array;
74 size_t pa_size = n_pages * sizeof(struct page *);
75
76 if (pa_size > PAGE_SIZE) {
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84}
85
86/*
87 * free an array of pointers of struct page
88 */
89static void relay_free_page_array(struct page **array)
90{
91 if (is_vmalloc_addr(array))
92 vfree(array);
93 else
94 kfree(array);
95}
96
68/** 97/**
69 * relay_mmap_buf: - mmap channel buffer to process address space 98 * relay_mmap_buf: - mmap channel buffer to process address space
70 * @buf: relay channel buffer 99 * @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
109 *size = PAGE_ALIGN(*size); 138 *size = PAGE_ALIGN(*size);
110 n_pages = *size >> PAGE_SHIFT; 139 n_pages = *size >> PAGE_SHIFT;
111 140
112 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); 141 buf->page_array = relay_alloc_page_array(n_pages);
113 if (!buf->page_array) 142 if (!buf->page_array)
114 return NULL; 143 return NULL;
115 144
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
130depopulate: 159depopulate:
131 for (j = 0; j < i; j++) 160 for (j = 0; j < i; j++)
132 __free_page(buf->page_array[j]); 161 __free_page(buf->page_array[j]);
133 kfree(buf->page_array); 162 relay_free_page_array(buf->page_array);
134 return NULL; 163 return NULL;
135} 164}
136 165
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
189 vunmap(buf->start); 218 vunmap(buf->start);
190 for (i = 0; i < buf->page_count; i++) 219 for (i = 0; i < buf->page_count; i++)
191 __free_page(buf->page_array[i]); 220 __free_page(buf->page_array[i]);
192 kfree(buf->page_array); 221 relay_free_page_array(buf->page_array);
193 } 222 }
194 chan->buf[buf->cpu] = NULL; 223 chan->buf[buf->cpu] = NULL;
195 kfree(buf->padding); 224 kfree(buf->padding);
@@ -1162,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1162 ret = 0; 1191 ret = 0;
1163 spliced = 0; 1192 spliced = 0;
1164 1193
1165 while (len) { 1194 while (len && !spliced) {
1166 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1167 if (ret < 0) 1196 if (ret < 0)
1168 break; 1197 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
13#include <linux/res_counter.h> 14#include <linux/res_counter.h>
14#include <linux/uaccess.h> 15#include <linux/uaccess.h>
15 16
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
27 } 28 }
28 29
29 counter->usage += val; 30 counter->usage += val;
31 if (counter->usage > counter->max_usage)
32 counter->max_usage = counter->usage;
30 return 0; 33 return 0;
31} 34}
32 35
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
65 switch (member) { 68 switch (member) {
66 case RES_USAGE: 69 case RES_USAGE:
67 return &counter->usage; 70 return &counter->usage;
71 case RES_MAX_USAGE:
72 return &counter->max_usage;
68 case RES_LIMIT: 73 case RES_LIMIT:
69 return &counter->limit; 74 return &counter->limit;
70 case RES_FAILCNT: 75 case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
92 pos, buf, s - buf); 97 pos, buf, s - buf);
93} 98}
94 99
100u64 res_counter_read_u64(struct res_counter *counter, int member)
101{
102 return *res_counter_member(counter, member);
103}
104
95ssize_t res_counter_write(struct res_counter *counter, int member, 105ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos, 106 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val)) 107 int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
131 131
132static int __init ioresources_init(void) 132static int __init ioresources_init(void)
133{ 133{
134 struct proc_dir_entry *entry; 134 proc_create("ioports", 0, NULL, &proc_ioports_operations);
135 135 proc_create("iomem", 0, NULL, &proc_iomem_operations);
136 entry = create_proc_entry("ioports", 0, NULL);
137 if (entry)
138 entry->proc_fops = &proc_ioports_operations;
139 entry = create_proc_entry("iomem", 0, NULL);
140 if (entry)
141 entry->proc_fops = &proc_iomem_operations;
142 return 0; 136 return 0;
143} 137}
144__initcall(ioresources_init); 138__initcall(ioresources_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..34bcc5bc120e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8025,7 +8025,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 8025
8026 se->my_q = cfs_rq; 8026 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 8027 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
8029 se->parent = parent; 8029 se->parent = parent;
8030} 8030}
8031#endif 8031#endif
@@ -8692,7 +8692,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8692 dequeue_entity(cfs_rq, se, 0);
8693 8693
8694 se->load.weight = shares; 8694 se->load.weight = shares;
8695 se->load.inv_weight = div64_64((1ULL<<32), shares); 8695 se->load.inv_weight = div64_u64((1ULL<<32), shares);
8696 8696
8697 if (on_rq) 8697 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8698 enqueue_entity(cfs_rq, se, 0);
@@ -8787,7 +8787,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8787 if (runtime == RUNTIME_INF) 8787 if (runtime == RUNTIME_INF)
8788 return 1ULL << 16; 8788 return 1ULL << 16;
8789 8789
8790 return div64_64(runtime << 16, period); 8790 return div64_u64(runtime << 16, period);
8791} 8791}
8792 8792
8793#ifdef CONFIG_CGROUP_SCHED 8793#ifdef CONFIG_CGROUP_SCHED
@@ -9057,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9057} 9057}
9058 9058
9059#ifdef CONFIG_FAIR_GROUP_SCHED 9059#ifdef CONFIG_FAIR_GROUP_SCHED
9060static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9060static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9061 u64 shareval) 9061 u64 shareval)
9062{ 9062{
9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9064} 9064}
9065 9065
9066static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) 9066static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9067{ 9067{
9068 struct task_group *tg = cgroup_tg(cgrp); 9068 struct task_group *tg = cgroup_tg(cgrp);
9069 9069
@@ -9073,48 +9073,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
9073 9073
9074#ifdef CONFIG_RT_GROUP_SCHED 9074#ifdef CONFIG_RT_GROUP_SCHED
9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9076 struct file *file, 9076 s64 val)
9077 const char __user *userbuf,
9078 size_t nbytes, loff_t *unused_ppos)
9079{ 9077{
9080 char buffer[64]; 9078 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9081 int retval = 0;
9082 s64 val;
9083 char *end;
9084
9085 if (!nbytes)
9086 return -EINVAL;
9087 if (nbytes >= sizeof(buffer))
9088 return -E2BIG;
9089 if (copy_from_user(buffer, userbuf, nbytes))
9090 return -EFAULT;
9091
9092 buffer[nbytes] = 0; /* nul-terminate */
9093
9094 /* strip newline if necessary */
9095 if (nbytes && (buffer[nbytes-1] == '\n'))
9096 buffer[nbytes-1] = 0;
9097 val = simple_strtoll(buffer, &end, 0);
9098 if (*end)
9099 return -EINVAL;
9100
9101 /* Pass to subsystem */
9102 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9103 if (!retval)
9104 retval = nbytes;
9105 return retval;
9106} 9079}
9107 9080
9108static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, 9081static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9109 struct file *file,
9110 char __user *buf, size_t nbytes,
9111 loff_t *ppos)
9112{ 9082{
9113 char tmp[64]; 9083 return sched_group_rt_runtime(cgroup_tg(cgrp));
9114 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
9115 int len = sprintf(tmp, "%ld\n", val);
9116
9117 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
9118} 9084}
9119 9085
9120static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9086static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9133,20 +9099,20 @@ static struct cftype cpu_files[] = {
9133#ifdef CONFIG_FAIR_GROUP_SCHED 9099#ifdef CONFIG_FAIR_GROUP_SCHED
9134 { 9100 {
9135 .name = "shares", 9101 .name = "shares",
9136 .read_uint = cpu_shares_read_uint, 9102 .read_u64 = cpu_shares_read_u64,
9137 .write_uint = cpu_shares_write_uint, 9103 .write_u64 = cpu_shares_write_u64,
9138 }, 9104 },
9139#endif 9105#endif
9140#ifdef CONFIG_RT_GROUP_SCHED 9106#ifdef CONFIG_RT_GROUP_SCHED
9141 { 9107 {
9142 .name = "rt_runtime_us", 9108 .name = "rt_runtime_us",
9143 .read = cpu_rt_runtime_read, 9109 .read_s64 = cpu_rt_runtime_read,
9144 .write = cpu_rt_runtime_write, 9110 .write_s64 = cpu_rt_runtime_write,
9145 }, 9111 },
9146 { 9112 {
9147 .name = "rt_period_us", 9113 .name = "rt_period_us",
9148 .read_uint = cpu_rt_period_read_uint, 9114 .read_u64 = cpu_rt_period_read_uint,
9149 .write_uint = cpu_rt_period_write_uint, 9115 .write_u64 = cpu_rt_period_write_uint,
9150 }, 9116 },
9151#endif 9117#endif
9152}; 9118};
@@ -9277,8 +9243,8 @@ out:
9277static struct cftype files[] = { 9243static struct cftype files[] = {
9278 { 9244 {
9279 .name = "usage", 9245 .name = "usage",
9280 .read_uint = cpuusage_read, 9246 .read_u64 = cpuusage_read,
9281 .write_uint = cpuusage_write, 9247 .write_u64 = cpuusage_write,
9282 }, 9248 },
9283}; 9249};
9284 9250
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..6b4a12558e88 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -277,12 +277,9 @@ static int __init init_sched_debug_procfs(void)
277{ 277{
278 struct proc_dir_entry *pe; 278 struct proc_dir_entry *pe;
279 279
280 pe = create_proc_entry("sched_debug", 0644, NULL); 280 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
281 if (!pe) 281 if (!pe)
282 return -ENOMEM; 282 return -ENOMEM;
283
284 pe->proc_fops = &sched_debug_fops;
285
286 return 0; 283 return 0;
287} 284}
288 285
@@ -360,8 +357,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
360 357
361 avg_per_cpu = p->se.sum_exec_runtime; 358 avg_per_cpu = p->se.sum_exec_runtime;
362 if (p->se.nr_migrations) { 359 if (p->se.nr_migrations) {
363 avg_per_cpu = div64_64(avg_per_cpu, 360 avg_per_cpu = div64_u64(avg_per_cpu,
364 p->se.nr_migrations); 361 p->se.nr_migrations);
365 } else { 362 } else {
366 avg_per_cpu = -1LL; 363 avg_per_cpu = -1LL;
367 } 364 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 64ad0ed15992..72bb4f51f963 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,11 +39,19 @@
39 39
40static struct kmem_cache *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
41 41
42static int __sig_ignored(struct task_struct *t, int sig)
43{
44 void __user *handler;
45
46 /* Is it explicitly or implicitly ignored? */
47
48 handler = t->sighand->action[sig - 1].sa.sa_handler;
49 return handler == SIG_IGN ||
50 (handler == SIG_DFL && sig_kernel_ignore(sig));
51}
42 52
43static int sig_ignored(struct task_struct *t, int sig) 53static int sig_ignored(struct task_struct *t, int sig)
44{ 54{
45 void __user * handler;
46
47 /* 55 /*
48 * Tracers always want to know about signals.. 56 * Tracers always want to know about signals..
49 */ 57 */
@@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig)
58 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 66 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
59 return 0; 67 return 0;
60 68
61 /* Is it explicitly or implicitly ignored? */ 69 return __sig_ignored(t, sig);
62 handler = t->sighand->action[sig-1].sa.sa_handler;
63 return handler == SIG_IGN ||
64 (handler == SIG_DFL && sig_kernel_ignore(sig));
65} 70}
66 71
67/* 72/*
@@ -372,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
372 */ 377 */
373int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 378int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
374{ 379{
375 int signr = 0; 380 int signr;
376 381
377 /* We only dequeue private signals from ourselves, we don't let 382 /* We only dequeue private signals from ourselves, we don't let
378 * signalfd steal them 383 * signalfd steal them
@@ -405,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
405 } 410 }
406 } 411 }
407 } 412 }
413
408 recalc_sigpending(); 414 recalc_sigpending();
409 if (signr && unlikely(sig_kernel_stop(signr))) { 415 if (!signr)
416 return 0;
417
418 if (unlikely(sig_kernel_stop(signr))) {
410 /* 419 /*
411 * Set a marker that we have dequeued a stop signal. Our 420 * Set a marker that we have dequeued a stop signal. Our
412 * caller might release the siglock and then the pending 421 * caller might release the siglock and then the pending
@@ -422,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
422 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 431 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
423 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 432 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
424 } 433 }
425 if (signr && 434 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
426 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
427 info->si_sys_private) {
428 /* 435 /*
429 * Release the siglock to ensure proper locking order 436 * Release the siglock to ensure proper locking order
430 * of timer locks outside of siglocks. Note, we leave 437 * of timer locks outside of siglocks. Note, we leave
@@ -526,21 +533,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
526static int check_kill_permission(int sig, struct siginfo *info, 533static int check_kill_permission(int sig, struct siginfo *info,
527 struct task_struct *t) 534 struct task_struct *t)
528{ 535{
529 int error = -EINVAL; 536 struct pid *sid;
537 int error;
538
530 if (!valid_signal(sig)) 539 if (!valid_signal(sig))
531 return error; 540 return -EINVAL;
532 541
533 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { 542 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
534 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 543 return 0;
535 if (error) 544
536 return error; 545 error = audit_signal_info(sig, t); /* Let audit system see the signal */
537 error = -EPERM; 546 if (error)
538 if (((sig != SIGCONT) ||
539 (task_session_nr(current) != task_session_nr(t)))
540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
542 && !capable(CAP_KILL))
543 return error; 547 return error;
548
549 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
550 (current->uid ^ t->suid) && (current->uid ^ t->uid) &&
551 !capable(CAP_KILL)) {
552 switch (sig) {
553 case SIGCONT:
554 sid = task_session(t);
555 /*
556 * We don't return the error if sid == NULL. The
557 * task was unhashed, the caller must notice this.
558 */
559 if (!sid || sid == task_session(current))
560 break;
561 default:
562 return -EPERM;
563 }
544 } 564 }
545 565
546 return security_task_kill(t, info, sig, 0); 566 return security_task_kill(t, info, sig, 0);
@@ -550,62 +570,44 @@ static int check_kill_permission(int sig, struct siginfo *info,
550static void do_notify_parent_cldstop(struct task_struct *tsk, int why); 570static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
551 571
552/* 572/*
553 * Handle magic process-wide effects of stop/continue signals. 573 * Handle magic process-wide effects of stop/continue signals. Unlike
554 * Unlike the signal actions, these happen immediately at signal-generation 574 * the signal actions, these happen immediately at signal-generation
555 * time regardless of blocking, ignoring, or handling. This does the 575 * time regardless of blocking, ignoring, or handling. This does the
556 * actual continuing for SIGCONT, but not the actual stopping for stop 576 * actual continuing for SIGCONT, but not the actual stopping for stop
557 * signals. The process stop is done as a signal action for SIG_DFL. 577 * signals. The process stop is done as a signal action for SIG_DFL.
578 *
579 * Returns true if the signal should be actually delivered, otherwise
580 * it should be dropped.
558 */ 581 */
559static void handle_stop_signal(int sig, struct task_struct *p) 582static int prepare_signal(int sig, struct task_struct *p)
560{ 583{
584 struct signal_struct *signal = p->signal;
561 struct task_struct *t; 585 struct task_struct *t;
562 586
563 if (p->signal->flags & SIGNAL_GROUP_EXIT) 587 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
564 /* 588 /*
565 * The process is in the middle of dying already. 589 * The process is in the middle of dying, nothing to do.
566 */ 590 */
567 return; 591 } else if (sig_kernel_stop(sig)) {
568
569 if (sig_kernel_stop(sig)) {
570 /* 592 /*
571 * This is a stop signal. Remove SIGCONT from all queues. 593 * This is a stop signal. Remove SIGCONT from all queues.
572 */ 594 */
573 rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); 595 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
574 t = p; 596 t = p;
575 do { 597 do {
576 rm_from_queue(sigmask(SIGCONT), &t->pending); 598 rm_from_queue(sigmask(SIGCONT), &t->pending);
577 t = next_thread(t); 599 } while_each_thread(p, t);
578 } while (t != p);
579 } else if (sig == SIGCONT) { 600 } else if (sig == SIGCONT) {
601 unsigned int why;
580 /* 602 /*
581 * Remove all stop signals from all queues, 603 * Remove all stop signals from all queues,
582 * and wake all threads. 604 * and wake all threads.
583 */ 605 */
584 if (unlikely(p->signal->group_stop_count > 0)) { 606 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
585 /*
586 * There was a group stop in progress. We'll
587 * pretend it finished before we got here. We are
588 * obliged to report it to the parent: if the
589 * SIGSTOP happened "after" this SIGCONT, then it
590 * would have cleared this pending SIGCONT. If it
591 * happened "before" this SIGCONT, then the parent
592 * got the SIGCHLD about the stop finishing before
593 * the continue happened. We do the notification
594 * now, and it's as if the stop had finished and
595 * the SIGCHLD was pending on entry to this kill.
596 */
597 p->signal->group_stop_count = 0;
598 p->signal->flags = SIGNAL_STOP_CONTINUED;
599 spin_unlock(&p->sighand->siglock);
600 do_notify_parent_cldstop(p, CLD_STOPPED);
601 spin_lock(&p->sighand->siglock);
602 }
603 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
604 t = p; 607 t = p;
605 do { 608 do {
606 unsigned int state; 609 unsigned int state;
607 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 610 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
608
609 /* 611 /*
610 * If there is a handler for SIGCONT, we must make 612 * If there is a handler for SIGCONT, we must make
611 * sure that no thread returns to user mode before 613 * sure that no thread returns to user mode before
@@ -615,7 +617,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
615 * running the handler. With the TIF_SIGPENDING 617 * running the handler. With the TIF_SIGPENDING
616 * flag set, the thread will pause and acquire the 618 * flag set, the thread will pause and acquire the
617 * siglock that we hold now and until we've queued 619 * siglock that we hold now and until we've queued
618 * the pending signal. 620 * the pending signal.
619 * 621 *
620 * Wake up the stopped thread _after_ setting 622 * Wake up the stopped thread _after_ setting
621 * TIF_SIGPENDING 623 * TIF_SIGPENDING
@@ -626,49 +628,163 @@ static void handle_stop_signal(int sig, struct task_struct *p)
626 state |= TASK_INTERRUPTIBLE; 628 state |= TASK_INTERRUPTIBLE;
627 } 629 }
628 wake_up_state(t, state); 630 wake_up_state(t, state);
631 } while_each_thread(p, t);
629 632
630 t = next_thread(t); 633 /*
631 } while (t != p); 634 * Notify the parent with CLD_CONTINUED if we were stopped.
635 *
636 * If we were in the middle of a group stop, we pretend it
637 * was already finished, and then continued. Since SIGCHLD
638 * doesn't queue we report only CLD_STOPPED, as if the next
639 * CLD_CONTINUED was dropped.
640 */
641 why = 0;
642 if (signal->flags & SIGNAL_STOP_STOPPED)
643 why |= SIGNAL_CLD_CONTINUED;
644 else if (signal->group_stop_count)
645 why |= SIGNAL_CLD_STOPPED;
632 646
633 if (p->signal->flags & SIGNAL_STOP_STOPPED) { 647 if (why) {
634 /* 648 /*
635 * We were in fact stopped, and are now continued. 649 * The first thread which returns from finish_stop()
636 * Notify the parent with CLD_CONTINUED. 650 * will take ->siglock, notice SIGNAL_CLD_MASK, and
651 * notify its parent. See get_signal_to_deliver().
637 */ 652 */
638 p->signal->flags = SIGNAL_STOP_CONTINUED; 653 signal->flags = why | SIGNAL_STOP_CONTINUED;
639 p->signal->group_exit_code = 0; 654 signal->group_stop_count = 0;
640 spin_unlock(&p->sighand->siglock); 655 signal->group_exit_code = 0;
641 do_notify_parent_cldstop(p, CLD_CONTINUED);
642 spin_lock(&p->sighand->siglock);
643 } else { 656 } else {
644 /* 657 /*
645 * We are not stopped, but there could be a stop 658 * We are not stopped, but there could be a stop
646 * signal in the middle of being processed after 659 * signal in the middle of being processed after
647 * being removed from the queue. Clear that too. 660 * being removed from the queue. Clear that too.
648 */ 661 */
649 p->signal->flags = 0; 662 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
650 } 663 }
651 } else if (sig == SIGKILL) { 664 }
665
666 return !sig_ignored(p, sig);
667}
668
669/*
670 * Test if P wants to take SIG. After we've checked all threads with this,
671 * it's equivalent to finding no threads not blocking SIG. Any threads not
672 * blocking SIG were ruled out because they are not running and already
673 * have pending signals. Such threads will dequeue from the shared queue
674 * as soon as they're available, so putting the signal on the shared queue
675 * will be equivalent to sending it to one such thread.
676 */
677static inline int wants_signal(int sig, struct task_struct *p)
678{
679 if (sigismember(&p->blocked, sig))
680 return 0;
681 if (p->flags & PF_EXITING)
682 return 0;
683 if (sig == SIGKILL)
684 return 1;
685 if (task_is_stopped_or_traced(p))
686 return 0;
687 return task_curr(p) || !signal_pending(p);
688}
689
690static void complete_signal(int sig, struct task_struct *p, int group)
691{
692 struct signal_struct *signal = p->signal;
693 struct task_struct *t;
694
695 /*
696 * Now find a thread we can wake up to take the signal off the queue.
697 *
698 * If the main thread wants the signal, it gets first crack.
699 * Probably the least surprising to the average bear.
700 */
701 if (wants_signal(sig, p))
702 t = p;
703 else if (!group || thread_group_empty(p))
704 /*
705 * There is just one thread and it does not need to be woken.
706 * It will dequeue unblocked signals before it runs again.
707 */
708 return;
709 else {
652 /* 710 /*
653 * Make sure that any pending stop signal already dequeued 711 * Otherwise try to find a suitable thread.
654 * is undone by the wakeup for SIGKILL.
655 */ 712 */
656 p->signal->flags = 0; 713 t = signal->curr_target;
714 while (!wants_signal(sig, t)) {
715 t = next_thread(t);
716 if (t == signal->curr_target)
717 /*
718 * No thread needs to be woken.
719 * Any eligible threads will see
720 * the signal in the queue soon.
721 */
722 return;
723 }
724 signal->curr_target = t;
657 } 725 }
726
727 /*
728 * Found a killable thread. If the signal will be fatal,
729 * then start taking the whole group down immediately.
730 */
731 if (sig_fatal(p, sig) &&
732 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
733 !sigismember(&t->real_blocked, sig) &&
734 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
735 /*
736 * This signal will be fatal to the whole group.
737 */
738 if (!sig_kernel_coredump(sig)) {
739 /*
740 * Start a group exit and wake everybody up.
741 * This way we don't have other threads
742 * running and doing things after a slower
743 * thread has the fatal signal pending.
744 */
745 signal->flags = SIGNAL_GROUP_EXIT;
746 signal->group_exit_code = sig;
747 signal->group_stop_count = 0;
748 t = p;
749 do {
750 sigaddset(&t->pending.signal, SIGKILL);
751 signal_wake_up(t, 1);
752 } while_each_thread(p, t);
753 return;
754 }
755 }
756
757 /*
758 * The signal is already in the shared-pending queue.
759 * Tell the chosen thread to wake up and dequeue it.
760 */
761 signal_wake_up(t, sig == SIGKILL);
762 return;
763}
764
765static inline int legacy_queue(struct sigpending *signals, int sig)
766{
767 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
658} 768}
659 769
660static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 770static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
661 struct sigpending *signals) 771 int group)
662{ 772{
663 struct sigqueue * q = NULL; 773 struct sigpending *pending;
664 int ret = 0; 774 struct sigqueue *q;
775
776 assert_spin_locked(&t->sighand->siglock);
777 if (!prepare_signal(sig, t))
778 return 0;
665 779
780 pending = group ? &t->signal->shared_pending : &t->pending;
666 /* 781 /*
667 * Deliver the signal to listening signalfds. This must be called 782 * Short-circuit ignored signals and support queuing
668 * with the sighand lock held. 783 * exactly one non-rt signal, so that we can get more
784 * detailed information about the cause of the signal.
669 */ 785 */
670 signalfd_notify(t, sig); 786 if (legacy_queue(pending, sig))
671 787 return 0;
672 /* 788 /*
673 * fast-pathed signals for kernel-internal things like SIGSTOP 789 * fast-pathed signals for kernel-internal things like SIGSTOP
674 * or SIGKILL. 790 * or SIGKILL.
@@ -688,7 +804,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
688 (is_si_special(info) || 804 (is_si_special(info) ||
689 info->si_code >= 0))); 805 info->si_code >= 0)));
690 if (q) { 806 if (q) {
691 list_add_tail(&q->list, &signals->list); 807 list_add_tail(&q->list, &pending->list);
692 switch ((unsigned long) info) { 808 switch ((unsigned long) info) {
693 case (unsigned long) SEND_SIG_NOINFO: 809 case (unsigned long) SEND_SIG_NOINFO:
694 q->info.si_signo = sig; 810 q->info.si_signo = sig;
@@ -718,13 +834,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
718 } 834 }
719 835
720out_set: 836out_set:
721 sigaddset(&signals->signal, sig); 837 signalfd_notify(t, sig);
722 return ret; 838 sigaddset(&pending->signal, sig);
839 complete_signal(sig, t, group);
840 return 0;
723} 841}
724 842
725#define LEGACY_QUEUE(sigptr, sig) \
726 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
727
728int print_fatal_signals; 843int print_fatal_signals;
729 844
730static void print_fatal_signal(struct pt_regs *regs, int signr) 845static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -757,29 +872,16 @@ static int __init setup_print_fatal_signals(char *str)
757 872
758__setup("print-fatal-signals=", setup_print_fatal_signals); 873__setup("print-fatal-signals=", setup_print_fatal_signals);
759 874
875int
876__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
877{
878 return send_signal(sig, info, p, 1);
879}
880
760static int 881static int
761specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 882specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
762{ 883{
763 int ret = 0; 884 return send_signal(sig, info, t, 0);
764
765 BUG_ON(!irqs_disabled());
766 assert_spin_locked(&t->sighand->siglock);
767
768 /* Short-circuit ignored signals. */
769 if (sig_ignored(t, sig))
770 goto out;
771
772 /* Support queueing exactly one non-rt signal, so that we
773 can get more detailed information about the cause of
774 the signal. */
775 if (LEGACY_QUEUE(&t->pending, sig))
776 goto out;
777
778 ret = send_signal(sig, info, t, &t->pending);
779 if (!ret && !sigismember(&t->blocked, sig))
780 signal_wake_up(t, sig == SIGKILL);
781out:
782 return ret;
783} 885}
784 886
785/* 887/*
@@ -790,7 +892,8 @@ out:
790 * since we do not want to have a signal handler that was blocked 892 * since we do not want to have a signal handler that was blocked
791 * be invoked when user space had explicitly blocked it. 893 * be invoked when user space had explicitly blocked it.
792 * 894 *
793 * We don't want to have recursive SIGSEGV's etc, for example. 895 * We don't want to have recursive SIGSEGV's etc, for example,
896 * that is why we also clear SIGNAL_UNKILLABLE.
794 */ 897 */
795int 898int
796force_sig_info(int sig, struct siginfo *info, struct task_struct *t) 899force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -810,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
810 recalc_sigpending_and_wake(t); 913 recalc_sigpending_and_wake(t);
811 } 914 }
812 } 915 }
916 if (action->sa.sa_handler == SIG_DFL)
917 t->signal->flags &= ~SIGNAL_UNKILLABLE;
813 ret = specific_send_sig_info(sig, info, t); 918 ret = specific_send_sig_info(sig, info, t);
814 spin_unlock_irqrestore(&t->sighand->siglock, flags); 919 spin_unlock_irqrestore(&t->sighand->siglock, flags);
815 920
@@ -823,134 +928,6 @@ force_sig_specific(int sig, struct task_struct *t)
823} 928}
824 929
825/* 930/*
826 * Test if P wants to take SIG. After we've checked all threads with this,
827 * it's equivalent to finding no threads not blocking SIG. Any threads not
828 * blocking SIG were ruled out because they are not running and already
829 * have pending signals. Such threads will dequeue from the shared queue
830 * as soon as they're available, so putting the signal on the shared queue
831 * will be equivalent to sending it to one such thread.
832 */
833static inline int wants_signal(int sig, struct task_struct *p)
834{
835 if (sigismember(&p->blocked, sig))
836 return 0;
837 if (p->flags & PF_EXITING)
838 return 0;
839 if (sig == SIGKILL)
840 return 1;
841 if (task_is_stopped_or_traced(p))
842 return 0;
843 return task_curr(p) || !signal_pending(p);
844}
845
846static void
847__group_complete_signal(int sig, struct task_struct *p)
848{
849 struct task_struct *t;
850
851 /*
852 * Now find a thread we can wake up to take the signal off the queue.
853 *
854 * If the main thread wants the signal, it gets first crack.
855 * Probably the least surprising to the average bear.
856 */
857 if (wants_signal(sig, p))
858 t = p;
859 else if (thread_group_empty(p))
860 /*
861 * There is just one thread and it does not need to be woken.
862 * It will dequeue unblocked signals before it runs again.
863 */
864 return;
865 else {
866 /*
867 * Otherwise try to find a suitable thread.
868 */
869 t = p->signal->curr_target;
870 if (t == NULL)
871 /* restart balancing at this thread */
872 t = p->signal->curr_target = p;
873
874 while (!wants_signal(sig, t)) {
875 t = next_thread(t);
876 if (t == p->signal->curr_target)
877 /*
878 * No thread needs to be woken.
879 * Any eligible threads will see
880 * the signal in the queue soon.
881 */
882 return;
883 }
884 p->signal->curr_target = t;
885 }
886
887 /*
888 * Found a killable thread. If the signal will be fatal,
889 * then start taking the whole group down immediately.
890 */
891 if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
892 !sigismember(&t->real_blocked, sig) &&
893 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
894 /*
895 * This signal will be fatal to the whole group.
896 */
897 if (!sig_kernel_coredump(sig)) {
898 /*
899 * Start a group exit and wake everybody up.
900 * This way we don't have other threads
901 * running and doing things after a slower
902 * thread has the fatal signal pending.
903 */
904 p->signal->flags = SIGNAL_GROUP_EXIT;
905 p->signal->group_exit_code = sig;
906 p->signal->group_stop_count = 0;
907 t = p;
908 do {
909 sigaddset(&t->pending.signal, SIGKILL);
910 signal_wake_up(t, 1);
911 } while_each_thread(p, t);
912 return;
913 }
914 }
915
916 /*
917 * The signal is already in the shared-pending queue.
918 * Tell the chosen thread to wake up and dequeue it.
919 */
920 signal_wake_up(t, sig == SIGKILL);
921 return;
922}
923
924int
925__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
926{
927 int ret = 0;
928
929 assert_spin_locked(&p->sighand->siglock);
930 handle_stop_signal(sig, p);
931
932 /* Short-circuit ignored signals. */
933 if (sig_ignored(p, sig))
934 return ret;
935
936 if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
937 /* This is a non-RT signal and we already have one queued. */
938 return ret;
939
940 /*
941 * Put this signal on the shared-pending queue, or fail with EAGAIN.
942 * We always use the shared queue for process-wide signals,
943 * to avoid several races.
944 */
945 ret = send_signal(sig, info, p, &p->signal->shared_pending);
946 if (unlikely(ret))
947 return ret;
948
949 __group_complete_signal(sig, p);
950 return 0;
951}
952
953/*
954 * Nuke all other threads in the group. 931 * Nuke all other threads in the group.
955 */ 932 */
956void zap_other_threads(struct task_struct *p) 933void zap_other_threads(struct task_struct *p)
@@ -978,13 +955,11 @@ int __fatal_signal_pending(struct task_struct *tsk)
978} 955}
979EXPORT_SYMBOL(__fatal_signal_pending); 956EXPORT_SYMBOL(__fatal_signal_pending);
980 957
981/*
982 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
983 */
984struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 958struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
985{ 959{
986 struct sighand_struct *sighand; 960 struct sighand_struct *sighand;
987 961
962 rcu_read_lock();
988 for (;;) { 963 for (;;) {
989 sighand = rcu_dereference(tsk->sighand); 964 sighand = rcu_dereference(tsk->sighand);
990 if (unlikely(sighand == NULL)) 965 if (unlikely(sighand == NULL))
@@ -995,6 +970,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
995 break; 970 break;
996 spin_unlock_irqrestore(&sighand->siglock, *flags); 971 spin_unlock_irqrestore(&sighand->siglock, *flags);
997 } 972 }
973 rcu_read_unlock();
998 974
999 return sighand; 975 return sighand;
1000} 976}
@@ -1043,9 +1019,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1043 struct task_struct *p; 1019 struct task_struct *p;
1044 1020
1045 rcu_read_lock(); 1021 rcu_read_lock();
1046 if (unlikely(sig_needs_tasklist(sig)))
1047 read_lock(&tasklist_lock);
1048
1049retry: 1022retry:
1050 p = pid_task(pid, PIDTYPE_PID); 1023 p = pid_task(pid, PIDTYPE_PID);
1051 if (p) { 1024 if (p) {
@@ -1059,10 +1032,8 @@ retry:
1059 */ 1032 */
1060 goto retry; 1033 goto retry;
1061 } 1034 }
1062
1063 if (unlikely(sig_needs_tasklist(sig)))
1064 read_unlock(&tasklist_lock);
1065 rcu_read_unlock(); 1035 rcu_read_unlock();
1036
1066 return error; 1037 return error;
1067} 1038}
1068 1039
@@ -1159,8 +1130,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1159 */ 1130 */
1160 1131
1161/* 1132/*
1162 * These two are the most common entry points. They send a signal 1133 * The caller must ensure the task can't exit.
1163 * just to the specific thread.
1164 */ 1134 */
1165int 1135int
1166send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1136send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
@@ -1175,17 +1145,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1175 if (!valid_signal(sig)) 1145 if (!valid_signal(sig))
1176 return -EINVAL; 1146 return -EINVAL;
1177 1147
1178 /*
1179 * We need the tasklist lock even for the specific
1180 * thread case (when we don't need to follow the group
1181 * lists) in order to avoid races with "p->sighand"
1182 * going away or changing from under us.
1183 */
1184 read_lock(&tasklist_lock);
1185 spin_lock_irqsave(&p->sighand->siglock, flags); 1148 spin_lock_irqsave(&p->sighand->siglock, flags);
1186 ret = specific_send_sig_info(sig, info, p); 1149 ret = specific_send_sig_info(sig, info, p);
1187 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1150 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1188 read_unlock(&tasklist_lock);
1189 return ret; 1151 return ret;
1190} 1152}
1191 1153
@@ -1291,28 +1253,24 @@ void sigqueue_free(struct sigqueue *q)
1291 __sigqueue_free(q); 1253 __sigqueue_free(q);
1292} 1254}
1293 1255
1294int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) 1256int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1295{ 1257{
1258 int sig = q->info.si_signo;
1259 struct sigpending *pending;
1296 unsigned long flags; 1260 unsigned long flags;
1297 int ret = 0; 1261 int ret;
1298 1262
1299 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1263 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1300 1264
1301 /* 1265 ret = -1;
1302 * The rcu based delayed sighand destroy makes it possible to 1266 if (!likely(lock_task_sighand(t, &flags)))
1303 * run this without tasklist lock held. The task struct itself 1267 goto ret;
1304 * cannot go away as create_timer did get_task_struct().
1305 *
1306 * We return -1, when the task is marked exiting, so
1307 * posix_timer_event can redirect it to the group leader
1308 */
1309 rcu_read_lock();
1310 1268
1311 if (!likely(lock_task_sighand(p, &flags))) { 1269 ret = 1; /* the signal is ignored */
1312 ret = -1; 1270 if (!prepare_signal(sig, t))
1313 goto out_err; 1271 goto out;
1314 }
1315 1272
1273 ret = 0;
1316 if (unlikely(!list_empty(&q->list))) { 1274 if (unlikely(!list_empty(&q->list))) {
1317 /* 1275 /*
1318 * If an SI_TIMER entry is already queue just increment 1276 * If an SI_TIMER entry is already queue just increment
@@ -1322,77 +1280,15 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1322 q->info.si_overrun++; 1280 q->info.si_overrun++;
1323 goto out; 1281 goto out;
1324 } 1282 }
1325 /* Short-circuit ignored signals. */
1326 if (sig_ignored(p, sig)) {
1327 ret = 1;
1328 goto out;
1329 }
1330 /*
1331 * Deliver the signal to listening signalfds. This must be called
1332 * with the sighand lock held.
1333 */
1334 signalfd_notify(p, sig);
1335
1336 list_add_tail(&q->list, &p->pending.list);
1337 sigaddset(&p->pending.signal, sig);
1338 if (!sigismember(&p->blocked, sig))
1339 signal_wake_up(p, sig == SIGKILL);
1340
1341out:
1342 unlock_task_sighand(p, &flags);
1343out_err:
1344 rcu_read_unlock();
1345
1346 return ret;
1347}
1348
1349int
1350send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1351{
1352 unsigned long flags;
1353 int ret = 0;
1354
1355 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1356
1357 read_lock(&tasklist_lock);
1358 /* Since it_lock is held, p->sighand cannot be NULL. */
1359 spin_lock_irqsave(&p->sighand->siglock, flags);
1360 handle_stop_signal(sig, p);
1361
1362 /* Short-circuit ignored signals. */
1363 if (sig_ignored(p, sig)) {
1364 ret = 1;
1365 goto out;
1366 }
1367 1283
1368 if (unlikely(!list_empty(&q->list))) { 1284 signalfd_notify(t, sig);
1369 /* 1285 pending = group ? &t->signal->shared_pending : &t->pending;
1370 * If an SI_TIMER entry is already queue just increment 1286 list_add_tail(&q->list, &pending->list);
1371 * the overrun count. Other uses should not try to 1287 sigaddset(&pending->signal, sig);
1372 * send the signal multiple times. 1288 complete_signal(sig, t, group);
1373 */
1374 BUG_ON(q->info.si_code != SI_TIMER);
1375 q->info.si_overrun++;
1376 goto out;
1377 }
1378 /*
1379 * Deliver the signal to listening signalfds. This must be called
1380 * with the sighand lock held.
1381 */
1382 signalfd_notify(p, sig);
1383
1384 /*
1385 * Put this signal on the shared-pending queue.
1386 * We always use the shared queue for process-wide signals,
1387 * to avoid several races.
1388 */
1389 list_add_tail(&q->list, &p->signal->shared_pending.list);
1390 sigaddset(&p->signal->shared_pending.signal, sig);
1391
1392 __group_complete_signal(sig, p);
1393out: 1289out:
1394 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1290 unlock_task_sighand(t, &flags);
1395 read_unlock(&tasklist_lock); 1291ret:
1396 return ret; 1292 return ret;
1397} 1293}
1398 1294
@@ -1723,8 +1619,9 @@ static int do_signal_stop(int signr)
1723 } else { 1619 } else {
1724 struct task_struct *t; 1620 struct task_struct *t;
1725 1621
1726 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1622 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
1727 unlikely(sig->group_exit_task)) 1623 != SIGNAL_STOP_DEQUEUED) ||
1624 unlikely(signal_group_exit(sig)))
1728 return 0; 1625 return 0;
1729 /* 1626 /*
1730 * There is no group stop already in progress. 1627 * There is no group stop already in progress.
@@ -1799,8 +1696,9 @@ static int ptrace_signal(int signr, siginfo_t *info,
1799int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 1696int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1800 struct pt_regs *regs, void *cookie) 1697 struct pt_regs *regs, void *cookie)
1801{ 1698{
1802 sigset_t *mask = &current->blocked; 1699 struct sighand_struct *sighand = current->sighand;
1803 int signr = 0; 1700 struct signal_struct *signal = current->signal;
1701 int signr;
1804 1702
1805relock: 1703relock:
1806 /* 1704 /*
@@ -1811,16 +1709,32 @@ relock:
1811 */ 1709 */
1812 try_to_freeze(); 1710 try_to_freeze();
1813 1711
1814 spin_lock_irq(&current->sighand->siglock); 1712 spin_lock_irq(&sighand->siglock);
1713 /*
1714 * Every stopped thread goes here after wakeup. Check to see if
1715 * we should notify the parent, prepare_signal(SIGCONT) encodes
1716 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1717 */
1718 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1719 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1720 ? CLD_CONTINUED : CLD_STOPPED;
1721 signal->flags &= ~SIGNAL_CLD_MASK;
1722 spin_unlock_irq(&sighand->siglock);
1723
1724 read_lock(&tasklist_lock);
1725 do_notify_parent_cldstop(current->group_leader, why);
1726 read_unlock(&tasklist_lock);
1727 goto relock;
1728 }
1729
1815 for (;;) { 1730 for (;;) {
1816 struct k_sigaction *ka; 1731 struct k_sigaction *ka;
1817 1732
1818 if (unlikely(current->signal->group_stop_count > 0) && 1733 if (unlikely(signal->group_stop_count > 0) &&
1819 do_signal_stop(0)) 1734 do_signal_stop(0))
1820 goto relock; 1735 goto relock;
1821 1736
1822 signr = dequeue_signal(current, mask, info); 1737 signr = dequeue_signal(current, &current->blocked, info);
1823
1824 if (!signr) 1738 if (!signr)
1825 break; /* will return 0 */ 1739 break; /* will return 0 */
1826 1740
@@ -1830,7 +1744,7 @@ relock:
1830 continue; 1744 continue;
1831 } 1745 }
1832 1746
1833 ka = &current->sighand->action[signr-1]; 1747 ka = &sighand->action[signr-1];
1834 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1748 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1835 continue; 1749 continue;
1836 if (ka->sa.sa_handler != SIG_DFL) { 1750 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1852,7 +1766,8 @@ relock:
1852 /* 1766 /*
1853 * Global init gets no signals it doesn't want. 1767 * Global init gets no signals it doesn't want.
1854 */ 1768 */
1855 if (is_global_init(current)) 1769 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1770 !signal_group_exit(signal))
1856 continue; 1771 continue;
1857 1772
1858 if (sig_kernel_stop(signr)) { 1773 if (sig_kernel_stop(signr)) {
@@ -1867,14 +1782,14 @@ relock:
1867 * We need to check for that and bail out if necessary. 1782 * We need to check for that and bail out if necessary.
1868 */ 1783 */
1869 if (signr != SIGSTOP) { 1784 if (signr != SIGSTOP) {
1870 spin_unlock_irq(&current->sighand->siglock); 1785 spin_unlock_irq(&sighand->siglock);
1871 1786
1872 /* signals can be posted during this window */ 1787 /* signals can be posted during this window */
1873 1788
1874 if (is_current_pgrp_orphaned()) 1789 if (is_current_pgrp_orphaned())
1875 goto relock; 1790 goto relock;
1876 1791
1877 spin_lock_irq(&current->sighand->siglock); 1792 spin_lock_irq(&sighand->siglock);
1878 } 1793 }
1879 1794
1880 if (likely(do_signal_stop(signr))) { 1795 if (likely(do_signal_stop(signr))) {
@@ -1889,15 +1804,16 @@ relock:
1889 continue; 1804 continue;
1890 } 1805 }
1891 1806
1892 spin_unlock_irq(&current->sighand->siglock); 1807 spin_unlock_irq(&sighand->siglock);
1893 1808
1894 /* 1809 /*
1895 * Anything else is fatal, maybe with a core dump. 1810 * Anything else is fatal, maybe with a core dump.
1896 */ 1811 */
1897 current->flags |= PF_SIGNALED; 1812 current->flags |= PF_SIGNALED;
1898 if ((signr != SIGKILL) && print_fatal_signals) 1813
1899 print_fatal_signal(regs, signr);
1900 if (sig_kernel_coredump(signr)) { 1814 if (sig_kernel_coredump(signr)) {
1815 if (print_fatal_signals)
1816 print_fatal_signal(regs, signr);
1901 /* 1817 /*
1902 * If it was able to dump core, this kills all 1818 * If it was able to dump core, this kills all
1903 * other threads in the group and synchronizes with 1819 * other threads in the group and synchronizes with
@@ -1915,7 +1831,7 @@ relock:
1915 do_group_exit(signr); 1831 do_group_exit(signr);
1916 /* NOTREACHED */ 1832 /* NOTREACHED */
1917 } 1833 }
1918 spin_unlock_irq(&current->sighand->siglock); 1834 spin_unlock_irq(&sighand->siglock);
1919 return signr; 1835 return signr;
1920} 1836}
1921 1837
@@ -2259,6 +2175,7 @@ static int do_tkill(int tgid, int pid, int sig)
2259 int error; 2175 int error;
2260 struct siginfo info; 2176 struct siginfo info;
2261 struct task_struct *p; 2177 struct task_struct *p;
2178 unsigned long flags;
2262 2179
2263 error = -ESRCH; 2180 error = -ESRCH;
2264 info.si_signo = sig; 2181 info.si_signo = sig;
@@ -2267,22 +2184,24 @@ static int do_tkill(int tgid, int pid, int sig)
2267 info.si_pid = task_tgid_vnr(current); 2184 info.si_pid = task_tgid_vnr(current);
2268 info.si_uid = current->uid; 2185 info.si_uid = current->uid;
2269 2186
2270 read_lock(&tasklist_lock); 2187 rcu_read_lock();
2271 p = find_task_by_vpid(pid); 2188 p = find_task_by_vpid(pid);
2272 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2189 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2273 error = check_kill_permission(sig, &info, p); 2190 error = check_kill_permission(sig, &info, p);
2274 /* 2191 /*
2275 * The null signal is a permissions and process existence 2192 * The null signal is a permissions and process existence
2276 * probe. No signal is actually delivered. 2193 * probe. No signal is actually delivered.
2194 *
2195 * If lock_task_sighand() fails we pretend the task dies
2196 * after receiving the signal. The window is tiny, and the
2197 * signal is private anyway.
2277 */ 2198 */
2278 if (!error && sig && p->sighand) { 2199 if (!error && sig && lock_task_sighand(p, &flags)) {
2279 spin_lock_irq(&p->sighand->siglock);
2280 handle_stop_signal(sig, p);
2281 error = specific_send_sig_info(sig, &info, p); 2200 error = specific_send_sig_info(sig, &info, p);
2282 spin_unlock_irq(&p->sighand->siglock); 2201 unlock_task_sighand(p, &flags);
2283 } 2202 }
2284 } 2203 }
2285 read_unlock(&tasklist_lock); 2204 rcu_read_unlock();
2286 2205
2287 return error; 2206 return error;
2288} 2207}
@@ -2339,13 +2258,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2339 2258
2340int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2259int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2341{ 2260{
2261 struct task_struct *t = current;
2342 struct k_sigaction *k; 2262 struct k_sigaction *k;
2343 sigset_t mask; 2263 sigset_t mask;
2344 2264
2345 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 2265 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
2346 return -EINVAL; 2266 return -EINVAL;
2347 2267
2348 k = &current->sighand->action[sig-1]; 2268 k = &t->sighand->action[sig-1];
2349 2269
2350 spin_lock_irq(&current->sighand->siglock); 2270 spin_lock_irq(&current->sighand->siglock);
2351 if (oact) 2271 if (oact)
@@ -2366,9 +2286,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2366 * (for example, SIGCHLD), shall cause the pending signal to 2286 * (for example, SIGCHLD), shall cause the pending signal to
2367 * be discarded, whether or not it is blocked" 2287 * be discarded, whether or not it is blocked"
2368 */ 2288 */
2369 if (act->sa.sa_handler == SIG_IGN || 2289 if (__sig_ignored(t, sig)) {
2370 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2371 struct task_struct *t = current;
2372 sigemptyset(&mask); 2290 sigemptyset(&mask);
2373 sigaddset(&mask, sig); 2291 sigaddset(&mask, sig);
2374 rm_from_queue_full(&mask, &t->signal->shared_pending); 2292 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2623,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2623 2541
2624 current->state = TASK_INTERRUPTIBLE; 2542 current->state = TASK_INTERRUPTIBLE;
2625 schedule(); 2543 schedule();
2626 set_thread_flag(TIF_RESTORE_SIGMASK); 2544 set_restore_sigmask();
2627 return -ERESTARTNOHAND; 2545 return -ERESTARTNOHAND;
2628} 2546}
2629#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 2547#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3c44956ee7e2..36e061740047 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -589,16 +589,20 @@ static void takeover_tasklets(unsigned int cpu)
589 local_irq_disable(); 589 local_irq_disable();
590 590
591 /* Find end, append list for that CPU. */ 591 /* Find end, append list for that CPU. */
592 *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; 592 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
593 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 593 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
594 per_cpu(tasklet_vec, cpu).head = NULL; 594 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
595 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 595 per_cpu(tasklet_vec, cpu).head = NULL;
596 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
597 }
596 raise_softirq_irqoff(TASKLET_SOFTIRQ); 598 raise_softirq_irqoff(TASKLET_SOFTIRQ);
597 599
598 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 600 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
599 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 601 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
600 per_cpu(tasklet_hi_vec, cpu).head = NULL; 602 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
601 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 603 per_cpu(tasklet_hi_vec, cpu).head = NULL;
604 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
605 }
602 raise_softirq_irqoff(HI_SOFTIRQ); 606 raise_softirq_irqoff(HI_SOFTIRQ);
603 607
604 local_irq_enable(); 608 local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index f2a451366953..895d2d4c9493 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -978,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
978 goto out; 978 goto out;
979 979
980 if (task_pgrp(p) != pgrp) { 980 if (task_pgrp(p) != pgrp) {
981 detach_pid(p, PIDTYPE_PGID); 981 change_pid(p, PIDTYPE_PGID, pgrp);
982 attach_pid(p, PIDTYPE_PGID, pgrp);
983 set_task_pgrp(p, pid_nr(pgrp)); 982 set_task_pgrp(p, pid_nr(pgrp));
984 } 983 }
985 984
@@ -992,54 +991,67 @@ out:
992 991
993asmlinkage long sys_getpgid(pid_t pid) 992asmlinkage long sys_getpgid(pid_t pid)
994{ 993{
994 struct task_struct *p;
995 struct pid *grp;
996 int retval;
997
998 rcu_read_lock();
995 if (!pid) 999 if (!pid)
996 return task_pgrp_vnr(current); 1000 grp = task_pgrp(current);
997 else { 1001 else {
998 int retval;
999 struct task_struct *p;
1000
1001 read_lock(&tasklist_lock);
1002 p = find_task_by_vpid(pid);
1003 retval = -ESRCH; 1002 retval = -ESRCH;
1004 if (p) { 1003 p = find_task_by_vpid(pid);
1005 retval = security_task_getpgid(p); 1004 if (!p)
1006 if (!retval) 1005 goto out;
1007 retval = task_pgrp_vnr(p); 1006 grp = task_pgrp(p);
1008 } 1007 if (!grp)
1009 read_unlock(&tasklist_lock); 1008 goto out;
1010 return retval; 1009
1010 retval = security_task_getpgid(p);
1011 if (retval)
1012 goto out;
1011 } 1013 }
1014 retval = pid_vnr(grp);
1015out:
1016 rcu_read_unlock();
1017 return retval;
1012} 1018}
1013 1019
1014#ifdef __ARCH_WANT_SYS_GETPGRP 1020#ifdef __ARCH_WANT_SYS_GETPGRP
1015 1021
1016asmlinkage long sys_getpgrp(void) 1022asmlinkage long sys_getpgrp(void)
1017{ 1023{
1018 /* SMP - assuming writes are word atomic this is fine */ 1024 return sys_getpgid(0);
1019 return task_pgrp_vnr(current);
1020} 1025}
1021 1026
1022#endif 1027#endif
1023 1028
1024asmlinkage long sys_getsid(pid_t pid) 1029asmlinkage long sys_getsid(pid_t pid)
1025{ 1030{
1031 struct task_struct *p;
1032 struct pid *sid;
1033 int retval;
1034
1035 rcu_read_lock();
1026 if (!pid) 1036 if (!pid)
1027 return task_session_vnr(current); 1037 sid = task_session(current);
1028 else { 1038 else {
1029 int retval;
1030 struct task_struct *p;
1031
1032 rcu_read_lock();
1033 p = find_task_by_vpid(pid);
1034 retval = -ESRCH; 1039 retval = -ESRCH;
1035 if (p) { 1040 p = find_task_by_vpid(pid);
1036 retval = security_task_getsid(p); 1041 if (!p)
1037 if (!retval) 1042 goto out;
1038 retval = task_session_vnr(p); 1043 sid = task_session(p);
1039 } 1044 if (!sid)
1040 rcu_read_unlock(); 1045 goto out;
1041 return retval; 1046
1047 retval = security_task_getsid(p);
1048 if (retval)
1049 goto out;
1042 } 1050 }
1051 retval = pid_vnr(sid);
1052out:
1053 rcu_read_unlock();
1054 return retval;
1043} 1055}
1044 1056
1045asmlinkage long sys_setsid(void) 1057asmlinkage long sys_setsid(void)
@@ -1545,6 +1557,19 @@ out:
1545 * 1557 *
1546 */ 1558 */
1547 1559
1560static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
1561 cputime_t *utimep, cputime_t *stimep)
1562{
1563 *utimep = cputime_add(*utimep, t->utime);
1564 *stimep = cputime_add(*stimep, t->stime);
1565 r->ru_nvcsw += t->nvcsw;
1566 r->ru_nivcsw += t->nivcsw;
1567 r->ru_minflt += t->min_flt;
1568 r->ru_majflt += t->maj_flt;
1569 r->ru_inblock += task_io_get_inblock(t);
1570 r->ru_oublock += task_io_get_oublock(t);
1571}
1572
1548static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1573static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1549{ 1574{
1550 struct task_struct *t; 1575 struct task_struct *t;
@@ -1554,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1554 memset((char *) r, 0, sizeof *r); 1579 memset((char *) r, 0, sizeof *r);
1555 utime = stime = cputime_zero; 1580 utime = stime = cputime_zero;
1556 1581
1557 rcu_read_lock(); 1582 if (who == RUSAGE_THREAD) {
1558 if (!lock_task_sighand(p, &flags)) { 1583 accumulate_thread_rusage(p, r, &utime, &stime);
1559 rcu_read_unlock(); 1584 goto out;
1560 return;
1561 } 1585 }
1562 1586
1587 if (!lock_task_sighand(p, &flags))
1588 return;
1589
1563 switch (who) { 1590 switch (who) {
1564 case RUSAGE_BOTH: 1591 case RUSAGE_BOTH:
1565 case RUSAGE_CHILDREN: 1592 case RUSAGE_CHILDREN:
@@ -1586,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1586 r->ru_oublock += p->signal->oublock; 1613 r->ru_oublock += p->signal->oublock;
1587 t = p; 1614 t = p;
1588 do { 1615 do {
1589 utime = cputime_add(utime, t->utime); 1616 accumulate_thread_rusage(t, r, &utime, &stime);
1590 stime = cputime_add(stime, t->stime);
1591 r->ru_nvcsw += t->nvcsw;
1592 r->ru_nivcsw += t->nivcsw;
1593 r->ru_minflt += t->min_flt;
1594 r->ru_majflt += t->maj_flt;
1595 r->ru_inblock += task_io_get_inblock(t);
1596 r->ru_oublock += task_io_get_oublock(t);
1597 t = next_thread(t); 1617 t = next_thread(t);
1598 } while (t != p); 1618 } while (t != p);
1599 break; 1619 break;
@@ -1601,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1601 default: 1621 default:
1602 BUG(); 1622 BUG();
1603 } 1623 }
1604
1605 unlock_task_sighand(p, &flags); 1624 unlock_task_sighand(p, &flags);
1606 rcu_read_unlock();
1607 1625
1626out:
1608 cputime_to_timeval(utime, &r->ru_utime); 1627 cputime_to_timeval(utime, &r->ru_utime);
1609 cputime_to_timeval(stime, &r->ru_stime); 1628 cputime_to_timeval(stime, &r->ru_stime);
1610} 1629}
@@ -1618,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1618 1637
1619asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1638asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1620{ 1639{
1621 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) 1640 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1641 who != RUSAGE_THREAD)
1622 return -EINVAL; 1642 return -EINVAL;
1623 return getrusage(current, who, ru); 1643 return getrusage(current, who, ru);
1624} 1644}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h>
41#include <linux/times.h> 42#include <linux/times.h>
42#include <linux/limits.h> 43#include <linux/limits.h>
43#include <linux/dcache.h> 44#include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
144extern int max_lock_depth; 145extern int max_lock_depth;
145#endif 146#endif
146 147
147#ifdef CONFIG_SYSCTL_SYSCALL
148static int parse_table(int __user *, int, void __user *, size_t __user *,
149 void __user *, size_t, struct ctl_table *);
150#endif
151
152
153#ifdef CONFIG_PROC_SYSCTL 148#ifdef CONFIG_PROC_SYSCTL
154static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 149static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
155 void __user *buffer, size_t *lenp, loff_t *ppos); 150 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = {
809 .proc_handler = &proc_dostring, 804 .proc_handler = &proc_dostring,
810 .strategy = &sysctl_string, 805 .strategy = &sysctl_string,
811 }, 806 },
807#ifdef CONFIG_KEYS
808 {
809 .ctl_name = CTL_UNNUMBERED,
810 .procname = "keys",
811 .mode = 0555,
812 .child = key_sysctls,
813 },
814#endif
812/* 815/*
813 * NOTE: do not add new entries to this table unless you have read 816 * NOTE: do not add new entries to this table unless you have read
814 * Documentation/sysctl/ctl_unnumbered.txt 817 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
1430} 1433}
1431 1434
1432#ifdef CONFIG_SYSCTL_SYSCALL 1435#ifdef CONFIG_SYSCTL_SYSCALL
1436/* Perform the actual read/write of a sysctl table entry. */
1437static int do_sysctl_strategy(struct ctl_table_root *root,
1438 struct ctl_table *table,
1439 int __user *name, int nlen,
1440 void __user *oldval, size_t __user *oldlenp,
1441 void __user *newval, size_t newlen)
1442{
1443 int op = 0, rc;
1444
1445 if (oldval)
1446 op |= 004;
1447 if (newval)
1448 op |= 002;
1449 if (sysctl_perm(root, table, op))
1450 return -EPERM;
1451
1452 if (table->strategy) {
1453 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1454 newval, newlen);
1455 if (rc < 0)
1456 return rc;
1457 if (rc > 0)
1458 return 0;
1459 }
1460
1461 /* If there is no strategy routine, or if the strategy returns
1462 * zero, proceed with automatic r/w */
1463 if (table->data && table->maxlen) {
1464 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1465 newval, newlen);
1466 if (rc < 0)
1467 return rc;
1468 }
1469 return 0;
1470}
1471
1472static int parse_table(int __user *name, int nlen,
1473 void __user *oldval, size_t __user *oldlenp,
1474 void __user *newval, size_t newlen,
1475 struct ctl_table_root *root,
1476 struct ctl_table *table)
1477{
1478 int n;
1479repeat:
1480 if (!nlen)
1481 return -ENOTDIR;
1482 if (get_user(n, name))
1483 return -EFAULT;
1484 for ( ; table->ctl_name || table->procname; table++) {
1485 if (!table->ctl_name)
1486 continue;
1487 if (n == table->ctl_name) {
1488 int error;
1489 if (table->child) {
1490 if (sysctl_perm(root, table, 001))
1491 return -EPERM;
1492 name++;
1493 nlen--;
1494 table = table->child;
1495 goto repeat;
1496 }
1497 error = do_sysctl_strategy(root, table, name, nlen,
1498 oldval, oldlenp,
1499 newval, newlen);
1500 return error;
1501 }
1502 }
1503 return -ENOTDIR;
1504}
1505
1433int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1506int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1434 void __user *newval, size_t newlen) 1507 void __user *newval, size_t newlen)
1435{ 1508{
@@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1447 for (head = sysctl_head_next(NULL); head; 1520 for (head = sysctl_head_next(NULL); head;
1448 head = sysctl_head_next(head)) { 1521 head = sysctl_head_next(head)) {
1449 error = parse_table(name, nlen, oldval, oldlenp, 1522 error = parse_table(name, nlen, oldval, oldlenp,
1450 newval, newlen, head->ctl_table); 1523 newval, newlen,
1524 head->root, head->ctl_table);
1451 if (error != -ENOTDIR) { 1525 if (error != -ENOTDIR) {
1452 sysctl_head_finish(head); 1526 sysctl_head_finish(head);
1453 break; 1527 break;
@@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op)
1493 return -EACCES; 1567 return -EACCES;
1494} 1568}
1495 1569
1496int sysctl_perm(struct ctl_table *table, int op) 1570int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1497{ 1571{
1498 int error; 1572 int error;
1573 int mode;
1574
1499 error = security_sysctl(table, op); 1575 error = security_sysctl(table, op);
1500 if (error) 1576 if (error)
1501 return error; 1577 return error;
1502 return test_perm(table->mode, op);
1503}
1504
1505#ifdef CONFIG_SYSCTL_SYSCALL
1506static int parse_table(int __user *name, int nlen,
1507 void __user *oldval, size_t __user *oldlenp,
1508 void __user *newval, size_t newlen,
1509 struct ctl_table *table)
1510{
1511 int n;
1512repeat:
1513 if (!nlen)
1514 return -ENOTDIR;
1515 if (get_user(n, name))
1516 return -EFAULT;
1517 for ( ; table->ctl_name || table->procname; table++) {
1518 if (!table->ctl_name)
1519 continue;
1520 if (n == table->ctl_name) {
1521 int error;
1522 if (table->child) {
1523 if (sysctl_perm(table, 001))
1524 return -EPERM;
1525 name++;
1526 nlen--;
1527 table = table->child;
1528 goto repeat;
1529 }
1530 error = do_sysctl_strategy(table, name, nlen,
1531 oldval, oldlenp,
1532 newval, newlen);
1533 return error;
1534 }
1535 }
1536 return -ENOTDIR;
1537}
1538 1578
1539/* Perform the actual read/write of a sysctl table entry. */ 1579 if (root->permissions)
1540int do_sysctl_strategy (struct ctl_table *table, 1580 mode = root->permissions(root, current->nsproxy, table);
1541 int __user *name, int nlen, 1581 else
1542 void __user *oldval, size_t __user *oldlenp, 1582 mode = table->mode;
1543 void __user *newval, size_t newlen)
1544{
1545 int op = 0, rc;
1546
1547 if (oldval)
1548 op |= 004;
1549 if (newval)
1550 op |= 002;
1551 if (sysctl_perm(table, op))
1552 return -EPERM;
1553 1583
1554 if (table->strategy) { 1584 return test_perm(mode, op);
1555 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1556 newval, newlen);
1557 if (rc < 0)
1558 return rc;
1559 if (rc > 0)
1560 return 0;
1561 }
1562
1563 /* If there is no strategy routine, or if the strategy returns
1564 * zero, proceed with automatic r/w */
1565 if (table->data && table->maxlen) {
1566 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1567 newval, newlen);
1568 if (rc < 0)
1569 return rc;
1570 }
1571 return 0;
1572} 1585}
1573#endif /* CONFIG_SYSCTL_SYSCALL */
1574 1586
1575static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1587static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1576{ 1588{
@@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1583 1595
1584static __init int sysctl_init(void) 1596static __init int sysctl_init(void)
1585{ 1597{
1586 int err;
1587 sysctl_set_parent(NULL, root_table); 1598 sysctl_set_parent(NULL, root_table);
1588 err = sysctl_check_table(current->nsproxy, root_table); 1599#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1600 {
1601 int err;
1602 err = sysctl_check_table(current->nsproxy, root_table);
1603 }
1604#endif
1589 return 0; 1605 return 0;
1590} 1606}
1591 1607
@@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
1712 header->unregistering = NULL; 1728 header->unregistering = NULL;
1713 header->root = root; 1729 header->root = root;
1714 sysctl_set_parent(NULL, header->ctl_table); 1730 sysctl_set_parent(NULL, header->ctl_table);
1731#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1715 if (sysctl_check_table(namespaces, header->ctl_table)) { 1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1716 kfree(header); 1733 kfree(header);
1717 return NULL; 1734 return NULL;
1718 } 1735 }
1736#endif
1719 spin_lock(&sysctl_lock); 1737 spin_lock(&sysctl_lock);
1720 header_list = lookup_header_list(root, namespaces); 1738 header_list = lookup_header_list(root, namespaces);
1721 list_add_tail(&header->ctl_entry, header_list); 1739 list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 07e86a828073..4a23517169a6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
183 183
184 if (!tsk) { 184 if (!tsk) {
185 rcu_read_lock(); 185 rcu_read_lock();
186 tsk = find_task_by_pid(pid); 186 tsk = find_task_by_vpid(pid);
187 if (tsk) 187 if (tsk)
188 get_task_struct(tsk); 188 get_task_struct(tsk);
189 rcu_read_unlock(); 189 rcu_read_unlock();
@@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
230 */ 230 */
231 rcu_read_lock(); 231 rcu_read_lock();
232 if (!first) 232 if (!first)
233 first = find_task_by_pid(tgid); 233 first = find_task_by_vpid(tgid);
234 234
235 if (!first || !lock_task_sighand(first, &flags)) 235 if (!first || !lock_task_sighand(first, &flags))
236 goto out; 236 goto out;
@@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
547 if (!stats) 547 if (!stats)
548 goto err; 548 goto err;
549 549
550 rc = fill_pid(tsk->pid, tsk, stats); 550 rc = fill_pid(-1, tsk, stats);
551 if (rc < 0) 551 if (rc < 0)
552 goto err; 552 goto err;
553 553
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..cbe0d5a222ff 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,8 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/unistd.h> 42#include <asm/unistd.h>
@@ -390,13 +392,17 @@ EXPORT_SYMBOL(set_normalized_timespec);
390struct timespec ns_to_timespec(const s64 nsec) 392struct timespec ns_to_timespec(const s64 nsec)
391{ 393{
392 struct timespec ts; 394 struct timespec ts;
395 s32 rem;
393 396
394 if (!nsec) 397 if (!nsec)
395 return (struct timespec) {0, 0}; 398 return (struct timespec) {0, 0};
396 399
397 ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); 400 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
398 if (unlikely(nsec < 0)) 401 if (unlikely(rem < 0)) {
399 set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); 402 ts.tv_sec--;
403 rem += NSEC_PER_SEC;
404 }
405 ts.tv_nsec = rem;
400 406
401 return ts; 407 return ts;
402} 408}
@@ -526,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
526 * Convert jiffies to nanoseconds and separate with 532 * Convert jiffies to nanoseconds and separate with
527 * one divide. 533 * one divide.
528 */ 534 */
529 u64 nsec = (u64)jiffies * TICK_NSEC; 535 u32 rem;
530 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); 536 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
537 NSEC_PER_SEC, &rem);
538 value->tv_nsec = rem;
531} 539}
532EXPORT_SYMBOL(jiffies_to_timespec); 540EXPORT_SYMBOL(jiffies_to_timespec);
533 541
@@ -565,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
565 * Convert jiffies to nanoseconds and separate with 573 * Convert jiffies to nanoseconds and separate with
566 * one divide. 574 * one divide.
567 */ 575 */
568 u64 nsec = (u64)jiffies * TICK_NSEC; 576 u32 rem;
569 long tv_usec;
570 577
571 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); 578 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
572 tv_usec /= NSEC_PER_USEC; 579 NSEC_PER_SEC, &rem);
573 value->tv_usec = tv_usec; 580 value->tv_usec = rem / NSEC_PER_USEC;
574} 581}
575EXPORT_SYMBOL(jiffies_to_timeval); 582EXPORT_SYMBOL(jiffies_to_timeval);
576 583
@@ -586,9 +593,7 @@ clock_t jiffies_to_clock_t(long x)
586 return x / (HZ / USER_HZ); 593 return x / (HZ / USER_HZ);
587# endif 594# endif
588#else 595#else
589 u64 tmp = (u64)x * TICK_NSEC; 596 return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
590 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
591 return (long)tmp;
592#endif 597#endif
593} 598}
594EXPORT_SYMBOL(jiffies_to_clock_t); 599EXPORT_SYMBOL(jiffies_to_clock_t);
@@ -600,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
600 return ~0UL; 605 return ~0UL;
601 return x * (HZ / USER_HZ); 606 return x * (HZ / USER_HZ);
602#else 607#else
603 u64 jif;
604
605 /* Don't worry about loss of precision here .. */ 608 /* Don't worry about loss of precision here .. */
606 if (x >= ~0UL / HZ * USER_HZ) 609 if (x >= ~0UL / HZ * USER_HZ)
607 return ~0UL; 610 return ~0UL;
608 611
609 /* .. but do try to contain it here */ 612 /* .. but do try to contain it here */
610 jif = x * (u64) HZ; 613 return div_u64((u64)x * HZ, USER_HZ);
611 do_div(jif, USER_HZ);
612 return jif;
613#endif 614#endif
614} 615}
615EXPORT_SYMBOL(clock_t_to_jiffies); 616EXPORT_SYMBOL(clock_t_to_jiffies);
@@ -618,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x)
618{ 619{
619#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 620#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
620# if HZ < USER_HZ 621# if HZ < USER_HZ
621 x *= USER_HZ; 622 x = div_u64(x * USER_HZ, HZ);
622 do_div(x, HZ);
623# elif HZ > USER_HZ 623# elif HZ > USER_HZ
624 do_div(x, HZ / USER_HZ); 624 x = div_u64(x, HZ / USER_HZ);
625# else 625# else
626 /* Nothing to do */ 626 /* Nothing to do */
627# endif 627# endif
@@ -631,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x)
631 * but even this doesn't overflow in hundreds of years 631 * but even this doesn't overflow in hundreds of years
632 * in 64 bits, so.. 632 * in 64 bits, so..
633 */ 633 */
634 x *= TICK_NSEC; 634 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
635 do_div(x, (NSEC_PER_SEC / USER_HZ));
636#endif 635#endif
637 return x; 636 return x;
638} 637}
@@ -641,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
641u64 nsec_to_clock_t(u64 x) 640u64 nsec_to_clock_t(u64 x)
642{ 641{
643#if (NSEC_PER_SEC % USER_HZ) == 0 642#if (NSEC_PER_SEC % USER_HZ) == 0
644 do_div(x, (NSEC_PER_SEC / USER_HZ)); 643 return div_u64(x, NSEC_PER_SEC / USER_HZ);
645#elif (USER_HZ % 512) == 0 644#elif (USER_HZ % 512) == 0
646 x *= USER_HZ/512; 645 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
647 do_div(x, (NSEC_PER_SEC / 512));
648#else 646#else
649 /* 647 /*
650 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, 648 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
651 * overflow after 64.99 years. 649 * overflow after 64.99 years.
652 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... 650 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
653 */ 651 */
654 x *= 9; 652 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
655 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
656 USER_HZ));
657#endif 653#endif
658 return x;
659} 654}
660 655
661#if (BITS_PER_LONG < 64) 656#if (BITS_PER_LONG < 64)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5fd9b9469770..5125ddd8196b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,7 +15,8 @@
15#include <linux/jiffies.h> 15#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/capability.h> 17#include <linux/capability.h>
18#include <asm/div64.h> 18#include <linux/math64.h>
19#include <linux/clocksource.h>
19#include <asm/timex.h> 20#include <asm/timex.h>
20 21
21/* 22/*
@@ -23,11 +24,14 @@
23 */ 24 */
24unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ 25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
25unsigned long tick_nsec; /* ACTHZ period (nsec) */ 26unsigned long tick_nsec; /* ACTHZ period (nsec) */
26static u64 tick_length, tick_length_base; 27u64 tick_length;
28static u64 tick_length_base;
29
30static struct hrtimer leap_timer;
27 31
28#define MAX_TICKADJ 500 /* microsecs */ 32#define MAX_TICKADJ 500 /* microsecs */
29#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
30 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) 34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
31 35
32/* 36/*
33 * phase-lock loop variables 37 * phase-lock loop variables
@@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base;
35/* TIME_ERROR prevents overwriting the CMOS clock */ 39/* TIME_ERROR prevents overwriting the CMOS clock */
36static int time_state = TIME_OK; /* clock synchronization status */ 40static int time_state = TIME_OK; /* clock synchronization status */
37int time_status = STA_UNSYNC; /* clock status bits */ 41int time_status = STA_UNSYNC; /* clock status bits */
38static s64 time_offset; /* time adjustment (ns) */ 42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
39static long time_constant = 2; /* pll time constant */ 44static long time_constant = 2; /* pll time constant */
40long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
41long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
42long time_freq; /* frequency offset (scaled ppm)*/ 47static s64 time_freq; /* frequency offset (scaled ns/s)*/
43static long time_reftime; /* time at last adjustment (s) */ 48static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 49long time_adjust;
45static long ntp_tick_adj; 50static long ntp_tick_adj;
@@ -47,16 +52,56 @@ static long ntp_tick_adj;
47static void ntp_update_frequency(void) 52static void ntp_update_frequency(void)
48{ 53{
49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 << TICK_LENGTH_SHIFT; 55 << NTP_SCALE_SHIFT;
51 second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; 56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 57 second_length += time_freq;
53 58
54 tick_length_base = second_length; 59 tick_length_base = second_length;
55 60
56 do_div(second_length, HZ); 61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT; 62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
63}
64
65static void ntp_update_offset(long offset)
66{
67 long mtemp;
68 s64 freq_adj;
69
70 if (!(time_status & STA_PLL))
71 return;
58 72
59 do_div(tick_length_base, NTP_INTERVAL_FREQ); 73 if (!(time_status & STA_NANO))
74 offset *= NSEC_PER_USEC;
75
76 /*
77 * Scale the phase adjustment and
78 * clamp to the operating range.
79 */
80 offset = min(offset, MAXPHASE);
81 offset = max(offset, -MAXPHASE);
82
83 /*
84 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL).
86 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0)
88 time_reftime = xtime.tv_sec;
89 mtemp = xtime.tv_sec - time_reftime;
90 time_reftime = xtime.tv_sec;
91
92 freq_adj = (s64)offset * mtemp;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
94 time_status &= ~STA_MODE;
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
60} 105}
61 106
62/** 107/**
@@ -78,62 +123,70 @@ void ntp_clear(void)
78} 123}
79 124
80/* 125/*
81 * this routine handles the overflow of the microsecond field 126 * Leap second processing. If in leap-insert state at the end of the
82 * 127 * day, the system clock is set back one second; if in leap-delete
83 * The tricky bits of code to handle the accurate clock support 128 * state, the system clock is set ahead one second.
84 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
85 * They were originally developed for SUN and DEC kernels.
86 * All the kudos should go to Dave for this stuff.
87 */ 129 */
88void second_overflow(void) 130static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
89{ 131{
90 long time_adj; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
91 133
92 /* Bump the maxerror field */ 134 write_seqlock_irq(&xtime_lock);
93 time_maxerror += MAXFREQ >> SHIFT_USEC;
94 if (time_maxerror > NTP_PHASE_LIMIT) {
95 time_maxerror = NTP_PHASE_LIMIT;
96 time_status |= STA_UNSYNC;
97 }
98 135
99 /*
100 * Leap second processing. If in leap-insert state at the end of the
101 * day, the system clock is set back one second; if in leap-delete
102 * state, the system clock is set ahead one second. The microtime()
103 * routine or external clock driver will insure that reported time is
104 * always monotonic. The ugly divides should be replaced.
105 */
106 switch (time_state) { 136 switch (time_state) {
107 case TIME_OK: 137 case TIME_OK:
108 if (time_status & STA_INS)
109 time_state = TIME_INS;
110 else if (time_status & STA_DEL)
111 time_state = TIME_DEL;
112 break; 138 break;
113 case TIME_INS: 139 case TIME_INS:
114 if (xtime.tv_sec % 86400 == 0) { 140 xtime.tv_sec--;
115 xtime.tv_sec--; 141 wall_to_monotonic.tv_sec++;
116 wall_to_monotonic.tv_sec++; 142 time_state = TIME_OOP;
117 time_state = TIME_OOP; 143 printk(KERN_NOTICE "Clock: "
118 printk(KERN_NOTICE "Clock: inserting leap second " 144 "inserting leap second 23:59:60 UTC\n");
119 "23:59:60 UTC\n"); 145 leap_timer.expires = ktime_add_ns(leap_timer.expires,
120 } 146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART;
121 break; 148 break;
122 case TIME_DEL: 149 case TIME_DEL:
123 if ((xtime.tv_sec + 1) % 86400 == 0) { 150 xtime.tv_sec++;
124 xtime.tv_sec++; 151 time_tai--;
125 wall_to_monotonic.tv_sec--; 152 wall_to_monotonic.tv_sec--;
126 time_state = TIME_WAIT; 153 time_state = TIME_WAIT;
127 printk(KERN_NOTICE "Clock: deleting leap second " 154 printk(KERN_NOTICE "Clock: "
128 "23:59:59 UTC\n"); 155 "deleting leap second 23:59:59 UTC\n");
129 }
130 break; 156 break;
131 case TIME_OOP: 157 case TIME_OOP:
158 time_tai++;
132 time_state = TIME_WAIT; 159 time_state = TIME_WAIT;
133 break; 160 /* fall through */
134 case TIME_WAIT: 161 case TIME_WAIT:
135 if (!(time_status & (STA_INS | STA_DEL))) 162 if (!(time_status & (STA_INS | STA_DEL)))
136 time_state = TIME_OK; 163 time_state = TIME_OK;
164 break;
165 }
166 update_vsyscall(&xtime, clock);
167
168 write_sequnlock_irq(&xtime_lock);
169
170 return res;
171}
172
173/*
174 * this routine handles the overflow of the microsecond field
175 *
176 * The tricky bits of code to handle the accurate clock support
177 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
178 * They were originally developed for SUN and DEC kernels.
179 * All the kudos should go to Dave for this stuff.
180 */
181void second_overflow(void)
182{
183 s64 time_adj;
184
185 /* Bump the maxerror field */
186 time_maxerror += MAXFREQ / NSEC_PER_USEC;
187 if (time_maxerror > NTP_PHASE_LIMIT) {
188 time_maxerror = NTP_PHASE_LIMIT;
189 time_status |= STA_UNSYNC;
137 } 190 }
138 191
139 /* 192 /*
@@ -143,7 +196,7 @@ void second_overflow(void)
143 tick_length = tick_length_base; 196 tick_length = tick_length_base;
144 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 197 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
145 time_offset -= time_adj; 198 time_offset -= time_adj;
146 tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); 199 tick_length += time_adj;
147 200
148 if (unlikely(time_adjust)) { 201 if (unlikely(time_adjust)) {
149 if (time_adjust > MAX_TICKADJ) { 202 if (time_adjust > MAX_TICKADJ) {
@@ -154,25 +207,12 @@ void second_overflow(void)
154 tick_length -= MAX_TICKADJ_SCALED; 207 tick_length -= MAX_TICKADJ_SCALED;
155 } else { 208 } else {
156 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 209 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
157 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; 210 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
158 time_adjust = 0; 211 time_adjust = 0;
159 } 212 }
160 } 213 }
161} 214}
162 215
163/*
164 * Return how long ticks are at the moment, that is, how much time
165 * update_wall_time_one_tick will add to xtime next time we call it
166 * (assuming no calls to do_adjtimex in the meantime).
167 * The return value is in fixed-point nanoseconds shifted by the
168 * specified number of bits to the right of the binary point.
169 * This function has no side-effects.
170 */
171u64 current_tick_length(void)
172{
173 return tick_length;
174}
175
176#ifdef CONFIG_GENERIC_CMOS_UPDATE 216#ifdef CONFIG_GENERIC_CMOS_UPDATE
177 217
178/* Disable the cmos update - used by virtualization and embedded */ 218/* Disable the cmos update - used by virtualization and embedded */
@@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { }
236 */ 276 */
237int do_adjtimex(struct timex *txc) 277int do_adjtimex(struct timex *txc)
238{ 278{
239 long mtemp, save_adjust, rem; 279 struct timespec ts;
240 s64 freq_adj, temp64; 280 long save_adjust, sec;
241 int result; 281 int result;
242 282
243 /* In order to modify anything, you gotta be super-user! */ 283 /* In order to modify anything, you gotta be super-user! */
@@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc)
247 /* Now we validate the data before disabling interrupts */ 287 /* Now we validate the data before disabling interrupts */
248 288
249 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { 289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
250 /* singleshot must not be used with any other mode bits */ 290 /* singleshot must not be used with any other mode bits */
251 if (txc->modes != ADJ_OFFSET_SINGLESHOT && 291 if (txc->modes & ~ADJ_OFFSET_SS_READ)
252 txc->modes != ADJ_OFFSET_SS_READ)
253 return -EINVAL; 292 return -EINVAL;
254 } 293 }
255 294
256 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
257 /* adjustment Offset limited to +- .512 seconds */
258 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
259 return -EINVAL;
260
261 /* if the quartz is off by more than 10% something is VERY wrong ! */ 295 /* if the quartz is off by more than 10% something is VERY wrong ! */
262 if (txc->modes & ADJ_TICK) 296 if (txc->modes & ADJ_TICK)
263 if (txc->tick < 900000/USER_HZ || 297 if (txc->tick < 900000/USER_HZ ||
264 txc->tick > 1100000/USER_HZ) 298 txc->tick > 1100000/USER_HZ)
265 return -EINVAL; 299 return -EINVAL;
266 300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts);
304
267 write_seqlock_irq(&xtime_lock); 305 write_seqlock_irq(&xtime_lock);
268 result = time_state; /* mostly `TIME_OK' */
269 306
270 /* Save for later - semantics of adjtime is to return old value */ 307 /* Save for later - semantics of adjtime is to return old value */
271 save_adjust = time_adjust; 308 save_adjust = time_adjust;
272 309
273#if 0 /* STA_CLOCKERR is never set yet */
274 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
275#endif
276 /* If there are input parameters, then process them */ 310 /* If there are input parameters, then process them */
277 if (txc->modes) 311 if (txc->modes) {
278 { 312 if (txc->modes & ADJ_STATUS) {
279 if (txc->modes & ADJ_STATUS) /* only set allowed bits */ 313 if ((time_status & STA_PLL) &&
280 time_status = (txc->status & ~STA_RONLY) | 314 !(txc->status & STA_PLL)) {
281 (time_status & STA_RONLY); 315 time_state = TIME_OK;
282 316 time_status = STA_UNSYNC;
283 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ 317 }
284 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { 318 /* only set allowed bits */
285 result = -EINVAL; 319 time_status &= STA_RONLY;
286 goto leave; 320 time_status |= txc->status & ~STA_RONLY;
287 } 321
288 time_freq = ((s64)txc->freq * NSEC_PER_USEC) 322 switch (time_state) {
289 >> (SHIFT_USEC - SHIFT_NSEC); 323 case TIME_OK:
290 } 324 start_timer:
291 325 sec = ts.tv_sec;
292 if (txc->modes & ADJ_MAXERROR) { 326 if (time_status & STA_INS) {
293 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { 327 time_state = TIME_INS;
294 result = -EINVAL; 328 sec += 86400 - sec % 86400;
295 goto leave; 329 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
330 } else if (time_status & STA_DEL) {
331 time_state = TIME_DEL;
332 sec += 86400 - (sec + 1) % 86400;
333 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
334 }
335 break;
336 case TIME_INS:
337 case TIME_DEL:
338 time_state = TIME_OK;
339 goto start_timer;
340 break;
341 case TIME_WAIT:
342 if (!(time_status & (STA_INS | STA_DEL)))
343 time_state = TIME_OK;
344 break;
345 case TIME_OOP:
346 hrtimer_restart(&leap_timer);
347 break;
348 }
296 } 349 }
297 time_maxerror = txc->maxerror;
298 }
299 350
300 if (txc->modes & ADJ_ESTERROR) { 351 if (txc->modes & ADJ_NANO)
301 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { 352 time_status |= STA_NANO;
302 result = -EINVAL; 353 if (txc->modes & ADJ_MICRO)
303 goto leave; 354 time_status &= ~STA_NANO;
355
356 if (txc->modes & ADJ_FREQUENCY) {
357 time_freq = (s64)txc->freq * PPM_SCALE;
358 time_freq = min(time_freq, MAXFREQ_SCALED);
359 time_freq = max(time_freq, -MAXFREQ_SCALED);
304 } 360 }
305 time_esterror = txc->esterror;
306 }
307 361
308 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ 362 if (txc->modes & ADJ_MAXERROR)
309 if (txc->constant < 0) { /* NTP v4 uses values > 6 */ 363 time_maxerror = txc->maxerror;
310 result = -EINVAL; 364 if (txc->modes & ADJ_ESTERROR)
311 goto leave; 365 time_esterror = txc->esterror;
366
367 if (txc->modes & ADJ_TIMECONST) {
368 time_constant = txc->constant;
369 if (!(time_status & STA_NANO))
370 time_constant += 4;
371 time_constant = min(time_constant, (long)MAXTC);
372 time_constant = max(time_constant, 0l);
312 } 373 }
313 time_constant = min(txc->constant + 4, (long)MAXTC);
314 }
315 374
316 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ 375 if (txc->modes & ADJ_TAI && txc->constant > 0)
317 if (txc->modes == ADJ_OFFSET_SINGLESHOT) { 376 time_tai = txc->constant;
318 /* adjtime() is independent from ntp_adjtime() */ 377
319 time_adjust = txc->offset; 378 if (txc->modes & ADJ_OFFSET) {
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT)
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
320 } 384 }
321 else if (time_status & STA_PLL) { 385 if (txc->modes & ADJ_TICK)
322 time_offset = txc->offset * NSEC_PER_USEC; 386 tick_usec = txc->tick;
323 387
324 /* 388 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
325 * Scale the phase adjustment and 389 ntp_update_frequency();
326 * clamp to the operating range. 390 }
327 */ 391
328 time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC); 392 result = time_state; /* mostly `TIME_OK' */
329 time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC); 393 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
330
331 /*
332 * Select whether the frequency is to be controlled
333 * and in which mode (PLL or FLL). Clamp to the operating
334 * range. Ugly multiply/divide should be replaced someday.
335 */
336
337 if (time_status & STA_FREQHOLD || time_reftime == 0)
338 time_reftime = xtime.tv_sec;
339 mtemp = xtime.tv_sec - time_reftime;
340 time_reftime = xtime.tv_sec;
341
342 freq_adj = time_offset * mtemp;
343 freq_adj = shift_right(freq_adj, time_constant * 2 +
344 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
345 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
346 u64 utemp64;
347 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
348 if (time_offset < 0) {
349 utemp64 = -temp64;
350 do_div(utemp64, mtemp);
351 freq_adj -= utemp64;
352 } else {
353 utemp64 = temp64;
354 do_div(utemp64, mtemp);
355 freq_adj += utemp64;
356 }
357 }
358 freq_adj += time_freq;
359 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
360 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
361 time_offset = div_long_long_rem_signed(time_offset,
362 NTP_INTERVAL_FREQ,
363 &rem);
364 time_offset <<= SHIFT_UPDATE;
365 } /* STA_PLL */
366 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK)
368 tick_usec = txc->tick;
369
370 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
371 ntp_update_frequency();
372 } /* txc->modes */
373leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
374 result = TIME_ERROR; 394 result = TIME_ERROR;
375 395
376 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
377 (txc->modes == ADJ_OFFSET_SS_READ)) 397 (txc->modes == ADJ_OFFSET_SS_READ))
378 txc->offset = save_adjust; 398 txc->offset = save_adjust;
379 else 399 else {
380 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * 400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
381 NTP_INTERVAL_FREQ / 1000; 401 NTP_SCALE_SHIFT);
382 txc->freq = (time_freq / NSEC_PER_USEC) << 402 if (!(time_status & STA_NANO))
383 (SHIFT_USEC - SHIFT_NSEC); 403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
384 txc->maxerror = time_maxerror; 408 txc->maxerror = time_maxerror;
385 txc->esterror = time_esterror; 409 txc->esterror = time_esterror;
386 txc->status = time_status; 410 txc->status = time_status;
387 txc->constant = time_constant; 411 txc->constant = time_constant;
388 txc->precision = 1; 412 txc->precision = 1;
389 txc->tolerance = MAXFREQ; 413 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
390 txc->tick = tick_usec; 414 txc->tick = tick_usec;
415 txc->tai = time_tai;
391 416
392 /* PPS is not implemented, so these are zero */ 417 /* PPS is not implemented, so these are zero */
393 txc->ppsfreq = 0; 418 txc->ppsfreq = 0;
@@ -399,9 +424,15 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
399 txc->errcnt = 0; 424 txc->errcnt = 0;
400 txc->stbcnt = 0; 425 txc->stbcnt = 0;
401 write_sequnlock_irq(&xtime_lock); 426 write_sequnlock_irq(&xtime_lock);
402 do_gettimeofday(&txc->time); 427
428 txc->time.tv_sec = ts.tv_sec;
429 txc->time.tv_usec = ts.tv_nsec;
430 if (!(time_status & STA_NANO))
431 txc->time.tv_usec /= NSEC_PER_USEC;
432
403 notify_cmos_timer(); 433 notify_cmos_timer();
404 return(result); 434
435 return result;
405} 436}
406 437
407static int __init ntp_tick_adj_setup(char *str) 438static int __init ntp_tick_adj_setup(char *str)
@@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str)
411} 442}
412 443
413__setup("ntp_tick_adj=", ntp_tick_adj_setup); 444__setup("ntp_tick_adj=", ntp_tick_adj_setup);
445
446void __init ntp_init(void)
447{
448 ntp_clear();
449 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
450 leap_timer.function = ntp_leap_second;
451}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d6087c7cf98..e91c29f961c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec)
53 timespec_add_ns(&xtime_cache, nsec); 53 timespec_add_ns(&xtime_cache, nsec);
54} 54}
55 55
56static struct clocksource *clock; /* pointer to current clocksource */ 56struct clocksource *clock;
57 57
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
@@ -246,7 +246,7 @@ void __init timekeeping_init(void)
246 246
247 write_seqlock_irqsave(&xtime_lock, flags); 247 write_seqlock_irqsave(&xtime_lock, flags);
248 248
249 ntp_clear(); 249 ntp_init();
250 250
251 clock = clocksource_get_next(); 251 clock = clocksource_get_next();
252 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 252 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -371,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
371 * here. This is tuned so that an error of about 1 msec is adjusted 371 * here. This is tuned so that an error of about 1 msec is adjusted
372 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 372 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
373 */ 373 */
374 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); 374 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
375 error2 = abs(error2); 375 error2 = abs(error2);
376 for (look_ahead = 0; error2 > 0; look_ahead++) 376 for (look_ahead = 0; error2 > 0; look_ahead++)
377 error2 >>= 2; 377 error2 >>= 2;
@@ -380,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
380 * Now calculate the error in (1 << look_ahead) ticks, but first 380 * Now calculate the error in (1 << look_ahead) ticks, but first
381 * remove the single look ahead already included in the error. 381 * remove the single look ahead already included in the error.
382 */ 382 */
383 tick_error = current_tick_length() >> 383 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
384 (TICK_LENGTH_SHIFT - clock->shift + 1);
385 tick_error -= clock->xtime_interval >> 1; 384 tick_error -= clock->xtime_interval >> 1;
386 error = ((error - tick_error) >> look_ahead) + tick_error; 385 error = ((error - tick_error) >> look_ahead) + tick_error;
387 386
@@ -412,7 +411,7 @@ static void clocksource_adjust(s64 offset)
412 s64 error, interval = clock->cycle_interval; 411 s64 error, interval = clock->cycle_interval;
413 int adj; 412 int adj;
414 413
415 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); 414 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
416 if (error > interval) { 415 if (error > interval) {
417 error >>= 2; 416 error >>= 2;
418 if (likely(error <= interval)) 417 if (likely(error <= interval))
@@ -434,7 +433,7 @@ static void clocksource_adjust(s64 offset)
434 clock->xtime_interval += interval; 433 clock->xtime_interval += interval;
435 clock->xtime_nsec -= offset; 434 clock->xtime_nsec -= offset;
436 clock->error -= (interval - offset) << 435 clock->error -= (interval - offset) <<
437 (TICK_LENGTH_SHIFT - clock->shift); 436 (NTP_SCALE_SHIFT - clock->shift);
438} 437}
439 438
440/** 439/**
@@ -473,8 +472,8 @@ void update_wall_time(void)
473 } 472 }
474 473
475 /* accumulate error between NTP and clock interval */ 474 /* accumulate error between NTP and clock interval */
476 clock->error += current_tick_length(); 475 clock->error += tick_length;
477 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
478 } 477 }
479 478
480 /* correct the clock when NTP error is too big */ 479 /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
278{ 278{
279 struct proc_dir_entry *pe; 279 struct proc_dir_entry *pe;
280 280
281 pe = create_proc_entry("timer_list", 0644, NULL); 281 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
282 if (!pe) 282 if (!pe)
283 return -ENOMEM; 283 return -ENOMEM;
284
285 pe->proc_fops = &timer_list_fops;
286
287 return 0; 284 return 0;
288} 285}
289__initcall(init_timer_list_procfs); 286__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
415{ 415{
416 struct proc_dir_entry *pe; 416 struct proc_dir_entry *pe;
417 417
418 pe = create_proc_entry("timer_stats", 0644, NULL); 418 pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
419 if (!pe) 419 if (!pe)
420 return -ENOMEM; 420 return -ENOMEM;
421
422 pe->proc_fops = &tstats_fops;
423
424 return 0; 421 return 0;
425} 422}
426__initcall(init_tstats_procfs); 423__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index f3d35d4ea42e..ceacc6626572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer)
320static void timer_stats_account_timer(struct timer_list *timer) {} 320static void timer_stats_account_timer(struct timer_list *timer) {}
321#endif 321#endif
322 322
323/** 323#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
324 * init_timer - initialize a timer. 324
325 * @timer: the timer to be initialized 325static struct debug_obj_descr timer_debug_descr;
326 * 326
327 * init_timer() must be done to a timer prior calling *any* of the 327/*
328 * other timer functions. 328 * fixup_init is called when:
329 * - an active object is initialized
329 */ 330 */
330void init_timer(struct timer_list *timer) 331static int timer_fixup_init(void *addr, enum debug_obj_state state)
332{
333 struct timer_list *timer = addr;
334
335 switch (state) {
336 case ODEBUG_STATE_ACTIVE:
337 del_timer_sync(timer);
338 debug_object_init(timer, &timer_debug_descr);
339 return 1;
340 default:
341 return 0;
342 }
343}
344
345/*
346 * fixup_activate is called when:
347 * - an active object is activated
348 * - an unknown object is activated (might be a statically initialized object)
349 */
350static int timer_fixup_activate(void *addr, enum debug_obj_state state)
351{
352 struct timer_list *timer = addr;
353
354 switch (state) {
355
356 case ODEBUG_STATE_NOTAVAILABLE:
357 /*
358 * This is not really a fixup. The timer was
359 * statically initialized. We just make sure that it
360 * is tracked in the object tracker.
361 */
362 if (timer->entry.next == NULL &&
363 timer->entry.prev == TIMER_ENTRY_STATIC) {
364 debug_object_init(timer, &timer_debug_descr);
365 debug_object_activate(timer, &timer_debug_descr);
366 return 0;
367 } else {
368 WARN_ON_ONCE(1);
369 }
370 return 0;
371
372 case ODEBUG_STATE_ACTIVE:
373 WARN_ON(1);
374
375 default:
376 return 0;
377 }
378}
379
380/*
381 * fixup_free is called when:
382 * - an active object is freed
383 */
384static int timer_fixup_free(void *addr, enum debug_obj_state state)
385{
386 struct timer_list *timer = addr;
387
388 switch (state) {
389 case ODEBUG_STATE_ACTIVE:
390 del_timer_sync(timer);
391 debug_object_free(timer, &timer_debug_descr);
392 return 1;
393 default:
394 return 0;
395 }
396}
397
398static struct debug_obj_descr timer_debug_descr = {
399 .name = "timer_list",
400 .fixup_init = timer_fixup_init,
401 .fixup_activate = timer_fixup_activate,
402 .fixup_free = timer_fixup_free,
403};
404
405static inline void debug_timer_init(struct timer_list *timer)
406{
407 debug_object_init(timer, &timer_debug_descr);
408}
409
410static inline void debug_timer_activate(struct timer_list *timer)
411{
412 debug_object_activate(timer, &timer_debug_descr);
413}
414
415static inline void debug_timer_deactivate(struct timer_list *timer)
416{
417 debug_object_deactivate(timer, &timer_debug_descr);
418}
419
420static inline void debug_timer_free(struct timer_list *timer)
421{
422 debug_object_free(timer, &timer_debug_descr);
423}
424
425static void __init_timer(struct timer_list *timer);
426
427void init_timer_on_stack(struct timer_list *timer)
428{
429 debug_object_init_on_stack(timer, &timer_debug_descr);
430 __init_timer(timer);
431}
432EXPORT_SYMBOL_GPL(init_timer_on_stack);
433
434void destroy_timer_on_stack(struct timer_list *timer)
435{
436 debug_object_free(timer, &timer_debug_descr);
437}
438EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
439
440#else
441static inline void debug_timer_init(struct timer_list *timer) { }
442static inline void debug_timer_activate(struct timer_list *timer) { }
443static inline void debug_timer_deactivate(struct timer_list *timer) { }
444#endif
445
446static void __init_timer(struct timer_list *timer)
331{ 447{
332 timer->entry.next = NULL; 448 timer->entry.next = NULL;
333 timer->base = __raw_get_cpu_var(tvec_bases); 449 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer)
337 memset(timer->start_comm, 0, TASK_COMM_LEN); 453 memset(timer->start_comm, 0, TASK_COMM_LEN);
338#endif 454#endif
339} 455}
456
457/**
458 * init_timer - initialize a timer.
459 * @timer: the timer to be initialized
460 *
461 * init_timer() must be done to a timer prior calling *any* of the
462 * other timer functions.
463 */
464void init_timer(struct timer_list *timer)
465{
466 debug_timer_init(timer);
467 __init_timer(timer);
468}
340EXPORT_SYMBOL(init_timer); 469EXPORT_SYMBOL(init_timer);
341 470
342void init_timer_deferrable(struct timer_list *timer) 471void init_timer_deferrable(struct timer_list *timer)
@@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer,
351{ 480{
352 struct list_head *entry = &timer->entry; 481 struct list_head *entry = &timer->entry;
353 482
483 debug_timer_deactivate(timer);
484
354 __list_del(entry->prev, entry->next); 485 __list_del(entry->prev, entry->next);
355 if (clear_pending) 486 if (clear_pending)
356 entry->next = NULL; 487 entry->next = NULL;
@@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
405 ret = 1; 536 ret = 1;
406 } 537 }
407 538
539 debug_timer_activate(timer);
540
408 new_base = __get_cpu_var(tvec_bases); 541 new_base = __get_cpu_var(tvec_bases);
409 542
410 if (base != new_base) { 543 if (base != new_base) {
@@ -450,6 +583,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
450 BUG_ON(timer_pending(timer) || !timer->function); 583 BUG_ON(timer_pending(timer) || !timer->function);
451 spin_lock_irqsave(&base->lock, flags); 584 spin_lock_irqsave(&base->lock, flags);
452 timer_set_base(timer, base); 585 timer_set_base(timer, base);
586 debug_timer_activate(timer);
453 internal_add_timer(base, timer); 587 internal_add_timer(base, timer);
454 /* 588 /*
455 * Check whether the other CPU is idle and needs to be 589 * Check whether the other CPU is idle and needs to be
@@ -1086,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout)
1086 1220
1087 expire = timeout + jiffies; 1221 expire = timeout + jiffies;
1088 1222
1089 setup_timer(&timer, process_timeout, (unsigned long)current); 1223 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1090 __mod_timer(&timer, expire); 1224 __mod_timer(&timer, expire);
1091 schedule(); 1225 schedule();
1092 del_singleshot_timer_sync(&timer); 1226 del_singleshot_timer_sync(&timer);
1093 1227
1228 /* Remove the timer from the object tracker */
1229 destroy_timer_on_stack(&timer);
1230
1094 timeout = expire - jiffies; 1231 timeout = expire - jiffies;
1095 1232
1096 out: 1233 out:
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..865ecf57a096 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
53 .files = ATOMIC_INIT(0), 53 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 54 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 55 .locked_shm = 0,
56#ifdef CONFIG_KEYS
57 .uid_keyring = &root_user_keyring,
58 .session_keyring = &root_session_keyring,
59#endif
60#ifdef CONFIG_USER_SCHED 56#ifdef CONFIG_USER_SCHED
61 .tg = &init_task_group, 57 .tg = &init_task_group,
62#endif 58#endif
@@ -388,7 +384,7 @@ void free_uid(struct user_struct *up)
388 local_irq_restore(flags); 384 local_irq_restore(flags);
389} 385}
390 386
391struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 387struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
392{ 388{
393 struct hlist_head *hashent = uidhashentry(ns, uid); 389 struct hlist_head *hashent = uidhashentry(ns, uid);
394 struct user_struct *up, *new; 390 struct user_struct *up, *new;
@@ -403,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
403 spin_unlock_irq(&uidhash_lock); 399 spin_unlock_irq(&uidhash_lock);
404 400
405 if (!up) { 401 if (!up) {
406 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 402 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
407 if (!new) 403 if (!new)
408 goto out_unlock; 404 goto out_unlock;
409 405
410 new->uid = uid; 406 new->uid = uid;
411 atomic_set(&new->__count, 1); 407 atomic_set(&new->__count, 1);
412 atomic_set(&new->processes, 0);
413 atomic_set(&new->files, 0);
414 atomic_set(&new->sigpending, 0);
415#ifdef CONFIG_INOTIFY_USER
416 atomic_set(&new->inotify_watches, 0);
417 atomic_set(&new->inotify_devs, 0);
418#endif
419#ifdef CONFIG_POSIX_MQUEUE
420 new->mq_bytes = 0;
421#endif
422 new->locked_shm = 0;
423
424 if (alloc_uid_keyring(new, current) < 0)
425 goto out_free_user;
426 408
427 if (sched_create_user(new) < 0) 409 if (sched_create_user(new) < 0)
428 goto out_put_keys; 410 goto out_free_user;
429 411
430 if (uids_user_create(new)) 412 if (uids_user_create(new))
431 goto out_destoy_sched; 413 goto out_destoy_sched;
@@ -459,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
459 441
460out_destoy_sched: 442out_destoy_sched:
461 sched_destroy_user(new); 443 sched_destroy_user(new);
462out_put_keys:
463 key_put(new->uid_keyring);
464 key_put(new->session_keyring);
465out_free_user: 444out_free_user:
466 kmem_cache_free(uid_cachep, new); 445 kmem_cache_free(uid_cachep, new);
467out_unlock: 446out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h> 9#include <linux/version.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/slab.h>
11#include <linux/user_namespace.h> 12#include <linux/user_namespace.h>
12 13
13/* 14/*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
73 release_uids(ns); 74 release_uids(ns);
74 kfree(ns); 75 kfree(ns);
75} 76}
77EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17 18
18/* 19/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 20 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..29fc39f1029c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
158 * 158 *
159 * Returns 0 if @work was already on a queue, non-zero otherwise. 159 * Returns 0 if @work was already on a queue, non-zero otherwise.
160 * 160 *
161 * We queue the work to the CPU it was submitted, but there is no 161 * We queue the work to the CPU on which it was submitted, but if the CPU dies
162 * guarantee that it will be processed by that CPU. 162 * it can be processed by another CPU.
163 */ 163 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 164int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 165{
@@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data)
195int queue_delayed_work(struct workqueue_struct *wq, 195int queue_delayed_work(struct workqueue_struct *wq,
196 struct delayed_work *dwork, unsigned long delay) 196 struct delayed_work *dwork, unsigned long delay)
197{ 197{
198 timer_stats_timer_set_start_info(&dwork->timer);
199 if (delay == 0) 198 if (delay == 0)
200 return queue_work(wq, &dwork->work); 199 return queue_work(wq, &dwork->work);
201 200
@@ -219,11 +218,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
219 struct timer_list *timer = &dwork->timer; 218 struct timer_list *timer = &dwork->timer;
220 struct work_struct *work = &dwork->work; 219 struct work_struct *work = &dwork->work;
221 220
222 timer_stats_timer_set_start_info(&dwork->timer);
223 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 221 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
224 BUG_ON(timer_pending(timer)); 222 BUG_ON(timer_pending(timer));
225 BUG_ON(!list_empty(&work->entry)); 223 BUG_ON(!list_empty(&work->entry));
226 224
225 timer_stats_timer_set_start_info(&dwork->timer);
226
227 /* This stores cwq for the moment, for the timer_fn */ 227 /* This stores cwq for the moment, for the timer_fn */
228 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 228 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
229 timer->expires = jiffies + delay; 229 timer->expires = jiffies + delay;
@@ -247,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
247 if (cwq->run_depth > 3) { 247 if (cwq->run_depth > 3) {
248 /* morton gets to eat his hat */ 248 /* morton gets to eat his hat */
249 printk("%s: recursion depth exceeded: %d\n", 249 printk("%s: recursion depth exceeded: %d\n",
250 __FUNCTION__, cwq->run_depth); 250 __func__, cwq->run_depth);
251 dump_stack(); 251 dump_stack();
252 } 252 }
253 while (!list_empty(&cwq->worklist)) { 253 while (!list_empty(&cwq->worklist)) {
@@ -564,7 +564,6 @@ EXPORT_SYMBOL(schedule_work);
564int schedule_delayed_work(struct delayed_work *dwork, 564int schedule_delayed_work(struct delayed_work *dwork,
565 unsigned long delay) 565 unsigned long delay)
566{ 566{
567 timer_stats_timer_set_start_info(&dwork->timer);
568 return queue_delayed_work(keventd_wq, dwork, delay); 567 return queue_delayed_work(keventd_wq, dwork, delay);
569} 568}
570EXPORT_SYMBOL(schedule_delayed_work); 569EXPORT_SYMBOL(schedule_delayed_work);
@@ -581,7 +580,6 @@ EXPORT_SYMBOL(schedule_delayed_work);
581int schedule_delayed_work_on(int cpu, 580int schedule_delayed_work_on(int cpu,
582 struct delayed_work *dwork, unsigned long delay) 581 struct delayed_work *dwork, unsigned long delay)
583{ 582{
584 timer_stats_timer_set_start_info(&dwork->timer);
585 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 583 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
586} 584}
587EXPORT_SYMBOL(schedule_delayed_work_on); 585EXPORT_SYMBOL(schedule_delayed_work_on);
@@ -772,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
772} 770}
773EXPORT_SYMBOL_GPL(__create_workqueue_key); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
774 772
775static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
776{ 774{
777 /* 775 /*
778 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
808void destroy_workqueue(struct workqueue_struct *wq) 806void destroy_workqueue(struct workqueue_struct *wq)
809{ 807{
810 const cpumask_t *cpu_map = wq_cpu_map(wq); 808 const cpumask_t *cpu_map = wq_cpu_map(wq);
811 struct cpu_workqueue_struct *cwq;
812 int cpu; 809 int cpu;
813 810
814 get_online_cpus(); 811 get_online_cpus();
815 spin_lock(&workqueue_lock); 812 spin_lock(&workqueue_lock);
816 list_del(&wq->list); 813 list_del(&wq->list);
817 spin_unlock(&workqueue_lock); 814 spin_unlock(&workqueue_lock);
818 put_online_cpus();
819 815
820 for_each_cpu_mask(cpu, *cpu_map) { 816 for_each_cpu_mask(cpu, *cpu_map)
821 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 817 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
822 cleanup_workqueue_thread(cwq, cpu); 818 put_online_cpus();
823 }
824 819
825 free_percpu(wq->cpu_wq); 820 free_percpu(wq->cpu_wq);
826 kfree(wq); 821 kfree(wq);
@@ -838,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
838 action &= ~CPU_TASKS_FROZEN; 833 action &= ~CPU_TASKS_FROZEN;
839 834
840 switch (action) { 835 switch (action) {
841
842 case CPU_UP_PREPARE: 836 case CPU_UP_PREPARE:
843 cpu_set(cpu, cpu_populated_map); 837 cpu_set(cpu, cpu_populated_map);
844 } 838 }
@@ -861,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
861 case CPU_UP_CANCELED: 855 case CPU_UP_CANCELED:
862 start_workqueue_thread(cwq, -1); 856 start_workqueue_thread(cwq, -1);
863 case CPU_DEAD: 857 case CPU_DEAD:
864 cleanup_workqueue_thread(cwq, cpu); 858 cleanup_workqueue_thread(cwq);
865 break; 859 break;
866 } 860 }
867 } 861 }
868 862
863 switch (action) {
864 case CPU_UP_CANCELED:
865 case CPU_DEAD:
866 cpu_clear(cpu, cpu_populated_map);
867 }
868
869 return NOTIFY_OK; 869 return NOTIFY_OK;
870} 870}
871 871