aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2008-05-06 13:55:32 -0400
committerSteve French <sfrench@us.ibm.com>2008-05-06 13:55:32 -0400
commita815752ac0ffdb910e92958d41d28f4fb28e5296 (patch)
treea3aa16a282354da0debe8e3a3a7ed8aac6e54001 /kernel
parent5ade9deaaa3e1f7291467d97b238648e43eae15e (diff)
parenta15306365a16380f3bafee9e181ba01231d4acd7 (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit.c249
-rw-r--r--kernel/audit.h13
-rw-r--r--kernel/auditfilter.c55
-rw-r--r--kernel/auditsc.c40
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/cgroup.c333
-rw-r--r--kernel/cgroup_debug.c20
-rw-r--r--kernel/compat.c6
-rw-r--r--kernel/configs.c7
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpuset.c358
-rw-r--r--kernel/dma.c7
-rw-r--r--kernel/exit.c150
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/futex.c193
-rw-r--r--kernel/hrtimer.c205
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/manage.c50
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kgdb.c8
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c9
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/marker.c3
-rw-r--r--kernel/module.c363
-rw-r--r--kernel/notifier.c38
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/nsproxy.c12
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid.c41
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/posix-cpu-timers.c11
-rw-r--r--kernel/posix-timers.c6
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/pm.c205
-rw-r--r--kernel/printk.c122
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c22
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c10
-rw-r--r--kernel/sched.c387
-rw-r--r--kernel/sched_clock.c236
-rw-r--r--kernel/sched_debug.c16
-rw-r--r--kernel/sched_fair.c39
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c9
-rw-r--r--kernel/signal.c646
-rw-r--r--kernel/softirq.c20
-rw-r--r--kernel/sys.c110
-rw-r--r--kernel/sysctl.c176
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time.c63
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/ntp.c398
-rw-r--r--kernel/time/timekeeping.c17
-rw-r--r--kernel/time/timer_list.c5
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/timeconst.pl120
-rw-r--r--kernel/timer.c153
-rw-r--r--kernel/user.c27
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/workqueue.c32
70 files changed, 2816 insertions, 2410 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,9 +9,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 16obj-y += time/
17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a7b16086d36f..b7d3709cc452 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,6 +126,8 @@ static int audit_freelist_count;
126static LIST_HEAD(audit_freelist); 126static LIST_HEAD(audit_freelist);
127 127
128static struct sk_buff_head audit_skb_queue; 128static struct sk_buff_head audit_skb_queue;
129/* queue of skbs to send to auditd when/if it comes back */
130static struct sk_buff_head audit_skb_hold_queue;
129static struct task_struct *kauditd_task; 131static struct task_struct *kauditd_task;
130static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 132static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
131static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -154,6 +156,11 @@ struct audit_buffer {
154 gfp_t gfp_mask; 156 gfp_t gfp_mask;
155}; 157};
156 158
159struct audit_reply {
160 int pid;
161 struct sk_buff *skb;
162};
163
157static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 164static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
158{ 165{
159 if (ab) { 166 if (ab) {
@@ -252,14 +259,15 @@ void audit_log_lost(const char *message)
252} 259}
253 260
254static int audit_log_config_change(char *function_name, int new, int old, 261static int audit_log_config_change(char *function_name, int new, int old,
255 uid_t loginuid, u32 sid, int allow_changes) 262 uid_t loginuid, u32 sessionid, u32 sid,
263 int allow_changes)
256{ 264{
257 struct audit_buffer *ab; 265 struct audit_buffer *ab;
258 int rc = 0; 266 int rc = 0;
259 267
260 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 268 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
261 audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, 269 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
262 old, loginuid); 270 old, loginuid, sessionid);
263 if (sid) { 271 if (sid) {
264 char *ctx = NULL; 272 char *ctx = NULL;
265 u32 len; 273 u32 len;
@@ -279,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
279} 287}
280 288
281static int audit_do_config_change(char *function_name, int *to_change, 289static int audit_do_config_change(char *function_name, int *to_change,
282 int new, uid_t loginuid, u32 sid) 290 int new, uid_t loginuid, u32 sessionid,
291 u32 sid)
283{ 292{
284 int allow_changes, rc = 0, old = *to_change; 293 int allow_changes, rc = 0, old = *to_change;
285 294
@@ -290,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change,
290 allow_changes = 1; 299 allow_changes = 1;
291 300
292 if (audit_enabled != AUDIT_OFF) { 301 if (audit_enabled != AUDIT_OFF) {
293 rc = audit_log_config_change(function_name, new, old, 302 rc = audit_log_config_change(function_name, new, old, loginuid,
294 loginuid, sid, allow_changes); 303 sessionid, sid, allow_changes);
295 if (rc) 304 if (rc)
296 allow_changes = 0; 305 allow_changes = 0;
297 } 306 }
@@ -305,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change,
305 return rc; 314 return rc;
306} 315}
307 316
308static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 317static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
318 u32 sid)
309{ 319{
310 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 320 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
311 limit, loginuid, sid); 321 limit, loginuid, sessionid, sid);
312} 322}
313 323
314static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 324static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
325 u32 sid)
315{ 326{
316 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 327 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
317 limit, loginuid, sid); 328 limit, loginuid, sessionid, sid);
318} 329}
319 330
320static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 331static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
321{ 332{
322 int rc; 333 int rc;
323 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 334 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
324 return -EINVAL; 335 return -EINVAL;
325 336
326 rc = audit_do_config_change("audit_enabled", &audit_enabled, state, 337 rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
327 loginuid, sid); 338 loginuid, sessionid, sid);
328 339
329 if (!rc) 340 if (!rc)
330 audit_ever_enabled |= !!state; 341 audit_ever_enabled |= !!state;
@@ -332,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
332 return rc; 343 return rc;
333} 344}
334 345
335static int audit_set_failure(int state, uid_t loginuid, u32 sid) 346static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
336{ 347{
337 if (state != AUDIT_FAIL_SILENT 348 if (state != AUDIT_FAIL_SILENT
338 && state != AUDIT_FAIL_PRINTK 349 && state != AUDIT_FAIL_PRINTK
@@ -340,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
340 return -EINVAL; 351 return -EINVAL;
341 352
342 return audit_do_config_change("audit_failure", &audit_failure, state, 353 return audit_do_config_change("audit_failure", &audit_failure, state,
343 loginuid, sid); 354 loginuid, sessionid, sid);
355}
356
357/*
358 * Queue skbs to be sent to auditd when/if it comes back. These skbs should
359 * already have been sent via prink/syslog and so if these messages are dropped
360 * it is not a huge concern since we already passed the audit_log_lost()
361 * notification and stuff. This is just nice to get audit messages during
362 * boot before auditd is running or messages generated while auditd is stopped.
363 * This only holds messages is audit_default is set, aka booting with audit=1
364 * or building your kernel that way.
365 */
366static void audit_hold_skb(struct sk_buff *skb)
367{
368 if (audit_default &&
369 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
370 skb_queue_tail(&audit_skb_hold_queue, skb);
371 else
372 kfree_skb(skb);
373}
374
375static void kauditd_send_skb(struct sk_buff *skb)
376{
377 int err;
378 /* take a reference in case we can't send it and we want to hold it */
379 skb_get(skb);
380 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
381 if (err < 0) {
382 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
383 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
384 audit_log_lost("auditd dissapeared\n");
385 audit_pid = 0;
386 /* we might get lucky and get this in the next auditd */
387 audit_hold_skb(skb);
388 } else
389 /* drop the extra reference if sent ok */
390 kfree_skb(skb);
344} 391}
345 392
346static int kauditd_thread(void *dummy) 393static int kauditd_thread(void *dummy)
@@ -349,24 +396,41 @@ static int kauditd_thread(void *dummy)
349 396
350 set_freezable(); 397 set_freezable();
351 while (!kthread_should_stop()) { 398 while (!kthread_should_stop()) {
399 /*
400 * if auditd just started drain the queue of messages already
401 * sent to syslog/printk. remember loss here is ok. we already
402 * called audit_log_lost() if it didn't go out normally. so the
403 * race between the skb_dequeue and the next check for audit_pid
404 * doesn't matter.
405 *
406 * if you ever find kauditd to be too slow we can get a perf win
407 * by doing our own locking and keeping better track if there
408 * are messages in this queue. I don't see the need now, but
409 * in 5 years when I want to play with this again I'll see this
410 * note and still have no friggin idea what i'm thinking today.
411 */
412 if (audit_default && audit_pid) {
413 skb = skb_dequeue(&audit_skb_hold_queue);
414 if (unlikely(skb)) {
415 while (skb && audit_pid) {
416 kauditd_send_skb(skb);
417 skb = skb_dequeue(&audit_skb_hold_queue);
418 }
419 }
420 }
421
352 skb = skb_dequeue(&audit_skb_queue); 422 skb = skb_dequeue(&audit_skb_queue);
353 wake_up(&audit_backlog_wait); 423 wake_up(&audit_backlog_wait);
354 if (skb) { 424 if (skb) {
355 if (audit_pid) { 425 if (audit_pid)
356 int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 426 kauditd_send_skb(skb);
357 if (err < 0) { 427 else {
358 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
359 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
360 audit_log_lost("auditd dissapeared\n");
361 audit_pid = 0;
362 }
363 } else {
364 if (printk_ratelimit()) 428 if (printk_ratelimit())
365 printk(KERN_NOTICE "%s\n", skb->data + 429 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
366 NLMSG_SPACE(0));
367 else 430 else
368 audit_log_lost("printk limit exceeded\n"); 431 audit_log_lost("printk limit exceeded\n");
369 kfree_skb(skb); 432
433 audit_hold_skb(skb);
370 } 434 }
371 } else { 435 } else {
372 DECLARE_WAITQUEUE(wait, current); 436 DECLARE_WAITQUEUE(wait, current);
@@ -385,13 +449,13 @@ static int kauditd_thread(void *dummy)
385 return 0; 449 return 0;
386} 450}
387 451
388static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) 452static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
389{ 453{
390 struct task_struct *tsk; 454 struct task_struct *tsk;
391 int err; 455 int err;
392 456
393 read_lock(&tasklist_lock); 457 read_lock(&tasklist_lock);
394 tsk = find_task_by_pid(pid); 458 tsk = find_task_by_vpid(pid);
395 err = -ESRCH; 459 err = -ESRCH;
396 if (!tsk) 460 if (!tsk)
397 goto out; 461 goto out;
@@ -404,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
404 if (err) 468 if (err)
405 goto out; 469 goto out;
406 470
407 tty_audit_push_task(tsk, loginuid); 471 tty_audit_push_task(tsk, loginuid, sessionid);
408out: 472out:
409 read_unlock(&tasklist_lock); 473 read_unlock(&tasklist_lock);
410 return err; 474 return err;
@@ -469,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */
469 return NULL; 533 return NULL;
470} 534}
471 535
536static int audit_send_reply_thread(void *arg)
537{
538 struct audit_reply *reply = (struct audit_reply *)arg;
539
540 mutex_lock(&audit_cmd_mutex);
541 mutex_unlock(&audit_cmd_mutex);
542
543 /* Ignore failure. It'll only happen if the sender goes away,
544 because our timeout is set to infinite. */
545 netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
546 kfree(reply);
547 return 0;
548}
472/** 549/**
473 * audit_send_reply - send an audit reply message via netlink 550 * audit_send_reply - send an audit reply message via netlink
474 * @pid: process id to send reply to 551 * @pid: process id to send reply to
@@ -485,14 +562,26 @@ nlmsg_failure: /* Used by NLMSG_PUT */
485void audit_send_reply(int pid, int seq, int type, int done, int multi, 562void audit_send_reply(int pid, int seq, int type, int done, int multi,
486 void *payload, int size) 563 void *payload, int size)
487{ 564{
488 struct sk_buff *skb; 565 struct sk_buff *skb;
566 struct task_struct *tsk;
567 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
568 GFP_KERNEL);
569
570 if (!reply)
571 return;
572
489 skb = audit_make_reply(pid, seq, type, done, multi, payload, size); 573 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
490 if (!skb) 574 if (!skb)
491 return; 575 return;
492 /* Ignore failure. It'll only happen if the sender goes away, 576
493 because our timeout is set to infinite. */ 577 reply->pid = pid;
494 netlink_unicast(audit_sock, skb, pid, 0); 578 reply->skb = skb;
495 return; 579
580 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
581 if (IS_ERR(tsk)) {
582 kfree(reply);
583 kfree_skb(skb);
584 }
496} 585}
497 586
498/* 587/*
@@ -534,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
534} 623}
535 624
536static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 625static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
537 u32 pid, u32 uid, uid_t auid, u32 sid) 626 u32 pid, u32 uid, uid_t auid, u32 ses,
627 u32 sid)
538{ 628{
539 int rc = 0; 629 int rc = 0;
540 char *ctx = NULL; 630 char *ctx = NULL;
@@ -546,8 +636,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
546 } 636 }
547 637
548 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 638 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
549 audit_log_format(*ab, "user pid=%d uid=%u auid=%u", 639 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
550 pid, uid, auid); 640 pid, uid, auid, ses);
551 if (sid) { 641 if (sid) {
552 rc = security_secid_to_secctx(sid, &ctx, &len); 642 rc = security_secid_to_secctx(sid, &ctx, &len);
553 if (rc) 643 if (rc)
@@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
570 struct audit_buffer *ab; 660 struct audit_buffer *ab;
571 u16 msg_type = nlh->nlmsg_type; 661 u16 msg_type = nlh->nlmsg_type;
572 uid_t loginuid; /* loginuid of sender */ 662 uid_t loginuid; /* loginuid of sender */
663 u32 sessionid;
573 struct audit_sig_info *sig_data; 664 struct audit_sig_info *sig_data;
574 char *ctx = NULL; 665 char *ctx = NULL;
575 u32 len; 666 u32 len;
@@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
591 pid = NETLINK_CREDS(skb)->pid; 682 pid = NETLINK_CREDS(skb)->pid;
592 uid = NETLINK_CREDS(skb)->uid; 683 uid = NETLINK_CREDS(skb)->uid;
593 loginuid = NETLINK_CB(skb).loginuid; 684 loginuid = NETLINK_CB(skb).loginuid;
685 sessionid = NETLINK_CB(skb).sessionid;
594 sid = NETLINK_CB(skb).sid; 686 sid = NETLINK_CB(skb).sid;
595 seq = nlh->nlmsg_seq; 687 seq = nlh->nlmsg_seq;
596 data = NLMSG_DATA(nlh); 688 data = NLMSG_DATA(nlh);
@@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
613 status_get = (struct audit_status *)data; 705 status_get = (struct audit_status *)data;
614 if (status_get->mask & AUDIT_STATUS_ENABLED) { 706 if (status_get->mask & AUDIT_STATUS_ENABLED) {
615 err = audit_set_enabled(status_get->enabled, 707 err = audit_set_enabled(status_get->enabled,
616 loginuid, sid); 708 loginuid, sessionid, sid);
617 if (err < 0) return err; 709 if (err < 0) return err;
618 } 710 }
619 if (status_get->mask & AUDIT_STATUS_FAILURE) { 711 if (status_get->mask & AUDIT_STATUS_FAILURE) {
620 err = audit_set_failure(status_get->failure, 712 err = audit_set_failure(status_get->failure,
621 loginuid, sid); 713 loginuid, sessionid, sid);
622 if (err < 0) return err; 714 if (err < 0) return err;
623 } 715 }
624 if (status_get->mask & AUDIT_STATUS_PID) { 716 if (status_get->mask & AUDIT_STATUS_PID) {
@@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
627 if (audit_enabled != AUDIT_OFF) 719 if (audit_enabled != AUDIT_OFF)
628 audit_log_config_change("audit_pid", new_pid, 720 audit_log_config_change("audit_pid", new_pid,
629 audit_pid, loginuid, 721 audit_pid, loginuid,
630 sid, 1); 722 sessionid, sid, 1);
631 723
632 audit_pid = new_pid; 724 audit_pid = new_pid;
633 audit_nlk_pid = NETLINK_CB(skb).pid; 725 audit_nlk_pid = NETLINK_CB(skb).pid;
634 } 726 }
635 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 727 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
636 err = audit_set_rate_limit(status_get->rate_limit, 728 err = audit_set_rate_limit(status_get->rate_limit,
637 loginuid, sid); 729 loginuid, sessionid, sid);
638 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 730 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
639 err = audit_set_backlog_limit(status_get->backlog_limit, 731 err = audit_set_backlog_limit(status_get->backlog_limit,
640 loginuid, sid); 732 loginuid, sessionid, sid);
641 break; 733 break;
642 case AUDIT_USER: 734 case AUDIT_USER:
643 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 735 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
649 if (err == 1) { 741 if (err == 1) {
650 err = 0; 742 err = 0;
651 if (msg_type == AUDIT_USER_TTY) { 743 if (msg_type == AUDIT_USER_TTY) {
652 err = audit_prepare_user_tty(pid, loginuid); 744 err = audit_prepare_user_tty(pid, loginuid,
745 sessionid);
653 if (err) 746 if (err)
654 break; 747 break;
655 } 748 }
656 audit_log_common_recv_msg(&ab, msg_type, pid, uid, 749 audit_log_common_recv_msg(&ab, msg_type, pid, uid,
657 loginuid, sid); 750 loginuid, sessionid, sid);
658 751
659 if (msg_type != AUDIT_USER_TTY) 752 if (msg_type != AUDIT_USER_TTY)
660 audit_log_format(ab, " msg='%.1024s'", 753 audit_log_format(ab, " msg='%.1024s'",
@@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
664 757
665 audit_log_format(ab, " msg="); 758 audit_log_format(ab, " msg=");
666 size = nlmsg_len(nlh); 759 size = nlmsg_len(nlh);
667 audit_log_n_untrustedstring(ab, size, 760 audit_log_n_untrustedstring(ab, data, size);
668 data);
669 } 761 }
670 audit_set_pid(ab, pid); 762 audit_set_pid(ab, pid);
671 audit_log_end(ab); 763 audit_log_end(ab);
@@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
677 return -EINVAL; 769 return -EINVAL;
678 if (audit_enabled == AUDIT_LOCKED) { 770 if (audit_enabled == AUDIT_LOCKED) {
679 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 771 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
680 uid, loginuid, sid); 772 uid, loginuid, sessionid, sid);
681 773
682 audit_log_format(ab, " audit_enabled=%d res=0", 774 audit_log_format(ab, " audit_enabled=%d res=0",
683 audit_enabled); 775 audit_enabled);
@@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
688 case AUDIT_LIST: 780 case AUDIT_LIST:
689 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 781 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
690 uid, seq, data, nlmsg_len(nlh), 782 uid, seq, data, nlmsg_len(nlh),
691 loginuid, sid); 783 loginuid, sessionid, sid);
692 break; 784 break;
693 case AUDIT_ADD_RULE: 785 case AUDIT_ADD_RULE:
694 case AUDIT_DEL_RULE: 786 case AUDIT_DEL_RULE:
@@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
696 return -EINVAL; 788 return -EINVAL;
697 if (audit_enabled == AUDIT_LOCKED) { 789 if (audit_enabled == AUDIT_LOCKED) {
698 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 790 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
699 uid, loginuid, sid); 791 uid, loginuid, sessionid, sid);
700 792
701 audit_log_format(ab, " audit_enabled=%d res=0", 793 audit_log_format(ab, " audit_enabled=%d res=0",
702 audit_enabled); 794 audit_enabled);
@@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
707 case AUDIT_LIST_RULES: 799 case AUDIT_LIST_RULES:
708 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 800 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
709 uid, seq, data, nlmsg_len(nlh), 801 uid, seq, data, nlmsg_len(nlh),
710 loginuid, sid); 802 loginuid, sessionid, sid);
711 break; 803 break;
712 case AUDIT_TRIM: 804 case AUDIT_TRIM:
713 audit_trim_trees(); 805 audit_trim_trees();
714 806
715 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 807 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
716 uid, loginuid, sid); 808 uid, loginuid, sessionid, sid);
717 809
718 audit_log_format(ab, " op=trim res=1"); 810 audit_log_format(ab, " op=trim res=1");
719 audit_log_end(ab); 811 audit_log_end(ab);
@@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
721 case AUDIT_MAKE_EQUIV: { 813 case AUDIT_MAKE_EQUIV: {
722 void *bufp = data; 814 void *bufp = data;
723 u32 sizes[2]; 815 u32 sizes[2];
724 size_t len = nlmsg_len(nlh); 816 size_t msglen = nlmsg_len(nlh);
725 char *old, *new; 817 char *old, *new;
726 818
727 err = -EINVAL; 819 err = -EINVAL;
728 if (len < 2 * sizeof(u32)) 820 if (msglen < 2 * sizeof(u32))
729 break; 821 break;
730 memcpy(sizes, bufp, 2 * sizeof(u32)); 822 memcpy(sizes, bufp, 2 * sizeof(u32));
731 bufp += 2 * sizeof(u32); 823 bufp += 2 * sizeof(u32);
732 len -= 2 * sizeof(u32); 824 msglen -= 2 * sizeof(u32);
733 old = audit_unpack_string(&bufp, &len, sizes[0]); 825 old = audit_unpack_string(&bufp, &msglen, sizes[0]);
734 if (IS_ERR(old)) { 826 if (IS_ERR(old)) {
735 err = PTR_ERR(old); 827 err = PTR_ERR(old);
736 break; 828 break;
737 } 829 }
738 new = audit_unpack_string(&bufp, &len, sizes[1]); 830 new = audit_unpack_string(&bufp, &msglen, sizes[1]);
739 if (IS_ERR(new)) { 831 if (IS_ERR(new)) {
740 err = PTR_ERR(new); 832 err = PTR_ERR(new);
741 kfree(old); 833 kfree(old);
@@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
745 err = audit_tag_tree(old, new); 837 err = audit_tag_tree(old, new);
746 838
747 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 839 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
748 uid, loginuid, sid); 840 uid, loginuid, sessionid, sid);
749 841
750 audit_log_format(ab, " op=make_equiv old="); 842 audit_log_format(ab, " op=make_equiv old=");
751 audit_log_untrustedstring(ab, old); 843 audit_log_untrustedstring(ab, old);
@@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
779 struct task_struct *tsk; 871 struct task_struct *tsk;
780 872
781 read_lock(&tasklist_lock); 873 read_lock(&tasklist_lock);
782 tsk = find_task_by_pid(pid); 874 tsk = find_task_by_vpid(pid);
783 if (!tsk) 875 if (!tsk)
784 err = -ESRCH; 876 err = -ESRCH;
785 else { 877 else {
@@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
802 if (s->enabled != 0 && s->enabled != 1) 894 if (s->enabled != 0 && s->enabled != 1)
803 return -EINVAL; 895 return -EINVAL;
804 read_lock(&tasklist_lock); 896 read_lock(&tasklist_lock);
805 tsk = find_task_by_pid(pid); 897 tsk = find_task_by_vpid(pid);
806 if (!tsk) 898 if (!tsk)
807 err = -ESRCH; 899 err = -ESRCH;
808 else { 900 else {
@@ -877,6 +969,7 @@ static int __init audit_init(void)
877 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 969 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
878 970
879 skb_queue_head_init(&audit_skb_queue); 971 skb_queue_head_init(&audit_skb_queue);
972 skb_queue_head_init(&audit_skb_hold_queue);
880 audit_initialized = 1; 973 audit_initialized = 1;
881 audit_enabled = audit_default; 974 audit_enabled = audit_default;
882 audit_ever_enabled |= !!audit_default; 975 audit_ever_enabled |= !!audit_default;
@@ -1199,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
1199 * This function will take the passed buf and convert it into a string of 1292 * This function will take the passed buf and convert it into a string of
1200 * ascii hex digits. The new string is placed onto the skb. 1293 * ascii hex digits. The new string is placed onto the skb.
1201 */ 1294 */
1202void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 1295void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1203 size_t len) 1296 size_t len)
1204{ 1297{
1205 int i, avail, new_len; 1298 int i, avail, new_len;
@@ -1235,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
1235 * Format a string of no more than slen characters into the audit buffer, 1328 * Format a string of no more than slen characters into the audit buffer,
1236 * enclosed in quote marks. 1329 * enclosed in quote marks.
1237 */ 1330 */
1238static void audit_log_n_string(struct audit_buffer *ab, size_t slen, 1331void audit_log_n_string(struct audit_buffer *ab, const char *string,
1239 const char *string) 1332 size_t slen)
1240{ 1333{
1241 int avail, new_len; 1334 int avail, new_len;
1242 unsigned char *ptr; 1335 unsigned char *ptr;
@@ -1292,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len)
1292 * The caller specifies the number of characters in the string to log, which may 1385 * The caller specifies the number of characters in the string to log, which may
1293 * or may not be the entire string. 1386 * or may not be the entire string.
1294 */ 1387 */
1295void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1388void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
1296 const char *string) 1389 size_t len)
1297{ 1390{
1298 if (audit_string_contains_control(string, len)) 1391 if (audit_string_contains_control(string, len))
1299 audit_log_hex(ab, string, len); 1392 audit_log_n_hex(ab, string, len);
1300 else 1393 else
1301 audit_log_n_string(ab, len, string); 1394 audit_log_n_string(ab, string, len);
1302} 1395}
1303 1396
1304/** 1397/**
@@ -1311,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1311 */ 1404 */
1312void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1405void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1313{ 1406{
1314 audit_log_n_untrustedstring(ab, strlen(string), string); 1407 audit_log_n_untrustedstring(ab, string, strlen(string));
1315} 1408}
1316 1409
1317/* This is a helper-function to print the escaped d_path */ 1410/* This is a helper-function to print the escaped d_path */
@@ -1355,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab)
1355 audit_log_lost("rate limit exceeded"); 1448 audit_log_lost("rate limit exceeded");
1356 } else { 1449 } else {
1357 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1450 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1451 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1452
1358 if (audit_pid) { 1453 if (audit_pid) {
1359 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1360 skb_queue_tail(&audit_skb_queue, ab->skb); 1454 skb_queue_tail(&audit_skb_queue, ab->skb);
1361 ab->skb = NULL;
1362 wake_up_interruptible(&kauditd_wait); 1455 wake_up_interruptible(&kauditd_wait);
1363 } else if (nlh->nlmsg_type != AUDIT_EOE) { 1456 } else {
1364 if (printk_ratelimit()) { 1457 if (nlh->nlmsg_type != AUDIT_EOE) {
1365 printk(KERN_NOTICE "type=%d %s\n", 1458 if (printk_ratelimit()) {
1366 nlh->nlmsg_type, 1459 printk(KERN_NOTICE "type=%d %s\n",
1367 ab->skb->data + NLMSG_SPACE(0)); 1460 nlh->nlmsg_type,
1368 } else 1461 ab->skb->data + NLMSG_SPACE(0));
1369 audit_log_lost("printk limit exceeded\n"); 1462 } else
1463 audit_log_lost("printk limit exceeded\n");
1464 }
1465 audit_hold_skb(ab->skb);
1370 } 1466 }
1467 ab->skb = NULL;
1371 } 1468 }
1372 audit_buffer_free(ab); 1469 audit_buffer_free(ab);
1373} 1470}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cfc54ee3e1f..9d6717412fec 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,6 +74,11 @@ struct audit_entry {
74 struct audit_krule rule; 74 struct audit_krule rule;
75}; 75};
76 76
77#ifdef CONFIG_AUDIT
78extern int audit_enabled;
79extern int audit_ever_enabled;
80#endif
81
77extern int audit_pid; 82extern int audit_pid;
78 83
79#define AUDIT_INODE_BUCKETS 32 84#define AUDIT_INODE_BUCKETS 32
@@ -104,6 +109,9 @@ struct audit_netlink_list {
104int audit_send_list(void *); 109int audit_send_list(void *);
105 110
106struct inotify_watch; 111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
107extern void audit_free_parent(struct inotify_watch *); 115extern void audit_free_parent(struct inotify_watch *);
108extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, 116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
109 const char *, struct inode *); 117 const char *, struct inode *);
@@ -111,6 +119,7 @@ extern int selinux_audit_rule_update(void);
111 119
112extern struct mutex audit_filter_mutex; 120extern struct mutex audit_filter_mutex;
113extern void audit_free_rule_rcu(struct rcu_head *); 121extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[];
114 123
115#ifdef CONFIG_AUDIT_TREE 124#ifdef CONFIG_AUDIT_TREE
116extern struct audit_chunk *audit_tree_lookup(const struct inode *); 125extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -137,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *);
137 146
138extern char *audit_unpack_string(void **, size_t *, size_t); 147extern char *audit_unpack_string(void **, size_t *, size_t);
139 148
149extern pid_t audit_sig_pid;
150extern uid_t audit_sig_uid;
151extern u32 audit_sig_sid;
152
140#ifdef CONFIG_AUDITSYSCALL 153#ifdef CONFIG_AUDITSYSCALL
141extern int __audit_signal_info(int sig, struct task_struct *t); 154extern int __audit_signal_info(int sig, struct task_struct *t);
142static inline int audit_signal_info(int sig, struct task_struct *t) 155static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..0e0bd27e6512 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
89 89
90DEFINE_MUTEX(audit_filter_mutex); 90DEFINE_MUTEX(audit_filter_mutex);
91 91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */ 92/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 93#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97 94
98extern int audit_enabled;
99
100void audit_free_parent(struct inotify_watch *i_watch) 95void audit_free_parent(struct inotify_watch *i_watch)
101{ 96{
102 struct audit_parent *parent; 97 struct audit_parent *parent;
@@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
272 return -EINVAL; 267 return -EINVAL;
273 268
274 watch = audit_init_watch(path); 269 watch = audit_init_watch(path);
275 if (unlikely(IS_ERR(watch))) 270 if (IS_ERR(watch))
276 return PTR_ERR(watch); 271 return PTR_ERR(watch);
277 272
278 audit_get_watch(watch); 273 audit_get_watch(watch);
@@ -422,7 +417,7 @@ exit_err:
422static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
423{ 418{
424 struct audit_entry *entry; 419 struct audit_entry *entry;
425 struct audit_field *f; 420 struct audit_field *ino_f;
426 int err = 0; 421 int err = 0;
427 int i; 422 int i;
428 423
@@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
483 if (f->val & ~15) 478 if (f->val & ~15)
484 goto exit_free; 479 goto exit_free;
485 break; 480 break;
481 case AUDIT_FILETYPE:
482 if ((f->val & ~S_IFMT) > S_IFMT)
483 goto exit_free;
484 break;
486 case AUDIT_INODE: 485 case AUDIT_INODE:
487 err = audit_to_inode(&entry->rule, f); 486 err = audit_to_inode(&entry->rule, f);
488 if (err) 487 if (err)
@@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
504 } 503 }
505 } 504 }
506 505
507 f = entry->rule.inode_f; 506 ino_f = entry->rule.inode_f;
508 if (f) { 507 if (ino_f) {
509 switch(f->op) { 508 switch(ino_f->op) {
510 case AUDIT_NOT_EQUAL: 509 case AUDIT_NOT_EQUAL:
511 entry->rule.inode_f = NULL; 510 entry->rule.inode_f = NULL;
512 case AUDIT_EQUAL: 511 case AUDIT_EQUAL:
@@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
531{ 530{
532 int err = 0; 531 int err = 0;
533 struct audit_entry *entry; 532 struct audit_entry *entry;
534 struct audit_field *f; 533 struct audit_field *ino_f;
535 void *bufp; 534 void *bufp;
536 size_t remain = datasz - sizeof(struct audit_rule_data); 535 size_t remain = datasz - sizeof(struct audit_rule_data);
537 int i; 536 int i;
@@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
654 if (f->val & ~15) 653 if (f->val & ~15)
655 goto exit_free; 654 goto exit_free;
656 break; 655 break;
656 case AUDIT_FILETYPE:
657 if ((f->val & ~S_IFMT) > S_IFMT)
658 goto exit_free;
659 break;
657 default: 660 default:
658 goto exit_free; 661 goto exit_free;
659 } 662 }
660 } 663 }
661 664
662 f = entry->rule.inode_f; 665 ino_f = entry->rule.inode_f;
663 if (f) { 666 if (ino_f) {
664 switch(f->op) { 667 switch(ino_f->op) {
665 case AUDIT_NOT_EQUAL: 668 case AUDIT_NOT_EQUAL:
666 entry->rule.inode_f = NULL; 669 entry->rule.inode_f = NULL;
667 case AUDIT_EQUAL: 670 case AUDIT_EQUAL:
@@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
848 return ERR_PTR(-ENOMEM); 851 return ERR_PTR(-ENOMEM);
849 852
850 new = audit_init_watch(path); 853 new = audit_init_watch(path);
851 if (unlikely(IS_ERR(new))) { 854 if (IS_ERR(new)) {
852 kfree(path); 855 kfree(path);
853 goto out; 856 goto out;
854 } 857 }
@@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent,
989 audit_set_auditable(current->audit_context); 992 audit_set_auditable(current->audit_context);
990 993
991 nwatch = audit_dupe_watch(owatch); 994 nwatch = audit_dupe_watch(owatch);
992 if (unlikely(IS_ERR(nwatch))) { 995 if (IS_ERR(nwatch)) {
993 mutex_unlock(&audit_filter_mutex); 996 mutex_unlock(&audit_filter_mutex);
994 audit_panic("error updating watch, skipping"); 997 audit_panic("error updating watch, skipping");
995 return; 998 return;
@@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent,
1004 list_del_rcu(&oentry->list); 1007 list_del_rcu(&oentry->list);
1005 1008
1006 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1009 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1007 if (unlikely(IS_ERR(nentry))) 1010 if (IS_ERR(nentry))
1008 audit_panic("error updating watch, removing"); 1011 audit_panic("error updating watch, removing");
1009 else { 1012 else {
1010 int h = audit_hash_ino((u32)ino); 1013 int h = audit_hash_ino((u32)ino);
@@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1500} 1503}
1501 1504
1502/* Log rule additions and removals */ 1505/* Log rule additions and removals */
1503static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, 1506static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1504 struct audit_krule *rule, int res) 1507 char *action, struct audit_krule *rule,
1508 int res)
1505{ 1509{
1506 struct audit_buffer *ab; 1510 struct audit_buffer *ab;
1507 1511
@@ -1511,7 +1515,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1511 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1515 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1512 if (!ab) 1516 if (!ab)
1513 return; 1517 return;
1514 audit_log_format(ab, "auid=%u", loginuid); 1518 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
1515 if (sid) { 1519 if (sid) {
1516 char *ctx = NULL; 1520 char *ctx = NULL;
1517 u32 len; 1521 u32 len;
@@ -1543,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1543 * @sid: SE Linux Security ID of sender 1547 * @sid: SE Linux Security ID of sender
1544 */ 1548 */
1545int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1549int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1546 size_t datasz, uid_t loginuid, u32 sid) 1550 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
1547{ 1551{
1548 struct task_struct *tsk; 1552 struct task_struct *tsk;
1549 struct audit_netlink_list *dest; 1553 struct audit_netlink_list *dest;
@@ -1590,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1590 1594
1591 err = audit_add_rule(entry, 1595 err = audit_add_rule(entry,
1592 &audit_filter_list[entry->rule.listnr]); 1596 &audit_filter_list[entry->rule.listnr]);
1593 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); 1597 audit_log_rule_change(loginuid, sessionid, sid, "add",
1598 &entry->rule, !err);
1594 1599
1595 if (err) 1600 if (err)
1596 audit_free_rule(entry); 1601 audit_free_rule(entry);
@@ -1606,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1606 1611
1607 err = audit_del_rule(entry, 1612 err = audit_del_rule(entry,
1608 &audit_filter_list[entry->rule.listnr]); 1613 &audit_filter_list[entry->rule.listnr]);
1609 audit_log_rule_change(loginuid, sid, "remove", &entry->rule, 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1610 !err); 1615 &entry->rule, !err);
1611 1616
1612 audit_free_rule(entry); 1617 audit_free_rule(entry);
1613 break; 1618 break;
@@ -1785,7 +1790,7 @@ int audit_update_lsm_rules(void)
1785 watch = entry->rule.watch; 1790 watch = entry->rule.watch;
1786 tree = entry->rule.tree; 1791 tree = entry->rule.tree;
1787 nentry = audit_dupe_rule(&entry->rule, watch); 1792 nentry = audit_dupe_rule(&entry->rule, watch);
1788 if (unlikely(IS_ERR(nentry))) { 1793 if (IS_ERR(nentry)) {
1789 /* save the first error encountered for the 1794 /* save the first error encountered for the
1790 * return value */ 1795 * return value */
1791 if (!err) 1796 if (!err)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 56e56ed594a8..c10e7aae04d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -68,9 +68,6 @@
68 68
69#include "audit.h" 69#include "audit.h"
70 70
71extern struct list_head audit_filter_list[];
72extern int audit_ever_enabled;
73
74/* AUDIT_NAMES is the number of slots we reserve in the audit_context 71/* AUDIT_NAMES is the number of slots we reserve in the audit_context
75 * for saving names from getname(). */ 72 * for saving names from getname(). */
76#define AUDIT_NAMES 20 73#define AUDIT_NAMES 20
@@ -283,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
283 } 280 }
284} 281}
285 282
283static int audit_match_filetype(struct audit_context *ctx, int which)
284{
285 unsigned index = which & ~S_IFMT;
286 mode_t mode = which & S_IFMT;
287 if (index >= ctx->name_count)
288 return 0;
289 if (ctx->names[index].ino == -1)
290 return 0;
291 if ((ctx->names[index].mode ^ mode) & S_IFMT)
292 return 0;
293 return 1;
294}
295
286/* 296/*
287 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; 297 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
288 * ->first_trees points to its beginning, ->trees - to the current end of data. 298 * ->first_trees points to its beginning, ->trees - to the current end of data.
@@ -592,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk,
592 case AUDIT_PERM: 602 case AUDIT_PERM:
593 result = audit_match_perm(ctx, f->val); 603 result = audit_match_perm(ctx, f->val);
594 break; 604 break;
605 case AUDIT_FILETYPE:
606 result = audit_match_filetype(ctx, f->val);
607 break;
595 } 608 }
596 609
597 if (!result) 610 if (!result)
@@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1095 audit_log_format(*ab, "[%d]", i); 1108 audit_log_format(*ab, "[%d]", i);
1096 audit_log_format(*ab, "="); 1109 audit_log_format(*ab, "=");
1097 if (has_cntl) 1110 if (has_cntl)
1098 audit_log_hex(*ab, buf, to_send); 1111 audit_log_n_hex(*ab, buf, to_send);
1099 else 1112 else
1100 audit_log_format(*ab, "\"%s\"", buf); 1113 audit_log_format(*ab, "\"%s\"", buf);
1101 audit_log_format(*ab, "\n"); 1114 audit_log_format(*ab, "\n");
@@ -1296,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1296 break; } 1309 break; }
1297 1310
1298 case AUDIT_SOCKETCALL: { 1311 case AUDIT_SOCKETCALL: {
1299 int i;
1300 struct audit_aux_data_socketcall *axs = (void *)aux; 1312 struct audit_aux_data_socketcall *axs = (void *)aux;
1301 audit_log_format(ab, "nargs=%d", axs->nargs); 1313 audit_log_format(ab, "nargs=%d", axs->nargs);
1302 for (i=0; i<axs->nargs; i++) 1314 for (i=0; i<axs->nargs; i++)
@@ -1307,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1307 struct audit_aux_data_sockaddr *axs = (void *)aux; 1319 struct audit_aux_data_sockaddr *axs = (void *)aux;
1308 1320
1309 audit_log_format(ab, "saddr="); 1321 audit_log_format(ab, "saddr=");
1310 audit_log_hex(ab, axs->a, axs->len); 1322 audit_log_n_hex(ab, axs->a, axs->len);
1311 break; } 1323 break; }
1312 1324
1313 case AUDIT_FD_PAIR: { 1325 case AUDIT_FD_PAIR: {
@@ -1321,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1321 1333
1322 for (aux = context->aux_pids; aux; aux = aux->next) { 1334 for (aux = context->aux_pids; aux; aux = aux->next) {
1323 struct audit_aux_data_pids *axs = (void *)aux; 1335 struct audit_aux_data_pids *axs = (void *)aux;
1324 int i;
1325 1336
1326 for (i = 0; i < axs->pid_count; i++) 1337 for (i = 0; i < axs->pid_count; i++)
1327 if (audit_log_pid_context(context, axs->target_pid[i], 1338 if (audit_log_pid_context(context, axs->target_pid[i],
@@ -1371,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1371 default: 1382 default:
1372 /* log the name's directory component */ 1383 /* log the name's directory component */
1373 audit_log_format(ab, " name="); 1384 audit_log_format(ab, " name=");
1374 audit_log_n_untrustedstring(ab, n->name_len, 1385 audit_log_n_untrustedstring(ab, n->name,
1375 n->name); 1386 n->name_len);
1376 } 1387 }
1377 } else 1388 } else
1378 audit_log_format(ab, " name=(null)"); 1389 audit_log_format(ab, " name=(null)");
@@ -1596,7 +1607,7 @@ static inline void handle_one(const struct inode *inode)
1596 if (likely(put_tree_ref(context, chunk))) 1607 if (likely(put_tree_ref(context, chunk)))
1597 return; 1608 return;
1598 if (unlikely(!grow_tree_refs(context))) { 1609 if (unlikely(!grow_tree_refs(context))) {
1599 printk(KERN_WARNING "out of memory, audit has lost a tree reference"); 1610 printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
1600 audit_set_auditable(context); 1611 audit_set_auditable(context);
1601 audit_put_chunk(chunk); 1612 audit_put_chunk(chunk);
1602 unroll_tree_refs(context, p, count); 1613 unroll_tree_refs(context, p, count);
@@ -1656,7 +1667,7 @@ retry:
1656 } 1667 }
1657 /* too bad */ 1668 /* too bad */
1658 printk(KERN_WARNING 1669 printk(KERN_WARNING
1659 "out of memory, audit has lost a tree reference"); 1670 "out of memory, audit has lost a tree reference\n");
1660 unroll_tree_refs(context, p, count); 1671 unroll_tree_refs(context, p, count);
1661 audit_set_auditable(context); 1672 audit_set_auditable(context);
1662 return; 1673 return;
@@ -1752,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context,
1752 if (context->name_count >= AUDIT_NAMES) { 1763 if (context->name_count >= AUDIT_NAMES) {
1753 if (inode) 1764 if (inode)
1754 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1765 printk(KERN_DEBUG "name_count maxed, losing inode data: "
1755 "dev=%02x:%02x, inode=%lu", 1766 "dev=%02x:%02x, inode=%lu\n",
1756 MAJOR(inode->i_sb->s_dev), 1767 MAJOR(inode->i_sb->s_dev),
1757 MINOR(inode->i_sb->s_dev), 1768 MINOR(inode->i_sb->s_dev),
1758 inode->i_ino); 1769 inode->i_ino);
1759 1770
1760 else 1771 else
1761 printk(KERN_DEBUG "name_count maxed, losing inode data"); 1772 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1762 return 1; 1773 return 1;
1763 } 1774 }
1764 context->name_count++; 1775 context->name_count++;
@@ -2361,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t)
2361 struct audit_aux_data_pids *axp; 2372 struct audit_aux_data_pids *axp;
2362 struct task_struct *tsk = current; 2373 struct task_struct *tsk = current;
2363 struct audit_context *ctx = tsk->audit_context; 2374 struct audit_context *ctx = tsk->audit_context;
2364 extern pid_t audit_sig_pid;
2365 extern uid_t audit_sig_uid;
2366 extern u32 audit_sig_sid;
2367 2375
2368 if (audit_pid && t->tgid == audit_pid) { 2376 if (audit_pid && t->tgid == audit_pid) {
2369 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2377 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c3c55544db2f..3c5301381837 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,11 +8,7 @@
8/* Include headers that define the enum constants of interest */ 8/* Include headers that define the enum constants of interest */
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11 11#include <linux/kbuild.h>
12#define DEFINE(sym, val) \
13 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
14
15#define BLANK() asm volatile("\n->" : : )
16 12
17void foo(void) 13void foo(void)
18{ 14{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..fbc6fc8949b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
44#include <linux/kmod.h> 44#include <linux/kmod.h>
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h>
47 48
48#include <asm/atomic.h> 49#include <asm/atomic.h>
49 50
@@ -118,17 +119,7 @@ static int root_count;
118 * be called. 119 * be called.
119 */ 120 */
120static int need_forkexit_callback; 121static int need_forkexit_callback;
121 122static int need_mm_owner_callback __read_mostly;
122/* bits in struct cgroup flags field */
123enum {
124 /* Control Group is dead */
125 CGRP_REMOVED,
126 /* Control Group has previously had a child cgroup or a task,
127 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
128 CGRP_RELEASABLE,
129 /* Control Group requires release notifications to userspace */
130 CGRP_NOTIFY_ON_RELEASE,
131};
132 123
133/* convenient tests for these bits */ 124/* convenient tests for these bits */
134inline int cgroup_is_removed(const struct cgroup *cgrp) 125inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
204static DEFINE_RWLOCK(css_set_lock); 195static DEFINE_RWLOCK(css_set_lock);
205static int css_set_count; 196static int css_set_count;
206 197
198/* hash table for cgroup groups. This improves the performance to
199 * find an existing css_set */
200#define CSS_SET_HASH_BITS 7
201#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
202static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
203
204static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
205{
206 int i;
207 int index;
208 unsigned long tmp = 0UL;
209
210 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
211 tmp += (unsigned long)css[i];
212 tmp = (tmp >> 16) ^ tmp;
213
214 index = hash_long(tmp, CSS_SET_HASH_BITS);
215
216 return &css_set_table[index];
217}
218
207/* We don't maintain the lists running through each css_set to its 219/* We don't maintain the lists running through each css_set to its
208 * task until after the first call to cgroup_iter_start(). This 220 * task until after the first call to cgroup_iter_start(). This
209 * reduces the fork()/exit() overhead for people who have cgroups 221 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
230static void unlink_css_set(struct css_set *cg) 242static void unlink_css_set(struct css_set *cg)
231{ 243{
232 write_lock(&css_set_lock); 244 write_lock(&css_set_lock);
233 list_del(&cg->list); 245 hlist_del(&cg->hlist);
234 css_set_count--; 246 css_set_count--;
235 while (!list_empty(&cg->cg_links)) { 247 while (!list_empty(&cg->cg_links)) {
236 struct cg_cgroup_link *link; 248 struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
295/* 307/*
296 * find_existing_css_set() is a helper for 308 * find_existing_css_set() is a helper for
297 * find_css_set(), and checks to see whether an existing 309 * find_css_set(), and checks to see whether an existing
298 * css_set is suitable. This currently walks a linked-list for 310 * css_set is suitable.
299 * simplicity; a later patch will use a hash table for better
300 * performance
301 * 311 *
302 * oldcg: the cgroup group that we're using before the cgroup 312 * oldcg: the cgroup group that we're using before the cgroup
303 * transition 313 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
314{ 324{
315 int i; 325 int i;
316 struct cgroupfs_root *root = cgrp->root; 326 struct cgroupfs_root *root = cgrp->root;
317 struct list_head *l = &init_css_set.list; 327 struct hlist_head *hhead;
328 struct hlist_node *node;
329 struct css_set *cg;
318 330
319 /* Built the set of subsystem state objects that we want to 331 /* Built the set of subsystem state objects that we want to
320 * see in the new css_set */ 332 * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
331 } 343 }
332 } 344 }
333 345
334 /* Look through existing cgroup groups to find one to reuse */ 346 hhead = css_set_hash(template);
335 do { 347 hlist_for_each_entry(cg, node, hhead, hlist) {
336 struct css_set *cg =
337 list_entry(l, struct css_set, list);
338
339 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 348 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
340 /* All subsystems matched */ 349 /* All subsystems matched */
341 return cg; 350 return cg;
342 } 351 }
343 /* Try the next cgroup group */ 352 }
344 l = l->next;
345 } while (l != &init_css_set.list);
346 353
347 /* No existing cgroup group matched */ 354 /* No existing cgroup group matched */
348 return NULL; 355 return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
404 struct list_head tmp_cg_links; 411 struct list_head tmp_cg_links;
405 struct cg_cgroup_link *link; 412 struct cg_cgroup_link *link;
406 413
414 struct hlist_head *hhead;
415
407 /* First see if we already have a cgroup group that matches 416 /* First see if we already have a cgroup group that matches
408 * the desired set */ 417 * the desired set */
409 write_lock(&css_set_lock); 418 write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
428 kref_init(&res->ref); 437 kref_init(&res->ref);
429 INIT_LIST_HEAD(&res->cg_links); 438 INIT_LIST_HEAD(&res->cg_links);
430 INIT_LIST_HEAD(&res->tasks); 439 INIT_LIST_HEAD(&res->tasks);
440 INIT_HLIST_NODE(&res->hlist);
431 441
432 /* Copy the set of subsystem state objects generated in 442 /* Copy the set of subsystem state objects generated in
433 * find_existing_css_set() */ 443 * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
467 477
468 BUG_ON(!list_empty(&tmp_cg_links)); 478 BUG_ON(!list_empty(&tmp_cg_links));
469 479
470 /* Link this cgroup group into the list */
471 list_add(&res->list, &init_css_set.list);
472 css_set_count++; 480 css_set_count++;
481
482 /* Add this cgroup group to the hash table */
483 hhead = css_set_hash(res->subsys);
484 hlist_add_head(&res->hlist, hhead);
485
473 write_unlock(&css_set_lock); 486 write_unlock(&css_set_lock);
474 487
475 return res; 488 return res;
@@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations;
562static struct file_operations proc_cgroupstats_operations; 575static struct file_operations proc_cgroupstats_operations;
563 576
564static struct backing_dev_info cgroup_backing_dev_info = { 577static struct backing_dev_info cgroup_backing_dev_info = {
565 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 578 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
566}; 579};
567 580
568static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 581static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
@@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
948 int ret = 0; 961 int ret = 0;
949 struct super_block *sb; 962 struct super_block *sb;
950 struct cgroupfs_root *root; 963 struct cgroupfs_root *root;
951 struct list_head tmp_cg_links, *l; 964 struct list_head tmp_cg_links;
952 INIT_LIST_HEAD(&tmp_cg_links); 965 INIT_LIST_HEAD(&tmp_cg_links);
953 966
954 /* First find the desired set of subsystems */ 967 /* First find the desired set of subsystems */
@@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
990 /* New superblock */ 1003 /* New superblock */
991 struct cgroup *cgrp = &root->top_cgroup; 1004 struct cgroup *cgrp = &root->top_cgroup;
992 struct inode *inode; 1005 struct inode *inode;
1006 int i;
993 1007
994 BUG_ON(sb->s_root != NULL); 1008 BUG_ON(sb->s_root != NULL);
995 1009
@@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1034 /* Link the top cgroup in this hierarchy into all 1048 /* Link the top cgroup in this hierarchy into all
1035 * the css_set objects */ 1049 * the css_set objects */
1036 write_lock(&css_set_lock); 1050 write_lock(&css_set_lock);
1037 l = &init_css_set.list; 1051 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1038 do { 1052 struct hlist_head *hhead = &css_set_table[i];
1053 struct hlist_node *node;
1039 struct css_set *cg; 1054 struct css_set *cg;
1040 struct cg_cgroup_link *link; 1055
1041 cg = list_entry(l, struct css_set, list); 1056 hlist_for_each_entry(cg, node, hhead, hlist) {
1042 BUG_ON(list_empty(&tmp_cg_links)); 1057 struct cg_cgroup_link *link;
1043 link = list_entry(tmp_cg_links.next, 1058
1044 struct cg_cgroup_link, 1059 BUG_ON(list_empty(&tmp_cg_links));
1045 cgrp_link_list); 1060 link = list_entry(tmp_cg_links.next,
1046 list_del(&link->cgrp_link_list); 1061 struct cg_cgroup_link,
1047 link->cg = cg; 1062 cgrp_link_list);
1048 list_add(&link->cgrp_link_list, 1063 list_del(&link->cgrp_link_list);
1049 &root->top_cgroup.css_sets); 1064 link->cg = cg;
1050 list_add(&link->cg_link_list, &cg->cg_links); 1065 list_add(&link->cgrp_link_list,
1051 l = l->next; 1066 &root->top_cgroup.css_sets);
1052 } while (l != &init_css_set.list); 1067 list_add(&link->cg_link_list, &cg->cg_links);
1068 }
1069 }
1053 write_unlock(&css_set_lock); 1070 write_unlock(&css_set_lock);
1054 1071
1055 free_cg_links(&tmp_cg_links); 1072 free_cg_links(&tmp_cg_links);
@@ -1307,18 +1324,16 @@ enum cgroup_filetype {
1307 FILE_DIR, 1324 FILE_DIR,
1308 FILE_TASKLIST, 1325 FILE_TASKLIST,
1309 FILE_NOTIFY_ON_RELEASE, 1326 FILE_NOTIFY_ON_RELEASE,
1310 FILE_RELEASABLE,
1311 FILE_RELEASE_AGENT, 1327 FILE_RELEASE_AGENT,
1312}; 1328};
1313 1329
1314static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, 1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1315 struct file *file, 1331 struct file *file,
1316 const char __user *userbuf, 1332 const char __user *userbuf,
1317 size_t nbytes, loff_t *unused_ppos) 1333 size_t nbytes, loff_t *unused_ppos)
1318{ 1334{
1319 char buffer[64]; 1335 char buffer[64];
1320 int retval = 0; 1336 int retval = 0;
1321 u64 val;
1322 char *end; 1337 char *end;
1323 1338
1324 if (!nbytes) 1339 if (!nbytes)
@@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1329 return -EFAULT; 1344 return -EFAULT;
1330 1345
1331 buffer[nbytes] = 0; /* nul-terminate */ 1346 buffer[nbytes] = 0; /* nul-terminate */
1332 1347 strstrip(buffer);
1333 /* strip newline if necessary */ 1348 if (cft->write_u64) {
1334 if (nbytes && (buffer[nbytes-1] == '\n')) 1349 u64 val = simple_strtoull(buffer, &end, 0);
1335 buffer[nbytes-1] = 0; 1350 if (*end)
1336 val = simple_strtoull(buffer, &end, 0); 1351 return -EINVAL;
1337 if (*end) 1352 retval = cft->write_u64(cgrp, cft, val);
1338 return -EINVAL; 1353 } else {
1339 1354 s64 val = simple_strtoll(buffer, &end, 0);
1340 /* Pass to subsystem */ 1355 if (*end)
1341 retval = cft->write_uint(cgrp, cft, val); 1356 return -EINVAL;
1357 retval = cft->write_s64(cgrp, cft, val);
1358 }
1342 if (!retval) 1359 if (!retval)
1343 retval = nbytes; 1360 retval = nbytes;
1344 return retval; 1361 return retval;
@@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1419 return -ENODEV; 1436 return -ENODEV;
1420 if (cft->write) 1437 if (cft->write)
1421 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1422 if (cft->write_uint) 1439 if (cft->write_u64 || cft->write_s64)
1423 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); 1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes;
1444 }
1424 return -EINVAL; 1445 return -EINVAL;
1425} 1446}
1426 1447
1427static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, 1448static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1428 struct file *file, 1449 struct file *file,
1429 char __user *buf, size_t nbytes, 1450 char __user *buf, size_t nbytes,
1430 loff_t *ppos) 1451 loff_t *ppos)
1431{ 1452{
1432 char tmp[64]; 1453 char tmp[64];
1433 u64 val = cft->read_uint(cgrp, cft); 1454 u64 val = cft->read_u64(cgrp, cft);
1434 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1435 1456
1436 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1457 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1437} 1458}
1438 1459
1460static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1461 struct file *file,
1462 char __user *buf, size_t nbytes,
1463 loff_t *ppos)
1464{
1465 char tmp[64];
1466 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val);
1468
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470}
1471
1439static ssize_t cgroup_common_file_read(struct cgroup *cgrp, 1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1440 struct cftype *cft, 1473 struct cftype *cft,
1441 struct file *file, 1474 struct file *file,
@@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1490 1523
1491 if (cft->read) 1524 if (cft->read)
1492 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 1525 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1493 if (cft->read_uint) 1526 if (cft->read_u64)
1494 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); 1527 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1528 if (cft->read_s64)
1529 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
1495 return -EINVAL; 1530 return -EINVAL;
1496} 1531}
1497 1532
1533/*
1534 * seqfile ops/methods for returning structured data. Currently just
1535 * supports string->u64 maps, but can be extended in future.
1536 */
1537
1538struct cgroup_seqfile_state {
1539 struct cftype *cft;
1540 struct cgroup *cgroup;
1541};
1542
1543static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1544{
1545 struct seq_file *sf = cb->state;
1546 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1547}
1548
1549static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1550{
1551 struct cgroup_seqfile_state *state = m->private;
1552 struct cftype *cft = state->cft;
1553 if (cft->read_map) {
1554 struct cgroup_map_cb cb = {
1555 .fill = cgroup_map_add,
1556 .state = m,
1557 };
1558 return cft->read_map(state->cgroup, cft, &cb);
1559 }
1560 return cft->read_seq_string(state->cgroup, cft, m);
1561}
1562
1563int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{
1565 struct seq_file *seq = file->private_data;
1566 kfree(seq->private);
1567 return single_release(inode, file);
1568}
1569
1570static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read,
1572 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release,
1574};
1575
1498static int cgroup_file_open(struct inode *inode, struct file *file) 1576static int cgroup_file_open(struct inode *inode, struct file *file)
1499{ 1577{
1500 int err; 1578 int err;
@@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1507 cft = __d_cft(file->f_dentry); 1585 cft = __d_cft(file->f_dentry);
1508 if (!cft) 1586 if (!cft)
1509 return -ENODEV; 1587 return -ENODEV;
1510 if (cft->open) 1588 if (cft->read_map || cft->read_seq_string) {
1589 struct cgroup_seqfile_state *state =
1590 kzalloc(sizeof(*state), GFP_USER);
1591 if (!state)
1592 return -ENOMEM;
1593 state->cft = cft;
1594 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1595 file->f_op = &cgroup_seqfile_operations;
1596 err = single_open(file, cgroup_seqfile_show, state);
1597 if (err < 0)
1598 kfree(state);
1599 } else if (cft->open)
1511 err = cft->open(inode, file); 1600 err = cft->open(inode, file);
1512 else 1601 else
1513 err = 0; 1602 err = 0;
@@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1715 * The tasklist_lock is not held here, as do_each_thread() and 1804 * The tasklist_lock is not held here, as do_each_thread() and
1716 * while_each_thread() are protected by RCU. 1805 * while_each_thread() are protected by RCU.
1717 */ 1806 */
1718void cgroup_enable_task_cg_lists(void) 1807static void cgroup_enable_task_cg_lists(void)
1719{ 1808{
1720 struct task_struct *p, *g; 1809 struct task_struct *p, *g;
1721 write_lock(&css_set_lock); 1810 write_lock(&css_set_lock);
@@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1913 2002
1914 if (heap->size) { 2003 if (heap->size) {
1915 for (i = 0; i < heap->size; i++) { 2004 for (i = 0; i < heap->size; i++) {
1916 struct task_struct *p = heap->ptrs[i]; 2005 struct task_struct *q = heap->ptrs[i];
1917 if (i == 0) { 2006 if (i == 0) {
1918 latest_time = p->start_time; 2007 latest_time = q->start_time;
1919 latest_task = p; 2008 latest_task = q;
1920 } 2009 }
1921 /* Process the task per the caller's callback */ 2010 /* Process the task per the caller's callback */
1922 scan->process_task(p, scan); 2011 scan->process_task(q, scan);
1923 put_task_struct(p); 2012 put_task_struct(q);
1924 } 2013 }
1925 /* 2014 /*
1926 * If we had to process any tasks at all, scan again 2015 * If we had to process any tasks at all, scan again
@@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2138 return notify_on_release(cgrp); 2227 return notify_on_release(cgrp);
2139} 2228}
2140 2229
2141static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
2142{
2143 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
2144}
2145
2146/* 2230/*
2147 * for the common functions, 'private' gives the type of file 2231 * for the common functions, 'private' gives the type of file
2148 */ 2232 */
@@ -2158,16 +2242,10 @@ static struct cftype files[] = {
2158 2242
2159 { 2243 {
2160 .name = "notify_on_release", 2244 .name = "notify_on_release",
2161 .read_uint = cgroup_read_notify_on_release, 2245 .read_u64 = cgroup_read_notify_on_release,
2162 .write = cgroup_common_file_write, 2246 .write = cgroup_common_file_write,
2163 .private = FILE_NOTIFY_ON_RELEASE, 2247 .private = FILE_NOTIFY_ON_RELEASE,
2164 }, 2248 },
2165
2166 {
2167 .name = "releasable",
2168 .read_uint = cgroup_read_releasable,
2169 .private = FILE_RELEASABLE,
2170 }
2171}; 2249};
2172 2250
2173static struct cftype cft_release_agent = { 2251static struct cftype cft_release_agent = {
@@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2401 return 0; 2479 return 0;
2402} 2480}
2403 2481
2404static void cgroup_init_subsys(struct cgroup_subsys *ss) 2482static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2405{ 2483{
2406 struct cgroup_subsys_state *css; 2484 struct cgroup_subsys_state *css;
2407 struct list_head *l;
2408 2485
2409 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2486 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2410 2487
@@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
2415 BUG_ON(IS_ERR(css)); 2492 BUG_ON(IS_ERR(css));
2416 init_cgroup_css(css, ss, dummytop); 2493 init_cgroup_css(css, ss, dummytop);
2417 2494
2418 /* Update all cgroup groups to contain a subsys 2495 /* Update the init_css_set to contain a subsys
2419 * pointer to this state - since the subsystem is 2496 * pointer to this state - since the subsystem is
2420 * newly registered, all tasks and hence all cgroup 2497 * newly registered, all tasks and hence the
2421 * groups are in the subsystem's top cgroup. */ 2498 * init_css_set is in the subsystem's top cgroup. */
2422 write_lock(&css_set_lock); 2499 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2423 l = &init_css_set.list;
2424 do {
2425 struct css_set *cg =
2426 list_entry(l, struct css_set, list);
2427 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2428 l = l->next;
2429 } while (l != &init_css_set.list);
2430 write_unlock(&css_set_lock);
2431
2432 /* If this subsystem requested that it be notified with fork
2433 * events, we should send it one now for every process in the
2434 * system */
2435 if (ss->fork) {
2436 struct task_struct *g, *p;
2437
2438 read_lock(&tasklist_lock);
2439 do_each_thread(g, p) {
2440 ss->fork(ss, p);
2441 } while_each_thread(g, p);
2442 read_unlock(&tasklist_lock);
2443 }
2444 2500
2445 need_forkexit_callback |= ss->fork || ss->exit; 2501 need_forkexit_callback |= ss->fork || ss->exit;
2502 need_mm_owner_callback |= !!ss->mm_owner_changed;
2503
2504 /* At system boot, before all subsystems have been
2505 * registered, no tasks have been forked, so we don't
2506 * need to invoke fork callbacks here. */
2507 BUG_ON(!list_empty(&init_task.tasks));
2446 2508
2447 ss->active = 1; 2509 ss->active = 1;
2448} 2510}
@@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void)
2458 int i; 2520 int i;
2459 kref_init(&init_css_set.ref); 2521 kref_init(&init_css_set.ref);
2460 kref_get(&init_css_set.ref); 2522 kref_get(&init_css_set.ref);
2461 INIT_LIST_HEAD(&init_css_set.list);
2462 INIT_LIST_HEAD(&init_css_set.cg_links); 2523 INIT_LIST_HEAD(&init_css_set.cg_links);
2463 INIT_LIST_HEAD(&init_css_set.tasks); 2524 INIT_LIST_HEAD(&init_css_set.tasks);
2525 INIT_HLIST_NODE(&init_css_set.hlist);
2464 css_set_count = 1; 2526 css_set_count = 1;
2465 init_cgroup_root(&rootnode); 2527 init_cgroup_root(&rootnode);
2466 list_add(&rootnode.root_list, &roots); 2528 list_add(&rootnode.root_list, &roots);
@@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void)
2473 list_add(&init_css_set_link.cg_link_list, 2535 list_add(&init_css_set_link.cg_link_list,
2474 &init_css_set.cg_links); 2536 &init_css_set.cg_links);
2475 2537
2538 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
2539 INIT_HLIST_HEAD(&css_set_table[i]);
2540
2476 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2541 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2477 struct cgroup_subsys *ss = subsys[i]; 2542 struct cgroup_subsys *ss = subsys[i];
2478 2543
@@ -2502,7 +2567,7 @@ int __init cgroup_init(void)
2502{ 2567{
2503 int err; 2568 int err;
2504 int i; 2569 int i;
2505 struct proc_dir_entry *entry; 2570 struct hlist_head *hhead;
2506 2571
2507 err = bdi_init(&cgroup_backing_dev_info); 2572 err = bdi_init(&cgroup_backing_dev_info);
2508 if (err) 2573 if (err)
@@ -2514,13 +2579,15 @@ int __init cgroup_init(void)
2514 cgroup_init_subsys(ss); 2579 cgroup_init_subsys(ss);
2515 } 2580 }
2516 2581
2582 /* Add init_css_set to the hash table */
2583 hhead = css_set_hash(init_css_set.subsys);
2584 hlist_add_head(&init_css_set.hlist, hhead);
2585
2517 err = register_filesystem(&cgroup_fs_type); 2586 err = register_filesystem(&cgroup_fs_type);
2518 if (err < 0) 2587 if (err < 0)
2519 goto out; 2588 goto out;
2520 2589
2521 entry = create_proc_entry("cgroups", 0, NULL); 2590 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
2522 if (entry)
2523 entry->proc_fops = &proc_cgroupstats_operations;
2524 2591
2525out: 2592out:
2526 if (err) 2593 if (err)
@@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2683 } 2750 }
2684} 2751}
2685 2752
2753#ifdef CONFIG_MM_OWNER
2754/**
2755 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2756 * @p: the new owner
2757 *
2758 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it.
2761 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{
2764 struct cgroup *oldcgrp, *newcgrp;
2765
2766 if (need_mm_owner_callback) {
2767 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp)
2773 continue;
2774 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp);
2776 }
2777 }
2778}
2779#endif /* CONFIG_MM_OWNER */
2780
2686/** 2781/**
2687 * cgroup_post_fork - called on a new task after adding it to the task list 2782 * cgroup_post_fork - called on a new task after adding it to the task list
2688 * @child: the task in question 2783 * @child: the task in question
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that 2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info 3 * exposes debug info
4 * 4 *
5 * Copyright (C) Google Inc, 2007 5 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
62 return count; 62 return count;
63} 63}
64 64
65static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
66{
67 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
68}
69
65static struct cftype files[] = { 70static struct cftype files[] = {
66 { 71 {
67 .name = "cgroup_refcount", 72 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read, 73 .read_u64 = cgroup_refcount_read,
69 }, 74 },
70 { 75 {
71 .name = "taskcount", 76 .name = "taskcount",
72 .read_uint = taskcount_read, 77 .read_u64 = taskcount_read,
73 }, 78 },
74 79
75 { 80 {
76 .name = "current_css_set", 81 .name = "current_css_set",
77 .read_uint = current_css_set_read, 82 .read_u64 = current_css_set_read,
78 }, 83 },
79 84
80 { 85 {
81 .name = "current_css_set_refcount", 86 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read, 87 .read_u64 = current_css_set_refcount_read,
83 }, 88 },
89
90 {
91 .name = "releasable",
92 .read_u64 = releasable_read,
93 }
84}; 94};
85 95
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index e1ef04870c2a..32c254a8ab9a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -898,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
898 898
899 current->state = TASK_INTERRUPTIBLE; 899 current->state = TASK_INTERRUPTIBLE;
900 schedule(); 900 schedule();
901 set_thread_flag(TIF_RESTORE_SIGMASK); 901 set_restore_sigmask();
902 return -ERESTARTNOHAND; 902 return -ERESTARTNOHAND;
903} 903}
904#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 904#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -955,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
955 __put_user(txc.jitcnt, &utp->jitcnt) || 955 __put_user(txc.jitcnt, &utp->jitcnt) ||
956 __put_user(txc.calcnt, &utp->calcnt) || 956 __put_user(txc.calcnt, &utp->calcnt) ||
957 __put_user(txc.errcnt, &utp->errcnt) || 957 __put_user(txc.errcnt, &utp->errcnt) ||
958 __put_user(txc.stbcnt, &utp->stbcnt)) 958 __put_user(txc.stbcnt, &utp->stbcnt) ||
959 __put_user(txc.tai, &utp->tai))
959 ret = -EFAULT; 960 ret = -EFAULT;
960 961
961 return ret; 962 return ret;
@@ -1080,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1080 1081
1081 return 0; 1082 return 0;
1082} 1083}
1083
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
79 struct proc_dir_entry *entry; 79 struct proc_dir_entry *entry;
80 80
81 /* create the current config file */ 81 /* create the current config file */
82 entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, 82 entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
83 &proc_root); 83 &ikconfig_file_ops);
84 if (!entry) 84 if (!entry)
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 entry->proc_fops = &ikconfig_file_ops;
88 entry->size = kernel_config_data_size; 87 entry->size = kernel_config_data_size;
89 88
90 return 0; 89 return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
95 94
96static void __exit ikconfig_cleanup(void) 95static void __exit ikconfig_cleanup(void)
97{ 96{
98 remove_proc_entry("config.gz", &proc_root); 97 remove_proc_entry("config.gz", NULL);
99} 98}
100 99
101module_init(ikconfig_init); 100module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..c77bc3a1c722 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
33 * an ongoing cpu hotplug operation. 33 * an ongoing cpu hotplug operation.
34 */ 34 */
35 int refcount; 35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug; 36} cpu_hotplug;
38 37
39#define writer_exists() (cpu_hotplug.active_writer != NULL)
40
41void __init cpu_hotplug_init(void) 38void __init cpu_hotplug_init(void)
42{ 39{
43 cpu_hotplug.active_writer = NULL; 40 cpu_hotplug.active_writer = NULL;
44 mutex_init(&cpu_hotplug.lock); 41 mutex_init(&cpu_hotplug.lock);
45 cpu_hotplug.refcount = 0; 42 cpu_hotplug.refcount = 0;
46 init_waitqueue_head(&cpu_hotplug.writer_queue);
47} 43}
48 44
49#ifdef CONFIG_HOTPLUG_CPU 45#ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
65 if (cpu_hotplug.active_writer == current) 61 if (cpu_hotplug.active_writer == current)
66 return; 62 return;
67 mutex_lock(&cpu_hotplug.lock); 63 mutex_lock(&cpu_hotplug.lock);
68 cpu_hotplug.refcount--; 64 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
69 65 wake_up_process(cpu_hotplug.active_writer);
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock); 66 mutex_unlock(&cpu_hotplug.lock);
74 67
75} 68}
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
98 * Note that during a cpu-hotplug operation, the new readers, if any, 91 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock 92 * will be blocked by the cpu_hotplug.lock
100 * 93 *
101 * Since cpu_maps_update_begin is always called after invoking 94 * Since cpu_hotplug_begin() is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active. 95 * cpu_maps_update_begin(), we can be sure that only one writer is active.
103 * 96 *
104 * Note that theoretically, there is a possibility of a livelock: 97 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping 98 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
115 */ 108 */
116static void cpu_hotplug_begin(void) 109static void cpu_hotplug_begin(void)
117{ 110{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current; 111 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); 112
124 while (cpu_hotplug.refcount) { 113 for (;;) {
125 set_current_state(TASK_UNINTERRUPTIBLE); 114 mutex_lock(&cpu_hotplug.lock);
115 if (likely(!cpu_hotplug.refcount))
116 break;
117 __set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock); 118 mutex_unlock(&cpu_hotplug.lock);
127 schedule(); 119 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 } 120 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131} 121}
132 122
133static void cpu_hotplug_done(void) 123static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
136 mutex_unlock(&cpu_hotplug.lock); 126 mutex_unlock(&cpu_hotplug.lock);
137} 127}
138/* Need to know about CPUs going up/down? */ 128/* Need to know about CPUs going up/down? */
139int __cpuinit register_cpu_notifier(struct notifier_block *nb) 129int __ref register_cpu_notifier(struct notifier_block *nb)
140{ 130{
141 int ret; 131 int ret;
142 cpu_maps_update_begin(); 132 cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
149 139
150EXPORT_SYMBOL(register_cpu_notifier); 140EXPORT_SYMBOL(register_cpu_notifier);
151 141
152void unregister_cpu_notifier(struct notifier_block *nb) 142void __ref unregister_cpu_notifier(struct notifier_block *nb)
153{ 143{
154 cpu_maps_update_begin(); 144 cpu_maps_update_begin();
155 raw_notifier_chain_unregister(&cpu_chain, nb); 145 raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
180}; 170};
181 171
182/* Take this CPU down. */ 172/* Take this CPU down. */
183static int take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
184{ 174{
185 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
186 int err; 176 int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
199} 189}
200 190
201/* Requires cpu_add_remove_lock to be held */ 191/* Requires cpu_add_remove_lock to be held */
202static int _cpu_down(unsigned int cpu, int tasks_frozen) 192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
203{ 193{
204 int err, nr_calls = 0; 194 int err, nr_calls = 0;
205 struct task_struct *p; 195 struct task_struct *p;
@@ -225,7 +215,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
225 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 215 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
226 hcpu, nr_calls, NULL); 216 hcpu, nr_calls, NULL);
227 printk("%s: attempt to take down CPU %u failed\n", 217 printk("%s: attempt to take down CPU %u failed\n",
228 __FUNCTION__, cpu); 218 __func__, cpu);
229 err = -EINVAL; 219 err = -EINVAL;
230 goto out_release; 220 goto out_release;
231 } 221 }
@@ -274,7 +264,7 @@ out_release:
274 return err; 264 return err;
275} 265}
276 266
277int cpu_down(unsigned int cpu) 267int __ref cpu_down(unsigned int cpu)
278{ 268{
279 int err = 0; 269 int err = 0;
280 270
@@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
305 if (ret == NOTIFY_BAD) { 295 if (ret == NOTIFY_BAD) {
306 nr_calls--; 296 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 297 printk("%s: attempt to bring up CPU %u failed\n",
308 __FUNCTION__, cpu); 298 __func__, cpu);
309 ret = -EINVAL; 299 ret = -EINVAL;
310 goto out_notify; 300 goto out_notify;
311 } 301 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 024888bb9814..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
127typedef enum { 127typedef enum {
128 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
130 CS_MEMORY_MIGRATE, 131 CS_MEMORY_MIGRATE,
131 CS_SCHED_LOAD_BALANCE, 132 CS_SCHED_LOAD_BALANCE,
132 CS_SPREAD_PAGE, 133 CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
144 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
145} 146}
146 147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
147static inline int is_sched_load_balance(const struct cpuset *cs) 153static inline int is_sched_load_balance(const struct cpuset *cs)
148{ 154{
149 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
735 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 741 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
736 * words, if its mask is not equal to its cpuset's mask). 742 * words, if its mask is not equal to its cpuset's mask).
737 */ 743 */
738int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 744static int cpuset_test_cpumask(struct task_struct *tsk,
745 struct cgroup_scanner *scan)
739{ 746{
740 return !cpus_equal(tsk->cpus_allowed, 747 return !cpus_equal(tsk->cpus_allowed,
741 (cgroup_cs(scan->cg))->cpus_allowed); 748 (cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
752 * We don't need to re-check for the cgroup/cpuset membership, since we're 759 * We don't need to re-check for the cgroup/cpuset membership, since we're
753 * holding cgroup_lock() at this point. 760 * holding cgroup_lock() at this point.
754 */ 761 */
755void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 762static void cpuset_change_cpumask(struct task_struct *tsk,
763 struct cgroup_scanner *scan)
756{ 764{
757 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 765 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
758} 766}
@@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void)
1023 return task_cs(current) == cpuset_being_rebound; 1031 return task_cs(current) == cpuset_being_rebound;
1024} 1032}
1025 1033
1026/*
1027 * Call with cgroup_mutex held.
1028 */
1029
1030static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1031{
1032 if (simple_strtoul(buf, NULL, 10) != 0)
1033 cpuset_memory_pressure_enabled = 1;
1034 else
1035 cpuset_memory_pressure_enabled = 0;
1036 return 0;
1037}
1038
1039static int update_relax_domain_level(struct cpuset *cs, char *buf) 1034static int update_relax_domain_level(struct cpuset *cs, char *buf)
1040{ 1035{
1041 int val = simple_strtol(buf, NULL, 10); 1036 int val = simple_strtol(buf, NULL, 10);
@@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
1053 1048
1054/* 1049/*
1055 * update_flag - read a 0 or a 1 in a file and update associated flag 1050 * update_flag - read a 0 or a 1 in a file and update associated flag
1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1051 * bit: the bit to update (see cpuset_flagbits_t)
1057 * CS_SCHED_LOAD_BALANCE, 1052 * cs: the cpuset to update
1058 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1053 * turning_on: whether the flag is being set or cleared
1059 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1060 * cs: the cpuset to update
1061 * buf: the buffer where we read the 0 or 1
1062 * 1054 *
1063 * Call with cgroup_mutex held. 1055 * Call with cgroup_mutex held.
1064 */ 1056 */
1065 1057
1066static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1058static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1059 int turning_on)
1067{ 1060{
1068 int turning_on;
1069 struct cpuset trialcs; 1061 struct cpuset trialcs;
1070 int err; 1062 int err;
1071 int cpus_nonempty, balance_flag_changed; 1063 int cpus_nonempty, balance_flag_changed;
1072 1064
1073 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1074
1075 trialcs = *cs; 1065 trialcs = *cs;
1076 if (turning_on) 1066 if (turning_on)
1077 set_bit(bit, &trialcs.flags); 1067 set_bit(bit, &trialcs.flags);
@@ -1241,6 +1231,7 @@ typedef enum {
1241 FILE_MEMLIST, 1231 FILE_MEMLIST,
1242 FILE_CPU_EXCLUSIVE, 1232 FILE_CPU_EXCLUSIVE,
1243 FILE_MEM_EXCLUSIVE, 1233 FILE_MEM_EXCLUSIVE,
1234 FILE_MEM_HARDWALL,
1244 FILE_SCHED_LOAD_BALANCE, 1235 FILE_SCHED_LOAD_BALANCE,
1245 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1236 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1246 FILE_MEMORY_PRESSURE_ENABLED, 1237 FILE_MEMORY_PRESSURE_ENABLED,
@@ -1265,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1265 return -E2BIG; 1256 return -E2BIG;
1266 1257
1267 /* +1 for nul-terminator */ 1258 /* +1 for nul-terminator */
1268 if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) 1259 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1260 if (!buffer)
1269 return -ENOMEM; 1261 return -ENOMEM;
1270 1262
1271 if (copy_from_user(buffer, userbuf, nbytes)) { 1263 if (copy_from_user(buffer, userbuf, nbytes)) {
@@ -1288,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1288 case FILE_MEMLIST: 1280 case FILE_MEMLIST:
1289 retval = update_nodemask(cs, buffer); 1281 retval = update_nodemask(cs, buffer);
1290 break; 1282 break;
1283 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1284 retval = update_relax_domain_level(cs, buffer);
1285 break;
1286 default:
1287 retval = -EINVAL;
1288 goto out2;
1289 }
1290
1291 if (retval == 0)
1292 retval = nbytes;
1293out2:
1294 cgroup_unlock();
1295out1:
1296 kfree(buffer);
1297 return retval;
1298}
1299
1300static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1301{
1302 int retval = 0;
1303 struct cpuset *cs = cgroup_cs(cgrp);
1304 cpuset_filetype_t type = cft->private;
1305
1306 cgroup_lock();
1307
1308 if (cgroup_is_removed(cgrp)) {
1309 cgroup_unlock();
1310 return -ENODEV;
1311 }
1312
1313 switch (type) {
1291 case FILE_CPU_EXCLUSIVE: 1314 case FILE_CPU_EXCLUSIVE:
1292 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); 1315 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1293 break; 1316 break;
1294 case FILE_MEM_EXCLUSIVE: 1317 case FILE_MEM_EXCLUSIVE:
1295 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1318 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1296 break; 1319 break;
1297 case FILE_SCHED_LOAD_BALANCE: 1320 case FILE_MEM_HARDWALL:
1298 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1321 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1299 break; 1322 break;
1300 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1323 case FILE_SCHED_LOAD_BALANCE:
1301 retval = update_relax_domain_level(cs, buffer); 1324 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1302 break; 1325 break;
1303 case FILE_MEMORY_MIGRATE: 1326 case FILE_MEMORY_MIGRATE:
1304 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1327 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1305 break; 1328 break;
1306 case FILE_MEMORY_PRESSURE_ENABLED: 1329 case FILE_MEMORY_PRESSURE_ENABLED:
1307 retval = update_memory_pressure_enabled(cs, buffer); 1330 cpuset_memory_pressure_enabled = !!val;
1308 break; 1331 break;
1309 case FILE_MEMORY_PRESSURE: 1332 case FILE_MEMORY_PRESSURE:
1310 retval = -EACCES; 1333 retval = -EACCES;
1311 break; 1334 break;
1312 case FILE_SPREAD_PAGE: 1335 case FILE_SPREAD_PAGE:
1313 retval = update_flag(CS_SPREAD_PAGE, cs, buffer); 1336 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1314 cs->mems_generation = cpuset_mems_generation++; 1337 cs->mems_generation = cpuset_mems_generation++;
1315 break; 1338 break;
1316 case FILE_SPREAD_SLAB: 1339 case FILE_SPREAD_SLAB:
1317 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1340 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1318 cs->mems_generation = cpuset_mems_generation++; 1341 cs->mems_generation = cpuset_mems_generation++;
1319 break; 1342 break;
1320 default: 1343 default:
1321 retval = -EINVAL; 1344 retval = -EINVAL;
1322 goto out2; 1345 break;
1323 } 1346 }
1324
1325 if (retval == 0)
1326 retval = nbytes;
1327out2:
1328 cgroup_unlock(); 1347 cgroup_unlock();
1329out1:
1330 kfree(buffer);
1331 return retval; 1348 return retval;
1332} 1349}
1333 1350
@@ -1389,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1389 case FILE_MEMLIST: 1406 case FILE_MEMLIST:
1390 s += cpuset_sprintf_memlist(s, cs); 1407 s += cpuset_sprintf_memlist(s, cs);
1391 break; 1408 break;
1392 case FILE_CPU_EXCLUSIVE:
1393 *s++ = is_cpu_exclusive(cs) ? '1' : '0';
1394 break;
1395 case FILE_MEM_EXCLUSIVE:
1396 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1397 break;
1398 case FILE_SCHED_LOAD_BALANCE:
1399 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1400 break;
1401 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1409 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1402 s += sprintf(s, "%d", cs->relax_domain_level); 1410 s += sprintf(s, "%d", cs->relax_domain_level);
1403 break; 1411 break;
1404 case FILE_MEMORY_MIGRATE:
1405 *s++ = is_memory_migrate(cs) ? '1' : '0';
1406 break;
1407 case FILE_MEMORY_PRESSURE_ENABLED:
1408 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1409 break;
1410 case FILE_MEMORY_PRESSURE:
1411 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1412 break;
1413 case FILE_SPREAD_PAGE:
1414 *s++ = is_spread_page(cs) ? '1' : '0';
1415 break;
1416 case FILE_SPREAD_SLAB:
1417 *s++ = is_spread_slab(cs) ? '1' : '0';
1418 break;
1419 default: 1412 default:
1420 retval = -EINVAL; 1413 retval = -EINVAL;
1421 goto out; 1414 goto out;
@@ -1428,121 +1421,137 @@ out:
1428 return retval; 1421 return retval;
1429} 1422}
1430 1423
1431 1424static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1432 1425{
1426 struct cpuset *cs = cgroup_cs(cont);
1427 cpuset_filetype_t type = cft->private;
1428 switch (type) {
1429 case FILE_CPU_EXCLUSIVE:
1430 return is_cpu_exclusive(cs);
1431 case FILE_MEM_EXCLUSIVE:
1432 return is_mem_exclusive(cs);
1433 case FILE_MEM_HARDWALL:
1434 return is_mem_hardwall(cs);
1435 case FILE_SCHED_LOAD_BALANCE:
1436 return is_sched_load_balance(cs);
1437 case FILE_MEMORY_MIGRATE:
1438 return is_memory_migrate(cs);
1439 case FILE_MEMORY_PRESSURE_ENABLED:
1440 return cpuset_memory_pressure_enabled;
1441 case FILE_MEMORY_PRESSURE:
1442 return fmeter_getrate(&cs->fmeter);
1443 case FILE_SPREAD_PAGE:
1444 return is_spread_page(cs);
1445 case FILE_SPREAD_SLAB:
1446 return is_spread_slab(cs);
1447 default:
1448 BUG();
1449 }
1450}
1433 1451
1434 1452
1435/* 1453/*
1436 * for the common functions, 'private' gives the type of file 1454 * for the common functions, 'private' gives the type of file
1437 */ 1455 */
1438 1456
1439static struct cftype cft_cpus = { 1457static struct cftype files[] = {
1440 .name = "cpus", 1458 {
1441 .read = cpuset_common_file_read, 1459 .name = "cpus",
1442 .write = cpuset_common_file_write, 1460 .read = cpuset_common_file_read,
1443 .private = FILE_CPULIST, 1461 .write = cpuset_common_file_write,
1444}; 1462 .private = FILE_CPULIST,
1445 1463 },
1446static struct cftype cft_mems = { 1464
1447 .name = "mems", 1465 {
1448 .read = cpuset_common_file_read, 1466 .name = "mems",
1449 .write = cpuset_common_file_write, 1467 .read = cpuset_common_file_read,
1450 .private = FILE_MEMLIST, 1468 .write = cpuset_common_file_write,
1451}; 1469 .private = FILE_MEMLIST,
1452 1470 },
1453static struct cftype cft_cpu_exclusive = { 1471
1454 .name = "cpu_exclusive", 1472 {
1455 .read = cpuset_common_file_read, 1473 .name = "cpu_exclusive",
1456 .write = cpuset_common_file_write, 1474 .read_u64 = cpuset_read_u64,
1457 .private = FILE_CPU_EXCLUSIVE, 1475 .write_u64 = cpuset_write_u64,
1458}; 1476 .private = FILE_CPU_EXCLUSIVE,
1459 1477 },
1460static struct cftype cft_mem_exclusive = { 1478
1461 .name = "mem_exclusive", 1479 {
1462 .read = cpuset_common_file_read, 1480 .name = "mem_exclusive",
1463 .write = cpuset_common_file_write, 1481 .read_u64 = cpuset_read_u64,
1464 .private = FILE_MEM_EXCLUSIVE, 1482 .write_u64 = cpuset_write_u64,
1465}; 1483 .private = FILE_MEM_EXCLUSIVE,
1466 1484 },
1467static struct cftype cft_sched_load_balance = { 1485
1468 .name = "sched_load_balance", 1486 {
1469 .read = cpuset_common_file_read, 1487 .name = "mem_hardwall",
1470 .write = cpuset_common_file_write, 1488 .read_u64 = cpuset_read_u64,
1471 .private = FILE_SCHED_LOAD_BALANCE, 1489 .write_u64 = cpuset_write_u64,
1472}; 1490 .private = FILE_MEM_HARDWALL,
1473 1491 },
1474static struct cftype cft_sched_relax_domain_level = { 1492
1475 .name = "sched_relax_domain_level", 1493 {
1476 .read = cpuset_common_file_read, 1494 .name = "sched_load_balance",
1477 .write = cpuset_common_file_write, 1495 .read_u64 = cpuset_read_u64,
1478 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1496 .write_u64 = cpuset_write_u64,
1479}; 1497 .private = FILE_SCHED_LOAD_BALANCE,
1480 1498 },
1481static struct cftype cft_memory_migrate = { 1499
1482 .name = "memory_migrate", 1500 {
1483 .read = cpuset_common_file_read, 1501 .name = "sched_relax_domain_level",
1484 .write = cpuset_common_file_write, 1502 .read_u64 = cpuset_read_u64,
1485 .private = FILE_MEMORY_MIGRATE, 1503 .write_u64 = cpuset_write_u64,
1504 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1505 },
1506
1507 {
1508 .name = "memory_migrate",
1509 .read_u64 = cpuset_read_u64,
1510 .write_u64 = cpuset_write_u64,
1511 .private = FILE_MEMORY_MIGRATE,
1512 },
1513
1514 {
1515 .name = "memory_pressure",
1516 .read_u64 = cpuset_read_u64,
1517 .write_u64 = cpuset_write_u64,
1518 .private = FILE_MEMORY_PRESSURE,
1519 },
1520
1521 {
1522 .name = "memory_spread_page",
1523 .read_u64 = cpuset_read_u64,
1524 .write_u64 = cpuset_write_u64,
1525 .private = FILE_SPREAD_PAGE,
1526 },
1527
1528 {
1529 .name = "memory_spread_slab",
1530 .read_u64 = cpuset_read_u64,
1531 .write_u64 = cpuset_write_u64,
1532 .private = FILE_SPREAD_SLAB,
1533 },
1486}; 1534};
1487 1535
1488static struct cftype cft_memory_pressure_enabled = { 1536static struct cftype cft_memory_pressure_enabled = {
1489 .name = "memory_pressure_enabled", 1537 .name = "memory_pressure_enabled",
1490 .read = cpuset_common_file_read, 1538 .read_u64 = cpuset_read_u64,
1491 .write = cpuset_common_file_write, 1539 .write_u64 = cpuset_write_u64,
1492 .private = FILE_MEMORY_PRESSURE_ENABLED, 1540 .private = FILE_MEMORY_PRESSURE_ENABLED,
1493}; 1541};
1494 1542
1495static struct cftype cft_memory_pressure = {
1496 .name = "memory_pressure",
1497 .read = cpuset_common_file_read,
1498 .write = cpuset_common_file_write,
1499 .private = FILE_MEMORY_PRESSURE,
1500};
1501
1502static struct cftype cft_spread_page = {
1503 .name = "memory_spread_page",
1504 .read = cpuset_common_file_read,
1505 .write = cpuset_common_file_write,
1506 .private = FILE_SPREAD_PAGE,
1507};
1508
1509static struct cftype cft_spread_slab = {
1510 .name = "memory_spread_slab",
1511 .read = cpuset_common_file_read,
1512 .write = cpuset_common_file_write,
1513 .private = FILE_SPREAD_SLAB,
1514};
1515
1516static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1543static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1517{ 1544{
1518 int err; 1545 int err;
1519 1546
1520 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) 1547 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1521 return err; 1548 if (err)
1522 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1523 return err;
1524 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1525 return err;
1526 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1527 return err;
1528 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1529 return err;
1530 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1531 return err;
1532 if ((err = cgroup_add_file(cont, ss,
1533 &cft_sched_relax_domain_level)) < 0)
1534 return err;
1535 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1536 return err;
1537 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1538 return err;
1539 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1540 return err; 1549 return err;
1541 /* memory_pressure_enabled is in root cpuset only */ 1550 /* memory_pressure_enabled is in root cpuset only */
1542 if (err == 0 && !cont->parent) 1551 if (!cont->parent)
1543 err = cgroup_add_file(cont, ss, 1552 err = cgroup_add_file(cont, ss,
1544 &cft_memory_pressure_enabled); 1553 &cft_memory_pressure_enabled);
1545 return 0; 1554 return err;
1546} 1555}
1547 1556
1548/* 1557/*
@@ -1642,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1642 cpuset_update_task_memory_state(); 1651 cpuset_update_task_memory_state();
1643 1652
1644 if (is_sched_load_balance(cs)) 1653 if (is_sched_load_balance(cs))
1645 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); 1654 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1646 1655
1647 number_of_cpusets--; 1656 number_of_cpusets--;
1648 kfree(cs); 1657 kfree(cs);
@@ -1707,7 +1716,8 @@ int __init cpuset_init(void)
1707 * Called by cgroup_scan_tasks() for each task in a cgroup. 1716 * Called by cgroup_scan_tasks() for each task in a cgroup.
1708 * Return nonzero to stop the walk through the tasks. 1717 * Return nonzero to stop the walk through the tasks.
1709 */ 1718 */
1710void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) 1719static void cpuset_do_move_task(struct task_struct *tsk,
1720 struct cgroup_scanner *scan)
1711{ 1721{
1712 struct cpuset_hotplug_scanner *chsp; 1722 struct cpuset_hotplug_scanner *chsp;
1713 1723
@@ -1969,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1969} 1979}
1970 1980
1971/* 1981/*
1972 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1982 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
1973 * ancestor to the specified cpuset. Call holding callback_mutex. 1983 * mem_hardwall ancestor to the specified cpuset. Call holding
1974 * If no ancestor is mem_exclusive (an unusual configuration), then 1984 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
1975 * returns the root cpuset. 1985 * (an unusual configuration), then returns the root cpuset.
1976 */ 1986 */
1977static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) 1987static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
1978{ 1988{
1979 while (!is_mem_exclusive(cs) && cs->parent) 1989 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
1980 cs = cs->parent; 1990 cs = cs->parent;
1981 return cs; 1991 return cs;
1982} 1992}
@@ -1990,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1990 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2000 * __GFP_THISNODE is set, yes, we can always allocate. If zone
1991 * z's node is in our tasks mems_allowed, yes. If it's not a 2001 * z's node is in our tasks mems_allowed, yes. If it's not a
1992 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2002 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1993 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2003 * hardwalled cpuset ancestor to this tasks cpuset, yes.
1994 * If the task has been OOM killed and has access to memory reserves 2004 * If the task has been OOM killed and has access to memory reserves
1995 * as specified by the TIF_MEMDIE flag, yes. 2005 * as specified by the TIF_MEMDIE flag, yes.
1996 * Otherwise, no. 2006 * Otherwise, no.
@@ -2013,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2013 * and do not allow allocations outside the current tasks cpuset 2023 * and do not allow allocations outside the current tasks cpuset
2014 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2024 * unless the task has been OOM killed as is marked TIF_MEMDIE.
2015 * GFP_KERNEL allocations are not so marked, so can escape to the 2025 * GFP_KERNEL allocations are not so marked, so can escape to the
2016 * nearest enclosing mem_exclusive ancestor cpuset. 2026 * nearest enclosing hardwalled ancestor cpuset.
2017 * 2027 *
2018 * Scanning up parent cpusets requires callback_mutex. The 2028 * Scanning up parent cpusets requires callback_mutex. The
2019 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2029 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2036,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2036 * in_interrupt - any node ok (current task context irrelevant) 2046 * in_interrupt - any node ok (current task context irrelevant)
2037 * GFP_ATOMIC - any node ok 2047 * GFP_ATOMIC - any node ok
2038 * TIF_MEMDIE - any node ok 2048 * TIF_MEMDIE - any node ok
2039 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2049 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2040 * GFP_USER - only nodes in current tasks mems allowed ok. 2050 * GFP_USER - only nodes in current tasks mems allowed ok.
2041 * 2051 *
2042 * Rule: 2052 * Rule:
@@ -2073,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2073 mutex_lock(&callback_mutex); 2083 mutex_lock(&callback_mutex);
2074 2084
2075 task_lock(current); 2085 task_lock(current);
2076 cs = nearest_exclusive_ancestor(task_cs(current)); 2086 cs = nearest_hardwall_ancestor(task_cs(current));
2077 task_unlock(current); 2087 task_unlock(current);
2078 2088
2079 allowed = node_isset(node, cs->mems_allowed); 2089 allowed = node_isset(node, cs->mems_allowed);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
149 149
150static int __init proc_dma_init(void) 150static int __init proc_dma_init(void)
151{ 151{
152 struct proc_dir_entry *e; 152 proc_create("dma", 0, NULL, &proc_dma_operations);
153
154 e = create_proc_entry("dma", 0, NULL);
155 if (e)
156 e->proc_fops = &proc_dma_operations;
157
158 return 0; 153 return 0;
159} 154}
160 155
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..1510f78a0ffa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
19#include <linux/acct.h> 19#include <linux/acct.h>
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h>
22#include <linux/binfmts.h> 23#include <linux/binfmts.h>
23#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
24#include <linux/pid_namespace.h> 25#include <linux/pid_namespace.h>
@@ -52,6 +53,11 @@
52 53
53static void exit_mm(struct task_struct * tsk); 54static void exit_mm(struct task_struct * tsk);
54 55
56static inline int task_detached(struct task_struct *p)
57{
58 return p->exit_signal == -1;
59}
60
55static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p)
56{ 62{
57 nr_threads--; 63 nr_threads--;
@@ -160,7 +166,7 @@ repeat:
160 zap_leader = 0; 166 zap_leader = 0;
161 leader = p->group_leader; 167 leader = p->group_leader;
162 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 168 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
163 BUG_ON(leader->exit_signal == -1); 169 BUG_ON(task_detached(leader));
164 do_notify_parent(leader, leader->exit_signal); 170 do_notify_parent(leader, leader->exit_signal);
165 /* 171 /*
166 * If we were the last child thread and the leader has 172 * If we were the last child thread and the leader has
@@ -170,7 +176,7 @@ repeat:
170 * do_notify_parent() will have marked it self-reaping in 176 * do_notify_parent() will have marked it self-reaping in
171 * that case. 177 * that case.
172 */ 178 */
173 zap_leader = (leader->exit_signal == -1); 179 zap_leader = task_detached(leader);
174 } 180 }
175 181
176 write_unlock_irq(&tasklist_lock); 182 write_unlock_irq(&tasklist_lock);
@@ -329,13 +335,11 @@ void __set_special_pids(struct pid *pid)
329 pid_t nr = pid_nr(pid); 335 pid_t nr = pid_nr(pid);
330 336
331 if (task_session(curr) != pid) { 337 if (task_session(curr) != pid) {
332 detach_pid(curr, PIDTYPE_SID); 338 change_pid(curr, PIDTYPE_SID, pid);
333 attach_pid(curr, PIDTYPE_SID, pid);
334 set_task_session(curr, nr); 339 set_task_session(curr, nr);
335 } 340 }
336 if (task_pgrp(curr) != pid) { 341 if (task_pgrp(curr) != pid) {
337 detach_pid(curr, PIDTYPE_PGID); 342 change_pid(curr, PIDTYPE_PGID, pid);
338 attach_pid(curr, PIDTYPE_PGID, pid);
339 set_task_pgrp(curr, nr); 343 set_task_pgrp(curr, nr);
340 } 344 }
341} 345}
@@ -557,6 +561,88 @@ void exit_fs(struct task_struct *tsk)
557 561
558EXPORT_SYMBOL_GPL(exit_fs); 562EXPORT_SYMBOL_GPL(exit_fs);
559 563
564#ifdef CONFIG_MM_OWNER
565/*
566 * Task p is exiting and it owned mm, lets find a new owner for it
567 */
568static inline int
569mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
570{
571 /*
572 * If there are other users of the mm and the owner (us) is exiting
573 * we need to find a new owner to take on the responsibility.
574 */
575 if (!mm)
576 return 0;
577 if (atomic_read(&mm->mm_users) <= 1)
578 return 0;
579 if (mm->owner != p)
580 return 0;
581 return 1;
582}
583
584void mm_update_next_owner(struct mm_struct *mm)
585{
586 struct task_struct *c, *g, *p = current;
587
588retry:
589 if (!mm_need_new_owner(mm, p))
590 return;
591
592 read_lock(&tasklist_lock);
593 /*
594 * Search in the children
595 */
596 list_for_each_entry(c, &p->children, sibling) {
597 if (c->mm == mm)
598 goto assign_new_owner;
599 }
600
601 /*
602 * Search in the siblings
603 */
604 list_for_each_entry(c, &p->parent->children, sibling) {
605 if (c->mm == mm)
606 goto assign_new_owner;
607 }
608
609 /*
610 * Search through everything else. We should not get
611 * here often
612 */
613 do_each_thread(g, c) {
614 if (c->mm == mm)
615 goto assign_new_owner;
616 } while_each_thread(g, c);
617
618 read_unlock(&tasklist_lock);
619 return;
620
621assign_new_owner:
622 BUG_ON(c == p);
623 get_task_struct(c);
624 /*
625 * The task_lock protects c->mm from changing.
626 * We always want mm->owner->mm == mm
627 */
628 task_lock(c);
629 /*
630 * Delay read_unlock() till we have the task_lock()
631 * to ensure that c does not slip away underneath us
632 */
633 read_unlock(&tasklist_lock);
634 if (c->mm != mm) {
635 task_unlock(c);
636 put_task_struct(c);
637 goto retry;
638 }
639 cgroup_mm_owner_callbacks(mm->owner, c);
640 mm->owner = c;
641 task_unlock(c);
642 put_task_struct(c);
643}
644#endif /* CONFIG_MM_OWNER */
645
560/* 646/*
561 * Turn us into a lazy TLB process if we 647 * Turn us into a lazy TLB process if we
562 * aren't already.. 648 * aren't already..
@@ -596,6 +682,7 @@ static void exit_mm(struct task_struct * tsk)
596 /* We don't want this task to be frozen prematurely */ 682 /* We don't want this task to be frozen prematurely */
597 clear_freeze_flag(tsk); 683 clear_freeze_flag(tsk);
598 task_unlock(tsk); 684 task_unlock(tsk);
685 mm_update_next_owner(mm);
599 mmput(mm); 686 mmput(mm);
600} 687}
601 688
@@ -610,7 +697,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
610 if (unlikely(traced)) { 697 if (unlikely(traced)) {
611 /* Preserve ptrace links if someone else is tracing this child. */ 698 /* Preserve ptrace links if someone else is tracing this child. */
612 list_del_init(&p->ptrace_list); 699 list_del_init(&p->ptrace_list);
613 if (p->parent != p->real_parent) 700 if (ptrace_reparented(p))
614 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 701 list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
615 } else { 702 } else {
616 /* If this child is being traced, then we're the one tracing it 703 /* If this child is being traced, then we're the one tracing it
@@ -634,18 +721,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
634 /* If this is a threaded reparent there is no need to 721 /* If this is a threaded reparent there is no need to
635 * notify anyone anything has happened. 722 * notify anyone anything has happened.
636 */ 723 */
637 if (p->real_parent->group_leader == father->group_leader) 724 if (same_thread_group(p->real_parent, father))
638 return; 725 return;
639 726
640 /* We don't want people slaying init. */ 727 /* We don't want people slaying init. */
641 if (p->exit_signal != -1) 728 if (!task_detached(p))
642 p->exit_signal = SIGCHLD; 729 p->exit_signal = SIGCHLD;
643 730
644 /* If we'd notified the old parent about this child's death, 731 /* If we'd notified the old parent about this child's death,
645 * also notify the new parent. 732 * also notify the new parent.
646 */ 733 */
647 if (!traced && p->exit_state == EXIT_ZOMBIE && 734 if (!traced && p->exit_state == EXIT_ZOMBIE &&
648 p->exit_signal != -1 && thread_group_empty(p)) 735 !task_detached(p) && thread_group_empty(p))
649 do_notify_parent(p, p->exit_signal); 736 do_notify_parent(p, p->exit_signal);
650 737
651 kill_orphaned_pgrp(p, father); 738 kill_orphaned_pgrp(p, father);
@@ -698,18 +785,18 @@ static void forget_original_parent(struct task_struct *father)
698 } else { 785 } else {
699 /* reparent ptraced task to its real parent */ 786 /* reparent ptraced task to its real parent */
700 __ptrace_unlink (p); 787 __ptrace_unlink (p);
701 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && 788 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
702 thread_group_empty(p)) 789 thread_group_empty(p))
703 do_notify_parent(p, p->exit_signal); 790 do_notify_parent(p, p->exit_signal);
704 } 791 }
705 792
706 /* 793 /*
707 * if the ptraced child is a zombie with exit_signal == -1 794 * if the ptraced child is a detached zombie we must collect
708 * we must collect it before we exit, or it will remain 795 * it before we exit, or it will remain zombie forever since
709 * zombie forever since we prevented it from self-reap itself 796 * we prevented it from self-reap itself while it was being
710 * while it was being traced by us, to be able to see it in wait4. 797 * traced by us, to be able to see it in wait4.
711 */ 798 */
712 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 799 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
713 list_add(&p->ptrace_list, &ptrace_dead); 800 list_add(&p->ptrace_list, &ptrace_dead);
714 } 801 }
715 802
@@ -766,29 +853,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
766 * we have changed execution domain as these two values started 853 * we have changed execution domain as these two values started
767 * the same after a fork. 854 * the same after a fork.
768 */ 855 */
769 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && 856 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
770 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 857 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
771 tsk->self_exec_id != tsk->parent_exec_id) 858 tsk->self_exec_id != tsk->parent_exec_id) &&
772 && !capable(CAP_KILL)) 859 !capable(CAP_KILL))
773 tsk->exit_signal = SIGCHLD; 860 tsk->exit_signal = SIGCHLD;
774 861
775
776 /* If something other than our normal parent is ptracing us, then 862 /* If something other than our normal parent is ptracing us, then
777 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 863 * send it a SIGCHLD instead of honoring exit_signal. exit_signal
778 * only has special meaning to our real parent. 864 * only has special meaning to our real parent.
779 */ 865 */
780 if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { 866 if (!task_detached(tsk) && thread_group_empty(tsk)) {
781 int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; 867 int signal = ptrace_reparented(tsk) ?
868 SIGCHLD : tsk->exit_signal;
782 do_notify_parent(tsk, signal); 869 do_notify_parent(tsk, signal);
783 } else if (tsk->ptrace) { 870 } else if (tsk->ptrace) {
784 do_notify_parent(tsk, SIGCHLD); 871 do_notify_parent(tsk, SIGCHLD);
785 } 872 }
786 873
787 state = EXIT_ZOMBIE; 874 state = EXIT_ZOMBIE;
788 if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) 875 if (task_detached(tsk) && likely(!tsk->ptrace))
789 state = EXIT_DEAD; 876 state = EXIT_DEAD;
790 tsk->exit_state = state; 877 tsk->exit_state = state;
791 878
879 /* mt-exec, de_thread() is waiting for us */
792 if (thread_group_leader(tsk) && 880 if (thread_group_leader(tsk) &&
793 tsk->signal->notify_count < 0 && 881 tsk->signal->notify_count < 0 &&
794 tsk->signal->group_exit_task) 882 tsk->signal->group_exit_task)
@@ -1032,12 +1120,13 @@ asmlinkage long sys_exit(int error_code)
1032NORET_TYPE void 1120NORET_TYPE void
1033do_group_exit(int exit_code) 1121do_group_exit(int exit_code)
1034{ 1122{
1123 struct signal_struct *sig = current->signal;
1124
1035 BUG_ON(exit_code & 0x80); /* core dumps don't get here */ 1125 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1036 1126
1037 if (current->signal->flags & SIGNAL_GROUP_EXIT) 1127 if (signal_group_exit(sig))
1038 exit_code = current->signal->group_exit_code; 1128 exit_code = sig->group_exit_code;
1039 else if (!thread_group_empty(current)) { 1129 else if (!thread_group_empty(current)) {
1040 struct signal_struct *const sig = current->signal;
1041 struct sighand_struct *const sighand = current->sighand; 1130 struct sighand_struct *const sighand = current->sighand;
1042 spin_lock_irq(&sighand->siglock); 1131 spin_lock_irq(&sighand->siglock);
1043 if (signal_group_exit(sig)) 1132 if (signal_group_exit(sig))
@@ -1089,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1089 * Do not consider detached threads that are 1178 * Do not consider detached threads that are
1090 * not ptraced: 1179 * not ptraced:
1091 */ 1180 */
1092 if (p->exit_signal == -1 && !p->ptrace) 1181 if (task_detached(p) && !p->ptrace)
1093 return 0; 1182 return 0;
1094 1183
1095 /* Wait for all children (clone and not) if __WALL is set; 1184 /* Wait for all children (clone and not) if __WALL is set;
@@ -1179,8 +1268,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1179 return 0; 1268 return 0;
1180 } 1269 }
1181 1270
1182 /* traced means p->ptrace, but not vice versa */ 1271 traced = ptrace_reparented(p);
1183 traced = (p->real_parent != p->parent);
1184 1272
1185 if (likely(!traced)) { 1273 if (likely(!traced)) {
1186 struct signal_struct *psig; 1274 struct signal_struct *psig;
@@ -1281,9 +1369,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1281 * If it's still not detached after that, don't release 1369 * If it's still not detached after that, don't release
1282 * it now. 1370 * it now.
1283 */ 1371 */
1284 if (p->exit_signal != -1) { 1372 if (!task_detached(p)) {
1285 do_notify_parent(p, p->exit_signal); 1373 do_notify_parent(p, p->exit_signal);
1286 if (p->exit_signal != -1) { 1374 if (!task_detached(p)) {
1287 p->exit_state = EXIT_ZOMBIE; 1375 p->exit_state = EXIT_ZOMBIE;
1288 p = NULL; 1376 p = NULL;
1289 } 1377 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..933e60ebccae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
22#include <linux/mempolicy.h> 22#include <linux/mempolicy.h>
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fdtable.h>
25#include <linux/key.h> 26#include <linux/key.h>
26#include <linux/binfmts.h> 27#include <linux/binfmts.h>
27#include <linux/mman.h> 28#include <linux/mman.h>
@@ -381,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
381 mm->ioctx_list = NULL; 382 mm->ioctx_list = NULL;
382 mm->free_area_cache = TASK_UNMAPPED_BASE; 383 mm->free_area_cache = TASK_UNMAPPED_BASE;
383 mm->cached_hole_size = ~0UL; 384 mm->cached_hole_size = ~0UL;
384 mm_init_cgroup(mm, p); 385 mm_init_owner(mm, p);
385 386
386 if (likely(!mm_alloc_pgd(mm))) { 387 if (likely(!mm_alloc_pgd(mm))) {
387 mm->def_flags = 0; 388 mm->def_flags = 0;
388 return mm; 389 return mm;
389 } 390 }
390 391
391 mm_free_cgroup(mm);
392 free_mm(mm); 392 free_mm(mm);
393 return NULL; 393 return NULL;
394} 394}
@@ -432,13 +432,13 @@ void mmput(struct mm_struct *mm)
432 if (atomic_dec_and_test(&mm->mm_users)) { 432 if (atomic_dec_and_test(&mm->mm_users)) {
433 exit_aio(mm); 433 exit_aio(mm);
434 exit_mmap(mm); 434 exit_mmap(mm);
435 set_mm_exe_file(mm, NULL);
435 if (!list_empty(&mm->mmlist)) { 436 if (!list_empty(&mm->mmlist)) {
436 spin_lock(&mmlist_lock); 437 spin_lock(&mmlist_lock);
437 list_del(&mm->mmlist); 438 list_del(&mm->mmlist);
438 spin_unlock(&mmlist_lock); 439 spin_unlock(&mmlist_lock);
439 } 440 }
440 put_swap_token(mm); 441 put_swap_token(mm);
441 mm_free_cgroup(mm);
442 mmdrop(mm); 442 mmdrop(mm);
443 } 443 }
444} 444}
@@ -545,6 +545,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
545 if (init_new_context(tsk, mm)) 545 if (init_new_context(tsk, mm))
546 goto fail_nocontext; 546 goto fail_nocontext;
547 547
548 dup_mm_exe_file(oldmm, mm);
549
548 err = dup_mmap(mm, oldmm); 550 err = dup_mmap(mm, oldmm);
549 if (err) 551 if (err)
550 goto free_pt; 552 goto free_pt;
@@ -891,7 +893,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
891 sig->group_exit_code = 0; 893 sig->group_exit_code = 0;
892 sig->group_exit_task = NULL; 894 sig->group_exit_task = NULL;
893 sig->group_stop_count = 0; 895 sig->group_stop_count = 0;
894 sig->curr_target = NULL; 896 sig->curr_target = tsk;
895 init_sigpending(&sig->shared_pending); 897 init_sigpending(&sig->shared_pending);
896 INIT_LIST_HEAD(&sig->posix_timers); 898 INIT_LIST_HEAD(&sig->posix_timers);
897 899
@@ -982,6 +984,13 @@ static void rt_mutex_init_task(struct task_struct *p)
982#endif 984#endif
983} 985}
984 986
987#ifdef CONFIG_MM_OWNER
988void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
989{
990 mm->owner = p;
991}
992#endif /* CONFIG_MM_OWNER */
993
985/* 994/*
986 * This creates a new process as a copy of the old one, 995 * This creates a new process as a copy of the old one,
987 * but does not actually start it yet. 996 * but does not actually start it yet.
@@ -1664,18 +1673,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1664} 1673}
1665 1674
1666/* 1675/*
1667 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1668 * supported yet
1669 */
1670static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1671{
1672 if (unshare_flags & CLONE_SYSVSEM)
1673 return -EINVAL;
1674
1675 return 0;
1676}
1677
1678/*
1679 * unshare allows a process to 'unshare' part of the process 1676 * unshare allows a process to 'unshare' part of the process
1680 * context which was originally shared using clone. copy_* 1677 * context which was originally shared using clone. copy_*
1681 * functions used by do_fork() cannot be used here directly 1678 * functions used by do_fork() cannot be used here directly
@@ -1690,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1690 struct sighand_struct *new_sigh = NULL; 1687 struct sighand_struct *new_sigh = NULL;
1691 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1688 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1692 struct files_struct *fd, *new_fd = NULL; 1689 struct files_struct *fd, *new_fd = NULL;
1693 struct sem_undo_list *new_ulist = NULL;
1694 struct nsproxy *new_nsproxy = NULL; 1690 struct nsproxy *new_nsproxy = NULL;
1691 int do_sysvsem = 0;
1695 1692
1696 check_unshare_flags(&unshare_flags); 1693 check_unshare_flags(&unshare_flags);
1697 1694
@@ -1703,6 +1700,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1703 CLONE_NEWNET)) 1700 CLONE_NEWNET))
1704 goto bad_unshare_out; 1701 goto bad_unshare_out;
1705 1702
1703 /*
1704 * CLONE_NEWIPC must also detach from the undolist: after switching
1705 * to a new ipc namespace, the semaphore arrays from the old
1706 * namespace are unreachable.
1707 */
1708 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1709 do_sysvsem = 1;
1706 if ((err = unshare_thread(unshare_flags))) 1710 if ((err = unshare_thread(unshare_flags)))
1707 goto bad_unshare_out; 1711 goto bad_unshare_out;
1708 if ((err = unshare_fs(unshare_flags, &new_fs))) 1712 if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1713,13 +1717,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1713 goto bad_unshare_cleanup_sigh; 1717 goto bad_unshare_cleanup_sigh;
1714 if ((err = unshare_fd(unshare_flags, &new_fd))) 1718 if ((err = unshare_fd(unshare_flags, &new_fd)))
1715 goto bad_unshare_cleanup_vm; 1719 goto bad_unshare_cleanup_vm;
1716 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1717 goto bad_unshare_cleanup_fd;
1718 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1720 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1719 new_fs))) 1721 new_fs)))
1720 goto bad_unshare_cleanup_semundo; 1722 goto bad_unshare_cleanup_fd;
1721 1723
1722 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1724 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
1725 if (do_sysvsem) {
1726 /*
1727 * CLONE_SYSVSEM is equivalent to sys_exit().
1728 */
1729 exit_sem(current);
1730 }
1723 1731
1724 if (new_nsproxy) { 1732 if (new_nsproxy) {
1725 switch_task_namespaces(current, new_nsproxy); 1733 switch_task_namespaces(current, new_nsproxy);
@@ -1755,7 +1763,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1755 if (new_nsproxy) 1763 if (new_nsproxy)
1756 put_nsproxy(new_nsproxy); 1764 put_nsproxy(new_nsproxy);
1757 1765
1758bad_unshare_cleanup_semundo:
1759bad_unshare_cleanup_fd: 1766bad_unshare_cleanup_fd:
1760 if (new_fd) 1767 if (new_fd)
1761 put_files_struct(new_fd); 1768 put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index e43945e995f5..449def8074fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
104 /* Key which the futex is hashed on: */ 104 /* Key which the futex is hashed on: */
105 union futex_key key; 105 union futex_key key;
106 106
107 /* For fd, sigio sent using these: */
108 int fd;
109 struct file *filp;
110
111 /* Optional priority inheritance state: */ 107 /* Optional priority inheritance state: */
112 struct futex_pi_state *pi_state; 108 struct futex_pi_state *pi_state;
113 struct task_struct *task; 109 struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
126 122
127static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
128 124
129/* Futex-fs vfsmount entry: */
130static struct vfsmount *futex_mnt;
131
132/* 125/*
133 * Take mm->mmap_sem, when futex is shared 126 * Take mm->mmap_sem, when futex is shared
134 */ 127 */
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
610static void wake_futex(struct futex_q *q) 603static void wake_futex(struct futex_q *q)
611{ 604{
612 plist_del(&q->list, &q->list.plist); 605 plist_del(&q->list, &q->list.plist);
613 if (q->filp)
614 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
615 /* 606 /*
616 * The lock in wake_up_all() is a crucial memory barrier after the 607 * The lock in wake_up_all() is a crucial memory barrier after the
617 * plist_del() and also before assigning to q->lock_ptr. 608 * plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
988} 979}
989 980
990/* The key must be already stored in q->key. */ 981/* The key must be already stored in q->key. */
991static inline struct futex_hash_bucket * 982static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
992queue_lock(struct futex_q *q, int fd, struct file *filp)
993{ 983{
994 struct futex_hash_bucket *hb; 984 struct futex_hash_bucket *hb;
995 985
996 q->fd = fd;
997 q->filp = filp;
998
999 init_waitqueue_head(&q->waiters); 986 init_waitqueue_head(&q->waiters);
1000 987
1001 get_futex_key_refs(&q->key); 988 get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
1006 return hb; 993 return hb;
1007} 994}
1008 995
1009static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 996static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1010{ 997{
1011 int prio; 998 int prio;
1012 999
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1041 * exactly once. They are called with the hashed spinlock held. 1028 * exactly once. They are called with the hashed spinlock held.
1042 */ 1029 */
1043 1030
1044/* The key must be already stored in q->key. */
1045static void queue_me(struct futex_q *q, int fd, struct file *filp)
1046{
1047 struct futex_hash_bucket *hb;
1048
1049 hb = queue_lock(q, fd, filp);
1050 __queue_me(q, hb);
1051}
1052
1053/* Return 1 if we were still queued (ie. 0 means we were woken) */ 1031/* Return 1 if we were still queued (ie. 0 means we were woken) */
1054static int unqueue_me(struct futex_q *q) 1032static int unqueue_me(struct futex_q *q)
1055{ 1033{
@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1194 if (unlikely(ret != 0)) 1172 if (unlikely(ret != 0))
1195 goto out_release_sem; 1173 goto out_release_sem;
1196 1174
1197 hb = queue_lock(&q, -1, NULL); 1175 hb = queue_lock(&q);
1198 1176
1199 /* 1177 /*
1200 * Access the page AFTER the futex is queued. 1178 * Access the page AFTER the futex is queued.
@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1238 goto out_unlock_release_sem; 1216 goto out_unlock_release_sem;
1239 1217
1240 /* Only actually queue if *uaddr contained val. */ 1218 /* Only actually queue if *uaddr contained val. */
1241 __queue_me(&q, hb); 1219 queue_me(&q, hb);
1242 1220
1243 /* 1221 /*
1244 * Now the futex is queued and we have checked the data, we 1222 * Now the futex is queued and we have checked the data, we
@@ -1266,11 +1244,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1266 if (!abs_time) 1244 if (!abs_time)
1267 schedule(); 1245 schedule();
1268 else { 1246 else {
1269 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1247 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1270 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1271 t.timer.expires = *abs_time; 1250 t.timer.expires = *abs_time;
1272 1251
1273 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); 1252 hrtimer_start(&t.timer, t.timer.expires,
1253 HRTIMER_MODE_ABS);
1274 if (!hrtimer_active(&t.timer)) 1254 if (!hrtimer_active(&t.timer))
1275 t.task = NULL; 1255 t.task = NULL;
1276 1256
@@ -1286,6 +1266,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1286 1266
1287 /* Flag if a timeout occured */ 1267 /* Flag if a timeout occured */
1288 rem = (t.task == NULL); 1268 rem = (t.task == NULL);
1269
1270 destroy_hrtimer_on_stack(&t.timer);
1289 } 1271 }
1290 } 1272 }
1291 __set_current_state(TASK_RUNNING); 1273 __set_current_state(TASK_RUNNING);
@@ -1367,7 +1349,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1367 1349
1368 if (time) { 1350 if (time) {
1369 to = &timeout; 1351 to = &timeout;
1370 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1352 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1353 HRTIMER_MODE_ABS);
1371 hrtimer_init_sleeper(to, current); 1354 hrtimer_init_sleeper(to, current);
1372 to->timer.expires = *time; 1355 to->timer.expires = *time;
1373 } 1356 }
@@ -1381,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1381 goto out_release_sem; 1364 goto out_release_sem;
1382 1365
1383 retry_unlocked: 1366 retry_unlocked:
1384 hb = queue_lock(&q, -1, NULL); 1367 hb = queue_lock(&q);
1385 1368
1386 retry_locked: 1369 retry_locked:
1387 ret = lock_taken = 0; 1370 ret = lock_taken = 0;
@@ -1494,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1494 /* 1477 /*
1495 * Only actually queue now that the atomic ops are done: 1478 * Only actually queue now that the atomic ops are done:
1496 */ 1479 */
1497 __queue_me(&q, hb); 1480 queue_me(&q, hb);
1498 1481
1499 /* 1482 /*
1500 * Now the futex is queued and we have checked the data, we 1483 * Now the futex is queued and we have checked the data, we
@@ -1581,6 +1564,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1581 unqueue_me_pi(&q); 1564 unqueue_me_pi(&q);
1582 futex_unlock_mm(fshared); 1565 futex_unlock_mm(fshared);
1583 1566
1567 if (to)
1568 destroy_hrtimer_on_stack(&to->timer);
1584 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1585 1570
1586 out_unlock_release_sem: 1571 out_unlock_release_sem:
@@ -1588,6 +1573,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1588 1573
1589 out_release_sem: 1574 out_release_sem:
1590 futex_unlock_mm(fshared); 1575 futex_unlock_mm(fshared);
1576 if (to)
1577 destroy_hrtimer_on_stack(&to->timer);
1591 return ret; 1578 return ret;
1592 1579
1593 uaddr_faulted: 1580 uaddr_faulted:
@@ -1615,6 +1602,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1615 if (!ret && (uval != -EFAULT)) 1602 if (!ret && (uval != -EFAULT))
1616 goto retry; 1603 goto retry;
1617 1604
1605 if (to)
1606 destroy_hrtimer_on_stack(&to->timer);
1618 return ret; 1607 return ret;
1619} 1608}
1620 1609
@@ -1735,121 +1724,6 @@ pi_faulted:
1735 return ret; 1724 return ret;
1736} 1725}
1737 1726
1738static int futex_close(struct inode *inode, struct file *filp)
1739{
1740 struct futex_q *q = filp->private_data;
1741
1742 unqueue_me(q);
1743 kfree(q);
1744
1745 return 0;
1746}
1747
1748/* This is one-shot: once it's gone off you need a new fd */
1749static unsigned int futex_poll(struct file *filp,
1750 struct poll_table_struct *wait)
1751{
1752 struct futex_q *q = filp->private_data;
1753 int ret = 0;
1754
1755 poll_wait(filp, &q->waiters, wait);
1756
1757 /*
1758 * plist_node_empty() is safe here without any lock.
1759 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1760 */
1761 if (plist_node_empty(&q->list))
1762 ret = POLLIN | POLLRDNORM;
1763
1764 return ret;
1765}
1766
1767static const struct file_operations futex_fops = {
1768 .release = futex_close,
1769 .poll = futex_poll,
1770};
1771
1772/*
1773 * Signal allows caller to avoid the race which would occur if they
1774 * set the sigio stuff up afterwards.
1775 */
1776static int futex_fd(u32 __user *uaddr, int signal)
1777{
1778 struct futex_q *q;
1779 struct file *filp;
1780 int ret, err;
1781 struct rw_semaphore *fshared;
1782 static unsigned long printk_interval;
1783
1784 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1785 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1786 "will be removed from the kernel in June 2007\n",
1787 current->comm);
1788 }
1789
1790 ret = -EINVAL;
1791 if (!valid_signal(signal))
1792 goto out;
1793
1794 ret = get_unused_fd();
1795 if (ret < 0)
1796 goto out;
1797 filp = get_empty_filp();
1798 if (!filp) {
1799 put_unused_fd(ret);
1800 ret = -ENFILE;
1801 goto out;
1802 }
1803 filp->f_op = &futex_fops;
1804 filp->f_path.mnt = mntget(futex_mnt);
1805 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1806 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1807
1808 if (signal) {
1809 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
1810 if (err < 0) {
1811 goto error;
1812 }
1813 filp->f_owner.signum = signal;
1814 }
1815
1816 q = kmalloc(sizeof(*q), GFP_KERNEL);
1817 if (!q) {
1818 err = -ENOMEM;
1819 goto error;
1820 }
1821 q->pi_state = NULL;
1822
1823 fshared = &current->mm->mmap_sem;
1824 down_read(fshared);
1825 err = get_futex_key(uaddr, fshared, &q->key);
1826
1827 if (unlikely(err != 0)) {
1828 up_read(fshared);
1829 kfree(q);
1830 goto error;
1831 }
1832
1833 /*
1834 * queue_me() must be called before releasing mmap_sem, because
1835 * key->shared.inode needs to be referenced while holding it.
1836 */
1837 filp->private_data = q;
1838
1839 queue_me(q, ret, filp);
1840 up_read(fshared);
1841
1842 /* Now we map fd to filp, so userspace can access it */
1843 fd_install(ret, filp);
1844out:
1845 return ret;
1846error:
1847 put_unused_fd(ret);
1848 put_filp(filp);
1849 ret = err;
1850 goto out;
1851}
1852
1853/* 1727/*
1854 * Support for robust futexes: the kernel cleans up held futexes at 1728 * Support for robust futexes: the kernel cleans up held futexes at
1855 * thread exit time. 1729 * thread exit time.
@@ -2081,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2081 case FUTEX_WAKE_BITSET: 1955 case FUTEX_WAKE_BITSET:
2082 ret = futex_wake(uaddr, fshared, val, val3); 1956 ret = futex_wake(uaddr, fshared, val, val3);
2083 break; 1957 break;
2084 case FUTEX_FD:
2085 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
2086 ret = futex_fd(uaddr, val);
2087 break;
2088 case FUTEX_REQUEUE: 1958 case FUTEX_REQUEUE:
2089 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 1959 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2090 break; 1960 break;
@@ -2145,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2145 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2015 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2146} 2016}
2147 2017
2148static int futexfs_get_sb(struct file_system_type *fs_type,
2149 int flags, const char *dev_name, void *data,
2150 struct vfsmount *mnt)
2151{
2152 return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
2153}
2154
2155static struct file_system_type futex_fs_type = {
2156 .name = "futexfs",
2157 .get_sb = futexfs_get_sb,
2158 .kill_sb = kill_anon_super,
2159};
2160
2161static int __init futex_init(void) 2018static int __init futex_init(void)
2162{ 2019{
2163 u32 curval; 2020 u32 curval;
@@ -2182,16 +2039,6 @@ static int __init futex_init(void)
2182 spin_lock_init(&futex_queues[i].lock); 2039 spin_lock_init(&futex_queues[i].lock);
2183 } 2040 }
2184 2041
2185 i = register_filesystem(&futex_fs_type);
2186 if (i)
2187 return i;
2188
2189 futex_mnt = kern_mount(&futex_fs_type);
2190 if (IS_ERR(futex_mnt)) {
2191 unregister_filesystem(&futex_fs_type);
2192 return PTR_ERR(futex_mnt);
2193 }
2194
2195 return 0; 2042 return 0;
2196} 2043}
2197__initcall(futex_init); 2044__initcall(futex_init);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e379ef0e9c20..421be5fe5cc7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,7 @@
43#include <linux/tick.h> 43#include <linux/tick.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45#include <linux/err.h> 45#include <linux/err.h>
46#include <linux/debugobjects.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
@@ -153,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
153} 154}
154 155
155/* 156/*
156 * Helper function to check, whether the timer is running the callback
157 * function
158 */
159static inline int hrtimer_callback_running(struct hrtimer *timer)
160{
161 return timer->state & HRTIMER_STATE_CALLBACK;
162}
163
164/*
165 * Functions and macros which are different for UP/SMP systems are kept in a 157 * Functions and macros which are different for UP/SMP systems are kept in a
166 * single place 158 * single place
167 */ 159 */
@@ -342,6 +334,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
342 return res; 334 return res;
343} 335}
344 336
337#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
338
339static struct debug_obj_descr hrtimer_debug_descr;
340
341/*
342 * fixup_init is called when:
343 * - an active object is initialized
344 */
345static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
346{
347 struct hrtimer *timer = addr;
348
349 switch (state) {
350 case ODEBUG_STATE_ACTIVE:
351 hrtimer_cancel(timer);
352 debug_object_init(timer, &hrtimer_debug_descr);
353 return 1;
354 default:
355 return 0;
356 }
357}
358
359/*
360 * fixup_activate is called when:
361 * - an active object is activated
362 * - an unknown object is activated (might be a statically initialized object)
363 */
364static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
365{
366 switch (state) {
367
368 case ODEBUG_STATE_NOTAVAILABLE:
369 WARN_ON_ONCE(1);
370 return 0;
371
372 case ODEBUG_STATE_ACTIVE:
373 WARN_ON(1);
374
375 default:
376 return 0;
377 }
378}
379
380/*
381 * fixup_free is called when:
382 * - an active object is freed
383 */
384static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
385{
386 struct hrtimer *timer = addr;
387
388 switch (state) {
389 case ODEBUG_STATE_ACTIVE:
390 hrtimer_cancel(timer);
391 debug_object_free(timer, &hrtimer_debug_descr);
392 return 1;
393 default:
394 return 0;
395 }
396}
397
398static struct debug_obj_descr hrtimer_debug_descr = {
399 .name = "hrtimer",
400 .fixup_init = hrtimer_fixup_init,
401 .fixup_activate = hrtimer_fixup_activate,
402 .fixup_free = hrtimer_fixup_free,
403};
404
405static inline void debug_hrtimer_init(struct hrtimer *timer)
406{
407 debug_object_init(timer, &hrtimer_debug_descr);
408}
409
410static inline void debug_hrtimer_activate(struct hrtimer *timer)
411{
412 debug_object_activate(timer, &hrtimer_debug_descr);
413}
414
415static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
416{
417 debug_object_deactivate(timer, &hrtimer_debug_descr);
418}
419
420static inline void debug_hrtimer_free(struct hrtimer *timer)
421{
422 debug_object_free(timer, &hrtimer_debug_descr);
423}
424
425static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
426 enum hrtimer_mode mode);
427
428void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
429 enum hrtimer_mode mode)
430{
431 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
432 __hrtimer_init(timer, clock_id, mode);
433}
434
435void destroy_hrtimer_on_stack(struct hrtimer *timer)
436{
437 debug_object_free(timer, &hrtimer_debug_descr);
438}
439
440#else
441static inline void debug_hrtimer_init(struct hrtimer *timer) { }
442static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
443static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
444#endif
445
345/* 446/*
346 * Check, whether the timer is on the callback pending list 447 * Check, whether the timer is on the callback pending list
347 */ 448 */
@@ -567,6 +668,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
567 /* Timer is expired, act upon the callback mode */ 668 /* Timer is expired, act upon the callback mode */
568 switch(timer->cb_mode) { 669 switch(timer->cb_mode) {
569 case HRTIMER_CB_IRQSAFE_NO_RESTART: 670 case HRTIMER_CB_IRQSAFE_NO_RESTART:
671 debug_hrtimer_deactivate(timer);
570 /* 672 /*
571 * We can call the callback from here. No restart 673 * We can call the callback from here. No restart
572 * happens, so no danger of recursion 674 * happens, so no danger of recursion
@@ -581,6 +683,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
581 * the tick timer in the softirq ! The calling site 683 * the tick timer in the softirq ! The calling site
582 * takes care of this. 684 * takes care of this.
583 */ 685 */
686 debug_hrtimer_deactivate(timer);
584 return 1; 687 return 1;
585 case HRTIMER_CB_IRQSAFE: 688 case HRTIMER_CB_IRQSAFE:
586 case HRTIMER_CB_SOFTIRQ: 689 case HRTIMER_CB_SOFTIRQ:
@@ -590,7 +693,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
590 list_add_tail(&timer->cb_entry, 693 list_add_tail(&timer->cb_entry,
591 &base->cpu_base->cb_pending); 694 &base->cpu_base->cb_pending);
592 timer->state = HRTIMER_STATE_PENDING; 695 timer->state = HRTIMER_STATE_PENDING;
593 raise_softirq(HRTIMER_SOFTIRQ);
594 return 1; 696 return 1;
595 default: 697 default:
596 BUG(); 698 BUG();
@@ -633,6 +735,11 @@ static int hrtimer_switch_to_hres(void)
633 return 1; 735 return 1;
634} 736}
635 737
738static inline void hrtimer_raise_softirq(void)
739{
740 raise_softirq(HRTIMER_SOFTIRQ);
741}
742
636#else 743#else
637 744
638static inline int hrtimer_hres_active(void) { return 0; } 745static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +758,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
651{ 758{
652 return 0; 759 return 0;
653} 760}
761static inline void hrtimer_raise_softirq(void) { }
654 762
655#endif /* CONFIG_HIGH_RES_TIMERS */ 763#endif /* CONFIG_HIGH_RES_TIMERS */
656 764
@@ -730,6 +838,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
730 struct hrtimer *entry; 838 struct hrtimer *entry;
731 int leftmost = 1; 839 int leftmost = 1;
732 840
841 debug_hrtimer_activate(timer);
842
733 /* 843 /*
734 * Find the right place in the rbtree: 844 * Find the right place in the rbtree:
735 */ 845 */
@@ -826,6 +936,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
826 * reprogramming happens in the interrupt handler. This is a 936 * reprogramming happens in the interrupt handler. This is a
827 * rare case and less expensive than a smp call. 937 * rare case and less expensive than a smp call.
828 */ 938 */
939 debug_hrtimer_deactivate(timer);
829 timer_stats_hrtimer_clear_start_info(timer); 940 timer_stats_hrtimer_clear_start_info(timer);
830 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 941 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
831 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 942 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -850,7 +961,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850{ 961{
851 struct hrtimer_clock_base *base, *new_base; 962 struct hrtimer_clock_base *base, *new_base;
852 unsigned long flags; 963 unsigned long flags;
853 int ret; 964 int ret, raise;
854 965
855 base = lock_hrtimer_base(timer, &flags); 966 base = lock_hrtimer_base(timer, &flags);
856 967
@@ -873,6 +984,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
873 tim = ktime_add_safe(tim, base->resolution); 984 tim = ktime_add_safe(tim, base->resolution);
874#endif 985#endif
875 } 986 }
987
876 timer->expires = tim; 988 timer->expires = tim;
877 989
878 timer_stats_hrtimer_set_start_info(timer); 990 timer_stats_hrtimer_set_start_info(timer);
@@ -884,8 +996,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
884 enqueue_hrtimer(timer, new_base, 996 enqueue_hrtimer(timer, new_base,
885 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 997 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
886 998
999 /*
1000 * The timer may be expired and moved to the cb_pending
1001 * list. We can not raise the softirq with base lock held due
1002 * to a possible deadlock with runqueue lock.
1003 */
1004 raise = timer->state == HRTIMER_STATE_PENDING;
1005
887 unlock_hrtimer_base(timer, &flags); 1006 unlock_hrtimer_base(timer, &flags);
888 1007
1008 if (raise)
1009 hrtimer_raise_softirq();
1010
889 return ret; 1011 return ret;
890} 1012}
891EXPORT_SYMBOL_GPL(hrtimer_start); 1013EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -996,14 +1118,8 @@ ktime_t hrtimer_get_next_event(void)
996} 1118}
997#endif 1119#endif
998 1120
999/** 1121static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1000 * hrtimer_init - initialize a timer to the given clock 1122 enum hrtimer_mode mode)
1001 * @timer: the timer to be initialized
1002 * @clock_id: the clock to be used
1003 * @mode: timer mode abs/rel
1004 */
1005void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1006 enum hrtimer_mode mode)
1007{ 1123{
1008 struct hrtimer_cpu_base *cpu_base; 1124 struct hrtimer_cpu_base *cpu_base;
1009 1125
@@ -1024,6 +1140,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1024 memset(timer->start_comm, 0, TASK_COMM_LEN); 1140 memset(timer->start_comm, 0, TASK_COMM_LEN);
1025#endif 1141#endif
1026} 1142}
1143
1144/**
1145 * hrtimer_init - initialize a timer to the given clock
1146 * @timer: the timer to be initialized
1147 * @clock_id: the clock to be used
1148 * @mode: timer mode abs/rel
1149 */
1150void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1151 enum hrtimer_mode mode)
1152{
1153 debug_hrtimer_init(timer);
1154 __hrtimer_init(timer, clock_id, mode);
1155}
1027EXPORT_SYMBOL_GPL(hrtimer_init); 1156EXPORT_SYMBOL_GPL(hrtimer_init);
1028 1157
1029/** 1158/**
@@ -1057,6 +1186,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1057 timer = list_entry(cpu_base->cb_pending.next, 1186 timer = list_entry(cpu_base->cb_pending.next,
1058 struct hrtimer, cb_entry); 1187 struct hrtimer, cb_entry);
1059 1188
1189 debug_hrtimer_deactivate(timer);
1060 timer_stats_account_hrtimer(timer); 1190 timer_stats_account_hrtimer(timer);
1061 1191
1062 fn = timer->function; 1192 fn = timer->function;
@@ -1105,6 +1235,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1105 enum hrtimer_restart (*fn)(struct hrtimer *); 1235 enum hrtimer_restart (*fn)(struct hrtimer *);
1106 int restart; 1236 int restart;
1107 1237
1238 debug_hrtimer_deactivate(timer);
1108 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1239 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1109 timer_stats_account_hrtimer(timer); 1240 timer_stats_account_hrtimer(timer);
1110 1241
@@ -1363,22 +1494,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1363{ 1494{
1364 struct hrtimer_sleeper t; 1495 struct hrtimer_sleeper t;
1365 struct timespec __user *rmtp; 1496 struct timespec __user *rmtp;
1497 int ret = 0;
1366 1498
1367 hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); 1499 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1500 HRTIMER_MODE_ABS);
1368 t.timer.expires.tv64 = restart->nanosleep.expires; 1501 t.timer.expires.tv64 = restart->nanosleep.expires;
1369 1502
1370 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1503 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1371 return 0; 1504 goto out;
1372 1505
1373 rmtp = restart->nanosleep.rmtp; 1506 rmtp = restart->nanosleep.rmtp;
1374 if (rmtp) { 1507 if (rmtp) {
1375 int ret = update_rmtp(&t.timer, rmtp); 1508 ret = update_rmtp(&t.timer, rmtp);
1376 if (ret <= 0) 1509 if (ret <= 0)
1377 return ret; 1510 goto out;
1378 } 1511 }
1379 1512
1380 /* The other values in restart are already filled in */ 1513 /* The other values in restart are already filled in */
1381 return -ERESTART_RESTARTBLOCK; 1514 ret = -ERESTART_RESTARTBLOCK;
1515out:
1516 destroy_hrtimer_on_stack(&t.timer);
1517 return ret;
1382} 1518}
1383 1519
1384long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1520long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
@@ -1386,20 +1522,23 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1386{ 1522{
1387 struct restart_block *restart; 1523 struct restart_block *restart;
1388 struct hrtimer_sleeper t; 1524 struct hrtimer_sleeper t;
1525 int ret = 0;
1389 1526
1390 hrtimer_init(&t.timer, clockid, mode); 1527 hrtimer_init_on_stack(&t.timer, clockid, mode);
1391 t.timer.expires = timespec_to_ktime(*rqtp); 1528 t.timer.expires = timespec_to_ktime(*rqtp);
1392 if (do_nanosleep(&t, mode)) 1529 if (do_nanosleep(&t, mode))
1393 return 0; 1530 goto out;
1394 1531
1395 /* Absolute timers do not update the rmtp value and restart: */ 1532 /* Absolute timers do not update the rmtp value and restart: */
1396 if (mode == HRTIMER_MODE_ABS) 1533 if (mode == HRTIMER_MODE_ABS) {
1397 return -ERESTARTNOHAND; 1534 ret = -ERESTARTNOHAND;
1535 goto out;
1536 }
1398 1537
1399 if (rmtp) { 1538 if (rmtp) {
1400 int ret = update_rmtp(&t.timer, rmtp); 1539 ret = update_rmtp(&t.timer, rmtp);
1401 if (ret <= 0) 1540 if (ret <= 0)
1402 return ret; 1541 goto out;
1403 } 1542 }
1404 1543
1405 restart = &current_thread_info()->restart_block; 1544 restart = &current_thread_info()->restart_block;
@@ -1408,7 +1547,10 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1408 restart->nanosleep.rmtp = rmtp; 1547 restart->nanosleep.rmtp = rmtp;
1409 restart->nanosleep.expires = t.timer.expires.tv64; 1548 restart->nanosleep.expires = t.timer.expires.tv64;
1410 1549
1411 return -ERESTART_RESTARTBLOCK; 1550 ret = -ERESTART_RESTARTBLOCK;
1551out:
1552 destroy_hrtimer_on_stack(&t.timer);
1553 return ret;
1412} 1554}
1413 1555
1414asmlinkage long 1556asmlinkage long
@@ -1453,6 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1453 while ((node = rb_first(&old_base->active))) { 1595 while ((node = rb_first(&old_base->active))) {
1454 timer = rb_entry(node, struct hrtimer, node); 1596 timer = rb_entry(node, struct hrtimer, node);
1455 BUG_ON(hrtimer_callback_running(timer)); 1597 BUG_ON(hrtimer_callback_running(timer));
1598 debug_hrtimer_deactivate(timer);
1456 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1599 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
1457 timer->base = new_base; 1600 timer->base = new_base;
1458 /* 1601 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/gfp.h>
4 5
5/* 6/*
6 * Device resource management aware IRQ request/free implementation. 7 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46d6611a33bb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/slab.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -149,6 +150,26 @@ void disable_irq(unsigned int irq)
149} 150}
150EXPORT_SYMBOL(disable_irq); 151EXPORT_SYMBOL(disable_irq);
151 152
153static void __enable_irq(struct irq_desc *desc, unsigned int irq)
154{
155 switch (desc->depth) {
156 case 0:
157 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
158 WARN_ON(1);
159 break;
160 case 1: {
161 unsigned int status = desc->status & ~IRQ_DISABLED;
162
163 /* Prevent probing on this irq: */
164 desc->status = status | IRQ_NOPROBE;
165 check_irq_resend(desc, irq);
166 /* fall-through */
167 }
168 default:
169 desc->depth--;
170 }
171}
172
152/** 173/**
153 * enable_irq - enable handling of an irq 174 * enable_irq - enable handling of an irq
154 * @irq: Interrupt to enable 175 * @irq: Interrupt to enable
@@ -168,22 +189,7 @@ void enable_irq(unsigned int irq)
168 return; 189 return;
169 190
170 spin_lock_irqsave(&desc->lock, flags); 191 spin_lock_irqsave(&desc->lock, flags);
171 switch (desc->depth) { 192 __enable_irq(desc, irq);
172 case 0:
173 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
174 WARN_ON(1);
175 break;
176 case 1: {
177 unsigned int status = desc->status & ~IRQ_DISABLED;
178
179 /* Prevent probing on this irq: */
180 desc->status = status | IRQ_NOPROBE;
181 check_irq_resend(desc, irq);
182 /* fall-through */
183 }
184 default:
185 desc->depth--;
186 }
187 spin_unlock_irqrestore(&desc->lock, flags); 193 spin_unlock_irqrestore(&desc->lock, flags);
188} 194}
189EXPORT_SYMBOL(enable_irq); 195EXPORT_SYMBOL(enable_irq);
@@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
364 compat_irq_chip_set_default_handler(desc); 370 compat_irq_chip_set_default_handler(desc);
365 371
366 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 372 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
367 IRQ_INPROGRESS); 373 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
368 374
369 if (!(desc->status & IRQ_NOAUTOEN)) { 375 if (!(desc->status & IRQ_NOAUTOEN)) {
370 desc->depth = 0; 376 desc->depth = 0;
@@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
380 /* Reset broken irq detection when installing new handler */ 386 /* Reset broken irq detection when installing new handler */
381 desc->irq_count = 0; 387 desc->irq_count = 0;
382 desc->irqs_unhandled = 0; 388 desc->irqs_unhandled = 0;
389
390 /*
391 * Check whether we disabled the irq via the spurious handler
392 * before. Reenable it and give it another chance.
393 */
394 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
395 desc->status &= ~IRQ_SPURIOUS_DISABLED;
396 __enable_irq(desc, irq);
397 }
398
383 spin_unlock_irqrestore(&desc->lock, flags); 399 spin_unlock_irqrestore(&desc->lock, flags);
384 400
385 new->irq = irq; 401 new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6a..c66d3f10e853 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
209 * Now kill the IRQ 209 * Now kill the IRQ
210 */ 210 */
211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
212 desc->status |= IRQ_DISABLED; 212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth = 1; 213 desc->depth++;
214 desc->chip->disable(irq); 214 desc->chip->disable(irq);
215 } 215 }
216 desc->irqs_unhandled = 0; 216 desc->irqs_unhandled = 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
472 472
473static int __init kallsyms_init(void) 473static int __init kallsyms_init(void)
474{ 474{
475 struct proc_dir_entry *entry; 475 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
476
477 entry = create_proc_entry("kallsyms", 0444, NULL);
478 if (entry)
479 entry->proc_fops = &kallsyms_operations;
480 return 0; 476 return 0;
481} 477}
482__initcall(kallsyms_init); 478__initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb85c79989b4..1c5fcacbcf33 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1217,7 +1217,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1217 } 1217 }
1218 1218
1219 /* match ? */ 1219 /* match ? */
1220 if (system_ram >= start && system_ram <= end) { 1220 if (system_ram >= start && system_ram < end) {
1221 *crash_size = size; 1221 *crash_size = size;
1222 break; 1222 break;
1223 } 1223 }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1c80b2..39e31a036f5b 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -61,7 +61,7 @@ struct kgdb_state {
61 int err_code; 61 int err_code;
62 int cpu; 62 int cpu;
63 int pass_exception; 63 int pass_exception;
64 long threadid; 64 unsigned long threadid;
65 long kgdb_usethreadid; 65 long kgdb_usethreadid;
66 struct pt_regs *linux_regs; 66 struct pt_regs *linux_regs;
67}; 67};
@@ -146,7 +146,7 @@ atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
146 * the other CPUs might interfere with your debugging context, so 146 * the other CPUs might interfere with your debugging context, so
147 * use this with care: 147 * use this with care:
148 */ 148 */
149int kgdb_do_roundup = 1; 149static int kgdb_do_roundup = 1;
150 150
151static int __init opt_nokgdbroundup(char *str) 151static int __init opt_nokgdbroundup(char *str)
152{ 152{
@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
438 * While we find nice hex chars, build a long_val. 438 * While we find nice hex chars, build a long_val.
439 * Return number of chars processed. 439 * Return number of chars processed.
440 */ 440 */
441int kgdb_hex2long(char **ptr, long *long_val) 441int kgdb_hex2long(char **ptr, unsigned long *long_val)
442{ 442{
443 int hex_val; 443 int hex_val;
444 int num = 0; 444 int num = 0;
@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
709 return 0; 709 return 0;
710} 710}
711 711
712int remove_all_break(void) 712static int remove_all_break(void)
713{ 713{
714 unsigned long addr; 714 unsigned long addr;
715 int error; 715 int error;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec03..8df97d3dfda8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 28#include <linux/completion.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include <linux/mount.h> 33#include <linux/mount.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..bd1b9ea024e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -98,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create)
98 struct sched_param param = { .sched_priority = 0 }; 98 struct sched_param param = { .sched_priority = 0 };
99 wait_for_completion(&create->started); 99 wait_for_completion(&create->started);
100 read_lock(&tasklist_lock); 100 read_lock(&tasklist_lock);
101 create->result = find_task_by_pid(pid); 101 create->result = find_task_by_pid_ns(pid, &init_pid_ns);
102 read_unlock(&tasklist_lock); 102 read_unlock(&tasklist_lock);
103 /* 103 /*
104 * root may have changed our (kthreadd's) priority or CPU mask. 104 * root may have changed our (kthreadd's) priority or CPU mask.
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
144 144
145 spin_lock(&kthread_create_lock); 145 spin_lock(&kthread_create_lock);
146 list_add_tail(&create.list, &kthread_create_list); 146 list_add_tail(&create.list, &kthread_create_list);
147 wake_up_process(kthreadd_task);
148 spin_unlock(&kthread_create_lock); 147 spin_unlock(&kthread_create_lock);
149 148
149 wake_up_process(kthreadd_task);
150 wait_for_completion(&create.done); 150 wait_for_completion(&create.done);
151 151
152 if (!IS_ERR(create.result)) { 152 if (!IS_ERR(create.result)) {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
233 233
234static int __init init_lstats_procfs(void) 234static int __init init_lstats_procfs(void)
235{ 235{
236 struct proc_dir_entry *pe; 236 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237
238 pe = create_proc_entry("latency_stats", 0644, NULL);
239 if (!pe)
240 return -ENOMEM;
241
242 pe->proc_fops = &lstats_fops;
243
244 return 0; 237 return 0;
245} 238}
246__initcall(init_lstats_procfs); 239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
660 660
661static int __init lockdep_proc_init(void) 661static int __init lockdep_proc_init(void)
662{ 662{
663 struct proc_dir_entry *entry; 663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
664 664 proc_create("lockdep_stats", S_IRUSR, NULL,
665 entry = create_proc_entry("lockdep", S_IRUSR, NULL); 665 &proc_lockdep_stats_operations);
666 if (entry)
667 entry->proc_fops = &proc_lockdep_operations;
668
669 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
670 if (entry)
671 entry->proc_fops = &proc_lockdep_stats_operations;
672 666
673#ifdef CONFIG_LOCK_STAT 667#ifdef CONFIG_LOCK_STAT
674 entry = create_proc_entry("lock_stat", S_IRUSR, NULL); 668 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
675 if (entry)
676 entry->proc_fops = &proc_lock_stat_operations;
677#endif 669#endif
678 670
679 return 0; 671 return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..b5a9fe1d50d5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,12 +23,13 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/marker.h> 24#include <linux/marker.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27extern struct marker __start___markers[]; 28extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 29extern struct marker __stop___markers[];
29 30
30/* Set to 1 to enable marker debug output */ 31/* Set to 1 to enable marker debug output */
31const int marker_debug; 32static const int marker_debug;
32 33
33/* 34/*
34 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
diff --git a/kernel/module.c b/kernel/module.c
index 8d6cccc6c3cf..8e4528c9909f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -164,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
164 return NULL; 164 return NULL;
165} 165}
166 166
167static void printk_unused_warning(const char *name) 167static bool always_ok(bool gplok, bool warn, const char *name)
168{ 168{
169 printk(KERN_WARNING "Symbol %s is marked as UNUSED, " 169 return true;
170 "however this module is using it.\n", name);
171 printk(KERN_WARNING "This symbol will go away in the future.\n");
172 printk(KERN_WARNING "Please evalute if this is the right api to use, "
173 "and if it really is, submit a report the linux kernel "
174 "mailinglist together with submitting your code for "
175 "inclusion.\n");
176} 170}
177 171
178/* Find a symbol, return value, crc and module which owns it */ 172static bool printk_unused_warning(bool gplok, bool warn, const char *name)
179static unsigned long __find_symbol(const char *name,
180 struct module **owner,
181 const unsigned long **crc,
182 int gplok)
183{ 173{
184 struct module *mod; 174 if (warn) {
185 const struct kernel_symbol *ks; 175 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
186 176 "however this module is using it.\n", name);
187 /* Core kernel first. */ 177 printk(KERN_WARNING
188 *owner = NULL; 178 "This symbol will go away in the future.\n");
189 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); 179 printk(KERN_WARNING
190 if (ks) { 180 "Please evalute if this is the right api to use and if "
191 *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); 181 "it really is, submit a report the linux kernel "
192 return ks->value; 182 "mailinglist together with submitting your code for "
193 } 183 "inclusion.\n");
194 if (gplok) {
195 ks = lookup_symbol(name, __start___ksymtab_gpl,
196 __stop___ksymtab_gpl);
197 if (ks) {
198 *crc = symversion(__start___kcrctab_gpl,
199 (ks - __start___ksymtab_gpl));
200 return ks->value;
201 }
202 } 184 }
203 ks = lookup_symbol(name, __start___ksymtab_gpl_future, 185 return true;
204 __stop___ksymtab_gpl_future); 186}
205 if (ks) { 187
206 if (!gplok) { 188static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
207 printk(KERN_WARNING "Symbol %s is being used " 189{
208 "by a non-GPL module, which will not " 190 if (!gplok)
209 "be allowed in the future\n", name); 191 return false;
210 printk(KERN_WARNING "Please see the file " 192 return printk_unused_warning(gplok, warn, name);
211 "Documentation/feature-removal-schedule.txt " 193}
212 "in the kernel source tree for more " 194
213 "details.\n"); 195static bool gpl_only(bool gplok, bool warn, const char *name)
214 } 196{
215 *crc = symversion(__start___kcrctab_gpl_future, 197 return gplok;
216 (ks - __start___ksymtab_gpl_future)); 198}
217 return ks->value; 199
200static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
201{
202 if (!gplok && warn) {
203 printk(KERN_WARNING "Symbol %s is being used "
204 "by a non-GPL module, which will not "
205 "be allowed in the future\n", name);
206 printk(KERN_WARNING "Please see the file "
207 "Documentation/feature-removal-schedule.txt "
208 "in the kernel source tree for more details.\n");
218 } 209 }
210 return true;
211}
219 212
220 ks = lookup_symbol(name, __start___ksymtab_unused, 213struct symsearch {
221 __stop___ksymtab_unused); 214 const struct kernel_symbol *start, *stop;
222 if (ks) { 215 const unsigned long *crcs;
223 printk_unused_warning(name); 216 bool (*check)(bool gplok, bool warn, const char *name);
224 *crc = symversion(__start___kcrctab_unused, 217};
225 (ks - __start___ksymtab_unused)); 218
226 return ks->value; 219/* Look through this array of symbol tables for a symbol match which
220 * passes the check function. */
221static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
222 unsigned int num,
223 const char *name,
224 bool gplok,
225 bool warn,
226 const unsigned long **crc)
227{
228 unsigned int i;
229 const struct kernel_symbol *ks;
230
231 for (i = 0; i < num; i++) {
232 ks = lookup_symbol(name, arr[i].start, arr[i].stop);
233 if (!ks || !arr[i].check(gplok, warn, name))
234 continue;
235
236 if (crc)
237 *crc = symversion(arr[i].crcs, ks - arr[i].start);
238 return ks;
227 } 239 }
240 return NULL;
241}
228 242
229 if (gplok) 243/* Find a symbol, return value, (optional) crc and (optional) module
230 ks = lookup_symbol(name, __start___ksymtab_unused_gpl, 244 * which owns it */
231 __stop___ksymtab_unused_gpl); 245static unsigned long find_symbol(const char *name,
246 struct module **owner,
247 const unsigned long **crc,
248 bool gplok,
249 bool warn)
250{
251 struct module *mod;
252 const struct kernel_symbol *ks;
253 const struct symsearch arr[] = {
254 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
255 always_ok },
256 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
257 __start___kcrctab_gpl, gpl_only },
258 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
259 __start___kcrctab_gpl_future, warn_if_not_gpl },
260 { __start___ksymtab_unused, __stop___ksymtab_unused,
261 __start___kcrctab_unused, printk_unused_warning },
262 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
263 __start___kcrctab_unused_gpl, gpl_only_unused_warning },
264 };
265
266 /* Core kernel first. */
267 ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
232 if (ks) { 268 if (ks) {
233 printk_unused_warning(name); 269 if (owner)
234 *crc = symversion(__start___kcrctab_unused_gpl, 270 *owner = NULL;
235 (ks - __start___ksymtab_unused_gpl));
236 return ks->value; 271 return ks->value;
237 } 272 }
238 273
239 /* Now try modules. */ 274 /* Now try modules. */
240 list_for_each_entry(mod, &modules, list) { 275 list_for_each_entry(mod, &modules, list) {
241 *owner = mod; 276 struct symsearch arr[] = {
242 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); 277 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
278 always_ok },
279 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
280 mod->gpl_crcs, gpl_only },
281 { mod->gpl_future_syms,
282 mod->gpl_future_syms + mod->num_gpl_future_syms,
283 mod->gpl_future_crcs, warn_if_not_gpl },
284 { mod->unused_syms,
285 mod->unused_syms + mod->num_unused_syms,
286 mod->unused_crcs, printk_unused_warning },
287 { mod->unused_gpl_syms,
288 mod->unused_gpl_syms + mod->num_unused_gpl_syms,
289 mod->unused_gpl_crcs, gpl_only_unused_warning },
290 };
291
292 ks = search_symarrays(arr, ARRAY_SIZE(arr),
293 name, gplok, warn, crc);
243 if (ks) { 294 if (ks) {
244 *crc = symversion(mod->crcs, (ks - mod->syms)); 295 if (owner)
245 return ks->value; 296 *owner = mod;
246 }
247
248 if (gplok) {
249 ks = lookup_symbol(name, mod->gpl_syms,
250 mod->gpl_syms + mod->num_gpl_syms);
251 if (ks) {
252 *crc = symversion(mod->gpl_crcs,
253 (ks - mod->gpl_syms));
254 return ks->value;
255 }
256 }
257 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
258 if (ks) {
259 printk_unused_warning(name);
260 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
261 return ks->value;
262 }
263
264 if (gplok) {
265 ks = lookup_symbol(name, mod->unused_gpl_syms,
266 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
267 if (ks) {
268 printk_unused_warning(name);
269 *crc = symversion(mod->unused_gpl_crcs,
270 (ks - mod->unused_gpl_syms));
271 return ks->value;
272 }
273 }
274 ks = lookup_symbol(name, mod->gpl_future_syms,
275 (mod->gpl_future_syms +
276 mod->num_gpl_future_syms));
277 if (ks) {
278 if (!gplok) {
279 printk(KERN_WARNING "Symbol %s is being used "
280 "by a non-GPL module, which will not "
281 "be allowed in the future\n", name);
282 printk(KERN_WARNING "Please see the file "
283 "Documentation/feature-removal-schedule.txt "
284 "in the kernel source tree for more "
285 "details.\n");
286 }
287 *crc = symversion(mod->gpl_future_crcs,
288 (ks - mod->gpl_future_syms));
289 return ks->value; 297 return ks->value;
290 } 298 }
291 } 299 }
300
292 DEBUGP("Failed to find symbol %s\n", name); 301 DEBUGP("Failed to find symbol %s\n", name);
293 return -ENOENT; 302 return -ENOENT;
294} 303}
@@ -736,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
736 if (!forced && module_refcount(mod) != 0) 745 if (!forced && module_refcount(mod) != 0)
737 wait_for_zero_refcount(mod); 746 wait_for_zero_refcount(mod);
738 747
748 mutex_unlock(&module_mutex);
739 /* Final destruction now noone is using it. */ 749 /* Final destruction now noone is using it. */
740 if (mod->exit != NULL) { 750 if (mod->exit != NULL)
741 mutex_unlock(&module_mutex);
742 mod->exit(); 751 mod->exit();
743 mutex_lock(&module_mutex); 752 blocking_notifier_call_chain(&module_notify_list,
744 } 753 MODULE_STATE_GOING, mod);
754 mutex_lock(&module_mutex);
745 /* Store the name of the last unloaded module for diagnostic purposes */ 755 /* Store the name of the last unloaded module for diagnostic purposes */
746 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 756 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
747 free_module(mod); 757 free_module(mod);
@@ -777,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
777void __symbol_put(const char *symbol) 787void __symbol_put(const char *symbol)
778{ 788{
779 struct module *owner; 789 struct module *owner;
780 const unsigned long *crc;
781 790
782 preempt_disable(); 791 preempt_disable();
783 if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) 792 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
784 BUG(); 793 BUG();
785 module_put(owner); 794 module_put(owner);
786 preempt_enable(); 795 preempt_enable();
@@ -881,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
881 890
882static const char vermagic[] = VERMAGIC_STRING; 891static const char vermagic[] = VERMAGIC_STRING;
883 892
893static int try_to_force_load(struct module *mod, const char *symname)
894{
895#ifdef CONFIG_MODULE_FORCE_LOAD
896 if (!(tainted & TAINT_FORCED_MODULE))
897 printk("%s: no version for \"%s\" found: kernel tainted.\n",
898 mod->name, symname);
899 add_taint_module(mod, TAINT_FORCED_MODULE);
900 return 0;
901#else
902 return -ENOEXEC;
903#endif
904}
905
884#ifdef CONFIG_MODVERSIONS 906#ifdef CONFIG_MODVERSIONS
885static int check_version(Elf_Shdr *sechdrs, 907static int check_version(Elf_Shdr *sechdrs,
886 unsigned int versindex, 908 unsigned int versindex,
@@ -905,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
905 927
906 if (versions[i].crc == *crc) 928 if (versions[i].crc == *crc)
907 return 1; 929 return 1;
908 printk("%s: disagrees about version of symbol %s\n",
909 mod->name, symname);
910 DEBUGP("Found checksum %lX vs module %lX\n", 930 DEBUGP("Found checksum %lX vs module %lX\n",
911 *crc, versions[i].crc); 931 *crc, versions[i].crc);
912 return 0; 932 goto bad_version;
913 } 933 }
914 /* Not in module's version table. OK, but that taints the kernel. */ 934
915 if (!(tainted & TAINT_FORCED_MODULE)) 935 if (!try_to_force_load(mod, symname))
916 printk("%s: no version for \"%s\" found: kernel tainted.\n", 936 return 1;
917 mod->name, symname); 937
918 add_taint_module(mod, TAINT_FORCED_MODULE); 938bad_version:
919 return 1; 939 printk("%s: disagrees about version of symbol %s\n",
940 mod->name, symname);
941 return 0;
920} 942}
921 943
922static inline int check_modstruct_version(Elf_Shdr *sechdrs, 944static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -924,13 +946,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
924 struct module *mod) 946 struct module *mod)
925{ 947{
926 const unsigned long *crc; 948 const unsigned long *crc;
927 struct module *owner;
928 949
929 if (IS_ERR_VALUE(__find_symbol("struct_module", 950 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
930 &owner, &crc, 1)))
931 BUG(); 951 BUG();
932 return check_version(sechdrs, versindex, "struct_module", mod, 952 return check_version(sechdrs, versindex, "struct_module", mod, crc);
933 crc);
934} 953}
935 954
936/* First part is kernel version, which we ignore. */ 955/* First part is kernel version, which we ignore. */
@@ -974,8 +993,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
974 unsigned long ret; 993 unsigned long ret;
975 const unsigned long *crc; 994 const unsigned long *crc;
976 995
977 ret = __find_symbol(name, &owner, &crc, 996 ret = find_symbol(name, &owner, &crc,
978 !(mod->taints & TAINT_PROPRIETARY_MODULE)); 997 !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
979 if (!IS_ERR_VALUE(ret)) { 998 if (!IS_ERR_VALUE(ret)) {
980 /* use_module can fail due to OOM, 999 /* use_module can fail due to OOM,
981 or module initialization or unloading */ 1000 or module initialization or unloading */
@@ -991,6 +1010,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
991 * J. Corbet <corbet@lwn.net> 1010 * J. Corbet <corbet@lwn.net>
992 */ 1011 */
993#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1013struct module_sect_attr
1014{
1015 struct module_attribute mattr;
1016 char *name;
1017 unsigned long address;
1018};
1019
1020struct module_sect_attrs
1021{
1022 struct attribute_group grp;
1023 unsigned int nsections;
1024 struct module_sect_attr attrs[0];
1025};
1026
994static ssize_t module_sect_show(struct module_attribute *mattr, 1027static ssize_t module_sect_show(struct module_attribute *mattr,
995 struct module *mod, char *buf) 1028 struct module *mod, char *buf)
996{ 1029{
@@ -1001,7 +1034,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1001 1034
1002static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1035static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1003{ 1036{
1004 int section; 1037 unsigned int section;
1005 1038
1006 for (section = 0; section < sect_attrs->nsections; section++) 1039 for (section = 0; section < sect_attrs->nsections; section++)
1007 kfree(sect_attrs->attrs[section].name); 1040 kfree(sect_attrs->attrs[section].name);
@@ -1362,10 +1395,9 @@ void *__symbol_get(const char *symbol)
1362{ 1395{
1363 struct module *owner; 1396 struct module *owner;
1364 unsigned long value; 1397 unsigned long value;
1365 const unsigned long *crc;
1366 1398
1367 preempt_disable(); 1399 preempt_disable();
1368 value = __find_symbol(symbol, &owner, &crc, 1); 1400 value = find_symbol(symbol, &owner, NULL, true, true);
1369 if (IS_ERR_VALUE(value)) 1401 if (IS_ERR_VALUE(value))
1370 value = 0; 1402 value = 0;
1371 else if (strong_try_module_get(owner)) 1403 else if (strong_try_module_get(owner))
@@ -1382,33 +1414,33 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1382 */ 1414 */
1383static int verify_export_symbols(struct module *mod) 1415static int verify_export_symbols(struct module *mod)
1384{ 1416{
1385 const char *name = NULL; 1417 unsigned int i;
1386 unsigned long i, ret = 0;
1387 struct module *owner; 1418 struct module *owner;
1388 const unsigned long *crc; 1419 const struct kernel_symbol *s;
1389 1420 struct {
1390 for (i = 0; i < mod->num_syms; i++) 1421 const struct kernel_symbol *sym;
1391 if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, 1422 unsigned int num;
1392 &owner, &crc, 1))) { 1423 } arr[] = {
1393 name = mod->syms[i].name; 1424 { mod->syms, mod->num_syms },
1394 ret = -ENOEXEC; 1425 { mod->gpl_syms, mod->num_gpl_syms },
1395 goto dup; 1426 { mod->gpl_future_syms, mod->num_gpl_future_syms },
1396 } 1427 { mod->unused_syms, mod->num_unused_syms },
1428 { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
1429 };
1397 1430
1398 for (i = 0; i < mod->num_gpl_syms; i++) 1431 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1399 if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, 1432 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1400 &owner, &crc, 1))) { 1433 if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
1401 name = mod->gpl_syms[i].name; 1434 NULL, true, false))) {
1402 ret = -ENOEXEC; 1435 printk(KERN_ERR
1403 goto dup; 1436 "%s: exports duplicate symbol %s"
1437 " (owned by %s)\n",
1438 mod->name, s->name, module_name(owner));
1439 return -ENOEXEC;
1440 }
1404 } 1441 }
1405 1442 }
1406dup: 1443 return 0;
1407 if (ret)
1408 printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
1409 mod->name, name, module_name(owner));
1410
1411 return ret;
1412} 1444}
1413 1445
1414/* Change all symbols so that st_value encodes the pointer directly. */ 1446/* Change all symbols so that st_value encodes the pointer directly. */
@@ -1814,8 +1846,9 @@ static struct module *load_module(void __user *umod,
1814 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); 1846 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1815#endif 1847#endif
1816 1848
1817 /* Don't keep modinfo section */ 1849 /* Don't keep modinfo and version sections. */
1818 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1850 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1851 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1819#ifdef CONFIG_KALLSYMS 1852#ifdef CONFIG_KALLSYMS
1820 /* Keep symbol and string tables for decoding later. */ 1853 /* Keep symbol and string tables for decoding later. */
1821 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1854 sechdrs[symindex].sh_flags |= SHF_ALLOC;
@@ -1833,9 +1866,9 @@ static struct module *load_module(void __user *umod,
1833 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1866 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1834 /* This is allowed: modprobe --force will invalidate it. */ 1867 /* This is allowed: modprobe --force will invalidate it. */
1835 if (!modmagic) { 1868 if (!modmagic) {
1836 add_taint_module(mod, TAINT_FORCED_MODULE); 1869 err = try_to_force_load(mod, "magic");
1837 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1870 if (err)
1838 mod->name); 1871 goto free_hdr;
1839 } else if (!same_magic(modmagic, vermagic)) { 1872 } else if (!same_magic(modmagic, vermagic)) {
1840 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 1873 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
1841 mod->name, modmagic, vermagic); 1874 mod->name, modmagic, vermagic);
@@ -1977,7 +2010,8 @@ static struct module *load_module(void __user *umod,
1977 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 2010 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1978 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 2011 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1979 if (unusedgplcrcindex) 2012 if (unusedgplcrcindex)
1980 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2013 mod->unused_gpl_crcs
2014 = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1981 2015
1982#ifdef CONFIG_MODVERSIONS 2016#ifdef CONFIG_MODVERSIONS
1983 if ((mod->num_syms && !crcindex) || 2017 if ((mod->num_syms && !crcindex) ||
@@ -1985,9 +2019,10 @@ static struct module *load_module(void __user *umod,
1985 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 2019 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1986 (mod->num_unused_syms && !unusedcrcindex) || 2020 (mod->num_unused_syms && !unusedcrcindex) ||
1987 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { 2021 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1988 printk(KERN_WARNING "%s: No versions for exported symbols." 2022 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
1989 " Tainting kernel.\n", mod->name); 2023 err = try_to_force_load(mod, "nocrc");
1990 add_taint_module(mod, TAINT_FORCED_MODULE); 2024 if (err)
2025 goto cleanup;
1991 } 2026 }
1992#endif 2027#endif
1993 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); 2028 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
@@ -2171,6 +2206,8 @@ sys_init_module(void __user *umod,
2171 mod->state = MODULE_STATE_GOING; 2206 mod->state = MODULE_STATE_GOING;
2172 synchronize_sched(); 2207 synchronize_sched();
2173 module_put(mod); 2208 module_put(mod);
2209 blocking_notifier_call_chain(&module_notify_list,
2210 MODULE_STATE_GOING, mod);
2174 mutex_lock(&module_mutex); 2211 mutex_lock(&module_mutex);
2175 free_module(mod); 2212 free_module(mod);
2176 mutex_unlock(&module_mutex); 2213 mutex_unlock(&module_mutex);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
31 return 0; 31 return 0;
32} 32}
33 33
34static int notifier_chain_cond_register(struct notifier_block **nl,
35 struct notifier_block *n)
36{
37 while ((*nl) != NULL) {
38 if ((*nl) == n)
39 return 0;
40 if (n->priority > (*nl)->priority)
41 break;
42 nl = &((*nl)->next);
43 }
44 n->next = *nl;
45 rcu_assign_pointer(*nl, n);
46 return 0;
47}
48
34static int notifier_chain_unregister(struct notifier_block **nl, 49static int notifier_chain_unregister(struct notifier_block **nl,
35 struct notifier_block *n) 50 struct notifier_block *n)
36{ 51{
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
205EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); 220EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
206 221
207/** 222/**
223 * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
224 * @nh: Pointer to head of the blocking notifier chain
225 * @n: New entry in notifier chain
226 *
227 * Adds a notifier to a blocking notifier chain, only if not already
228 * present in the chain.
229 * Must be called in process context.
230 *
231 * Currently always returns zero.
232 */
233int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
234 struct notifier_block *n)
235{
236 int ret;
237
238 down_write(&nh->rwsem);
239 ret = notifier_chain_cond_register(&nh->head, n);
240 up_write(&nh->rwsem);
241 return ret;
242}
243EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
244
245/**
208 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain 246 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
209 * @nh: Pointer to head of the blocking notifier chain 247 * @nh: Pointer to head of the blocking notifier chain
210 * @n: Entry to remove from notifier chain 248 * @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/slab.h>
11#include <linux/nsproxy.h>
10 12
11struct ns_cgroup { 13struct ns_cgroup {
12 struct cgroup_subsys_state css; 14 struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
139 goto out; 139 goto out;
140 } 140 }
141 141
142 /*
143 * CLONE_NEWIPC must detach from the undolist: after switching
144 * to a new ipc namespace, the semaphore arrays from the old
145 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
146 * means share undolist with parent, so we must forbid using
147 * it along with CLONE_NEWIPC.
148 */
149 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
150 err = -EINVAL;
151 goto out;
152 }
153
142 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
143 if (IS_ERR(new_ns)) { 155 if (IS_ERR(new_ns)) {
144 err = PTR_ERR(new_ns); 156 err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
153 * 'M' - System experienced a machine check exception. 153 * 'M' - System experienced a machine check exception.
154 * 'B' - System has hit bad_page. 154 * 'B' - System has hit bad_page.
155 * 'U' - Userspace-defined naughtiness. 155 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning.
156 * 158 *
157 * The string is overwritten by the next call to print_taint(). 159 * The string is overwritten by the next call to print_taint().
158 */ 160 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
161{ 163{
162 static char buf[20]; 164 static char buf[20];
163 if (tainted) { 165 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", 166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 172 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 173 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' ', 174 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); 175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
176 tainted & TAINT_WARN ? 'W' : ' ');
174 } 177 }
175 else 178 else
176 snprintf(buf, sizeof(buf), "Not tainted"); 179 snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
312 print_modules(); 315 print_modules();
313 dump_stack(); 316 dump_stack();
314 print_oops_end_marker(); 317 print_oops_end_marker();
318 add_taint(TAINT_WARN);
315} 319}
316EXPORT_SYMBOL(warn_on_slowpath); 320EXPORT_SYMBOL(warn_on_slowpath);
317#endif 321#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 477691576b33..20d59fa2d493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init);
111 111
112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
113 113
114static void free_pidmap(struct pid_namespace *pid_ns, int pid) 114static void free_pidmap(struct upid *upid)
115{ 115{
116 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; 116 int nr = upid->nr;
117 int offset = pid & BITS_PER_PAGE_MASK; 117 struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
118 int offset = nr & BITS_PER_PAGE_MASK;
118 119
119 clear_bit(offset, map->page); 120 clear_bit(offset, map->page);
120 atomic_inc(&map->nr_free); 121 atomic_inc(&map->nr_free);
@@ -232,7 +233,7 @@ void free_pid(struct pid *pid)
232 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
233 234
234 for (i = 0; i <= pid->level; i++) 235 for (i = 0; i <= pid->level; i++)
235 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 236 free_pidmap(pid->numbers + i);
236 237
237 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
238} 239}
@@ -278,8 +279,8 @@ out:
278 return pid; 279 return pid;
279 280
280out_free: 281out_free:
281 for (i++; i <= ns->level; i++) 282 while (++i <= ns->level)
282 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 283 free_pidmap(pid->numbers + i);
283 284
284 kmem_cache_free(ns->pid_cachep, pid); 285 kmem_cache_free(ns->pid_cachep, pid);
285 pid = NULL; 286 pid = NULL;
@@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid);
316/* 317/*
317 * attach_pid() must be called with the tasklist_lock write-held. 318 * attach_pid() must be called with the tasklist_lock write-held.
318 */ 319 */
319int attach_pid(struct task_struct *task, enum pid_type type, 320void attach_pid(struct task_struct *task, enum pid_type type,
320 struct pid *pid) 321 struct pid *pid)
321{ 322{
322 struct pid_link *link; 323 struct pid_link *link;
@@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type,
324 link = &task->pids[type]; 325 link = &task->pids[type];
325 link->pid = pid; 326 link->pid = pid;
326 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 327 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
327
328 return 0;
329} 328}
330 329
331void detach_pid(struct task_struct *task, enum pid_type type) 330static void __change_pid(struct task_struct *task, enum pid_type type,
331 struct pid *new)
332{ 332{
333 struct pid_link *link; 333 struct pid_link *link;
334 struct pid *pid; 334 struct pid *pid;
@@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type)
338 pid = link->pid; 338 pid = link->pid;
339 339
340 hlist_del_rcu(&link->node); 340 hlist_del_rcu(&link->node);
341 link->pid = NULL; 341 link->pid = new;
342 342
343 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 343 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
344 if (!hlist_empty(&pid->tasks[tmp])) 344 if (!hlist_empty(&pid->tasks[tmp]))
@@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type)
347 free_pid(pid); 347 free_pid(pid);
348} 348}
349 349
350void detach_pid(struct task_struct *task, enum pid_type type)
351{
352 __change_pid(task, type, NULL);
353}
354
355void change_pid(struct task_struct *task, enum pid_type type,
356 struct pid *pid)
357{
358 __change_pid(task, type, pid);
359 attach_pid(task, type, pid);
360}
361
350/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 362/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
351void transfer_pid(struct task_struct *old, struct task_struct *new, 363void transfer_pid(struct task_struct *old, struct task_struct *new,
352 enum pid_type type) 364 enum pid_type type)
353{ 365{
354 new->pids[type].pid = old->pids[type].pid; 366 new->pids[type].pid = old->pids[type].pid;
355 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); 367 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
356 old->pids[type].pid = NULL;
357} 368}
358 369
359struct task_struct *pid_task(struct pid *pid, enum pid_type type) 370struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr,
380 391
381EXPORT_SYMBOL(find_task_by_pid_type_ns); 392EXPORT_SYMBOL(find_task_by_pid_type_ns);
382 393
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 394struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 395{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 396 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..98702b4b8851 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -66,7 +66,7 @@ err_alloc:
66 return NULL; 66 return NULL;
67} 67}
68 68
69static struct pid_namespace *create_pid_namespace(int level) 69static struct pid_namespace *create_pid_namespace(unsigned int level)
70{ 70{
71 struct pid_namespace *ns; 71 struct pid_namespace *ns;
72 int i; 72 int i;
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 93
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 94 for (i = 1; i < PIDMAP_ENTRIES; i++) {
95 ns->pidmap[i].page = 0; 95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 } 97 }
98 98
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ae5c6c147c4b..f1525ad06cb3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -4,8 +4,9 @@
4 4
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/posix-timers.h> 6#include <linux/posix-timers.h>
7#include <asm/uaccess.h>
8#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
9 10
10static int check_clock(const clockid_t which_clock) 11static int check_clock(const clockid_t which_clock)
11{ 12{
@@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock,
47 union cpu_time_count cpu, 48 union cpu_time_count cpu,
48 struct timespec *tp) 49 struct timespec *tp)
49{ 50{
50 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 51 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
51 tp->tv_sec = div_long_long_rem(cpu.sched, 52 *tp = ns_to_timespec(cpu.sched);
52 NSEC_PER_SEC, &tp->tv_nsec); 53 else
53 } else {
54 cputime_to_timespec(cpu.cpu, tp); 54 cputime_to_timespec(cpu.cpu, tp);
55 }
56} 55}
57 56
58static inline int cpu_time_before(const clockid_t which_clock, 57static inline int cpu_time_before(const clockid_t which_clock,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 8476956ffd92..dbd8398ddb0b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -310,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
310 310
311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
312 struct task_struct *leader; 312 struct task_struct *leader;
313 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, 313 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
314 timr->it_process);
315 314
316 if (likely(ret >= 0)) 315 if (likely(ret >= 0))
317 return ret; 316 return ret;
@@ -322,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
322 timr->it_process = leader; 321 timr->it_process = leader;
323 } 322 }
324 323
325 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, 324 return send_sigqueue(timr->sigq, timr->it_process, 1);
326 timr->it_process);
327} 325}
328EXPORT_SYMBOL_GPL(posix_timer_event); 326EXPORT_SYMBOL_GPL(posix_timer_event);
329 327
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM
25 default n
26 ---help---
27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
29
30 If unsure, say N.
31
32config PM_DEBUG 22config PM_DEBUG
33 bool "Power Management Debug Support" 23 bool "Power Management Debug Support"
34 depends on PM 24 depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-y := main.o
7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += process.o console.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
10 9
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 * pm.c - Power management interface
3 *
4 * Copyright (C) 2000 Andrew Henroid
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h>
28#include <linux/mutex.h>
29
30/*
31 * Locking notes:
32 * pm_devs_lock can be a semaphore providing pm ops are not called
33 * from an interrupt handler (already a bad idea so no change here). Each
34 * change must be protected so that an unlink of an entry doesn't clash
35 * with a pm send - which is permitted to sleep in the current architecture
36 *
37 * Module unloads clashing with pm events now work out safely, the module
38 * unload path will block until the event has been sent. It may well block
39 * until a resume but that will be fine.
40 */
41
42static DEFINE_MUTEX(pm_devs_lock);
43static LIST_HEAD(pm_devs);
44
45/**
46 * pm_register - register a device with power management
47 * @type: device type
48 * @id: device ID
49 * @callback: callback function
50 *
51 * Add a device to the list of devices that wish to be notified about
52 * power management events. A &pm_dev structure is returned on success,
53 * on failure the return is %NULL.
54 *
55 * The callback function will be called in process context and
56 * it may sleep.
57 */
58
59struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id,
61 pm_callback callback)
62{
63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) {
65 dev->type = type;
66 dev->id = id;
67 dev->callback = callback;
68
69 mutex_lock(&pm_devs_lock);
70 list_add(&dev->entry, &pm_devs);
71 mutex_unlock(&pm_devs_lock);
72 }
73 return dev;
74}
75
76/**
77 * pm_send - send request to a single device
78 * @dev: device to send to
79 * @rqst: power management request
80 * @data: data for the callback
81 *
82 * Issue a power management request to a given device. The
83 * %PM_SUSPEND and %PM_RESUME events are handled specially. The
84 * data field must hold the intended next state. No call is made
85 * if the state matches.
86 *
87 * BUGS: what stops two power management requests occurring in parallel
88 * and conflicting.
89 *
90 * WARNING: Calling pm_send directly is not generally recommended, in
91 * particular there is no locking against the pm_dev going away. The
92 * caller must maintain all needed locking or have 'inside knowledge'
93 * on the safety. Also remember that this function is not locked against
94 * pm_unregister. This means that you must handle SMP races on callback
95 * execution and unload yourself.
96 */
97
98static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
99{
100 int status = 0;
101 unsigned long prev_state, next_state;
102
103 if (in_interrupt())
104 BUG();
105
106 switch (rqst) {
107 case PM_SUSPEND:
108 case PM_RESUME:
109 prev_state = dev->state;
110 next_state = (unsigned long) data;
111 if (prev_state != next_state) {
112 if (dev->callback)
113 status = (*dev->callback)(dev, rqst, data);
114 if (!status) {
115 dev->state = next_state;
116 dev->prev_state = prev_state;
117 }
118 }
119 else {
120 dev->prev_state = prev_state;
121 }
122 break;
123 default:
124 if (dev->callback)
125 status = (*dev->callback)(dev, rqst, data);
126 break;
127 }
128 return status;
129}
130
131/*
132 * Undo incomplete request
133 */
134static void pm_undo_all(struct pm_dev *last)
135{
136 struct list_head *entry = last->entry.prev;
137 while (entry != &pm_devs) {
138 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
139 if (dev->state != dev->prev_state) {
140 /* previous state was zero (running) resume or
141 * previous state was non-zero (suspended) suspend
142 */
143 pm_request_t undo = (dev->prev_state
144 ? PM_SUSPEND:PM_RESUME);
145 pm_send(dev, undo, (void*) dev->prev_state);
146 }
147 entry = entry->prev;
148 }
149}
150
151/**
152 * pm_send_all - send request to all managed devices
153 * @rqst: power management request
154 * @data: data for the callback
155 *
156 * Issue a power management request to a all devices. The
157 * %PM_SUSPEND events are handled specially. Any device is
158 * permitted to fail a suspend by returning a non zero (error)
159 * value from its callback function. If any device vetoes a
160 * suspend request then all other devices that have suspended
161 * during the processing of this request are restored to their
162 * previous state.
163 *
164 * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
165 * the callbacks have completed. This prevents races against pm locking
166 * functions, races against module unload pm_unregister code. It does
167 * mean however that you must not issue pm_ functions within the callback
168 * or you will deadlock and users will hate you.
169 *
170 * Zero is returned on success. If a suspend fails then the status
171 * from the device that vetoes the suspend is returned.
172 *
173 * BUGS: what stops two power management requests occurring in parallel
174 * and conflicting.
175 */
176
177int pm_send_all(pm_request_t rqst, void *data)
178{
179 struct list_head *entry;
180
181 mutex_lock(&pm_devs_lock);
182 entry = pm_devs.next;
183 while (entry != &pm_devs) {
184 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
185 if (dev->callback) {
186 int status = pm_send(dev, rqst, data);
187 if (status) {
188 /* return devices to previous state on
189 * failed suspend request
190 */
191 if (rqst == PM_SUSPEND)
192 pm_undo_all(dev);
193 mutex_unlock(&pm_devs_lock);
194 return status;
195 }
196 }
197 entry = entry->next;
198 }
199 mutex_unlock(&pm_devs_lock);
200 return 0;
201}
202
203EXPORT_SYMBOL(pm_register);
204EXPORT_SYMBOL(pm_send_all);
205
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..8fb01c32aa3b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -111,6 +111,9 @@ struct console_cmdline
111 char name[8]; /* Name of the driver */ 111 char name[8]; /* Name of the driver */
112 int index; /* Minor dev. to use */ 112 int index; /* Minor dev. to use */
113 char *options; /* Options for the driver */ 113 char *options; /* Options for the driver */
114#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
115 char *brl_options; /* Options for braille driver */
116#endif
114}; 117};
115 118
116#define MAX_CMDLINECONSOLES 8 119#define MAX_CMDLINECONSOLES 8
@@ -808,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end)
808 811
809#endif 812#endif
810 813
814static int __add_preferred_console(char *name, int idx, char *options,
815 char *brl_options)
816{
817 struct console_cmdline *c;
818 int i;
819
820 /*
821 * See if this tty is not yet registered, and
822 * if we have a slot free.
823 */
824 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
825 if (strcmp(console_cmdline[i].name, name) == 0 &&
826 console_cmdline[i].index == idx) {
827 if (!brl_options)
828 selected_console = i;
829 return 0;
830 }
831 if (i == MAX_CMDLINECONSOLES)
832 return -E2BIG;
833 if (!brl_options)
834 selected_console = i;
835 c = &console_cmdline[i];
836 strlcpy(c->name, name, sizeof(c->name));
837 c->options = options;
838#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
839 c->brl_options = brl_options;
840#endif
841 c->index = idx;
842 return 0;
843}
811/* 844/*
812 * Set up a list of consoles. Called from init/main.c 845 * Set up a list of consoles. Called from init/main.c
813 */ 846 */
814static int __init console_setup(char *str) 847static int __init console_setup(char *str)
815{ 848{
816 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ 849 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
817 char *s, *options; 850 char *s, *options, *brl_options = NULL;
818 int idx; 851 int idx;
819 852
853#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
854 if (!memcmp(str, "brl,", 4)) {
855 brl_options = "";
856 str += 4;
857 } else if (!memcmp(str, "brl=", 4)) {
858 brl_options = str + 4;
859 str = strchr(brl_options, ',');
860 if (!str) {
861 printk(KERN_ERR "need port name after brl=\n");
862 return 1;
863 }
864 *(str++) = 0;
865 }
866#endif
867
820 /* 868 /*
821 * Decode str into name, index, options. 869 * Decode str into name, index, options.
822 */ 870 */
@@ -841,7 +889,7 @@ static int __init console_setup(char *str)
841 idx = simple_strtoul(s, NULL, 10); 889 idx = simple_strtoul(s, NULL, 10);
842 *s = 0; 890 *s = 0;
843 891
844 add_preferred_console(buf, idx, options); 892 __add_preferred_console(buf, idx, options, brl_options);
845 return 1; 893 return 1;
846} 894}
847__setup("console=", console_setup); 895__setup("console=", console_setup);
@@ -861,28 +909,7 @@ __setup("console=", console_setup);
861 */ 909 */
862int add_preferred_console(char *name, int idx, char *options) 910int add_preferred_console(char *name, int idx, char *options)
863{ 911{
864 struct console_cmdline *c; 912 return __add_preferred_console(name, idx, options, NULL);
865 int i;
866
867 /*
868 * See if this tty is not yet registered, and
869 * if we have a slot free.
870 */
871 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
872 if (strcmp(console_cmdline[i].name, name) == 0 &&
873 console_cmdline[i].index == idx) {
874 selected_console = i;
875 return 0;
876 }
877 if (i == MAX_CMDLINECONSOLES)
878 return -E2BIG;
879 selected_console = i;
880 c = &console_cmdline[i];
881 memcpy(c->name, name, sizeof(c->name));
882 c->name[sizeof(c->name) - 1] = 0;
883 c->options = options;
884 c->index = idx;
885 return 0;
886} 913}
887 914
888int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) 915int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
@@ -894,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
894 if (strcmp(console_cmdline[i].name, name) == 0 && 921 if (strcmp(console_cmdline[i].name, name) == 0 &&
895 console_cmdline[i].index == idx) { 922 console_cmdline[i].index == idx) {
896 c = &console_cmdline[i]; 923 c = &console_cmdline[i];
897 memcpy(c->name, name_new, sizeof(c->name)); 924 strlcpy(c->name, name_new, sizeof(c->name));
898 c->name[sizeof(c->name) - 1] = 0; 925 c->name[sizeof(c->name) - 1] = 0;
899 c->options = options; 926 c->options = options;
900 c->index = idx_new; 927 c->index = idx_new;
@@ -1163,6 +1190,16 @@ void register_console(struct console *console)
1163 continue; 1190 continue;
1164 if (console->index < 0) 1191 if (console->index < 0)
1165 console->index = console_cmdline[i].index; 1192 console->index = console_cmdline[i].index;
1193#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1194 if (console_cmdline[i].brl_options) {
1195 console->flags |= CON_BRL;
1196 braille_register_console(console,
1197 console_cmdline[i].index,
1198 console_cmdline[i].options,
1199 console_cmdline[i].brl_options);
1200 return;
1201 }
1202#endif
1166 if (console->setup && 1203 if (console->setup &&
1167 console->setup(console, console_cmdline[i].options) != 0) 1204 console->setup(console, console_cmdline[i].options) != 0)
1168 break; 1205 break;
@@ -1221,6 +1258,11 @@ int unregister_console(struct console *console)
1221 struct console *a, *b; 1258 struct console *a, *b;
1222 int res = 1; 1259 int res = 1;
1223 1260
1261#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1262 if (console->flags & CON_BRL)
1263 return braille_unregister_console(console);
1264#endif
1265
1224 acquire_console_sem(); 1266 acquire_console_sem();
1225 if (console_drivers == console) { 1267 if (console_drivers == console) {
1226 console_drivers=console->next; 1268 console_drivers=console->next;
@@ -1272,8 +1314,8 @@ late_initcall(disable_boot_consoles);
1272 */ 1314 */
1273void tty_write_message(struct tty_struct *tty, char *msg) 1315void tty_write_message(struct tty_struct *tty, char *msg)
1274{ 1316{
1275 if (tty && tty->driver->write) 1317 if (tty && tty->ops->write)
1276 tty->driver->write(tty, msg, strlen(msg)); 1318 tty->ops->write(tty, msg, strlen(msg));
1277 return; 1319 return;
1278} 1320}
1279 1321
@@ -1287,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1287 */ 1329 */
1288int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1330int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1289{ 1331{
1290 static DEFINE_SPINLOCK(ratelimit_lock); 1332 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1291 static unsigned toks = 10 * 5 * HZ;
1292 static unsigned long last_msg;
1293 static int missed;
1294 unsigned long flags;
1295 unsigned long now = jiffies;
1296
1297 spin_lock_irqsave(&ratelimit_lock, flags);
1298 toks += now - last_msg;
1299 last_msg = now;
1300 if (toks > (ratelimit_burst * ratelimit_jiffies))
1301 toks = ratelimit_burst * ratelimit_jiffies;
1302 if (toks >= ratelimit_jiffies) {
1303 int lost = missed;
1304
1305 missed = 0;
1306 toks -= ratelimit_jiffies;
1307 spin_unlock_irqrestore(&ratelimit_lock, flags);
1308 if (lost)
1309 printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
1310 return 1;
1311 }
1312 missed++;
1313 spin_unlock_irqrestore(&ratelimit_lock, flags);
1314 return 0;
1315} 1333}
1316EXPORT_SYMBOL(__printk_ratelimit); 1334EXPORT_SYMBOL(__printk_ratelimit);
1317 1335
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
587 return 0; 587 return 0;
588 if (create_hash_tables()) 588 if (create_hash_tables())
589 return -1; 589 return -1;
590 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); 590 entry = proc_create("profile", S_IWUSR | S_IRUGO,
591 NULL, &proc_profile_operations);
591 if (!entry) 592 if (!entry)
592 return 0; 593 return 0;
593 entry->proc_fops = &proc_profile_operations;
594 entry->size = (1+prof_len) * sizeof(atomic_t); 594 entry->size = (1+prof_len) * sizeof(atomic_t);
595 hotcpu_notifier(profile_cpu_callback, 0); 595 hotcpu_notifier(profile_cpu_callback, 0);
596 return 0; 596 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 67e392ed5496..6c19e94fd0a5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child)
73 BUG_ON(!child->ptrace); 73 BUG_ON(!child->ptrace);
74 74
75 child->ptrace = 0; 75 child->ptrace = 0;
76 if (!list_empty(&child->ptrace_list)) { 76 if (ptrace_reparented(child)) {
77 list_del_init(&child->ptrace_list); 77 list_del_init(&child->ptrace_list);
78 remove_parent(child); 78 remove_parent(child);
79 child->parent = child->real_parent; 79 child->parent = child->real_parent;
@@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task)
168 audit_ptrace(task); 168 audit_ptrace(task);
169 169
170 retval = -EPERM; 170 retval = -EPERM;
171 if (task->pid <= 1)
172 goto out;
173 if (same_thread_group(task, current)) 171 if (same_thread_group(task, current))
174 goto out; 172 goto out;
175 173
@@ -208,8 +206,7 @@ repeat:
208 206
209 __ptrace_link(task, current); 207 __ptrace_link(task, current);
210 208
211 force_sig_specific(SIGSTOP, task); 209 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
212
213bad: 210bad:
214 write_unlock_irqrestore(&tasklist_lock, flags); 211 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 212 task_unlock(task);
@@ -522,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
522{ 519{
523 struct task_struct *child; 520 struct task_struct *child;
524 521
525 /*
526 * Tracing init is not allowed.
527 */
528 if (pid == 1)
529 return ERR_PTR(-EPERM);
530
531 read_lock(&tasklist_lock); 522 read_lock(&tasklist_lock);
532 child = find_task_by_vpid(pid); 523 child = find_task_by_vpid(pid);
533 if (child) 524 if (child)
@@ -543,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
543#define arch_ptrace_attach(child) do { } while (0) 534#define arch_ptrace_attach(child) do { } while (0)
544#endif 535#endif
545 536
546#ifndef __ARCH_SYS_PTRACE
547asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 537asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
548{ 538{
549 struct task_struct *child; 539 struct task_struct *child;
@@ -591,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
591 unlock_kernel(); 581 unlock_kernel();
592 return ret; 582 return ret;
593} 583}
594#endif /* __ARCH_SYS_PTRACE */
595 584
596int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 585int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
597{ 586{
@@ -612,7 +601,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
612 return (copied == sizeof(data)) ? 0 : -EIO; 601 return (copied == sizeof(data)) ? 0 : -EIO;
613} 602}
614 603
615#ifdef CONFIG_COMPAT 604#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
616#include <linux/compat.h> 605#include <linux/compat.h>
617 606
618int compat_ptrace_request(struct task_struct *child, compat_long_t request, 607int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -667,7 +656,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
667 return ret; 656 return ret;
668} 657}
669 658
670#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
671asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 659asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
672 compat_long_t addr, compat_long_t data) 660 compat_long_t addr, compat_long_t data)
673{ 661{
@@ -710,6 +698,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
710 unlock_kernel(); 698 unlock_kernel();
711 return ret; 699 return ret;
712} 700}
713#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ 701#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
714
715#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
45#include <linux/byteorder/swabb.h> 45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h>
48 49
49MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
67 67
68/*
69 * allocate an array of pointers of struct page
70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{
73 struct page **array;
74 size_t pa_size = n_pages * sizeof(struct page *);
75
76 if (pa_size > PAGE_SIZE) {
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84}
85
86/*
87 * free an array of pointers of struct page
88 */
89static void relay_free_page_array(struct page **array)
90{
91 if (is_vmalloc_addr(array))
92 vfree(array);
93 else
94 kfree(array);
95}
96
68/** 97/**
69 * relay_mmap_buf: - mmap channel buffer to process address space 98 * relay_mmap_buf: - mmap channel buffer to process address space
70 * @buf: relay channel buffer 99 * @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
109 *size = PAGE_ALIGN(*size); 138 *size = PAGE_ALIGN(*size);
110 n_pages = *size >> PAGE_SHIFT; 139 n_pages = *size >> PAGE_SHIFT;
111 140
112 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); 141 buf->page_array = relay_alloc_page_array(n_pages);
113 if (!buf->page_array) 142 if (!buf->page_array)
114 return NULL; 143 return NULL;
115 144
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
130depopulate: 159depopulate:
131 for (j = 0; j < i; j++) 160 for (j = 0; j < i; j++)
132 __free_page(buf->page_array[j]); 161 __free_page(buf->page_array[j]);
133 kfree(buf->page_array); 162 relay_free_page_array(buf->page_array);
134 return NULL; 163 return NULL;
135} 164}
136 165
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
189 vunmap(buf->start); 218 vunmap(buf->start);
190 for (i = 0; i < buf->page_count; i++) 219 for (i = 0; i < buf->page_count; i++)
191 __free_page(buf->page_array[i]); 220 __free_page(buf->page_array[i]);
192 kfree(buf->page_array); 221 relay_free_page_array(buf->page_array);
193 } 222 }
194 chan->buf[buf->cpu] = NULL; 223 chan->buf[buf->cpu] = NULL;
195 kfree(buf->padding); 224 kfree(buf->padding);
@@ -1162,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1162 ret = 0; 1191 ret = 0;
1163 spliced = 0; 1192 spliced = 0;
1164 1193
1165 while (len) { 1194 while (len && !spliced) {
1166 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1167 if (ret < 0) 1196 if (ret < 0)
1168 break; 1197 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
13#include <linux/res_counter.h> 14#include <linux/res_counter.h>
14#include <linux/uaccess.h> 15#include <linux/uaccess.h>
15 16
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
27 } 28 }
28 29
29 counter->usage += val; 30 counter->usage += val;
31 if (counter->usage > counter->max_usage)
32 counter->max_usage = counter->usage;
30 return 0; 33 return 0;
31} 34}
32 35
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
65 switch (member) { 68 switch (member) {
66 case RES_USAGE: 69 case RES_USAGE:
67 return &counter->usage; 70 return &counter->usage;
71 case RES_MAX_USAGE:
72 return &counter->max_usage;
68 case RES_LIMIT: 73 case RES_LIMIT:
69 return &counter->limit; 74 return &counter->limit;
70 case RES_FAILCNT: 75 case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
92 pos, buf, s - buf); 97 pos, buf, s - buf);
93} 98}
94 99
100u64 res_counter_read_u64(struct res_counter *counter, int member)
101{
102 return *res_counter_member(counter, member);
103}
104
95ssize_t res_counter_write(struct res_counter *counter, int member, 105ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos, 106 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val)) 107 int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
131 131
132static int __init ioresources_init(void) 132static int __init ioresources_init(void)
133{ 133{
134 struct proc_dir_entry *entry; 134 proc_create("ioports", 0, NULL, &proc_ioports_operations);
135 135 proc_create("iomem", 0, NULL, &proc_iomem_operations);
136 entry = create_proc_entry("ioports", 0, NULL);
137 if (entry)
138 entry->proc_fops = &proc_ioports_operations;
139 entry = create_proc_entry("iomem", 0, NULL);
140 if (entry)
141 entry->proc_fops = &proc_iomem_operations;
142 return 0; 136 return 0;
143} 137}
144__initcall(ioresources_init); 138__initcall(ioresources_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
242} 232}
243#endif 233#endif
244 234
235/*
236 * sched_domains_mutex serializes calls to arch_init_sched_domains,
237 * detach_destroy_domains and partition_sched_domains.
238 */
239static DEFINE_MUTEX(sched_domains_mutex);
240
245#ifdef CONFIG_GROUP_SCHED 241#ifdef CONFIG_GROUP_SCHED
246 242
247#include <linux/cgroup.h> 243#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
308 */ 304 */
309static DEFINE_SPINLOCK(task_group_lock); 305static DEFINE_SPINLOCK(task_group_lock);
310 306
311/* doms_cur_mutex serializes access to doms_cur[] array */
312static DEFINE_MUTEX(doms_cur_mutex);
313
314#ifdef CONFIG_FAIR_GROUP_SCHED 307#ifdef CONFIG_FAIR_GROUP_SCHED
315#ifdef CONFIG_USER_SCHED 308#ifdef CONFIG_USER_SCHED
316# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
319#endif 312#endif
320 313
314/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
316 * (The default weight is 1024 - so there's no practical
317 * limitation from this.)
318 */
321#define MIN_SHARES 2 319#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1)
322 321
323static int init_task_group_load = INIT_TASK_GROUP_LOAD; 322static int init_task_group_load = INIT_TASK_GROUP_LOAD;
324#endif 323#endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
358#endif 357#endif
359} 358}
360 359
361static inline void lock_doms_cur(void)
362{
363 mutex_lock(&doms_cur_mutex);
364}
365
366static inline void unlock_doms_cur(void)
367{
368 mutex_unlock(&doms_cur_mutex);
369}
370
371#else 360#else
372 361
373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline void lock_doms_cur(void) { }
375static inline void unlock_doms_cur(void) { }
376 363
377#endif /* CONFIG_GROUP_SCHED */ 364#endif /* CONFIG_GROUP_SCHED */
378 365
@@ -560,13 +547,7 @@ struct rq {
560 unsigned long next_balance; 547 unsigned long next_balance;
561 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
562 549
563 u64 clock, prev_clock_raw; 550 u64 clock;
564 s64 clock_max_delta;
565
566 unsigned int clock_warps, clock_overflows, clock_underflows;
567 u64 idle_clock;
568 unsigned int clock_deep_idle_events;
569 u64 tick_timestamp;
570 551
571 atomic_t nr_iowait; 552 atomic_t nr_iowait;
572 553
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
631#endif 612#endif
632} 613}
633 614
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
660/*
661 * Update the per-runqueue clock, as finegrained as the platform can give
662 * us, but without assuming monotonicity, etc.:
663 */
664static void __update_rq_clock(struct rq *rq)
665{
666 u64 prev_raw = rq->prev_clock_raw;
667 u64 now = sched_clock();
668 s64 delta = now - prev_raw;
669 u64 clock = rq->clock;
670
671#ifdef CONFIG_SCHED_DEBUG
672 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
673#endif
674 /*
675 * Protect against sched_clock() occasionally going backwards:
676 */
677 if (unlikely(delta < 0)) {
678 clock++;
679 rq->clock_warps++;
680 } else {
681 /*
682 * Catch too large forward jumps too:
683 */
684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
685 u64 max_time = rq->tick_timestamp + max_jump;
686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
690 else
691 clock++;
692 rq->clock_overflows++;
693 } else {
694 if (unlikely(delta > rq->clock_max_delta))
695 rq->clock_max_delta = delta;
696 clock += delta;
697 }
698 }
699
700 rq->prev_clock_raw = now;
701 rq->clock = clock;
702}
703
704static void update_rq_clock(struct rq *rq)
705{
706 if (likely(smp_processor_id() == cpu_of(rq)))
707 __update_rq_clock(rq);
708}
709
710/* 615/*
711 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
712 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
722#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
723#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
724 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
725/* 635/*
726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
727 */ 637 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
757#define SCHED_FEAT(name, enabled) \ 667#define SCHED_FEAT(name, enabled) \
758 #name , 668 #name ,
759 669
760__read_mostly char *sched_feat_names[] = { 670static __read_mostly char *sched_feat_names[] = {
761#include "sched_features.h" 671#include "sched_features.h"
762 NULL 672 NULL
763}; 673};
764 674
765#undef SCHED_FEAT 675#undef SCHED_FEAT
766 676
767int sched_feat_open(struct inode *inode, struct file *filp) 677static int sched_feat_open(struct inode *inode, struct file *filp)
768{ 678{
769 filp->private_data = inode->i_private; 679 filp->private_data = inode->i_private;
770 return 0; 680 return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900} 810}
901 811
902static const unsigned long long time_sync_thresh = 100000; 812unsigned long long time_sync_thresh = 100000;
903 813
904static DEFINE_PER_CPU(unsigned long long, time_offset); 814static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); 815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
913static DEFINE_SPINLOCK(time_sync_lock); 823static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time; 824static unsigned long long prev_global_time;
915 825
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) 826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
917{ 827{
918 unsigned long flags; 828 /*
919 829 * We want this inlined, to not get tracer function calls
920 spin_lock_irqsave(&time_sync_lock, flags); 830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
921 834
922 if (time < prev_global_time) { 835 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time; 836 per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
926 prev_global_time = time; 839 prev_global_time = time;
927 } 840 }
928 841
929 spin_unlock_irqrestore(&time_sync_lock, flags); 842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
930 844
931 return time; 845 return time;
932} 846}
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
934static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
935{ 849{
936 unsigned long long now; 850 unsigned long long now;
937 unsigned long flags;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 local_irq_save(flags); 859 now = sched_clock_cpu(cpu);
948 rq = cpu_rq(cpu);
949 update_rq_clock(rq);
950 now = rq->clock;
951 local_irq_restore(flags);
952 860
953 return now; 861 return now;
954} 862}
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
960unsigned long long cpu_clock(int cpu) 868unsigned long long cpu_clock(int cpu)
961{ 869{
962 unsigned long long prev_cpu_time, time, delta_time; 870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
963 872
873 local_irq_save(flags);
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu); 874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); 875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time; 876 delta_time = time-prev_cpu_time;
967 877
968 if (unlikely(delta_time > time_sync_thresh)) 878 if (unlikely(delta_time > time_sync_thresh)) {
969 time = __sync_cpu_clock(time, cpu); 879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
970 883
971 return time; 884 return time;
972} 885}
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
1117 return rq; 1030 return rq;
1118} 1031}
1119 1032
1120/*
1121 * We are going deep-idle (irqs are disabled):
1122 */
1123void sched_clock_idle_sleep_event(void)
1124{
1125 struct rq *rq = cpu_rq(smp_processor_id());
1126
1127 spin_lock(&rq->lock);
1128 __update_rq_clock(rq);
1129 spin_unlock(&rq->lock);
1130 rq->clock_deep_idle_events++;
1131}
1132EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1133
1134/*
1135 * We just idled delta nanoseconds (called with irqs disabled):
1136 */
1137void sched_clock_idle_wakeup_event(u64 delta_ns)
1138{
1139 struct rq *rq = cpu_rq(smp_processor_id());
1140 u64 now = sched_clock();
1141
1142 rq->idle_clock += delta_ns;
1143 /*
1144 * Override the previous timestamp and ignore all
1145 * sched_clock() deltas that occured while we idled,
1146 * and use the PM-provided delta_ns to advance the
1147 * rq clock:
1148 */
1149 spin_lock(&rq->lock);
1150 rq->prev_clock_raw = now;
1151 rq->clock += delta_ns;
1152 spin_unlock(&rq->lock);
1153 touch_softlockup_watchdog();
1154}
1155EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1156
1157static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1158 1034
1159static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
1189enum { 1065enum {
1190 HRTICK_SET, /* re-programm hrtick_timer */ 1066 HRTICK_SET, /* re-programm hrtick_timer */
1191 HRTICK_RESET, /* not a new slice */ 1067 HRTICK_RESET, /* not a new slice */
1068 HRTICK_BLOCK, /* stop hrtick operations */
1192}; 1069};
1193 1070
1194/* 1071/*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
1200{ 1077{
1201 if (!sched_feat(HRTICK)) 1078 if (!sched_feat(HRTICK))
1202 return 0; 1079 return 0;
1080 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1081 return 0;
1203 return hrtimer_is_hres_active(&rq->hrtick_timer); 1082 return hrtimer_is_hres_active(&rq->hrtick_timer);
1204} 1083}
1205 1084
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1275 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1276 1155
1277 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1278 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1279 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1280 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1281 1160
1282 return HRTIMER_NORESTART; 1161 return HRTIMER_NORESTART;
1283} 1162}
1284 1163
1285static inline void init_rq_hrtick(struct rq *rq) 1164static void hotplug_hrtick_disable(int cpu)
1165{
1166 struct rq *rq = cpu_rq(cpu);
1167 unsigned long flags;
1168
1169 spin_lock_irqsave(&rq->lock, flags);
1170 rq->hrtick_flags = 0;
1171 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1172 spin_unlock_irqrestore(&rq->lock, flags);
1173
1174 hrtick_clear(rq);
1175}
1176
1177static void hotplug_hrtick_enable(int cpu)
1178{
1179 struct rq *rq = cpu_rq(cpu);
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&rq->lock, flags);
1183 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1184 spin_unlock_irqrestore(&rq->lock, flags);
1185}
1186
1187static int
1188hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189{
1190 int cpu = (int)(long)hcpu;
1191
1192 switch (action) {
1193 case CPU_UP_CANCELED:
1194 case CPU_UP_CANCELED_FROZEN:
1195 case CPU_DOWN_PREPARE:
1196 case CPU_DOWN_PREPARE_FROZEN:
1197 case CPU_DEAD:
1198 case CPU_DEAD_FROZEN:
1199 hotplug_hrtick_disable(cpu);
1200 return NOTIFY_OK;
1201
1202 case CPU_UP_PREPARE:
1203 case CPU_UP_PREPARE_FROZEN:
1204 case CPU_DOWN_FAILED:
1205 case CPU_DOWN_FAILED_FROZEN:
1206 case CPU_ONLINE:
1207 case CPU_ONLINE_FROZEN:
1208 hotplug_hrtick_enable(cpu);
1209 return NOTIFY_OK;
1210 }
1211
1212 return NOTIFY_DONE;
1213}
1214
1215static void init_hrtick(void)
1216{
1217 hotcpu_notifier(hotplug_hrtick, 0);
1218}
1219
1220static void init_rq_hrtick(struct rq *rq)
1286{ 1221{
1287 rq->hrtick_flags = 0; 1222 rq->hrtick_flags = 0;
1288 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1223 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
1319void hrtick_resched(void) 1254void hrtick_resched(void)
1320{ 1255{
1321} 1256}
1257
1258static inline void init_hrtick(void)
1259{
1260}
1322#endif 1261#endif
1323 1262
1324/* 1263/*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1438{ 1377{
1439 u64 tmp; 1378 u64 tmp;
1440 1379
1441 if (unlikely(!lw->inv_weight)) 1380 if (!lw->inv_weight)
1442 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); 1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
1443 1382
1444 tmp = (u64)delta_exec * weight; 1383 tmp = (u64)delta_exec * weight;
1445 /* 1384 /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1748 1687
1749 if (shares < MIN_SHARES) 1688 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES; 1689 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES;
1751 1692
1752 __set_se_shares(tg->se[tcpu], shares); 1693 __set_se_shares(tg->se[tcpu], shares);
1753} 1694}
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4339 struct rq *rq = this_rq(); 4280 struct rq *rq = this_rq();
4340 cputime64_t tmp; 4281 cputime64_t tmp;
4341 4282
4342 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) 4283 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4343 return account_guest_time(p, cputime); 4284 account_guest_time(p, cputime);
4285 return;
4286 }
4344 4287
4345 p->stime = cputime_add(p->stime, cputime); 4288 p->stime = cputime_add(p->stime, cputime);
4346 4289
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
4404 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4405 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4406 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4407 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4408 4352
4409 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4410 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4411 /*
4412 * Let rq->clock advance by at least TICK_NSEC:
4413 */
4414 if (unlikely(rq->clock < next_tick)) {
4415 rq->clock = next_tick;
4416 rq->clock_underflows++;
4417 }
4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
4420 update_cpu_load(rq); 4355 update_cpu_load(rq);
4421 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4422 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
4570 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4571 */ 4506 */
4572 local_irq_disable(); 4507 local_irq_disable();
4573 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4574 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4575 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4576 4511
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
4595 prev->sched_class->put_prev_task(rq, prev); 4530 prev->sched_class->put_prev_task(rq, prev);
4596 next = pick_next_task(rq, prev); 4531 next = pick_next_task(rq, prev);
4597 4532
4598 sched_info_switch(prev, next);
4599
4600 if (likely(prev != next)) { 4533 if (likely(prev != next)) {
4534 sched_info_switch(prev, next);
4535
4601 rq->nr_switches++; 4536 rq->nr_switches++;
4602 rq->curr = next; 4537 rq->curr = next;
4603 ++*switch_count; 4538 ++*switch_count;
@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7755{ 7690{
7756 int i, j; 7691 int i, j;
7757 7692
7758 lock_doms_cur(); 7693 mutex_lock(&sched_domains_mutex);
7759 7694
7760 /* always unregister in case we don't destroy any domains */ 7695 /* always unregister in case we don't destroy any domains */
7761 unregister_sched_domain_sysctl(); 7696 unregister_sched_domain_sysctl();
@@ -7804,7 +7739,7 @@ match2:
7804 7739
7805 register_sched_domain_sysctl(); 7740 register_sched_domain_sysctl();
7806 7741
7807 unlock_doms_cur(); 7742 mutex_unlock(&sched_domains_mutex);
7808} 7743}
7809 7744
7810#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7745#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
7813 int err; 7748 int err;
7814 7749
7815 get_online_cpus(); 7750 get_online_cpus();
7751 mutex_lock(&sched_domains_mutex);
7816 detach_destroy_domains(&cpu_online_map); 7752 detach_destroy_domains(&cpu_online_map);
7817 err = arch_init_sched_domains(&cpu_online_map); 7753 err = arch_init_sched_domains(&cpu_online_map);
7754 mutex_unlock(&sched_domains_mutex);
7818 put_online_cpus(); 7755 put_online_cpus();
7819 7756
7820 return err; 7757 return err;
@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
7932 BUG_ON(sched_group_nodes_bycpu == NULL); 7869 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif 7870#endif
7934 get_online_cpus(); 7871 get_online_cpus();
7872 mutex_lock(&sched_domains_mutex);
7935 arch_init_sched_domains(&cpu_online_map); 7873 arch_init_sched_domains(&cpu_online_map);
7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7874 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7937 if (cpus_empty(non_isolated_cpus)) 7875 if (cpus_empty(non_isolated_cpus))
7938 cpu_set(smp_processor_id(), non_isolated_cpus); 7876 cpu_set(smp_processor_id(), non_isolated_cpus);
7877 mutex_unlock(&sched_domains_mutex);
7939 put_online_cpus(); 7878 put_online_cpus();
7940 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7879 /* XXX: Theoretical race here - CPU may be hotplugged now */
7941 hotcpu_notifier(update_sched_domains, 0); 7880 hotcpu_notifier(update_sched_domains, 0);
7881 init_hrtick();
7942 7882
7943 /* Move init over to a non-isolated CPU */ 7883 /* Move init over to a non-isolated CPU */
7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 7884 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 7965
8026 se->my_q = cfs_rq; 7966 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 7967 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 7968 se->load.inv_weight = 0;
8029 se->parent = parent; 7969 se->parent = parent;
8030} 7970}
8031#endif 7971#endif
@@ -8149,8 +8089,6 @@ void __init sched_init(void)
8149 spin_lock_init(&rq->lock); 8089 spin_lock_init(&rq->lock);
8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8090 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8151 rq->nr_running = 0; 8091 rq->nr_running = 0;
8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
8154 init_cfs_rq(&rq->cfs, rq); 8092 init_cfs_rq(&rq->cfs, rq);
8155 init_rt_rq(&rq->rt, rq); 8093 init_rt_rq(&rq->rt, rq);
8156#ifdef CONFIG_FAIR_GROUP_SCHED 8094#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8294static void normalize_task(struct rq *rq, struct task_struct *p) 8232static void normalize_task(struct rq *rq, struct task_struct *p)
8295{ 8233{
8296 int on_rq; 8234 int on_rq;
8235
8297 update_rq_clock(rq); 8236 update_rq_clock(rq);
8298 on_rq = p->se.on_rq; 8237 on_rq = p->se.on_rq;
8299 if (on_rq) 8238 if (on_rq)
@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
8325 p->se.sleep_start = 0; 8264 p->se.sleep_start = 0;
8326 p->se.block_start = 0; 8265 p->se.block_start = 0;
8327#endif 8266#endif
8328 task_rq(p)->clock = 0;
8329 8267
8330 if (!rt_task(p)) { 8268 if (!rt_task(p)) {
8331 /* 8269 /*
@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8630 dequeue_entity(cfs_rq, se, 0);
8693 8631
8694 se->load.weight = shares; 8632 se->load.weight = shares;
8695 se->load.inv_weight = div64_64((1ULL<<32), shares); 8633 se->load.inv_weight = 0;
8696 8634
8697 if (on_rq) 8635 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8636 enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8722 if (!tg->se[0]) 8660 if (!tg->se[0])
8723 return -EINVAL; 8661 return -EINVAL;
8724 8662
8725 /*
8726 * A weight of 0 or 1 can cause arithmetics problems.
8727 * (The default weight is 1024 - so there's no practical
8728 * limitation from this.)
8729 */
8730 if (shares < MIN_SHARES) 8663 if (shares < MIN_SHARES)
8731 shares = MIN_SHARES; 8664 shares = MIN_SHARES;
8665 else if (shares > MAX_SHARES)
8666 shares = MAX_SHARES;
8732 8667
8733 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8734 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8753 * force a rebalance 8688 * force a rebalance
8754 */ 8689 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids); 8691 set_se_shares(tg->se[i], shares);
8757 } 8692 }
8758 8693
8759 /* 8694 /*
@@ -8787,7 +8722,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8787 if (runtime == RUNTIME_INF) 8722 if (runtime == RUNTIME_INF)
8788 return 1ULL << 16; 8723 return 1ULL << 16;
8789 8724
8790 return div64_64(runtime << 16, period); 8725 return div64_u64(runtime << 16, period);
8791} 8726}
8792 8727
8793#ifdef CONFIG_CGROUP_SCHED 8728#ifdef CONFIG_CGROUP_SCHED
@@ -9057,13 +8992,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9057} 8992}
9058 8993
9059#ifdef CONFIG_FAIR_GROUP_SCHED 8994#ifdef CONFIG_FAIR_GROUP_SCHED
9060static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8995static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9061 u64 shareval) 8996 u64 shareval)
9062{ 8997{
9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8998 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9064} 8999}
9065 9000
9066static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) 9001static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9067{ 9002{
9068 struct task_group *tg = cgroup_tg(cgrp); 9003 struct task_group *tg = cgroup_tg(cgrp);
9069 9004
@@ -9073,48 +9008,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
9073 9008
9074#ifdef CONFIG_RT_GROUP_SCHED 9009#ifdef CONFIG_RT_GROUP_SCHED
9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9010static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9076 struct file *file, 9011 s64 val)
9077 const char __user *userbuf,
9078 size_t nbytes, loff_t *unused_ppos)
9079{ 9012{
9080 char buffer[64]; 9013 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9081 int retval = 0;
9082 s64 val;
9083 char *end;
9084
9085 if (!nbytes)
9086 return -EINVAL;
9087 if (nbytes >= sizeof(buffer))
9088 return -E2BIG;
9089 if (copy_from_user(buffer, userbuf, nbytes))
9090 return -EFAULT;
9091
9092 buffer[nbytes] = 0; /* nul-terminate */
9093
9094 /* strip newline if necessary */
9095 if (nbytes && (buffer[nbytes-1] == '\n'))
9096 buffer[nbytes-1] = 0;
9097 val = simple_strtoll(buffer, &end, 0);
9098 if (*end)
9099 return -EINVAL;
9100
9101 /* Pass to subsystem */
9102 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9103 if (!retval)
9104 retval = nbytes;
9105 return retval;
9106} 9014}
9107 9015
9108static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, 9016static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9109 struct file *file,
9110 char __user *buf, size_t nbytes,
9111 loff_t *ppos)
9112{ 9017{
9113 char tmp[64]; 9018 return sched_group_rt_runtime(cgroup_tg(cgrp));
9114 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
9115 int len = sprintf(tmp, "%ld\n", val);
9116
9117 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
9118} 9019}
9119 9020
9120static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9021static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9133,20 +9034,20 @@ static struct cftype cpu_files[] = {
9133#ifdef CONFIG_FAIR_GROUP_SCHED 9034#ifdef CONFIG_FAIR_GROUP_SCHED
9134 { 9035 {
9135 .name = "shares", 9036 .name = "shares",
9136 .read_uint = cpu_shares_read_uint, 9037 .read_u64 = cpu_shares_read_u64,
9137 .write_uint = cpu_shares_write_uint, 9038 .write_u64 = cpu_shares_write_u64,
9138 }, 9039 },
9139#endif 9040#endif
9140#ifdef CONFIG_RT_GROUP_SCHED 9041#ifdef CONFIG_RT_GROUP_SCHED
9141 { 9042 {
9142 .name = "rt_runtime_us", 9043 .name = "rt_runtime_us",
9143 .read = cpu_rt_runtime_read, 9044 .read_s64 = cpu_rt_runtime_read,
9144 .write = cpu_rt_runtime_write, 9045 .write_s64 = cpu_rt_runtime_write,
9145 }, 9046 },
9146 { 9047 {
9147 .name = "rt_period_us", 9048 .name = "rt_period_us",
9148 .read_uint = cpu_rt_period_read_uint, 9049 .read_u64 = cpu_rt_period_read_uint,
9149 .write_uint = cpu_rt_period_write_uint, 9050 .write_u64 = cpu_rt_period_write_uint,
9150 }, 9051 },
9151#endif 9052#endif
9152}; 9053};
@@ -9277,8 +9178,8 @@ out:
9277static struct cftype files[] = { 9178static struct cftype files[] = {
9278 { 9179 {
9279 .name = "usage", 9180 .name = "usage",
9280 .read_uint = cpuusage_read, 9181 .read_u64 = cpuusage_read,
9281 .write_uint = cpuusage_write, 9182 .write_u64 = cpuusage_write,
9282 }, 9183 },
9283}; 9184};
9284 9185
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com>
9 *
10 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod
12 * - jiffies
13 * - sched_clock()
14 * - explicit idle events
15 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window
18 * is set up using jiffies.
19 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped).
22 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference).
25 */
26#include <linux/sched.h>
27#include <linux/percpu.h>
28#include <linux/spinlock.h>
29#include <linux/ktime.h>
30#include <linux/module.h>
31
32
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34
35struct sched_clock_data {
36 /*
37 * Raw spinlock - this is a special case: this might be called
38 * from within instrumentation code so we dont want to do any
39 * instrumentation ourselves.
40 */
41 raw_spinlock_t lock;
42
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw;
46 u64 tick_gtod;
47 u64 clock;
48};
49
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
51
52static inline struct sched_clock_data *this_scd(void)
53{
54 return &__get_cpu_var(sched_clock_data);
55}
56
57static inline struct sched_clock_data *cpu_sdc(int cpu)
58{
59 return &per_cpu(sched_clock_data, cpu);
60}
61
62void sched_clock_init(void)
63{
64 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0;
66 int cpu;
67
68 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu);
70
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies;
73 scd->prev_raw = now;
74 scd->tick_raw = now;
75 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now;
77 }
78}
79
80/*
81 * update the percpu scd from the raw @now value
82 *
83 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values
85 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
87{
88 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies;
90 u64 clock = scd->clock;
91 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw;
93
94 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
96
97 if (unlikely(delta < 0)) {
98 clock++;
99 goto out;
100 }
101
102 max_clock = min_clock + TICK_NSEC;
103
104 if (unlikely(clock + delta > max_clock)) {
105 if (clock < max_clock)
106 clock = max_clock;
107 else
108 clock++;
109 } else {
110 clock += delta;
111 }
112
113 out:
114 if (unlikely(clock < min_clock))
115 clock = min_clock;
116
117 scd->prev_raw = now;
118 scd->prev_jiffies = now_jiffies;
119 scd->clock = clock;
120}
121
122static void lock_double_clock(struct sched_clock_data *data1,
123 struct sched_clock_data *data2)
124{
125 if (data1 < data2) {
126 __raw_spin_lock(&data1->lock);
127 __raw_spin_lock(&data2->lock);
128 } else {
129 __raw_spin_lock(&data2->lock);
130 __raw_spin_lock(&data1->lock);
131 }
132}
133
134u64 sched_clock_cpu(int cpu)
135{
136 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock;
138
139 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock();
141
142 if (cpu != raw_smp_processor_id()) {
143 /*
144 * in order to update a remote cpu's clock based on our
145 * unstable raw time rebase it against:
146 * tick_raw (offset between raw counters)
147 * tick_gotd (tick offset between cpus)
148 */
149 struct sched_clock_data *my_scd = this_scd();
150
151 lock_double_clock(scd, my_scd);
152
153 now -= my_scd->tick_raw;
154 now += scd->tick_raw;
155
156 now -= my_scd->tick_gtod;
157 now += scd->tick_gtod;
158
159 __raw_spin_unlock(&my_scd->lock);
160 } else {
161 __raw_spin_lock(&scd->lock);
162 }
163
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock;
170}
171
172void sched_clock_tick(void)
173{
174 struct sched_clock_data *scd = this_scd();
175 u64 now, now_gtod;
176
177 WARN_ON_ONCE(!irqs_disabled());
178
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get());
181
182 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now);
184 /*
185 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies.
188 */
189 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod;
191 __raw_spin_unlock(&scd->lock);
192}
193
194/*
195 * We are going deep-idle (irqs are disabled):
196 */
197void sched_clock_idle_sleep_event(void)
198{
199 sched_clock_cpu(smp_processor_id());
200}
201EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
202
203/*
204 * We just idled delta nanoseconds (called with irqs disabled):
205 */
206void sched_clock_idle_wakeup_event(u64 delta_ns)
207{
208 struct sched_clock_data *scd = this_scd();
209 u64 now = sched_clock();
210
211 /*
212 * Override the previous timestamp and ignore all
213 * sched_clock() deltas that occured while we idled,
214 * and use the PM-provided delta_ns to advance the
215 * rq clock:
216 */
217 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now;
219 scd->clock += delta_ns;
220 __raw_spin_unlock(&scd->lock);
221
222 touch_softlockup_watchdog();
223}
224EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
225
226#endif
227
228/*
229 * Scheduler clock - returns current time in nanosec units.
230 * This is default implementation.
231 * Architectures and sub-architectures can override this.
232 */
233unsigned long long __attribute__((weak)) sched_clock(void)
234{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204 PN(next_balance); 204 PN(next_balance);
205 P(curr->pid); 205 P(curr->pid);
206 PN(clock); 206 PN(clock);
207 PN(idle_clock);
208 PN(prev_clock_raw);
209 P(clock_warps);
210 P(clock_overflows);
211 P(clock_underflows);
212 P(clock_deep_idle_events);
213 PN(clock_max_delta);
214 P(cpu_load[0]); 207 P(cpu_load[0]);
215 P(cpu_load[1]); 208 P(cpu_load[1]);
216 P(cpu_load[2]); 209 P(cpu_load[2]);
@@ -277,12 +270,9 @@ static int __init init_sched_debug_procfs(void)
277{ 270{
278 struct proc_dir_entry *pe; 271 struct proc_dir_entry *pe;
279 272
280 pe = create_proc_entry("sched_debug", 0644, NULL); 273 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
281 if (!pe) 274 if (!pe)
282 return -ENOMEM; 275 return -ENOMEM;
283
284 pe->proc_fops = &sched_debug_fops;
285
286 return 0; 276 return 0;
287} 277}
288 278
@@ -360,8 +350,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
360 350
361 avg_per_cpu = p->se.sum_exec_runtime; 351 avg_per_cpu = p->se.sum_exec_runtime;
362 if (p->se.nr_migrations) { 352 if (p->se.nr_migrations) {
363 avg_per_cpu = div64_64(avg_per_cpu, 353 avg_per_cpu = div64_u64(avg_per_cpu,
364 p->se.nr_migrations); 354 p->se.nr_migrations);
365 } else { 355 } else {
366 avg_per_cpu = -1LL; 356 avg_per_cpu = -1LL;
367 } 357 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
682 * Update run-time statistics of the 'current'. 682 * Update run-time statistics of the 'current'.
683 */ 683 */
684 update_curr(cfs_rq); 684 update_curr(cfs_rq);
685 account_entity_enqueue(cfs_rq, se);
685 686
686 if (wakeup) { 687 if (wakeup) {
687 place_entity(cfs_rq, se, 0); 688 place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
692 check_spread(cfs_rq, se); 693 check_spread(cfs_rq, se);
693 if (se != cfs_rq->curr) 694 if (se != cfs_rq->curr)
694 __enqueue_entity(cfs_rq, se); 695 __enqueue_entity(cfs_rq, se);
695 account_entity_enqueue(cfs_rq, se);
696} 696}
697 697
698static void update_avg(u64 *avg, u64 sample) 698static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
841 * queued ticks are scheduled to match the slice, so don't bother 841 * queued ticks are scheduled to match the slice, so don't bother
842 * validating it and just reschedule. 842 * validating it and just reschedule.
843 */ 843 */
844 if (queued) 844 if (queued) {
845 return resched_task(rq_of(cfs_rq)->curr); 845 resched_task(rq_of(cfs_rq)->curr);
846 return;
847 }
846 /* 848 /*
847 * don't let the period tick interfere with the hrtick preemption 849 * don't let the period tick interfere with the hrtick preemption
848 */ 850 */
@@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
957 return; 959 return;
958 960
959 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 961 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
960 __update_rq_clock(rq); 962 update_rq_clock(rq);
961 /* 963 /*
962 * Update run-time statistics of the 'current'. 964 * Update run-time statistics of the 'current'.
963 */ 965 */
@@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1007 * sibling runqueue info. This will avoid the checks and cache miss 1009 * sibling runqueue info. This will avoid the checks and cache miss
1008 * penalities associated with that. 1010 * penalities associated with that.
1009 */ 1011 */
1010 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) 1012 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1011 return cpu; 1013 return cpu;
1012 1014
1013 for_each_domain(cpu, sd) { 1015 for_each_domain(cpu, sd) {
@@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
1611}; 1613};
1612 1614
1613#ifdef CONFIG_SCHED_DEBUG 1615#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1638static void print_cfs_stats(struct seq_file *m, int cpu) 1616static void print_cfs_stats(struct seq_file *m, int cpu)
1639{ 1617{
1640 struct cfs_rq *cfs_rq; 1618 struct cfs_rq *cfs_rq;
@@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1642 rcu_read_lock(); 1620 rcu_read_lock();
1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1621 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1644 print_cfs_rq(m, cpu, cfs_rq); 1622 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1648 rcu_read_unlock(); 1623 rcu_read_unlock();
1649} 1624}
1650#endif 1625#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
99/* 99/*
100 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
101 */ 101 */
102const struct sched_class idle_sched_class = { 102static const struct sched_class idle_sched_class = {
103 /* .next is NULL */ 103 /* .next is NULL */
104 /* no enqueue/yield_task for idle tasks */ 104 /* no enqueue/yield_task for idle tasks */
105 105
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
1098 } 1098 }
1099} 1099}
1100 1100
1101 1101/*
1102 * If we are not running and we are not going to reschedule soon, we should
1103 * try to push tasks away now
1104 */
1102static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1105static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1103{ 1106{
1104 if (!task_running(rq, p) && 1107 if (!task_running(rq, p) &&
1105 (p->prio >= rq->rt.highest_prio) && 1108 !test_tsk_need_resched(rq->curr) &&
1106 rq->rt.overloaded) 1109 rq->rt.overloaded)
1107 push_rt_tasks(rq); 1110 push_rt_tasks(rq);
1108} 1111}
@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
1309 p->se.exec_start = rq->clock; 1312 p->se.exec_start = rq->clock;
1310} 1313}
1311 1314
1312const struct sched_class rt_sched_class = { 1315static const struct sched_class rt_sched_class = {
1313 .next = &fair_sched_class, 1316 .next = &fair_sched_class,
1314 .enqueue_task = enqueue_task_rt, 1317 .enqueue_task = enqueue_task_rt,
1315 .dequeue_task = dequeue_task_rt, 1318 .dequeue_task = dequeue_task_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 64ad0ed15992..72bb4f51f963 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,11 +39,19 @@
39 39
40static struct kmem_cache *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
41 41
42static int __sig_ignored(struct task_struct *t, int sig)
43{
44 void __user *handler;
45
46 /* Is it explicitly or implicitly ignored? */
47
48 handler = t->sighand->action[sig - 1].sa.sa_handler;
49 return handler == SIG_IGN ||
50 (handler == SIG_DFL && sig_kernel_ignore(sig));
51}
42 52
43static int sig_ignored(struct task_struct *t, int sig) 53static int sig_ignored(struct task_struct *t, int sig)
44{ 54{
45 void __user * handler;
46
47 /* 55 /*
48 * Tracers always want to know about signals.. 56 * Tracers always want to know about signals..
49 */ 57 */
@@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig)
58 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 66 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
59 return 0; 67 return 0;
60 68
61 /* Is it explicitly or implicitly ignored? */ 69 return __sig_ignored(t, sig);
62 handler = t->sighand->action[sig-1].sa.sa_handler;
63 return handler == SIG_IGN ||
64 (handler == SIG_DFL && sig_kernel_ignore(sig));
65} 70}
66 71
67/* 72/*
@@ -372,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
372 */ 377 */
373int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 378int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
374{ 379{
375 int signr = 0; 380 int signr;
376 381
377 /* We only dequeue private signals from ourselves, we don't let 382 /* We only dequeue private signals from ourselves, we don't let
378 * signalfd steal them 383 * signalfd steal them
@@ -405,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
405 } 410 }
406 } 411 }
407 } 412 }
413
408 recalc_sigpending(); 414 recalc_sigpending();
409 if (signr && unlikely(sig_kernel_stop(signr))) { 415 if (!signr)
416 return 0;
417
418 if (unlikely(sig_kernel_stop(signr))) {
410 /* 419 /*
411 * Set a marker that we have dequeued a stop signal. Our 420 * Set a marker that we have dequeued a stop signal. Our
412 * caller might release the siglock and then the pending 421 * caller might release the siglock and then the pending
@@ -422,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
422 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 431 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
423 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 432 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
424 } 433 }
425 if (signr && 434 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
426 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
427 info->si_sys_private) {
428 /* 435 /*
429 * Release the siglock to ensure proper locking order 436 * Release the siglock to ensure proper locking order
430 * of timer locks outside of siglocks. Note, we leave 437 * of timer locks outside of siglocks. Note, we leave
@@ -526,21 +533,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
526static int check_kill_permission(int sig, struct siginfo *info, 533static int check_kill_permission(int sig, struct siginfo *info,
527 struct task_struct *t) 534 struct task_struct *t)
528{ 535{
529 int error = -EINVAL; 536 struct pid *sid;
537 int error;
538
530 if (!valid_signal(sig)) 539 if (!valid_signal(sig))
531 return error; 540 return -EINVAL;
532 541
533 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { 542 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
534 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 543 return 0;
535 if (error) 544
536 return error; 545 error = audit_signal_info(sig, t); /* Let audit system see the signal */
537 error = -EPERM; 546 if (error)
538 if (((sig != SIGCONT) ||
539 (task_session_nr(current) != task_session_nr(t)))
540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
542 && !capable(CAP_KILL))
543 return error; 547 return error;
548
549 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
550 (current->uid ^ t->suid) && (current->uid ^ t->uid) &&
551 !capable(CAP_KILL)) {
552 switch (sig) {
553 case SIGCONT:
554 sid = task_session(t);
555 /*
556 * We don't return the error if sid == NULL. The
557 * task was unhashed, the caller must notice this.
558 */
559 if (!sid || sid == task_session(current))
560 break;
561 default:
562 return -EPERM;
563 }
544 } 564 }
545 565
546 return security_task_kill(t, info, sig, 0); 566 return security_task_kill(t, info, sig, 0);
@@ -550,62 +570,44 @@ static int check_kill_permission(int sig, struct siginfo *info,
550static void do_notify_parent_cldstop(struct task_struct *tsk, int why); 570static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
551 571
552/* 572/*
553 * Handle magic process-wide effects of stop/continue signals. 573 * Handle magic process-wide effects of stop/continue signals. Unlike
554 * Unlike the signal actions, these happen immediately at signal-generation 574 * the signal actions, these happen immediately at signal-generation
555 * time regardless of blocking, ignoring, or handling. This does the 575 * time regardless of blocking, ignoring, or handling. This does the
556 * actual continuing for SIGCONT, but not the actual stopping for stop 576 * actual continuing for SIGCONT, but not the actual stopping for stop
557 * signals. The process stop is done as a signal action for SIG_DFL. 577 * signals. The process stop is done as a signal action for SIG_DFL.
578 *
579 * Returns true if the signal should be actually delivered, otherwise
580 * it should be dropped.
558 */ 581 */
559static void handle_stop_signal(int sig, struct task_struct *p) 582static int prepare_signal(int sig, struct task_struct *p)
560{ 583{
584 struct signal_struct *signal = p->signal;
561 struct task_struct *t; 585 struct task_struct *t;
562 586
563 if (p->signal->flags & SIGNAL_GROUP_EXIT) 587 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
564 /* 588 /*
565 * The process is in the middle of dying already. 589 * The process is in the middle of dying, nothing to do.
566 */ 590 */
567 return; 591 } else if (sig_kernel_stop(sig)) {
568
569 if (sig_kernel_stop(sig)) {
570 /* 592 /*
571 * This is a stop signal. Remove SIGCONT from all queues. 593 * This is a stop signal. Remove SIGCONT from all queues.
572 */ 594 */
573 rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); 595 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
574 t = p; 596 t = p;
575 do { 597 do {
576 rm_from_queue(sigmask(SIGCONT), &t->pending); 598 rm_from_queue(sigmask(SIGCONT), &t->pending);
577 t = next_thread(t); 599 } while_each_thread(p, t);
578 } while (t != p);
579 } else if (sig == SIGCONT) { 600 } else if (sig == SIGCONT) {
601 unsigned int why;
580 /* 602 /*
581 * Remove all stop signals from all queues, 603 * Remove all stop signals from all queues,
582 * and wake all threads. 604 * and wake all threads.
583 */ 605 */
584 if (unlikely(p->signal->group_stop_count > 0)) { 606 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
585 /*
586 * There was a group stop in progress. We'll
587 * pretend it finished before we got here. We are
588 * obliged to report it to the parent: if the
589 * SIGSTOP happened "after" this SIGCONT, then it
590 * would have cleared this pending SIGCONT. If it
591 * happened "before" this SIGCONT, then the parent
592 * got the SIGCHLD about the stop finishing before
593 * the continue happened. We do the notification
594 * now, and it's as if the stop had finished and
595 * the SIGCHLD was pending on entry to this kill.
596 */
597 p->signal->group_stop_count = 0;
598 p->signal->flags = SIGNAL_STOP_CONTINUED;
599 spin_unlock(&p->sighand->siglock);
600 do_notify_parent_cldstop(p, CLD_STOPPED);
601 spin_lock(&p->sighand->siglock);
602 }
603 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
604 t = p; 607 t = p;
605 do { 608 do {
606 unsigned int state; 609 unsigned int state;
607 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 610 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
608
609 /* 611 /*
610 * If there is a handler for SIGCONT, we must make 612 * If there is a handler for SIGCONT, we must make
611 * sure that no thread returns to user mode before 613 * sure that no thread returns to user mode before
@@ -615,7 +617,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
615 * running the handler. With the TIF_SIGPENDING 617 * running the handler. With the TIF_SIGPENDING
616 * flag set, the thread will pause and acquire the 618 * flag set, the thread will pause and acquire the
617 * siglock that we hold now and until we've queued 619 * siglock that we hold now and until we've queued
618 * the pending signal. 620 * the pending signal.
619 * 621 *
620 * Wake up the stopped thread _after_ setting 622 * Wake up the stopped thread _after_ setting
621 * TIF_SIGPENDING 623 * TIF_SIGPENDING
@@ -626,49 +628,163 @@ static void handle_stop_signal(int sig, struct task_struct *p)
626 state |= TASK_INTERRUPTIBLE; 628 state |= TASK_INTERRUPTIBLE;
627 } 629 }
628 wake_up_state(t, state); 630 wake_up_state(t, state);
631 } while_each_thread(p, t);
629 632
630 t = next_thread(t); 633 /*
631 } while (t != p); 634 * Notify the parent with CLD_CONTINUED if we were stopped.
635 *
636 * If we were in the middle of a group stop, we pretend it
637 * was already finished, and then continued. Since SIGCHLD
638 * doesn't queue we report only CLD_STOPPED, as if the next
639 * CLD_CONTINUED was dropped.
640 */
641 why = 0;
642 if (signal->flags & SIGNAL_STOP_STOPPED)
643 why |= SIGNAL_CLD_CONTINUED;
644 else if (signal->group_stop_count)
645 why |= SIGNAL_CLD_STOPPED;
632 646
633 if (p->signal->flags & SIGNAL_STOP_STOPPED) { 647 if (why) {
634 /* 648 /*
635 * We were in fact stopped, and are now continued. 649 * The first thread which returns from finish_stop()
636 * Notify the parent with CLD_CONTINUED. 650 * will take ->siglock, notice SIGNAL_CLD_MASK, and
651 * notify its parent. See get_signal_to_deliver().
637 */ 652 */
638 p->signal->flags = SIGNAL_STOP_CONTINUED; 653 signal->flags = why | SIGNAL_STOP_CONTINUED;
639 p->signal->group_exit_code = 0; 654 signal->group_stop_count = 0;
640 spin_unlock(&p->sighand->siglock); 655 signal->group_exit_code = 0;
641 do_notify_parent_cldstop(p, CLD_CONTINUED);
642 spin_lock(&p->sighand->siglock);
643 } else { 656 } else {
644 /* 657 /*
645 * We are not stopped, but there could be a stop 658 * We are not stopped, but there could be a stop
646 * signal in the middle of being processed after 659 * signal in the middle of being processed after
647 * being removed from the queue. Clear that too. 660 * being removed from the queue. Clear that too.
648 */ 661 */
649 p->signal->flags = 0; 662 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
650 } 663 }
651 } else if (sig == SIGKILL) { 664 }
665
666 return !sig_ignored(p, sig);
667}
668
669/*
670 * Test if P wants to take SIG. After we've checked all threads with this,
671 * it's equivalent to finding no threads not blocking SIG. Any threads not
672 * blocking SIG were ruled out because they are not running and already
673 * have pending signals. Such threads will dequeue from the shared queue
674 * as soon as they're available, so putting the signal on the shared queue
675 * will be equivalent to sending it to one such thread.
676 */
677static inline int wants_signal(int sig, struct task_struct *p)
678{
679 if (sigismember(&p->blocked, sig))
680 return 0;
681 if (p->flags & PF_EXITING)
682 return 0;
683 if (sig == SIGKILL)
684 return 1;
685 if (task_is_stopped_or_traced(p))
686 return 0;
687 return task_curr(p) || !signal_pending(p);
688}
689
690static void complete_signal(int sig, struct task_struct *p, int group)
691{
692 struct signal_struct *signal = p->signal;
693 struct task_struct *t;
694
695 /*
696 * Now find a thread we can wake up to take the signal off the queue.
697 *
698 * If the main thread wants the signal, it gets first crack.
699 * Probably the least surprising to the average bear.
700 */
701 if (wants_signal(sig, p))
702 t = p;
703 else if (!group || thread_group_empty(p))
704 /*
705 * There is just one thread and it does not need to be woken.
706 * It will dequeue unblocked signals before it runs again.
707 */
708 return;
709 else {
652 /* 710 /*
653 * Make sure that any pending stop signal already dequeued 711 * Otherwise try to find a suitable thread.
654 * is undone by the wakeup for SIGKILL.
655 */ 712 */
656 p->signal->flags = 0; 713 t = signal->curr_target;
714 while (!wants_signal(sig, t)) {
715 t = next_thread(t);
716 if (t == signal->curr_target)
717 /*
718 * No thread needs to be woken.
719 * Any eligible threads will see
720 * the signal in the queue soon.
721 */
722 return;
723 }
724 signal->curr_target = t;
657 } 725 }
726
727 /*
728 * Found a killable thread. If the signal will be fatal,
729 * then start taking the whole group down immediately.
730 */
731 if (sig_fatal(p, sig) &&
732 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
733 !sigismember(&t->real_blocked, sig) &&
734 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
735 /*
736 * This signal will be fatal to the whole group.
737 */
738 if (!sig_kernel_coredump(sig)) {
739 /*
740 * Start a group exit and wake everybody up.
741 * This way we don't have other threads
742 * running and doing things after a slower
743 * thread has the fatal signal pending.
744 */
745 signal->flags = SIGNAL_GROUP_EXIT;
746 signal->group_exit_code = sig;
747 signal->group_stop_count = 0;
748 t = p;
749 do {
750 sigaddset(&t->pending.signal, SIGKILL);
751 signal_wake_up(t, 1);
752 } while_each_thread(p, t);
753 return;
754 }
755 }
756
757 /*
758 * The signal is already in the shared-pending queue.
759 * Tell the chosen thread to wake up and dequeue it.
760 */
761 signal_wake_up(t, sig == SIGKILL);
762 return;
763}
764
765static inline int legacy_queue(struct sigpending *signals, int sig)
766{
767 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
658} 768}
659 769
660static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 770static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
661 struct sigpending *signals) 771 int group)
662{ 772{
663 struct sigqueue * q = NULL; 773 struct sigpending *pending;
664 int ret = 0; 774 struct sigqueue *q;
775
776 assert_spin_locked(&t->sighand->siglock);
777 if (!prepare_signal(sig, t))
778 return 0;
665 779
780 pending = group ? &t->signal->shared_pending : &t->pending;
666 /* 781 /*
667 * Deliver the signal to listening signalfds. This must be called 782 * Short-circuit ignored signals and support queuing
668 * with the sighand lock held. 783 * exactly one non-rt signal, so that we can get more
784 * detailed information about the cause of the signal.
669 */ 785 */
670 signalfd_notify(t, sig); 786 if (legacy_queue(pending, sig))
671 787 return 0;
672 /* 788 /*
673 * fast-pathed signals for kernel-internal things like SIGSTOP 789 * fast-pathed signals for kernel-internal things like SIGSTOP
674 * or SIGKILL. 790 * or SIGKILL.
@@ -688,7 +804,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
688 (is_si_special(info) || 804 (is_si_special(info) ||
689 info->si_code >= 0))); 805 info->si_code >= 0)));
690 if (q) { 806 if (q) {
691 list_add_tail(&q->list, &signals->list); 807 list_add_tail(&q->list, &pending->list);
692 switch ((unsigned long) info) { 808 switch ((unsigned long) info) {
693 case (unsigned long) SEND_SIG_NOINFO: 809 case (unsigned long) SEND_SIG_NOINFO:
694 q->info.si_signo = sig; 810 q->info.si_signo = sig;
@@ -718,13 +834,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
718 } 834 }
719 835
720out_set: 836out_set:
721 sigaddset(&signals->signal, sig); 837 signalfd_notify(t, sig);
722 return ret; 838 sigaddset(&pending->signal, sig);
839 complete_signal(sig, t, group);
840 return 0;
723} 841}
724 842
725#define LEGACY_QUEUE(sigptr, sig) \
726 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
727
728int print_fatal_signals; 843int print_fatal_signals;
729 844
730static void print_fatal_signal(struct pt_regs *regs, int signr) 845static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -757,29 +872,16 @@ static int __init setup_print_fatal_signals(char *str)
757 872
758__setup("print-fatal-signals=", setup_print_fatal_signals); 873__setup("print-fatal-signals=", setup_print_fatal_signals);
759 874
875int
876__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
877{
878 return send_signal(sig, info, p, 1);
879}
880
760static int 881static int
761specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 882specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
762{ 883{
763 int ret = 0; 884 return send_signal(sig, info, t, 0);
764
765 BUG_ON(!irqs_disabled());
766 assert_spin_locked(&t->sighand->siglock);
767
768 /* Short-circuit ignored signals. */
769 if (sig_ignored(t, sig))
770 goto out;
771
772 /* Support queueing exactly one non-rt signal, so that we
773 can get more detailed information about the cause of
774 the signal. */
775 if (LEGACY_QUEUE(&t->pending, sig))
776 goto out;
777
778 ret = send_signal(sig, info, t, &t->pending);
779 if (!ret && !sigismember(&t->blocked, sig))
780 signal_wake_up(t, sig == SIGKILL);
781out:
782 return ret;
783} 885}
784 886
785/* 887/*
@@ -790,7 +892,8 @@ out:
790 * since we do not want to have a signal handler that was blocked 892 * since we do not want to have a signal handler that was blocked
791 * be invoked when user space had explicitly blocked it. 893 * be invoked when user space had explicitly blocked it.
792 * 894 *
793 * We don't want to have recursive SIGSEGV's etc, for example. 895 * We don't want to have recursive SIGSEGV's etc, for example,
896 * that is why we also clear SIGNAL_UNKILLABLE.
794 */ 897 */
795int 898int
796force_sig_info(int sig, struct siginfo *info, struct task_struct *t) 899force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -810,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
810 recalc_sigpending_and_wake(t); 913 recalc_sigpending_and_wake(t);
811 } 914 }
812 } 915 }
916 if (action->sa.sa_handler == SIG_DFL)
917 t->signal->flags &= ~SIGNAL_UNKILLABLE;
813 ret = specific_send_sig_info(sig, info, t); 918 ret = specific_send_sig_info(sig, info, t);
814 spin_unlock_irqrestore(&t->sighand->siglock, flags); 919 spin_unlock_irqrestore(&t->sighand->siglock, flags);
815 920
@@ -823,134 +928,6 @@ force_sig_specific(int sig, struct task_struct *t)
823} 928}
824 929
825/* 930/*
826 * Test if P wants to take SIG. After we've checked all threads with this,
827 * it's equivalent to finding no threads not blocking SIG. Any threads not
828 * blocking SIG were ruled out because they are not running and already
829 * have pending signals. Such threads will dequeue from the shared queue
830 * as soon as they're available, so putting the signal on the shared queue
831 * will be equivalent to sending it to one such thread.
832 */
833static inline int wants_signal(int sig, struct task_struct *p)
834{
835 if (sigismember(&p->blocked, sig))
836 return 0;
837 if (p->flags & PF_EXITING)
838 return 0;
839 if (sig == SIGKILL)
840 return 1;
841 if (task_is_stopped_or_traced(p))
842 return 0;
843 return task_curr(p) || !signal_pending(p);
844}
845
846static void
847__group_complete_signal(int sig, struct task_struct *p)
848{
849 struct task_struct *t;
850
851 /*
852 * Now find a thread we can wake up to take the signal off the queue.
853 *
854 * If the main thread wants the signal, it gets first crack.
855 * Probably the least surprising to the average bear.
856 */
857 if (wants_signal(sig, p))
858 t = p;
859 else if (thread_group_empty(p))
860 /*
861 * There is just one thread and it does not need to be woken.
862 * It will dequeue unblocked signals before it runs again.
863 */
864 return;
865 else {
866 /*
867 * Otherwise try to find a suitable thread.
868 */
869 t = p->signal->curr_target;
870 if (t == NULL)
871 /* restart balancing at this thread */
872 t = p->signal->curr_target = p;
873
874 while (!wants_signal(sig, t)) {
875 t = next_thread(t);
876 if (t == p->signal->curr_target)
877 /*
878 * No thread needs to be woken.
879 * Any eligible threads will see
880 * the signal in the queue soon.
881 */
882 return;
883 }
884 p->signal->curr_target = t;
885 }
886
887 /*
888 * Found a killable thread. If the signal will be fatal,
889 * then start taking the whole group down immediately.
890 */
891 if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
892 !sigismember(&t->real_blocked, sig) &&
893 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
894 /*
895 * This signal will be fatal to the whole group.
896 */
897 if (!sig_kernel_coredump(sig)) {
898 /*
899 * Start a group exit and wake everybody up.
900 * This way we don't have other threads
901 * running and doing things after a slower
902 * thread has the fatal signal pending.
903 */
904 p->signal->flags = SIGNAL_GROUP_EXIT;
905 p->signal->group_exit_code = sig;
906 p->signal->group_stop_count = 0;
907 t = p;
908 do {
909 sigaddset(&t->pending.signal, SIGKILL);
910 signal_wake_up(t, 1);
911 } while_each_thread(p, t);
912 return;
913 }
914 }
915
916 /*
917 * The signal is already in the shared-pending queue.
918 * Tell the chosen thread to wake up and dequeue it.
919 */
920 signal_wake_up(t, sig == SIGKILL);
921 return;
922}
923
924int
925__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
926{
927 int ret = 0;
928
929 assert_spin_locked(&p->sighand->siglock);
930 handle_stop_signal(sig, p);
931
932 /* Short-circuit ignored signals. */
933 if (sig_ignored(p, sig))
934 return ret;
935
936 if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
937 /* This is a non-RT signal and we already have one queued. */
938 return ret;
939
940 /*
941 * Put this signal on the shared-pending queue, or fail with EAGAIN.
942 * We always use the shared queue for process-wide signals,
943 * to avoid several races.
944 */
945 ret = send_signal(sig, info, p, &p->signal->shared_pending);
946 if (unlikely(ret))
947 return ret;
948
949 __group_complete_signal(sig, p);
950 return 0;
951}
952
953/*
954 * Nuke all other threads in the group. 931 * Nuke all other threads in the group.
955 */ 932 */
956void zap_other_threads(struct task_struct *p) 933void zap_other_threads(struct task_struct *p)
@@ -978,13 +955,11 @@ int __fatal_signal_pending(struct task_struct *tsk)
978} 955}
979EXPORT_SYMBOL(__fatal_signal_pending); 956EXPORT_SYMBOL(__fatal_signal_pending);
980 957
981/*
982 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
983 */
984struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 958struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
985{ 959{
986 struct sighand_struct *sighand; 960 struct sighand_struct *sighand;
987 961
962 rcu_read_lock();
988 for (;;) { 963 for (;;) {
989 sighand = rcu_dereference(tsk->sighand); 964 sighand = rcu_dereference(tsk->sighand);
990 if (unlikely(sighand == NULL)) 965 if (unlikely(sighand == NULL))
@@ -995,6 +970,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
995 break; 970 break;
996 spin_unlock_irqrestore(&sighand->siglock, *flags); 971 spin_unlock_irqrestore(&sighand->siglock, *flags);
997 } 972 }
973 rcu_read_unlock();
998 974
999 return sighand; 975 return sighand;
1000} 976}
@@ -1043,9 +1019,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1043 struct task_struct *p; 1019 struct task_struct *p;
1044 1020
1045 rcu_read_lock(); 1021 rcu_read_lock();
1046 if (unlikely(sig_needs_tasklist(sig)))
1047 read_lock(&tasklist_lock);
1048
1049retry: 1022retry:
1050 p = pid_task(pid, PIDTYPE_PID); 1023 p = pid_task(pid, PIDTYPE_PID);
1051 if (p) { 1024 if (p) {
@@ -1059,10 +1032,8 @@ retry:
1059 */ 1032 */
1060 goto retry; 1033 goto retry;
1061 } 1034 }
1062
1063 if (unlikely(sig_needs_tasklist(sig)))
1064 read_unlock(&tasklist_lock);
1065 rcu_read_unlock(); 1035 rcu_read_unlock();
1036
1066 return error; 1037 return error;
1067} 1038}
1068 1039
@@ -1159,8 +1130,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1159 */ 1130 */
1160 1131
1161/* 1132/*
1162 * These two are the most common entry points. They send a signal 1133 * The caller must ensure the task can't exit.
1163 * just to the specific thread.
1164 */ 1134 */
1165int 1135int
1166send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1136send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
@@ -1175,17 +1145,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1175 if (!valid_signal(sig)) 1145 if (!valid_signal(sig))
1176 return -EINVAL; 1146 return -EINVAL;
1177 1147
1178 /*
1179 * We need the tasklist lock even for the specific
1180 * thread case (when we don't need to follow the group
1181 * lists) in order to avoid races with "p->sighand"
1182 * going away or changing from under us.
1183 */
1184 read_lock(&tasklist_lock);
1185 spin_lock_irqsave(&p->sighand->siglock, flags); 1148 spin_lock_irqsave(&p->sighand->siglock, flags);
1186 ret = specific_send_sig_info(sig, info, p); 1149 ret = specific_send_sig_info(sig, info, p);
1187 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1150 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1188 read_unlock(&tasklist_lock);
1189 return ret; 1151 return ret;
1190} 1152}
1191 1153
@@ -1291,28 +1253,24 @@ void sigqueue_free(struct sigqueue *q)
1291 __sigqueue_free(q); 1253 __sigqueue_free(q);
1292} 1254}
1293 1255
1294int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) 1256int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1295{ 1257{
1258 int sig = q->info.si_signo;
1259 struct sigpending *pending;
1296 unsigned long flags; 1260 unsigned long flags;
1297 int ret = 0; 1261 int ret;
1298 1262
1299 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1263 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1300 1264
1301 /* 1265 ret = -1;
1302 * The rcu based delayed sighand destroy makes it possible to 1266 if (!likely(lock_task_sighand(t, &flags)))
1303 * run this without tasklist lock held. The task struct itself 1267 goto ret;
1304 * cannot go away as create_timer did get_task_struct().
1305 *
1306 * We return -1, when the task is marked exiting, so
1307 * posix_timer_event can redirect it to the group leader
1308 */
1309 rcu_read_lock();
1310 1268
1311 if (!likely(lock_task_sighand(p, &flags))) { 1269 ret = 1; /* the signal is ignored */
1312 ret = -1; 1270 if (!prepare_signal(sig, t))
1313 goto out_err; 1271 goto out;
1314 }
1315 1272
1273 ret = 0;
1316 if (unlikely(!list_empty(&q->list))) { 1274 if (unlikely(!list_empty(&q->list))) {
1317 /* 1275 /*
1318 * If an SI_TIMER entry is already queue just increment 1276 * If an SI_TIMER entry is already queue just increment
@@ -1322,77 +1280,15 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1322 q->info.si_overrun++; 1280 q->info.si_overrun++;
1323 goto out; 1281 goto out;
1324 } 1282 }
1325 /* Short-circuit ignored signals. */
1326 if (sig_ignored(p, sig)) {
1327 ret = 1;
1328 goto out;
1329 }
1330 /*
1331 * Deliver the signal to listening signalfds. This must be called
1332 * with the sighand lock held.
1333 */
1334 signalfd_notify(p, sig);
1335
1336 list_add_tail(&q->list, &p->pending.list);
1337 sigaddset(&p->pending.signal, sig);
1338 if (!sigismember(&p->blocked, sig))
1339 signal_wake_up(p, sig == SIGKILL);
1340
1341out:
1342 unlock_task_sighand(p, &flags);
1343out_err:
1344 rcu_read_unlock();
1345
1346 return ret;
1347}
1348
1349int
1350send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1351{
1352 unsigned long flags;
1353 int ret = 0;
1354
1355 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1356
1357 read_lock(&tasklist_lock);
1358 /* Since it_lock is held, p->sighand cannot be NULL. */
1359 spin_lock_irqsave(&p->sighand->siglock, flags);
1360 handle_stop_signal(sig, p);
1361
1362 /* Short-circuit ignored signals. */
1363 if (sig_ignored(p, sig)) {
1364 ret = 1;
1365 goto out;
1366 }
1367 1283
1368 if (unlikely(!list_empty(&q->list))) { 1284 signalfd_notify(t, sig);
1369 /* 1285 pending = group ? &t->signal->shared_pending : &t->pending;
1370 * If an SI_TIMER entry is already queue just increment 1286 list_add_tail(&q->list, &pending->list);
1371 * the overrun count. Other uses should not try to 1287 sigaddset(&pending->signal, sig);
1372 * send the signal multiple times. 1288 complete_signal(sig, t, group);
1373 */
1374 BUG_ON(q->info.si_code != SI_TIMER);
1375 q->info.si_overrun++;
1376 goto out;
1377 }
1378 /*
1379 * Deliver the signal to listening signalfds. This must be called
1380 * with the sighand lock held.
1381 */
1382 signalfd_notify(p, sig);
1383
1384 /*
1385 * Put this signal on the shared-pending queue.
1386 * We always use the shared queue for process-wide signals,
1387 * to avoid several races.
1388 */
1389 list_add_tail(&q->list, &p->signal->shared_pending.list);
1390 sigaddset(&p->signal->shared_pending.signal, sig);
1391
1392 __group_complete_signal(sig, p);
1393out: 1289out:
1394 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1290 unlock_task_sighand(t, &flags);
1395 read_unlock(&tasklist_lock); 1291ret:
1396 return ret; 1292 return ret;
1397} 1293}
1398 1294
@@ -1723,8 +1619,9 @@ static int do_signal_stop(int signr)
1723 } else { 1619 } else {
1724 struct task_struct *t; 1620 struct task_struct *t;
1725 1621
1726 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1622 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
1727 unlikely(sig->group_exit_task)) 1623 != SIGNAL_STOP_DEQUEUED) ||
1624 unlikely(signal_group_exit(sig)))
1728 return 0; 1625 return 0;
1729 /* 1626 /*
1730 * There is no group stop already in progress. 1627 * There is no group stop already in progress.
@@ -1799,8 +1696,9 @@ static int ptrace_signal(int signr, siginfo_t *info,
1799int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 1696int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1800 struct pt_regs *regs, void *cookie) 1697 struct pt_regs *regs, void *cookie)
1801{ 1698{
1802 sigset_t *mask = &current->blocked; 1699 struct sighand_struct *sighand = current->sighand;
1803 int signr = 0; 1700 struct signal_struct *signal = current->signal;
1701 int signr;
1804 1702
1805relock: 1703relock:
1806 /* 1704 /*
@@ -1811,16 +1709,32 @@ relock:
1811 */ 1709 */
1812 try_to_freeze(); 1710 try_to_freeze();
1813 1711
1814 spin_lock_irq(&current->sighand->siglock); 1712 spin_lock_irq(&sighand->siglock);
1713 /*
1714 * Every stopped thread goes here after wakeup. Check to see if
1715 * we should notify the parent, prepare_signal(SIGCONT) encodes
1716 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1717 */
1718 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1719 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1720 ? CLD_CONTINUED : CLD_STOPPED;
1721 signal->flags &= ~SIGNAL_CLD_MASK;
1722 spin_unlock_irq(&sighand->siglock);
1723
1724 read_lock(&tasklist_lock);
1725 do_notify_parent_cldstop(current->group_leader, why);
1726 read_unlock(&tasklist_lock);
1727 goto relock;
1728 }
1729
1815 for (;;) { 1730 for (;;) {
1816 struct k_sigaction *ka; 1731 struct k_sigaction *ka;
1817 1732
1818 if (unlikely(current->signal->group_stop_count > 0) && 1733 if (unlikely(signal->group_stop_count > 0) &&
1819 do_signal_stop(0)) 1734 do_signal_stop(0))
1820 goto relock; 1735 goto relock;
1821 1736
1822 signr = dequeue_signal(current, mask, info); 1737 signr = dequeue_signal(current, &current->blocked, info);
1823
1824 if (!signr) 1738 if (!signr)
1825 break; /* will return 0 */ 1739 break; /* will return 0 */
1826 1740
@@ -1830,7 +1744,7 @@ relock:
1830 continue; 1744 continue;
1831 } 1745 }
1832 1746
1833 ka = &current->sighand->action[signr-1]; 1747 ka = &sighand->action[signr-1];
1834 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1748 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1835 continue; 1749 continue;
1836 if (ka->sa.sa_handler != SIG_DFL) { 1750 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1852,7 +1766,8 @@ relock:
1852 /* 1766 /*
1853 * Global init gets no signals it doesn't want. 1767 * Global init gets no signals it doesn't want.
1854 */ 1768 */
1855 if (is_global_init(current)) 1769 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1770 !signal_group_exit(signal))
1856 continue; 1771 continue;
1857 1772
1858 if (sig_kernel_stop(signr)) { 1773 if (sig_kernel_stop(signr)) {
@@ -1867,14 +1782,14 @@ relock:
1867 * We need to check for that and bail out if necessary. 1782 * We need to check for that and bail out if necessary.
1868 */ 1783 */
1869 if (signr != SIGSTOP) { 1784 if (signr != SIGSTOP) {
1870 spin_unlock_irq(&current->sighand->siglock); 1785 spin_unlock_irq(&sighand->siglock);
1871 1786
1872 /* signals can be posted during this window */ 1787 /* signals can be posted during this window */
1873 1788
1874 if (is_current_pgrp_orphaned()) 1789 if (is_current_pgrp_orphaned())
1875 goto relock; 1790 goto relock;
1876 1791
1877 spin_lock_irq(&current->sighand->siglock); 1792 spin_lock_irq(&sighand->siglock);
1878 } 1793 }
1879 1794
1880 if (likely(do_signal_stop(signr))) { 1795 if (likely(do_signal_stop(signr))) {
@@ -1889,15 +1804,16 @@ relock:
1889 continue; 1804 continue;
1890 } 1805 }
1891 1806
1892 spin_unlock_irq(&current->sighand->siglock); 1807 spin_unlock_irq(&sighand->siglock);
1893 1808
1894 /* 1809 /*
1895 * Anything else is fatal, maybe with a core dump. 1810 * Anything else is fatal, maybe with a core dump.
1896 */ 1811 */
1897 current->flags |= PF_SIGNALED; 1812 current->flags |= PF_SIGNALED;
1898 if ((signr != SIGKILL) && print_fatal_signals) 1813
1899 print_fatal_signal(regs, signr);
1900 if (sig_kernel_coredump(signr)) { 1814 if (sig_kernel_coredump(signr)) {
1815 if (print_fatal_signals)
1816 print_fatal_signal(regs, signr);
1901 /* 1817 /*
1902 * If it was able to dump core, this kills all 1818 * If it was able to dump core, this kills all
1903 * other threads in the group and synchronizes with 1819 * other threads in the group and synchronizes with
@@ -1915,7 +1831,7 @@ relock:
1915 do_group_exit(signr); 1831 do_group_exit(signr);
1916 /* NOTREACHED */ 1832 /* NOTREACHED */
1917 } 1833 }
1918 spin_unlock_irq(&current->sighand->siglock); 1834 spin_unlock_irq(&sighand->siglock);
1919 return signr; 1835 return signr;
1920} 1836}
1921 1837
@@ -2259,6 +2175,7 @@ static int do_tkill(int tgid, int pid, int sig)
2259 int error; 2175 int error;
2260 struct siginfo info; 2176 struct siginfo info;
2261 struct task_struct *p; 2177 struct task_struct *p;
2178 unsigned long flags;
2262 2179
2263 error = -ESRCH; 2180 error = -ESRCH;
2264 info.si_signo = sig; 2181 info.si_signo = sig;
@@ -2267,22 +2184,24 @@ static int do_tkill(int tgid, int pid, int sig)
2267 info.si_pid = task_tgid_vnr(current); 2184 info.si_pid = task_tgid_vnr(current);
2268 info.si_uid = current->uid; 2185 info.si_uid = current->uid;
2269 2186
2270 read_lock(&tasklist_lock); 2187 rcu_read_lock();
2271 p = find_task_by_vpid(pid); 2188 p = find_task_by_vpid(pid);
2272 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2189 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2273 error = check_kill_permission(sig, &info, p); 2190 error = check_kill_permission(sig, &info, p);
2274 /* 2191 /*
2275 * The null signal is a permissions and process existence 2192 * The null signal is a permissions and process existence
2276 * probe. No signal is actually delivered. 2193 * probe. No signal is actually delivered.
2194 *
2195 * If lock_task_sighand() fails we pretend the task dies
2196 * after receiving the signal. The window is tiny, and the
2197 * signal is private anyway.
2277 */ 2198 */
2278 if (!error && sig && p->sighand) { 2199 if (!error && sig && lock_task_sighand(p, &flags)) {
2279 spin_lock_irq(&p->sighand->siglock);
2280 handle_stop_signal(sig, p);
2281 error = specific_send_sig_info(sig, &info, p); 2200 error = specific_send_sig_info(sig, &info, p);
2282 spin_unlock_irq(&p->sighand->siglock); 2201 unlock_task_sighand(p, &flags);
2283 } 2202 }
2284 } 2203 }
2285 read_unlock(&tasklist_lock); 2204 rcu_read_unlock();
2286 2205
2287 return error; 2206 return error;
2288} 2207}
@@ -2339,13 +2258,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2339 2258
2340int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2259int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2341{ 2260{
2261 struct task_struct *t = current;
2342 struct k_sigaction *k; 2262 struct k_sigaction *k;
2343 sigset_t mask; 2263 sigset_t mask;
2344 2264
2345 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 2265 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
2346 return -EINVAL; 2266 return -EINVAL;
2347 2267
2348 k = &current->sighand->action[sig-1]; 2268 k = &t->sighand->action[sig-1];
2349 2269
2350 spin_lock_irq(&current->sighand->siglock); 2270 spin_lock_irq(&current->sighand->siglock);
2351 if (oact) 2271 if (oact)
@@ -2366,9 +2286,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2366 * (for example, SIGCHLD), shall cause the pending signal to 2286 * (for example, SIGCHLD), shall cause the pending signal to
2367 * be discarded, whether or not it is blocked" 2287 * be discarded, whether or not it is blocked"
2368 */ 2288 */
2369 if (act->sa.sa_handler == SIG_IGN || 2289 if (__sig_ignored(t, sig)) {
2370 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2371 struct task_struct *t = current;
2372 sigemptyset(&mask); 2290 sigemptyset(&mask);
2373 sigaddset(&mask, sig); 2291 sigaddset(&mask, sig);
2374 rm_from_queue_full(&mask, &t->signal->shared_pending); 2292 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2623,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2623 2541
2624 current->state = TASK_INTERRUPTIBLE; 2542 current->state = TASK_INTERRUPTIBLE;
2625 schedule(); 2543 schedule();
2626 set_thread_flag(TIF_RESTORE_SIGMASK); 2544 set_restore_sigmask();
2627 return -ERESTARTNOHAND; 2545 return -ERESTARTNOHAND;
2628} 2546}
2629#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 2547#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3c44956ee7e2..36e061740047 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -589,16 +589,20 @@ static void takeover_tasklets(unsigned int cpu)
589 local_irq_disable(); 589 local_irq_disable();
590 590
591 /* Find end, append list for that CPU. */ 591 /* Find end, append list for that CPU. */
592 *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; 592 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
593 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 593 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
594 per_cpu(tasklet_vec, cpu).head = NULL; 594 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
595 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 595 per_cpu(tasklet_vec, cpu).head = NULL;
596 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
597 }
596 raise_softirq_irqoff(TASKLET_SOFTIRQ); 598 raise_softirq_irqoff(TASKLET_SOFTIRQ);
597 599
598 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 600 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
599 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 601 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
600 per_cpu(tasklet_hi_vec, cpu).head = NULL; 602 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
601 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 603 per_cpu(tasklet_hi_vec, cpu).head = NULL;
604 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
605 }
602 raise_softirq_irqoff(HI_SOFTIRQ); 606 raise_softirq_irqoff(HI_SOFTIRQ);
603 607
604 local_irq_enable(); 608 local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index f2a451366953..895d2d4c9493 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -978,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
978 goto out; 978 goto out;
979 979
980 if (task_pgrp(p) != pgrp) { 980 if (task_pgrp(p) != pgrp) {
981 detach_pid(p, PIDTYPE_PGID); 981 change_pid(p, PIDTYPE_PGID, pgrp);
982 attach_pid(p, PIDTYPE_PGID, pgrp);
983 set_task_pgrp(p, pid_nr(pgrp)); 982 set_task_pgrp(p, pid_nr(pgrp));
984 } 983 }
985 984
@@ -992,54 +991,67 @@ out:
992 991
993asmlinkage long sys_getpgid(pid_t pid) 992asmlinkage long sys_getpgid(pid_t pid)
994{ 993{
994 struct task_struct *p;
995 struct pid *grp;
996 int retval;
997
998 rcu_read_lock();
995 if (!pid) 999 if (!pid)
996 return task_pgrp_vnr(current); 1000 grp = task_pgrp(current);
997 else { 1001 else {
998 int retval;
999 struct task_struct *p;
1000
1001 read_lock(&tasklist_lock);
1002 p = find_task_by_vpid(pid);
1003 retval = -ESRCH; 1002 retval = -ESRCH;
1004 if (p) { 1003 p = find_task_by_vpid(pid);
1005 retval = security_task_getpgid(p); 1004 if (!p)
1006 if (!retval) 1005 goto out;
1007 retval = task_pgrp_vnr(p); 1006 grp = task_pgrp(p);
1008 } 1007 if (!grp)
1009 read_unlock(&tasklist_lock); 1008 goto out;
1010 return retval; 1009
1010 retval = security_task_getpgid(p);
1011 if (retval)
1012 goto out;
1011 } 1013 }
1014 retval = pid_vnr(grp);
1015out:
1016 rcu_read_unlock();
1017 return retval;
1012} 1018}
1013 1019
1014#ifdef __ARCH_WANT_SYS_GETPGRP 1020#ifdef __ARCH_WANT_SYS_GETPGRP
1015 1021
1016asmlinkage long sys_getpgrp(void) 1022asmlinkage long sys_getpgrp(void)
1017{ 1023{
1018 /* SMP - assuming writes are word atomic this is fine */ 1024 return sys_getpgid(0);
1019 return task_pgrp_vnr(current);
1020} 1025}
1021 1026
1022#endif 1027#endif
1023 1028
1024asmlinkage long sys_getsid(pid_t pid) 1029asmlinkage long sys_getsid(pid_t pid)
1025{ 1030{
1031 struct task_struct *p;
1032 struct pid *sid;
1033 int retval;
1034
1035 rcu_read_lock();
1026 if (!pid) 1036 if (!pid)
1027 return task_session_vnr(current); 1037 sid = task_session(current);
1028 else { 1038 else {
1029 int retval;
1030 struct task_struct *p;
1031
1032 rcu_read_lock();
1033 p = find_task_by_vpid(pid);
1034 retval = -ESRCH; 1039 retval = -ESRCH;
1035 if (p) { 1040 p = find_task_by_vpid(pid);
1036 retval = security_task_getsid(p); 1041 if (!p)
1037 if (!retval) 1042 goto out;
1038 retval = task_session_vnr(p); 1043 sid = task_session(p);
1039 } 1044 if (!sid)
1040 rcu_read_unlock(); 1045 goto out;
1041 return retval; 1046
1047 retval = security_task_getsid(p);
1048 if (retval)
1049 goto out;
1042 } 1050 }
1051 retval = pid_vnr(sid);
1052out:
1053 rcu_read_unlock();
1054 return retval;
1043} 1055}
1044 1056
1045asmlinkage long sys_setsid(void) 1057asmlinkage long sys_setsid(void)
@@ -1545,6 +1557,19 @@ out:
1545 * 1557 *
1546 */ 1558 */
1547 1559
1560static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
1561 cputime_t *utimep, cputime_t *stimep)
1562{
1563 *utimep = cputime_add(*utimep, t->utime);
1564 *stimep = cputime_add(*stimep, t->stime);
1565 r->ru_nvcsw += t->nvcsw;
1566 r->ru_nivcsw += t->nivcsw;
1567 r->ru_minflt += t->min_flt;
1568 r->ru_majflt += t->maj_flt;
1569 r->ru_inblock += task_io_get_inblock(t);
1570 r->ru_oublock += task_io_get_oublock(t);
1571}
1572
1548static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1573static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1549{ 1574{
1550 struct task_struct *t; 1575 struct task_struct *t;
@@ -1554,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1554 memset((char *) r, 0, sizeof *r); 1579 memset((char *) r, 0, sizeof *r);
1555 utime = stime = cputime_zero; 1580 utime = stime = cputime_zero;
1556 1581
1557 rcu_read_lock(); 1582 if (who == RUSAGE_THREAD) {
1558 if (!lock_task_sighand(p, &flags)) { 1583 accumulate_thread_rusage(p, r, &utime, &stime);
1559 rcu_read_unlock(); 1584 goto out;
1560 return;
1561 } 1585 }
1562 1586
1587 if (!lock_task_sighand(p, &flags))
1588 return;
1589
1563 switch (who) { 1590 switch (who) {
1564 case RUSAGE_BOTH: 1591 case RUSAGE_BOTH:
1565 case RUSAGE_CHILDREN: 1592 case RUSAGE_CHILDREN:
@@ -1586,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1586 r->ru_oublock += p->signal->oublock; 1613 r->ru_oublock += p->signal->oublock;
1587 t = p; 1614 t = p;
1588 do { 1615 do {
1589 utime = cputime_add(utime, t->utime); 1616 accumulate_thread_rusage(t, r, &utime, &stime);
1590 stime = cputime_add(stime, t->stime);
1591 r->ru_nvcsw += t->nvcsw;
1592 r->ru_nivcsw += t->nivcsw;
1593 r->ru_minflt += t->min_flt;
1594 r->ru_majflt += t->maj_flt;
1595 r->ru_inblock += task_io_get_inblock(t);
1596 r->ru_oublock += task_io_get_oublock(t);
1597 t = next_thread(t); 1617 t = next_thread(t);
1598 } while (t != p); 1618 } while (t != p);
1599 break; 1619 break;
@@ -1601,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1601 default: 1621 default:
1602 BUG(); 1622 BUG();
1603 } 1623 }
1604
1605 unlock_task_sighand(p, &flags); 1624 unlock_task_sighand(p, &flags);
1606 rcu_read_unlock();
1607 1625
1626out:
1608 cputime_to_timeval(utime, &r->ru_utime); 1627 cputime_to_timeval(utime, &r->ru_utime);
1609 cputime_to_timeval(stime, &r->ru_stime); 1628 cputime_to_timeval(stime, &r->ru_stime);
1610} 1629}
@@ -1618,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1618 1637
1619asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1638asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1620{ 1639{
1621 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) 1640 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1641 who != RUSAGE_THREAD)
1622 return -EINVAL; 1642 return -EINVAL;
1623 return getrusage(current, who, ru); 1643 return getrusage(current, who, ru);
1624} 1644}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h>
41#include <linux/times.h> 42#include <linux/times.h>
42#include <linux/limits.h> 43#include <linux/limits.h>
43#include <linux/dcache.h> 44#include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
144extern int max_lock_depth; 145extern int max_lock_depth;
145#endif 146#endif
146 147
147#ifdef CONFIG_SYSCTL_SYSCALL
148static int parse_table(int __user *, int, void __user *, size_t __user *,
149 void __user *, size_t, struct ctl_table *);
150#endif
151
152
153#ifdef CONFIG_PROC_SYSCTL 148#ifdef CONFIG_PROC_SYSCTL
154static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 149static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
155 void __user *buffer, size_t *lenp, loff_t *ppos); 150 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = {
809 .proc_handler = &proc_dostring, 804 .proc_handler = &proc_dostring,
810 .strategy = &sysctl_string, 805 .strategy = &sysctl_string,
811 }, 806 },
807#ifdef CONFIG_KEYS
808 {
809 .ctl_name = CTL_UNNUMBERED,
810 .procname = "keys",
811 .mode = 0555,
812 .child = key_sysctls,
813 },
814#endif
812/* 815/*
813 * NOTE: do not add new entries to this table unless you have read 816 * NOTE: do not add new entries to this table unless you have read
814 * Documentation/sysctl/ctl_unnumbered.txt 817 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
1430} 1433}
1431 1434
1432#ifdef CONFIG_SYSCTL_SYSCALL 1435#ifdef CONFIG_SYSCTL_SYSCALL
1436/* Perform the actual read/write of a sysctl table entry. */
1437static int do_sysctl_strategy(struct ctl_table_root *root,
1438 struct ctl_table *table,
1439 int __user *name, int nlen,
1440 void __user *oldval, size_t __user *oldlenp,
1441 void __user *newval, size_t newlen)
1442{
1443 int op = 0, rc;
1444
1445 if (oldval)
1446 op |= 004;
1447 if (newval)
1448 op |= 002;
1449 if (sysctl_perm(root, table, op))
1450 return -EPERM;
1451
1452 if (table->strategy) {
1453 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1454 newval, newlen);
1455 if (rc < 0)
1456 return rc;
1457 if (rc > 0)
1458 return 0;
1459 }
1460
1461 /* If there is no strategy routine, or if the strategy returns
1462 * zero, proceed with automatic r/w */
1463 if (table->data && table->maxlen) {
1464 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1465 newval, newlen);
1466 if (rc < 0)
1467 return rc;
1468 }
1469 return 0;
1470}
1471
1472static int parse_table(int __user *name, int nlen,
1473 void __user *oldval, size_t __user *oldlenp,
1474 void __user *newval, size_t newlen,
1475 struct ctl_table_root *root,
1476 struct ctl_table *table)
1477{
1478 int n;
1479repeat:
1480 if (!nlen)
1481 return -ENOTDIR;
1482 if (get_user(n, name))
1483 return -EFAULT;
1484 for ( ; table->ctl_name || table->procname; table++) {
1485 if (!table->ctl_name)
1486 continue;
1487 if (n == table->ctl_name) {
1488 int error;
1489 if (table->child) {
1490 if (sysctl_perm(root, table, 001))
1491 return -EPERM;
1492 name++;
1493 nlen--;
1494 table = table->child;
1495 goto repeat;
1496 }
1497 error = do_sysctl_strategy(root, table, name, nlen,
1498 oldval, oldlenp,
1499 newval, newlen);
1500 return error;
1501 }
1502 }
1503 return -ENOTDIR;
1504}
1505
1433int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1506int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1434 void __user *newval, size_t newlen) 1507 void __user *newval, size_t newlen)
1435{ 1508{
@@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1447 for (head = sysctl_head_next(NULL); head; 1520 for (head = sysctl_head_next(NULL); head;
1448 head = sysctl_head_next(head)) { 1521 head = sysctl_head_next(head)) {
1449 error = parse_table(name, nlen, oldval, oldlenp, 1522 error = parse_table(name, nlen, oldval, oldlenp,
1450 newval, newlen, head->ctl_table); 1523 newval, newlen,
1524 head->root, head->ctl_table);
1451 if (error != -ENOTDIR) { 1525 if (error != -ENOTDIR) {
1452 sysctl_head_finish(head); 1526 sysctl_head_finish(head);
1453 break; 1527 break;
@@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op)
1493 return -EACCES; 1567 return -EACCES;
1494} 1568}
1495 1569
1496int sysctl_perm(struct ctl_table *table, int op) 1570int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1497{ 1571{
1498 int error; 1572 int error;
1573 int mode;
1574
1499 error = security_sysctl(table, op); 1575 error = security_sysctl(table, op);
1500 if (error) 1576 if (error)
1501 return error; 1577 return error;
1502 return test_perm(table->mode, op);
1503}
1504
1505#ifdef CONFIG_SYSCTL_SYSCALL
1506static int parse_table(int __user *name, int nlen,
1507 void __user *oldval, size_t __user *oldlenp,
1508 void __user *newval, size_t newlen,
1509 struct ctl_table *table)
1510{
1511 int n;
1512repeat:
1513 if (!nlen)
1514 return -ENOTDIR;
1515 if (get_user(n, name))
1516 return -EFAULT;
1517 for ( ; table->ctl_name || table->procname; table++) {
1518 if (!table->ctl_name)
1519 continue;
1520 if (n == table->ctl_name) {
1521 int error;
1522 if (table->child) {
1523 if (sysctl_perm(table, 001))
1524 return -EPERM;
1525 name++;
1526 nlen--;
1527 table = table->child;
1528 goto repeat;
1529 }
1530 error = do_sysctl_strategy(table, name, nlen,
1531 oldval, oldlenp,
1532 newval, newlen);
1533 return error;
1534 }
1535 }
1536 return -ENOTDIR;
1537}
1538 1578
1539/* Perform the actual read/write of a sysctl table entry. */ 1579 if (root->permissions)
1540int do_sysctl_strategy (struct ctl_table *table, 1580 mode = root->permissions(root, current->nsproxy, table);
1541 int __user *name, int nlen, 1581 else
1542 void __user *oldval, size_t __user *oldlenp, 1582 mode = table->mode;
1543 void __user *newval, size_t newlen)
1544{
1545 int op = 0, rc;
1546
1547 if (oldval)
1548 op |= 004;
1549 if (newval)
1550 op |= 002;
1551 if (sysctl_perm(table, op))
1552 return -EPERM;
1553 1583
1554 if (table->strategy) { 1584 return test_perm(mode, op);
1555 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1556 newval, newlen);
1557 if (rc < 0)
1558 return rc;
1559 if (rc > 0)
1560 return 0;
1561 }
1562
1563 /* If there is no strategy routine, or if the strategy returns
1564 * zero, proceed with automatic r/w */
1565 if (table->data && table->maxlen) {
1566 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1567 newval, newlen);
1568 if (rc < 0)
1569 return rc;
1570 }
1571 return 0;
1572} 1585}
1573#endif /* CONFIG_SYSCTL_SYSCALL */
1574 1586
1575static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1587static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1576{ 1588{
@@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1583 1595
1584static __init int sysctl_init(void) 1596static __init int sysctl_init(void)
1585{ 1597{
1586 int err;
1587 sysctl_set_parent(NULL, root_table); 1598 sysctl_set_parent(NULL, root_table);
1588 err = sysctl_check_table(current->nsproxy, root_table); 1599#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1600 {
1601 int err;
1602 err = sysctl_check_table(current->nsproxy, root_table);
1603 }
1604#endif
1589 return 0; 1605 return 0;
1590} 1606}
1591 1607
@@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
1712 header->unregistering = NULL; 1728 header->unregistering = NULL;
1713 header->root = root; 1729 header->root = root;
1714 sysctl_set_parent(NULL, header->ctl_table); 1730 sysctl_set_parent(NULL, header->ctl_table);
1731#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1715 if (sysctl_check_table(namespaces, header->ctl_table)) { 1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1716 kfree(header); 1733 kfree(header);
1717 return NULL; 1734 return NULL;
1718 } 1735 }
1736#endif
1719 spin_lock(&sysctl_lock); 1737 spin_lock(&sysctl_lock);
1720 header_list = lookup_header_list(root, namespaces); 1738 header_list = lookup_header_list(root, namespaces);
1721 list_add_tail(&header->ctl_entry, header_list); 1739 list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 07e86a828073..4a23517169a6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
183 183
184 if (!tsk) { 184 if (!tsk) {
185 rcu_read_lock(); 185 rcu_read_lock();
186 tsk = find_task_by_pid(pid); 186 tsk = find_task_by_vpid(pid);
187 if (tsk) 187 if (tsk)
188 get_task_struct(tsk); 188 get_task_struct(tsk);
189 rcu_read_unlock(); 189 rcu_read_unlock();
@@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
230 */ 230 */
231 rcu_read_lock(); 231 rcu_read_lock();
232 if (!first) 232 if (!first)
233 first = find_task_by_pid(tgid); 233 first = find_task_by_vpid(tgid);
234 234
235 if (!first || !lock_task_sighand(first, &flags)) 235 if (!first || !lock_task_sighand(first, &flags))
236 goto out; 236 goto out;
@@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
547 if (!stats) 547 if (!stats)
548 goto err; 548 goto err;
549 549
550 rc = fill_pid(tsk->pid, tsk, stats); 550 rc = fill_pid(-1, tsk, stats);
551 if (rc < 0) 551 if (rc < 0)
552 goto err; 552 goto err;
553 553
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..6a08660b4fac 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,8 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/unistd.h> 42#include <asm/unistd.h>
@@ -244,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
244 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); 246 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
245#else 247#else
246# if BITS_PER_LONG == 32 248# if BITS_PER_LONG == 32
247 return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; 249 return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
248# else 250# else
249 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; 251 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
250# endif 252# endif
@@ -260,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
260 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); 262 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
261#else 263#else
262# if BITS_PER_LONG == 32 264# if BITS_PER_LONG == 32
263 return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; 265 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
264# else 266# else
265 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; 267 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
266# endif 268# endif
@@ -390,13 +392,17 @@ EXPORT_SYMBOL(set_normalized_timespec);
390struct timespec ns_to_timespec(const s64 nsec) 392struct timespec ns_to_timespec(const s64 nsec)
391{ 393{
392 struct timespec ts; 394 struct timespec ts;
395 s32 rem;
393 396
394 if (!nsec) 397 if (!nsec)
395 return (struct timespec) {0, 0}; 398 return (struct timespec) {0, 0};
396 399
397 ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); 400 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
398 if (unlikely(nsec < 0)) 401 if (unlikely(rem < 0)) {
399 set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); 402 ts.tv_sec--;
403 rem += NSEC_PER_SEC;
404 }
405 ts.tv_nsec = rem;
400 406
401 return ts; 407 return ts;
402} 408}
@@ -470,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
470 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) 476 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
471 return MAX_JIFFY_OFFSET; 477 return MAX_JIFFY_OFFSET;
472 478
473 return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) 479 return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
474 >> MSEC_TO_HZ_SHR32; 480 >> MSEC_TO_HZ_SHR32;
475#endif 481#endif
476} 482}
@@ -485,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u)
485#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) 491#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
486 return u * (HZ / USEC_PER_SEC); 492 return u * (HZ / USEC_PER_SEC);
487#else 493#else
488 return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) 494 return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
489 >> USEC_TO_HZ_SHR32; 495 >> USEC_TO_HZ_SHR32;
490#endif 496#endif
491} 497}
@@ -526,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
526 * Convert jiffies to nanoseconds and separate with 532 * Convert jiffies to nanoseconds and separate with
527 * one divide. 533 * one divide.
528 */ 534 */
529 u64 nsec = (u64)jiffies * TICK_NSEC; 535 u32 rem;
530 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); 536 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
537 NSEC_PER_SEC, &rem);
538 value->tv_nsec = rem;
531} 539}
532EXPORT_SYMBOL(jiffies_to_timespec); 540EXPORT_SYMBOL(jiffies_to_timespec);
533 541
@@ -565,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
565 * Convert jiffies to nanoseconds and separate with 573 * Convert jiffies to nanoseconds and separate with
566 * one divide. 574 * one divide.
567 */ 575 */
568 u64 nsec = (u64)jiffies * TICK_NSEC; 576 u32 rem;
569 long tv_usec;
570 577
571 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); 578 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
572 tv_usec /= NSEC_PER_USEC; 579 NSEC_PER_SEC, &rem);
573 value->tv_usec = tv_usec; 580 value->tv_usec = rem / NSEC_PER_USEC;
574} 581}
575EXPORT_SYMBOL(jiffies_to_timeval); 582EXPORT_SYMBOL(jiffies_to_timeval);
576 583
@@ -586,9 +593,7 @@ clock_t jiffies_to_clock_t(long x)
586 return x / (HZ / USER_HZ); 593 return x / (HZ / USER_HZ);
587# endif 594# endif
588#else 595#else
589 u64 tmp = (u64)x * TICK_NSEC; 596 return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
590 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
591 return (long)tmp;
592#endif 597#endif
593} 598}
594EXPORT_SYMBOL(jiffies_to_clock_t); 599EXPORT_SYMBOL(jiffies_to_clock_t);
@@ -600,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
600 return ~0UL; 605 return ~0UL;
601 return x * (HZ / USER_HZ); 606 return x * (HZ / USER_HZ);
602#else 607#else
603 u64 jif;
604
605 /* Don't worry about loss of precision here .. */ 608 /* Don't worry about loss of precision here .. */
606 if (x >= ~0UL / HZ * USER_HZ) 609 if (x >= ~0UL / HZ * USER_HZ)
607 return ~0UL; 610 return ~0UL;
608 611
609 /* .. but do try to contain it here */ 612 /* .. but do try to contain it here */
610 jif = x * (u64) HZ; 613 return div_u64((u64)x * HZ, USER_HZ);
611 do_div(jif, USER_HZ);
612 return jif;
613#endif 614#endif
614} 615}
615EXPORT_SYMBOL(clock_t_to_jiffies); 616EXPORT_SYMBOL(clock_t_to_jiffies);
@@ -618,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x)
618{ 619{
619#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 620#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
620# if HZ < USER_HZ 621# if HZ < USER_HZ
621 x *= USER_HZ; 622 x = div_u64(x * USER_HZ, HZ);
622 do_div(x, HZ);
623# elif HZ > USER_HZ 623# elif HZ > USER_HZ
624 do_div(x, HZ / USER_HZ); 624 x = div_u64(x, HZ / USER_HZ);
625# else 625# else
626 /* Nothing to do */ 626 /* Nothing to do */
627# endif 627# endif
@@ -631,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x)
631 * but even this doesn't overflow in hundreds of years 631 * but even this doesn't overflow in hundreds of years
632 * in 64 bits, so.. 632 * in 64 bits, so..
633 */ 633 */
634 x *= TICK_NSEC; 634 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
635 do_div(x, (NSEC_PER_SEC / USER_HZ));
636#endif 635#endif
637 return x; 636 return x;
638} 637}
@@ -641,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
641u64 nsec_to_clock_t(u64 x) 640u64 nsec_to_clock_t(u64 x)
642{ 641{
643#if (NSEC_PER_SEC % USER_HZ) == 0 642#if (NSEC_PER_SEC % USER_HZ) == 0
644 do_div(x, (NSEC_PER_SEC / USER_HZ)); 643 return div_u64(x, NSEC_PER_SEC / USER_HZ);
645#elif (USER_HZ % 512) == 0 644#elif (USER_HZ % 512) == 0
646 x *= USER_HZ/512; 645 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
647 do_div(x, (NSEC_PER_SEC / 512));
648#else 646#else
649 /* 647 /*
650 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, 648 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
651 * overflow after 64.99 years. 649 * overflow after 64.99 years.
652 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... 650 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
653 */ 651 */
654 x *= 9; 652 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
655 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
656 USER_HZ));
657#endif 653#endif
658 return x;
659} 654}
660 655
661#if (BITS_PER_LONG < 64) 656#if (BITS_PER_LONG < 64)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f35fdc8..dadde5361f32 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
471/* 471/*
472 * Sysfs setup bits: 472 * Sysfs setup bits:
473 */ 473 */
474static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 474static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
475 sysfs_override_clocksource); 475 sysfs_override_clocksource);
476 476
477static SYSDEV_ATTR(available_clocksource, 0600, 477static SYSDEV_ATTR(available_clocksource, 0444,
478 sysfs_show_available_clocksources, NULL); 478 sysfs_show_available_clocksources, NULL);
479 479
480static struct sysdev_class clocksource_sysclass = { 480static struct sysdev_class clocksource_sysclass = {
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5fd9b9469770..5125ddd8196b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,7 +15,8 @@
15#include <linux/jiffies.h> 15#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/capability.h> 17#include <linux/capability.h>
18#include <asm/div64.h> 18#include <linux/math64.h>
19#include <linux/clocksource.h>
19#include <asm/timex.h> 20#include <asm/timex.h>
20 21
21/* 22/*
@@ -23,11 +24,14 @@
23 */ 24 */
24unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ 25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
25unsigned long tick_nsec; /* ACTHZ period (nsec) */ 26unsigned long tick_nsec; /* ACTHZ period (nsec) */
26static u64 tick_length, tick_length_base; 27u64 tick_length;
28static u64 tick_length_base;
29
30static struct hrtimer leap_timer;
27 31
28#define MAX_TICKADJ 500 /* microsecs */ 32#define MAX_TICKADJ 500 /* microsecs */
29#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
30 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) 34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
31 35
32/* 36/*
33 * phase-lock loop variables 37 * phase-lock loop variables
@@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base;
35/* TIME_ERROR prevents overwriting the CMOS clock */ 39/* TIME_ERROR prevents overwriting the CMOS clock */
36static int time_state = TIME_OK; /* clock synchronization status */ 40static int time_state = TIME_OK; /* clock synchronization status */
37int time_status = STA_UNSYNC; /* clock status bits */ 41int time_status = STA_UNSYNC; /* clock status bits */
38static s64 time_offset; /* time adjustment (ns) */ 42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
39static long time_constant = 2; /* pll time constant */ 44static long time_constant = 2; /* pll time constant */
40long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
41long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
42long time_freq; /* frequency offset (scaled ppm)*/ 47static s64 time_freq; /* frequency offset (scaled ns/s)*/
43static long time_reftime; /* time at last adjustment (s) */ 48static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 49long time_adjust;
45static long ntp_tick_adj; 50static long ntp_tick_adj;
@@ -47,16 +52,56 @@ static long ntp_tick_adj;
47static void ntp_update_frequency(void) 52static void ntp_update_frequency(void)
48{ 53{
49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 << TICK_LENGTH_SHIFT; 55 << NTP_SCALE_SHIFT;
51 second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; 56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 57 second_length += time_freq;
53 58
54 tick_length_base = second_length; 59 tick_length_base = second_length;
55 60
56 do_div(second_length, HZ); 61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT; 62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
63}
64
65static void ntp_update_offset(long offset)
66{
67 long mtemp;
68 s64 freq_adj;
69
70 if (!(time_status & STA_PLL))
71 return;
58 72
59 do_div(tick_length_base, NTP_INTERVAL_FREQ); 73 if (!(time_status & STA_NANO))
74 offset *= NSEC_PER_USEC;
75
76 /*
77 * Scale the phase adjustment and
78 * clamp to the operating range.
79 */
80 offset = min(offset, MAXPHASE);
81 offset = max(offset, -MAXPHASE);
82
83 /*
84 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL).
86 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0)
88 time_reftime = xtime.tv_sec;
89 mtemp = xtime.tv_sec - time_reftime;
90 time_reftime = xtime.tv_sec;
91
92 freq_adj = (s64)offset * mtemp;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
94 time_status &= ~STA_MODE;
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
60} 105}
61 106
62/** 107/**
@@ -78,62 +123,70 @@ void ntp_clear(void)
78} 123}
79 124
80/* 125/*
81 * this routine handles the overflow of the microsecond field 126 * Leap second processing. If in leap-insert state at the end of the
82 * 127 * day, the system clock is set back one second; if in leap-delete
83 * The tricky bits of code to handle the accurate clock support 128 * state, the system clock is set ahead one second.
84 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
85 * They were originally developed for SUN and DEC kernels.
86 * All the kudos should go to Dave for this stuff.
87 */ 129 */
88void second_overflow(void) 130static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
89{ 131{
90 long time_adj; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
91 133
92 /* Bump the maxerror field */ 134 write_seqlock_irq(&xtime_lock);
93 time_maxerror += MAXFREQ >> SHIFT_USEC;
94 if (time_maxerror > NTP_PHASE_LIMIT) {
95 time_maxerror = NTP_PHASE_LIMIT;
96 time_status |= STA_UNSYNC;
97 }
98 135
99 /*
100 * Leap second processing. If in leap-insert state at the end of the
101 * day, the system clock is set back one second; if in leap-delete
102 * state, the system clock is set ahead one second. The microtime()
103 * routine or external clock driver will insure that reported time is
104 * always monotonic. The ugly divides should be replaced.
105 */
106 switch (time_state) { 136 switch (time_state) {
107 case TIME_OK: 137 case TIME_OK:
108 if (time_status & STA_INS)
109 time_state = TIME_INS;
110 else if (time_status & STA_DEL)
111 time_state = TIME_DEL;
112 break; 138 break;
113 case TIME_INS: 139 case TIME_INS:
114 if (xtime.tv_sec % 86400 == 0) { 140 xtime.tv_sec--;
115 xtime.tv_sec--; 141 wall_to_monotonic.tv_sec++;
116 wall_to_monotonic.tv_sec++; 142 time_state = TIME_OOP;
117 time_state = TIME_OOP; 143 printk(KERN_NOTICE "Clock: "
118 printk(KERN_NOTICE "Clock: inserting leap second " 144 "inserting leap second 23:59:60 UTC\n");
119 "23:59:60 UTC\n"); 145 leap_timer.expires = ktime_add_ns(leap_timer.expires,
120 } 146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART;
121 break; 148 break;
122 case TIME_DEL: 149 case TIME_DEL:
123 if ((xtime.tv_sec + 1) % 86400 == 0) { 150 xtime.tv_sec++;
124 xtime.tv_sec++; 151 time_tai--;
125 wall_to_monotonic.tv_sec--; 152 wall_to_monotonic.tv_sec--;
126 time_state = TIME_WAIT; 153 time_state = TIME_WAIT;
127 printk(KERN_NOTICE "Clock: deleting leap second " 154 printk(KERN_NOTICE "Clock: "
128 "23:59:59 UTC\n"); 155 "deleting leap second 23:59:59 UTC\n");
129 }
130 break; 156 break;
131 case TIME_OOP: 157 case TIME_OOP:
158 time_tai++;
132 time_state = TIME_WAIT; 159 time_state = TIME_WAIT;
133 break; 160 /* fall through */
134 case TIME_WAIT: 161 case TIME_WAIT:
135 if (!(time_status & (STA_INS | STA_DEL))) 162 if (!(time_status & (STA_INS | STA_DEL)))
136 time_state = TIME_OK; 163 time_state = TIME_OK;
164 break;
165 }
166 update_vsyscall(&xtime, clock);
167
168 write_sequnlock_irq(&xtime_lock);
169
170 return res;
171}
172
173/*
174 * this routine handles the overflow of the microsecond field
175 *
176 * The tricky bits of code to handle the accurate clock support
177 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
178 * They were originally developed for SUN and DEC kernels.
179 * All the kudos should go to Dave for this stuff.
180 */
181void second_overflow(void)
182{
183 s64 time_adj;
184
185 /* Bump the maxerror field */
186 time_maxerror += MAXFREQ / NSEC_PER_USEC;
187 if (time_maxerror > NTP_PHASE_LIMIT) {
188 time_maxerror = NTP_PHASE_LIMIT;
189 time_status |= STA_UNSYNC;
137 } 190 }
138 191
139 /* 192 /*
@@ -143,7 +196,7 @@ void second_overflow(void)
143 tick_length = tick_length_base; 196 tick_length = tick_length_base;
144 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 197 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
145 time_offset -= time_adj; 198 time_offset -= time_adj;
146 tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); 199 tick_length += time_adj;
147 200
148 if (unlikely(time_adjust)) { 201 if (unlikely(time_adjust)) {
149 if (time_adjust > MAX_TICKADJ) { 202 if (time_adjust > MAX_TICKADJ) {
@@ -154,25 +207,12 @@ void second_overflow(void)
154 tick_length -= MAX_TICKADJ_SCALED; 207 tick_length -= MAX_TICKADJ_SCALED;
155 } else { 208 } else {
156 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 209 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
157 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; 210 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
158 time_adjust = 0; 211 time_adjust = 0;
159 } 212 }
160 } 213 }
161} 214}
162 215
163/*
164 * Return how long ticks are at the moment, that is, how much time
165 * update_wall_time_one_tick will add to xtime next time we call it
166 * (assuming no calls to do_adjtimex in the meantime).
167 * The return value is in fixed-point nanoseconds shifted by the
168 * specified number of bits to the right of the binary point.
169 * This function has no side-effects.
170 */
171u64 current_tick_length(void)
172{
173 return tick_length;
174}
175
176#ifdef CONFIG_GENERIC_CMOS_UPDATE 216#ifdef CONFIG_GENERIC_CMOS_UPDATE
177 217
178/* Disable the cmos update - used by virtualization and embedded */ 218/* Disable the cmos update - used by virtualization and embedded */
@@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { }
236 */ 276 */
237int do_adjtimex(struct timex *txc) 277int do_adjtimex(struct timex *txc)
238{ 278{
239 long mtemp, save_adjust, rem; 279 struct timespec ts;
240 s64 freq_adj, temp64; 280 long save_adjust, sec;
241 int result; 281 int result;
242 282
243 /* In order to modify anything, you gotta be super-user! */ 283 /* In order to modify anything, you gotta be super-user! */
@@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc)
247 /* Now we validate the data before disabling interrupts */ 287 /* Now we validate the data before disabling interrupts */
248 288
249 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { 289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
250 /* singleshot must not be used with any other mode bits */ 290 /* singleshot must not be used with any other mode bits */
251 if (txc->modes != ADJ_OFFSET_SINGLESHOT && 291 if (txc->modes & ~ADJ_OFFSET_SS_READ)
252 txc->modes != ADJ_OFFSET_SS_READ)
253 return -EINVAL; 292 return -EINVAL;
254 } 293 }
255 294
256 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
257 /* adjustment Offset limited to +- .512 seconds */
258 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
259 return -EINVAL;
260
261 /* if the quartz is off by more than 10% something is VERY wrong ! */ 295 /* if the quartz is off by more than 10% something is VERY wrong ! */
262 if (txc->modes & ADJ_TICK) 296 if (txc->modes & ADJ_TICK)
263 if (txc->tick < 900000/USER_HZ || 297 if (txc->tick < 900000/USER_HZ ||
264 txc->tick > 1100000/USER_HZ) 298 txc->tick > 1100000/USER_HZ)
265 return -EINVAL; 299 return -EINVAL;
266 300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts);
304
267 write_seqlock_irq(&xtime_lock); 305 write_seqlock_irq(&xtime_lock);
268 result = time_state; /* mostly `TIME_OK' */
269 306
270 /* Save for later - semantics of adjtime is to return old value */ 307 /* Save for later - semantics of adjtime is to return old value */
271 save_adjust = time_adjust; 308 save_adjust = time_adjust;
272 309
273#if 0 /* STA_CLOCKERR is never set yet */
274 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
275#endif
276 /* If there are input parameters, then process them */ 310 /* If there are input parameters, then process them */
277 if (txc->modes) 311 if (txc->modes) {
278 { 312 if (txc->modes & ADJ_STATUS) {
279 if (txc->modes & ADJ_STATUS) /* only set allowed bits */ 313 if ((time_status & STA_PLL) &&
280 time_status = (txc->status & ~STA_RONLY) | 314 !(txc->status & STA_PLL)) {
281 (time_status & STA_RONLY); 315 time_state = TIME_OK;
282 316 time_status = STA_UNSYNC;
283 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ 317 }
284 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { 318 /* only set allowed bits */
285 result = -EINVAL; 319 time_status &= STA_RONLY;
286 goto leave; 320 time_status |= txc->status & ~STA_RONLY;
287 } 321
288 time_freq = ((s64)txc->freq * NSEC_PER_USEC) 322 switch (time_state) {
289 >> (SHIFT_USEC - SHIFT_NSEC); 323 case TIME_OK:
290 } 324 start_timer:
291 325 sec = ts.tv_sec;
292 if (txc->modes & ADJ_MAXERROR) { 326 if (time_status & STA_INS) {
293 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { 327 time_state = TIME_INS;
294 result = -EINVAL; 328 sec += 86400 - sec % 86400;
295 goto leave; 329 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
330 } else if (time_status & STA_DEL) {
331 time_state = TIME_DEL;
332 sec += 86400 - (sec + 1) % 86400;
333 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
334 }
335 break;
336 case TIME_INS:
337 case TIME_DEL:
338 time_state = TIME_OK;
339 goto start_timer;
340 break;
341 case TIME_WAIT:
342 if (!(time_status & (STA_INS | STA_DEL)))
343 time_state = TIME_OK;
344 break;
345 case TIME_OOP:
346 hrtimer_restart(&leap_timer);
347 break;
348 }
296 } 349 }
297 time_maxerror = txc->maxerror;
298 }
299 350
300 if (txc->modes & ADJ_ESTERROR) { 351 if (txc->modes & ADJ_NANO)
301 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { 352 time_status |= STA_NANO;
302 result = -EINVAL; 353 if (txc->modes & ADJ_MICRO)
303 goto leave; 354 time_status &= ~STA_NANO;
355
356 if (txc->modes & ADJ_FREQUENCY) {
357 time_freq = (s64)txc->freq * PPM_SCALE;
358 time_freq = min(time_freq, MAXFREQ_SCALED);
359 time_freq = max(time_freq, -MAXFREQ_SCALED);
304 } 360 }
305 time_esterror = txc->esterror;
306 }
307 361
308 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ 362 if (txc->modes & ADJ_MAXERROR)
309 if (txc->constant < 0) { /* NTP v4 uses values > 6 */ 363 time_maxerror = txc->maxerror;
310 result = -EINVAL; 364 if (txc->modes & ADJ_ESTERROR)
311 goto leave; 365 time_esterror = txc->esterror;
366
367 if (txc->modes & ADJ_TIMECONST) {
368 time_constant = txc->constant;
369 if (!(time_status & STA_NANO))
370 time_constant += 4;
371 time_constant = min(time_constant, (long)MAXTC);
372 time_constant = max(time_constant, 0l);
312 } 373 }
313 time_constant = min(txc->constant + 4, (long)MAXTC);
314 }
315 374
316 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ 375 if (txc->modes & ADJ_TAI && txc->constant > 0)
317 if (txc->modes == ADJ_OFFSET_SINGLESHOT) { 376 time_tai = txc->constant;
318 /* adjtime() is independent from ntp_adjtime() */ 377
319 time_adjust = txc->offset; 378 if (txc->modes & ADJ_OFFSET) {
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT)
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
320 } 384 }
321 else if (time_status & STA_PLL) { 385 if (txc->modes & ADJ_TICK)
322 time_offset = txc->offset * NSEC_PER_USEC; 386 tick_usec = txc->tick;
323 387
324 /* 388 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
325 * Scale the phase adjustment and 389 ntp_update_frequency();
326 * clamp to the operating range. 390 }
327 */ 391
328 time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC); 392 result = time_state; /* mostly `TIME_OK' */
329 time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC); 393 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
330
331 /*
332 * Select whether the frequency is to be controlled
333 * and in which mode (PLL or FLL). Clamp to the operating
334 * range. Ugly multiply/divide should be replaced someday.
335 */
336
337 if (time_status & STA_FREQHOLD || time_reftime == 0)
338 time_reftime = xtime.tv_sec;
339 mtemp = xtime.tv_sec - time_reftime;
340 time_reftime = xtime.tv_sec;
341
342 freq_adj = time_offset * mtemp;
343 freq_adj = shift_right(freq_adj, time_constant * 2 +
344 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
345 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
346 u64 utemp64;
347 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
348 if (time_offset < 0) {
349 utemp64 = -temp64;
350 do_div(utemp64, mtemp);
351 freq_adj -= utemp64;
352 } else {
353 utemp64 = temp64;
354 do_div(utemp64, mtemp);
355 freq_adj += utemp64;
356 }
357 }
358 freq_adj += time_freq;
359 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
360 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
361 time_offset = div_long_long_rem_signed(time_offset,
362 NTP_INTERVAL_FREQ,
363 &rem);
364 time_offset <<= SHIFT_UPDATE;
365 } /* STA_PLL */
366 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK)
368 tick_usec = txc->tick;
369
370 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
371 ntp_update_frequency();
372 } /* txc->modes */
373leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
374 result = TIME_ERROR; 394 result = TIME_ERROR;
375 395
376 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
377 (txc->modes == ADJ_OFFSET_SS_READ)) 397 (txc->modes == ADJ_OFFSET_SS_READ))
378 txc->offset = save_adjust; 398 txc->offset = save_adjust;
379 else 399 else {
380 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * 400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
381 NTP_INTERVAL_FREQ / 1000; 401 NTP_SCALE_SHIFT);
382 txc->freq = (time_freq / NSEC_PER_USEC) << 402 if (!(time_status & STA_NANO))
383 (SHIFT_USEC - SHIFT_NSEC); 403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
384 txc->maxerror = time_maxerror; 408 txc->maxerror = time_maxerror;
385 txc->esterror = time_esterror; 409 txc->esterror = time_esterror;
386 txc->status = time_status; 410 txc->status = time_status;
387 txc->constant = time_constant; 411 txc->constant = time_constant;
388 txc->precision = 1; 412 txc->precision = 1;
389 txc->tolerance = MAXFREQ; 413 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
390 txc->tick = tick_usec; 414 txc->tick = tick_usec;
415 txc->tai = time_tai;
391 416
392 /* PPS is not implemented, so these are zero */ 417 /* PPS is not implemented, so these are zero */
393 txc->ppsfreq = 0; 418 txc->ppsfreq = 0;
@@ -399,9 +424,15 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
399 txc->errcnt = 0; 424 txc->errcnt = 0;
400 txc->stbcnt = 0; 425 txc->stbcnt = 0;
401 write_sequnlock_irq(&xtime_lock); 426 write_sequnlock_irq(&xtime_lock);
402 do_gettimeofday(&txc->time); 427
428 txc->time.tv_sec = ts.tv_sec;
429 txc->time.tv_usec = ts.tv_nsec;
430 if (!(time_status & STA_NANO))
431 txc->time.tv_usec /= NSEC_PER_USEC;
432
403 notify_cmos_timer(); 433 notify_cmos_timer();
404 return(result); 434
435 return result;
405} 436}
406 437
407static int __init ntp_tick_adj_setup(char *str) 438static int __init ntp_tick_adj_setup(char *str)
@@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str)
411} 442}
412 443
413__setup("ntp_tick_adj=", ntp_tick_adj_setup); 444__setup("ntp_tick_adj=", ntp_tick_adj_setup);
445
446void __init ntp_init(void)
447{
448 ntp_clear();
449 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
450 leap_timer.function = ntp_leap_second;
451}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d6087c7cf98..e91c29f961c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec)
53 timespec_add_ns(&xtime_cache, nsec); 53 timespec_add_ns(&xtime_cache, nsec);
54} 54}
55 55
56static struct clocksource *clock; /* pointer to current clocksource */ 56struct clocksource *clock;
57 57
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
@@ -246,7 +246,7 @@ void __init timekeeping_init(void)
246 246
247 write_seqlock_irqsave(&xtime_lock, flags); 247 write_seqlock_irqsave(&xtime_lock, flags);
248 248
249 ntp_clear(); 249 ntp_init();
250 250
251 clock = clocksource_get_next(); 251 clock = clocksource_get_next();
252 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 252 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -371,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
371 * here. This is tuned so that an error of about 1 msec is adjusted 371 * here. This is tuned so that an error of about 1 msec is adjusted
372 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 372 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
373 */ 373 */
374 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); 374 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
375 error2 = abs(error2); 375 error2 = abs(error2);
376 for (look_ahead = 0; error2 > 0; look_ahead++) 376 for (look_ahead = 0; error2 > 0; look_ahead++)
377 error2 >>= 2; 377 error2 >>= 2;
@@ -380,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
380 * Now calculate the error in (1 << look_ahead) ticks, but first 380 * Now calculate the error in (1 << look_ahead) ticks, but first
381 * remove the single look ahead already included in the error. 381 * remove the single look ahead already included in the error.
382 */ 382 */
383 tick_error = current_tick_length() >> 383 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
384 (TICK_LENGTH_SHIFT - clock->shift + 1);
385 tick_error -= clock->xtime_interval >> 1; 384 tick_error -= clock->xtime_interval >> 1;
386 error = ((error - tick_error) >> look_ahead) + tick_error; 385 error = ((error - tick_error) >> look_ahead) + tick_error;
387 386
@@ -412,7 +411,7 @@ static void clocksource_adjust(s64 offset)
412 s64 error, interval = clock->cycle_interval; 411 s64 error, interval = clock->cycle_interval;
413 int adj; 412 int adj;
414 413
415 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); 414 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
416 if (error > interval) { 415 if (error > interval) {
417 error >>= 2; 416 error >>= 2;
418 if (likely(error <= interval)) 417 if (likely(error <= interval))
@@ -434,7 +433,7 @@ static void clocksource_adjust(s64 offset)
434 clock->xtime_interval += interval; 433 clock->xtime_interval += interval;
435 clock->xtime_nsec -= offset; 434 clock->xtime_nsec -= offset;
436 clock->error -= (interval - offset) << 435 clock->error -= (interval - offset) <<
437 (TICK_LENGTH_SHIFT - clock->shift); 436 (NTP_SCALE_SHIFT - clock->shift);
438} 437}
439 438
440/** 439/**
@@ -473,8 +472,8 @@ void update_wall_time(void)
473 } 472 }
474 473
475 /* accumulate error between NTP and clock interval */ 474 /* accumulate error between NTP and clock interval */
476 clock->error += current_tick_length(); 475 clock->error += tick_length;
477 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
478 } 477 }
479 478
480 /* correct the clock when NTP error is too big */ 479 /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
278{ 278{
279 struct proc_dir_entry *pe; 279 struct proc_dir_entry *pe;
280 280
281 pe = create_proc_entry("timer_list", 0644, NULL); 281 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
282 if (!pe) 282 if (!pe)
283 return -ENOMEM; 283 return -ENOMEM;
284
285 pe->proc_fops = &timer_list_fops;
286
287 return 0; 284 return 0;
288} 285}
289__initcall(init_timer_list_procfs); 286__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
415{ 415{
416 struct proc_dir_entry *pe; 416 struct proc_dir_entry *pe;
417 417
418 pe = create_proc_entry("timer_stats", 0644, NULL); 418 pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
419 if (!pe) 419 if (!pe)
420 return -ENOMEM; 420 return -ENOMEM;
421
422 pe->proc_fops = &tstats_fops;
423
424 return 0; 421 return 0;
425} 422}
426__initcall(init_tstats_procfs); 423__initcall(init_tstats_procfs);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 41468035473c..eb51d76e058a 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -1,7 +1,7 @@
1#!/usr/bin/perl 1#!/usr/bin/perl
2# ----------------------------------------------------------------------- 2# -----------------------------------------------------------------------
3# 3#
4# Copyright 2007 rPath, Inc. - All Rights Reserved 4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5# 5#
6# This file is part of the Linux kernel, and is made available under 6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your 7# the terms of the GNU General Public License version 2 or (at your
@@ -20,198 +20,138 @@
20%canned_values = ( 20%canned_values = (
21 24 => [ 21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26, 22 '0xa6aaaaab','0x2aaaaaa',26,
23 '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
24 125,3, 23 125,3,
25 '0xc49ba5e4','0x1fbe76c8b4',37, 24 '0xc49ba5e4','0x1fbe76c8b4',37,
26 '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
27 3,125, 25 3,125,
28 '0xa2c2aaab','0xaaaa',16, 26 '0xa2c2aaab','0xaaaa',16,
29 '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
30 125000,3, 27 125000,3,
31 '0xc9539b89','0x7fffbce4217d',47, 28 '0xc9539b89','0x7fffbce4217d',47,
32 '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
33 3,125000, 29 3,125000,
34 ], 32 => [ 30 ], 32 => [
35 '0xfa000000','0x6000000',27, 31 '0xfa000000','0x6000000',27,
36 '0xfa00000000000000','0x600000000000000',59,
37 125,4, 32 125,4,
38 '0x83126e98','0xfdf3b645a',36, 33 '0x83126e98','0xfdf3b645a',36,
39 '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
40 4,125, 34 4,125,
41 '0xf4240000','0x0',17, 35 '0xf4240000','0x0',17,
42 '0xf424000000000000','0x0',49,
43 31250,1, 36 31250,1,
44 '0x8637bd06','0x3fff79c842fa',46, 37 '0x8637bd06','0x3fff79c842fa',46,
45 '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
46 1,31250, 38 1,31250,
47 ], 48 => [ 39 ], 48 => [
48 '0xa6aaaaab','0x6aaaaaa',27, 40 '0xa6aaaaab','0x6aaaaaa',27,
49 '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
50 125,6, 41 125,6,
51 '0xc49ba5e4','0xfdf3b645a',36, 42 '0xc49ba5e4','0xfdf3b645a',36,
52 '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
53 6,125, 43 6,125,
54 '0xa2c2aaab','0x15555',17, 44 '0xa2c2aaab','0x15555',17,
55 '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
56 62500,3, 45 62500,3,
57 '0xc9539b89','0x3fffbce4217d',46, 46 '0xc9539b89','0x3fffbce4217d',46,
58 '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
59 3,62500, 47 3,62500,
60 ], 64 => [ 48 ], 64 => [
61 '0xfa000000','0xe000000',28, 49 '0xfa000000','0xe000000',28,
62 '0xfa00000000000000','0xe00000000000000',60,
63 125,8, 50 125,8,
64 '0x83126e98','0x7ef9db22d',35, 51 '0x83126e98','0x7ef9db22d',35,
65 '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
66 8,125, 52 8,125,
67 '0xf4240000','0x0',18, 53 '0xf4240000','0x0',18,
68 '0xf424000000000000','0x0',50,
69 15625,1, 54 15625,1,
70 '0x8637bd06','0x1fff79c842fa',45, 55 '0x8637bd06','0x1fff79c842fa',45,
71 '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
72 1,15625, 56 1,15625,
73 ], 100 => [ 57 ], 100 => [
74 '0xa0000000','0x0',28, 58 '0xa0000000','0x0',28,
75 '0xa000000000000000','0x0',60,
76 10,1, 59 10,1,
77 '0xcccccccd','0x733333333',35, 60 '0xcccccccd','0x733333333',35,
78 '0xcccccccccccccccd','0x73333333333333333',67,
79 1,10, 61 1,10,
80 '0x9c400000','0x0',18, 62 '0x9c400000','0x0',18,
81 '0x9c40000000000000','0x0',50,
82 10000,1, 63 10000,1,
83 '0xd1b71759','0x1fff2e48e8a7',45, 64 '0xd1b71759','0x1fff2e48e8a7',45,
84 '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
85 1,10000, 65 1,10000,
86 ], 122 => [ 66 ], 122 => [
87 '0x8325c53f','0xfbcda3a',28, 67 '0x8325c53f','0xfbcda3a',28,
88 '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
89 500,61, 68 500,61,
90 '0xf9db22d1','0x7fbe76c8b',35, 69 '0xf9db22d1','0x7fbe76c8b',35,
91 '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
92 61,500, 70 61,500,
93 '0x8012e2a0','0x3ef36',18, 71 '0x8012e2a0','0x3ef36',18,
94 '0x8012e29f79b47583','0x3ef368eb04325',50,
95 500000,61, 72 500000,61,
96 '0xffda4053','0x1ffffbce4217',45, 73 '0xffda4053','0x1ffffbce4217',45,
97 '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
98 61,500000, 74 61,500000,
99 ], 128 => [ 75 ], 128 => [
100 '0xfa000000','0x1e000000',29, 76 '0xfa000000','0x1e000000',29,
101 '0xfa00000000000000','0x1e00000000000000',61,
102 125,16, 77 125,16,
103 '0x83126e98','0x3f7ced916',34, 78 '0x83126e98','0x3f7ced916',34,
104 '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
105 16,125, 79 16,125,
106 '0xf4240000','0x40000',19, 80 '0xf4240000','0x40000',19,
107 '0xf424000000000000','0x4000000000000',51,
108 15625,2, 81 15625,2,
109 '0x8637bd06','0xfffbce4217d',44, 82 '0x8637bd06','0xfffbce4217d',44,
110 '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
111 2,15625, 83 2,15625,
112 ], 200 => [ 84 ], 200 => [
113 '0xa0000000','0x0',29, 85 '0xa0000000','0x0',29,
114 '0xa000000000000000','0x0',61,
115 5,1, 86 5,1,
116 '0xcccccccd','0x333333333',34, 87 '0xcccccccd','0x333333333',34,
117 '0xcccccccccccccccd','0x33333333333333333',66,
118 1,5, 88 1,5,
119 '0x9c400000','0x0',19, 89 '0x9c400000','0x0',19,
120 '0x9c40000000000000','0x0',51,
121 5000,1, 90 5000,1,
122 '0xd1b71759','0xfff2e48e8a7',44, 91 '0xd1b71759','0xfff2e48e8a7',44,
123 '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
124 1,5000, 92 1,5000,
125 ], 250 => [ 93 ], 250 => [
126 '0x80000000','0x0',29, 94 '0x80000000','0x0',29,
127 '0x8000000000000000','0x0',61,
128 4,1, 95 4,1,
129 '0x80000000','0x180000000',33, 96 '0x80000000','0x180000000',33,
130 '0x8000000000000000','0x18000000000000000',65,
131 1,4, 97 1,4,
132 '0xfa000000','0x0',20, 98 '0xfa000000','0x0',20,
133 '0xfa00000000000000','0x0',52,
134 4000,1, 99 4000,1,
135 '0x83126e98','0x7ff7ced9168',43, 100 '0x83126e98','0x7ff7ced9168',43,
136 '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
137 1,4000, 101 1,4000,
138 ], 256 => [ 102 ], 256 => [
139 '0xfa000000','0x3e000000',30, 103 '0xfa000000','0x3e000000',30,
140 '0xfa00000000000000','0x3e00000000000000',62,
141 125,32, 104 125,32,
142 '0x83126e98','0x1fbe76c8b',33, 105 '0x83126e98','0x1fbe76c8b',33,
143 '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
144 32,125, 106 32,125,
145 '0xf4240000','0xc0000',20, 107 '0xf4240000','0xc0000',20,
146 '0xf424000000000000','0xc000000000000',52,
147 15625,4, 108 15625,4,
148 '0x8637bd06','0x7ffde7210be',43, 109 '0x8637bd06','0x7ffde7210be',43,
149 '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
150 4,15625, 110 4,15625,
151 ], 300 => [ 111 ], 300 => [
152 '0xd5555556','0x2aaaaaaa',30, 112 '0xd5555556','0x2aaaaaaa',30,
153 '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
154 10,3, 113 10,3,
155 '0x9999999a','0x1cccccccc',33, 114 '0x9999999a','0x1cccccccc',33,
156 '0x999999999999999a','0x1cccccccccccccccc',65,
157 3,10, 115 3,10,
158 '0xd0555556','0xaaaaa',20, 116 '0xd0555556','0xaaaaa',20,
159 '0xd055555555555556','0xaaaaaaaaaaaaa',52,
160 10000,3, 117 10000,3,
161 '0x9d495183','0x7ffcb923a29',43, 118 '0x9d495183','0x7ffcb923a29',43,
162 '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
163 3,10000, 119 3,10000,
164 ], 512 => [ 120 ], 512 => [
165 '0xfa000000','0x7e000000',31, 121 '0xfa000000','0x7e000000',31,
166 '0xfa00000000000000','0x7e00000000000000',63,
167 125,64, 122 125,64,
168 '0x83126e98','0xfdf3b645',32, 123 '0x83126e98','0xfdf3b645',32,
169 '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
170 64,125, 124 64,125,
171 '0xf4240000','0x1c0000',21, 125 '0xf4240000','0x1c0000',21,
172 '0xf424000000000000','0x1c000000000000',53,
173 15625,8, 126 15625,8,
174 '0x8637bd06','0x3ffef39085f',42, 127 '0x8637bd06','0x3ffef39085f',42,
175 '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
176 8,15625, 128 8,15625,
177 ], 1000 => [ 129 ], 1000 => [
178 '0x80000000','0x0',31, 130 '0x80000000','0x0',31,
179 '0x8000000000000000','0x0',63,
180 1,1, 131 1,1,
181 '0x80000000','0x0',31, 132 '0x80000000','0x0',31,
182 '0x8000000000000000','0x0',63,
183 1,1, 133 1,1,
184 '0xfa000000','0x0',22, 134 '0xfa000000','0x0',22,
185 '0xfa00000000000000','0x0',54,
186 1000,1, 135 1000,1,
187 '0x83126e98','0x1ff7ced9168',41, 136 '0x83126e98','0x1ff7ced9168',41,
188 '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
189 1,1000, 137 1,1000,
190 ], 1024 => [ 138 ], 1024 => [
191 '0xfa000000','0xfe000000',32, 139 '0xfa000000','0xfe000000',32,
192 '0xfa00000000000000','0xfe00000000000000',64,
193 125,128, 140 125,128,
194 '0x83126e98','0x7ef9db22',31, 141 '0x83126e98','0x7ef9db22',31,
195 '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
196 128,125, 142 128,125,
197 '0xf4240000','0x3c0000',22, 143 '0xf4240000','0x3c0000',22,
198 '0xf424000000000000','0x3c000000000000',54,
199 15625,16, 144 15625,16,
200 '0x8637bd06','0x1fff79c842f',41, 145 '0x8637bd06','0x1fff79c842f',41,
201 '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
202 16,15625, 146 16,15625,
203 ], 1200 => [ 147 ], 1200 => [
204 '0xd5555556','0xd5555555',32, 148 '0xd5555556','0xd5555555',32,
205 '0xd555555555555556','0xd555555555555555',64,
206 5,6, 149 5,6,
207 '0x9999999a','0x66666666',31, 150 '0x9999999a','0x66666666',31,
208 '0x999999999999999a','0x6666666666666666',63,
209 6,5, 151 6,5,
210 '0xd0555556','0x2aaaaa',22, 152 '0xd0555556','0x2aaaaa',22,
211 '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
212 2500,3, 153 2500,3,
213 '0x9d495183','0x1ffcb923a29',41, 154 '0x9d495183','0x1ffcb923a29',41,
214 '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
215 3,2500, 155 3,2500,
216 ] 156 ]
217); 157);
@@ -264,6 +204,15 @@ sub fmuls($$$) {
264 return 0; 204 return 0;
265} 205}
266 206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
267# Provides mul, adj, and shr factors for a specific 216# Provides mul, adj, and shr factors for a specific
268# (bit, time, hz) combination 217# (bit, time, hz) combination
269sub muladj($$$) { 218sub muladj($$$) {
@@ -271,7 +220,7 @@ sub muladj($$$) {
271 my $s = fmuls($b, $t, $hz); 220 my $s = fmuls($b, $t, $hz);
272 my $m = fmul($s, $t, $hz); 221 my $m = fmul($s, $t, $hz);
273 my $a = fadj($s, $t, $hz); 222 my $a = fadj($s, $t, $hz);
274 return ($m->as_hex(), $a->as_hex(), $s); 223 return (bignum_hex($m), bignum_hex($a), $s);
275} 224}
276 225
277# Provides numerator, denominator values 226# Provides numerator, denominator values
@@ -288,12 +237,10 @@ sub conversions($$) {
288 237
289 # HZ_TO_xx 238 # HZ_TO_xx
290 push(@val, muladj(32, $t, $hz)); 239 push(@val, muladj(32, $t, $hz));
291 push(@val, muladj(64, $t, $hz));
292 push(@val, numden($t, $hz)); 240 push(@val, numden($t, $hz));
293 241
294 # xx_TO_HZ 242 # xx_TO_HZ
295 push(@val, muladj(32, $hz, $t)); 243 push(@val, muladj(32, $hz, $t));
296 push(@val, muladj(64, $hz, $t));
297 push(@val, numden($hz, $t)); 244 push(@val, numden($hz, $t));
298 245
299 return @val; 246 return @val;
@@ -318,6 +265,19 @@ sub compute_values($) {
318 return @val; 265 return @val;
319} 266}
320 267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
321sub output($@) 281sub output($@)
322{ 282{
323 my($hz, @val) = @_; 283 my($hz, @val) = @_;
@@ -331,6 +291,7 @@ sub output($@)
331 print "\n"; 291 print "\n";
332 292
333 print "#include <linux/param.h>\n"; 293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
334 295
335 print "\n"; 296 print "\n";
336 print "#if HZ != $hz\n"; 297 print "#if HZ != $hz\n";
@@ -340,15 +301,13 @@ sub output($@)
340 301
341 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', 302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
342 'HZ_TO_USEC','USEC_TO_HZ') { 303 'HZ_TO_USEC','USEC_TO_HZ') {
343 foreach $bit (32, 64) { 304 foreach $bit (32) {
344 foreach $suf ('MUL', 'ADJ', 'SHR') { 305 foreach $suf ('MUL', 'ADJ', 'SHR') {
345 printf "#define %-23s %s\n", 306 outputval("${pfx}_$suf$bit", shift(@val));
346 "${pfx}_$suf$bit", shift(@val);
347 } 307 }
348 } 308 }
349 foreach $suf ('NUM', 'DEN') { 309 foreach $suf ('NUM', 'DEN') {
350 printf "#define %-23s %s\n", 310 outputval("${pfx}_$suf", shift(@val));
351 "${pfx}_$suf", shift(@val);
352 } 311 }
353 } 312 }
354 313
@@ -356,6 +315,23 @@ sub output($@)
356 print "#endif /* KERNEL_TIMECONST_H */\n"; 315 print "#endif /* KERNEL_TIMECONST_H */\n";
357} 316}
358 317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
359($hz) = @ARGV; 335($hz) = @ARGV;
360 336
361# Use this to generate the %canned_values structure 337# Use this to generate the %canned_values structure
@@ -373,15 +349,15 @@ if ($hz eq '--can') {
373 print "$pf$hz => [\n"; 349 print "$pf$hz => [\n";
374 while (scalar(@values)) { 350 while (scalar(@values)) {
375 my $bit; 351 my $bit;
376 foreach $bit (32, 64) { 352 foreach $bit (32) {
377 my $m = shift(@values); 353 my $m = shift(@values);
378 my $a = shift(@values); 354 my $a = shift(@values);
379 my $s = shift(@values); 355 my $s = shift(@values);
380 print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n"; 356 print "\t\t", perlvals($m,$a,$s), ",\n";
381 } 357 }
382 my $n = shift(@values); 358 my $n = shift(@values);
383 my $d = shift(@values); 359 my $d = shift(@values);
384 print "\t\t",$n,',',$d,",\n"; 360 print "\t\t", perlvals($n,$d), ",\n";
385 } 361 }
386 print "\t]"; 362 print "\t]";
387 $pf = ', '; 363 $pf = ', ';
diff --git a/kernel/timer.c b/kernel/timer.c
index f3d35d4ea42e..ceacc6626572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer)
320static void timer_stats_account_timer(struct timer_list *timer) {} 320static void timer_stats_account_timer(struct timer_list *timer) {}
321#endif 321#endif
322 322
323/** 323#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
324 * init_timer - initialize a timer. 324
325 * @timer: the timer to be initialized 325static struct debug_obj_descr timer_debug_descr;
326 * 326
327 * init_timer() must be done to a timer prior calling *any* of the 327/*
328 * other timer functions. 328 * fixup_init is called when:
329 * - an active object is initialized
329 */ 330 */
330void init_timer(struct timer_list *timer) 331static int timer_fixup_init(void *addr, enum debug_obj_state state)
332{
333 struct timer_list *timer = addr;
334
335 switch (state) {
336 case ODEBUG_STATE_ACTIVE:
337 del_timer_sync(timer);
338 debug_object_init(timer, &timer_debug_descr);
339 return 1;
340 default:
341 return 0;
342 }
343}
344
345/*
346 * fixup_activate is called when:
347 * - an active object is activated
348 * - an unknown object is activated (might be a statically initialized object)
349 */
350static int timer_fixup_activate(void *addr, enum debug_obj_state state)
351{
352 struct timer_list *timer = addr;
353
354 switch (state) {
355
356 case ODEBUG_STATE_NOTAVAILABLE:
357 /*
358 * This is not really a fixup. The timer was
359 * statically initialized. We just make sure that it
360 * is tracked in the object tracker.
361 */
362 if (timer->entry.next == NULL &&
363 timer->entry.prev == TIMER_ENTRY_STATIC) {
364 debug_object_init(timer, &timer_debug_descr);
365 debug_object_activate(timer, &timer_debug_descr);
366 return 0;
367 } else {
368 WARN_ON_ONCE(1);
369 }
370 return 0;
371
372 case ODEBUG_STATE_ACTIVE:
373 WARN_ON(1);
374
375 default:
376 return 0;
377 }
378}
379
380/*
381 * fixup_free is called when:
382 * - an active object is freed
383 */
384static int timer_fixup_free(void *addr, enum debug_obj_state state)
385{
386 struct timer_list *timer = addr;
387
388 switch (state) {
389 case ODEBUG_STATE_ACTIVE:
390 del_timer_sync(timer);
391 debug_object_free(timer, &timer_debug_descr);
392 return 1;
393 default:
394 return 0;
395 }
396}
397
398static struct debug_obj_descr timer_debug_descr = {
399 .name = "timer_list",
400 .fixup_init = timer_fixup_init,
401 .fixup_activate = timer_fixup_activate,
402 .fixup_free = timer_fixup_free,
403};
404
405static inline void debug_timer_init(struct timer_list *timer)
406{
407 debug_object_init(timer, &timer_debug_descr);
408}
409
410static inline void debug_timer_activate(struct timer_list *timer)
411{
412 debug_object_activate(timer, &timer_debug_descr);
413}
414
415static inline void debug_timer_deactivate(struct timer_list *timer)
416{
417 debug_object_deactivate(timer, &timer_debug_descr);
418}
419
420static inline void debug_timer_free(struct timer_list *timer)
421{
422 debug_object_free(timer, &timer_debug_descr);
423}
424
425static void __init_timer(struct timer_list *timer);
426
427void init_timer_on_stack(struct timer_list *timer)
428{
429 debug_object_init_on_stack(timer, &timer_debug_descr);
430 __init_timer(timer);
431}
432EXPORT_SYMBOL_GPL(init_timer_on_stack);
433
434void destroy_timer_on_stack(struct timer_list *timer)
435{
436 debug_object_free(timer, &timer_debug_descr);
437}
438EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
439
440#else
441static inline void debug_timer_init(struct timer_list *timer) { }
442static inline void debug_timer_activate(struct timer_list *timer) { }
443static inline void debug_timer_deactivate(struct timer_list *timer) { }
444#endif
445
446static void __init_timer(struct timer_list *timer)
331{ 447{
332 timer->entry.next = NULL; 448 timer->entry.next = NULL;
333 timer->base = __raw_get_cpu_var(tvec_bases); 449 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer)
337 memset(timer->start_comm, 0, TASK_COMM_LEN); 453 memset(timer->start_comm, 0, TASK_COMM_LEN);
338#endif 454#endif
339} 455}
456
457/**
458 * init_timer - initialize a timer.
459 * @timer: the timer to be initialized
460 *
461 * init_timer() must be done to a timer prior calling *any* of the
462 * other timer functions.
463 */
464void init_timer(struct timer_list *timer)
465{
466 debug_timer_init(timer);
467 __init_timer(timer);
468}
340EXPORT_SYMBOL(init_timer); 469EXPORT_SYMBOL(init_timer);
341 470
342void init_timer_deferrable(struct timer_list *timer) 471void init_timer_deferrable(struct timer_list *timer)
@@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer,
351{ 480{
352 struct list_head *entry = &timer->entry; 481 struct list_head *entry = &timer->entry;
353 482
483 debug_timer_deactivate(timer);
484
354 __list_del(entry->prev, entry->next); 485 __list_del(entry->prev, entry->next);
355 if (clear_pending) 486 if (clear_pending)
356 entry->next = NULL; 487 entry->next = NULL;
@@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
405 ret = 1; 536 ret = 1;
406 } 537 }
407 538
539 debug_timer_activate(timer);
540
408 new_base = __get_cpu_var(tvec_bases); 541 new_base = __get_cpu_var(tvec_bases);
409 542
410 if (base != new_base) { 543 if (base != new_base) {
@@ -450,6 +583,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
450 BUG_ON(timer_pending(timer) || !timer->function); 583 BUG_ON(timer_pending(timer) || !timer->function);
451 spin_lock_irqsave(&base->lock, flags); 584 spin_lock_irqsave(&base->lock, flags);
452 timer_set_base(timer, base); 585 timer_set_base(timer, base);
586 debug_timer_activate(timer);
453 internal_add_timer(base, timer); 587 internal_add_timer(base, timer);
454 /* 588 /*
455 * Check whether the other CPU is idle and needs to be 589 * Check whether the other CPU is idle and needs to be
@@ -1086,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout)
1086 1220
1087 expire = timeout + jiffies; 1221 expire = timeout + jiffies;
1088 1222
1089 setup_timer(&timer, process_timeout, (unsigned long)current); 1223 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1090 __mod_timer(&timer, expire); 1224 __mod_timer(&timer, expire);
1091 schedule(); 1225 schedule();
1092 del_singleshot_timer_sync(&timer); 1226 del_singleshot_timer_sync(&timer);
1093 1227
1228 /* Remove the timer from the object tracker */
1229 destroy_timer_on_stack(&timer);
1230
1094 timeout = expire - jiffies; 1231 timeout = expire - jiffies;
1095 1232
1096 out: 1233 out:
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..865ecf57a096 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
53 .files = ATOMIC_INIT(0), 53 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 54 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 55 .locked_shm = 0,
56#ifdef CONFIG_KEYS
57 .uid_keyring = &root_user_keyring,
58 .session_keyring = &root_session_keyring,
59#endif
60#ifdef CONFIG_USER_SCHED 56#ifdef CONFIG_USER_SCHED
61 .tg = &init_task_group, 57 .tg = &init_task_group,
62#endif 58#endif
@@ -388,7 +384,7 @@ void free_uid(struct user_struct *up)
388 local_irq_restore(flags); 384 local_irq_restore(flags);
389} 385}
390 386
391struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 387struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
392{ 388{
393 struct hlist_head *hashent = uidhashentry(ns, uid); 389 struct hlist_head *hashent = uidhashentry(ns, uid);
394 struct user_struct *up, *new; 390 struct user_struct *up, *new;
@@ -403,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
403 spin_unlock_irq(&uidhash_lock); 399 spin_unlock_irq(&uidhash_lock);
404 400
405 if (!up) { 401 if (!up) {
406 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 402 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
407 if (!new) 403 if (!new)
408 goto out_unlock; 404 goto out_unlock;
409 405
410 new->uid = uid; 406 new->uid = uid;
411 atomic_set(&new->__count, 1); 407 atomic_set(&new->__count, 1);
412 atomic_set(&new->processes, 0);
413 atomic_set(&new->files, 0);
414 atomic_set(&new->sigpending, 0);
415#ifdef CONFIG_INOTIFY_USER
416 atomic_set(&new->inotify_watches, 0);
417 atomic_set(&new->inotify_devs, 0);
418#endif
419#ifdef CONFIG_POSIX_MQUEUE
420 new->mq_bytes = 0;
421#endif
422 new->locked_shm = 0;
423
424 if (alloc_uid_keyring(new, current) < 0)
425 goto out_free_user;
426 408
427 if (sched_create_user(new) < 0) 409 if (sched_create_user(new) < 0)
428 goto out_put_keys; 410 goto out_free_user;
429 411
430 if (uids_user_create(new)) 412 if (uids_user_create(new))
431 goto out_destoy_sched; 413 goto out_destoy_sched;
@@ -459,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
459 441
460out_destoy_sched: 442out_destoy_sched:
461 sched_destroy_user(new); 443 sched_destroy_user(new);
462out_put_keys:
463 key_put(new->uid_keyring);
464 key_put(new->session_keyring);
465out_free_user: 444out_free_user:
466 kmem_cache_free(uid_cachep, new); 445 kmem_cache_free(uid_cachep, new);
467out_unlock: 446out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h> 9#include <linux/version.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/slab.h>
11#include <linux/user_namespace.h> 12#include <linux/user_namespace.h>
12 13
13/* 14/*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
73 release_uids(ns); 74 release_uids(ns);
74 kfree(ns); 75 kfree(ns);
75} 76}
77EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17 18
18/* 19/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 20 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..29fc39f1029c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
158 * 158 *
159 * Returns 0 if @work was already on a queue, non-zero otherwise. 159 * Returns 0 if @work was already on a queue, non-zero otherwise.
160 * 160 *
161 * We queue the work to the CPU it was submitted, but there is no 161 * We queue the work to the CPU on which it was submitted, but if the CPU dies
162 * guarantee that it will be processed by that CPU. 162 * it can be processed by another CPU.
163 */ 163 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 164int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 165{
@@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data)
195int queue_delayed_work(struct workqueue_struct *wq, 195int queue_delayed_work(struct workqueue_struct *wq,
196 struct delayed_work *dwork, unsigned long delay) 196 struct delayed_work *dwork, unsigned long delay)
197{ 197{
198 timer_stats_timer_set_start_info(&dwork->timer);
199 if (delay == 0) 198 if (delay == 0)
200 return queue_work(wq, &dwork->work); 199 return queue_work(wq, &dwork->work);
201 200
@@ -219,11 +218,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
219 struct timer_list *timer = &dwork->timer; 218 struct timer_list *timer = &dwork->timer;
220 struct work_struct *work = &dwork->work; 219 struct work_struct *work = &dwork->work;
221 220
222 timer_stats_timer_set_start_info(&dwork->timer);
223 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 221 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
224 BUG_ON(timer_pending(timer)); 222 BUG_ON(timer_pending(timer));
225 BUG_ON(!list_empty(&work->entry)); 223 BUG_ON(!list_empty(&work->entry));
226 224
225 timer_stats_timer_set_start_info(&dwork->timer);
226
227 /* This stores cwq for the moment, for the timer_fn */ 227 /* This stores cwq for the moment, for the timer_fn */
228 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 228 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
229 timer->expires = jiffies + delay; 229 timer->expires = jiffies + delay;
@@ -247,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
247 if (cwq->run_depth > 3) { 247 if (cwq->run_depth > 3) {
248 /* morton gets to eat his hat */ 248 /* morton gets to eat his hat */
249 printk("%s: recursion depth exceeded: %d\n", 249 printk("%s: recursion depth exceeded: %d\n",
250 __FUNCTION__, cwq->run_depth); 250 __func__, cwq->run_depth);
251 dump_stack(); 251 dump_stack();
252 } 252 }
253 while (!list_empty(&cwq->worklist)) { 253 while (!list_empty(&cwq->worklist)) {
@@ -564,7 +564,6 @@ EXPORT_SYMBOL(schedule_work);
564int schedule_delayed_work(struct delayed_work *dwork, 564int schedule_delayed_work(struct delayed_work *dwork,
565 unsigned long delay) 565 unsigned long delay)
566{ 566{
567 timer_stats_timer_set_start_info(&dwork->timer);
568 return queue_delayed_work(keventd_wq, dwork, delay); 567 return queue_delayed_work(keventd_wq, dwork, delay);
569} 568}
570EXPORT_SYMBOL(schedule_delayed_work); 569EXPORT_SYMBOL(schedule_delayed_work);
@@ -581,7 +580,6 @@ EXPORT_SYMBOL(schedule_delayed_work);
581int schedule_delayed_work_on(int cpu, 580int schedule_delayed_work_on(int cpu,
582 struct delayed_work *dwork, unsigned long delay) 581 struct delayed_work *dwork, unsigned long delay)
583{ 582{
584 timer_stats_timer_set_start_info(&dwork->timer);
585 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 583 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
586} 584}
587EXPORT_SYMBOL(schedule_delayed_work_on); 585EXPORT_SYMBOL(schedule_delayed_work_on);
@@ -772,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
772} 770}
773EXPORT_SYMBOL_GPL(__create_workqueue_key); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
774 772
775static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
776{ 774{
777 /* 775 /*
778 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
808void destroy_workqueue(struct workqueue_struct *wq) 806void destroy_workqueue(struct workqueue_struct *wq)
809{ 807{
810 const cpumask_t *cpu_map = wq_cpu_map(wq); 808 const cpumask_t *cpu_map = wq_cpu_map(wq);
811 struct cpu_workqueue_struct *cwq;
812 int cpu; 809 int cpu;
813 810
814 get_online_cpus(); 811 get_online_cpus();
815 spin_lock(&workqueue_lock); 812 spin_lock(&workqueue_lock);
816 list_del(&wq->list); 813 list_del(&wq->list);
817 spin_unlock(&workqueue_lock); 814 spin_unlock(&workqueue_lock);
818 put_online_cpus();
819 815
820 for_each_cpu_mask(cpu, *cpu_map) { 816 for_each_cpu_mask(cpu, *cpu_map)
821 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 817 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
822 cleanup_workqueue_thread(cwq, cpu); 818 put_online_cpus();
823 }
824 819
825 free_percpu(wq->cpu_wq); 820 free_percpu(wq->cpu_wq);
826 kfree(wq); 821 kfree(wq);
@@ -838,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
838 action &= ~CPU_TASKS_FROZEN; 833 action &= ~CPU_TASKS_FROZEN;
839 834
840 switch (action) { 835 switch (action) {
841
842 case CPU_UP_PREPARE: 836 case CPU_UP_PREPARE:
843 cpu_set(cpu, cpu_populated_map); 837 cpu_set(cpu, cpu_populated_map);
844 } 838 }
@@ -861,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
861 case CPU_UP_CANCELED: 855 case CPU_UP_CANCELED:
862 start_workqueue_thread(cwq, -1); 856 start_workqueue_thread(cwq, -1);
863 case CPU_DEAD: 857 case CPU_DEAD:
864 cleanup_workqueue_thread(cwq, cpu); 858 cleanup_workqueue_thread(cwq);
865 break; 859 break;
866 } 860 }
867 } 861 }
868 862
863 switch (action) {
864 case CPU_UP_CANCELED:
865 case CPU_DEAD:
866 cpu_clear(cpu, cpu_populated_map);
867 }
868
869 return NOTIFY_OK; 869 return NOTIFY_OK;
870} 870}
871 871