aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c279
-rw-r--r--kernel/audit.h38
-rw-r--r--kernel/auditfilter.c154
-rw-r--r--kernel/auditsc.c114
-rw-r--r--kernel/bounds.c19
-rw-r--r--kernel/cgroup.c387
-rw-r--r--kernel/cgroup_debug.c20
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/configs.c7
-rw-r--r--kernel/cpu.c50
-rw-r--r--kernel/cpuset.c444
-rw-r--r--kernel/dma.c7
-rw-r--r--kernel/exit.c185
-rw-r--r--kernel/fork.c152
-rw-r--r--kernel/futex.c23
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c294
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/manage.c50
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kgdb.c1700
-rw-r--r--kernel/kmod.c3
-rw-r--r--kernel/kprobes.c349
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/latencytop.c36
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/marker.c12
-rw-r--r--kernel/module.c322
-rw-r--r--kernel/notifier.c38
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/nsproxy.c12
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid.c41
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/posix-cpu-timers.c41
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/console.c27
-rw-r--r--kernel/power/pm.c205
-rw-r--r--kernel/printk.c139
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c70
-rw-r--r--kernel/rcupreempt.c4
-rw-r--r--kernel/rcutorture.c16
-rw-r--r--kernel/relay.c44
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c28
-rw-r--r--kernel/sched.c1966
-rw-r--r--kernel/sched_debug.c45
-rw-r--r--kernel/sched_fair.c578
-rw-r--r--kernel/sched_features.h10
-rw-r--r--kernel/sched_rt.c227
-rw-r--r--kernel/sched_stats.h8
-rw-r--r--kernel/semaphore.c264
-rw-r--r--kernel/signal.c719
-rw-r--r--kernel/softirq.c67
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c150
-rw-r--r--kernel/sysctl.c191
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time.c64
-rw-r--r--kernel/time/clocksource.c32
-rw-r--r--kernel/time/ntp.c398
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c19
-rw-r--r--kernel/time/timer_list.c5
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/timeconst.pl120
-rw-r--r--kernel/timer.c179
-rw-r--r--kernel/uid16.c22
-rw-r--r--kernel/user.c57
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/workqueue.c30
82 files changed, 7504 insertions, 3117 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c584c55a6e9..188c43223f52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,10 +8,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 16obj-y += time/
17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
53obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 53obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
54obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 54obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
55obj-$(CONFIG_KPROBES) += kprobes.o 55obj-$(CONFIG_KPROBES) += kprobes.o
56obj-$(CONFIG_KGDB) += kgdb.o
56obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 57obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
57obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 58obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
58obj-$(CONFIG_SECCOMP) += seccomp.o 59obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/audit.c b/kernel/audit.c
index be55cb503633..b7d3709cc452 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -21,7 +21,7 @@
21 * 21 *
22 * Written by Rickard E. (Rik) Faith <faith@redhat.com> 22 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
23 * 23 *
24 * Goals: 1) Integrate fully with SELinux. 24 * Goals: 1) Integrate fully with Security Modules.
25 * 2) Minimal run-time overhead: 25 * 2) Minimal run-time overhead:
26 * a) Minimal when syscall auditing is disabled (audit_enable=0). 26 * a) Minimal when syscall auditing is disabled (audit_enable=0).
27 * b) Small when syscall auditing is enabled and no audit record 27 * b) Small when syscall auditing is enabled and no audit record
@@ -55,7 +55,6 @@
55#include <net/netlink.h> 55#include <net/netlink.h>
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h>
59#include <linux/inotify.h> 58#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
@@ -127,6 +126,8 @@ static int audit_freelist_count;
127static LIST_HEAD(audit_freelist); 126static LIST_HEAD(audit_freelist);
128 127
129static struct sk_buff_head audit_skb_queue; 128static struct sk_buff_head audit_skb_queue;
129/* queue of skbs to send to auditd when/if it comes back */
130static struct sk_buff_head audit_skb_hold_queue;
130static struct task_struct *kauditd_task; 131static struct task_struct *kauditd_task;
131static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 132static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
132static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -155,6 +156,11 @@ struct audit_buffer {
155 gfp_t gfp_mask; 156 gfp_t gfp_mask;
156}; 157};
157 158
159struct audit_reply {
160 int pid;
161 struct sk_buff *skb;
162};
163
158static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 164static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
159{ 165{
160 if (ab) { 166 if (ab) {
@@ -253,25 +259,26 @@ void audit_log_lost(const char *message)
253} 259}
254 260
255static int audit_log_config_change(char *function_name, int new, int old, 261static int audit_log_config_change(char *function_name, int new, int old,
256 uid_t loginuid, u32 sid, int allow_changes) 262 uid_t loginuid, u32 sessionid, u32 sid,
263 int allow_changes)
257{ 264{
258 struct audit_buffer *ab; 265 struct audit_buffer *ab;
259 int rc = 0; 266 int rc = 0;
260 267
261 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 268 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
262 audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, 269 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
263 old, loginuid); 270 old, loginuid, sessionid);
264 if (sid) { 271 if (sid) {
265 char *ctx = NULL; 272 char *ctx = NULL;
266 u32 len; 273 u32 len;
267 274
268 rc = selinux_sid_to_string(sid, &ctx, &len); 275 rc = security_secid_to_secctx(sid, &ctx, &len);
269 if (rc) { 276 if (rc) {
270 audit_log_format(ab, " sid=%u", sid); 277 audit_log_format(ab, " sid=%u", sid);
271 allow_changes = 0; /* Something weird, deny request */ 278 allow_changes = 0; /* Something weird, deny request */
272 } else { 279 } else {
273 audit_log_format(ab, " subj=%s", ctx); 280 audit_log_format(ab, " subj=%s", ctx);
274 kfree(ctx); 281 security_release_secctx(ctx, len);
275 } 282 }
276 } 283 }
277 audit_log_format(ab, " res=%d", allow_changes); 284 audit_log_format(ab, " res=%d", allow_changes);
@@ -280,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
280} 287}
281 288
282static int audit_do_config_change(char *function_name, int *to_change, 289static int audit_do_config_change(char *function_name, int *to_change,
283 int new, uid_t loginuid, u32 sid) 290 int new, uid_t loginuid, u32 sessionid,
291 u32 sid)
284{ 292{
285 int allow_changes, rc = 0, old = *to_change; 293 int allow_changes, rc = 0, old = *to_change;
286 294
@@ -291,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change,
291 allow_changes = 1; 299 allow_changes = 1;
292 300
293 if (audit_enabled != AUDIT_OFF) { 301 if (audit_enabled != AUDIT_OFF) {
294 rc = audit_log_config_change(function_name, new, old, 302 rc = audit_log_config_change(function_name, new, old, loginuid,
295 loginuid, sid, allow_changes); 303 sessionid, sid, allow_changes);
296 if (rc) 304 if (rc)
297 allow_changes = 0; 305 allow_changes = 0;
298 } 306 }
@@ -306,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change,
306 return rc; 314 return rc;
307} 315}
308 316
309static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 317static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
318 u32 sid)
310{ 319{
311 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, 320 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
312 limit, loginuid, sid); 321 limit, loginuid, sessionid, sid);
313} 322}
314 323
315static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 324static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
325 u32 sid)
316{ 326{
317 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, 327 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
318 limit, loginuid, sid); 328 limit, loginuid, sessionid, sid);
319} 329}
320 330
321static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 331static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
322{ 332{
323 int rc; 333 int rc;
324 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 334 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
325 return -EINVAL; 335 return -EINVAL;
326 336
327 rc = audit_do_config_change("audit_enabled", &audit_enabled, state, 337 rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
328 loginuid, sid); 338 loginuid, sessionid, sid);
329 339
330 if (!rc) 340 if (!rc)
331 audit_ever_enabled |= !!state; 341 audit_ever_enabled |= !!state;
@@ -333,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
333 return rc; 343 return rc;
334} 344}
335 345
336static int audit_set_failure(int state, uid_t loginuid, u32 sid) 346static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
337{ 347{
338 if (state != AUDIT_FAIL_SILENT 348 if (state != AUDIT_FAIL_SILENT
339 && state != AUDIT_FAIL_PRINTK 349 && state != AUDIT_FAIL_PRINTK
@@ -341,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
341 return -EINVAL; 351 return -EINVAL;
342 352
343 return audit_do_config_change("audit_failure", &audit_failure, state, 353 return audit_do_config_change("audit_failure", &audit_failure, state,
344 loginuid, sid); 354 loginuid, sessionid, sid);
355}
356
357/*
358 * Queue skbs to be sent to auditd when/if it comes back. These skbs should
359 * already have been sent via prink/syslog and so if these messages are dropped
360 * it is not a huge concern since we already passed the audit_log_lost()
361 * notification and stuff. This is just nice to get audit messages during
362 * boot before auditd is running or messages generated while auditd is stopped.
363 * This only holds messages is audit_default is set, aka booting with audit=1
364 * or building your kernel that way.
365 */
366static void audit_hold_skb(struct sk_buff *skb)
367{
368 if (audit_default &&
369 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
370 skb_queue_tail(&audit_skb_hold_queue, skb);
371 else
372 kfree_skb(skb);
373}
374
375static void kauditd_send_skb(struct sk_buff *skb)
376{
377 int err;
378 /* take a reference in case we can't send it and we want to hold it */
379 skb_get(skb);
380 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
381 if (err < 0) {
382 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
383 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
384 audit_log_lost("auditd dissapeared\n");
385 audit_pid = 0;
386 /* we might get lucky and get this in the next auditd */
387 audit_hold_skb(skb);
388 } else
389 /* drop the extra reference if sent ok */
390 kfree_skb(skb);
345} 391}
346 392
347static int kauditd_thread(void *dummy) 393static int kauditd_thread(void *dummy)
@@ -350,24 +396,41 @@ static int kauditd_thread(void *dummy)
350 396
351 set_freezable(); 397 set_freezable();
352 while (!kthread_should_stop()) { 398 while (!kthread_should_stop()) {
399 /*
400 * if auditd just started drain the queue of messages already
401 * sent to syslog/printk. remember loss here is ok. we already
402 * called audit_log_lost() if it didn't go out normally. so the
403 * race between the skb_dequeue and the next check for audit_pid
404 * doesn't matter.
405 *
406 * if you ever find kauditd to be too slow we can get a perf win
407 * by doing our own locking and keeping better track if there
408 * are messages in this queue. I don't see the need now, but
409 * in 5 years when I want to play with this again I'll see this
410 * note and still have no friggin idea what i'm thinking today.
411 */
412 if (audit_default && audit_pid) {
413 skb = skb_dequeue(&audit_skb_hold_queue);
414 if (unlikely(skb)) {
415 while (skb && audit_pid) {
416 kauditd_send_skb(skb);
417 skb = skb_dequeue(&audit_skb_hold_queue);
418 }
419 }
420 }
421
353 skb = skb_dequeue(&audit_skb_queue); 422 skb = skb_dequeue(&audit_skb_queue);
354 wake_up(&audit_backlog_wait); 423 wake_up(&audit_backlog_wait);
355 if (skb) { 424 if (skb) {
356 if (audit_pid) { 425 if (audit_pid)
357 int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 426 kauditd_send_skb(skb);
358 if (err < 0) { 427 else {
359 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
360 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
361 audit_log_lost("auditd dissapeared\n");
362 audit_pid = 0;
363 }
364 } else {
365 if (printk_ratelimit()) 428 if (printk_ratelimit())
366 printk(KERN_NOTICE "%s\n", skb->data + 429 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
367 NLMSG_SPACE(0));
368 else 430 else
369 audit_log_lost("printk limit exceeded\n"); 431 audit_log_lost("printk limit exceeded\n");
370 kfree_skb(skb); 432
433 audit_hold_skb(skb);
371 } 434 }
372 } else { 435 } else {
373 DECLARE_WAITQUEUE(wait, current); 436 DECLARE_WAITQUEUE(wait, current);
@@ -386,13 +449,13 @@ static int kauditd_thread(void *dummy)
386 return 0; 449 return 0;
387} 450}
388 451
389static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) 452static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
390{ 453{
391 struct task_struct *tsk; 454 struct task_struct *tsk;
392 int err; 455 int err;
393 456
394 read_lock(&tasklist_lock); 457 read_lock(&tasklist_lock);
395 tsk = find_task_by_pid(pid); 458 tsk = find_task_by_vpid(pid);
396 err = -ESRCH; 459 err = -ESRCH;
397 if (!tsk) 460 if (!tsk)
398 goto out; 461 goto out;
@@ -405,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
405 if (err) 468 if (err)
406 goto out; 469 goto out;
407 470
408 tty_audit_push_task(tsk, loginuid); 471 tty_audit_push_task(tsk, loginuid, sessionid);
409out: 472out:
410 read_unlock(&tasklist_lock); 473 read_unlock(&tasklist_lock);
411 return err; 474 return err;
@@ -470,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */
470 return NULL; 533 return NULL;
471} 534}
472 535
536static int audit_send_reply_thread(void *arg)
537{
538 struct audit_reply *reply = (struct audit_reply *)arg;
539
540 mutex_lock(&audit_cmd_mutex);
541 mutex_unlock(&audit_cmd_mutex);
542
543 /* Ignore failure. It'll only happen if the sender goes away,
544 because our timeout is set to infinite. */
545 netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
546 kfree(reply);
547 return 0;
548}
473/** 549/**
474 * audit_send_reply - send an audit reply message via netlink 550 * audit_send_reply - send an audit reply message via netlink
475 * @pid: process id to send reply to 551 * @pid: process id to send reply to
@@ -486,14 +562,26 @@ nlmsg_failure: /* Used by NLMSG_PUT */
486void audit_send_reply(int pid, int seq, int type, int done, int multi, 562void audit_send_reply(int pid, int seq, int type, int done, int multi,
487 void *payload, int size) 563 void *payload, int size)
488{ 564{
489 struct sk_buff *skb; 565 struct sk_buff *skb;
566 struct task_struct *tsk;
567 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
568 GFP_KERNEL);
569
570 if (!reply)
571 return;
572
490 skb = audit_make_reply(pid, seq, type, done, multi, payload, size); 573 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
491 if (!skb) 574 if (!skb)
492 return; 575 return;
493 /* Ignore failure. It'll only happen if the sender goes away, 576
494 because our timeout is set to infinite. */ 577 reply->pid = pid;
495 netlink_unicast(audit_sock, skb, pid, 0); 578 reply->skb = skb;
496 return; 579
580 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
581 if (IS_ERR(tsk)) {
582 kfree(reply);
583 kfree_skb(skb);
584 }
497} 585}
498 586
499/* 587/*
@@ -535,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
535} 623}
536 624
537static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, 625static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
538 u32 pid, u32 uid, uid_t auid, u32 sid) 626 u32 pid, u32 uid, uid_t auid, u32 ses,
627 u32 sid)
539{ 628{
540 int rc = 0; 629 int rc = 0;
541 char *ctx = NULL; 630 char *ctx = NULL;
@@ -547,15 +636,16 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
547 } 636 }
548 637
549 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 638 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
550 audit_log_format(*ab, "user pid=%d uid=%u auid=%u", 639 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
551 pid, uid, auid); 640 pid, uid, auid, ses);
552 if (sid) { 641 if (sid) {
553 rc = selinux_sid_to_string(sid, &ctx, &len); 642 rc = security_secid_to_secctx(sid, &ctx, &len);
554 if (rc) 643 if (rc)
555 audit_log_format(*ab, " ssid=%u", sid); 644 audit_log_format(*ab, " ssid=%u", sid);
556 else 645 else {
557 audit_log_format(*ab, " subj=%s", ctx); 646 audit_log_format(*ab, " subj=%s", ctx);
558 kfree(ctx); 647 security_release_secctx(ctx, len);
648 }
559 } 649 }
560 650
561 return rc; 651 return rc;
@@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
570 struct audit_buffer *ab; 660 struct audit_buffer *ab;
571 u16 msg_type = nlh->nlmsg_type; 661 u16 msg_type = nlh->nlmsg_type;
572 uid_t loginuid; /* loginuid of sender */ 662 uid_t loginuid; /* loginuid of sender */
663 u32 sessionid;
573 struct audit_sig_info *sig_data; 664 struct audit_sig_info *sig_data;
574 char *ctx = NULL; 665 char *ctx = NULL;
575 u32 len; 666 u32 len;
@@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
591 pid = NETLINK_CREDS(skb)->pid; 682 pid = NETLINK_CREDS(skb)->pid;
592 uid = NETLINK_CREDS(skb)->uid; 683 uid = NETLINK_CREDS(skb)->uid;
593 loginuid = NETLINK_CB(skb).loginuid; 684 loginuid = NETLINK_CB(skb).loginuid;
685 sessionid = NETLINK_CB(skb).sessionid;
594 sid = NETLINK_CB(skb).sid; 686 sid = NETLINK_CB(skb).sid;
595 seq = nlh->nlmsg_seq; 687 seq = nlh->nlmsg_seq;
596 data = NLMSG_DATA(nlh); 688 data = NLMSG_DATA(nlh);
@@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
613 status_get = (struct audit_status *)data; 705 status_get = (struct audit_status *)data;
614 if (status_get->mask & AUDIT_STATUS_ENABLED) { 706 if (status_get->mask & AUDIT_STATUS_ENABLED) {
615 err = audit_set_enabled(status_get->enabled, 707 err = audit_set_enabled(status_get->enabled,
616 loginuid, sid); 708 loginuid, sessionid, sid);
617 if (err < 0) return err; 709 if (err < 0) return err;
618 } 710 }
619 if (status_get->mask & AUDIT_STATUS_FAILURE) { 711 if (status_get->mask & AUDIT_STATUS_FAILURE) {
620 err = audit_set_failure(status_get->failure, 712 err = audit_set_failure(status_get->failure,
621 loginuid, sid); 713 loginuid, sessionid, sid);
622 if (err < 0) return err; 714 if (err < 0) return err;
623 } 715 }
624 if (status_get->mask & AUDIT_STATUS_PID) { 716 if (status_get->mask & AUDIT_STATUS_PID) {
@@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
627 if (audit_enabled != AUDIT_OFF) 719 if (audit_enabled != AUDIT_OFF)
628 audit_log_config_change("audit_pid", new_pid, 720 audit_log_config_change("audit_pid", new_pid,
629 audit_pid, loginuid, 721 audit_pid, loginuid,
630 sid, 1); 722 sessionid, sid, 1);
631 723
632 audit_pid = new_pid; 724 audit_pid = new_pid;
633 audit_nlk_pid = NETLINK_CB(skb).pid; 725 audit_nlk_pid = NETLINK_CB(skb).pid;
634 } 726 }
635 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 727 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
636 err = audit_set_rate_limit(status_get->rate_limit, 728 err = audit_set_rate_limit(status_get->rate_limit,
637 loginuid, sid); 729 loginuid, sessionid, sid);
638 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 730 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
639 err = audit_set_backlog_limit(status_get->backlog_limit, 731 err = audit_set_backlog_limit(status_get->backlog_limit,
640 loginuid, sid); 732 loginuid, sessionid, sid);
641 break; 733 break;
642 case AUDIT_USER: 734 case AUDIT_USER:
643 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 735 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
649 if (err == 1) { 741 if (err == 1) {
650 err = 0; 742 err = 0;
651 if (msg_type == AUDIT_USER_TTY) { 743 if (msg_type == AUDIT_USER_TTY) {
652 err = audit_prepare_user_tty(pid, loginuid); 744 err = audit_prepare_user_tty(pid, loginuid,
745 sessionid);
653 if (err) 746 if (err)
654 break; 747 break;
655 } 748 }
656 audit_log_common_recv_msg(&ab, msg_type, pid, uid, 749 audit_log_common_recv_msg(&ab, msg_type, pid, uid,
657 loginuid, sid); 750 loginuid, sessionid, sid);
658 751
659 if (msg_type != AUDIT_USER_TTY) 752 if (msg_type != AUDIT_USER_TTY)
660 audit_log_format(ab, " msg='%.1024s'", 753 audit_log_format(ab, " msg='%.1024s'",
@@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
664 757
665 audit_log_format(ab, " msg="); 758 audit_log_format(ab, " msg=");
666 size = nlmsg_len(nlh); 759 size = nlmsg_len(nlh);
667 audit_log_n_untrustedstring(ab, size, 760 audit_log_n_untrustedstring(ab, data, size);
668 data);
669 } 761 }
670 audit_set_pid(ab, pid); 762 audit_set_pid(ab, pid);
671 audit_log_end(ab); 763 audit_log_end(ab);
@@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
677 return -EINVAL; 769 return -EINVAL;
678 if (audit_enabled == AUDIT_LOCKED) { 770 if (audit_enabled == AUDIT_LOCKED) {
679 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 771 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
680 uid, loginuid, sid); 772 uid, loginuid, sessionid, sid);
681 773
682 audit_log_format(ab, " audit_enabled=%d res=0", 774 audit_log_format(ab, " audit_enabled=%d res=0",
683 audit_enabled); 775 audit_enabled);
@@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
688 case AUDIT_LIST: 780 case AUDIT_LIST:
689 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 781 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
690 uid, seq, data, nlmsg_len(nlh), 782 uid, seq, data, nlmsg_len(nlh),
691 loginuid, sid); 783 loginuid, sessionid, sid);
692 break; 784 break;
693 case AUDIT_ADD_RULE: 785 case AUDIT_ADD_RULE:
694 case AUDIT_DEL_RULE: 786 case AUDIT_DEL_RULE:
@@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
696 return -EINVAL; 788 return -EINVAL;
697 if (audit_enabled == AUDIT_LOCKED) { 789 if (audit_enabled == AUDIT_LOCKED) {
698 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 790 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
699 uid, loginuid, sid); 791 uid, loginuid, sessionid, sid);
700 792
701 audit_log_format(ab, " audit_enabled=%d res=0", 793 audit_log_format(ab, " audit_enabled=%d res=0",
702 audit_enabled); 794 audit_enabled);
@@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
707 case AUDIT_LIST_RULES: 799 case AUDIT_LIST_RULES:
708 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 800 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
709 uid, seq, data, nlmsg_len(nlh), 801 uid, seq, data, nlmsg_len(nlh),
710 loginuid, sid); 802 loginuid, sessionid, sid);
711 break; 803 break;
712 case AUDIT_TRIM: 804 case AUDIT_TRIM:
713 audit_trim_trees(); 805 audit_trim_trees();
714 806
715 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 807 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
716 uid, loginuid, sid); 808 uid, loginuid, sessionid, sid);
717 809
718 audit_log_format(ab, " op=trim res=1"); 810 audit_log_format(ab, " op=trim res=1");
719 audit_log_end(ab); 811 audit_log_end(ab);
@@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
721 case AUDIT_MAKE_EQUIV: { 813 case AUDIT_MAKE_EQUIV: {
722 void *bufp = data; 814 void *bufp = data;
723 u32 sizes[2]; 815 u32 sizes[2];
724 size_t len = nlmsg_len(nlh); 816 size_t msglen = nlmsg_len(nlh);
725 char *old, *new; 817 char *old, *new;
726 818
727 err = -EINVAL; 819 err = -EINVAL;
728 if (len < 2 * sizeof(u32)) 820 if (msglen < 2 * sizeof(u32))
729 break; 821 break;
730 memcpy(sizes, bufp, 2 * sizeof(u32)); 822 memcpy(sizes, bufp, 2 * sizeof(u32));
731 bufp += 2 * sizeof(u32); 823 bufp += 2 * sizeof(u32);
732 len -= 2 * sizeof(u32); 824 msglen -= 2 * sizeof(u32);
733 old = audit_unpack_string(&bufp, &len, sizes[0]); 825 old = audit_unpack_string(&bufp, &msglen, sizes[0]);
734 if (IS_ERR(old)) { 826 if (IS_ERR(old)) {
735 err = PTR_ERR(old); 827 err = PTR_ERR(old);
736 break; 828 break;
737 } 829 }
738 new = audit_unpack_string(&bufp, &len, sizes[1]); 830 new = audit_unpack_string(&bufp, &msglen, sizes[1]);
739 if (IS_ERR(new)) { 831 if (IS_ERR(new)) {
740 err = PTR_ERR(new); 832 err = PTR_ERR(new);
741 kfree(old); 833 kfree(old);
@@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
745 err = audit_tag_tree(old, new); 837 err = audit_tag_tree(old, new);
746 838
747 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, 839 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
748 uid, loginuid, sid); 840 uid, loginuid, sessionid, sid);
749 841
750 audit_log_format(ab, " op=make_equiv old="); 842 audit_log_format(ab, " op=make_equiv old=");
751 audit_log_untrustedstring(ab, old); 843 audit_log_untrustedstring(ab, old);
@@ -758,18 +850,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
758 break; 850 break;
759 } 851 }
760 case AUDIT_SIGNAL_INFO: 852 case AUDIT_SIGNAL_INFO:
761 err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); 853 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
762 if (err) 854 if (err)
763 return err; 855 return err;
764 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 856 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
765 if (!sig_data) { 857 if (!sig_data) {
766 kfree(ctx); 858 security_release_secctx(ctx, len);
767 return -ENOMEM; 859 return -ENOMEM;
768 } 860 }
769 sig_data->uid = audit_sig_uid; 861 sig_data->uid = audit_sig_uid;
770 sig_data->pid = audit_sig_pid; 862 sig_data->pid = audit_sig_pid;
771 memcpy(sig_data->ctx, ctx, len); 863 memcpy(sig_data->ctx, ctx, len);
772 kfree(ctx); 864 security_release_secctx(ctx, len);
773 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 865 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
774 0, 0, sig_data, sizeof(*sig_data) + len); 866 0, 0, sig_data, sizeof(*sig_data) + len);
775 kfree(sig_data); 867 kfree(sig_data);
@@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
779 struct task_struct *tsk; 871 struct task_struct *tsk;
780 872
781 read_lock(&tasklist_lock); 873 read_lock(&tasklist_lock);
782 tsk = find_task_by_pid(pid); 874 tsk = find_task_by_vpid(pid);
783 if (!tsk) 875 if (!tsk)
784 err = -ESRCH; 876 err = -ESRCH;
785 else { 877 else {
@@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
802 if (s->enabled != 0 && s->enabled != 1) 894 if (s->enabled != 0 && s->enabled != 1)
803 return -EINVAL; 895 return -EINVAL;
804 read_lock(&tasklist_lock); 896 read_lock(&tasklist_lock);
805 tsk = find_task_by_pid(pid); 897 tsk = find_task_by_vpid(pid);
806 if (!tsk) 898 if (!tsk)
807 err = -ESRCH; 899 err = -ESRCH;
808 else { 900 else {
@@ -877,14 +969,11 @@ static int __init audit_init(void)
877 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 969 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
878 970
879 skb_queue_head_init(&audit_skb_queue); 971 skb_queue_head_init(&audit_skb_queue);
972 skb_queue_head_init(&audit_skb_hold_queue);
880 audit_initialized = 1; 973 audit_initialized = 1;
881 audit_enabled = audit_default; 974 audit_enabled = audit_default;
882 audit_ever_enabled |= !!audit_default; 975 audit_ever_enabled |= !!audit_default;
883 976
884 /* Register the callback with selinux. This callback will be invoked
885 * when a new policy is loaded. */
886 selinux_audit_set_callback(&selinux_audit_rule_update);
887
888 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 977 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
889 978
890#ifdef CONFIG_AUDITSYSCALL 979#ifdef CONFIG_AUDITSYSCALL
@@ -1203,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
1203 * This function will take the passed buf and convert it into a string of 1292 * This function will take the passed buf and convert it into a string of
1204 * ascii hex digits. The new string is placed onto the skb. 1293 * ascii hex digits. The new string is placed onto the skb.
1205 */ 1294 */
1206void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 1295void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1207 size_t len) 1296 size_t len)
1208{ 1297{
1209 int i, avail, new_len; 1298 int i, avail, new_len;
@@ -1239,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
1239 * Format a string of no more than slen characters into the audit buffer, 1328 * Format a string of no more than slen characters into the audit buffer,
1240 * enclosed in quote marks. 1329 * enclosed in quote marks.
1241 */ 1330 */
1242static void audit_log_n_string(struct audit_buffer *ab, size_t slen, 1331void audit_log_n_string(struct audit_buffer *ab, const char *string,
1243 const char *string) 1332 size_t slen)
1244{ 1333{
1245 int avail, new_len; 1334 int avail, new_len;
1246 unsigned char *ptr; 1335 unsigned char *ptr;
@@ -1269,8 +1358,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1269 1358
1270/** 1359/**
1271 * audit_string_contains_control - does a string need to be logged in hex 1360 * audit_string_contains_control - does a string need to be logged in hex
1272 * @string - string to be checked 1361 * @string: string to be checked
1273 * @len - max length of the string to check 1362 * @len: max length of the string to check
1274 */ 1363 */
1275int audit_string_contains_control(const char *string, size_t len) 1364int audit_string_contains_control(const char *string, size_t len)
1276{ 1365{
@@ -1285,7 +1374,7 @@ int audit_string_contains_control(const char *string, size_t len)
1285/** 1374/**
1286 * audit_log_n_untrustedstring - log a string that may contain random characters 1375 * audit_log_n_untrustedstring - log a string that may contain random characters
1287 * @ab: audit_buffer 1376 * @ab: audit_buffer
1288 * @len: lenth of string (not including trailing null) 1377 * @len: length of string (not including trailing null)
1289 * @string: string to be logged 1378 * @string: string to be logged
1290 * 1379 *
1291 * This code will escape a string that is passed to it if the string 1380 * This code will escape a string that is passed to it if the string
@@ -1296,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len)
1296 * The caller specifies the number of characters in the string to log, which may 1385 * The caller specifies the number of characters in the string to log, which may
1297 * or may not be the entire string. 1386 * or may not be the entire string.
1298 */ 1387 */
1299void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1388void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
1300 const char *string) 1389 size_t len)
1301{ 1390{
1302 if (audit_string_contains_control(string, len)) 1391 if (audit_string_contains_control(string, len))
1303 audit_log_hex(ab, string, len); 1392 audit_log_n_hex(ab, string, len);
1304 else 1393 else
1305 audit_log_n_string(ab, len, string); 1394 audit_log_n_string(ab, string, len);
1306} 1395}
1307 1396
1308/** 1397/**
@@ -1315,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1315 */ 1404 */
1316void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1405void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1317{ 1406{
1318 audit_log_n_untrustedstring(ab, strlen(string), string); 1407 audit_log_n_untrustedstring(ab, string, strlen(string));
1319} 1408}
1320 1409
1321/* This is a helper-function to print the escaped d_path */ 1410/* This is a helper-function to print the escaped d_path */
@@ -1359,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab)
1359 audit_log_lost("rate limit exceeded"); 1448 audit_log_lost("rate limit exceeded");
1360 } else { 1449 } else {
1361 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1450 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1451 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1452
1362 if (audit_pid) { 1453 if (audit_pid) {
1363 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
1364 skb_queue_tail(&audit_skb_queue, ab->skb); 1454 skb_queue_tail(&audit_skb_queue, ab->skb);
1365 ab->skb = NULL;
1366 wake_up_interruptible(&kauditd_wait); 1455 wake_up_interruptible(&kauditd_wait);
1367 } else if (nlh->nlmsg_type != AUDIT_EOE) { 1456 } else {
1368 if (printk_ratelimit()) { 1457 if (nlh->nlmsg_type != AUDIT_EOE) {
1369 printk(KERN_NOTICE "type=%d %s\n", 1458 if (printk_ratelimit()) {
1370 nlh->nlmsg_type, 1459 printk(KERN_NOTICE "type=%d %s\n",
1371 ab->skb->data + NLMSG_SPACE(0)); 1460 nlh->nlmsg_type,
1372 } else 1461 ab->skb->data + NLMSG_SPACE(0));
1373 audit_log_lost("printk limit exceeded\n"); 1462 } else
1463 audit_log_lost("printk limit exceeded\n");
1464 }
1465 audit_hold_skb(ab->skb);
1374 } 1466 }
1467 ab->skb = NULL;
1375 } 1468 }
1376 audit_buffer_free(ab); 1469 audit_buffer_free(ab);
1377} 1470}
diff --git a/kernel/audit.h b/kernel/audit.h
index 2554bd524fd1..9d6717412fec 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -65,40 +65,20 @@ struct audit_watch {
65 struct list_head rules; /* associated rules */ 65 struct list_head rules; /* associated rules */
66}; 66};
67 67
68struct audit_field {
69 u32 type;
70 u32 val;
71 u32 op;
72 char *se_str;
73 struct selinux_audit_rule *se_rule;
74};
75
76struct audit_tree; 68struct audit_tree;
77struct audit_chunk; 69struct audit_chunk;
78 70
79struct audit_krule {
80 int vers_ops;
81 u32 flags;
82 u32 listnr;
83 u32 action;
84 u32 mask[AUDIT_BITMASK_SIZE];
85 u32 buflen; /* for data alloc on list rules */
86 u32 field_count;
87 char *filterkey; /* ties events to rules */
88 struct audit_field *fields;
89 struct audit_field *arch_f; /* quick access to arch field */
90 struct audit_field *inode_f; /* quick access to an inode field */
91 struct audit_watch *watch; /* associated watch */
92 struct audit_tree *tree; /* associated watched tree */
93 struct list_head rlist; /* entry in audit_{watch,tree}.rules list */
94};
95
96struct audit_entry { 71struct audit_entry {
97 struct list_head list; 72 struct list_head list;
98 struct rcu_head rcu; 73 struct rcu_head rcu;
99 struct audit_krule rule; 74 struct audit_krule rule;
100}; 75};
101 76
77#ifdef CONFIG_AUDIT
78extern int audit_enabled;
79extern int audit_ever_enabled;
80#endif
81
102extern int audit_pid; 82extern int audit_pid;
103 83
104#define AUDIT_INODE_BUCKETS 32 84#define AUDIT_INODE_BUCKETS 32
@@ -129,6 +109,9 @@ struct audit_netlink_list {
129int audit_send_list(void *); 109int audit_send_list(void *);
130 110
131struct inotify_watch; 111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
132extern void audit_free_parent(struct inotify_watch *); 115extern void audit_free_parent(struct inotify_watch *);
133extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, 116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
134 const char *, struct inode *); 117 const char *, struct inode *);
@@ -136,6 +119,7 @@ extern int selinux_audit_rule_update(void);
136 119
137extern struct mutex audit_filter_mutex; 120extern struct mutex audit_filter_mutex;
138extern void audit_free_rule_rcu(struct rcu_head *); 121extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[];
139 123
140#ifdef CONFIG_AUDIT_TREE 124#ifdef CONFIG_AUDIT_TREE
141extern struct audit_chunk *audit_tree_lookup(const struct inode *); 125extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -162,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *);
162 146
163extern char *audit_unpack_string(void **, size_t *, size_t); 147extern char *audit_unpack_string(void **, size_t *, size_t);
164 148
149extern pid_t audit_sig_pid;
150extern uid_t audit_sig_uid;
151extern u32 audit_sig_sid;
152
165#ifdef CONFIG_AUDITSYSCALL 153#ifdef CONFIG_AUDITSYSCALL
166extern int __audit_signal_info(int sig, struct task_struct *t); 154extern int __audit_signal_info(int sig, struct task_struct *t);
167static inline int audit_signal_info(int sig, struct task_struct *t) 155static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 2f2914b7cc30..0e0bd27e6512 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -28,7 +28,7 @@
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/selinux.h> 31#include <linux/security.h>
32#include "audit.h" 32#include "audit.h"
33 33
34/* 34/*
@@ -38,7 +38,7 @@
38 * Synchronizes writes and blocking reads of audit's filterlist 38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access 39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque 40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures 41 * LSM rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist. 42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may 43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held. 44 * be written directly provided audit_filter_mutex is held.
@@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
89 89
90DEFINE_MUTEX(audit_filter_mutex); 90DEFINE_MUTEX(audit_filter_mutex);
91 91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */ 92/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 93#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97 94
98extern int audit_enabled;
99
100void audit_free_parent(struct inotify_watch *i_watch) 95void audit_free_parent(struct inotify_watch *i_watch)
101{ 96{
102 struct audit_parent *parent; 97 struct audit_parent *parent;
@@ -139,8 +134,8 @@ static inline void audit_free_rule(struct audit_entry *e)
139 if (e->rule.fields) 134 if (e->rule.fields)
140 for (i = 0; i < e->rule.field_count; i++) { 135 for (i = 0; i < e->rule.field_count; i++) {
141 struct audit_field *f = &e->rule.fields[i]; 136 struct audit_field *f = &e->rule.fields[i];
142 kfree(f->se_str); 137 kfree(f->lsm_str);
143 selinux_audit_rule_free(f->se_rule); 138 security_audit_rule_free(f->lsm_rule);
144 } 139 }
145 kfree(e->rule.fields); 140 kfree(e->rule.fields);
146 kfree(e->rule.filterkey); 141 kfree(e->rule.filterkey);
@@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
272 return -EINVAL; 267 return -EINVAL;
273 268
274 watch = audit_init_watch(path); 269 watch = audit_init_watch(path);
275 if (unlikely(IS_ERR(watch))) 270 if (IS_ERR(watch))
276 return PTR_ERR(watch); 271 return PTR_ERR(watch);
277 272
278 audit_get_watch(watch); 273 audit_get_watch(watch);
@@ -422,7 +417,7 @@ exit_err:
422static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
423{ 418{
424 struct audit_entry *entry; 419 struct audit_entry *entry;
425 struct audit_field *f; 420 struct audit_field *ino_f;
426 int err = 0; 421 int err = 0;
427 int i; 422 int i;
428 423
@@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
483 if (f->val & ~15) 478 if (f->val & ~15)
484 goto exit_free; 479 goto exit_free;
485 break; 480 break;
481 case AUDIT_FILETYPE:
482 if ((f->val & ~S_IFMT) > S_IFMT)
483 goto exit_free;
484 break;
486 case AUDIT_INODE: 485 case AUDIT_INODE:
487 err = audit_to_inode(&entry->rule, f); 486 err = audit_to_inode(&entry->rule, f);
488 if (err) 487 if (err)
@@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
504 } 503 }
505 } 504 }
506 505
507 f = entry->rule.inode_f; 506 ino_f = entry->rule.inode_f;
508 if (f) { 507 if (ino_f) {
509 switch(f->op) { 508 switch(ino_f->op) {
510 case AUDIT_NOT_EQUAL: 509 case AUDIT_NOT_EQUAL:
511 entry->rule.inode_f = NULL; 510 entry->rule.inode_f = NULL;
512 case AUDIT_EQUAL: 511 case AUDIT_EQUAL:
@@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
531{ 530{
532 int err = 0; 531 int err = 0;
533 struct audit_entry *entry; 532 struct audit_entry *entry;
534 struct audit_field *f; 533 struct audit_field *ino_f;
535 void *bufp; 534 void *bufp;
536 size_t remain = datasz - sizeof(struct audit_rule_data); 535 size_t remain = datasz - sizeof(struct audit_rule_data);
537 int i; 536 int i;
@@ -554,8 +553,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
554 f->op = data->fieldflags[i] & AUDIT_OPERATORS; 553 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
555 f->type = data->fields[i]; 554 f->type = data->fields[i];
556 f->val = data->values[i]; 555 f->val = data->values[i];
557 f->se_str = NULL; 556 f->lsm_str = NULL;
558 f->se_rule = NULL; 557 f->lsm_rule = NULL;
559 switch(f->type) { 558 switch(f->type) {
560 case AUDIT_PID: 559 case AUDIT_PID:
561 case AUDIT_UID: 560 case AUDIT_UID:
@@ -597,12 +596,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
597 goto exit_free; 596 goto exit_free;
598 entry->rule.buflen += f->val; 597 entry->rule.buflen += f->val;
599 598
600 err = selinux_audit_rule_init(f->type, f->op, str, 599 err = security_audit_rule_init(f->type, f->op, str,
601 &f->se_rule); 600 (void **)&f->lsm_rule);
602 /* Keep currently invalid fields around in case they 601 /* Keep currently invalid fields around in case they
603 * become valid after a policy reload. */ 602 * become valid after a policy reload. */
604 if (err == -EINVAL) { 603 if (err == -EINVAL) {
605 printk(KERN_WARNING "audit rule for selinux " 604 printk(KERN_WARNING "audit rule for LSM "
606 "\'%s\' is invalid\n", str); 605 "\'%s\' is invalid\n", str);
607 err = 0; 606 err = 0;
608 } 607 }
@@ -610,7 +609,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
610 kfree(str); 609 kfree(str);
611 goto exit_free; 610 goto exit_free;
612 } else 611 } else
613 f->se_str = str; 612 f->lsm_str = str;
614 break; 613 break;
615 case AUDIT_WATCH: 614 case AUDIT_WATCH:
616 str = audit_unpack_string(&bufp, &remain, f->val); 615 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
654 if (f->val & ~15) 653 if (f->val & ~15)
655 goto exit_free; 654 goto exit_free;
656 break; 655 break;
656 case AUDIT_FILETYPE:
657 if ((f->val & ~S_IFMT) > S_IFMT)
658 goto exit_free;
659 break;
657 default: 660 default:
658 goto exit_free; 661 goto exit_free;
659 } 662 }
660 } 663 }
661 664
662 f = entry->rule.inode_f; 665 ino_f = entry->rule.inode_f;
663 if (f) { 666 if (ino_f) {
664 switch(f->op) { 667 switch(ino_f->op) {
665 case AUDIT_NOT_EQUAL: 668 case AUDIT_NOT_EQUAL:
666 entry->rule.inode_f = NULL; 669 entry->rule.inode_f = NULL;
667 case AUDIT_EQUAL: 670 case AUDIT_EQUAL:
@@ -754,7 +757,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
754 case AUDIT_OBJ_LEV_LOW: 757 case AUDIT_OBJ_LEV_LOW:
755 case AUDIT_OBJ_LEV_HIGH: 758 case AUDIT_OBJ_LEV_HIGH:
756 data->buflen += data->values[i] = 759 data->buflen += data->values[i] =
757 audit_pack_string(&bufp, f->se_str); 760 audit_pack_string(&bufp, f->lsm_str);
758 break; 761 break;
759 case AUDIT_WATCH: 762 case AUDIT_WATCH:
760 data->buflen += data->values[i] = 763 data->buflen += data->values[i] =
@@ -806,7 +809,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
806 case AUDIT_OBJ_TYPE: 809 case AUDIT_OBJ_TYPE:
807 case AUDIT_OBJ_LEV_LOW: 810 case AUDIT_OBJ_LEV_LOW:
808 case AUDIT_OBJ_LEV_HIGH: 811 case AUDIT_OBJ_LEV_HIGH:
809 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 812 if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
810 return 1; 813 return 1;
811 break; 814 break;
812 case AUDIT_WATCH: 815 case AUDIT_WATCH:
@@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
848 return ERR_PTR(-ENOMEM); 851 return ERR_PTR(-ENOMEM);
849 852
850 new = audit_init_watch(path); 853 new = audit_init_watch(path);
851 if (unlikely(IS_ERR(new))) { 854 if (IS_ERR(new)) {
852 kfree(path); 855 kfree(path);
853 goto out; 856 goto out;
854 } 857 }
@@ -862,28 +865,28 @@ out:
862 return new; 865 return new;
863} 866}
864 867
865/* Duplicate selinux field information. The se_rule is opaque, so must be 868/* Duplicate LSM field information. The lsm_rule is opaque, so must be
866 * re-initialized. */ 869 * re-initialized. */
867static inline int audit_dupe_selinux_field(struct audit_field *df, 870static inline int audit_dupe_lsm_field(struct audit_field *df,
868 struct audit_field *sf) 871 struct audit_field *sf)
869{ 872{
870 int ret = 0; 873 int ret = 0;
871 char *se_str; 874 char *lsm_str;
872 875
873 /* our own copy of se_str */ 876 /* our own copy of lsm_str */
874 se_str = kstrdup(sf->se_str, GFP_KERNEL); 877 lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
875 if (unlikely(!se_str)) 878 if (unlikely(!lsm_str))
876 return -ENOMEM; 879 return -ENOMEM;
877 df->se_str = se_str; 880 df->lsm_str = lsm_str;
878 881
879 /* our own (refreshed) copy of se_rule */ 882 /* our own (refreshed) copy of lsm_rule */
880 ret = selinux_audit_rule_init(df->type, df->op, df->se_str, 883 ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
881 &df->se_rule); 884 (void **)&df->lsm_rule);
882 /* Keep currently invalid fields around in case they 885 /* Keep currently invalid fields around in case they
883 * become valid after a policy reload. */ 886 * become valid after a policy reload. */
884 if (ret == -EINVAL) { 887 if (ret == -EINVAL) {
885 printk(KERN_WARNING "audit rule for selinux \'%s\' is " 888 printk(KERN_WARNING "audit rule for LSM \'%s\' is "
886 "invalid\n", df->se_str); 889 "invalid\n", df->lsm_str);
887 ret = 0; 890 ret = 0;
888 } 891 }
889 892
@@ -891,7 +894,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
891} 894}
892 895
893/* Duplicate an audit rule. This will be a deep copy with the exception 896/* Duplicate an audit rule. This will be a deep copy with the exception
894 * of the watch - that pointer is carried over. The selinux specific fields 897 * of the watch - that pointer is carried over. The LSM specific fields
895 * will be updated in the copy. The point is to be able to replace the old 898 * will be updated in the copy. The point is to be able to replace the old
896 * rule with the new rule in the filterlist, then free the old rule. 899 * rule with the new rule in the filterlist, then free the old rule.
897 * The rlist element is undefined; list manipulations are handled apart from 900 * The rlist element is undefined; list manipulations are handled apart from
@@ -930,7 +933,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
930 new->tree = old->tree; 933 new->tree = old->tree;
931 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 934 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
932 935
933 /* deep copy this information, updating the se_rule fields, because 936 /* deep copy this information, updating the lsm_rule fields, because
934 * the originals will all be freed when the old rule is freed. */ 937 * the originals will all be freed when the old rule is freed. */
935 for (i = 0; i < fcount; i++) { 938 for (i = 0; i < fcount; i++) {
936 switch (new->fields[i].type) { 939 switch (new->fields[i].type) {
@@ -944,7 +947,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
944 case AUDIT_OBJ_TYPE: 947 case AUDIT_OBJ_TYPE:
945 case AUDIT_OBJ_LEV_LOW: 948 case AUDIT_OBJ_LEV_LOW:
946 case AUDIT_OBJ_LEV_HIGH: 949 case AUDIT_OBJ_LEV_HIGH:
947 err = audit_dupe_selinux_field(&new->fields[i], 950 err = audit_dupe_lsm_field(&new->fields[i],
948 &old->fields[i]); 951 &old->fields[i]);
949 break; 952 break;
950 case AUDIT_FILTERKEY: 953 case AUDIT_FILTERKEY:
@@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent,
989 audit_set_auditable(current->audit_context); 992 audit_set_auditable(current->audit_context);
990 993
991 nwatch = audit_dupe_watch(owatch); 994 nwatch = audit_dupe_watch(owatch);
992 if (unlikely(IS_ERR(nwatch))) { 995 if (IS_ERR(nwatch)) {
993 mutex_unlock(&audit_filter_mutex); 996 mutex_unlock(&audit_filter_mutex);
994 audit_panic("error updating watch, skipping"); 997 audit_panic("error updating watch, skipping");
995 return; 998 return;
@@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent,
1004 list_del_rcu(&oentry->list); 1007 list_del_rcu(&oentry->list);
1005 1008
1006 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1009 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1007 if (unlikely(IS_ERR(nentry))) 1010 if (IS_ERR(nentry))
1008 audit_panic("error updating watch, removing"); 1011 audit_panic("error updating watch, removing");
1009 else { 1012 else {
1010 int h = audit_hash_ino((u32)ino); 1013 int h = audit_hash_ino((u32)ino);
@@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1500} 1503}
1501 1504
1502/* Log rule additions and removals */ 1505/* Log rule additions and removals */
1503static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, 1506static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1504 struct audit_krule *rule, int res) 1507 char *action, struct audit_krule *rule,
1508 int res)
1505{ 1509{
1506 struct audit_buffer *ab; 1510 struct audit_buffer *ab;
1507 1511
@@ -1511,15 +1515,16 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1511 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1515 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1512 if (!ab) 1516 if (!ab)
1513 return; 1517 return;
1514 audit_log_format(ab, "auid=%u", loginuid); 1518 audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
1515 if (sid) { 1519 if (sid) {
1516 char *ctx = NULL; 1520 char *ctx = NULL;
1517 u32 len; 1521 u32 len;
1518 if (selinux_sid_to_string(sid, &ctx, &len)) 1522 if (security_secid_to_secctx(sid, &ctx, &len))
1519 audit_log_format(ab, " ssid=%u", sid); 1523 audit_log_format(ab, " ssid=%u", sid);
1520 else 1524 else {
1521 audit_log_format(ab, " subj=%s", ctx); 1525 audit_log_format(ab, " subj=%s", ctx);
1522 kfree(ctx); 1526 security_release_secctx(ctx, len);
1527 }
1523 } 1528 }
1524 audit_log_format(ab, " op=%s rule key=", action); 1529 audit_log_format(ab, " op=%s rule key=", action);
1525 if (rule->filterkey) 1530 if (rule->filterkey)
@@ -1542,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1542 * @sid: SE Linux Security ID of sender 1547 * @sid: SE Linux Security ID of sender
1543 */ 1548 */
1544int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1549int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1545 size_t datasz, uid_t loginuid, u32 sid) 1550 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
1546{ 1551{
1547 struct task_struct *tsk; 1552 struct task_struct *tsk;
1548 struct audit_netlink_list *dest; 1553 struct audit_netlink_list *dest;
@@ -1589,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1589 1594
1590 err = audit_add_rule(entry, 1595 err = audit_add_rule(entry,
1591 &audit_filter_list[entry->rule.listnr]); 1596 &audit_filter_list[entry->rule.listnr]);
1592 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); 1597 audit_log_rule_change(loginuid, sessionid, sid, "add",
1598 &entry->rule, !err);
1593 1599
1594 if (err) 1600 if (err)
1595 audit_free_rule(entry); 1601 audit_free_rule(entry);
@@ -1605,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1605 1611
1606 err = audit_del_rule(entry, 1612 err = audit_del_rule(entry,
1607 &audit_filter_list[entry->rule.listnr]); 1613 &audit_filter_list[entry->rule.listnr]);
1608 audit_log_rule_change(loginuid, sid, "remove", &entry->rule, 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1609 !err); 1615 &entry->rule, !err);
1610 1616
1611 audit_free_rule(entry); 1617 audit_free_rule(entry);
1612 break; 1618 break;
@@ -1761,38 +1767,12 @@ unlock_and_return:
1761 return result; 1767 return result;
1762} 1768}
1763 1769
1764/* Check to see if the rule contains any selinux fields. Returns 1 if there 1770/* This function will re-initialize the lsm_rule field of all applicable rules.
1765 are selinux fields specified in the rule, 0 otherwise. */ 1771 * It will traverse the filter lists serarching for rules that contain LSM
1766static inline int audit_rule_has_selinux(struct audit_krule *rule)
1767{
1768 int i;
1769
1770 for (i = 0; i < rule->field_count; i++) {
1771 struct audit_field *f = &rule->fields[i];
1772 switch (f->type) {
1773 case AUDIT_SUBJ_USER:
1774 case AUDIT_SUBJ_ROLE:
1775 case AUDIT_SUBJ_TYPE:
1776 case AUDIT_SUBJ_SEN:
1777 case AUDIT_SUBJ_CLR:
1778 case AUDIT_OBJ_USER:
1779 case AUDIT_OBJ_ROLE:
1780 case AUDIT_OBJ_TYPE:
1781 case AUDIT_OBJ_LEV_LOW:
1782 case AUDIT_OBJ_LEV_HIGH:
1783 return 1;
1784 }
1785 }
1786
1787 return 0;
1788}
1789
1790/* This function will re-initialize the se_rule field of all applicable rules.
1791 * It will traverse the filter lists serarching for rules that contain selinux
1792 * specific filter fields. When such a rule is found, it is copied, the 1772 * specific filter fields. When such a rule is found, it is copied, the
1793 * selinux field is re-initialized, and the old rule is replaced with the 1773 * LSM field is re-initialized, and the old rule is replaced with the
1794 * updated rule. */ 1774 * updated rule. */
1795int selinux_audit_rule_update(void) 1775int audit_update_lsm_rules(void)
1796{ 1776{
1797 struct audit_entry *entry, *n, *nentry; 1777 struct audit_entry *entry, *n, *nentry;
1798 struct audit_watch *watch; 1778 struct audit_watch *watch;
@@ -1804,18 +1784,18 @@ int selinux_audit_rule_update(void)
1804 1784
1805 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1785 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1806 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1786 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
1807 if (!audit_rule_has_selinux(&entry->rule)) 1787 if (!security_audit_rule_known(&entry->rule))
1808 continue; 1788 continue;
1809 1789
1810 watch = entry->rule.watch; 1790 watch = entry->rule.watch;
1811 tree = entry->rule.tree; 1791 tree = entry->rule.tree;
1812 nentry = audit_dupe_rule(&entry->rule, watch); 1792 nentry = audit_dupe_rule(&entry->rule, watch);
1813 if (unlikely(IS_ERR(nentry))) { 1793 if (IS_ERR(nentry)) {
1814 /* save the first error encountered for the 1794 /* save the first error encountered for the
1815 * return value */ 1795 * return value */
1816 if (!err) 1796 if (!err)
1817 err = PTR_ERR(nentry); 1797 err = PTR_ERR(nentry);
1818 audit_panic("error updating selinux filters"); 1798 audit_panic("error updating LSM filters");
1819 if (watch) 1799 if (watch)
1820 list_del(&entry->rule.rlist); 1800 list_del(&entry->rule.rlist);
1821 list_del_rcu(&entry->list); 1801 list_del_rcu(&entry->list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 782262e4107d..c10e7aae04d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -61,7 +61,6 @@
61#include <linux/security.h> 61#include <linux/security.h>
62#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/selinux.h>
65#include <linux/binfmts.h> 64#include <linux/binfmts.h>
66#include <linux/highmem.h> 65#include <linux/highmem.h>
67#include <linux/syscalls.h> 66#include <linux/syscalls.h>
@@ -69,9 +68,6 @@
69 68
70#include "audit.h" 69#include "audit.h"
71 70
72extern struct list_head audit_filter_list[];
73extern int audit_ever_enabled;
74
75/* AUDIT_NAMES is the number of slots we reserve in the audit_context 71/* AUDIT_NAMES is the number of slots we reserve in the audit_context
76 * for saving names from getname(). */ 72 * for saving names from getname(). */
77#define AUDIT_NAMES 20 73#define AUDIT_NAMES 20
@@ -284,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
284 } 280 }
285} 281}
286 282
283static int audit_match_filetype(struct audit_context *ctx, int which)
284{
285 unsigned index = which & ~S_IFMT;
286 mode_t mode = which & S_IFMT;
287 if (index >= ctx->name_count)
288 return 0;
289 if (ctx->names[index].ino == -1)
290 return 0;
291 if ((ctx->names[index].mode ^ mode) & S_IFMT)
292 return 0;
293 return 1;
294}
295
287/* 296/*
288 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; 297 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
289 * ->first_trees points to its beginning, ->trees - to the current end of data. 298 * ->first_trees points to its beginning, ->trees - to the current end of data.
@@ -528,14 +537,14 @@ static int audit_filter_rules(struct task_struct *tsk,
528 match for now to avoid losing information that 537 match for now to avoid losing information that
529 may be wanted. An error message will also be 538 may be wanted. An error message will also be
530 logged upon error */ 539 logged upon error */
531 if (f->se_rule) { 540 if (f->lsm_rule) {
532 if (need_sid) { 541 if (need_sid) {
533 selinux_get_task_sid(tsk, &sid); 542 security_task_getsecid(tsk, &sid);
534 need_sid = 0; 543 need_sid = 0;
535 } 544 }
536 result = selinux_audit_rule_match(sid, f->type, 545 result = security_audit_rule_match(sid, f->type,
537 f->op, 546 f->op,
538 f->se_rule, 547 f->lsm_rule,
539 ctx); 548 ctx);
540 } 549 }
541 break; 550 break;
@@ -546,18 +555,18 @@ static int audit_filter_rules(struct task_struct *tsk,
546 case AUDIT_OBJ_LEV_HIGH: 555 case AUDIT_OBJ_LEV_HIGH:
547 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR 556 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
548 also applies here */ 557 also applies here */
549 if (f->se_rule) { 558 if (f->lsm_rule) {
550 /* Find files that match */ 559 /* Find files that match */
551 if (name) { 560 if (name) {
552 result = selinux_audit_rule_match( 561 result = security_audit_rule_match(
553 name->osid, f->type, f->op, 562 name->osid, f->type, f->op,
554 f->se_rule, ctx); 563 f->lsm_rule, ctx);
555 } else if (ctx) { 564 } else if (ctx) {
556 for (j = 0; j < ctx->name_count; j++) { 565 for (j = 0; j < ctx->name_count; j++) {
557 if (selinux_audit_rule_match( 566 if (security_audit_rule_match(
558 ctx->names[j].osid, 567 ctx->names[j].osid,
559 f->type, f->op, 568 f->type, f->op,
560 f->se_rule, ctx)) { 569 f->lsm_rule, ctx)) {
561 ++result; 570 ++result;
562 break; 571 break;
563 } 572 }
@@ -570,7 +579,7 @@ static int audit_filter_rules(struct task_struct *tsk,
570 aux = aux->next) { 579 aux = aux->next) {
571 if (aux->type == AUDIT_IPC) { 580 if (aux->type == AUDIT_IPC) {
572 struct audit_aux_data_ipcctl *axi = (void *)aux; 581 struct audit_aux_data_ipcctl *axi = (void *)aux;
573 if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { 582 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
574 ++result; 583 ++result;
575 break; 584 break;
576 } 585 }
@@ -593,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk,
593 case AUDIT_PERM: 602 case AUDIT_PERM:
594 result = audit_match_perm(ctx, f->val); 603 result = audit_match_perm(ctx, f->val);
595 break; 604 break;
605 case AUDIT_FILETYPE:
606 result = audit_match_filetype(ctx, f->val);
607 break;
596 } 608 }
597 609
598 if (!result) 610 if (!result)
@@ -885,11 +897,11 @@ void audit_log_task_context(struct audit_buffer *ab)
885 int error; 897 int error;
886 u32 sid; 898 u32 sid;
887 899
888 selinux_get_task_sid(current, &sid); 900 security_task_getsecid(current, &sid);
889 if (!sid) 901 if (!sid)
890 return; 902 return;
891 903
892 error = selinux_sid_to_string(sid, &ctx, &len); 904 error = security_secid_to_secctx(sid, &ctx, &len);
893 if (error) { 905 if (error) {
894 if (error != -EINVAL) 906 if (error != -EINVAL)
895 goto error_path; 907 goto error_path;
@@ -897,7 +909,7 @@ void audit_log_task_context(struct audit_buffer *ab)
897 } 909 }
898 910
899 audit_log_format(ab, " subj=%s", ctx); 911 audit_log_format(ab, " subj=%s", ctx);
900 kfree(ctx); 912 security_release_secctx(ctx, len);
901 return; 913 return;
902 914
903error_path: 915error_path:
@@ -941,7 +953,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
941 u32 sid, char *comm) 953 u32 sid, char *comm)
942{ 954{
943 struct audit_buffer *ab; 955 struct audit_buffer *ab;
944 char *s = NULL; 956 char *ctx = NULL;
945 u32 len; 957 u32 len;
946 int rc = 0; 958 int rc = 0;
947 959
@@ -951,15 +963,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
951 963
952 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, 964 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
953 uid, sessionid); 965 uid, sessionid);
954 if (selinux_sid_to_string(sid, &s, &len)) { 966 if (security_secid_to_secctx(sid, &ctx, &len)) {
955 audit_log_format(ab, " obj=(none)"); 967 audit_log_format(ab, " obj=(none)");
956 rc = 1; 968 rc = 1;
957 } else 969 } else {
958 audit_log_format(ab, " obj=%s", s); 970 audit_log_format(ab, " obj=%s", ctx);
971 security_release_secctx(ctx, len);
972 }
959 audit_log_format(ab, " ocomm="); 973 audit_log_format(ab, " ocomm=");
960 audit_log_untrustedstring(ab, comm); 974 audit_log_untrustedstring(ab, comm);
961 audit_log_end(ab); 975 audit_log_end(ab);
962 kfree(s);
963 976
964 return rc; 977 return rc;
965} 978}
@@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1095 audit_log_format(*ab, "[%d]", i); 1108 audit_log_format(*ab, "[%d]", i);
1096 audit_log_format(*ab, "="); 1109 audit_log_format(*ab, "=");
1097 if (has_cntl) 1110 if (has_cntl)
1098 audit_log_hex(*ab, buf, to_send); 1111 audit_log_n_hex(*ab, buf, to_send);
1099 else 1112 else
1100 audit_log_format(*ab, "\"%s\"", buf); 1113 audit_log_format(*ab, "\"%s\"", buf);
1101 audit_log_format(*ab, "\n"); 1114 audit_log_format(*ab, "\n");
@@ -1271,14 +1284,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1271 if (axi->osid != 0) { 1284 if (axi->osid != 0) {
1272 char *ctx = NULL; 1285 char *ctx = NULL;
1273 u32 len; 1286 u32 len;
1274 if (selinux_sid_to_string( 1287 if (security_secid_to_secctx(
1275 axi->osid, &ctx, &len)) { 1288 axi->osid, &ctx, &len)) {
1276 audit_log_format(ab, " osid=%u", 1289 audit_log_format(ab, " osid=%u",
1277 axi->osid); 1290 axi->osid);
1278 call_panic = 1; 1291 call_panic = 1;
1279 } else 1292 } else {
1280 audit_log_format(ab, " obj=%s", ctx); 1293 audit_log_format(ab, " obj=%s", ctx);
1281 kfree(ctx); 1294 security_release_secctx(ctx, len);
1295 }
1282 } 1296 }
1283 break; } 1297 break; }
1284 1298
@@ -1295,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1295 break; } 1309 break; }
1296 1310
1297 case AUDIT_SOCKETCALL: { 1311 case AUDIT_SOCKETCALL: {
1298 int i;
1299 struct audit_aux_data_socketcall *axs = (void *)aux; 1312 struct audit_aux_data_socketcall *axs = (void *)aux;
1300 audit_log_format(ab, "nargs=%d", axs->nargs); 1313 audit_log_format(ab, "nargs=%d", axs->nargs);
1301 for (i=0; i<axs->nargs; i++) 1314 for (i=0; i<axs->nargs; i++)
@@ -1306,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1306 struct audit_aux_data_sockaddr *axs = (void *)aux; 1319 struct audit_aux_data_sockaddr *axs = (void *)aux;
1307 1320
1308 audit_log_format(ab, "saddr="); 1321 audit_log_format(ab, "saddr=");
1309 audit_log_hex(ab, axs->a, axs->len); 1322 audit_log_n_hex(ab, axs->a, axs->len);
1310 break; } 1323 break; }
1311 1324
1312 case AUDIT_FD_PAIR: { 1325 case AUDIT_FD_PAIR: {
@@ -1320,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1320 1333
1321 for (aux = context->aux_pids; aux; aux = aux->next) { 1334 for (aux = context->aux_pids; aux; aux = aux->next) {
1322 struct audit_aux_data_pids *axs = (void *)aux; 1335 struct audit_aux_data_pids *axs = (void *)aux;
1323 int i;
1324 1336
1325 for (i = 0; i < axs->pid_count; i++) 1337 for (i = 0; i < axs->pid_count; i++)
1326 if (audit_log_pid_context(context, axs->target_pid[i], 1338 if (audit_log_pid_context(context, axs->target_pid[i],
@@ -1370,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1370 default: 1382 default:
1371 /* log the name's directory component */ 1383 /* log the name's directory component */
1372 audit_log_format(ab, " name="); 1384 audit_log_format(ab, " name=");
1373 audit_log_n_untrustedstring(ab, n->name_len, 1385 audit_log_n_untrustedstring(ab, n->name,
1374 n->name); 1386 n->name_len);
1375 } 1387 }
1376 } else 1388 } else
1377 audit_log_format(ab, " name=(null)"); 1389 audit_log_format(ab, " name=(null)");
@@ -1392,13 +1404,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1392 if (n->osid != 0) { 1404 if (n->osid != 0) {
1393 char *ctx = NULL; 1405 char *ctx = NULL;
1394 u32 len; 1406 u32 len;
1395 if (selinux_sid_to_string( 1407 if (security_secid_to_secctx(
1396 n->osid, &ctx, &len)) { 1408 n->osid, &ctx, &len)) {
1397 audit_log_format(ab, " osid=%u", n->osid); 1409 audit_log_format(ab, " osid=%u", n->osid);
1398 call_panic = 2; 1410 call_panic = 2;
1399 } else 1411 } else {
1400 audit_log_format(ab, " obj=%s", ctx); 1412 audit_log_format(ab, " obj=%s", ctx);
1401 kfree(ctx); 1413 security_release_secctx(ctx, len);
1414 }
1402 } 1415 }
1403 1416
1404 audit_log_end(ab); 1417 audit_log_end(ab);
@@ -1594,7 +1607,7 @@ static inline void handle_one(const struct inode *inode)
1594 if (likely(put_tree_ref(context, chunk))) 1607 if (likely(put_tree_ref(context, chunk)))
1595 return; 1608 return;
1596 if (unlikely(!grow_tree_refs(context))) { 1609 if (unlikely(!grow_tree_refs(context))) {
1597 printk(KERN_WARNING "out of memory, audit has lost a tree reference"); 1610 printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
1598 audit_set_auditable(context); 1611 audit_set_auditable(context);
1599 audit_put_chunk(chunk); 1612 audit_put_chunk(chunk);
1600 unroll_tree_refs(context, p, count); 1613 unroll_tree_refs(context, p, count);
@@ -1654,7 +1667,7 @@ retry:
1654 } 1667 }
1655 /* too bad */ 1668 /* too bad */
1656 printk(KERN_WARNING 1669 printk(KERN_WARNING
1657 "out of memory, audit has lost a tree reference"); 1670 "out of memory, audit has lost a tree reference\n");
1658 unroll_tree_refs(context, p, count); 1671 unroll_tree_refs(context, p, count);
1659 audit_set_auditable(context); 1672 audit_set_auditable(context);
1660 return; 1673 return;
@@ -1750,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context,
1750 if (context->name_count >= AUDIT_NAMES) { 1763 if (context->name_count >= AUDIT_NAMES) {
1751 if (inode) 1764 if (inode)
1752 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1765 printk(KERN_DEBUG "name_count maxed, losing inode data: "
1753 "dev=%02x:%02x, inode=%lu", 1766 "dev=%02x:%02x, inode=%lu\n",
1754 MAJOR(inode->i_sb->s_dev), 1767 MAJOR(inode->i_sb->s_dev),
1755 MINOR(inode->i_sb->s_dev), 1768 MINOR(inode->i_sb->s_dev),
1756 inode->i_ino); 1769 inode->i_ino);
1757 1770
1758 else 1771 else
1759 printk(KERN_DEBUG "name_count maxed, losing inode data"); 1772 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1760 return 1; 1773 return 1;
1761 } 1774 }
1762 context->name_count++; 1775 context->name_count++;
@@ -1775,7 +1788,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1775 name->uid = inode->i_uid; 1788 name->uid = inode->i_uid;
1776 name->gid = inode->i_gid; 1789 name->gid = inode->i_gid;
1777 name->rdev = inode->i_rdev; 1790 name->rdev = inode->i_rdev;
1778 selinux_get_inode_sid(inode, &name->osid); 1791 security_inode_getsecid(inode, &name->osid);
1779} 1792}
1780 1793
1781/** 1794/**
@@ -2190,8 +2203,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2190 ax->uid = ipcp->uid; 2203 ax->uid = ipcp->uid;
2191 ax->gid = ipcp->gid; 2204 ax->gid = ipcp->gid;
2192 ax->mode = ipcp->mode; 2205 ax->mode = ipcp->mode;
2193 selinux_get_ipc_sid(ipcp, &ax->osid); 2206 security_ipc_getsecid(ipcp, &ax->osid);
2194
2195 ax->d.type = AUDIT_IPC; 2207 ax->d.type = AUDIT_IPC;
2196 ax->d.next = context->aux; 2208 ax->d.next = context->aux;
2197 context->aux = (void *)ax; 2209 context->aux = (void *)ax;
@@ -2343,7 +2355,7 @@ void __audit_ptrace(struct task_struct *t)
2343 context->target_auid = audit_get_loginuid(t); 2355 context->target_auid = audit_get_loginuid(t);
2344 context->target_uid = t->uid; 2356 context->target_uid = t->uid;
2345 context->target_sessionid = audit_get_sessionid(t); 2357 context->target_sessionid = audit_get_sessionid(t);
2346 selinux_get_task_sid(t, &context->target_sid); 2358 security_task_getsecid(t, &context->target_sid);
2347 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2359 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
2348} 2360}
2349 2361
@@ -2360,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t)
2360 struct audit_aux_data_pids *axp; 2372 struct audit_aux_data_pids *axp;
2361 struct task_struct *tsk = current; 2373 struct task_struct *tsk = current;
2362 struct audit_context *ctx = tsk->audit_context; 2374 struct audit_context *ctx = tsk->audit_context;
2363 extern pid_t audit_sig_pid;
2364 extern uid_t audit_sig_uid;
2365 extern u32 audit_sig_sid;
2366 2375
2367 if (audit_pid && t->tgid == audit_pid) { 2376 if (audit_pid && t->tgid == audit_pid) {
2368 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2377 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
@@ -2371,7 +2380,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2371 audit_sig_uid = tsk->loginuid; 2380 audit_sig_uid = tsk->loginuid;
2372 else 2381 else
2373 audit_sig_uid = tsk->uid; 2382 audit_sig_uid = tsk->uid;
2374 selinux_get_task_sid(tsk, &audit_sig_sid); 2383 security_task_getsecid(tsk, &audit_sig_sid);
2375 } 2384 }
2376 if (!audit_signals || audit_dummy_context()) 2385 if (!audit_signals || audit_dummy_context())
2377 return 0; 2386 return 0;
@@ -2384,7 +2393,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2384 ctx->target_auid = audit_get_loginuid(t); 2393 ctx->target_auid = audit_get_loginuid(t);
2385 ctx->target_uid = t->uid; 2394 ctx->target_uid = t->uid;
2386 ctx->target_sessionid = audit_get_sessionid(t); 2395 ctx->target_sessionid = audit_get_sessionid(t);
2387 selinux_get_task_sid(t, &ctx->target_sid); 2396 security_task_getsecid(t, &ctx->target_sid);
2388 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2397 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
2389 return 0; 2398 return 0;
2390 } 2399 }
@@ -2405,7 +2414,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2405 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2414 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2406 axp->target_uid[axp->pid_count] = t->uid; 2415 axp->target_uid[axp->pid_count] = t->uid;
2407 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2416 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2408 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); 2417 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2409 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2418 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
2410 axp->pid_count++; 2419 axp->pid_count++;
2411 2420
@@ -2435,16 +2444,17 @@ void audit_core_dumps(long signr)
2435 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2436 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2445 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2437 auid, current->uid, current->gid, sessionid); 2446 auid, current->uid, current->gid, sessionid);
2438 selinux_get_task_sid(current, &sid); 2447 security_task_getsecid(current, &sid);
2439 if (sid) { 2448 if (sid) {
2440 char *ctx = NULL; 2449 char *ctx = NULL;
2441 u32 len; 2450 u32 len;
2442 2451
2443 if (selinux_sid_to_string(sid, &ctx, &len)) 2452 if (security_secid_to_secctx(sid, &ctx, &len))
2444 audit_log_format(ab, " ssid=%u", sid); 2453 audit_log_format(ab, " ssid=%u", sid);
2445 else 2454 else {
2446 audit_log_format(ab, " subj=%s", ctx); 2455 audit_log_format(ab, " subj=%s", ctx);
2447 kfree(ctx); 2456 security_release_secctx(ctx, len);
2457 }
2448 } 2458 }
2449 audit_log_format(ab, " pid=%d comm=", current->pid); 2459 audit_log_format(ab, " pid=%d comm=", current->pid);
2450 audit_log_untrustedstring(ab, current->comm); 2460 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 000000000000..3c5301381837
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,19 @@
1/*
2 * Generate definitions needed by the preprocessor.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#define __GENERATING_BOUNDS_H
8/* Include headers that define the enum constants of interest */
9#include <linux/page-flags.h>
10#include <linux/mmzone.h>
11#include <linux/kbuild.h>
12
13void foo(void)
14{
15 /* The enum constants to put into include/linux/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */
19}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9c2fb01e89b..fbc6fc8949b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
44#include <linux/kmod.h> 44#include <linux/kmod.h>
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h>
47 48
48#include <asm/atomic.h> 49#include <asm/atomic.h>
49 50
@@ -118,17 +119,7 @@ static int root_count;
118 * be called. 119 * be called.
119 */ 120 */
120static int need_forkexit_callback; 121static int need_forkexit_callback;
121 122static int need_mm_owner_callback __read_mostly;
122/* bits in struct cgroup flags field */
123enum {
124 /* Control Group is dead */
125 CGRP_REMOVED,
126 /* Control Group has previously had a child cgroup or a task,
127 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
128 CGRP_RELEASABLE,
129 /* Control Group requires release notifications to userspace */
130 CGRP_NOTIFY_ON_RELEASE,
131};
132 123
133/* convenient tests for these bits */ 124/* convenient tests for these bits */
134inline int cgroup_is_removed(const struct cgroup *cgrp) 125inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
204static DEFINE_RWLOCK(css_set_lock); 195static DEFINE_RWLOCK(css_set_lock);
205static int css_set_count; 196static int css_set_count;
206 197
198/* hash table for cgroup groups. This improves the performance to
199 * find an existing css_set */
200#define CSS_SET_HASH_BITS 7
201#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
202static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
203
204static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
205{
206 int i;
207 int index;
208 unsigned long tmp = 0UL;
209
210 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
211 tmp += (unsigned long)css[i];
212 tmp = (tmp >> 16) ^ tmp;
213
214 index = hash_long(tmp, CSS_SET_HASH_BITS);
215
216 return &css_set_table[index];
217}
218
207/* We don't maintain the lists running through each css_set to its 219/* We don't maintain the lists running through each css_set to its
208 * task until after the first call to cgroup_iter_start(). This 220 * task until after the first call to cgroup_iter_start(). This
209 * reduces the fork()/exit() overhead for people who have cgroups 221 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
230static void unlink_css_set(struct css_set *cg) 242static void unlink_css_set(struct css_set *cg)
231{ 243{
232 write_lock(&css_set_lock); 244 write_lock(&css_set_lock);
233 list_del(&cg->list); 245 hlist_del(&cg->hlist);
234 css_set_count--; 246 css_set_count--;
235 while (!list_empty(&cg->cg_links)) { 247 while (!list_empty(&cg->cg_links)) {
236 struct cg_cgroup_link *link; 248 struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
295/* 307/*
296 * find_existing_css_set() is a helper for 308 * find_existing_css_set() is a helper for
297 * find_css_set(), and checks to see whether an existing 309 * find_css_set(), and checks to see whether an existing
298 * css_set is suitable. This currently walks a linked-list for 310 * css_set is suitable.
299 * simplicity; a later patch will use a hash table for better
300 * performance
301 * 311 *
302 * oldcg: the cgroup group that we're using before the cgroup 312 * oldcg: the cgroup group that we're using before the cgroup
303 * transition 313 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
314{ 324{
315 int i; 325 int i;
316 struct cgroupfs_root *root = cgrp->root; 326 struct cgroupfs_root *root = cgrp->root;
317 struct list_head *l = &init_css_set.list; 327 struct hlist_head *hhead;
328 struct hlist_node *node;
329 struct css_set *cg;
318 330
319 /* Built the set of subsystem state objects that we want to 331 /* Built the set of subsystem state objects that we want to
320 * see in the new css_set */ 332 * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
331 } 343 }
332 } 344 }
333 345
334 /* Look through existing cgroup groups to find one to reuse */ 346 hhead = css_set_hash(template);
335 do { 347 hlist_for_each_entry(cg, node, hhead, hlist) {
336 struct css_set *cg =
337 list_entry(l, struct css_set, list);
338
339 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 348 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
340 /* All subsystems matched */ 349 /* All subsystems matched */
341 return cg; 350 return cg;
342 } 351 }
343 /* Try the next cgroup group */ 352 }
344 l = l->next;
345 } while (l != &init_css_set.list);
346 353
347 /* No existing cgroup group matched */ 354 /* No existing cgroup group matched */
348 return NULL; 355 return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
404 struct list_head tmp_cg_links; 411 struct list_head tmp_cg_links;
405 struct cg_cgroup_link *link; 412 struct cg_cgroup_link *link;
406 413
414 struct hlist_head *hhead;
415
407 /* First see if we already have a cgroup group that matches 416 /* First see if we already have a cgroup group that matches
408 * the desired set */ 417 * the desired set */
409 write_lock(&css_set_lock); 418 write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
428 kref_init(&res->ref); 437 kref_init(&res->ref);
429 INIT_LIST_HEAD(&res->cg_links); 438 INIT_LIST_HEAD(&res->cg_links);
430 INIT_LIST_HEAD(&res->tasks); 439 INIT_LIST_HEAD(&res->tasks);
440 INIT_HLIST_NODE(&res->hlist);
431 441
432 /* Copy the set of subsystem state objects generated in 442 /* Copy the set of subsystem state objects generated in
433 * find_existing_css_set() */ 443 * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
467 477
468 BUG_ON(!list_empty(&tmp_cg_links)); 478 BUG_ON(!list_empty(&tmp_cg_links));
469 479
470 /* Link this cgroup group into the list */
471 list_add(&res->list, &init_css_set.list);
472 css_set_count++; 480 css_set_count++;
481
482 /* Add this cgroup group to the hash table */
483 hhead = css_set_hash(res->subsys);
484 hlist_add_head(&res->hlist, hhead);
485
473 write_unlock(&css_set_lock); 486 write_unlock(&css_set_lock);
474 487
475 return res; 488 return res;
@@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations;
562static struct file_operations proc_cgroupstats_operations; 575static struct file_operations proc_cgroupstats_operations;
563 576
564static struct backing_dev_info cgroup_backing_dev_info = { 577static struct backing_dev_info cgroup_backing_dev_info = {
565 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 578 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
566}; 579};
567 580
568static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 581static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
@@ -782,7 +795,14 @@ static int parse_cgroupfs_options(char *data,
782 if (!*token) 795 if (!*token)
783 return -EINVAL; 796 return -EINVAL;
784 if (!strcmp(token, "all")) { 797 if (!strcmp(token, "all")) {
785 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; 798 /* Add all non-disabled subsystems */
799 int i;
800 opts->subsys_bits = 0;
801 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
802 struct cgroup_subsys *ss = subsys[i];
803 if (!ss->disabled)
804 opts->subsys_bits |= 1ul << i;
805 }
786 } else if (!strcmp(token, "noprefix")) { 806 } else if (!strcmp(token, "noprefix")) {
787 set_bit(ROOT_NOPREFIX, &opts->flags); 807 set_bit(ROOT_NOPREFIX, &opts->flags);
788 } else if (!strncmp(token, "release_agent=", 14)) { 808 } else if (!strncmp(token, "release_agent=", 14)) {
@@ -800,7 +820,8 @@ static int parse_cgroupfs_options(char *data,
800 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 820 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
801 ss = subsys[i]; 821 ss = subsys[i];
802 if (!strcmp(token, ss->name)) { 822 if (!strcmp(token, ss->name)) {
803 set_bit(i, &opts->subsys_bits); 823 if (!ss->disabled)
824 set_bit(i, &opts->subsys_bits);
804 break; 825 break;
805 } 826 }
806 } 827 }
@@ -940,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
940 int ret = 0; 961 int ret = 0;
941 struct super_block *sb; 962 struct super_block *sb;
942 struct cgroupfs_root *root; 963 struct cgroupfs_root *root;
943 struct list_head tmp_cg_links, *l; 964 struct list_head tmp_cg_links;
944 INIT_LIST_HEAD(&tmp_cg_links); 965 INIT_LIST_HEAD(&tmp_cg_links);
945 966
946 /* First find the desired set of subsystems */ 967 /* First find the desired set of subsystems */
@@ -982,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
982 /* New superblock */ 1003 /* New superblock */
983 struct cgroup *cgrp = &root->top_cgroup; 1004 struct cgroup *cgrp = &root->top_cgroup;
984 struct inode *inode; 1005 struct inode *inode;
1006 int i;
985 1007
986 BUG_ON(sb->s_root != NULL); 1008 BUG_ON(sb->s_root != NULL);
987 1009
@@ -1026,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1026 /* Link the top cgroup in this hierarchy into all 1048 /* Link the top cgroup in this hierarchy into all
1027 * the css_set objects */ 1049 * the css_set objects */
1028 write_lock(&css_set_lock); 1050 write_lock(&css_set_lock);
1029 l = &init_css_set.list; 1051 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1030 do { 1052 struct hlist_head *hhead = &css_set_table[i];
1053 struct hlist_node *node;
1031 struct css_set *cg; 1054 struct css_set *cg;
1032 struct cg_cgroup_link *link; 1055
1033 cg = list_entry(l, struct css_set, list); 1056 hlist_for_each_entry(cg, node, hhead, hlist) {
1034 BUG_ON(list_empty(&tmp_cg_links)); 1057 struct cg_cgroup_link *link;
1035 link = list_entry(tmp_cg_links.next, 1058
1036 struct cg_cgroup_link, 1059 BUG_ON(list_empty(&tmp_cg_links));
1037 cgrp_link_list); 1060 link = list_entry(tmp_cg_links.next,
1038 list_del(&link->cgrp_link_list); 1061 struct cg_cgroup_link,
1039 link->cg = cg; 1062 cgrp_link_list);
1040 list_add(&link->cgrp_link_list, 1063 list_del(&link->cgrp_link_list);
1041 &root->top_cgroup.css_sets); 1064 link->cg = cg;
1042 list_add(&link->cg_link_list, &cg->cg_links); 1065 list_add(&link->cgrp_link_list,
1043 l = l->next; 1066 &root->top_cgroup.css_sets);
1044 } while (l != &init_css_set.list); 1067 list_add(&link->cg_link_list, &cg->cg_links);
1068 }
1069 }
1045 write_unlock(&css_set_lock); 1070 write_unlock(&css_set_lock);
1046 1071
1047 free_cg_links(&tmp_cg_links); 1072 free_cg_links(&tmp_cg_links);
@@ -1299,18 +1324,16 @@ enum cgroup_filetype {
1299 FILE_DIR, 1324 FILE_DIR,
1300 FILE_TASKLIST, 1325 FILE_TASKLIST,
1301 FILE_NOTIFY_ON_RELEASE, 1326 FILE_NOTIFY_ON_RELEASE,
1302 FILE_RELEASABLE,
1303 FILE_RELEASE_AGENT, 1327 FILE_RELEASE_AGENT,
1304}; 1328};
1305 1329
1306static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, 1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1307 struct file *file, 1331 struct file *file,
1308 const char __user *userbuf, 1332 const char __user *userbuf,
1309 size_t nbytes, loff_t *unused_ppos) 1333 size_t nbytes, loff_t *unused_ppos)
1310{ 1334{
1311 char buffer[64]; 1335 char buffer[64];
1312 int retval = 0; 1336 int retval = 0;
1313 u64 val;
1314 char *end; 1337 char *end;
1315 1338
1316 if (!nbytes) 1339 if (!nbytes)
@@ -1321,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1321 return -EFAULT; 1344 return -EFAULT;
1322 1345
1323 buffer[nbytes] = 0; /* nul-terminate */ 1346 buffer[nbytes] = 0; /* nul-terminate */
1324 1347 strstrip(buffer);
1325 /* strip newline if necessary */ 1348 if (cft->write_u64) {
1326 if (nbytes && (buffer[nbytes-1] == '\n')) 1349 u64 val = simple_strtoull(buffer, &end, 0);
1327 buffer[nbytes-1] = 0; 1350 if (*end)
1328 val = simple_strtoull(buffer, &end, 0); 1351 return -EINVAL;
1329 if (*end) 1352 retval = cft->write_u64(cgrp, cft, val);
1330 return -EINVAL; 1353 } else {
1331 1354 s64 val = simple_strtoll(buffer, &end, 0);
1332 /* Pass to subsystem */ 1355 if (*end)
1333 retval = cft->write_uint(cgrp, cft, val); 1356 return -EINVAL;
1357 retval = cft->write_s64(cgrp, cft, val);
1358 }
1334 if (!retval) 1359 if (!retval)
1335 retval = nbytes; 1360 retval = nbytes;
1336 return retval; 1361 return retval;
@@ -1411,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1411 return -ENODEV; 1436 return -ENODEV;
1412 if (cft->write) 1437 if (cft->write)
1413 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1414 if (cft->write_uint) 1439 if (cft->write_u64 || cft->write_s64)
1415 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); 1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes;
1444 }
1416 return -EINVAL; 1445 return -EINVAL;
1417} 1446}
1418 1447
1419static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, 1448static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1420 struct file *file, 1449 struct file *file,
1421 char __user *buf, size_t nbytes, 1450 char __user *buf, size_t nbytes,
1422 loff_t *ppos) 1451 loff_t *ppos)
1423{ 1452{
1424 char tmp[64]; 1453 char tmp[64];
1425 u64 val = cft->read_uint(cgrp, cft); 1454 u64 val = cft->read_u64(cgrp, cft);
1426 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1427 1456
1428 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1457 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1429} 1458}
1430 1459
1460static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1461 struct file *file,
1462 char __user *buf, size_t nbytes,
1463 loff_t *ppos)
1464{
1465 char tmp[64];
1466 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val);
1468
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470}
1471
1431static ssize_t cgroup_common_file_read(struct cgroup *cgrp, 1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1432 struct cftype *cft, 1473 struct cftype *cft,
1433 struct file *file, 1474 struct file *file,
@@ -1482,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1482 1523
1483 if (cft->read) 1524 if (cft->read)
1484 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 1525 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1485 if (cft->read_uint) 1526 if (cft->read_u64)
1486 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); 1527 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1528 if (cft->read_s64)
1529 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
1487 return -EINVAL; 1530 return -EINVAL;
1488} 1531}
1489 1532
1533/*
1534 * seqfile ops/methods for returning structured data. Currently just
1535 * supports string->u64 maps, but can be extended in future.
1536 */
1537
1538struct cgroup_seqfile_state {
1539 struct cftype *cft;
1540 struct cgroup *cgroup;
1541};
1542
1543static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1544{
1545 struct seq_file *sf = cb->state;
1546 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1547}
1548
1549static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1550{
1551 struct cgroup_seqfile_state *state = m->private;
1552 struct cftype *cft = state->cft;
1553 if (cft->read_map) {
1554 struct cgroup_map_cb cb = {
1555 .fill = cgroup_map_add,
1556 .state = m,
1557 };
1558 return cft->read_map(state->cgroup, cft, &cb);
1559 }
1560 return cft->read_seq_string(state->cgroup, cft, m);
1561}
1562
1563int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{
1565 struct seq_file *seq = file->private_data;
1566 kfree(seq->private);
1567 return single_release(inode, file);
1568}
1569
1570static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read,
1572 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release,
1574};
1575
1490static int cgroup_file_open(struct inode *inode, struct file *file) 1576static int cgroup_file_open(struct inode *inode, struct file *file)
1491{ 1577{
1492 int err; 1578 int err;
@@ -1499,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1499 cft = __d_cft(file->f_dentry); 1585 cft = __d_cft(file->f_dentry);
1500 if (!cft) 1586 if (!cft)
1501 return -ENODEV; 1587 return -ENODEV;
1502 if (cft->open) 1588 if (cft->read_map || cft->read_seq_string) {
1589 struct cgroup_seqfile_state *state =
1590 kzalloc(sizeof(*state), GFP_USER);
1591 if (!state)
1592 return -ENOMEM;
1593 state->cft = cft;
1594 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1595 file->f_op = &cgroup_seqfile_operations;
1596 err = single_open(file, cgroup_seqfile_show, state);
1597 if (err < 0)
1598 kfree(state);
1599 } else if (cft->open)
1503 err = cft->open(inode, file); 1600 err = cft->open(inode, file);
1504 else 1601 else
1505 err = 0; 1602 err = 0;
@@ -1707,14 +1804,19 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1707 * The tasklist_lock is not held here, as do_each_thread() and 1804 * The tasklist_lock is not held here, as do_each_thread() and
1708 * while_each_thread() are protected by RCU. 1805 * while_each_thread() are protected by RCU.
1709 */ 1806 */
1710void cgroup_enable_task_cg_lists(void) 1807static void cgroup_enable_task_cg_lists(void)
1711{ 1808{
1712 struct task_struct *p, *g; 1809 struct task_struct *p, *g;
1713 write_lock(&css_set_lock); 1810 write_lock(&css_set_lock);
1714 use_task_css_set_links = 1; 1811 use_task_css_set_links = 1;
1715 do_each_thread(g, p) { 1812 do_each_thread(g, p) {
1716 task_lock(p); 1813 task_lock(p);
1717 if (list_empty(&p->cg_list)) 1814 /*
1815 * We should check if the process is exiting, otherwise
1816 * it will race with cgroup_exit() in that the list
1817 * entry won't be deleted though the process has exited.
1818 */
1819 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
1718 list_add(&p->cg_list, &p->cgroups->tasks); 1820 list_add(&p->cg_list, &p->cgroups->tasks);
1719 task_unlock(p); 1821 task_unlock(p);
1720 } while_each_thread(g, p); 1822 } while_each_thread(g, p);
@@ -1900,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1900 2002
1901 if (heap->size) { 2003 if (heap->size) {
1902 for (i = 0; i < heap->size; i++) { 2004 for (i = 0; i < heap->size; i++) {
1903 struct task_struct *p = heap->ptrs[i]; 2005 struct task_struct *q = heap->ptrs[i];
1904 if (i == 0) { 2006 if (i == 0) {
1905 latest_time = p->start_time; 2007 latest_time = q->start_time;
1906 latest_task = p; 2008 latest_task = q;
1907 } 2009 }
1908 /* Process the task per the caller's callback */ 2010 /* Process the task per the caller's callback */
1909 scan->process_task(p, scan); 2011 scan->process_task(q, scan);
1910 put_task_struct(p); 2012 put_task_struct(q);
1911 } 2013 }
1912 /* 2014 /*
1913 * If we had to process any tasks at all, scan again 2015 * If we had to process any tasks at all, scan again
@@ -2082,7 +2184,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2082 2184
2083 kfree(pidarray); 2185 kfree(pidarray);
2084 } else { 2186 } else {
2085 ctr->buf = 0; 2187 ctr->buf = NULL;
2086 ctr->bufsz = 0; 2188 ctr->bufsz = 0;
2087 } 2189 }
2088 file->private_data = ctr; 2190 file->private_data = ctr;
@@ -2125,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2125 return notify_on_release(cgrp); 2227 return notify_on_release(cgrp);
2126} 2228}
2127 2229
2128static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
2129{
2130 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
2131}
2132
2133/* 2230/*
2134 * for the common functions, 'private' gives the type of file 2231 * for the common functions, 'private' gives the type of file
2135 */ 2232 */
@@ -2145,16 +2242,10 @@ static struct cftype files[] = {
2145 2242
2146 { 2243 {
2147 .name = "notify_on_release", 2244 .name = "notify_on_release",
2148 .read_uint = cgroup_read_notify_on_release, 2245 .read_u64 = cgroup_read_notify_on_release,
2149 .write = cgroup_common_file_write, 2246 .write = cgroup_common_file_write,
2150 .private = FILE_NOTIFY_ON_RELEASE, 2247 .private = FILE_NOTIFY_ON_RELEASE,
2151 }, 2248 },
2152
2153 {
2154 .name = "releasable",
2155 .read_uint = cgroup_read_releasable,
2156 .private = FILE_RELEASABLE,
2157 }
2158}; 2249};
2159 2250
2160static struct cftype cft_release_agent = { 2251static struct cftype cft_release_agent = {
@@ -2388,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2388 return 0; 2479 return 0;
2389} 2480}
2390 2481
2391static void cgroup_init_subsys(struct cgroup_subsys *ss) 2482static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2392{ 2483{
2393 struct cgroup_subsys_state *css; 2484 struct cgroup_subsys_state *css;
2394 struct list_head *l;
2395 2485
2396 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2486 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2397 2487
@@ -2402,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
2402 BUG_ON(IS_ERR(css)); 2492 BUG_ON(IS_ERR(css));
2403 init_cgroup_css(css, ss, dummytop); 2493 init_cgroup_css(css, ss, dummytop);
2404 2494
2405 /* Update all cgroup groups to contain a subsys 2495 /* Update the init_css_set to contain a subsys
2406 * pointer to this state - since the subsystem is 2496 * pointer to this state - since the subsystem is
2407 * newly registered, all tasks and hence all cgroup 2497 * newly registered, all tasks and hence the
2408 * groups are in the subsystem's top cgroup. */ 2498 * init_css_set is in the subsystem's top cgroup. */
2409 write_lock(&css_set_lock); 2499 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2410 l = &init_css_set.list;
2411 do {
2412 struct css_set *cg =
2413 list_entry(l, struct css_set, list);
2414 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2415 l = l->next;
2416 } while (l != &init_css_set.list);
2417 write_unlock(&css_set_lock);
2418
2419 /* If this subsystem requested that it be notified with fork
2420 * events, we should send it one now for every process in the
2421 * system */
2422 if (ss->fork) {
2423 struct task_struct *g, *p;
2424
2425 read_lock(&tasklist_lock);
2426 do_each_thread(g, p) {
2427 ss->fork(ss, p);
2428 } while_each_thread(g, p);
2429 read_unlock(&tasklist_lock);
2430 }
2431 2500
2432 need_forkexit_callback |= ss->fork || ss->exit; 2501 need_forkexit_callback |= ss->fork || ss->exit;
2502 need_mm_owner_callback |= !!ss->mm_owner_changed;
2503
2504 /* At system boot, before all subsystems have been
2505 * registered, no tasks have been forked, so we don't
2506 * need to invoke fork callbacks here. */
2507 BUG_ON(!list_empty(&init_task.tasks));
2433 2508
2434 ss->active = 1; 2509 ss->active = 1;
2435} 2510}
@@ -2445,9 +2520,9 @@ int __init cgroup_init_early(void)
2445 int i; 2520 int i;
2446 kref_init(&init_css_set.ref); 2521 kref_init(&init_css_set.ref);
2447 kref_get(&init_css_set.ref); 2522 kref_get(&init_css_set.ref);
2448 INIT_LIST_HEAD(&init_css_set.list);
2449 INIT_LIST_HEAD(&init_css_set.cg_links); 2523 INIT_LIST_HEAD(&init_css_set.cg_links);
2450 INIT_LIST_HEAD(&init_css_set.tasks); 2524 INIT_LIST_HEAD(&init_css_set.tasks);
2525 INIT_HLIST_NODE(&init_css_set.hlist);
2451 css_set_count = 1; 2526 css_set_count = 1;
2452 init_cgroup_root(&rootnode); 2527 init_cgroup_root(&rootnode);
2453 list_add(&rootnode.root_list, &roots); 2528 list_add(&rootnode.root_list, &roots);
@@ -2460,6 +2535,9 @@ int __init cgroup_init_early(void)
2460 list_add(&init_css_set_link.cg_link_list, 2535 list_add(&init_css_set_link.cg_link_list,
2461 &init_css_set.cg_links); 2536 &init_css_set.cg_links);
2462 2537
2538 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
2539 INIT_HLIST_HEAD(&css_set_table[i]);
2540
2463 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2541 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2464 struct cgroup_subsys *ss = subsys[i]; 2542 struct cgroup_subsys *ss = subsys[i];
2465 2543
@@ -2489,7 +2567,7 @@ int __init cgroup_init(void)
2489{ 2567{
2490 int err; 2568 int err;
2491 int i; 2569 int i;
2492 struct proc_dir_entry *entry; 2570 struct hlist_head *hhead;
2493 2571
2494 err = bdi_init(&cgroup_backing_dev_info); 2572 err = bdi_init(&cgroup_backing_dev_info);
2495 if (err) 2573 if (err)
@@ -2501,13 +2579,15 @@ int __init cgroup_init(void)
2501 cgroup_init_subsys(ss); 2579 cgroup_init_subsys(ss);
2502 } 2580 }
2503 2581
2582 /* Add init_css_set to the hash table */
2583 hhead = css_set_hash(init_css_set.subsys);
2584 hlist_add_head(&init_css_set.hlist, hhead);
2585
2504 err = register_filesystem(&cgroup_fs_type); 2586 err = register_filesystem(&cgroup_fs_type);
2505 if (err < 0) 2587 if (err < 0)
2506 goto out; 2588 goto out;
2507 2589
2508 entry = create_proc_entry("cgroups", 0, NULL); 2590 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
2509 if (entry)
2510 entry->proc_fops = &proc_cgroupstats_operations;
2511 2591
2512out: 2592out:
2513 if (err) 2593 if (err)
@@ -2561,6 +2641,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2561 /* Skip this hierarchy if it has no active subsystems */ 2641 /* Skip this hierarchy if it has no active subsystems */
2562 if (!root->actual_subsys_bits) 2642 if (!root->actual_subsys_bits)
2563 continue; 2643 continue;
2644 seq_printf(m, "%lu:", root->subsys_bits);
2564 for_each_subsys(root, ss) 2645 for_each_subsys(root, ss)
2565 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2646 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2566 seq_putc(m, ':'); 2647 seq_putc(m, ':');
@@ -2600,13 +2681,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
2600{ 2681{
2601 int i; 2682 int i;
2602 2683
2603 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); 2684 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
2604 mutex_lock(&cgroup_mutex); 2685 mutex_lock(&cgroup_mutex);
2605 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2686 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2606 struct cgroup_subsys *ss = subsys[i]; 2687 struct cgroup_subsys *ss = subsys[i];
2607 seq_printf(m, "%s\t%lu\t%d\n", 2688 seq_printf(m, "%s\t%lu\t%d\t%d\n",
2608 ss->name, ss->root->subsys_bits, 2689 ss->name, ss->root->subsys_bits,
2609 ss->root->number_of_cgroups); 2690 ss->root->number_of_cgroups, !ss->disabled);
2610 } 2691 }
2611 mutex_unlock(&cgroup_mutex); 2692 mutex_unlock(&cgroup_mutex);
2612 return 0; 2693 return 0;
@@ -2614,7 +2695,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
2614 2695
2615static int cgroupstats_open(struct inode *inode, struct file *file) 2696static int cgroupstats_open(struct inode *inode, struct file *file)
2616{ 2697{
2617 return single_open(file, proc_cgroupstats_show, 0); 2698 return single_open(file, proc_cgroupstats_show, NULL);
2618} 2699}
2619 2700
2620static struct file_operations proc_cgroupstats_operations = { 2701static struct file_operations proc_cgroupstats_operations = {
@@ -2669,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2669 } 2750 }
2670} 2751}
2671 2752
2753#ifdef CONFIG_MM_OWNER
2754/**
2755 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2756 * @p: the new owner
2757 *
2758 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it.
2761 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{
2764 struct cgroup *oldcgrp, *newcgrp;
2765
2766 if (need_mm_owner_callback) {
2767 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp)
2773 continue;
2774 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp);
2776 }
2777 }
2778}
2779#endif /* CONFIG_MM_OWNER */
2780
2672/** 2781/**
2673 * cgroup_post_fork - called on a new task after adding it to the task list 2782 * cgroup_post_fork - called on a new task after adding it to the task list
2674 * @child: the task in question 2783 * @child: the task in question
@@ -3010,3 +3119,27 @@ static void cgroup_release_agent(struct work_struct *work)
3010 spin_unlock(&release_list_lock); 3119 spin_unlock(&release_list_lock);
3011 mutex_unlock(&cgroup_mutex); 3120 mutex_unlock(&cgroup_mutex);
3012} 3121}
3122
3123static int __init cgroup_disable(char *str)
3124{
3125 int i;
3126 char *token;
3127
3128 while ((token = strsep(&str, ",")) != NULL) {
3129 if (!*token)
3130 continue;
3131
3132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3133 struct cgroup_subsys *ss = subsys[i];
3134
3135 if (!strcmp(token, ss->name)) {
3136 ss->disabled = 1;
3137 printk(KERN_INFO "Disabling %s control group"
3138 " subsystem\n", ss->name);
3139 break;
3140 }
3141 }
3142 }
3143 return 1;
3144}
3145__setup("cgroup_disable=", cgroup_disable);
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that 2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info 3 * exposes debug info
4 * 4 *
5 * Copyright (C) Google Inc, 2007 5 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
62 return count; 62 return count;
63} 63}
64 64
65static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
66{
67 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
68}
69
65static struct cftype files[] = { 70static struct cftype files[] = {
66 { 71 {
67 .name = "cgroup_refcount", 72 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read, 73 .read_u64 = cgroup_refcount_read,
69 }, 74 },
70 { 75 {
71 .name = "taskcount", 76 .name = "taskcount",
72 .read_uint = taskcount_read, 77 .read_u64 = taskcount_read,
73 }, 78 },
74 79
75 { 80 {
76 .name = "current_css_set", 81 .name = "current_css_set",
77 .read_uint = current_css_set_read, 82 .read_u64 = current_css_set_read,
78 }, 83 },
79 84
80 { 85 {
81 .name = "current_css_set_refcount", 86 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read, 87 .read_u64 = current_css_set_refcount_read,
83 }, 88 },
89
90 {
91 .name = "releasable",
92 .read_u64 = releasable_read,
93 }
84}; 94};
85 95
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index 5f0e201bcfd3..32c254a8ab9a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart)
47 mm_segment_t oldfs; 47 mm_segment_t oldfs;
48 long ret; 48 long ret;
49 49
50 rmtp = (struct compat_timespec __user *)(restart->arg1); 50 restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
51 restart->arg1 = (unsigned long)&rmt;
52 oldfs = get_fs(); 51 oldfs = get_fs();
53 set_fs(KERNEL_DS); 52 set_fs(KERNEL_DS);
54 ret = hrtimer_nanosleep_restart(restart); 53 ret = hrtimer_nanosleep_restart(restart);
55 set_fs(oldfs); 54 set_fs(oldfs);
56 55
57 if (ret) { 56 if (ret) {
58 restart->arg1 = (unsigned long)rmtp; 57 rmtp = restart->nanosleep.compat_rmtp;
59 58
60 if (rmtp && put_compat_timespec(&rmt, rmtp)) 59 if (rmtp && put_compat_timespec(&rmt, rmtp))
61 return -EFAULT; 60 return -EFAULT;
@@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
89 = &current_thread_info()->restart_block; 88 = &current_thread_info()->restart_block;
90 89
91 restart->fn = compat_nanosleep_restart; 90 restart->fn = compat_nanosleep_restart;
92 restart->arg1 = (unsigned long)rmtp; 91 restart->nanosleep.compat_rmtp = rmtp;
93 92
94 if (rmtp && put_compat_timespec(&rmt, rmtp)) 93 if (rmtp && put_compat_timespec(&rmt, rmtp))
95 return -EFAULT; 94 return -EFAULT;
@@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
446 if (retval) 445 if (retval)
447 return retval; 446 return retval;
448 447
449 return sched_setaffinity(pid, new_mask); 448 return sched_setaffinity(pid, &new_mask);
450} 449}
451 450
452asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 451asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
@@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
607 long err; 606 long err;
608 mm_segment_t oldfs; 607 mm_segment_t oldfs;
609 struct timespec tu; 608 struct timespec tu;
610 struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); 609 struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
611 610
612 restart->arg1 = (unsigned long) &tu; 611 restart->nanosleep.rmtp = (struct timespec __user *) &tu;
613 oldfs = get_fs(); 612 oldfs = get_fs();
614 set_fs(KERNEL_DS); 613 set_fs(KERNEL_DS);
615 err = clock_nanosleep_restart(restart); 614 err = clock_nanosleep_restart(restart);
@@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
621 620
622 if (err == -ERESTART_RESTARTBLOCK) { 621 if (err == -ERESTART_RESTARTBLOCK) {
623 restart->fn = compat_clock_nanosleep_restart; 622 restart->fn = compat_clock_nanosleep_restart;
624 restart->arg1 = (unsigned long) rmtp; 623 restart->nanosleep.compat_rmtp = rmtp;
625 } 624 }
626 return err; 625 return err;
627} 626}
@@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
652 if (err == -ERESTART_RESTARTBLOCK) { 651 if (err == -ERESTART_RESTARTBLOCK) {
653 restart = &current_thread_info()->restart_block; 652 restart = &current_thread_info()->restart_block;
654 restart->fn = compat_clock_nanosleep_restart; 653 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp; 654 restart->nanosleep.compat_rmtp = rmtp;
656 } 655 }
657 return err; 656 return err;
658} 657}
@@ -899,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
899 898
900 current->state = TASK_INTERRUPTIBLE; 899 current->state = TASK_INTERRUPTIBLE;
901 schedule(); 900 schedule();
902 set_thread_flag(TIF_RESTORE_SIGMASK); 901 set_restore_sigmask();
903 return -ERESTARTNOHAND; 902 return -ERESTARTNOHAND;
904} 903}
905#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 904#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -956,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
956 __put_user(txc.jitcnt, &utp->jitcnt) || 955 __put_user(txc.jitcnt, &utp->jitcnt) ||
957 __put_user(txc.calcnt, &utp->calcnt) || 956 __put_user(txc.calcnt, &utp->calcnt) ||
958 __put_user(txc.errcnt, &utp->errcnt) || 957 __put_user(txc.errcnt, &utp->errcnt) ||
959 __put_user(txc.stbcnt, &utp->stbcnt)) 958 __put_user(txc.stbcnt, &utp->stbcnt) ||
959 __put_user(txc.tai, &utp->tai))
960 ret = -EFAULT; 960 ret = -EFAULT;
961 961
962 return ret; 962 return ret;
@@ -1081,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1081 1081
1082 return 0; 1082 return 0;
1083} 1083}
1084
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
79 struct proc_dir_entry *entry; 79 struct proc_dir_entry *entry;
80 80
81 /* create the current config file */ 81 /* create the current config file */
82 entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, 82 entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
83 &proc_root); 83 &ikconfig_file_ops);
84 if (!entry) 84 if (!entry)
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 entry->proc_fops = &ikconfig_file_ops;
88 entry->size = kernel_config_data_size; 87 entry->size = kernel_config_data_size;
89 88
90 return 0; 89 return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
95 94
96static void __exit ikconfig_cleanup(void) 95static void __exit ikconfig_cleanup(void)
97{ 96{
98 remove_proc_entry("config.gz", &proc_root); 97 remove_proc_entry("config.gz", NULL);
99} 98}
100 99
101module_init(ikconfig_init); 100module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2eff3f63abed..c77bc3a1c722 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
33 * an ongoing cpu hotplug operation. 33 * an ongoing cpu hotplug operation.
34 */ 34 */
35 int refcount; 35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug; 36} cpu_hotplug;
38 37
39#define writer_exists() (cpu_hotplug.active_writer != NULL)
40
41void __init cpu_hotplug_init(void) 38void __init cpu_hotplug_init(void)
42{ 39{
43 cpu_hotplug.active_writer = NULL; 40 cpu_hotplug.active_writer = NULL;
44 mutex_init(&cpu_hotplug.lock); 41 mutex_init(&cpu_hotplug.lock);
45 cpu_hotplug.refcount = 0; 42 cpu_hotplug.refcount = 0;
46 init_waitqueue_head(&cpu_hotplug.writer_queue);
47} 43}
48 44
49#ifdef CONFIG_HOTPLUG_CPU 45#ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
65 if (cpu_hotplug.active_writer == current) 61 if (cpu_hotplug.active_writer == current)
66 return; 62 return;
67 mutex_lock(&cpu_hotplug.lock); 63 mutex_lock(&cpu_hotplug.lock);
68 cpu_hotplug.refcount--; 64 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
69 65 wake_up_process(cpu_hotplug.active_writer);
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock); 66 mutex_unlock(&cpu_hotplug.lock);
74 67
75} 68}
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
98 * Note that during a cpu-hotplug operation, the new readers, if any, 91 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock 92 * will be blocked by the cpu_hotplug.lock
100 * 93 *
101 * Since cpu_maps_update_begin is always called after invoking 94 * Since cpu_hotplug_begin() is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active. 95 * cpu_maps_update_begin(), we can be sure that only one writer is active.
103 * 96 *
104 * Note that theoretically, there is a possibility of a livelock: 97 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping 98 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
115 */ 108 */
116static void cpu_hotplug_begin(void) 109static void cpu_hotplug_begin(void)
117{ 110{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current; 111 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); 112
124 while (cpu_hotplug.refcount) { 113 for (;;) {
125 set_current_state(TASK_UNINTERRUPTIBLE); 114 mutex_lock(&cpu_hotplug.lock);
115 if (likely(!cpu_hotplug.refcount))
116 break;
117 __set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock); 118 mutex_unlock(&cpu_hotplug.lock);
127 schedule(); 119 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 } 120 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131} 121}
132 122
133static void cpu_hotplug_done(void) 123static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
136 mutex_unlock(&cpu_hotplug.lock); 126 mutex_unlock(&cpu_hotplug.lock);
137} 127}
138/* Need to know about CPUs going up/down? */ 128/* Need to know about CPUs going up/down? */
139int __cpuinit register_cpu_notifier(struct notifier_block *nb) 129int __ref register_cpu_notifier(struct notifier_block *nb)
140{ 130{
141 int ret; 131 int ret;
142 cpu_maps_update_begin(); 132 cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
149 139
150EXPORT_SYMBOL(register_cpu_notifier); 140EXPORT_SYMBOL(register_cpu_notifier);
151 141
152void unregister_cpu_notifier(struct notifier_block *nb) 142void __ref unregister_cpu_notifier(struct notifier_block *nb)
153{ 143{
154 cpu_maps_update_begin(); 144 cpu_maps_update_begin();
155 raw_notifier_chain_unregister(&cpu_chain, nb); 145 raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
180}; 170};
181 171
182/* Take this CPU down. */ 172/* Take this CPU down. */
183static int take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
184{ 174{
185 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
186 int err; 176 int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
199} 189}
200 190
201/* Requires cpu_add_remove_lock to be held */ 191/* Requires cpu_add_remove_lock to be held */
202static int _cpu_down(unsigned int cpu, int tasks_frozen) 192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
203{ 193{
204 int err, nr_calls = 0; 194 int err, nr_calls = 0;
205 struct task_struct *p; 195 struct task_struct *p;
@@ -225,16 +215,16 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
225 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 215 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
226 hcpu, nr_calls, NULL); 216 hcpu, nr_calls, NULL);
227 printk("%s: attempt to take down CPU %u failed\n", 217 printk("%s: attempt to take down CPU %u failed\n",
228 __FUNCTION__, cpu); 218 __func__, cpu);
229 err = -EINVAL; 219 err = -EINVAL;
230 goto out_release; 220 goto out_release;
231 } 221 }
232 222
233 /* Ensure that we are not runnable on dying cpu */ 223 /* Ensure that we are not runnable on dying cpu */
234 old_allowed = current->cpus_allowed; 224 old_allowed = current->cpus_allowed;
235 tmp = CPU_MASK_ALL; 225 cpus_setall(tmp);
236 cpu_clear(cpu, tmp); 226 cpu_clear(cpu, tmp);
237 set_cpus_allowed(current, tmp); 227 set_cpus_allowed_ptr(current, &tmp);
238 228
239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 229 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
240 230
@@ -268,13 +258,13 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
268out_thread: 258out_thread:
269 err = kthread_stop(p); 259 err = kthread_stop(p);
270out_allowed: 260out_allowed:
271 set_cpus_allowed(current, old_allowed); 261 set_cpus_allowed_ptr(current, &old_allowed);
272out_release: 262out_release:
273 cpu_hotplug_done(); 263 cpu_hotplug_done();
274 return err; 264 return err;
275} 265}
276 266
277int cpu_down(unsigned int cpu) 267int __ref cpu_down(unsigned int cpu)
278{ 268{
279 int err = 0; 269 int err = 0;
280 270
@@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
305 if (ret == NOTIFY_BAD) { 295 if (ret == NOTIFY_BAD) {
306 nr_calls--; 296 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 297 printk("%s: attempt to bring up CPU %u failed\n",
308 __FUNCTION__, cpu); 298 __func__, cpu);
309 ret = -EINVAL; 299 ret = -EINVAL;
310 goto out_notify; 300 goto out_notify;
311 } 301 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a1b61f414228..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -98,6 +98,9 @@ struct cpuset {
98 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
99 int pn; 99 int pn;
100 100
101 /* for custom sched domain */
102 int relax_domain_level;
103
101 /* used for walking a cpuset heirarchy */ 104 /* used for walking a cpuset heirarchy */
102 struct list_head stack_list; 105 struct list_head stack_list;
103}; 106};
@@ -124,6 +127,7 @@ struct cpuset_hotplug_scanner {
124typedef enum { 127typedef enum {
125 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
126 CS_MEM_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
127 CS_MEMORY_MIGRATE, 131 CS_MEMORY_MIGRATE,
128 CS_SCHED_LOAD_BALANCE, 132 CS_SCHED_LOAD_BALANCE,
129 CS_SPREAD_PAGE, 133 CS_SPREAD_PAGE,
@@ -141,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
141 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
142} 146}
143 147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
144static inline int is_sched_load_balance(const struct cpuset *cs) 153static inline int is_sched_load_balance(const struct cpuset *cs)
145{ 154{
146 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -478,6 +487,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
478 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 487 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
479} 488}
480 489
490static void
491update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
492{
493 if (!dattr)
494 return;
495 if (dattr->relax_domain_level < c->relax_domain_level)
496 dattr->relax_domain_level = c->relax_domain_level;
497 return;
498}
499
481/* 500/*
482 * rebuild_sched_domains() 501 * rebuild_sched_domains()
483 * 502 *
@@ -553,12 +572,14 @@ static void rebuild_sched_domains(void)
553 int csn; /* how many cpuset ptrs in csa so far */ 572 int csn; /* how many cpuset ptrs in csa so far */
554 int i, j, k; /* indices for partition finding loops */ 573 int i, j, k; /* indices for partition finding loops */
555 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 574 cpumask_t *doms; /* resulting partition; i.e. sched domains */
575 struct sched_domain_attr *dattr; /* attributes for custom domains */
556 int ndoms; /* number of sched domains in result */ 576 int ndoms; /* number of sched domains in result */
557 int nslot; /* next empty doms[] cpumask_t slot */ 577 int nslot; /* next empty doms[] cpumask_t slot */
558 578
559 q = NULL; 579 q = NULL;
560 csa = NULL; 580 csa = NULL;
561 doms = NULL; 581 doms = NULL;
582 dattr = NULL;
562 583
563 /* Special case for the 99% of systems with one, full, sched domain */ 584 /* Special case for the 99% of systems with one, full, sched domain */
564 if (is_sched_load_balance(&top_cpuset)) { 585 if (is_sched_load_balance(&top_cpuset)) {
@@ -566,6 +587,11 @@ static void rebuild_sched_domains(void)
566 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 587 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
567 if (!doms) 588 if (!doms)
568 goto rebuild; 589 goto rebuild;
590 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
591 if (dattr) {
592 *dattr = SD_ATTR_INIT;
593 update_domain_attr(dattr, &top_cpuset);
594 }
569 *doms = top_cpuset.cpus_allowed; 595 *doms = top_cpuset.cpus_allowed;
570 goto rebuild; 596 goto rebuild;
571 } 597 }
@@ -622,6 +648,7 @@ restart:
622 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 648 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
623 if (!doms) 649 if (!doms)
624 goto rebuild; 650 goto rebuild;
651 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
625 652
626 for (nslot = 0, i = 0; i < csn; i++) { 653 for (nslot = 0, i = 0; i < csn; i++) {
627 struct cpuset *a = csa[i]; 654 struct cpuset *a = csa[i];
@@ -644,12 +671,15 @@ restart:
644 } 671 }
645 672
646 cpus_clear(*dp); 673 cpus_clear(*dp);
674 if (dattr)
675 *(dattr + nslot) = SD_ATTR_INIT;
647 for (j = i; j < csn; j++) { 676 for (j = i; j < csn; j++) {
648 struct cpuset *b = csa[j]; 677 struct cpuset *b = csa[j];
649 678
650 if (apn == b->pn) { 679 if (apn == b->pn) {
651 cpus_or(*dp, *dp, b->cpus_allowed); 680 cpus_or(*dp, *dp, b->cpus_allowed);
652 b->pn = -1; 681 b->pn = -1;
682 update_domain_attr(dattr, b);
653 } 683 }
654 } 684 }
655 nslot++; 685 nslot++;
@@ -660,7 +690,7 @@ restart:
660rebuild: 690rebuild:
661 /* Have scheduler rebuild sched domains */ 691 /* Have scheduler rebuild sched domains */
662 get_online_cpus(); 692 get_online_cpus();
663 partition_sched_domains(ndoms, doms); 693 partition_sched_domains(ndoms, doms, dattr);
664 put_online_cpus(); 694 put_online_cpus();
665 695
666done: 696done:
@@ -668,6 +698,7 @@ done:
668 kfifo_free(q); 698 kfifo_free(q);
669 kfree(csa); 699 kfree(csa);
670 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 700 /* Don't kfree(doms) -- partition_sched_domains() does that. */
701 /* Don't kfree(dattr) -- partition_sched_domains() does that. */
671} 702}
672 703
673static inline int started_after_time(struct task_struct *t1, 704static inline int started_after_time(struct task_struct *t1,
@@ -710,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
710 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 741 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
711 * words, if its mask is not equal to its cpuset's mask). 742 * words, if its mask is not equal to its cpuset's mask).
712 */ 743 */
713int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 744static int cpuset_test_cpumask(struct task_struct *tsk,
745 struct cgroup_scanner *scan)
714{ 746{
715 return !cpus_equal(tsk->cpus_allowed, 747 return !cpus_equal(tsk->cpus_allowed,
716 (cgroup_cs(scan->cg))->cpus_allowed); 748 (cgroup_cs(scan->cg))->cpus_allowed);
@@ -727,9 +759,10 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
727 * We don't need to re-check for the cgroup/cpuset membership, since we're 759 * We don't need to re-check for the cgroup/cpuset membership, since we're
728 * holding cgroup_lock() at this point. 760 * holding cgroup_lock() at this point.
729 */ 761 */
730void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 762static void cpuset_change_cpumask(struct task_struct *tsk,
763 struct cgroup_scanner *scan)
731{ 764{
732 set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); 765 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
733} 766}
734 767
735/** 768/**
@@ -916,7 +949,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
916 cs->mems_generation = cpuset_mems_generation++; 949 cs->mems_generation = cpuset_mems_generation++;
917 mutex_unlock(&callback_mutex); 950 mutex_unlock(&callback_mutex);
918 951
919 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ 952 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
920 953
921 fudge = 10; /* spare mmarray[] slots */ 954 fudge = 10; /* spare mmarray[] slots */
922 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 955 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -967,7 +1000,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
967 * rebind the vma mempolicies of each mm in mmarray[] to their 1000 * rebind the vma mempolicies of each mm in mmarray[] to their
968 * new cpuset, and release that mm. The mpol_rebind_mm() 1001 * new cpuset, and release that mm. The mpol_rebind_mm()
969 * call takes mmap_sem, which we couldn't take while holding 1002 * call takes mmap_sem, which we couldn't take while holding
970 * tasklist_lock. Forks can happen again now - the mpol_copy() 1003 * tasklist_lock. Forks can happen again now - the mpol_dup()
971 * cpuset_being_rebound check will catch such forks, and rebind 1004 * cpuset_being_rebound check will catch such forks, and rebind
972 * their vma mempolicies too. Because we still hold the global 1005 * their vma mempolicies too. Because we still hold the global
973 * cgroup_mutex, we know that no other rebind effort will 1006 * cgroup_mutex, we know that no other rebind effort will
@@ -998,40 +1031,37 @@ int current_cpuset_is_being_rebound(void)
998 return task_cs(current) == cpuset_being_rebound; 1031 return task_cs(current) == cpuset_being_rebound;
999} 1032}
1000 1033
1001/* 1034static int update_relax_domain_level(struct cpuset *cs, char *buf)
1002 * Call with cgroup_mutex held.
1003 */
1004
1005static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1006{ 1035{
1007 if (simple_strtoul(buf, NULL, 10) != 0) 1036 int val = simple_strtol(buf, NULL, 10);
1008 cpuset_memory_pressure_enabled = 1; 1037
1009 else 1038 if (val < 0)
1010 cpuset_memory_pressure_enabled = 0; 1039 val = -1;
1040
1041 if (val != cs->relax_domain_level) {
1042 cs->relax_domain_level = val;
1043 rebuild_sched_domains();
1044 }
1045
1011 return 0; 1046 return 0;
1012} 1047}
1013 1048
1014/* 1049/*
1015 * update_flag - read a 0 or a 1 in a file and update associated flag 1050 * update_flag - read a 0 or a 1 in a file and update associated flag
1016 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1051 * bit: the bit to update (see cpuset_flagbits_t)
1017 * CS_SCHED_LOAD_BALANCE, 1052 * cs: the cpuset to update
1018 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1053 * turning_on: whether the flag is being set or cleared
1019 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1020 * cs: the cpuset to update
1021 * buf: the buffer where we read the 0 or 1
1022 * 1054 *
1023 * Call with cgroup_mutex held. 1055 * Call with cgroup_mutex held.
1024 */ 1056 */
1025 1057
1026static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1058static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1059 int turning_on)
1027{ 1060{
1028 int turning_on;
1029 struct cpuset trialcs; 1061 struct cpuset trialcs;
1030 int err; 1062 int err;
1031 int cpus_nonempty, balance_flag_changed; 1063 int cpus_nonempty, balance_flag_changed;
1032 1064
1033 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1034
1035 trialcs = *cs; 1065 trialcs = *cs;
1036 if (turning_on) 1066 if (turning_on)
1037 set_bit(bit, &trialcs.flags); 1067 set_bit(bit, &trialcs.flags);
@@ -1178,7 +1208,7 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1178 1208
1179 mutex_lock(&callback_mutex); 1209 mutex_lock(&callback_mutex);
1180 guarantee_online_cpus(cs, &cpus); 1210 guarantee_online_cpus(cs, &cpus);
1181 set_cpus_allowed(tsk, cpus); 1211 set_cpus_allowed_ptr(tsk, &cpus);
1182 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
1183 1213
1184 from = oldcs->mems_allowed; 1214 from = oldcs->mems_allowed;
@@ -1201,7 +1231,9 @@ typedef enum {
1201 FILE_MEMLIST, 1231 FILE_MEMLIST,
1202 FILE_CPU_EXCLUSIVE, 1232 FILE_CPU_EXCLUSIVE,
1203 FILE_MEM_EXCLUSIVE, 1233 FILE_MEM_EXCLUSIVE,
1234 FILE_MEM_HARDWALL,
1204 FILE_SCHED_LOAD_BALANCE, 1235 FILE_SCHED_LOAD_BALANCE,
1236 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1205 FILE_MEMORY_PRESSURE_ENABLED, 1237 FILE_MEMORY_PRESSURE_ENABLED,
1206 FILE_MEMORY_PRESSURE, 1238 FILE_MEMORY_PRESSURE,
1207 FILE_SPREAD_PAGE, 1239 FILE_SPREAD_PAGE,
@@ -1224,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1224 return -E2BIG; 1256 return -E2BIG;
1225 1257
1226 /* +1 for nul-terminator */ 1258 /* +1 for nul-terminator */
1227 if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) 1259 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1260 if (!buffer)
1228 return -ENOMEM; 1261 return -ENOMEM;
1229 1262
1230 if (copy_from_user(buffer, userbuf, nbytes)) { 1263 if (copy_from_user(buffer, userbuf, nbytes)) {
@@ -1247,43 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1247 case FILE_MEMLIST: 1280 case FILE_MEMLIST:
1248 retval = update_nodemask(cs, buffer); 1281 retval = update_nodemask(cs, buffer);
1249 break; 1282 break;
1283 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1284 retval = update_relax_domain_level(cs, buffer);
1285 break;
1286 default:
1287 retval = -EINVAL;
1288 goto out2;
1289 }
1290
1291 if (retval == 0)
1292 retval = nbytes;
1293out2:
1294 cgroup_unlock();
1295out1:
1296 kfree(buffer);
1297 return retval;
1298}
1299
1300static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1301{
1302 int retval = 0;
1303 struct cpuset *cs = cgroup_cs(cgrp);
1304 cpuset_filetype_t type = cft->private;
1305
1306 cgroup_lock();
1307
1308 if (cgroup_is_removed(cgrp)) {
1309 cgroup_unlock();
1310 return -ENODEV;
1311 }
1312
1313 switch (type) {
1250 case FILE_CPU_EXCLUSIVE: 1314 case FILE_CPU_EXCLUSIVE:
1251 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); 1315 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1252 break; 1316 break;
1253 case FILE_MEM_EXCLUSIVE: 1317 case FILE_MEM_EXCLUSIVE:
1254 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1318 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1319 break;
1320 case FILE_MEM_HARDWALL:
1321 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1255 break; 1322 break;
1256 case FILE_SCHED_LOAD_BALANCE: 1323 case FILE_SCHED_LOAD_BALANCE:
1257 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1324 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1258 break; 1325 break;
1259 case FILE_MEMORY_MIGRATE: 1326 case FILE_MEMORY_MIGRATE:
1260 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1327 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1261 break; 1328 break;
1262 case FILE_MEMORY_PRESSURE_ENABLED: 1329 case FILE_MEMORY_PRESSURE_ENABLED:
1263 retval = update_memory_pressure_enabled(cs, buffer); 1330 cpuset_memory_pressure_enabled = !!val;
1264 break; 1331 break;
1265 case FILE_MEMORY_PRESSURE: 1332 case FILE_MEMORY_PRESSURE:
1266 retval = -EACCES; 1333 retval = -EACCES;
1267 break; 1334 break;
1268 case FILE_SPREAD_PAGE: 1335 case FILE_SPREAD_PAGE:
1269 retval = update_flag(CS_SPREAD_PAGE, cs, buffer); 1336 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1270 cs->mems_generation = cpuset_mems_generation++; 1337 cs->mems_generation = cpuset_mems_generation++;
1271 break; 1338 break;
1272 case FILE_SPREAD_SLAB: 1339 case FILE_SPREAD_SLAB:
1273 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1340 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1274 cs->mems_generation = cpuset_mems_generation++; 1341 cs->mems_generation = cpuset_mems_generation++;
1275 break; 1342 break;
1276 default: 1343 default:
1277 retval = -EINVAL; 1344 retval = -EINVAL;
1278 goto out2; 1345 break;
1279 } 1346 }
1280
1281 if (retval == 0)
1282 retval = nbytes;
1283out2:
1284 cgroup_unlock(); 1347 cgroup_unlock();
1285out1:
1286 kfree(buffer);
1287 return retval; 1348 return retval;
1288} 1349}
1289 1350
@@ -1345,29 +1406,8 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1345 case FILE_MEMLIST: 1406 case FILE_MEMLIST:
1346 s += cpuset_sprintf_memlist(s, cs); 1407 s += cpuset_sprintf_memlist(s, cs);
1347 break; 1408 break;
1348 case FILE_CPU_EXCLUSIVE: 1409 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1349 *s++ = is_cpu_exclusive(cs) ? '1' : '0'; 1410 s += sprintf(s, "%d", cs->relax_domain_level);
1350 break;
1351 case FILE_MEM_EXCLUSIVE:
1352 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1353 break;
1354 case FILE_SCHED_LOAD_BALANCE:
1355 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1356 break;
1357 case FILE_MEMORY_MIGRATE:
1358 *s++ = is_memory_migrate(cs) ? '1' : '0';
1359 break;
1360 case FILE_MEMORY_PRESSURE_ENABLED:
1361 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1362 break;
1363 case FILE_MEMORY_PRESSURE:
1364 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1365 break;
1366 case FILE_SPREAD_PAGE:
1367 *s++ = is_spread_page(cs) ? '1' : '0';
1368 break;
1369 case FILE_SPREAD_SLAB:
1370 *s++ = is_spread_slab(cs) ? '1' : '0';
1371 break; 1411 break;
1372 default: 1412 default:
1373 retval = -EINVAL; 1413 retval = -EINVAL;
@@ -1381,111 +1421,137 @@ out:
1381 return retval; 1421 return retval;
1382} 1422}
1383 1423
1384 1424static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1385 1425{
1426 struct cpuset *cs = cgroup_cs(cont);
1427 cpuset_filetype_t type = cft->private;
1428 switch (type) {
1429 case FILE_CPU_EXCLUSIVE:
1430 return is_cpu_exclusive(cs);
1431 case FILE_MEM_EXCLUSIVE:
1432 return is_mem_exclusive(cs);
1433 case FILE_MEM_HARDWALL:
1434 return is_mem_hardwall(cs);
1435 case FILE_SCHED_LOAD_BALANCE:
1436 return is_sched_load_balance(cs);
1437 case FILE_MEMORY_MIGRATE:
1438 return is_memory_migrate(cs);
1439 case FILE_MEMORY_PRESSURE_ENABLED:
1440 return cpuset_memory_pressure_enabled;
1441 case FILE_MEMORY_PRESSURE:
1442 return fmeter_getrate(&cs->fmeter);
1443 case FILE_SPREAD_PAGE:
1444 return is_spread_page(cs);
1445 case FILE_SPREAD_SLAB:
1446 return is_spread_slab(cs);
1447 default:
1448 BUG();
1449 }
1450}
1386 1451
1387 1452
1388/* 1453/*
1389 * for the common functions, 'private' gives the type of file 1454 * for the common functions, 'private' gives the type of file
1390 */ 1455 */
1391 1456
1392static struct cftype cft_cpus = { 1457static struct cftype files[] = {
1393 .name = "cpus", 1458 {
1394 .read = cpuset_common_file_read, 1459 .name = "cpus",
1395 .write = cpuset_common_file_write, 1460 .read = cpuset_common_file_read,
1396 .private = FILE_CPULIST, 1461 .write = cpuset_common_file_write,
1397}; 1462 .private = FILE_CPULIST,
1398 1463 },
1399static struct cftype cft_mems = { 1464
1400 .name = "mems", 1465 {
1401 .read = cpuset_common_file_read, 1466 .name = "mems",
1402 .write = cpuset_common_file_write, 1467 .read = cpuset_common_file_read,
1403 .private = FILE_MEMLIST, 1468 .write = cpuset_common_file_write,
1404}; 1469 .private = FILE_MEMLIST,
1405 1470 },
1406static struct cftype cft_cpu_exclusive = { 1471
1407 .name = "cpu_exclusive", 1472 {
1408 .read = cpuset_common_file_read, 1473 .name = "cpu_exclusive",
1409 .write = cpuset_common_file_write, 1474 .read_u64 = cpuset_read_u64,
1410 .private = FILE_CPU_EXCLUSIVE, 1475 .write_u64 = cpuset_write_u64,
1411}; 1476 .private = FILE_CPU_EXCLUSIVE,
1412 1477 },
1413static struct cftype cft_mem_exclusive = { 1478
1414 .name = "mem_exclusive", 1479 {
1415 .read = cpuset_common_file_read, 1480 .name = "mem_exclusive",
1416 .write = cpuset_common_file_write, 1481 .read_u64 = cpuset_read_u64,
1417 .private = FILE_MEM_EXCLUSIVE, 1482 .write_u64 = cpuset_write_u64,
1418}; 1483 .private = FILE_MEM_EXCLUSIVE,
1419 1484 },
1420static struct cftype cft_sched_load_balance = { 1485
1421 .name = "sched_load_balance", 1486 {
1422 .read = cpuset_common_file_read, 1487 .name = "mem_hardwall",
1423 .write = cpuset_common_file_write, 1488 .read_u64 = cpuset_read_u64,
1424 .private = FILE_SCHED_LOAD_BALANCE, 1489 .write_u64 = cpuset_write_u64,
1425}; 1490 .private = FILE_MEM_HARDWALL,
1426 1491 },
1427static struct cftype cft_memory_migrate = { 1492
1428 .name = "memory_migrate", 1493 {
1429 .read = cpuset_common_file_read, 1494 .name = "sched_load_balance",
1430 .write = cpuset_common_file_write, 1495 .read_u64 = cpuset_read_u64,
1431 .private = FILE_MEMORY_MIGRATE, 1496 .write_u64 = cpuset_write_u64,
1497 .private = FILE_SCHED_LOAD_BALANCE,
1498 },
1499
1500 {
1501 .name = "sched_relax_domain_level",
1502 .read_u64 = cpuset_read_u64,
1503 .write_u64 = cpuset_write_u64,
1504 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1505 },
1506
1507 {
1508 .name = "memory_migrate",
1509 .read_u64 = cpuset_read_u64,
1510 .write_u64 = cpuset_write_u64,
1511 .private = FILE_MEMORY_MIGRATE,
1512 },
1513
1514 {
1515 .name = "memory_pressure",
1516 .read_u64 = cpuset_read_u64,
1517 .write_u64 = cpuset_write_u64,
1518 .private = FILE_MEMORY_PRESSURE,
1519 },
1520
1521 {
1522 .name = "memory_spread_page",
1523 .read_u64 = cpuset_read_u64,
1524 .write_u64 = cpuset_write_u64,
1525 .private = FILE_SPREAD_PAGE,
1526 },
1527
1528 {
1529 .name = "memory_spread_slab",
1530 .read_u64 = cpuset_read_u64,
1531 .write_u64 = cpuset_write_u64,
1532 .private = FILE_SPREAD_SLAB,
1533 },
1432}; 1534};
1433 1535
1434static struct cftype cft_memory_pressure_enabled = { 1536static struct cftype cft_memory_pressure_enabled = {
1435 .name = "memory_pressure_enabled", 1537 .name = "memory_pressure_enabled",
1436 .read = cpuset_common_file_read, 1538 .read_u64 = cpuset_read_u64,
1437 .write = cpuset_common_file_write, 1539 .write_u64 = cpuset_write_u64,
1438 .private = FILE_MEMORY_PRESSURE_ENABLED, 1540 .private = FILE_MEMORY_PRESSURE_ENABLED,
1439}; 1541};
1440 1542
1441static struct cftype cft_memory_pressure = {
1442 .name = "memory_pressure",
1443 .read = cpuset_common_file_read,
1444 .write = cpuset_common_file_write,
1445 .private = FILE_MEMORY_PRESSURE,
1446};
1447
1448static struct cftype cft_spread_page = {
1449 .name = "memory_spread_page",
1450 .read = cpuset_common_file_read,
1451 .write = cpuset_common_file_write,
1452 .private = FILE_SPREAD_PAGE,
1453};
1454
1455static struct cftype cft_spread_slab = {
1456 .name = "memory_spread_slab",
1457 .read = cpuset_common_file_read,
1458 .write = cpuset_common_file_write,
1459 .private = FILE_SPREAD_SLAB,
1460};
1461
1462static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1543static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1463{ 1544{
1464 int err; 1545 int err;
1465 1546
1466 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) 1547 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1467 return err; 1548 if (err)
1468 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1469 return err;
1470 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1471 return err;
1472 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1473 return err;
1474 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1475 return err;
1476 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1477 return err;
1478 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1479 return err;
1480 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1481 return err;
1482 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1483 return err; 1549 return err;
1484 /* memory_pressure_enabled is in root cpuset only */ 1550 /* memory_pressure_enabled is in root cpuset only */
1485 if (err == 0 && !cont->parent) 1551 if (!cont->parent)
1486 err = cgroup_add_file(cont, ss, 1552 err = cgroup_add_file(cont, ss,
1487 &cft_memory_pressure_enabled); 1553 &cft_memory_pressure_enabled);
1488 return 0; 1554 return err;
1489} 1555}
1490 1556
1491/* 1557/*
@@ -1555,10 +1621,11 @@ static struct cgroup_subsys_state *cpuset_create(
1555 if (is_spread_slab(parent)) 1621 if (is_spread_slab(parent))
1556 set_bit(CS_SPREAD_SLAB, &cs->flags); 1622 set_bit(CS_SPREAD_SLAB, &cs->flags);
1557 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1623 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1558 cs->cpus_allowed = CPU_MASK_NONE; 1624 cpus_clear(cs->cpus_allowed);
1559 cs->mems_allowed = NODE_MASK_NONE; 1625 nodes_clear(cs->mems_allowed);
1560 cs->mems_generation = cpuset_mems_generation++; 1626 cs->mems_generation = cpuset_mems_generation++;
1561 fmeter_init(&cs->fmeter); 1627 fmeter_init(&cs->fmeter);
1628 cs->relax_domain_level = -1;
1562 1629
1563 cs->parent = parent; 1630 cs->parent = parent;
1564 number_of_cpusets++; 1631 number_of_cpusets++;
@@ -1584,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1584 cpuset_update_task_memory_state(); 1651 cpuset_update_task_memory_state();
1585 1652
1586 if (is_sched_load_balance(cs)) 1653 if (is_sched_load_balance(cs))
1587 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); 1654 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1588 1655
1589 number_of_cpusets--; 1656 number_of_cpusets--;
1590 kfree(cs); 1657 kfree(cs);
@@ -1625,12 +1692,13 @@ int __init cpuset_init(void)
1625{ 1692{
1626 int err = 0; 1693 int err = 0;
1627 1694
1628 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1695 cpus_setall(top_cpuset.cpus_allowed);
1629 top_cpuset.mems_allowed = NODE_MASK_ALL; 1696 nodes_setall(top_cpuset.mems_allowed);
1630 1697
1631 fmeter_init(&top_cpuset.fmeter); 1698 fmeter_init(&top_cpuset.fmeter);
1632 top_cpuset.mems_generation = cpuset_mems_generation++; 1699 top_cpuset.mems_generation = cpuset_mems_generation++;
1633 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1700 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1701 top_cpuset.relax_domain_level = -1;
1634 1702
1635 err = register_filesystem(&cpuset_fs_type); 1703 err = register_filesystem(&cpuset_fs_type);
1636 if (err < 0) 1704 if (err < 0)
@@ -1648,7 +1716,8 @@ int __init cpuset_init(void)
1648 * Called by cgroup_scan_tasks() for each task in a cgroup. 1716 * Called by cgroup_scan_tasks() for each task in a cgroup.
1649 * Return nonzero to stop the walk through the tasks. 1717 * Return nonzero to stop the walk through the tasks.
1650 */ 1718 */
1651void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) 1719static void cpuset_do_move_task(struct task_struct *tsk,
1720 struct cgroup_scanner *scan)
1652{ 1721{
1653 struct cpuset_hotplug_scanner *chsp; 1722 struct cpuset_hotplug_scanner *chsp;
1654 1723
@@ -1844,6 +1913,7 @@ void __init cpuset_init_smp(void)
1844 1913
1845 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1914 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1846 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1915 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1916 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
1847 * 1917 *
1848 * Description: Returns the cpumask_t cpus_allowed of the cpuset 1918 * Description: Returns the cpumask_t cpus_allowed of the cpuset
1849 * attached to the specified @tsk. Guaranteed to return some non-empty 1919 * attached to the specified @tsk. Guaranteed to return some non-empty
@@ -1851,35 +1921,27 @@ void __init cpuset_init_smp(void)
1851 * tasks cpuset. 1921 * tasks cpuset.
1852 **/ 1922 **/
1853 1923
1854cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) 1924void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
1855{ 1925{
1856 cpumask_t mask;
1857
1858 mutex_lock(&callback_mutex); 1926 mutex_lock(&callback_mutex);
1859 mask = cpuset_cpus_allowed_locked(tsk); 1927 cpuset_cpus_allowed_locked(tsk, pmask);
1860 mutex_unlock(&callback_mutex); 1928 mutex_unlock(&callback_mutex);
1861
1862 return mask;
1863} 1929}
1864 1930
1865/** 1931/**
1866 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 1932 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1867 * Must be called with callback_mutex held. 1933 * Must be called with callback_mutex held.
1868 **/ 1934 **/
1869cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) 1935void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
1870{ 1936{
1871 cpumask_t mask;
1872
1873 task_lock(tsk); 1937 task_lock(tsk);
1874 guarantee_online_cpus(task_cs(tsk), &mask); 1938 guarantee_online_cpus(task_cs(tsk), pmask);
1875 task_unlock(tsk); 1939 task_unlock(tsk);
1876
1877 return mask;
1878} 1940}
1879 1941
1880void cpuset_init_current_mems_allowed(void) 1942void cpuset_init_current_mems_allowed(void)
1881{ 1943{
1882 current->mems_allowed = NODE_MASK_ALL; 1944 nodes_setall(current->mems_allowed);
1883} 1945}
1884 1946
1885/** 1947/**
@@ -1906,33 +1968,25 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1906} 1968}
1907 1969
1908/** 1970/**
1909 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed 1971 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
1910 * @zl: the zonelist to be checked 1972 * @nodemask: the nodemask to be checked
1911 * 1973 *
1912 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1974 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1913 */ 1975 */
1914int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1976int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1915{ 1977{
1916 int i; 1978 return nodes_intersects(*nodemask, current->mems_allowed);
1917
1918 for (i = 0; zl->zones[i]; i++) {
1919 int nid = zone_to_nid(zl->zones[i]);
1920
1921 if (node_isset(nid, current->mems_allowed))
1922 return 1;
1923 }
1924 return 0;
1925} 1979}
1926 1980
1927/* 1981/*
1928 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1982 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
1929 * ancestor to the specified cpuset. Call holding callback_mutex. 1983 * mem_hardwall ancestor to the specified cpuset. Call holding
1930 * If no ancestor is mem_exclusive (an unusual configuration), then 1984 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
1931 * returns the root cpuset. 1985 * (an unusual configuration), then returns the root cpuset.
1932 */ 1986 */
1933static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) 1987static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
1934{ 1988{
1935 while (!is_mem_exclusive(cs) && cs->parent) 1989 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
1936 cs = cs->parent; 1990 cs = cs->parent;
1937 return cs; 1991 return cs;
1938} 1992}
@@ -1946,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1946 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2000 * __GFP_THISNODE is set, yes, we can always allocate. If zone
1947 * z's node is in our tasks mems_allowed, yes. If it's not a 2001 * z's node is in our tasks mems_allowed, yes. If it's not a
1948 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2002 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1949 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2003 * hardwalled cpuset ancestor to this tasks cpuset, yes.
1950 * If the task has been OOM killed and has access to memory reserves 2004 * If the task has been OOM killed and has access to memory reserves
1951 * as specified by the TIF_MEMDIE flag, yes. 2005 * as specified by the TIF_MEMDIE flag, yes.
1952 * Otherwise, no. 2006 * Otherwise, no.
@@ -1969,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1969 * and do not allow allocations outside the current tasks cpuset 2023 * and do not allow allocations outside the current tasks cpuset
1970 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2024 * unless the task has been OOM killed as is marked TIF_MEMDIE.
1971 * GFP_KERNEL allocations are not so marked, so can escape to the 2025 * GFP_KERNEL allocations are not so marked, so can escape to the
1972 * nearest enclosing mem_exclusive ancestor cpuset. 2026 * nearest enclosing hardwalled ancestor cpuset.
1973 * 2027 *
1974 * Scanning up parent cpusets requires callback_mutex. The 2028 * Scanning up parent cpusets requires callback_mutex. The
1975 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2029 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -1992,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1992 * in_interrupt - any node ok (current task context irrelevant) 2046 * in_interrupt - any node ok (current task context irrelevant)
1993 * GFP_ATOMIC - any node ok 2047 * GFP_ATOMIC - any node ok
1994 * TIF_MEMDIE - any node ok 2048 * TIF_MEMDIE - any node ok
1995 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2049 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
1996 * GFP_USER - only nodes in current tasks mems allowed ok. 2050 * GFP_USER - only nodes in current tasks mems allowed ok.
1997 * 2051 *
1998 * Rule: 2052 * Rule:
@@ -2029,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2029 mutex_lock(&callback_mutex); 2083 mutex_lock(&callback_mutex);
2030 2084
2031 task_lock(current); 2085 task_lock(current);
2032 cs = nearest_exclusive_ancestor(task_cs(current)); 2086 cs = nearest_hardwall_ancestor(task_cs(current));
2033 task_unlock(current); 2087 task_unlock(current);
2034 2088
2035 allowed = node_isset(node, cs->mems_allowed); 2089 allowed = node_isset(node, cs->mems_allowed);
@@ -2261,8 +2315,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2261 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2315 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
2262 task->cpus_allowed); 2316 task->cpus_allowed);
2263 seq_printf(m, "\n"); 2317 seq_printf(m, "\n");
2318 seq_printf(m, "Cpus_allowed_list:\t");
2319 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count,
2320 task->cpus_allowed);
2321 seq_printf(m, "\n");
2264 seq_printf(m, "Mems_allowed:\t"); 2322 seq_printf(m, "Mems_allowed:\t");
2265 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2323 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
2266 task->mems_allowed); 2324 task->mems_allowed);
2267 seq_printf(m, "\n"); 2325 seq_printf(m, "\n");
2326 seq_printf(m, "Mems_allowed_list:\t");
2327 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count,
2328 task->mems_allowed);
2329 seq_printf(m, "\n");
2268} 2330}
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
149 149
150static int __init proc_dma_init(void) 150static int __init proc_dma_init(void)
151{ 151{
152 struct proc_dir_entry *e; 152 proc_create("dma", 0, NULL, &proc_dma_operations);
153
154 e = create_proc_entry("dma", 0, NULL);
155 if (e)
156 e->proc_fops = &proc_dma_operations;
157
158 return 0; 153 return 0;
159} 154}
160 155
diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf993fa..1510f78a0ffa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
19#include <linux/acct.h> 19#include <linux/acct.h>
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h>
22#include <linux/binfmts.h> 23#include <linux/binfmts.h>
23#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
24#include <linux/pid_namespace.h> 25#include <linux/pid_namespace.h>
@@ -52,6 +53,11 @@
52 53
53static void exit_mm(struct task_struct * tsk); 54static void exit_mm(struct task_struct * tsk);
54 55
56static inline int task_detached(struct task_struct *p)
57{
58 return p->exit_signal == -1;
59}
60
55static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p)
56{ 62{
57 nr_threads--; 63 nr_threads--;
@@ -160,7 +166,7 @@ repeat:
160 zap_leader = 0; 166 zap_leader = 0;
161 leader = p->group_leader; 167 leader = p->group_leader;
162 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 168 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
163 BUG_ON(leader->exit_signal == -1); 169 BUG_ON(task_detached(leader));
164 do_notify_parent(leader, leader->exit_signal); 170 do_notify_parent(leader, leader->exit_signal);
165 /* 171 /*
166 * If we were the last child thread and the leader has 172 * If we were the last child thread and the leader has
@@ -170,7 +176,7 @@ repeat:
170 * do_notify_parent() will have marked it self-reaping in 176 * do_notify_parent() will have marked it self-reaping in
171 * that case. 177 * that case.
172 */ 178 */
173 zap_leader = (leader->exit_signal == -1); 179 zap_leader = task_detached(leader);
174 } 180 }
175 181
176 write_unlock_irq(&tasklist_lock); 182 write_unlock_irq(&tasklist_lock);
@@ -329,13 +335,11 @@ void __set_special_pids(struct pid *pid)
329 pid_t nr = pid_nr(pid); 335 pid_t nr = pid_nr(pid);
330 336
331 if (task_session(curr) != pid) { 337 if (task_session(curr) != pid) {
332 detach_pid(curr, PIDTYPE_SID); 338 change_pid(curr, PIDTYPE_SID, pid);
333 attach_pid(curr, PIDTYPE_SID, pid);
334 set_task_session(curr, nr); 339 set_task_session(curr, nr);
335 } 340 }
336 if (task_pgrp(curr) != pid) { 341 if (task_pgrp(curr) != pid) {
337 detach_pid(curr, PIDTYPE_PGID); 342 change_pid(curr, PIDTYPE_PGID, pid);
338 attach_pid(curr, PIDTYPE_PGID, pid);
339 set_task_pgrp(curr, nr); 343 set_task_pgrp(curr, nr);
340 } 344 }
341} 345}
@@ -507,10 +511,9 @@ void put_files_struct(struct files_struct *files)
507 } 511 }
508} 512}
509 513
510EXPORT_SYMBOL(put_files_struct); 514void reset_files_struct(struct files_struct *files)
511
512void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
513{ 515{
516 struct task_struct *tsk = current;
514 struct files_struct *old; 517 struct files_struct *old;
515 518
516 old = tsk->files; 519 old = tsk->files;
@@ -519,9 +522,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
519 task_unlock(tsk); 522 task_unlock(tsk);
520 put_files_struct(old); 523 put_files_struct(old);
521} 524}
522EXPORT_SYMBOL(reset_files_struct);
523 525
524static void __exit_files(struct task_struct *tsk) 526void exit_files(struct task_struct *tsk)
525{ 527{
526 struct files_struct * files = tsk->files; 528 struct files_struct * files = tsk->files;
527 529
@@ -533,12 +535,7 @@ static void __exit_files(struct task_struct *tsk)
533 } 535 }
534} 536}
535 537
536void exit_files(struct task_struct *tsk) 538void put_fs_struct(struct fs_struct *fs)
537{
538 __exit_files(tsk);
539}
540
541static void __put_fs_struct(struct fs_struct *fs)
542{ 539{
543 /* No need to hold fs->lock if we are killing it */ 540 /* No need to hold fs->lock if we are killing it */
544 if (atomic_dec_and_test(&fs->count)) { 541 if (atomic_dec_and_test(&fs->count)) {
@@ -550,12 +547,7 @@ static void __put_fs_struct(struct fs_struct *fs)
550 } 547 }
551} 548}
552 549
553void put_fs_struct(struct fs_struct *fs) 550void exit_fs(struct task_struct *tsk)
554{
555 __put_fs_struct(fs);
556}
557
558static void __exit_fs(struct task_struct *tsk)
559{ 551{
560 struct fs_struct * fs = tsk->fs; 552 struct fs_struct * fs = tsk->fs;
561 553
@@ -563,16 +555,93 @@ static void __exit_fs(struct task_struct *tsk)
563 task_lock(tsk); 555 task_lock(tsk);
564 tsk->fs = NULL; 556 tsk->fs = NULL;
565 task_unlock(tsk); 557 task_unlock(tsk);
566 __put_fs_struct(fs); 558 put_fs_struct(fs);
567 } 559 }
568} 560}
569 561
570void exit_fs(struct task_struct *tsk) 562EXPORT_SYMBOL_GPL(exit_fs);
563
564#ifdef CONFIG_MM_OWNER
565/*
566 * Task p is exiting and it owned mm, lets find a new owner for it
567 */
568static inline int
569mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
571{ 570{
572 __exit_fs(tsk); 571 /*
572 * If there are other users of the mm and the owner (us) is exiting
573 * we need to find a new owner to take on the responsibility.
574 */
575 if (!mm)
576 return 0;
577 if (atomic_read(&mm->mm_users) <= 1)
578 return 0;
579 if (mm->owner != p)
580 return 0;
581 return 1;
573} 582}
574 583
575EXPORT_SYMBOL_GPL(exit_fs); 584void mm_update_next_owner(struct mm_struct *mm)
585{
586 struct task_struct *c, *g, *p = current;
587
588retry:
589 if (!mm_need_new_owner(mm, p))
590 return;
591
592 read_lock(&tasklist_lock);
593 /*
594 * Search in the children
595 */
596 list_for_each_entry(c, &p->children, sibling) {
597 if (c->mm == mm)
598 goto assign_new_owner;
599 }
600
601 /*
602 * Search in the siblings
603 */
604 list_for_each_entry(c, &p->parent->children, sibling) {
605 if (c->mm == mm)
606 goto assign_new_owner;
607 }
608
609 /*
610 * Search through everything else. We should not get
611 * here often
612 */
613 do_each_thread(g, c) {
614 if (c->mm == mm)
615 goto assign_new_owner;
616 } while_each_thread(g, c);
617
618 read_unlock(&tasklist_lock);
619 return;
620
621assign_new_owner:
622 BUG_ON(c == p);
623 get_task_struct(c);
624 /*
625 * The task_lock protects c->mm from changing.
626 * We always want mm->owner->mm == mm
627 */
628 task_lock(c);
629 /*
630 * Delay read_unlock() till we have the task_lock()
631 * to ensure that c does not slip away underneath us
632 */
633 read_unlock(&tasklist_lock);
634 if (c->mm != mm) {
635 task_unlock(c);
636 put_task_struct(c);
637 goto retry;
638 }
639 cgroup_mm_owner_callbacks(mm->owner, c);
640 mm->owner = c;
641 task_unlock(c);
642 put_task_struct(c);
643}
644#endif /* CONFIG_MM_OWNER */
576 645
577/* 646/*
578 * Turn us into a lazy TLB process if we 647 * Turn us into a lazy TLB process if we
@@ -613,6 +682,7 @@ static void exit_mm(struct task_struct * tsk)
613 /* We don't want this task to be frozen prematurely */ 682 /* We don't want this task to be frozen prematurely */
614 clear_freeze_flag(tsk); 683 clear_freeze_flag(tsk);
615 task_unlock(tsk); 684 task_unlock(tsk);
685 mm_update_next_owner(mm);
616 mmput(mm); 686 mmput(mm);
617} 687}
618 688
@@ -627,7 +697,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
627 if (unlikely(traced)) { 697 if (unlikely(traced)) {
628 /* Preserve ptrace links if someone else is tracing this child. */ 698 /* Preserve ptrace links if someone else is tracing this child. */
629 list_del_init(&p->ptrace_list); 699 list_del_init(&p->ptrace_list);
630 if (p->parent != p->real_parent) 700 if (ptrace_reparented(p))
631 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 701 list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
632 } else { 702 } else {
633 /* If this child is being traced, then we're the one tracing it 703 /* If this child is being traced, then we're the one tracing it
@@ -651,18 +721,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
651 /* If this is a threaded reparent there is no need to 721 /* If this is a threaded reparent there is no need to
652 * notify anyone anything has happened. 722 * notify anyone anything has happened.
653 */ 723 */
654 if (p->real_parent->group_leader == father->group_leader) 724 if (same_thread_group(p->real_parent, father))
655 return; 725 return;
656 726
657 /* We don't want people slaying init. */ 727 /* We don't want people slaying init. */
658 if (p->exit_signal != -1) 728 if (!task_detached(p))
659 p->exit_signal = SIGCHLD; 729 p->exit_signal = SIGCHLD;
660 730
661 /* If we'd notified the old parent about this child's death, 731 /* If we'd notified the old parent about this child's death,
662 * also notify the new parent. 732 * also notify the new parent.
663 */ 733 */
664 if (!traced && p->exit_state == EXIT_ZOMBIE && 734 if (!traced && p->exit_state == EXIT_ZOMBIE &&
665 p->exit_signal != -1 && thread_group_empty(p)) 735 !task_detached(p) && thread_group_empty(p))
666 do_notify_parent(p, p->exit_signal); 736 do_notify_parent(p, p->exit_signal);
667 737
668 kill_orphaned_pgrp(p, father); 738 kill_orphaned_pgrp(p, father);
@@ -715,18 +785,18 @@ static void forget_original_parent(struct task_struct *father)
715 } else { 785 } else {
716 /* reparent ptraced task to its real parent */ 786 /* reparent ptraced task to its real parent */
717 __ptrace_unlink (p); 787 __ptrace_unlink (p);
718 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && 788 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
719 thread_group_empty(p)) 789 thread_group_empty(p))
720 do_notify_parent(p, p->exit_signal); 790 do_notify_parent(p, p->exit_signal);
721 } 791 }
722 792
723 /* 793 /*
724 * if the ptraced child is a zombie with exit_signal == -1 794 * if the ptraced child is a detached zombie we must collect
725 * we must collect it before we exit, or it will remain 795 * it before we exit, or it will remain zombie forever since
726 * zombie forever since we prevented it from self-reap itself 796 * we prevented it from self-reap itself while it was being
727 * while it was being traced by us, to be able to see it in wait4. 797 * traced by us, to be able to see it in wait4.
728 */ 798 */
729 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 799 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
730 list_add(&p->ptrace_list, &ptrace_dead); 800 list_add(&p->ptrace_list, &ptrace_dead);
731 } 801 }
732 802
@@ -783,29 +853,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
783 * we have changed execution domain as these two values started 853 * we have changed execution domain as these two values started
784 * the same after a fork. 854 * the same after a fork.
785 */ 855 */
786 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && 856 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
787 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 857 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
788 tsk->self_exec_id != tsk->parent_exec_id) 858 tsk->self_exec_id != tsk->parent_exec_id) &&
789 && !capable(CAP_KILL)) 859 !capable(CAP_KILL))
790 tsk->exit_signal = SIGCHLD; 860 tsk->exit_signal = SIGCHLD;
791 861
792
793 /* If something other than our normal parent is ptracing us, then 862 /* If something other than our normal parent is ptracing us, then
794 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 863 * send it a SIGCHLD instead of honoring exit_signal. exit_signal
795 * only has special meaning to our real parent. 864 * only has special meaning to our real parent.
796 */ 865 */
797 if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { 866 if (!task_detached(tsk) && thread_group_empty(tsk)) {
798 int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; 867 int signal = ptrace_reparented(tsk) ?
868 SIGCHLD : tsk->exit_signal;
799 do_notify_parent(tsk, signal); 869 do_notify_parent(tsk, signal);
800 } else if (tsk->ptrace) { 870 } else if (tsk->ptrace) {
801 do_notify_parent(tsk, SIGCHLD); 871 do_notify_parent(tsk, SIGCHLD);
802 } 872 }
803 873
804 state = EXIT_ZOMBIE; 874 state = EXIT_ZOMBIE;
805 if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) 875 if (task_detached(tsk) && likely(!tsk->ptrace))
806 state = EXIT_DEAD; 876 state = EXIT_DEAD;
807 tsk->exit_state = state; 877 tsk->exit_state = state;
808 878
879 /* mt-exec, de_thread() is waiting for us */
809 if (thread_group_leader(tsk) && 880 if (thread_group_leader(tsk) &&
810 tsk->signal->notify_count < 0 && 881 tsk->signal->notify_count < 0 &&
811 tsk->signal->group_exit_task) 882 tsk->signal->group_exit_task)
@@ -967,8 +1038,8 @@ NORET_TYPE void do_exit(long code)
967 if (group_dead) 1038 if (group_dead)
968 acct_process(); 1039 acct_process();
969 exit_sem(tsk); 1040 exit_sem(tsk);
970 __exit_files(tsk); 1041 exit_files(tsk);
971 __exit_fs(tsk); 1042 exit_fs(tsk);
972 check_stack_usage(); 1043 check_stack_usage();
973 exit_thread(); 1044 exit_thread();
974 cgroup_exit(tsk, 1); 1045 cgroup_exit(tsk, 1);
@@ -984,7 +1055,7 @@ NORET_TYPE void do_exit(long code)
984 proc_exit_connector(tsk); 1055 proc_exit_connector(tsk);
985 exit_notify(tsk, group_dead); 1056 exit_notify(tsk, group_dead);
986#ifdef CONFIG_NUMA 1057#ifdef CONFIG_NUMA
987 mpol_free(tsk->mempolicy); 1058 mpol_put(tsk->mempolicy);
988 tsk->mempolicy = NULL; 1059 tsk->mempolicy = NULL;
989#endif 1060#endif
990#ifdef CONFIG_FUTEX 1061#ifdef CONFIG_FUTEX
@@ -1049,12 +1120,13 @@ asmlinkage long sys_exit(int error_code)
1049NORET_TYPE void 1120NORET_TYPE void
1050do_group_exit(int exit_code) 1121do_group_exit(int exit_code)
1051{ 1122{
1123 struct signal_struct *sig = current->signal;
1124
1052 BUG_ON(exit_code & 0x80); /* core dumps don't get here */ 1125 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1053 1126
1054 if (current->signal->flags & SIGNAL_GROUP_EXIT) 1127 if (signal_group_exit(sig))
1055 exit_code = current->signal->group_exit_code; 1128 exit_code = sig->group_exit_code;
1056 else if (!thread_group_empty(current)) { 1129 else if (!thread_group_empty(current)) {
1057 struct signal_struct *const sig = current->signal;
1058 struct sighand_struct *const sighand = current->sighand; 1130 struct sighand_struct *const sighand = current->sighand;
1059 spin_lock_irq(&sighand->siglock); 1131 spin_lock_irq(&sighand->siglock);
1060 if (signal_group_exit(sig)) 1132 if (signal_group_exit(sig))
@@ -1106,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1106 * Do not consider detached threads that are 1178 * Do not consider detached threads that are
1107 * not ptraced: 1179 * not ptraced:
1108 */ 1180 */
1109 if (p->exit_signal == -1 && !p->ptrace) 1181 if (task_detached(p) && !p->ptrace)
1110 return 0; 1182 return 0;
1111 1183
1112 /* Wait for all children (clone and not) if __WALL is set; 1184 /* Wait for all children (clone and not) if __WALL is set;
@@ -1196,8 +1268,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1196 return 0; 1268 return 0;
1197 } 1269 }
1198 1270
1199 /* traced means p->ptrace, but not vice versa */ 1271 traced = ptrace_reparented(p);
1200 traced = (p->real_parent != p->parent);
1201 1272
1202 if (likely(!traced)) { 1273 if (likely(!traced)) {
1203 struct signal_struct *psig; 1274 struct signal_struct *psig;
@@ -1298,9 +1369,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1298 * If it's still not detached after that, don't release 1369 * If it's still not detached after that, don't release
1299 * it now. 1370 * it now.
1300 */ 1371 */
1301 if (p->exit_signal != -1) { 1372 if (!task_detached(p)) {
1302 do_notify_parent(p, p->exit_signal); 1373 do_notify_parent(p, p->exit_signal);
1303 if (p->exit_signal != -1) { 1374 if (!task_detached(p)) {
1304 p->exit_state = EXIT_ZOMBIE; 1375 p->exit_state = EXIT_ZOMBIE;
1305 p = NULL; 1376 p = NULL;
1306 } 1377 }
@@ -1608,7 +1679,7 @@ asmlinkage long sys_waitid(int which, pid_t upid,
1608 put_pid(pid); 1679 put_pid(pid);
1609 1680
1610 /* avoid REGPARM breakage on x86: */ 1681 /* avoid REGPARM breakage on x86: */
1611 prevent_tail_call(ret); 1682 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1612 return ret; 1683 return ret;
1613} 1684}
1614 1685
@@ -1640,7 +1711,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1640 put_pid(pid); 1711 put_pid(pid);
1641 1712
1642 /* avoid REGPARM breakage on x86: */ 1713 /* avoid REGPARM breakage on x86: */
1643 prevent_tail_call(ret); 1714 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1644 return ret; 1715 return ret;
1645} 1716}
1646 1717
diff --git a/kernel/fork.c b/kernel/fork.c
index dd249c37b3a3..933e60ebccae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
22#include <linux/mempolicy.h> 22#include <linux/mempolicy.h>
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fdtable.h>
25#include <linux/key.h> 26#include <linux/key.h>
26#include <linux/binfmts.h> 27#include <linux/binfmts.h>
27#include <linux/mman.h> 28#include <linux/mman.h>
@@ -132,6 +133,14 @@ void __put_task_struct(struct task_struct *tsk)
132 free_task(tsk); 133 free_task(tsk);
133} 134}
134 135
136/*
137 * macro override instead of weak attribute alias, to workaround
138 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
139 */
140#ifndef arch_task_cache_init
141#define arch_task_cache_init()
142#endif
143
135void __init fork_init(unsigned long mempages) 144void __init fork_init(unsigned long mempages)
136{ 145{
137#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 146#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -144,6 +153,9 @@ void __init fork_init(unsigned long mempages)
144 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 153 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
145#endif 154#endif
146 155
156 /* do the arch specific task caches init */
157 arch_task_cache_init();
158
147 /* 159 /*
148 * The default maximum number of threads is set to a safe 160 * The default maximum number of threads is set to a safe
149 * value: the thread structures can take up at most half 161 * value: the thread structures can take up at most half
@@ -163,6 +175,13 @@ void __init fork_init(unsigned long mempages)
163 init_task.signal->rlim[RLIMIT_NPROC]; 175 init_task.signal->rlim[RLIMIT_NPROC];
164} 176}
165 177
178int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
179 struct task_struct *src)
180{
181 *dst = *src;
182 return 0;
183}
184
166static struct task_struct *dup_task_struct(struct task_struct *orig) 185static struct task_struct *dup_task_struct(struct task_struct *orig)
167{ 186{
168 struct task_struct *tsk; 187 struct task_struct *tsk;
@@ -181,15 +200,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
181 return NULL; 200 return NULL;
182 } 201 }
183 202
184 *tsk = *orig; 203 err = arch_dup_task_struct(tsk, orig);
204 if (err)
205 goto out;
206
185 tsk->stack = ti; 207 tsk->stack = ti;
186 208
187 err = prop_local_init_single(&tsk->dirties); 209 err = prop_local_init_single(&tsk->dirties);
188 if (err) { 210 if (err)
189 free_thread_info(ti); 211 goto out;
190 free_task_struct(tsk);
191 return NULL;
192 }
193 212
194 setup_thread_stack(tsk, orig); 213 setup_thread_stack(tsk, orig);
195 214
@@ -205,6 +224,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
205#endif 224#endif
206 tsk->splice_pipe = NULL; 225 tsk->splice_pipe = NULL;
207 return tsk; 226 return tsk;
227
228out:
229 free_thread_info(ti);
230 free_task_struct(tsk);
231 return NULL;
208} 232}
209 233
210#ifdef CONFIG_MMU 234#ifdef CONFIG_MMU
@@ -256,7 +280,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
256 if (!tmp) 280 if (!tmp)
257 goto fail_nomem; 281 goto fail_nomem;
258 *tmp = *mpnt; 282 *tmp = *mpnt;
259 pol = mpol_copy(vma_policy(mpnt)); 283 pol = mpol_dup(vma_policy(mpnt));
260 retval = PTR_ERR(pol); 284 retval = PTR_ERR(pol);
261 if (IS_ERR(pol)) 285 if (IS_ERR(pol))
262 goto fail_nomem_policy; 286 goto fail_nomem_policy;
@@ -358,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
358 mm->ioctx_list = NULL; 382 mm->ioctx_list = NULL;
359 mm->free_area_cache = TASK_UNMAPPED_BASE; 383 mm->free_area_cache = TASK_UNMAPPED_BASE;
360 mm->cached_hole_size = ~0UL; 384 mm->cached_hole_size = ~0UL;
361 mm_init_cgroup(mm, p); 385 mm_init_owner(mm, p);
362 386
363 if (likely(!mm_alloc_pgd(mm))) { 387 if (likely(!mm_alloc_pgd(mm))) {
364 mm->def_flags = 0; 388 mm->def_flags = 0;
365 return mm; 389 return mm;
366 } 390 }
367 391
368 mm_free_cgroup(mm);
369 free_mm(mm); 392 free_mm(mm);
370 return NULL; 393 return NULL;
371} 394}
@@ -394,7 +417,6 @@ void __mmdrop(struct mm_struct *mm)
394{ 417{
395 BUG_ON(mm == &init_mm); 418 BUG_ON(mm == &init_mm);
396 mm_free_pgd(mm); 419 mm_free_pgd(mm);
397 mm_free_cgroup(mm);
398 destroy_context(mm); 420 destroy_context(mm);
399 free_mm(mm); 421 free_mm(mm);
400} 422}
@@ -410,6 +432,7 @@ void mmput(struct mm_struct *mm)
410 if (atomic_dec_and_test(&mm->mm_users)) { 432 if (atomic_dec_and_test(&mm->mm_users)) {
411 exit_aio(mm); 433 exit_aio(mm);
412 exit_mmap(mm); 434 exit_mmap(mm);
435 set_mm_exe_file(mm, NULL);
413 if (!list_empty(&mm->mmlist)) { 436 if (!list_empty(&mm->mmlist)) {
414 spin_lock(&mmlist_lock); 437 spin_lock(&mmlist_lock);
415 list_del(&mm->mmlist); 438 list_del(&mm->mmlist);
@@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
498 * Allocate a new mm structure and copy contents from the 521 * Allocate a new mm structure and copy contents from the
499 * mm structure of the passed in task structure. 522 * mm structure of the passed in task structure.
500 */ 523 */
501static struct mm_struct *dup_mm(struct task_struct *tsk) 524struct mm_struct *dup_mm(struct task_struct *tsk)
502{ 525{
503 struct mm_struct *mm, *oldmm = current->mm; 526 struct mm_struct *mm, *oldmm = current->mm;
504 int err; 527 int err;
@@ -522,6 +545,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
522 if (init_new_context(tsk, mm)) 545 if (init_new_context(tsk, mm))
523 goto fail_nocontext; 546 goto fail_nocontext;
524 547
548 dup_mm_exe_file(oldmm, mm);
549
525 err = dup_mmap(mm, oldmm); 550 err = dup_mmap(mm, oldmm);
526 if (err) 551 if (err)
527 goto free_pt; 552 goto free_pt;
@@ -782,12 +807,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
782 goto out; 807 goto out;
783 } 808 }
784 809
785 /*
786 * Note: we may be using current for both targets (See exec.c)
787 * This works because we cache current->files (old) as oldf. Don't
788 * break this.
789 */
790 tsk->files = NULL;
791 newf = dup_fd(oldf, &error); 810 newf = dup_fd(oldf, &error);
792 if (!newf) 811 if (!newf)
793 goto out; 812 goto out;
@@ -823,34 +842,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
823 return 0; 842 return 0;
824} 843}
825 844
826/*
827 * Helper to unshare the files of the current task.
828 * We don't want to expose copy_files internals to
829 * the exec layer of the kernel.
830 */
831
832int unshare_files(void)
833{
834 struct files_struct *files = current->files;
835 int rc;
836
837 BUG_ON(!files);
838
839 /* This can race but the race causes us to copy when we don't
840 need to and drop the copy */
841 if(atomic_read(&files->count) == 1)
842 {
843 atomic_inc(&files->count);
844 return 0;
845 }
846 rc = copy_files(0, current);
847 if(rc)
848 current->files = files;
849 return rc;
850}
851
852EXPORT_SYMBOL(unshare_files);
853
854static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) 845static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
855{ 846{
856 struct sighand_struct *sig; 847 struct sighand_struct *sig;
@@ -902,7 +893,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
902 sig->group_exit_code = 0; 893 sig->group_exit_code = 0;
903 sig->group_exit_task = NULL; 894 sig->group_exit_task = NULL;
904 sig->group_stop_count = 0; 895 sig->group_stop_count = 0;
905 sig->curr_target = NULL; 896 sig->curr_target = tsk;
906 init_sigpending(&sig->shared_pending); 897 init_sigpending(&sig->shared_pending);
907 INIT_LIST_HEAD(&sig->posix_timers); 898 INIT_LIST_HEAD(&sig->posix_timers);
908 899
@@ -993,6 +984,13 @@ static void rt_mutex_init_task(struct task_struct *p)
993#endif 984#endif
994} 985}
995 986
987#ifdef CONFIG_MM_OWNER
988void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
989{
990 mm->owner = p;
991}
992#endif /* CONFIG_MM_OWNER */
993
996/* 994/*
997 * This creates a new process as a copy of the old one, 995 * This creates a new process as a copy of the old one,
998 * but does not actually start it yet. 996 * but does not actually start it yet.
@@ -1127,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127 p->audit_context = NULL; 1125 p->audit_context = NULL;
1128 cgroup_fork(p); 1126 cgroup_fork(p);
1129#ifdef CONFIG_NUMA 1127#ifdef CONFIG_NUMA
1130 p->mempolicy = mpol_copy(p->mempolicy); 1128 p->mempolicy = mpol_dup(p->mempolicy);
1131 if (IS_ERR(p->mempolicy)) { 1129 if (IS_ERR(p->mempolicy)) {
1132 retval = PTR_ERR(p->mempolicy); 1130 retval = PTR_ERR(p->mempolicy);
1133 p->mempolicy = NULL; 1131 p->mempolicy = NULL;
@@ -1385,7 +1383,7 @@ bad_fork_cleanup_security:
1385 security_task_free(p); 1383 security_task_free(p);
1386bad_fork_cleanup_policy: 1384bad_fork_cleanup_policy:
1387#ifdef CONFIG_NUMA 1385#ifdef CONFIG_NUMA
1388 mpol_free(p->mempolicy); 1386 mpol_put(p->mempolicy);
1389bad_fork_cleanup_cgroup: 1387bad_fork_cleanup_cgroup:
1390#endif 1388#endif
1391 cgroup_exit(p, cgroup_callbacks_done); 1389 cgroup_exit(p, cgroup_callbacks_done);
@@ -1675,18 +1673,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1675} 1673}
1676 1674
1677/* 1675/*
1678 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1679 * supported yet
1680 */
1681static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1682{
1683 if (unshare_flags & CLONE_SYSVSEM)
1684 return -EINVAL;
1685
1686 return 0;
1687}
1688
1689/*
1690 * unshare allows a process to 'unshare' part of the process 1676 * unshare allows a process to 'unshare' part of the process
1691 * context which was originally shared using clone. copy_* 1677 * context which was originally shared using clone. copy_*
1692 * functions used by do_fork() cannot be used here directly 1678 * functions used by do_fork() cannot be used here directly
@@ -1701,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1701 struct sighand_struct *new_sigh = NULL; 1687 struct sighand_struct *new_sigh = NULL;
1702 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1688 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1703 struct files_struct *fd, *new_fd = NULL; 1689 struct files_struct *fd, *new_fd = NULL;
1704 struct sem_undo_list *new_ulist = NULL;
1705 struct nsproxy *new_nsproxy = NULL; 1690 struct nsproxy *new_nsproxy = NULL;
1691 int do_sysvsem = 0;
1706 1692
1707 check_unshare_flags(&unshare_flags); 1693 check_unshare_flags(&unshare_flags);
1708 1694
@@ -1714,6 +1700,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1714 CLONE_NEWNET)) 1700 CLONE_NEWNET))
1715 goto bad_unshare_out; 1701 goto bad_unshare_out;
1716 1702
1703 /*
1704 * CLONE_NEWIPC must also detach from the undolist: after switching
1705 * to a new ipc namespace, the semaphore arrays from the old
1706 * namespace are unreachable.
1707 */
1708 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1709 do_sysvsem = 1;
1717 if ((err = unshare_thread(unshare_flags))) 1710 if ((err = unshare_thread(unshare_flags)))
1718 goto bad_unshare_out; 1711 goto bad_unshare_out;
1719 if ((err = unshare_fs(unshare_flags, &new_fs))) 1712 if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1724,13 +1717,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1724 goto bad_unshare_cleanup_sigh; 1717 goto bad_unshare_cleanup_sigh;
1725 if ((err = unshare_fd(unshare_flags, &new_fd))) 1718 if ((err = unshare_fd(unshare_flags, &new_fd)))
1726 goto bad_unshare_cleanup_vm; 1719 goto bad_unshare_cleanup_vm;
1727 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1728 goto bad_unshare_cleanup_fd;
1729 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1720 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1730 new_fs))) 1721 new_fs)))
1731 goto bad_unshare_cleanup_semundo; 1722 goto bad_unshare_cleanup_fd;
1732 1723
1733 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1724 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
1725 if (do_sysvsem) {
1726 /*
1727 * CLONE_SYSVSEM is equivalent to sys_exit().
1728 */
1729 exit_sem(current);
1730 }
1734 1731
1735 if (new_nsproxy) { 1732 if (new_nsproxy) {
1736 switch_task_namespaces(current, new_nsproxy); 1733 switch_task_namespaces(current, new_nsproxy);
@@ -1766,7 +1763,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1766 if (new_nsproxy) 1763 if (new_nsproxy)
1767 put_nsproxy(new_nsproxy); 1764 put_nsproxy(new_nsproxy);
1768 1765
1769bad_unshare_cleanup_semundo:
1770bad_unshare_cleanup_fd: 1766bad_unshare_cleanup_fd:
1771 if (new_fd) 1767 if (new_fd)
1772 put_files_struct(new_fd); 1768 put_files_struct(new_fd);
@@ -1788,3 +1784,27 @@ bad_unshare_cleanup_thread:
1788bad_unshare_out: 1784bad_unshare_out:
1789 return err; 1785 return err;
1790} 1786}
1787
1788/*
1789 * Helper to unshare the files of the current task.
1790 * We don't want to expose copy_files internals to
1791 * the exec layer of the kernel.
1792 */
1793
1794int unshare_files(struct files_struct **displaced)
1795{
1796 struct task_struct *task = current;
1797 struct files_struct *copy = NULL;
1798 int error;
1799
1800 error = unshare_fd(CLONE_FILES, &copy);
1801 if (error || !copy) {
1802 *displaced = NULL;
1803 return error;
1804 }
1805 *displaced = task->files;
1806 task_lock(task);
1807 task->files = copy;
1808 task_unlock(task);
1809 return 0;
1810}
diff --git a/kernel/futex.c b/kernel/futex.c
index 06968cd79200..98092c9817f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
281 */ 281 */
282static void get_futex_key_refs(union futex_key *key) 282static void get_futex_key_refs(union futex_key *key)
283{ 283{
284 if (key->both.ptr == 0) 284 if (key->both.ptr == NULL)
285 return; 285 return;
286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
287 case FUT_OFF_INODE: 287 case FUT_OFF_INODE:
@@ -1266,11 +1266,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1266 if (!abs_time) 1266 if (!abs_time)
1267 schedule(); 1267 schedule();
1268 else { 1268 else {
1269 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1269 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1270 HRTIMER_MODE_ABS);
1270 hrtimer_init_sleeper(&t, current); 1271 hrtimer_init_sleeper(&t, current);
1271 t.timer.expires = *abs_time; 1272 t.timer.expires = *abs_time;
1272 1273
1273 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); 1274 hrtimer_start(&t.timer, t.timer.expires,
1275 HRTIMER_MODE_ABS);
1274 if (!hrtimer_active(&t.timer)) 1276 if (!hrtimer_active(&t.timer))
1275 t.task = NULL; 1277 t.task = NULL;
1276 1278
@@ -1286,6 +1288,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1286 1288
1287 /* Flag if a timeout occured */ 1289 /* Flag if a timeout occured */
1288 rem = (t.task == NULL); 1290 rem = (t.task == NULL);
1291
1292 destroy_hrtimer_on_stack(&t.timer);
1289 } 1293 }
1290 } 1294 }
1291 __set_current_state(TASK_RUNNING); 1295 __set_current_state(TASK_RUNNING);
@@ -1367,7 +1371,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1367 1371
1368 if (time) { 1372 if (time) {
1369 to = &timeout; 1373 to = &timeout;
1370 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1374 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1375 HRTIMER_MODE_ABS);
1371 hrtimer_init_sleeper(to, current); 1376 hrtimer_init_sleeper(to, current);
1372 to->timer.expires = *time; 1377 to->timer.expires = *time;
1373 } 1378 }
@@ -1581,6 +1586,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1581 unqueue_me_pi(&q); 1586 unqueue_me_pi(&q);
1582 futex_unlock_mm(fshared); 1587 futex_unlock_mm(fshared);
1583 1588
1589 if (to)
1590 destroy_hrtimer_on_stack(&to->timer);
1584 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1591 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1585 1592
1586 out_unlock_release_sem: 1593 out_unlock_release_sem:
@@ -1588,6 +1595,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1588 1595
1589 out_release_sem: 1596 out_release_sem:
1590 futex_unlock_mm(fshared); 1597 futex_unlock_mm(fshared);
1598 if (to)
1599 destroy_hrtimer_on_stack(&to->timer);
1591 return ret; 1600 return ret;
1592 1601
1593 uaddr_faulted: 1602 uaddr_faulted:
@@ -1615,6 +1624,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1615 if (!ret && (uval != -EFAULT)) 1624 if (!ret && (uval != -EFAULT))
1616 goto retry; 1625 goto retry;
1617 1626
1627 if (to)
1628 destroy_hrtimer_on_stack(&to->timer);
1618 return ret; 1629 return ret;
1619} 1630}
1620 1631
@@ -2158,7 +2169,7 @@ static struct file_system_type futex_fs_type = {
2158 .kill_sb = kill_anon_super, 2169 .kill_sb = kill_anon_super,
2159}; 2170};
2160 2171
2161static int __init init(void) 2172static int __init futex_init(void)
2162{ 2173{
2163 u32 curval; 2174 u32 curval;
2164 int i; 2175 int i;
@@ -2194,4 +2205,4 @@ static int __init init(void)
2194 2205
2195 return 0; 2206 return 0;
2196} 2207}
2197__initcall(init); 2208__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index ff90f049f8f6..04ac3a9e42cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
30 return 0; 30 return 0;
31} 31}
32 32
33static void __user *futex_uaddr(struct robust_list *entry, 33static void __user *futex_uaddr(struct robust_list __user *entry,
34 compat_long_t futex_offset) 34 compat_long_t futex_offset)
35{ 35{
36 compat_uptr_t base = ptr_to_compat(entry); 36 compat_uptr_t base = ptr_to_compat(entry);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 98bee013f71f..9af1d6a8095e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,7 @@
43#include <linux/tick.h> 43#include <linux/tick.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45#include <linux/err.h> 45#include <linux/err.h>
46#include <linux/debugobjects.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
@@ -342,6 +343,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
342 return res; 343 return res;
343} 344}
344 345
346#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
347
348static struct debug_obj_descr hrtimer_debug_descr;
349
350/*
351 * fixup_init is called when:
352 * - an active object is initialized
353 */
354static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
355{
356 struct hrtimer *timer = addr;
357
358 switch (state) {
359 case ODEBUG_STATE_ACTIVE:
360 hrtimer_cancel(timer);
361 debug_object_init(timer, &hrtimer_debug_descr);
362 return 1;
363 default:
364 return 0;
365 }
366}
367
368/*
369 * fixup_activate is called when:
370 * - an active object is activated
371 * - an unknown object is activated (might be a statically initialized object)
372 */
373static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
374{
375 switch (state) {
376
377 case ODEBUG_STATE_NOTAVAILABLE:
378 WARN_ON_ONCE(1);
379 return 0;
380
381 case ODEBUG_STATE_ACTIVE:
382 WARN_ON(1);
383
384 default:
385 return 0;
386 }
387}
388
389/*
390 * fixup_free is called when:
391 * - an active object is freed
392 */
393static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
394{
395 struct hrtimer *timer = addr;
396
397 switch (state) {
398 case ODEBUG_STATE_ACTIVE:
399 hrtimer_cancel(timer);
400 debug_object_free(timer, &hrtimer_debug_descr);
401 return 1;
402 default:
403 return 0;
404 }
405}
406
407static struct debug_obj_descr hrtimer_debug_descr = {
408 .name = "hrtimer",
409 .fixup_init = hrtimer_fixup_init,
410 .fixup_activate = hrtimer_fixup_activate,
411 .fixup_free = hrtimer_fixup_free,
412};
413
414static inline void debug_hrtimer_init(struct hrtimer *timer)
415{
416 debug_object_init(timer, &hrtimer_debug_descr);
417}
418
419static inline void debug_hrtimer_activate(struct hrtimer *timer)
420{
421 debug_object_activate(timer, &hrtimer_debug_descr);
422}
423
424static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
425{
426 debug_object_deactivate(timer, &hrtimer_debug_descr);
427}
428
429static inline void debug_hrtimer_free(struct hrtimer *timer)
430{
431 debug_object_free(timer, &hrtimer_debug_descr);
432}
433
434static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
435 enum hrtimer_mode mode);
436
437void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
438 enum hrtimer_mode mode)
439{
440 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
441 __hrtimer_init(timer, clock_id, mode);
442}
443
444void destroy_hrtimer_on_stack(struct hrtimer *timer)
445{
446 debug_object_free(timer, &hrtimer_debug_descr);
447}
448
449#else
450static inline void debug_hrtimer_init(struct hrtimer *timer) { }
451static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
452static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
453#endif
454
345/* 455/*
346 * Check, whether the timer is on the callback pending list 456 * Check, whether the timer is on the callback pending list
347 */ 457 */
@@ -567,6 +677,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
567 /* Timer is expired, act upon the callback mode */ 677 /* Timer is expired, act upon the callback mode */
568 switch(timer->cb_mode) { 678 switch(timer->cb_mode) {
569 case HRTIMER_CB_IRQSAFE_NO_RESTART: 679 case HRTIMER_CB_IRQSAFE_NO_RESTART:
680 debug_hrtimer_deactivate(timer);
570 /* 681 /*
571 * We can call the callback from here. No restart 682 * We can call the callback from here. No restart
572 * happens, so no danger of recursion 683 * happens, so no danger of recursion
@@ -581,6 +692,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
581 * the tick timer in the softirq ! The calling site 692 * the tick timer in the softirq ! The calling site
582 * takes care of this. 693 * takes care of this.
583 */ 694 */
695 debug_hrtimer_deactivate(timer);
584 return 1; 696 return 1;
585 case HRTIMER_CB_IRQSAFE: 697 case HRTIMER_CB_IRQSAFE:
586 case HRTIMER_CB_SOFTIRQ: 698 case HRTIMER_CB_SOFTIRQ:
@@ -590,7 +702,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
590 list_add_tail(&timer->cb_entry, 702 list_add_tail(&timer->cb_entry,
591 &base->cpu_base->cb_pending); 703 &base->cpu_base->cb_pending);
592 timer->state = HRTIMER_STATE_PENDING; 704 timer->state = HRTIMER_STATE_PENDING;
593 raise_softirq(HRTIMER_SOFTIRQ);
594 return 1; 705 return 1;
595 default: 706 default:
596 BUG(); 707 BUG();
@@ -633,6 +744,11 @@ static int hrtimer_switch_to_hres(void)
633 return 1; 744 return 1;
634} 745}
635 746
747static inline void hrtimer_raise_softirq(void)
748{
749 raise_softirq(HRTIMER_SOFTIRQ);
750}
751
636#else 752#else
637 753
638static inline int hrtimer_hres_active(void) { return 0; } 754static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +767,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
651{ 767{
652 return 0; 768 return 0;
653} 769}
770static inline void hrtimer_raise_softirq(void) { }
654 771
655#endif /* CONFIG_HIGH_RES_TIMERS */ 772#endif /* CONFIG_HIGH_RES_TIMERS */
656 773
@@ -730,6 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
730 struct hrtimer *entry; 847 struct hrtimer *entry;
731 int leftmost = 1; 848 int leftmost = 1;
732 849
850 debug_hrtimer_activate(timer);
851
733 /* 852 /*
734 * Find the right place in the rbtree: 853 * Find the right place in the rbtree:
735 */ 854 */
@@ -826,6 +945,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
826 * reprogramming happens in the interrupt handler. This is a 945 * reprogramming happens in the interrupt handler. This is a
827 * rare case and less expensive than a smp call. 946 * rare case and less expensive than a smp call.
828 */ 947 */
948 debug_hrtimer_deactivate(timer);
829 timer_stats_hrtimer_clear_start_info(timer); 949 timer_stats_hrtimer_clear_start_info(timer);
830 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 950 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
831 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 951 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -850,7 +970,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850{ 970{
851 struct hrtimer_clock_base *base, *new_base; 971 struct hrtimer_clock_base *base, *new_base;
852 unsigned long flags; 972 unsigned long flags;
853 int ret; 973 int ret, raise;
854 974
855 base = lock_hrtimer_base(timer, &flags); 975 base = lock_hrtimer_base(timer, &flags);
856 976
@@ -873,6 +993,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
873 tim = ktime_add_safe(tim, base->resolution); 993 tim = ktime_add_safe(tim, base->resolution);
874#endif 994#endif
875 } 995 }
996
876 timer->expires = tim; 997 timer->expires = tim;
877 998
878 timer_stats_hrtimer_set_start_info(timer); 999 timer_stats_hrtimer_set_start_info(timer);
@@ -884,8 +1005,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
884 enqueue_hrtimer(timer, new_base, 1005 enqueue_hrtimer(timer, new_base,
885 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 1006 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
886 1007
1008 /*
1009 * The timer may be expired and moved to the cb_pending
1010 * list. We can not raise the softirq with base lock held due
1011 * to a possible deadlock with runqueue lock.
1012 */
1013 raise = timer->state == HRTIMER_STATE_PENDING;
1014
887 unlock_hrtimer_base(timer, &flags); 1015 unlock_hrtimer_base(timer, &flags);
888 1016
1017 if (raise)
1018 hrtimer_raise_softirq();
1019
889 return ret; 1020 return ret;
890} 1021}
891EXPORT_SYMBOL_GPL(hrtimer_start); 1022EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -996,14 +1127,8 @@ ktime_t hrtimer_get_next_event(void)
996} 1127}
997#endif 1128#endif
998 1129
999/** 1130static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1000 * hrtimer_init - initialize a timer to the given clock 1131 enum hrtimer_mode mode)
1001 * @timer: the timer to be initialized
1002 * @clock_id: the clock to be used
1003 * @mode: timer mode abs/rel
1004 */
1005void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1006 enum hrtimer_mode mode)
1007{ 1132{
1008 struct hrtimer_cpu_base *cpu_base; 1133 struct hrtimer_cpu_base *cpu_base;
1009 1134
@@ -1024,6 +1149,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1024 memset(timer->start_comm, 0, TASK_COMM_LEN); 1149 memset(timer->start_comm, 0, TASK_COMM_LEN);
1025#endif 1150#endif
1026} 1151}
1152
1153/**
1154 * hrtimer_init - initialize a timer to the given clock
1155 * @timer: the timer to be initialized
1156 * @clock_id: the clock to be used
1157 * @mode: timer mode abs/rel
1158 */
1159void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1160 enum hrtimer_mode mode)
1161{
1162 debug_hrtimer_init(timer);
1163 __hrtimer_init(timer, clock_id, mode);
1164}
1027EXPORT_SYMBOL_GPL(hrtimer_init); 1165EXPORT_SYMBOL_GPL(hrtimer_init);
1028 1166
1029/** 1167/**
@@ -1057,6 +1195,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1057 timer = list_entry(cpu_base->cb_pending.next, 1195 timer = list_entry(cpu_base->cb_pending.next,
1058 struct hrtimer, cb_entry); 1196 struct hrtimer, cb_entry);
1059 1197
1198 debug_hrtimer_deactivate(timer);
1060 timer_stats_account_hrtimer(timer); 1199 timer_stats_account_hrtimer(timer);
1061 1200
1062 fn = timer->function; 1201 fn = timer->function;
@@ -1080,8 +1219,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1080 * If the timer was rearmed on another CPU, reprogram 1219 * If the timer was rearmed on another CPU, reprogram
1081 * the event device. 1220 * the event device.
1082 */ 1221 */
1083 if (timer->base->first == &timer->node) 1222 struct hrtimer_clock_base *base = timer->base;
1084 hrtimer_reprogram(timer, timer->base); 1223
1224 if (base->first == &timer->node &&
1225 hrtimer_reprogram(timer, base)) {
1226 /*
1227 * Timer is expired. Thus move it from tree to
1228 * pending list again.
1229 */
1230 __remove_hrtimer(timer, base,
1231 HRTIMER_STATE_PENDING, 0);
1232 list_add_tail(&timer->cb_entry,
1233 &base->cpu_base->cb_pending);
1234 }
1085 } 1235 }
1086 } 1236 }
1087 spin_unlock_irq(&cpu_base->lock); 1237 spin_unlock_irq(&cpu_base->lock);
@@ -1094,6 +1244,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1094 enum hrtimer_restart (*fn)(struct hrtimer *); 1244 enum hrtimer_restart (*fn)(struct hrtimer *);
1095 int restart; 1245 int restart;
1096 1246
1247 debug_hrtimer_deactivate(timer);
1097 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1248 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1098 timer_stats_account_hrtimer(timer); 1249 timer_stats_account_hrtimer(timer);
1099 1250
@@ -1238,51 +1389,50 @@ void hrtimer_run_pending(void)
1238/* 1389/*
1239 * Called from hardirq context every jiffy 1390 * Called from hardirq context every jiffy
1240 */ 1391 */
1241static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1392void hrtimer_run_queues(void)
1242 int index)
1243{ 1393{
1244 struct rb_node *node; 1394 struct rb_node *node;
1245 struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; 1395 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1396 struct hrtimer_clock_base *base;
1397 int index, gettime = 1;
1246 1398
1247 if (!base->first) 1399 if (hrtimer_hres_active())
1248 return; 1400 return;
1249 1401
1250 if (base->get_softirq_time) 1402 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1251 base->softirq_time = base->get_softirq_time(); 1403 base = &cpu_base->clock_base[index];
1252
1253 spin_lock(&cpu_base->lock);
1254
1255 while ((node = base->first)) {
1256 struct hrtimer *timer;
1257
1258 timer = rb_entry(node, struct hrtimer, node);
1259 if (base->softirq_time.tv64 <= timer->expires.tv64)
1260 break;
1261 1404
1262 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { 1405 if (!base->first)
1263 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1264 list_add_tail(&timer->cb_entry,
1265 &base->cpu_base->cb_pending);
1266 continue; 1406 continue;
1407
1408 if (base->get_softirq_time)
1409 base->softirq_time = base->get_softirq_time();
1410 else if (gettime) {
1411 hrtimer_get_softirq_time(cpu_base);
1412 gettime = 0;
1267 } 1413 }
1268 1414
1269 __run_hrtimer(timer); 1415 spin_lock(&cpu_base->lock);
1270 }
1271 spin_unlock(&cpu_base->lock);
1272}
1273 1416
1274void hrtimer_run_queues(void) 1417 while ((node = base->first)) {
1275{ 1418 struct hrtimer *timer;
1276 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1277 int i;
1278 1419
1279 if (hrtimer_hres_active()) 1420 timer = rb_entry(node, struct hrtimer, node);
1280 return; 1421 if (base->softirq_time.tv64 <= timer->expires.tv64)
1422 break;
1281 1423
1282 hrtimer_get_softirq_time(cpu_base); 1424 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1425 __remove_hrtimer(timer, base,
1426 HRTIMER_STATE_PENDING, 0);
1427 list_add_tail(&timer->cb_entry,
1428 &base->cpu_base->cb_pending);
1429 continue;
1430 }
1283 1431
1284 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1432 __run_hrtimer(timer);
1285 run_hrtimer_queue(cpu_base, i); 1433 }
1434 spin_unlock(&cpu_base->lock);
1435 }
1286} 1436}
1287 1437
1288/* 1438/*
@@ -1353,22 +1503,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1353{ 1503{
1354 struct hrtimer_sleeper t; 1504 struct hrtimer_sleeper t;
1355 struct timespec __user *rmtp; 1505 struct timespec __user *rmtp;
1506 int ret = 0;
1356 1507
1357 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); 1508 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1358 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1509 HRTIMER_MODE_ABS);
1510 t.timer.expires.tv64 = restart->nanosleep.expires;
1359 1511
1360 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1512 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1361 return 0; 1513 goto out;
1362 1514
1363 rmtp = (struct timespec __user *)restart->arg1; 1515 rmtp = restart->nanosleep.rmtp;
1364 if (rmtp) { 1516 if (rmtp) {
1365 int ret = update_rmtp(&t.timer, rmtp); 1517 ret = update_rmtp(&t.timer, rmtp);
1366 if (ret <= 0) 1518 if (ret <= 0)
1367 return ret; 1519 goto out;
1368 } 1520 }
1369 1521
1370 /* The other values in restart are already filled in */ 1522 /* The other values in restart are already filled in */
1371 return -ERESTART_RESTARTBLOCK; 1523 ret = -ERESTART_RESTARTBLOCK;
1524out:
1525 destroy_hrtimer_on_stack(&t.timer);
1526 return ret;
1372} 1527}
1373 1528
1374long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1529long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
@@ -1376,30 +1531,35 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1376{ 1531{
1377 struct restart_block *restart; 1532 struct restart_block *restart;
1378 struct hrtimer_sleeper t; 1533 struct hrtimer_sleeper t;
1534 int ret = 0;
1379 1535
1380 hrtimer_init(&t.timer, clockid, mode); 1536 hrtimer_init_on_stack(&t.timer, clockid, mode);
1381 t.timer.expires = timespec_to_ktime(*rqtp); 1537 t.timer.expires = timespec_to_ktime(*rqtp);
1382 if (do_nanosleep(&t, mode)) 1538 if (do_nanosleep(&t, mode))
1383 return 0; 1539 goto out;
1384 1540
1385 /* Absolute timers do not update the rmtp value and restart: */ 1541 /* Absolute timers do not update the rmtp value and restart: */
1386 if (mode == HRTIMER_MODE_ABS) 1542 if (mode == HRTIMER_MODE_ABS) {
1387 return -ERESTARTNOHAND; 1543 ret = -ERESTARTNOHAND;
1544 goto out;
1545 }
1388 1546
1389 if (rmtp) { 1547 if (rmtp) {
1390 int ret = update_rmtp(&t.timer, rmtp); 1548 ret = update_rmtp(&t.timer, rmtp);
1391 if (ret <= 0) 1549 if (ret <= 0)
1392 return ret; 1550 goto out;
1393 } 1551 }
1394 1552
1395 restart = &current_thread_info()->restart_block; 1553 restart = &current_thread_info()->restart_block;
1396 restart->fn = hrtimer_nanosleep_restart; 1554 restart->fn = hrtimer_nanosleep_restart;
1397 restart->arg0 = (unsigned long) t.timer.base->index; 1555 restart->nanosleep.index = t.timer.base->index;
1398 restart->arg1 = (unsigned long) rmtp; 1556 restart->nanosleep.rmtp = rmtp;
1399 restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; 1557 restart->nanosleep.expires = t.timer.expires.tv64;
1400 restart->arg3 = t.timer.expires.tv64 >> 32;
1401 1558
1402 return -ERESTART_RESTARTBLOCK; 1559 ret = -ERESTART_RESTARTBLOCK;
1560out:
1561 destroy_hrtimer_on_stack(&t.timer);
1562 return ret;
1403} 1563}
1404 1564
1405asmlinkage long 1565asmlinkage long
@@ -1425,7 +1585,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1425 int i; 1585 int i;
1426 1586
1427 spin_lock_init(&cpu_base->lock); 1587 spin_lock_init(&cpu_base->lock);
1428 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
1429 1588
1430 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1589 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1431 cpu_base->clock_base[i].cpu_base = cpu_base; 1590 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1445,6 +1604,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1445 while ((node = rb_first(&old_base->active))) { 1604 while ((node = rb_first(&old_base->active))) {
1446 timer = rb_entry(node, struct hrtimer, node); 1605 timer = rb_entry(node, struct hrtimer, node);
1447 BUG_ON(hrtimer_callback_running(timer)); 1606 BUG_ON(hrtimer_callback_running(timer));
1607 debug_hrtimer_deactivate(timer);
1448 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1608 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
1449 timer->base = new_base; 1609 timer->base = new_base;
1450 /* 1610 /*
@@ -1466,16 +1626,16 @@ static void migrate_hrtimers(int cpu)
1466 tick_cancel_sched_timer(cpu); 1626 tick_cancel_sched_timer(cpu);
1467 1627
1468 local_irq_disable(); 1628 local_irq_disable();
1469 double_spin_lock(&new_base->lock, &old_base->lock, 1629 spin_lock(&new_base->lock);
1470 smp_processor_id() < cpu); 1630 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1471 1631
1472 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1632 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1473 migrate_hrtimer_list(&old_base->clock_base[i], 1633 migrate_hrtimer_list(&old_base->clock_base[i],
1474 &new_base->clock_base[i]); 1634 &new_base->clock_base[i]);
1475 } 1635 }
1476 1636
1477 double_spin_unlock(&new_base->lock, &old_base->lock, 1637 spin_unlock(&old_base->lock);
1478 smp_processor_id() < cpu); 1638 spin_unlock(&new_base->lock);
1479 local_irq_enable(); 1639 local_irq_enable();
1480 put_cpu_var(hrtimer_bases); 1640 put_cpu_var(hrtimer_bases);
1481} 1641}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fdb3fbe2b0c4..964964baefa2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq)
47 desc->irq_count = 0; 47 desc->irq_count = 0;
48 desc->irqs_unhandled = 0; 48 desc->irqs_unhandled = 0;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 desc->affinity = CPU_MASK_ALL; 50 cpus_setall(desc->affinity);
51#endif 51#endif
52 spin_unlock_irqrestore(&desc->lock, flags); 52 spin_unlock_irqrestore(&desc->lock, flags);
53} 53}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/gfp.h>
4 5
5/* 6/*
6 * Device resource management aware IRQ request/free implementation. 7 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46d6611a33bb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/slab.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -149,6 +150,26 @@ void disable_irq(unsigned int irq)
149} 150}
150EXPORT_SYMBOL(disable_irq); 151EXPORT_SYMBOL(disable_irq);
151 152
153static void __enable_irq(struct irq_desc *desc, unsigned int irq)
154{
155 switch (desc->depth) {
156 case 0:
157 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
158 WARN_ON(1);
159 break;
160 case 1: {
161 unsigned int status = desc->status & ~IRQ_DISABLED;
162
163 /* Prevent probing on this irq: */
164 desc->status = status | IRQ_NOPROBE;
165 check_irq_resend(desc, irq);
166 /* fall-through */
167 }
168 default:
169 desc->depth--;
170 }
171}
172
152/** 173/**
153 * enable_irq - enable handling of an irq 174 * enable_irq - enable handling of an irq
154 * @irq: Interrupt to enable 175 * @irq: Interrupt to enable
@@ -168,22 +189,7 @@ void enable_irq(unsigned int irq)
168 return; 189 return;
169 190
170 spin_lock_irqsave(&desc->lock, flags); 191 spin_lock_irqsave(&desc->lock, flags);
171 switch (desc->depth) { 192 __enable_irq(desc, irq);
172 case 0:
173 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
174 WARN_ON(1);
175 break;
176 case 1: {
177 unsigned int status = desc->status & ~IRQ_DISABLED;
178
179 /* Prevent probing on this irq: */
180 desc->status = status | IRQ_NOPROBE;
181 check_irq_resend(desc, irq);
182 /* fall-through */
183 }
184 default:
185 desc->depth--;
186 }
187 spin_unlock_irqrestore(&desc->lock, flags); 193 spin_unlock_irqrestore(&desc->lock, flags);
188} 194}
189EXPORT_SYMBOL(enable_irq); 195EXPORT_SYMBOL(enable_irq);
@@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
364 compat_irq_chip_set_default_handler(desc); 370 compat_irq_chip_set_default_handler(desc);
365 371
366 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 372 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
367 IRQ_INPROGRESS); 373 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
368 374
369 if (!(desc->status & IRQ_NOAUTOEN)) { 375 if (!(desc->status & IRQ_NOAUTOEN)) {
370 desc->depth = 0; 376 desc->depth = 0;
@@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
380 /* Reset broken irq detection when installing new handler */ 386 /* Reset broken irq detection when installing new handler */
381 desc->irq_count = 0; 387 desc->irq_count = 0;
382 desc->irqs_unhandled = 0; 388 desc->irqs_unhandled = 0;
389
390 /*
391 * Check whether we disabled the irq via the spurious handler
392 * before. Reenable it and give it another chance.
393 */
394 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
395 desc->status &= ~IRQ_SPURIOUS_DISABLED;
396 __enable_irq(desc, irq);
397 }
398
383 spin_unlock_irqrestore(&desc->lock, flags); 399 spin_unlock_irqrestore(&desc->lock, flags);
384 400
385 new->irq = irq; 401 new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6a..c66d3f10e853 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
209 * Now kill the IRQ 209 * Now kill the IRQ
210 */ 210 */
211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 211 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
212 desc->status |= IRQ_DISABLED; 212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth = 1; 213 desc->depth++;
214 desc->chip->disable(irq); 214 desc->chip->disable(irq);
215 } 215 }
216 desc->irqs_unhandled = 0; 216 desc->irqs_unhandled = 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
472 472
473static int __init kallsyms_init(void) 473static int __init kallsyms_init(void)
474{ 474{
475 struct proc_dir_entry *entry; 475 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
476
477 entry = create_proc_entry("kallsyms", 0444, NULL);
478 if (entry)
479 entry->proc_fops = &kallsyms_operations;
480 return 0; 476 return 0;
481} 477}
482__initcall(kallsyms_init); 478__initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 06a0e2775651..1c5fcacbcf33 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -29,7 +29,6 @@
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/system.h> 31#include <asm/system.h>
32#include <asm/semaphore.h>
33#include <asm/sections.h> 32#include <asm/sections.h>
34 33
35/* Per cpu memory for storing cpu states in case of system crash. */ 34/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1218,7 +1217,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1218 } 1217 }
1219 1218
1220 /* match ? */ 1219 /* match ? */
1221 if (system_ram >= start && system_ram <= end) { 1220 if (system_ram >= start && system_ram < end) {
1222 *crash_size = size; 1221 *crash_size = size;
1223 break; 1222 break;
1224 } 1223 }
@@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1407 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1408 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1407 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408 VMCOREINFO_NUMBER(PG_lru);
1409 VMCOREINFO_NUMBER(PG_private);
1410 VMCOREINFO_NUMBER(PG_swapcache);
1409 1411
1410 arch_crash_save_vmcoreinfo(); 1412 arch_crash_save_vmcoreinfo();
1411 1413
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
new file mode 100644
index 000000000000..1bd0ec1c80b2
--- /dev/null
+++ b/kernel/kgdb.c
@@ -0,0 +1,1700 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56static int kgdb_break_asap;
57
58struct kgdb_state {
59 int ex_vector;
60 int signo;
61 int err_code;
62 int cpu;
63 int pass_exception;
64 long threadid;
65 long kgdb_usethreadid;
66 struct pt_regs *linux_regs;
67};
68
69static struct debuggerinfo_struct {
70 void *debuggerinfo;
71 struct task_struct *task;
72} kgdb_info[NR_CPUS];
73
74/**
75 * kgdb_connected - Is a host GDB connected to us?
76 */
77int kgdb_connected;
78EXPORT_SYMBOL_GPL(kgdb_connected);
79
80/* All the KGDB handlers are installed */
81static int kgdb_io_module_registered;
82
83/* Guard for recursive entry */
84static int exception_level;
85
86static struct kgdb_io *kgdb_io_ops;
87static DEFINE_SPINLOCK(kgdb_registration_lock);
88
89/* kgdb console driver is loaded */
90static int kgdb_con_registered;
91/* determine if kgdb console output should be used */
92static int kgdb_use_con;
93
94static int __init opt_kgdb_con(char *str)
95{
96 kgdb_use_con = 1;
97 return 0;
98}
99
100early_param("kgdbcon", opt_kgdb_con);
101
102module_param(kgdb_use_con, int, 0644);
103
104/*
105 * Holds information about breakpoints in a kernel. These breakpoints are
106 * added and removed by gdb.
107 */
108static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
110};
111
112/*
113 * The CPU# of the active CPU, or -1 if none:
114 */
115atomic_t kgdb_active = ATOMIC_INIT(-1);
116
117/*
118 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
119 * bootup code (which might not have percpu set up yet):
120 */
121static atomic_t passive_cpu_wait[NR_CPUS];
122static atomic_t cpu_in_kgdb[NR_CPUS];
123atomic_t kgdb_setting_breakpoint;
124
125struct task_struct *kgdb_usethread;
126struct task_struct *kgdb_contthread;
127
128int kgdb_single_step;
129
130/* Our I/O buffers. */
131static char remcom_in_buffer[BUFMAX];
132static char remcom_out_buffer[BUFMAX];
133
134/* Storage for the registers, in GDB format. */
135static unsigned long gdb_regs[(NUMREGBYTES +
136 sizeof(unsigned long) - 1) /
137 sizeof(unsigned long)];
138
139/* to keep track of the CPU which is doing the single stepping*/
140atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
141
142/*
143 * If you are debugging a problem where roundup (the collection of
144 * all other CPUs) is a problem [this should be extremely rare],
145 * then use the nokgdbroundup option to avoid roundup. In that case
146 * the other CPUs might interfere with your debugging context, so
147 * use this with care:
148 */
149int kgdb_do_roundup = 1;
150
151static int __init opt_nokgdbroundup(char *str)
152{
153 kgdb_do_roundup = 0;
154
155 return 0;
156}
157
158early_param("nokgdbroundup", opt_nokgdbroundup);
159
160/*
161 * Finally, some KGDB code :-)
162 */
163
164/*
165 * Weak aliases for breakpoint management,
166 * can be overriden by architectures when needed:
167 */
168int __weak kgdb_validate_break_address(unsigned long addr)
169{
170 char tmp_variable[BREAK_INSTR_SIZE];
171
172 return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
176{
177 int err;
178
179 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
180 if (err)
181 return err;
182
183 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
184 BREAK_INSTR_SIZE);
185}
186
187int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
188{
189 return probe_kernel_write((char *)addr,
190 (char *)bundle, BREAK_INSTR_SIZE);
191}
192
193unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
194{
195 return instruction_pointer(regs);
196}
197
198int __weak kgdb_arch_init(void)
199{
200 return 0;
201}
202
203int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
204{
205 return 0;
206}
207
208void __weak
209kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
210{
211 return;
212}
213
214/**
215 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
216 * @regs: Current &struct pt_regs.
217 *
218 * This function will be called if the particular architecture must
219 * disable hardware debugging while it is processing gdb packets or
220 * handling exception.
221 */
222void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
223{
224}
225
226/*
227 * GDB remote protocol parser:
228 */
229
230static const char hexchars[] = "0123456789abcdef";
231
232static int hex(char ch)
233{
234 if ((ch >= 'a') && (ch <= 'f'))
235 return ch - 'a' + 10;
236 if ((ch >= '0') && (ch <= '9'))
237 return ch - '0';
238 if ((ch >= 'A') && (ch <= 'F'))
239 return ch - 'A' + 10;
240 return -1;
241}
242
243/* scan for the sequence $<data>#<checksum> */
244static void get_packet(char *buffer)
245{
246 unsigned char checksum;
247 unsigned char xmitcsum;
248 int count;
249 char ch;
250
251 do {
252 /*
253 * Spin and wait around for the start character, ignore all
254 * other characters:
255 */
256 while ((ch = (kgdb_io_ops->read_char())) != '$')
257 /* nothing */;
258
259 kgdb_connected = 1;
260 checksum = 0;
261 xmitcsum = -1;
262
263 count = 0;
264
265 /*
266 * now, read until a # or end of buffer is found:
267 */
268 while (count < (BUFMAX - 1)) {
269 ch = kgdb_io_ops->read_char();
270 if (ch == '#')
271 break;
272 checksum = checksum + ch;
273 buffer[count] = ch;
274 count = count + 1;
275 }
276 buffer[count] = 0;
277
278 if (ch == '#') {
279 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
280 xmitcsum += hex(kgdb_io_ops->read_char());
281
282 if (checksum != xmitcsum)
283 /* failed checksum */
284 kgdb_io_ops->write_char('-');
285 else
286 /* successful transfer */
287 kgdb_io_ops->write_char('+');
288 if (kgdb_io_ops->flush)
289 kgdb_io_ops->flush();
290 }
291 } while (checksum != xmitcsum);
292}
293
294/*
295 * Send the packet in buffer.
296 * Check for gdb connection if asked for.
297 */
298static void put_packet(char *buffer)
299{
300 unsigned char checksum;
301 int count;
302 char ch;
303
304 /*
305 * $<packet info>#<checksum>.
306 */
307 while (1) {
308 kgdb_io_ops->write_char('$');
309 checksum = 0;
310 count = 0;
311
312 while ((ch = buffer[count])) {
313 kgdb_io_ops->write_char(ch);
314 checksum += ch;
315 count++;
316 }
317
318 kgdb_io_ops->write_char('#');
319 kgdb_io_ops->write_char(hexchars[checksum >> 4]);
320 kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
321 if (kgdb_io_ops->flush)
322 kgdb_io_ops->flush();
323
324 /* Now see what we get in reply. */
325 ch = kgdb_io_ops->read_char();
326
327 if (ch == 3)
328 ch = kgdb_io_ops->read_char();
329
330 /* If we get an ACK, we are done. */
331 if (ch == '+')
332 return;
333
334 /*
335 * If we get the start of another packet, this means
336 * that GDB is attempting to reconnect. We will NAK
337 * the packet being sent, and stop trying to send this
338 * packet.
339 */
340 if (ch == '$') {
341 kgdb_io_ops->write_char('-');
342 if (kgdb_io_ops->flush)
343 kgdb_io_ops->flush();
344 return;
345 }
346 }
347}
348
349static char *pack_hex_byte(char *pkt, u8 byte)
350{
351 *pkt++ = hexchars[byte >> 4];
352 *pkt++ = hexchars[byte & 0xf];
353
354 return pkt;
355}
356
357/*
358 * Convert the memory pointed to by mem into hex, placing result in buf.
359 * Return a pointer to the last char put in buf (null). May return an error.
360 */
361int kgdb_mem2hex(char *mem, char *buf, int count)
362{
363 char *tmp;
364 int err;
365
366 /*
367 * We use the upper half of buf as an intermediate buffer for the
368 * raw memory copy. Hex conversion will work against this one.
369 */
370 tmp = buf + count;
371
372 err = probe_kernel_read(tmp, mem, count);
373 if (!err) {
374 while (count > 0) {
375 buf = pack_hex_byte(buf, *tmp);
376 tmp++;
377 count--;
378 }
379
380 *buf = 0;
381 }
382
383 return err;
384}
385
386/*
387 * Copy the binary array pointed to by buf into mem. Fix $, #, and
388 * 0x7d escaped with 0x7d. Return a pointer to the character after
389 * the last byte written.
390 */
391static int kgdb_ebin2mem(char *buf, char *mem, int count)
392{
393 int err = 0;
394 char c;
395
396 while (count-- > 0) {
397 c = *buf++;
398 if (c == 0x7d)
399 c = *buf++ ^ 0x20;
400
401 err = probe_kernel_write(mem, &c, 1);
402 if (err)
403 break;
404
405 mem++;
406 }
407
408 return err;
409}
410
411/*
412 * Convert the hex array pointed to by buf into binary to be placed in mem.
413 * Return a pointer to the character AFTER the last byte written.
414 * May return an error.
415 */
416int kgdb_hex2mem(char *buf, char *mem, int count)
417{
418 char *tmp_raw;
419 char *tmp_hex;
420
421 /*
422 * We use the upper half of buf as an intermediate buffer for the
423 * raw memory that is converted from hex.
424 */
425 tmp_raw = buf + count * 2;
426
427 tmp_hex = tmp_raw - 1;
428 while (tmp_hex >= buf) {
429 tmp_raw--;
430 *tmp_raw = hex(*tmp_hex--);
431 *tmp_raw |= hex(*tmp_hex--) << 4;
432 }
433
434 return probe_kernel_write(mem, tmp_raw, count);
435}
436
437/*
438 * While we find nice hex chars, build a long_val.
439 * Return number of chars processed.
440 */
441int kgdb_hex2long(char **ptr, long *long_val)
442{
443 int hex_val;
444 int num = 0;
445
446 *long_val = 0;
447
448 while (**ptr) {
449 hex_val = hex(**ptr);
450 if (hex_val < 0)
451 break;
452
453 *long_val = (*long_val << 4) | hex_val;
454 num++;
455 (*ptr)++;
456 }
457
458 return num;
459}
460
461/* Write memory due to an 'M' or 'X' packet. */
462static int write_mem_msg(int binary)
463{
464 char *ptr = &remcom_in_buffer[1];
465 unsigned long addr;
466 unsigned long length;
467 int err;
468
469 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
470 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
471 if (binary)
472 err = kgdb_ebin2mem(ptr, (char *)addr, length);
473 else
474 err = kgdb_hex2mem(ptr, (char *)addr, length);
475 if (err)
476 return err;
477 if (CACHE_FLUSH_IS_SAFE)
478 flush_icache_range(addr, addr + length + 1);
479 return 0;
480 }
481
482 return -EINVAL;
483}
484
485static void error_packet(char *pkt, int error)
486{
487 error = -error;
488 pkt[0] = 'E';
489 pkt[1] = hexchars[(error / 10)];
490 pkt[2] = hexchars[(error % 10)];
491 pkt[3] = '\0';
492}
493
494/*
495 * Thread ID accessors. We represent a flat TID space to GDB, where
496 * the per CPU idle threads (which under Linux all have PID 0) are
497 * remapped to negative TIDs.
498 */
499
500#define BUF_THREAD_ID_SIZE 16
501
502static char *pack_threadid(char *pkt, unsigned char *id)
503{
504 char *limit;
505
506 limit = pkt + BUF_THREAD_ID_SIZE;
507 while (pkt < limit)
508 pkt = pack_hex_byte(pkt, *id++);
509
510 return pkt;
511}
512
513static void int_to_threadref(unsigned char *id, int value)
514{
515 unsigned char *scan;
516 int i = 4;
517
518 scan = (unsigned char *)id;
519 while (i--)
520 *scan++ = 0;
521 *scan++ = (value >> 24) & 0xff;
522 *scan++ = (value >> 16) & 0xff;
523 *scan++ = (value >> 8) & 0xff;
524 *scan++ = (value & 0xff);
525}
526
527static struct task_struct *getthread(struct pt_regs *regs, int tid)
528{
529 /*
530 * Non-positive TIDs are remapped idle tasks:
531 */
532 if (tid <= 0)
533 return idle_task(-tid);
534
535 /*
536 * find_task_by_pid_ns() does not take the tasklist lock anymore
537 * but is nicely RCU locked - hence is a pretty resilient
538 * thing to use:
539 */
540 return find_task_by_pid_ns(tid, &init_pid_ns);
541}
542
543/*
544 * CPU debug state control:
545 */
546
547#ifdef CONFIG_SMP
548static void kgdb_wait(struct pt_regs *regs)
549{
550 unsigned long flags;
551 int cpu;
552
553 local_irq_save(flags);
554 cpu = raw_smp_processor_id();
555 kgdb_info[cpu].debuggerinfo = regs;
556 kgdb_info[cpu].task = current;
557 /*
558 * Make sure the above info reaches the primary CPU before
559 * our cpu_in_kgdb[] flag setting does:
560 */
561 smp_wmb();
562 atomic_set(&cpu_in_kgdb[cpu], 1);
563
564 /* Wait till primary CPU is done with debugging */
565 while (atomic_read(&passive_cpu_wait[cpu]))
566 cpu_relax();
567
568 kgdb_info[cpu].debuggerinfo = NULL;
569 kgdb_info[cpu].task = NULL;
570
571 /* fix up hardware debug registers on local cpu */
572 if (arch_kgdb_ops.correct_hw_break)
573 arch_kgdb_ops.correct_hw_break();
574
575 /* Signal the primary CPU that we are done: */
576 atomic_set(&cpu_in_kgdb[cpu], 0);
577 clocksource_touch_watchdog();
578 local_irq_restore(flags);
579}
580#endif
581
582/*
583 * Some architectures need cache flushes when we set/clear a
584 * breakpoint:
585 */
586static void kgdb_flush_swbreak_addr(unsigned long addr)
587{
588 if (!CACHE_FLUSH_IS_SAFE)
589 return;
590
591 if (current->mm && current->mm->mmap_cache) {
592 flush_cache_range(current->mm->mmap_cache,
593 addr, addr + BREAK_INSTR_SIZE);
594 }
595 /* Force flush instruction cache if it was outside the mm */
596 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
597}
598
599/*
600 * SW breakpoint management:
601 */
602static int kgdb_activate_sw_breakpoints(void)
603{
604 unsigned long addr;
605 int error = 0;
606 int i;
607
608 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
609 if (kgdb_break[i].state != BP_SET)
610 continue;
611
612 addr = kgdb_break[i].bpt_addr;
613 error = kgdb_arch_set_breakpoint(addr,
614 kgdb_break[i].saved_instr);
615 if (error)
616 return error;
617
618 kgdb_flush_swbreak_addr(addr);
619 kgdb_break[i].state = BP_ACTIVE;
620 }
621 return 0;
622}
623
624static int kgdb_set_sw_break(unsigned long addr)
625{
626 int err = kgdb_validate_break_address(addr);
627 int breakno = -1;
628 int i;
629
630 if (err)
631 return err;
632
633 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
634 if ((kgdb_break[i].state == BP_SET) &&
635 (kgdb_break[i].bpt_addr == addr))
636 return -EEXIST;
637 }
638 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
639 if (kgdb_break[i].state == BP_REMOVED &&
640 kgdb_break[i].bpt_addr == addr) {
641 breakno = i;
642 break;
643 }
644 }
645
646 if (breakno == -1) {
647 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
648 if (kgdb_break[i].state == BP_UNDEFINED) {
649 breakno = i;
650 break;
651 }
652 }
653 }
654
655 if (breakno == -1)
656 return -E2BIG;
657
658 kgdb_break[breakno].state = BP_SET;
659 kgdb_break[breakno].type = BP_BREAKPOINT;
660 kgdb_break[breakno].bpt_addr = addr;
661
662 return 0;
663}
664
665static int kgdb_deactivate_sw_breakpoints(void)
666{
667 unsigned long addr;
668 int error = 0;
669 int i;
670
671 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
672 if (kgdb_break[i].state != BP_ACTIVE)
673 continue;
674 addr = kgdb_break[i].bpt_addr;
675 error = kgdb_arch_remove_breakpoint(addr,
676 kgdb_break[i].saved_instr);
677 if (error)
678 return error;
679
680 kgdb_flush_swbreak_addr(addr);
681 kgdb_break[i].state = BP_SET;
682 }
683 return 0;
684}
685
686static int kgdb_remove_sw_break(unsigned long addr)
687{
688 int i;
689
690 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
691 if ((kgdb_break[i].state == BP_SET) &&
692 (kgdb_break[i].bpt_addr == addr)) {
693 kgdb_break[i].state = BP_REMOVED;
694 return 0;
695 }
696 }
697 return -ENOENT;
698}
699
700int kgdb_isremovedbreak(unsigned long addr)
701{
702 int i;
703
704 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
705 if ((kgdb_break[i].state == BP_REMOVED) &&
706 (kgdb_break[i].bpt_addr == addr))
707 return 1;
708 }
709 return 0;
710}
711
712int remove_all_break(void)
713{
714 unsigned long addr;
715 int error;
716 int i;
717
718 /* Clear memory breakpoints. */
719 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
720 if (kgdb_break[i].state != BP_ACTIVE)
721 goto setundefined;
722 addr = kgdb_break[i].bpt_addr;
723 error = kgdb_arch_remove_breakpoint(addr,
724 kgdb_break[i].saved_instr);
725 if (error)
726 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
727 addr);
728setundefined:
729 kgdb_break[i].state = BP_UNDEFINED;
730 }
731
732 /* Clear hardware breakpoints. */
733 if (arch_kgdb_ops.remove_all_hw_break)
734 arch_kgdb_ops.remove_all_hw_break();
735
736 return 0;
737}
738
739/*
740 * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
741 */
742static inline int shadow_pid(int realpid)
743{
744 if (realpid)
745 return realpid;
746
747 return -1-raw_smp_processor_id();
748}
749
750static char gdbmsgbuf[BUFMAX + 1];
751
752static void kgdb_msg_write(const char *s, int len)
753{
754 char *bufptr;
755 int wcount;
756 int i;
757
758 /* 'O'utput */
759 gdbmsgbuf[0] = 'O';
760
761 /* Fill and send buffers... */
762 while (len > 0) {
763 bufptr = gdbmsgbuf + 1;
764
765 /* Calculate how many this time */
766 if ((len << 1) > (BUFMAX - 2))
767 wcount = (BUFMAX - 2) >> 1;
768 else
769 wcount = len;
770
771 /* Pack in hex chars */
772 for (i = 0; i < wcount; i++)
773 bufptr = pack_hex_byte(bufptr, s[i]);
774 *bufptr = '\0';
775
776 /* Move up */
777 s += wcount;
778 len -= wcount;
779
780 /* Write packet */
781 put_packet(gdbmsgbuf);
782 }
783}
784
785/*
786 * Return true if there is a valid kgdb I/O module. Also if no
787 * debugger is attached a message can be printed to the console about
788 * waiting for the debugger to attach.
789 *
790 * The print_wait argument is only to be true when called from inside
791 * the core kgdb_handle_exception, because it will wait for the
792 * debugger to attach.
793 */
794static int kgdb_io_ready(int print_wait)
795{
796 if (!kgdb_io_ops)
797 return 0;
798 if (kgdb_connected)
799 return 1;
800 if (atomic_read(&kgdb_setting_breakpoint))
801 return 1;
802 if (print_wait)
803 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
804 return 1;
805}
806
807/*
808 * All the functions that start with gdb_cmd are the various
809 * operations to implement the handlers for the gdbserial protocol
810 * where KGDB is communicating with an external debugger
811 */
812
813/* Handle the '?' status packets */
814static void gdb_cmd_status(struct kgdb_state *ks)
815{
816 /*
817 * We know that this packet is only sent
818 * during initial connect. So to be safe,
819 * we clear out our breakpoints now in case
820 * GDB is reconnecting.
821 */
822 remove_all_break();
823
824 remcom_out_buffer[0] = 'S';
825 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
826}
827
828/* Handle the 'g' get registers request */
829static void gdb_cmd_getregs(struct kgdb_state *ks)
830{
831 struct task_struct *thread;
832 void *local_debuggerinfo;
833 int i;
834
835 thread = kgdb_usethread;
836 if (!thread) {
837 thread = kgdb_info[ks->cpu].task;
838 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
839 } else {
840 local_debuggerinfo = NULL;
841 for (i = 0; i < NR_CPUS; i++) {
842 /*
843 * Try to find the task on some other
844 * or possibly this node if we do not
845 * find the matching task then we try
846 * to approximate the results.
847 */
848 if (thread == kgdb_info[i].task)
849 local_debuggerinfo = kgdb_info[i].debuggerinfo;
850 }
851 }
852
853 /*
854 * All threads that don't have debuggerinfo should be
855 * in __schedule() sleeping, since all other CPUs
856 * are in kgdb_wait, and thus have debuggerinfo.
857 */
858 if (local_debuggerinfo) {
859 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
860 } else {
861 /*
862 * Pull stuff saved during switch_to; nothing
863 * else is accessible (or even particularly
864 * relevant).
865 *
866 * This should be enough for a stack trace.
867 */
868 sleeping_thread_to_gdb_regs(gdb_regs, thread);
869 }
870 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
871}
872
873/* Handle the 'G' set registers request */
874static void gdb_cmd_setregs(struct kgdb_state *ks)
875{
876 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
877
878 if (kgdb_usethread && kgdb_usethread != current) {
879 error_packet(remcom_out_buffer, -EINVAL);
880 } else {
881 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
882 strcpy(remcom_out_buffer, "OK");
883 }
884}
885
886/* Handle the 'm' memory read bytes */
887static void gdb_cmd_memread(struct kgdb_state *ks)
888{
889 char *ptr = &remcom_in_buffer[1];
890 unsigned long length;
891 unsigned long addr;
892 int err;
893
894 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
895 kgdb_hex2long(&ptr, &length) > 0) {
896 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
897 if (err)
898 error_packet(remcom_out_buffer, err);
899 } else {
900 error_packet(remcom_out_buffer, -EINVAL);
901 }
902}
903
904/* Handle the 'M' memory write bytes */
905static void gdb_cmd_memwrite(struct kgdb_state *ks)
906{
907 int err = write_mem_msg(0);
908
909 if (err)
910 error_packet(remcom_out_buffer, err);
911 else
912 strcpy(remcom_out_buffer, "OK");
913}
914
915/* Handle the 'X' memory binary write bytes */
916static void gdb_cmd_binwrite(struct kgdb_state *ks)
917{
918 int err = write_mem_msg(1);
919
920 if (err)
921 error_packet(remcom_out_buffer, err);
922 else
923 strcpy(remcom_out_buffer, "OK");
924}
925
926/* Handle the 'D' or 'k', detach or kill packets */
927static void gdb_cmd_detachkill(struct kgdb_state *ks)
928{
929 int error;
930
931 /* The detach case */
932 if (remcom_in_buffer[0] == 'D') {
933 error = remove_all_break();
934 if (error < 0) {
935 error_packet(remcom_out_buffer, error);
936 } else {
937 strcpy(remcom_out_buffer, "OK");
938 kgdb_connected = 0;
939 }
940 put_packet(remcom_out_buffer);
941 } else {
942 /*
943 * Assume the kill case, with no exit code checking,
944 * trying to force detach the debugger:
945 */
946 remove_all_break();
947 kgdb_connected = 0;
948 }
949}
950
951/* Handle the 'R' reboot packets */
952static int gdb_cmd_reboot(struct kgdb_state *ks)
953{
954 /* For now, only honor R0 */
955 if (strcmp(remcom_in_buffer, "R0") == 0) {
956 printk(KERN_CRIT "Executing emergency reboot\n");
957 strcpy(remcom_out_buffer, "OK");
958 put_packet(remcom_out_buffer);
959
960 /*
961 * Execution should not return from
962 * machine_emergency_restart()
963 */
964 machine_emergency_restart();
965 kgdb_connected = 0;
966
967 return 1;
968 }
969 return 0;
970}
971
972/* Handle the 'q' query packets */
973static void gdb_cmd_query(struct kgdb_state *ks)
974{
975 struct task_struct *thread;
976 unsigned char thref[8];
977 char *ptr;
978 int i;
979
980 switch (remcom_in_buffer[1]) {
981 case 's':
982 case 'f':
983 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
984 error_packet(remcom_out_buffer, -EINVAL);
985 break;
986 }
987
988 if (remcom_in_buffer[1] == 'f')
989 ks->threadid = 1;
990
991 remcom_out_buffer[0] = 'm';
992 ptr = remcom_out_buffer + 1;
993
994 for (i = 0; i < 17; ks->threadid++) {
995 thread = getthread(ks->linux_regs, ks->threadid);
996 if (thread) {
997 int_to_threadref(thref, ks->threadid);
998 pack_threadid(ptr, thref);
999 ptr += BUF_THREAD_ID_SIZE;
1000 *(ptr++) = ',';
1001 i++;
1002 }
1003 }
1004 *(--ptr) = '\0';
1005 break;
1006
1007 case 'C':
1008 /* Current thread id */
1009 strcpy(remcom_out_buffer, "QC");
1010 ks->threadid = shadow_pid(current->pid);
1011 int_to_threadref(thref, ks->threadid);
1012 pack_threadid(remcom_out_buffer + 2, thref);
1013 break;
1014 case 'T':
1015 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1016 error_packet(remcom_out_buffer, -EINVAL);
1017 break;
1018 }
1019 ks->threadid = 0;
1020 ptr = remcom_in_buffer + 17;
1021 kgdb_hex2long(&ptr, &ks->threadid);
1022 if (!getthread(ks->linux_regs, ks->threadid)) {
1023 error_packet(remcom_out_buffer, -EINVAL);
1024 break;
1025 }
1026 if (ks->threadid > 0) {
1027 kgdb_mem2hex(getthread(ks->linux_regs,
1028 ks->threadid)->comm,
1029 remcom_out_buffer, 16);
1030 } else {
1031 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1032
1033 sprintf(tmpstr, "Shadow task %d for pid 0",
1034 (int)(-ks->threadid-1));
1035 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1036 }
1037 break;
1038 }
1039}
1040
1041/* Handle the 'H' task query packets */
1042static void gdb_cmd_task(struct kgdb_state *ks)
1043{
1044 struct task_struct *thread;
1045 char *ptr;
1046
1047 switch (remcom_in_buffer[1]) {
1048 case 'g':
1049 ptr = &remcom_in_buffer[2];
1050 kgdb_hex2long(&ptr, &ks->threadid);
1051 thread = getthread(ks->linux_regs, ks->threadid);
1052 if (!thread && ks->threadid > 0) {
1053 error_packet(remcom_out_buffer, -EINVAL);
1054 break;
1055 }
1056 kgdb_usethread = thread;
1057 ks->kgdb_usethreadid = ks->threadid;
1058 strcpy(remcom_out_buffer, "OK");
1059 break;
1060 case 'c':
1061 ptr = &remcom_in_buffer[2];
1062 kgdb_hex2long(&ptr, &ks->threadid);
1063 if (!ks->threadid) {
1064 kgdb_contthread = NULL;
1065 } else {
1066 thread = getthread(ks->linux_regs, ks->threadid);
1067 if (!thread && ks->threadid > 0) {
1068 error_packet(remcom_out_buffer, -EINVAL);
1069 break;
1070 }
1071 kgdb_contthread = thread;
1072 }
1073 strcpy(remcom_out_buffer, "OK");
1074 break;
1075 }
1076}
1077
1078/* Handle the 'T' thread query packets */
1079static void gdb_cmd_thread(struct kgdb_state *ks)
1080{
1081 char *ptr = &remcom_in_buffer[1];
1082 struct task_struct *thread;
1083
1084 kgdb_hex2long(&ptr, &ks->threadid);
1085 thread = getthread(ks->linux_regs, ks->threadid);
1086 if (thread)
1087 strcpy(remcom_out_buffer, "OK");
1088 else
1089 error_packet(remcom_out_buffer, -EINVAL);
1090}
1091
1092/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1093static void gdb_cmd_break(struct kgdb_state *ks)
1094{
1095 /*
1096 * Since GDB-5.3, it's been drafted that '0' is a software
1097 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1098 */
1099 char *bpt_type = &remcom_in_buffer[1];
1100 char *ptr = &remcom_in_buffer[2];
1101 unsigned long addr;
1102 unsigned long length;
1103 int error = 0;
1104
1105 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1106 /* Unsupported */
1107 if (*bpt_type > '4')
1108 return;
1109 } else {
1110 if (*bpt_type != '0' && *bpt_type != '1')
1111 /* Unsupported. */
1112 return;
1113 }
1114
1115 /*
1116 * Test if this is a hardware breakpoint, and
1117 * if we support it:
1118 */
1119 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1120 /* Unsupported. */
1121 return;
1122
1123 if (*(ptr++) != ',') {
1124 error_packet(remcom_out_buffer, -EINVAL);
1125 return;
1126 }
1127 if (!kgdb_hex2long(&ptr, &addr)) {
1128 error_packet(remcom_out_buffer, -EINVAL);
1129 return;
1130 }
1131 if (*(ptr++) != ',' ||
1132 !kgdb_hex2long(&ptr, &length)) {
1133 error_packet(remcom_out_buffer, -EINVAL);
1134 return;
1135 }
1136
1137 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1138 error = kgdb_set_sw_break(addr);
1139 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1140 error = kgdb_remove_sw_break(addr);
1141 else if (remcom_in_buffer[0] == 'Z')
1142 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1143 (int)length, *bpt_type - '0');
1144 else if (remcom_in_buffer[0] == 'z')
1145 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1146 (int) length, *bpt_type - '0');
1147
1148 if (error == 0)
1149 strcpy(remcom_out_buffer, "OK");
1150 else
1151 error_packet(remcom_out_buffer, error);
1152}
1153
1154/* Handle the 'C' signal / exception passing packets */
1155static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1156{
1157 /* C09 == pass exception
1158 * C15 == detach kgdb, pass exception
1159 */
1160 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1161
1162 ks->pass_exception = 1;
1163 remcom_in_buffer[0] = 'c';
1164
1165 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1166
1167 ks->pass_exception = 1;
1168 remcom_in_buffer[0] = 'D';
1169 remove_all_break();
1170 kgdb_connected = 0;
1171 return 1;
1172
1173 } else {
1174 error_packet(remcom_out_buffer, -EINVAL);
1175 return 0;
1176 }
1177
1178 /* Indicate fall through */
1179 return -1;
1180}
1181
1182/*
1183 * This function performs all gdbserial command procesing
1184 */
1185static int gdb_serial_stub(struct kgdb_state *ks)
1186{
1187 int error = 0;
1188 int tmp;
1189
1190 /* Clear the out buffer. */
1191 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1192
1193 if (kgdb_connected) {
1194 unsigned char thref[8];
1195 char *ptr;
1196
1197 /* Reply to host that an exception has occurred */
1198 ptr = remcom_out_buffer;
1199 *ptr++ = 'T';
1200 ptr = pack_hex_byte(ptr, ks->signo);
1201 ptr += strlen(strcpy(ptr, "thread:"));
1202 int_to_threadref(thref, shadow_pid(current->pid));
1203 ptr = pack_threadid(ptr, thref);
1204 *ptr++ = ';';
1205 put_packet(remcom_out_buffer);
1206 }
1207
1208 kgdb_usethread = kgdb_info[ks->cpu].task;
1209 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1210 ks->pass_exception = 0;
1211
1212 while (1) {
1213 error = 0;
1214
1215 /* Clear the out buffer. */
1216 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1217
1218 get_packet(remcom_in_buffer);
1219
1220 switch (remcom_in_buffer[0]) {
1221 case '?': /* gdbserial status */
1222 gdb_cmd_status(ks);
1223 break;
1224 case 'g': /* return the value of the CPU registers */
1225 gdb_cmd_getregs(ks);
1226 break;
1227 case 'G': /* set the value of the CPU registers - return OK */
1228 gdb_cmd_setregs(ks);
1229 break;
1230 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1231 gdb_cmd_memread(ks);
1232 break;
1233 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1234 gdb_cmd_memwrite(ks);
1235 break;
1236 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1237 gdb_cmd_binwrite(ks);
1238 break;
1239 /* kill or detach. KGDB should treat this like a
1240 * continue.
1241 */
1242 case 'D': /* Debugger detach */
1243 case 'k': /* Debugger detach via kill */
1244 gdb_cmd_detachkill(ks);
1245 goto default_handle;
1246 case 'R': /* Reboot */
1247 if (gdb_cmd_reboot(ks))
1248 goto default_handle;
1249 break;
1250 case 'q': /* query command */
1251 gdb_cmd_query(ks);
1252 break;
1253 case 'H': /* task related */
1254 gdb_cmd_task(ks);
1255 break;
1256 case 'T': /* Query thread status */
1257 gdb_cmd_thread(ks);
1258 break;
1259 case 'z': /* Break point remove */
1260 case 'Z': /* Break point set */
1261 gdb_cmd_break(ks);
1262 break;
1263 case 'C': /* Exception passing */
1264 tmp = gdb_cmd_exception_pass(ks);
1265 if (tmp > 0)
1266 goto default_handle;
1267 if (tmp == 0)
1268 break;
1269 /* Fall through on tmp < 0 */
1270 case 'c': /* Continue packet */
1271 case 's': /* Single step packet */
1272 if (kgdb_contthread && kgdb_contthread != current) {
1273 /* Can't switch threads in kgdb */
1274 error_packet(remcom_out_buffer, -EINVAL);
1275 break;
1276 }
1277 kgdb_activate_sw_breakpoints();
1278 /* Fall through to default processing */
1279 default:
1280default_handle:
1281 error = kgdb_arch_handle_exception(ks->ex_vector,
1282 ks->signo,
1283 ks->err_code,
1284 remcom_in_buffer,
1285 remcom_out_buffer,
1286 ks->linux_regs);
1287 /*
1288 * Leave cmd processing on error, detach,
1289 * kill, continue, or single step.
1290 */
1291 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1292 remcom_in_buffer[0] == 'k') {
1293 error = 0;
1294 goto kgdb_exit;
1295 }
1296
1297 }
1298
1299 /* reply to the request */
1300 put_packet(remcom_out_buffer);
1301 }
1302
1303kgdb_exit:
1304 if (ks->pass_exception)
1305 error = 1;
1306 return error;
1307}
1308
1309static int kgdb_reenter_check(struct kgdb_state *ks)
1310{
1311 unsigned long addr;
1312
1313 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1314 return 0;
1315
1316 /* Panic on recursive debugger calls: */
1317 exception_level++;
1318 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1319 kgdb_deactivate_sw_breakpoints();
1320
1321 /*
1322 * If the break point removed ok at the place exception
1323 * occurred, try to recover and print a warning to the end
1324 * user because the user planted a breakpoint in a place that
1325 * KGDB needs in order to function.
1326 */
1327 if (kgdb_remove_sw_break(addr) == 0) {
1328 exception_level = 0;
1329 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1330 kgdb_activate_sw_breakpoints();
1331 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1332 addr);
1333 WARN_ON_ONCE(1);
1334
1335 return 1;
1336 }
1337 remove_all_break();
1338 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1339
1340 if (exception_level > 1) {
1341 dump_stack();
1342 panic("Recursive entry to debugger");
1343 }
1344
1345 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1346 dump_stack();
1347 panic("Recursive entry to debugger");
1348
1349 return 1;
1350}
1351
1352/*
1353 * kgdb_handle_exception() - main entry point from a kernel exception
1354 *
1355 * Locking hierarchy:
1356 * interface locks, if any (begin_session)
1357 * kgdb lock (kgdb_active)
1358 */
1359int
1360kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1361{
1362 struct kgdb_state kgdb_var;
1363 struct kgdb_state *ks = &kgdb_var;
1364 unsigned long flags;
1365 int error = 0;
1366 int i, cpu;
1367
1368 ks->cpu = raw_smp_processor_id();
1369 ks->ex_vector = evector;
1370 ks->signo = signo;
1371 ks->ex_vector = evector;
1372 ks->err_code = ecode;
1373 ks->kgdb_usethreadid = 0;
1374 ks->linux_regs = regs;
1375
1376 if (kgdb_reenter_check(ks))
1377 return 0; /* Ouch, double exception ! */
1378
1379acquirelock:
1380 /*
1381 * Interrupts will be restored by the 'trap return' code, except when
1382 * single stepping.
1383 */
1384 local_irq_save(flags);
1385
1386 cpu = raw_smp_processor_id();
1387
1388 /*
1389 * Acquire the kgdb_active lock:
1390 */
1391 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
1392 cpu_relax();
1393
1394 /*
1395 * Do not start the debugger connection on this CPU if the last
1396 * instance of the exception handler wanted to come into the
1397 * debugger on a different CPU via a single step
1398 */
1399 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1400 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1401
1402 atomic_set(&kgdb_active, -1);
1403 clocksource_touch_watchdog();
1404 local_irq_restore(flags);
1405
1406 goto acquirelock;
1407 }
1408
1409 if (!kgdb_io_ready(1)) {
1410 error = 1;
1411 goto kgdb_restore; /* No I/O connection, so resume the system */
1412 }
1413
1414 /*
1415 * Don't enter if we have hit a removed breakpoint.
1416 */
1417 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1418 goto kgdb_restore;
1419
1420 /* Call the I/O driver's pre_exception routine */
1421 if (kgdb_io_ops->pre_exception)
1422 kgdb_io_ops->pre_exception();
1423
1424 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1425 kgdb_info[ks->cpu].task = current;
1426
1427 kgdb_disable_hw_debug(ks->linux_regs);
1428
1429 /*
1430 * Get the passive CPU lock which will hold all the non-primary
1431 * CPU in a spin state while the debugger is active
1432 */
1433 if (!kgdb_single_step || !kgdb_contthread) {
1434 for (i = 0; i < NR_CPUS; i++)
1435 atomic_set(&passive_cpu_wait[i], 1);
1436 }
1437
1438 /*
1439 * spin_lock code is good enough as a barrier so we don't
1440 * need one here:
1441 */
1442 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1443
1444#ifdef CONFIG_SMP
1445 /* Signal the other CPUs to enter kgdb_wait() */
1446 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
1447 kgdb_roundup_cpus(flags);
1448#endif
1449
1450 /*
1451 * Wait for the other CPUs to be notified and be waiting for us:
1452 */
1453 for_each_online_cpu(i) {
1454 while (!atomic_read(&cpu_in_kgdb[i]))
1455 cpu_relax();
1456 }
1457
1458 /*
1459 * At this point the primary processor is completely
1460 * in the debugger and all secondary CPUs are quiescent
1461 */
1462 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1463 kgdb_deactivate_sw_breakpoints();
1464 kgdb_single_step = 0;
1465 kgdb_contthread = NULL;
1466 exception_level = 0;
1467
1468 /* Talk to debugger with gdbserial protocol */
1469 error = gdb_serial_stub(ks);
1470
1471 /* Call the I/O driver's post_exception routine */
1472 if (kgdb_io_ops->post_exception)
1473 kgdb_io_ops->post_exception();
1474
1475 kgdb_info[ks->cpu].debuggerinfo = NULL;
1476 kgdb_info[ks->cpu].task = NULL;
1477 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1478
1479 if (!kgdb_single_step || !kgdb_contthread) {
1480 for (i = NR_CPUS-1; i >= 0; i--)
1481 atomic_set(&passive_cpu_wait[i], 0);
1482 /*
1483 * Wait till all the CPUs have quit
1484 * from the debugger.
1485 */
1486 for_each_online_cpu(i) {
1487 while (atomic_read(&cpu_in_kgdb[i]))
1488 cpu_relax();
1489 }
1490 }
1491
1492kgdb_restore:
1493 /* Free kgdb_active */
1494 atomic_set(&kgdb_active, -1);
1495 clocksource_touch_watchdog();
1496 local_irq_restore(flags);
1497
1498 return error;
1499}
1500
1501int kgdb_nmicallback(int cpu, void *regs)
1502{
1503#ifdef CONFIG_SMP
1504 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1505 atomic_read(&kgdb_active) != cpu &&
1506 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
1507 kgdb_wait((struct pt_regs *)regs);
1508 return 0;
1509 }
1510#endif
1511 return 1;
1512}
1513
1514void kgdb_console_write(struct console *co, const char *s, unsigned count)
1515{
1516 unsigned long flags;
1517
1518 /* If we're debugging, or KGDB has not connected, don't try
1519 * and print. */
1520 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1521 return;
1522
1523 local_irq_save(flags);
1524 kgdb_msg_write(s, count);
1525 local_irq_restore(flags);
1526}
1527
1528static struct console kgdbcons = {
1529 .name = "kgdb",
1530 .write = kgdb_console_write,
1531 .flags = CON_PRINTBUFFER | CON_ENABLED,
1532 .index = -1,
1533};
1534
1535#ifdef CONFIG_MAGIC_SYSRQ
1536static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1537{
1538 if (!kgdb_io_ops) {
1539 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1540 return;
1541 }
1542 if (!kgdb_connected)
1543 printk(KERN_CRIT "Entering KGDB\n");
1544
1545 kgdb_breakpoint();
1546}
1547
1548static struct sysrq_key_op sysrq_gdb_op = {
1549 .handler = sysrq_handle_gdb,
1550 .help_msg = "Gdb",
1551 .action_msg = "GDB",
1552};
1553#endif
1554
1555static void kgdb_register_callbacks(void)
1556{
1557 if (!kgdb_io_module_registered) {
1558 kgdb_io_module_registered = 1;
1559 kgdb_arch_init();
1560#ifdef CONFIG_MAGIC_SYSRQ
1561 register_sysrq_key('g', &sysrq_gdb_op);
1562#endif
1563 if (kgdb_use_con && !kgdb_con_registered) {
1564 register_console(&kgdbcons);
1565 kgdb_con_registered = 1;
1566 }
1567 }
1568}
1569
1570static void kgdb_unregister_callbacks(void)
1571{
1572 /*
1573 * When this routine is called KGDB should unregister from the
1574 * panic handler and clean up, making sure it is not handling any
1575 * break exceptions at the time.
1576 */
1577 if (kgdb_io_module_registered) {
1578 kgdb_io_module_registered = 0;
1579 kgdb_arch_exit();
1580#ifdef CONFIG_MAGIC_SYSRQ
1581 unregister_sysrq_key('g', &sysrq_gdb_op);
1582#endif
1583 if (kgdb_con_registered) {
1584 unregister_console(&kgdbcons);
1585 kgdb_con_registered = 0;
1586 }
1587 }
1588}
1589
1590static void kgdb_initial_breakpoint(void)
1591{
1592 kgdb_break_asap = 0;
1593
1594 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1595 kgdb_breakpoint();
1596}
1597
1598/**
1599 * kgdb_register_io_module - register KGDB IO module
1600 * @new_kgdb_io_ops: the io ops vector
1601 *
1602 * Register it with the KGDB core.
1603 */
1604int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1605{
1606 int err;
1607
1608 spin_lock(&kgdb_registration_lock);
1609
1610 if (kgdb_io_ops) {
1611 spin_unlock(&kgdb_registration_lock);
1612
1613 printk(KERN_ERR "kgdb: Another I/O driver is already "
1614 "registered with KGDB.\n");
1615 return -EBUSY;
1616 }
1617
1618 if (new_kgdb_io_ops->init) {
1619 err = new_kgdb_io_ops->init();
1620 if (err) {
1621 spin_unlock(&kgdb_registration_lock);
1622 return err;
1623 }
1624 }
1625
1626 kgdb_io_ops = new_kgdb_io_ops;
1627
1628 spin_unlock(&kgdb_registration_lock);
1629
1630 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1631 new_kgdb_io_ops->name);
1632
1633 /* Arm KGDB now. */
1634 kgdb_register_callbacks();
1635
1636 if (kgdb_break_asap)
1637 kgdb_initial_breakpoint();
1638
1639 return 0;
1640}
1641EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1642
1643/**
1644 * kkgdb_unregister_io_module - unregister KGDB IO module
1645 * @old_kgdb_io_ops: the io ops vector
1646 *
1647 * Unregister it with the KGDB core.
1648 */
1649void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1650{
1651 BUG_ON(kgdb_connected);
1652
1653 /*
1654 * KGDB is no longer able to communicate out, so
1655 * unregister our callbacks and reset state.
1656 */
1657 kgdb_unregister_callbacks();
1658
1659 spin_lock(&kgdb_registration_lock);
1660
1661 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1662 kgdb_io_ops = NULL;
1663
1664 spin_unlock(&kgdb_registration_lock);
1665
1666 printk(KERN_INFO
1667 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1668 old_kgdb_io_ops->name);
1669}
1670EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1671
1672/**
1673 * kgdb_breakpoint - generate breakpoint exception
1674 *
1675 * This function will generate a breakpoint exception. It is used at the
1676 * beginning of a program to sync up with a debugger and can be used
1677 * otherwise as a quick means to stop program execution and "break" into
1678 * the debugger.
1679 */
1680void kgdb_breakpoint(void)
1681{
1682 atomic_set(&kgdb_setting_breakpoint, 1);
1683 wmb(); /* Sync point before breakpoint */
1684 arch_kgdb_breakpoint();
1685 wmb(); /* Sync point after breakpoint */
1686 atomic_set(&kgdb_setting_breakpoint, 0);
1687}
1688EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1689
1690static int __init opt_kgdb_wait(char *str)
1691{
1692 kgdb_break_asap = 1;
1693
1694 if (kgdb_io_module_registered)
1695 kgdb_initial_breakpoint();
1696
1697 return 0;
1698}
1699
1700early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 22be3ff3f363..8df97d3dfda8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 28#include <linux/completion.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include <linux/mount.h> 33#include <linux/mount.h>
@@ -165,7 +166,7 @@ static int ____call_usermodehelper(void *data)
165 } 166 }
166 167
167 /* We can run anywhere, unlike our parent keventd(). */ 168 /* We can run anywhere, unlike our parent keventd(). */
168 set_cpus_allowed(current, CPU_MASK_ALL); 169 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
169 170
170 /* 171 /*
171 * Our parent is keventd, which runs with elevated scheduling priority. 172 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fcfb580c3afc..1e0250cb9486 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74 74
75/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked
77 * __kprobes. But, there are cases where such functions already belong to
78 * a different section (__sched for preempt_schedule)
79 *
80 * For such cases, we now have a blacklist
81 */
82struct kprobe_blackpoint kprobe_blacklist[] = {
83 {"preempt_schedule",},
84 {NULL} /* Terminator */
85};
86
75#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 87#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
76/* 88/*
77 * kprobe->ainsn.insn points to the copy of the instruction to be 89 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp)
417 } 429 }
418} 430}
419 431
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{
434 unsigned long flags;
435 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next;
437 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags);
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
440 ri->rp = NULL;
441 hlist_del(&ri->uflist);
442 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp);
445}
446
420/* 447/*
421 * Keep all fields in the kprobe consistent 448 * Keep all fields in the kprobe consistent
422 */ 449 */
@@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
492 519
493static int __kprobes in_kprobes_functions(unsigned long addr) 520static int __kprobes in_kprobes_functions(unsigned long addr)
494{ 521{
522 struct kprobe_blackpoint *kb;
523
495 if (addr >= (unsigned long)__kprobes_text_start && 524 if (addr >= (unsigned long)__kprobes_text_start &&
496 addr < (unsigned long)__kprobes_text_end) 525 addr < (unsigned long)__kprobes_text_end)
497 return -EINVAL; 526 return -EINVAL;
527 /*
528 * If there exists a kprobe_blacklist, verify and
529 * fail any probe registration in the prohibited area
530 */
531 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
532 if (kb->start_addr) {
533 if (addr >= kb->start_addr &&
534 addr < (kb->start_addr + kb->range))
535 return -EINVAL;
536 }
537 }
498 return 0; 538 return 0;
499} 539}
500 540
@@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
555 } 595 }
556 596
557 p->nmissed = 0; 597 p->nmissed = 0;
598 INIT_LIST_HEAD(&p->list);
558 mutex_lock(&kprobe_mutex); 599 mutex_lock(&kprobe_mutex);
559 old_p = get_kprobe(p->addr); 600 old_p = get_kprobe(p->addr);
560 if (old_p) { 601 if (old_p) {
@@ -581,35 +622,28 @@ out:
581 return ret; 622 return ret;
582} 623}
583 624
584int __kprobes register_kprobe(struct kprobe *p) 625/*
585{ 626 * Unregister a kprobe without a scheduler synchronization.
586 return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); 627 */
587} 628static int __kprobes __unregister_kprobe_top(struct kprobe *p)
588
589void __kprobes unregister_kprobe(struct kprobe *p)
590{ 629{
591 struct module *mod;
592 struct kprobe *old_p, *list_p; 630 struct kprobe *old_p, *list_p;
593 int cleanup_p;
594 631
595 mutex_lock(&kprobe_mutex);
596 old_p = get_kprobe(p->addr); 632 old_p = get_kprobe(p->addr);
597 if (unlikely(!old_p)) { 633 if (unlikely(!old_p))
598 mutex_unlock(&kprobe_mutex); 634 return -EINVAL;
599 return; 635
600 }
601 if (p != old_p) { 636 if (p != old_p) {
602 list_for_each_entry_rcu(list_p, &old_p->list, list) 637 list_for_each_entry_rcu(list_p, &old_p->list, list)
603 if (list_p == p) 638 if (list_p == p)
604 /* kprobe p is a valid probe */ 639 /* kprobe p is a valid probe */
605 goto valid_p; 640 goto valid_p;
606 mutex_unlock(&kprobe_mutex); 641 return -EINVAL;
607 return;
608 } 642 }
609valid_p: 643valid_p:
610 if (old_p == p || 644 if (old_p == p ||
611 (old_p->pre_handler == aggr_pre_handler && 645 (old_p->pre_handler == aggr_pre_handler &&
612 p->list.next == &old_p->list && p->list.prev == &old_p->list)) { 646 list_is_singular(&old_p->list))) {
613 /* 647 /*
614 * Only probe on the hash list. Disarm only if kprobes are 648 * Only probe on the hash list. Disarm only if kprobes are
615 * enabled - otherwise, the breakpoint would already have 649 * enabled - otherwise, the breakpoint would already have
@@ -618,43 +652,97 @@ valid_p:
618 if (kprobe_enabled) 652 if (kprobe_enabled)
619 arch_disarm_kprobe(p); 653 arch_disarm_kprobe(p);
620 hlist_del_rcu(&old_p->hlist); 654 hlist_del_rcu(&old_p->hlist);
621 cleanup_p = 1;
622 } else { 655 } else {
656 if (p->break_handler)
657 old_p->break_handler = NULL;
658 if (p->post_handler) {
659 list_for_each_entry_rcu(list_p, &old_p->list, list) {
660 if ((list_p != p) && (list_p->post_handler))
661 goto noclean;
662 }
663 old_p->post_handler = NULL;
664 }
665noclean:
623 list_del_rcu(&p->list); 666 list_del_rcu(&p->list);
624 cleanup_p = 0;
625 } 667 }
668 return 0;
669}
626 670
627 mutex_unlock(&kprobe_mutex); 671static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
672{
673 struct module *mod;
674 struct kprobe *old_p;
628 675
629 synchronize_sched();
630 if (p->mod_refcounted) { 676 if (p->mod_refcounted) {
631 mod = module_text_address((unsigned long)p->addr); 677 mod = module_text_address((unsigned long)p->addr);
632 if (mod) 678 if (mod)
633 module_put(mod); 679 module_put(mod);
634 } 680 }
635 681
636 if (cleanup_p) { 682 if (list_empty(&p->list) || list_is_singular(&p->list)) {
637 if (p != old_p) { 683 if (!list_empty(&p->list)) {
638 list_del_rcu(&p->list); 684 /* "p" is the last child of an aggr_kprobe */
685 old_p = list_entry(p->list.next, struct kprobe, list);
686 list_del(&p->list);
639 kfree(old_p); 687 kfree(old_p);
640 } 688 }
641 arch_remove_kprobe(p); 689 arch_remove_kprobe(p);
642 } else { 690 }
643 mutex_lock(&kprobe_mutex); 691}
644 if (p->break_handler) 692
645 old_p->break_handler = NULL; 693static int __register_kprobes(struct kprobe **kps, int num,
646 if (p->post_handler){ 694 unsigned long called_from)
647 list_for_each_entry_rcu(list_p, &old_p->list, list){ 695{
648 if (list_p->post_handler){ 696 int i, ret = 0;
649 cleanup_p = 2; 697
650 break; 698 if (num <= 0)
651 } 699 return -EINVAL;
652 } 700 for (i = 0; i < num; i++) {
653 if (cleanup_p == 0) 701 ret = __register_kprobe(kps[i], called_from);
654 old_p->post_handler = NULL; 702 if (ret < 0 && i > 0) {
703 unregister_kprobes(kps, i);
704 break;
655 } 705 }
656 mutex_unlock(&kprobe_mutex);
657 } 706 }
707 return ret;
708}
709
710/*
711 * Registration and unregistration functions for kprobe.
712 */
713int __kprobes register_kprobe(struct kprobe *p)
714{
715 return __register_kprobes(&p, 1,
716 (unsigned long)__builtin_return_address(0));
717}
718
719void __kprobes unregister_kprobe(struct kprobe *p)
720{
721 unregister_kprobes(&p, 1);
722}
723
724int __kprobes register_kprobes(struct kprobe **kps, int num)
725{
726 return __register_kprobes(kps, num,
727 (unsigned long)__builtin_return_address(0));
728}
729
730void __kprobes unregister_kprobes(struct kprobe **kps, int num)
731{
732 int i;
733
734 if (num <= 0)
735 return;
736 mutex_lock(&kprobe_mutex);
737 for (i = 0; i < num; i++)
738 if (__unregister_kprobe_top(kps[i]) < 0)
739 kps[i]->addr = NULL;
740 mutex_unlock(&kprobe_mutex);
741
742 synchronize_sched();
743 for (i = 0; i < num; i++)
744 if (kps[i]->addr)
745 __unregister_kprobe_bottom(kps[i]);
658} 746}
659 747
660static struct notifier_block kprobe_exceptions_nb = { 748static struct notifier_block kprobe_exceptions_nb = {
@@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry)
667 return (unsigned long)entry; 755 return (unsigned long)entry;
668} 756}
669 757
670int __kprobes register_jprobe(struct jprobe *jp) 758static int __register_jprobes(struct jprobe **jps, int num,
759 unsigned long called_from)
671{ 760{
672 unsigned long addr = arch_deref_entry_point(jp->entry); 761 struct jprobe *jp;
762 int ret = 0, i;
673 763
674 if (!kernel_text_address(addr)) 764 if (num <= 0)
675 return -EINVAL; 765 return -EINVAL;
766 for (i = 0; i < num; i++) {
767 unsigned long addr;
768 jp = jps[i];
769 addr = arch_deref_entry_point(jp->entry);
770
771 if (!kernel_text_address(addr))
772 ret = -EINVAL;
773 else {
774 /* Todo: Verify probepoint is a function entry point */
775 jp->kp.pre_handler = setjmp_pre_handler;
776 jp->kp.break_handler = longjmp_break_handler;
777 ret = __register_kprobe(&jp->kp, called_from);
778 }
779 if (ret < 0 && i > 0) {
780 unregister_jprobes(jps, i);
781 break;
782 }
783 }
784 return ret;
785}
676 786
677 /* Todo: Verify probepoint is a function entry point */ 787int __kprobes register_jprobe(struct jprobe *jp)
678 jp->kp.pre_handler = setjmp_pre_handler; 788{
679 jp->kp.break_handler = longjmp_break_handler; 789 return __register_jprobes(&jp, 1,
680
681 return __register_kprobe(&jp->kp,
682 (unsigned long)__builtin_return_address(0)); 790 (unsigned long)__builtin_return_address(0));
683} 791}
684 792
685void __kprobes unregister_jprobe(struct jprobe *jp) 793void __kprobes unregister_jprobe(struct jprobe *jp)
686{ 794{
687 unregister_kprobe(&jp->kp); 795 unregister_jprobes(&jp, 1);
796}
797
798int __kprobes register_jprobes(struct jprobe **jps, int num)
799{
800 return __register_jprobes(jps, num,
801 (unsigned long)__builtin_return_address(0));
802}
803
804void __kprobes unregister_jprobes(struct jprobe **jps, int num)
805{
806 int i;
807
808 if (num <= 0)
809 return;
810 mutex_lock(&kprobe_mutex);
811 for (i = 0; i < num; i++)
812 if (__unregister_kprobe_top(&jps[i]->kp) < 0)
813 jps[i]->kp.addr = NULL;
814 mutex_unlock(&kprobe_mutex);
815
816 synchronize_sched();
817 for (i = 0; i < num; i++) {
818 if (jps[i]->kp.addr)
819 __unregister_kprobe_bottom(&jps[i]->kp);
820 }
688} 821}
689 822
690#ifdef CONFIG_KRETPROBES 823#ifdef CONFIG_KRETPROBES
@@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
725 return 0; 858 return 0;
726} 859}
727 860
728int __kprobes register_kretprobe(struct kretprobe *rp) 861static int __kprobes __register_kretprobe(struct kretprobe *rp,
862 unsigned long called_from)
729{ 863{
730 int ret = 0; 864 int ret = 0;
731 struct kretprobe_instance *inst; 865 struct kretprobe_instance *inst;
@@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
771 905
772 rp->nmissed = 0; 906 rp->nmissed = 0;
773 /* Establish function entry probe point */ 907 /* Establish function entry probe point */
774 if ((ret = __register_kprobe(&rp->kp, 908 ret = __register_kprobe(&rp->kp, called_from);
775 (unsigned long)__builtin_return_address(0))) != 0) 909 if (ret != 0)
776 free_rp_inst(rp); 910 free_rp_inst(rp);
777 return ret; 911 return ret;
778} 912}
779 913
914static int __register_kretprobes(struct kretprobe **rps, int num,
915 unsigned long called_from)
916{
917 int ret = 0, i;
918
919 if (num <= 0)
920 return -EINVAL;
921 for (i = 0; i < num; i++) {
922 ret = __register_kretprobe(rps[i], called_from);
923 if (ret < 0 && i > 0) {
924 unregister_kretprobes(rps, i);
925 break;
926 }
927 }
928 return ret;
929}
930
931int __kprobes register_kretprobe(struct kretprobe *rp)
932{
933 return __register_kretprobes(&rp, 1,
934 (unsigned long)__builtin_return_address(0));
935}
936
937void __kprobes unregister_kretprobe(struct kretprobe *rp)
938{
939 unregister_kretprobes(&rp, 1);
940}
941
942int __kprobes register_kretprobes(struct kretprobe **rps, int num)
943{
944 return __register_kretprobes(rps, num,
945 (unsigned long)__builtin_return_address(0));
946}
947
948void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
949{
950 int i;
951
952 if (num <= 0)
953 return;
954 mutex_lock(&kprobe_mutex);
955 for (i = 0; i < num; i++)
956 if (__unregister_kprobe_top(&rps[i]->kp) < 0)
957 rps[i]->kp.addr = NULL;
958 mutex_unlock(&kprobe_mutex);
959
960 synchronize_sched();
961 for (i = 0; i < num; i++) {
962 if (rps[i]->kp.addr) {
963 __unregister_kprobe_bottom(&rps[i]->kp);
964 cleanup_rp_inst(rps[i]);
965 }
966 }
967}
968
780#else /* CONFIG_KRETPROBES */ 969#else /* CONFIG_KRETPROBES */
781int __kprobes register_kretprobe(struct kretprobe *rp) 970int __kprobes register_kretprobe(struct kretprobe *rp)
782{ 971{
783 return -ENOSYS; 972 return -ENOSYS;
784} 973}
785 974
786static int __kprobes pre_handler_kretprobe(struct kprobe *p, 975int __kprobes register_kretprobes(struct kretprobe **rps, int num)
787 struct pt_regs *regs)
788{ 976{
789 return 0; 977 return -ENOSYS;
790} 978}
791#endif /* CONFIG_KRETPROBES */
792
793void __kprobes unregister_kretprobe(struct kretprobe *rp) 979void __kprobes unregister_kretprobe(struct kretprobe *rp)
794{ 980{
795 unsigned long flags; 981}
796 struct kretprobe_instance *ri;
797 struct hlist_node *pos, *next;
798 982
799 unregister_kprobe(&rp->kp); 983void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
984{
985}
800 986
801 /* No race here */ 987static int __kprobes pre_handler_kretprobe(struct kprobe *p,
802 spin_lock_irqsave(&kretprobe_lock, flags); 988 struct pt_regs *regs)
803 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 989{
804 ri->rp = NULL; 990 return 0;
805 hlist_del(&ri->uflist);
806 }
807 spin_unlock_irqrestore(&kretprobe_lock, flags);
808 free_rp_inst(rp);
809} 991}
810 992
993#endif /* CONFIG_KRETPROBES */
994
811static int __init init_kprobes(void) 995static int __init init_kprobes(void)
812{ 996{
813 int i, err = 0; 997 int i, err = 0;
998 unsigned long offset = 0, size = 0;
999 char *modname, namebuf[128];
1000 const char *symbol_name;
1001 void *addr;
1002 struct kprobe_blackpoint *kb;
814 1003
815 /* FIXME allocate the probe table, currently defined statically */ 1004 /* FIXME allocate the probe table, currently defined statically */
816 /* initialize all list heads */ 1005 /* initialize all list heads */
@@ -819,6 +1008,28 @@ static int __init init_kprobes(void)
819 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1008 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
820 } 1009 }
821 1010
1011 /*
1012 * Lookup and populate the kprobe_blacklist.
1013 *
1014 * Unlike the kretprobe blacklist, we'll need to determine
1015 * the range of addresses that belong to the said functions,
1016 * since a kprobe need not necessarily be at the beginning
1017 * of a function.
1018 */
1019 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
1020 kprobe_lookup_name(kb->name, addr);
1021 if (!addr)
1022 continue;
1023
1024 kb->start_addr = (unsigned long)addr;
1025 symbol_name = kallsyms_lookup(kb->start_addr,
1026 &size, &offset, &modname, namebuf);
1027 if (!symbol_name)
1028 kb->range = 0;
1029 else
1030 kb->range = size;
1031 }
1032
822 if (kretprobe_blacklist_size) { 1033 if (kretprobe_blacklist_size) {
823 /* lookup the function address from its name */ 1034 /* lookup the function address from its name */
824 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1035 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -1066,8 +1277,12 @@ module_init(init_kprobes);
1066 1277
1067EXPORT_SYMBOL_GPL(register_kprobe); 1278EXPORT_SYMBOL_GPL(register_kprobe);
1068EXPORT_SYMBOL_GPL(unregister_kprobe); 1279EXPORT_SYMBOL_GPL(unregister_kprobe);
1280EXPORT_SYMBOL_GPL(register_kprobes);
1281EXPORT_SYMBOL_GPL(unregister_kprobes);
1069EXPORT_SYMBOL_GPL(register_jprobe); 1282EXPORT_SYMBOL_GPL(register_jprobe);
1070EXPORT_SYMBOL_GPL(unregister_jprobe); 1283EXPORT_SYMBOL_GPL(unregister_jprobe);
1284EXPORT_SYMBOL_GPL(register_jprobes);
1285EXPORT_SYMBOL_GPL(unregister_jprobes);
1071#ifdef CONFIG_KPROBES 1286#ifdef CONFIG_KPROBES
1072EXPORT_SYMBOL_GPL(jprobe_return); 1287EXPORT_SYMBOL_GPL(jprobe_return);
1073#endif 1288#endif
@@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return);
1075#ifdef CONFIG_KPROBES 1290#ifdef CONFIG_KPROBES
1076EXPORT_SYMBOL_GPL(register_kretprobe); 1291EXPORT_SYMBOL_GPL(register_kretprobe);
1077EXPORT_SYMBOL_GPL(unregister_kretprobe); 1292EXPORT_SYMBOL_GPL(unregister_kretprobe);
1293EXPORT_SYMBOL_GPL(register_kretprobes);
1294EXPORT_SYMBOL_GPL(unregister_kretprobes);
1078#endif 1295#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0ac887882f90..bd1b9ea024e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,6 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h>
17 16
18#define KTHREAD_NICE_LEVEL (-5) 17#define KTHREAD_NICE_LEVEL (-5)
19 18
@@ -99,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create)
99 struct sched_param param = { .sched_priority = 0 }; 98 struct sched_param param = { .sched_priority = 0 };
100 wait_for_completion(&create->started); 99 wait_for_completion(&create->started);
101 read_lock(&tasklist_lock); 100 read_lock(&tasklist_lock);
102 create->result = find_task_by_pid(pid); 101 create->result = find_task_by_pid_ns(pid, &init_pid_ns);
103 read_unlock(&tasklist_lock); 102 read_unlock(&tasklist_lock);
104 /* 103 /*
105 * root may have changed our (kthreadd's) priority or CPU mask. 104 * root may have changed our (kthreadd's) priority or CPU mask.
@@ -145,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 144
146 spin_lock(&kthread_create_lock); 145 spin_lock(&kthread_create_lock);
147 list_add_tail(&create.list, &kthread_create_list); 146 list_add_tail(&create.list, &kthread_create_list);
148 wake_up_process(kthreadd_task);
149 spin_unlock(&kthread_create_lock); 147 spin_unlock(&kthread_create_lock);
150 148
149 wake_up_process(kthreadd_task);
151 wait_for_completion(&create.done); 150 wait_for_completion(&create.done);
152 151
153 if (!IS_ERR(create.result)) { 152 if (!IS_ERR(create.result)) {
@@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 wait_task_inactive(k); 179 wait_task_inactive(k);
181 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
182 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1;
183} 183}
184EXPORT_SYMBOL(kthread_bind); 184EXPORT_SYMBOL(kthread_bind);
185 185
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b4e3c85abe74..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
64 return; 64 return;
65 65
66 for (i = 0; i < MAXLR; i++) { 66 for (i = 0; i < MAXLR; i++) {
67 int q; 67 int q, same = 1;
68 int same = 1; 68
69 /* Nothing stored: */ 69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) { 70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i) 71 if (firstnonnull > i)
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
73 continue; 73 continue;
74 } 74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] != 76 unsigned long record = lat->backtrace[q];
77 lat->backtrace[q]) 77
78 if (latency_record[i].backtrace[q] != record) {
78 same = 0; 79 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break; 80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX) 81 }
82
83 /* 0 and ULONG_MAX entries mean end of backtrace: */
84 if (record == 0 || record == ULONG_MAX)
82 break; 85 break;
83 } 86 }
84 if (same) { 87 if (same) {
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 for (i = 0; i < LT_SAVECOUNT ; i++) { 146 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat; 147 struct latency_record *mylat;
145 int same = 1; 148 int same = 1;
149
146 mylat = &tsk->latency_record[i]; 150 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] != 152 unsigned long record = lat.backtrace[q];
149 lat.backtrace[q]) 153
154 if (mylat->backtrace[q] != record) {
150 same = 0; 155 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break; 156 break;
153 if (same && lat.backtrace[q] == ULONG_MAX) 157 }
158
159 /* 0 and ULONG_MAX entries mean end of backtrace: */
160 if (record == 0 || record == ULONG_MAX)
154 break; 161 break;
155 } 162 }
156 if (same) { 163 if (same) {
@@ -226,14 +233,7 @@ static struct file_operations lstats_fops = {
226 233
227static int __init init_lstats_procfs(void) 234static int __init init_lstats_procfs(void)
228{ 235{
229 struct proc_dir_entry *pe; 236 proc_create("latency_stats", 0644, NULL, &lstats_fops);
230
231 pe = create_proc_entry("latency_stats", 0644, NULL);
232 if (!pe)
233 return -ENOMEM;
234
235 pe->proc_fops = &lstats_fops;
236
237 return 0; 237 return 0;
238} 238}
239__initcall(init_lstats_procfs); 239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
660 660
661static int __init lockdep_proc_init(void) 661static int __init lockdep_proc_init(void)
662{ 662{
663 struct proc_dir_entry *entry; 663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
664 664 proc_create("lockdep_stats", S_IRUSR, NULL,
665 entry = create_proc_entry("lockdep", S_IRUSR, NULL); 665 &proc_lockdep_stats_operations);
666 if (entry)
667 entry->proc_fops = &proc_lockdep_operations;
668
669 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
670 if (entry)
671 entry->proc_fops = &proc_lockdep_stats_operations;
672 666
673#ifdef CONFIG_LOCK_STAT 667#ifdef CONFIG_LOCK_STAT
674 entry = create_proc_entry("lock_stat", S_IRUSR, NULL); 668 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
675 if (entry)
676 entry->proc_fops = &proc_lock_stat_operations;
677#endif 669#endif
678 670
679 return 0; 671 return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 041c33e3e95c..b5a9fe1d50d5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,12 +23,13 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/marker.h> 24#include <linux/marker.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27extern struct marker __start___markers[]; 28extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 29extern struct marker __stop___markers[];
29 30
30/* Set to 1 to enable marker debug output */ 31/* Set to 1 to enable marker debug output */
31const int marker_debug; 32static const int marker_debug;
32 33
33/* 34/*
34 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
@@ -671,6 +672,9 @@ int marker_probe_register(const char *name, const char *format,
671 entry->rcu_pending = 1; 672 entry->rcu_pending = 1;
672 /* write rcu_pending before calling the RCU callback */ 673 /* write rcu_pending before calling the RCU callback */
673 smp_wmb(); 674 smp_wmb();
675#ifdef CONFIG_PREEMPT_RCU
676 synchronize_sched(); /* Until we have the call_rcu_sched() */
677#endif
674 call_rcu(&entry->rcu, free_old_closure); 678 call_rcu(&entry->rcu, free_old_closure);
675end: 679end:
676 mutex_unlock(&markers_mutex); 680 mutex_unlock(&markers_mutex);
@@ -714,6 +718,9 @@ int marker_probe_unregister(const char *name,
714 entry->rcu_pending = 1; 718 entry->rcu_pending = 1;
715 /* write rcu_pending before calling the RCU callback */ 719 /* write rcu_pending before calling the RCU callback */
716 smp_wmb(); 720 smp_wmb();
721#ifdef CONFIG_PREEMPT_RCU
722 synchronize_sched(); /* Until we have the call_rcu_sched() */
723#endif
717 call_rcu(&entry->rcu, free_old_closure); 724 call_rcu(&entry->rcu, free_old_closure);
718 remove_marker(name); /* Ignore busy error message */ 725 remove_marker(name); /* Ignore busy error message */
719 ret = 0; 726 ret = 0;
@@ -792,6 +799,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
792 entry->rcu_pending = 1; 799 entry->rcu_pending = 1;
793 /* write rcu_pending before calling the RCU callback */ 800 /* write rcu_pending before calling the RCU callback */
794 smp_wmb(); 801 smp_wmb();
802#ifdef CONFIG_PREEMPT_RCU
803 synchronize_sched(); /* Until we have the call_rcu_sched() */
804#endif
795 call_rcu(&entry->rcu, free_old_closure); 805 call_rcu(&entry->rcu, free_old_closure);
796 remove_marker(entry->name); /* Ignore busy error message */ 806 remove_marker(entry->name); /* Ignore busy error message */
797end: 807end:
diff --git a/kernel/module.c b/kernel/module.c
index 5d437bffd8dc..8674a390a2e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/mutex.h> 43#include <linux/mutex.h>
44#include <linux/unwind.h> 44#include <linux/unwind.h>
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46#include <asm/semaphore.h>
47#include <asm/cacheflush.h> 46#include <asm/cacheflush.h>
48#include <linux/license.h> 47#include <linux/license.h>
49#include <asm/sections.h> 48#include <asm/sections.h>
@@ -165,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
165 return NULL; 164 return NULL;
166} 165}
167 166
168static void printk_unused_warning(const char *name) 167static bool always_ok(bool gplok, bool warn, const char *name)
169{ 168{
170 printk(KERN_WARNING "Symbol %s is marked as UNUSED, " 169 return true;
171 "however this module is using it.\n", name);
172 printk(KERN_WARNING "This symbol will go away in the future.\n");
173 printk(KERN_WARNING "Please evalute if this is the right api to use, "
174 "and if it really is, submit a report the linux kernel "
175 "mailinglist together with submitting your code for "
176 "inclusion.\n");
177} 170}
178 171
179/* Find a symbol, return value, crc and module which owns it */ 172static bool printk_unused_warning(bool gplok, bool warn, const char *name)
180static unsigned long __find_symbol(const char *name,
181 struct module **owner,
182 const unsigned long **crc,
183 int gplok)
184{ 173{
185 struct module *mod; 174 if (warn) {
186 const struct kernel_symbol *ks; 175 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
187 176 "however this module is using it.\n", name);
188 /* Core kernel first. */ 177 printk(KERN_WARNING
189 *owner = NULL; 178 "This symbol will go away in the future.\n");
190 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); 179 printk(KERN_WARNING
191 if (ks) { 180 "Please evalute if this is the right api to use and if "
192 *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); 181 "it really is, submit a report the linux kernel "
193 return ks->value; 182 "mailinglist together with submitting your code for "
183 "inclusion.\n");
194 } 184 }
195 if (gplok) { 185 return true;
196 ks = lookup_symbol(name, __start___ksymtab_gpl, 186}
197 __stop___ksymtab_gpl); 187
198 if (ks) { 188static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
199 *crc = symversion(__start___kcrctab_gpl, 189{
200 (ks - __start___ksymtab_gpl)); 190 if (!gplok)
201 return ks->value; 191 return false;
202 } 192 return printk_unused_warning(gplok, warn, name);
203 } 193}
204 ks = lookup_symbol(name, __start___ksymtab_gpl_future, 194
205 __stop___ksymtab_gpl_future); 195static bool gpl_only(bool gplok, bool warn, const char *name)
206 if (ks) { 196{
207 if (!gplok) { 197 return gplok;
208 printk(KERN_WARNING "Symbol %s is being used " 198}
209 "by a non-GPL module, which will not " 199
210 "be allowed in the future\n", name); 200static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
211 printk(KERN_WARNING "Please see the file " 201{
212 "Documentation/feature-removal-schedule.txt " 202 if (!gplok && warn) {
213 "in the kernel source tree for more " 203 printk(KERN_WARNING "Symbol %s is being used "
214 "details.\n"); 204 "by a non-GPL module, which will not "
215 } 205 "be allowed in the future\n", name);
216 *crc = symversion(__start___kcrctab_gpl_future, 206 printk(KERN_WARNING "Please see the file "
217 (ks - __start___ksymtab_gpl_future)); 207 "Documentation/feature-removal-schedule.txt "
218 return ks->value; 208 "in the kernel source tree for more details.\n");
219 } 209 }
210 return true;
211}
220 212
221 ks = lookup_symbol(name, __start___ksymtab_unused, 213struct symsearch {
222 __stop___ksymtab_unused); 214 const struct kernel_symbol *start, *stop;
223 if (ks) { 215 const unsigned long *crcs;
224 printk_unused_warning(name); 216 bool (*check)(bool gplok, bool warn, const char *name);
225 *crc = symversion(__start___kcrctab_unused, 217};
226 (ks - __start___ksymtab_unused)); 218
227 return ks->value; 219/* Look through this array of symbol tables for a symbol match which
220 * passes the check function. */
221static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
222 unsigned int num,
223 const char *name,
224 bool gplok,
225 bool warn,
226 const unsigned long **crc)
227{
228 unsigned int i;
229 const struct kernel_symbol *ks;
230
231 for (i = 0; i < num; i++) {
232 ks = lookup_symbol(name, arr[i].start, arr[i].stop);
233 if (!ks || !arr[i].check(gplok, warn, name))
234 continue;
235
236 if (crc)
237 *crc = symversion(arr[i].crcs, ks - arr[i].start);
238 return ks;
228 } 239 }
240 return NULL;
241}
242
243/* Find a symbol, return value, (optional) crc and (optional) module
244 * which owns it */
245static unsigned long find_symbol(const char *name,
246 struct module **owner,
247 const unsigned long **crc,
248 bool gplok,
249 bool warn)
250{
251 struct module *mod;
252 const struct kernel_symbol *ks;
253 const struct symsearch arr[] = {
254 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
255 always_ok },
256 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
257 __start___kcrctab_gpl, gpl_only },
258 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
259 __start___kcrctab_gpl_future, warn_if_not_gpl },
260 { __start___ksymtab_unused, __stop___ksymtab_unused,
261 __start___kcrctab_unused, printk_unused_warning },
262 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
263 __start___kcrctab_unused_gpl, gpl_only_unused_warning },
264 };
229 265
230 if (gplok) 266 /* Core kernel first. */
231 ks = lookup_symbol(name, __start___ksymtab_unused_gpl, 267 ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
232 __stop___ksymtab_unused_gpl);
233 if (ks) { 268 if (ks) {
234 printk_unused_warning(name); 269 if (owner)
235 *crc = symversion(__start___kcrctab_unused_gpl, 270 *owner = NULL;
236 (ks - __start___ksymtab_unused_gpl));
237 return ks->value; 271 return ks->value;
238 } 272 }
239 273
240 /* Now try modules. */ 274 /* Now try modules. */
241 list_for_each_entry(mod, &modules, list) { 275 list_for_each_entry(mod, &modules, list) {
242 *owner = mod; 276 struct symsearch arr[] = {
243 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); 277 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
244 if (ks) { 278 always_ok },
245 *crc = symversion(mod->crcs, (ks - mod->syms)); 279 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
246 return ks->value; 280 mod->gpl_crcs, gpl_only },
247 } 281 { mod->gpl_future_syms,
248 282 mod->gpl_future_syms + mod->num_gpl_future_syms,
249 if (gplok) { 283 mod->gpl_future_crcs, warn_if_not_gpl },
250 ks = lookup_symbol(name, mod->gpl_syms, 284 { mod->unused_syms,
251 mod->gpl_syms + mod->num_gpl_syms); 285 mod->unused_syms + mod->num_unused_syms,
252 if (ks) { 286 mod->unused_crcs, printk_unused_warning },
253 *crc = symversion(mod->gpl_crcs, 287 { mod->unused_gpl_syms,
254 (ks - mod->gpl_syms)); 288 mod->unused_gpl_syms + mod->num_unused_gpl_syms,
255 return ks->value; 289 mod->unused_gpl_crcs, gpl_only_unused_warning },
256 } 290 };
257 } 291
258 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); 292 ks = search_symarrays(arr, ARRAY_SIZE(arr),
293 name, gplok, warn, crc);
259 if (ks) { 294 if (ks) {
260 printk_unused_warning(name); 295 if (owner)
261 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); 296 *owner = mod;
262 return ks->value;
263 }
264
265 if (gplok) {
266 ks = lookup_symbol(name, mod->unused_gpl_syms,
267 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
268 if (ks) {
269 printk_unused_warning(name);
270 *crc = symversion(mod->unused_gpl_crcs,
271 (ks - mod->unused_gpl_syms));
272 return ks->value;
273 }
274 }
275 ks = lookup_symbol(name, mod->gpl_future_syms,
276 (mod->gpl_future_syms +
277 mod->num_gpl_future_syms));
278 if (ks) {
279 if (!gplok) {
280 printk(KERN_WARNING "Symbol %s is being used "
281 "by a non-GPL module, which will not "
282 "be allowed in the future\n", name);
283 printk(KERN_WARNING "Please see the file "
284 "Documentation/feature-removal-schedule.txt "
285 "in the kernel source tree for more "
286 "details.\n");
287 }
288 *crc = symversion(mod->gpl_future_crcs,
289 (ks - mod->gpl_future_syms));
290 return ks->value; 297 return ks->value;
291 } 298 }
292 } 299 }
300
293 DEBUGP("Failed to find symbol %s\n", name); 301 DEBUGP("Failed to find symbol %s\n", name);
294 return -ENOENT; 302 return -ENOENT;
295} 303}
@@ -664,7 +672,7 @@ static void free_module(struct module *mod);
664 672
665static void wait_for_zero_refcount(struct module *mod) 673static void wait_for_zero_refcount(struct module *mod)
666{ 674{
667 /* Since we might sleep for some time, drop the semaphore first */ 675 /* Since we might sleep for some time, release the mutex first */
668 mutex_unlock(&module_mutex); 676 mutex_unlock(&module_mutex);
669 for (;;) { 677 for (;;) {
670 DEBUGP("Looking at refcount...\n"); 678 DEBUGP("Looking at refcount...\n");
@@ -737,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
737 if (!forced && module_refcount(mod) != 0) 745 if (!forced && module_refcount(mod) != 0)
738 wait_for_zero_refcount(mod); 746 wait_for_zero_refcount(mod);
739 747
748 mutex_unlock(&module_mutex);
740 /* Final destruction now noone is using it. */ 749 /* Final destruction now noone is using it. */
741 if (mod->exit != NULL) { 750 if (mod->exit != NULL)
742 mutex_unlock(&module_mutex);
743 mod->exit(); 751 mod->exit();
744 mutex_lock(&module_mutex); 752 blocking_notifier_call_chain(&module_notify_list,
745 } 753 MODULE_STATE_GOING, mod);
754 mutex_lock(&module_mutex);
746 /* Store the name of the last unloaded module for diagnostic purposes */ 755 /* Store the name of the last unloaded module for diagnostic purposes */
747 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 756 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
748 free_module(mod); 757 free_module(mod);
@@ -778,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
778void __symbol_put(const char *symbol) 787void __symbol_put(const char *symbol)
779{ 788{
780 struct module *owner; 789 struct module *owner;
781 const unsigned long *crc;
782 790
783 preempt_disable(); 791 preempt_disable();
784 if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) 792 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
785 BUG(); 793 BUG();
786 module_put(owner); 794 module_put(owner);
787 preempt_enable(); 795 preempt_enable();
@@ -925,13 +933,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
925 struct module *mod) 933 struct module *mod)
926{ 934{
927 const unsigned long *crc; 935 const unsigned long *crc;
928 struct module *owner;
929 936
930 if (IS_ERR_VALUE(__find_symbol("struct_module", 937 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
931 &owner, &crc, 1)))
932 BUG(); 938 BUG();
933 return check_version(sechdrs, versindex, "struct_module", mod, 939 return check_version(sechdrs, versindex, "struct_module", mod, crc);
934 crc);
935} 940}
936 941
937/* First part is kernel version, which we ignore. */ 942/* First part is kernel version, which we ignore. */
@@ -975,8 +980,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
975 unsigned long ret; 980 unsigned long ret;
976 const unsigned long *crc; 981 const unsigned long *crc;
977 982
978 ret = __find_symbol(name, &owner, &crc, 983 ret = find_symbol(name, &owner, &crc,
979 !(mod->taints & TAINT_PROPRIETARY_MODULE)); 984 !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
980 if (!IS_ERR_VALUE(ret)) { 985 if (!IS_ERR_VALUE(ret)) {
981 /* use_module can fail due to OOM, 986 /* use_module can fail due to OOM,
982 or module initialization or unloading */ 987 or module initialization or unloading */
@@ -992,6 +997,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
992 * J. Corbet <corbet@lwn.net> 997 * J. Corbet <corbet@lwn.net>
993 */ 998 */
994#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 999#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1000struct module_sect_attr
1001{
1002 struct module_attribute mattr;
1003 char *name;
1004 unsigned long address;
1005};
1006
1007struct module_sect_attrs
1008{
1009 struct attribute_group grp;
1010 unsigned int nsections;
1011 struct module_sect_attr attrs[0];
1012};
1013
995static ssize_t module_sect_show(struct module_attribute *mattr, 1014static ssize_t module_sect_show(struct module_attribute *mattr,
996 struct module *mod, char *buf) 1015 struct module *mod, char *buf)
997{ 1016{
@@ -1002,7 +1021,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1002 1021
1003static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1022static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1004{ 1023{
1005 int section; 1024 unsigned int section;
1006 1025
1007 for (section = 0; section < sect_attrs->nsections; section++) 1026 for (section = 0; section < sect_attrs->nsections; section++)
1008 kfree(sect_attrs->attrs[section].name); 1027 kfree(sect_attrs->attrs[section].name);
@@ -1363,10 +1382,9 @@ void *__symbol_get(const char *symbol)
1363{ 1382{
1364 struct module *owner; 1383 struct module *owner;
1365 unsigned long value; 1384 unsigned long value;
1366 const unsigned long *crc;
1367 1385
1368 preempt_disable(); 1386 preempt_disable();
1369 value = __find_symbol(symbol, &owner, &crc, 1); 1387 value = find_symbol(symbol, &owner, NULL, true, true);
1370 if (IS_ERR_VALUE(value)) 1388 if (IS_ERR_VALUE(value))
1371 value = 0; 1389 value = 0;
1372 else if (strong_try_module_get(owner)) 1390 else if (strong_try_module_get(owner))
@@ -1383,33 +1401,33 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1383 */ 1401 */
1384static int verify_export_symbols(struct module *mod) 1402static int verify_export_symbols(struct module *mod)
1385{ 1403{
1386 const char *name = NULL; 1404 unsigned int i;
1387 unsigned long i, ret = 0;
1388 struct module *owner; 1405 struct module *owner;
1389 const unsigned long *crc; 1406 const struct kernel_symbol *s;
1390 1407 struct {
1391 for (i = 0; i < mod->num_syms; i++) 1408 const struct kernel_symbol *sym;
1392 if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, 1409 unsigned int num;
1393 &owner, &crc, 1))) { 1410 } arr[] = {
1394 name = mod->syms[i].name; 1411 { mod->syms, mod->num_syms },
1395 ret = -ENOEXEC; 1412 { mod->gpl_syms, mod->num_gpl_syms },
1396 goto dup; 1413 { mod->gpl_future_syms, mod->num_gpl_future_syms },
1397 } 1414 { mod->unused_syms, mod->num_unused_syms },
1415 { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
1416 };
1398 1417
1399 for (i = 0; i < mod->num_gpl_syms; i++) 1418 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1400 if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, 1419 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1401 &owner, &crc, 1))) { 1420 if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
1402 name = mod->gpl_syms[i].name; 1421 NULL, true, false))) {
1403 ret = -ENOEXEC; 1422 printk(KERN_ERR
1404 goto dup; 1423 "%s: exports duplicate symbol %s"
1424 " (owned by %s)\n",
1425 mod->name, s->name, module_name(owner));
1426 return -ENOEXEC;
1427 }
1405 } 1428 }
1406 1429 }
1407dup: 1430 return 0;
1408 if (ret)
1409 printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
1410 mod->name, name, module_name(owner));
1411
1412 return ret;
1413} 1431}
1414 1432
1415/* Change all symbols so that st_value encodes the pointer directly. */ 1433/* Change all symbols so that st_value encodes the pointer directly. */
@@ -1815,8 +1833,9 @@ static struct module *load_module(void __user *umod,
1815 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); 1833 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1816#endif 1834#endif
1817 1835
1818 /* Don't keep modinfo section */ 1836 /* Don't keep modinfo and version sections. */
1819 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1837 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1838 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1820#ifdef CONFIG_KALLSYMS 1839#ifdef CONFIG_KALLSYMS
1821 /* Keep symbol and string tables for decoding later. */ 1840 /* Keep symbol and string tables for decoding later. */
1822 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1841 sechdrs[symindex].sh_flags |= SHF_ALLOC;
@@ -1978,7 +1997,8 @@ static struct module *load_module(void __user *umod,
1978 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 1997 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1979 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 1998 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1980 if (unusedgplcrcindex) 1999 if (unusedgplcrcindex)
1981 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2000 mod->unused_gpl_crcs
2001 = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1982 2002
1983#ifdef CONFIG_MODVERSIONS 2003#ifdef CONFIG_MODVERSIONS
1984 if ((mod->num_syms && !crcindex) || 2004 if ((mod->num_syms && !crcindex) ||
@@ -2172,6 +2192,8 @@ sys_init_module(void __user *umod,
2172 mod->state = MODULE_STATE_GOING; 2192 mod->state = MODULE_STATE_GOING;
2173 synchronize_sched(); 2193 synchronize_sched();
2174 module_put(mod); 2194 module_put(mod);
2195 blocking_notifier_call_chain(&module_notify_list,
2196 MODULE_STATE_GOING, mod);
2175 mutex_lock(&module_mutex); 2197 mutex_lock(&module_mutex);
2176 free_module(mod); 2198 free_module(mod);
2177 mutex_unlock(&module_mutex); 2199 mutex_unlock(&module_mutex);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
31 return 0; 31 return 0;
32} 32}
33 33
34static int notifier_chain_cond_register(struct notifier_block **nl,
35 struct notifier_block *n)
36{
37 while ((*nl) != NULL) {
38 if ((*nl) == n)
39 return 0;
40 if (n->priority > (*nl)->priority)
41 break;
42 nl = &((*nl)->next);
43 }
44 n->next = *nl;
45 rcu_assign_pointer(*nl, n);
46 return 0;
47}
48
34static int notifier_chain_unregister(struct notifier_block **nl, 49static int notifier_chain_unregister(struct notifier_block **nl,
35 struct notifier_block *n) 50 struct notifier_block *n)
36{ 51{
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
205EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); 220EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
206 221
207/** 222/**
223 * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
224 * @nh: Pointer to head of the blocking notifier chain
225 * @n: New entry in notifier chain
226 *
227 * Adds a notifier to a blocking notifier chain, only if not already
228 * present in the chain.
229 * Must be called in process context.
230 *
231 * Currently always returns zero.
232 */
233int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
234 struct notifier_block *n)
235{
236 int ret;
237
238 down_write(&nh->rwsem);
239 ret = notifier_chain_cond_register(&nh->head, n);
240 up_write(&nh->rwsem);
241 return ret;
242}
243EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
244
245/**
208 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain 246 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
209 * @nh: Pointer to head of the blocking notifier chain 247 * @nh: Pointer to head of the blocking notifier chain
210 * @n: Entry to remove from notifier chain 248 * @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/slab.h>
11#include <linux/nsproxy.h>
10 12
11struct ns_cgroup { 13struct ns_cgroup {
12 struct cgroup_subsys_state css; 14 struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
139 goto out; 139 goto out;
140 } 140 }
141 141
142 /*
143 * CLONE_NEWIPC must detach from the undolist: after switching
144 * to a new ipc namespace, the semaphore arrays from the old
145 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
146 * means share undolist with parent, so we must forbid using
147 * it along with CLONE_NEWIPC.
148 */
149 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
150 err = -EINVAL;
151 goto out;
152 }
153
142 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
143 if (IS_ERR(new_ns)) { 155 if (IS_ERR(new_ns)) {
144 err = PTR_ERR(new_ns); 156 err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
153 * 'M' - System experienced a machine check exception. 153 * 'M' - System experienced a machine check exception.
154 * 'B' - System has hit bad_page. 154 * 'B' - System has hit bad_page.
155 * 'U' - Userspace-defined naughtiness. 155 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning.
156 * 158 *
157 * The string is overwritten by the next call to print_taint(). 159 * The string is overwritten by the next call to print_taint().
158 */ 160 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
161{ 163{
162 static char buf[20]; 164 static char buf[20];
163 if (tainted) { 165 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", 166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 172 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 173 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' ', 174 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); 175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
176 tainted & TAINT_WARN ? 'W' : ' ');
174 } 177 }
175 else 178 else
176 snprintf(buf, sizeof(buf), "Not tainted"); 179 snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
312 print_modules(); 315 print_modules();
313 dump_stack(); 316 dump_stack();
314 print_oops_end_marker(); 317 print_oops_end_marker();
318 add_taint(TAINT_WARN);
315} 319}
316EXPORT_SYMBOL(warn_on_slowpath); 320EXPORT_SYMBOL(warn_on_slowpath);
317#endif 321#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 477691576b33..20d59fa2d493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init);
111 111
112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 112static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
113 113
114static void free_pidmap(struct pid_namespace *pid_ns, int pid) 114static void free_pidmap(struct upid *upid)
115{ 115{
116 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; 116 int nr = upid->nr;
117 int offset = pid & BITS_PER_PAGE_MASK; 117 struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
118 int offset = nr & BITS_PER_PAGE_MASK;
118 119
119 clear_bit(offset, map->page); 120 clear_bit(offset, map->page);
120 atomic_inc(&map->nr_free); 121 atomic_inc(&map->nr_free);
@@ -232,7 +233,7 @@ void free_pid(struct pid *pid)
232 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
233 234
234 for (i = 0; i <= pid->level; i++) 235 for (i = 0; i <= pid->level; i++)
235 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 236 free_pidmap(pid->numbers + i);
236 237
237 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
238} 239}
@@ -278,8 +279,8 @@ out:
278 return pid; 279 return pid;
279 280
280out_free: 281out_free:
281 for (i++; i <= ns->level; i++) 282 while (++i <= ns->level)
282 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); 283 free_pidmap(pid->numbers + i);
283 284
284 kmem_cache_free(ns->pid_cachep, pid); 285 kmem_cache_free(ns->pid_cachep, pid);
285 pid = NULL; 286 pid = NULL;
@@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid);
316/* 317/*
317 * attach_pid() must be called with the tasklist_lock write-held. 318 * attach_pid() must be called with the tasklist_lock write-held.
318 */ 319 */
319int attach_pid(struct task_struct *task, enum pid_type type, 320void attach_pid(struct task_struct *task, enum pid_type type,
320 struct pid *pid) 321 struct pid *pid)
321{ 322{
322 struct pid_link *link; 323 struct pid_link *link;
@@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type,
324 link = &task->pids[type]; 325 link = &task->pids[type];
325 link->pid = pid; 326 link->pid = pid;
326 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 327 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
327
328 return 0;
329} 328}
330 329
331void detach_pid(struct task_struct *task, enum pid_type type) 330static void __change_pid(struct task_struct *task, enum pid_type type,
331 struct pid *new)
332{ 332{
333 struct pid_link *link; 333 struct pid_link *link;
334 struct pid *pid; 334 struct pid *pid;
@@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type)
338 pid = link->pid; 338 pid = link->pid;
339 339
340 hlist_del_rcu(&link->node); 340 hlist_del_rcu(&link->node);
341 link->pid = NULL; 341 link->pid = new;
342 342
343 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 343 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
344 if (!hlist_empty(&pid->tasks[tmp])) 344 if (!hlist_empty(&pid->tasks[tmp]))
@@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type)
347 free_pid(pid); 347 free_pid(pid);
348} 348}
349 349
350void detach_pid(struct task_struct *task, enum pid_type type)
351{
352 __change_pid(task, type, NULL);
353}
354
355void change_pid(struct task_struct *task, enum pid_type type,
356 struct pid *pid)
357{
358 __change_pid(task, type, pid);
359 attach_pid(task, type, pid);
360}
361
350/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 362/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
351void transfer_pid(struct task_struct *old, struct task_struct *new, 363void transfer_pid(struct task_struct *old, struct task_struct *new,
352 enum pid_type type) 364 enum pid_type type)
353{ 365{
354 new->pids[type].pid = old->pids[type].pid; 366 new->pids[type].pid = old->pids[type].pid;
355 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); 367 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
356 old->pids[type].pid = NULL;
357} 368}
358 369
359struct task_struct *pid_task(struct pid *pid, enum pid_type type) 370struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr,
380 391
381EXPORT_SYMBOL(find_task_by_pid_type_ns); 392EXPORT_SYMBOL(find_task_by_pid_type_ns);
382 393
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 394struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 395{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 396 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..98702b4b8851 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -66,7 +66,7 @@ err_alloc:
66 return NULL; 66 return NULL;
67} 67}
68 68
69static struct pid_namespace *create_pid_namespace(int level) 69static struct pid_namespace *create_pid_namespace(unsigned int level)
70{ 70{
71 struct pid_namespace *ns; 71 struct pid_namespace *ns;
72 int i; 72 int i;
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 93
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 94 for (i = 1; i < PIDMAP_ENTRIES; i++) {
95 ns->pidmap[i].page = 0; 95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 } 97 }
98 98
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2eae91f954ca..f1525ad06cb3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -4,8 +4,9 @@
4 4
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/posix-timers.h> 6#include <linux/posix-timers.h>
7#include <asm/uaccess.h>
8#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
9 10
10static int check_clock(const clockid_t which_clock) 11static int check_clock(const clockid_t which_clock)
11{ 12{
@@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock,
47 union cpu_time_count cpu, 48 union cpu_time_count cpu,
48 struct timespec *tp) 49 struct timespec *tp)
49{ 50{
50 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 51 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
51 tp->tv_sec = div_long_long_rem(cpu.sched, 52 *tp = ns_to_timespec(cpu.sched);
52 NSEC_PER_SEC, &tp->tv_nsec); 53 else
53 } else {
54 cputime_to_timespec(cpu.cpu, tp); 54 cputime_to_timespec(cpu.cpu, tp);
55 }
56} 55}
57 56
58static inline int cpu_time_before(const clockid_t which_clock, 57static inline int cpu_time_before(const clockid_t which_clock,
@@ -1087,45 +1086,45 @@ static void check_process_timers(struct task_struct *tsk,
1087 maxfire = 20; 1086 maxfire = 20;
1088 prof_expires = cputime_zero; 1087 prof_expires = cputime_zero;
1089 while (!list_empty(timers)) { 1088 while (!list_empty(timers)) {
1090 struct cpu_timer_list *t = list_first_entry(timers, 1089 struct cpu_timer_list *tl = list_first_entry(timers,
1091 struct cpu_timer_list, 1090 struct cpu_timer_list,
1092 entry); 1091 entry);
1093 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { 1092 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
1094 prof_expires = t->expires.cpu; 1093 prof_expires = tl->expires.cpu;
1095 break; 1094 break;
1096 } 1095 }
1097 t->firing = 1; 1096 tl->firing = 1;
1098 list_move_tail(&t->entry, firing); 1097 list_move_tail(&tl->entry, firing);
1099 } 1098 }
1100 1099
1101 ++timers; 1100 ++timers;
1102 maxfire = 20; 1101 maxfire = 20;
1103 virt_expires = cputime_zero; 1102 virt_expires = cputime_zero;
1104 while (!list_empty(timers)) { 1103 while (!list_empty(timers)) {
1105 struct cpu_timer_list *t = list_first_entry(timers, 1104 struct cpu_timer_list *tl = list_first_entry(timers,
1106 struct cpu_timer_list, 1105 struct cpu_timer_list,
1107 entry); 1106 entry);
1108 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { 1107 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
1109 virt_expires = t->expires.cpu; 1108 virt_expires = tl->expires.cpu;
1110 break; 1109 break;
1111 } 1110 }
1112 t->firing = 1; 1111 tl->firing = 1;
1113 list_move_tail(&t->entry, firing); 1112 list_move_tail(&tl->entry, firing);
1114 } 1113 }
1115 1114
1116 ++timers; 1115 ++timers;
1117 maxfire = 20; 1116 maxfire = 20;
1118 sched_expires = 0; 1117 sched_expires = 0;
1119 while (!list_empty(timers)) { 1118 while (!list_empty(timers)) {
1120 struct cpu_timer_list *t = list_first_entry(timers, 1119 struct cpu_timer_list *tl = list_first_entry(timers,
1121 struct cpu_timer_list, 1120 struct cpu_timer_list,
1122 entry); 1121 entry);
1123 if (!--maxfire || sum_sched_runtime < t->expires.sched) { 1122 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1124 sched_expires = t->expires.sched; 1123 sched_expires = tl->expires.sched;
1125 break; 1124 break;
1126 } 1125 }
1127 t->firing = 1; 1126 tl->firing = 1;
1128 list_move_tail(&t->entry, firing); 1127 list_move_tail(&tl->entry, firing);
1129 } 1128 }
1130 1129
1131 /* 1130 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a9b04203a66d..dbd8398ddb0b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -37,7 +37,6 @@
37#include <linux/mutex.h> 37#include <linux/mutex.h>
38 38
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/semaphore.h>
41#include <linux/list.h> 40#include <linux/list.h>
42#include <linux/init.h> 41#include <linux/init.h>
43#include <linux/compiler.h> 42#include <linux/compiler.h>
@@ -311,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
311 310
312 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
313 struct task_struct *leader; 312 struct task_struct *leader;
314 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, 313 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
315 timr->it_process);
316 314
317 if (likely(ret >= 0)) 315 if (likely(ret >= 0))
318 return ret; 316 return ret;
@@ -323,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
323 timr->it_process = leader; 321 timr->it_process = leader;
324 } 322 }
325 323
326 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, 324 return send_sigqueue(timr->sigq, timr->it_process, 1);
327 timr->it_process);
328} 325}
329EXPORT_SYMBOL_GPL(posix_timer_event); 326EXPORT_SYMBOL_GPL(posix_timer_event);
330 327
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM
25 default n
26 ---help---
27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
29
30 If unsure, say N.
31
32config PM_DEBUG 22config PM_DEBUG
33 bool "Power Management Debug Support" 23 bool "Power Management Debug Support"
34 depends on PM 24 depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-y := main.o
7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += process.o console.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
10 9
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 89bcf4973ee5..b8628be2a465 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -7,17 +7,39 @@
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/console.h>
10#include <linux/module.h>
10#include "power.h" 11#include "power.h"
11 12
12#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) 13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
16 33
17int pm_prepare_console(void) 34int pm_prepare_console(void)
18{ 35{
19 acquire_console_sem(); 36 acquire_console_sem();
20 37
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
21 orig_fgconsole = fg_console; 43 orig_fgconsole = fg_console;
22 44
23 if (vc_allocate(SUSPEND_CONSOLE)) { 45 if (vc_allocate(SUSPEND_CONSOLE)) {
@@ -50,9 +72,12 @@ int pm_prepare_console(void)
50void pm_restore_console(void) 72void pm_restore_console(void)
51{ 73{
52 acquire_console_sem(); 74 acquire_console_sem();
75 if (disable_vt_switch) {
76 release_console_sem();
77 return;
78 }
53 set_console(orig_fgconsole); 79 set_console(orig_fgconsole);
54 release_console_sem(); 80 release_console_sem();
55 kmsg_redirect = orig_kmsg; 81 kmsg_redirect = orig_kmsg;
56 return;
57} 82}
58#endif 83#endif
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 * pm.c - Power management interface
3 *
4 * Copyright (C) 2000 Andrew Henroid
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h>
28#include <linux/mutex.h>
29
30/*
31 * Locking notes:
32 * pm_devs_lock can be a semaphore providing pm ops are not called
33 * from an interrupt handler (already a bad idea so no change here). Each
34 * change must be protected so that an unlink of an entry doesn't clash
35 * with a pm send - which is permitted to sleep in the current architecture
36 *
37 * Module unloads clashing with pm events now work out safely, the module
38 * unload path will block until the event has been sent. It may well block
39 * until a resume but that will be fine.
40 */
41
42static DEFINE_MUTEX(pm_devs_lock);
43static LIST_HEAD(pm_devs);
44
45/**
46 * pm_register - register a device with power management
47 * @type: device type
48 * @id: device ID
49 * @callback: callback function
50 *
51 * Add a device to the list of devices that wish to be notified about
52 * power management events. A &pm_dev structure is returned on success,
53 * on failure the return is %NULL.
54 *
55 * The callback function will be called in process context and
56 * it may sleep.
57 */
58
59struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id,
61 pm_callback callback)
62{
63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) {
65 dev->type = type;
66 dev->id = id;
67 dev->callback = callback;
68
69 mutex_lock(&pm_devs_lock);
70 list_add(&dev->entry, &pm_devs);
71 mutex_unlock(&pm_devs_lock);
72 }
73 return dev;
74}
75
76/**
77 * pm_send - send request to a single device
78 * @dev: device to send to
79 * @rqst: power management request
80 * @data: data for the callback
81 *
82 * Issue a power management request to a given device. The
83 * %PM_SUSPEND and %PM_RESUME events are handled specially. The
84 * data field must hold the intended next state. No call is made
85 * if the state matches.
86 *
87 * BUGS: what stops two power management requests occurring in parallel
88 * and conflicting.
89 *
90 * WARNING: Calling pm_send directly is not generally recommended, in
91 * particular there is no locking against the pm_dev going away. The
92 * caller must maintain all needed locking or have 'inside knowledge'
93 * on the safety. Also remember that this function is not locked against
94 * pm_unregister. This means that you must handle SMP races on callback
95 * execution and unload yourself.
96 */
97
98static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
99{
100 int status = 0;
101 unsigned long prev_state, next_state;
102
103 if (in_interrupt())
104 BUG();
105
106 switch (rqst) {
107 case PM_SUSPEND:
108 case PM_RESUME:
109 prev_state = dev->state;
110 next_state = (unsigned long) data;
111 if (prev_state != next_state) {
112 if (dev->callback)
113 status = (*dev->callback)(dev, rqst, data);
114 if (!status) {
115 dev->state = next_state;
116 dev->prev_state = prev_state;
117 }
118 }
119 else {
120 dev->prev_state = prev_state;
121 }
122 break;
123 default:
124 if (dev->callback)
125 status = (*dev->callback)(dev, rqst, data);
126 break;
127 }
128 return status;
129}
130
131/*
132 * Undo incomplete request
133 */
134static void pm_undo_all(struct pm_dev *last)
135{
136 struct list_head *entry = last->entry.prev;
137 while (entry != &pm_devs) {
138 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
139 if (dev->state != dev->prev_state) {
140 /* previous state was zero (running) resume or
141 * previous state was non-zero (suspended) suspend
142 */
143 pm_request_t undo = (dev->prev_state
144 ? PM_SUSPEND:PM_RESUME);
145 pm_send(dev, undo, (void*) dev->prev_state);
146 }
147 entry = entry->prev;
148 }
149}
150
151/**
152 * pm_send_all - send request to all managed devices
153 * @rqst: power management request
154 * @data: data for the callback
155 *
156 * Issue a power management request to a all devices. The
157 * %PM_SUSPEND events are handled specially. Any device is
158 * permitted to fail a suspend by returning a non zero (error)
159 * value from its callback function. If any device vetoes a
160 * suspend request then all other devices that have suspended
161 * during the processing of this request are restored to their
162 * previous state.
163 *
164 * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
165 * the callbacks have completed. This prevents races against pm locking
166 * functions, races against module unload pm_unregister code. It does
167 * mean however that you must not issue pm_ functions within the callback
168 * or you will deadlock and users will hate you.
169 *
170 * Zero is returned on success. If a suspend fails then the status
171 * from the device that vetoes the suspend is returned.
172 *
173 * BUGS: what stops two power management requests occurring in parallel
174 * and conflicting.
175 */
176
177int pm_send_all(pm_request_t rqst, void *data)
178{
179 struct list_head *entry;
180
181 mutex_lock(&pm_devs_lock);
182 entry = pm_devs.next;
183 while (entry != &pm_devs) {
184 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
185 if (dev->callback) {
186 int status = pm_send(dev, rqst, data);
187 if (status) {
188 /* return devices to previous state on
189 * failed suspend request
190 */
191 if (rqst == PM_SUSPEND)
192 pm_undo_all(dev);
193 mutex_unlock(&pm_devs_lock);
194 return status;
195 }
196 }
197 entry = entry->next;
198 }
199 mutex_unlock(&pm_devs_lock);
200 return 0;
201}
202
203EXPORT_SYMBOL(pm_register);
204EXPORT_SYMBOL(pm_send_all);
205
diff --git a/kernel/printk.c b/kernel/printk.c
index c46a20a19a15..8fb01c32aa3b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -111,6 +111,9 @@ struct console_cmdline
111 char name[8]; /* Name of the driver */ 111 char name[8]; /* Name of the driver */
112 int index; /* Minor dev. to use */ 112 int index; /* Minor dev. to use */
113 char *options; /* Options for the driver */ 113 char *options; /* Options for the driver */
114#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
115 char *brl_options; /* Options for braille driver */
116#endif
114}; 117};
115 118
116#define MAX_CMDLINECONSOLES 8 119#define MAX_CMDLINECONSOLES 8
@@ -643,8 +646,21 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
643{ 646{
644 int retval = 0; 647 int retval = 0;
645 648
646 if (can_use_console(cpu)) 649 if (!try_acquire_console_sem()) {
647 retval = !try_acquire_console_sem(); 650 retval = 1;
651
652 /*
653 * If we can't use the console, we need to release
654 * the console semaphore by hand to avoid flushing
655 * the buffer. We need to hold the console semaphore
656 * in order to do this test safely.
657 */
658 if (!can_use_console(cpu)) {
659 console_locked = 0;
660 up(&console_sem);
661 retval = 0;
662 }
663 }
648 printk_cpu = UINT_MAX; 664 printk_cpu = UINT_MAX;
649 spin_unlock(&logbuf_lock); 665 spin_unlock(&logbuf_lock);
650 return retval; 666 return retval;
@@ -795,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end)
795 811
796#endif 812#endif
797 813
814static int __add_preferred_console(char *name, int idx, char *options,
815 char *brl_options)
816{
817 struct console_cmdline *c;
818 int i;
819
820 /*
821 * See if this tty is not yet registered, and
822 * if we have a slot free.
823 */
824 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
825 if (strcmp(console_cmdline[i].name, name) == 0 &&
826 console_cmdline[i].index == idx) {
827 if (!brl_options)
828 selected_console = i;
829 return 0;
830 }
831 if (i == MAX_CMDLINECONSOLES)
832 return -E2BIG;
833 if (!brl_options)
834 selected_console = i;
835 c = &console_cmdline[i];
836 strlcpy(c->name, name, sizeof(c->name));
837 c->options = options;
838#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
839 c->brl_options = brl_options;
840#endif
841 c->index = idx;
842 return 0;
843}
798/* 844/*
799 * Set up a list of consoles. Called from init/main.c 845 * Set up a list of consoles. Called from init/main.c
800 */ 846 */
801static int __init console_setup(char *str) 847static int __init console_setup(char *str)
802{ 848{
803 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ 849 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
804 char *s, *options; 850 char *s, *options, *brl_options = NULL;
805 int idx; 851 int idx;
806 852
853#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
854 if (!memcmp(str, "brl,", 4)) {
855 brl_options = "";
856 str += 4;
857 } else if (!memcmp(str, "brl=", 4)) {
858 brl_options = str + 4;
859 str = strchr(brl_options, ',');
860 if (!str) {
861 printk(KERN_ERR "need port name after brl=\n");
862 return 1;
863 }
864 *(str++) = 0;
865 }
866#endif
867
807 /* 868 /*
808 * Decode str into name, index, options. 869 * Decode str into name, index, options.
809 */ 870 */
@@ -828,7 +889,7 @@ static int __init console_setup(char *str)
828 idx = simple_strtoul(s, NULL, 10); 889 idx = simple_strtoul(s, NULL, 10);
829 *s = 0; 890 *s = 0;
830 891
831 add_preferred_console(buf, idx, options); 892 __add_preferred_console(buf, idx, options, brl_options);
832 return 1; 893 return 1;
833} 894}
834__setup("console=", console_setup); 895__setup("console=", console_setup);
@@ -848,28 +909,7 @@ __setup("console=", console_setup);
848 */ 909 */
849int add_preferred_console(char *name, int idx, char *options) 910int add_preferred_console(char *name, int idx, char *options)
850{ 911{
851 struct console_cmdline *c; 912 return __add_preferred_console(name, idx, options, NULL);
852 int i;
853
854 /*
855 * See if this tty is not yet registered, and
856 * if we have a slot free.
857 */
858 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
859 if (strcmp(console_cmdline[i].name, name) == 0 &&
860 console_cmdline[i].index == idx) {
861 selected_console = i;
862 return 0;
863 }
864 if (i == MAX_CMDLINECONSOLES)
865 return -E2BIG;
866 selected_console = i;
867 c = &console_cmdline[i];
868 memcpy(c->name, name, sizeof(c->name));
869 c->name[sizeof(c->name) - 1] = 0;
870 c->options = options;
871 c->index = idx;
872 return 0;
873} 913}
874 914
875int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) 915int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
@@ -881,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
881 if (strcmp(console_cmdline[i].name, name) == 0 && 921 if (strcmp(console_cmdline[i].name, name) == 0 &&
882 console_cmdline[i].index == idx) { 922 console_cmdline[i].index == idx) {
883 c = &console_cmdline[i]; 923 c = &console_cmdline[i];
884 memcpy(c->name, name_new, sizeof(c->name)); 924 strlcpy(c->name, name_new, sizeof(c->name));
885 c->name[sizeof(c->name) - 1] = 0; 925 c->name[sizeof(c->name) - 1] = 0;
886 c->options = options; 926 c->options = options;
887 c->index = idx_new; 927 c->index = idx_new;
@@ -1150,6 +1190,16 @@ void register_console(struct console *console)
1150 continue; 1190 continue;
1151 if (console->index < 0) 1191 if (console->index < 0)
1152 console->index = console_cmdline[i].index; 1192 console->index = console_cmdline[i].index;
1193#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1194 if (console_cmdline[i].brl_options) {
1195 console->flags |= CON_BRL;
1196 braille_register_console(console,
1197 console_cmdline[i].index,
1198 console_cmdline[i].options,
1199 console_cmdline[i].brl_options);
1200 return;
1201 }
1202#endif
1153 if (console->setup && 1203 if (console->setup &&
1154 console->setup(console, console_cmdline[i].options) != 0) 1204 console->setup(console, console_cmdline[i].options) != 0)
1155 break; 1205 break;
@@ -1208,6 +1258,11 @@ int unregister_console(struct console *console)
1208 struct console *a, *b; 1258 struct console *a, *b;
1209 int res = 1; 1259 int res = 1;
1210 1260
1261#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1262 if (console->flags & CON_BRL)
1263 return braille_unregister_console(console);
1264#endif
1265
1211 acquire_console_sem(); 1266 acquire_console_sem();
1212 if (console_drivers == console) { 1267 if (console_drivers == console) {
1213 console_drivers=console->next; 1268 console_drivers=console->next;
@@ -1259,8 +1314,8 @@ late_initcall(disable_boot_consoles);
1259 */ 1314 */
1260void tty_write_message(struct tty_struct *tty, char *msg) 1315void tty_write_message(struct tty_struct *tty, char *msg)
1261{ 1316{
1262 if (tty && tty->driver->write) 1317 if (tty && tty->ops->write)
1263 tty->driver->write(tty, msg, strlen(msg)); 1318 tty->ops->write(tty, msg, strlen(msg));
1264 return; 1319 return;
1265} 1320}
1266 1321
@@ -1274,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1274 */ 1329 */
1275int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1330int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1276{ 1331{
1277 static DEFINE_SPINLOCK(ratelimit_lock); 1332 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1278 static unsigned toks = 10 * 5 * HZ;
1279 static unsigned long last_msg;
1280 static int missed;
1281 unsigned long flags;
1282 unsigned long now = jiffies;
1283
1284 spin_lock_irqsave(&ratelimit_lock, flags);
1285 toks += now - last_msg;
1286 last_msg = now;
1287 if (toks > (ratelimit_burst * ratelimit_jiffies))
1288 toks = ratelimit_burst * ratelimit_jiffies;
1289 if (toks >= ratelimit_jiffies) {
1290 int lost = missed;
1291
1292 missed = 0;
1293 toks -= ratelimit_jiffies;
1294 spin_unlock_irqrestore(&ratelimit_lock, flags);
1295 if (lost)
1296 printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
1297 return 1;
1298 }
1299 missed++;
1300 spin_unlock_irqrestore(&ratelimit_lock, flags);
1301 return 0;
1302} 1333}
1303EXPORT_SYMBOL(__printk_ratelimit); 1334EXPORT_SYMBOL(__printk_ratelimit);
1304 1335
diff --git a/kernel/profile.c b/kernel/profile.c
index 3b7a1b055122..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,7 +23,6 @@
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <asm/sections.h> 25#include <asm/sections.h>
26#include <asm/semaphore.h>
27#include <asm/irq_regs.h> 26#include <asm/irq_regs.h>
28#include <asm/ptrace.h> 27#include <asm/ptrace.h>
29 28
@@ -588,10 +587,10 @@ static int __init create_proc_profile(void)
588 return 0; 587 return 0;
589 if (create_hash_tables()) 588 if (create_hash_tables())
590 return -1; 589 return -1;
591 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); 590 entry = proc_create("profile", S_IWUSR | S_IRUGO,
591 NULL, &proc_profile_operations);
592 if (!entry) 592 if (!entry)
593 return 0; 593 return 0;
594 entry->proc_fops = &proc_profile_operations;
595 entry->size = (1+prof_len) * sizeof(atomic_t); 594 entry->size = (1+prof_len) * sizeof(atomic_t);
596 hotcpu_notifier(profile_cpu_callback, 0); 595 hotcpu_notifier(profile_cpu_callback, 0);
597 return 0; 596 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index fdb34e86f923..6c19e94fd0a5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child)
73 BUG_ON(!child->ptrace); 73 BUG_ON(!child->ptrace);
74 74
75 child->ptrace = 0; 75 child->ptrace = 0;
76 if (!list_empty(&child->ptrace_list)) { 76 if (ptrace_reparented(child)) {
77 list_del_init(&child->ptrace_list); 77 list_del_init(&child->ptrace_list);
78 remove_parent(child); 78 remove_parent(child);
79 child->parent = child->real_parent; 79 child->parent = child->real_parent;
@@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task)
168 audit_ptrace(task); 168 audit_ptrace(task);
169 169
170 retval = -EPERM; 170 retval = -EPERM;
171 if (task->pid <= 1)
172 goto out;
173 if (same_thread_group(task, current)) 171 if (same_thread_group(task, current))
174 goto out; 172 goto out;
175 173
@@ -208,8 +206,7 @@ repeat:
208 206
209 __ptrace_link(task, current); 207 __ptrace_link(task, current);
210 208
211 force_sig_specific(SIGSTOP, task); 209 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
212
213bad: 210bad:
214 write_unlock_irqrestore(&tasklist_lock, flags); 211 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 212 task_unlock(task);
@@ -323,9 +320,8 @@ static int ptrace_setoptions(struct task_struct *child, long data)
323 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 320 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
324} 321}
325 322
326static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) 323static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
327{ 324{
328 siginfo_t lastinfo;
329 int error = -ESRCH; 325 int error = -ESRCH;
330 326
331 read_lock(&tasklist_lock); 327 read_lock(&tasklist_lock);
@@ -333,31 +329,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
333 error = -EINVAL; 329 error = -EINVAL;
334 spin_lock_irq(&child->sighand->siglock); 330 spin_lock_irq(&child->sighand->siglock);
335 if (likely(child->last_siginfo != NULL)) { 331 if (likely(child->last_siginfo != NULL)) {
336 lastinfo = *child->last_siginfo; 332 *info = *child->last_siginfo;
337 error = 0; 333 error = 0;
338 } 334 }
339 spin_unlock_irq(&child->sighand->siglock); 335 spin_unlock_irq(&child->sighand->siglock);
340 } 336 }
341 read_unlock(&tasklist_lock); 337 read_unlock(&tasklist_lock);
342 if (!error)
343 return copy_siginfo_to_user(data, &lastinfo);
344 return error; 338 return error;
345} 339}
346 340
347static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) 341static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
348{ 342{
349 siginfo_t newinfo;
350 int error = -ESRCH; 343 int error = -ESRCH;
351 344
352 if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
353 return -EFAULT;
354
355 read_lock(&tasklist_lock); 345 read_lock(&tasklist_lock);
356 if (likely(child->sighand != NULL)) { 346 if (likely(child->sighand != NULL)) {
357 error = -EINVAL; 347 error = -EINVAL;
358 spin_lock_irq(&child->sighand->siglock); 348 spin_lock_irq(&child->sighand->siglock);
359 if (likely(child->last_siginfo != NULL)) { 349 if (likely(child->last_siginfo != NULL)) {
360 *child->last_siginfo = newinfo; 350 *child->last_siginfo = *info;
361 error = 0; 351 error = 0;
362 } 352 }
363 spin_unlock_irq(&child->sighand->siglock); 353 spin_unlock_irq(&child->sighand->siglock);
@@ -424,6 +414,7 @@ int ptrace_request(struct task_struct *child, long request,
424 long addr, long data) 414 long addr, long data)
425{ 415{
426 int ret = -EIO; 416 int ret = -EIO;
417 siginfo_t siginfo;
427 418
428 switch (request) { 419 switch (request) {
429 case PTRACE_PEEKTEXT: 420 case PTRACE_PEEKTEXT:
@@ -442,12 +433,22 @@ int ptrace_request(struct task_struct *child, long request,
442 case PTRACE_GETEVENTMSG: 433 case PTRACE_GETEVENTMSG:
443 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 434 ret = put_user(child->ptrace_message, (unsigned long __user *) data);
444 break; 435 break;
436
445 case PTRACE_GETSIGINFO: 437 case PTRACE_GETSIGINFO:
446 ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); 438 ret = ptrace_getsiginfo(child, &siginfo);
439 if (!ret)
440 ret = copy_siginfo_to_user((siginfo_t __user *) data,
441 &siginfo);
447 break; 442 break;
443
448 case PTRACE_SETSIGINFO: 444 case PTRACE_SETSIGINFO:
449 ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); 445 if (copy_from_user(&siginfo, (siginfo_t __user *) data,
446 sizeof siginfo))
447 ret = -EFAULT;
448 else
449 ret = ptrace_setsiginfo(child, &siginfo);
450 break; 450 break;
451
451 case PTRACE_DETACH: /* detach a process that was attached. */ 452 case PTRACE_DETACH: /* detach a process that was attached. */
452 ret = ptrace_detach(child, data); 453 ret = ptrace_detach(child, data);
453 break; 454 break;
@@ -518,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
518{ 519{
519 struct task_struct *child; 520 struct task_struct *child;
520 521
521 /*
522 * Tracing init is not allowed.
523 */
524 if (pid == 1)
525 return ERR_PTR(-EPERM);
526
527 read_lock(&tasklist_lock); 522 read_lock(&tasklist_lock);
528 child = find_task_by_vpid(pid); 523 child = find_task_by_vpid(pid);
529 if (child) 524 if (child)
@@ -539,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
539#define arch_ptrace_attach(child) do { } while (0) 534#define arch_ptrace_attach(child) do { } while (0)
540#endif 535#endif
541 536
542#ifndef __ARCH_SYS_PTRACE
543asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 537asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
544{ 538{
545 struct task_struct *child; 539 struct task_struct *child;
@@ -587,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
587 unlock_kernel(); 581 unlock_kernel();
588 return ret; 582 return ret;
589} 583}
590#endif /* __ARCH_SYS_PTRACE */
591 584
592int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) 585int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
593{ 586{
@@ -608,7 +601,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
608 return (copied == sizeof(data)) ? 0 : -EIO; 601 return (copied == sizeof(data)) ? 0 : -EIO;
609} 602}
610 603
611#ifdef CONFIG_COMPAT 604#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
612#include <linux/compat.h> 605#include <linux/compat.h>
613 606
614int compat_ptrace_request(struct task_struct *child, compat_long_t request, 607int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -616,6 +609,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
616{ 609{
617 compat_ulong_t __user *datap = compat_ptr(data); 610 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word; 611 compat_ulong_t word;
612 siginfo_t siginfo;
619 int ret; 613 int ret;
620 614
621 switch (request) { 615 switch (request) {
@@ -638,6 +632,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap); 632 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break; 633 break;
640 634
635 case PTRACE_GETSIGINFO:
636 ret = ptrace_getsiginfo(child, &siginfo);
637 if (!ret)
638 ret = copy_siginfo_to_user32(
639 (struct compat_siginfo __user *) datap,
640 &siginfo);
641 break;
642
643 case PTRACE_SETSIGINFO:
644 memset(&siginfo, 0, sizeof siginfo);
645 if (copy_siginfo_from_user32(
646 &siginfo, (struct compat_siginfo __user *) datap))
647 ret = -EFAULT;
648 else
649 ret = ptrace_setsiginfo(child, &siginfo);
650 break;
651
641 default: 652 default:
642 ret = ptrace_request(child, request, addr, data); 653 ret = ptrace_request(child, request, addr, data);
643 } 654 }
@@ -645,7 +656,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
645 return ret; 656 return ret;
646} 657}
647 658
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 659asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data) 660 compat_long_t addr, compat_long_t data)
651{ 661{
@@ -688,6 +698,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
688 unlock_kernel(); 698 unlock_kernel();
689 return ret; 699 return ret;
690} 700}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ 701#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e9517014b57c..e1cdf196a515 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void)
1007 if (sched_getaffinity(0, &oldmask) < 0) 1007 if (sched_getaffinity(0, &oldmask) < 0)
1008 oldmask = cpu_possible_map; 1008 oldmask = cpu_possible_map;
1009 for_each_online_cpu(cpu) { 1009 for_each_online_cpu(cpu) {
1010 sched_setaffinity(0, cpumask_of_cpu(cpu)); 1010 sched_setaffinity(0, &cpumask_of_cpu(cpu));
1011 schedule(); 1011 schedule();
1012 } 1012 }
1013 sched_setaffinity(0, oldmask); 1013 sched_setaffinity(0, &oldmask);
1014} 1014}
1015EXPORT_SYMBOL_GPL(__synchronize_sched); 1015EXPORT_SYMBOL_GPL(__synchronize_sched);
1016 1016
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index fd599829e72a..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
45#include <linux/byteorder/swabb.h> 45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h>
48 49
49MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -723,9 +724,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
723 */ 724 */
724static void rcu_torture_shuffle_tasks(void) 725static void rcu_torture_shuffle_tasks(void)
725{ 726{
726 cpumask_t tmp_mask = CPU_MASK_ALL; 727 cpumask_t tmp_mask;
727 int i; 728 int i;
728 729
730 cpus_setall(tmp_mask);
729 get_online_cpus(); 731 get_online_cpus();
730 732
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 733 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -737,25 +739,27 @@ static void rcu_torture_shuffle_tasks(void)
737 if (rcu_idle_cpu != -1) 739 if (rcu_idle_cpu != -1)
738 cpu_clear(rcu_idle_cpu, tmp_mask); 740 cpu_clear(rcu_idle_cpu, tmp_mask);
739 741
740 set_cpus_allowed(current, tmp_mask); 742 set_cpus_allowed_ptr(current, &tmp_mask);
741 743
742 if (reader_tasks) { 744 if (reader_tasks) {
743 for (i = 0; i < nrealreaders; i++) 745 for (i = 0; i < nrealreaders; i++)
744 if (reader_tasks[i]) 746 if (reader_tasks[i])
745 set_cpus_allowed(reader_tasks[i], tmp_mask); 747 set_cpus_allowed_ptr(reader_tasks[i],
748 &tmp_mask);
746 } 749 }
747 750
748 if (fakewriter_tasks) { 751 if (fakewriter_tasks) {
749 for (i = 0; i < nfakewriters; i++) 752 for (i = 0; i < nfakewriters; i++)
750 if (fakewriter_tasks[i]) 753 if (fakewriter_tasks[i])
751 set_cpus_allowed(fakewriter_tasks[i], tmp_mask); 754 set_cpus_allowed_ptr(fakewriter_tasks[i],
755 &tmp_mask);
752 } 756 }
753 757
754 if (writer_task) 758 if (writer_task)
755 set_cpus_allowed(writer_task, tmp_mask); 759 set_cpus_allowed_ptr(writer_task, &tmp_mask);
756 760
757 if (stats_task) 761 if (stats_task)
758 set_cpus_allowed(stats_task, tmp_mask); 762 set_cpus_allowed_ptr(stats_task, &tmp_mask);
759 763
760 if (rcu_idle_cpu == -1) 764 if (rcu_idle_cpu == -1)
761 rcu_idle_cpu = num_online_cpus() - 1; 765 rcu_idle_cpu = num_online_cpus() - 1;
diff --git a/kernel/relay.c b/kernel/relay.c
index 4c035a8a248c..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
67 67
68/*
69 * allocate an array of pointers of struct page
70 */
71static struct page **relay_alloc_page_array(unsigned int n_pages)
72{
73 struct page **array;
74 size_t pa_size = n_pages * sizeof(struct page *);
75
76 if (pa_size > PAGE_SIZE) {
77 array = vmalloc(pa_size);
78 if (array)
79 memset(array, 0, pa_size);
80 } else {
81 array = kzalloc(pa_size, GFP_KERNEL);
82 }
83 return array;
84}
85
86/*
87 * free an array of pointers of struct page
88 */
89static void relay_free_page_array(struct page **array)
90{
91 if (is_vmalloc_addr(array))
92 vfree(array);
93 else
94 kfree(array);
95}
96
68/** 97/**
69 * relay_mmap_buf: - mmap channel buffer to process address space 98 * relay_mmap_buf: - mmap channel buffer to process address space
70 * @buf: relay channel buffer 99 * @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
109 *size = PAGE_ALIGN(*size); 138 *size = PAGE_ALIGN(*size);
110 n_pages = *size >> PAGE_SHIFT; 139 n_pages = *size >> PAGE_SHIFT;
111 140
112 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); 141 buf->page_array = relay_alloc_page_array(n_pages);
113 if (!buf->page_array) 142 if (!buf->page_array)
114 return NULL; 143 return NULL;
115 144
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
130depopulate: 159depopulate:
131 for (j = 0; j < i; j++) 160 for (j = 0; j < i; j++)
132 __free_page(buf->page_array[j]); 161 __free_page(buf->page_array[j]);
133 kfree(buf->page_array); 162 relay_free_page_array(buf->page_array);
134 return NULL; 163 return NULL;
135} 164}
136 165
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
189 vunmap(buf->start); 218 vunmap(buf->start);
190 for (i = 0; i < buf->page_count; i++) 219 for (i = 0; i < buf->page_count; i++)
191 __free_page(buf->page_array[i]); 220 __free_page(buf->page_array[i]);
192 kfree(buf->page_array); 221 relay_free_page_array(buf->page_array);
193 } 222 }
194 chan->buf[buf->cpu] = NULL; 223 chan->buf[buf->cpu] = NULL;
195 kfree(buf->padding); 224 kfree(buf->padding);
@@ -736,7 +765,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
736 kref_get(&buf->kref); 765 kref_get(&buf->kref);
737 filp->private_data = buf; 766 filp->private_data = buf;
738 767
739 return 0; 768 return nonseekable_open(inode, filp);
740} 769}
741 770
742/** 771/**
@@ -1056,6 +1085,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = {
1056 .get = generic_pipe_buf_get, 1085 .get = generic_pipe_buf_get,
1057}; 1086};
1058 1087
1088static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1089{
1090}
1091
1059/* 1092/*
1060 * subbuf_splice_actor - splice up to one subbuf's worth of data 1093 * subbuf_splice_actor - splice up to one subbuf's worth of data
1061 */ 1094 */
@@ -1083,6 +1116,7 @@ static int subbuf_splice_actor(struct file *in,
1083 .partial = partial, 1116 .partial = partial,
1084 .flags = flags, 1117 .flags = flags,
1085 .ops = &relay_pipe_buf_ops, 1118 .ops = &relay_pipe_buf_ops,
1119 .spd_release = relay_page_release,
1086 }; 1120 };
1087 1121
1088 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1122 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
@@ -1157,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1157 ret = 0; 1191 ret = 0;
1158 spliced = 0; 1192 spliced = 0;
1159 1193
1160 while (len) { 1194 while (len && !spliced) {
1161 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1162 if (ret < 0) 1196 if (ret < 0)
1163 break; 1197 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
13#include <linux/res_counter.h> 14#include <linux/res_counter.h>
14#include <linux/uaccess.h> 15#include <linux/uaccess.h>
15 16
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
27 } 28 }
28 29
29 counter->usage += val; 30 counter->usage += val;
31 if (counter->usage > counter->max_usage)
32 counter->max_usage = counter->usage;
30 return 0; 33 return 0;
31} 34}
32 35
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
65 switch (member) { 68 switch (member) {
66 case RES_USAGE: 69 case RES_USAGE:
67 return &counter->usage; 70 return &counter->usage;
71 case RES_MAX_USAGE:
72 return &counter->max_usage;
68 case RES_LIMIT: 73 case RES_LIMIT:
69 return &counter->limit; 74 return &counter->limit;
70 case RES_FAILCNT: 75 case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
92 pos, buf, s - buf); 97 pos, buf, s - buf);
93} 98}
94 99
100u64 res_counter_read_u64(struct res_counter *counter, int member)
101{
102 return *res_counter_member(counter, member);
103}
104
95ssize_t res_counter_write(struct res_counter *counter, int member, 105ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos, 106 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val)) 107 int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index 82aea814d409..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
131 131
132static int __init ioresources_init(void) 132static int __init ioresources_init(void)
133{ 133{
134 struct proc_dir_entry *entry; 134 proc_create("ioports", 0, NULL, &proc_ioports_operations);
135 135 proc_create("iomem", 0, NULL, &proc_iomem_operations);
136 entry = create_proc_entry("ioports", 0, NULL);
137 if (entry)
138 entry->proc_fops = &proc_ioports_operations;
139 entry = create_proc_entry("iomem", 0, NULL);
140 if (entry)
141 entry->proc_fops = &proc_iomem_operations;
142 return 0; 136 return 0;
143} 137}
144__initcall(ioresources_init); 138__initcall(ioresources_init);
@@ -486,6 +480,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
486 480
487EXPORT_SYMBOL(adjust_resource); 481EXPORT_SYMBOL(adjust_resource);
488 482
483/**
484 * resource_alignment - calculate resource's alignment
485 * @res: resource pointer
486 *
487 * Returns alignment on success, 0 (invalid alignment) on failure.
488 */
489resource_size_t resource_alignment(struct resource *res)
490{
491 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
492 case IORESOURCE_SIZEALIGN:
493 return res->end - res->start + 1;
494 case IORESOURCE_STARTALIGN:
495 return res->start;
496 default:
497 return 0;
498 }
499}
500
489/* 501/*
490 * This is compatibility stuff for IO resources. 502 * This is compatibility stuff for IO resources.
491 * 503 *
diff --git a/kernel/sched.c b/kernel/sched.c
index 28c73f07efb2..34bcc5bc120e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,10 @@
66#include <linux/unistd.h> 66#include <linux/unistd.h>
67#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h> 68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
69 73
70#include <asm/tlb.h> 74#include <asm/tlb.h>
71#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
@@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void)
114 */ 118 */
115#define DEF_TIMESLICE (100 * HZ / 1000) 119#define DEF_TIMESLICE (100 * HZ / 1000)
116 120
121/*
122 * single value that denotes runtime == period, ie unlimited time.
123 */
124#define RUNTIME_INF ((u64)~0ULL)
125
117#ifdef CONFIG_SMP 126#ifdef CONFIG_SMP
118/* 127/*
119 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -155,6 +164,84 @@ struct rt_prio_array {
155 struct list_head queue[MAX_RT_PRIO]; 164 struct list_head queue[MAX_RT_PRIO];
156}; 165};
157 166
167struct rt_bandwidth {
168 /* nests inside the rq lock: */
169 spinlock_t rt_runtime_lock;
170 ktime_t rt_period;
171 u64 rt_runtime;
172 struct hrtimer rt_period_timer;
173};
174
175static struct rt_bandwidth def_rt_bandwidth;
176
177static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
178
179static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
180{
181 struct rt_bandwidth *rt_b =
182 container_of(timer, struct rt_bandwidth, rt_period_timer);
183 ktime_t now;
184 int overrun;
185 int idle = 0;
186
187 for (;;) {
188 now = hrtimer_cb_get_time(timer);
189 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
190
191 if (!overrun)
192 break;
193
194 idle = do_sched_rt_period_timer(rt_b, overrun);
195 }
196
197 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
198}
199
200static
201void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
202{
203 rt_b->rt_period = ns_to_ktime(period);
204 rt_b->rt_runtime = runtime;
205
206 spin_lock_init(&rt_b->rt_runtime_lock);
207
208 hrtimer_init(&rt_b->rt_period_timer,
209 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
210 rt_b->rt_period_timer.function = sched_rt_period_timer;
211 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
212}
213
214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
215{
216 ktime_t now;
217
218 if (rt_b->rt_runtime == RUNTIME_INF)
219 return;
220
221 if (hrtimer_active(&rt_b->rt_period_timer))
222 return;
223
224 spin_lock(&rt_b->rt_runtime_lock);
225 for (;;) {
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 break;
228
229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
231 hrtimer_start(&rt_b->rt_period_timer,
232 rt_b->rt_period_timer.expires,
233 HRTIMER_MODE_ABS);
234 }
235 spin_unlock(&rt_b->rt_runtime_lock);
236}
237
238#ifdef CONFIG_RT_GROUP_SCHED
239static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
240{
241 hrtimer_cancel(&rt_b->rt_period_timer);
242}
243#endif
244
158#ifdef CONFIG_GROUP_SCHED 245#ifdef CONFIG_GROUP_SCHED
159 246
160#include <linux/cgroup.h> 247#include <linux/cgroup.h>
@@ -181,29 +268,39 @@ struct task_group {
181 struct sched_rt_entity **rt_se; 268 struct sched_rt_entity **rt_se;
182 struct rt_rq **rt_rq; 269 struct rt_rq **rt_rq;
183 270
184 u64 rt_runtime; 271 struct rt_bandwidth rt_bandwidth;
185#endif 272#endif
186 273
187 struct rcu_head rcu; 274 struct rcu_head rcu;
188 struct list_head list; 275 struct list_head list;
276
277 struct task_group *parent;
278 struct list_head siblings;
279 struct list_head children;
189}; 280};
190 281
282#ifdef CONFIG_USER_SCHED
283
284/*
285 * Root task group.
286 * Every UID task group (including init_task_group aka UID-0) will
287 * be a child to this group.
288 */
289struct task_group root_task_group;
290
191#ifdef CONFIG_FAIR_GROUP_SCHED 291#ifdef CONFIG_FAIR_GROUP_SCHED
192/* Default task group's sched entity on each cpu */ 292/* Default task group's sched entity on each cpu */
193static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 293static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
194/* Default task group's cfs_rq on each cpu */ 294/* Default task group's cfs_rq on each cpu */
195static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 295static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
196
197static struct sched_entity *init_sched_entity_p[NR_CPUS];
198static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
199#endif 296#endif
200 297
201#ifdef CONFIG_RT_GROUP_SCHED 298#ifdef CONFIG_RT_GROUP_SCHED
202static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 299static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
203static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 300static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
204 301#endif
205static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; 302#else
206static struct rt_rq *init_rt_rq_p[NR_CPUS]; 303#define root_task_group init_task_group
207#endif 304#endif
208 305
209/* task_group_lock serializes add/remove of task groups and also changes to 306/* task_group_lock serializes add/remove of task groups and also changes to
@@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex);
221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
222#endif 319#endif
223 320
321#define MIN_SHARES 2
322
224static int init_task_group_load = INIT_TASK_GROUP_LOAD; 323static int init_task_group_load = INIT_TASK_GROUP_LOAD;
225#endif 324#endif
226 325
227/* Default task group. 326/* Default task group.
228 * Every task in system belong to this group at bootup. 327 * Every task in system belong to this group at bootup.
229 */ 328 */
230struct task_group init_task_group = { 329struct task_group init_task_group;
231#ifdef CONFIG_FAIR_GROUP_SCHED
232 .se = init_sched_entity_p,
233 .cfs_rq = init_cfs_rq_p,
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237 .rt_se = init_sched_rt_entity_p,
238 .rt_rq = init_rt_rq_p,
239#endif
240};
241 330
242/* return group to which a task belongs */ 331/* return group to which a task belongs */
243static inline struct task_group *task_group(struct task_struct *p) 332static inline struct task_group *task_group(struct task_struct *p)
@@ -297,8 +386,12 @@ struct cfs_rq {
297 386
298 struct rb_root tasks_timeline; 387 struct rb_root tasks_timeline;
299 struct rb_node *rb_leftmost; 388 struct rb_node *rb_leftmost;
300 struct rb_node *rb_load_balance_curr; 389
301 /* 'curr' points to currently running entity on this cfs_rq. 390 struct list_head tasks;
391 struct list_head *balance_iterator;
392
393 /*
394 * 'curr' points to currently running entity on this cfs_rq.
302 * It is set to NULL otherwise (i.e when none are currently running). 395 * It is set to NULL otherwise (i.e when none are currently running).
303 */ 396 */
304 struct sched_entity *curr, *next; 397 struct sched_entity *curr, *next;
@@ -318,6 +411,43 @@ struct cfs_rq {
318 */ 411 */
319 struct list_head leaf_cfs_rq_list; 412 struct list_head leaf_cfs_rq_list;
320 struct task_group *tg; /* group that "owns" this runqueue */ 413 struct task_group *tg; /* group that "owns" this runqueue */
414
415#ifdef CONFIG_SMP
416 unsigned long task_weight;
417 unsigned long shares;
418 /*
419 * We need space to build a sched_domain wide view of the full task
420 * group tree, in order to avoid depending on dynamic memory allocation
421 * during the load balancing we place this in the per cpu task group
422 * hierarchy. This limits the load balancing to one instance per cpu,
423 * but more should not be needed anyway.
424 */
425 struct aggregate_struct {
426 /*
427 * load = weight(cpus) * f(tg)
428 *
429 * Where f(tg) is the recursive weight fraction assigned to
430 * this group.
431 */
432 unsigned long load;
433
434 /*
435 * part of the group weight distributed to this span.
436 */
437 unsigned long shares;
438
439 /*
440 * The sum of all runqueue weights within this span.
441 */
442 unsigned long rq_weight;
443
444 /*
445 * Weight contributed by tasks; this is the part we can
446 * influence by moving tasks around.
447 */
448 unsigned long task_weight;
449 } aggregate;
450#endif
321#endif 451#endif
322}; 452};
323 453
@@ -334,6 +464,9 @@ struct rt_rq {
334#endif 464#endif
335 int rt_throttled; 465 int rt_throttled;
336 u64 rt_time; 466 u64 rt_time;
467 u64 rt_runtime;
468 /* Nests inside the rq lock: */
469 spinlock_t rt_runtime_lock;
337 470
338#ifdef CONFIG_RT_GROUP_SCHED 471#ifdef CONFIG_RT_GROUP_SCHED
339 unsigned long rt_nr_boosted; 472 unsigned long rt_nr_boosted;
@@ -396,6 +529,7 @@ struct rq {
396 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 529 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
397 unsigned char idle_at_tick; 530 unsigned char idle_at_tick;
398#ifdef CONFIG_NO_HZ 531#ifdef CONFIG_NO_HZ
532 unsigned long last_tick_seen;
399 unsigned char in_nohz_recently; 533 unsigned char in_nohz_recently;
400#endif 534#endif
401 /* capture load from *all* tasks on this cpu: */ 535 /* capture load from *all* tasks on this cpu: */
@@ -405,8 +539,6 @@ struct rq {
405 539
406 struct cfs_rq cfs; 540 struct cfs_rq cfs;
407 struct rt_rq rt; 541 struct rt_rq rt;
408 u64 rt_period_expire;
409 int rt_throttled;
410 542
411#ifdef CONFIG_FAIR_GROUP_SCHED 543#ifdef CONFIG_FAIR_GROUP_SCHED
412 /* list of leaf cfs_rq on this cpu: */ 544 /* list of leaf cfs_rq on this cpu: */
@@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq)
499#endif 631#endif
500} 632}
501 633
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
502/* 660/*
503 * Update the per-runqueue clock, as finegrained as the platform can give 661 * Update the per-runqueue clock, as finegrained as the platform can give
504 * us, but without assuming monotonicity, etc.: 662 * us, but without assuming monotonicity, etc.:
@@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq)
523 /* 681 /*
524 * Catch too large forward jumps too: 682 * Catch too large forward jumps too:
525 */ 683 */
526 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { 684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
527 if (clock < rq->tick_timestamp + TICK_NSEC) 685 u64 max_time = rq->tick_timestamp + max_jump;
528 clock = rq->tick_timestamp + TICK_NSEC; 686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
529 else 690 else
530 clock++; 691 clock++;
531 rq->clock_overflows++; 692 rq->clock_overflows++;
@@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq)
561#define task_rq(p) cpu_rq(task_cpu(p)) 722#define task_rq(p) cpu_rq(task_cpu(p))
562#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 723#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
563 724
564unsigned long rt_needs_cpu(int cpu)
565{
566 struct rq *rq = cpu_rq(cpu);
567 u64 delta;
568
569 if (!rq->rt_throttled)
570 return 0;
571
572 if (rq->clock > rq->rt_period_expire)
573 return 1;
574
575 delta = rq->rt_period_expire - rq->clock;
576 do_div(delta, NSEC_PER_SEC / HZ);
577
578 return (unsigned long)delta;
579}
580
581/* 725/*
582 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
583 */ 727 */
@@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu)
590/* 734/*
591 * Debugging: various feature bits 735 * Debugging: various feature bits
592 */ 736 */
737
738#define SCHED_FEAT(name, enabled) \
739 __SCHED_FEAT_##name ,
740
593enum { 741enum {
594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 742#include "sched_features.h"
595 SCHED_FEAT_WAKEUP_PREEMPT = 2,
596 SCHED_FEAT_START_DEBIT = 4,
597 SCHED_FEAT_HRTICK = 8,
598 SCHED_FEAT_DOUBLE_TICK = 16,
599}; 743};
600 744
745#undef SCHED_FEAT
746
747#define SCHED_FEAT(name, enabled) \
748 (1UL << __SCHED_FEAT_##name) * enabled |
749
601const_debug unsigned int sysctl_sched_features = 750const_debug unsigned int sysctl_sched_features =
602 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | 751#include "sched_features.h"
603 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 752 0;
604 SCHED_FEAT_START_DEBIT * 1 | 753
605 SCHED_FEAT_HRTICK * 1 | 754#undef SCHED_FEAT
606 SCHED_FEAT_DOUBLE_TICK * 0; 755
756#ifdef CONFIG_SCHED_DEBUG
757#define SCHED_FEAT(name, enabled) \
758 #name ,
759
760__read_mostly char *sched_feat_names[] = {
761#include "sched_features.h"
762 NULL
763};
764
765#undef SCHED_FEAT
766
767int sched_feat_open(struct inode *inode, struct file *filp)
768{
769 filp->private_data = inode->i_private;
770 return 0;
771}
772
773static ssize_t
774sched_feat_read(struct file *filp, char __user *ubuf,
775 size_t cnt, loff_t *ppos)
776{
777 char *buf;
778 int r = 0;
779 int len = 0;
780 int i;
781
782 for (i = 0; sched_feat_names[i]; i++) {
783 len += strlen(sched_feat_names[i]);
784 len += 4;
785 }
786
787 buf = kmalloc(len + 2, GFP_KERNEL);
788 if (!buf)
789 return -ENOMEM;
790
791 for (i = 0; sched_feat_names[i]; i++) {
792 if (sysctl_sched_features & (1UL << i))
793 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
794 else
795 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
796 }
607 797
608#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 798 r += sprintf(buf + r, "\n");
799 WARN_ON(r >= len + 2);
800
801 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
802
803 kfree(buf);
804
805 return r;
806}
807
808static ssize_t
809sched_feat_write(struct file *filp, const char __user *ubuf,
810 size_t cnt, loff_t *ppos)
811{
812 char buf[64];
813 char *cmp = buf;
814 int neg = 0;
815 int i;
816
817 if (cnt > 63)
818 cnt = 63;
819
820 if (copy_from_user(&buf, ubuf, cnt))
821 return -EFAULT;
822
823 buf[cnt] = 0;
824
825 if (strncmp(buf, "NO_", 3) == 0) {
826 neg = 1;
827 cmp += 3;
828 }
829
830 for (i = 0; sched_feat_names[i]; i++) {
831 int len = strlen(sched_feat_names[i]);
832
833 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
834 if (neg)
835 sysctl_sched_features &= ~(1UL << i);
836 else
837 sysctl_sched_features |= (1UL << i);
838 break;
839 }
840 }
841
842 if (!sched_feat_names[i])
843 return -EINVAL;
844
845 filp->f_pos += cnt;
846
847 return cnt;
848}
849
850static struct file_operations sched_feat_fops = {
851 .open = sched_feat_open,
852 .read = sched_feat_read,
853 .write = sched_feat_write,
854};
855
856static __init int sched_init_debug(void)
857{
858 debugfs_create_file("sched_features", 0644, NULL, NULL,
859 &sched_feat_fops);
860
861 return 0;
862}
863late_initcall(sched_init_debug);
864
865#endif
866
867#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
609 868
610/* 869/*
611 * Number of tasks to iterate in a single balance run. 870 * Number of tasks to iterate in a single balance run.
@@ -627,16 +886,52 @@ static __read_mostly int scheduler_running;
627 */ 886 */
628int sysctl_sched_rt_runtime = 950000; 887int sysctl_sched_rt_runtime = 950000;
629 888
630/* 889static inline u64 global_rt_period(void)
631 * single value that denotes runtime == period, ie unlimited time. 890{
632 */ 891 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
633#define RUNTIME_INF ((u64)~0ULL) 892}
893
894static inline u64 global_rt_runtime(void)
895{
896 if (sysctl_sched_rt_period < 0)
897 return RUNTIME_INF;
898
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900}
901
902static const unsigned long long time_sync_thresh = 100000;
903
904static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
634 906
635/* 907/*
636 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 908 * Global lock which we take every now and then to synchronize
637 * clock constructed from sched_clock(): 909 * the CPUs time. This method is not warp-safe, but it's good
910 * enough to synchronize slowly diverging time sources and thus
911 * it's good enough for tracing:
638 */ 912 */
639unsigned long long cpu_clock(int cpu) 913static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time;
915
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
917{
918 unsigned long flags;
919
920 spin_lock_irqsave(&time_sync_lock, flags);
921
922 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time;
924 time = prev_global_time;
925 } else {
926 prev_global_time = time;
927 }
928
929 spin_unlock_irqrestore(&time_sync_lock, flags);
930
931 return time;
932}
933
934static unsigned long long __cpu_clock(int cpu)
640{ 935{
641 unsigned long long now; 936 unsigned long long now;
642 unsigned long flags; 937 unsigned long flags;
@@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu)
657 952
658 return now; 953 return now;
659} 954}
955
956/*
957 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
958 * clock constructed from sched_clock():
959 */
960unsigned long long cpu_clock(int cpu)
961{
962 unsigned long long prev_cpu_time, time, delta_time;
963
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time;
967
968 if (unlikely(delta_time > time_sync_thresh))
969 time = __sync_cpu_clock(time, cpu);
970
971 return time;
972}
660EXPORT_SYMBOL_GPL(cpu_clock); 973EXPORT_SYMBOL_GPL(cpu_clock);
661 974
662#ifndef prepare_arch_switch 975#ifndef prepare_arch_switch
@@ -1052,6 +1365,49 @@ static void resched_cpu(int cpu)
1052 resched_task(cpu_curr(cpu)); 1365 resched_task(cpu_curr(cpu));
1053 spin_unlock_irqrestore(&rq->lock, flags); 1366 spin_unlock_irqrestore(&rq->lock, flags);
1054} 1367}
1368
1369#ifdef CONFIG_NO_HZ
1370/*
1371 * When add_timer_on() enqueues a timer into the timer wheel of an
1372 * idle CPU then this timer might expire before the next timer event
1373 * which is scheduled to wake up that CPU. In case of a completely
1374 * idle system the next event might even be infinite time into the
1375 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1376 * leaves the inner idle loop so the newly added timer is taken into
1377 * account when the CPU goes back to idle and evaluates the timer
1378 * wheel for the next timer event.
1379 */
1380void wake_up_idle_cpu(int cpu)
1381{
1382 struct rq *rq = cpu_rq(cpu);
1383
1384 if (cpu == smp_processor_id())
1385 return;
1386
1387 /*
1388 * This is safe, as this function is called with the timer
1389 * wheel base lock of (cpu) held. When the CPU is on the way
1390 * to idle and has not yet set rq->curr to idle then it will
1391 * be serialized on the timer wheel base lock and take the new
1392 * timer into account automatically.
1393 */
1394 if (rq->curr != rq->idle)
1395 return;
1396
1397 /*
1398 * We can set TIF_RESCHED on the idle task of the other CPU
1399 * lockless. The worst case is that the other CPU runs the
1400 * idle task through an additional NOOP schedule()
1401 */
1402 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1403
1404 /* NEED_RESCHED must be visible before we test polling */
1405 smp_mb();
1406 if (!tsk_is_polling(rq->idle))
1407 smp_send_reschedule(cpu);
1408}
1409#endif
1410
1055#else 1411#else
1056static void __resched_task(struct task_struct *p, int tif_bit) 1412static void __resched_task(struct task_struct *p, int tif_bit)
1057{ 1413{
@@ -1073,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1073 */ 1429 */
1074#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1430#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1075 1431
1432/*
1433 * delta *= weight / lw
1434 */
1076static unsigned long 1435static unsigned long
1077calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1436calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1078 struct load_weight *lw) 1437 struct load_weight *lw)
@@ -1095,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1095 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1454 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1096} 1455}
1097 1456
1098static inline unsigned long
1099calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1100{
1101 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1102}
1103
1104static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1457static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1105{ 1458{
1106 lw->weight += inc; 1459 lw->weight += inc;
@@ -1198,11 +1551,347 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1198static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1551static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1199#endif 1552#endif
1200 1553
1554static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1555{
1556 update_load_add(&rq->load, load);
1557}
1558
1559static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1560{
1561 update_load_sub(&rq->load, load);
1562}
1563
1201#ifdef CONFIG_SMP 1564#ifdef CONFIG_SMP
1202static unsigned long source_load(int cpu, int type); 1565static unsigned long source_load(int cpu, int type);
1203static unsigned long target_load(int cpu, int type); 1566static unsigned long target_load(int cpu, int type);
1204static unsigned long cpu_avg_load_per_task(int cpu); 1567static unsigned long cpu_avg_load_per_task(int cpu);
1205static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1568static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1569
1570#ifdef CONFIG_FAIR_GROUP_SCHED
1571
1572/*
1573 * Group load balancing.
1574 *
1575 * We calculate a few balance domain wide aggregate numbers; load and weight.
1576 * Given the pictures below, and assuming each item has equal weight:
1577 *
1578 * root 1 - thread
1579 * / | \ A - group
1580 * A 1 B
1581 * /|\ / \
1582 * C 2 D 3 4
1583 * | |
1584 * 5 6
1585 *
1586 * load:
1587 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1588 * which equals 1/9-th of the total load.
1589 *
1590 * shares:
1591 * The weight of this group on the selected cpus.
1592 *
1593 * rq_weight:
1594 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1595 * B would get 2.
1596 *
1597 * task_weight:
1598 * Part of the rq_weight contributed by tasks; all groups except B would
1599 * get 1, B gets 2.
1600 */
1601
1602static inline struct aggregate_struct *
1603aggregate(struct task_group *tg, struct sched_domain *sd)
1604{
1605 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1606}
1607
1608typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1609
1610/*
1611 * Iterate the full tree, calling @down when first entering a node and @up when
1612 * leaving it for the final time.
1613 */
1614static
1615void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1616 struct sched_domain *sd)
1617{
1618 struct task_group *parent, *child;
1619
1620 rcu_read_lock();
1621 parent = &root_task_group;
1622down:
1623 (*down)(parent, sd);
1624 list_for_each_entry_rcu(child, &parent->children, siblings) {
1625 parent = child;
1626 goto down;
1627
1628up:
1629 continue;
1630 }
1631 (*up)(parent, sd);
1632
1633 child = parent;
1634 parent = parent->parent;
1635 if (parent)
1636 goto up;
1637 rcu_read_unlock();
1638}
1639
1640/*
1641 * Calculate the aggregate runqueue weight.
1642 */
1643static
1644void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1645{
1646 unsigned long rq_weight = 0;
1647 unsigned long task_weight = 0;
1648 int i;
1649
1650 for_each_cpu_mask(i, sd->span) {
1651 rq_weight += tg->cfs_rq[i]->load.weight;
1652 task_weight += tg->cfs_rq[i]->task_weight;
1653 }
1654
1655 aggregate(tg, sd)->rq_weight = rq_weight;
1656 aggregate(tg, sd)->task_weight = task_weight;
1657}
1658
1659/*
1660 * Compute the weight of this group on the given cpus.
1661 */
1662static
1663void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1664{
1665 unsigned long shares = 0;
1666 int i;
1667
1668 for_each_cpu_mask(i, sd->span)
1669 shares += tg->cfs_rq[i]->shares;
1670
1671 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1672 shares = tg->shares;
1673
1674 aggregate(tg, sd)->shares = shares;
1675}
1676
1677/*
1678 * Compute the load fraction assigned to this group, relies on the aggregate
1679 * weight and this group's parent's load, i.e. top-down.
1680 */
1681static
1682void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1683{
1684 unsigned long load;
1685
1686 if (!tg->parent) {
1687 int i;
1688
1689 load = 0;
1690 for_each_cpu_mask(i, sd->span)
1691 load += cpu_rq(i)->load.weight;
1692
1693 } else {
1694 load = aggregate(tg->parent, sd)->load;
1695
1696 /*
1697 * shares is our weight in the parent's rq so
1698 * shares/parent->rq_weight gives our fraction of the load
1699 */
1700 load *= aggregate(tg, sd)->shares;
1701 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1702 }
1703
1704 aggregate(tg, sd)->load = load;
1705}
1706
1707static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1708
1709/*
1710 * Calculate and set the cpu's group shares.
1711 */
1712static void
1713__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1714 int tcpu)
1715{
1716 int boost = 0;
1717 unsigned long shares;
1718 unsigned long rq_weight;
1719
1720 if (!tg->se[tcpu])
1721 return;
1722
1723 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1724
1725 /*
1726 * If there are currently no tasks on the cpu pretend there is one of
1727 * average load so that when a new task gets to run here it will not
1728 * get delayed by group starvation.
1729 */
1730 if (!rq_weight) {
1731 boost = 1;
1732 rq_weight = NICE_0_LOAD;
1733 }
1734
1735 /*
1736 * \Sum shares * rq_weight
1737 * shares = -----------------------
1738 * \Sum rq_weight
1739 *
1740 */
1741 shares = aggregate(tg, sd)->shares * rq_weight;
1742 shares /= aggregate(tg, sd)->rq_weight + 1;
1743
1744 /*
1745 * record the actual number of shares, not the boosted amount.
1746 */
1747 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1748
1749 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES;
1751
1752 __set_se_shares(tg->se[tcpu], shares);
1753}
1754
1755/*
1756 * Re-adjust the weights on the cpu the task came from and on the cpu the
1757 * task went to.
1758 */
1759static void
1760__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1761 int scpu, int dcpu)
1762{
1763 unsigned long shares;
1764
1765 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1766
1767 __update_group_shares_cpu(tg, sd, scpu);
1768 __update_group_shares_cpu(tg, sd, dcpu);
1769
1770 /*
1771 * ensure we never loose shares due to rounding errors in the
1772 * above redistribution.
1773 */
1774 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1775 if (shares)
1776 tg->cfs_rq[dcpu]->shares += shares;
1777}
1778
1779/*
1780 * Because changing a group's shares changes the weight of the super-group
1781 * we need to walk up the tree and change all shares until we hit the root.
1782 */
1783static void
1784move_group_shares(struct task_group *tg, struct sched_domain *sd,
1785 int scpu, int dcpu)
1786{
1787 while (tg) {
1788 __move_group_shares(tg, sd, scpu, dcpu);
1789 tg = tg->parent;
1790 }
1791}
1792
1793static
1794void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1795{
1796 unsigned long shares = aggregate(tg, sd)->shares;
1797 int i;
1798
1799 for_each_cpu_mask(i, sd->span) {
1800 struct rq *rq = cpu_rq(i);
1801 unsigned long flags;
1802
1803 spin_lock_irqsave(&rq->lock, flags);
1804 __update_group_shares_cpu(tg, sd, i);
1805 spin_unlock_irqrestore(&rq->lock, flags);
1806 }
1807
1808 aggregate_group_shares(tg, sd);
1809
1810 /*
1811 * ensure we never loose shares due to rounding errors in the
1812 * above redistribution.
1813 */
1814 shares -= aggregate(tg, sd)->shares;
1815 if (shares) {
1816 tg->cfs_rq[sd->first_cpu]->shares += shares;
1817 aggregate(tg, sd)->shares += shares;
1818 }
1819}
1820
1821/*
1822 * Calculate the accumulative weight and recursive load of each task group
1823 * while walking down the tree.
1824 */
1825static
1826void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1827{
1828 aggregate_group_weight(tg, sd);
1829 aggregate_group_shares(tg, sd);
1830 aggregate_group_load(tg, sd);
1831}
1832
1833/*
1834 * Rebalance the cpu shares while walking back up the tree.
1835 */
1836static
1837void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1838{
1839 aggregate_group_set_shares(tg, sd);
1840}
1841
1842static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1843
1844static void __init init_aggregate(void)
1845{
1846 int i;
1847
1848 for_each_possible_cpu(i)
1849 spin_lock_init(&per_cpu(aggregate_lock, i));
1850}
1851
1852static int get_aggregate(struct sched_domain *sd)
1853{
1854 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1855 return 0;
1856
1857 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1858 return 1;
1859}
1860
1861static void put_aggregate(struct sched_domain *sd)
1862{
1863 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1864}
1865
1866static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1867{
1868 cfs_rq->shares = shares;
1869}
1870
1871#else
1872
1873static inline void init_aggregate(void)
1874{
1875}
1876
1877static inline int get_aggregate(struct sched_domain *sd)
1878{
1879 return 0;
1880}
1881
1882static inline void put_aggregate(struct sched_domain *sd)
1883{
1884}
1885#endif
1886
1887#else /* CONFIG_SMP */
1888
1889#ifdef CONFIG_FAIR_GROUP_SCHED
1890static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1891{
1892}
1893#endif
1894
1206#endif /* CONFIG_SMP */ 1895#endif /* CONFIG_SMP */
1207 1896
1208#include "sched_stats.h" 1897#include "sched_stats.h"
@@ -1215,26 +1904,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1215 1904
1216#define sched_class_highest (&rt_sched_class) 1905#define sched_class_highest (&rt_sched_class)
1217 1906
1218static inline void inc_load(struct rq *rq, const struct task_struct *p) 1907static void inc_nr_running(struct rq *rq)
1219{
1220 update_load_add(&rq->load, p->se.load.weight);
1221}
1222
1223static inline void dec_load(struct rq *rq, const struct task_struct *p)
1224{
1225 update_load_sub(&rq->load, p->se.load.weight);
1226}
1227
1228static void inc_nr_running(struct task_struct *p, struct rq *rq)
1229{ 1908{
1230 rq->nr_running++; 1909 rq->nr_running++;
1231 inc_load(rq, p);
1232} 1910}
1233 1911
1234static void dec_nr_running(struct task_struct *p, struct rq *rq) 1912static void dec_nr_running(struct rq *rq)
1235{ 1913{
1236 rq->nr_running--; 1914 rq->nr_running--;
1237 dec_load(rq, p);
1238} 1915}
1239 1916
1240static void set_load_weight(struct task_struct *p) 1917static void set_load_weight(struct task_struct *p)
@@ -1326,7 +2003,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1326 rq->nr_uninterruptible--; 2003 rq->nr_uninterruptible--;
1327 2004
1328 enqueue_task(rq, p, wakeup); 2005 enqueue_task(rq, p, wakeup);
1329 inc_nr_running(p, rq); 2006 inc_nr_running(rq);
1330} 2007}
1331 2008
1332/* 2009/*
@@ -1338,7 +2015,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1338 rq->nr_uninterruptible++; 2015 rq->nr_uninterruptible++;
1339 2016
1340 dequeue_task(rq, p, sleep); 2017 dequeue_task(rq, p, sleep);
1341 dec_nr_running(p, rq); 2018 dec_nr_running(rq);
1342} 2019}
1343 2020
1344/** 2021/**
@@ -1395,7 +2072,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1395 /* 2072 /*
1396 * Buddy candidates are cache hot: 2073 * Buddy candidates are cache hot:
1397 */ 2074 */
1398 if (&p->se == cfs_rq_of(&p->se)->next) 2075 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1399 return 1; 2076 return 1;
1400 2077
1401 if (p->sched_class != &fair_sched_class) 2078 if (p->sched_class != &fair_sched_class)
@@ -1685,17 +2362,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1685 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2362 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1686 */ 2363 */
1687static int 2364static int
1688find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 2365find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2366 cpumask_t *tmp)
1689{ 2367{
1690 cpumask_t tmp;
1691 unsigned long load, min_load = ULONG_MAX; 2368 unsigned long load, min_load = ULONG_MAX;
1692 int idlest = -1; 2369 int idlest = -1;
1693 int i; 2370 int i;
1694 2371
1695 /* Traverse only the allowed CPUs */ 2372 /* Traverse only the allowed CPUs */
1696 cpus_and(tmp, group->cpumask, p->cpus_allowed); 2373 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1697 2374
1698 for_each_cpu_mask(i, tmp) { 2375 for_each_cpu_mask(i, *tmp) {
1699 load = weighted_cpuload(i); 2376 load = weighted_cpuload(i);
1700 2377
1701 if (load < min_load || (load == min_load && i == this_cpu)) { 2378 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1734,7 +2411,7 @@ static int sched_balance_self(int cpu, int flag)
1734 } 2411 }
1735 2412
1736 while (sd) { 2413 while (sd) {
1737 cpumask_t span; 2414 cpumask_t span, tmpmask;
1738 struct sched_group *group; 2415 struct sched_group *group;
1739 int new_cpu, weight; 2416 int new_cpu, weight;
1740 2417
@@ -1750,7 +2427,7 @@ static int sched_balance_self(int cpu, int flag)
1750 continue; 2427 continue;
1751 } 2428 }
1752 2429
1753 new_cpu = find_idlest_cpu(group, t, cpu); 2430 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
1754 if (new_cpu == -1 || new_cpu == cpu) { 2431 if (new_cpu == -1 || new_cpu == cpu) {
1755 /* Now try balancing at a lower domain level of cpu */ 2432 /* Now try balancing at a lower domain level of cpu */
1756 sd = sd->child; 2433 sd = sd->child;
@@ -1796,6 +2473,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1796 long old_state; 2473 long old_state;
1797 struct rq *rq; 2474 struct rq *rq;
1798 2475
2476 if (!sched_feat(SYNC_WAKEUPS))
2477 sync = 0;
2478
1799 smp_wmb(); 2479 smp_wmb();
1800 rq = task_rq_lock(p, &flags); 2480 rq = task_rq_lock(p, &flags);
1801 old_state = p->state; 2481 old_state = p->state;
@@ -1912,6 +2592,7 @@ static void __sched_fork(struct task_struct *p)
1912 2592
1913 INIT_LIST_HEAD(&p->rt.run_list); 2593 INIT_LIST_HEAD(&p->rt.run_list);
1914 p->se.on_rq = 0; 2594 p->se.on_rq = 0;
2595 INIT_LIST_HEAD(&p->se.group_node);
1915 2596
1916#ifdef CONFIG_PREEMPT_NOTIFIERS 2597#ifdef CONFIG_PREEMPT_NOTIFIERS
1917 INIT_HLIST_HEAD(&p->preempt_notifiers); 2598 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -1987,7 +2668,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1987 * management (if any): 2668 * management (if any):
1988 */ 2669 */
1989 p->sched_class->task_new(rq, p); 2670 p->sched_class->task_new(rq, p);
1990 inc_nr_running(p, rq); 2671 inc_nr_running(rq);
1991 } 2672 }
1992 check_preempt_curr(rq, p); 2673 check_preempt_curr(rq, p);
1993#ifdef CONFIG_SMP 2674#ifdef CONFIG_SMP
@@ -2631,7 +3312,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2631static struct sched_group * 3312static struct sched_group *
2632find_busiest_group(struct sched_domain *sd, int this_cpu, 3313find_busiest_group(struct sched_domain *sd, int this_cpu,
2633 unsigned long *imbalance, enum cpu_idle_type idle, 3314 unsigned long *imbalance, enum cpu_idle_type idle,
2634 int *sd_idle, cpumask_t *cpus, int *balance) 3315 int *sd_idle, const cpumask_t *cpus, int *balance)
2635{ 3316{
2636 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3317 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2637 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3318 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2932,7 +3613,7 @@ ret:
2932 */ 3613 */
2933static struct rq * 3614static struct rq *
2934find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3615find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2935 unsigned long imbalance, cpumask_t *cpus) 3616 unsigned long imbalance, const cpumask_t *cpus)
2936{ 3617{
2937 struct rq *busiest = NULL, *rq; 3618 struct rq *busiest = NULL, *rq;
2938 unsigned long max_load = 0; 3619 unsigned long max_load = 0;
@@ -2971,14 +3652,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2971 */ 3652 */
2972static int load_balance(int this_cpu, struct rq *this_rq, 3653static int load_balance(int this_cpu, struct rq *this_rq,
2973 struct sched_domain *sd, enum cpu_idle_type idle, 3654 struct sched_domain *sd, enum cpu_idle_type idle,
2974 int *balance) 3655 int *balance, cpumask_t *cpus)
2975{ 3656{
2976 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3657 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2977 struct sched_group *group; 3658 struct sched_group *group;
2978 unsigned long imbalance; 3659 unsigned long imbalance;
2979 struct rq *busiest; 3660 struct rq *busiest;
2980 cpumask_t cpus = CPU_MASK_ALL;
2981 unsigned long flags; 3661 unsigned long flags;
3662 int unlock_aggregate;
3663
3664 cpus_setall(*cpus);
3665
3666 unlock_aggregate = get_aggregate(sd);
2982 3667
2983 /* 3668 /*
2984 * When power savings policy is enabled for the parent domain, idle 3669 * When power savings policy is enabled for the parent domain, idle
@@ -2994,7 +3679,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2994 3679
2995redo: 3680redo:
2996 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3681 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2997 &cpus, balance); 3682 cpus, balance);
2998 3683
2999 if (*balance == 0) 3684 if (*balance == 0)
3000 goto out_balanced; 3685 goto out_balanced;
@@ -3004,7 +3689,7 @@ redo:
3004 goto out_balanced; 3689 goto out_balanced;
3005 } 3690 }
3006 3691
3007 busiest = find_busiest_queue(group, idle, imbalance, &cpus); 3692 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3008 if (!busiest) { 3693 if (!busiest) {
3009 schedstat_inc(sd, lb_nobusyq[idle]); 3694 schedstat_inc(sd, lb_nobusyq[idle]);
3010 goto out_balanced; 3695 goto out_balanced;
@@ -3037,8 +3722,8 @@ redo:
3037 3722
3038 /* All tasks on this runqueue were pinned by CPU affinity */ 3723 /* All tasks on this runqueue were pinned by CPU affinity */
3039 if (unlikely(all_pinned)) { 3724 if (unlikely(all_pinned)) {
3040 cpu_clear(cpu_of(busiest), cpus); 3725 cpu_clear(cpu_of(busiest), *cpus);
3041 if (!cpus_empty(cpus)) 3726 if (!cpus_empty(*cpus))
3042 goto redo; 3727 goto redo;
3043 goto out_balanced; 3728 goto out_balanced;
3044 } 3729 }
@@ -3095,8 +3780,9 @@ redo:
3095 3780
3096 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3781 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3097 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3782 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3098 return -1; 3783 ld_moved = -1;
3099 return ld_moved; 3784
3785 goto out;
3100 3786
3101out_balanced: 3787out_balanced:
3102 schedstat_inc(sd, lb_balanced[idle]); 3788 schedstat_inc(sd, lb_balanced[idle]);
@@ -3111,8 +3797,13 @@ out_one_pinned:
3111 3797
3112 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3798 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3113 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3799 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3114 return -1; 3800 ld_moved = -1;
3115 return 0; 3801 else
3802 ld_moved = 0;
3803out:
3804 if (unlock_aggregate)
3805 put_aggregate(sd);
3806 return ld_moved;
3116} 3807}
3117 3808
3118/* 3809/*
@@ -3123,7 +3814,8 @@ out_one_pinned:
3123 * this_rq is locked. 3814 * this_rq is locked.
3124 */ 3815 */
3125static int 3816static int
3126load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 3817load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3818 cpumask_t *cpus)
3127{ 3819{
3128 struct sched_group *group; 3820 struct sched_group *group;
3129 struct rq *busiest = NULL; 3821 struct rq *busiest = NULL;
@@ -3131,7 +3823,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3131 int ld_moved = 0; 3823 int ld_moved = 0;
3132 int sd_idle = 0; 3824 int sd_idle = 0;
3133 int all_pinned = 0; 3825 int all_pinned = 0;
3134 cpumask_t cpus = CPU_MASK_ALL; 3826
3827 cpus_setall(*cpus);
3135 3828
3136 /* 3829 /*
3137 * When power savings policy is enabled for the parent domain, idle 3830 * When power savings policy is enabled for the parent domain, idle
@@ -3146,14 +3839,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3146 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3839 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3147redo: 3840redo:
3148 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3841 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3149 &sd_idle, &cpus, NULL); 3842 &sd_idle, cpus, NULL);
3150 if (!group) { 3843 if (!group) {
3151 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); 3844 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3152 goto out_balanced; 3845 goto out_balanced;
3153 } 3846 }
3154 3847
3155 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, 3848 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3156 &cpus);
3157 if (!busiest) { 3849 if (!busiest) {
3158 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); 3850 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3159 goto out_balanced; 3851 goto out_balanced;
@@ -3175,8 +3867,8 @@ redo:
3175 spin_unlock(&busiest->lock); 3867 spin_unlock(&busiest->lock);
3176 3868
3177 if (unlikely(all_pinned)) { 3869 if (unlikely(all_pinned)) {
3178 cpu_clear(cpu_of(busiest), cpus); 3870 cpu_clear(cpu_of(busiest), *cpus);
3179 if (!cpus_empty(cpus)) 3871 if (!cpus_empty(*cpus))
3180 goto redo; 3872 goto redo;
3181 } 3873 }
3182 } 3874 }
@@ -3210,6 +3902,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3210 struct sched_domain *sd; 3902 struct sched_domain *sd;
3211 int pulled_task = -1; 3903 int pulled_task = -1;
3212 unsigned long next_balance = jiffies + HZ; 3904 unsigned long next_balance = jiffies + HZ;
3905 cpumask_t tmpmask;
3213 3906
3214 for_each_domain(this_cpu, sd) { 3907 for_each_domain(this_cpu, sd) {
3215 unsigned long interval; 3908 unsigned long interval;
@@ -3219,8 +3912,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3219 3912
3220 if (sd->flags & SD_BALANCE_NEWIDLE) 3913 if (sd->flags & SD_BALANCE_NEWIDLE)
3221 /* If we've pulled tasks over stop searching: */ 3914 /* If we've pulled tasks over stop searching: */
3222 pulled_task = load_balance_newidle(this_cpu, 3915 pulled_task = load_balance_newidle(this_cpu, this_rq,
3223 this_rq, sd); 3916 sd, &tmpmask);
3224 3917
3225 interval = msecs_to_jiffies(sd->balance_interval); 3918 interval = msecs_to_jiffies(sd->balance_interval);
3226 if (time_after(next_balance, sd->last_balance + interval)) 3919 if (time_after(next_balance, sd->last_balance + interval))
@@ -3379,6 +4072,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3379 /* Earliest time when we have to do rebalance again */ 4072 /* Earliest time when we have to do rebalance again */
3380 unsigned long next_balance = jiffies + 60*HZ; 4073 unsigned long next_balance = jiffies + 60*HZ;
3381 int update_next_balance = 0; 4074 int update_next_balance = 0;
4075 cpumask_t tmp;
3382 4076
3383 for_each_domain(cpu, sd) { 4077 for_each_domain(cpu, sd) {
3384 if (!(sd->flags & SD_LOAD_BALANCE)) 4078 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3402,7 +4096,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3402 } 4096 }
3403 4097
3404 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4098 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3405 if (load_balance(cpu, rq, sd, idle, &balance)) { 4099 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3406 /* 4100 /*
3407 * We've pulled tasks over so either we're no 4101 * We've pulled tasks over so either we're no
3408 * longer idle, or one of our SMT siblings is 4102 * longer idle, or one of our SMT siblings is
@@ -3518,7 +4212,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3518 */ 4212 */
3519 int ilb = first_cpu(nohz.cpu_mask); 4213 int ilb = first_cpu(nohz.cpu_mask);
3520 4214
3521 if (ilb != NR_CPUS) 4215 if (ilb < nr_cpu_ids)
3522 resched_cpu(ilb); 4216 resched_cpu(ilb);
3523 } 4217 }
3524 } 4218 }
@@ -3722,9 +4416,9 @@ void scheduler_tick(void)
3722 rq->clock_underflows++; 4416 rq->clock_underflows++;
3723 } 4417 }
3724 rq->tick_timestamp = rq->clock; 4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
3725 update_cpu_load(rq); 4420 update_cpu_load(rq);
3726 curr->sched_class->task_tick(rq, curr, 0); 4421 curr->sched_class->task_tick(rq, curr, 0);
3727 update_sched_rt_period(rq);
3728 spin_unlock(&rq->lock); 4422 spin_unlock(&rq->lock);
3729 4423
3730#ifdef CONFIG_SMP 4424#ifdef CONFIG_SMP
@@ -4324,10 +5018,8 @@ void set_user_nice(struct task_struct *p, long nice)
4324 goto out_unlock; 5018 goto out_unlock;
4325 } 5019 }
4326 on_rq = p->se.on_rq; 5020 on_rq = p->se.on_rq;
4327 if (on_rq) { 5021 if (on_rq)
4328 dequeue_task(rq, p, 0); 5022 dequeue_task(rq, p, 0);
4329 dec_load(rq, p);
4330 }
4331 5023
4332 p->static_prio = NICE_TO_PRIO(nice); 5024 p->static_prio = NICE_TO_PRIO(nice);
4333 set_load_weight(p); 5025 set_load_weight(p);
@@ -4337,7 +5029,6 @@ void set_user_nice(struct task_struct *p, long nice)
4337 5029
4338 if (on_rq) { 5030 if (on_rq) {
4339 enqueue_task(rq, p, 0); 5031 enqueue_task(rq, p, 0);
4340 inc_load(rq, p);
4341 /* 5032 /*
4342 * If the task increased its priority or is running and 5033 * If the task increased its priority or is running and
4343 * lowered its priority, then reschedule its CPU: 5034 * lowered its priority, then reschedule its CPU:
@@ -4559,7 +5250,7 @@ recheck:
4559 * Do not allow realtime tasks into groups that have no runtime 5250 * Do not allow realtime tasks into groups that have no runtime
4560 * assigned. 5251 * assigned.
4561 */ 5252 */
4562 if (rt_policy(policy) && task_group(p)->rt_runtime == 0) 5253 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4563 return -EPERM; 5254 return -EPERM;
4564#endif 5255#endif
4565 5256
@@ -4721,9 +5412,10 @@ out_unlock:
4721 return retval; 5412 return retval;
4722} 5413}
4723 5414
4724long sched_setaffinity(pid_t pid, cpumask_t new_mask) 5415long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
4725{ 5416{
4726 cpumask_t cpus_allowed; 5417 cpumask_t cpus_allowed;
5418 cpumask_t new_mask = *in_mask;
4727 struct task_struct *p; 5419 struct task_struct *p;
4728 int retval; 5420 int retval;
4729 5421
@@ -4754,13 +5446,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4754 if (retval) 5446 if (retval)
4755 goto out_unlock; 5447 goto out_unlock;
4756 5448
4757 cpus_allowed = cpuset_cpus_allowed(p); 5449 cpuset_cpus_allowed(p, &cpus_allowed);
4758 cpus_and(new_mask, new_mask, cpus_allowed); 5450 cpus_and(new_mask, new_mask, cpus_allowed);
4759 again: 5451 again:
4760 retval = set_cpus_allowed(p, new_mask); 5452 retval = set_cpus_allowed_ptr(p, &new_mask);
4761 5453
4762 if (!retval) { 5454 if (!retval) {
4763 cpus_allowed = cpuset_cpus_allowed(p); 5455 cpuset_cpus_allowed(p, &cpus_allowed);
4764 if (!cpus_subset(new_mask, cpus_allowed)) { 5456 if (!cpus_subset(new_mask, cpus_allowed)) {
4765 /* 5457 /*
4766 * We must have raced with a concurrent cpuset 5458 * We must have raced with a concurrent cpuset
@@ -4804,7 +5496,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4804 if (retval) 5496 if (retval)
4805 return retval; 5497 return retval;
4806 5498
4807 return sched_setaffinity(pid, new_mask); 5499 return sched_setaffinity(pid, &new_mask);
4808} 5500}
4809 5501
4810/* 5502/*
@@ -5266,7 +5958,6 @@ static inline void sched_init_granularity(void)
5266 sysctl_sched_latency = limit; 5958 sysctl_sched_latency = limit;
5267 5959
5268 sysctl_sched_wakeup_granularity *= factor; 5960 sysctl_sched_wakeup_granularity *= factor;
5269 sysctl_sched_batch_wakeup_granularity *= factor;
5270} 5961}
5271 5962
5272#ifdef CONFIG_SMP 5963#ifdef CONFIG_SMP
@@ -5295,7 +5986,7 @@ static inline void sched_init_granularity(void)
5295 * task must not exit() & deallocate itself prematurely. The 5986 * task must not exit() & deallocate itself prematurely. The
5296 * call is not atomic; no spinlocks may be held. 5987 * call is not atomic; no spinlocks may be held.
5297 */ 5988 */
5298int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 5989int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5299{ 5990{
5300 struct migration_req req; 5991 struct migration_req req;
5301 unsigned long flags; 5992 unsigned long flags;
@@ -5303,23 +5994,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5303 int ret = 0; 5994 int ret = 0;
5304 5995
5305 rq = task_rq_lock(p, &flags); 5996 rq = task_rq_lock(p, &flags);
5306 if (!cpus_intersects(new_mask, cpu_online_map)) { 5997 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5307 ret = -EINVAL; 5998 ret = -EINVAL;
5308 goto out; 5999 goto out;
5309 } 6000 }
5310 6001
5311 if (p->sched_class->set_cpus_allowed) 6002 if (p->sched_class->set_cpus_allowed)
5312 p->sched_class->set_cpus_allowed(p, &new_mask); 6003 p->sched_class->set_cpus_allowed(p, new_mask);
5313 else { 6004 else {
5314 p->cpus_allowed = new_mask; 6005 p->cpus_allowed = *new_mask;
5315 p->rt.nr_cpus_allowed = cpus_weight(new_mask); 6006 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5316 } 6007 }
5317 6008
5318 /* Can the task run on the task's current CPU? If so, we're done */ 6009 /* Can the task run on the task's current CPU? If so, we're done */
5319 if (cpu_isset(task_cpu(p), new_mask)) 6010 if (cpu_isset(task_cpu(p), *new_mask))
5320 goto out; 6011 goto out;
5321 6012
5322 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 6013 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5323 /* Need help from migration thread: drop lock and wait. */ 6014 /* Need help from migration thread: drop lock and wait. */
5324 task_rq_unlock(rq, &flags); 6015 task_rq_unlock(rq, &flags);
5325 wake_up_process(rq->migration_thread); 6016 wake_up_process(rq->migration_thread);
@@ -5332,7 +6023,7 @@ out:
5332 6023
5333 return ret; 6024 return ret;
5334} 6025}
5335EXPORT_SYMBOL_GPL(set_cpus_allowed); 6026EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5336 6027
5337/* 6028/*
5338 * Move (not current) task off this cpu, onto dest cpu. We're doing 6029 * Move (not current) task off this cpu, onto dest cpu. We're doing
@@ -5470,12 +6161,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5470 dest_cpu = any_online_cpu(mask); 6161 dest_cpu = any_online_cpu(mask);
5471 6162
5472 /* On any allowed CPU? */ 6163 /* On any allowed CPU? */
5473 if (dest_cpu == NR_CPUS) 6164 if (dest_cpu >= nr_cpu_ids)
5474 dest_cpu = any_online_cpu(p->cpus_allowed); 6165 dest_cpu = any_online_cpu(p->cpus_allowed);
5475 6166
5476 /* No more Mr. Nice Guy. */ 6167 /* No more Mr. Nice Guy. */
5477 if (dest_cpu == NR_CPUS) { 6168 if (dest_cpu >= nr_cpu_ids) {
5478 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); 6169 cpumask_t cpus_allowed;
6170
6171 cpuset_cpus_allowed_locked(p, &cpus_allowed);
5479 /* 6172 /*
5480 * Try to stay on the same cpuset, where the 6173 * Try to stay on the same cpuset, where the
5481 * current cpuset may be a subset of all cpus. 6174 * current cpuset may be a subset of all cpus.
@@ -5511,7 +6204,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5511 */ 6204 */
5512static void migrate_nr_uninterruptible(struct rq *rq_src) 6205static void migrate_nr_uninterruptible(struct rq *rq_src)
5513{ 6206{
5514 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 6207 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
5515 unsigned long flags; 6208 unsigned long flags;
5516 6209
5517 local_irq_save(flags); 6210 local_irq_save(flags);
@@ -5923,20 +6616,16 @@ void __init migration_init(void)
5923 6616
5924#ifdef CONFIG_SMP 6617#ifdef CONFIG_SMP
5925 6618
5926/* Number of possible processor ids */
5927int nr_cpu_ids __read_mostly = NR_CPUS;
5928EXPORT_SYMBOL(nr_cpu_ids);
5929
5930#ifdef CONFIG_SCHED_DEBUG 6619#ifdef CONFIG_SCHED_DEBUG
5931 6620
5932static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) 6621static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6622 cpumask_t *groupmask)
5933{ 6623{
5934 struct sched_group *group = sd->groups; 6624 struct sched_group *group = sd->groups;
5935 cpumask_t groupmask; 6625 char str[256];
5936 char str[NR_CPUS];
5937 6626
5938 cpumask_scnprintf(str, NR_CPUS, sd->span); 6627 cpulist_scnprintf(str, sizeof(str), sd->span);
5939 cpus_clear(groupmask); 6628 cpus_clear(*groupmask);
5940 6629
5941 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6630 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5942 6631
@@ -5980,25 +6669,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
5980 break; 6669 break;
5981 } 6670 }
5982 6671
5983 if (cpus_intersects(groupmask, group->cpumask)) { 6672 if (cpus_intersects(*groupmask, group->cpumask)) {
5984 printk(KERN_CONT "\n"); 6673 printk(KERN_CONT "\n");
5985 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6674 printk(KERN_ERR "ERROR: repeated CPUs\n");
5986 break; 6675 break;
5987 } 6676 }
5988 6677
5989 cpus_or(groupmask, groupmask, group->cpumask); 6678 cpus_or(*groupmask, *groupmask, group->cpumask);
5990 6679
5991 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 6680 cpulist_scnprintf(str, sizeof(str), group->cpumask);
5992 printk(KERN_CONT " %s", str); 6681 printk(KERN_CONT " %s", str);
5993 6682
5994 group = group->next; 6683 group = group->next;
5995 } while (group != sd->groups); 6684 } while (group != sd->groups);
5996 printk(KERN_CONT "\n"); 6685 printk(KERN_CONT "\n");
5997 6686
5998 if (!cpus_equal(sd->span, groupmask)) 6687 if (!cpus_equal(sd->span, *groupmask))
5999 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6688 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6000 6689
6001 if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) 6690 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6002 printk(KERN_ERR "ERROR: parent span is not a superset " 6691 printk(KERN_ERR "ERROR: parent span is not a superset "
6003 "of domain->span\n"); 6692 "of domain->span\n");
6004 return 0; 6693 return 0;
@@ -6006,6 +6695,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
6006 6695
6007static void sched_domain_debug(struct sched_domain *sd, int cpu) 6696static void sched_domain_debug(struct sched_domain *sd, int cpu)
6008{ 6697{
6698 cpumask_t *groupmask;
6009 int level = 0; 6699 int level = 0;
6010 6700
6011 if (!sd) { 6701 if (!sd) {
@@ -6015,14 +6705,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6015 6705
6016 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6706 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6017 6707
6708 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6709 if (!groupmask) {
6710 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6711 return;
6712 }
6713
6018 for (;;) { 6714 for (;;) {
6019 if (sched_domain_debug_one(sd, cpu, level)) 6715 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6020 break; 6716 break;
6021 level++; 6717 level++;
6022 sd = sd->parent; 6718 sd = sd->parent;
6023 if (!sd) 6719 if (!sd)
6024 break; 6720 break;
6025 } 6721 }
6722 kfree(groupmask);
6026} 6723}
6027#else 6724#else
6028# define sched_domain_debug(sd, cpu) do { } while (0) 6725# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6210,30 +6907,33 @@ __setup("isolcpus=", isolated_cpu_setup);
6210 * and ->cpu_power to 0. 6907 * and ->cpu_power to 0.
6211 */ 6908 */
6212static void 6909static void
6213init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, 6910init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6214 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6911 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6215 struct sched_group **sg)) 6912 struct sched_group **sg,
6913 cpumask_t *tmpmask),
6914 cpumask_t *covered, cpumask_t *tmpmask)
6216{ 6915{
6217 struct sched_group *first = NULL, *last = NULL; 6916 struct sched_group *first = NULL, *last = NULL;
6218 cpumask_t covered = CPU_MASK_NONE;
6219 int i; 6917 int i;
6220 6918
6221 for_each_cpu_mask(i, span) { 6919 cpus_clear(*covered);
6920
6921 for_each_cpu_mask(i, *span) {
6222 struct sched_group *sg; 6922 struct sched_group *sg;
6223 int group = group_fn(i, cpu_map, &sg); 6923 int group = group_fn(i, cpu_map, &sg, tmpmask);
6224 int j; 6924 int j;
6225 6925
6226 if (cpu_isset(i, covered)) 6926 if (cpu_isset(i, *covered))
6227 continue; 6927 continue;
6228 6928
6229 sg->cpumask = CPU_MASK_NONE; 6929 cpus_clear(sg->cpumask);
6230 sg->__cpu_power = 0; 6930 sg->__cpu_power = 0;
6231 6931
6232 for_each_cpu_mask(j, span) { 6932 for_each_cpu_mask(j, *span) {
6233 if (group_fn(j, cpu_map, NULL) != group) 6933 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6234 continue; 6934 continue;
6235 6935
6236 cpu_set(j, covered); 6936 cpu_set(j, *covered);
6237 cpu_set(j, sg->cpumask); 6937 cpu_set(j, sg->cpumask);
6238 } 6938 }
6239 if (!first) 6939 if (!first)
@@ -6259,7 +6959,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
6259 * 6959 *
6260 * Should use nodemask_t. 6960 * Should use nodemask_t.
6261 */ 6961 */
6262static int find_next_best_node(int node, unsigned long *used_nodes) 6962static int find_next_best_node(int node, nodemask_t *used_nodes)
6263{ 6963{
6264 int i, n, val, min_val, best_node = 0; 6964 int i, n, val, min_val, best_node = 0;
6265 6965
@@ -6273,7 +6973,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6273 continue; 6973 continue;
6274 6974
6275 /* Skip already used nodes */ 6975 /* Skip already used nodes */
6276 if (test_bit(n, used_nodes)) 6976 if (node_isset(n, *used_nodes))
6277 continue; 6977 continue;
6278 6978
6279 /* Simple min distance search */ 6979 /* Simple min distance search */
@@ -6285,40 +6985,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6285 } 6985 }
6286 } 6986 }
6287 6987
6288 set_bit(best_node, used_nodes); 6988 node_set(best_node, *used_nodes);
6289 return best_node; 6989 return best_node;
6290} 6990}
6291 6991
6292/** 6992/**
6293 * sched_domain_node_span - get a cpumask for a node's sched_domain 6993 * sched_domain_node_span - get a cpumask for a node's sched_domain
6294 * @node: node whose cpumask we're constructing 6994 * @node: node whose cpumask we're constructing
6295 * @size: number of nodes to include in this span 6995 * @span: resulting cpumask
6296 * 6996 *
6297 * Given a node, construct a good cpumask for its sched_domain to span. It 6997 * Given a node, construct a good cpumask for its sched_domain to span. It
6298 * should be one that prevents unnecessary balancing, but also spreads tasks 6998 * should be one that prevents unnecessary balancing, but also spreads tasks
6299 * out optimally. 6999 * out optimally.
6300 */ 7000 */
6301static cpumask_t sched_domain_node_span(int node) 7001static void sched_domain_node_span(int node, cpumask_t *span)
6302{ 7002{
6303 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 7003 nodemask_t used_nodes;
6304 cpumask_t span, nodemask; 7004 node_to_cpumask_ptr(nodemask, node);
6305 int i; 7005 int i;
6306 7006
6307 cpus_clear(span); 7007 cpus_clear(*span);
6308 bitmap_zero(used_nodes, MAX_NUMNODES); 7008 nodes_clear(used_nodes);
6309 7009
6310 nodemask = node_to_cpumask(node); 7010 cpus_or(*span, *span, *nodemask);
6311 cpus_or(span, span, nodemask); 7011 node_set(node, used_nodes);
6312 set_bit(node, used_nodes);
6313 7012
6314 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7013 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6315 int next_node = find_next_best_node(node, used_nodes); 7014 int next_node = find_next_best_node(node, &used_nodes);
6316 7015
6317 nodemask = node_to_cpumask(next_node); 7016 node_to_cpumask_ptr_next(nodemask, next_node);
6318 cpus_or(span, span, nodemask); 7017 cpus_or(*span, *span, *nodemask);
6319 } 7018 }
6320
6321 return span;
6322} 7019}
6323#endif 7020#endif
6324 7021
@@ -6332,7 +7029,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6332static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7029static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6333 7030
6334static int 7031static int
6335cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7032cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7033 cpumask_t *unused)
6336{ 7034{
6337 if (sg) 7035 if (sg)
6338 *sg = &per_cpu(sched_group_cpus, cpu); 7036 *sg = &per_cpu(sched_group_cpus, cpu);
@@ -6350,19 +7048,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6350 7048
6351#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7049#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6352static int 7050static int
6353cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7051cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7052 cpumask_t *mask)
6354{ 7053{
6355 int group; 7054 int group;
6356 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7055
6357 cpus_and(mask, mask, *cpu_map); 7056 *mask = per_cpu(cpu_sibling_map, cpu);
6358 group = first_cpu(mask); 7057 cpus_and(*mask, *mask, *cpu_map);
7058 group = first_cpu(*mask);
6359 if (sg) 7059 if (sg)
6360 *sg = &per_cpu(sched_group_core, group); 7060 *sg = &per_cpu(sched_group_core, group);
6361 return group; 7061 return group;
6362} 7062}
6363#elif defined(CONFIG_SCHED_MC) 7063#elif defined(CONFIG_SCHED_MC)
6364static int 7064static int
6365cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7065cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7066 cpumask_t *unused)
6366{ 7067{
6367 if (sg) 7068 if (sg)
6368 *sg = &per_cpu(sched_group_core, cpu); 7069 *sg = &per_cpu(sched_group_core, cpu);
@@ -6374,17 +7075,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6374static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7075static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6375 7076
6376static int 7077static int
6377cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7078cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7079 cpumask_t *mask)
6378{ 7080{
6379 int group; 7081 int group;
6380#ifdef CONFIG_SCHED_MC 7082#ifdef CONFIG_SCHED_MC
6381 cpumask_t mask = cpu_coregroup_map(cpu); 7083 *mask = cpu_coregroup_map(cpu);
6382 cpus_and(mask, mask, *cpu_map); 7084 cpus_and(*mask, *mask, *cpu_map);
6383 group = first_cpu(mask); 7085 group = first_cpu(*mask);
6384#elif defined(CONFIG_SCHED_SMT) 7086#elif defined(CONFIG_SCHED_SMT)
6385 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7087 *mask = per_cpu(cpu_sibling_map, cpu);
6386 cpus_and(mask, mask, *cpu_map); 7088 cpus_and(*mask, *mask, *cpu_map);
6387 group = first_cpu(mask); 7089 group = first_cpu(*mask);
6388#else 7090#else
6389 group = cpu; 7091 group = cpu;
6390#endif 7092#endif
@@ -6400,19 +7102,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6400 * gets dynamically allocated. 7102 * gets dynamically allocated.
6401 */ 7103 */
6402static DEFINE_PER_CPU(struct sched_domain, node_domains); 7104static DEFINE_PER_CPU(struct sched_domain, node_domains);
6403static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 7105static struct sched_group ***sched_group_nodes_bycpu;
6404 7106
6405static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7107static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6406static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7108static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6407 7109
6408static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7110static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6409 struct sched_group **sg) 7111 struct sched_group **sg, cpumask_t *nodemask)
6410{ 7112{
6411 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6412 int group; 7113 int group;
6413 7114
6414 cpus_and(nodemask, nodemask, *cpu_map); 7115 *nodemask = node_to_cpumask(cpu_to_node(cpu));
6415 group = first_cpu(nodemask); 7116 cpus_and(*nodemask, *nodemask, *cpu_map);
7117 group = first_cpu(*nodemask);
6416 7118
6417 if (sg) 7119 if (sg)
6418 *sg = &per_cpu(sched_group_allnodes, group); 7120 *sg = &per_cpu(sched_group_allnodes, group);
@@ -6448,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6448 7150
6449#ifdef CONFIG_NUMA 7151#ifdef CONFIG_NUMA
6450/* Free memory allocated for various sched_group structures */ 7152/* Free memory allocated for various sched_group structures */
6451static void free_sched_groups(const cpumask_t *cpu_map) 7153static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6452{ 7154{
6453 int cpu, i; 7155 int cpu, i;
6454 7156
@@ -6460,11 +7162,11 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6460 continue; 7162 continue;
6461 7163
6462 for (i = 0; i < MAX_NUMNODES; i++) { 7164 for (i = 0; i < MAX_NUMNODES; i++) {
6463 cpumask_t nodemask = node_to_cpumask(i);
6464 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7165 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6465 7166
6466 cpus_and(nodemask, nodemask, *cpu_map); 7167 *nodemask = node_to_cpumask(i);
6467 if (cpus_empty(nodemask)) 7168 cpus_and(*nodemask, *nodemask, *cpu_map);
7169 if (cpus_empty(*nodemask))
6468 continue; 7170 continue;
6469 7171
6470 if (sg == NULL) 7172 if (sg == NULL)
@@ -6482,7 +7184,7 @@ next_sg:
6482 } 7184 }
6483} 7185}
6484#else 7186#else
6485static void free_sched_groups(const cpumask_t *cpu_map) 7187static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6486{ 7188{
6487} 7189}
6488#endif 7190#endif
@@ -6540,13 +7242,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6540} 7242}
6541 7243
6542/* 7244/*
7245 * Initializers for schedule domains
7246 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7247 */
7248
7249#define SD_INIT(sd, type) sd_init_##type(sd)
7250#define SD_INIT_FUNC(type) \
7251static noinline void sd_init_##type(struct sched_domain *sd) \
7252{ \
7253 memset(sd, 0, sizeof(*sd)); \
7254 *sd = SD_##type##_INIT; \
7255 sd->level = SD_LV_##type; \
7256}
7257
7258SD_INIT_FUNC(CPU)
7259#ifdef CONFIG_NUMA
7260 SD_INIT_FUNC(ALLNODES)
7261 SD_INIT_FUNC(NODE)
7262#endif
7263#ifdef CONFIG_SCHED_SMT
7264 SD_INIT_FUNC(SIBLING)
7265#endif
7266#ifdef CONFIG_SCHED_MC
7267 SD_INIT_FUNC(MC)
7268#endif
7269
7270/*
7271 * To minimize stack usage kmalloc room for cpumasks and share the
7272 * space as the usage in build_sched_domains() dictates. Used only
7273 * if the amount of space is significant.
7274 */
7275struct allmasks {
7276 cpumask_t tmpmask; /* make this one first */
7277 union {
7278 cpumask_t nodemask;
7279 cpumask_t this_sibling_map;
7280 cpumask_t this_core_map;
7281 };
7282 cpumask_t send_covered;
7283
7284#ifdef CONFIG_NUMA
7285 cpumask_t domainspan;
7286 cpumask_t covered;
7287 cpumask_t notcovered;
7288#endif
7289};
7290
7291#if NR_CPUS > 128
7292#define SCHED_CPUMASK_ALLOC 1
7293#define SCHED_CPUMASK_FREE(v) kfree(v)
7294#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7295#else
7296#define SCHED_CPUMASK_ALLOC 0
7297#define SCHED_CPUMASK_FREE(v)
7298#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7299#endif
7300
7301#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7302 ((unsigned long)(a) + offsetof(struct allmasks, v))
7303
7304static int default_relax_domain_level = -1;
7305
7306static int __init setup_relax_domain_level(char *str)
7307{
7308 default_relax_domain_level = simple_strtoul(str, NULL, 0);
7309 return 1;
7310}
7311__setup("relax_domain_level=", setup_relax_domain_level);
7312
7313static void set_domain_attribute(struct sched_domain *sd,
7314 struct sched_domain_attr *attr)
7315{
7316 int request;
7317
7318 if (!attr || attr->relax_domain_level < 0) {
7319 if (default_relax_domain_level < 0)
7320 return;
7321 else
7322 request = default_relax_domain_level;
7323 } else
7324 request = attr->relax_domain_level;
7325 if (request < sd->level) {
7326 /* turn off idle balance on this domain */
7327 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7328 } else {
7329 /* turn on idle balance on this domain */
7330 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7331 }
7332}
7333
7334/*
6543 * Build sched domains for a given set of cpus and attach the sched domains 7335 * Build sched domains for a given set of cpus and attach the sched domains
6544 * to the individual cpus 7336 * to the individual cpus
6545 */ 7337 */
6546static int build_sched_domains(const cpumask_t *cpu_map) 7338static int __build_sched_domains(const cpumask_t *cpu_map,
7339 struct sched_domain_attr *attr)
6547{ 7340{
6548 int i; 7341 int i;
6549 struct root_domain *rd; 7342 struct root_domain *rd;
7343 SCHED_CPUMASK_DECLARE(allmasks);
7344 cpumask_t *tmpmask;
6550#ifdef CONFIG_NUMA 7345#ifdef CONFIG_NUMA
6551 struct sched_group **sched_group_nodes = NULL; 7346 struct sched_group **sched_group_nodes = NULL;
6552 int sd_allnodes = 0; 7347 int sd_allnodes = 0;
@@ -6560,39 +7355,65 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6560 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7355 printk(KERN_WARNING "Can not alloc sched group node list\n");
6561 return -ENOMEM; 7356 return -ENOMEM;
6562 } 7357 }
6563 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6564#endif 7358#endif
6565 7359
6566 rd = alloc_rootdomain(); 7360 rd = alloc_rootdomain();
6567 if (!rd) { 7361 if (!rd) {
6568 printk(KERN_WARNING "Cannot alloc root domain\n"); 7362 printk(KERN_WARNING "Cannot alloc root domain\n");
7363#ifdef CONFIG_NUMA
7364 kfree(sched_group_nodes);
7365#endif
7366 return -ENOMEM;
7367 }
7368
7369#if SCHED_CPUMASK_ALLOC
7370 /* get space for all scratch cpumask variables */
7371 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7372 if (!allmasks) {
7373 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7374 kfree(rd);
7375#ifdef CONFIG_NUMA
7376 kfree(sched_group_nodes);
7377#endif
6569 return -ENOMEM; 7378 return -ENOMEM;
6570 } 7379 }
7380#endif
7381 tmpmask = (cpumask_t *)allmasks;
7382
7383
7384#ifdef CONFIG_NUMA
7385 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7386#endif
6571 7387
6572 /* 7388 /*
6573 * Set up domains for cpus specified by the cpu_map. 7389 * Set up domains for cpus specified by the cpu_map.
6574 */ 7390 */
6575 for_each_cpu_mask(i, *cpu_map) { 7391 for_each_cpu_mask(i, *cpu_map) {
6576 struct sched_domain *sd = NULL, *p; 7392 struct sched_domain *sd = NULL, *p;
6577 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 7393 SCHED_CPUMASK_VAR(nodemask, allmasks);
6578 7394
6579 cpus_and(nodemask, nodemask, *cpu_map); 7395 *nodemask = node_to_cpumask(cpu_to_node(i));
7396 cpus_and(*nodemask, *nodemask, *cpu_map);
6580 7397
6581#ifdef CONFIG_NUMA 7398#ifdef CONFIG_NUMA
6582 if (cpus_weight(*cpu_map) > 7399 if (cpus_weight(*cpu_map) >
6583 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 7400 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
6584 sd = &per_cpu(allnodes_domains, i); 7401 sd = &per_cpu(allnodes_domains, i);
6585 *sd = SD_ALLNODES_INIT; 7402 SD_INIT(sd, ALLNODES);
7403 set_domain_attribute(sd, attr);
6586 sd->span = *cpu_map; 7404 sd->span = *cpu_map;
6587 cpu_to_allnodes_group(i, cpu_map, &sd->groups); 7405 sd->first_cpu = first_cpu(sd->span);
7406 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6588 p = sd; 7407 p = sd;
6589 sd_allnodes = 1; 7408 sd_allnodes = 1;
6590 } else 7409 } else
6591 p = NULL; 7410 p = NULL;
6592 7411
6593 sd = &per_cpu(node_domains, i); 7412 sd = &per_cpu(node_domains, i);
6594 *sd = SD_NODE_INIT; 7413 SD_INIT(sd, NODE);
6595 sd->span = sched_domain_node_span(cpu_to_node(i)); 7414 set_domain_attribute(sd, attr);
7415 sched_domain_node_span(cpu_to_node(i), &sd->span);
7416 sd->first_cpu = first_cpu(sd->span);
6596 sd->parent = p; 7417 sd->parent = p;
6597 if (p) 7418 if (p)
6598 p->child = sd; 7419 p->child = sd;
@@ -6601,94 +7422,120 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 7422
6602 p = sd; 7423 p = sd;
6603 sd = &per_cpu(phys_domains, i); 7424 sd = &per_cpu(phys_domains, i);
6604 *sd = SD_CPU_INIT; 7425 SD_INIT(sd, CPU);
6605 sd->span = nodemask; 7426 set_domain_attribute(sd, attr);
7427 sd->span = *nodemask;
7428 sd->first_cpu = first_cpu(sd->span);
6606 sd->parent = p; 7429 sd->parent = p;
6607 if (p) 7430 if (p)
6608 p->child = sd; 7431 p->child = sd;
6609 cpu_to_phys_group(i, cpu_map, &sd->groups); 7432 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
6610 7433
6611#ifdef CONFIG_SCHED_MC 7434#ifdef CONFIG_SCHED_MC
6612 p = sd; 7435 p = sd;
6613 sd = &per_cpu(core_domains, i); 7436 sd = &per_cpu(core_domains, i);
6614 *sd = SD_MC_INIT; 7437 SD_INIT(sd, MC);
7438 set_domain_attribute(sd, attr);
6615 sd->span = cpu_coregroup_map(i); 7439 sd->span = cpu_coregroup_map(i);
7440 sd->first_cpu = first_cpu(sd->span);
6616 cpus_and(sd->span, sd->span, *cpu_map); 7441 cpus_and(sd->span, sd->span, *cpu_map);
6617 sd->parent = p; 7442 sd->parent = p;
6618 p->child = sd; 7443 p->child = sd;
6619 cpu_to_core_group(i, cpu_map, &sd->groups); 7444 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
6620#endif 7445#endif
6621 7446
6622#ifdef CONFIG_SCHED_SMT 7447#ifdef CONFIG_SCHED_SMT
6623 p = sd; 7448 p = sd;
6624 sd = &per_cpu(cpu_domains, i); 7449 sd = &per_cpu(cpu_domains, i);
6625 *sd = SD_SIBLING_INIT; 7450 SD_INIT(sd, SIBLING);
7451 set_domain_attribute(sd, attr);
6626 sd->span = per_cpu(cpu_sibling_map, i); 7452 sd->span = per_cpu(cpu_sibling_map, i);
7453 sd->first_cpu = first_cpu(sd->span);
6627 cpus_and(sd->span, sd->span, *cpu_map); 7454 cpus_and(sd->span, sd->span, *cpu_map);
6628 sd->parent = p; 7455 sd->parent = p;
6629 p->child = sd; 7456 p->child = sd;
6630 cpu_to_cpu_group(i, cpu_map, &sd->groups); 7457 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
6631#endif 7458#endif
6632 } 7459 }
6633 7460
6634#ifdef CONFIG_SCHED_SMT 7461#ifdef CONFIG_SCHED_SMT
6635 /* Set up CPU (sibling) groups */ 7462 /* Set up CPU (sibling) groups */
6636 for_each_cpu_mask(i, *cpu_map) { 7463 for_each_cpu_mask(i, *cpu_map) {
6637 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); 7464 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
6638 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 7465 SCHED_CPUMASK_VAR(send_covered, allmasks);
6639 if (i != first_cpu(this_sibling_map)) 7466
7467 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7468 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7469 if (i != first_cpu(*this_sibling_map))
6640 continue; 7470 continue;
6641 7471
6642 init_sched_build_groups(this_sibling_map, cpu_map, 7472 init_sched_build_groups(this_sibling_map, cpu_map,
6643 &cpu_to_cpu_group); 7473 &cpu_to_cpu_group,
7474 send_covered, tmpmask);
6644 } 7475 }
6645#endif 7476#endif
6646 7477
6647#ifdef CONFIG_SCHED_MC 7478#ifdef CONFIG_SCHED_MC
6648 /* Set up multi-core groups */ 7479 /* Set up multi-core groups */
6649 for_each_cpu_mask(i, *cpu_map) { 7480 for_each_cpu_mask(i, *cpu_map) {
6650 cpumask_t this_core_map = cpu_coregroup_map(i); 7481 SCHED_CPUMASK_VAR(this_core_map, allmasks);
6651 cpus_and(this_core_map, this_core_map, *cpu_map); 7482 SCHED_CPUMASK_VAR(send_covered, allmasks);
6652 if (i != first_cpu(this_core_map)) 7483
7484 *this_core_map = cpu_coregroup_map(i);
7485 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7486 if (i != first_cpu(*this_core_map))
6653 continue; 7487 continue;
7488
6654 init_sched_build_groups(this_core_map, cpu_map, 7489 init_sched_build_groups(this_core_map, cpu_map,
6655 &cpu_to_core_group); 7490 &cpu_to_core_group,
7491 send_covered, tmpmask);
6656 } 7492 }
6657#endif 7493#endif
6658 7494
6659 /* Set up physical groups */ 7495 /* Set up physical groups */
6660 for (i = 0; i < MAX_NUMNODES; i++) { 7496 for (i = 0; i < MAX_NUMNODES; i++) {
6661 cpumask_t nodemask = node_to_cpumask(i); 7497 SCHED_CPUMASK_VAR(nodemask, allmasks);
7498 SCHED_CPUMASK_VAR(send_covered, allmasks);
6662 7499
6663 cpus_and(nodemask, nodemask, *cpu_map); 7500 *nodemask = node_to_cpumask(i);
6664 if (cpus_empty(nodemask)) 7501 cpus_and(*nodemask, *nodemask, *cpu_map);
7502 if (cpus_empty(*nodemask))
6665 continue; 7503 continue;
6666 7504
6667 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); 7505 init_sched_build_groups(nodemask, cpu_map,
7506 &cpu_to_phys_group,
7507 send_covered, tmpmask);
6668 } 7508 }
6669 7509
6670#ifdef CONFIG_NUMA 7510#ifdef CONFIG_NUMA
6671 /* Set up node groups */ 7511 /* Set up node groups */
6672 if (sd_allnodes) 7512 if (sd_allnodes) {
6673 init_sched_build_groups(*cpu_map, cpu_map, 7513 SCHED_CPUMASK_VAR(send_covered, allmasks);
6674 &cpu_to_allnodes_group); 7514
7515 init_sched_build_groups(cpu_map, cpu_map,
7516 &cpu_to_allnodes_group,
7517 send_covered, tmpmask);
7518 }
6675 7519
6676 for (i = 0; i < MAX_NUMNODES; i++) { 7520 for (i = 0; i < MAX_NUMNODES; i++) {
6677 /* Set up node groups */ 7521 /* Set up node groups */
6678 struct sched_group *sg, *prev; 7522 struct sched_group *sg, *prev;
6679 cpumask_t nodemask = node_to_cpumask(i); 7523 SCHED_CPUMASK_VAR(nodemask, allmasks);
6680 cpumask_t domainspan; 7524 SCHED_CPUMASK_VAR(domainspan, allmasks);
6681 cpumask_t covered = CPU_MASK_NONE; 7525 SCHED_CPUMASK_VAR(covered, allmasks);
6682 int j; 7526 int j;
6683 7527
6684 cpus_and(nodemask, nodemask, *cpu_map); 7528 *nodemask = node_to_cpumask(i);
6685 if (cpus_empty(nodemask)) { 7529 cpus_clear(*covered);
7530
7531 cpus_and(*nodemask, *nodemask, *cpu_map);
7532 if (cpus_empty(*nodemask)) {
6686 sched_group_nodes[i] = NULL; 7533 sched_group_nodes[i] = NULL;
6687 continue; 7534 continue;
6688 } 7535 }
6689 7536
6690 domainspan = sched_domain_node_span(i); 7537 sched_domain_node_span(i, domainspan);
6691 cpus_and(domainspan, domainspan, *cpu_map); 7538 cpus_and(*domainspan, *domainspan, *cpu_map);
6692 7539
6693 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7540 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6694 if (!sg) { 7541 if (!sg) {
@@ -6697,31 +7544,31 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6697 goto error; 7544 goto error;
6698 } 7545 }
6699 sched_group_nodes[i] = sg; 7546 sched_group_nodes[i] = sg;
6700 for_each_cpu_mask(j, nodemask) { 7547 for_each_cpu_mask(j, *nodemask) {
6701 struct sched_domain *sd; 7548 struct sched_domain *sd;
6702 7549
6703 sd = &per_cpu(node_domains, j); 7550 sd = &per_cpu(node_domains, j);
6704 sd->groups = sg; 7551 sd->groups = sg;
6705 } 7552 }
6706 sg->__cpu_power = 0; 7553 sg->__cpu_power = 0;
6707 sg->cpumask = nodemask; 7554 sg->cpumask = *nodemask;
6708 sg->next = sg; 7555 sg->next = sg;
6709 cpus_or(covered, covered, nodemask); 7556 cpus_or(*covered, *covered, *nodemask);
6710 prev = sg; 7557 prev = sg;
6711 7558
6712 for (j = 0; j < MAX_NUMNODES; j++) { 7559 for (j = 0; j < MAX_NUMNODES; j++) {
6713 cpumask_t tmp, notcovered; 7560 SCHED_CPUMASK_VAR(notcovered, allmasks);
6714 int n = (i + j) % MAX_NUMNODES; 7561 int n = (i + j) % MAX_NUMNODES;
7562 node_to_cpumask_ptr(pnodemask, n);
6715 7563
6716 cpus_complement(notcovered, covered); 7564 cpus_complement(*notcovered, *covered);
6717 cpus_and(tmp, notcovered, *cpu_map); 7565 cpus_and(*tmpmask, *notcovered, *cpu_map);
6718 cpus_and(tmp, tmp, domainspan); 7566 cpus_and(*tmpmask, *tmpmask, *domainspan);
6719 if (cpus_empty(tmp)) 7567 if (cpus_empty(*tmpmask))
6720 break; 7568 break;
6721 7569
6722 nodemask = node_to_cpumask(n); 7570 cpus_and(*tmpmask, *tmpmask, *pnodemask);
6723 cpus_and(tmp, tmp, nodemask); 7571 if (cpus_empty(*tmpmask))
6724 if (cpus_empty(tmp))
6725 continue; 7572 continue;
6726 7573
6727 sg = kmalloc_node(sizeof(struct sched_group), 7574 sg = kmalloc_node(sizeof(struct sched_group),
@@ -6732,9 +7579,9 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6732 goto error; 7579 goto error;
6733 } 7580 }
6734 sg->__cpu_power = 0; 7581 sg->__cpu_power = 0;
6735 sg->cpumask = tmp; 7582 sg->cpumask = *tmpmask;
6736 sg->next = prev->next; 7583 sg->next = prev->next;
6737 cpus_or(covered, covered, tmp); 7584 cpus_or(*covered, *covered, *tmpmask);
6738 prev->next = sg; 7585 prev->next = sg;
6739 prev = sg; 7586 prev = sg;
6740 } 7587 }
@@ -6770,7 +7617,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6770 if (sd_allnodes) { 7617 if (sd_allnodes) {
6771 struct sched_group *sg; 7618 struct sched_group *sg;
6772 7619
6773 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); 7620 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7621 tmpmask);
6774 init_numa_sched_groups_power(sg); 7622 init_numa_sched_groups_power(sg);
6775 } 7623 }
6776#endif 7624#endif
@@ -6788,17 +7636,26 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6788 cpu_attach_domain(sd, rd, i); 7636 cpu_attach_domain(sd, rd, i);
6789 } 7637 }
6790 7638
7639 SCHED_CPUMASK_FREE((void *)allmasks);
6791 return 0; 7640 return 0;
6792 7641
6793#ifdef CONFIG_NUMA 7642#ifdef CONFIG_NUMA
6794error: 7643error:
6795 free_sched_groups(cpu_map); 7644 free_sched_groups(cpu_map, tmpmask);
7645 SCHED_CPUMASK_FREE((void *)allmasks);
6796 return -ENOMEM; 7646 return -ENOMEM;
6797#endif 7647#endif
6798} 7648}
6799 7649
7650static int build_sched_domains(const cpumask_t *cpu_map)
7651{
7652 return __build_sched_domains(cpu_map, NULL);
7653}
7654
6800static cpumask_t *doms_cur; /* current sched domains */ 7655static cpumask_t *doms_cur; /* current sched domains */
6801static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7656static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7657static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
7658 in 'doms_cur' */
6802 7659
6803/* 7660/*
6804 * Special case: If a kmalloc of a doms_cur partition (array of 7661 * Special case: If a kmalloc of a doms_cur partition (array of
@@ -6826,15 +7683,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6826 if (!doms_cur) 7683 if (!doms_cur)
6827 doms_cur = &fallback_doms; 7684 doms_cur = &fallback_doms;
6828 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7685 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7686 dattr_cur = NULL;
6829 err = build_sched_domains(doms_cur); 7687 err = build_sched_domains(doms_cur);
6830 register_sched_domain_sysctl(); 7688 register_sched_domain_sysctl();
6831 7689
6832 return err; 7690 return err;
6833} 7691}
6834 7692
6835static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 7693static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7694 cpumask_t *tmpmask)
6836{ 7695{
6837 free_sched_groups(cpu_map); 7696 free_sched_groups(cpu_map, tmpmask);
6838} 7697}
6839 7698
6840/* 7699/*
@@ -6843,6 +7702,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6843 */ 7702 */
6844static void detach_destroy_domains(const cpumask_t *cpu_map) 7703static void detach_destroy_domains(const cpumask_t *cpu_map)
6845{ 7704{
7705 cpumask_t tmpmask;
6846 int i; 7706 int i;
6847 7707
6848 unregister_sched_domain_sysctl(); 7708 unregister_sched_domain_sysctl();
@@ -6850,7 +7710,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6850 for_each_cpu_mask(i, *cpu_map) 7710 for_each_cpu_mask(i, *cpu_map)
6851 cpu_attach_domain(NULL, &def_root_domain, i); 7711 cpu_attach_domain(NULL, &def_root_domain, i);
6852 synchronize_sched(); 7712 synchronize_sched();
6853 arch_destroy_sched_domains(cpu_map); 7713 arch_destroy_sched_domains(cpu_map, &tmpmask);
7714}
7715
7716/* handle null as "default" */
7717static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7718 struct sched_domain_attr *new, int idx_new)
7719{
7720 struct sched_domain_attr tmp;
7721
7722 /* fast path */
7723 if (!new && !cur)
7724 return 1;
7725
7726 tmp = SD_ATTR_INIT;
7727 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7728 new ? (new + idx_new) : &tmp,
7729 sizeof(struct sched_domain_attr));
6854} 7730}
6855 7731
6856/* 7732/*
@@ -6874,7 +7750,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6874 * 7750 *
6875 * Call with hotplug lock held 7751 * Call with hotplug lock held
6876 */ 7752 */
6877void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) 7753void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7754 struct sched_domain_attr *dattr_new)
6878{ 7755{
6879 int i, j; 7756 int i, j;
6880 7757
@@ -6887,12 +7764,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6887 ndoms_new = 1; 7764 ndoms_new = 1;
6888 doms_new = &fallback_doms; 7765 doms_new = &fallback_doms;
6889 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7766 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7767 dattr_new = NULL;
6890 } 7768 }
6891 7769
6892 /* Destroy deleted domains */ 7770 /* Destroy deleted domains */
6893 for (i = 0; i < ndoms_cur; i++) { 7771 for (i = 0; i < ndoms_cur; i++) {
6894 for (j = 0; j < ndoms_new; j++) { 7772 for (j = 0; j < ndoms_new; j++) {
6895 if (cpus_equal(doms_cur[i], doms_new[j])) 7773 if (cpus_equal(doms_cur[i], doms_new[j])
7774 && dattrs_equal(dattr_cur, i, dattr_new, j))
6896 goto match1; 7775 goto match1;
6897 } 7776 }
6898 /* no match - a current sched domain not in new doms_new[] */ 7777 /* no match - a current sched domain not in new doms_new[] */
@@ -6904,11 +7783,13 @@ match1:
6904 /* Build new domains */ 7783 /* Build new domains */
6905 for (i = 0; i < ndoms_new; i++) { 7784 for (i = 0; i < ndoms_new; i++) {
6906 for (j = 0; j < ndoms_cur; j++) { 7785 for (j = 0; j < ndoms_cur; j++) {
6907 if (cpus_equal(doms_new[i], doms_cur[j])) 7786 if (cpus_equal(doms_new[i], doms_cur[j])
7787 && dattrs_equal(dattr_new, i, dattr_cur, j))
6908 goto match2; 7788 goto match2;
6909 } 7789 }
6910 /* no match - add a new doms_new */ 7790 /* no match - add a new doms_new */
6911 build_sched_domains(doms_new + i); 7791 __build_sched_domains(doms_new + i,
7792 dattr_new ? dattr_new + i : NULL);
6912match2: 7793match2:
6913 ; 7794 ;
6914 } 7795 }
@@ -6916,7 +7797,9 @@ match2:
6916 /* Remember the new sched domains */ 7797 /* Remember the new sched domains */
6917 if (doms_cur != &fallback_doms) 7798 if (doms_cur != &fallback_doms)
6918 kfree(doms_cur); 7799 kfree(doms_cur);
7800 kfree(dattr_cur); /* kfree(NULL) is safe */
6919 doms_cur = doms_new; 7801 doms_cur = doms_new;
7802 dattr_cur = dattr_new;
6920 ndoms_cur = ndoms_new; 7803 ndoms_cur = ndoms_new;
6921 7804
6922 register_sched_domain_sysctl(); 7805 register_sched_domain_sysctl();
@@ -7043,6 +7926,11 @@ void __init sched_init_smp(void)
7043{ 7926{
7044 cpumask_t non_isolated_cpus; 7927 cpumask_t non_isolated_cpus;
7045 7928
7929#if defined(CONFIG_NUMA)
7930 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7931 GFP_KERNEL);
7932 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif
7046 get_online_cpus(); 7934 get_online_cpus();
7047 arch_init_sched_domains(&cpu_online_map); 7935 arch_init_sched_domains(&cpu_online_map);
7048 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
@@ -7053,7 +7941,7 @@ void __init sched_init_smp(void)
7053 hotcpu_notifier(update_sched_domains, 0); 7941 hotcpu_notifier(update_sched_domains, 0);
7054 7942
7055 /* Move init over to a non-isolated CPU */ 7943 /* Move init over to a non-isolated CPU */
7056 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
7057 BUG(); 7945 BUG();
7058 sched_init_granularity(); 7946 sched_init_granularity();
7059} 7947}
@@ -7074,6 +7962,7 @@ int in_sched_functions(unsigned long addr)
7074static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7962static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7075{ 7963{
7076 cfs_rq->tasks_timeline = RB_ROOT; 7964 cfs_rq->tasks_timeline = RB_ROOT;
7965 INIT_LIST_HEAD(&cfs_rq->tasks);
7077#ifdef CONFIG_FAIR_GROUP_SCHED 7966#ifdef CONFIG_FAIR_GROUP_SCHED
7078 cfs_rq->rq = rq; 7967 cfs_rq->rq = rq;
7079#endif 7968#endif
@@ -7103,6 +7992,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7103 7992
7104 rt_rq->rt_time = 0; 7993 rt_rq->rt_time = 0;
7105 rt_rq->rt_throttled = 0; 7994 rt_rq->rt_throttled = 0;
7995 rt_rq->rt_runtime = 0;
7996 spin_lock_init(&rt_rq->rt_runtime_lock);
7106 7997
7107#ifdef CONFIG_RT_GROUP_SCHED 7998#ifdef CONFIG_RT_GROUP_SCHED
7108 rt_rq->rt_nr_boosted = 0; 7999 rt_rq->rt_nr_boosted = 0;
@@ -7111,10 +8002,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7111} 8002}
7112 8003
7113#ifdef CONFIG_FAIR_GROUP_SCHED 8004#ifdef CONFIG_FAIR_GROUP_SCHED
7114static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, 8005static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7115 struct cfs_rq *cfs_rq, struct sched_entity *se, 8006 struct sched_entity *se, int cpu, int add,
7116 int cpu, int add) 8007 struct sched_entity *parent)
7117{ 8008{
8009 struct rq *rq = cpu_rq(cpu);
7118 tg->cfs_rq[cpu] = cfs_rq; 8010 tg->cfs_rq[cpu] = cfs_rq;
7119 init_cfs_rq(cfs_rq, rq); 8011 init_cfs_rq(cfs_rq, rq);
7120 cfs_rq->tg = tg; 8012 cfs_rq->tg = tg;
@@ -7122,45 +8014,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7122 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 8014 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7123 8015
7124 tg->se[cpu] = se; 8016 tg->se[cpu] = se;
7125 se->cfs_rq = &rq->cfs; 8017 /* se could be NULL for init_task_group */
8018 if (!se)
8019 return;
8020
8021 if (!parent)
8022 se->cfs_rq = &rq->cfs;
8023 else
8024 se->cfs_rq = parent->my_q;
8025
7126 se->my_q = cfs_rq; 8026 se->my_q = cfs_rq;
7127 se->load.weight = tg->shares; 8027 se->load.weight = tg->shares;
7128 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
7129 se->parent = NULL; 8029 se->parent = parent;
7130} 8030}
7131#endif 8031#endif
7132 8032
7133#ifdef CONFIG_RT_GROUP_SCHED 8033#ifdef CONFIG_RT_GROUP_SCHED
7134static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, 8034static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7135 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 8035 struct sched_rt_entity *rt_se, int cpu, int add,
7136 int cpu, int add) 8036 struct sched_rt_entity *parent)
7137{ 8037{
8038 struct rq *rq = cpu_rq(cpu);
8039
7138 tg->rt_rq[cpu] = rt_rq; 8040 tg->rt_rq[cpu] = rt_rq;
7139 init_rt_rq(rt_rq, rq); 8041 init_rt_rq(rt_rq, rq);
7140 rt_rq->tg = tg; 8042 rt_rq->tg = tg;
7141 rt_rq->rt_se = rt_se; 8043 rt_rq->rt_se = rt_se;
8044 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7142 if (add) 8045 if (add)
7143 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 8046 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7144 8047
7145 tg->rt_se[cpu] = rt_se; 8048 tg->rt_se[cpu] = rt_se;
8049 if (!rt_se)
8050 return;
8051
8052 if (!parent)
8053 rt_se->rt_rq = &rq->rt;
8054 else
8055 rt_se->rt_rq = parent->my_q;
8056
7146 rt_se->rt_rq = &rq->rt; 8057 rt_se->rt_rq = &rq->rt;
7147 rt_se->my_q = rt_rq; 8058 rt_se->my_q = rt_rq;
7148 rt_se->parent = NULL; 8059 rt_se->parent = parent;
7149 INIT_LIST_HEAD(&rt_se->run_list); 8060 INIT_LIST_HEAD(&rt_se->run_list);
7150} 8061}
7151#endif 8062#endif
7152 8063
7153void __init sched_init(void) 8064void __init sched_init(void)
7154{ 8065{
7155 int highest_cpu = 0;
7156 int i, j; 8066 int i, j;
8067 unsigned long alloc_size = 0, ptr;
8068
8069#ifdef CONFIG_FAIR_GROUP_SCHED
8070 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8071#endif
8072#ifdef CONFIG_RT_GROUP_SCHED
8073 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8074#endif
8075#ifdef CONFIG_USER_SCHED
8076 alloc_size *= 2;
8077#endif
8078 /*
8079 * As sched_init() is called before page_alloc is setup,
8080 * we use alloc_bootmem().
8081 */
8082 if (alloc_size) {
8083 ptr = (unsigned long)alloc_bootmem(alloc_size);
8084
8085#ifdef CONFIG_FAIR_GROUP_SCHED
8086 init_task_group.se = (struct sched_entity **)ptr;
8087 ptr += nr_cpu_ids * sizeof(void **);
8088
8089 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8090 ptr += nr_cpu_ids * sizeof(void **);
8091
8092#ifdef CONFIG_USER_SCHED
8093 root_task_group.se = (struct sched_entity **)ptr;
8094 ptr += nr_cpu_ids * sizeof(void **);
8095
8096 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8097 ptr += nr_cpu_ids * sizeof(void **);
8098#endif
8099#endif
8100#ifdef CONFIG_RT_GROUP_SCHED
8101 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8102 ptr += nr_cpu_ids * sizeof(void **);
8103
8104 init_task_group.rt_rq = (struct rt_rq **)ptr;
8105 ptr += nr_cpu_ids * sizeof(void **);
8106
8107#ifdef CONFIG_USER_SCHED
8108 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8109 ptr += nr_cpu_ids * sizeof(void **);
8110
8111 root_task_group.rt_rq = (struct rt_rq **)ptr;
8112 ptr += nr_cpu_ids * sizeof(void **);
8113#endif
8114#endif
8115 }
7157 8116
7158#ifdef CONFIG_SMP 8117#ifdef CONFIG_SMP
8118 init_aggregate();
7159 init_defrootdomain(); 8119 init_defrootdomain();
7160#endif 8120#endif
7161 8121
8122 init_rt_bandwidth(&def_rt_bandwidth,
8123 global_rt_period(), global_rt_runtime());
8124
8125#ifdef CONFIG_RT_GROUP_SCHED
8126 init_rt_bandwidth(&init_task_group.rt_bandwidth,
8127 global_rt_period(), global_rt_runtime());
8128#ifdef CONFIG_USER_SCHED
8129 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8130 global_rt_period(), RUNTIME_INF);
8131#endif
8132#endif
8133
7162#ifdef CONFIG_GROUP_SCHED 8134#ifdef CONFIG_GROUP_SCHED
7163 list_add(&init_task_group.list, &task_groups); 8135 list_add(&init_task_group.list, &task_groups);
8136 INIT_LIST_HEAD(&init_task_group.children);
8137
8138#ifdef CONFIG_USER_SCHED
8139 INIT_LIST_HEAD(&root_task_group.children);
8140 init_task_group.parent = &root_task_group;
8141 list_add(&init_task_group.siblings, &root_task_group.children);
8142#endif
7164#endif 8143#endif
7165 8144
7166 for_each_possible_cpu(i) { 8145 for_each_possible_cpu(i) {
@@ -7171,26 +8150,68 @@ void __init sched_init(void)
7171 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7172 rq->nr_running = 0; 8151 rq->nr_running = 0;
7173 rq->clock = 1; 8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
7174 init_cfs_rq(&rq->cfs, rq); 8154 init_cfs_rq(&rq->cfs, rq);
7175 init_rt_rq(&rq->rt, rq); 8155 init_rt_rq(&rq->rt, rq);
7176#ifdef CONFIG_FAIR_GROUP_SCHED 8156#ifdef CONFIG_FAIR_GROUP_SCHED
7177 init_task_group.shares = init_task_group_load; 8157 init_task_group.shares = init_task_group_load;
7178 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8158 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7179 init_tg_cfs_entry(rq, &init_task_group, 8159#ifdef CONFIG_CGROUP_SCHED
8160 /*
8161 * How much cpu bandwidth does init_task_group get?
8162 *
8163 * In case of task-groups formed thr' the cgroup filesystem, it
8164 * gets 100% of the cpu resources in the system. This overall
8165 * system cpu resource is divided among the tasks of
8166 * init_task_group and its child task-groups in a fair manner,
8167 * based on each entity's (task or task-group's) weight
8168 * (se->load.weight).
8169 *
8170 * In other words, if init_task_group has 10 tasks of weight
8171 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8172 * then A0's share of the cpu resource is:
8173 *
8174 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8175 *
8176 * We achieve this by letting init_task_group's tasks sit
8177 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8178 */
8179 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8180#elif defined CONFIG_USER_SCHED
8181 root_task_group.shares = NICE_0_LOAD;
8182 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8183 /*
8184 * In case of task-groups formed thr' the user id of tasks,
8185 * init_task_group represents tasks belonging to root user.
8186 * Hence it forms a sibling of all subsequent groups formed.
8187 * In this case, init_task_group gets only a fraction of overall
8188 * system cpu resource, based on the weight assigned to root
8189 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8190 * by letting tasks of init_task_group sit in a separate cfs_rq
8191 * (init_cfs_rq) and having one entity represent this group of
8192 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8193 */
8194 init_tg_cfs_entry(&init_task_group,
7180 &per_cpu(init_cfs_rq, i), 8195 &per_cpu(init_cfs_rq, i),
7181 &per_cpu(init_sched_entity, i), i, 1); 8196 &per_cpu(init_sched_entity, i), i, 1,
8197 root_task_group.se[i]);
7182 8198
7183#endif 8199#endif
8200#endif /* CONFIG_FAIR_GROUP_SCHED */
8201
8202 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7184#ifdef CONFIG_RT_GROUP_SCHED 8203#ifdef CONFIG_RT_GROUP_SCHED
7185 init_task_group.rt_runtime =
7186 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7187 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8204 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7188 init_tg_rt_entry(rq, &init_task_group, 8205#ifdef CONFIG_CGROUP_SCHED
8206 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8207#elif defined CONFIG_USER_SCHED
8208 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8209 init_tg_rt_entry(&init_task_group,
7189 &per_cpu(init_rt_rq, i), 8210 &per_cpu(init_rt_rq, i),
7190 &per_cpu(init_sched_rt_entity, i), i, 1); 8211 &per_cpu(init_sched_rt_entity, i), i, 1,
8212 root_task_group.rt_se[i]);
8213#endif
7191#endif 8214#endif
7192 rq->rt_period_expire = 0;
7193 rq->rt_throttled = 0;
7194 8215
7195 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8216 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7196 rq->cpu_load[j] = 0; 8217 rq->cpu_load[j] = 0;
@@ -7207,7 +8228,6 @@ void __init sched_init(void)
7207#endif 8228#endif
7208 init_rq_hrtick(rq); 8229 init_rq_hrtick(rq);
7209 atomic_set(&rq->nr_iowait, 0); 8230 atomic_set(&rq->nr_iowait, 0);
7210 highest_cpu = i;
7211 } 8231 }
7212 8232
7213 set_load_weight(&init_task); 8233 set_load_weight(&init_task);
@@ -7217,7 +8237,6 @@ void __init sched_init(void)
7217#endif 8237#endif
7218 8238
7219#ifdef CONFIG_SMP 8239#ifdef CONFIG_SMP
7220 nr_cpu_ids = highest_cpu + 1;
7221 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8240 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7222#endif 8241#endif
7223 8242
@@ -7376,8 +8395,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7376 8395
7377#endif 8396#endif
7378 8397
7379#ifdef CONFIG_GROUP_SCHED
7380
7381#ifdef CONFIG_FAIR_GROUP_SCHED 8398#ifdef CONFIG_FAIR_GROUP_SCHED
7382static void free_fair_sched_group(struct task_group *tg) 8399static void free_fair_sched_group(struct task_group *tg)
7383{ 8400{
@@ -7394,17 +8411,18 @@ static void free_fair_sched_group(struct task_group *tg)
7394 kfree(tg->se); 8411 kfree(tg->se);
7395} 8412}
7396 8413
7397static int alloc_fair_sched_group(struct task_group *tg) 8414static
8415int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7398{ 8416{
7399 struct cfs_rq *cfs_rq; 8417 struct cfs_rq *cfs_rq;
7400 struct sched_entity *se; 8418 struct sched_entity *se, *parent_se;
7401 struct rq *rq; 8419 struct rq *rq;
7402 int i; 8420 int i;
7403 8421
7404 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); 8422 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7405 if (!tg->cfs_rq) 8423 if (!tg->cfs_rq)
7406 goto err; 8424 goto err;
7407 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 8425 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7408 if (!tg->se) 8426 if (!tg->se)
7409 goto err; 8427 goto err;
7410 8428
@@ -7423,7 +8441,8 @@ static int alloc_fair_sched_group(struct task_group *tg)
7423 if (!se) 8441 if (!se)
7424 goto err; 8442 goto err;
7425 8443
7426 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); 8444 parent_se = parent ? parent->se[i] : NULL;
8445 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
7427 } 8446 }
7428 8447
7429 return 1; 8448 return 1;
@@ -7447,7 +8466,8 @@ static inline void free_fair_sched_group(struct task_group *tg)
7447{ 8466{
7448} 8467}
7449 8468
7450static inline int alloc_fair_sched_group(struct task_group *tg) 8469static inline
8470int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7451{ 8471{
7452 return 1; 8472 return 1;
7453} 8473}
@@ -7466,6 +8486,8 @@ static void free_rt_sched_group(struct task_group *tg)
7466{ 8486{
7467 int i; 8487 int i;
7468 8488
8489 destroy_rt_bandwidth(&tg->rt_bandwidth);
8490
7469 for_each_possible_cpu(i) { 8491 for_each_possible_cpu(i) {
7470 if (tg->rt_rq) 8492 if (tg->rt_rq)
7471 kfree(tg->rt_rq[i]); 8493 kfree(tg->rt_rq[i]);
@@ -7477,21 +8499,23 @@ static void free_rt_sched_group(struct task_group *tg)
7477 kfree(tg->rt_se); 8499 kfree(tg->rt_se);
7478} 8500}
7479 8501
7480static int alloc_rt_sched_group(struct task_group *tg) 8502static
8503int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7481{ 8504{
7482 struct rt_rq *rt_rq; 8505 struct rt_rq *rt_rq;
7483 struct sched_rt_entity *rt_se; 8506 struct sched_rt_entity *rt_se, *parent_se;
7484 struct rq *rq; 8507 struct rq *rq;
7485 int i; 8508 int i;
7486 8509
7487 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); 8510 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
7488 if (!tg->rt_rq) 8511 if (!tg->rt_rq)
7489 goto err; 8512 goto err;
7490 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); 8513 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
7491 if (!tg->rt_se) 8514 if (!tg->rt_se)
7492 goto err; 8515 goto err;
7493 8516
7494 tg->rt_runtime = 0; 8517 init_rt_bandwidth(&tg->rt_bandwidth,
8518 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
7495 8519
7496 for_each_possible_cpu(i) { 8520 for_each_possible_cpu(i) {
7497 rq = cpu_rq(i); 8521 rq = cpu_rq(i);
@@ -7506,7 +8530,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
7506 if (!rt_se) 8530 if (!rt_se)
7507 goto err; 8531 goto err;
7508 8532
7509 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); 8533 parent_se = parent ? parent->rt_se[i] : NULL;
8534 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
7510 } 8535 }
7511 8536
7512 return 1; 8537 return 1;
@@ -7530,7 +8555,8 @@ static inline void free_rt_sched_group(struct task_group *tg)
7530{ 8555{
7531} 8556}
7532 8557
7533static inline int alloc_rt_sched_group(struct task_group *tg) 8558static inline
8559int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7534{ 8560{
7535 return 1; 8561 return 1;
7536} 8562}
@@ -7544,6 +8570,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7544} 8570}
7545#endif 8571#endif
7546 8572
8573#ifdef CONFIG_GROUP_SCHED
7547static void free_sched_group(struct task_group *tg) 8574static void free_sched_group(struct task_group *tg)
7548{ 8575{
7549 free_fair_sched_group(tg); 8576 free_fair_sched_group(tg);
@@ -7552,7 +8579,7 @@ static void free_sched_group(struct task_group *tg)
7552} 8579}
7553 8580
7554/* allocate runqueue etc for a new task group */ 8581/* allocate runqueue etc for a new task group */
7555struct task_group *sched_create_group(void) 8582struct task_group *sched_create_group(struct task_group *parent)
7556{ 8583{
7557 struct task_group *tg; 8584 struct task_group *tg;
7558 unsigned long flags; 8585 unsigned long flags;
@@ -7562,10 +8589,10 @@ struct task_group *sched_create_group(void)
7562 if (!tg) 8589 if (!tg)
7563 return ERR_PTR(-ENOMEM); 8590 return ERR_PTR(-ENOMEM);
7564 8591
7565 if (!alloc_fair_sched_group(tg)) 8592 if (!alloc_fair_sched_group(tg, parent))
7566 goto err; 8593 goto err;
7567 8594
7568 if (!alloc_rt_sched_group(tg)) 8595 if (!alloc_rt_sched_group(tg, parent))
7569 goto err; 8596 goto err;
7570 8597
7571 spin_lock_irqsave(&task_group_lock, flags); 8598 spin_lock_irqsave(&task_group_lock, flags);
@@ -7574,6 +8601,12 @@ struct task_group *sched_create_group(void)
7574 register_rt_sched_group(tg, i); 8601 register_rt_sched_group(tg, i);
7575 } 8602 }
7576 list_add_rcu(&tg->list, &task_groups); 8603 list_add_rcu(&tg->list, &task_groups);
8604
8605 WARN_ON(!parent); /* root should already exist */
8606
8607 tg->parent = parent;
8608 list_add_rcu(&tg->siblings, &parent->children);
8609 INIT_LIST_HEAD(&tg->children);
7577 spin_unlock_irqrestore(&task_group_lock, flags); 8610 spin_unlock_irqrestore(&task_group_lock, flags);
7578 8611
7579 return tg; 8612 return tg;
@@ -7602,6 +8635,7 @@ void sched_destroy_group(struct task_group *tg)
7602 unregister_rt_sched_group(tg, i); 8635 unregister_rt_sched_group(tg, i);
7603 } 8636 }
7604 list_del_rcu(&tg->list); 8637 list_del_rcu(&tg->list);
8638 list_del_rcu(&tg->siblings);
7605 spin_unlock_irqrestore(&task_group_lock, flags); 8639 spin_unlock_irqrestore(&task_group_lock, flags);
7606 8640
7607 /* wait for possible concurrent references to cfs_rqs complete */ 8641 /* wait for possible concurrent references to cfs_rqs complete */
@@ -7645,27 +8679,34 @@ void sched_move_task(struct task_struct *tsk)
7645 8679
7646 task_rq_unlock(rq, &flags); 8680 task_rq_unlock(rq, &flags);
7647} 8681}
8682#endif
7648 8683
7649#ifdef CONFIG_FAIR_GROUP_SCHED 8684#ifdef CONFIG_FAIR_GROUP_SCHED
7650static void set_se_shares(struct sched_entity *se, unsigned long shares) 8685static void __set_se_shares(struct sched_entity *se, unsigned long shares)
7651{ 8686{
7652 struct cfs_rq *cfs_rq = se->cfs_rq; 8687 struct cfs_rq *cfs_rq = se->cfs_rq;
7653 struct rq *rq = cfs_rq->rq;
7654 int on_rq; 8688 int on_rq;
7655 8689
7656 spin_lock_irq(&rq->lock);
7657
7658 on_rq = se->on_rq; 8690 on_rq = se->on_rq;
7659 if (on_rq) 8691 if (on_rq)
7660 dequeue_entity(cfs_rq, se, 0); 8692 dequeue_entity(cfs_rq, se, 0);
7661 8693
7662 se->load.weight = shares; 8694 se->load.weight = shares;
7663 se->load.inv_weight = div64_64((1ULL<<32), shares); 8695 se->load.inv_weight = div64_u64((1ULL<<32), shares);
7664 8696
7665 if (on_rq) 8697 if (on_rq)
7666 enqueue_entity(cfs_rq, se, 0); 8698 enqueue_entity(cfs_rq, se, 0);
8699}
7667 8700
7668 spin_unlock_irq(&rq->lock); 8701static void set_se_shares(struct sched_entity *se, unsigned long shares)
8702{
8703 struct cfs_rq *cfs_rq = se->cfs_rq;
8704 struct rq *rq = cfs_rq->rq;
8705 unsigned long flags;
8706
8707 spin_lock_irqsave(&rq->lock, flags);
8708 __set_se_shares(se, shares);
8709 spin_unlock_irqrestore(&rq->lock, flags);
7669} 8710}
7670 8711
7671static DEFINE_MUTEX(shares_mutex); 8712static DEFINE_MUTEX(shares_mutex);
@@ -7676,12 +8717,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7676 unsigned long flags; 8717 unsigned long flags;
7677 8718
7678 /* 8719 /*
8720 * We can't change the weight of the root cgroup.
8721 */
8722 if (!tg->se[0])
8723 return -EINVAL;
8724
8725 /*
7679 * A weight of 0 or 1 can cause arithmetics problems. 8726 * A weight of 0 or 1 can cause arithmetics problems.
7680 * (The default weight is 1024 - so there's no practical 8727 * (The default weight is 1024 - so there's no practical
7681 * limitation from this.) 8728 * limitation from this.)
7682 */ 8729 */
7683 if (shares < 2) 8730 if (shares < MIN_SHARES)
7684 shares = 2; 8731 shares = MIN_SHARES;
7685 8732
7686 mutex_lock(&shares_mutex); 8733 mutex_lock(&shares_mutex);
7687 if (tg->shares == shares) 8734 if (tg->shares == shares)
@@ -7690,6 +8737,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7690 spin_lock_irqsave(&task_group_lock, flags); 8737 spin_lock_irqsave(&task_group_lock, flags);
7691 for_each_possible_cpu(i) 8738 for_each_possible_cpu(i)
7692 unregister_fair_sched_group(tg, i); 8739 unregister_fair_sched_group(tg, i);
8740 list_del_rcu(&tg->siblings);
7693 spin_unlock_irqrestore(&task_group_lock, flags); 8741 spin_unlock_irqrestore(&task_group_lock, flags);
7694 8742
7695 /* wait for any ongoing reference to this group to finish */ 8743 /* wait for any ongoing reference to this group to finish */
@@ -7700,8 +8748,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7700 * w/o tripping rebalance_share or load_balance_fair. 8748 * w/o tripping rebalance_share or load_balance_fair.
7701 */ 8749 */
7702 tg->shares = shares; 8750 tg->shares = shares;
7703 for_each_possible_cpu(i) 8751 for_each_possible_cpu(i) {
7704 set_se_shares(tg->se[i], shares); 8752 /*
8753 * force a rebalance
8754 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids);
8757 }
7705 8758
7706 /* 8759 /*
7707 * Enable load balance activity on this group, by inserting it back on 8760 * Enable load balance activity on this group, by inserting it back on
@@ -7710,6 +8763,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7710 spin_lock_irqsave(&task_group_lock, flags); 8763 spin_lock_irqsave(&task_group_lock, flags);
7711 for_each_possible_cpu(i) 8764 for_each_possible_cpu(i)
7712 register_fair_sched_group(tg, i); 8765 register_fair_sched_group(tg, i);
8766 list_add_rcu(&tg->siblings, &tg->parent->children);
7713 spin_unlock_irqrestore(&task_group_lock, flags); 8767 spin_unlock_irqrestore(&task_group_lock, flags);
7714done: 8768done:
7715 mutex_unlock(&shares_mutex); 8769 mutex_unlock(&shares_mutex);
@@ -7733,29 +8787,61 @@ static unsigned long to_ratio(u64 period, u64 runtime)
7733 if (runtime == RUNTIME_INF) 8787 if (runtime == RUNTIME_INF)
7734 return 1ULL << 16; 8788 return 1ULL << 16;
7735 8789
7736 return div64_64(runtime << 16, period); 8790 return div64_u64(runtime << 16, period);
7737} 8791}
7738 8792
8793#ifdef CONFIG_CGROUP_SCHED
8794static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8795{
8796 struct task_group *tgi, *parent = tg->parent;
8797 unsigned long total = 0;
8798
8799 if (!parent) {
8800 if (global_rt_period() < period)
8801 return 0;
8802
8803 return to_ratio(period, runtime) <
8804 to_ratio(global_rt_period(), global_rt_runtime());
8805 }
8806
8807 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
8808 return 0;
8809
8810 rcu_read_lock();
8811 list_for_each_entry_rcu(tgi, &parent->children, siblings) {
8812 if (tgi == tg)
8813 continue;
8814
8815 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8816 tgi->rt_bandwidth.rt_runtime);
8817 }
8818 rcu_read_unlock();
8819
8820 return total + to_ratio(period, runtime) <
8821 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8822 parent->rt_bandwidth.rt_runtime);
8823}
8824#elif defined CONFIG_USER_SCHED
7739static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8825static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7740{ 8826{
7741 struct task_group *tgi; 8827 struct task_group *tgi;
7742 unsigned long total = 0; 8828 unsigned long total = 0;
7743 unsigned long global_ratio = 8829 unsigned long global_ratio =
7744 to_ratio(sysctl_sched_rt_period, 8830 to_ratio(global_rt_period(), global_rt_runtime());
7745 sysctl_sched_rt_runtime < 0 ?
7746 RUNTIME_INF : sysctl_sched_rt_runtime);
7747 8831
7748 rcu_read_lock(); 8832 rcu_read_lock();
7749 list_for_each_entry_rcu(tgi, &task_groups, list) { 8833 list_for_each_entry_rcu(tgi, &task_groups, list) {
7750 if (tgi == tg) 8834 if (tgi == tg)
7751 continue; 8835 continue;
7752 8836
7753 total += to_ratio(period, tgi->rt_runtime); 8837 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8838 tgi->rt_bandwidth.rt_runtime);
7754 } 8839 }
7755 rcu_read_unlock(); 8840 rcu_read_unlock();
7756 8841
7757 return total + to_ratio(period, runtime) < global_ratio; 8842 return total + to_ratio(period, runtime) < global_ratio;
7758} 8843}
8844#endif
7759 8845
7760/* Must be called with tasklist_lock held */ 8846/* Must be called with tasklist_lock held */
7761static inline int tg_has_rt_tasks(struct task_group *tg) 8847static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -7768,19 +8854,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7768 return 0; 8854 return 0;
7769} 8855}
7770 8856
7771int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8857static int tg_set_bandwidth(struct task_group *tg,
8858 u64 rt_period, u64 rt_runtime)
7772{ 8859{
7773 u64 rt_runtime, rt_period; 8860 int i, err = 0;
7774 int err = 0;
7775
7776 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7777 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7778 if (rt_runtime_us == -1)
7779 rt_runtime = RUNTIME_INF;
7780 8861
7781 mutex_lock(&rt_constraints_mutex); 8862 mutex_lock(&rt_constraints_mutex);
7782 read_lock(&tasklist_lock); 8863 read_lock(&tasklist_lock);
7783 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { 8864 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
7784 err = -EBUSY; 8865 err = -EBUSY;
7785 goto unlock; 8866 goto unlock;
7786 } 8867 }
@@ -7788,7 +8869,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7788 err = -EINVAL; 8869 err = -EINVAL;
7789 goto unlock; 8870 goto unlock;
7790 } 8871 }
7791 tg->rt_runtime = rt_runtime; 8872
8873 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8874 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8875 tg->rt_bandwidth.rt_runtime = rt_runtime;
8876
8877 for_each_possible_cpu(i) {
8878 struct rt_rq *rt_rq = tg->rt_rq[i];
8879
8880 spin_lock(&rt_rq->rt_runtime_lock);
8881 rt_rq->rt_runtime = rt_runtime;
8882 spin_unlock(&rt_rq->rt_runtime_lock);
8883 }
8884 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7792 unlock: 8885 unlock:
7793 read_unlock(&tasklist_lock); 8886 read_unlock(&tasklist_lock);
7794 mutex_unlock(&rt_constraints_mutex); 8887 mutex_unlock(&rt_constraints_mutex);
@@ -7796,19 +8889,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7796 return err; 8889 return err;
7797} 8890}
7798 8891
8892int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8893{
8894 u64 rt_runtime, rt_period;
8895
8896 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8897 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8898 if (rt_runtime_us < 0)
8899 rt_runtime = RUNTIME_INF;
8900
8901 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8902}
8903
7799long sched_group_rt_runtime(struct task_group *tg) 8904long sched_group_rt_runtime(struct task_group *tg)
7800{ 8905{
7801 u64 rt_runtime_us; 8906 u64 rt_runtime_us;
7802 8907
7803 if (tg->rt_runtime == RUNTIME_INF) 8908 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7804 return -1; 8909 return -1;
7805 8910
7806 rt_runtime_us = tg->rt_runtime; 8911 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7807 do_div(rt_runtime_us, NSEC_PER_USEC); 8912 do_div(rt_runtime_us, NSEC_PER_USEC);
7808 return rt_runtime_us; 8913 return rt_runtime_us;
7809} 8914}
8915
8916int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8917{
8918 u64 rt_runtime, rt_period;
8919
8920 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8921 rt_runtime = tg->rt_bandwidth.rt_runtime;
8922
8923 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8924}
8925
8926long sched_group_rt_period(struct task_group *tg)
8927{
8928 u64 rt_period_us;
8929
8930 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8931 do_div(rt_period_us, NSEC_PER_USEC);
8932 return rt_period_us;
8933}
8934
8935static int sched_rt_global_constraints(void)
8936{
8937 int ret = 0;
8938
8939 mutex_lock(&rt_constraints_mutex);
8940 if (!__rt_schedulable(NULL, 1, 0))
8941 ret = -EINVAL;
8942 mutex_unlock(&rt_constraints_mutex);
8943
8944 return ret;
8945}
8946#else
8947static int sched_rt_global_constraints(void)
8948{
8949 unsigned long flags;
8950 int i;
8951
8952 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8953 for_each_possible_cpu(i) {
8954 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8955
8956 spin_lock(&rt_rq->rt_runtime_lock);
8957 rt_rq->rt_runtime = global_rt_runtime();
8958 spin_unlock(&rt_rq->rt_runtime_lock);
8959 }
8960 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8961
8962 return 0;
8963}
7810#endif 8964#endif
7811#endif /* CONFIG_GROUP_SCHED */ 8965
8966int sched_rt_handler(struct ctl_table *table, int write,
8967 struct file *filp, void __user *buffer, size_t *lenp,
8968 loff_t *ppos)
8969{
8970 int ret;
8971 int old_period, old_runtime;
8972 static DEFINE_MUTEX(mutex);
8973
8974 mutex_lock(&mutex);
8975 old_period = sysctl_sched_rt_period;
8976 old_runtime = sysctl_sched_rt_runtime;
8977
8978 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
8979
8980 if (!ret && write) {
8981 ret = sched_rt_global_constraints();
8982 if (ret) {
8983 sysctl_sched_rt_period = old_period;
8984 sysctl_sched_rt_runtime = old_runtime;
8985 } else {
8986 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8987 def_rt_bandwidth.rt_period =
8988 ns_to_ktime(global_rt_period());
8989 }
8990 }
8991 mutex_unlock(&mutex);
8992
8993 return ret;
8994}
7812 8995
7813#ifdef CONFIG_CGROUP_SCHED 8996#ifdef CONFIG_CGROUP_SCHED
7814 8997
@@ -7822,7 +9005,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7822static struct cgroup_subsys_state * 9005static struct cgroup_subsys_state *
7823cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 9006cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7824{ 9007{
7825 struct task_group *tg; 9008 struct task_group *tg, *parent;
7826 9009
7827 if (!cgrp->parent) { 9010 if (!cgrp->parent) {
7828 /* This is early initialization for the top cgroup */ 9011 /* This is early initialization for the top cgroup */
@@ -7830,11 +9013,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7830 return &init_task_group.css; 9013 return &init_task_group.css;
7831 } 9014 }
7832 9015
7833 /* we support only 1-level deep hierarchical scheduler atm */ 9016 parent = cgroup_tg(cgrp->parent);
7834 if (cgrp->parent->parent) 9017 tg = sched_create_group(parent);
7835 return ERR_PTR(-EINVAL);
7836
7837 tg = sched_create_group();
7838 if (IS_ERR(tg)) 9018 if (IS_ERR(tg))
7839 return ERR_PTR(-ENOMEM); 9019 return ERR_PTR(-ENOMEM);
7840 9020
@@ -7858,7 +9038,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7858{ 9038{
7859#ifdef CONFIG_RT_GROUP_SCHED 9039#ifdef CONFIG_RT_GROUP_SCHED
7860 /* Don't accept realtime tasks when there is no way for them to run */ 9040 /* Don't accept realtime tasks when there is no way for them to run */
7861 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) 9041 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
7862 return -EINVAL; 9042 return -EINVAL;
7863#else 9043#else
7864 /* We don't support RT-tasks being in separate groups */ 9044 /* We don't support RT-tasks being in separate groups */
@@ -7877,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7877} 9057}
7878 9058
7879#ifdef CONFIG_FAIR_GROUP_SCHED 9059#ifdef CONFIG_FAIR_GROUP_SCHED
7880static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9060static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7881 u64 shareval) 9061 u64 shareval)
7882{ 9062{
7883 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9063 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
7884} 9064}
7885 9065
7886static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) 9066static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7887{ 9067{
7888 struct task_group *tg = cgroup_tg(cgrp); 9068 struct task_group *tg = cgroup_tg(cgrp);
7889 9069
@@ -7892,49 +9072,26 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7892#endif 9072#endif
7893 9073
7894#ifdef CONFIG_RT_GROUP_SCHED 9074#ifdef CONFIG_RT_GROUP_SCHED
7895static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7896 struct file *file, 9076 s64 val)
7897 const char __user *userbuf,
7898 size_t nbytes, loff_t *unused_ppos)
7899{ 9077{
7900 char buffer[64]; 9078 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7901 int retval = 0; 9079}
7902 s64 val;
7903 char *end;
7904
7905 if (!nbytes)
7906 return -EINVAL;
7907 if (nbytes >= sizeof(buffer))
7908 return -E2BIG;
7909 if (copy_from_user(buffer, userbuf, nbytes))
7910 return -EFAULT;
7911
7912 buffer[nbytes] = 0; /* nul-terminate */
7913
7914 /* strip newline if necessary */
7915 if (nbytes && (buffer[nbytes-1] == '\n'))
7916 buffer[nbytes-1] = 0;
7917 val = simple_strtoll(buffer, &end, 0);
7918 if (*end)
7919 return -EINVAL;
7920 9080
7921 /* Pass to subsystem */ 9081static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7922 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 9082{
7923 if (!retval) 9083 return sched_group_rt_runtime(cgroup_tg(cgrp));
7924 retval = nbytes;
7925 return retval;
7926} 9084}
7927 9085
7928static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, 9086static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7929 struct file *file, 9087 u64 rt_period_us)
7930 char __user *buf, size_t nbytes,
7931 loff_t *ppos)
7932{ 9088{
7933 char tmp[64]; 9089 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7934 long val = sched_group_rt_runtime(cgroup_tg(cgrp)); 9090}
7935 int len = sprintf(tmp, "%ld\n", val);
7936 9091
7937 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 9092static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9093{
9094 return sched_group_rt_period(cgroup_tg(cgrp));
7938} 9095}
7939#endif 9096#endif
7940 9097
@@ -7942,15 +9099,20 @@ static struct cftype cpu_files[] = {
7942#ifdef CONFIG_FAIR_GROUP_SCHED 9099#ifdef CONFIG_FAIR_GROUP_SCHED
7943 { 9100 {
7944 .name = "shares", 9101 .name = "shares",
7945 .read_uint = cpu_shares_read_uint, 9102 .read_u64 = cpu_shares_read_u64,
7946 .write_uint = cpu_shares_write_uint, 9103 .write_u64 = cpu_shares_write_u64,
7947 }, 9104 },
7948#endif 9105#endif
7949#ifdef CONFIG_RT_GROUP_SCHED 9106#ifdef CONFIG_RT_GROUP_SCHED
7950 { 9107 {
7951 .name = "rt_runtime_us", 9108 .name = "rt_runtime_us",
7952 .read = cpu_rt_runtime_read, 9109 .read_s64 = cpu_rt_runtime_read,
7953 .write = cpu_rt_runtime_write, 9110 .write_s64 = cpu_rt_runtime_write,
9111 },
9112 {
9113 .name = "rt_period_us",
9114 .read_u64 = cpu_rt_period_read_uint,
9115 .write_u64 = cpu_rt_period_write_uint,
7954 }, 9116 },
7955#endif 9117#endif
7956}; 9118};
@@ -7992,9 +9154,9 @@ struct cpuacct {
7992struct cgroup_subsys cpuacct_subsys; 9154struct cgroup_subsys cpuacct_subsys;
7993 9155
7994/* return cpu accounting group corresponding to this container */ 9156/* return cpu accounting group corresponding to this container */
7995static inline struct cpuacct *cgroup_ca(struct cgroup *cont) 9157static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
7996{ 9158{
7997 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), 9159 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
7998 struct cpuacct, css); 9160 struct cpuacct, css);
7999} 9161}
8000 9162
@@ -8007,7 +9169,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
8007 9169
8008/* create a new cpu accounting group */ 9170/* create a new cpu accounting group */
8009static struct cgroup_subsys_state *cpuacct_create( 9171static struct cgroup_subsys_state *cpuacct_create(
8010 struct cgroup_subsys *ss, struct cgroup *cont) 9172 struct cgroup_subsys *ss, struct cgroup *cgrp)
8011{ 9173{
8012 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9174 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8013 9175
@@ -8025,18 +9187,18 @@ static struct cgroup_subsys_state *cpuacct_create(
8025 9187
8026/* destroy an existing cpu accounting group */ 9188/* destroy an existing cpu accounting group */
8027static void 9189static void
8028cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 9190cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8029{ 9191{
8030 struct cpuacct *ca = cgroup_ca(cont); 9192 struct cpuacct *ca = cgroup_ca(cgrp);
8031 9193
8032 free_percpu(ca->cpuusage); 9194 free_percpu(ca->cpuusage);
8033 kfree(ca); 9195 kfree(ca);
8034} 9196}
8035 9197
8036/* return total cpu usage (in nanoseconds) of a group */ 9198/* return total cpu usage (in nanoseconds) of a group */
8037static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) 9199static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8038{ 9200{
8039 struct cpuacct *ca = cgroup_ca(cont); 9201 struct cpuacct *ca = cgroup_ca(cgrp);
8040 u64 totalcpuusage = 0; 9202 u64 totalcpuusage = 0;
8041 int i; 9203 int i;
8042 9204
@@ -8055,16 +9217,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
8055 return totalcpuusage; 9217 return totalcpuusage;
8056} 9218}
8057 9219
9220static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9221 u64 reset)
9222{
9223 struct cpuacct *ca = cgroup_ca(cgrp);
9224 int err = 0;
9225 int i;
9226
9227 if (reset) {
9228 err = -EINVAL;
9229 goto out;
9230 }
9231
9232 for_each_possible_cpu(i) {
9233 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9234
9235 spin_lock_irq(&cpu_rq(i)->lock);
9236 *cpuusage = 0;
9237 spin_unlock_irq(&cpu_rq(i)->lock);
9238 }
9239out:
9240 return err;
9241}
9242
8058static struct cftype files[] = { 9243static struct cftype files[] = {
8059 { 9244 {
8060 .name = "usage", 9245 .name = "usage",
8061 .read_uint = cpuusage_read, 9246 .read_u64 = cpuusage_read,
9247 .write_u64 = cpuusage_write,
8062 }, 9248 },
8063}; 9249};
8064 9250
8065static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) 9251static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8066{ 9252{
8067 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 9253 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8068} 9254}
8069 9255
8070/* 9256/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ef358ba07683..6b4a12558e88 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
67 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
68 p->prio); 68 p->prio);
69#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
71 SPLIT_NS(p->se.vruntime), 71 SPLIT_NS(p->se.vruntime),
72 SPLIT_NS(p->se.sum_exec_runtime), 72 SPLIT_NS(p->se.sum_exec_runtime),
73 SPLIT_NS(p->se.sum_sleep_runtime)); 73 SPLIT_NS(p->se.sum_sleep_runtime));
74#else 74#else
75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
77#endif 77#endif
78
79#ifdef CONFIG_CGROUP_SCHED
80 {
81 char path[64];
82
83 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
84 SEQ_printf(m, " %s", path);
85 }
86#endif
87 SEQ_printf(m, "\n");
78} 88}
79 89
80static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 90static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
109 struct sched_entity *last; 119 struct sched_entity *last;
110 unsigned long flags; 120 unsigned long flags;
111 121
112 SEQ_printf(m, "\ncfs_rq\n"); 122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = "";
126 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg;
128
129 if (tg)
130 cgroup = tg->css.cgroup;
131
132 if (cgroup)
133 cgroup_path(cgroup, path, sizeof(path));
134
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
136#endif
113 137
114 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
115 SPLIT_NS(cfs_rq->exec_clock)); 139 SPLIT_NS(cfs_rq->exec_clock));
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
143#endif 167#endif
144 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
145 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170#ifdef CONFIG_FAIR_GROUP_SCHED
171#ifdef CONFIG_SMP
172 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
173#endif
174#endif
146} 175}
147 176
148static void print_cpu(struct seq_file *m, int cpu) 177static void print_cpu(struct seq_file *m, int cpu)
@@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v)
214 PN(sysctl_sched_latency); 243 PN(sysctl_sched_latency);
215 PN(sysctl_sched_min_granularity); 244 PN(sysctl_sched_min_granularity);
216 PN(sysctl_sched_wakeup_granularity); 245 PN(sysctl_sched_wakeup_granularity);
217 PN(sysctl_sched_batch_wakeup_granularity);
218 PN(sysctl_sched_child_runs_first); 246 PN(sysctl_sched_child_runs_first);
219 P(sysctl_sched_features); 247 P(sysctl_sched_features);
220#undef PN 248#undef PN
@@ -249,12 +277,9 @@ static int __init init_sched_debug_procfs(void)
249{ 277{
250 struct proc_dir_entry *pe; 278 struct proc_dir_entry *pe;
251 279
252 pe = create_proc_entry("sched_debug", 0644, NULL); 280 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
253 if (!pe) 281 if (!pe)
254 return -ENOMEM; 282 return -ENOMEM;
255
256 pe->proc_fops = &sched_debug_fops;
257
258 return 0; 283 return 0;
259} 284}
260 285
@@ -332,8 +357,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
332 357
333 avg_per_cpu = p->se.sum_exec_runtime; 358 avg_per_cpu = p->se.sum_exec_runtime;
334 if (p->se.nr_migrations) { 359 if (p->se.nr_migrations) {
335 avg_per_cpu = div64_64(avg_per_cpu, 360 avg_per_cpu = div64_u64(avg_per_cpu,
336 p->se.nr_migrations); 361 p->se.nr_migrations);
337 } else { 362 } else {
338 avg_per_cpu = -1LL; 363 avg_per_cpu = -1LL;
339 } 364 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 86a93376282c..89fa32b4edf2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1;
62unsigned int __read_mostly sysctl_sched_compat_yield; 62unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_BATCH wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 *
68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies.
71 */
72unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73
74/*
75 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 67 *
78 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
81 */ 71 */
82unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
83 73
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 75
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
87 * CFS operations on generic schedulable entities: 77 * CFS operations on generic schedulable entities:
88 */ 78 */
89 79
80static inline struct task_struct *task_of(struct sched_entity *se)
81{
82 return container_of(se, struct task_struct, se);
83}
84
90#ifdef CONFIG_FAIR_GROUP_SCHED 85#ifdef CONFIG_FAIR_GROUP_SCHED
91 86
92/* cpu runqueue to which this cfs_rq is attached */ 87/* cpu runqueue to which this cfs_rq is attached */
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
98/* An entity is a task if it doesn't "own" a runqueue */ 93/* An entity is a task if it doesn't "own" a runqueue */
99#define entity_is_task(se) (!se->my_q) 94#define entity_is_task(se) (!se->my_q)
100 95
96/* Walk up scheduling entities hierarchy */
97#define for_each_sched_entity(se) \
98 for (; se; se = se->parent)
99
100static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
101{
102 return p->se.cfs_rq;
103}
104
105/* runqueue on which this entity is (to be) queued */
106static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
107{
108 return se->cfs_rq;
109}
110
111/* runqueue "owned" by this group */
112static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
113{
114 return grp->my_q;
115}
116
117/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
118 * another cpu ('this_cpu')
119 */
120static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
121{
122 return cfs_rq->tg->cfs_rq[this_cpu];
123}
124
125/* Iterate thr' all leaf cfs_rq's on a runqueue */
126#define for_each_leaf_cfs_rq(rq, cfs_rq) \
127 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
128
129/* Do the two (enqueued) entities belong to the same group ? */
130static inline int
131is_same_group(struct sched_entity *se, struct sched_entity *pse)
132{
133 if (se->cfs_rq == pse->cfs_rq)
134 return 1;
135
136 return 0;
137}
138
139static inline struct sched_entity *parent_entity(struct sched_entity *se)
140{
141 return se->parent;
142}
143
101#else /* CONFIG_FAIR_GROUP_SCHED */ 144#else /* CONFIG_FAIR_GROUP_SCHED */
102 145
103static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 146static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
107 150
108#define entity_is_task(se) 1 151#define entity_is_task(se) 1
109 152
110#endif /* CONFIG_FAIR_GROUP_SCHED */ 153#define for_each_sched_entity(se) \
154 for (; se; se = NULL)
111 155
112static inline struct task_struct *task_of(struct sched_entity *se) 156static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
113{ 157{
114 return container_of(se, struct task_struct, se); 158 return &task_rq(p)->cfs;
159}
160
161static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
162{
163 struct task_struct *p = task_of(se);
164 struct rq *rq = task_rq(p);
165
166 return &rq->cfs;
115} 167}
116 168
169/* runqueue "owned" by this group */
170static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
171{
172 return NULL;
173}
174
175static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
176{
177 return &cpu_rq(this_cpu)->cfs;
178}
179
180#define for_each_leaf_cfs_rq(rq, cfs_rq) \
181 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
182
183static inline int
184is_same_group(struct sched_entity *se, struct sched_entity *pse)
185{
186 return 1;
187}
188
189static inline struct sched_entity *parent_entity(struct sched_entity *se)
190{
191 return NULL;
192}
193
194#endif /* CONFIG_FAIR_GROUP_SCHED */
195
117 196
118/************************************************************** 197/**************************************************************
119 * Scheduling class tree data structure manipulation methods: 198 * Scheduling class tree data structure manipulation methods:
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
255#endif 334#endif
256 335
257/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
258 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
259 * 366 *
260 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
283 */ 390 */
284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
285{ 392{
286 return calc_delta_mine(__sched_period(cfs_rq->nr_running), 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
287 se->load.weight, &cfs_rq->load);
288} 394}
289 395
290/* 396/*
291 * We calculate the vruntime slice. 397 * We calculate the vruntime slice of a to be inserted task
292 * 398 *
293 * vs = s/w = p/rw 399 * vs = s*rw/w = p
294 */ 400 */
295static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
296{ 402{
297 u64 vslice = __sched_period(nr_running); 403 unsigned long nr_running = cfs_rq->nr_running;
298 404
299 vslice *= NICE_0_LOAD; 405 if (!se->on_rq)
300 do_div(vslice, rq_weight); 406 nr_running++;
301 407
302 return vslice; 408 return __sched_period(nr_running);
303} 409}
304 410
305static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
306{ 424{
307 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 425 struct load_weight lw = {
308 cfs_rq->nr_running + 1); 426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432
433 if (se->load.weight < NICE_0_LOAD)
434 se_lw = &lw;
435
436 delta = calc_delta_mine(delta,
437 cfs_rq_of(se)->load.weight, se_lw);
438 }
439
440 return delta;
309} 441}
310 442
311/* 443/*
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
322 454
323 curr->sum_exec_runtime += delta_exec; 455 curr->sum_exec_runtime += delta_exec;
324 schedstat_add(cfs_rq, exec_clock, delta_exec); 456 schedstat_add(cfs_rq, exec_clock, delta_exec);
325 delta_exec_weighted = delta_exec; 457 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
326 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
327 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
328 &curr->load);
329 }
330 curr->vruntime += delta_exec_weighted; 458 curr->vruntime += delta_exec_weighted;
331} 459}
332 460
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 * Scheduling class queueing methods: 541 * Scheduling class queueing methods:
414 */ 542 */
415 543
544#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
545static void
546add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
547{
548 cfs_rq->task_weight += weight;
549}
550#else
551static inline void
552add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
553{
554}
555#endif
556
416static void 557static void
417account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 558account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
418{ 559{
419 update_load_add(&cfs_rq->load, se->load.weight); 560 update_load_add(&cfs_rq->load, se->load.weight);
561 if (!parent_entity(se))
562 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
563 if (entity_is_task(se))
564 add_cfs_task_weight(cfs_rq, se->load.weight);
420 cfs_rq->nr_running++; 565 cfs_rq->nr_running++;
421 se->on_rq = 1; 566 se->on_rq = 1;
567 list_add(&se->group_node, &cfs_rq->tasks);
422} 568}
423 569
424static void 570static void
425account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 571account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
426{ 572{
427 update_load_sub(&cfs_rq->load, se->load.weight); 573 update_load_sub(&cfs_rq->load, se->load.weight);
574 if (!parent_entity(se))
575 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
576 if (entity_is_task(se))
577 add_cfs_task_weight(cfs_rq, -se->load.weight);
428 cfs_rq->nr_running--; 578 cfs_rq->nr_running--;
429 se->on_rq = 0; 579 se->on_rq = 0;
580 list_del_init(&se->group_node);
430} 581}
431 582
432static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 583static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -511,8 +662,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
511 if (!initial) { 662 if (!initial) {
512 /* sleeps upto a single latency don't count. */ 663 /* sleeps upto a single latency don't count. */
513 if (sched_feat(NEW_FAIR_SLEEPERS)) { 664 if (sched_feat(NEW_FAIR_SLEEPERS)) {
514 vruntime -= calc_delta_fair(sysctl_sched_latency, 665 if (sched_feat(NORMALIZED_SLEEPER))
515 &cfs_rq->load); 666 vruntime -= calc_delta_weight(sysctl_sched_latency, se);
667 else
668 vruntime -= sysctl_sched_latency;
516 } 669 }
517 670
518 /* ensure we never gain time by being placed backwards. */ 671 /* ensure we never gain time by being placed backwards. */
@@ -629,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
629 se->prev_sum_exec_runtime = se->sum_exec_runtime; 782 se->prev_sum_exec_runtime = se->sum_exec_runtime;
630} 783}
631 784
785static int
786wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
787
632static struct sched_entity * 788static struct sched_entity *
633pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 789pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
634{ 790{
635 s64 diff, gran;
636
637 if (!cfs_rq->next) 791 if (!cfs_rq->next)
638 return se; 792 return se;
639 793
640 diff = cfs_rq->next->vruntime - se->vruntime; 794 if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
641 if (diff < 0)
642 return se;
643
644 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
645 if (diff > gran)
646 return se; 795 return se;
647 796
648 return cfs_rq->next; 797 return cfs_rq->next;
@@ -710,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
710 * CFS operations on tasks: 859 * CFS operations on tasks:
711 */ 860 */
712 861
713#ifdef CONFIG_FAIR_GROUP_SCHED
714
715/* Walk up scheduling entities hierarchy */
716#define for_each_sched_entity(se) \
717 for (; se; se = se->parent)
718
719static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
720{
721 return p->se.cfs_rq;
722}
723
724/* runqueue on which this entity is (to be) queued */
725static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
726{
727 return se->cfs_rq;
728}
729
730/* runqueue "owned" by this group */
731static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
732{
733 return grp->my_q;
734}
735
736/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
737 * another cpu ('this_cpu')
738 */
739static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
740{
741 return cfs_rq->tg->cfs_rq[this_cpu];
742}
743
744/* Iterate thr' all leaf cfs_rq's on a runqueue */
745#define for_each_leaf_cfs_rq(rq, cfs_rq) \
746 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
747
748/* Do the two (enqueued) entities belong to the same group ? */
749static inline int
750is_same_group(struct sched_entity *se, struct sched_entity *pse)
751{
752 if (se->cfs_rq == pse->cfs_rq)
753 return 1;
754
755 return 0;
756}
757
758static inline struct sched_entity *parent_entity(struct sched_entity *se)
759{
760 return se->parent;
761}
762
763#else /* CONFIG_FAIR_GROUP_SCHED */
764
765#define for_each_sched_entity(se) \
766 for (; se; se = NULL)
767
768static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
769{
770 return &task_rq(p)->cfs;
771}
772
773static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
774{
775 struct task_struct *p = task_of(se);
776 struct rq *rq = task_rq(p);
777
778 return &rq->cfs;
779}
780
781/* runqueue "owned" by this group */
782static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
783{
784 return NULL;
785}
786
787static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
788{
789 return &cpu_rq(this_cpu)->cfs;
790}
791
792#define for_each_leaf_cfs_rq(rq, cfs_rq) \
793 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
794
795static inline int
796is_same_group(struct sched_entity *se, struct sched_entity *pse)
797{
798 return 1;
799}
800
801static inline struct sched_entity *parent_entity(struct sched_entity *se)
802{
803 return NULL;
804}
805
806#endif /* CONFIG_FAIR_GROUP_SCHED */
807
808#ifdef CONFIG_SCHED_HRTICK 862#ifdef CONFIG_SCHED_HRTICK
809static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 863static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
810{ 864{
@@ -918,7 +972,7 @@ static void yield_task_fair(struct rq *rq)
918 /* 972 /*
919 * Already in the rightmost position? 973 * Already in the rightmost position?
920 */ 974 */
921 if (unlikely(rightmost->vruntime < se->vruntime)) 975 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
922 return; 976 return;
923 977
924 /* 978 /*
@@ -957,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p)
957 return cpu; 1011 return cpu;
958 1012
959 for_each_domain(cpu, sd) { 1013 for_each_domain(cpu, sd) {
960 if (sd->flags & SD_WAKE_IDLE) { 1014 if ((sd->flags & SD_WAKE_IDLE)
1015 || ((sd->flags & SD_WAKE_IDLE_FAR)
1016 && !task_hot(p, task_rq(p)->clock, sd))) {
961 cpus_and(tmp, sd->span, p->cpus_allowed); 1017 cpus_and(tmp, sd->span, p->cpus_allowed);
962 for_each_cpu_mask(i, tmp) { 1018 for_each_cpu_mask(i, tmp) {
963 if (idle_cpu(i)) { 1019 if (idle_cpu(i)) {
@@ -1101,6 +1157,58 @@ out:
1101} 1157}
1102#endif /* CONFIG_SMP */ 1158#endif /* CONFIG_SMP */
1103 1159
1160static unsigned long wakeup_gran(struct sched_entity *se)
1161{
1162 unsigned long gran = sysctl_sched_wakeup_granularity;
1163
1164 /*
1165 * More easily preempt - nice tasks, while not making it harder for
1166 * + nice tasks.
1167 */
1168 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1169
1170 return gran;
1171}
1172
1173/*
1174 * Should 'se' preempt 'curr'.
1175 *
1176 * |s1
1177 * |s2
1178 * |s3
1179 * g
1180 * |<--->|c
1181 *
1182 * w(c, s1) = -1
1183 * w(c, s2) = 0
1184 * w(c, s3) = 1
1185 *
1186 */
1187static int
1188wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1189{
1190 s64 gran, vdiff = curr->vruntime - se->vruntime;
1191
1192 if (vdiff < 0)
1193 return -1;
1194
1195 gran = wakeup_gran(curr);
1196 if (vdiff > gran)
1197 return 1;
1198
1199 return 0;
1200}
1201
1202/* return depth at which a sched entity is present in the hierarchy */
1203static inline int depth_se(struct sched_entity *se)
1204{
1205 int depth = 0;
1206
1207 for_each_sched_entity(se)
1208 depth++;
1209
1210 return depth;
1211}
1104 1212
1105/* 1213/*
1106 * Preempt the current task with a newly woken task if needed: 1214 * Preempt the current task with a newly woken task if needed:
@@ -1110,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1110 struct task_struct *curr = rq->curr; 1218 struct task_struct *curr = rq->curr;
1111 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1219 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1112 struct sched_entity *se = &curr->se, *pse = &p->se; 1220 struct sched_entity *se = &curr->se, *pse = &p->se;
1113 unsigned long gran; 1221 int se_depth, pse_depth;
1114 1222
1115 if (unlikely(rt_prio(p->prio))) { 1223 if (unlikely(rt_prio(p->prio))) {
1116 update_rq_clock(rq); 1224 update_rq_clock(rq);
@@ -1135,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1135 if (!sched_feat(WAKEUP_PREEMPT)) 1243 if (!sched_feat(WAKEUP_PREEMPT))
1136 return; 1244 return;
1137 1245
1138 while (!is_same_group(se, pse)) { 1246 /*
1247 * preemption test can be made between sibling entities who are in the
1248 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
1249 * both tasks until we find their ancestors who are siblings of common
1250 * parent.
1251 */
1252
1253 /* First walk up until both entities are at same depth */
1254 se_depth = depth_se(se);
1255 pse_depth = depth_se(pse);
1256
1257 while (se_depth > pse_depth) {
1258 se_depth--;
1139 se = parent_entity(se); 1259 se = parent_entity(se);
1260 }
1261
1262 while (pse_depth > se_depth) {
1263 pse_depth--;
1140 pse = parent_entity(pse); 1264 pse = parent_entity(pse);
1141 } 1265 }
1142 1266
1143 gran = sysctl_sched_wakeup_granularity; 1267 while (!is_same_group(se, pse)) {
1144 /* 1268 se = parent_entity(se);
1145 * More easily preempt - nice tasks, while not making 1269 pse = parent_entity(pse);
1146 * it harder for + nice tasks. 1270 }
1147 */
1148 if (unlikely(se->load.weight > NICE_0_LOAD))
1149 gran = calc_delta_fair(gran, &se->load);
1150 1271
1151 if (pse->vruntime + gran < se->vruntime) 1272 if (wakeup_preempt_entity(se, pse) == 1)
1152 resched_task(curr); 1273 resched_task(curr);
1153} 1274}
1154 1275
@@ -1199,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1199 * the current task: 1320 * the current task:
1200 */ 1321 */
1201static struct task_struct * 1322static struct task_struct *
1202__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 1323__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1203{ 1324{
1204 struct task_struct *p; 1325 struct task_struct *p = NULL;
1326 struct sched_entity *se;
1327
1328 if (next == &cfs_rq->tasks)
1329 return NULL;
1330
1331 /* Skip over entities that are not tasks */
1332 do {
1333 se = list_entry(next, struct sched_entity, group_node);
1334 next = next->next;
1335 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1205 1336
1206 if (!curr) 1337 if (next == &cfs_rq->tasks)
1207 return NULL; 1338 return NULL;
1208 1339
1209 p = rb_entry(curr, struct task_struct, se.run_node); 1340 cfs_rq->balance_iterator = next;
1210 cfs_rq->rb_load_balance_curr = rb_next(curr); 1341
1342 if (entity_is_task(se))
1343 p = task_of(se);
1211 1344
1212 return p; 1345 return p;
1213} 1346}
@@ -1216,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg)
1216{ 1349{
1217 struct cfs_rq *cfs_rq = arg; 1350 struct cfs_rq *cfs_rq = arg;
1218 1351
1219 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); 1352 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
1220} 1353}
1221 1354
1222static struct task_struct *load_balance_next_fair(void *arg) 1355static struct task_struct *load_balance_next_fair(void *arg)
1223{ 1356{
1224 struct cfs_rq *cfs_rq = arg; 1357 struct cfs_rq *cfs_rq = arg;
1225 1358
1226 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1359 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1227} 1360}
1228 1361
1229#ifdef CONFIG_FAIR_GROUP_SCHED 1362static unsigned long
1230static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1363__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1364 unsigned long max_load_move, struct sched_domain *sd,
1365 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1366 struct cfs_rq *cfs_rq)
1231{ 1367{
1232 struct sched_entity *curr; 1368 struct rq_iterator cfs_rq_iterator;
1233 struct task_struct *p;
1234
1235 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1236 return MAX_PRIO;
1237
1238 curr = cfs_rq->curr;
1239 if (!curr)
1240 curr = __pick_next_entity(cfs_rq);
1241 1369
1242 p = task_of(curr); 1370 cfs_rq_iterator.start = load_balance_start_fair;
1371 cfs_rq_iterator.next = load_balance_next_fair;
1372 cfs_rq_iterator.arg = cfs_rq;
1243 1373
1244 return p->prio; 1374 return balance_tasks(this_rq, this_cpu, busiest,
1375 max_load_move, sd, idle, all_pinned,
1376 this_best_prio, &cfs_rq_iterator);
1245} 1377}
1246#endif
1247 1378
1379#ifdef CONFIG_FAIR_GROUP_SCHED
1248static unsigned long 1380static unsigned long
1249load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1381load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1250 unsigned long max_load_move, 1382 unsigned long max_load_move,
1251 struct sched_domain *sd, enum cpu_idle_type idle, 1383 struct sched_domain *sd, enum cpu_idle_type idle,
1252 int *all_pinned, int *this_best_prio) 1384 int *all_pinned, int *this_best_prio)
1253{ 1385{
1254 struct cfs_rq *busy_cfs_rq;
1255 long rem_load_move = max_load_move; 1386 long rem_load_move = max_load_move;
1256 struct rq_iterator cfs_rq_iterator; 1387 int busiest_cpu = cpu_of(busiest);
1257 1388 struct task_group *tg;
1258 cfs_rq_iterator.start = load_balance_start_fair;
1259 cfs_rq_iterator.next = load_balance_next_fair;
1260 1389
1261 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1390 rcu_read_lock();
1262#ifdef CONFIG_FAIR_GROUP_SCHED 1391 list_for_each_entry(tg, &task_groups, list) {
1263 struct cfs_rq *this_cfs_rq;
1264 long imbalance; 1392 long imbalance;
1265 unsigned long maxload; 1393 unsigned long this_weight, busiest_weight;
1394 long rem_load, max_load, moved_load;
1395
1396 /*
1397 * empty group
1398 */
1399 if (!aggregate(tg, sd)->task_weight)
1400 continue;
1401
1402 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1403 rem_load /= aggregate(tg, sd)->load + 1;
1404
1405 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1406 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1407
1408 imbalance = (busiest_weight - this_weight) / 2;
1266 1409
1267 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1410 if (imbalance < 0)
1411 imbalance = busiest_weight;
1268 1412
1269 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1413 max_load = max(rem_load, imbalance);
1270 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1414 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1271 if (imbalance <= 0) 1415 max_load, sd, idle, all_pinned, this_best_prio,
1416 tg->cfs_rq[busiest_cpu]);
1417
1418 if (!moved_load)
1272 continue; 1419 continue;
1273 1420
1274 /* Don't pull more than imbalance/2 */ 1421 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1275 imbalance /= 2;
1276 maxload = min(rem_load_move, imbalance);
1277 1422
1278 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1423 moved_load *= aggregate(tg, sd)->load;
1279#else 1424 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1280# define maxload rem_load_move
1281#endif
1282 /*
1283 * pass busy_cfs_rq argument into
1284 * load_balance_[start|next]_fair iterators
1285 */
1286 cfs_rq_iterator.arg = busy_cfs_rq;
1287 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1288 maxload, sd, idle, all_pinned,
1289 this_best_prio,
1290 &cfs_rq_iterator);
1291 1425
1292 if (rem_load_move <= 0) 1426 rem_load_move -= moved_load;
1427 if (rem_load_move < 0)
1293 break; 1428 break;
1294 } 1429 }
1430 rcu_read_unlock();
1295 1431
1296 return max_load_move - rem_load_move; 1432 return max_load_move - rem_load_move;
1297} 1433}
1434#else
1435static unsigned long
1436load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1437 unsigned long max_load_move,
1438 struct sched_domain *sd, enum cpu_idle_type idle,
1439 int *all_pinned, int *this_best_prio)
1440{
1441 return __load_balance_fair(this_rq, this_cpu, busiest,
1442 max_load_move, sd, idle, all_pinned,
1443 this_best_prio, &busiest->cfs);
1444}
1445#endif
1298 1446
1299static int 1447static int
1300move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1448move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1463,16 +1611,40 @@ static const struct sched_class fair_sched_class = {
1463}; 1611};
1464 1612
1465#ifdef CONFIG_SCHED_DEBUG 1613#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1466static void print_cfs_stats(struct seq_file *m, int cpu) 1638static void print_cfs_stats(struct seq_file *m, int cpu)
1467{ 1639{
1468 struct cfs_rq *cfs_rq; 1640 struct cfs_rq *cfs_rq;
1469 1641
1470#ifdef CONFIG_FAIR_GROUP_SCHED
1471 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1472#endif
1473 rcu_read_lock(); 1642 rcu_read_lock();
1474 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1475 print_cfs_rq(m, cpu, cfs_rq); 1644 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1476 rcu_read_unlock(); 1648 rcu_read_unlock();
1477} 1649}
1478#endif 1650#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
new file mode 100644
index 000000000000..1c7283cb9581
--- /dev/null
+++ b/kernel/sched_features.h
@@ -0,0 +1,10 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1)
5SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1)
10SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a6d2e516420..c2730a5a4f05 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
62 if (!rt_rq->tg) 62 if (!rt_rq->tg)
63 return RUNTIME_INF; 63 return RUNTIME_INF;
64 64
65 return rt_rq->tg->rt_runtime; 65 return rt_rq->rt_runtime;
66}
67
68static inline u64 sched_rt_period(struct rt_rq *rt_rq)
69{
70 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
66} 71}
67 72
68#define for_each_leaf_rt_rq(rt_rq, rq) \ 73#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
127 return p->prio != p->normal_prio; 132 return p->prio != p->normal_prio;
128} 133}
129 134
135#ifdef CONFIG_SMP
136static inline cpumask_t sched_rt_period_mask(void)
137{
138 return cpu_rq(smp_processor_id())->rd->span;
139}
140#else
141static inline cpumask_t sched_rt_period_mask(void)
142{
143 return cpu_online_map;
144}
145#endif
146
147static inline
148struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
149{
150 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
151}
152
153static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
154{
155 return &rt_rq->tg->rt_bandwidth;
156}
157
130#else 158#else
131 159
132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
133{ 161{
134 if (sysctl_sched_rt_runtime == -1) 162 return rt_rq->rt_runtime;
135 return RUNTIME_INF; 163}
136 164
137 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 165static inline u64 sched_rt_period(struct rt_rq *rt_rq)
166{
167 return ktime_to_ns(def_rt_bandwidth.rt_period);
138} 168}
139 169
140#define for_each_leaf_rt_rq(rt_rq, rq) \ 170#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
173{ 203{
174 return rt_rq->rt_throttled; 204 return rt_rq->rt_throttled;
175} 205}
206
207static inline cpumask_t sched_rt_period_mask(void)
208{
209 return cpu_online_map;
210}
211
212static inline
213struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
214{
215 return &cpu_rq(cpu)->rt;
216}
217
218static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
219{
220 return &def_rt_bandwidth;
221}
222
223#endif
224
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 }
254
255 if (enqueue)
256 sched_rt_rq_enqueue(rt_rq);
257 spin_unlock(&rq->lock);
258 }
259
260 return idle;
261}
262
263#ifdef CONFIG_SMP
264static int balance_runtime(struct rt_rq *rt_rq)
265{
266 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
267 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
268 int i, weight, more = 0;
269 u64 rt_period;
270
271 weight = cpus_weight(rd->span);
272
273 spin_lock(&rt_b->rt_runtime_lock);
274 rt_period = ktime_to_ns(rt_b->rt_period);
275 for_each_cpu_mask(i, rd->span) {
276 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
277 s64 diff;
278
279 if (iter == rt_rq)
280 continue;
281
282 spin_lock(&iter->rt_runtime_lock);
283 diff = iter->rt_runtime - iter->rt_time;
284 if (diff > 0) {
285 do_div(diff, weight);
286 if (rt_rq->rt_runtime + diff > rt_period)
287 diff = rt_period - rt_rq->rt_runtime;
288 iter->rt_runtime -= diff;
289 rt_rq->rt_runtime += diff;
290 more = 1;
291 if (rt_rq->rt_runtime == rt_period) {
292 spin_unlock(&iter->rt_runtime_lock);
293 break;
294 }
295 }
296 spin_unlock(&iter->rt_runtime_lock);
297 }
298 spin_unlock(&rt_b->rt_runtime_lock);
299
300 return more;
301}
176#endif 302#endif
177 303
178static inline int rt_se_prio(struct sched_rt_entity *rt_se) 304static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
197 if (rt_rq->rt_throttled) 323 if (rt_rq->rt_throttled)
198 return rt_rq_throttled(rt_rq); 324 return rt_rq_throttled(rt_rq);
199 325
326 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
327 return 0;
328
329#ifdef CONFIG_SMP
200 if (rt_rq->rt_time > runtime) { 330 if (rt_rq->rt_time > runtime) {
201 struct rq *rq = rq_of_rt_rq(rt_rq); 331 int more;
202 332
203 rq->rt_throttled = 1; 333 spin_unlock(&rt_rq->rt_runtime_lock);
204 rt_rq->rt_throttled = 1; 334 more = balance_runtime(rt_rq);
335 spin_lock(&rt_rq->rt_runtime_lock);
205 336
337 if (more)
338 runtime = sched_rt_runtime(rt_rq);
339 }
340#endif
341
342 if (rt_rq->rt_time > runtime) {
343 rt_rq->rt_throttled = 1;
206 if (rt_rq_throttled(rt_rq)) { 344 if (rt_rq_throttled(rt_rq)) {
207 sched_rt_rq_dequeue(rt_rq); 345 sched_rt_rq_dequeue(rt_rq);
208 return 1; 346 return 1;
@@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
212 return 0; 350 return 0;
213} 351}
214 352
215static void update_sched_rt_period(struct rq *rq)
216{
217 struct rt_rq *rt_rq;
218 u64 period;
219
220 while (rq->clock > rq->rt_period_expire) {
221 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
222 rq->rt_period_expire += period;
223
224 for_each_leaf_rt_rq(rt_rq, rq) {
225 u64 runtime = sched_rt_runtime(rt_rq);
226
227 rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
228 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
229 rt_rq->rt_throttled = 0;
230 sched_rt_rq_enqueue(rt_rq);
231 }
232 }
233
234 rq->rt_throttled = 0;
235 }
236}
237
238/* 353/*
239 * Update the current task's runtime statistics. Skip current tasks that 354 * Update the current task's runtime statistics. Skip current tasks that
240 * are not in our scheduling class. 355 * are not in our scheduling class.
@@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq)
259 curr->se.exec_start = rq->clock; 374 curr->se.exec_start = rq->clock;
260 cpuacct_charge(curr, delta_exec); 375 cpuacct_charge(curr, delta_exec);
261 376
262 rt_rq->rt_time += delta_exec; 377 for_each_sched_rt_entity(rt_se) {
263 if (sched_rt_runtime_exceeded(rt_rq)) 378 rt_rq = rt_rq_of_se(rt_se);
264 resched_task(curr); 379
380 spin_lock(&rt_rq->rt_runtime_lock);
381 rt_rq->rt_time += delta_exec;
382 if (sched_rt_runtime_exceeded(rt_rq))
383 resched_task(curr);
384 spin_unlock(&rt_rq->rt_runtime_lock);
385 }
265} 386}
266 387
267static inline 388static inline
@@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
284#ifdef CONFIG_RT_GROUP_SCHED 405#ifdef CONFIG_RT_GROUP_SCHED
285 if (rt_se_boosted(rt_se)) 406 if (rt_se_boosted(rt_se))
286 rt_rq->rt_nr_boosted++; 407 rt_rq->rt_nr_boosted++;
408
409 if (rt_rq->tg)
410 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
411#else
412 start_rt_bandwidth(&def_rt_bandwidth);
287#endif 413#endif
288} 414}
289 415
@@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
353/* 479/*
354 * Because the prio of an upper entry depends on the lower 480 * Because the prio of an upper entry depends on the lower
355 * entries, we must remove entries top - down. 481 * entries, we must remove entries top - down.
356 *
357 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
358 * doesn't matter much for now, as h=2 for GROUP_SCHED.
359 */ 482 */
360static void dequeue_rt_stack(struct task_struct *p) 483static void dequeue_rt_stack(struct task_struct *p)
361{ 484{
362 struct sched_rt_entity *rt_se, *top_se; 485 struct sched_rt_entity *rt_se, *back = NULL;
363 486
364 /* 487 rt_se = &p->rt;
365 * dequeue all, top - down. 488 for_each_sched_rt_entity(rt_se) {
366 */ 489 rt_se->back = back;
367 do { 490 back = rt_se;
368 rt_se = &p->rt; 491 }
369 top_se = NULL; 492
370 for_each_sched_rt_entity(rt_se) { 493 for (rt_se = back; rt_se; rt_se = rt_se->back) {
371 if (on_rt_rq(rt_se)) 494 if (on_rt_rq(rt_se))
372 top_se = rt_se; 495 dequeue_rt_entity(rt_se);
373 } 496 }
374 if (top_se)
375 dequeue_rt_entity(top_se);
376 } while (top_se);
377} 497}
378 498
379/* 499/*
@@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
393 */ 513 */
394 for_each_sched_rt_entity(rt_se) 514 for_each_sched_rt_entity(rt_se)
395 enqueue_rt_entity(rt_se); 515 enqueue_rt_entity(rt_se);
516
517 inc_cpu_load(rq, p->se.load.weight);
396} 518}
397 519
398static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 520static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
412 if (rt_rq && rt_rq->rt_nr_running) 534 if (rt_rq && rt_rq->rt_nr_running)
413 enqueue_rt_entity(rt_se); 535 enqueue_rt_entity(rt_se);
414 } 536 }
537
538 dec_cpu_load(rq, p->se.load.weight);
415} 539}
416 540
417/* 541/*
@@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1001 return 0; 1125 return 0;
1002} 1126}
1003 1127
1004static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) 1128static void set_cpus_allowed_rt(struct task_struct *p,
1129 const cpumask_t *new_mask)
1005{ 1130{
1006 int weight = cpus_weight(*new_mask); 1131 int weight = cpus_weight(*new_mask);
1007 1132
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5b32433e7ee5..5bae2e0c3ff2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,6 +9,11 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
12 17
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies); 19 seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
36 preempt_disable(); 41 preempt_disable();
37 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40 44
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 45 cpumask_scnprintf(mask_str, mask_len, sd->span);
42 seq_printf(seq, "domain%d %s", dcount++, mask_str); 46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 48 itype++) {
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
new file mode 100644
index 000000000000..5c2942e768cd
--- /dev/null
+++ b/kernel/semaphore.c
@@ -0,0 +1,264 @@
1/*
2 * Copyright (c) 2008 Intel Corporation
3 * Author: Matthew Wilcox <willy@linux.intel.com>
4 *
5 * Distributed under the terms of the GNU GPL, version 2
6 *
7 * This file implements counting semaphores.
8 * A counting semaphore may be acquired 'n' times before sleeping.
9 * See mutex.c for single-acquisition sleeping locks which enforce
10 * rules which allow code to be debugged more easily.
11 */
12
13/*
14 * Some notes on the implementation:
15 *
16 * The spinlock controls access to the other members of the semaphore.
17 * down_trylock() and up() can be called from interrupt context, so we
18 * have to disable interrupts when taking the lock. It turns out various
19 * parts of the kernel expect to be able to use down() on a semaphore in
20 * interrupt context when they know it will succeed, so we have to use
21 * irqsave variants for down(), down_interruptible() and down_killable()
22 * too.
23 *
24 * The ->count variable represents how many more tasks can acquire this
25 * semaphore. If it's zero, there may be tasks waiting on the wait_list.
26 */
27
28#include <linux/compiler.h>
29#include <linux/kernel.h>
30#include <linux/module.h>
31#include <linux/sched.h>
32#include <linux/semaphore.h>
33#include <linux/spinlock.h>
34
35static noinline void __down(struct semaphore *sem);
36static noinline int __down_interruptible(struct semaphore *sem);
37static noinline int __down_killable(struct semaphore *sem);
38static noinline int __down_timeout(struct semaphore *sem, long jiffies);
39static noinline void __up(struct semaphore *sem);
40
41/**
42 * down - acquire the semaphore
43 * @sem: the semaphore to be acquired
44 *
45 * Acquires the semaphore. If no more tasks are allowed to acquire the
46 * semaphore, calling this function will put the task to sleep until the
47 * semaphore is released.
48 *
49 * Use of this function is deprecated, please use down_interruptible() or
50 * down_killable() instead.
51 */
52void down(struct semaphore *sem)
53{
54 unsigned long flags;
55
56 spin_lock_irqsave(&sem->lock, flags);
57 if (likely(sem->count > 0))
58 sem->count--;
59 else
60 __down(sem);
61 spin_unlock_irqrestore(&sem->lock, flags);
62}
63EXPORT_SYMBOL(down);
64
65/**
66 * down_interruptible - acquire the semaphore unless interrupted
67 * @sem: the semaphore to be acquired
68 *
69 * Attempts to acquire the semaphore. If no more tasks are allowed to
70 * acquire the semaphore, calling this function will put the task to sleep.
71 * If the sleep is interrupted by a signal, this function will return -EINTR.
72 * If the semaphore is successfully acquired, this function returns 0.
73 */
74int down_interruptible(struct semaphore *sem)
75{
76 unsigned long flags;
77 int result = 0;
78
79 spin_lock_irqsave(&sem->lock, flags);
80 if (likely(sem->count > 0))
81 sem->count--;
82 else
83 result = __down_interruptible(sem);
84 spin_unlock_irqrestore(&sem->lock, flags);
85
86 return result;
87}
88EXPORT_SYMBOL(down_interruptible);
89
90/**
91 * down_killable - acquire the semaphore unless killed
92 * @sem: the semaphore to be acquired
93 *
94 * Attempts to acquire the semaphore. If no more tasks are allowed to
95 * acquire the semaphore, calling this function will put the task to sleep.
96 * If the sleep is interrupted by a fatal signal, this function will return
97 * -EINTR. If the semaphore is successfully acquired, this function returns
98 * 0.
99 */
100int down_killable(struct semaphore *sem)
101{
102 unsigned long flags;
103 int result = 0;
104
105 spin_lock_irqsave(&sem->lock, flags);
106 if (likely(sem->count > 0))
107 sem->count--;
108 else
109 result = __down_killable(sem);
110 spin_unlock_irqrestore(&sem->lock, flags);
111
112 return result;
113}
114EXPORT_SYMBOL(down_killable);
115
116/**
117 * down_trylock - try to acquire the semaphore, without waiting
118 * @sem: the semaphore to be acquired
119 *
120 * Try to acquire the semaphore atomically. Returns 0 if the mutex has
121 * been acquired successfully or 1 if it it cannot be acquired.
122 *
123 * NOTE: This return value is inverted from both spin_trylock and
124 * mutex_trylock! Be careful about this when converting code.
125 *
126 * Unlike mutex_trylock, this function can be used from interrupt context,
127 * and the semaphore can be released by any task or interrupt.
128 */
129int down_trylock(struct semaphore *sem)
130{
131 unsigned long flags;
132 int count;
133
134 spin_lock_irqsave(&sem->lock, flags);
135 count = sem->count - 1;
136 if (likely(count >= 0))
137 sem->count = count;
138 spin_unlock_irqrestore(&sem->lock, flags);
139
140 return (count < 0);
141}
142EXPORT_SYMBOL(down_trylock);
143
144/**
145 * down_timeout - acquire the semaphore within a specified time
146 * @sem: the semaphore to be acquired
147 * @jiffies: how long to wait before failing
148 *
149 * Attempts to acquire the semaphore. If no more tasks are allowed to
150 * acquire the semaphore, calling this function will put the task to sleep.
151 * If the semaphore is not released within the specified number of jiffies,
152 * this function returns -ETIME. It returns 0 if the semaphore was acquired.
153 */
154int down_timeout(struct semaphore *sem, long jiffies)
155{
156 unsigned long flags;
157 int result = 0;
158
159 spin_lock_irqsave(&sem->lock, flags);
160 if (likely(sem->count > 0))
161 sem->count--;
162 else
163 result = __down_timeout(sem, jiffies);
164 spin_unlock_irqrestore(&sem->lock, flags);
165
166 return result;
167}
168EXPORT_SYMBOL(down_timeout);
169
170/**
171 * up - release the semaphore
172 * @sem: the semaphore to release
173 *
174 * Release the semaphore. Unlike mutexes, up() may be called from any
175 * context and even by tasks which have never called down().
176 */
177void up(struct semaphore *sem)
178{
179 unsigned long flags;
180
181 spin_lock_irqsave(&sem->lock, flags);
182 if (likely(list_empty(&sem->wait_list)))
183 sem->count++;
184 else
185 __up(sem);
186 spin_unlock_irqrestore(&sem->lock, flags);
187}
188EXPORT_SYMBOL(up);
189
190/* Functions for the contended case */
191
192struct semaphore_waiter {
193 struct list_head list;
194 struct task_struct *task;
195 int up;
196};
197
198/*
199 * Because this function is inlined, the 'state' parameter will be
200 * constant, and thus optimised away by the compiler. Likewise the
201 * 'timeout' parameter for the cases without timeouts.
202 */
203static inline int __sched __down_common(struct semaphore *sem, long state,
204 long timeout)
205{
206 struct task_struct *task = current;
207 struct semaphore_waiter waiter;
208
209 list_add_tail(&waiter.list, &sem->wait_list);
210 waiter.task = task;
211 waiter.up = 0;
212
213 for (;;) {
214 if (state == TASK_INTERRUPTIBLE && signal_pending(task))
215 goto interrupted;
216 if (state == TASK_KILLABLE && fatal_signal_pending(task))
217 goto interrupted;
218 if (timeout <= 0)
219 goto timed_out;
220 __set_task_state(task, state);
221 spin_unlock_irq(&sem->lock);
222 timeout = schedule_timeout(timeout);
223 spin_lock_irq(&sem->lock);
224 if (waiter.up)
225 return 0;
226 }
227
228 timed_out:
229 list_del(&waiter.list);
230 return -ETIME;
231
232 interrupted:
233 list_del(&waiter.list);
234 return -EINTR;
235}
236
237static noinline void __sched __down(struct semaphore *sem)
238{
239 __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
240}
241
242static noinline int __sched __down_interruptible(struct semaphore *sem)
243{
244 return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
245}
246
247static noinline int __sched __down_killable(struct semaphore *sem)
248{
249 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
250}
251
252static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
253{
254 return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
255}
256
257static noinline void __sched __up(struct semaphore *sem)
258{
259 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
260 struct semaphore_waiter, list);
261 list_del(&waiter->list);
262 waiter->up = 1;
263 wake_up_process(waiter->task);
264}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6af1210092c3..72bb4f51f963 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,11 +39,19 @@
39 39
40static struct kmem_cache *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
41 41
42static int __sig_ignored(struct task_struct *t, int sig)
43{
44 void __user *handler;
45
46 /* Is it explicitly or implicitly ignored? */
47
48 handler = t->sighand->action[sig - 1].sa.sa_handler;
49 return handler == SIG_IGN ||
50 (handler == SIG_DFL && sig_kernel_ignore(sig));
51}
42 52
43static int sig_ignored(struct task_struct *t, int sig) 53static int sig_ignored(struct task_struct *t, int sig)
44{ 54{
45 void __user * handler;
46
47 /* 55 /*
48 * Tracers always want to know about signals.. 56 * Tracers always want to know about signals..
49 */ 57 */
@@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig)
58 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 66 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
59 return 0; 67 return 0;
60 68
61 /* Is it explicitly or implicitly ignored? */ 69 return __sig_ignored(t, sig);
62 handler = t->sighand->action[sig-1].sa.sa_handler;
63 return handler == SIG_IGN ||
64 (handler == SIG_DFL && sig_kernel_ignore(sig));
65} 70}
66 71
67/* 72/*
@@ -220,7 +225,7 @@ void flush_signals(struct task_struct *t)
220 unsigned long flags; 225 unsigned long flags;
221 226
222 spin_lock_irqsave(&t->sighand->siglock, flags); 227 spin_lock_irqsave(&t->sighand->siglock, flags);
223 clear_tsk_thread_flag(t,TIF_SIGPENDING); 228 clear_tsk_thread_flag(t, TIF_SIGPENDING);
224 flush_sigqueue(&t->pending); 229 flush_sigqueue(&t->pending);
225 flush_sigqueue(&t->signal->shared_pending); 230 flush_sigqueue(&t->signal->shared_pending);
226 spin_unlock_irqrestore(&t->sighand->siglock, flags); 231 spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -372,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
372 */ 377 */
373int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 378int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
374{ 379{
375 int signr = 0; 380 int signr;
376 381
377 /* We only dequeue private signals from ourselves, we don't let 382 /* We only dequeue private signals from ourselves, we don't let
378 * signalfd steal them 383 * signalfd steal them
@@ -405,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
405 } 410 }
406 } 411 }
407 } 412 }
413
408 recalc_sigpending(); 414 recalc_sigpending();
409 if (signr && unlikely(sig_kernel_stop(signr))) { 415 if (!signr)
416 return 0;
417
418 if (unlikely(sig_kernel_stop(signr))) {
410 /* 419 /*
411 * Set a marker that we have dequeued a stop signal. Our 420 * Set a marker that we have dequeued a stop signal. Our
412 * caller might release the siglock and then the pending 421 * caller might release the siglock and then the pending
@@ -422,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
422 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 431 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
423 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 432 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
424 } 433 }
425 if (signr && 434 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
426 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
427 info->si_sys_private){
428 /* 435 /*
429 * Release the siglock to ensure proper locking order 436 * Release the siglock to ensure proper locking order
430 * of timer locks outside of siglocks. Note, we leave 437 * of timer locks outside of siglocks. Note, we leave
@@ -526,21 +533,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
526static int check_kill_permission(int sig, struct siginfo *info, 533static int check_kill_permission(int sig, struct siginfo *info,
527 struct task_struct *t) 534 struct task_struct *t)
528{ 535{
529 int error = -EINVAL; 536 struct pid *sid;
537 int error;
538
530 if (!valid_signal(sig)) 539 if (!valid_signal(sig))
531 return error; 540 return -EINVAL;
532 541
533 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { 542 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
534 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 543 return 0;
535 if (error) 544
536 return error; 545 error = audit_signal_info(sig, t); /* Let audit system see the signal */
537 error = -EPERM; 546 if (error)
538 if (((sig != SIGCONT) ||
539 (task_session_nr(current) != task_session_nr(t)))
540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
542 && !capable(CAP_KILL))
543 return error; 547 return error;
548
549 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
550 (current->uid ^ t->suid) && (current->uid ^ t->uid) &&
551 !capable(CAP_KILL)) {
552 switch (sig) {
553 case SIGCONT:
554 sid = task_session(t);
555 /*
556 * We don't return the error if sid == NULL. The
557 * task was unhashed, the caller must notice this.
558 */
559 if (!sid || sid == task_session(current))
560 break;
561 default:
562 return -EPERM;
563 }
544 } 564 }
545 565
546 return security_task_kill(t, info, sig, 0); 566 return security_task_kill(t, info, sig, 0);
@@ -550,62 +570,44 @@ static int check_kill_permission(int sig, struct siginfo *info,
550static void do_notify_parent_cldstop(struct task_struct *tsk, int why); 570static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
551 571
552/* 572/*
553 * Handle magic process-wide effects of stop/continue signals. 573 * Handle magic process-wide effects of stop/continue signals. Unlike
554 * Unlike the signal actions, these happen immediately at signal-generation 574 * the signal actions, these happen immediately at signal-generation
555 * time regardless of blocking, ignoring, or handling. This does the 575 * time regardless of blocking, ignoring, or handling. This does the
556 * actual continuing for SIGCONT, but not the actual stopping for stop 576 * actual continuing for SIGCONT, but not the actual stopping for stop
557 * signals. The process stop is done as a signal action for SIG_DFL. 577 * signals. The process stop is done as a signal action for SIG_DFL.
578 *
579 * Returns true if the signal should be actually delivered, otherwise
580 * it should be dropped.
558 */ 581 */
559static void handle_stop_signal(int sig, struct task_struct *p) 582static int prepare_signal(int sig, struct task_struct *p)
560{ 583{
584 struct signal_struct *signal = p->signal;
561 struct task_struct *t; 585 struct task_struct *t;
562 586
563 if (p->signal->flags & SIGNAL_GROUP_EXIT) 587 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
564 /* 588 /*
565 * The process is in the middle of dying already. 589 * The process is in the middle of dying, nothing to do.
566 */ 590 */
567 return; 591 } else if (sig_kernel_stop(sig)) {
568
569 if (sig_kernel_stop(sig)) {
570 /* 592 /*
571 * This is a stop signal. Remove SIGCONT from all queues. 593 * This is a stop signal. Remove SIGCONT from all queues.
572 */ 594 */
573 rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); 595 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
574 t = p; 596 t = p;
575 do { 597 do {
576 rm_from_queue(sigmask(SIGCONT), &t->pending); 598 rm_from_queue(sigmask(SIGCONT), &t->pending);
577 t = next_thread(t); 599 } while_each_thread(p, t);
578 } while (t != p);
579 } else if (sig == SIGCONT) { 600 } else if (sig == SIGCONT) {
601 unsigned int why;
580 /* 602 /*
581 * Remove all stop signals from all queues, 603 * Remove all stop signals from all queues,
582 * and wake all threads. 604 * and wake all threads.
583 */ 605 */
584 if (unlikely(p->signal->group_stop_count > 0)) { 606 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
585 /*
586 * There was a group stop in progress. We'll
587 * pretend it finished before we got here. We are
588 * obliged to report it to the parent: if the
589 * SIGSTOP happened "after" this SIGCONT, then it
590 * would have cleared this pending SIGCONT. If it
591 * happened "before" this SIGCONT, then the parent
592 * got the SIGCHLD about the stop finishing before
593 * the continue happened. We do the notification
594 * now, and it's as if the stop had finished and
595 * the SIGCHLD was pending on entry to this kill.
596 */
597 p->signal->group_stop_count = 0;
598 p->signal->flags = SIGNAL_STOP_CONTINUED;
599 spin_unlock(&p->sighand->siglock);
600 do_notify_parent_cldstop(p, CLD_STOPPED);
601 spin_lock(&p->sighand->siglock);
602 }
603 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
604 t = p; 607 t = p;
605 do { 608 do {
606 unsigned int state; 609 unsigned int state;
607 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 610 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
608
609 /* 611 /*
610 * If there is a handler for SIGCONT, we must make 612 * If there is a handler for SIGCONT, we must make
611 * sure that no thread returns to user mode before 613 * sure that no thread returns to user mode before
@@ -615,7 +617,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
615 * running the handler. With the TIF_SIGPENDING 617 * running the handler. With the TIF_SIGPENDING
616 * flag set, the thread will pause and acquire the 618 * flag set, the thread will pause and acquire the
617 * siglock that we hold now and until we've queued 619 * siglock that we hold now and until we've queued
618 * the pending signal. 620 * the pending signal.
619 * 621 *
620 * Wake up the stopped thread _after_ setting 622 * Wake up the stopped thread _after_ setting
621 * TIF_SIGPENDING 623 * TIF_SIGPENDING
@@ -626,49 +628,163 @@ static void handle_stop_signal(int sig, struct task_struct *p)
626 state |= TASK_INTERRUPTIBLE; 628 state |= TASK_INTERRUPTIBLE;
627 } 629 }
628 wake_up_state(t, state); 630 wake_up_state(t, state);
631 } while_each_thread(p, t);
629 632
630 t = next_thread(t); 633 /*
631 } while (t != p); 634 * Notify the parent with CLD_CONTINUED if we were stopped.
635 *
636 * If we were in the middle of a group stop, we pretend it
637 * was already finished, and then continued. Since SIGCHLD
638 * doesn't queue we report only CLD_STOPPED, as if the next
639 * CLD_CONTINUED was dropped.
640 */
641 why = 0;
642 if (signal->flags & SIGNAL_STOP_STOPPED)
643 why |= SIGNAL_CLD_CONTINUED;
644 else if (signal->group_stop_count)
645 why |= SIGNAL_CLD_STOPPED;
632 646
633 if (p->signal->flags & SIGNAL_STOP_STOPPED) { 647 if (why) {
634 /* 648 /*
635 * We were in fact stopped, and are now continued. 649 * The first thread which returns from finish_stop()
636 * Notify the parent with CLD_CONTINUED. 650 * will take ->siglock, notice SIGNAL_CLD_MASK, and
651 * notify its parent. See get_signal_to_deliver().
637 */ 652 */
638 p->signal->flags = SIGNAL_STOP_CONTINUED; 653 signal->flags = why | SIGNAL_STOP_CONTINUED;
639 p->signal->group_exit_code = 0; 654 signal->group_stop_count = 0;
640 spin_unlock(&p->sighand->siglock); 655 signal->group_exit_code = 0;
641 do_notify_parent_cldstop(p, CLD_CONTINUED);
642 spin_lock(&p->sighand->siglock);
643 } else { 656 } else {
644 /* 657 /*
645 * We are not stopped, but there could be a stop 658 * We are not stopped, but there could be a stop
646 * signal in the middle of being processed after 659 * signal in the middle of being processed after
647 * being removed from the queue. Clear that too. 660 * being removed from the queue. Clear that too.
648 */ 661 */
649 p->signal->flags = 0; 662 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
663 }
664 }
665
666 return !sig_ignored(p, sig);
667}
668
669/*
670 * Test if P wants to take SIG. After we've checked all threads with this,
671 * it's equivalent to finding no threads not blocking SIG. Any threads not
672 * blocking SIG were ruled out because they are not running and already
673 * have pending signals. Such threads will dequeue from the shared queue
674 * as soon as they're available, so putting the signal on the shared queue
675 * will be equivalent to sending it to one such thread.
676 */
677static inline int wants_signal(int sig, struct task_struct *p)
678{
679 if (sigismember(&p->blocked, sig))
680 return 0;
681 if (p->flags & PF_EXITING)
682 return 0;
683 if (sig == SIGKILL)
684 return 1;
685 if (task_is_stopped_or_traced(p))
686 return 0;
687 return task_curr(p) || !signal_pending(p);
688}
689
690static void complete_signal(int sig, struct task_struct *p, int group)
691{
692 struct signal_struct *signal = p->signal;
693 struct task_struct *t;
694
695 /*
696 * Now find a thread we can wake up to take the signal off the queue.
697 *
698 * If the main thread wants the signal, it gets first crack.
699 * Probably the least surprising to the average bear.
700 */
701 if (wants_signal(sig, p))
702 t = p;
703 else if (!group || thread_group_empty(p))
704 /*
705 * There is just one thread and it does not need to be woken.
706 * It will dequeue unblocked signals before it runs again.
707 */
708 return;
709 else {
710 /*
711 * Otherwise try to find a suitable thread.
712 */
713 t = signal->curr_target;
714 while (!wants_signal(sig, t)) {
715 t = next_thread(t);
716 if (t == signal->curr_target)
717 /*
718 * No thread needs to be woken.
719 * Any eligible threads will see
720 * the signal in the queue soon.
721 */
722 return;
650 } 723 }
651 } else if (sig == SIGKILL) { 724 signal->curr_target = t;
725 }
726
727 /*
728 * Found a killable thread. If the signal will be fatal,
729 * then start taking the whole group down immediately.
730 */
731 if (sig_fatal(p, sig) &&
732 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
733 !sigismember(&t->real_blocked, sig) &&
734 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
652 /* 735 /*
653 * Make sure that any pending stop signal already dequeued 736 * This signal will be fatal to the whole group.
654 * is undone by the wakeup for SIGKILL.
655 */ 737 */
656 p->signal->flags = 0; 738 if (!sig_kernel_coredump(sig)) {
739 /*
740 * Start a group exit and wake everybody up.
741 * This way we don't have other threads
742 * running and doing things after a slower
743 * thread has the fatal signal pending.
744 */
745 signal->flags = SIGNAL_GROUP_EXIT;
746 signal->group_exit_code = sig;
747 signal->group_stop_count = 0;
748 t = p;
749 do {
750 sigaddset(&t->pending.signal, SIGKILL);
751 signal_wake_up(t, 1);
752 } while_each_thread(p, t);
753 return;
754 }
657 } 755 }
756
757 /*
758 * The signal is already in the shared-pending queue.
759 * Tell the chosen thread to wake up and dequeue it.
760 */
761 signal_wake_up(t, sig == SIGKILL);
762 return;
763}
764
765static inline int legacy_queue(struct sigpending *signals, int sig)
766{
767 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
658} 768}
659 769
660static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 770static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
661 struct sigpending *signals) 771 int group)
662{ 772{
663 struct sigqueue * q = NULL; 773 struct sigpending *pending;
664 int ret = 0; 774 struct sigqueue *q;
775
776 assert_spin_locked(&t->sighand->siglock);
777 if (!prepare_signal(sig, t))
778 return 0;
665 779
780 pending = group ? &t->signal->shared_pending : &t->pending;
666 /* 781 /*
667 * Deliver the signal to listening signalfds. This must be called 782 * Short-circuit ignored signals and support queuing
668 * with the sighand lock held. 783 * exactly one non-rt signal, so that we can get more
784 * detailed information about the cause of the signal.
669 */ 785 */
670 signalfd_notify(t, sig); 786 if (legacy_queue(pending, sig))
671 787 return 0;
672 /* 788 /*
673 * fast-pathed signals for kernel-internal things like SIGSTOP 789 * fast-pathed signals for kernel-internal things like SIGSTOP
674 * or SIGKILL. 790 * or SIGKILL.
@@ -688,7 +804,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
688 (is_si_special(info) || 804 (is_si_special(info) ||
689 info->si_code >= 0))); 805 info->si_code >= 0)));
690 if (q) { 806 if (q) {
691 list_add_tail(&q->list, &signals->list); 807 list_add_tail(&q->list, &pending->list);
692 switch ((unsigned long) info) { 808 switch ((unsigned long) info) {
693 case (unsigned long) SEND_SIG_NOINFO: 809 case (unsigned long) SEND_SIG_NOINFO:
694 q->info.si_signo = sig; 810 q->info.si_signo = sig;
@@ -718,13 +834,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
718 } 834 }
719 835
720out_set: 836out_set:
721 sigaddset(&signals->signal, sig); 837 signalfd_notify(t, sig);
722 return ret; 838 sigaddset(&pending->signal, sig);
839 complete_signal(sig, t, group);
840 return 0;
723} 841}
724 842
725#define LEGACY_QUEUE(sigptr, sig) \
726 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
727
728int print_fatal_signals; 843int print_fatal_signals;
729 844
730static void print_fatal_signal(struct pt_regs *regs, int signr) 845static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -757,29 +872,16 @@ static int __init setup_print_fatal_signals(char *str)
757 872
758__setup("print-fatal-signals=", setup_print_fatal_signals); 873__setup("print-fatal-signals=", setup_print_fatal_signals);
759 874
875int
876__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
877{
878 return send_signal(sig, info, p, 1);
879}
880
760static int 881static int
761specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 882specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
762{ 883{
763 int ret = 0; 884 return send_signal(sig, info, t, 0);
764
765 BUG_ON(!irqs_disabled());
766 assert_spin_locked(&t->sighand->siglock);
767
768 /* Short-circuit ignored signals. */
769 if (sig_ignored(t, sig))
770 goto out;
771
772 /* Support queueing exactly one non-rt signal, so that we
773 can get more detailed information about the cause of
774 the signal. */
775 if (LEGACY_QUEUE(&t->pending, sig))
776 goto out;
777
778 ret = send_signal(sig, info, t, &t->pending);
779 if (!ret && !sigismember(&t->blocked, sig))
780 signal_wake_up(t, sig == SIGKILL);
781out:
782 return ret;
783} 885}
784 886
785/* 887/*
@@ -790,7 +892,8 @@ out:
790 * since we do not want to have a signal handler that was blocked 892 * since we do not want to have a signal handler that was blocked
791 * be invoked when user space had explicitly blocked it. 893 * be invoked when user space had explicitly blocked it.
792 * 894 *
793 * We don't want to have recursive SIGSEGV's etc, for example. 895 * We don't want to have recursive SIGSEGV's etc, for example,
896 * that is why we also clear SIGNAL_UNKILLABLE.
794 */ 897 */
795int 898int
796force_sig_info(int sig, struct siginfo *info, struct task_struct *t) 899force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -810,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
810 recalc_sigpending_and_wake(t); 913 recalc_sigpending_and_wake(t);
811 } 914 }
812 } 915 }
916 if (action->sa.sa_handler == SIG_DFL)
917 t->signal->flags &= ~SIGNAL_UNKILLABLE;
813 ret = specific_send_sig_info(sig, info, t); 918 ret = specific_send_sig_info(sig, info, t);
814 spin_unlock_irqrestore(&t->sighand->siglock, flags); 919 spin_unlock_irqrestore(&t->sighand->siglock, flags);
815 920
@@ -823,134 +928,6 @@ force_sig_specific(int sig, struct task_struct *t)
823} 928}
824 929
825/* 930/*
826 * Test if P wants to take SIG. After we've checked all threads with this,
827 * it's equivalent to finding no threads not blocking SIG. Any threads not
828 * blocking SIG were ruled out because they are not running and already
829 * have pending signals. Such threads will dequeue from the shared queue
830 * as soon as they're available, so putting the signal on the shared queue
831 * will be equivalent to sending it to one such thread.
832 */
833static inline int wants_signal(int sig, struct task_struct *p)
834{
835 if (sigismember(&p->blocked, sig))
836 return 0;
837 if (p->flags & PF_EXITING)
838 return 0;
839 if (sig == SIGKILL)
840 return 1;
841 if (task_is_stopped_or_traced(p))
842 return 0;
843 return task_curr(p) || !signal_pending(p);
844}
845
846static void
847__group_complete_signal(int sig, struct task_struct *p)
848{
849 struct task_struct *t;
850
851 /*
852 * Now find a thread we can wake up to take the signal off the queue.
853 *
854 * If the main thread wants the signal, it gets first crack.
855 * Probably the least surprising to the average bear.
856 */
857 if (wants_signal(sig, p))
858 t = p;
859 else if (thread_group_empty(p))
860 /*
861 * There is just one thread and it does not need to be woken.
862 * It will dequeue unblocked signals before it runs again.
863 */
864 return;
865 else {
866 /*
867 * Otherwise try to find a suitable thread.
868 */
869 t = p->signal->curr_target;
870 if (t == NULL)
871 /* restart balancing at this thread */
872 t = p->signal->curr_target = p;
873
874 while (!wants_signal(sig, t)) {
875 t = next_thread(t);
876 if (t == p->signal->curr_target)
877 /*
878 * No thread needs to be woken.
879 * Any eligible threads will see
880 * the signal in the queue soon.
881 */
882 return;
883 }
884 p->signal->curr_target = t;
885 }
886
887 /*
888 * Found a killable thread. If the signal will be fatal,
889 * then start taking the whole group down immediately.
890 */
891 if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
892 !sigismember(&t->real_blocked, sig) &&
893 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
894 /*
895 * This signal will be fatal to the whole group.
896 */
897 if (!sig_kernel_coredump(sig)) {
898 /*
899 * Start a group exit and wake everybody up.
900 * This way we don't have other threads
901 * running and doing things after a slower
902 * thread has the fatal signal pending.
903 */
904 p->signal->flags = SIGNAL_GROUP_EXIT;
905 p->signal->group_exit_code = sig;
906 p->signal->group_stop_count = 0;
907 t = p;
908 do {
909 sigaddset(&t->pending.signal, SIGKILL);
910 signal_wake_up(t, 1);
911 } while_each_thread(p, t);
912 return;
913 }
914 }
915
916 /*
917 * The signal is already in the shared-pending queue.
918 * Tell the chosen thread to wake up and dequeue it.
919 */
920 signal_wake_up(t, sig == SIGKILL);
921 return;
922}
923
924int
925__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
926{
927 int ret = 0;
928
929 assert_spin_locked(&p->sighand->siglock);
930 handle_stop_signal(sig, p);
931
932 /* Short-circuit ignored signals. */
933 if (sig_ignored(p, sig))
934 return ret;
935
936 if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
937 /* This is a non-RT signal and we already have one queued. */
938 return ret;
939
940 /*
941 * Put this signal on the shared-pending queue, or fail with EAGAIN.
942 * We always use the shared queue for process-wide signals,
943 * to avoid several races.
944 */
945 ret = send_signal(sig, info, p, &p->signal->shared_pending);
946 if (unlikely(ret))
947 return ret;
948
949 __group_complete_signal(sig, p);
950 return 0;
951}
952
953/*
954 * Nuke all other threads in the group. 931 * Nuke all other threads in the group.
955 */ 932 */
956void zap_other_threads(struct task_struct *p) 933void zap_other_threads(struct task_struct *p)
@@ -978,13 +955,11 @@ int __fatal_signal_pending(struct task_struct *tsk)
978} 955}
979EXPORT_SYMBOL(__fatal_signal_pending); 956EXPORT_SYMBOL(__fatal_signal_pending);
980 957
981/*
982 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
983 */
984struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 958struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
985{ 959{
986 struct sighand_struct *sighand; 960 struct sighand_struct *sighand;
987 961
962 rcu_read_lock();
988 for (;;) { 963 for (;;) {
989 sighand = rcu_dereference(tsk->sighand); 964 sighand = rcu_dereference(tsk->sighand);
990 if (unlikely(sighand == NULL)) 965 if (unlikely(sighand == NULL))
@@ -995,6 +970,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
995 break; 970 break;
996 spin_unlock_irqrestore(&sighand->siglock, *flags); 971 spin_unlock_irqrestore(&sighand->siglock, *flags);
997 } 972 }
973 rcu_read_unlock();
998 974
999 return sighand; 975 return sighand;
1000} 976}
@@ -1043,9 +1019,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1043 struct task_struct *p; 1019 struct task_struct *p;
1044 1020
1045 rcu_read_lock(); 1021 rcu_read_lock();
1046 if (unlikely(sig_needs_tasklist(sig)))
1047 read_lock(&tasklist_lock);
1048
1049retry: 1022retry:
1050 p = pid_task(pid, PIDTYPE_PID); 1023 p = pid_task(pid, PIDTYPE_PID);
1051 if (p) { 1024 if (p) {
@@ -1059,10 +1032,8 @@ retry:
1059 */ 1032 */
1060 goto retry; 1033 goto retry;
1061 } 1034 }
1062
1063 if (unlikely(sig_needs_tasklist(sig)))
1064 read_unlock(&tasklist_lock);
1065 rcu_read_unlock(); 1035 rcu_read_unlock();
1036
1066 return error; 1037 return error;
1067} 1038}
1068 1039
@@ -1159,8 +1130,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1159 */ 1130 */
1160 1131
1161/* 1132/*
1162 * These two are the most common entry points. They send a signal 1133 * The caller must ensure the task can't exit.
1163 * just to the specific thread.
1164 */ 1134 */
1165int 1135int
1166send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1136send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
@@ -1175,17 +1145,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1175 if (!valid_signal(sig)) 1145 if (!valid_signal(sig))
1176 return -EINVAL; 1146 return -EINVAL;
1177 1147
1178 /*
1179 * We need the tasklist lock even for the specific
1180 * thread case (when we don't need to follow the group
1181 * lists) in order to avoid races with "p->sighand"
1182 * going away or changing from under us.
1183 */
1184 read_lock(&tasklist_lock);
1185 spin_lock_irqsave(&p->sighand->siglock, flags); 1148 spin_lock_irqsave(&p->sighand->siglock, flags);
1186 ret = specific_send_sig_info(sig, info, p); 1149 ret = specific_send_sig_info(sig, info, p);
1187 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1150 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1188 read_unlock(&tasklist_lock);
1189 return ret; 1151 return ret;
1190} 1152}
1191 1153
@@ -1291,28 +1253,24 @@ void sigqueue_free(struct sigqueue *q)
1291 __sigqueue_free(q); 1253 __sigqueue_free(q);
1292} 1254}
1293 1255
1294int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) 1256int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1295{ 1257{
1258 int sig = q->info.si_signo;
1259 struct sigpending *pending;
1296 unsigned long flags; 1260 unsigned long flags;
1297 int ret = 0; 1261 int ret;
1298 1262
1299 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1263 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1300 1264
1301 /* 1265 ret = -1;
1302 * The rcu based delayed sighand destroy makes it possible to 1266 if (!likely(lock_task_sighand(t, &flags)))
1303 * run this without tasklist lock held. The task struct itself 1267 goto ret;
1304 * cannot go away as create_timer did get_task_struct().
1305 *
1306 * We return -1, when the task is marked exiting, so
1307 * posix_timer_event can redirect it to the group leader
1308 */
1309 rcu_read_lock();
1310 1268
1311 if (!likely(lock_task_sighand(p, &flags))) { 1269 ret = 1; /* the signal is ignored */
1312 ret = -1; 1270 if (!prepare_signal(sig, t))
1313 goto out_err; 1271 goto out;
1314 }
1315 1272
1273 ret = 0;
1316 if (unlikely(!list_empty(&q->list))) { 1274 if (unlikely(!list_empty(&q->list))) {
1317 /* 1275 /*
1318 * If an SI_TIMER entry is already queue just increment 1276 * If an SI_TIMER entry is already queue just increment
@@ -1322,77 +1280,15 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1322 q->info.si_overrun++; 1280 q->info.si_overrun++;
1323 goto out; 1281 goto out;
1324 } 1282 }
1325 /* Short-circuit ignored signals. */
1326 if (sig_ignored(p, sig)) {
1327 ret = 1;
1328 goto out;
1329 }
1330 /*
1331 * Deliver the signal to listening signalfds. This must be called
1332 * with the sighand lock held.
1333 */
1334 signalfd_notify(p, sig);
1335
1336 list_add_tail(&q->list, &p->pending.list);
1337 sigaddset(&p->pending.signal, sig);
1338 if (!sigismember(&p->blocked, sig))
1339 signal_wake_up(p, sig == SIGKILL);
1340
1341out:
1342 unlock_task_sighand(p, &flags);
1343out_err:
1344 rcu_read_unlock();
1345
1346 return ret;
1347}
1348
1349int
1350send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1351{
1352 unsigned long flags;
1353 int ret = 0;
1354
1355 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1356
1357 read_lock(&tasklist_lock);
1358 /* Since it_lock is held, p->sighand cannot be NULL. */
1359 spin_lock_irqsave(&p->sighand->siglock, flags);
1360 handle_stop_signal(sig, p);
1361
1362 /* Short-circuit ignored signals. */
1363 if (sig_ignored(p, sig)) {
1364 ret = 1;
1365 goto out;
1366 }
1367
1368 if (unlikely(!list_empty(&q->list))) {
1369 /*
1370 * If an SI_TIMER entry is already queue just increment
1371 * the overrun count. Other uses should not try to
1372 * send the signal multiple times.
1373 */
1374 BUG_ON(q->info.si_code != SI_TIMER);
1375 q->info.si_overrun++;
1376 goto out;
1377 }
1378 /*
1379 * Deliver the signal to listening signalfds. This must be called
1380 * with the sighand lock held.
1381 */
1382 signalfd_notify(p, sig);
1383 1283
1384 /* 1284 signalfd_notify(t, sig);
1385 * Put this signal on the shared-pending queue. 1285 pending = group ? &t->signal->shared_pending : &t->pending;
1386 * We always use the shared queue for process-wide signals, 1286 list_add_tail(&q->list, &pending->list);
1387 * to avoid several races. 1287 sigaddset(&pending->signal, sig);
1388 */ 1288 complete_signal(sig, t, group);
1389 list_add_tail(&q->list, &p->signal->shared_pending.list);
1390 sigaddset(&p->signal->shared_pending.signal, sig);
1391
1392 __group_complete_signal(sig, p);
1393out: 1289out:
1394 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1290 unlock_task_sighand(t, &flags);
1395 read_unlock(&tasklist_lock); 1291ret:
1396 return ret; 1292 return ret;
1397} 1293}
1398 1294
@@ -1723,8 +1619,9 @@ static int do_signal_stop(int signr)
1723 } else { 1619 } else {
1724 struct task_struct *t; 1620 struct task_struct *t;
1725 1621
1726 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1622 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
1727 unlikely(sig->group_exit_task)) 1623 != SIGNAL_STOP_DEQUEUED) ||
1624 unlikely(signal_group_exit(sig)))
1728 return 0; 1625 return 0;
1729 /* 1626 /*
1730 * There is no group stop already in progress. 1627 * There is no group stop already in progress.
@@ -1757,11 +1654,51 @@ static int do_signal_stop(int signr)
1757 return 1; 1654 return 1;
1758} 1655}
1759 1656
1657static int ptrace_signal(int signr, siginfo_t *info,
1658 struct pt_regs *regs, void *cookie)
1659{
1660 if (!(current->ptrace & PT_PTRACED))
1661 return signr;
1662
1663 ptrace_signal_deliver(regs, cookie);
1664
1665 /* Let the debugger run. */
1666 ptrace_stop(signr, 0, info);
1667
1668 /* We're back. Did the debugger cancel the sig? */
1669 signr = current->exit_code;
1670 if (signr == 0)
1671 return signr;
1672
1673 current->exit_code = 0;
1674
1675 /* Update the siginfo structure if the signal has
1676 changed. If the debugger wanted something
1677 specific in the siginfo structure then it should
1678 have updated *info via PTRACE_SETSIGINFO. */
1679 if (signr != info->si_signo) {
1680 info->si_signo = signr;
1681 info->si_errno = 0;
1682 info->si_code = SI_USER;
1683 info->si_pid = task_pid_vnr(current->parent);
1684 info->si_uid = current->parent->uid;
1685 }
1686
1687 /* If the (new) signal is now blocked, requeue it. */
1688 if (sigismember(&current->blocked, signr)) {
1689 specific_send_sig_info(signr, info, current);
1690 signr = 0;
1691 }
1692
1693 return signr;
1694}
1695
1760int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 1696int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1761 struct pt_regs *regs, void *cookie) 1697 struct pt_regs *regs, void *cookie)
1762{ 1698{
1763 sigset_t *mask = &current->blocked; 1699 struct sighand_struct *sighand = current->sighand;
1764 int signr = 0; 1700 struct signal_struct *signal = current->signal;
1701 int signr;
1765 1702
1766relock: 1703relock:
1767 /* 1704 /*
@@ -1772,52 +1709,42 @@ relock:
1772 */ 1709 */
1773 try_to_freeze(); 1710 try_to_freeze();
1774 1711
1775 spin_lock_irq(&current->sighand->siglock); 1712 spin_lock_irq(&sighand->siglock);
1713 /*
1714 * Every stopped thread goes here after wakeup. Check to see if
1715 * we should notify the parent, prepare_signal(SIGCONT) encodes
1716 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1717 */
1718 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1719 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1720 ? CLD_CONTINUED : CLD_STOPPED;
1721 signal->flags &= ~SIGNAL_CLD_MASK;
1722 spin_unlock_irq(&sighand->siglock);
1723
1724 read_lock(&tasklist_lock);
1725 do_notify_parent_cldstop(current->group_leader, why);
1726 read_unlock(&tasklist_lock);
1727 goto relock;
1728 }
1729
1776 for (;;) { 1730 for (;;) {
1777 struct k_sigaction *ka; 1731 struct k_sigaction *ka;
1778 1732
1779 if (unlikely(current->signal->group_stop_count > 0) && 1733 if (unlikely(signal->group_stop_count > 0) &&
1780 do_signal_stop(0)) 1734 do_signal_stop(0))
1781 goto relock; 1735 goto relock;
1782 1736
1783 signr = dequeue_signal(current, mask, info); 1737 signr = dequeue_signal(current, &current->blocked, info);
1784
1785 if (!signr) 1738 if (!signr)
1786 break; /* will return 0 */ 1739 break; /* will return 0 */
1787 1740
1788 if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { 1741 if (signr != SIGKILL) {
1789 ptrace_signal_deliver(regs, cookie); 1742 signr = ptrace_signal(signr, info, regs, cookie);
1790 1743 if (!signr)
1791 /* Let the debugger run. */
1792 ptrace_stop(signr, 0, info);
1793
1794 /* We're back. Did the debugger cancel the sig? */
1795 signr = current->exit_code;
1796 if (signr == 0)
1797 continue;
1798
1799 current->exit_code = 0;
1800
1801 /* Update the siginfo structure if the signal has
1802 changed. If the debugger wanted something
1803 specific in the siginfo structure then it should
1804 have updated *info via PTRACE_SETSIGINFO. */
1805 if (signr != info->si_signo) {
1806 info->si_signo = signr;
1807 info->si_errno = 0;
1808 info->si_code = SI_USER;
1809 info->si_pid = task_pid_vnr(current->parent);
1810 info->si_uid = current->parent->uid;
1811 }
1812
1813 /* If the (new) signal is now blocked, requeue it. */
1814 if (sigismember(&current->blocked, signr)) {
1815 specific_send_sig_info(signr, info, current);
1816 continue; 1744 continue;
1817 }
1818 } 1745 }
1819 1746
1820 ka = &current->sighand->action[signr-1]; 1747 ka = &sighand->action[signr-1];
1821 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1748 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1822 continue; 1749 continue;
1823 if (ka->sa.sa_handler != SIG_DFL) { 1750 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1839,7 +1766,8 @@ relock:
1839 /* 1766 /*
1840 * Global init gets no signals it doesn't want. 1767 * Global init gets no signals it doesn't want.
1841 */ 1768 */
1842 if (is_global_init(current)) 1769 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1770 !signal_group_exit(signal))
1843 continue; 1771 continue;
1844 1772
1845 if (sig_kernel_stop(signr)) { 1773 if (sig_kernel_stop(signr)) {
@@ -1854,14 +1782,14 @@ relock:
1854 * We need to check for that and bail out if necessary. 1782 * We need to check for that and bail out if necessary.
1855 */ 1783 */
1856 if (signr != SIGSTOP) { 1784 if (signr != SIGSTOP) {
1857 spin_unlock_irq(&current->sighand->siglock); 1785 spin_unlock_irq(&sighand->siglock);
1858 1786
1859 /* signals can be posted during this window */ 1787 /* signals can be posted during this window */
1860 1788
1861 if (is_current_pgrp_orphaned()) 1789 if (is_current_pgrp_orphaned())
1862 goto relock; 1790 goto relock;
1863 1791
1864 spin_lock_irq(&current->sighand->siglock); 1792 spin_lock_irq(&sighand->siglock);
1865 } 1793 }
1866 1794
1867 if (likely(do_signal_stop(signr))) { 1795 if (likely(do_signal_stop(signr))) {
@@ -1876,15 +1804,16 @@ relock:
1876 continue; 1804 continue;
1877 } 1805 }
1878 1806
1879 spin_unlock_irq(&current->sighand->siglock); 1807 spin_unlock_irq(&sighand->siglock);
1880 1808
1881 /* 1809 /*
1882 * Anything else is fatal, maybe with a core dump. 1810 * Anything else is fatal, maybe with a core dump.
1883 */ 1811 */
1884 current->flags |= PF_SIGNALED; 1812 current->flags |= PF_SIGNALED;
1885 if ((signr != SIGKILL) && print_fatal_signals) 1813
1886 print_fatal_signal(regs, signr);
1887 if (sig_kernel_coredump(signr)) { 1814 if (sig_kernel_coredump(signr)) {
1815 if (print_fatal_signals)
1816 print_fatal_signal(regs, signr);
1888 /* 1817 /*
1889 * If it was able to dump core, this kills all 1818 * If it was able to dump core, this kills all
1890 * other threads in the group and synchronizes with 1819 * other threads in the group and synchronizes with
@@ -1902,7 +1831,7 @@ relock:
1902 do_group_exit(signr); 1831 do_group_exit(signr);
1903 /* NOTREACHED */ 1832 /* NOTREACHED */
1904 } 1833 }
1905 spin_unlock_irq(&current->sighand->siglock); 1834 spin_unlock_irq(&sighand->siglock);
1906 return signr; 1835 return signr;
1907} 1836}
1908 1837
@@ -2246,6 +2175,7 @@ static int do_tkill(int tgid, int pid, int sig)
2246 int error; 2175 int error;
2247 struct siginfo info; 2176 struct siginfo info;
2248 struct task_struct *p; 2177 struct task_struct *p;
2178 unsigned long flags;
2249 2179
2250 error = -ESRCH; 2180 error = -ESRCH;
2251 info.si_signo = sig; 2181 info.si_signo = sig;
@@ -2254,22 +2184,24 @@ static int do_tkill(int tgid, int pid, int sig)
2254 info.si_pid = task_tgid_vnr(current); 2184 info.si_pid = task_tgid_vnr(current);
2255 info.si_uid = current->uid; 2185 info.si_uid = current->uid;
2256 2186
2257 read_lock(&tasklist_lock); 2187 rcu_read_lock();
2258 p = find_task_by_vpid(pid); 2188 p = find_task_by_vpid(pid);
2259 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2189 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2260 error = check_kill_permission(sig, &info, p); 2190 error = check_kill_permission(sig, &info, p);
2261 /* 2191 /*
2262 * The null signal is a permissions and process existence 2192 * The null signal is a permissions and process existence
2263 * probe. No signal is actually delivered. 2193 * probe. No signal is actually delivered.
2194 *
2195 * If lock_task_sighand() fails we pretend the task dies
2196 * after receiving the signal. The window is tiny, and the
2197 * signal is private anyway.
2264 */ 2198 */
2265 if (!error && sig && p->sighand) { 2199 if (!error && sig && lock_task_sighand(p, &flags)) {
2266 spin_lock_irq(&p->sighand->siglock);
2267 handle_stop_signal(sig, p);
2268 error = specific_send_sig_info(sig, &info, p); 2200 error = specific_send_sig_info(sig, &info, p);
2269 spin_unlock_irq(&p->sighand->siglock); 2201 unlock_task_sighand(p, &flags);
2270 } 2202 }
2271 } 2203 }
2272 read_unlock(&tasklist_lock); 2204 rcu_read_unlock();
2273 2205
2274 return error; 2206 return error;
2275} 2207}
@@ -2326,13 +2258,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2326 2258
2327int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2259int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2328{ 2260{
2261 struct task_struct *t = current;
2329 struct k_sigaction *k; 2262 struct k_sigaction *k;
2330 sigset_t mask; 2263 sigset_t mask;
2331 2264
2332 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 2265 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
2333 return -EINVAL; 2266 return -EINVAL;
2334 2267
2335 k = &current->sighand->action[sig-1]; 2268 k = &t->sighand->action[sig-1];
2336 2269
2337 spin_lock_irq(&current->sighand->siglock); 2270 spin_lock_irq(&current->sighand->siglock);
2338 if (oact) 2271 if (oact)
@@ -2353,9 +2286,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2353 * (for example, SIGCHLD), shall cause the pending signal to 2286 * (for example, SIGCHLD), shall cause the pending signal to
2354 * be discarded, whether or not it is blocked" 2287 * be discarded, whether or not it is blocked"
2355 */ 2288 */
2356 if (act->sa.sa_handler == SIG_IGN || 2289 if (__sig_ignored(t, sig)) {
2357 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2358 struct task_struct *t = current;
2359 sigemptyset(&mask); 2290 sigemptyset(&mask);
2360 sigaddset(&mask, sig); 2291 sigaddset(&mask, sig);
2361 rm_from_queue_full(&mask, &t->signal->shared_pending); 2292 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2610,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2610 2541
2611 current->state = TASK_INTERRUPTIBLE; 2542 current->state = TASK_INTERRUPTIBLE;
2612 schedule(); 2543 schedule();
2613 set_thread_flag(TIF_RESTORE_SIGMASK); 2544 set_restore_sigmask();
2614 return -ERESTARTNOHAND; 2545 return -ERESTARTNOHAND;
2615} 2546}
2616#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 2547#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31e9f2a47928..36e061740047 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
356/* Tasklets */ 356/* Tasklets */
357struct tasklet_head 357struct tasklet_head
358{ 358{
359 struct tasklet_struct *list; 359 struct tasklet_struct *head;
360 struct tasklet_struct **tail;
360}; 361};
361 362
362/* Some compilers disobey section attribute on statics when not 363/* Some compilers disobey section attribute on statics when not
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t)
369 unsigned long flags; 370 unsigned long flags;
370 371
371 local_irq_save(flags); 372 local_irq_save(flags);
372 t->next = __get_cpu_var(tasklet_vec).list; 373 t->next = NULL;
373 __get_cpu_var(tasklet_vec).list = t; 374 *__get_cpu_var(tasklet_vec).tail = t;
375 __get_cpu_var(tasklet_vec).tail = &(t->next);
374 raise_softirq_irqoff(TASKLET_SOFTIRQ); 376 raise_softirq_irqoff(TASKLET_SOFTIRQ);
375 local_irq_restore(flags); 377 local_irq_restore(flags);
376} 378}
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 unsigned long flags; 384 unsigned long flags;
383 385
384 local_irq_save(flags); 386 local_irq_save(flags);
385 t->next = __get_cpu_var(tasklet_hi_vec).list; 387 t->next = NULL;
386 __get_cpu_var(tasklet_hi_vec).list = t; 388 *__get_cpu_var(tasklet_hi_vec).tail = t;
389 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
387 raise_softirq_irqoff(HI_SOFTIRQ); 390 raise_softirq_irqoff(HI_SOFTIRQ);
388 local_irq_restore(flags); 391 local_irq_restore(flags);
389} 392}
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a)
395 struct tasklet_struct *list; 398 struct tasklet_struct *list;
396 399
397 local_irq_disable(); 400 local_irq_disable();
398 list = __get_cpu_var(tasklet_vec).list; 401 list = __get_cpu_var(tasklet_vec).head;
399 __get_cpu_var(tasklet_vec).list = NULL; 402 __get_cpu_var(tasklet_vec).head = NULL;
403 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
400 local_irq_enable(); 404 local_irq_enable();
401 405
402 while (list) { 406 while (list) {
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a)
416 } 420 }
417 421
418 local_irq_disable(); 422 local_irq_disable();
419 t->next = __get_cpu_var(tasklet_vec).list; 423 t->next = NULL;
420 __get_cpu_var(tasklet_vec).list = t; 424 *__get_cpu_var(tasklet_vec).tail = t;
425 __get_cpu_var(tasklet_vec).tail = &(t->next);
421 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 426 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
422 local_irq_enable(); 427 local_irq_enable();
423 } 428 }
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a)
428 struct tasklet_struct *list; 433 struct tasklet_struct *list;
429 434
430 local_irq_disable(); 435 local_irq_disable();
431 list = __get_cpu_var(tasklet_hi_vec).list; 436 list = __get_cpu_var(tasklet_hi_vec).head;
432 __get_cpu_var(tasklet_hi_vec).list = NULL; 437 __get_cpu_var(tasklet_hi_vec).head = NULL;
438 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
433 local_irq_enable(); 439 local_irq_enable();
434 440
435 while (list) { 441 while (list) {
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a)
449 } 455 }
450 456
451 local_irq_disable(); 457 local_irq_disable();
452 t->next = __get_cpu_var(tasklet_hi_vec).list; 458 t->next = NULL;
453 __get_cpu_var(tasklet_hi_vec).list = t; 459 *__get_cpu_var(tasklet_hi_vec).tail = t;
460 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
454 __raise_softirq_irqoff(HI_SOFTIRQ); 461 __raise_softirq_irqoff(HI_SOFTIRQ);
455 local_irq_enable(); 462 local_irq_enable();
456 } 463 }
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill);
487 494
488void __init softirq_init(void) 495void __init softirq_init(void)
489{ 496{
497 int cpu;
498
499 for_each_possible_cpu(cpu) {
500 per_cpu(tasklet_vec, cpu).tail =
501 &per_cpu(tasklet_vec, cpu).head;
502 per_cpu(tasklet_hi_vec, cpu).tail =
503 &per_cpu(tasklet_hi_vec, cpu).head;
504 }
505
490 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); 506 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
491 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); 507 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
492} 508}
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
555 return; 571 return;
556 572
557 /* CPU is dead, so no lock needed. */ 573 /* CPU is dead, so no lock needed. */
558 for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { 574 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
559 if (*i == t) { 575 if (*i == t) {
560 *i = t->next; 576 *i = t->next;
577 /* If this was the tail element, move the tail ptr */
578 if (*i == NULL)
579 per_cpu(tasklet_vec, cpu).tail = i;
561 return; 580 return;
562 } 581 }
563 } 582 }
@@ -566,20 +585,24 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
566 585
567static void takeover_tasklets(unsigned int cpu) 586static void takeover_tasklets(unsigned int cpu)
568{ 587{
569 struct tasklet_struct **i;
570
571 /* CPU is dead, so no lock needed. */ 588 /* CPU is dead, so no lock needed. */
572 local_irq_disable(); 589 local_irq_disable();
573 590
574 /* Find end, append list for that CPU. */ 591 /* Find end, append list for that CPU. */
575 for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); 592 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
576 *i = per_cpu(tasklet_vec, cpu).list; 593 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
577 per_cpu(tasklet_vec, cpu).list = NULL; 594 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
595 per_cpu(tasklet_vec, cpu).head = NULL;
596 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
597 }
578 raise_softirq_irqoff(TASKLET_SOFTIRQ); 598 raise_softirq_irqoff(TASKLET_SOFTIRQ);
579 599
580 for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); 600 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
581 *i = per_cpu(tasklet_hi_vec, cpu).list; 601 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
582 per_cpu(tasklet_hi_vec, cpu).list = NULL; 602 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
603 per_cpu(tasklet_hi_vec, cpu).head = NULL;
604 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
605 }
583 raise_softirq_irqoff(HI_SOFTIRQ); 606 raise_softirq_irqoff(HI_SOFTIRQ);
584 607
585 local_irq_enable(); 608 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70c..0101aeef7ed7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -11,7 +11,6 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12 12
13#include <asm/atomic.h> 13#include <asm/atomic.h>
14#include <asm/semaphore.h>
15#include <asm/uaccess.h> 14#include <asm/uaccess.h>
16 15
17/* Since we effect priority and affinity (both of which are visible 16/* Since we effect priority and affinity (both of which are visible
@@ -35,7 +34,7 @@ static int stopmachine(void *cpu)
35 int irqs_disabled = 0; 34 int irqs_disabled = 0;
36 int prepared = 0; 35 int prepared = 0;
37 36
38 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); 37 set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
39 38
40 /* Ack: we are alive */ 39 /* Ack: we are alive */
41 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 40 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
@@ -135,8 +134,7 @@ static void restart_machine(void)
135 preempt_enable_no_resched(); 134 preempt_enable_no_resched();
136} 135}
137 136
138struct stop_machine_data 137struct stop_machine_data {
139{
140 int (*fn)(void *); 138 int (*fn)(void *);
141 void *data; 139 void *data;
142 struct completion done; 140 struct completion done;
diff --git a/kernel/sys.c b/kernel/sys.c
index a626116af5db..895d2d4c9493 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -67,6 +67,12 @@
67#ifndef SET_ENDIAN 67#ifndef SET_ENDIAN
68# define SET_ENDIAN(a,b) (-EINVAL) 68# define SET_ENDIAN(a,b) (-EINVAL)
69#endif 69#endif
70#ifndef GET_TSC_CTL
71# define GET_TSC_CTL(a) (-EINVAL)
72#endif
73#ifndef SET_TSC_CTL
74# define SET_TSC_CTL(a) (-EINVAL)
75#endif
70 76
71/* 77/*
72 * this is where the system-wide overflow UID and GID are defined, for 78 * this is where the system-wide overflow UID and GID are defined, for
@@ -972,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
972 goto out; 978 goto out;
973 979
974 if (task_pgrp(p) != pgrp) { 980 if (task_pgrp(p) != pgrp) {
975 detach_pid(p, PIDTYPE_PGID); 981 change_pid(p, PIDTYPE_PGID, pgrp);
976 attach_pid(p, PIDTYPE_PGID, pgrp);
977 set_task_pgrp(p, pid_nr(pgrp)); 982 set_task_pgrp(p, pid_nr(pgrp));
978 } 983 }
979 984
@@ -986,54 +991,67 @@ out:
986 991
987asmlinkage long sys_getpgid(pid_t pid) 992asmlinkage long sys_getpgid(pid_t pid)
988{ 993{
994 struct task_struct *p;
995 struct pid *grp;
996 int retval;
997
998 rcu_read_lock();
989 if (!pid) 999 if (!pid)
990 return task_pgrp_vnr(current); 1000 grp = task_pgrp(current);
991 else { 1001 else {
992 int retval;
993 struct task_struct *p;
994
995 read_lock(&tasklist_lock);
996 p = find_task_by_vpid(pid);
997 retval = -ESRCH; 1002 retval = -ESRCH;
998 if (p) { 1003 p = find_task_by_vpid(pid);
999 retval = security_task_getpgid(p); 1004 if (!p)
1000 if (!retval) 1005 goto out;
1001 retval = task_pgrp_vnr(p); 1006 grp = task_pgrp(p);
1002 } 1007 if (!grp)
1003 read_unlock(&tasklist_lock); 1008 goto out;
1004 return retval; 1009
1010 retval = security_task_getpgid(p);
1011 if (retval)
1012 goto out;
1005 } 1013 }
1014 retval = pid_vnr(grp);
1015out:
1016 rcu_read_unlock();
1017 return retval;
1006} 1018}
1007 1019
1008#ifdef __ARCH_WANT_SYS_GETPGRP 1020#ifdef __ARCH_WANT_SYS_GETPGRP
1009 1021
1010asmlinkage long sys_getpgrp(void) 1022asmlinkage long sys_getpgrp(void)
1011{ 1023{
1012 /* SMP - assuming writes are word atomic this is fine */ 1024 return sys_getpgid(0);
1013 return task_pgrp_vnr(current);
1014} 1025}
1015 1026
1016#endif 1027#endif
1017 1028
1018asmlinkage long sys_getsid(pid_t pid) 1029asmlinkage long sys_getsid(pid_t pid)
1019{ 1030{
1031 struct task_struct *p;
1032 struct pid *sid;
1033 int retval;
1034
1035 rcu_read_lock();
1020 if (!pid) 1036 if (!pid)
1021 return task_session_vnr(current); 1037 sid = task_session(current);
1022 else { 1038 else {
1023 int retval;
1024 struct task_struct *p;
1025
1026 rcu_read_lock();
1027 p = find_task_by_vpid(pid);
1028 retval = -ESRCH; 1039 retval = -ESRCH;
1029 if (p) { 1040 p = find_task_by_vpid(pid);
1030 retval = security_task_getsid(p); 1041 if (!p)
1031 if (!retval) 1042 goto out;
1032 retval = task_session_vnr(p); 1043 sid = task_session(p);
1033 } 1044 if (!sid)
1034 rcu_read_unlock(); 1045 goto out;
1035 return retval; 1046
1047 retval = security_task_getsid(p);
1048 if (retval)
1049 goto out;
1036 } 1050 }
1051 retval = pid_vnr(sid);
1052out:
1053 rcu_read_unlock();
1054 return retval;
1037} 1055}
1038 1056
1039asmlinkage long sys_setsid(void) 1057asmlinkage long sys_setsid(void)
@@ -1539,6 +1557,19 @@ out:
1539 * 1557 *
1540 */ 1558 */
1541 1559
1560static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
1561 cputime_t *utimep, cputime_t *stimep)
1562{
1563 *utimep = cputime_add(*utimep, t->utime);
1564 *stimep = cputime_add(*stimep, t->stime);
1565 r->ru_nvcsw += t->nvcsw;
1566 r->ru_nivcsw += t->nivcsw;
1567 r->ru_minflt += t->min_flt;
1568 r->ru_majflt += t->maj_flt;
1569 r->ru_inblock += task_io_get_inblock(t);
1570 r->ru_oublock += task_io_get_oublock(t);
1571}
1572
1542static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1573static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1543{ 1574{
1544 struct task_struct *t; 1575 struct task_struct *t;
@@ -1548,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1548 memset((char *) r, 0, sizeof *r); 1579 memset((char *) r, 0, sizeof *r);
1549 utime = stime = cputime_zero; 1580 utime = stime = cputime_zero;
1550 1581
1551 rcu_read_lock(); 1582 if (who == RUSAGE_THREAD) {
1552 if (!lock_task_sighand(p, &flags)) { 1583 accumulate_thread_rusage(p, r, &utime, &stime);
1553 rcu_read_unlock(); 1584 goto out;
1554 return;
1555 } 1585 }
1556 1586
1587 if (!lock_task_sighand(p, &flags))
1588 return;
1589
1557 switch (who) { 1590 switch (who) {
1558 case RUSAGE_BOTH: 1591 case RUSAGE_BOTH:
1559 case RUSAGE_CHILDREN: 1592 case RUSAGE_CHILDREN:
@@ -1580,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1580 r->ru_oublock += p->signal->oublock; 1613 r->ru_oublock += p->signal->oublock;
1581 t = p; 1614 t = p;
1582 do { 1615 do {
1583 utime = cputime_add(utime, t->utime); 1616 accumulate_thread_rusage(t, r, &utime, &stime);
1584 stime = cputime_add(stime, t->stime);
1585 r->ru_nvcsw += t->nvcsw;
1586 r->ru_nivcsw += t->nivcsw;
1587 r->ru_minflt += t->min_flt;
1588 r->ru_majflt += t->maj_flt;
1589 r->ru_inblock += task_io_get_inblock(t);
1590 r->ru_oublock += task_io_get_oublock(t);
1591 t = next_thread(t); 1617 t = next_thread(t);
1592 } while (t != p); 1618 } while (t != p);
1593 break; 1619 break;
@@ -1595,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1595 default: 1621 default:
1596 BUG(); 1622 BUG();
1597 } 1623 }
1598
1599 unlock_task_sighand(p, &flags); 1624 unlock_task_sighand(p, &flags);
1600 rcu_read_unlock();
1601 1625
1626out:
1602 cputime_to_timeval(utime, &r->ru_utime); 1627 cputime_to_timeval(utime, &r->ru_utime);
1603 cputime_to_timeval(stime, &r->ru_stime); 1628 cputime_to_timeval(stime, &r->ru_stime);
1604} 1629}
@@ -1612,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1612 1637
1613asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1638asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1614{ 1639{
1615 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) 1640 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1641 who != RUSAGE_THREAD)
1616 return -EINVAL; 1642 return -EINVAL;
1617 return getrusage(current, who, ru); 1643 return getrusage(current, who, ru);
1618} 1644}
@@ -1626,10 +1652,9 @@ asmlinkage long sys_umask(int mask)
1626asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1652asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1627 unsigned long arg4, unsigned long arg5) 1653 unsigned long arg4, unsigned long arg5)
1628{ 1654{
1629 long error; 1655 long uninitialized_var(error);
1630 1656
1631 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1657 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
1632 if (error)
1633 return error; 1658 return error;
1634 1659
1635 switch (option) { 1660 switch (option) {
@@ -1682,17 +1707,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1682 error = -EINVAL; 1707 error = -EINVAL;
1683 break; 1708 break;
1684 1709
1685 case PR_GET_KEEPCAPS:
1686 if (current->keep_capabilities)
1687 error = 1;
1688 break;
1689 case PR_SET_KEEPCAPS:
1690 if (arg2 != 0 && arg2 != 1) {
1691 error = -EINVAL;
1692 break;
1693 }
1694 current->keep_capabilities = arg2;
1695 break;
1696 case PR_SET_NAME: { 1710 case PR_SET_NAME: {
1697 struct task_struct *me = current; 1711 struct task_struct *me = current;
1698 unsigned char ncomm[sizeof(me->comm)]; 1712 unsigned char ncomm[sizeof(me->comm)];
@@ -1726,18 +1740,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1726 case PR_SET_SECCOMP: 1740 case PR_SET_SECCOMP:
1727 error = prctl_set_seccomp(arg2); 1741 error = prctl_set_seccomp(arg2);
1728 break; 1742 break;
1729 1743 case PR_GET_TSC:
1730 case PR_CAPBSET_READ: 1744 error = GET_TSC_CTL(arg2);
1731 if (!cap_valid(arg2)) 1745 break;
1732 return -EINVAL; 1746 case PR_SET_TSC:
1733 return !!cap_raised(current->cap_bset, arg2); 1747 error = SET_TSC_CTL(arg2);
1734 case PR_CAPBSET_DROP: 1748 break;
1735#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
1736 return cap_prctl_drop(arg2);
1737#else
1738 return -EINVAL;
1739#endif
1740
1741 default: 1749 default:
1742 error = -EINVAL; 1750 error = -EINVAL;
1743 break; 1751 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2a2d6889bab..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/initrd.h> 40#include <linux/initrd.h>
41#include <linux/key.h>
41#include <linux/times.h> 42#include <linux/times.h>
42#include <linux/limits.h> 43#include <linux/limits.h>
43#include <linux/dcache.h> 44#include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
144extern int max_lock_depth; 145extern int max_lock_depth;
145#endif 146#endif
146 147
147#ifdef CONFIG_SYSCTL_SYSCALL
148static int parse_table(int __user *, int, void __user *, size_t __user *,
149 void __user *, size_t, struct ctl_table *);
150#endif
151
152
153#ifdef CONFIG_PROC_SYSCTL 148#ifdef CONFIG_PROC_SYSCTL
154static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 149static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
155 void __user *buffer, size_t *lenp, loff_t *ppos); 150 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -270,17 +265,6 @@ static struct ctl_table kern_table[] = {
270 }, 265 },
271 { 266 {
272 .ctl_name = CTL_UNNUMBERED, 267 .ctl_name = CTL_UNNUMBERED,
273 .procname = "sched_batch_wakeup_granularity_ns",
274 .data = &sysctl_sched_batch_wakeup_granularity,
275 .maxlen = sizeof(unsigned int),
276 .mode = 0644,
277 .proc_handler = &proc_dointvec_minmax,
278 .strategy = &sysctl_intvec,
279 .extra1 = &min_wakeup_granularity_ns,
280 .extra2 = &max_wakeup_granularity_ns,
281 },
282 {
283 .ctl_name = CTL_UNNUMBERED,
284 .procname = "sched_child_runs_first", 268 .procname = "sched_child_runs_first",
285 .data = &sysctl_sched_child_runs_first, 269 .data = &sysctl_sched_child_runs_first,
286 .maxlen = sizeof(unsigned int), 270 .maxlen = sizeof(unsigned int),
@@ -318,7 +302,7 @@ static struct ctl_table kern_table[] = {
318 .data = &sysctl_sched_rt_period, 302 .data = &sysctl_sched_rt_period,
319 .maxlen = sizeof(unsigned int), 303 .maxlen = sizeof(unsigned int),
320 .mode = 0644, 304 .mode = 0644,
321 .proc_handler = &proc_dointvec, 305 .proc_handler = &sched_rt_handler,
322 }, 306 },
323 { 307 {
324 .ctl_name = CTL_UNNUMBERED, 308 .ctl_name = CTL_UNNUMBERED,
@@ -326,7 +310,7 @@ static struct ctl_table kern_table[] = {
326 .data = &sysctl_sched_rt_runtime, 310 .data = &sysctl_sched_rt_runtime,
327 .maxlen = sizeof(int), 311 .maxlen = sizeof(int),
328 .mode = 0644, 312 .mode = 0644,
329 .proc_handler = &proc_dointvec, 313 .proc_handler = &sched_rt_handler,
330 }, 314 },
331 { 315 {
332 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
@@ -820,6 +804,14 @@ static struct ctl_table kern_table[] = {
820 .proc_handler = &proc_dostring, 804 .proc_handler = &proc_dostring,
821 .strategy = &sysctl_string, 805 .strategy = &sysctl_string,
822 }, 806 },
807#ifdef CONFIG_KEYS
808 {
809 .ctl_name = CTL_UNNUMBERED,
810 .procname = "keys",
811 .mode = 0555,
812 .child = key_sysctls,
813 },
814#endif
823/* 815/*
824 * NOTE: do not add new entries to this table unless you have read 816 * NOTE: do not add new entries to this table unless you have read
825 * Documentation/sysctl/ctl_unnumbered.txt 817 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1441,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
1441} 1433}
1442 1434
1443#ifdef CONFIG_SYSCTL_SYSCALL 1435#ifdef CONFIG_SYSCTL_SYSCALL
1436/* Perform the actual read/write of a sysctl table entry. */
1437static int do_sysctl_strategy(struct ctl_table_root *root,
1438 struct ctl_table *table,
1439 int __user *name, int nlen,
1440 void __user *oldval, size_t __user *oldlenp,
1441 void __user *newval, size_t newlen)
1442{
1443 int op = 0, rc;
1444
1445 if (oldval)
1446 op |= 004;
1447 if (newval)
1448 op |= 002;
1449 if (sysctl_perm(root, table, op))
1450 return -EPERM;
1451
1452 if (table->strategy) {
1453 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1454 newval, newlen);
1455 if (rc < 0)
1456 return rc;
1457 if (rc > 0)
1458 return 0;
1459 }
1460
1461 /* If there is no strategy routine, or if the strategy returns
1462 * zero, proceed with automatic r/w */
1463 if (table->data && table->maxlen) {
1464 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1465 newval, newlen);
1466 if (rc < 0)
1467 return rc;
1468 }
1469 return 0;
1470}
1471
1472static int parse_table(int __user *name, int nlen,
1473 void __user *oldval, size_t __user *oldlenp,
1474 void __user *newval, size_t newlen,
1475 struct ctl_table_root *root,
1476 struct ctl_table *table)
1477{
1478 int n;
1479repeat:
1480 if (!nlen)
1481 return -ENOTDIR;
1482 if (get_user(n, name))
1483 return -EFAULT;
1484 for ( ; table->ctl_name || table->procname; table++) {
1485 if (!table->ctl_name)
1486 continue;
1487 if (n == table->ctl_name) {
1488 int error;
1489 if (table->child) {
1490 if (sysctl_perm(root, table, 001))
1491 return -EPERM;
1492 name++;
1493 nlen--;
1494 table = table->child;
1495 goto repeat;
1496 }
1497 error = do_sysctl_strategy(root, table, name, nlen,
1498 oldval, oldlenp,
1499 newval, newlen);
1500 return error;
1501 }
1502 }
1503 return -ENOTDIR;
1504}
1505
1444int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1506int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1445 void __user *newval, size_t newlen) 1507 void __user *newval, size_t newlen)
1446{ 1508{
@@ -1458,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1458 for (head = sysctl_head_next(NULL); head; 1520 for (head = sysctl_head_next(NULL); head;
1459 head = sysctl_head_next(head)) { 1521 head = sysctl_head_next(head)) {
1460 error = parse_table(name, nlen, oldval, oldlenp, 1522 error = parse_table(name, nlen, oldval, oldlenp,
1461 newval, newlen, head->ctl_table); 1523 newval, newlen,
1524 head->root, head->ctl_table);
1462 if (error != -ENOTDIR) { 1525 if (error != -ENOTDIR) {
1463 sysctl_head_finish(head); 1526 sysctl_head_finish(head);
1464 break; 1527 break;
@@ -1504,84 +1567,22 @@ static int test_perm(int mode, int op)
1504 return -EACCES; 1567 return -EACCES;
1505} 1568}
1506 1569
1507int sysctl_perm(struct ctl_table *table, int op) 1570int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1508{ 1571{
1509 int error; 1572 int error;
1573 int mode;
1574
1510 error = security_sysctl(table, op); 1575 error = security_sysctl(table, op);
1511 if (error) 1576 if (error)
1512 return error; 1577 return error;
1513 return test_perm(table->mode, op);
1514}
1515 1578
1516#ifdef CONFIG_SYSCTL_SYSCALL 1579 if (root->permissions)
1517static int parse_table(int __user *name, int nlen, 1580 mode = root->permissions(root, current->nsproxy, table);
1518 void __user *oldval, size_t __user *oldlenp, 1581 else
1519 void __user *newval, size_t newlen, 1582 mode = table->mode;
1520 struct ctl_table *table)
1521{
1522 int n;
1523repeat:
1524 if (!nlen)
1525 return -ENOTDIR;
1526 if (get_user(n, name))
1527 return -EFAULT;
1528 for ( ; table->ctl_name || table->procname; table++) {
1529 if (!table->ctl_name)
1530 continue;
1531 if (n == table->ctl_name) {
1532 int error;
1533 if (table->child) {
1534 if (sysctl_perm(table, 001))
1535 return -EPERM;
1536 name++;
1537 nlen--;
1538 table = table->child;
1539 goto repeat;
1540 }
1541 error = do_sysctl_strategy(table, name, nlen,
1542 oldval, oldlenp,
1543 newval, newlen);
1544 return error;
1545 }
1546 }
1547 return -ENOTDIR;
1548}
1549
1550/* Perform the actual read/write of a sysctl table entry. */
1551int do_sysctl_strategy (struct ctl_table *table,
1552 int __user *name, int nlen,
1553 void __user *oldval, size_t __user *oldlenp,
1554 void __user *newval, size_t newlen)
1555{
1556 int op = 0, rc;
1557 1583
1558 if (oldval) 1584 return test_perm(mode, op);
1559 op |= 004;
1560 if (newval)
1561 op |= 002;
1562 if (sysctl_perm(table, op))
1563 return -EPERM;
1564
1565 if (table->strategy) {
1566 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1567 newval, newlen);
1568 if (rc < 0)
1569 return rc;
1570 if (rc > 0)
1571 return 0;
1572 }
1573
1574 /* If there is no strategy routine, or if the strategy returns
1575 * zero, proceed with automatic r/w */
1576 if (table->data && table->maxlen) {
1577 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1578 newval, newlen);
1579 if (rc < 0)
1580 return rc;
1581 }
1582 return 0;
1583} 1585}
1584#endif /* CONFIG_SYSCTL_SYSCALL */
1585 1586
1586static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) 1587static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1587{ 1588{
@@ -1594,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1594 1595
1595static __init int sysctl_init(void) 1596static __init int sysctl_init(void)
1596{ 1597{
1597 int err;
1598 sysctl_set_parent(NULL, root_table); 1598 sysctl_set_parent(NULL, root_table);
1599 err = sysctl_check_table(current->nsproxy, root_table); 1599#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1600 {
1601 int err;
1602 err = sysctl_check_table(current->nsproxy, root_table);
1603 }
1604#endif
1600 return 0; 1605 return 0;
1601} 1606}
1602 1607
@@ -1723,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
1723 header->unregistering = NULL; 1728 header->unregistering = NULL;
1724 header->root = root; 1729 header->root = root;
1725 sysctl_set_parent(NULL, header->ctl_table); 1730 sysctl_set_parent(NULL, header->ctl_table);
1731#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1726 if (sysctl_check_table(namespaces, header->ctl_table)) { 1732 if (sysctl_check_table(namespaces, header->ctl_table)) {
1727 kfree(header); 1733 kfree(header);
1728 return NULL; 1734 return NULL;
1729 } 1735 }
1736#endif
1730 spin_lock(&sysctl_lock); 1737 spin_lock(&sysctl_lock);
1731 header_list = lookup_header_list(root, namespaces); 1738 header_list = lookup_header_list(root, namespaces);
1732 list_add_tail(&header->ctl_entry, header_list); 1739 list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 07e86a828073..4a23517169a6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
183 183
184 if (!tsk) { 184 if (!tsk) {
185 rcu_read_lock(); 185 rcu_read_lock();
186 tsk = find_task_by_pid(pid); 186 tsk = find_task_by_vpid(pid);
187 if (tsk) 187 if (tsk)
188 get_task_struct(tsk); 188 get_task_struct(tsk);
189 rcu_read_unlock(); 189 rcu_read_unlock();
@@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
230 */ 230 */
231 rcu_read_lock(); 231 rcu_read_lock();
232 if (!first) 232 if (!first)
233 first = find_task_by_pid(tgid); 233 first = find_task_by_vpid(tgid);
234 234
235 if (!first || !lock_task_sighand(first, &flags)) 235 if (!first || !lock_task_sighand(first, &flags))
236 goto out; 236 goto out;
@@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
547 if (!stats) 547 if (!stats)
548 goto err; 548 goto err;
549 549
550 rc = fill_pid(tsk->pid, tsk, stats); 550 rc = fill_pid(-1, tsk, stats);
551 if (rc < 0) 551 if (rc < 0)
552 goto err; 552 goto err;
553 553
diff --git a/kernel/time.c b/kernel/time.c
index a5ec013b6c80..6a08660b4fac 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,8 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/unistd.h> 42#include <asm/unistd.h>
@@ -244,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
244 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); 246 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
245#else 247#else
246# if BITS_PER_LONG == 32 248# if BITS_PER_LONG == 32
247 return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; 249 return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
248# else 250# else
249 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; 251 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
250# endif 252# endif
@@ -260,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
260 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); 262 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
261#else 263#else
262# if BITS_PER_LONG == 32 264# if BITS_PER_LONG == 32
263 return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; 265 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
264# else 266# else
265 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; 267 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
266# endif 268# endif
@@ -379,6 +381,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
379 ts->tv_sec = sec; 381 ts->tv_sec = sec;
380 ts->tv_nsec = nsec; 382 ts->tv_nsec = nsec;
381} 383}
384EXPORT_SYMBOL(set_normalized_timespec);
382 385
383/** 386/**
384 * ns_to_timespec - Convert nanoseconds to timespec 387 * ns_to_timespec - Convert nanoseconds to timespec
@@ -389,13 +392,17 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
389struct timespec ns_to_timespec(const s64 nsec) 392struct timespec ns_to_timespec(const s64 nsec)
390{ 393{
391 struct timespec ts; 394 struct timespec ts;
395 s32 rem;
392 396
393 if (!nsec) 397 if (!nsec)
394 return (struct timespec) {0, 0}; 398 return (struct timespec) {0, 0};
395 399
396 ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); 400 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
397 if (unlikely(nsec < 0)) 401 if (unlikely(rem < 0)) {
398 set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); 402 ts.tv_sec--;
403 rem += NSEC_PER_SEC;
404 }
405 ts.tv_nsec = rem;
399 406
400 return ts; 407 return ts;
401} 408}
@@ -469,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
469 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) 476 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
470 return MAX_JIFFY_OFFSET; 477 return MAX_JIFFY_OFFSET;
471 478
472 return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) 479 return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
473 >> MSEC_TO_HZ_SHR32; 480 >> MSEC_TO_HZ_SHR32;
474#endif 481#endif
475} 482}
@@ -484,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u)
484#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) 491#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
485 return u * (HZ / USEC_PER_SEC); 492 return u * (HZ / USEC_PER_SEC);
486#else 493#else
487 return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) 494 return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
488 >> USEC_TO_HZ_SHR32; 495 >> USEC_TO_HZ_SHR32;
489#endif 496#endif
490} 497}
@@ -525,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
525 * Convert jiffies to nanoseconds and separate with 532 * Convert jiffies to nanoseconds and separate with
526 * one divide. 533 * one divide.
527 */ 534 */
528 u64 nsec = (u64)jiffies * TICK_NSEC; 535 u32 rem;
529 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); 536 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
537 NSEC_PER_SEC, &rem);
538 value->tv_nsec = rem;
530} 539}
531EXPORT_SYMBOL(jiffies_to_timespec); 540EXPORT_SYMBOL(jiffies_to_timespec);
532 541
@@ -564,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
564 * Convert jiffies to nanoseconds and separate with 573 * Convert jiffies to nanoseconds and separate with
565 * one divide. 574 * one divide.
566 */ 575 */
567 u64 nsec = (u64)jiffies * TICK_NSEC; 576 u32 rem;
568 long tv_usec;
569 577
570 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); 578 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
571 tv_usec /= NSEC_PER_USEC; 579 NSEC_PER_SEC, &rem);
572 value->tv_usec = tv_usec; 580 value->tv_usec = rem / NSEC_PER_USEC;
573} 581}
574EXPORT_SYMBOL(jiffies_to_timeval); 582EXPORT_SYMBOL(jiffies_to_timeval);
575 583
@@ -585,9 +593,7 @@ clock_t jiffies_to_clock_t(long x)
585 return x / (HZ / USER_HZ); 593 return x / (HZ / USER_HZ);
586# endif 594# endif
587#else 595#else
588 u64 tmp = (u64)x * TICK_NSEC; 596 return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
589 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
590 return (long)tmp;
591#endif 597#endif
592} 598}
593EXPORT_SYMBOL(jiffies_to_clock_t); 599EXPORT_SYMBOL(jiffies_to_clock_t);
@@ -599,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
599 return ~0UL; 605 return ~0UL;
600 return x * (HZ / USER_HZ); 606 return x * (HZ / USER_HZ);
601#else 607#else
602 u64 jif;
603
604 /* Don't worry about loss of precision here .. */ 608 /* Don't worry about loss of precision here .. */
605 if (x >= ~0UL / HZ * USER_HZ) 609 if (x >= ~0UL / HZ * USER_HZ)
606 return ~0UL; 610 return ~0UL;
607 611
608 /* .. but do try to contain it here */ 612 /* .. but do try to contain it here */
609 jif = x * (u64) HZ; 613 return div_u64((u64)x * HZ, USER_HZ);
610 do_div(jif, USER_HZ);
611 return jif;
612#endif 614#endif
613} 615}
614EXPORT_SYMBOL(clock_t_to_jiffies); 616EXPORT_SYMBOL(clock_t_to_jiffies);
@@ -617,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x)
617{ 619{
618#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 620#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
619# if HZ < USER_HZ 621# if HZ < USER_HZ
620 x *= USER_HZ; 622 x = div_u64(x * USER_HZ, HZ);
621 do_div(x, HZ);
622# elif HZ > USER_HZ 623# elif HZ > USER_HZ
623 do_div(x, HZ / USER_HZ); 624 x = div_u64(x, HZ / USER_HZ);
624# else 625# else
625 /* Nothing to do */ 626 /* Nothing to do */
626# endif 627# endif
@@ -630,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x)
630 * but even this doesn't overflow in hundreds of years 631 * but even this doesn't overflow in hundreds of years
631 * in 64 bits, so.. 632 * in 64 bits, so..
632 */ 633 */
633 x *= TICK_NSEC; 634 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
634 do_div(x, (NSEC_PER_SEC / USER_HZ));
635#endif 635#endif
636 return x; 636 return x;
637} 637}
@@ -640,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
640u64 nsec_to_clock_t(u64 x) 640u64 nsec_to_clock_t(u64 x)
641{ 641{
642#if (NSEC_PER_SEC % USER_HZ) == 0 642#if (NSEC_PER_SEC % USER_HZ) == 0
643 do_div(x, (NSEC_PER_SEC / USER_HZ)); 643 return div_u64(x, NSEC_PER_SEC / USER_HZ);
644#elif (USER_HZ % 512) == 0 644#elif (USER_HZ % 512) == 0
645 x *= USER_HZ/512; 645 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
646 do_div(x, (NSEC_PER_SEC / 512));
647#else 646#else
648 /* 647 /*
649 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, 648 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
650 * overflow after 64.99 years. 649 * overflow after 64.99 years.
651 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... 650 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
652 */ 651 */
653 x *= 9; 652 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
654 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
655 USER_HZ));
656#endif 653#endif
657 return x;
658} 654}
659 655
660#if (BITS_PER_LONG < 64) 656#if (BITS_PER_LONG < 64)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 278534bbca95..73961f35fdc8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data)
141 } 141 }
142 142
143 if (!list_empty(&watchdog_list)) { 143 if (!list_empty(&watchdog_list)) {
144 __mod_timer(&watchdog_timer, 144 /*
145 watchdog_timer.expires + WATCHDOG_INTERVAL); 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other.
147 */
148 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
149
150 if (next_cpu >= NR_CPUS)
151 next_cpu = first_cpu(cpu_online_map);
152 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu);
146 } 154 }
147 spin_unlock(&watchdog_lock); 155 spin_unlock(&watchdog_lock);
148} 156}
@@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
164 if (!started && watchdog) { 172 if (!started && watchdog) {
165 watchdog_last = watchdog->read(); 173 watchdog_last = watchdog->read();
166 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
167 add_timer(&watchdog_timer); 175 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map));
168 } 177 }
169 } else { 178 } else {
170 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -174,7 +183,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
174 if (watchdog) 183 if (watchdog)
175 del_timer(&watchdog_timer); 184 del_timer(&watchdog_timer);
176 watchdog = cs; 185 watchdog = cs;
177 init_timer_deferrable(&watchdog_timer); 186 init_timer(&watchdog_timer);
178 watchdog_timer.function = clocksource_watchdog; 187 watchdog_timer.function = clocksource_watchdog;
179 188
180 /* Reset watchdog cycles */ 189 /* Reset watchdog cycles */
@@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
185 watchdog_last = watchdog->read(); 194 watchdog_last = watchdog->read();
186 watchdog_timer.expires = 195 watchdog_timer.expires =
187 jiffies + WATCHDOG_INTERVAL; 196 jiffies + WATCHDOG_INTERVAL;
188 add_timer(&watchdog_timer); 197 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map));
189 } 199 }
190 } 200 }
191 } 201 }
@@ -222,6 +232,18 @@ void clocksource_resume(void)
222} 232}
223 233
224/** 234/**
235 * clocksource_touch_watchdog - Update watchdog
236 *
237 * Update the watchdog after exception contexts such as kgdb so as not
238 * to incorrectly trip the watchdog.
239 *
240 */
241void clocksource_touch_watchdog(void)
242{
243 clocksource_resume_watchdog();
244}
245
246/**
225 * clocksource_get_next - Returns the selected clocksource 247 * clocksource_get_next - Returns the selected clocksource
226 * 248 *
227 */ 249 */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5fd9b9469770..5125ddd8196b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,7 +15,8 @@
15#include <linux/jiffies.h> 15#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/capability.h> 17#include <linux/capability.h>
18#include <asm/div64.h> 18#include <linux/math64.h>
19#include <linux/clocksource.h>
19#include <asm/timex.h> 20#include <asm/timex.h>
20 21
21/* 22/*
@@ -23,11 +24,14 @@
23 */ 24 */
24unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ 25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
25unsigned long tick_nsec; /* ACTHZ period (nsec) */ 26unsigned long tick_nsec; /* ACTHZ period (nsec) */
26static u64 tick_length, tick_length_base; 27u64 tick_length;
28static u64 tick_length_base;
29
30static struct hrtimer leap_timer;
27 31
28#define MAX_TICKADJ 500 /* microsecs */ 32#define MAX_TICKADJ 500 /* microsecs */
29#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
30 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) 34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
31 35
32/* 36/*
33 * phase-lock loop variables 37 * phase-lock loop variables
@@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base;
35/* TIME_ERROR prevents overwriting the CMOS clock */ 39/* TIME_ERROR prevents overwriting the CMOS clock */
36static int time_state = TIME_OK; /* clock synchronization status */ 40static int time_state = TIME_OK; /* clock synchronization status */
37int time_status = STA_UNSYNC; /* clock status bits */ 41int time_status = STA_UNSYNC; /* clock status bits */
38static s64 time_offset; /* time adjustment (ns) */ 42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
39static long time_constant = 2; /* pll time constant */ 44static long time_constant = 2; /* pll time constant */
40long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
41long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
42long time_freq; /* frequency offset (scaled ppm)*/ 47static s64 time_freq; /* frequency offset (scaled ns/s)*/
43static long time_reftime; /* time at last adjustment (s) */ 48static long time_reftime; /* time at last adjustment (s) */
44long time_adjust; 49long time_adjust;
45static long ntp_tick_adj; 50static long ntp_tick_adj;
@@ -47,16 +52,56 @@ static long ntp_tick_adj;
47static void ntp_update_frequency(void) 52static void ntp_update_frequency(void)
48{ 53{
49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 << TICK_LENGTH_SHIFT; 55 << NTP_SCALE_SHIFT;
51 second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; 56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 57 second_length += time_freq;
53 58
54 tick_length_base = second_length; 59 tick_length_base = second_length;
55 60
56 do_div(second_length, HZ); 61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT; 62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
63}
64
65static void ntp_update_offset(long offset)
66{
67 long mtemp;
68 s64 freq_adj;
69
70 if (!(time_status & STA_PLL))
71 return;
58 72
59 do_div(tick_length_base, NTP_INTERVAL_FREQ); 73 if (!(time_status & STA_NANO))
74 offset *= NSEC_PER_USEC;
75
76 /*
77 * Scale the phase adjustment and
78 * clamp to the operating range.
79 */
80 offset = min(offset, MAXPHASE);
81 offset = max(offset, -MAXPHASE);
82
83 /*
84 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL).
86 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0)
88 time_reftime = xtime.tv_sec;
89 mtemp = xtime.tv_sec - time_reftime;
90 time_reftime = xtime.tv_sec;
91
92 freq_adj = (s64)offset * mtemp;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
94 time_status &= ~STA_MODE;
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
60} 105}
61 106
62/** 107/**
@@ -78,62 +123,70 @@ void ntp_clear(void)
78} 123}
79 124
80/* 125/*
81 * this routine handles the overflow of the microsecond field 126 * Leap second processing. If in leap-insert state at the end of the
82 * 127 * day, the system clock is set back one second; if in leap-delete
83 * The tricky bits of code to handle the accurate clock support 128 * state, the system clock is set ahead one second.
84 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
85 * They were originally developed for SUN and DEC kernels.
86 * All the kudos should go to Dave for this stuff.
87 */ 129 */
88void second_overflow(void) 130static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
89{ 131{
90 long time_adj; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
91 133
92 /* Bump the maxerror field */ 134 write_seqlock_irq(&xtime_lock);
93 time_maxerror += MAXFREQ >> SHIFT_USEC;
94 if (time_maxerror > NTP_PHASE_LIMIT) {
95 time_maxerror = NTP_PHASE_LIMIT;
96 time_status |= STA_UNSYNC;
97 }
98 135
99 /*
100 * Leap second processing. If in leap-insert state at the end of the
101 * day, the system clock is set back one second; if in leap-delete
102 * state, the system clock is set ahead one second. The microtime()
103 * routine or external clock driver will insure that reported time is
104 * always monotonic. The ugly divides should be replaced.
105 */
106 switch (time_state) { 136 switch (time_state) {
107 case TIME_OK: 137 case TIME_OK:
108 if (time_status & STA_INS)
109 time_state = TIME_INS;
110 else if (time_status & STA_DEL)
111 time_state = TIME_DEL;
112 break; 138 break;
113 case TIME_INS: 139 case TIME_INS:
114 if (xtime.tv_sec % 86400 == 0) { 140 xtime.tv_sec--;
115 xtime.tv_sec--; 141 wall_to_monotonic.tv_sec++;
116 wall_to_monotonic.tv_sec++; 142 time_state = TIME_OOP;
117 time_state = TIME_OOP; 143 printk(KERN_NOTICE "Clock: "
118 printk(KERN_NOTICE "Clock: inserting leap second " 144 "inserting leap second 23:59:60 UTC\n");
119 "23:59:60 UTC\n"); 145 leap_timer.expires = ktime_add_ns(leap_timer.expires,
120 } 146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART;
121 break; 148 break;
122 case TIME_DEL: 149 case TIME_DEL:
123 if ((xtime.tv_sec + 1) % 86400 == 0) { 150 xtime.tv_sec++;
124 xtime.tv_sec++; 151 time_tai--;
125 wall_to_monotonic.tv_sec--; 152 wall_to_monotonic.tv_sec--;
126 time_state = TIME_WAIT; 153 time_state = TIME_WAIT;
127 printk(KERN_NOTICE "Clock: deleting leap second " 154 printk(KERN_NOTICE "Clock: "
128 "23:59:59 UTC\n"); 155 "deleting leap second 23:59:59 UTC\n");
129 }
130 break; 156 break;
131 case TIME_OOP: 157 case TIME_OOP:
158 time_tai++;
132 time_state = TIME_WAIT; 159 time_state = TIME_WAIT;
133 break; 160 /* fall through */
134 case TIME_WAIT: 161 case TIME_WAIT:
135 if (!(time_status & (STA_INS | STA_DEL))) 162 if (!(time_status & (STA_INS | STA_DEL)))
136 time_state = TIME_OK; 163 time_state = TIME_OK;
164 break;
165 }
166 update_vsyscall(&xtime, clock);
167
168 write_sequnlock_irq(&xtime_lock);
169
170 return res;
171}
172
173/*
174 * this routine handles the overflow of the microsecond field
175 *
176 * The tricky bits of code to handle the accurate clock support
177 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
178 * They were originally developed for SUN and DEC kernels.
179 * All the kudos should go to Dave for this stuff.
180 */
181void second_overflow(void)
182{
183 s64 time_adj;
184
185 /* Bump the maxerror field */
186 time_maxerror += MAXFREQ / NSEC_PER_USEC;
187 if (time_maxerror > NTP_PHASE_LIMIT) {
188 time_maxerror = NTP_PHASE_LIMIT;
189 time_status |= STA_UNSYNC;
137 } 190 }
138 191
139 /* 192 /*
@@ -143,7 +196,7 @@ void second_overflow(void)
143 tick_length = tick_length_base; 196 tick_length = tick_length_base;
144 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 197 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
145 time_offset -= time_adj; 198 time_offset -= time_adj;
146 tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); 199 tick_length += time_adj;
147 200
148 if (unlikely(time_adjust)) { 201 if (unlikely(time_adjust)) {
149 if (time_adjust > MAX_TICKADJ) { 202 if (time_adjust > MAX_TICKADJ) {
@@ -154,25 +207,12 @@ void second_overflow(void)
154 tick_length -= MAX_TICKADJ_SCALED; 207 tick_length -= MAX_TICKADJ_SCALED;
155 } else { 208 } else {
156 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 209 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
157 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; 210 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
158 time_adjust = 0; 211 time_adjust = 0;
159 } 212 }
160 } 213 }
161} 214}
162 215
163/*
164 * Return how long ticks are at the moment, that is, how much time
165 * update_wall_time_one_tick will add to xtime next time we call it
166 * (assuming no calls to do_adjtimex in the meantime).
167 * The return value is in fixed-point nanoseconds shifted by the
168 * specified number of bits to the right of the binary point.
169 * This function has no side-effects.
170 */
171u64 current_tick_length(void)
172{
173 return tick_length;
174}
175
176#ifdef CONFIG_GENERIC_CMOS_UPDATE 216#ifdef CONFIG_GENERIC_CMOS_UPDATE
177 217
178/* Disable the cmos update - used by virtualization and embedded */ 218/* Disable the cmos update - used by virtualization and embedded */
@@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { }
236 */ 276 */
237int do_adjtimex(struct timex *txc) 277int do_adjtimex(struct timex *txc)
238{ 278{
239 long mtemp, save_adjust, rem; 279 struct timespec ts;
240 s64 freq_adj, temp64; 280 long save_adjust, sec;
241 int result; 281 int result;
242 282
243 /* In order to modify anything, you gotta be super-user! */ 283 /* In order to modify anything, you gotta be super-user! */
@@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc)
247 /* Now we validate the data before disabling interrupts */ 287 /* Now we validate the data before disabling interrupts */
248 288
249 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { 289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
250 /* singleshot must not be used with any other mode bits */ 290 /* singleshot must not be used with any other mode bits */
251 if (txc->modes != ADJ_OFFSET_SINGLESHOT && 291 if (txc->modes & ~ADJ_OFFSET_SS_READ)
252 txc->modes != ADJ_OFFSET_SS_READ)
253 return -EINVAL; 292 return -EINVAL;
254 } 293 }
255 294
256 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
257 /* adjustment Offset limited to +- .512 seconds */
258 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
259 return -EINVAL;
260
261 /* if the quartz is off by more than 10% something is VERY wrong ! */ 295 /* if the quartz is off by more than 10% something is VERY wrong ! */
262 if (txc->modes & ADJ_TICK) 296 if (txc->modes & ADJ_TICK)
263 if (txc->tick < 900000/USER_HZ || 297 if (txc->tick < 900000/USER_HZ ||
264 txc->tick > 1100000/USER_HZ) 298 txc->tick > 1100000/USER_HZ)
265 return -EINVAL; 299 return -EINVAL;
266 300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts);
304
267 write_seqlock_irq(&xtime_lock); 305 write_seqlock_irq(&xtime_lock);
268 result = time_state; /* mostly `TIME_OK' */
269 306
270 /* Save for later - semantics of adjtime is to return old value */ 307 /* Save for later - semantics of adjtime is to return old value */
271 save_adjust = time_adjust; 308 save_adjust = time_adjust;
272 309
273#if 0 /* STA_CLOCKERR is never set yet */
274 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
275#endif
276 /* If there are input parameters, then process them */ 310 /* If there are input parameters, then process them */
277 if (txc->modes) 311 if (txc->modes) {
278 { 312 if (txc->modes & ADJ_STATUS) {
279 if (txc->modes & ADJ_STATUS) /* only set allowed bits */ 313 if ((time_status & STA_PLL) &&
280 time_status = (txc->status & ~STA_RONLY) | 314 !(txc->status & STA_PLL)) {
281 (time_status & STA_RONLY); 315 time_state = TIME_OK;
282 316 time_status = STA_UNSYNC;
283 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ 317 }
284 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { 318 /* only set allowed bits */
285 result = -EINVAL; 319 time_status &= STA_RONLY;
286 goto leave; 320 time_status |= txc->status & ~STA_RONLY;
287 } 321
288 time_freq = ((s64)txc->freq * NSEC_PER_USEC) 322 switch (time_state) {
289 >> (SHIFT_USEC - SHIFT_NSEC); 323 case TIME_OK:
290 } 324 start_timer:
291 325 sec = ts.tv_sec;
292 if (txc->modes & ADJ_MAXERROR) { 326 if (time_status & STA_INS) {
293 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { 327 time_state = TIME_INS;
294 result = -EINVAL; 328 sec += 86400 - sec % 86400;
295 goto leave; 329 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
330 } else if (time_status & STA_DEL) {
331 time_state = TIME_DEL;
332 sec += 86400 - (sec + 1) % 86400;
333 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
334 }
335 break;
336 case TIME_INS:
337 case TIME_DEL:
338 time_state = TIME_OK;
339 goto start_timer;
340 break;
341 case TIME_WAIT:
342 if (!(time_status & (STA_INS | STA_DEL)))
343 time_state = TIME_OK;
344 break;
345 case TIME_OOP:
346 hrtimer_restart(&leap_timer);
347 break;
348 }
296 } 349 }
297 time_maxerror = txc->maxerror;
298 }
299 350
300 if (txc->modes & ADJ_ESTERROR) { 351 if (txc->modes & ADJ_NANO)
301 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { 352 time_status |= STA_NANO;
302 result = -EINVAL; 353 if (txc->modes & ADJ_MICRO)
303 goto leave; 354 time_status &= ~STA_NANO;
355
356 if (txc->modes & ADJ_FREQUENCY) {
357 time_freq = (s64)txc->freq * PPM_SCALE;
358 time_freq = min(time_freq, MAXFREQ_SCALED);
359 time_freq = max(time_freq, -MAXFREQ_SCALED);
304 } 360 }
305 time_esterror = txc->esterror;
306 }
307 361
308 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ 362 if (txc->modes & ADJ_MAXERROR)
309 if (txc->constant < 0) { /* NTP v4 uses values > 6 */ 363 time_maxerror = txc->maxerror;
310 result = -EINVAL; 364 if (txc->modes & ADJ_ESTERROR)
311 goto leave; 365 time_esterror = txc->esterror;
366
367 if (txc->modes & ADJ_TIMECONST) {
368 time_constant = txc->constant;
369 if (!(time_status & STA_NANO))
370 time_constant += 4;
371 time_constant = min(time_constant, (long)MAXTC);
372 time_constant = max(time_constant, 0l);
312 } 373 }
313 time_constant = min(txc->constant + 4, (long)MAXTC);
314 }
315 374
316 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ 375 if (txc->modes & ADJ_TAI && txc->constant > 0)
317 if (txc->modes == ADJ_OFFSET_SINGLESHOT) { 376 time_tai = txc->constant;
318 /* adjtime() is independent from ntp_adjtime() */ 377
319 time_adjust = txc->offset; 378 if (txc->modes & ADJ_OFFSET) {
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT)
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
320 } 384 }
321 else if (time_status & STA_PLL) { 385 if (txc->modes & ADJ_TICK)
322 time_offset = txc->offset * NSEC_PER_USEC; 386 tick_usec = txc->tick;
323 387
324 /* 388 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
325 * Scale the phase adjustment and 389 ntp_update_frequency();
326 * clamp to the operating range. 390 }
327 */ 391
328 time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC); 392 result = time_state; /* mostly `TIME_OK' */
329 time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC); 393 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
330
331 /*
332 * Select whether the frequency is to be controlled
333 * and in which mode (PLL or FLL). Clamp to the operating
334 * range. Ugly multiply/divide should be replaced someday.
335 */
336
337 if (time_status & STA_FREQHOLD || time_reftime == 0)
338 time_reftime = xtime.tv_sec;
339 mtemp = xtime.tv_sec - time_reftime;
340 time_reftime = xtime.tv_sec;
341
342 freq_adj = time_offset * mtemp;
343 freq_adj = shift_right(freq_adj, time_constant * 2 +
344 (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
345 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
346 u64 utemp64;
347 temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
348 if (time_offset < 0) {
349 utemp64 = -temp64;
350 do_div(utemp64, mtemp);
351 freq_adj -= utemp64;
352 } else {
353 utemp64 = temp64;
354 do_div(utemp64, mtemp);
355 freq_adj += utemp64;
356 }
357 }
358 freq_adj += time_freq;
359 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
360 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
361 time_offset = div_long_long_rem_signed(time_offset,
362 NTP_INTERVAL_FREQ,
363 &rem);
364 time_offset <<= SHIFT_UPDATE;
365 } /* STA_PLL */
366 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK)
368 tick_usec = txc->tick;
369
370 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
371 ntp_update_frequency();
372 } /* txc->modes */
373leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
374 result = TIME_ERROR; 394 result = TIME_ERROR;
375 395
376 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
377 (txc->modes == ADJ_OFFSET_SS_READ)) 397 (txc->modes == ADJ_OFFSET_SS_READ))
378 txc->offset = save_adjust; 398 txc->offset = save_adjust;
379 else 399 else {
380 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * 400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
381 NTP_INTERVAL_FREQ / 1000; 401 NTP_SCALE_SHIFT);
382 txc->freq = (time_freq / NSEC_PER_USEC) << 402 if (!(time_status & STA_NANO))
383 (SHIFT_USEC - SHIFT_NSEC); 403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
384 txc->maxerror = time_maxerror; 408 txc->maxerror = time_maxerror;
385 txc->esterror = time_esterror; 409 txc->esterror = time_esterror;
386 txc->status = time_status; 410 txc->status = time_status;
387 txc->constant = time_constant; 411 txc->constant = time_constant;
388 txc->precision = 1; 412 txc->precision = 1;
389 txc->tolerance = MAXFREQ; 413 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
390 txc->tick = tick_usec; 414 txc->tick = tick_usec;
415 txc->tai = time_tai;
391 416
392 /* PPS is not implemented, so these are zero */ 417 /* PPS is not implemented, so these are zero */
393 txc->ppsfreq = 0; 418 txc->ppsfreq = 0;
@@ -399,9 +424,15 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
399 txc->errcnt = 0; 424 txc->errcnt = 0;
400 txc->stbcnt = 0; 425 txc->stbcnt = 0;
401 write_sequnlock_irq(&xtime_lock); 426 write_sequnlock_irq(&xtime_lock);
402 do_gettimeofday(&txc->time); 427
428 txc->time.tv_sec = ts.tv_sec;
429 txc->time.tv_usec = ts.tv_nsec;
430 if (!(time_status & STA_NANO))
431 txc->time.tv_usec /= NSEC_PER_USEC;
432
403 notify_cmos_timer(); 433 notify_cmos_timer();
404 return(result); 434
435 return result;
405} 436}
406 437
407static int __init ntp_tick_adj_setup(char *str) 438static int __init ntp_tick_adj_setup(char *str)
@@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str)
411} 442}
412 443
413__setup("ntp_tick_adj=", ntp_tick_adj_setup); 444__setup("ntp_tick_adj=", ntp_tick_adj_setup);
445
446void __init ntp_init(void)
447{
448 ntp_clear();
449 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
450 leap_timer.function = ntp_leap_second;
451}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e1bd50cbbf5d..57a1f02e5ec0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -14,7 +14,7 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
@@ -262,7 +262,7 @@ out:
262void tick_broadcast_on_off(unsigned long reason, int *oncpu) 262void tick_broadcast_on_off(unsigned long reason, int *oncpu)
263{ 263{
264 if (!cpu_isset(*oncpu, cpu_online_map)) 264 if (!cpu_isset(*oncpu, cpu_online_map))
265 printk(KERN_ERR "tick-braodcast: ignoring broadcast for " 265 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
266 "offline CPU #%d\n", *oncpu); 266 "offline CPU #%d\n", *oncpu);
267 else 267 else
268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 1bea399a9ef0..4f3886562b8c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -14,12 +14,14 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include <asm/irq_regs.h>
24
23#include "tick-internal.h" 25#include "tick-internal.h"
24 26
25/* 27/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0258d3115d54..450c04935b66 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -14,7 +14,7 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 686da821d376..b854a895591e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu)
158 } 158 }
159} 159}
160 160
161static ktime_t tick_nohz_start_idle(int cpu) 161static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
162{ 162{
163 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
164 ktime_t now, delta; 163 ktime_t now, delta;
165 164
166 now = ktime_get(); 165 now = ktime_get();
@@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
192void tick_nohz_stop_sched_tick(void) 191void tick_nohz_stop_sched_tick(void)
193{ 192{
194 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
195 unsigned long rt_jiffies;
196 struct tick_sched *ts; 194 struct tick_sched *ts;
197 ktime_t last_update, expires, now; 195 ktime_t last_update, expires, now;
198 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 196 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void)
201 local_irq_save(flags); 199 local_irq_save(flags);
202 200
203 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
204 now = tick_nohz_start_idle(cpu);
205 ts = &per_cpu(tick_cpu_sched, cpu); 202 ts = &per_cpu(tick_cpu_sched, cpu);
203 now = tick_nohz_start_idle(ts);
206 204
207 /* 205 /*
208 * If this cpu is offline and it is the one which updates 206 * If this cpu is offline and it is the one which updates
@@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void)
222 if (need_resched()) 220 if (need_resched())
223 goto end; 221 goto end;
224 222
225 cpu = smp_processor_id();
226 if (unlikely(local_softirq_pending())) { 223 if (unlikely(local_softirq_pending())) {
227 static int ratelimit; 224 static int ratelimit;
228 225
@@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void)
245 next_jiffies = get_next_timer_interrupt(last_jiffies); 242 next_jiffies = get_next_timer_interrupt(last_jiffies);
246 delta_jiffies = next_jiffies - last_jiffies; 243 delta_jiffies = next_jiffies - last_jiffies;
247 244
248 rt_jiffies = rt_needs_cpu(cpu);
249 if (rt_jiffies && rt_jiffies < delta_jiffies)
250 delta_jiffies = rt_jiffies;
251
252 if (rcu_needs_cpu(cpu)) 245 if (rcu_needs_cpu(cpu))
253 delta_jiffies = 1; 246 delta_jiffies = 1;
254 /* 247 /*
@@ -400,6 +393,7 @@ void tick_nohz_restart_sched_tick(void)
400 sub_preempt_count(HARDIRQ_OFFSET); 393 sub_preempt_count(HARDIRQ_OFFSET);
401 } 394 }
402 395
396 touch_softlockup_watchdog();
403 /* 397 /*
404 * Cancel the scheduled timer and restore the tick 398 * Cancel the scheduled timer and restore the tick
405 */ 399 */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a3fa587c350c..e91c29f961c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec)
53 timespec_add_ns(&xtime_cache, nsec); 53 timespec_add_ns(&xtime_cache, nsec);
54} 54}
55 55
56static struct clocksource *clock; /* pointer to current clocksource */ 56struct clocksource *clock;
57 57
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
@@ -178,6 +178,7 @@ static void change_clocksource(void)
178 if (clock == new) 178 if (clock == new)
179 return; 179 return;
180 180
181 new->cycle_last = 0;
181 now = clocksource_read(new); 182 now = clocksource_read(new);
182 nsec = __get_nsec_offset(); 183 nsec = __get_nsec_offset();
183 timespec_add_ns(&xtime, nsec); 184 timespec_add_ns(&xtime, nsec);
@@ -245,7 +246,7 @@ void __init timekeeping_init(void)
245 246
246 write_seqlock_irqsave(&xtime_lock, flags); 247 write_seqlock_irqsave(&xtime_lock, flags);
247 248
248 ntp_clear(); 249 ntp_init();
249 250
250 clock = clocksource_get_next(); 251 clock = clocksource_get_next();
251 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 252 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev)
295 timespec_add_ns(&xtime, timekeeping_suspend_nsecs); 296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
296 update_xtime_cache(0); 297 update_xtime_cache(0);
297 /* re-base the last cycle value */ 298 /* re-base the last cycle value */
299 clock->cycle_last = 0;
298 clock->cycle_last = clocksource_read(clock); 300 clock->cycle_last = clocksource_read(clock);
299 clock->error = 0; 301 clock->error = 0;
300 timekeeping_suspended = 0; 302 timekeeping_suspended = 0;
@@ -369,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
369 * here. This is tuned so that an error of about 1 msec is adjusted 371 * here. This is tuned so that an error of about 1 msec is adjusted
370 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 372 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
371 */ 373 */
372 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); 374 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
373 error2 = abs(error2); 375 error2 = abs(error2);
374 for (look_ahead = 0; error2 > 0; look_ahead++) 376 for (look_ahead = 0; error2 > 0; look_ahead++)
375 error2 >>= 2; 377 error2 >>= 2;
@@ -378,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
378 * Now calculate the error in (1 << look_ahead) ticks, but first 380 * Now calculate the error in (1 << look_ahead) ticks, but first
379 * remove the single look ahead already included in the error. 381 * remove the single look ahead already included in the error.
380 */ 382 */
381 tick_error = current_tick_length() >> 383 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
382 (TICK_LENGTH_SHIFT - clock->shift + 1);
383 tick_error -= clock->xtime_interval >> 1; 384 tick_error -= clock->xtime_interval >> 1;
384 error = ((error - tick_error) >> look_ahead) + tick_error; 385 error = ((error - tick_error) >> look_ahead) + tick_error;
385 386
@@ -410,7 +411,7 @@ static void clocksource_adjust(s64 offset)
410 s64 error, interval = clock->cycle_interval; 411 s64 error, interval = clock->cycle_interval;
411 int adj; 412 int adj;
412 413
413 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); 414 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
414 if (error > interval) { 415 if (error > interval) {
415 error >>= 2; 416 error >>= 2;
416 if (likely(error <= interval)) 417 if (likely(error <= interval))
@@ -432,7 +433,7 @@ static void clocksource_adjust(s64 offset)
432 clock->xtime_interval += interval; 433 clock->xtime_interval += interval;
433 clock->xtime_nsec -= offset; 434 clock->xtime_nsec -= offset;
434 clock->error -= (interval - offset) << 435 clock->error -= (interval - offset) <<
435 (TICK_LENGTH_SHIFT - clock->shift); 436 (NTP_SCALE_SHIFT - clock->shift);
436} 437}
437 438
438/** 439/**
@@ -471,8 +472,8 @@ void update_wall_time(void)
471 } 472 }
472 473
473 /* accumulate error between NTP and clock interval */ 474 /* accumulate error between NTP and clock interval */
474 clock->error += current_tick_length(); 475 clock->error += tick_length;
475 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
476 } 477 }
477 478
478 /* correct the clock when NTP error is too big */ 479 /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
278{ 278{
279 struct proc_dir_entry *pe; 279 struct proc_dir_entry *pe;
280 280
281 pe = create_proc_entry("timer_list", 0644, NULL); 281 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
282 if (!pe) 282 if (!pe)
283 return -ENOMEM; 283 return -ENOMEM;
284
285 pe->proc_fops = &timer_list_fops;
286
287 return 0; 284 return 0;
288} 285}
289__initcall(init_timer_list_procfs); 286__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
415{ 415{
416 struct proc_dir_entry *pe; 416 struct proc_dir_entry *pe;
417 417
418 pe = create_proc_entry("timer_stats", 0644, NULL); 418 pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
419 if (!pe) 419 if (!pe)
420 return -ENOMEM; 420 return -ENOMEM;
421
422 pe->proc_fops = &tstats_fops;
423
424 return 0; 421 return 0;
425} 422}
426__initcall(init_tstats_procfs); 423__initcall(init_tstats_procfs);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 41468035473c..eb51d76e058a 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -1,7 +1,7 @@
1#!/usr/bin/perl 1#!/usr/bin/perl
2# ----------------------------------------------------------------------- 2# -----------------------------------------------------------------------
3# 3#
4# Copyright 2007 rPath, Inc. - All Rights Reserved 4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5# 5#
6# This file is part of the Linux kernel, and is made available under 6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your 7# the terms of the GNU General Public License version 2 or (at your
@@ -20,198 +20,138 @@
20%canned_values = ( 20%canned_values = (
21 24 => [ 21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26, 22 '0xa6aaaaab','0x2aaaaaa',26,
23 '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
24 125,3, 23 125,3,
25 '0xc49ba5e4','0x1fbe76c8b4',37, 24 '0xc49ba5e4','0x1fbe76c8b4',37,
26 '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
27 3,125, 25 3,125,
28 '0xa2c2aaab','0xaaaa',16, 26 '0xa2c2aaab','0xaaaa',16,
29 '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
30 125000,3, 27 125000,3,
31 '0xc9539b89','0x7fffbce4217d',47, 28 '0xc9539b89','0x7fffbce4217d',47,
32 '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
33 3,125000, 29 3,125000,
34 ], 32 => [ 30 ], 32 => [
35 '0xfa000000','0x6000000',27, 31 '0xfa000000','0x6000000',27,
36 '0xfa00000000000000','0x600000000000000',59,
37 125,4, 32 125,4,
38 '0x83126e98','0xfdf3b645a',36, 33 '0x83126e98','0xfdf3b645a',36,
39 '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
40 4,125, 34 4,125,
41 '0xf4240000','0x0',17, 35 '0xf4240000','0x0',17,
42 '0xf424000000000000','0x0',49,
43 31250,1, 36 31250,1,
44 '0x8637bd06','0x3fff79c842fa',46, 37 '0x8637bd06','0x3fff79c842fa',46,
45 '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
46 1,31250, 38 1,31250,
47 ], 48 => [ 39 ], 48 => [
48 '0xa6aaaaab','0x6aaaaaa',27, 40 '0xa6aaaaab','0x6aaaaaa',27,
49 '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
50 125,6, 41 125,6,
51 '0xc49ba5e4','0xfdf3b645a',36, 42 '0xc49ba5e4','0xfdf3b645a',36,
52 '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
53 6,125, 43 6,125,
54 '0xa2c2aaab','0x15555',17, 44 '0xa2c2aaab','0x15555',17,
55 '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
56 62500,3, 45 62500,3,
57 '0xc9539b89','0x3fffbce4217d',46, 46 '0xc9539b89','0x3fffbce4217d',46,
58 '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
59 3,62500, 47 3,62500,
60 ], 64 => [ 48 ], 64 => [
61 '0xfa000000','0xe000000',28, 49 '0xfa000000','0xe000000',28,
62 '0xfa00000000000000','0xe00000000000000',60,
63 125,8, 50 125,8,
64 '0x83126e98','0x7ef9db22d',35, 51 '0x83126e98','0x7ef9db22d',35,
65 '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
66 8,125, 52 8,125,
67 '0xf4240000','0x0',18, 53 '0xf4240000','0x0',18,
68 '0xf424000000000000','0x0',50,
69 15625,1, 54 15625,1,
70 '0x8637bd06','0x1fff79c842fa',45, 55 '0x8637bd06','0x1fff79c842fa',45,
71 '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
72 1,15625, 56 1,15625,
73 ], 100 => [ 57 ], 100 => [
74 '0xa0000000','0x0',28, 58 '0xa0000000','0x0',28,
75 '0xa000000000000000','0x0',60,
76 10,1, 59 10,1,
77 '0xcccccccd','0x733333333',35, 60 '0xcccccccd','0x733333333',35,
78 '0xcccccccccccccccd','0x73333333333333333',67,
79 1,10, 61 1,10,
80 '0x9c400000','0x0',18, 62 '0x9c400000','0x0',18,
81 '0x9c40000000000000','0x0',50,
82 10000,1, 63 10000,1,
83 '0xd1b71759','0x1fff2e48e8a7',45, 64 '0xd1b71759','0x1fff2e48e8a7',45,
84 '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
85 1,10000, 65 1,10000,
86 ], 122 => [ 66 ], 122 => [
87 '0x8325c53f','0xfbcda3a',28, 67 '0x8325c53f','0xfbcda3a',28,
88 '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
89 500,61, 68 500,61,
90 '0xf9db22d1','0x7fbe76c8b',35, 69 '0xf9db22d1','0x7fbe76c8b',35,
91 '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
92 61,500, 70 61,500,
93 '0x8012e2a0','0x3ef36',18, 71 '0x8012e2a0','0x3ef36',18,
94 '0x8012e29f79b47583','0x3ef368eb04325',50,
95 500000,61, 72 500000,61,
96 '0xffda4053','0x1ffffbce4217',45, 73 '0xffda4053','0x1ffffbce4217',45,
97 '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
98 61,500000, 74 61,500000,
99 ], 128 => [ 75 ], 128 => [
100 '0xfa000000','0x1e000000',29, 76 '0xfa000000','0x1e000000',29,
101 '0xfa00000000000000','0x1e00000000000000',61,
102 125,16, 77 125,16,
103 '0x83126e98','0x3f7ced916',34, 78 '0x83126e98','0x3f7ced916',34,
104 '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
105 16,125, 79 16,125,
106 '0xf4240000','0x40000',19, 80 '0xf4240000','0x40000',19,
107 '0xf424000000000000','0x4000000000000',51,
108 15625,2, 81 15625,2,
109 '0x8637bd06','0xfffbce4217d',44, 82 '0x8637bd06','0xfffbce4217d',44,
110 '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
111 2,15625, 83 2,15625,
112 ], 200 => [ 84 ], 200 => [
113 '0xa0000000','0x0',29, 85 '0xa0000000','0x0',29,
114 '0xa000000000000000','0x0',61,
115 5,1, 86 5,1,
116 '0xcccccccd','0x333333333',34, 87 '0xcccccccd','0x333333333',34,
117 '0xcccccccccccccccd','0x33333333333333333',66,
118 1,5, 88 1,5,
119 '0x9c400000','0x0',19, 89 '0x9c400000','0x0',19,
120 '0x9c40000000000000','0x0',51,
121 5000,1, 90 5000,1,
122 '0xd1b71759','0xfff2e48e8a7',44, 91 '0xd1b71759','0xfff2e48e8a7',44,
123 '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
124 1,5000, 92 1,5000,
125 ], 250 => [ 93 ], 250 => [
126 '0x80000000','0x0',29, 94 '0x80000000','0x0',29,
127 '0x8000000000000000','0x0',61,
128 4,1, 95 4,1,
129 '0x80000000','0x180000000',33, 96 '0x80000000','0x180000000',33,
130 '0x8000000000000000','0x18000000000000000',65,
131 1,4, 97 1,4,
132 '0xfa000000','0x0',20, 98 '0xfa000000','0x0',20,
133 '0xfa00000000000000','0x0',52,
134 4000,1, 99 4000,1,
135 '0x83126e98','0x7ff7ced9168',43, 100 '0x83126e98','0x7ff7ced9168',43,
136 '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
137 1,4000, 101 1,4000,
138 ], 256 => [ 102 ], 256 => [
139 '0xfa000000','0x3e000000',30, 103 '0xfa000000','0x3e000000',30,
140 '0xfa00000000000000','0x3e00000000000000',62,
141 125,32, 104 125,32,
142 '0x83126e98','0x1fbe76c8b',33, 105 '0x83126e98','0x1fbe76c8b',33,
143 '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
144 32,125, 106 32,125,
145 '0xf4240000','0xc0000',20, 107 '0xf4240000','0xc0000',20,
146 '0xf424000000000000','0xc000000000000',52,
147 15625,4, 108 15625,4,
148 '0x8637bd06','0x7ffde7210be',43, 109 '0x8637bd06','0x7ffde7210be',43,
149 '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
150 4,15625, 110 4,15625,
151 ], 300 => [ 111 ], 300 => [
152 '0xd5555556','0x2aaaaaaa',30, 112 '0xd5555556','0x2aaaaaaa',30,
153 '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
154 10,3, 113 10,3,
155 '0x9999999a','0x1cccccccc',33, 114 '0x9999999a','0x1cccccccc',33,
156 '0x999999999999999a','0x1cccccccccccccccc',65,
157 3,10, 115 3,10,
158 '0xd0555556','0xaaaaa',20, 116 '0xd0555556','0xaaaaa',20,
159 '0xd055555555555556','0xaaaaaaaaaaaaa',52,
160 10000,3, 117 10000,3,
161 '0x9d495183','0x7ffcb923a29',43, 118 '0x9d495183','0x7ffcb923a29',43,
162 '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
163 3,10000, 119 3,10000,
164 ], 512 => [ 120 ], 512 => [
165 '0xfa000000','0x7e000000',31, 121 '0xfa000000','0x7e000000',31,
166 '0xfa00000000000000','0x7e00000000000000',63,
167 125,64, 122 125,64,
168 '0x83126e98','0xfdf3b645',32, 123 '0x83126e98','0xfdf3b645',32,
169 '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
170 64,125, 124 64,125,
171 '0xf4240000','0x1c0000',21, 125 '0xf4240000','0x1c0000',21,
172 '0xf424000000000000','0x1c000000000000',53,
173 15625,8, 126 15625,8,
174 '0x8637bd06','0x3ffef39085f',42, 127 '0x8637bd06','0x3ffef39085f',42,
175 '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
176 8,15625, 128 8,15625,
177 ], 1000 => [ 129 ], 1000 => [
178 '0x80000000','0x0',31, 130 '0x80000000','0x0',31,
179 '0x8000000000000000','0x0',63,
180 1,1, 131 1,1,
181 '0x80000000','0x0',31, 132 '0x80000000','0x0',31,
182 '0x8000000000000000','0x0',63,
183 1,1, 133 1,1,
184 '0xfa000000','0x0',22, 134 '0xfa000000','0x0',22,
185 '0xfa00000000000000','0x0',54,
186 1000,1, 135 1000,1,
187 '0x83126e98','0x1ff7ced9168',41, 136 '0x83126e98','0x1ff7ced9168',41,
188 '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
189 1,1000, 137 1,1000,
190 ], 1024 => [ 138 ], 1024 => [
191 '0xfa000000','0xfe000000',32, 139 '0xfa000000','0xfe000000',32,
192 '0xfa00000000000000','0xfe00000000000000',64,
193 125,128, 140 125,128,
194 '0x83126e98','0x7ef9db22',31, 141 '0x83126e98','0x7ef9db22',31,
195 '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
196 128,125, 142 128,125,
197 '0xf4240000','0x3c0000',22, 143 '0xf4240000','0x3c0000',22,
198 '0xf424000000000000','0x3c000000000000',54,
199 15625,16, 144 15625,16,
200 '0x8637bd06','0x1fff79c842f',41, 145 '0x8637bd06','0x1fff79c842f',41,
201 '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
202 16,15625, 146 16,15625,
203 ], 1200 => [ 147 ], 1200 => [
204 '0xd5555556','0xd5555555',32, 148 '0xd5555556','0xd5555555',32,
205 '0xd555555555555556','0xd555555555555555',64,
206 5,6, 149 5,6,
207 '0x9999999a','0x66666666',31, 150 '0x9999999a','0x66666666',31,
208 '0x999999999999999a','0x6666666666666666',63,
209 6,5, 151 6,5,
210 '0xd0555556','0x2aaaaa',22, 152 '0xd0555556','0x2aaaaa',22,
211 '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
212 2500,3, 153 2500,3,
213 '0x9d495183','0x1ffcb923a29',41, 154 '0x9d495183','0x1ffcb923a29',41,
214 '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
215 3,2500, 155 3,2500,
216 ] 156 ]
217); 157);
@@ -264,6 +204,15 @@ sub fmuls($$$) {
264 return 0; 204 return 0;
265} 205}
266 206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
267# Provides mul, adj, and shr factors for a specific 216# Provides mul, adj, and shr factors for a specific
268# (bit, time, hz) combination 217# (bit, time, hz) combination
269sub muladj($$$) { 218sub muladj($$$) {
@@ -271,7 +220,7 @@ sub muladj($$$) {
271 my $s = fmuls($b, $t, $hz); 220 my $s = fmuls($b, $t, $hz);
272 my $m = fmul($s, $t, $hz); 221 my $m = fmul($s, $t, $hz);
273 my $a = fadj($s, $t, $hz); 222 my $a = fadj($s, $t, $hz);
274 return ($m->as_hex(), $a->as_hex(), $s); 223 return (bignum_hex($m), bignum_hex($a), $s);
275} 224}
276 225
277# Provides numerator, denominator values 226# Provides numerator, denominator values
@@ -288,12 +237,10 @@ sub conversions($$) {
288 237
289 # HZ_TO_xx 238 # HZ_TO_xx
290 push(@val, muladj(32, $t, $hz)); 239 push(@val, muladj(32, $t, $hz));
291 push(@val, muladj(64, $t, $hz));
292 push(@val, numden($t, $hz)); 240 push(@val, numden($t, $hz));
293 241
294 # xx_TO_HZ 242 # xx_TO_HZ
295 push(@val, muladj(32, $hz, $t)); 243 push(@val, muladj(32, $hz, $t));
296 push(@val, muladj(64, $hz, $t));
297 push(@val, numden($hz, $t)); 244 push(@val, numden($hz, $t));
298 245
299 return @val; 246 return @val;
@@ -318,6 +265,19 @@ sub compute_values($) {
318 return @val; 265 return @val;
319} 266}
320 267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
321sub output($@) 281sub output($@)
322{ 282{
323 my($hz, @val) = @_; 283 my($hz, @val) = @_;
@@ -331,6 +291,7 @@ sub output($@)
331 print "\n"; 291 print "\n";
332 292
333 print "#include <linux/param.h>\n"; 293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
334 295
335 print "\n"; 296 print "\n";
336 print "#if HZ != $hz\n"; 297 print "#if HZ != $hz\n";
@@ -340,15 +301,13 @@ sub output($@)
340 301
341 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', 302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
342 'HZ_TO_USEC','USEC_TO_HZ') { 303 'HZ_TO_USEC','USEC_TO_HZ') {
343 foreach $bit (32, 64) { 304 foreach $bit (32) {
344 foreach $suf ('MUL', 'ADJ', 'SHR') { 305 foreach $suf ('MUL', 'ADJ', 'SHR') {
345 printf "#define %-23s %s\n", 306 outputval("${pfx}_$suf$bit", shift(@val));
346 "${pfx}_$suf$bit", shift(@val);
347 } 307 }
348 } 308 }
349 foreach $suf ('NUM', 'DEN') { 309 foreach $suf ('NUM', 'DEN') {
350 printf "#define %-23s %s\n", 310 outputval("${pfx}_$suf", shift(@val));
351 "${pfx}_$suf", shift(@val);
352 } 311 }
353 } 312 }
354 313
@@ -356,6 +315,23 @@ sub output($@)
356 print "#endif /* KERNEL_TIMECONST_H */\n"; 315 print "#endif /* KERNEL_TIMECONST_H */\n";
357} 316}
358 317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
359($hz) = @ARGV; 335($hz) = @ARGV;
360 336
361# Use this to generate the %canned_values structure 337# Use this to generate the %canned_values structure
@@ -373,15 +349,15 @@ if ($hz eq '--can') {
373 print "$pf$hz => [\n"; 349 print "$pf$hz => [\n";
374 while (scalar(@values)) { 350 while (scalar(@values)) {
375 my $bit; 351 my $bit;
376 foreach $bit (32, 64) { 352 foreach $bit (32) {
377 my $m = shift(@values); 353 my $m = shift(@values);
378 my $a = shift(@values); 354 my $a = shift(@values);
379 my $s = shift(@values); 355 my $s = shift(@values);
380 print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n"; 356 print "\t\t", perlvals($m,$a,$s), ",\n";
381 } 357 }
382 my $n = shift(@values); 358 my $n = shift(@values);
383 my $d = shift(@values); 359 my $d = shift(@values);
384 print "\t\t",$n,',',$d,",\n"; 360 print "\t\t", perlvals($n,$d), ",\n";
385 } 361 }
386 print "\t]"; 362 print "\t]";
387 $pf = ', '; 363 $pf = ', ';
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..ceacc6626572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer)
320static void timer_stats_account_timer(struct timer_list *timer) {} 320static void timer_stats_account_timer(struct timer_list *timer) {}
321#endif 321#endif
322 322
323/** 323#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
324 * init_timer - initialize a timer. 324
325 * @timer: the timer to be initialized 325static struct debug_obj_descr timer_debug_descr;
326 * 326
327 * init_timer() must be done to a timer prior calling *any* of the 327/*
328 * other timer functions. 328 * fixup_init is called when:
329 * - an active object is initialized
329 */ 330 */
330void init_timer(struct timer_list *timer) 331static int timer_fixup_init(void *addr, enum debug_obj_state state)
332{
333 struct timer_list *timer = addr;
334
335 switch (state) {
336 case ODEBUG_STATE_ACTIVE:
337 del_timer_sync(timer);
338 debug_object_init(timer, &timer_debug_descr);
339 return 1;
340 default:
341 return 0;
342 }
343}
344
345/*
346 * fixup_activate is called when:
347 * - an active object is activated
348 * - an unknown object is activated (might be a statically initialized object)
349 */
350static int timer_fixup_activate(void *addr, enum debug_obj_state state)
351{
352 struct timer_list *timer = addr;
353
354 switch (state) {
355
356 case ODEBUG_STATE_NOTAVAILABLE:
357 /*
358 * This is not really a fixup. The timer was
359 * statically initialized. We just make sure that it
360 * is tracked in the object tracker.
361 */
362 if (timer->entry.next == NULL &&
363 timer->entry.prev == TIMER_ENTRY_STATIC) {
364 debug_object_init(timer, &timer_debug_descr);
365 debug_object_activate(timer, &timer_debug_descr);
366 return 0;
367 } else {
368 WARN_ON_ONCE(1);
369 }
370 return 0;
371
372 case ODEBUG_STATE_ACTIVE:
373 WARN_ON(1);
374
375 default:
376 return 0;
377 }
378}
379
380/*
381 * fixup_free is called when:
382 * - an active object is freed
383 */
384static int timer_fixup_free(void *addr, enum debug_obj_state state)
385{
386 struct timer_list *timer = addr;
387
388 switch (state) {
389 case ODEBUG_STATE_ACTIVE:
390 del_timer_sync(timer);
391 debug_object_free(timer, &timer_debug_descr);
392 return 1;
393 default:
394 return 0;
395 }
396}
397
398static struct debug_obj_descr timer_debug_descr = {
399 .name = "timer_list",
400 .fixup_init = timer_fixup_init,
401 .fixup_activate = timer_fixup_activate,
402 .fixup_free = timer_fixup_free,
403};
404
405static inline void debug_timer_init(struct timer_list *timer)
406{
407 debug_object_init(timer, &timer_debug_descr);
408}
409
410static inline void debug_timer_activate(struct timer_list *timer)
411{
412 debug_object_activate(timer, &timer_debug_descr);
413}
414
415static inline void debug_timer_deactivate(struct timer_list *timer)
416{
417 debug_object_deactivate(timer, &timer_debug_descr);
418}
419
420static inline void debug_timer_free(struct timer_list *timer)
421{
422 debug_object_free(timer, &timer_debug_descr);
423}
424
425static void __init_timer(struct timer_list *timer);
426
427void init_timer_on_stack(struct timer_list *timer)
428{
429 debug_object_init_on_stack(timer, &timer_debug_descr);
430 __init_timer(timer);
431}
432EXPORT_SYMBOL_GPL(init_timer_on_stack);
433
434void destroy_timer_on_stack(struct timer_list *timer)
435{
436 debug_object_free(timer, &timer_debug_descr);
437}
438EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
439
440#else
441static inline void debug_timer_init(struct timer_list *timer) { }
442static inline void debug_timer_activate(struct timer_list *timer) { }
443static inline void debug_timer_deactivate(struct timer_list *timer) { }
444#endif
445
446static void __init_timer(struct timer_list *timer)
331{ 447{
332 timer->entry.next = NULL; 448 timer->entry.next = NULL;
333 timer->base = __raw_get_cpu_var(tvec_bases); 449 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer)
337 memset(timer->start_comm, 0, TASK_COMM_LEN); 453 memset(timer->start_comm, 0, TASK_COMM_LEN);
338#endif 454#endif
339} 455}
456
457/**
458 * init_timer - initialize a timer.
459 * @timer: the timer to be initialized
460 *
461 * init_timer() must be done to a timer prior calling *any* of the
462 * other timer functions.
463 */
464void init_timer(struct timer_list *timer)
465{
466 debug_timer_init(timer);
467 __init_timer(timer);
468}
340EXPORT_SYMBOL(init_timer); 469EXPORT_SYMBOL(init_timer);
341 470
342void init_timer_deferrable(struct timer_list *timer) 471void init_timer_deferrable(struct timer_list *timer)
@@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer,
351{ 480{
352 struct list_head *entry = &timer->entry; 481 struct list_head *entry = &timer->entry;
353 482
483 debug_timer_deactivate(timer);
484
354 __list_del(entry->prev, entry->next); 485 __list_del(entry->prev, entry->next);
355 if (clear_pending) 486 if (clear_pending)
356 entry->next = NULL; 487 entry->next = NULL;
@@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
405 ret = 1; 536 ret = 1;
406 } 537 }
407 538
539 debug_timer_activate(timer);
540
408 new_base = __get_cpu_var(tvec_bases); 541 new_base = __get_cpu_var(tvec_bases);
409 542
410 if (base != new_base) { 543 if (base != new_base) {
@@ -450,11 +583,20 @@ void add_timer_on(struct timer_list *timer, int cpu)
450 BUG_ON(timer_pending(timer) || !timer->function); 583 BUG_ON(timer_pending(timer) || !timer->function);
451 spin_lock_irqsave(&base->lock, flags); 584 spin_lock_irqsave(&base->lock, flags);
452 timer_set_base(timer, base); 585 timer_set_base(timer, base);
586 debug_timer_activate(timer);
453 internal_add_timer(base, timer); 587 internal_add_timer(base, timer);
588 /*
589 * Check whether the other CPU is idle and needs to be
590 * triggered to reevaluate the timer wheel when nohz is
591 * active. We are protected against the other CPU fiddling
592 * with the timer by holding the timer base lock. This also
593 * makes sure that a CPU on the way to idle can not evaluate
594 * the timer wheel.
595 */
596 wake_up_idle_cpu(cpu);
454 spin_unlock_irqrestore(&base->lock, flags); 597 spin_unlock_irqrestore(&base->lock, flags);
455} 598}
456 599
457
458/** 600/**
459 * mod_timer - modify a timer's timeout 601 * mod_timer - modify a timer's timeout
460 * @timer: the timer to be modified 602 * @timer: the timer to be modified
@@ -1078,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout)
1078 1220
1079 expire = timeout + jiffies; 1221 expire = timeout + jiffies;
1080 1222
1081 setup_timer(&timer, process_timeout, (unsigned long)current); 1223 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1082 __mod_timer(&timer, expire); 1224 __mod_timer(&timer, expire);
1083 schedule(); 1225 schedule();
1084 del_singleshot_timer_sync(&timer); 1226 del_singleshot_timer_sync(&timer);
1085 1227
1228 /* Remove the timer from the object tracker */
1229 destroy_timer_on_stack(&timer);
1230
1086 timeout = expire - jiffies; 1231 timeout = expire - jiffies;
1087 1232
1088 out: 1233 out:
@@ -1220,13 +1365,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1220 return 0; 1365 return 0;
1221} 1366}
1222 1367
1223/*
1224 * lockdep: we want to track each per-CPU base as a separate lock-class,
1225 * but timer-bases are kmalloc()-ed, so we need to attach separate
1226 * keys to them:
1227 */
1228static struct lock_class_key base_lock_keys[NR_CPUS];
1229
1230static int __cpuinit init_timers_cpu(int cpu) 1368static int __cpuinit init_timers_cpu(int cpu)
1231{ 1369{
1232 int j; 1370 int j;
@@ -1269,7 +1407,6 @@ static int __cpuinit init_timers_cpu(int cpu)
1269 } 1407 }
1270 1408
1271 spin_lock_init(&base->lock); 1409 spin_lock_init(&base->lock);
1272 lockdep_set_class(&base->lock, base_lock_keys + cpu);
1273 1410
1274 for (j = 0; j < TVN_SIZE; j++) { 1411 for (j = 0; j < TVN_SIZE; j++) {
1275 INIT_LIST_HEAD(base->tv5.vec + j); 1412 INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1308,8 +1445,8 @@ static void __cpuinit migrate_timers(int cpu)
1308 new_base = get_cpu_var(tvec_bases); 1445 new_base = get_cpu_var(tvec_bases);
1309 1446
1310 local_irq_disable(); 1447 local_irq_disable();
1311 double_spin_lock(&new_base->lock, &old_base->lock, 1448 spin_lock(&new_base->lock);
1312 smp_processor_id() < cpu); 1449 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1313 1450
1314 BUG_ON(old_base->running_timer); 1451 BUG_ON(old_base->running_timer);
1315 1452
@@ -1322,8 +1459,8 @@ static void __cpuinit migrate_timers(int cpu)
1322 migrate_timer_list(new_base, old_base->tv5.vec + i); 1459 migrate_timer_list(new_base, old_base->tv5.vec + i);
1323 } 1460 }
1324 1461
1325 double_spin_unlock(&new_base->lock, &old_base->lock, 1462 spin_unlock(&old_base->lock);
1326 smp_processor_id() < cpu); 1463 spin_unlock(&new_base->lock);
1327 local_irq_enable(); 1464 local_irq_enable();
1328 put_cpu_var(tvec_bases); 1465 put_cpu_var(tvec_bases);
1329} 1466}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index dd308ba4e03b..3e41c1673e2f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
21{ 21{
22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
23 /* avoid REGPARM breakage on x86: */ 23 /* avoid REGPARM breakage on x86: */
24 prevent_tail_call(ret); 24 asmlinkage_protect(3, ret, filename, user, group);
25 return ret; 25 return ret;
26} 26}
27 27
@@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
29{ 29{
30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
31 /* avoid REGPARM breakage on x86: */ 31 /* avoid REGPARM breakage on x86: */
32 prevent_tail_call(ret); 32 asmlinkage_protect(3, ret, filename, user, group);
33 return ret; 33 return ret;
34} 34}
35 35
@@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
37{ 37{
38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
39 /* avoid REGPARM breakage on x86: */ 39 /* avoid REGPARM breakage on x86: */
40 prevent_tail_call(ret); 40 asmlinkage_protect(3, ret, fd, user, group);
41 return ret; 41 return ret;
42} 42}
43 43
@@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
45{ 45{
46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
47 /* avoid REGPARM breakage on x86: */ 47 /* avoid REGPARM breakage on x86: */
48 prevent_tail_call(ret); 48 asmlinkage_protect(2, ret, rgid, egid);
49 return ret; 49 return ret;
50} 50}
51 51
@@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
53{ 53{
54 long ret = sys_setgid(low2highgid(gid)); 54 long ret = sys_setgid(low2highgid(gid));
55 /* avoid REGPARM breakage on x86: */ 55 /* avoid REGPARM breakage on x86: */
56 prevent_tail_call(ret); 56 asmlinkage_protect(1, ret, gid);
57 return ret; 57 return ret;
58} 58}
59 59
@@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
61{ 61{
62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
63 /* avoid REGPARM breakage on x86: */ 63 /* avoid REGPARM breakage on x86: */
64 prevent_tail_call(ret); 64 asmlinkage_protect(2, ret, ruid, euid);
65 return ret; 65 return ret;
66} 66}
67 67
@@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
69{ 69{
70 long ret = sys_setuid(low2highuid(uid)); 70 long ret = sys_setuid(low2highuid(uid));
71 /* avoid REGPARM breakage on x86: */ 71 /* avoid REGPARM breakage on x86: */
72 prevent_tail_call(ret); 72 asmlinkage_protect(1, ret, uid);
73 return ret; 73 return ret;
74} 74}
75 75
@@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
79 low2highuid(suid)); 79 low2highuid(suid));
80 /* avoid REGPARM breakage on x86: */ 80 /* avoid REGPARM breakage on x86: */
81 prevent_tail_call(ret); 81 asmlinkage_protect(3, ret, ruid, euid, suid);
82 return ret; 82 return ret;
83} 83}
84 84
@@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
98 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 98 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
99 low2highgid(sgid)); 99 low2highgid(sgid));
100 /* avoid REGPARM breakage on x86: */ 100 /* avoid REGPARM breakage on x86: */
101 prevent_tail_call(ret); 101 asmlinkage_protect(3, ret, rgid, egid, sgid);
102 return ret; 102 return ret;
103} 103}
104 104
@@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
117{ 117{
118 long ret = sys_setfsuid(low2highuid(uid)); 118 long ret = sys_setfsuid(low2highuid(uid));
119 /* avoid REGPARM breakage on x86: */ 119 /* avoid REGPARM breakage on x86: */
120 prevent_tail_call(ret); 120 asmlinkage_protect(1, ret, uid);
121 return ret; 121 return ret;
122} 122}
123 123
@@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid)
125{ 125{
126 long ret = sys_setfsgid(low2highgid(gid)); 126 long ret = sys_setfsgid(low2highgid(gid));
127 /* avoid REGPARM breakage on x86: */ 127 /* avoid REGPARM breakage on x86: */
128 prevent_tail_call(ret); 128 asmlinkage_protect(1, ret, gid);
129 return ret; 129 return ret;
130} 130}
131 131
diff --git a/kernel/user.c b/kernel/user.c
index 7132022a040c..865ecf57a096 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
53 .files = ATOMIC_INIT(0), 53 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 54 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 55 .locked_shm = 0,
56#ifdef CONFIG_KEYS
57 .uid_keyring = &root_user_keyring,
58 .session_keyring = &root_session_keyring,
59#endif
60#ifdef CONFIG_USER_SCHED 56#ifdef CONFIG_USER_SCHED
61 .tg = &init_task_group, 57 .tg = &init_task_group,
62#endif 58#endif
@@ -101,7 +97,7 @@ static int sched_create_user(struct user_struct *up)
101{ 97{
102 int rc = 0; 98 int rc = 0;
103 99
104 up->tg = sched_create_group(); 100 up->tg = sched_create_group(&root_task_group);
105 if (IS_ERR(up->tg)) 101 if (IS_ERR(up->tg))
106 rc = -ENOMEM; 102 rc = -ENOMEM;
107 103
@@ -193,6 +189,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
193 189
194static struct kobj_attribute cpu_rt_runtime_attr = 190static struct kobj_attribute cpu_rt_runtime_attr =
195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); 191 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
192
193static ssize_t cpu_rt_period_show(struct kobject *kobj,
194 struct kobj_attribute *attr,
195 char *buf)
196{
197 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
198
199 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
200}
201
202static ssize_t cpu_rt_period_store(struct kobject *kobj,
203 struct kobj_attribute *attr,
204 const char *buf, size_t size)
205{
206 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
207 unsigned long rt_period;
208 int rc;
209
210 sscanf(buf, "%lu", &rt_period);
211
212 rc = sched_group_set_rt_period(up->tg, rt_period);
213
214 return (rc ? rc : size);
215}
216
217static struct kobj_attribute cpu_rt_period_attr =
218 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
196#endif 219#endif
197 220
198/* default attributes per uid directory */ 221/* default attributes per uid directory */
@@ -202,6 +225,7 @@ static struct attribute *uids_attributes[] = {
202#endif 225#endif
203#ifdef CONFIG_RT_GROUP_SCHED 226#ifdef CONFIG_RT_GROUP_SCHED
204 &cpu_rt_runtime_attr.attr, 227 &cpu_rt_runtime_attr.attr,
228 &cpu_rt_period_attr.attr,
205#endif 229#endif
206 NULL 230 NULL
207}; 231};
@@ -360,7 +384,7 @@ void free_uid(struct user_struct *up)
360 local_irq_restore(flags); 384 local_irq_restore(flags);
361} 385}
362 386
363struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 387struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
364{ 388{
365 struct hlist_head *hashent = uidhashentry(ns, uid); 389 struct hlist_head *hashent = uidhashentry(ns, uid);
366 struct user_struct *up, *new; 390 struct user_struct *up, *new;
@@ -375,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
375 spin_unlock_irq(&uidhash_lock); 399 spin_unlock_irq(&uidhash_lock);
376 400
377 if (!up) { 401 if (!up) {
378 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 402 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
379 if (!new) 403 if (!new)
380 goto out_unlock; 404 goto out_unlock;
381 405
382 new->uid = uid; 406 new->uid = uid;
383 atomic_set(&new->__count, 1); 407 atomic_set(&new->__count, 1);
384 atomic_set(&new->processes, 0);
385 atomic_set(&new->files, 0);
386 atomic_set(&new->sigpending, 0);
387#ifdef CONFIG_INOTIFY_USER
388 atomic_set(&new->inotify_watches, 0);
389 atomic_set(&new->inotify_devs, 0);
390#endif
391#ifdef CONFIG_POSIX_MQUEUE
392 new->mq_bytes = 0;
393#endif
394 new->locked_shm = 0;
395
396 if (alloc_uid_keyring(new, current) < 0)
397 goto out_free_user;
398 408
399 if (sched_create_user(new) < 0) 409 if (sched_create_user(new) < 0)
400 goto out_put_keys; 410 goto out_free_user;
401 411
402 if (uids_user_create(new)) 412 if (uids_user_create(new))
403 goto out_destoy_sched; 413 goto out_destoy_sched;
@@ -431,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
431 441
432out_destoy_sched: 442out_destoy_sched:
433 sched_destroy_user(new); 443 sched_destroy_user(new);
434out_put_keys:
435 key_put(new->uid_keyring);
436 key_put(new->session_keyring);
437out_free_user: 444out_free_user:
438 kmem_cache_free(uid_cachep, new); 445 kmem_cache_free(uid_cachep, new);
439out_unlock: 446out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h> 9#include <linux/version.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/slab.h>
11#include <linux/user_namespace.h> 12#include <linux/user_namespace.h>
12 13
13/* 14/*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
73 release_uids(ns); 74 release_uids(ns);
74 kfree(ns); 75 kfree(ns);
75} 76}
77EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17 18
18/* 19/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 20 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ff06611655af..29fc39f1029c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
158 * 158 *
159 * Returns 0 if @work was already on a queue, non-zero otherwise. 159 * Returns 0 if @work was already on a queue, non-zero otherwise.
160 * 160 *
161 * We queue the work to the CPU it was submitted, but there is no 161 * We queue the work to the CPU on which it was submitted, but if the CPU dies
162 * guarantee that it will be processed by that CPU. 162 * it can be processed by another CPU.
163 */ 163 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 164int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 165{
@@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data)
195int queue_delayed_work(struct workqueue_struct *wq, 195int queue_delayed_work(struct workqueue_struct *wq,
196 struct delayed_work *dwork, unsigned long delay) 196 struct delayed_work *dwork, unsigned long delay)
197{ 197{
198 timer_stats_timer_set_start_info(&dwork->timer);
199 if (delay == 0) 198 if (delay == 0)
200 return queue_work(wq, &dwork->work); 199 return queue_work(wq, &dwork->work);
201 200
@@ -223,6 +222,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
223 BUG_ON(timer_pending(timer)); 222 BUG_ON(timer_pending(timer));
224 BUG_ON(!list_empty(&work->entry)); 223 BUG_ON(!list_empty(&work->entry));
225 224
225 timer_stats_timer_set_start_info(&dwork->timer);
226
226 /* This stores cwq for the moment, for the timer_fn */ 227 /* This stores cwq for the moment, for the timer_fn */
227 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); 228 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
228 timer->expires = jiffies + delay; 229 timer->expires = jiffies + delay;
@@ -246,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
246 if (cwq->run_depth > 3) { 247 if (cwq->run_depth > 3) {
247 /* morton gets to eat his hat */ 248 /* morton gets to eat his hat */
248 printk("%s: recursion depth exceeded: %d\n", 249 printk("%s: recursion depth exceeded: %d\n",
249 __FUNCTION__, cwq->run_depth); 250 __func__, cwq->run_depth);
250 dump_stack(); 251 dump_stack();
251 } 252 }
252 while (!list_empty(&cwq->worklist)) { 253 while (!list_empty(&cwq->worklist)) {
@@ -563,7 +564,6 @@ EXPORT_SYMBOL(schedule_work);
563int schedule_delayed_work(struct delayed_work *dwork, 564int schedule_delayed_work(struct delayed_work *dwork,
564 unsigned long delay) 565 unsigned long delay)
565{ 566{
566 timer_stats_timer_set_start_info(&dwork->timer);
567 return queue_delayed_work(keventd_wq, dwork, delay); 567 return queue_delayed_work(keventd_wq, dwork, delay);
568} 568}
569EXPORT_SYMBOL(schedule_delayed_work); 569EXPORT_SYMBOL(schedule_delayed_work);
@@ -770,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
770} 770}
771EXPORT_SYMBOL_GPL(__create_workqueue_key); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
772 772
773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
774{ 774{
775 /* 775 /*
776 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -806,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
806void destroy_workqueue(struct workqueue_struct *wq) 806void destroy_workqueue(struct workqueue_struct *wq)
807{ 807{
808 const cpumask_t *cpu_map = wq_cpu_map(wq); 808 const cpumask_t *cpu_map = wq_cpu_map(wq);
809 struct cpu_workqueue_struct *cwq;
810 int cpu; 809 int cpu;
811 810
812 get_online_cpus(); 811 get_online_cpus();
813 spin_lock(&workqueue_lock); 812 spin_lock(&workqueue_lock);
814 list_del(&wq->list); 813 list_del(&wq->list);
815 spin_unlock(&workqueue_lock); 814 spin_unlock(&workqueue_lock);
816 put_online_cpus();
817 815
818 for_each_cpu_mask(cpu, *cpu_map) { 816 for_each_cpu_mask(cpu, *cpu_map)
819 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 817 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
820 cleanup_workqueue_thread(cwq, cpu); 818 put_online_cpus();
821 }
822 819
823 free_percpu(wq->cpu_wq); 820 free_percpu(wq->cpu_wq);
824 kfree(wq); 821 kfree(wq);
@@ -836,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
836 action &= ~CPU_TASKS_FROZEN; 833 action &= ~CPU_TASKS_FROZEN;
837 834
838 switch (action) { 835 switch (action) {
839
840 case CPU_UP_PREPARE: 836 case CPU_UP_PREPARE:
841 cpu_set(cpu, cpu_populated_map); 837 cpu_set(cpu, cpu_populated_map);
842 } 838 }
@@ -859,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
859 case CPU_UP_CANCELED: 855 case CPU_UP_CANCELED:
860 start_workqueue_thread(cwq, -1); 856 start_workqueue_thread(cwq, -1);
861 case CPU_DEAD: 857 case CPU_DEAD:
862 cleanup_workqueue_thread(cwq, cpu); 858 cleanup_workqueue_thread(cwq);
863 break; 859 break;
864 } 860 }
865 } 861 }
866 862
863 switch (action) {
864 case CPU_UP_CANCELED:
865 case CPU_DEAD:
866 cpu_clear(cpu, cpu_populated_map);
867 }
868
867 return NOTIFY_OK; 869 return NOTIFY_OK;
868} 870}
869 871