diff options
Diffstat (limited to 'kernel')
82 files changed, 7504 insertions, 3117 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c584c55a6e9..188c43223f52 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -8,10 +8,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
| 8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o pm_qos_params.o | 12 | notifier.o ksysfs.o pm_qos_params.o |
| 13 | 13 | ||
| 14 | obj-$(CONFIG_SYSCTL) += sysctl_check.o | 14 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o |
| 15 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 15 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
| 16 | obj-y += time/ | 16 | obj-y += time/ |
| 17 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 17 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
| @@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | |||
| 53 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 53 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
| 54 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 54 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
| 55 | obj-$(CONFIG_KPROBES) += kprobes.o | 55 | obj-$(CONFIG_KPROBES) += kprobes.o |
| 56 | obj-$(CONFIG_KGDB) += kgdb.o | ||
| 56 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | 57 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o |
| 57 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 58 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 58 | obj-$(CONFIG_SECCOMP) += seccomp.o | 59 | obj-$(CONFIG_SECCOMP) += seccomp.o |
diff --git a/kernel/audit.c b/kernel/audit.c index be55cb503633..b7d3709cc452 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -21,7 +21,7 @@ | |||
| 21 | * | 21 | * |
| 22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | 22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> |
| 23 | * | 23 | * |
| 24 | * Goals: 1) Integrate fully with SELinux. | 24 | * Goals: 1) Integrate fully with Security Modules. |
| 25 | * 2) Minimal run-time overhead: | 25 | * 2) Minimal run-time overhead: |
| 26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). | 26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). |
| 27 | * b) Small when syscall auditing is enabled and no audit record | 27 | * b) Small when syscall auditing is enabled and no audit record |
| @@ -55,7 +55,6 @@ | |||
| 55 | #include <net/netlink.h> | 55 | #include <net/netlink.h> |
| 56 | #include <linux/skbuff.h> | 56 | #include <linux/skbuff.h> |
| 57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
| 58 | #include <linux/selinux.h> | ||
| 59 | #include <linux/inotify.h> | 58 | #include <linux/inotify.h> |
| 60 | #include <linux/freezer.h> | 59 | #include <linux/freezer.h> |
| 61 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
| @@ -127,6 +126,8 @@ static int audit_freelist_count; | |||
| 127 | static LIST_HEAD(audit_freelist); | 126 | static LIST_HEAD(audit_freelist); |
| 128 | 127 | ||
| 129 | static struct sk_buff_head audit_skb_queue; | 128 | static struct sk_buff_head audit_skb_queue; |
| 129 | /* queue of skbs to send to auditd when/if it comes back */ | ||
| 130 | static struct sk_buff_head audit_skb_hold_queue; | ||
| 130 | static struct task_struct *kauditd_task; | 131 | static struct task_struct *kauditd_task; |
| 131 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | 132 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); |
| 132 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | 133 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); |
| @@ -155,6 +156,11 @@ struct audit_buffer { | |||
| 155 | gfp_t gfp_mask; | 156 | gfp_t gfp_mask; |
| 156 | }; | 157 | }; |
| 157 | 158 | ||
| 159 | struct audit_reply { | ||
| 160 | int pid; | ||
| 161 | struct sk_buff *skb; | ||
| 162 | }; | ||
| 163 | |||
| 158 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 164 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
| 159 | { | 165 | { |
| 160 | if (ab) { | 166 | if (ab) { |
| @@ -253,25 +259,26 @@ void audit_log_lost(const char *message) | |||
| 253 | } | 259 | } |
| 254 | 260 | ||
| 255 | static int audit_log_config_change(char *function_name, int new, int old, | 261 | static int audit_log_config_change(char *function_name, int new, int old, |
| 256 | uid_t loginuid, u32 sid, int allow_changes) | 262 | uid_t loginuid, u32 sessionid, u32 sid, |
| 263 | int allow_changes) | ||
| 257 | { | 264 | { |
| 258 | struct audit_buffer *ab; | 265 | struct audit_buffer *ab; |
| 259 | int rc = 0; | 266 | int rc = 0; |
| 260 | 267 | ||
| 261 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 268 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 262 | audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, | 269 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, |
| 263 | old, loginuid); | 270 | old, loginuid, sessionid); |
| 264 | if (sid) { | 271 | if (sid) { |
| 265 | char *ctx = NULL; | 272 | char *ctx = NULL; |
| 266 | u32 len; | 273 | u32 len; |
| 267 | 274 | ||
| 268 | rc = selinux_sid_to_string(sid, &ctx, &len); | 275 | rc = security_secid_to_secctx(sid, &ctx, &len); |
| 269 | if (rc) { | 276 | if (rc) { |
| 270 | audit_log_format(ab, " sid=%u", sid); | 277 | audit_log_format(ab, " sid=%u", sid); |
| 271 | allow_changes = 0; /* Something weird, deny request */ | 278 | allow_changes = 0; /* Something weird, deny request */ |
| 272 | } else { | 279 | } else { |
| 273 | audit_log_format(ab, " subj=%s", ctx); | 280 | audit_log_format(ab, " subj=%s", ctx); |
| 274 | kfree(ctx); | 281 | security_release_secctx(ctx, len); |
| 275 | } | 282 | } |
| 276 | } | 283 | } |
| 277 | audit_log_format(ab, " res=%d", allow_changes); | 284 | audit_log_format(ab, " res=%d", allow_changes); |
| @@ -280,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
| 280 | } | 287 | } |
| 281 | 288 | ||
| 282 | static int audit_do_config_change(char *function_name, int *to_change, | 289 | static int audit_do_config_change(char *function_name, int *to_change, |
| 283 | int new, uid_t loginuid, u32 sid) | 290 | int new, uid_t loginuid, u32 sessionid, |
| 291 | u32 sid) | ||
| 284 | { | 292 | { |
| 285 | int allow_changes, rc = 0, old = *to_change; | 293 | int allow_changes, rc = 0, old = *to_change; |
| 286 | 294 | ||
| @@ -291,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
| 291 | allow_changes = 1; | 299 | allow_changes = 1; |
| 292 | 300 | ||
| 293 | if (audit_enabled != AUDIT_OFF) { | 301 | if (audit_enabled != AUDIT_OFF) { |
| 294 | rc = audit_log_config_change(function_name, new, old, | 302 | rc = audit_log_config_change(function_name, new, old, loginuid, |
| 295 | loginuid, sid, allow_changes); | 303 | sessionid, sid, allow_changes); |
| 296 | if (rc) | 304 | if (rc) |
| 297 | allow_changes = 0; | 305 | allow_changes = 0; |
| 298 | } | 306 | } |
| @@ -306,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
| 306 | return rc; | 314 | return rc; |
| 307 | } | 315 | } |
| 308 | 316 | ||
| 309 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | 317 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, |
| 318 | u32 sid) | ||
| 310 | { | 319 | { |
| 311 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, | 320 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, |
| 312 | limit, loginuid, sid); | 321 | limit, loginuid, sessionid, sid); |
| 313 | } | 322 | } |
| 314 | 323 | ||
| 315 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | 324 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, |
| 325 | u32 sid) | ||
| 316 | { | 326 | { |
| 317 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, | 327 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, |
| 318 | limit, loginuid, sid); | 328 | limit, loginuid, sessionid, sid); |
| 319 | } | 329 | } |
| 320 | 330 | ||
| 321 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | 331 | static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) |
| 322 | { | 332 | { |
| 323 | int rc; | 333 | int rc; |
| 324 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | 334 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) |
| 325 | return -EINVAL; | 335 | return -EINVAL; |
| 326 | 336 | ||
| 327 | rc = audit_do_config_change("audit_enabled", &audit_enabled, state, | 337 | rc = audit_do_config_change("audit_enabled", &audit_enabled, state, |
| 328 | loginuid, sid); | 338 | loginuid, sessionid, sid); |
| 329 | 339 | ||
| 330 | if (!rc) | 340 | if (!rc) |
| 331 | audit_ever_enabled |= !!state; | 341 | audit_ever_enabled |= !!state; |
| @@ -333,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | |||
| 333 | return rc; | 343 | return rc; |
| 334 | } | 344 | } |
| 335 | 345 | ||
| 336 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) | 346 | static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) |
| 337 | { | 347 | { |
| 338 | if (state != AUDIT_FAIL_SILENT | 348 | if (state != AUDIT_FAIL_SILENT |
| 339 | && state != AUDIT_FAIL_PRINTK | 349 | && state != AUDIT_FAIL_PRINTK |
| @@ -341,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) | |||
| 341 | return -EINVAL; | 351 | return -EINVAL; |
| 342 | 352 | ||
| 343 | return audit_do_config_change("audit_failure", &audit_failure, state, | 353 | return audit_do_config_change("audit_failure", &audit_failure, state, |
| 344 | loginuid, sid); | 354 | loginuid, sessionid, sid); |
| 355 | } | ||
| 356 | |||
| 357 | /* | ||
| 358 | * Queue skbs to be sent to auditd when/if it comes back. These skbs should | ||
| 359 | * already have been sent via prink/syslog and so if these messages are dropped | ||
| 360 | * it is not a huge concern since we already passed the audit_log_lost() | ||
| 361 | * notification and stuff. This is just nice to get audit messages during | ||
| 362 | * boot before auditd is running or messages generated while auditd is stopped. | ||
| 363 | * This only holds messages is audit_default is set, aka booting with audit=1 | ||
| 364 | * or building your kernel that way. | ||
| 365 | */ | ||
| 366 | static void audit_hold_skb(struct sk_buff *skb) | ||
| 367 | { | ||
| 368 | if (audit_default && | ||
| 369 | skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) | ||
| 370 | skb_queue_tail(&audit_skb_hold_queue, skb); | ||
| 371 | else | ||
| 372 | kfree_skb(skb); | ||
| 373 | } | ||
| 374 | |||
| 375 | static void kauditd_send_skb(struct sk_buff *skb) | ||
| 376 | { | ||
| 377 | int err; | ||
| 378 | /* take a reference in case we can't send it and we want to hold it */ | ||
| 379 | skb_get(skb); | ||
| 380 | err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); | ||
| 381 | if (err < 0) { | ||
| 382 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | ||
| 383 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | ||
| 384 | audit_log_lost("auditd dissapeared\n"); | ||
| 385 | audit_pid = 0; | ||
| 386 | /* we might get lucky and get this in the next auditd */ | ||
| 387 | audit_hold_skb(skb); | ||
| 388 | } else | ||
| 389 | /* drop the extra reference if sent ok */ | ||
| 390 | kfree_skb(skb); | ||
| 345 | } | 391 | } |
| 346 | 392 | ||
| 347 | static int kauditd_thread(void *dummy) | 393 | static int kauditd_thread(void *dummy) |
| @@ -350,24 +396,41 @@ static int kauditd_thread(void *dummy) | |||
| 350 | 396 | ||
| 351 | set_freezable(); | 397 | set_freezable(); |
| 352 | while (!kthread_should_stop()) { | 398 | while (!kthread_should_stop()) { |
| 399 | /* | ||
| 400 | * if auditd just started drain the queue of messages already | ||
| 401 | * sent to syslog/printk. remember loss here is ok. we already | ||
| 402 | * called audit_log_lost() if it didn't go out normally. so the | ||
| 403 | * race between the skb_dequeue and the next check for audit_pid | ||
| 404 | * doesn't matter. | ||
| 405 | * | ||
| 406 | * if you ever find kauditd to be too slow we can get a perf win | ||
| 407 | * by doing our own locking and keeping better track if there | ||
| 408 | * are messages in this queue. I don't see the need now, but | ||
| 409 | * in 5 years when I want to play with this again I'll see this | ||
| 410 | * note and still have no friggin idea what i'm thinking today. | ||
| 411 | */ | ||
| 412 | if (audit_default && audit_pid) { | ||
| 413 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
| 414 | if (unlikely(skb)) { | ||
| 415 | while (skb && audit_pid) { | ||
| 416 | kauditd_send_skb(skb); | ||
| 417 | skb = skb_dequeue(&audit_skb_hold_queue); | ||
| 418 | } | ||
| 419 | } | ||
| 420 | } | ||
| 421 | |||
| 353 | skb = skb_dequeue(&audit_skb_queue); | 422 | skb = skb_dequeue(&audit_skb_queue); |
| 354 | wake_up(&audit_backlog_wait); | 423 | wake_up(&audit_backlog_wait); |
| 355 | if (skb) { | 424 | if (skb) { |
| 356 | if (audit_pid) { | 425 | if (audit_pid) |
| 357 | int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); | 426 | kauditd_send_skb(skb); |
| 358 | if (err < 0) { | 427 | else { |
| 359 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | ||
| 360 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | ||
| 361 | audit_log_lost("auditd dissapeared\n"); | ||
| 362 | audit_pid = 0; | ||
| 363 | } | ||
| 364 | } else { | ||
| 365 | if (printk_ratelimit()) | 428 | if (printk_ratelimit()) |
| 366 | printk(KERN_NOTICE "%s\n", skb->data + | 429 | printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); |
| 367 | NLMSG_SPACE(0)); | ||
| 368 | else | 430 | else |
| 369 | audit_log_lost("printk limit exceeded\n"); | 431 | audit_log_lost("printk limit exceeded\n"); |
| 370 | kfree_skb(skb); | 432 | |
| 433 | audit_hold_skb(skb); | ||
| 371 | } | 434 | } |
| 372 | } else { | 435 | } else { |
| 373 | DECLARE_WAITQUEUE(wait, current); | 436 | DECLARE_WAITQUEUE(wait, current); |
| @@ -386,13 +449,13 @@ static int kauditd_thread(void *dummy) | |||
| 386 | return 0; | 449 | return 0; |
| 387 | } | 450 | } |
| 388 | 451 | ||
| 389 | static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) | 452 | static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) |
| 390 | { | 453 | { |
| 391 | struct task_struct *tsk; | 454 | struct task_struct *tsk; |
| 392 | int err; | 455 | int err; |
| 393 | 456 | ||
| 394 | read_lock(&tasklist_lock); | 457 | read_lock(&tasklist_lock); |
| 395 | tsk = find_task_by_pid(pid); | 458 | tsk = find_task_by_vpid(pid); |
| 396 | err = -ESRCH; | 459 | err = -ESRCH; |
| 397 | if (!tsk) | 460 | if (!tsk) |
| 398 | goto out; | 461 | goto out; |
| @@ -405,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) | |||
| 405 | if (err) | 468 | if (err) |
| 406 | goto out; | 469 | goto out; |
| 407 | 470 | ||
| 408 | tty_audit_push_task(tsk, loginuid); | 471 | tty_audit_push_task(tsk, loginuid, sessionid); |
| 409 | out: | 472 | out: |
| 410 | read_unlock(&tasklist_lock); | 473 | read_unlock(&tasklist_lock); |
| 411 | return err; | 474 | return err; |
| @@ -470,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */ | |||
| 470 | return NULL; | 533 | return NULL; |
| 471 | } | 534 | } |
| 472 | 535 | ||
| 536 | static int audit_send_reply_thread(void *arg) | ||
| 537 | { | ||
| 538 | struct audit_reply *reply = (struct audit_reply *)arg; | ||
| 539 | |||
| 540 | mutex_lock(&audit_cmd_mutex); | ||
| 541 | mutex_unlock(&audit_cmd_mutex); | ||
| 542 | |||
| 543 | /* Ignore failure. It'll only happen if the sender goes away, | ||
| 544 | because our timeout is set to infinite. */ | ||
| 545 | netlink_unicast(audit_sock, reply->skb, reply->pid, 0); | ||
| 546 | kfree(reply); | ||
| 547 | return 0; | ||
| 548 | } | ||
| 473 | /** | 549 | /** |
| 474 | * audit_send_reply - send an audit reply message via netlink | 550 | * audit_send_reply - send an audit reply message via netlink |
| 475 | * @pid: process id to send reply to | 551 | * @pid: process id to send reply to |
| @@ -486,14 +562,26 @@ nlmsg_failure: /* Used by NLMSG_PUT */ | |||
| 486 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | 562 | void audit_send_reply(int pid, int seq, int type, int done, int multi, |
| 487 | void *payload, int size) | 563 | void *payload, int size) |
| 488 | { | 564 | { |
| 489 | struct sk_buff *skb; | 565 | struct sk_buff *skb; |
| 566 | struct task_struct *tsk; | ||
| 567 | struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), | ||
| 568 | GFP_KERNEL); | ||
| 569 | |||
| 570 | if (!reply) | ||
| 571 | return; | ||
| 572 | |||
| 490 | skb = audit_make_reply(pid, seq, type, done, multi, payload, size); | 573 | skb = audit_make_reply(pid, seq, type, done, multi, payload, size); |
| 491 | if (!skb) | 574 | if (!skb) |
| 492 | return; | 575 | return; |
| 493 | /* Ignore failure. It'll only happen if the sender goes away, | 576 | |
| 494 | because our timeout is set to infinite. */ | 577 | reply->pid = pid; |
| 495 | netlink_unicast(audit_sock, skb, pid, 0); | 578 | reply->skb = skb; |
| 496 | return; | 579 | |
| 580 | tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); | ||
| 581 | if (IS_ERR(tsk)) { | ||
| 582 | kfree(reply); | ||
| 583 | kfree_skb(skb); | ||
| 584 | } | ||
| 497 | } | 585 | } |
| 498 | 586 | ||
| 499 | /* | 587 | /* |
| @@ -535,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
| 535 | } | 623 | } |
| 536 | 624 | ||
| 537 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | 625 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, |
| 538 | u32 pid, u32 uid, uid_t auid, u32 sid) | 626 | u32 pid, u32 uid, uid_t auid, u32 ses, |
| 627 | u32 sid) | ||
| 539 | { | 628 | { |
| 540 | int rc = 0; | 629 | int rc = 0; |
| 541 | char *ctx = NULL; | 630 | char *ctx = NULL; |
| @@ -547,15 +636,16 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
| 547 | } | 636 | } |
| 548 | 637 | ||
| 549 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 638 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
| 550 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u", | 639 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", |
| 551 | pid, uid, auid); | 640 | pid, uid, auid, ses); |
| 552 | if (sid) { | 641 | if (sid) { |
| 553 | rc = selinux_sid_to_string(sid, &ctx, &len); | 642 | rc = security_secid_to_secctx(sid, &ctx, &len); |
| 554 | if (rc) | 643 | if (rc) |
| 555 | audit_log_format(*ab, " ssid=%u", sid); | 644 | audit_log_format(*ab, " ssid=%u", sid); |
| 556 | else | 645 | else { |
| 557 | audit_log_format(*ab, " subj=%s", ctx); | 646 | audit_log_format(*ab, " subj=%s", ctx); |
| 558 | kfree(ctx); | 647 | security_release_secctx(ctx, len); |
| 648 | } | ||
| 559 | } | 649 | } |
| 560 | 650 | ||
| 561 | return rc; | 651 | return rc; |
| @@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 570 | struct audit_buffer *ab; | 660 | struct audit_buffer *ab; |
| 571 | u16 msg_type = nlh->nlmsg_type; | 661 | u16 msg_type = nlh->nlmsg_type; |
| 572 | uid_t loginuid; /* loginuid of sender */ | 662 | uid_t loginuid; /* loginuid of sender */ |
| 663 | u32 sessionid; | ||
| 573 | struct audit_sig_info *sig_data; | 664 | struct audit_sig_info *sig_data; |
| 574 | char *ctx = NULL; | 665 | char *ctx = NULL; |
| 575 | u32 len; | 666 | u32 len; |
| @@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 591 | pid = NETLINK_CREDS(skb)->pid; | 682 | pid = NETLINK_CREDS(skb)->pid; |
| 592 | uid = NETLINK_CREDS(skb)->uid; | 683 | uid = NETLINK_CREDS(skb)->uid; |
| 593 | loginuid = NETLINK_CB(skb).loginuid; | 684 | loginuid = NETLINK_CB(skb).loginuid; |
| 685 | sessionid = NETLINK_CB(skb).sessionid; | ||
| 594 | sid = NETLINK_CB(skb).sid; | 686 | sid = NETLINK_CB(skb).sid; |
| 595 | seq = nlh->nlmsg_seq; | 687 | seq = nlh->nlmsg_seq; |
| 596 | data = NLMSG_DATA(nlh); | 688 | data = NLMSG_DATA(nlh); |
| @@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 613 | status_get = (struct audit_status *)data; | 705 | status_get = (struct audit_status *)data; |
| 614 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | 706 | if (status_get->mask & AUDIT_STATUS_ENABLED) { |
| 615 | err = audit_set_enabled(status_get->enabled, | 707 | err = audit_set_enabled(status_get->enabled, |
| 616 | loginuid, sid); | 708 | loginuid, sessionid, sid); |
| 617 | if (err < 0) return err; | 709 | if (err < 0) return err; |
| 618 | } | 710 | } |
| 619 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | 711 | if (status_get->mask & AUDIT_STATUS_FAILURE) { |
| 620 | err = audit_set_failure(status_get->failure, | 712 | err = audit_set_failure(status_get->failure, |
| 621 | loginuid, sid); | 713 | loginuid, sessionid, sid); |
| 622 | if (err < 0) return err; | 714 | if (err < 0) return err; |
| 623 | } | 715 | } |
| 624 | if (status_get->mask & AUDIT_STATUS_PID) { | 716 | if (status_get->mask & AUDIT_STATUS_PID) { |
| @@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 627 | if (audit_enabled != AUDIT_OFF) | 719 | if (audit_enabled != AUDIT_OFF) |
| 628 | audit_log_config_change("audit_pid", new_pid, | 720 | audit_log_config_change("audit_pid", new_pid, |
| 629 | audit_pid, loginuid, | 721 | audit_pid, loginuid, |
| 630 | sid, 1); | 722 | sessionid, sid, 1); |
| 631 | 723 | ||
| 632 | audit_pid = new_pid; | 724 | audit_pid = new_pid; |
| 633 | audit_nlk_pid = NETLINK_CB(skb).pid; | 725 | audit_nlk_pid = NETLINK_CB(skb).pid; |
| 634 | } | 726 | } |
| 635 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 727 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
| 636 | err = audit_set_rate_limit(status_get->rate_limit, | 728 | err = audit_set_rate_limit(status_get->rate_limit, |
| 637 | loginuid, sid); | 729 | loginuid, sessionid, sid); |
| 638 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 730 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
| 639 | err = audit_set_backlog_limit(status_get->backlog_limit, | 731 | err = audit_set_backlog_limit(status_get->backlog_limit, |
| 640 | loginuid, sid); | 732 | loginuid, sessionid, sid); |
| 641 | break; | 733 | break; |
| 642 | case AUDIT_USER: | 734 | case AUDIT_USER: |
| 643 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: | 735 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
| @@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 649 | if (err == 1) { | 741 | if (err == 1) { |
| 650 | err = 0; | 742 | err = 0; |
| 651 | if (msg_type == AUDIT_USER_TTY) { | 743 | if (msg_type == AUDIT_USER_TTY) { |
| 652 | err = audit_prepare_user_tty(pid, loginuid); | 744 | err = audit_prepare_user_tty(pid, loginuid, |
| 745 | sessionid); | ||
| 653 | if (err) | 746 | if (err) |
| 654 | break; | 747 | break; |
| 655 | } | 748 | } |
| 656 | audit_log_common_recv_msg(&ab, msg_type, pid, uid, | 749 | audit_log_common_recv_msg(&ab, msg_type, pid, uid, |
| 657 | loginuid, sid); | 750 | loginuid, sessionid, sid); |
| 658 | 751 | ||
| 659 | if (msg_type != AUDIT_USER_TTY) | 752 | if (msg_type != AUDIT_USER_TTY) |
| 660 | audit_log_format(ab, " msg='%.1024s'", | 753 | audit_log_format(ab, " msg='%.1024s'", |
| @@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 664 | 757 | ||
| 665 | audit_log_format(ab, " msg="); | 758 | audit_log_format(ab, " msg="); |
| 666 | size = nlmsg_len(nlh); | 759 | size = nlmsg_len(nlh); |
| 667 | audit_log_n_untrustedstring(ab, size, | 760 | audit_log_n_untrustedstring(ab, data, size); |
| 668 | data); | ||
| 669 | } | 761 | } |
| 670 | audit_set_pid(ab, pid); | 762 | audit_set_pid(ab, pid); |
| 671 | audit_log_end(ab); | 763 | audit_log_end(ab); |
| @@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 677 | return -EINVAL; | 769 | return -EINVAL; |
| 678 | if (audit_enabled == AUDIT_LOCKED) { | 770 | if (audit_enabled == AUDIT_LOCKED) { |
| 679 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 771 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
| 680 | uid, loginuid, sid); | 772 | uid, loginuid, sessionid, sid); |
| 681 | 773 | ||
| 682 | audit_log_format(ab, " audit_enabled=%d res=0", | 774 | audit_log_format(ab, " audit_enabled=%d res=0", |
| 683 | audit_enabled); | 775 | audit_enabled); |
| @@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 688 | case AUDIT_LIST: | 780 | case AUDIT_LIST: |
| 689 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 781 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
| 690 | uid, seq, data, nlmsg_len(nlh), | 782 | uid, seq, data, nlmsg_len(nlh), |
| 691 | loginuid, sid); | 783 | loginuid, sessionid, sid); |
| 692 | break; | 784 | break; |
| 693 | case AUDIT_ADD_RULE: | 785 | case AUDIT_ADD_RULE: |
| 694 | case AUDIT_DEL_RULE: | 786 | case AUDIT_DEL_RULE: |
| @@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 696 | return -EINVAL; | 788 | return -EINVAL; |
| 697 | if (audit_enabled == AUDIT_LOCKED) { | 789 | if (audit_enabled == AUDIT_LOCKED) { |
| 698 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 790 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
| 699 | uid, loginuid, sid); | 791 | uid, loginuid, sessionid, sid); |
| 700 | 792 | ||
| 701 | audit_log_format(ab, " audit_enabled=%d res=0", | 793 | audit_log_format(ab, " audit_enabled=%d res=0", |
| 702 | audit_enabled); | 794 | audit_enabled); |
| @@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 707 | case AUDIT_LIST_RULES: | 799 | case AUDIT_LIST_RULES: |
| 708 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 800 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
| 709 | uid, seq, data, nlmsg_len(nlh), | 801 | uid, seq, data, nlmsg_len(nlh), |
| 710 | loginuid, sid); | 802 | loginuid, sessionid, sid); |
| 711 | break; | 803 | break; |
| 712 | case AUDIT_TRIM: | 804 | case AUDIT_TRIM: |
| 713 | audit_trim_trees(); | 805 | audit_trim_trees(); |
| 714 | 806 | ||
| 715 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 807 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
| 716 | uid, loginuid, sid); | 808 | uid, loginuid, sessionid, sid); |
| 717 | 809 | ||
| 718 | audit_log_format(ab, " op=trim res=1"); | 810 | audit_log_format(ab, " op=trim res=1"); |
| 719 | audit_log_end(ab); | 811 | audit_log_end(ab); |
| @@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 721 | case AUDIT_MAKE_EQUIV: { | 813 | case AUDIT_MAKE_EQUIV: { |
| 722 | void *bufp = data; | 814 | void *bufp = data; |
| 723 | u32 sizes[2]; | 815 | u32 sizes[2]; |
| 724 | size_t len = nlmsg_len(nlh); | 816 | size_t msglen = nlmsg_len(nlh); |
| 725 | char *old, *new; | 817 | char *old, *new; |
| 726 | 818 | ||
| 727 | err = -EINVAL; | 819 | err = -EINVAL; |
| 728 | if (len < 2 * sizeof(u32)) | 820 | if (msglen < 2 * sizeof(u32)) |
| 729 | break; | 821 | break; |
| 730 | memcpy(sizes, bufp, 2 * sizeof(u32)); | 822 | memcpy(sizes, bufp, 2 * sizeof(u32)); |
| 731 | bufp += 2 * sizeof(u32); | 823 | bufp += 2 * sizeof(u32); |
| 732 | len -= 2 * sizeof(u32); | 824 | msglen -= 2 * sizeof(u32); |
| 733 | old = audit_unpack_string(&bufp, &len, sizes[0]); | 825 | old = audit_unpack_string(&bufp, &msglen, sizes[0]); |
| 734 | if (IS_ERR(old)) { | 826 | if (IS_ERR(old)) { |
| 735 | err = PTR_ERR(old); | 827 | err = PTR_ERR(old); |
| 736 | break; | 828 | break; |
| 737 | } | 829 | } |
| 738 | new = audit_unpack_string(&bufp, &len, sizes[1]); | 830 | new = audit_unpack_string(&bufp, &msglen, sizes[1]); |
| 739 | if (IS_ERR(new)) { | 831 | if (IS_ERR(new)) { |
| 740 | err = PTR_ERR(new); | 832 | err = PTR_ERR(new); |
| 741 | kfree(old); | 833 | kfree(old); |
| @@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 745 | err = audit_tag_tree(old, new); | 837 | err = audit_tag_tree(old, new); |
| 746 | 838 | ||
| 747 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 839 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
| 748 | uid, loginuid, sid); | 840 | uid, loginuid, sessionid, sid); |
| 749 | 841 | ||
| 750 | audit_log_format(ab, " op=make_equiv old="); | 842 | audit_log_format(ab, " op=make_equiv old="); |
| 751 | audit_log_untrustedstring(ab, old); | 843 | audit_log_untrustedstring(ab, old); |
| @@ -758,18 +850,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 758 | break; | 850 | break; |
| 759 | } | 851 | } |
| 760 | case AUDIT_SIGNAL_INFO: | 852 | case AUDIT_SIGNAL_INFO: |
| 761 | err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); | 853 | err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); |
| 762 | if (err) | 854 | if (err) |
| 763 | return err; | 855 | return err; |
| 764 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); | 856 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); |
| 765 | if (!sig_data) { | 857 | if (!sig_data) { |
| 766 | kfree(ctx); | 858 | security_release_secctx(ctx, len); |
| 767 | return -ENOMEM; | 859 | return -ENOMEM; |
| 768 | } | 860 | } |
| 769 | sig_data->uid = audit_sig_uid; | 861 | sig_data->uid = audit_sig_uid; |
| 770 | sig_data->pid = audit_sig_pid; | 862 | sig_data->pid = audit_sig_pid; |
| 771 | memcpy(sig_data->ctx, ctx, len); | 863 | memcpy(sig_data->ctx, ctx, len); |
| 772 | kfree(ctx); | 864 | security_release_secctx(ctx, len); |
| 773 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 865 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, |
| 774 | 0, 0, sig_data, sizeof(*sig_data) + len); | 866 | 0, 0, sig_data, sizeof(*sig_data) + len); |
| 775 | kfree(sig_data); | 867 | kfree(sig_data); |
| @@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 779 | struct task_struct *tsk; | 871 | struct task_struct *tsk; |
| 780 | 872 | ||
| 781 | read_lock(&tasklist_lock); | 873 | read_lock(&tasklist_lock); |
| 782 | tsk = find_task_by_pid(pid); | 874 | tsk = find_task_by_vpid(pid); |
| 783 | if (!tsk) | 875 | if (!tsk) |
| 784 | err = -ESRCH; | 876 | err = -ESRCH; |
| 785 | else { | 877 | else { |
| @@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 802 | if (s->enabled != 0 && s->enabled != 1) | 894 | if (s->enabled != 0 && s->enabled != 1) |
| 803 | return -EINVAL; | 895 | return -EINVAL; |
| 804 | read_lock(&tasklist_lock); | 896 | read_lock(&tasklist_lock); |
| 805 | tsk = find_task_by_pid(pid); | 897 | tsk = find_task_by_vpid(pid); |
| 806 | if (!tsk) | 898 | if (!tsk) |
| 807 | err = -ESRCH; | 899 | err = -ESRCH; |
| 808 | else { | 900 | else { |
| @@ -877,14 +969,11 @@ static int __init audit_init(void) | |||
| 877 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | 969 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; |
| 878 | 970 | ||
| 879 | skb_queue_head_init(&audit_skb_queue); | 971 | skb_queue_head_init(&audit_skb_queue); |
| 972 | skb_queue_head_init(&audit_skb_hold_queue); | ||
| 880 | audit_initialized = 1; | 973 | audit_initialized = 1; |
| 881 | audit_enabled = audit_default; | 974 | audit_enabled = audit_default; |
| 882 | audit_ever_enabled |= !!audit_default; | 975 | audit_ever_enabled |= !!audit_default; |
| 883 | 976 | ||
| 884 | /* Register the callback with selinux. This callback will be invoked | ||
| 885 | * when a new policy is loaded. */ | ||
| 886 | selinux_audit_set_callback(&selinux_audit_rule_update); | ||
| 887 | |||
| 888 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 977 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
| 889 | 978 | ||
| 890 | #ifdef CONFIG_AUDITSYSCALL | 979 | #ifdef CONFIG_AUDITSYSCALL |
| @@ -1203,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | |||
| 1203 | * This function will take the passed buf and convert it into a string of | 1292 | * This function will take the passed buf and convert it into a string of |
| 1204 | * ascii hex digits. The new string is placed onto the skb. | 1293 | * ascii hex digits. The new string is placed onto the skb. |
| 1205 | */ | 1294 | */ |
| 1206 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | 1295 | void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, |
| 1207 | size_t len) | 1296 | size_t len) |
| 1208 | { | 1297 | { |
| 1209 | int i, avail, new_len; | 1298 | int i, avail, new_len; |
| @@ -1239,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
| 1239 | * Format a string of no more than slen characters into the audit buffer, | 1328 | * Format a string of no more than slen characters into the audit buffer, |
| 1240 | * enclosed in quote marks. | 1329 | * enclosed in quote marks. |
| 1241 | */ | 1330 | */ |
| 1242 | static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | 1331 | void audit_log_n_string(struct audit_buffer *ab, const char *string, |
| 1243 | const char *string) | 1332 | size_t slen) |
| 1244 | { | 1333 | { |
| 1245 | int avail, new_len; | 1334 | int avail, new_len; |
| 1246 | unsigned char *ptr; | 1335 | unsigned char *ptr; |
| @@ -1269,8 +1358,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
| 1269 | 1358 | ||
| 1270 | /** | 1359 | /** |
| 1271 | * audit_string_contains_control - does a string need to be logged in hex | 1360 | * audit_string_contains_control - does a string need to be logged in hex |
| 1272 | * @string - string to be checked | 1361 | * @string: string to be checked |
| 1273 | * @len - max length of the string to check | 1362 | * @len: max length of the string to check |
| 1274 | */ | 1363 | */ |
| 1275 | int audit_string_contains_control(const char *string, size_t len) | 1364 | int audit_string_contains_control(const char *string, size_t len) |
| 1276 | { | 1365 | { |
| @@ -1285,7 +1374,7 @@ int audit_string_contains_control(const char *string, size_t len) | |||
| 1285 | /** | 1374 | /** |
| 1286 | * audit_log_n_untrustedstring - log a string that may contain random characters | 1375 | * audit_log_n_untrustedstring - log a string that may contain random characters |
| 1287 | * @ab: audit_buffer | 1376 | * @ab: audit_buffer |
| 1288 | * @len: lenth of string (not including trailing null) | 1377 | * @len: length of string (not including trailing null) |
| 1289 | * @string: string to be logged | 1378 | * @string: string to be logged |
| 1290 | * | 1379 | * |
| 1291 | * This code will escape a string that is passed to it if the string | 1380 | * This code will escape a string that is passed to it if the string |
| @@ -1296,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len) | |||
| 1296 | * The caller specifies the number of characters in the string to log, which may | 1385 | * The caller specifies the number of characters in the string to log, which may |
| 1297 | * or may not be the entire string. | 1386 | * or may not be the entire string. |
| 1298 | */ | 1387 | */ |
| 1299 | void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, | 1388 | void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, |
| 1300 | const char *string) | 1389 | size_t len) |
| 1301 | { | 1390 | { |
| 1302 | if (audit_string_contains_control(string, len)) | 1391 | if (audit_string_contains_control(string, len)) |
| 1303 | audit_log_hex(ab, string, len); | 1392 | audit_log_n_hex(ab, string, len); |
| 1304 | else | 1393 | else |
| 1305 | audit_log_n_string(ab, len, string); | 1394 | audit_log_n_string(ab, string, len); |
| 1306 | } | 1395 | } |
| 1307 | 1396 | ||
| 1308 | /** | 1397 | /** |
| @@ -1315,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, | |||
| 1315 | */ | 1404 | */ |
| 1316 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1405 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
| 1317 | { | 1406 | { |
| 1318 | audit_log_n_untrustedstring(ab, strlen(string), string); | 1407 | audit_log_n_untrustedstring(ab, string, strlen(string)); |
| 1319 | } | 1408 | } |
| 1320 | 1409 | ||
| 1321 | /* This is a helper-function to print the escaped d_path */ | 1410 | /* This is a helper-function to print the escaped d_path */ |
| @@ -1359,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1359 | audit_log_lost("rate limit exceeded"); | 1448 | audit_log_lost("rate limit exceeded"); |
| 1360 | } else { | 1449 | } else { |
| 1361 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1450 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
| 1451 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); | ||
| 1452 | |||
| 1362 | if (audit_pid) { | 1453 | if (audit_pid) { |
| 1363 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); | ||
| 1364 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1454 | skb_queue_tail(&audit_skb_queue, ab->skb); |
| 1365 | ab->skb = NULL; | ||
| 1366 | wake_up_interruptible(&kauditd_wait); | 1455 | wake_up_interruptible(&kauditd_wait); |
| 1367 | } else if (nlh->nlmsg_type != AUDIT_EOE) { | 1456 | } else { |
| 1368 | if (printk_ratelimit()) { | 1457 | if (nlh->nlmsg_type != AUDIT_EOE) { |
| 1369 | printk(KERN_NOTICE "type=%d %s\n", | 1458 | if (printk_ratelimit()) { |
| 1370 | nlh->nlmsg_type, | 1459 | printk(KERN_NOTICE "type=%d %s\n", |
| 1371 | ab->skb->data + NLMSG_SPACE(0)); | 1460 | nlh->nlmsg_type, |
| 1372 | } else | 1461 | ab->skb->data + NLMSG_SPACE(0)); |
| 1373 | audit_log_lost("printk limit exceeded\n"); | 1462 | } else |
| 1463 | audit_log_lost("printk limit exceeded\n"); | ||
| 1464 | } | ||
| 1465 | audit_hold_skb(ab->skb); | ||
| 1374 | } | 1466 | } |
| 1467 | ab->skb = NULL; | ||
| 1375 | } | 1468 | } |
| 1376 | audit_buffer_free(ab); | 1469 | audit_buffer_free(ab); |
| 1377 | } | 1470 | } |
diff --git a/kernel/audit.h b/kernel/audit.h index 2554bd524fd1..9d6717412fec 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -65,40 +65,20 @@ struct audit_watch { | |||
| 65 | struct list_head rules; /* associated rules */ | 65 | struct list_head rules; /* associated rules */ |
| 66 | }; | 66 | }; |
| 67 | 67 | ||
| 68 | struct audit_field { | ||
| 69 | u32 type; | ||
| 70 | u32 val; | ||
| 71 | u32 op; | ||
| 72 | char *se_str; | ||
| 73 | struct selinux_audit_rule *se_rule; | ||
| 74 | }; | ||
| 75 | |||
| 76 | struct audit_tree; | 68 | struct audit_tree; |
| 77 | struct audit_chunk; | 69 | struct audit_chunk; |
| 78 | 70 | ||
| 79 | struct audit_krule { | ||
| 80 | int vers_ops; | ||
| 81 | u32 flags; | ||
| 82 | u32 listnr; | ||
| 83 | u32 action; | ||
| 84 | u32 mask[AUDIT_BITMASK_SIZE]; | ||
| 85 | u32 buflen; /* for data alloc on list rules */ | ||
| 86 | u32 field_count; | ||
| 87 | char *filterkey; /* ties events to rules */ | ||
| 88 | struct audit_field *fields; | ||
| 89 | struct audit_field *arch_f; /* quick access to arch field */ | ||
| 90 | struct audit_field *inode_f; /* quick access to an inode field */ | ||
| 91 | struct audit_watch *watch; /* associated watch */ | ||
| 92 | struct audit_tree *tree; /* associated watched tree */ | ||
| 93 | struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ | ||
| 94 | }; | ||
| 95 | |||
| 96 | struct audit_entry { | 71 | struct audit_entry { |
| 97 | struct list_head list; | 72 | struct list_head list; |
| 98 | struct rcu_head rcu; | 73 | struct rcu_head rcu; |
| 99 | struct audit_krule rule; | 74 | struct audit_krule rule; |
| 100 | }; | 75 | }; |
| 101 | 76 | ||
| 77 | #ifdef CONFIG_AUDIT | ||
| 78 | extern int audit_enabled; | ||
| 79 | extern int audit_ever_enabled; | ||
| 80 | #endif | ||
| 81 | |||
| 102 | extern int audit_pid; | 82 | extern int audit_pid; |
| 103 | 83 | ||
| 104 | #define AUDIT_INODE_BUCKETS 32 | 84 | #define AUDIT_INODE_BUCKETS 32 |
| @@ -129,6 +109,9 @@ struct audit_netlink_list { | |||
| 129 | int audit_send_list(void *); | 109 | int audit_send_list(void *); |
| 130 | 110 | ||
| 131 | struct inotify_watch; | 111 | struct inotify_watch; |
| 112 | /* Inotify handle */ | ||
| 113 | extern struct inotify_handle *audit_ih; | ||
| 114 | |||
| 132 | extern void audit_free_parent(struct inotify_watch *); | 115 | extern void audit_free_parent(struct inotify_watch *); |
| 133 | extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, | 116 | extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, |
| 134 | const char *, struct inode *); | 117 | const char *, struct inode *); |
| @@ -136,6 +119,7 @@ extern int selinux_audit_rule_update(void); | |||
| 136 | 119 | ||
| 137 | extern struct mutex audit_filter_mutex; | 120 | extern struct mutex audit_filter_mutex; |
| 138 | extern void audit_free_rule_rcu(struct rcu_head *); | 121 | extern void audit_free_rule_rcu(struct rcu_head *); |
| 122 | extern struct list_head audit_filter_list[]; | ||
| 139 | 123 | ||
| 140 | #ifdef CONFIG_AUDIT_TREE | 124 | #ifdef CONFIG_AUDIT_TREE |
| 141 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 125 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); |
| @@ -162,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *); | |||
| 162 | 146 | ||
| 163 | extern char *audit_unpack_string(void **, size_t *, size_t); | 147 | extern char *audit_unpack_string(void **, size_t *, size_t); |
| 164 | 148 | ||
| 149 | extern pid_t audit_sig_pid; | ||
| 150 | extern uid_t audit_sig_uid; | ||
| 151 | extern u32 audit_sig_sid; | ||
| 152 | |||
| 165 | #ifdef CONFIG_AUDITSYSCALL | 153 | #ifdef CONFIG_AUDITSYSCALL |
| 166 | extern int __audit_signal_info(int sig, struct task_struct *t); | 154 | extern int __audit_signal_info(int sig, struct task_struct *t); |
| 167 | static inline int audit_signal_info(int sig, struct task_struct *t) | 155 | static inline int audit_signal_info(int sig, struct task_struct *t) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2f2914b7cc30..0e0bd27e6512 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
| 29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
| 30 | #include <linux/inotify.h> | 30 | #include <linux/inotify.h> |
| 31 | #include <linux/selinux.h> | 31 | #include <linux/security.h> |
| 32 | #include "audit.h" | 32 | #include "audit.h" |
| 33 | 33 | ||
| 34 | /* | 34 | /* |
| @@ -38,7 +38,7 @@ | |||
| 38 | * Synchronizes writes and blocking reads of audit's filterlist | 38 | * Synchronizes writes and blocking reads of audit's filterlist |
| 39 | * data. Rcu is used to traverse the filterlist and access | 39 | * data. Rcu is used to traverse the filterlist and access |
| 40 | * contents of structs audit_entry, audit_watch and opaque | 40 | * contents of structs audit_entry, audit_watch and opaque |
| 41 | * selinux rules during filtering. If modified, these structures | 41 | * LSM rules during filtering. If modified, these structures |
| 42 | * must be copied and replace their counterparts in the filterlist. | 42 | * must be copied and replace their counterparts in the filterlist. |
| 43 | * An audit_parent struct is not accessed during filtering, so may | 43 | * An audit_parent struct is not accessed during filtering, so may |
| 44 | * be written directly provided audit_filter_mutex is held. | 44 | * be written directly provided audit_filter_mutex is held. |
| @@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | |||
| 89 | 89 | ||
| 90 | DEFINE_MUTEX(audit_filter_mutex); | 90 | DEFINE_MUTEX(audit_filter_mutex); |
| 91 | 91 | ||
| 92 | /* Inotify handle */ | ||
| 93 | extern struct inotify_handle *audit_ih; | ||
| 94 | |||
| 95 | /* Inotify events we care about. */ | 92 | /* Inotify events we care about. */ |
| 96 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | 93 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF |
| 97 | 94 | ||
| 98 | extern int audit_enabled; | ||
| 99 | |||
| 100 | void audit_free_parent(struct inotify_watch *i_watch) | 95 | void audit_free_parent(struct inotify_watch *i_watch) |
| 101 | { | 96 | { |
| 102 | struct audit_parent *parent; | 97 | struct audit_parent *parent; |
| @@ -139,8 +134,8 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
| 139 | if (e->rule.fields) | 134 | if (e->rule.fields) |
| 140 | for (i = 0; i < e->rule.field_count; i++) { | 135 | for (i = 0; i < e->rule.field_count; i++) { |
| 141 | struct audit_field *f = &e->rule.fields[i]; | 136 | struct audit_field *f = &e->rule.fields[i]; |
| 142 | kfree(f->se_str); | 137 | kfree(f->lsm_str); |
| 143 | selinux_audit_rule_free(f->se_rule); | 138 | security_audit_rule_free(f->lsm_rule); |
| 144 | } | 139 | } |
| 145 | kfree(e->rule.fields); | 140 | kfree(e->rule.fields); |
| 146 | kfree(e->rule.filterkey); | 141 | kfree(e->rule.filterkey); |
| @@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, | |||
| 272 | return -EINVAL; | 267 | return -EINVAL; |
| 273 | 268 | ||
| 274 | watch = audit_init_watch(path); | 269 | watch = audit_init_watch(path); |
| 275 | if (unlikely(IS_ERR(watch))) | 270 | if (IS_ERR(watch)) |
| 276 | return PTR_ERR(watch); | 271 | return PTR_ERR(watch); |
| 277 | 272 | ||
| 278 | audit_get_watch(watch); | 273 | audit_get_watch(watch); |
| @@ -422,7 +417,7 @@ exit_err: | |||
| 422 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | 417 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) |
| 423 | { | 418 | { |
| 424 | struct audit_entry *entry; | 419 | struct audit_entry *entry; |
| 425 | struct audit_field *f; | 420 | struct audit_field *ino_f; |
| 426 | int err = 0; | 421 | int err = 0; |
| 427 | int i; | 422 | int i; |
| 428 | 423 | ||
| @@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 483 | if (f->val & ~15) | 478 | if (f->val & ~15) |
| 484 | goto exit_free; | 479 | goto exit_free; |
| 485 | break; | 480 | break; |
| 481 | case AUDIT_FILETYPE: | ||
| 482 | if ((f->val & ~S_IFMT) > S_IFMT) | ||
| 483 | goto exit_free; | ||
| 484 | break; | ||
| 486 | case AUDIT_INODE: | 485 | case AUDIT_INODE: |
| 487 | err = audit_to_inode(&entry->rule, f); | 486 | err = audit_to_inode(&entry->rule, f); |
| 488 | if (err) | 487 | if (err) |
| @@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 504 | } | 503 | } |
| 505 | } | 504 | } |
| 506 | 505 | ||
| 507 | f = entry->rule.inode_f; | 506 | ino_f = entry->rule.inode_f; |
| 508 | if (f) { | 507 | if (ino_f) { |
| 509 | switch(f->op) { | 508 | switch(ino_f->op) { |
| 510 | case AUDIT_NOT_EQUAL: | 509 | case AUDIT_NOT_EQUAL: |
| 511 | entry->rule.inode_f = NULL; | 510 | entry->rule.inode_f = NULL; |
| 512 | case AUDIT_EQUAL: | 511 | case AUDIT_EQUAL: |
| @@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 531 | { | 530 | { |
| 532 | int err = 0; | 531 | int err = 0; |
| 533 | struct audit_entry *entry; | 532 | struct audit_entry *entry; |
| 534 | struct audit_field *f; | 533 | struct audit_field *ino_f; |
| 535 | void *bufp; | 534 | void *bufp; |
| 536 | size_t remain = datasz - sizeof(struct audit_rule_data); | 535 | size_t remain = datasz - sizeof(struct audit_rule_data); |
| 537 | int i; | 536 | int i; |
| @@ -554,8 +553,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 554 | f->op = data->fieldflags[i] & AUDIT_OPERATORS; | 553 | f->op = data->fieldflags[i] & AUDIT_OPERATORS; |
| 555 | f->type = data->fields[i]; | 554 | f->type = data->fields[i]; |
| 556 | f->val = data->values[i]; | 555 | f->val = data->values[i]; |
| 557 | f->se_str = NULL; | 556 | f->lsm_str = NULL; |
| 558 | f->se_rule = NULL; | 557 | f->lsm_rule = NULL; |
| 559 | switch(f->type) { | 558 | switch(f->type) { |
| 560 | case AUDIT_PID: | 559 | case AUDIT_PID: |
| 561 | case AUDIT_UID: | 560 | case AUDIT_UID: |
| @@ -597,12 +596,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 597 | goto exit_free; | 596 | goto exit_free; |
| 598 | entry->rule.buflen += f->val; | 597 | entry->rule.buflen += f->val; |
| 599 | 598 | ||
| 600 | err = selinux_audit_rule_init(f->type, f->op, str, | 599 | err = security_audit_rule_init(f->type, f->op, str, |
| 601 | &f->se_rule); | 600 | (void **)&f->lsm_rule); |
| 602 | /* Keep currently invalid fields around in case they | 601 | /* Keep currently invalid fields around in case they |
| 603 | * become valid after a policy reload. */ | 602 | * become valid after a policy reload. */ |
| 604 | if (err == -EINVAL) { | 603 | if (err == -EINVAL) { |
| 605 | printk(KERN_WARNING "audit rule for selinux " | 604 | printk(KERN_WARNING "audit rule for LSM " |
| 606 | "\'%s\' is invalid\n", str); | 605 | "\'%s\' is invalid\n", str); |
| 607 | err = 0; | 606 | err = 0; |
| 608 | } | 607 | } |
| @@ -610,7 +609,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 610 | kfree(str); | 609 | kfree(str); |
| 611 | goto exit_free; | 610 | goto exit_free; |
| 612 | } else | 611 | } else |
| 613 | f->se_str = str; | 612 | f->lsm_str = str; |
| 614 | break; | 613 | break; |
| 615 | case AUDIT_WATCH: | 614 | case AUDIT_WATCH: |
| 616 | str = audit_unpack_string(&bufp, &remain, f->val); | 615 | str = audit_unpack_string(&bufp, &remain, f->val); |
| @@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 654 | if (f->val & ~15) | 653 | if (f->val & ~15) |
| 655 | goto exit_free; | 654 | goto exit_free; |
| 656 | break; | 655 | break; |
| 656 | case AUDIT_FILETYPE: | ||
| 657 | if ((f->val & ~S_IFMT) > S_IFMT) | ||
| 658 | goto exit_free; | ||
| 659 | break; | ||
| 657 | default: | 660 | default: |
| 658 | goto exit_free; | 661 | goto exit_free; |
| 659 | } | 662 | } |
| 660 | } | 663 | } |
| 661 | 664 | ||
| 662 | f = entry->rule.inode_f; | 665 | ino_f = entry->rule.inode_f; |
| 663 | if (f) { | 666 | if (ino_f) { |
| 664 | switch(f->op) { | 667 | switch(ino_f->op) { |
| 665 | case AUDIT_NOT_EQUAL: | 668 | case AUDIT_NOT_EQUAL: |
| 666 | entry->rule.inode_f = NULL; | 669 | entry->rule.inode_f = NULL; |
| 667 | case AUDIT_EQUAL: | 670 | case AUDIT_EQUAL: |
| @@ -754,7 +757,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
| 754 | case AUDIT_OBJ_LEV_LOW: | 757 | case AUDIT_OBJ_LEV_LOW: |
| 755 | case AUDIT_OBJ_LEV_HIGH: | 758 | case AUDIT_OBJ_LEV_HIGH: |
| 756 | data->buflen += data->values[i] = | 759 | data->buflen += data->values[i] = |
| 757 | audit_pack_string(&bufp, f->se_str); | 760 | audit_pack_string(&bufp, f->lsm_str); |
| 758 | break; | 761 | break; |
| 759 | case AUDIT_WATCH: | 762 | case AUDIT_WATCH: |
| 760 | data->buflen += data->values[i] = | 763 | data->buflen += data->values[i] = |
| @@ -806,7 +809,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 806 | case AUDIT_OBJ_TYPE: | 809 | case AUDIT_OBJ_TYPE: |
| 807 | case AUDIT_OBJ_LEV_LOW: | 810 | case AUDIT_OBJ_LEV_LOW: |
| 808 | case AUDIT_OBJ_LEV_HIGH: | 811 | case AUDIT_OBJ_LEV_HIGH: |
| 809 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | 812 | if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) |
| 810 | return 1; | 813 | return 1; |
| 811 | break; | 814 | break; |
| 812 | case AUDIT_WATCH: | 815 | case AUDIT_WATCH: |
| @@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) | |||
| 848 | return ERR_PTR(-ENOMEM); | 851 | return ERR_PTR(-ENOMEM); |
| 849 | 852 | ||
| 850 | new = audit_init_watch(path); | 853 | new = audit_init_watch(path); |
| 851 | if (unlikely(IS_ERR(new))) { | 854 | if (IS_ERR(new)) { |
| 852 | kfree(path); | 855 | kfree(path); |
| 853 | goto out; | 856 | goto out; |
| 854 | } | 857 | } |
| @@ -862,28 +865,28 @@ out: | |||
| 862 | return new; | 865 | return new; |
| 863 | } | 866 | } |
| 864 | 867 | ||
| 865 | /* Duplicate selinux field information. The se_rule is opaque, so must be | 868 | /* Duplicate LSM field information. The lsm_rule is opaque, so must be |
| 866 | * re-initialized. */ | 869 | * re-initialized. */ |
| 867 | static inline int audit_dupe_selinux_field(struct audit_field *df, | 870 | static inline int audit_dupe_lsm_field(struct audit_field *df, |
| 868 | struct audit_field *sf) | 871 | struct audit_field *sf) |
| 869 | { | 872 | { |
| 870 | int ret = 0; | 873 | int ret = 0; |
| 871 | char *se_str; | 874 | char *lsm_str; |
| 872 | 875 | ||
| 873 | /* our own copy of se_str */ | 876 | /* our own copy of lsm_str */ |
| 874 | se_str = kstrdup(sf->se_str, GFP_KERNEL); | 877 | lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); |
| 875 | if (unlikely(!se_str)) | 878 | if (unlikely(!lsm_str)) |
| 876 | return -ENOMEM; | 879 | return -ENOMEM; |
| 877 | df->se_str = se_str; | 880 | df->lsm_str = lsm_str; |
| 878 | 881 | ||
| 879 | /* our own (refreshed) copy of se_rule */ | 882 | /* our own (refreshed) copy of lsm_rule */ |
| 880 | ret = selinux_audit_rule_init(df->type, df->op, df->se_str, | 883 | ret = security_audit_rule_init(df->type, df->op, df->lsm_str, |
| 881 | &df->se_rule); | 884 | (void **)&df->lsm_rule); |
| 882 | /* Keep currently invalid fields around in case they | 885 | /* Keep currently invalid fields around in case they |
| 883 | * become valid after a policy reload. */ | 886 | * become valid after a policy reload. */ |
| 884 | if (ret == -EINVAL) { | 887 | if (ret == -EINVAL) { |
| 885 | printk(KERN_WARNING "audit rule for selinux \'%s\' is " | 888 | printk(KERN_WARNING "audit rule for LSM \'%s\' is " |
| 886 | "invalid\n", df->se_str); | 889 | "invalid\n", df->lsm_str); |
| 887 | ret = 0; | 890 | ret = 0; |
| 888 | } | 891 | } |
| 889 | 892 | ||
| @@ -891,7 +894,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, | |||
| 891 | } | 894 | } |
| 892 | 895 | ||
| 893 | /* Duplicate an audit rule. This will be a deep copy with the exception | 896 | /* Duplicate an audit rule. This will be a deep copy with the exception |
| 894 | * of the watch - that pointer is carried over. The selinux specific fields | 897 | * of the watch - that pointer is carried over. The LSM specific fields |
| 895 | * will be updated in the copy. The point is to be able to replace the old | 898 | * will be updated in the copy. The point is to be able to replace the old |
| 896 | * rule with the new rule in the filterlist, then free the old rule. | 899 | * rule with the new rule in the filterlist, then free the old rule. |
| 897 | * The rlist element is undefined; list manipulations are handled apart from | 900 | * The rlist element is undefined; list manipulations are handled apart from |
| @@ -930,7 +933,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
| 930 | new->tree = old->tree; | 933 | new->tree = old->tree; |
| 931 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); | 934 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); |
| 932 | 935 | ||
| 933 | /* deep copy this information, updating the se_rule fields, because | 936 | /* deep copy this information, updating the lsm_rule fields, because |
| 934 | * the originals will all be freed when the old rule is freed. */ | 937 | * the originals will all be freed when the old rule is freed. */ |
| 935 | for (i = 0; i < fcount; i++) { | 938 | for (i = 0; i < fcount; i++) { |
| 936 | switch (new->fields[i].type) { | 939 | switch (new->fields[i].type) { |
| @@ -944,7 +947,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
| 944 | case AUDIT_OBJ_TYPE: | 947 | case AUDIT_OBJ_TYPE: |
| 945 | case AUDIT_OBJ_LEV_LOW: | 948 | case AUDIT_OBJ_LEV_LOW: |
| 946 | case AUDIT_OBJ_LEV_HIGH: | 949 | case AUDIT_OBJ_LEV_HIGH: |
| 947 | err = audit_dupe_selinux_field(&new->fields[i], | 950 | err = audit_dupe_lsm_field(&new->fields[i], |
| 948 | &old->fields[i]); | 951 | &old->fields[i]); |
| 949 | break; | 952 | break; |
| 950 | case AUDIT_FILTERKEY: | 953 | case AUDIT_FILTERKEY: |
| @@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent, | |||
| 989 | audit_set_auditable(current->audit_context); | 992 | audit_set_auditable(current->audit_context); |
| 990 | 993 | ||
| 991 | nwatch = audit_dupe_watch(owatch); | 994 | nwatch = audit_dupe_watch(owatch); |
| 992 | if (unlikely(IS_ERR(nwatch))) { | 995 | if (IS_ERR(nwatch)) { |
| 993 | mutex_unlock(&audit_filter_mutex); | 996 | mutex_unlock(&audit_filter_mutex); |
| 994 | audit_panic("error updating watch, skipping"); | 997 | audit_panic("error updating watch, skipping"); |
| 995 | return; | 998 | return; |
| @@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent, | |||
| 1004 | list_del_rcu(&oentry->list); | 1007 | list_del_rcu(&oentry->list); |
| 1005 | 1008 | ||
| 1006 | nentry = audit_dupe_rule(&oentry->rule, nwatch); | 1009 | nentry = audit_dupe_rule(&oentry->rule, nwatch); |
| 1007 | if (unlikely(IS_ERR(nentry))) | 1010 | if (IS_ERR(nentry)) |
| 1008 | audit_panic("error updating watch, removing"); | 1011 | audit_panic("error updating watch, removing"); |
| 1009 | else { | 1012 | else { |
| 1010 | int h = audit_hash_ino((u32)ino); | 1013 | int h = audit_hash_ino((u32)ino); |
| @@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
| 1500 | } | 1503 | } |
| 1501 | 1504 | ||
| 1502 | /* Log rule additions and removals */ | 1505 | /* Log rule additions and removals */ |
| 1503 | static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | 1506 | static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, |
| 1504 | struct audit_krule *rule, int res) | 1507 | char *action, struct audit_krule *rule, |
| 1508 | int res) | ||
| 1505 | { | 1509 | { |
| 1506 | struct audit_buffer *ab; | 1510 | struct audit_buffer *ab; |
| 1507 | 1511 | ||
| @@ -1511,15 +1515,16 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | |||
| 1511 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1515 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 1512 | if (!ab) | 1516 | if (!ab) |
| 1513 | return; | 1517 | return; |
| 1514 | audit_log_format(ab, "auid=%u", loginuid); | 1518 | audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); |
| 1515 | if (sid) { | 1519 | if (sid) { |
| 1516 | char *ctx = NULL; | 1520 | char *ctx = NULL; |
| 1517 | u32 len; | 1521 | u32 len; |
| 1518 | if (selinux_sid_to_string(sid, &ctx, &len)) | 1522 | if (security_secid_to_secctx(sid, &ctx, &len)) |
| 1519 | audit_log_format(ab, " ssid=%u", sid); | 1523 | audit_log_format(ab, " ssid=%u", sid); |
| 1520 | else | 1524 | else { |
| 1521 | audit_log_format(ab, " subj=%s", ctx); | 1525 | audit_log_format(ab, " subj=%s", ctx); |
| 1522 | kfree(ctx); | 1526 | security_release_secctx(ctx, len); |
| 1527 | } | ||
| 1523 | } | 1528 | } |
| 1524 | audit_log_format(ab, " op=%s rule key=", action); | 1529 | audit_log_format(ab, " op=%s rule key=", action); |
| 1525 | if (rule->filterkey) | 1530 | if (rule->filterkey) |
| @@ -1542,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | |||
| 1542 | * @sid: SE Linux Security ID of sender | 1547 | * @sid: SE Linux Security ID of sender |
| 1543 | */ | 1548 | */ |
| 1544 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | 1549 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, |
| 1545 | size_t datasz, uid_t loginuid, u32 sid) | 1550 | size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) |
| 1546 | { | 1551 | { |
| 1547 | struct task_struct *tsk; | 1552 | struct task_struct *tsk; |
| 1548 | struct audit_netlink_list *dest; | 1553 | struct audit_netlink_list *dest; |
| @@ -1589,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 1589 | 1594 | ||
| 1590 | err = audit_add_rule(entry, | 1595 | err = audit_add_rule(entry, |
| 1591 | &audit_filter_list[entry->rule.listnr]); | 1596 | &audit_filter_list[entry->rule.listnr]); |
| 1592 | audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); | 1597 | audit_log_rule_change(loginuid, sessionid, sid, "add", |
| 1598 | &entry->rule, !err); | ||
| 1593 | 1599 | ||
| 1594 | if (err) | 1600 | if (err) |
| 1595 | audit_free_rule(entry); | 1601 | audit_free_rule(entry); |
| @@ -1605,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 1605 | 1611 | ||
| 1606 | err = audit_del_rule(entry, | 1612 | err = audit_del_rule(entry, |
| 1607 | &audit_filter_list[entry->rule.listnr]); | 1613 | &audit_filter_list[entry->rule.listnr]); |
| 1608 | audit_log_rule_change(loginuid, sid, "remove", &entry->rule, | 1614 | audit_log_rule_change(loginuid, sessionid, sid, "remove", |
| 1609 | !err); | 1615 | &entry->rule, !err); |
| 1610 | 1616 | ||
| 1611 | audit_free_rule(entry); | 1617 | audit_free_rule(entry); |
| 1612 | break; | 1618 | break; |
| @@ -1761,38 +1767,12 @@ unlock_and_return: | |||
| 1761 | return result; | 1767 | return result; |
| 1762 | } | 1768 | } |
| 1763 | 1769 | ||
| 1764 | /* Check to see if the rule contains any selinux fields. Returns 1 if there | 1770 | /* This function will re-initialize the lsm_rule field of all applicable rules. |
| 1765 | are selinux fields specified in the rule, 0 otherwise. */ | 1771 | * It will traverse the filter lists serarching for rules that contain LSM |
| 1766 | static inline int audit_rule_has_selinux(struct audit_krule *rule) | ||
| 1767 | { | ||
| 1768 | int i; | ||
| 1769 | |||
| 1770 | for (i = 0; i < rule->field_count; i++) { | ||
| 1771 | struct audit_field *f = &rule->fields[i]; | ||
| 1772 | switch (f->type) { | ||
| 1773 | case AUDIT_SUBJ_USER: | ||
| 1774 | case AUDIT_SUBJ_ROLE: | ||
| 1775 | case AUDIT_SUBJ_TYPE: | ||
| 1776 | case AUDIT_SUBJ_SEN: | ||
| 1777 | case AUDIT_SUBJ_CLR: | ||
| 1778 | case AUDIT_OBJ_USER: | ||
| 1779 | case AUDIT_OBJ_ROLE: | ||
| 1780 | case AUDIT_OBJ_TYPE: | ||
| 1781 | case AUDIT_OBJ_LEV_LOW: | ||
| 1782 | case AUDIT_OBJ_LEV_HIGH: | ||
| 1783 | return 1; | ||
| 1784 | } | ||
| 1785 | } | ||
| 1786 | |||
| 1787 | return 0; | ||
| 1788 | } | ||
| 1789 | |||
| 1790 | /* This function will re-initialize the se_rule field of all applicable rules. | ||
| 1791 | * It will traverse the filter lists serarching for rules that contain selinux | ||
| 1792 | * specific filter fields. When such a rule is found, it is copied, the | 1772 | * specific filter fields. When such a rule is found, it is copied, the |
| 1793 | * selinux field is re-initialized, and the old rule is replaced with the | 1773 | * LSM field is re-initialized, and the old rule is replaced with the |
| 1794 | * updated rule. */ | 1774 | * updated rule. */ |
| 1795 | int selinux_audit_rule_update(void) | 1775 | int audit_update_lsm_rules(void) |
| 1796 | { | 1776 | { |
| 1797 | struct audit_entry *entry, *n, *nentry; | 1777 | struct audit_entry *entry, *n, *nentry; |
| 1798 | struct audit_watch *watch; | 1778 | struct audit_watch *watch; |
| @@ -1804,18 +1784,18 @@ int selinux_audit_rule_update(void) | |||
| 1804 | 1784 | ||
| 1805 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { | 1785 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { |
| 1806 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { | 1786 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { |
| 1807 | if (!audit_rule_has_selinux(&entry->rule)) | 1787 | if (!security_audit_rule_known(&entry->rule)) |
| 1808 | continue; | 1788 | continue; |
| 1809 | 1789 | ||
| 1810 | watch = entry->rule.watch; | 1790 | watch = entry->rule.watch; |
| 1811 | tree = entry->rule.tree; | 1791 | tree = entry->rule.tree; |
| 1812 | nentry = audit_dupe_rule(&entry->rule, watch); | 1792 | nentry = audit_dupe_rule(&entry->rule, watch); |
| 1813 | if (unlikely(IS_ERR(nentry))) { | 1793 | if (IS_ERR(nentry)) { |
| 1814 | /* save the first error encountered for the | 1794 | /* save the first error encountered for the |
| 1815 | * return value */ | 1795 | * return value */ |
| 1816 | if (!err) | 1796 | if (!err) |
| 1817 | err = PTR_ERR(nentry); | 1797 | err = PTR_ERR(nentry); |
| 1818 | audit_panic("error updating selinux filters"); | 1798 | audit_panic("error updating LSM filters"); |
| 1819 | if (watch) | 1799 | if (watch) |
| 1820 | list_del(&entry->rule.rlist); | 1800 | list_del(&entry->rule.rlist); |
| 1821 | list_del_rcu(&entry->list); | 1801 | list_del_rcu(&entry->list); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 782262e4107d..c10e7aae04d7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -61,7 +61,6 @@ | |||
| 61 | #include <linux/security.h> | 61 | #include <linux/security.h> |
| 62 | #include <linux/list.h> | 62 | #include <linux/list.h> |
| 63 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
| 64 | #include <linux/selinux.h> | ||
| 65 | #include <linux/binfmts.h> | 64 | #include <linux/binfmts.h> |
| 66 | #include <linux/highmem.h> | 65 | #include <linux/highmem.h> |
| 67 | #include <linux/syscalls.h> | 66 | #include <linux/syscalls.h> |
| @@ -69,9 +68,6 @@ | |||
| 69 | 68 | ||
| 70 | #include "audit.h" | 69 | #include "audit.h" |
| 71 | 70 | ||
| 72 | extern struct list_head audit_filter_list[]; | ||
| 73 | extern int audit_ever_enabled; | ||
| 74 | |||
| 75 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 71 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
| 76 | * for saving names from getname(). */ | 72 | * for saving names from getname(). */ |
| 77 | #define AUDIT_NAMES 20 | 73 | #define AUDIT_NAMES 20 |
| @@ -284,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask) | |||
| 284 | } | 280 | } |
| 285 | } | 281 | } |
| 286 | 282 | ||
| 283 | static int audit_match_filetype(struct audit_context *ctx, int which) | ||
| 284 | { | ||
| 285 | unsigned index = which & ~S_IFMT; | ||
| 286 | mode_t mode = which & S_IFMT; | ||
| 287 | if (index >= ctx->name_count) | ||
| 288 | return 0; | ||
| 289 | if (ctx->names[index].ino == -1) | ||
| 290 | return 0; | ||
| 291 | if ((ctx->names[index].mode ^ mode) & S_IFMT) | ||
| 292 | return 0; | ||
| 293 | return 1; | ||
| 294 | } | ||
| 295 | |||
| 287 | /* | 296 | /* |
| 288 | * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; | 297 | * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; |
| 289 | * ->first_trees points to its beginning, ->trees - to the current end of data. | 298 | * ->first_trees points to its beginning, ->trees - to the current end of data. |
| @@ -528,14 +537,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 528 | match for now to avoid losing information that | 537 | match for now to avoid losing information that |
| 529 | may be wanted. An error message will also be | 538 | may be wanted. An error message will also be |
| 530 | logged upon error */ | 539 | logged upon error */ |
| 531 | if (f->se_rule) { | 540 | if (f->lsm_rule) { |
| 532 | if (need_sid) { | 541 | if (need_sid) { |
| 533 | selinux_get_task_sid(tsk, &sid); | 542 | security_task_getsecid(tsk, &sid); |
| 534 | need_sid = 0; | 543 | need_sid = 0; |
| 535 | } | 544 | } |
| 536 | result = selinux_audit_rule_match(sid, f->type, | 545 | result = security_audit_rule_match(sid, f->type, |
| 537 | f->op, | 546 | f->op, |
| 538 | f->se_rule, | 547 | f->lsm_rule, |
| 539 | ctx); | 548 | ctx); |
| 540 | } | 549 | } |
| 541 | break; | 550 | break; |
| @@ -546,18 +555,18 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 546 | case AUDIT_OBJ_LEV_HIGH: | 555 | case AUDIT_OBJ_LEV_HIGH: |
| 547 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR | 556 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR |
| 548 | also applies here */ | 557 | also applies here */ |
| 549 | if (f->se_rule) { | 558 | if (f->lsm_rule) { |
| 550 | /* Find files that match */ | 559 | /* Find files that match */ |
| 551 | if (name) { | 560 | if (name) { |
| 552 | result = selinux_audit_rule_match( | 561 | result = security_audit_rule_match( |
| 553 | name->osid, f->type, f->op, | 562 | name->osid, f->type, f->op, |
| 554 | f->se_rule, ctx); | 563 | f->lsm_rule, ctx); |
| 555 | } else if (ctx) { | 564 | } else if (ctx) { |
| 556 | for (j = 0; j < ctx->name_count; j++) { | 565 | for (j = 0; j < ctx->name_count; j++) { |
| 557 | if (selinux_audit_rule_match( | 566 | if (security_audit_rule_match( |
| 558 | ctx->names[j].osid, | 567 | ctx->names[j].osid, |
| 559 | f->type, f->op, | 568 | f->type, f->op, |
| 560 | f->se_rule, ctx)) { | 569 | f->lsm_rule, ctx)) { |
| 561 | ++result; | 570 | ++result; |
| 562 | break; | 571 | break; |
| 563 | } | 572 | } |
| @@ -570,7 +579,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 570 | aux = aux->next) { | 579 | aux = aux->next) { |
| 571 | if (aux->type == AUDIT_IPC) { | 580 | if (aux->type == AUDIT_IPC) { |
| 572 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 581 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
| 573 | if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { | 582 | if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { |
| 574 | ++result; | 583 | ++result; |
| 575 | break; | 584 | break; |
| 576 | } | 585 | } |
| @@ -593,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 593 | case AUDIT_PERM: | 602 | case AUDIT_PERM: |
| 594 | result = audit_match_perm(ctx, f->val); | 603 | result = audit_match_perm(ctx, f->val); |
| 595 | break; | 604 | break; |
| 605 | case AUDIT_FILETYPE: | ||
| 606 | result = audit_match_filetype(ctx, f->val); | ||
| 607 | break; | ||
| 596 | } | 608 | } |
| 597 | 609 | ||
| 598 | if (!result) | 610 | if (!result) |
| @@ -885,11 +897,11 @@ void audit_log_task_context(struct audit_buffer *ab) | |||
| 885 | int error; | 897 | int error; |
| 886 | u32 sid; | 898 | u32 sid; |
| 887 | 899 | ||
| 888 | selinux_get_task_sid(current, &sid); | 900 | security_task_getsecid(current, &sid); |
| 889 | if (!sid) | 901 | if (!sid) |
| 890 | return; | 902 | return; |
| 891 | 903 | ||
| 892 | error = selinux_sid_to_string(sid, &ctx, &len); | 904 | error = security_secid_to_secctx(sid, &ctx, &len); |
| 893 | if (error) { | 905 | if (error) { |
| 894 | if (error != -EINVAL) | 906 | if (error != -EINVAL) |
| 895 | goto error_path; | 907 | goto error_path; |
| @@ -897,7 +909,7 @@ void audit_log_task_context(struct audit_buffer *ab) | |||
| 897 | } | 909 | } |
| 898 | 910 | ||
| 899 | audit_log_format(ab, " subj=%s", ctx); | 911 | audit_log_format(ab, " subj=%s", ctx); |
| 900 | kfree(ctx); | 912 | security_release_secctx(ctx, len); |
| 901 | return; | 913 | return; |
| 902 | 914 | ||
| 903 | error_path: | 915 | error_path: |
| @@ -941,7 +953,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
| 941 | u32 sid, char *comm) | 953 | u32 sid, char *comm) |
| 942 | { | 954 | { |
| 943 | struct audit_buffer *ab; | 955 | struct audit_buffer *ab; |
| 944 | char *s = NULL; | 956 | char *ctx = NULL; |
| 945 | u32 len; | 957 | u32 len; |
| 946 | int rc = 0; | 958 | int rc = 0; |
| 947 | 959 | ||
| @@ -951,15 +963,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
| 951 | 963 | ||
| 952 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, | 964 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, |
| 953 | uid, sessionid); | 965 | uid, sessionid); |
| 954 | if (selinux_sid_to_string(sid, &s, &len)) { | 966 | if (security_secid_to_secctx(sid, &ctx, &len)) { |
| 955 | audit_log_format(ab, " obj=(none)"); | 967 | audit_log_format(ab, " obj=(none)"); |
| 956 | rc = 1; | 968 | rc = 1; |
| 957 | } else | 969 | } else { |
| 958 | audit_log_format(ab, " obj=%s", s); | 970 | audit_log_format(ab, " obj=%s", ctx); |
| 971 | security_release_secctx(ctx, len); | ||
| 972 | } | ||
| 959 | audit_log_format(ab, " ocomm="); | 973 | audit_log_format(ab, " ocomm="); |
| 960 | audit_log_untrustedstring(ab, comm); | 974 | audit_log_untrustedstring(ab, comm); |
| 961 | audit_log_end(ab); | 975 | audit_log_end(ab); |
| 962 | kfree(s); | ||
| 963 | 976 | ||
| 964 | return rc; | 977 | return rc; |
| 965 | } | 978 | } |
| @@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, | |||
| 1095 | audit_log_format(*ab, "[%d]", i); | 1108 | audit_log_format(*ab, "[%d]", i); |
| 1096 | audit_log_format(*ab, "="); | 1109 | audit_log_format(*ab, "="); |
| 1097 | if (has_cntl) | 1110 | if (has_cntl) |
| 1098 | audit_log_hex(*ab, buf, to_send); | 1111 | audit_log_n_hex(*ab, buf, to_send); |
| 1099 | else | 1112 | else |
| 1100 | audit_log_format(*ab, "\"%s\"", buf); | 1113 | audit_log_format(*ab, "\"%s\"", buf); |
| 1101 | audit_log_format(*ab, "\n"); | 1114 | audit_log_format(*ab, "\n"); |
| @@ -1271,14 +1284,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1271 | if (axi->osid != 0) { | 1284 | if (axi->osid != 0) { |
| 1272 | char *ctx = NULL; | 1285 | char *ctx = NULL; |
| 1273 | u32 len; | 1286 | u32 len; |
| 1274 | if (selinux_sid_to_string( | 1287 | if (security_secid_to_secctx( |
| 1275 | axi->osid, &ctx, &len)) { | 1288 | axi->osid, &ctx, &len)) { |
| 1276 | audit_log_format(ab, " osid=%u", | 1289 | audit_log_format(ab, " osid=%u", |
| 1277 | axi->osid); | 1290 | axi->osid); |
| 1278 | call_panic = 1; | 1291 | call_panic = 1; |
| 1279 | } else | 1292 | } else { |
| 1280 | audit_log_format(ab, " obj=%s", ctx); | 1293 | audit_log_format(ab, " obj=%s", ctx); |
| 1281 | kfree(ctx); | 1294 | security_release_secctx(ctx, len); |
| 1295 | } | ||
| 1282 | } | 1296 | } |
| 1283 | break; } | 1297 | break; } |
| 1284 | 1298 | ||
| @@ -1295,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1295 | break; } | 1309 | break; } |
| 1296 | 1310 | ||
| 1297 | case AUDIT_SOCKETCALL: { | 1311 | case AUDIT_SOCKETCALL: { |
| 1298 | int i; | ||
| 1299 | struct audit_aux_data_socketcall *axs = (void *)aux; | 1312 | struct audit_aux_data_socketcall *axs = (void *)aux; |
| 1300 | audit_log_format(ab, "nargs=%d", axs->nargs); | 1313 | audit_log_format(ab, "nargs=%d", axs->nargs); |
| 1301 | for (i=0; i<axs->nargs; i++) | 1314 | for (i=0; i<axs->nargs; i++) |
| @@ -1306,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1306 | struct audit_aux_data_sockaddr *axs = (void *)aux; | 1319 | struct audit_aux_data_sockaddr *axs = (void *)aux; |
| 1307 | 1320 | ||
| 1308 | audit_log_format(ab, "saddr="); | 1321 | audit_log_format(ab, "saddr="); |
| 1309 | audit_log_hex(ab, axs->a, axs->len); | 1322 | audit_log_n_hex(ab, axs->a, axs->len); |
| 1310 | break; } | 1323 | break; } |
| 1311 | 1324 | ||
| 1312 | case AUDIT_FD_PAIR: { | 1325 | case AUDIT_FD_PAIR: { |
| @@ -1320,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1320 | 1333 | ||
| 1321 | for (aux = context->aux_pids; aux; aux = aux->next) { | 1334 | for (aux = context->aux_pids; aux; aux = aux->next) { |
| 1322 | struct audit_aux_data_pids *axs = (void *)aux; | 1335 | struct audit_aux_data_pids *axs = (void *)aux; |
| 1323 | int i; | ||
| 1324 | 1336 | ||
| 1325 | for (i = 0; i < axs->pid_count; i++) | 1337 | for (i = 0; i < axs->pid_count; i++) |
| 1326 | if (audit_log_pid_context(context, axs->target_pid[i], | 1338 | if (audit_log_pid_context(context, axs->target_pid[i], |
| @@ -1370,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1370 | default: | 1382 | default: |
| 1371 | /* log the name's directory component */ | 1383 | /* log the name's directory component */ |
| 1372 | audit_log_format(ab, " name="); | 1384 | audit_log_format(ab, " name="); |
| 1373 | audit_log_n_untrustedstring(ab, n->name_len, | 1385 | audit_log_n_untrustedstring(ab, n->name, |
| 1374 | n->name); | 1386 | n->name_len); |
| 1375 | } | 1387 | } |
| 1376 | } else | 1388 | } else |
| 1377 | audit_log_format(ab, " name=(null)"); | 1389 | audit_log_format(ab, " name=(null)"); |
| @@ -1392,13 +1404,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1392 | if (n->osid != 0) { | 1404 | if (n->osid != 0) { |
| 1393 | char *ctx = NULL; | 1405 | char *ctx = NULL; |
| 1394 | u32 len; | 1406 | u32 len; |
| 1395 | if (selinux_sid_to_string( | 1407 | if (security_secid_to_secctx( |
| 1396 | n->osid, &ctx, &len)) { | 1408 | n->osid, &ctx, &len)) { |
| 1397 | audit_log_format(ab, " osid=%u", n->osid); | 1409 | audit_log_format(ab, " osid=%u", n->osid); |
| 1398 | call_panic = 2; | 1410 | call_panic = 2; |
| 1399 | } else | 1411 | } else { |
| 1400 | audit_log_format(ab, " obj=%s", ctx); | 1412 | audit_log_format(ab, " obj=%s", ctx); |
| 1401 | kfree(ctx); | 1413 | security_release_secctx(ctx, len); |
| 1414 | } | ||
| 1402 | } | 1415 | } |
| 1403 | 1416 | ||
| 1404 | audit_log_end(ab); | 1417 | audit_log_end(ab); |
| @@ -1594,7 +1607,7 @@ static inline void handle_one(const struct inode *inode) | |||
| 1594 | if (likely(put_tree_ref(context, chunk))) | 1607 | if (likely(put_tree_ref(context, chunk))) |
| 1595 | return; | 1608 | return; |
| 1596 | if (unlikely(!grow_tree_refs(context))) { | 1609 | if (unlikely(!grow_tree_refs(context))) { |
| 1597 | printk(KERN_WARNING "out of memory, audit has lost a tree reference"); | 1610 | printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); |
| 1598 | audit_set_auditable(context); | 1611 | audit_set_auditable(context); |
| 1599 | audit_put_chunk(chunk); | 1612 | audit_put_chunk(chunk); |
| 1600 | unroll_tree_refs(context, p, count); | 1613 | unroll_tree_refs(context, p, count); |
| @@ -1654,7 +1667,7 @@ retry: | |||
| 1654 | } | 1667 | } |
| 1655 | /* too bad */ | 1668 | /* too bad */ |
| 1656 | printk(KERN_WARNING | 1669 | printk(KERN_WARNING |
| 1657 | "out of memory, audit has lost a tree reference"); | 1670 | "out of memory, audit has lost a tree reference\n"); |
| 1658 | unroll_tree_refs(context, p, count); | 1671 | unroll_tree_refs(context, p, count); |
| 1659 | audit_set_auditable(context); | 1672 | audit_set_auditable(context); |
| 1660 | return; | 1673 | return; |
| @@ -1750,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context, | |||
| 1750 | if (context->name_count >= AUDIT_NAMES) { | 1763 | if (context->name_count >= AUDIT_NAMES) { |
| 1751 | if (inode) | 1764 | if (inode) |
| 1752 | printk(KERN_DEBUG "name_count maxed, losing inode data: " | 1765 | printk(KERN_DEBUG "name_count maxed, losing inode data: " |
| 1753 | "dev=%02x:%02x, inode=%lu", | 1766 | "dev=%02x:%02x, inode=%lu\n", |
| 1754 | MAJOR(inode->i_sb->s_dev), | 1767 | MAJOR(inode->i_sb->s_dev), |
| 1755 | MINOR(inode->i_sb->s_dev), | 1768 | MINOR(inode->i_sb->s_dev), |
| 1756 | inode->i_ino); | 1769 | inode->i_ino); |
| 1757 | 1770 | ||
| 1758 | else | 1771 | else |
| 1759 | printk(KERN_DEBUG "name_count maxed, losing inode data"); | 1772 | printk(KERN_DEBUG "name_count maxed, losing inode data\n"); |
| 1760 | return 1; | 1773 | return 1; |
| 1761 | } | 1774 | } |
| 1762 | context->name_count++; | 1775 | context->name_count++; |
| @@ -1775,7 +1788,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode | |||
| 1775 | name->uid = inode->i_uid; | 1788 | name->uid = inode->i_uid; |
| 1776 | name->gid = inode->i_gid; | 1789 | name->gid = inode->i_gid; |
| 1777 | name->rdev = inode->i_rdev; | 1790 | name->rdev = inode->i_rdev; |
| 1778 | selinux_get_inode_sid(inode, &name->osid); | 1791 | security_inode_getsecid(inode, &name->osid); |
| 1779 | } | 1792 | } |
| 1780 | 1793 | ||
| 1781 | /** | 1794 | /** |
| @@ -2190,8 +2203,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
| 2190 | ax->uid = ipcp->uid; | 2203 | ax->uid = ipcp->uid; |
| 2191 | ax->gid = ipcp->gid; | 2204 | ax->gid = ipcp->gid; |
| 2192 | ax->mode = ipcp->mode; | 2205 | ax->mode = ipcp->mode; |
| 2193 | selinux_get_ipc_sid(ipcp, &ax->osid); | 2206 | security_ipc_getsecid(ipcp, &ax->osid); |
| 2194 | |||
| 2195 | ax->d.type = AUDIT_IPC; | 2207 | ax->d.type = AUDIT_IPC; |
| 2196 | ax->d.next = context->aux; | 2208 | ax->d.next = context->aux; |
| 2197 | context->aux = (void *)ax; | 2209 | context->aux = (void *)ax; |
| @@ -2343,7 +2355,7 @@ void __audit_ptrace(struct task_struct *t) | |||
| 2343 | context->target_auid = audit_get_loginuid(t); | 2355 | context->target_auid = audit_get_loginuid(t); |
| 2344 | context->target_uid = t->uid; | 2356 | context->target_uid = t->uid; |
| 2345 | context->target_sessionid = audit_get_sessionid(t); | 2357 | context->target_sessionid = audit_get_sessionid(t); |
| 2346 | selinux_get_task_sid(t, &context->target_sid); | 2358 | security_task_getsecid(t, &context->target_sid); |
| 2347 | memcpy(context->target_comm, t->comm, TASK_COMM_LEN); | 2359 | memcpy(context->target_comm, t->comm, TASK_COMM_LEN); |
| 2348 | } | 2360 | } |
| 2349 | 2361 | ||
| @@ -2360,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2360 | struct audit_aux_data_pids *axp; | 2372 | struct audit_aux_data_pids *axp; |
| 2361 | struct task_struct *tsk = current; | 2373 | struct task_struct *tsk = current; |
| 2362 | struct audit_context *ctx = tsk->audit_context; | 2374 | struct audit_context *ctx = tsk->audit_context; |
| 2363 | extern pid_t audit_sig_pid; | ||
| 2364 | extern uid_t audit_sig_uid; | ||
| 2365 | extern u32 audit_sig_sid; | ||
| 2366 | 2375 | ||
| 2367 | if (audit_pid && t->tgid == audit_pid) { | 2376 | if (audit_pid && t->tgid == audit_pid) { |
| 2368 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { | 2377 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { |
| @@ -2371,7 +2380,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2371 | audit_sig_uid = tsk->loginuid; | 2380 | audit_sig_uid = tsk->loginuid; |
| 2372 | else | 2381 | else |
| 2373 | audit_sig_uid = tsk->uid; | 2382 | audit_sig_uid = tsk->uid; |
| 2374 | selinux_get_task_sid(tsk, &audit_sig_sid); | 2383 | security_task_getsecid(tsk, &audit_sig_sid); |
| 2375 | } | 2384 | } |
| 2376 | if (!audit_signals || audit_dummy_context()) | 2385 | if (!audit_signals || audit_dummy_context()) |
| 2377 | return 0; | 2386 | return 0; |
| @@ -2384,7 +2393,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2384 | ctx->target_auid = audit_get_loginuid(t); | 2393 | ctx->target_auid = audit_get_loginuid(t); |
| 2385 | ctx->target_uid = t->uid; | 2394 | ctx->target_uid = t->uid; |
| 2386 | ctx->target_sessionid = audit_get_sessionid(t); | 2395 | ctx->target_sessionid = audit_get_sessionid(t); |
| 2387 | selinux_get_task_sid(t, &ctx->target_sid); | 2396 | security_task_getsecid(t, &ctx->target_sid); |
| 2388 | memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); | 2397 | memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); |
| 2389 | return 0; | 2398 | return 0; |
| 2390 | } | 2399 | } |
| @@ -2405,7 +2414,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2405 | axp->target_auid[axp->pid_count] = audit_get_loginuid(t); | 2414 | axp->target_auid[axp->pid_count] = audit_get_loginuid(t); |
| 2406 | axp->target_uid[axp->pid_count] = t->uid; | 2415 | axp->target_uid[axp->pid_count] = t->uid; |
| 2407 | axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); | 2416 | axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); |
| 2408 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); | 2417 | security_task_getsecid(t, &axp->target_sid[axp->pid_count]); |
| 2409 | memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); | 2418 | memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); |
| 2410 | axp->pid_count++; | 2419 | axp->pid_count++; |
| 2411 | 2420 | ||
| @@ -2435,16 +2444,17 @@ void audit_core_dumps(long signr) | |||
| 2435 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2444 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
| 2436 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2445 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", |
| 2437 | auid, current->uid, current->gid, sessionid); | 2446 | auid, current->uid, current->gid, sessionid); |
| 2438 | selinux_get_task_sid(current, &sid); | 2447 | security_task_getsecid(current, &sid); |
| 2439 | if (sid) { | 2448 | if (sid) { |
| 2440 | char *ctx = NULL; | 2449 | char *ctx = NULL; |
| 2441 | u32 len; | 2450 | u32 len; |
| 2442 | 2451 | ||
| 2443 | if (selinux_sid_to_string(sid, &ctx, &len)) | 2452 | if (security_secid_to_secctx(sid, &ctx, &len)) |
| 2444 | audit_log_format(ab, " ssid=%u", sid); | 2453 | audit_log_format(ab, " ssid=%u", sid); |
| 2445 | else | 2454 | else { |
| 2446 | audit_log_format(ab, " subj=%s", ctx); | 2455 | audit_log_format(ab, " subj=%s", ctx); |
| 2447 | kfree(ctx); | 2456 | security_release_secctx(ctx, len); |
| 2457 | } | ||
| 2448 | } | 2458 | } |
| 2449 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2459 | audit_log_format(ab, " pid=%d comm=", current->pid); |
| 2450 | audit_log_untrustedstring(ab, current->comm); | 2460 | audit_log_untrustedstring(ab, current->comm); |
diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 000000000000..3c5301381837 --- /dev/null +++ b/kernel/bounds.c | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | /* | ||
| 2 | * Generate definitions needed by the preprocessor. | ||
| 3 | * This code generates raw asm output which is post-processed | ||
| 4 | * to extract and format the required data. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #define __GENERATING_BOUNDS_H | ||
| 8 | /* Include headers that define the enum constants of interest */ | ||
| 9 | #include <linux/page-flags.h> | ||
| 10 | #include <linux/mmzone.h> | ||
| 11 | #include <linux/kbuild.h> | ||
| 12 | |||
| 13 | void foo(void) | ||
| 14 | { | ||
| 15 | /* The enum constants to put into include/linux/bounds.h */ | ||
| 16 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | ||
| 17 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | ||
| 18 | /* End of constants */ | ||
| 19 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9c2fb01e89b..fbc6fc8949b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -44,6 +44,7 @@ | |||
| 44 | #include <linux/kmod.h> | 44 | #include <linux/kmod.h> |
| 45 | #include <linux/delayacct.h> | 45 | #include <linux/delayacct.h> |
| 46 | #include <linux/cgroupstats.h> | 46 | #include <linux/cgroupstats.h> |
| 47 | #include <linux/hash.h> | ||
| 47 | 48 | ||
| 48 | #include <asm/atomic.h> | 49 | #include <asm/atomic.h> |
| 49 | 50 | ||
| @@ -118,17 +119,7 @@ static int root_count; | |||
| 118 | * be called. | 119 | * be called. |
| 119 | */ | 120 | */ |
| 120 | static int need_forkexit_callback; | 121 | static int need_forkexit_callback; |
| 121 | 122 | static int need_mm_owner_callback __read_mostly; | |
| 122 | /* bits in struct cgroup flags field */ | ||
| 123 | enum { | ||
| 124 | /* Control Group is dead */ | ||
| 125 | CGRP_REMOVED, | ||
| 126 | /* Control Group has previously had a child cgroup or a task, | ||
| 127 | * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ | ||
| 128 | CGRP_RELEASABLE, | ||
| 129 | /* Control Group requires release notifications to userspace */ | ||
| 130 | CGRP_NOTIFY_ON_RELEASE, | ||
| 131 | }; | ||
| 132 | 123 | ||
| 133 | /* convenient tests for these bits */ | 124 | /* convenient tests for these bits */ |
| 134 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 125 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
| @@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link; | |||
| 204 | static DEFINE_RWLOCK(css_set_lock); | 195 | static DEFINE_RWLOCK(css_set_lock); |
| 205 | static int css_set_count; | 196 | static int css_set_count; |
| 206 | 197 | ||
| 198 | /* hash table for cgroup groups. This improves the performance to | ||
| 199 | * find an existing css_set */ | ||
| 200 | #define CSS_SET_HASH_BITS 7 | ||
| 201 | #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) | ||
| 202 | static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; | ||
| 203 | |||
| 204 | static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | ||
| 205 | { | ||
| 206 | int i; | ||
| 207 | int index; | ||
| 208 | unsigned long tmp = 0UL; | ||
| 209 | |||
| 210 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | ||
| 211 | tmp += (unsigned long)css[i]; | ||
| 212 | tmp = (tmp >> 16) ^ tmp; | ||
| 213 | |||
| 214 | index = hash_long(tmp, CSS_SET_HASH_BITS); | ||
| 215 | |||
| 216 | return &css_set_table[index]; | ||
| 217 | } | ||
| 218 | |||
| 207 | /* We don't maintain the lists running through each css_set to its | 219 | /* We don't maintain the lists running through each css_set to its |
| 208 | * task until after the first call to cgroup_iter_start(). This | 220 | * task until after the first call to cgroup_iter_start(). This |
| 209 | * reduces the fork()/exit() overhead for people who have cgroups | 221 | * reduces the fork()/exit() overhead for people who have cgroups |
| @@ -230,7 +242,7 @@ static int use_task_css_set_links; | |||
| 230 | static void unlink_css_set(struct css_set *cg) | 242 | static void unlink_css_set(struct css_set *cg) |
| 231 | { | 243 | { |
| 232 | write_lock(&css_set_lock); | 244 | write_lock(&css_set_lock); |
| 233 | list_del(&cg->list); | 245 | hlist_del(&cg->hlist); |
| 234 | css_set_count--; | 246 | css_set_count--; |
| 235 | while (!list_empty(&cg->cg_links)) { | 247 | while (!list_empty(&cg->cg_links)) { |
| 236 | struct cg_cgroup_link *link; | 248 | struct cg_cgroup_link *link; |
| @@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg) | |||
| 295 | /* | 307 | /* |
| 296 | * find_existing_css_set() is a helper for | 308 | * find_existing_css_set() is a helper for |
| 297 | * find_css_set(), and checks to see whether an existing | 309 | * find_css_set(), and checks to see whether an existing |
| 298 | * css_set is suitable. This currently walks a linked-list for | 310 | * css_set is suitable. |
| 299 | * simplicity; a later patch will use a hash table for better | ||
| 300 | * performance | ||
| 301 | * | 311 | * |
| 302 | * oldcg: the cgroup group that we're using before the cgroup | 312 | * oldcg: the cgroup group that we're using before the cgroup |
| 303 | * transition | 313 | * transition |
| @@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set( | |||
| 314 | { | 324 | { |
| 315 | int i; | 325 | int i; |
| 316 | struct cgroupfs_root *root = cgrp->root; | 326 | struct cgroupfs_root *root = cgrp->root; |
| 317 | struct list_head *l = &init_css_set.list; | 327 | struct hlist_head *hhead; |
| 328 | struct hlist_node *node; | ||
| 329 | struct css_set *cg; | ||
| 318 | 330 | ||
| 319 | /* Built the set of subsystem state objects that we want to | 331 | /* Built the set of subsystem state objects that we want to |
| 320 | * see in the new css_set */ | 332 | * see in the new css_set */ |
| @@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set( | |||
| 331 | } | 343 | } |
| 332 | } | 344 | } |
| 333 | 345 | ||
| 334 | /* Look through existing cgroup groups to find one to reuse */ | 346 | hhead = css_set_hash(template); |
| 335 | do { | 347 | hlist_for_each_entry(cg, node, hhead, hlist) { |
| 336 | struct css_set *cg = | ||
| 337 | list_entry(l, struct css_set, list); | ||
| 338 | |||
| 339 | if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { | 348 | if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { |
| 340 | /* All subsystems matched */ | 349 | /* All subsystems matched */ |
| 341 | return cg; | 350 | return cg; |
| 342 | } | 351 | } |
| 343 | /* Try the next cgroup group */ | 352 | } |
| 344 | l = l->next; | ||
| 345 | } while (l != &init_css_set.list); | ||
| 346 | 353 | ||
| 347 | /* No existing cgroup group matched */ | 354 | /* No existing cgroup group matched */ |
| 348 | return NULL; | 355 | return NULL; |
| @@ -404,6 +411,8 @@ static struct css_set *find_css_set( | |||
| 404 | struct list_head tmp_cg_links; | 411 | struct list_head tmp_cg_links; |
| 405 | struct cg_cgroup_link *link; | 412 | struct cg_cgroup_link *link; |
| 406 | 413 | ||
| 414 | struct hlist_head *hhead; | ||
| 415 | |||
| 407 | /* First see if we already have a cgroup group that matches | 416 | /* First see if we already have a cgroup group that matches |
| 408 | * the desired set */ | 417 | * the desired set */ |
| 409 | write_lock(&css_set_lock); | 418 | write_lock(&css_set_lock); |
| @@ -428,6 +437,7 @@ static struct css_set *find_css_set( | |||
| 428 | kref_init(&res->ref); | 437 | kref_init(&res->ref); |
| 429 | INIT_LIST_HEAD(&res->cg_links); | 438 | INIT_LIST_HEAD(&res->cg_links); |
| 430 | INIT_LIST_HEAD(&res->tasks); | 439 | INIT_LIST_HEAD(&res->tasks); |
| 440 | INIT_HLIST_NODE(&res->hlist); | ||
| 431 | 441 | ||
| 432 | /* Copy the set of subsystem state objects generated in | 442 | /* Copy the set of subsystem state objects generated in |
| 433 | * find_existing_css_set() */ | 443 | * find_existing_css_set() */ |
| @@ -467,9 +477,12 @@ static struct css_set *find_css_set( | |||
| 467 | 477 | ||
| 468 | BUG_ON(!list_empty(&tmp_cg_links)); | 478 | BUG_ON(!list_empty(&tmp_cg_links)); |
| 469 | 479 | ||
| 470 | /* Link this cgroup group into the list */ | ||
| 471 | list_add(&res->list, &init_css_set.list); | ||
| 472 | css_set_count++; | 480 | css_set_count++; |
| 481 | |||
| 482 | /* Add this cgroup group to the hash table */ | ||
| 483 | hhead = css_set_hash(res->subsys); | ||
| 484 | hlist_add_head(&res->hlist, hhead); | ||
| 485 | |||
| 473 | write_unlock(&css_set_lock); | 486 | write_unlock(&css_set_lock); |
| 474 | 487 | ||
| 475 | return res; | 488 | return res; |
| @@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations; | |||
| 562 | static struct file_operations proc_cgroupstats_operations; | 575 | static struct file_operations proc_cgroupstats_operations; |
| 563 | 576 | ||
| 564 | static struct backing_dev_info cgroup_backing_dev_info = { | 577 | static struct backing_dev_info cgroup_backing_dev_info = { |
| 565 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | 578 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
| 566 | }; | 579 | }; |
| 567 | 580 | ||
| 568 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | 581 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) |
| @@ -782,7 +795,14 @@ static int parse_cgroupfs_options(char *data, | |||
| 782 | if (!*token) | 795 | if (!*token) |
| 783 | return -EINVAL; | 796 | return -EINVAL; |
| 784 | if (!strcmp(token, "all")) { | 797 | if (!strcmp(token, "all")) { |
| 785 | opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; | 798 | /* Add all non-disabled subsystems */ |
| 799 | int i; | ||
| 800 | opts->subsys_bits = 0; | ||
| 801 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
| 802 | struct cgroup_subsys *ss = subsys[i]; | ||
| 803 | if (!ss->disabled) | ||
| 804 | opts->subsys_bits |= 1ul << i; | ||
| 805 | } | ||
| 786 | } else if (!strcmp(token, "noprefix")) { | 806 | } else if (!strcmp(token, "noprefix")) { |
| 787 | set_bit(ROOT_NOPREFIX, &opts->flags); | 807 | set_bit(ROOT_NOPREFIX, &opts->flags); |
| 788 | } else if (!strncmp(token, "release_agent=", 14)) { | 808 | } else if (!strncmp(token, "release_agent=", 14)) { |
| @@ -800,7 +820,8 @@ static int parse_cgroupfs_options(char *data, | |||
| 800 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 820 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 801 | ss = subsys[i]; | 821 | ss = subsys[i]; |
| 802 | if (!strcmp(token, ss->name)) { | 822 | if (!strcmp(token, ss->name)) { |
| 803 | set_bit(i, &opts->subsys_bits); | 823 | if (!ss->disabled) |
| 824 | set_bit(i, &opts->subsys_bits); | ||
| 804 | break; | 825 | break; |
| 805 | } | 826 | } |
| 806 | } | 827 | } |
| @@ -940,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
| 940 | int ret = 0; | 961 | int ret = 0; |
| 941 | struct super_block *sb; | 962 | struct super_block *sb; |
| 942 | struct cgroupfs_root *root; | 963 | struct cgroupfs_root *root; |
| 943 | struct list_head tmp_cg_links, *l; | 964 | struct list_head tmp_cg_links; |
| 944 | INIT_LIST_HEAD(&tmp_cg_links); | 965 | INIT_LIST_HEAD(&tmp_cg_links); |
| 945 | 966 | ||
| 946 | /* First find the desired set of subsystems */ | 967 | /* First find the desired set of subsystems */ |
| @@ -982,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
| 982 | /* New superblock */ | 1003 | /* New superblock */ |
| 983 | struct cgroup *cgrp = &root->top_cgroup; | 1004 | struct cgroup *cgrp = &root->top_cgroup; |
| 984 | struct inode *inode; | 1005 | struct inode *inode; |
| 1006 | int i; | ||
| 985 | 1007 | ||
| 986 | BUG_ON(sb->s_root != NULL); | 1008 | BUG_ON(sb->s_root != NULL); |
| 987 | 1009 | ||
| @@ -1026,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
| 1026 | /* Link the top cgroup in this hierarchy into all | 1048 | /* Link the top cgroup in this hierarchy into all |
| 1027 | * the css_set objects */ | 1049 | * the css_set objects */ |
| 1028 | write_lock(&css_set_lock); | 1050 | write_lock(&css_set_lock); |
| 1029 | l = &init_css_set.list; | 1051 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { |
| 1030 | do { | 1052 | struct hlist_head *hhead = &css_set_table[i]; |
| 1053 | struct hlist_node *node; | ||
| 1031 | struct css_set *cg; | 1054 | struct css_set *cg; |
| 1032 | struct cg_cgroup_link *link; | 1055 | |
| 1033 | cg = list_entry(l, struct css_set, list); | 1056 | hlist_for_each_entry(cg, node, hhead, hlist) { |
| 1034 | BUG_ON(list_empty(&tmp_cg_links)); | 1057 | struct cg_cgroup_link *link; |
| 1035 | link = list_entry(tmp_cg_links.next, | 1058 | |
| 1036 | struct cg_cgroup_link, | 1059 | BUG_ON(list_empty(&tmp_cg_links)); |
| 1037 | cgrp_link_list); | 1060 | link = list_entry(tmp_cg_links.next, |
| 1038 | list_del(&link->cgrp_link_list); | 1061 | struct cg_cgroup_link, |
| 1039 | link->cg = cg; | 1062 | cgrp_link_list); |
| 1040 | list_add(&link->cgrp_link_list, | 1063 | list_del(&link->cgrp_link_list); |
| 1041 | &root->top_cgroup.css_sets); | 1064 | link->cg = cg; |
| 1042 | list_add(&link->cg_link_list, &cg->cg_links); | 1065 | list_add(&link->cgrp_link_list, |
| 1043 | l = l->next; | 1066 | &root->top_cgroup.css_sets); |
| 1044 | } while (l != &init_css_set.list); | 1067 | list_add(&link->cg_link_list, &cg->cg_links); |
| 1068 | } | ||
| 1069 | } | ||
| 1045 | write_unlock(&css_set_lock); | 1070 | write_unlock(&css_set_lock); |
| 1046 | 1071 | ||
| 1047 | free_cg_links(&tmp_cg_links); | 1072 | free_cg_links(&tmp_cg_links); |
| @@ -1299,18 +1324,16 @@ enum cgroup_filetype { | |||
| 1299 | FILE_DIR, | 1324 | FILE_DIR, |
| 1300 | FILE_TASKLIST, | 1325 | FILE_TASKLIST, |
| 1301 | FILE_NOTIFY_ON_RELEASE, | 1326 | FILE_NOTIFY_ON_RELEASE, |
| 1302 | FILE_RELEASABLE, | ||
| 1303 | FILE_RELEASE_AGENT, | 1327 | FILE_RELEASE_AGENT, |
| 1304 | }; | 1328 | }; |
| 1305 | 1329 | ||
| 1306 | static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, | 1330 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, |
| 1307 | struct file *file, | 1331 | struct file *file, |
| 1308 | const char __user *userbuf, | 1332 | const char __user *userbuf, |
| 1309 | size_t nbytes, loff_t *unused_ppos) | 1333 | size_t nbytes, loff_t *unused_ppos) |
| 1310 | { | 1334 | { |
| 1311 | char buffer[64]; | 1335 | char buffer[64]; |
| 1312 | int retval = 0; | 1336 | int retval = 0; |
| 1313 | u64 val; | ||
| 1314 | char *end; | 1337 | char *end; |
| 1315 | 1338 | ||
| 1316 | if (!nbytes) | 1339 | if (!nbytes) |
| @@ -1321,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, | |||
| 1321 | return -EFAULT; | 1344 | return -EFAULT; |
| 1322 | 1345 | ||
| 1323 | buffer[nbytes] = 0; /* nul-terminate */ | 1346 | buffer[nbytes] = 0; /* nul-terminate */ |
| 1324 | 1347 | strstrip(buffer); | |
| 1325 | /* strip newline if necessary */ | 1348 | if (cft->write_u64) { |
| 1326 | if (nbytes && (buffer[nbytes-1] == '\n')) | 1349 | u64 val = simple_strtoull(buffer, &end, 0); |
| 1327 | buffer[nbytes-1] = 0; | 1350 | if (*end) |
| 1328 | val = simple_strtoull(buffer, &end, 0); | 1351 | return -EINVAL; |
| 1329 | if (*end) | 1352 | retval = cft->write_u64(cgrp, cft, val); |
| 1330 | return -EINVAL; | 1353 | } else { |
| 1331 | 1354 | s64 val = simple_strtoll(buffer, &end, 0); | |
| 1332 | /* Pass to subsystem */ | 1355 | if (*end) |
| 1333 | retval = cft->write_uint(cgrp, cft, val); | 1356 | return -EINVAL; |
| 1357 | retval = cft->write_s64(cgrp, cft, val); | ||
| 1358 | } | ||
| 1334 | if (!retval) | 1359 | if (!retval) |
| 1335 | retval = nbytes; | 1360 | retval = nbytes; |
| 1336 | return retval; | 1361 | return retval; |
| @@ -1411,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
| 1411 | return -ENODEV; | 1436 | return -ENODEV; |
| 1412 | if (cft->write) | 1437 | if (cft->write) |
| 1413 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 1438 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
| 1414 | if (cft->write_uint) | 1439 | if (cft->write_u64 || cft->write_s64) |
| 1415 | return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); | 1440 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); |
| 1441 | if (cft->trigger) { | ||
| 1442 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | ||
| 1443 | return ret ? ret : nbytes; | ||
| 1444 | } | ||
| 1416 | return -EINVAL; | 1445 | return -EINVAL; |
| 1417 | } | 1446 | } |
| 1418 | 1447 | ||
| 1419 | static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, | 1448 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, |
| 1420 | struct file *file, | 1449 | struct file *file, |
| 1421 | char __user *buf, size_t nbytes, | 1450 | char __user *buf, size_t nbytes, |
| 1422 | loff_t *ppos) | 1451 | loff_t *ppos) |
| 1423 | { | 1452 | { |
| 1424 | char tmp[64]; | 1453 | char tmp[64]; |
| 1425 | u64 val = cft->read_uint(cgrp, cft); | 1454 | u64 val = cft->read_u64(cgrp, cft); |
| 1426 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 1455 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
| 1427 | 1456 | ||
| 1428 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 1457 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
| 1429 | } | 1458 | } |
| 1430 | 1459 | ||
| 1460 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | ||
| 1461 | struct file *file, | ||
| 1462 | char __user *buf, size_t nbytes, | ||
| 1463 | loff_t *ppos) | ||
| 1464 | { | ||
| 1465 | char tmp[64]; | ||
| 1466 | s64 val = cft->read_s64(cgrp, cft); | ||
| 1467 | int len = sprintf(tmp, "%lld\n", (long long) val); | ||
| 1468 | |||
| 1469 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
| 1470 | } | ||
| 1471 | |||
| 1431 | static ssize_t cgroup_common_file_read(struct cgroup *cgrp, | 1472 | static ssize_t cgroup_common_file_read(struct cgroup *cgrp, |
| 1432 | struct cftype *cft, | 1473 | struct cftype *cft, |
| 1433 | struct file *file, | 1474 | struct file *file, |
| @@ -1482,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
| 1482 | 1523 | ||
| 1483 | if (cft->read) | 1524 | if (cft->read) |
| 1484 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 1525 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); |
| 1485 | if (cft->read_uint) | 1526 | if (cft->read_u64) |
| 1486 | return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); | 1527 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); |
| 1528 | if (cft->read_s64) | ||
| 1529 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | ||
| 1487 | return -EINVAL; | 1530 | return -EINVAL; |
| 1488 | } | 1531 | } |
| 1489 | 1532 | ||
| 1533 | /* | ||
| 1534 | * seqfile ops/methods for returning structured data. Currently just | ||
| 1535 | * supports string->u64 maps, but can be extended in future. | ||
| 1536 | */ | ||
| 1537 | |||
| 1538 | struct cgroup_seqfile_state { | ||
| 1539 | struct cftype *cft; | ||
| 1540 | struct cgroup *cgroup; | ||
| 1541 | }; | ||
| 1542 | |||
| 1543 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | ||
| 1544 | { | ||
| 1545 | struct seq_file *sf = cb->state; | ||
| 1546 | return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); | ||
| 1547 | } | ||
| 1548 | |||
| 1549 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | ||
| 1550 | { | ||
| 1551 | struct cgroup_seqfile_state *state = m->private; | ||
| 1552 | struct cftype *cft = state->cft; | ||
| 1553 | if (cft->read_map) { | ||
| 1554 | struct cgroup_map_cb cb = { | ||
| 1555 | .fill = cgroup_map_add, | ||
| 1556 | .state = m, | ||
| 1557 | }; | ||
| 1558 | return cft->read_map(state->cgroup, cft, &cb); | ||
| 1559 | } | ||
| 1560 | return cft->read_seq_string(state->cgroup, cft, m); | ||
| 1561 | } | ||
| 1562 | |||
| 1563 | int cgroup_seqfile_release(struct inode *inode, struct file *file) | ||
| 1564 | { | ||
| 1565 | struct seq_file *seq = file->private_data; | ||
| 1566 | kfree(seq->private); | ||
| 1567 | return single_release(inode, file); | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | static struct file_operations cgroup_seqfile_operations = { | ||
| 1571 | .read = seq_read, | ||
| 1572 | .llseek = seq_lseek, | ||
| 1573 | .release = cgroup_seqfile_release, | ||
| 1574 | }; | ||
| 1575 | |||
| 1490 | static int cgroup_file_open(struct inode *inode, struct file *file) | 1576 | static int cgroup_file_open(struct inode *inode, struct file *file) |
| 1491 | { | 1577 | { |
| 1492 | int err; | 1578 | int err; |
| @@ -1499,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
| 1499 | cft = __d_cft(file->f_dentry); | 1585 | cft = __d_cft(file->f_dentry); |
| 1500 | if (!cft) | 1586 | if (!cft) |
| 1501 | return -ENODEV; | 1587 | return -ENODEV; |
| 1502 | if (cft->open) | 1588 | if (cft->read_map || cft->read_seq_string) { |
| 1589 | struct cgroup_seqfile_state *state = | ||
| 1590 | kzalloc(sizeof(*state), GFP_USER); | ||
| 1591 | if (!state) | ||
| 1592 | return -ENOMEM; | ||
| 1593 | state->cft = cft; | ||
| 1594 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | ||
| 1595 | file->f_op = &cgroup_seqfile_operations; | ||
| 1596 | err = single_open(file, cgroup_seqfile_show, state); | ||
| 1597 | if (err < 0) | ||
| 1598 | kfree(state); | ||
| 1599 | } else if (cft->open) | ||
| 1503 | err = cft->open(inode, file); | 1600 | err = cft->open(inode, file); |
| 1504 | else | 1601 | else |
| 1505 | err = 0; | 1602 | err = 0; |
| @@ -1707,14 +1804,19 @@ static void cgroup_advance_iter(struct cgroup *cgrp, | |||
| 1707 | * The tasklist_lock is not held here, as do_each_thread() and | 1804 | * The tasklist_lock is not held here, as do_each_thread() and |
| 1708 | * while_each_thread() are protected by RCU. | 1805 | * while_each_thread() are protected by RCU. |
| 1709 | */ | 1806 | */ |
| 1710 | void cgroup_enable_task_cg_lists(void) | 1807 | static void cgroup_enable_task_cg_lists(void) |
| 1711 | { | 1808 | { |
| 1712 | struct task_struct *p, *g; | 1809 | struct task_struct *p, *g; |
| 1713 | write_lock(&css_set_lock); | 1810 | write_lock(&css_set_lock); |
| 1714 | use_task_css_set_links = 1; | 1811 | use_task_css_set_links = 1; |
| 1715 | do_each_thread(g, p) { | 1812 | do_each_thread(g, p) { |
| 1716 | task_lock(p); | 1813 | task_lock(p); |
| 1717 | if (list_empty(&p->cg_list)) | 1814 | /* |
| 1815 | * We should check if the process is exiting, otherwise | ||
| 1816 | * it will race with cgroup_exit() in that the list | ||
| 1817 | * entry won't be deleted though the process has exited. | ||
| 1818 | */ | ||
| 1819 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | ||
| 1718 | list_add(&p->cg_list, &p->cgroups->tasks); | 1820 | list_add(&p->cg_list, &p->cgroups->tasks); |
| 1719 | task_unlock(p); | 1821 | task_unlock(p); |
| 1720 | } while_each_thread(g, p); | 1822 | } while_each_thread(g, p); |
| @@ -1900,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
| 1900 | 2002 | ||
| 1901 | if (heap->size) { | 2003 | if (heap->size) { |
| 1902 | for (i = 0; i < heap->size; i++) { | 2004 | for (i = 0; i < heap->size; i++) { |
| 1903 | struct task_struct *p = heap->ptrs[i]; | 2005 | struct task_struct *q = heap->ptrs[i]; |
| 1904 | if (i == 0) { | 2006 | if (i == 0) { |
| 1905 | latest_time = p->start_time; | 2007 | latest_time = q->start_time; |
| 1906 | latest_task = p; | 2008 | latest_task = q; |
| 1907 | } | 2009 | } |
| 1908 | /* Process the task per the caller's callback */ | 2010 | /* Process the task per the caller's callback */ |
| 1909 | scan->process_task(p, scan); | 2011 | scan->process_task(q, scan); |
| 1910 | put_task_struct(p); | 2012 | put_task_struct(q); |
| 1911 | } | 2013 | } |
| 1912 | /* | 2014 | /* |
| 1913 | * If we had to process any tasks at all, scan again | 2015 | * If we had to process any tasks at all, scan again |
| @@ -2082,7 +2184,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) | |||
| 2082 | 2184 | ||
| 2083 | kfree(pidarray); | 2185 | kfree(pidarray); |
| 2084 | } else { | 2186 | } else { |
| 2085 | ctr->buf = 0; | 2187 | ctr->buf = NULL; |
| 2086 | ctr->bufsz = 0; | 2188 | ctr->bufsz = 0; |
| 2087 | } | 2189 | } |
| 2088 | file->private_data = ctr; | 2190 | file->private_data = ctr; |
| @@ -2125,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | |||
| 2125 | return notify_on_release(cgrp); | 2227 | return notify_on_release(cgrp); |
| 2126 | } | 2228 | } |
| 2127 | 2229 | ||
| 2128 | static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft) | ||
| 2129 | { | ||
| 2130 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 2131 | } | ||
| 2132 | |||
| 2133 | /* | 2230 | /* |
| 2134 | * for the common functions, 'private' gives the type of file | 2231 | * for the common functions, 'private' gives the type of file |
| 2135 | */ | 2232 | */ |
| @@ -2145,16 +2242,10 @@ static struct cftype files[] = { | |||
| 2145 | 2242 | ||
| 2146 | { | 2243 | { |
| 2147 | .name = "notify_on_release", | 2244 | .name = "notify_on_release", |
| 2148 | .read_uint = cgroup_read_notify_on_release, | 2245 | .read_u64 = cgroup_read_notify_on_release, |
| 2149 | .write = cgroup_common_file_write, | 2246 | .write = cgroup_common_file_write, |
| 2150 | .private = FILE_NOTIFY_ON_RELEASE, | 2247 | .private = FILE_NOTIFY_ON_RELEASE, |
| 2151 | }, | 2248 | }, |
| 2152 | |||
| 2153 | { | ||
| 2154 | .name = "releasable", | ||
| 2155 | .read_uint = cgroup_read_releasable, | ||
| 2156 | .private = FILE_RELEASABLE, | ||
| 2157 | } | ||
| 2158 | }; | 2249 | }; |
| 2159 | 2250 | ||
| 2160 | static struct cftype cft_release_agent = { | 2251 | static struct cftype cft_release_agent = { |
| @@ -2388,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2388 | return 0; | 2479 | return 0; |
| 2389 | } | 2480 | } |
| 2390 | 2481 | ||
| 2391 | static void cgroup_init_subsys(struct cgroup_subsys *ss) | 2482 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
| 2392 | { | 2483 | { |
| 2393 | struct cgroup_subsys_state *css; | 2484 | struct cgroup_subsys_state *css; |
| 2394 | struct list_head *l; | ||
| 2395 | 2485 | ||
| 2396 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 2486 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
| 2397 | 2487 | ||
| @@ -2402,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 2402 | BUG_ON(IS_ERR(css)); | 2492 | BUG_ON(IS_ERR(css)); |
| 2403 | init_cgroup_css(css, ss, dummytop); | 2493 | init_cgroup_css(css, ss, dummytop); |
| 2404 | 2494 | ||
| 2405 | /* Update all cgroup groups to contain a subsys | 2495 | /* Update the init_css_set to contain a subsys |
| 2406 | * pointer to this state - since the subsystem is | 2496 | * pointer to this state - since the subsystem is |
| 2407 | * newly registered, all tasks and hence all cgroup | 2497 | * newly registered, all tasks and hence the |
| 2408 | * groups are in the subsystem's top cgroup. */ | 2498 | * init_css_set is in the subsystem's top cgroup. */ |
| 2409 | write_lock(&css_set_lock); | 2499 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; |
| 2410 | l = &init_css_set.list; | ||
| 2411 | do { | ||
| 2412 | struct css_set *cg = | ||
| 2413 | list_entry(l, struct css_set, list); | ||
| 2414 | cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | ||
| 2415 | l = l->next; | ||
| 2416 | } while (l != &init_css_set.list); | ||
| 2417 | write_unlock(&css_set_lock); | ||
| 2418 | |||
| 2419 | /* If this subsystem requested that it be notified with fork | ||
| 2420 | * events, we should send it one now for every process in the | ||
| 2421 | * system */ | ||
| 2422 | if (ss->fork) { | ||
| 2423 | struct task_struct *g, *p; | ||
| 2424 | |||
| 2425 | read_lock(&tasklist_lock); | ||
| 2426 | do_each_thread(g, p) { | ||
| 2427 | ss->fork(ss, p); | ||
| 2428 | } while_each_thread(g, p); | ||
| 2429 | read_unlock(&tasklist_lock); | ||
| 2430 | } | ||
| 2431 | 2500 | ||
| 2432 | need_forkexit_callback |= ss->fork || ss->exit; | 2501 | need_forkexit_callback |= ss->fork || ss->exit; |
| 2502 | need_mm_owner_callback |= !!ss->mm_owner_changed; | ||
| 2503 | |||
| 2504 | /* At system boot, before all subsystems have been | ||
| 2505 | * registered, no tasks have been forked, so we don't | ||
| 2506 | * need to invoke fork callbacks here. */ | ||
| 2507 | BUG_ON(!list_empty(&init_task.tasks)); | ||
| 2433 | 2508 | ||
| 2434 | ss->active = 1; | 2509 | ss->active = 1; |
| 2435 | } | 2510 | } |
| @@ -2445,9 +2520,9 @@ int __init cgroup_init_early(void) | |||
| 2445 | int i; | 2520 | int i; |
| 2446 | kref_init(&init_css_set.ref); | 2521 | kref_init(&init_css_set.ref); |
| 2447 | kref_get(&init_css_set.ref); | 2522 | kref_get(&init_css_set.ref); |
| 2448 | INIT_LIST_HEAD(&init_css_set.list); | ||
| 2449 | INIT_LIST_HEAD(&init_css_set.cg_links); | 2523 | INIT_LIST_HEAD(&init_css_set.cg_links); |
| 2450 | INIT_LIST_HEAD(&init_css_set.tasks); | 2524 | INIT_LIST_HEAD(&init_css_set.tasks); |
| 2525 | INIT_HLIST_NODE(&init_css_set.hlist); | ||
| 2451 | css_set_count = 1; | 2526 | css_set_count = 1; |
| 2452 | init_cgroup_root(&rootnode); | 2527 | init_cgroup_root(&rootnode); |
| 2453 | list_add(&rootnode.root_list, &roots); | 2528 | list_add(&rootnode.root_list, &roots); |
| @@ -2460,6 +2535,9 @@ int __init cgroup_init_early(void) | |||
| 2460 | list_add(&init_css_set_link.cg_link_list, | 2535 | list_add(&init_css_set_link.cg_link_list, |
| 2461 | &init_css_set.cg_links); | 2536 | &init_css_set.cg_links); |
| 2462 | 2537 | ||
| 2538 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | ||
| 2539 | INIT_HLIST_HEAD(&css_set_table[i]); | ||
| 2540 | |||
| 2463 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 2541 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 2464 | struct cgroup_subsys *ss = subsys[i]; | 2542 | struct cgroup_subsys *ss = subsys[i]; |
| 2465 | 2543 | ||
| @@ -2489,7 +2567,7 @@ int __init cgroup_init(void) | |||
| 2489 | { | 2567 | { |
| 2490 | int err; | 2568 | int err; |
| 2491 | int i; | 2569 | int i; |
| 2492 | struct proc_dir_entry *entry; | 2570 | struct hlist_head *hhead; |
| 2493 | 2571 | ||
| 2494 | err = bdi_init(&cgroup_backing_dev_info); | 2572 | err = bdi_init(&cgroup_backing_dev_info); |
| 2495 | if (err) | 2573 | if (err) |
| @@ -2501,13 +2579,15 @@ int __init cgroup_init(void) | |||
| 2501 | cgroup_init_subsys(ss); | 2579 | cgroup_init_subsys(ss); |
| 2502 | } | 2580 | } |
| 2503 | 2581 | ||
| 2582 | /* Add init_css_set to the hash table */ | ||
| 2583 | hhead = css_set_hash(init_css_set.subsys); | ||
| 2584 | hlist_add_head(&init_css_set.hlist, hhead); | ||
| 2585 | |||
| 2504 | err = register_filesystem(&cgroup_fs_type); | 2586 | err = register_filesystem(&cgroup_fs_type); |
| 2505 | if (err < 0) | 2587 | if (err < 0) |
| 2506 | goto out; | 2588 | goto out; |
| 2507 | 2589 | ||
| 2508 | entry = create_proc_entry("cgroups", 0, NULL); | 2590 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); |
| 2509 | if (entry) | ||
| 2510 | entry->proc_fops = &proc_cgroupstats_operations; | ||
| 2511 | 2591 | ||
| 2512 | out: | 2592 | out: |
| 2513 | if (err) | 2593 | if (err) |
| @@ -2561,6 +2641,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 2561 | /* Skip this hierarchy if it has no active subsystems */ | 2641 | /* Skip this hierarchy if it has no active subsystems */ |
| 2562 | if (!root->actual_subsys_bits) | 2642 | if (!root->actual_subsys_bits) |
| 2563 | continue; | 2643 | continue; |
| 2644 | seq_printf(m, "%lu:", root->subsys_bits); | ||
| 2564 | for_each_subsys(root, ss) | 2645 | for_each_subsys(root, ss) |
| 2565 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 2646 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
| 2566 | seq_putc(m, ':'); | 2647 | seq_putc(m, ':'); |
| @@ -2600,13 +2681,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
| 2600 | { | 2681 | { |
| 2601 | int i; | 2682 | int i; |
| 2602 | 2683 | ||
| 2603 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); | 2684 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
| 2604 | mutex_lock(&cgroup_mutex); | 2685 | mutex_lock(&cgroup_mutex); |
| 2605 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 2686 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 2606 | struct cgroup_subsys *ss = subsys[i]; | 2687 | struct cgroup_subsys *ss = subsys[i]; |
| 2607 | seq_printf(m, "%s\t%lu\t%d\n", | 2688 | seq_printf(m, "%s\t%lu\t%d\t%d\n", |
| 2608 | ss->name, ss->root->subsys_bits, | 2689 | ss->name, ss->root->subsys_bits, |
| 2609 | ss->root->number_of_cgroups); | 2690 | ss->root->number_of_cgroups, !ss->disabled); |
| 2610 | } | 2691 | } |
| 2611 | mutex_unlock(&cgroup_mutex); | 2692 | mutex_unlock(&cgroup_mutex); |
| 2612 | return 0; | 2693 | return 0; |
| @@ -2614,7 +2695,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
| 2614 | 2695 | ||
| 2615 | static int cgroupstats_open(struct inode *inode, struct file *file) | 2696 | static int cgroupstats_open(struct inode *inode, struct file *file) |
| 2616 | { | 2697 | { |
| 2617 | return single_open(file, proc_cgroupstats_show, 0); | 2698 | return single_open(file, proc_cgroupstats_show, NULL); |
| 2618 | } | 2699 | } |
| 2619 | 2700 | ||
| 2620 | static struct file_operations proc_cgroupstats_operations = { | 2701 | static struct file_operations proc_cgroupstats_operations = { |
| @@ -2669,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
| 2669 | } | 2750 | } |
| 2670 | } | 2751 | } |
| 2671 | 2752 | ||
| 2753 | #ifdef CONFIG_MM_OWNER | ||
| 2754 | /** | ||
| 2755 | * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes | ||
| 2756 | * @p: the new owner | ||
| 2757 | * | ||
| 2758 | * Called on every change to mm->owner. mm_init_owner() does not | ||
| 2759 | * invoke this routine, since it assigns the mm->owner the first time | ||
| 2760 | * and does not change it. | ||
| 2761 | */ | ||
| 2762 | void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) | ||
| 2763 | { | ||
| 2764 | struct cgroup *oldcgrp, *newcgrp; | ||
| 2765 | |||
| 2766 | if (need_mm_owner_callback) { | ||
| 2767 | int i; | ||
| 2768 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
| 2769 | struct cgroup_subsys *ss = subsys[i]; | ||
| 2770 | oldcgrp = task_cgroup(old, ss->subsys_id); | ||
| 2771 | newcgrp = task_cgroup(new, ss->subsys_id); | ||
| 2772 | if (oldcgrp == newcgrp) | ||
| 2773 | continue; | ||
| 2774 | if (ss->mm_owner_changed) | ||
| 2775 | ss->mm_owner_changed(ss, oldcgrp, newcgrp); | ||
| 2776 | } | ||
| 2777 | } | ||
| 2778 | } | ||
| 2779 | #endif /* CONFIG_MM_OWNER */ | ||
| 2780 | |||
| 2672 | /** | 2781 | /** |
| 2673 | * cgroup_post_fork - called on a new task after adding it to the task list | 2782 | * cgroup_post_fork - called on a new task after adding it to the task list |
| 2674 | * @child: the task in question | 2783 | * @child: the task in question |
| @@ -3010,3 +3119,27 @@ static void cgroup_release_agent(struct work_struct *work) | |||
| 3010 | spin_unlock(&release_list_lock); | 3119 | spin_unlock(&release_list_lock); |
| 3011 | mutex_unlock(&cgroup_mutex); | 3120 | mutex_unlock(&cgroup_mutex); |
| 3012 | } | 3121 | } |
| 3122 | |||
| 3123 | static int __init cgroup_disable(char *str) | ||
| 3124 | { | ||
| 3125 | int i; | ||
| 3126 | char *token; | ||
| 3127 | |||
| 3128 | while ((token = strsep(&str, ",")) != NULL) { | ||
| 3129 | if (!*token) | ||
| 3130 | continue; | ||
| 3131 | |||
| 3132 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
| 3133 | struct cgroup_subsys *ss = subsys[i]; | ||
| 3134 | |||
| 3135 | if (!strcmp(token, ss->name)) { | ||
| 3136 | ss->disabled = 1; | ||
| 3137 | printk(KERN_INFO "Disabling %s control group" | ||
| 3138 | " subsystem\n", ss->name); | ||
| 3139 | break; | ||
| 3140 | } | ||
| 3141 | } | ||
| 3142 | } | ||
| 3143 | return 1; | ||
| 3144 | } | ||
| 3145 | __setup("cgroup_disable=", cgroup_disable); | ||
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index 37301e877cb0..c3dc3aba4c02 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kernel/ccontainer_debug.c - Example cgroup subsystem that | 2 | * kernel/cgroup_debug.c - Example cgroup subsystem that |
| 3 | * exposes debug info | 3 | * exposes debug info |
| 4 | * | 4 | * |
| 5 | * Copyright (C) Google Inc, 2007 | 5 | * Copyright (C) Google Inc, 2007 |
| @@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, | |||
| 62 | return count; | 62 | return count; |
| 63 | } | 63 | } |
| 64 | 64 | ||
| 65 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | ||
| 66 | { | ||
| 67 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 68 | } | ||
| 69 | |||
| 65 | static struct cftype files[] = { | 70 | static struct cftype files[] = { |
| 66 | { | 71 | { |
| 67 | .name = "cgroup_refcount", | 72 | .name = "cgroup_refcount", |
| 68 | .read_uint = cgroup_refcount_read, | 73 | .read_u64 = cgroup_refcount_read, |
| 69 | }, | 74 | }, |
| 70 | { | 75 | { |
| 71 | .name = "taskcount", | 76 | .name = "taskcount", |
| 72 | .read_uint = taskcount_read, | 77 | .read_u64 = taskcount_read, |
| 73 | }, | 78 | }, |
| 74 | 79 | ||
| 75 | { | 80 | { |
| 76 | .name = "current_css_set", | 81 | .name = "current_css_set", |
| 77 | .read_uint = current_css_set_read, | 82 | .read_u64 = current_css_set_read, |
| 78 | }, | 83 | }, |
| 79 | 84 | ||
| 80 | { | 85 | { |
| 81 | .name = "current_css_set_refcount", | 86 | .name = "current_css_set_refcount", |
| 82 | .read_uint = current_css_set_refcount_read, | 87 | .read_u64 = current_css_set_refcount_read, |
| 83 | }, | 88 | }, |
| 89 | |||
| 90 | { | ||
| 91 | .name = "releasable", | ||
| 92 | .read_u64 = releasable_read, | ||
| 93 | } | ||
| 84 | }; | 94 | }; |
| 85 | 95 | ||
| 86 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 96 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
diff --git a/kernel/compat.c b/kernel/compat.c index 5f0e201bcfd3..32c254a8ab9a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart) | |||
| 47 | mm_segment_t oldfs; | 47 | mm_segment_t oldfs; |
| 48 | long ret; | 48 | long ret; |
| 49 | 49 | ||
| 50 | rmtp = (struct compat_timespec __user *)(restart->arg1); | 50 | restart->nanosleep.rmtp = (struct timespec __user *) &rmt; |
| 51 | restart->arg1 = (unsigned long)&rmt; | ||
| 52 | oldfs = get_fs(); | 51 | oldfs = get_fs(); |
| 53 | set_fs(KERNEL_DS); | 52 | set_fs(KERNEL_DS); |
| 54 | ret = hrtimer_nanosleep_restart(restart); | 53 | ret = hrtimer_nanosleep_restart(restart); |
| 55 | set_fs(oldfs); | 54 | set_fs(oldfs); |
| 56 | 55 | ||
| 57 | if (ret) { | 56 | if (ret) { |
| 58 | restart->arg1 = (unsigned long)rmtp; | 57 | rmtp = restart->nanosleep.compat_rmtp; |
| 59 | 58 | ||
| 60 | if (rmtp && put_compat_timespec(&rmt, rmtp)) | 59 | if (rmtp && put_compat_timespec(&rmt, rmtp)) |
| 61 | return -EFAULT; | 60 | return -EFAULT; |
| @@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, | |||
| 89 | = ¤t_thread_info()->restart_block; | 88 | = ¤t_thread_info()->restart_block; |
| 90 | 89 | ||
| 91 | restart->fn = compat_nanosleep_restart; | 90 | restart->fn = compat_nanosleep_restart; |
| 92 | restart->arg1 = (unsigned long)rmtp; | 91 | restart->nanosleep.compat_rmtp = rmtp; |
| 93 | 92 | ||
| 94 | if (rmtp && put_compat_timespec(&rmt, rmtp)) | 93 | if (rmtp && put_compat_timespec(&rmt, rmtp)) |
| 95 | return -EFAULT; | 94 | return -EFAULT; |
| @@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, | |||
| 446 | if (retval) | 445 | if (retval) |
| 447 | return retval; | 446 | return retval; |
| 448 | 447 | ||
| 449 | return sched_setaffinity(pid, new_mask); | 448 | return sched_setaffinity(pid, &new_mask); |
| 450 | } | 449 | } |
| 451 | 450 | ||
| 452 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, | 451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, |
| @@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) | |||
| 607 | long err; | 606 | long err; |
| 608 | mm_segment_t oldfs; | 607 | mm_segment_t oldfs; |
| 609 | struct timespec tu; | 608 | struct timespec tu; |
| 610 | struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); | 609 | struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; |
| 611 | 610 | ||
| 612 | restart->arg1 = (unsigned long) &tu; | 611 | restart->nanosleep.rmtp = (struct timespec __user *) &tu; |
| 613 | oldfs = get_fs(); | 612 | oldfs = get_fs(); |
| 614 | set_fs(KERNEL_DS); | 613 | set_fs(KERNEL_DS); |
| 615 | err = clock_nanosleep_restart(restart); | 614 | err = clock_nanosleep_restart(restart); |
| @@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) | |||
| 621 | 620 | ||
| 622 | if (err == -ERESTART_RESTARTBLOCK) { | 621 | if (err == -ERESTART_RESTARTBLOCK) { |
| 623 | restart->fn = compat_clock_nanosleep_restart; | 622 | restart->fn = compat_clock_nanosleep_restart; |
| 624 | restart->arg1 = (unsigned long) rmtp; | 623 | restart->nanosleep.compat_rmtp = rmtp; |
| 625 | } | 624 | } |
| 626 | return err; | 625 | return err; |
| 627 | } | 626 | } |
| @@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
| 652 | if (err == -ERESTART_RESTARTBLOCK) { | 651 | if (err == -ERESTART_RESTARTBLOCK) { |
| 653 | restart = ¤t_thread_info()->restart_block; | 652 | restart = ¤t_thread_info()->restart_block; |
| 654 | restart->fn = compat_clock_nanosleep_restart; | 653 | restart->fn = compat_clock_nanosleep_restart; |
| 655 | restart->arg1 = (unsigned long) rmtp; | 654 | restart->nanosleep.compat_rmtp = rmtp; |
| 656 | } | 655 | } |
| 657 | return err; | 656 | return err; |
| 658 | } | 657 | } |
| @@ -899,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
| 899 | 898 | ||
| 900 | current->state = TASK_INTERRUPTIBLE; | 899 | current->state = TASK_INTERRUPTIBLE; |
| 901 | schedule(); | 900 | schedule(); |
| 902 | set_thread_flag(TIF_RESTORE_SIGMASK); | 901 | set_restore_sigmask(); |
| 903 | return -ERESTARTNOHAND; | 902 | return -ERESTARTNOHAND; |
| 904 | } | 903 | } |
| 905 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | 904 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ |
| @@ -956,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | |||
| 956 | __put_user(txc.jitcnt, &utp->jitcnt) || | 955 | __put_user(txc.jitcnt, &utp->jitcnt) || |
| 957 | __put_user(txc.calcnt, &utp->calcnt) || | 956 | __put_user(txc.calcnt, &utp->calcnt) || |
| 958 | __put_user(txc.errcnt, &utp->errcnt) || | 957 | __put_user(txc.errcnt, &utp->errcnt) || |
| 959 | __put_user(txc.stbcnt, &utp->stbcnt)) | 958 | __put_user(txc.stbcnt, &utp->stbcnt) || |
| 959 | __put_user(txc.tai, &utp->tai)) | ||
| 960 | ret = -EFAULT; | 960 | ret = -EFAULT; |
| 961 | 961 | ||
| 962 | return ret; | 962 | return ret; |
| @@ -1081,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
| 1081 | 1081 | ||
| 1082 | return 0; | 1082 | return 0; |
| 1083 | } | 1083 | } |
| 1084 | |||
diff --git a/kernel/configs.c b/kernel/configs.c index e84d3f9c6c7b..4c345210ed8c 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
| @@ -79,12 +79,11 @@ static int __init ikconfig_init(void) | |||
| 79 | struct proc_dir_entry *entry; | 79 | struct proc_dir_entry *entry; |
| 80 | 80 | ||
| 81 | /* create the current config file */ | 81 | /* create the current config file */ |
| 82 | entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, | 82 | entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, |
| 83 | &proc_root); | 83 | &ikconfig_file_ops); |
| 84 | if (!entry) | 84 | if (!entry) |
| 85 | return -ENOMEM; | 85 | return -ENOMEM; |
| 86 | 86 | ||
| 87 | entry->proc_fops = &ikconfig_file_ops; | ||
| 88 | entry->size = kernel_config_data_size; | 87 | entry->size = kernel_config_data_size; |
| 89 | 88 | ||
| 90 | return 0; | 89 | return 0; |
| @@ -95,7 +94,7 @@ static int __init ikconfig_init(void) | |||
| 95 | 94 | ||
| 96 | static void __exit ikconfig_cleanup(void) | 95 | static void __exit ikconfig_cleanup(void) |
| 97 | { | 96 | { |
| 98 | remove_proc_entry("config.gz", &proc_root); | 97 | remove_proc_entry("config.gz", NULL); |
| 99 | } | 98 | } |
| 100 | 99 | ||
| 101 | module_init(ikconfig_init); | 100 | module_init(ikconfig_init); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abed..c77bc3a1c722 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -33,17 +33,13 @@ static struct { | |||
| 33 | * an ongoing cpu hotplug operation. | 33 | * an ongoing cpu hotplug operation. |
| 34 | */ | 34 | */ |
| 35 | int refcount; | 35 | int refcount; |
| 36 | wait_queue_head_t writer_queue; | ||
| 37 | } cpu_hotplug; | 36 | } cpu_hotplug; |
| 38 | 37 | ||
| 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) | ||
| 40 | |||
| 41 | void __init cpu_hotplug_init(void) | 38 | void __init cpu_hotplug_init(void) |
| 42 | { | 39 | { |
| 43 | cpu_hotplug.active_writer = NULL; | 40 | cpu_hotplug.active_writer = NULL; |
| 44 | mutex_init(&cpu_hotplug.lock); | 41 | mutex_init(&cpu_hotplug.lock); |
| 45 | cpu_hotplug.refcount = 0; | 42 | cpu_hotplug.refcount = 0; |
| 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); | ||
| 47 | } | 43 | } |
| 48 | 44 | ||
| 49 | #ifdef CONFIG_HOTPLUG_CPU | 45 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -65,11 +61,8 @@ void put_online_cpus(void) | |||
| 65 | if (cpu_hotplug.active_writer == current) | 61 | if (cpu_hotplug.active_writer == current) |
| 66 | return; | 62 | return; |
| 67 | mutex_lock(&cpu_hotplug.lock); | 63 | mutex_lock(&cpu_hotplug.lock); |
| 68 | cpu_hotplug.refcount--; | 64 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) |
| 69 | 65 | wake_up_process(cpu_hotplug.active_writer); | |
| 70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
| 71 | wake_up(&cpu_hotplug.writer_queue); | ||
| 72 | |||
| 73 | mutex_unlock(&cpu_hotplug.lock); | 66 | mutex_unlock(&cpu_hotplug.lock); |
| 74 | 67 | ||
| 75 | } | 68 | } |
| @@ -98,8 +91,8 @@ void cpu_maps_update_done(void) | |||
| 98 | * Note that during a cpu-hotplug operation, the new readers, if any, | 91 | * Note that during a cpu-hotplug operation, the new readers, if any, |
| 99 | * will be blocked by the cpu_hotplug.lock | 92 | * will be blocked by the cpu_hotplug.lock |
| 100 | * | 93 | * |
| 101 | * Since cpu_maps_update_begin is always called after invoking | 94 | * Since cpu_hotplug_begin() is always called after invoking |
| 102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | 95 | * cpu_maps_update_begin(), we can be sure that only one writer is active. |
| 103 | * | 96 | * |
| 104 | * Note that theoretically, there is a possibility of a livelock: | 97 | * Note that theoretically, there is a possibility of a livelock: |
| 105 | * - Refcount goes to zero, last reader wakes up the sleeping | 98 | * - Refcount goes to zero, last reader wakes up the sleeping |
| @@ -115,19 +108,16 @@ void cpu_maps_update_done(void) | |||
| 115 | */ | 108 | */ |
| 116 | static void cpu_hotplug_begin(void) | 109 | static void cpu_hotplug_begin(void) |
| 117 | { | 110 | { |
| 118 | DECLARE_WAITQUEUE(wait, current); | ||
| 119 | |||
| 120 | mutex_lock(&cpu_hotplug.lock); | ||
| 121 | |||
| 122 | cpu_hotplug.active_writer = current; | 111 | cpu_hotplug.active_writer = current; |
| 123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | 112 | |
| 124 | while (cpu_hotplug.refcount) { | 113 | for (;;) { |
| 125 | set_current_state(TASK_UNINTERRUPTIBLE); | 114 | mutex_lock(&cpu_hotplug.lock); |
| 115 | if (likely(!cpu_hotplug.refcount)) | ||
| 116 | break; | ||
| 117 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 126 | mutex_unlock(&cpu_hotplug.lock); | 118 | mutex_unlock(&cpu_hotplug.lock); |
| 127 | schedule(); | 119 | schedule(); |
| 128 | mutex_lock(&cpu_hotplug.lock); | ||
| 129 | } | 120 | } |
| 130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
| 131 | } | 121 | } |
| 132 | 122 | ||
| 133 | static void cpu_hotplug_done(void) | 123 | static void cpu_hotplug_done(void) |
| @@ -136,7 +126,7 @@ static void cpu_hotplug_done(void) | |||
| 136 | mutex_unlock(&cpu_hotplug.lock); | 126 | mutex_unlock(&cpu_hotplug.lock); |
| 137 | } | 127 | } |
| 138 | /* Need to know about CPUs going up/down? */ | 128 | /* Need to know about CPUs going up/down? */ |
| 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 129 | int __ref register_cpu_notifier(struct notifier_block *nb) |
| 140 | { | 130 | { |
| 141 | int ret; | 131 | int ret; |
| 142 | cpu_maps_update_begin(); | 132 | cpu_maps_update_begin(); |
| @@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb) | |||
| 149 | 139 | ||
| 150 | EXPORT_SYMBOL(register_cpu_notifier); | 140 | EXPORT_SYMBOL(register_cpu_notifier); |
| 151 | 141 | ||
| 152 | void unregister_cpu_notifier(struct notifier_block *nb) | 142 | void __ref unregister_cpu_notifier(struct notifier_block *nb) |
| 153 | { | 143 | { |
| 154 | cpu_maps_update_begin(); | 144 | cpu_maps_update_begin(); |
| 155 | raw_notifier_chain_unregister(&cpu_chain, nb); | 145 | raw_notifier_chain_unregister(&cpu_chain, nb); |
| @@ -180,7 +170,7 @@ struct take_cpu_down_param { | |||
| 180 | }; | 170 | }; |
| 181 | 171 | ||
| 182 | /* Take this CPU down. */ | 172 | /* Take this CPU down. */ |
| 183 | static int take_cpu_down(void *_param) | 173 | static int __ref take_cpu_down(void *_param) |
| 184 | { | 174 | { |
| 185 | struct take_cpu_down_param *param = _param; | 175 | struct take_cpu_down_param *param = _param; |
| 186 | int err; | 176 | int err; |
| @@ -199,7 +189,7 @@ static int take_cpu_down(void *_param) | |||
| 199 | } | 189 | } |
| 200 | 190 | ||
| 201 | /* Requires cpu_add_remove_lock to be held */ | 191 | /* Requires cpu_add_remove_lock to be held */ |
| 202 | static int _cpu_down(unsigned int cpu, int tasks_frozen) | 192 | static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) |
| 203 | { | 193 | { |
| 204 | int err, nr_calls = 0; | 194 | int err, nr_calls = 0; |
| 205 | struct task_struct *p; | 195 | struct task_struct *p; |
| @@ -225,16 +215,16 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 225 | __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, | 215 | __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, |
| 226 | hcpu, nr_calls, NULL); | 216 | hcpu, nr_calls, NULL); |
| 227 | printk("%s: attempt to take down CPU %u failed\n", | 217 | printk("%s: attempt to take down CPU %u failed\n", |
| 228 | __FUNCTION__, cpu); | 218 | __func__, cpu); |
| 229 | err = -EINVAL; | 219 | err = -EINVAL; |
| 230 | goto out_release; | 220 | goto out_release; |
| 231 | } | 221 | } |
| 232 | 222 | ||
| 233 | /* Ensure that we are not runnable on dying cpu */ | 223 | /* Ensure that we are not runnable on dying cpu */ |
| 234 | old_allowed = current->cpus_allowed; | 224 | old_allowed = current->cpus_allowed; |
| 235 | tmp = CPU_MASK_ALL; | 225 | cpus_setall(tmp); |
| 236 | cpu_clear(cpu, tmp); | 226 | cpu_clear(cpu, tmp); |
| 237 | set_cpus_allowed(current, tmp); | 227 | set_cpus_allowed_ptr(current, &tmp); |
| 238 | 228 | ||
| 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 229 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
| 240 | 230 | ||
| @@ -268,13 +258,13 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 268 | out_thread: | 258 | out_thread: |
| 269 | err = kthread_stop(p); | 259 | err = kthread_stop(p); |
| 270 | out_allowed: | 260 | out_allowed: |
| 271 | set_cpus_allowed(current, old_allowed); | 261 | set_cpus_allowed_ptr(current, &old_allowed); |
| 272 | out_release: | 262 | out_release: |
| 273 | cpu_hotplug_done(); | 263 | cpu_hotplug_done(); |
| 274 | return err; | 264 | return err; |
| 275 | } | 265 | } |
| 276 | 266 | ||
| 277 | int cpu_down(unsigned int cpu) | 267 | int __ref cpu_down(unsigned int cpu) |
| 278 | { | 268 | { |
| 279 | int err = 0; | 269 | int err = 0; |
| 280 | 270 | ||
| @@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 305 | if (ret == NOTIFY_BAD) { | 295 | if (ret == NOTIFY_BAD) { |
| 306 | nr_calls--; | 296 | nr_calls--; |
| 307 | printk("%s: attempt to bring up CPU %u failed\n", | 297 | printk("%s: attempt to bring up CPU %u failed\n", |
| 308 | __FUNCTION__, cpu); | 298 | __func__, cpu); |
| 309 | ret = -EINVAL; | 299 | ret = -EINVAL; |
| 310 | goto out_notify; | 300 | goto out_notify; |
| 311 | } | 301 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f414228..8da627d33804 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -98,6 +98,9 @@ struct cpuset { | |||
| 98 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
| 99 | int pn; | 99 | int pn; |
| 100 | 100 | ||
| 101 | /* for custom sched domain */ | ||
| 102 | int relax_domain_level; | ||
| 103 | |||
| 101 | /* used for walking a cpuset heirarchy */ | 104 | /* used for walking a cpuset heirarchy */ |
| 102 | struct list_head stack_list; | 105 | struct list_head stack_list; |
| 103 | }; | 106 | }; |
| @@ -124,6 +127,7 @@ struct cpuset_hotplug_scanner { | |||
| 124 | typedef enum { | 127 | typedef enum { |
| 125 | CS_CPU_EXCLUSIVE, | 128 | CS_CPU_EXCLUSIVE, |
| 126 | CS_MEM_EXCLUSIVE, | 129 | CS_MEM_EXCLUSIVE, |
| 130 | CS_MEM_HARDWALL, | ||
| 127 | CS_MEMORY_MIGRATE, | 131 | CS_MEMORY_MIGRATE, |
| 128 | CS_SCHED_LOAD_BALANCE, | 132 | CS_SCHED_LOAD_BALANCE, |
| 129 | CS_SPREAD_PAGE, | 133 | CS_SPREAD_PAGE, |
| @@ -141,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs) | |||
| 141 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | 145 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
| 142 | } | 146 | } |
| 143 | 147 | ||
| 148 | static inline int is_mem_hardwall(const struct cpuset *cs) | ||
| 149 | { | ||
| 150 | return test_bit(CS_MEM_HARDWALL, &cs->flags); | ||
| 151 | } | ||
| 152 | |||
| 144 | static inline int is_sched_load_balance(const struct cpuset *cs) | 153 | static inline int is_sched_load_balance(const struct cpuset *cs) |
| 145 | { | 154 | { |
| 146 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 155 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
| @@ -478,6 +487,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
| 478 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 487 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
| 479 | } | 488 | } |
| 480 | 489 | ||
| 490 | static void | ||
| 491 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | ||
| 492 | { | ||
| 493 | if (!dattr) | ||
| 494 | return; | ||
| 495 | if (dattr->relax_domain_level < c->relax_domain_level) | ||
| 496 | dattr->relax_domain_level = c->relax_domain_level; | ||
| 497 | return; | ||
| 498 | } | ||
| 499 | |||
| 481 | /* | 500 | /* |
| 482 | * rebuild_sched_domains() | 501 | * rebuild_sched_domains() |
| 483 | * | 502 | * |
| @@ -553,12 +572,14 @@ static void rebuild_sched_domains(void) | |||
| 553 | int csn; /* how many cpuset ptrs in csa so far */ | 572 | int csn; /* how many cpuset ptrs in csa so far */ |
| 554 | int i, j, k; /* indices for partition finding loops */ | 573 | int i, j, k; /* indices for partition finding loops */ |
| 555 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | 574 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ |
| 575 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | ||
| 556 | int ndoms; /* number of sched domains in result */ | 576 | int ndoms; /* number of sched domains in result */ |
| 557 | int nslot; /* next empty doms[] cpumask_t slot */ | 577 | int nslot; /* next empty doms[] cpumask_t slot */ |
| 558 | 578 | ||
| 559 | q = NULL; | 579 | q = NULL; |
| 560 | csa = NULL; | 580 | csa = NULL; |
| 561 | doms = NULL; | 581 | doms = NULL; |
| 582 | dattr = NULL; | ||
| 562 | 583 | ||
| 563 | /* Special case for the 99% of systems with one, full, sched domain */ | 584 | /* Special case for the 99% of systems with one, full, sched domain */ |
| 564 | if (is_sched_load_balance(&top_cpuset)) { | 585 | if (is_sched_load_balance(&top_cpuset)) { |
| @@ -566,6 +587,11 @@ static void rebuild_sched_domains(void) | |||
| 566 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 587 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
| 567 | if (!doms) | 588 | if (!doms) |
| 568 | goto rebuild; | 589 | goto rebuild; |
| 590 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
| 591 | if (dattr) { | ||
| 592 | *dattr = SD_ATTR_INIT; | ||
| 593 | update_domain_attr(dattr, &top_cpuset); | ||
| 594 | } | ||
| 569 | *doms = top_cpuset.cpus_allowed; | 595 | *doms = top_cpuset.cpus_allowed; |
| 570 | goto rebuild; | 596 | goto rebuild; |
| 571 | } | 597 | } |
| @@ -622,6 +648,7 @@ restart: | |||
| 622 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 648 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
| 623 | if (!doms) | 649 | if (!doms) |
| 624 | goto rebuild; | 650 | goto rebuild; |
| 651 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
| 625 | 652 | ||
| 626 | for (nslot = 0, i = 0; i < csn; i++) { | 653 | for (nslot = 0, i = 0; i < csn; i++) { |
| 627 | struct cpuset *a = csa[i]; | 654 | struct cpuset *a = csa[i]; |
| @@ -644,12 +671,15 @@ restart: | |||
| 644 | } | 671 | } |
| 645 | 672 | ||
| 646 | cpus_clear(*dp); | 673 | cpus_clear(*dp); |
| 674 | if (dattr) | ||
| 675 | *(dattr + nslot) = SD_ATTR_INIT; | ||
| 647 | for (j = i; j < csn; j++) { | 676 | for (j = i; j < csn; j++) { |
| 648 | struct cpuset *b = csa[j]; | 677 | struct cpuset *b = csa[j]; |
| 649 | 678 | ||
| 650 | if (apn == b->pn) { | 679 | if (apn == b->pn) { |
| 651 | cpus_or(*dp, *dp, b->cpus_allowed); | 680 | cpus_or(*dp, *dp, b->cpus_allowed); |
| 652 | b->pn = -1; | 681 | b->pn = -1; |
| 682 | update_domain_attr(dattr, b); | ||
| 653 | } | 683 | } |
| 654 | } | 684 | } |
| 655 | nslot++; | 685 | nslot++; |
| @@ -660,7 +690,7 @@ restart: | |||
| 660 | rebuild: | 690 | rebuild: |
| 661 | /* Have scheduler rebuild sched domains */ | 691 | /* Have scheduler rebuild sched domains */ |
| 662 | get_online_cpus(); | 692 | get_online_cpus(); |
| 663 | partition_sched_domains(ndoms, doms); | 693 | partition_sched_domains(ndoms, doms, dattr); |
| 664 | put_online_cpus(); | 694 | put_online_cpus(); |
| 665 | 695 | ||
| 666 | done: | 696 | done: |
| @@ -668,6 +698,7 @@ done: | |||
| 668 | kfifo_free(q); | 698 | kfifo_free(q); |
| 669 | kfree(csa); | 699 | kfree(csa); |
| 670 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 700 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
| 701 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | ||
| 671 | } | 702 | } |
| 672 | 703 | ||
| 673 | static inline int started_after_time(struct task_struct *t1, | 704 | static inline int started_after_time(struct task_struct *t1, |
| @@ -710,7 +741,8 @@ static inline int started_after(void *p1, void *p2) | |||
| 710 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 741 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other |
| 711 | * words, if its mask is not equal to its cpuset's mask). | 742 | * words, if its mask is not equal to its cpuset's mask). |
| 712 | */ | 743 | */ |
| 713 | int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | 744 | static int cpuset_test_cpumask(struct task_struct *tsk, |
| 745 | struct cgroup_scanner *scan) | ||
| 714 | { | 746 | { |
| 715 | return !cpus_equal(tsk->cpus_allowed, | 747 | return !cpus_equal(tsk->cpus_allowed, |
| 716 | (cgroup_cs(scan->cg))->cpus_allowed); | 748 | (cgroup_cs(scan->cg))->cpus_allowed); |
| @@ -727,9 +759,10 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | |||
| 727 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 759 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
| 728 | * holding cgroup_lock() at this point. | 760 | * holding cgroup_lock() at this point. |
| 729 | */ | 761 | */ |
| 730 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | 762 | static void cpuset_change_cpumask(struct task_struct *tsk, |
| 763 | struct cgroup_scanner *scan) | ||
| 731 | { | 764 | { |
| 732 | set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); | 765 | set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); |
| 733 | } | 766 | } |
| 734 | 767 | ||
| 735 | /** | 768 | /** |
| @@ -916,7 +949,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 916 | cs->mems_generation = cpuset_mems_generation++; | 949 | cs->mems_generation = cpuset_mems_generation++; |
| 917 | mutex_unlock(&callback_mutex); | 950 | mutex_unlock(&callback_mutex); |
| 918 | 951 | ||
| 919 | cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ | 952 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
| 920 | 953 | ||
| 921 | fudge = 10; /* spare mmarray[] slots */ | 954 | fudge = 10; /* spare mmarray[] slots */ |
| 922 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | 955 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ |
| @@ -967,7 +1000,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 967 | * rebind the vma mempolicies of each mm in mmarray[] to their | 1000 | * rebind the vma mempolicies of each mm in mmarray[] to their |
| 968 | * new cpuset, and release that mm. The mpol_rebind_mm() | 1001 | * new cpuset, and release that mm. The mpol_rebind_mm() |
| 969 | * call takes mmap_sem, which we couldn't take while holding | 1002 | * call takes mmap_sem, which we couldn't take while holding |
| 970 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 1003 | * tasklist_lock. Forks can happen again now - the mpol_dup() |
| 971 | * cpuset_being_rebound check will catch such forks, and rebind | 1004 | * cpuset_being_rebound check will catch such forks, and rebind |
| 972 | * their vma mempolicies too. Because we still hold the global | 1005 | * their vma mempolicies too. Because we still hold the global |
| 973 | * cgroup_mutex, we know that no other rebind effort will | 1006 | * cgroup_mutex, we know that no other rebind effort will |
| @@ -998,40 +1031,37 @@ int current_cpuset_is_being_rebound(void) | |||
| 998 | return task_cs(current) == cpuset_being_rebound; | 1031 | return task_cs(current) == cpuset_being_rebound; |
| 999 | } | 1032 | } |
| 1000 | 1033 | ||
| 1001 | /* | 1034 | static int update_relax_domain_level(struct cpuset *cs, char *buf) |
| 1002 | * Call with cgroup_mutex held. | ||
| 1003 | */ | ||
| 1004 | |||
| 1005 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
| 1006 | { | 1035 | { |
| 1007 | if (simple_strtoul(buf, NULL, 10) != 0) | 1036 | int val = simple_strtol(buf, NULL, 10); |
| 1008 | cpuset_memory_pressure_enabled = 1; | 1037 | |
| 1009 | else | 1038 | if (val < 0) |
| 1010 | cpuset_memory_pressure_enabled = 0; | 1039 | val = -1; |
| 1040 | |||
| 1041 | if (val != cs->relax_domain_level) { | ||
| 1042 | cs->relax_domain_level = val; | ||
| 1043 | rebuild_sched_domains(); | ||
| 1044 | } | ||
| 1045 | |||
| 1011 | return 0; | 1046 | return 0; |
| 1012 | } | 1047 | } |
| 1013 | 1048 | ||
| 1014 | /* | 1049 | /* |
| 1015 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1050 | * update_flag - read a 0 or a 1 in a file and update associated flag |
| 1016 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1051 | * bit: the bit to update (see cpuset_flagbits_t) |
| 1017 | * CS_SCHED_LOAD_BALANCE, | 1052 | * cs: the cpuset to update |
| 1018 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, | 1053 | * turning_on: whether the flag is being set or cleared |
| 1019 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | ||
| 1020 | * cs: the cpuset to update | ||
| 1021 | * buf: the buffer where we read the 0 or 1 | ||
| 1022 | * | 1054 | * |
| 1023 | * Call with cgroup_mutex held. | 1055 | * Call with cgroup_mutex held. |
| 1024 | */ | 1056 | */ |
| 1025 | 1057 | ||
| 1026 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 1058 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
| 1059 | int turning_on) | ||
| 1027 | { | 1060 | { |
| 1028 | int turning_on; | ||
| 1029 | struct cpuset trialcs; | 1061 | struct cpuset trialcs; |
| 1030 | int err; | 1062 | int err; |
| 1031 | int cpus_nonempty, balance_flag_changed; | 1063 | int cpus_nonempty, balance_flag_changed; |
| 1032 | 1064 | ||
| 1033 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | ||
| 1034 | |||
| 1035 | trialcs = *cs; | 1065 | trialcs = *cs; |
| 1036 | if (turning_on) | 1066 | if (turning_on) |
| 1037 | set_bit(bit, &trialcs.flags); | 1067 | set_bit(bit, &trialcs.flags); |
| @@ -1178,7 +1208,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
| 1178 | 1208 | ||
| 1179 | mutex_lock(&callback_mutex); | 1209 | mutex_lock(&callback_mutex); |
| 1180 | guarantee_online_cpus(cs, &cpus); | 1210 | guarantee_online_cpus(cs, &cpus); |
| 1181 | set_cpus_allowed(tsk, cpus); | 1211 | set_cpus_allowed_ptr(tsk, &cpus); |
| 1182 | mutex_unlock(&callback_mutex); | 1212 | mutex_unlock(&callback_mutex); |
| 1183 | 1213 | ||
| 1184 | from = oldcs->mems_allowed; | 1214 | from = oldcs->mems_allowed; |
| @@ -1201,7 +1231,9 @@ typedef enum { | |||
| 1201 | FILE_MEMLIST, | 1231 | FILE_MEMLIST, |
| 1202 | FILE_CPU_EXCLUSIVE, | 1232 | FILE_CPU_EXCLUSIVE, |
| 1203 | FILE_MEM_EXCLUSIVE, | 1233 | FILE_MEM_EXCLUSIVE, |
| 1234 | FILE_MEM_HARDWALL, | ||
| 1204 | FILE_SCHED_LOAD_BALANCE, | 1235 | FILE_SCHED_LOAD_BALANCE, |
| 1236 | FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
| 1205 | FILE_MEMORY_PRESSURE_ENABLED, | 1237 | FILE_MEMORY_PRESSURE_ENABLED, |
| 1206 | FILE_MEMORY_PRESSURE, | 1238 | FILE_MEMORY_PRESSURE, |
| 1207 | FILE_SPREAD_PAGE, | 1239 | FILE_SPREAD_PAGE, |
| @@ -1224,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
| 1224 | return -E2BIG; | 1256 | return -E2BIG; |
| 1225 | 1257 | ||
| 1226 | /* +1 for nul-terminator */ | 1258 | /* +1 for nul-terminator */ |
| 1227 | if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) | 1259 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); |
| 1260 | if (!buffer) | ||
| 1228 | return -ENOMEM; | 1261 | return -ENOMEM; |
| 1229 | 1262 | ||
| 1230 | if (copy_from_user(buffer, userbuf, nbytes)) { | 1263 | if (copy_from_user(buffer, userbuf, nbytes)) { |
| @@ -1247,43 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
| 1247 | case FILE_MEMLIST: | 1280 | case FILE_MEMLIST: |
| 1248 | retval = update_nodemask(cs, buffer); | 1281 | retval = update_nodemask(cs, buffer); |
| 1249 | break; | 1282 | break; |
| 1283 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
| 1284 | retval = update_relax_domain_level(cs, buffer); | ||
| 1285 | break; | ||
| 1286 | default: | ||
| 1287 | retval = -EINVAL; | ||
| 1288 | goto out2; | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | if (retval == 0) | ||
| 1292 | retval = nbytes; | ||
| 1293 | out2: | ||
| 1294 | cgroup_unlock(); | ||
| 1295 | out1: | ||
| 1296 | kfree(buffer); | ||
| 1297 | return retval; | ||
| 1298 | } | ||
| 1299 | |||
| 1300 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
| 1301 | { | ||
| 1302 | int retval = 0; | ||
| 1303 | struct cpuset *cs = cgroup_cs(cgrp); | ||
| 1304 | cpuset_filetype_t type = cft->private; | ||
| 1305 | |||
| 1306 | cgroup_lock(); | ||
| 1307 | |||
| 1308 | if (cgroup_is_removed(cgrp)) { | ||
| 1309 | cgroup_unlock(); | ||
| 1310 | return -ENODEV; | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | switch (type) { | ||
| 1250 | case FILE_CPU_EXCLUSIVE: | 1314 | case FILE_CPU_EXCLUSIVE: |
| 1251 | retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); | 1315 | retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); |
| 1252 | break; | 1316 | break; |
| 1253 | case FILE_MEM_EXCLUSIVE: | 1317 | case FILE_MEM_EXCLUSIVE: |
| 1254 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); | 1318 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); |
| 1319 | break; | ||
| 1320 | case FILE_MEM_HARDWALL: | ||
| 1321 | retval = update_flag(CS_MEM_HARDWALL, cs, val); | ||
| 1255 | break; | 1322 | break; |
| 1256 | case FILE_SCHED_LOAD_BALANCE: | 1323 | case FILE_SCHED_LOAD_BALANCE: |
| 1257 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); | 1324 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); |
| 1258 | break; | 1325 | break; |
| 1259 | case FILE_MEMORY_MIGRATE: | 1326 | case FILE_MEMORY_MIGRATE: |
| 1260 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1327 | retval = update_flag(CS_MEMORY_MIGRATE, cs, val); |
| 1261 | break; | 1328 | break; |
| 1262 | case FILE_MEMORY_PRESSURE_ENABLED: | 1329 | case FILE_MEMORY_PRESSURE_ENABLED: |
| 1263 | retval = update_memory_pressure_enabled(cs, buffer); | 1330 | cpuset_memory_pressure_enabled = !!val; |
| 1264 | break; | 1331 | break; |
| 1265 | case FILE_MEMORY_PRESSURE: | 1332 | case FILE_MEMORY_PRESSURE: |
| 1266 | retval = -EACCES; | 1333 | retval = -EACCES; |
| 1267 | break; | 1334 | break; |
| 1268 | case FILE_SPREAD_PAGE: | 1335 | case FILE_SPREAD_PAGE: |
| 1269 | retval = update_flag(CS_SPREAD_PAGE, cs, buffer); | 1336 | retval = update_flag(CS_SPREAD_PAGE, cs, val); |
| 1270 | cs->mems_generation = cpuset_mems_generation++; | 1337 | cs->mems_generation = cpuset_mems_generation++; |
| 1271 | break; | 1338 | break; |
| 1272 | case FILE_SPREAD_SLAB: | 1339 | case FILE_SPREAD_SLAB: |
| 1273 | retval = update_flag(CS_SPREAD_SLAB, cs, buffer); | 1340 | retval = update_flag(CS_SPREAD_SLAB, cs, val); |
| 1274 | cs->mems_generation = cpuset_mems_generation++; | 1341 | cs->mems_generation = cpuset_mems_generation++; |
| 1275 | break; | 1342 | break; |
| 1276 | default: | 1343 | default: |
| 1277 | retval = -EINVAL; | 1344 | retval = -EINVAL; |
| 1278 | goto out2; | 1345 | break; |
| 1279 | } | 1346 | } |
| 1280 | |||
| 1281 | if (retval == 0) | ||
| 1282 | retval = nbytes; | ||
| 1283 | out2: | ||
| 1284 | cgroup_unlock(); | 1347 | cgroup_unlock(); |
| 1285 | out1: | ||
| 1286 | kfree(buffer); | ||
| 1287 | return retval; | 1348 | return retval; |
| 1288 | } | 1349 | } |
| 1289 | 1350 | ||
| @@ -1345,29 +1406,8 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, | |||
| 1345 | case FILE_MEMLIST: | 1406 | case FILE_MEMLIST: |
| 1346 | s += cpuset_sprintf_memlist(s, cs); | 1407 | s += cpuset_sprintf_memlist(s, cs); |
| 1347 | break; | 1408 | break; |
| 1348 | case FILE_CPU_EXCLUSIVE: | 1409 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
| 1349 | *s++ = is_cpu_exclusive(cs) ? '1' : '0'; | 1410 | s += sprintf(s, "%d", cs->relax_domain_level); |
| 1350 | break; | ||
| 1351 | case FILE_MEM_EXCLUSIVE: | ||
| 1352 | *s++ = is_mem_exclusive(cs) ? '1' : '0'; | ||
| 1353 | break; | ||
| 1354 | case FILE_SCHED_LOAD_BALANCE: | ||
| 1355 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; | ||
| 1356 | break; | ||
| 1357 | case FILE_MEMORY_MIGRATE: | ||
| 1358 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
| 1359 | break; | ||
| 1360 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
| 1361 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
| 1362 | break; | ||
| 1363 | case FILE_MEMORY_PRESSURE: | ||
| 1364 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
| 1365 | break; | ||
| 1366 | case FILE_SPREAD_PAGE: | ||
| 1367 | *s++ = is_spread_page(cs) ? '1' : '0'; | ||
| 1368 | break; | ||
| 1369 | case FILE_SPREAD_SLAB: | ||
| 1370 | *s++ = is_spread_slab(cs) ? '1' : '0'; | ||
| 1371 | break; | 1411 | break; |
| 1372 | default: | 1412 | default: |
| 1373 | retval = -EINVAL; | 1413 | retval = -EINVAL; |
| @@ -1381,111 +1421,137 @@ out: | |||
| 1381 | return retval; | 1421 | return retval; |
| 1382 | } | 1422 | } |
| 1383 | 1423 | ||
| 1384 | 1424 | static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |
| 1385 | 1425 | { | |
| 1426 | struct cpuset *cs = cgroup_cs(cont); | ||
| 1427 | cpuset_filetype_t type = cft->private; | ||
| 1428 | switch (type) { | ||
| 1429 | case FILE_CPU_EXCLUSIVE: | ||
| 1430 | return is_cpu_exclusive(cs); | ||
| 1431 | case FILE_MEM_EXCLUSIVE: | ||
| 1432 | return is_mem_exclusive(cs); | ||
| 1433 | case FILE_MEM_HARDWALL: | ||
| 1434 | return is_mem_hardwall(cs); | ||
| 1435 | case FILE_SCHED_LOAD_BALANCE: | ||
| 1436 | return is_sched_load_balance(cs); | ||
| 1437 | case FILE_MEMORY_MIGRATE: | ||
| 1438 | return is_memory_migrate(cs); | ||
| 1439 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
| 1440 | return cpuset_memory_pressure_enabled; | ||
| 1441 | case FILE_MEMORY_PRESSURE: | ||
| 1442 | return fmeter_getrate(&cs->fmeter); | ||
| 1443 | case FILE_SPREAD_PAGE: | ||
| 1444 | return is_spread_page(cs); | ||
| 1445 | case FILE_SPREAD_SLAB: | ||
| 1446 | return is_spread_slab(cs); | ||
| 1447 | default: | ||
| 1448 | BUG(); | ||
| 1449 | } | ||
| 1450 | } | ||
| 1386 | 1451 | ||
| 1387 | 1452 | ||
| 1388 | /* | 1453 | /* |
| 1389 | * for the common functions, 'private' gives the type of file | 1454 | * for the common functions, 'private' gives the type of file |
| 1390 | */ | 1455 | */ |
| 1391 | 1456 | ||
| 1392 | static struct cftype cft_cpus = { | 1457 | static struct cftype files[] = { |
| 1393 | .name = "cpus", | 1458 | { |
| 1394 | .read = cpuset_common_file_read, | 1459 | .name = "cpus", |
| 1395 | .write = cpuset_common_file_write, | 1460 | .read = cpuset_common_file_read, |
| 1396 | .private = FILE_CPULIST, | 1461 | .write = cpuset_common_file_write, |
| 1397 | }; | 1462 | .private = FILE_CPULIST, |
| 1398 | 1463 | }, | |
| 1399 | static struct cftype cft_mems = { | 1464 | |
| 1400 | .name = "mems", | 1465 | { |
| 1401 | .read = cpuset_common_file_read, | 1466 | .name = "mems", |
| 1402 | .write = cpuset_common_file_write, | 1467 | .read = cpuset_common_file_read, |
| 1403 | .private = FILE_MEMLIST, | 1468 | .write = cpuset_common_file_write, |
| 1404 | }; | 1469 | .private = FILE_MEMLIST, |
| 1405 | 1470 | }, | |
| 1406 | static struct cftype cft_cpu_exclusive = { | 1471 | |
| 1407 | .name = "cpu_exclusive", | 1472 | { |
| 1408 | .read = cpuset_common_file_read, | 1473 | .name = "cpu_exclusive", |
| 1409 | .write = cpuset_common_file_write, | 1474 | .read_u64 = cpuset_read_u64, |
| 1410 | .private = FILE_CPU_EXCLUSIVE, | 1475 | .write_u64 = cpuset_write_u64, |
| 1411 | }; | 1476 | .private = FILE_CPU_EXCLUSIVE, |
| 1412 | 1477 | }, | |
| 1413 | static struct cftype cft_mem_exclusive = { | 1478 | |
| 1414 | .name = "mem_exclusive", | 1479 | { |
| 1415 | .read = cpuset_common_file_read, | 1480 | .name = "mem_exclusive", |
| 1416 | .write = cpuset_common_file_write, | 1481 | .read_u64 = cpuset_read_u64, |
| 1417 | .private = FILE_MEM_EXCLUSIVE, | 1482 | .write_u64 = cpuset_write_u64, |
| 1418 | }; | 1483 | .private = FILE_MEM_EXCLUSIVE, |
| 1419 | 1484 | }, | |
| 1420 | static struct cftype cft_sched_load_balance = { | 1485 | |
| 1421 | .name = "sched_load_balance", | 1486 | { |
| 1422 | .read = cpuset_common_file_read, | 1487 | .name = "mem_hardwall", |
| 1423 | .write = cpuset_common_file_write, | 1488 | .read_u64 = cpuset_read_u64, |
| 1424 | .private = FILE_SCHED_LOAD_BALANCE, | 1489 | .write_u64 = cpuset_write_u64, |
| 1425 | }; | 1490 | .private = FILE_MEM_HARDWALL, |
| 1426 | 1491 | }, | |
| 1427 | static struct cftype cft_memory_migrate = { | 1492 | |
| 1428 | .name = "memory_migrate", | 1493 | { |
| 1429 | .read = cpuset_common_file_read, | 1494 | .name = "sched_load_balance", |
| 1430 | .write = cpuset_common_file_write, | 1495 | .read_u64 = cpuset_read_u64, |
| 1431 | .private = FILE_MEMORY_MIGRATE, | 1496 | .write_u64 = cpuset_write_u64, |
| 1497 | .private = FILE_SCHED_LOAD_BALANCE, | ||
| 1498 | }, | ||
| 1499 | |||
| 1500 | { | ||
| 1501 | .name = "sched_relax_domain_level", | ||
| 1502 | .read_u64 = cpuset_read_u64, | ||
| 1503 | .write_u64 = cpuset_write_u64, | ||
| 1504 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
| 1505 | }, | ||
| 1506 | |||
| 1507 | { | ||
| 1508 | .name = "memory_migrate", | ||
| 1509 | .read_u64 = cpuset_read_u64, | ||
| 1510 | .write_u64 = cpuset_write_u64, | ||
| 1511 | .private = FILE_MEMORY_MIGRATE, | ||
| 1512 | }, | ||
| 1513 | |||
| 1514 | { | ||
| 1515 | .name = "memory_pressure", | ||
| 1516 | .read_u64 = cpuset_read_u64, | ||
| 1517 | .write_u64 = cpuset_write_u64, | ||
| 1518 | .private = FILE_MEMORY_PRESSURE, | ||
| 1519 | }, | ||
| 1520 | |||
| 1521 | { | ||
| 1522 | .name = "memory_spread_page", | ||
| 1523 | .read_u64 = cpuset_read_u64, | ||
| 1524 | .write_u64 = cpuset_write_u64, | ||
| 1525 | .private = FILE_SPREAD_PAGE, | ||
| 1526 | }, | ||
| 1527 | |||
| 1528 | { | ||
| 1529 | .name = "memory_spread_slab", | ||
| 1530 | .read_u64 = cpuset_read_u64, | ||
| 1531 | .write_u64 = cpuset_write_u64, | ||
| 1532 | .private = FILE_SPREAD_SLAB, | ||
| 1533 | }, | ||
| 1432 | }; | 1534 | }; |
| 1433 | 1535 | ||
| 1434 | static struct cftype cft_memory_pressure_enabled = { | 1536 | static struct cftype cft_memory_pressure_enabled = { |
| 1435 | .name = "memory_pressure_enabled", | 1537 | .name = "memory_pressure_enabled", |
| 1436 | .read = cpuset_common_file_read, | 1538 | .read_u64 = cpuset_read_u64, |
| 1437 | .write = cpuset_common_file_write, | 1539 | .write_u64 = cpuset_write_u64, |
| 1438 | .private = FILE_MEMORY_PRESSURE_ENABLED, | 1540 | .private = FILE_MEMORY_PRESSURE_ENABLED, |
| 1439 | }; | 1541 | }; |
| 1440 | 1542 | ||
| 1441 | static struct cftype cft_memory_pressure = { | ||
| 1442 | .name = "memory_pressure", | ||
| 1443 | .read = cpuset_common_file_read, | ||
| 1444 | .write = cpuset_common_file_write, | ||
| 1445 | .private = FILE_MEMORY_PRESSURE, | ||
| 1446 | }; | ||
| 1447 | |||
| 1448 | static struct cftype cft_spread_page = { | ||
| 1449 | .name = "memory_spread_page", | ||
| 1450 | .read = cpuset_common_file_read, | ||
| 1451 | .write = cpuset_common_file_write, | ||
| 1452 | .private = FILE_SPREAD_PAGE, | ||
| 1453 | }; | ||
| 1454 | |||
| 1455 | static struct cftype cft_spread_slab = { | ||
| 1456 | .name = "memory_spread_slab", | ||
| 1457 | .read = cpuset_common_file_read, | ||
| 1458 | .write = cpuset_common_file_write, | ||
| 1459 | .private = FILE_SPREAD_SLAB, | ||
| 1460 | }; | ||
| 1461 | |||
| 1462 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1543 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
| 1463 | { | 1544 | { |
| 1464 | int err; | 1545 | int err; |
| 1465 | 1546 | ||
| 1466 | if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) | 1547 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); |
| 1467 | return err; | 1548 | if (err) |
| 1468 | if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0) | ||
| 1469 | return err; | ||
| 1470 | if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0) | ||
| 1471 | return err; | ||
| 1472 | if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0) | ||
| 1473 | return err; | ||
| 1474 | if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) | ||
| 1475 | return err; | ||
| 1476 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) | ||
| 1477 | return err; | ||
| 1478 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) | ||
| 1479 | return err; | ||
| 1480 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) | ||
| 1481 | return err; | ||
| 1482 | if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) | ||
| 1483 | return err; | 1549 | return err; |
| 1484 | /* memory_pressure_enabled is in root cpuset only */ | 1550 | /* memory_pressure_enabled is in root cpuset only */ |
| 1485 | if (err == 0 && !cont->parent) | 1551 | if (!cont->parent) |
| 1486 | err = cgroup_add_file(cont, ss, | 1552 | err = cgroup_add_file(cont, ss, |
| 1487 | &cft_memory_pressure_enabled); | 1553 | &cft_memory_pressure_enabled); |
| 1488 | return 0; | 1554 | return err; |
| 1489 | } | 1555 | } |
| 1490 | 1556 | ||
| 1491 | /* | 1557 | /* |
| @@ -1555,10 +1621,11 @@ static struct cgroup_subsys_state *cpuset_create( | |||
| 1555 | if (is_spread_slab(parent)) | 1621 | if (is_spread_slab(parent)) |
| 1556 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1622 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
| 1557 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1623 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
| 1558 | cs->cpus_allowed = CPU_MASK_NONE; | 1624 | cpus_clear(cs->cpus_allowed); |
| 1559 | cs->mems_allowed = NODE_MASK_NONE; | 1625 | nodes_clear(cs->mems_allowed); |
| 1560 | cs->mems_generation = cpuset_mems_generation++; | 1626 | cs->mems_generation = cpuset_mems_generation++; |
| 1561 | fmeter_init(&cs->fmeter); | 1627 | fmeter_init(&cs->fmeter); |
| 1628 | cs->relax_domain_level = -1; | ||
| 1562 | 1629 | ||
| 1563 | cs->parent = parent; | 1630 | cs->parent = parent; |
| 1564 | number_of_cpusets++; | 1631 | number_of_cpusets++; |
| @@ -1584,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1584 | cpuset_update_task_memory_state(); | 1651 | cpuset_update_task_memory_state(); |
| 1585 | 1652 | ||
| 1586 | if (is_sched_load_balance(cs)) | 1653 | if (is_sched_load_balance(cs)) |
| 1587 | update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); | 1654 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
| 1588 | 1655 | ||
| 1589 | number_of_cpusets--; | 1656 | number_of_cpusets--; |
| 1590 | kfree(cs); | 1657 | kfree(cs); |
| @@ -1625,12 +1692,13 @@ int __init cpuset_init(void) | |||
| 1625 | { | 1692 | { |
| 1626 | int err = 0; | 1693 | int err = 0; |
| 1627 | 1694 | ||
| 1628 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1695 | cpus_setall(top_cpuset.cpus_allowed); |
| 1629 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1696 | nodes_setall(top_cpuset.mems_allowed); |
| 1630 | 1697 | ||
| 1631 | fmeter_init(&top_cpuset.fmeter); | 1698 | fmeter_init(&top_cpuset.fmeter); |
| 1632 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1699 | top_cpuset.mems_generation = cpuset_mems_generation++; |
| 1633 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1700 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
| 1701 | top_cpuset.relax_domain_level = -1; | ||
| 1634 | 1702 | ||
| 1635 | err = register_filesystem(&cpuset_fs_type); | 1703 | err = register_filesystem(&cpuset_fs_type); |
| 1636 | if (err < 0) | 1704 | if (err < 0) |
| @@ -1648,7 +1716,8 @@ int __init cpuset_init(void) | |||
| 1648 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1716 | * Called by cgroup_scan_tasks() for each task in a cgroup. |
| 1649 | * Return nonzero to stop the walk through the tasks. | 1717 | * Return nonzero to stop the walk through the tasks. |
| 1650 | */ | 1718 | */ |
| 1651 | void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) | 1719 | static void cpuset_do_move_task(struct task_struct *tsk, |
| 1720 | struct cgroup_scanner *scan) | ||
| 1652 | { | 1721 | { |
| 1653 | struct cpuset_hotplug_scanner *chsp; | 1722 | struct cpuset_hotplug_scanner *chsp; |
| 1654 | 1723 | ||
| @@ -1844,6 +1913,7 @@ void __init cpuset_init_smp(void) | |||
| 1844 | 1913 | ||
| 1845 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1914 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
| 1846 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1915 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
| 1916 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | ||
| 1847 | * | 1917 | * |
| 1848 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | 1918 | * Description: Returns the cpumask_t cpus_allowed of the cpuset |
| 1849 | * attached to the specified @tsk. Guaranteed to return some non-empty | 1919 | * attached to the specified @tsk. Guaranteed to return some non-empty |
| @@ -1851,35 +1921,27 @@ void __init cpuset_init_smp(void) | |||
| 1851 | * tasks cpuset. | 1921 | * tasks cpuset. |
| 1852 | **/ | 1922 | **/ |
| 1853 | 1923 | ||
| 1854 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | 1924 | void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) |
| 1855 | { | 1925 | { |
| 1856 | cpumask_t mask; | ||
| 1857 | |||
| 1858 | mutex_lock(&callback_mutex); | 1926 | mutex_lock(&callback_mutex); |
| 1859 | mask = cpuset_cpus_allowed_locked(tsk); | 1927 | cpuset_cpus_allowed_locked(tsk, pmask); |
| 1860 | mutex_unlock(&callback_mutex); | 1928 | mutex_unlock(&callback_mutex); |
| 1861 | |||
| 1862 | return mask; | ||
| 1863 | } | 1929 | } |
| 1864 | 1930 | ||
| 1865 | /** | 1931 | /** |
| 1866 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1932 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
| 1867 | * Must be called with callback_mutex held. | 1933 | * Must be called with callback_mutex held. |
| 1868 | **/ | 1934 | **/ |
| 1869 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1935 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) |
| 1870 | { | 1936 | { |
| 1871 | cpumask_t mask; | ||
| 1872 | |||
| 1873 | task_lock(tsk); | 1937 | task_lock(tsk); |
| 1874 | guarantee_online_cpus(task_cs(tsk), &mask); | 1938 | guarantee_online_cpus(task_cs(tsk), pmask); |
| 1875 | task_unlock(tsk); | 1939 | task_unlock(tsk); |
| 1876 | |||
| 1877 | return mask; | ||
| 1878 | } | 1940 | } |
| 1879 | 1941 | ||
| 1880 | void cpuset_init_current_mems_allowed(void) | 1942 | void cpuset_init_current_mems_allowed(void) |
| 1881 | { | 1943 | { |
| 1882 | current->mems_allowed = NODE_MASK_ALL; | 1944 | nodes_setall(current->mems_allowed); |
| 1883 | } | 1945 | } |
| 1884 | 1946 | ||
| 1885 | /** | 1947 | /** |
| @@ -1906,33 +1968,25 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |||
| 1906 | } | 1968 | } |
| 1907 | 1969 | ||
| 1908 | /** | 1970 | /** |
| 1909 | * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed | 1971 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed |
| 1910 | * @zl: the zonelist to be checked | 1972 | * @nodemask: the nodemask to be checked |
| 1911 | * | 1973 | * |
| 1912 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? | 1974 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? |
| 1913 | */ | 1975 | */ |
| 1914 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | 1976 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
| 1915 | { | 1977 | { |
| 1916 | int i; | 1978 | return nodes_intersects(*nodemask, current->mems_allowed); |
| 1917 | |||
| 1918 | for (i = 0; zl->zones[i]; i++) { | ||
| 1919 | int nid = zone_to_nid(zl->zones[i]); | ||
| 1920 | |||
| 1921 | if (node_isset(nid, current->mems_allowed)) | ||
| 1922 | return 1; | ||
| 1923 | } | ||
| 1924 | return 0; | ||
| 1925 | } | 1979 | } |
| 1926 | 1980 | ||
| 1927 | /* | 1981 | /* |
| 1928 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1982 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
| 1929 | * ancestor to the specified cpuset. Call holding callback_mutex. | 1983 | * mem_hardwall ancestor to the specified cpuset. Call holding |
| 1930 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1984 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
| 1931 | * returns the root cpuset. | 1985 | * (an unusual configuration), then returns the root cpuset. |
| 1932 | */ | 1986 | */ |
| 1933 | static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | 1987 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) |
| 1934 | { | 1988 | { |
| 1935 | while (!is_mem_exclusive(cs) && cs->parent) | 1989 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) |
| 1936 | cs = cs->parent; | 1990 | cs = cs->parent; |
| 1937 | return cs; | 1991 | return cs; |
| 1938 | } | 1992 | } |
| @@ -1946,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1946 | * __GFP_THISNODE is set, yes, we can always allocate. If zone | 2000 | * __GFP_THISNODE is set, yes, we can always allocate. If zone |
| 1947 | * z's node is in our tasks mems_allowed, yes. If it's not a | 2001 | * z's node is in our tasks mems_allowed, yes. If it's not a |
| 1948 | * __GFP_HARDWALL request and this zone's nodes is in the nearest | 2002 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
| 1949 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. | 2003 | * hardwalled cpuset ancestor to this tasks cpuset, yes. |
| 1950 | * If the task has been OOM killed and has access to memory reserves | 2004 | * If the task has been OOM killed and has access to memory reserves |
| 1951 | * as specified by the TIF_MEMDIE flag, yes. | 2005 | * as specified by the TIF_MEMDIE flag, yes. |
| 1952 | * Otherwise, no. | 2006 | * Otherwise, no. |
| @@ -1969,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1969 | * and do not allow allocations outside the current tasks cpuset | 2023 | * and do not allow allocations outside the current tasks cpuset |
| 1970 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 2024 | * unless the task has been OOM killed as is marked TIF_MEMDIE. |
| 1971 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2025 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 1972 | * nearest enclosing mem_exclusive ancestor cpuset. | 2026 | * nearest enclosing hardwalled ancestor cpuset. |
| 1973 | * | 2027 | * |
| 1974 | * Scanning up parent cpusets requires callback_mutex. The | 2028 | * Scanning up parent cpusets requires callback_mutex. The |
| 1975 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 2029 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
| @@ -1992,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1992 | * in_interrupt - any node ok (current task context irrelevant) | 2046 | * in_interrupt - any node ok (current task context irrelevant) |
| 1993 | * GFP_ATOMIC - any node ok | 2047 | * GFP_ATOMIC - any node ok |
| 1994 | * TIF_MEMDIE - any node ok | 2048 | * TIF_MEMDIE - any node ok |
| 1995 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | 2049 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
| 1996 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2050 | * GFP_USER - only nodes in current tasks mems allowed ok. |
| 1997 | * | 2051 | * |
| 1998 | * Rule: | 2052 | * Rule: |
| @@ -2029,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | |||
| 2029 | mutex_lock(&callback_mutex); | 2083 | mutex_lock(&callback_mutex); |
| 2030 | 2084 | ||
| 2031 | task_lock(current); | 2085 | task_lock(current); |
| 2032 | cs = nearest_exclusive_ancestor(task_cs(current)); | 2086 | cs = nearest_hardwall_ancestor(task_cs(current)); |
| 2033 | task_unlock(current); | 2087 | task_unlock(current); |
| 2034 | 2088 | ||
| 2035 | allowed = node_isset(node, cs->mems_allowed); | 2089 | allowed = node_isset(node, cs->mems_allowed); |
| @@ -2261,8 +2315,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | |||
| 2261 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, | 2315 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, |
| 2262 | task->cpus_allowed); | 2316 | task->cpus_allowed); |
| 2263 | seq_printf(m, "\n"); | 2317 | seq_printf(m, "\n"); |
| 2318 | seq_printf(m, "Cpus_allowed_list:\t"); | ||
| 2319 | m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, | ||
| 2320 | task->cpus_allowed); | ||
| 2321 | seq_printf(m, "\n"); | ||
| 2264 | seq_printf(m, "Mems_allowed:\t"); | 2322 | seq_printf(m, "Mems_allowed:\t"); |
| 2265 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, | 2323 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, |
| 2266 | task->mems_allowed); | 2324 | task->mems_allowed); |
| 2267 | seq_printf(m, "\n"); | 2325 | seq_printf(m, "\n"); |
| 2326 | seq_printf(m, "Mems_allowed_list:\t"); | ||
| 2327 | m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, | ||
| 2328 | task->mems_allowed); | ||
| 2329 | seq_printf(m, "\n"); | ||
| 2268 | } | 2330 | } |
diff --git a/kernel/dma.c b/kernel/dma.c index 6a82bb716dac..d2c60a822790 100644 --- a/kernel/dma.c +++ b/kernel/dma.c | |||
| @@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = { | |||
| 149 | 149 | ||
| 150 | static int __init proc_dma_init(void) | 150 | static int __init proc_dma_init(void) |
| 151 | { | 151 | { |
| 152 | struct proc_dir_entry *e; | 152 | proc_create("dma", 0, NULL, &proc_dma_operations); |
| 153 | |||
| 154 | e = create_proc_entry("dma", 0, NULL); | ||
| 155 | if (e) | ||
| 156 | e->proc_fops = &proc_dma_operations; | ||
| 157 | |||
| 158 | return 0; | 153 | return 0; |
| 159 | } | 154 | } |
| 160 | 155 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 53872bf993fa..1510f78a0ffa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/acct.h> | 19 | #include <linux/acct.h> |
| 20 | #include <linux/tsacct_kern.h> | 20 | #include <linux/tsacct_kern.h> |
| 21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
| 22 | #include <linux/fdtable.h> | ||
| 22 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
| 23 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
| 24 | #include <linux/pid_namespace.h> | 25 | #include <linux/pid_namespace.h> |
| @@ -52,6 +53,11 @@ | |||
| 52 | 53 | ||
| 53 | static void exit_mm(struct task_struct * tsk); | 54 | static void exit_mm(struct task_struct * tsk); |
| 54 | 55 | ||
| 56 | static inline int task_detached(struct task_struct *p) | ||
| 57 | { | ||
| 58 | return p->exit_signal == -1; | ||
| 59 | } | ||
| 60 | |||
| 55 | static void __unhash_process(struct task_struct *p) | 61 | static void __unhash_process(struct task_struct *p) |
| 56 | { | 62 | { |
| 57 | nr_threads--; | 63 | nr_threads--; |
| @@ -160,7 +166,7 @@ repeat: | |||
| 160 | zap_leader = 0; | 166 | zap_leader = 0; |
| 161 | leader = p->group_leader; | 167 | leader = p->group_leader; |
| 162 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 168 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { |
| 163 | BUG_ON(leader->exit_signal == -1); | 169 | BUG_ON(task_detached(leader)); |
| 164 | do_notify_parent(leader, leader->exit_signal); | 170 | do_notify_parent(leader, leader->exit_signal); |
| 165 | /* | 171 | /* |
| 166 | * If we were the last child thread and the leader has | 172 | * If we were the last child thread and the leader has |
| @@ -170,7 +176,7 @@ repeat: | |||
| 170 | * do_notify_parent() will have marked it self-reaping in | 176 | * do_notify_parent() will have marked it self-reaping in |
| 171 | * that case. | 177 | * that case. |
| 172 | */ | 178 | */ |
| 173 | zap_leader = (leader->exit_signal == -1); | 179 | zap_leader = task_detached(leader); |
| 174 | } | 180 | } |
| 175 | 181 | ||
| 176 | write_unlock_irq(&tasklist_lock); | 182 | write_unlock_irq(&tasklist_lock); |
| @@ -329,13 +335,11 @@ void __set_special_pids(struct pid *pid) | |||
| 329 | pid_t nr = pid_nr(pid); | 335 | pid_t nr = pid_nr(pid); |
| 330 | 336 | ||
| 331 | if (task_session(curr) != pid) { | 337 | if (task_session(curr) != pid) { |
| 332 | detach_pid(curr, PIDTYPE_SID); | 338 | change_pid(curr, PIDTYPE_SID, pid); |
| 333 | attach_pid(curr, PIDTYPE_SID, pid); | ||
| 334 | set_task_session(curr, nr); | 339 | set_task_session(curr, nr); |
| 335 | } | 340 | } |
| 336 | if (task_pgrp(curr) != pid) { | 341 | if (task_pgrp(curr) != pid) { |
| 337 | detach_pid(curr, PIDTYPE_PGID); | 342 | change_pid(curr, PIDTYPE_PGID, pid); |
| 338 | attach_pid(curr, PIDTYPE_PGID, pid); | ||
| 339 | set_task_pgrp(curr, nr); | 343 | set_task_pgrp(curr, nr); |
| 340 | } | 344 | } |
| 341 | } | 345 | } |
| @@ -507,10 +511,9 @@ void put_files_struct(struct files_struct *files) | |||
| 507 | } | 511 | } |
| 508 | } | 512 | } |
| 509 | 513 | ||
| 510 | EXPORT_SYMBOL(put_files_struct); | 514 | void reset_files_struct(struct files_struct *files) |
| 511 | |||
| 512 | void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | ||
| 513 | { | 515 | { |
| 516 | struct task_struct *tsk = current; | ||
| 514 | struct files_struct *old; | 517 | struct files_struct *old; |
| 515 | 518 | ||
| 516 | old = tsk->files; | 519 | old = tsk->files; |
| @@ -519,9 +522,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | |||
| 519 | task_unlock(tsk); | 522 | task_unlock(tsk); |
| 520 | put_files_struct(old); | 523 | put_files_struct(old); |
| 521 | } | 524 | } |
| 522 | EXPORT_SYMBOL(reset_files_struct); | ||
| 523 | 525 | ||
| 524 | static void __exit_files(struct task_struct *tsk) | 526 | void exit_files(struct task_struct *tsk) |
| 525 | { | 527 | { |
| 526 | struct files_struct * files = tsk->files; | 528 | struct files_struct * files = tsk->files; |
| 527 | 529 | ||
| @@ -533,12 +535,7 @@ static void __exit_files(struct task_struct *tsk) | |||
| 533 | } | 535 | } |
| 534 | } | 536 | } |
| 535 | 537 | ||
| 536 | void exit_files(struct task_struct *tsk) | 538 | void put_fs_struct(struct fs_struct *fs) |
| 537 | { | ||
| 538 | __exit_files(tsk); | ||
| 539 | } | ||
| 540 | |||
| 541 | static void __put_fs_struct(struct fs_struct *fs) | ||
| 542 | { | 539 | { |
| 543 | /* No need to hold fs->lock if we are killing it */ | 540 | /* No need to hold fs->lock if we are killing it */ |
| 544 | if (atomic_dec_and_test(&fs->count)) { | 541 | if (atomic_dec_and_test(&fs->count)) { |
| @@ -550,12 +547,7 @@ static void __put_fs_struct(struct fs_struct *fs) | |||
| 550 | } | 547 | } |
| 551 | } | 548 | } |
| 552 | 549 | ||
| 553 | void put_fs_struct(struct fs_struct *fs) | 550 | void exit_fs(struct task_struct *tsk) |
| 554 | { | ||
| 555 | __put_fs_struct(fs); | ||
| 556 | } | ||
| 557 | |||
| 558 | static void __exit_fs(struct task_struct *tsk) | ||
| 559 | { | 551 | { |
| 560 | struct fs_struct * fs = tsk->fs; | 552 | struct fs_struct * fs = tsk->fs; |
| 561 | 553 | ||
| @@ -563,16 +555,93 @@ static void __exit_fs(struct task_struct *tsk) | |||
| 563 | task_lock(tsk); | 555 | task_lock(tsk); |
| 564 | tsk->fs = NULL; | 556 | tsk->fs = NULL; |
| 565 | task_unlock(tsk); | 557 | task_unlock(tsk); |
| 566 | __put_fs_struct(fs); | 558 | put_fs_struct(fs); |
| 567 | } | 559 | } |
| 568 | } | 560 | } |
| 569 | 561 | ||
| 570 | void exit_fs(struct task_struct *tsk) | 562 | EXPORT_SYMBOL_GPL(exit_fs); |
| 563 | |||
| 564 | #ifdef CONFIG_MM_OWNER | ||
| 565 | /* | ||
| 566 | * Task p is exiting and it owned mm, lets find a new owner for it | ||
| 567 | */ | ||
| 568 | static inline int | ||
| 569 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | ||
| 571 | { | 570 | { |
| 572 | __exit_fs(tsk); | 571 | /* |
| 572 | * If there are other users of the mm and the owner (us) is exiting | ||
| 573 | * we need to find a new owner to take on the responsibility. | ||
| 574 | */ | ||
| 575 | if (!mm) | ||
| 576 | return 0; | ||
| 577 | if (atomic_read(&mm->mm_users) <= 1) | ||
| 578 | return 0; | ||
| 579 | if (mm->owner != p) | ||
| 580 | return 0; | ||
| 581 | return 1; | ||
| 573 | } | 582 | } |
| 574 | 583 | ||
| 575 | EXPORT_SYMBOL_GPL(exit_fs); | 584 | void mm_update_next_owner(struct mm_struct *mm) |
| 585 | { | ||
| 586 | struct task_struct *c, *g, *p = current; | ||
| 587 | |||
| 588 | retry: | ||
| 589 | if (!mm_need_new_owner(mm, p)) | ||
| 590 | return; | ||
| 591 | |||
| 592 | read_lock(&tasklist_lock); | ||
| 593 | /* | ||
| 594 | * Search in the children | ||
| 595 | */ | ||
| 596 | list_for_each_entry(c, &p->children, sibling) { | ||
| 597 | if (c->mm == mm) | ||
| 598 | goto assign_new_owner; | ||
| 599 | } | ||
| 600 | |||
| 601 | /* | ||
| 602 | * Search in the siblings | ||
| 603 | */ | ||
| 604 | list_for_each_entry(c, &p->parent->children, sibling) { | ||
| 605 | if (c->mm == mm) | ||
| 606 | goto assign_new_owner; | ||
| 607 | } | ||
| 608 | |||
| 609 | /* | ||
| 610 | * Search through everything else. We should not get | ||
| 611 | * here often | ||
| 612 | */ | ||
| 613 | do_each_thread(g, c) { | ||
| 614 | if (c->mm == mm) | ||
| 615 | goto assign_new_owner; | ||
| 616 | } while_each_thread(g, c); | ||
| 617 | |||
| 618 | read_unlock(&tasklist_lock); | ||
| 619 | return; | ||
| 620 | |||
| 621 | assign_new_owner: | ||
| 622 | BUG_ON(c == p); | ||
| 623 | get_task_struct(c); | ||
| 624 | /* | ||
| 625 | * The task_lock protects c->mm from changing. | ||
| 626 | * We always want mm->owner->mm == mm | ||
| 627 | */ | ||
| 628 | task_lock(c); | ||
| 629 | /* | ||
| 630 | * Delay read_unlock() till we have the task_lock() | ||
| 631 | * to ensure that c does not slip away underneath us | ||
| 632 | */ | ||
| 633 | read_unlock(&tasklist_lock); | ||
| 634 | if (c->mm != mm) { | ||
| 635 | task_unlock(c); | ||
| 636 | put_task_struct(c); | ||
| 637 | goto retry; | ||
| 638 | } | ||
| 639 | cgroup_mm_owner_callbacks(mm->owner, c); | ||
| 640 | mm->owner = c; | ||
| 641 | task_unlock(c); | ||
| 642 | put_task_struct(c); | ||
| 643 | } | ||
| 644 | #endif /* CONFIG_MM_OWNER */ | ||
| 576 | 645 | ||
| 577 | /* | 646 | /* |
| 578 | * Turn us into a lazy TLB process if we | 647 | * Turn us into a lazy TLB process if we |
| @@ -613,6 +682,7 @@ static void exit_mm(struct task_struct * tsk) | |||
| 613 | /* We don't want this task to be frozen prematurely */ | 682 | /* We don't want this task to be frozen prematurely */ |
| 614 | clear_freeze_flag(tsk); | 683 | clear_freeze_flag(tsk); |
| 615 | task_unlock(tsk); | 684 | task_unlock(tsk); |
| 685 | mm_update_next_owner(mm); | ||
| 616 | mmput(mm); | 686 | mmput(mm); |
| 617 | } | 687 | } |
| 618 | 688 | ||
| @@ -627,7 +697,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 627 | if (unlikely(traced)) { | 697 | if (unlikely(traced)) { |
| 628 | /* Preserve ptrace links if someone else is tracing this child. */ | 698 | /* Preserve ptrace links if someone else is tracing this child. */ |
| 629 | list_del_init(&p->ptrace_list); | 699 | list_del_init(&p->ptrace_list); |
| 630 | if (p->parent != p->real_parent) | 700 | if (ptrace_reparented(p)) |
| 631 | list_add(&p->ptrace_list, &p->real_parent->ptrace_children); | 701 | list_add(&p->ptrace_list, &p->real_parent->ptrace_children); |
| 632 | } else { | 702 | } else { |
| 633 | /* If this child is being traced, then we're the one tracing it | 703 | /* If this child is being traced, then we're the one tracing it |
| @@ -651,18 +721,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 651 | /* If this is a threaded reparent there is no need to | 721 | /* If this is a threaded reparent there is no need to |
| 652 | * notify anyone anything has happened. | 722 | * notify anyone anything has happened. |
| 653 | */ | 723 | */ |
| 654 | if (p->real_parent->group_leader == father->group_leader) | 724 | if (same_thread_group(p->real_parent, father)) |
| 655 | return; | 725 | return; |
| 656 | 726 | ||
| 657 | /* We don't want people slaying init. */ | 727 | /* We don't want people slaying init. */ |
| 658 | if (p->exit_signal != -1) | 728 | if (!task_detached(p)) |
| 659 | p->exit_signal = SIGCHLD; | 729 | p->exit_signal = SIGCHLD; |
| 660 | 730 | ||
| 661 | /* If we'd notified the old parent about this child's death, | 731 | /* If we'd notified the old parent about this child's death, |
| 662 | * also notify the new parent. | 732 | * also notify the new parent. |
| 663 | */ | 733 | */ |
| 664 | if (!traced && p->exit_state == EXIT_ZOMBIE && | 734 | if (!traced && p->exit_state == EXIT_ZOMBIE && |
| 665 | p->exit_signal != -1 && thread_group_empty(p)) | 735 | !task_detached(p) && thread_group_empty(p)) |
| 666 | do_notify_parent(p, p->exit_signal); | 736 | do_notify_parent(p, p->exit_signal); |
| 667 | 737 | ||
| 668 | kill_orphaned_pgrp(p, father); | 738 | kill_orphaned_pgrp(p, father); |
| @@ -715,18 +785,18 @@ static void forget_original_parent(struct task_struct *father) | |||
| 715 | } else { | 785 | } else { |
| 716 | /* reparent ptraced task to its real parent */ | 786 | /* reparent ptraced task to its real parent */ |
| 717 | __ptrace_unlink (p); | 787 | __ptrace_unlink (p); |
| 718 | if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && | 788 | if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && |
| 719 | thread_group_empty(p)) | 789 | thread_group_empty(p)) |
| 720 | do_notify_parent(p, p->exit_signal); | 790 | do_notify_parent(p, p->exit_signal); |
| 721 | } | 791 | } |
| 722 | 792 | ||
| 723 | /* | 793 | /* |
| 724 | * if the ptraced child is a zombie with exit_signal == -1 | 794 | * if the ptraced child is a detached zombie we must collect |
| 725 | * we must collect it before we exit, or it will remain | 795 | * it before we exit, or it will remain zombie forever since |
| 726 | * zombie forever since we prevented it from self-reap itself | 796 | * we prevented it from self-reap itself while it was being |
| 727 | * while it was being traced by us, to be able to see it in wait4. | 797 | * traced by us, to be able to see it in wait4. |
| 728 | */ | 798 | */ |
| 729 | if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) | 799 | if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) |
| 730 | list_add(&p->ptrace_list, &ptrace_dead); | 800 | list_add(&p->ptrace_list, &ptrace_dead); |
| 731 | } | 801 | } |
| 732 | 802 | ||
| @@ -783,29 +853,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 783 | * we have changed execution domain as these two values started | 853 | * we have changed execution domain as these two values started |
| 784 | * the same after a fork. | 854 | * the same after a fork. |
| 785 | */ | 855 | */ |
| 786 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && | 856 | if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && |
| 787 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || | 857 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || |
| 788 | tsk->self_exec_id != tsk->parent_exec_id) | 858 | tsk->self_exec_id != tsk->parent_exec_id) && |
| 789 | && !capable(CAP_KILL)) | 859 | !capable(CAP_KILL)) |
| 790 | tsk->exit_signal = SIGCHLD; | 860 | tsk->exit_signal = SIGCHLD; |
| 791 | 861 | ||
| 792 | |||
| 793 | /* If something other than our normal parent is ptracing us, then | 862 | /* If something other than our normal parent is ptracing us, then |
| 794 | * send it a SIGCHLD instead of honoring exit_signal. exit_signal | 863 | * send it a SIGCHLD instead of honoring exit_signal. exit_signal |
| 795 | * only has special meaning to our real parent. | 864 | * only has special meaning to our real parent. |
| 796 | */ | 865 | */ |
| 797 | if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { | 866 | if (!task_detached(tsk) && thread_group_empty(tsk)) { |
| 798 | int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; | 867 | int signal = ptrace_reparented(tsk) ? |
| 868 | SIGCHLD : tsk->exit_signal; | ||
| 799 | do_notify_parent(tsk, signal); | 869 | do_notify_parent(tsk, signal); |
| 800 | } else if (tsk->ptrace) { | 870 | } else if (tsk->ptrace) { |
| 801 | do_notify_parent(tsk, SIGCHLD); | 871 | do_notify_parent(tsk, SIGCHLD); |
| 802 | } | 872 | } |
| 803 | 873 | ||
| 804 | state = EXIT_ZOMBIE; | 874 | state = EXIT_ZOMBIE; |
| 805 | if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) | 875 | if (task_detached(tsk) && likely(!tsk->ptrace)) |
| 806 | state = EXIT_DEAD; | 876 | state = EXIT_DEAD; |
| 807 | tsk->exit_state = state; | 877 | tsk->exit_state = state; |
| 808 | 878 | ||
| 879 | /* mt-exec, de_thread() is waiting for us */ | ||
| 809 | if (thread_group_leader(tsk) && | 880 | if (thread_group_leader(tsk) && |
| 810 | tsk->signal->notify_count < 0 && | 881 | tsk->signal->notify_count < 0 && |
| 811 | tsk->signal->group_exit_task) | 882 | tsk->signal->group_exit_task) |
| @@ -967,8 +1038,8 @@ NORET_TYPE void do_exit(long code) | |||
| 967 | if (group_dead) | 1038 | if (group_dead) |
| 968 | acct_process(); | 1039 | acct_process(); |
| 969 | exit_sem(tsk); | 1040 | exit_sem(tsk); |
| 970 | __exit_files(tsk); | 1041 | exit_files(tsk); |
| 971 | __exit_fs(tsk); | 1042 | exit_fs(tsk); |
| 972 | check_stack_usage(); | 1043 | check_stack_usage(); |
| 973 | exit_thread(); | 1044 | exit_thread(); |
| 974 | cgroup_exit(tsk, 1); | 1045 | cgroup_exit(tsk, 1); |
| @@ -984,7 +1055,7 @@ NORET_TYPE void do_exit(long code) | |||
| 984 | proc_exit_connector(tsk); | 1055 | proc_exit_connector(tsk); |
| 985 | exit_notify(tsk, group_dead); | 1056 | exit_notify(tsk, group_dead); |
| 986 | #ifdef CONFIG_NUMA | 1057 | #ifdef CONFIG_NUMA |
| 987 | mpol_free(tsk->mempolicy); | 1058 | mpol_put(tsk->mempolicy); |
| 988 | tsk->mempolicy = NULL; | 1059 | tsk->mempolicy = NULL; |
| 989 | #endif | 1060 | #endif |
| 990 | #ifdef CONFIG_FUTEX | 1061 | #ifdef CONFIG_FUTEX |
| @@ -1049,12 +1120,13 @@ asmlinkage long sys_exit(int error_code) | |||
| 1049 | NORET_TYPE void | 1120 | NORET_TYPE void |
| 1050 | do_group_exit(int exit_code) | 1121 | do_group_exit(int exit_code) |
| 1051 | { | 1122 | { |
| 1123 | struct signal_struct *sig = current->signal; | ||
| 1124 | |||
| 1052 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ | 1125 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ |
| 1053 | 1126 | ||
| 1054 | if (current->signal->flags & SIGNAL_GROUP_EXIT) | 1127 | if (signal_group_exit(sig)) |
| 1055 | exit_code = current->signal->group_exit_code; | 1128 | exit_code = sig->group_exit_code; |
| 1056 | else if (!thread_group_empty(current)) { | 1129 | else if (!thread_group_empty(current)) { |
| 1057 | struct signal_struct *const sig = current->signal; | ||
| 1058 | struct sighand_struct *const sighand = current->sighand; | 1130 | struct sighand_struct *const sighand = current->sighand; |
| 1059 | spin_lock_irq(&sighand->siglock); | 1131 | spin_lock_irq(&sighand->siglock); |
| 1060 | if (signal_group_exit(sig)) | 1132 | if (signal_group_exit(sig)) |
| @@ -1106,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, | |||
| 1106 | * Do not consider detached threads that are | 1178 | * Do not consider detached threads that are |
| 1107 | * not ptraced: | 1179 | * not ptraced: |
| 1108 | */ | 1180 | */ |
| 1109 | if (p->exit_signal == -1 && !p->ptrace) | 1181 | if (task_detached(p) && !p->ptrace) |
| 1110 | return 0; | 1182 | return 0; |
| 1111 | 1183 | ||
| 1112 | /* Wait for all children (clone and not) if __WALL is set; | 1184 | /* Wait for all children (clone and not) if __WALL is set; |
| @@ -1196,8 +1268,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap, | |||
| 1196 | return 0; | 1268 | return 0; |
| 1197 | } | 1269 | } |
| 1198 | 1270 | ||
| 1199 | /* traced means p->ptrace, but not vice versa */ | 1271 | traced = ptrace_reparented(p); |
| 1200 | traced = (p->real_parent != p->parent); | ||
| 1201 | 1272 | ||
| 1202 | if (likely(!traced)) { | 1273 | if (likely(!traced)) { |
| 1203 | struct signal_struct *psig; | 1274 | struct signal_struct *psig; |
| @@ -1298,9 +1369,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap, | |||
| 1298 | * If it's still not detached after that, don't release | 1369 | * If it's still not detached after that, don't release |
| 1299 | * it now. | 1370 | * it now. |
| 1300 | */ | 1371 | */ |
| 1301 | if (p->exit_signal != -1) { | 1372 | if (!task_detached(p)) { |
| 1302 | do_notify_parent(p, p->exit_signal); | 1373 | do_notify_parent(p, p->exit_signal); |
| 1303 | if (p->exit_signal != -1) { | 1374 | if (!task_detached(p)) { |
| 1304 | p->exit_state = EXIT_ZOMBIE; | 1375 | p->exit_state = EXIT_ZOMBIE; |
| 1305 | p = NULL; | 1376 | p = NULL; |
| 1306 | } | 1377 | } |
| @@ -1608,7 +1679,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, | |||
| 1608 | put_pid(pid); | 1679 | put_pid(pid); |
| 1609 | 1680 | ||
| 1610 | /* avoid REGPARM breakage on x86: */ | 1681 | /* avoid REGPARM breakage on x86: */ |
| 1611 | prevent_tail_call(ret); | 1682 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); |
| 1612 | return ret; | 1683 | return ret; |
| 1613 | } | 1684 | } |
| 1614 | 1685 | ||
| @@ -1640,7 +1711,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, | |||
| 1640 | put_pid(pid); | 1711 | put_pid(pid); |
| 1641 | 1712 | ||
| 1642 | /* avoid REGPARM breakage on x86: */ | 1713 | /* avoid REGPARM breakage on x86: */ |
| 1643 | prevent_tail_call(ret); | 1714 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); |
| 1644 | return ret; | 1715 | return ret; |
| 1645 | } | 1716 | } |
| 1646 | 1717 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a3..933e60ebccae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/mempolicy.h> | 22 | #include <linux/mempolicy.h> |
| 23 | #include <linux/sem.h> | 23 | #include <linux/sem.h> |
| 24 | #include <linux/file.h> | 24 | #include <linux/file.h> |
| 25 | #include <linux/fdtable.h> | ||
| 25 | #include <linux/key.h> | 26 | #include <linux/key.h> |
| 26 | #include <linux/binfmts.h> | 27 | #include <linux/binfmts.h> |
| 27 | #include <linux/mman.h> | 28 | #include <linux/mman.h> |
| @@ -132,6 +133,14 @@ void __put_task_struct(struct task_struct *tsk) | |||
| 132 | free_task(tsk); | 133 | free_task(tsk); |
| 133 | } | 134 | } |
| 134 | 135 | ||
| 136 | /* | ||
| 137 | * macro override instead of weak attribute alias, to workaround | ||
| 138 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
| 139 | */ | ||
| 140 | #ifndef arch_task_cache_init | ||
| 141 | #define arch_task_cache_init() | ||
| 142 | #endif | ||
| 143 | |||
| 135 | void __init fork_init(unsigned long mempages) | 144 | void __init fork_init(unsigned long mempages) |
| 136 | { | 145 | { |
| 137 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 146 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
| @@ -144,6 +153,9 @@ void __init fork_init(unsigned long mempages) | |||
| 144 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); | 153 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
| 145 | #endif | 154 | #endif |
| 146 | 155 | ||
| 156 | /* do the arch specific task caches init */ | ||
| 157 | arch_task_cache_init(); | ||
| 158 | |||
| 147 | /* | 159 | /* |
| 148 | * The default maximum number of threads is set to a safe | 160 | * The default maximum number of threads is set to a safe |
| 149 | * value: the thread structures can take up at most half | 161 | * value: the thread structures can take up at most half |
| @@ -163,6 +175,13 @@ void __init fork_init(unsigned long mempages) | |||
| 163 | init_task.signal->rlim[RLIMIT_NPROC]; | 175 | init_task.signal->rlim[RLIMIT_NPROC]; |
| 164 | } | 176 | } |
| 165 | 177 | ||
| 178 | int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, | ||
| 179 | struct task_struct *src) | ||
| 180 | { | ||
| 181 | *dst = *src; | ||
| 182 | return 0; | ||
| 183 | } | ||
| 184 | |||
| 166 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 185 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
| 167 | { | 186 | { |
| 168 | struct task_struct *tsk; | 187 | struct task_struct *tsk; |
| @@ -181,15 +200,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 181 | return NULL; | 200 | return NULL; |
| 182 | } | 201 | } |
| 183 | 202 | ||
| 184 | *tsk = *orig; | 203 | err = arch_dup_task_struct(tsk, orig); |
| 204 | if (err) | ||
| 205 | goto out; | ||
| 206 | |||
| 185 | tsk->stack = ti; | 207 | tsk->stack = ti; |
| 186 | 208 | ||
| 187 | err = prop_local_init_single(&tsk->dirties); | 209 | err = prop_local_init_single(&tsk->dirties); |
| 188 | if (err) { | 210 | if (err) |
| 189 | free_thread_info(ti); | 211 | goto out; |
| 190 | free_task_struct(tsk); | ||
| 191 | return NULL; | ||
| 192 | } | ||
| 193 | 212 | ||
| 194 | setup_thread_stack(tsk, orig); | 213 | setup_thread_stack(tsk, orig); |
| 195 | 214 | ||
| @@ -205,6 +224,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 205 | #endif | 224 | #endif |
| 206 | tsk->splice_pipe = NULL; | 225 | tsk->splice_pipe = NULL; |
| 207 | return tsk; | 226 | return tsk; |
| 227 | |||
| 228 | out: | ||
| 229 | free_thread_info(ti); | ||
| 230 | free_task_struct(tsk); | ||
| 231 | return NULL; | ||
| 208 | } | 232 | } |
| 209 | 233 | ||
| 210 | #ifdef CONFIG_MMU | 234 | #ifdef CONFIG_MMU |
| @@ -256,7 +280,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 256 | if (!tmp) | 280 | if (!tmp) |
| 257 | goto fail_nomem; | 281 | goto fail_nomem; |
| 258 | *tmp = *mpnt; | 282 | *tmp = *mpnt; |
| 259 | pol = mpol_copy(vma_policy(mpnt)); | 283 | pol = mpol_dup(vma_policy(mpnt)); |
| 260 | retval = PTR_ERR(pol); | 284 | retval = PTR_ERR(pol); |
| 261 | if (IS_ERR(pol)) | 285 | if (IS_ERR(pol)) |
| 262 | goto fail_nomem_policy; | 286 | goto fail_nomem_policy; |
| @@ -358,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
| 358 | mm->ioctx_list = NULL; | 382 | mm->ioctx_list = NULL; |
| 359 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 383 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
| 360 | mm->cached_hole_size = ~0UL; | 384 | mm->cached_hole_size = ~0UL; |
| 361 | mm_init_cgroup(mm, p); | 385 | mm_init_owner(mm, p); |
| 362 | 386 | ||
| 363 | if (likely(!mm_alloc_pgd(mm))) { | 387 | if (likely(!mm_alloc_pgd(mm))) { |
| 364 | mm->def_flags = 0; | 388 | mm->def_flags = 0; |
| 365 | return mm; | 389 | return mm; |
| 366 | } | 390 | } |
| 367 | 391 | ||
| 368 | mm_free_cgroup(mm); | ||
| 369 | free_mm(mm); | 392 | free_mm(mm); |
| 370 | return NULL; | 393 | return NULL; |
| 371 | } | 394 | } |
| @@ -394,7 +417,6 @@ void __mmdrop(struct mm_struct *mm) | |||
| 394 | { | 417 | { |
| 395 | BUG_ON(mm == &init_mm); | 418 | BUG_ON(mm == &init_mm); |
| 396 | mm_free_pgd(mm); | 419 | mm_free_pgd(mm); |
| 397 | mm_free_cgroup(mm); | ||
| 398 | destroy_context(mm); | 420 | destroy_context(mm); |
| 399 | free_mm(mm); | 421 | free_mm(mm); |
| 400 | } | 422 | } |
| @@ -410,6 +432,7 @@ void mmput(struct mm_struct *mm) | |||
| 410 | if (atomic_dec_and_test(&mm->mm_users)) { | 432 | if (atomic_dec_and_test(&mm->mm_users)) { |
| 411 | exit_aio(mm); | 433 | exit_aio(mm); |
| 412 | exit_mmap(mm); | 434 | exit_mmap(mm); |
| 435 | set_mm_exe_file(mm, NULL); | ||
| 413 | if (!list_empty(&mm->mmlist)) { | 436 | if (!list_empty(&mm->mmlist)) { |
| 414 | spin_lock(&mmlist_lock); | 437 | spin_lock(&mmlist_lock); |
| 415 | list_del(&mm->mmlist); | 438 | list_del(&mm->mmlist); |
| @@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
| 498 | * Allocate a new mm structure and copy contents from the | 521 | * Allocate a new mm structure and copy contents from the |
| 499 | * mm structure of the passed in task structure. | 522 | * mm structure of the passed in task structure. |
| 500 | */ | 523 | */ |
| 501 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 524 | struct mm_struct *dup_mm(struct task_struct *tsk) |
| 502 | { | 525 | { |
| 503 | struct mm_struct *mm, *oldmm = current->mm; | 526 | struct mm_struct *mm, *oldmm = current->mm; |
| 504 | int err; | 527 | int err; |
| @@ -522,6 +545,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 522 | if (init_new_context(tsk, mm)) | 545 | if (init_new_context(tsk, mm)) |
| 523 | goto fail_nocontext; | 546 | goto fail_nocontext; |
| 524 | 547 | ||
| 548 | dup_mm_exe_file(oldmm, mm); | ||
| 549 | |||
| 525 | err = dup_mmap(mm, oldmm); | 550 | err = dup_mmap(mm, oldmm); |
| 526 | if (err) | 551 | if (err) |
| 527 | goto free_pt; | 552 | goto free_pt; |
| @@ -782,12 +807,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
| 782 | goto out; | 807 | goto out; |
| 783 | } | 808 | } |
| 784 | 809 | ||
| 785 | /* | ||
| 786 | * Note: we may be using current for both targets (See exec.c) | ||
| 787 | * This works because we cache current->files (old) as oldf. Don't | ||
| 788 | * break this. | ||
| 789 | */ | ||
| 790 | tsk->files = NULL; | ||
| 791 | newf = dup_fd(oldf, &error); | 810 | newf = dup_fd(oldf, &error); |
| 792 | if (!newf) | 811 | if (!newf) |
| 793 | goto out; | 812 | goto out; |
| @@ -823,34 +842,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
| 823 | return 0; | 842 | return 0; |
| 824 | } | 843 | } |
| 825 | 844 | ||
| 826 | /* | ||
| 827 | * Helper to unshare the files of the current task. | ||
| 828 | * We don't want to expose copy_files internals to | ||
| 829 | * the exec layer of the kernel. | ||
| 830 | */ | ||
| 831 | |||
| 832 | int unshare_files(void) | ||
| 833 | { | ||
| 834 | struct files_struct *files = current->files; | ||
| 835 | int rc; | ||
| 836 | |||
| 837 | BUG_ON(!files); | ||
| 838 | |||
| 839 | /* This can race but the race causes us to copy when we don't | ||
| 840 | need to and drop the copy */ | ||
| 841 | if(atomic_read(&files->count) == 1) | ||
| 842 | { | ||
| 843 | atomic_inc(&files->count); | ||
| 844 | return 0; | ||
| 845 | } | ||
| 846 | rc = copy_files(0, current); | ||
| 847 | if(rc) | ||
| 848 | current->files = files; | ||
| 849 | return rc; | ||
| 850 | } | ||
| 851 | |||
| 852 | EXPORT_SYMBOL(unshare_files); | ||
| 853 | |||
| 854 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | 845 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) |
| 855 | { | 846 | { |
| 856 | struct sighand_struct *sig; | 847 | struct sighand_struct *sig; |
| @@ -902,7 +893,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 902 | sig->group_exit_code = 0; | 893 | sig->group_exit_code = 0; |
| 903 | sig->group_exit_task = NULL; | 894 | sig->group_exit_task = NULL; |
| 904 | sig->group_stop_count = 0; | 895 | sig->group_stop_count = 0; |
| 905 | sig->curr_target = NULL; | 896 | sig->curr_target = tsk; |
| 906 | init_sigpending(&sig->shared_pending); | 897 | init_sigpending(&sig->shared_pending); |
| 907 | INIT_LIST_HEAD(&sig->posix_timers); | 898 | INIT_LIST_HEAD(&sig->posix_timers); |
| 908 | 899 | ||
| @@ -993,6 +984,13 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
| 993 | #endif | 984 | #endif |
| 994 | } | 985 | } |
| 995 | 986 | ||
| 987 | #ifdef CONFIG_MM_OWNER | ||
| 988 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | ||
| 989 | { | ||
| 990 | mm->owner = p; | ||
| 991 | } | ||
| 992 | #endif /* CONFIG_MM_OWNER */ | ||
| 993 | |||
| 996 | /* | 994 | /* |
| 997 | * This creates a new process as a copy of the old one, | 995 | * This creates a new process as a copy of the old one, |
| 998 | * but does not actually start it yet. | 996 | * but does not actually start it yet. |
| @@ -1127,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1127 | p->audit_context = NULL; | 1125 | p->audit_context = NULL; |
| 1128 | cgroup_fork(p); | 1126 | cgroup_fork(p); |
| 1129 | #ifdef CONFIG_NUMA | 1127 | #ifdef CONFIG_NUMA |
| 1130 | p->mempolicy = mpol_copy(p->mempolicy); | 1128 | p->mempolicy = mpol_dup(p->mempolicy); |
| 1131 | if (IS_ERR(p->mempolicy)) { | 1129 | if (IS_ERR(p->mempolicy)) { |
| 1132 | retval = PTR_ERR(p->mempolicy); | 1130 | retval = PTR_ERR(p->mempolicy); |
| 1133 | p->mempolicy = NULL; | 1131 | p->mempolicy = NULL; |
| @@ -1385,7 +1383,7 @@ bad_fork_cleanup_security: | |||
| 1385 | security_task_free(p); | 1383 | security_task_free(p); |
| 1386 | bad_fork_cleanup_policy: | 1384 | bad_fork_cleanup_policy: |
| 1387 | #ifdef CONFIG_NUMA | 1385 | #ifdef CONFIG_NUMA |
| 1388 | mpol_free(p->mempolicy); | 1386 | mpol_put(p->mempolicy); |
| 1389 | bad_fork_cleanup_cgroup: | 1387 | bad_fork_cleanup_cgroup: |
| 1390 | #endif | 1388 | #endif |
| 1391 | cgroup_exit(p, cgroup_callbacks_done); | 1389 | cgroup_exit(p, cgroup_callbacks_done); |
| @@ -1675,18 +1673,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
| 1675 | } | 1673 | } |
| 1676 | 1674 | ||
| 1677 | /* | 1675 | /* |
| 1678 | * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not | ||
| 1679 | * supported yet | ||
| 1680 | */ | ||
| 1681 | static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) | ||
| 1682 | { | ||
| 1683 | if (unshare_flags & CLONE_SYSVSEM) | ||
| 1684 | return -EINVAL; | ||
| 1685 | |||
| 1686 | return 0; | ||
| 1687 | } | ||
| 1688 | |||
| 1689 | /* | ||
| 1690 | * unshare allows a process to 'unshare' part of the process | 1676 | * unshare allows a process to 'unshare' part of the process |
| 1691 | * context which was originally shared using clone. copy_* | 1677 | * context which was originally shared using clone. copy_* |
| 1692 | * functions used by do_fork() cannot be used here directly | 1678 | * functions used by do_fork() cannot be used here directly |
| @@ -1701,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1701 | struct sighand_struct *new_sigh = NULL; | 1687 | struct sighand_struct *new_sigh = NULL; |
| 1702 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1688 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
| 1703 | struct files_struct *fd, *new_fd = NULL; | 1689 | struct files_struct *fd, *new_fd = NULL; |
| 1704 | struct sem_undo_list *new_ulist = NULL; | ||
| 1705 | struct nsproxy *new_nsproxy = NULL; | 1690 | struct nsproxy *new_nsproxy = NULL; |
| 1691 | int do_sysvsem = 0; | ||
| 1706 | 1692 | ||
| 1707 | check_unshare_flags(&unshare_flags); | 1693 | check_unshare_flags(&unshare_flags); |
| 1708 | 1694 | ||
| @@ -1714,6 +1700,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1714 | CLONE_NEWNET)) | 1700 | CLONE_NEWNET)) |
| 1715 | goto bad_unshare_out; | 1701 | goto bad_unshare_out; |
| 1716 | 1702 | ||
| 1703 | /* | ||
| 1704 | * CLONE_NEWIPC must also detach from the undolist: after switching | ||
| 1705 | * to a new ipc namespace, the semaphore arrays from the old | ||
| 1706 | * namespace are unreachable. | ||
| 1707 | */ | ||
| 1708 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | ||
| 1709 | do_sysvsem = 1; | ||
| 1717 | if ((err = unshare_thread(unshare_flags))) | 1710 | if ((err = unshare_thread(unshare_flags))) |
| 1718 | goto bad_unshare_out; | 1711 | goto bad_unshare_out; |
| 1719 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1712 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
| @@ -1724,13 +1717,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1724 | goto bad_unshare_cleanup_sigh; | 1717 | goto bad_unshare_cleanup_sigh; |
| 1725 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1718 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
| 1726 | goto bad_unshare_cleanup_vm; | 1719 | goto bad_unshare_cleanup_vm; |
| 1727 | if ((err = unshare_semundo(unshare_flags, &new_ulist))) | ||
| 1728 | goto bad_unshare_cleanup_fd; | ||
| 1729 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1720 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
| 1730 | new_fs))) | 1721 | new_fs))) |
| 1731 | goto bad_unshare_cleanup_semundo; | 1722 | goto bad_unshare_cleanup_fd; |
| 1732 | 1723 | ||
| 1733 | if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { | 1724 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { |
| 1725 | if (do_sysvsem) { | ||
| 1726 | /* | ||
| 1727 | * CLONE_SYSVSEM is equivalent to sys_exit(). | ||
| 1728 | */ | ||
| 1729 | exit_sem(current); | ||
| 1730 | } | ||
| 1734 | 1731 | ||
| 1735 | if (new_nsproxy) { | 1732 | if (new_nsproxy) { |
| 1736 | switch_task_namespaces(current, new_nsproxy); | 1733 | switch_task_namespaces(current, new_nsproxy); |
| @@ -1766,7 +1763,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1766 | if (new_nsproxy) | 1763 | if (new_nsproxy) |
| 1767 | put_nsproxy(new_nsproxy); | 1764 | put_nsproxy(new_nsproxy); |
| 1768 | 1765 | ||
| 1769 | bad_unshare_cleanup_semundo: | ||
| 1770 | bad_unshare_cleanup_fd: | 1766 | bad_unshare_cleanup_fd: |
| 1771 | if (new_fd) | 1767 | if (new_fd) |
| 1772 | put_files_struct(new_fd); | 1768 | put_files_struct(new_fd); |
| @@ -1788,3 +1784,27 @@ bad_unshare_cleanup_thread: | |||
| 1788 | bad_unshare_out: | 1784 | bad_unshare_out: |
| 1789 | return err; | 1785 | return err; |
| 1790 | } | 1786 | } |
| 1787 | |||
| 1788 | /* | ||
| 1789 | * Helper to unshare the files of the current task. | ||
| 1790 | * We don't want to expose copy_files internals to | ||
| 1791 | * the exec layer of the kernel. | ||
| 1792 | */ | ||
| 1793 | |||
| 1794 | int unshare_files(struct files_struct **displaced) | ||
| 1795 | { | ||
| 1796 | struct task_struct *task = current; | ||
| 1797 | struct files_struct *copy = NULL; | ||
| 1798 | int error; | ||
| 1799 | |||
| 1800 | error = unshare_fd(CLONE_FILES, ©); | ||
| 1801 | if (error || !copy) { | ||
| 1802 | *displaced = NULL; | ||
| 1803 | return error; | ||
| 1804 | } | ||
| 1805 | *displaced = task->files; | ||
| 1806 | task_lock(task); | ||
| 1807 | task->files = copy; | ||
| 1808 | task_unlock(task); | ||
| 1809 | return 0; | ||
| 1810 | } | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 06968cd79200..98092c9817f4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 281 | */ | 281 | */ |
| 282 | static void get_futex_key_refs(union futex_key *key) | 282 | static void get_futex_key_refs(union futex_key *key) |
| 283 | { | 283 | { |
| 284 | if (key->both.ptr == 0) | 284 | if (key->both.ptr == NULL) |
| 285 | return; | 285 | return; |
| 286 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 286 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
| 287 | case FUT_OFF_INODE: | 287 | case FUT_OFF_INODE: |
| @@ -1266,11 +1266,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1266 | if (!abs_time) | 1266 | if (!abs_time) |
| 1267 | schedule(); | 1267 | schedule(); |
| 1268 | else { | 1268 | else { |
| 1269 | hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1269 | hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, |
| 1270 | HRTIMER_MODE_ABS); | ||
| 1270 | hrtimer_init_sleeper(&t, current); | 1271 | hrtimer_init_sleeper(&t, current); |
| 1271 | t.timer.expires = *abs_time; | 1272 | t.timer.expires = *abs_time; |
| 1272 | 1273 | ||
| 1273 | hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); | 1274 | hrtimer_start(&t.timer, t.timer.expires, |
| 1275 | HRTIMER_MODE_ABS); | ||
| 1274 | if (!hrtimer_active(&t.timer)) | 1276 | if (!hrtimer_active(&t.timer)) |
| 1275 | t.task = NULL; | 1277 | t.task = NULL; |
| 1276 | 1278 | ||
| @@ -1286,6 +1288,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1286 | 1288 | ||
| 1287 | /* Flag if a timeout occured */ | 1289 | /* Flag if a timeout occured */ |
| 1288 | rem = (t.task == NULL); | 1290 | rem = (t.task == NULL); |
| 1291 | |||
| 1292 | destroy_hrtimer_on_stack(&t.timer); | ||
| 1289 | } | 1293 | } |
| 1290 | } | 1294 | } |
| 1291 | __set_current_state(TASK_RUNNING); | 1295 | __set_current_state(TASK_RUNNING); |
| @@ -1367,7 +1371,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1367 | 1371 | ||
| 1368 | if (time) { | 1372 | if (time) { |
| 1369 | to = &timeout; | 1373 | to = &timeout; |
| 1370 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | 1374 | hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, |
| 1375 | HRTIMER_MODE_ABS); | ||
| 1371 | hrtimer_init_sleeper(to, current); | 1376 | hrtimer_init_sleeper(to, current); |
| 1372 | to->timer.expires = *time; | 1377 | to->timer.expires = *time; |
| 1373 | } | 1378 | } |
| @@ -1581,6 +1586,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1581 | unqueue_me_pi(&q); | 1586 | unqueue_me_pi(&q); |
| 1582 | futex_unlock_mm(fshared); | 1587 | futex_unlock_mm(fshared); |
| 1583 | 1588 | ||
| 1589 | if (to) | ||
| 1590 | destroy_hrtimer_on_stack(&to->timer); | ||
| 1584 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 1591 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
| 1585 | 1592 | ||
| 1586 | out_unlock_release_sem: | 1593 | out_unlock_release_sem: |
| @@ -1588,6 +1595,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1588 | 1595 | ||
| 1589 | out_release_sem: | 1596 | out_release_sem: |
| 1590 | futex_unlock_mm(fshared); | 1597 | futex_unlock_mm(fshared); |
| 1598 | if (to) | ||
| 1599 | destroy_hrtimer_on_stack(&to->timer); | ||
| 1591 | return ret; | 1600 | return ret; |
| 1592 | 1601 | ||
| 1593 | uaddr_faulted: | 1602 | uaddr_faulted: |
| @@ -1615,6 +1624,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
| 1615 | if (!ret && (uval != -EFAULT)) | 1624 | if (!ret && (uval != -EFAULT)) |
| 1616 | goto retry; | 1625 | goto retry; |
| 1617 | 1626 | ||
| 1627 | if (to) | ||
| 1628 | destroy_hrtimer_on_stack(&to->timer); | ||
| 1618 | return ret; | 1629 | return ret; |
| 1619 | } | 1630 | } |
| 1620 | 1631 | ||
| @@ -2158,7 +2169,7 @@ static struct file_system_type futex_fs_type = { | |||
| 2158 | .kill_sb = kill_anon_super, | 2169 | .kill_sb = kill_anon_super, |
| 2159 | }; | 2170 | }; |
| 2160 | 2171 | ||
| 2161 | static int __init init(void) | 2172 | static int __init futex_init(void) |
| 2162 | { | 2173 | { |
| 2163 | u32 curval; | 2174 | u32 curval; |
| 2164 | int i; | 2175 | int i; |
| @@ -2194,4 +2205,4 @@ static int __init init(void) | |||
| 2194 | 2205 | ||
| 2195 | return 0; | 2206 | return 0; |
| 2196 | } | 2207 | } |
| 2197 | __initcall(init); | 2208 | __initcall(futex_init); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index ff90f049f8f6..04ac3a9e42cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | |||
| 30 | return 0; | 30 | return 0; |
| 31 | } | 31 | } |
| 32 | 32 | ||
| 33 | static void __user *futex_uaddr(struct robust_list *entry, | 33 | static void __user *futex_uaddr(struct robust_list __user *entry, |
| 34 | compat_long_t futex_offset) | 34 | compat_long_t futex_offset) |
| 35 | { | 35 | { |
| 36 | compat_uptr_t base = ptr_to_compat(entry); | 36 | compat_uptr_t base = ptr_to_compat(entry); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 98bee013f71f..9af1d6a8095e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -43,6 +43,7 @@ | |||
| 43 | #include <linux/tick.h> | 43 | #include <linux/tick.h> |
| 44 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
| 45 | #include <linux/err.h> | 45 | #include <linux/err.h> |
| 46 | #include <linux/debugobjects.h> | ||
| 46 | 47 | ||
| 47 | #include <asm/uaccess.h> | 48 | #include <asm/uaccess.h> |
| 48 | 49 | ||
| @@ -342,6 +343,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) | |||
| 342 | return res; | 343 | return res; |
| 343 | } | 344 | } |
| 344 | 345 | ||
| 346 | #ifdef CONFIG_DEBUG_OBJECTS_TIMERS | ||
| 347 | |||
| 348 | static struct debug_obj_descr hrtimer_debug_descr; | ||
| 349 | |||
| 350 | /* | ||
| 351 | * fixup_init is called when: | ||
| 352 | * - an active object is initialized | ||
| 353 | */ | ||
| 354 | static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) | ||
| 355 | { | ||
| 356 | struct hrtimer *timer = addr; | ||
| 357 | |||
| 358 | switch (state) { | ||
| 359 | case ODEBUG_STATE_ACTIVE: | ||
| 360 | hrtimer_cancel(timer); | ||
| 361 | debug_object_init(timer, &hrtimer_debug_descr); | ||
| 362 | return 1; | ||
| 363 | default: | ||
| 364 | return 0; | ||
| 365 | } | ||
| 366 | } | ||
| 367 | |||
| 368 | /* | ||
| 369 | * fixup_activate is called when: | ||
| 370 | * - an active object is activated | ||
| 371 | * - an unknown object is activated (might be a statically initialized object) | ||
| 372 | */ | ||
| 373 | static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) | ||
| 374 | { | ||
| 375 | switch (state) { | ||
| 376 | |||
| 377 | case ODEBUG_STATE_NOTAVAILABLE: | ||
| 378 | WARN_ON_ONCE(1); | ||
| 379 | return 0; | ||
| 380 | |||
| 381 | case ODEBUG_STATE_ACTIVE: | ||
| 382 | WARN_ON(1); | ||
| 383 | |||
| 384 | default: | ||
| 385 | return 0; | ||
| 386 | } | ||
| 387 | } | ||
| 388 | |||
| 389 | /* | ||
| 390 | * fixup_free is called when: | ||
| 391 | * - an active object is freed | ||
| 392 | */ | ||
| 393 | static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) | ||
| 394 | { | ||
| 395 | struct hrtimer *timer = addr; | ||
| 396 | |||
| 397 | switch (state) { | ||
| 398 | case ODEBUG_STATE_ACTIVE: | ||
| 399 | hrtimer_cancel(timer); | ||
| 400 | debug_object_free(timer, &hrtimer_debug_descr); | ||
| 401 | return 1; | ||
| 402 | default: | ||
| 403 | return 0; | ||
| 404 | } | ||
| 405 | } | ||
| 406 | |||
| 407 | static struct debug_obj_descr hrtimer_debug_descr = { | ||
| 408 | .name = "hrtimer", | ||
| 409 | .fixup_init = hrtimer_fixup_init, | ||
| 410 | .fixup_activate = hrtimer_fixup_activate, | ||
| 411 | .fixup_free = hrtimer_fixup_free, | ||
| 412 | }; | ||
| 413 | |||
| 414 | static inline void debug_hrtimer_init(struct hrtimer *timer) | ||
| 415 | { | ||
| 416 | debug_object_init(timer, &hrtimer_debug_descr); | ||
| 417 | } | ||
| 418 | |||
| 419 | static inline void debug_hrtimer_activate(struct hrtimer *timer) | ||
| 420 | { | ||
| 421 | debug_object_activate(timer, &hrtimer_debug_descr); | ||
| 422 | } | ||
| 423 | |||
| 424 | static inline void debug_hrtimer_deactivate(struct hrtimer *timer) | ||
| 425 | { | ||
| 426 | debug_object_deactivate(timer, &hrtimer_debug_descr); | ||
| 427 | } | ||
| 428 | |||
| 429 | static inline void debug_hrtimer_free(struct hrtimer *timer) | ||
| 430 | { | ||
| 431 | debug_object_free(timer, &hrtimer_debug_descr); | ||
| 432 | } | ||
| 433 | |||
| 434 | static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | ||
| 435 | enum hrtimer_mode mode); | ||
| 436 | |||
| 437 | void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, | ||
| 438 | enum hrtimer_mode mode) | ||
| 439 | { | ||
| 440 | debug_object_init_on_stack(timer, &hrtimer_debug_descr); | ||
| 441 | __hrtimer_init(timer, clock_id, mode); | ||
| 442 | } | ||
| 443 | |||
| 444 | void destroy_hrtimer_on_stack(struct hrtimer *timer) | ||
| 445 | { | ||
| 446 | debug_object_free(timer, &hrtimer_debug_descr); | ||
| 447 | } | ||
| 448 | |||
| 449 | #else | ||
| 450 | static inline void debug_hrtimer_init(struct hrtimer *timer) { } | ||
| 451 | static inline void debug_hrtimer_activate(struct hrtimer *timer) { } | ||
| 452 | static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } | ||
| 453 | #endif | ||
| 454 | |||
| 345 | /* | 455 | /* |
| 346 | * Check, whether the timer is on the callback pending list | 456 | * Check, whether the timer is on the callback pending list |
| 347 | */ | 457 | */ |
| @@ -567,6 +677,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 567 | /* Timer is expired, act upon the callback mode */ | 677 | /* Timer is expired, act upon the callback mode */ |
| 568 | switch(timer->cb_mode) { | 678 | switch(timer->cb_mode) { |
| 569 | case HRTIMER_CB_IRQSAFE_NO_RESTART: | 679 | case HRTIMER_CB_IRQSAFE_NO_RESTART: |
| 680 | debug_hrtimer_deactivate(timer); | ||
| 570 | /* | 681 | /* |
| 571 | * We can call the callback from here. No restart | 682 | * We can call the callback from here. No restart |
| 572 | * happens, so no danger of recursion | 683 | * happens, so no danger of recursion |
| @@ -581,6 +692,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 581 | * the tick timer in the softirq ! The calling site | 692 | * the tick timer in the softirq ! The calling site |
| 582 | * takes care of this. | 693 | * takes care of this. |
| 583 | */ | 694 | */ |
| 695 | debug_hrtimer_deactivate(timer); | ||
| 584 | return 1; | 696 | return 1; |
| 585 | case HRTIMER_CB_IRQSAFE: | 697 | case HRTIMER_CB_IRQSAFE: |
| 586 | case HRTIMER_CB_SOFTIRQ: | 698 | case HRTIMER_CB_SOFTIRQ: |
| @@ -590,7 +702,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 590 | list_add_tail(&timer->cb_entry, | 702 | list_add_tail(&timer->cb_entry, |
| 591 | &base->cpu_base->cb_pending); | 703 | &base->cpu_base->cb_pending); |
| 592 | timer->state = HRTIMER_STATE_PENDING; | 704 | timer->state = HRTIMER_STATE_PENDING; |
| 593 | raise_softirq(HRTIMER_SOFTIRQ); | ||
| 594 | return 1; | 705 | return 1; |
| 595 | default: | 706 | default: |
| 596 | BUG(); | 707 | BUG(); |
| @@ -633,6 +744,11 @@ static int hrtimer_switch_to_hres(void) | |||
| 633 | return 1; | 744 | return 1; |
| 634 | } | 745 | } |
| 635 | 746 | ||
| 747 | static inline void hrtimer_raise_softirq(void) | ||
| 748 | { | ||
| 749 | raise_softirq(HRTIMER_SOFTIRQ); | ||
| 750 | } | ||
| 751 | |||
| 636 | #else | 752 | #else |
| 637 | 753 | ||
| 638 | static inline int hrtimer_hres_active(void) { return 0; } | 754 | static inline int hrtimer_hres_active(void) { return 0; } |
| @@ -651,6 +767,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, | |||
| 651 | { | 767 | { |
| 652 | return 0; | 768 | return 0; |
| 653 | } | 769 | } |
| 770 | static inline void hrtimer_raise_softirq(void) { } | ||
| 654 | 771 | ||
| 655 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 772 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
| 656 | 773 | ||
| @@ -730,6 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
| 730 | struct hrtimer *entry; | 847 | struct hrtimer *entry; |
| 731 | int leftmost = 1; | 848 | int leftmost = 1; |
| 732 | 849 | ||
| 850 | debug_hrtimer_activate(timer); | ||
| 851 | |||
| 733 | /* | 852 | /* |
| 734 | * Find the right place in the rbtree: | 853 | * Find the right place in the rbtree: |
| 735 | */ | 854 | */ |
| @@ -826,6 +945,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
| 826 | * reprogramming happens in the interrupt handler. This is a | 945 | * reprogramming happens in the interrupt handler. This is a |
| 827 | * rare case and less expensive than a smp call. | 946 | * rare case and less expensive than a smp call. |
| 828 | */ | 947 | */ |
| 948 | debug_hrtimer_deactivate(timer); | ||
| 829 | timer_stats_hrtimer_clear_start_info(timer); | 949 | timer_stats_hrtimer_clear_start_info(timer); |
| 830 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 950 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); |
| 831 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | 951 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, |
| @@ -850,7 +970,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
| 850 | { | 970 | { |
| 851 | struct hrtimer_clock_base *base, *new_base; | 971 | struct hrtimer_clock_base *base, *new_base; |
| 852 | unsigned long flags; | 972 | unsigned long flags; |
| 853 | int ret; | 973 | int ret, raise; |
| 854 | 974 | ||
| 855 | base = lock_hrtimer_base(timer, &flags); | 975 | base = lock_hrtimer_base(timer, &flags); |
| 856 | 976 | ||
| @@ -873,6 +993,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
| 873 | tim = ktime_add_safe(tim, base->resolution); | 993 | tim = ktime_add_safe(tim, base->resolution); |
| 874 | #endif | 994 | #endif |
| 875 | } | 995 | } |
| 996 | |||
| 876 | timer->expires = tim; | 997 | timer->expires = tim; |
| 877 | 998 | ||
| 878 | timer_stats_hrtimer_set_start_info(timer); | 999 | timer_stats_hrtimer_set_start_info(timer); |
| @@ -884,8 +1005,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
| 884 | enqueue_hrtimer(timer, new_base, | 1005 | enqueue_hrtimer(timer, new_base, |
| 885 | new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); | 1006 | new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); |
| 886 | 1007 | ||
| 1008 | /* | ||
| 1009 | * The timer may be expired and moved to the cb_pending | ||
| 1010 | * list. We can not raise the softirq with base lock held due | ||
| 1011 | * to a possible deadlock with runqueue lock. | ||
| 1012 | */ | ||
| 1013 | raise = timer->state == HRTIMER_STATE_PENDING; | ||
| 1014 | |||
| 887 | unlock_hrtimer_base(timer, &flags); | 1015 | unlock_hrtimer_base(timer, &flags); |
| 888 | 1016 | ||
| 1017 | if (raise) | ||
| 1018 | hrtimer_raise_softirq(); | ||
| 1019 | |||
| 889 | return ret; | 1020 | return ret; |
| 890 | } | 1021 | } |
| 891 | EXPORT_SYMBOL_GPL(hrtimer_start); | 1022 | EXPORT_SYMBOL_GPL(hrtimer_start); |
| @@ -996,14 +1127,8 @@ ktime_t hrtimer_get_next_event(void) | |||
| 996 | } | 1127 | } |
| 997 | #endif | 1128 | #endif |
| 998 | 1129 | ||
| 999 | /** | 1130 | static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
| 1000 | * hrtimer_init - initialize a timer to the given clock | 1131 | enum hrtimer_mode mode) |
| 1001 | * @timer: the timer to be initialized | ||
| 1002 | * @clock_id: the clock to be used | ||
| 1003 | * @mode: timer mode abs/rel | ||
| 1004 | */ | ||
| 1005 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | ||
| 1006 | enum hrtimer_mode mode) | ||
| 1007 | { | 1132 | { |
| 1008 | struct hrtimer_cpu_base *cpu_base; | 1133 | struct hrtimer_cpu_base *cpu_base; |
| 1009 | 1134 | ||
| @@ -1024,6 +1149,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1024 | memset(timer->start_comm, 0, TASK_COMM_LEN); | 1149 | memset(timer->start_comm, 0, TASK_COMM_LEN); |
| 1025 | #endif | 1150 | #endif |
| 1026 | } | 1151 | } |
| 1152 | |||
| 1153 | /** | ||
| 1154 | * hrtimer_init - initialize a timer to the given clock | ||
| 1155 | * @timer: the timer to be initialized | ||
| 1156 | * @clock_id: the clock to be used | ||
| 1157 | * @mode: timer mode abs/rel | ||
| 1158 | */ | ||
| 1159 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | ||
| 1160 | enum hrtimer_mode mode) | ||
| 1161 | { | ||
| 1162 | debug_hrtimer_init(timer); | ||
| 1163 | __hrtimer_init(timer, clock_id, mode); | ||
| 1164 | } | ||
| 1027 | EXPORT_SYMBOL_GPL(hrtimer_init); | 1165 | EXPORT_SYMBOL_GPL(hrtimer_init); |
| 1028 | 1166 | ||
| 1029 | /** | 1167 | /** |
| @@ -1057,6 +1195,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | |||
| 1057 | timer = list_entry(cpu_base->cb_pending.next, | 1195 | timer = list_entry(cpu_base->cb_pending.next, |
| 1058 | struct hrtimer, cb_entry); | 1196 | struct hrtimer, cb_entry); |
| 1059 | 1197 | ||
| 1198 | debug_hrtimer_deactivate(timer); | ||
| 1060 | timer_stats_account_hrtimer(timer); | 1199 | timer_stats_account_hrtimer(timer); |
| 1061 | 1200 | ||
| 1062 | fn = timer->function; | 1201 | fn = timer->function; |
| @@ -1080,8 +1219,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | |||
| 1080 | * If the timer was rearmed on another CPU, reprogram | 1219 | * If the timer was rearmed on another CPU, reprogram |
| 1081 | * the event device. | 1220 | * the event device. |
| 1082 | */ | 1221 | */ |
| 1083 | if (timer->base->first == &timer->node) | 1222 | struct hrtimer_clock_base *base = timer->base; |
| 1084 | hrtimer_reprogram(timer, timer->base); | 1223 | |
| 1224 | if (base->first == &timer->node && | ||
| 1225 | hrtimer_reprogram(timer, base)) { | ||
| 1226 | /* | ||
| 1227 | * Timer is expired. Thus move it from tree to | ||
| 1228 | * pending list again. | ||
| 1229 | */ | ||
| 1230 | __remove_hrtimer(timer, base, | ||
| 1231 | HRTIMER_STATE_PENDING, 0); | ||
| 1232 | list_add_tail(&timer->cb_entry, | ||
| 1233 | &base->cpu_base->cb_pending); | ||
| 1234 | } | ||
| 1085 | } | 1235 | } |
| 1086 | } | 1236 | } |
| 1087 | spin_unlock_irq(&cpu_base->lock); | 1237 | spin_unlock_irq(&cpu_base->lock); |
| @@ -1094,6 +1244,7 @@ static void __run_hrtimer(struct hrtimer *timer) | |||
| 1094 | enum hrtimer_restart (*fn)(struct hrtimer *); | 1244 | enum hrtimer_restart (*fn)(struct hrtimer *); |
| 1095 | int restart; | 1245 | int restart; |
| 1096 | 1246 | ||
| 1247 | debug_hrtimer_deactivate(timer); | ||
| 1097 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | 1248 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); |
| 1098 | timer_stats_account_hrtimer(timer); | 1249 | timer_stats_account_hrtimer(timer); |
| 1099 | 1250 | ||
| @@ -1238,51 +1389,50 @@ void hrtimer_run_pending(void) | |||
| 1238 | /* | 1389 | /* |
| 1239 | * Called from hardirq context every jiffy | 1390 | * Called from hardirq context every jiffy |
| 1240 | */ | 1391 | */ |
| 1241 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1392 | void hrtimer_run_queues(void) |
| 1242 | int index) | ||
| 1243 | { | 1393 | { |
| 1244 | struct rb_node *node; | 1394 | struct rb_node *node; |
| 1245 | struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; | 1395 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
| 1396 | struct hrtimer_clock_base *base; | ||
| 1397 | int index, gettime = 1; | ||
| 1246 | 1398 | ||
| 1247 | if (!base->first) | 1399 | if (hrtimer_hres_active()) |
| 1248 | return; | 1400 | return; |
| 1249 | 1401 | ||
| 1250 | if (base->get_softirq_time) | 1402 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
| 1251 | base->softirq_time = base->get_softirq_time(); | 1403 | base = &cpu_base->clock_base[index]; |
| 1252 | |||
| 1253 | spin_lock(&cpu_base->lock); | ||
| 1254 | |||
| 1255 | while ((node = base->first)) { | ||
| 1256 | struct hrtimer *timer; | ||
| 1257 | |||
| 1258 | timer = rb_entry(node, struct hrtimer, node); | ||
| 1259 | if (base->softirq_time.tv64 <= timer->expires.tv64) | ||
| 1260 | break; | ||
| 1261 | 1404 | ||
| 1262 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { | 1405 | if (!base->first) |
| 1263 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); | ||
| 1264 | list_add_tail(&timer->cb_entry, | ||
| 1265 | &base->cpu_base->cb_pending); | ||
| 1266 | continue; | 1406 | continue; |
| 1407 | |||
| 1408 | if (base->get_softirq_time) | ||
| 1409 | base->softirq_time = base->get_softirq_time(); | ||
| 1410 | else if (gettime) { | ||
| 1411 | hrtimer_get_softirq_time(cpu_base); | ||
| 1412 | gettime = 0; | ||
| 1267 | } | 1413 | } |
| 1268 | 1414 | ||
| 1269 | __run_hrtimer(timer); | 1415 | spin_lock(&cpu_base->lock); |
| 1270 | } | ||
| 1271 | spin_unlock(&cpu_base->lock); | ||
| 1272 | } | ||
| 1273 | 1416 | ||
| 1274 | void hrtimer_run_queues(void) | 1417 | while ((node = base->first)) { |
| 1275 | { | 1418 | struct hrtimer *timer; |
| 1276 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 1277 | int i; | ||
| 1278 | 1419 | ||
| 1279 | if (hrtimer_hres_active()) | 1420 | timer = rb_entry(node, struct hrtimer, node); |
| 1280 | return; | 1421 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
| 1422 | break; | ||
| 1281 | 1423 | ||
| 1282 | hrtimer_get_softirq_time(cpu_base); | 1424 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
| 1425 | __remove_hrtimer(timer, base, | ||
| 1426 | HRTIMER_STATE_PENDING, 0); | ||
| 1427 | list_add_tail(&timer->cb_entry, | ||
| 1428 | &base->cpu_base->cb_pending); | ||
| 1429 | continue; | ||
| 1430 | } | ||
| 1283 | 1431 | ||
| 1284 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1432 | __run_hrtimer(timer); |
| 1285 | run_hrtimer_queue(cpu_base, i); | 1433 | } |
| 1434 | spin_unlock(&cpu_base->lock); | ||
| 1435 | } | ||
| 1286 | } | 1436 | } |
| 1287 | 1437 | ||
| 1288 | /* | 1438 | /* |
| @@ -1353,22 +1503,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
| 1353 | { | 1503 | { |
| 1354 | struct hrtimer_sleeper t; | 1504 | struct hrtimer_sleeper t; |
| 1355 | struct timespec __user *rmtp; | 1505 | struct timespec __user *rmtp; |
| 1506 | int ret = 0; | ||
| 1356 | 1507 | ||
| 1357 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); | 1508 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, |
| 1358 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; | 1509 | HRTIMER_MODE_ABS); |
| 1510 | t.timer.expires.tv64 = restart->nanosleep.expires; | ||
| 1359 | 1511 | ||
| 1360 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) | 1512 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) |
| 1361 | return 0; | 1513 | goto out; |
| 1362 | 1514 | ||
| 1363 | rmtp = (struct timespec __user *)restart->arg1; | 1515 | rmtp = restart->nanosleep.rmtp; |
| 1364 | if (rmtp) { | 1516 | if (rmtp) { |
| 1365 | int ret = update_rmtp(&t.timer, rmtp); | 1517 | ret = update_rmtp(&t.timer, rmtp); |
| 1366 | if (ret <= 0) | 1518 | if (ret <= 0) |
| 1367 | return ret; | 1519 | goto out; |
| 1368 | } | 1520 | } |
| 1369 | 1521 | ||
| 1370 | /* The other values in restart are already filled in */ | 1522 | /* The other values in restart are already filled in */ |
| 1371 | return -ERESTART_RESTARTBLOCK; | 1523 | ret = -ERESTART_RESTARTBLOCK; |
| 1524 | out: | ||
| 1525 | destroy_hrtimer_on_stack(&t.timer); | ||
| 1526 | return ret; | ||
| 1372 | } | 1527 | } |
| 1373 | 1528 | ||
| 1374 | long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | 1529 | long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, |
| @@ -1376,30 +1531,35 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
| 1376 | { | 1531 | { |
| 1377 | struct restart_block *restart; | 1532 | struct restart_block *restart; |
| 1378 | struct hrtimer_sleeper t; | 1533 | struct hrtimer_sleeper t; |
| 1534 | int ret = 0; | ||
| 1379 | 1535 | ||
| 1380 | hrtimer_init(&t.timer, clockid, mode); | 1536 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
| 1381 | t.timer.expires = timespec_to_ktime(*rqtp); | 1537 | t.timer.expires = timespec_to_ktime(*rqtp); |
| 1382 | if (do_nanosleep(&t, mode)) | 1538 | if (do_nanosleep(&t, mode)) |
| 1383 | return 0; | 1539 | goto out; |
| 1384 | 1540 | ||
| 1385 | /* Absolute timers do not update the rmtp value and restart: */ | 1541 | /* Absolute timers do not update the rmtp value and restart: */ |
| 1386 | if (mode == HRTIMER_MODE_ABS) | 1542 | if (mode == HRTIMER_MODE_ABS) { |
| 1387 | return -ERESTARTNOHAND; | 1543 | ret = -ERESTARTNOHAND; |
| 1544 | goto out; | ||
| 1545 | } | ||
| 1388 | 1546 | ||
| 1389 | if (rmtp) { | 1547 | if (rmtp) { |
| 1390 | int ret = update_rmtp(&t.timer, rmtp); | 1548 | ret = update_rmtp(&t.timer, rmtp); |
| 1391 | if (ret <= 0) | 1549 | if (ret <= 0) |
| 1392 | return ret; | 1550 | goto out; |
| 1393 | } | 1551 | } |
| 1394 | 1552 | ||
| 1395 | restart = ¤t_thread_info()->restart_block; | 1553 | restart = ¤t_thread_info()->restart_block; |
| 1396 | restart->fn = hrtimer_nanosleep_restart; | 1554 | restart->fn = hrtimer_nanosleep_restart; |
| 1397 | restart->arg0 = (unsigned long) t.timer.base->index; | 1555 | restart->nanosleep.index = t.timer.base->index; |
| 1398 | restart->arg1 = (unsigned long) rmtp; | 1556 | restart->nanosleep.rmtp = rmtp; |
| 1399 | restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; | 1557 | restart->nanosleep.expires = t.timer.expires.tv64; |
| 1400 | restart->arg3 = t.timer.expires.tv64 >> 32; | ||
| 1401 | 1558 | ||
| 1402 | return -ERESTART_RESTARTBLOCK; | 1559 | ret = -ERESTART_RESTARTBLOCK; |
| 1560 | out: | ||
| 1561 | destroy_hrtimer_on_stack(&t.timer); | ||
| 1562 | return ret; | ||
| 1403 | } | 1563 | } |
| 1404 | 1564 | ||
| 1405 | asmlinkage long | 1565 | asmlinkage long |
| @@ -1425,7 +1585,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
| 1425 | int i; | 1585 | int i; |
| 1426 | 1586 | ||
| 1427 | spin_lock_init(&cpu_base->lock); | 1587 | spin_lock_init(&cpu_base->lock); |
| 1428 | lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); | ||
| 1429 | 1588 | ||
| 1430 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1589 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
| 1431 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1590 | cpu_base->clock_base[i].cpu_base = cpu_base; |
| @@ -1445,6 +1604,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
| 1445 | while ((node = rb_first(&old_base->active))) { | 1604 | while ((node = rb_first(&old_base->active))) { |
| 1446 | timer = rb_entry(node, struct hrtimer, node); | 1605 | timer = rb_entry(node, struct hrtimer, node); |
| 1447 | BUG_ON(hrtimer_callback_running(timer)); | 1606 | BUG_ON(hrtimer_callback_running(timer)); |
| 1607 | debug_hrtimer_deactivate(timer); | ||
| 1448 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); | 1608 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); |
| 1449 | timer->base = new_base; | 1609 | timer->base = new_base; |
| 1450 | /* | 1610 | /* |
| @@ -1466,16 +1626,16 @@ static void migrate_hrtimers(int cpu) | |||
| 1466 | tick_cancel_sched_timer(cpu); | 1626 | tick_cancel_sched_timer(cpu); |
| 1467 | 1627 | ||
| 1468 | local_irq_disable(); | 1628 | local_irq_disable(); |
| 1469 | double_spin_lock(&new_base->lock, &old_base->lock, | 1629 | spin_lock(&new_base->lock); |
| 1470 | smp_processor_id() < cpu); | 1630 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
| 1471 | 1631 | ||
| 1472 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1632 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
| 1473 | migrate_hrtimer_list(&old_base->clock_base[i], | 1633 | migrate_hrtimer_list(&old_base->clock_base[i], |
| 1474 | &new_base->clock_base[i]); | 1634 | &new_base->clock_base[i]); |
| 1475 | } | 1635 | } |
| 1476 | 1636 | ||
| 1477 | double_spin_unlock(&new_base->lock, &old_base->lock, | 1637 | spin_unlock(&old_base->lock); |
| 1478 | smp_processor_id() < cpu); | 1638 | spin_unlock(&new_base->lock); |
| 1479 | local_irq_enable(); | 1639 | local_irq_enable(); |
| 1480 | put_cpu_var(hrtimer_bases); | 1640 | put_cpu_var(hrtimer_bases); |
| 1481 | } | 1641 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c4..964964baefa2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) | |||
| 47 | desc->irq_count = 0; | 47 | desc->irq_count = 0; |
| 48 | desc->irqs_unhandled = 0; | 48 | desc->irqs_unhandled = 0; |
| 49 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
| 50 | desc->affinity = CPU_MASK_ALL; | 50 | cpus_setall(desc->affinity); |
| 51 | #endif | 51 | #endif |
| 52 | spin_unlock_irqrestore(&desc->lock, flags); | 52 | spin_unlock_irqrestore(&desc->lock, flags); |
| 53 | } | 53 | } |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 6d9204f3a370..38a25b8d8bff 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | #include <linux/module.h> | 1 | #include <linux/module.h> |
| 2 | #include <linux/interrupt.h> | 2 | #include <linux/interrupt.h> |
| 3 | #include <linux/device.h> | 3 | #include <linux/device.h> |
| 4 | #include <linux/gfp.h> | ||
| 4 | 5 | ||
| 5 | /* | 6 | /* |
| 6 | * Device resource management aware IRQ request/free implementation. | 7 | * Device resource management aware IRQ request/free implementation. |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 438a01464287..46d6611a33bb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
| 13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
| 14 | #include <linux/slab.h> | ||
| 14 | 15 | ||
| 15 | #include "internals.h" | 16 | #include "internals.h" |
| 16 | 17 | ||
| @@ -149,6 +150,26 @@ void disable_irq(unsigned int irq) | |||
| 149 | } | 150 | } |
| 150 | EXPORT_SYMBOL(disable_irq); | 151 | EXPORT_SYMBOL(disable_irq); |
| 151 | 152 | ||
| 153 | static void __enable_irq(struct irq_desc *desc, unsigned int irq) | ||
| 154 | { | ||
| 155 | switch (desc->depth) { | ||
| 156 | case 0: | ||
| 157 | printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | ||
| 158 | WARN_ON(1); | ||
| 159 | break; | ||
| 160 | case 1: { | ||
| 161 | unsigned int status = desc->status & ~IRQ_DISABLED; | ||
| 162 | |||
| 163 | /* Prevent probing on this irq: */ | ||
| 164 | desc->status = status | IRQ_NOPROBE; | ||
| 165 | check_irq_resend(desc, irq); | ||
| 166 | /* fall-through */ | ||
| 167 | } | ||
| 168 | default: | ||
| 169 | desc->depth--; | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 152 | /** | 173 | /** |
| 153 | * enable_irq - enable handling of an irq | 174 | * enable_irq - enable handling of an irq |
| 154 | * @irq: Interrupt to enable | 175 | * @irq: Interrupt to enable |
| @@ -168,22 +189,7 @@ void enable_irq(unsigned int irq) | |||
| 168 | return; | 189 | return; |
| 169 | 190 | ||
| 170 | spin_lock_irqsave(&desc->lock, flags); | 191 | spin_lock_irqsave(&desc->lock, flags); |
| 171 | switch (desc->depth) { | 192 | __enable_irq(desc, irq); |
| 172 | case 0: | ||
| 173 | printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | ||
| 174 | WARN_ON(1); | ||
| 175 | break; | ||
| 176 | case 1: { | ||
| 177 | unsigned int status = desc->status & ~IRQ_DISABLED; | ||
| 178 | |||
| 179 | /* Prevent probing on this irq: */ | ||
| 180 | desc->status = status | IRQ_NOPROBE; | ||
| 181 | check_irq_resend(desc, irq); | ||
| 182 | /* fall-through */ | ||
| 183 | } | ||
| 184 | default: | ||
| 185 | desc->depth--; | ||
| 186 | } | ||
| 187 | spin_unlock_irqrestore(&desc->lock, flags); | 193 | spin_unlock_irqrestore(&desc->lock, flags); |
| 188 | } | 194 | } |
| 189 | EXPORT_SYMBOL(enable_irq); | 195 | EXPORT_SYMBOL(enable_irq); |
| @@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
| 364 | compat_irq_chip_set_default_handler(desc); | 370 | compat_irq_chip_set_default_handler(desc); |
| 365 | 371 | ||
| 366 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | | 372 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | |
| 367 | IRQ_INPROGRESS); | 373 | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); |
| 368 | 374 | ||
| 369 | if (!(desc->status & IRQ_NOAUTOEN)) { | 375 | if (!(desc->status & IRQ_NOAUTOEN)) { |
| 370 | desc->depth = 0; | 376 | desc->depth = 0; |
| @@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
| 380 | /* Reset broken irq detection when installing new handler */ | 386 | /* Reset broken irq detection when installing new handler */ |
| 381 | desc->irq_count = 0; | 387 | desc->irq_count = 0; |
| 382 | desc->irqs_unhandled = 0; | 388 | desc->irqs_unhandled = 0; |
| 389 | |||
| 390 | /* | ||
| 391 | * Check whether we disabled the irq via the spurious handler | ||
| 392 | * before. Reenable it and give it another chance. | ||
| 393 | */ | ||
| 394 | if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { | ||
| 395 | desc->status &= ~IRQ_SPURIOUS_DISABLED; | ||
| 396 | __enable_irq(desc, irq); | ||
| 397 | } | ||
| 398 | |||
| 383 | spin_unlock_irqrestore(&desc->lock, flags); | 399 | spin_unlock_irqrestore(&desc->lock, flags); |
| 384 | 400 | ||
| 385 | new->irq = irq; | 401 | new->irq = irq; |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 088dabbf2d6a..c66d3f10e853 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 209 | * Now kill the IRQ | 209 | * Now kill the IRQ |
| 210 | */ | 210 | */ |
| 211 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 211 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
| 212 | desc->status |= IRQ_DISABLED; | 212 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; |
| 213 | desc->depth = 1; | 213 | desc->depth++; |
| 214 | desc->chip->disable(irq); | 214 | desc->chip->disable(irq); |
| 215 | } | 215 | } |
| 216 | desc->irqs_unhandled = 0; | 216 | desc->irqs_unhandled = 0; |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f091d13def00..6fc0040f3e3a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = { | |||
| 472 | 472 | ||
| 473 | static int __init kallsyms_init(void) | 473 | static int __init kallsyms_init(void) |
| 474 | { | 474 | { |
| 475 | struct proc_dir_entry *entry; | 475 | proc_create("kallsyms", 0444, NULL, &kallsyms_operations); |
| 476 | |||
| 477 | entry = create_proc_entry("kallsyms", 0444, NULL); | ||
| 478 | if (entry) | ||
| 479 | entry->proc_fops = &kallsyms_operations; | ||
| 480 | return 0; | 476 | return 0; |
| 481 | } | 477 | } |
| 482 | __initcall(kallsyms_init); | 478 | __initcall(kallsyms_init); |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 06a0e2775651..1c5fcacbcf33 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
| 30 | #include <asm/io.h> | 30 | #include <asm/io.h> |
| 31 | #include <asm/system.h> | 31 | #include <asm/system.h> |
| 32 | #include <asm/semaphore.h> | ||
| 33 | #include <asm/sections.h> | 32 | #include <asm/sections.h> |
| 34 | 33 | ||
| 35 | /* Per cpu memory for storing cpu states in case of system crash. */ | 34 | /* Per cpu memory for storing cpu states in case of system crash. */ |
| @@ -1218,7 +1217,7 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
| 1218 | } | 1217 | } |
| 1219 | 1218 | ||
| 1220 | /* match ? */ | 1219 | /* match ? */ |
| 1221 | if (system_ram >= start && system_ram <= end) { | 1220 | if (system_ram >= start && system_ram < end) { |
| 1222 | *crash_size = size; | 1221 | *crash_size = size; |
| 1223 | break; | 1222 | break; |
| 1224 | } | 1223 | } |
| @@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | 1405 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); |
| 1407 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | 1406 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); |
| 1408 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | 1407 | VMCOREINFO_NUMBER(NR_FREE_PAGES); |
| 1408 | VMCOREINFO_NUMBER(PG_lru); | ||
| 1409 | VMCOREINFO_NUMBER(PG_private); | ||
| 1410 | VMCOREINFO_NUMBER(PG_swapcache); | ||
| 1409 | 1411 | ||
| 1410 | arch_crash_save_vmcoreinfo(); | 1412 | arch_crash_save_vmcoreinfo(); |
| 1411 | 1413 | ||
diff --git a/kernel/kgdb.c b/kernel/kgdb.c new file mode 100644 index 000000000000..1bd0ec1c80b2 --- /dev/null +++ b/kernel/kgdb.c | |||
| @@ -0,0 +1,1700 @@ | |||
| 1 | /* | ||
| 2 | * KGDB stub. | ||
| 3 | * | ||
| 4 | * Maintainer: Jason Wessel <jason.wessel@windriver.com> | ||
| 5 | * | ||
| 6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. | ||
| 7 | * Copyright (C) 2002-2004 Timesys Corporation | ||
| 8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> | ||
| 9 | * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> | ||
| 10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> | ||
| 11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. | ||
| 12 | * Copyright (C) 2005-2008 Wind River Systems, Inc. | ||
| 13 | * Copyright (C) 2007 MontaVista Software, Inc. | ||
| 14 | * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 15 | * | ||
| 16 | * Contributors at various stages not listed above: | ||
| 17 | * Jason Wessel ( jason.wessel@windriver.com ) | ||
| 18 | * George Anzinger <george@mvista.com> | ||
| 19 | * Anurekh Saxena (anurekh.saxena@timesys.com) | ||
| 20 | * Lake Stevens Instrument Division (Glenn Engel) | ||
| 21 | * Jim Kingdon, Cygnus Support. | ||
| 22 | * | ||
| 23 | * Original KGDB stub: David Grothe <dave@gcom.com>, | ||
| 24 | * Tigran Aivazian <tigran@sco.com> | ||
| 25 | * | ||
| 26 | * This file is licensed under the terms of the GNU General Public License | ||
| 27 | * version 2. This program is licensed "as is" without any warranty of any | ||
| 28 | * kind, whether express or implied. | ||
| 29 | */ | ||
| 30 | #include <linux/pid_namespace.h> | ||
| 31 | #include <linux/clocksource.h> | ||
| 32 | #include <linux/interrupt.h> | ||
| 33 | #include <linux/spinlock.h> | ||
| 34 | #include <linux/console.h> | ||
| 35 | #include <linux/threads.h> | ||
| 36 | #include <linux/uaccess.h> | ||
| 37 | #include <linux/kernel.h> | ||
| 38 | #include <linux/module.h> | ||
| 39 | #include <linux/ptrace.h> | ||
| 40 | #include <linux/reboot.h> | ||
| 41 | #include <linux/string.h> | ||
| 42 | #include <linux/delay.h> | ||
| 43 | #include <linux/sched.h> | ||
| 44 | #include <linux/sysrq.h> | ||
| 45 | #include <linux/init.h> | ||
| 46 | #include <linux/kgdb.h> | ||
| 47 | #include <linux/pid.h> | ||
| 48 | #include <linux/smp.h> | ||
| 49 | #include <linux/mm.h> | ||
| 50 | |||
| 51 | #include <asm/cacheflush.h> | ||
| 52 | #include <asm/byteorder.h> | ||
| 53 | #include <asm/atomic.h> | ||
| 54 | #include <asm/system.h> | ||
| 55 | |||
| 56 | static int kgdb_break_asap; | ||
| 57 | |||
| 58 | struct kgdb_state { | ||
| 59 | int ex_vector; | ||
| 60 | int signo; | ||
| 61 | int err_code; | ||
| 62 | int cpu; | ||
| 63 | int pass_exception; | ||
| 64 | long threadid; | ||
| 65 | long kgdb_usethreadid; | ||
| 66 | struct pt_regs *linux_regs; | ||
| 67 | }; | ||
| 68 | |||
| 69 | static struct debuggerinfo_struct { | ||
| 70 | void *debuggerinfo; | ||
| 71 | struct task_struct *task; | ||
| 72 | } kgdb_info[NR_CPUS]; | ||
| 73 | |||
| 74 | /** | ||
| 75 | * kgdb_connected - Is a host GDB connected to us? | ||
| 76 | */ | ||
| 77 | int kgdb_connected; | ||
| 78 | EXPORT_SYMBOL_GPL(kgdb_connected); | ||
| 79 | |||
| 80 | /* All the KGDB handlers are installed */ | ||
| 81 | static int kgdb_io_module_registered; | ||
| 82 | |||
| 83 | /* Guard for recursive entry */ | ||
| 84 | static int exception_level; | ||
| 85 | |||
| 86 | static struct kgdb_io *kgdb_io_ops; | ||
| 87 | static DEFINE_SPINLOCK(kgdb_registration_lock); | ||
| 88 | |||
| 89 | /* kgdb console driver is loaded */ | ||
| 90 | static int kgdb_con_registered; | ||
| 91 | /* determine if kgdb console output should be used */ | ||
| 92 | static int kgdb_use_con; | ||
| 93 | |||
| 94 | static int __init opt_kgdb_con(char *str) | ||
| 95 | { | ||
| 96 | kgdb_use_con = 1; | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | |||
| 100 | early_param("kgdbcon", opt_kgdb_con); | ||
| 101 | |||
| 102 | module_param(kgdb_use_con, int, 0644); | ||
| 103 | |||
| 104 | /* | ||
| 105 | * Holds information about breakpoints in a kernel. These breakpoints are | ||
| 106 | * added and removed by gdb. | ||
| 107 | */ | ||
| 108 | static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { | ||
| 109 | [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } | ||
| 110 | }; | ||
| 111 | |||
| 112 | /* | ||
| 113 | * The CPU# of the active CPU, or -1 if none: | ||
| 114 | */ | ||
| 115 | atomic_t kgdb_active = ATOMIC_INIT(-1); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early | ||
| 119 | * bootup code (which might not have percpu set up yet): | ||
| 120 | */ | ||
| 121 | static atomic_t passive_cpu_wait[NR_CPUS]; | ||
| 122 | static atomic_t cpu_in_kgdb[NR_CPUS]; | ||
| 123 | atomic_t kgdb_setting_breakpoint; | ||
| 124 | |||
| 125 | struct task_struct *kgdb_usethread; | ||
| 126 | struct task_struct *kgdb_contthread; | ||
| 127 | |||
| 128 | int kgdb_single_step; | ||
| 129 | |||
| 130 | /* Our I/O buffers. */ | ||
| 131 | static char remcom_in_buffer[BUFMAX]; | ||
| 132 | static char remcom_out_buffer[BUFMAX]; | ||
| 133 | |||
| 134 | /* Storage for the registers, in GDB format. */ | ||
| 135 | static unsigned long gdb_regs[(NUMREGBYTES + | ||
| 136 | sizeof(unsigned long) - 1) / | ||
| 137 | sizeof(unsigned long)]; | ||
| 138 | |||
| 139 | /* to keep track of the CPU which is doing the single stepping*/ | ||
| 140 | atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); | ||
| 141 | |||
| 142 | /* | ||
| 143 | * If you are debugging a problem where roundup (the collection of | ||
| 144 | * all other CPUs) is a problem [this should be extremely rare], | ||
| 145 | * then use the nokgdbroundup option to avoid roundup. In that case | ||
| 146 | * the other CPUs might interfere with your debugging context, so | ||
| 147 | * use this with care: | ||
| 148 | */ | ||
| 149 | int kgdb_do_roundup = 1; | ||
| 150 | |||
| 151 | static int __init opt_nokgdbroundup(char *str) | ||
| 152 | { | ||
| 153 | kgdb_do_roundup = 0; | ||
| 154 | |||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | |||
| 158 | early_param("nokgdbroundup", opt_nokgdbroundup); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Finally, some KGDB code :-) | ||
| 162 | */ | ||
| 163 | |||
| 164 | /* | ||
| 165 | * Weak aliases for breakpoint management, | ||
| 166 | * can be overriden by architectures when needed: | ||
| 167 | */ | ||
| 168 | int __weak kgdb_validate_break_address(unsigned long addr) | ||
| 169 | { | ||
| 170 | char tmp_variable[BREAK_INSTR_SIZE]; | ||
| 171 | |||
| 172 | return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); | ||
| 173 | } | ||
| 174 | |||
| 175 | int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) | ||
| 176 | { | ||
| 177 | int err; | ||
| 178 | |||
| 179 | err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); | ||
| 180 | if (err) | ||
| 181 | return err; | ||
| 182 | |||
| 183 | return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, | ||
| 184 | BREAK_INSTR_SIZE); | ||
| 185 | } | ||
| 186 | |||
| 187 | int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) | ||
| 188 | { | ||
| 189 | return probe_kernel_write((char *)addr, | ||
| 190 | (char *)bundle, BREAK_INSTR_SIZE); | ||
| 191 | } | ||
| 192 | |||
| 193 | unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) | ||
| 194 | { | ||
| 195 | return instruction_pointer(regs); | ||
| 196 | } | ||
| 197 | |||
| 198 | int __weak kgdb_arch_init(void) | ||
| 199 | { | ||
| 200 | return 0; | ||
| 201 | } | ||
| 202 | |||
| 203 | int __weak kgdb_skipexception(int exception, struct pt_regs *regs) | ||
| 204 | { | ||
| 205 | return 0; | ||
| 206 | } | ||
| 207 | |||
| 208 | void __weak | ||
| 209 | kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) | ||
| 210 | { | ||
| 211 | return; | ||
| 212 | } | ||
| 213 | |||
| 214 | /** | ||
| 215 | * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. | ||
| 216 | * @regs: Current &struct pt_regs. | ||
| 217 | * | ||
| 218 | * This function will be called if the particular architecture must | ||
| 219 | * disable hardware debugging while it is processing gdb packets or | ||
| 220 | * handling exception. | ||
| 221 | */ | ||
| 222 | void __weak kgdb_disable_hw_debug(struct pt_regs *regs) | ||
| 223 | { | ||
| 224 | } | ||
| 225 | |||
| 226 | /* | ||
| 227 | * GDB remote protocol parser: | ||
| 228 | */ | ||
| 229 | |||
| 230 | static const char hexchars[] = "0123456789abcdef"; | ||
| 231 | |||
| 232 | static int hex(char ch) | ||
| 233 | { | ||
| 234 | if ((ch >= 'a') && (ch <= 'f')) | ||
| 235 | return ch - 'a' + 10; | ||
| 236 | if ((ch >= '0') && (ch <= '9')) | ||
| 237 | return ch - '0'; | ||
| 238 | if ((ch >= 'A') && (ch <= 'F')) | ||
| 239 | return ch - 'A' + 10; | ||
| 240 | return -1; | ||
| 241 | } | ||
| 242 | |||
| 243 | /* scan for the sequence $<data>#<checksum> */ | ||
| 244 | static void get_packet(char *buffer) | ||
| 245 | { | ||
| 246 | unsigned char checksum; | ||
| 247 | unsigned char xmitcsum; | ||
| 248 | int count; | ||
| 249 | char ch; | ||
| 250 | |||
| 251 | do { | ||
| 252 | /* | ||
| 253 | * Spin and wait around for the start character, ignore all | ||
| 254 | * other characters: | ||
| 255 | */ | ||
| 256 | while ((ch = (kgdb_io_ops->read_char())) != '$') | ||
| 257 | /* nothing */; | ||
| 258 | |||
| 259 | kgdb_connected = 1; | ||
| 260 | checksum = 0; | ||
| 261 | xmitcsum = -1; | ||
| 262 | |||
| 263 | count = 0; | ||
| 264 | |||
| 265 | /* | ||
| 266 | * now, read until a # or end of buffer is found: | ||
| 267 | */ | ||
| 268 | while (count < (BUFMAX - 1)) { | ||
| 269 | ch = kgdb_io_ops->read_char(); | ||
| 270 | if (ch == '#') | ||
| 271 | break; | ||
| 272 | checksum = checksum + ch; | ||
| 273 | buffer[count] = ch; | ||
| 274 | count = count + 1; | ||
| 275 | } | ||
| 276 | buffer[count] = 0; | ||
| 277 | |||
| 278 | if (ch == '#') { | ||
| 279 | xmitcsum = hex(kgdb_io_ops->read_char()) << 4; | ||
| 280 | xmitcsum += hex(kgdb_io_ops->read_char()); | ||
| 281 | |||
| 282 | if (checksum != xmitcsum) | ||
| 283 | /* failed checksum */ | ||
| 284 | kgdb_io_ops->write_char('-'); | ||
| 285 | else | ||
| 286 | /* successful transfer */ | ||
| 287 | kgdb_io_ops->write_char('+'); | ||
| 288 | if (kgdb_io_ops->flush) | ||
| 289 | kgdb_io_ops->flush(); | ||
| 290 | } | ||
| 291 | } while (checksum != xmitcsum); | ||
| 292 | } | ||
| 293 | |||
| 294 | /* | ||
| 295 | * Send the packet in buffer. | ||
| 296 | * Check for gdb connection if asked for. | ||
| 297 | */ | ||
| 298 | static void put_packet(char *buffer) | ||
| 299 | { | ||
| 300 | unsigned char checksum; | ||
| 301 | int count; | ||
| 302 | char ch; | ||
| 303 | |||
| 304 | /* | ||
| 305 | * $<packet info>#<checksum>. | ||
| 306 | */ | ||
| 307 | while (1) { | ||
| 308 | kgdb_io_ops->write_char('$'); | ||
| 309 | checksum = 0; | ||
| 310 | count = 0; | ||
| 311 | |||
| 312 | while ((ch = buffer[count])) { | ||
| 313 | kgdb_io_ops->write_char(ch); | ||
| 314 | checksum += ch; | ||
| 315 | count++; | ||
| 316 | } | ||
| 317 | |||
| 318 | kgdb_io_ops->write_char('#'); | ||
| 319 | kgdb_io_ops->write_char(hexchars[checksum >> 4]); | ||
| 320 | kgdb_io_ops->write_char(hexchars[checksum & 0xf]); | ||
| 321 | if (kgdb_io_ops->flush) | ||
| 322 | kgdb_io_ops->flush(); | ||
| 323 | |||
| 324 | /* Now see what we get in reply. */ | ||
| 325 | ch = kgdb_io_ops->read_char(); | ||
| 326 | |||
| 327 | if (ch == 3) | ||
| 328 | ch = kgdb_io_ops->read_char(); | ||
| 329 | |||
| 330 | /* If we get an ACK, we are done. */ | ||
| 331 | if (ch == '+') | ||
| 332 | return; | ||
| 333 | |||
| 334 | /* | ||
| 335 | * If we get the start of another packet, this means | ||
| 336 | * that GDB is attempting to reconnect. We will NAK | ||
| 337 | * the packet being sent, and stop trying to send this | ||
| 338 | * packet. | ||
| 339 | */ | ||
| 340 | if (ch == '$') { | ||
| 341 | kgdb_io_ops->write_char('-'); | ||
| 342 | if (kgdb_io_ops->flush) | ||
| 343 | kgdb_io_ops->flush(); | ||
| 344 | return; | ||
| 345 | } | ||
| 346 | } | ||
| 347 | } | ||
| 348 | |||
| 349 | static char *pack_hex_byte(char *pkt, u8 byte) | ||
| 350 | { | ||
| 351 | *pkt++ = hexchars[byte >> 4]; | ||
| 352 | *pkt++ = hexchars[byte & 0xf]; | ||
| 353 | |||
| 354 | return pkt; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* | ||
| 358 | * Convert the memory pointed to by mem into hex, placing result in buf. | ||
| 359 | * Return a pointer to the last char put in buf (null). May return an error. | ||
| 360 | */ | ||
| 361 | int kgdb_mem2hex(char *mem, char *buf, int count) | ||
| 362 | { | ||
| 363 | char *tmp; | ||
| 364 | int err; | ||
| 365 | |||
| 366 | /* | ||
| 367 | * We use the upper half of buf as an intermediate buffer for the | ||
| 368 | * raw memory copy. Hex conversion will work against this one. | ||
| 369 | */ | ||
| 370 | tmp = buf + count; | ||
| 371 | |||
| 372 | err = probe_kernel_read(tmp, mem, count); | ||
| 373 | if (!err) { | ||
| 374 | while (count > 0) { | ||
| 375 | buf = pack_hex_byte(buf, *tmp); | ||
| 376 | tmp++; | ||
| 377 | count--; | ||
| 378 | } | ||
| 379 | |||
| 380 | *buf = 0; | ||
| 381 | } | ||
| 382 | |||
| 383 | return err; | ||
| 384 | } | ||
| 385 | |||
| 386 | /* | ||
| 387 | * Copy the binary array pointed to by buf into mem. Fix $, #, and | ||
| 388 | * 0x7d escaped with 0x7d. Return a pointer to the character after | ||
| 389 | * the last byte written. | ||
| 390 | */ | ||
| 391 | static int kgdb_ebin2mem(char *buf, char *mem, int count) | ||
| 392 | { | ||
| 393 | int err = 0; | ||
| 394 | char c; | ||
| 395 | |||
| 396 | while (count-- > 0) { | ||
| 397 | c = *buf++; | ||
| 398 | if (c == 0x7d) | ||
| 399 | c = *buf++ ^ 0x20; | ||
| 400 | |||
| 401 | err = probe_kernel_write(mem, &c, 1); | ||
| 402 | if (err) | ||
| 403 | break; | ||
| 404 | |||
| 405 | mem++; | ||
| 406 | } | ||
| 407 | |||
| 408 | return err; | ||
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * Convert the hex array pointed to by buf into binary to be placed in mem. | ||
| 413 | * Return a pointer to the character AFTER the last byte written. | ||
| 414 | * May return an error. | ||
| 415 | */ | ||
| 416 | int kgdb_hex2mem(char *buf, char *mem, int count) | ||
| 417 | { | ||
| 418 | char *tmp_raw; | ||
| 419 | char *tmp_hex; | ||
| 420 | |||
| 421 | /* | ||
| 422 | * We use the upper half of buf as an intermediate buffer for the | ||
| 423 | * raw memory that is converted from hex. | ||
| 424 | */ | ||
| 425 | tmp_raw = buf + count * 2; | ||
| 426 | |||
| 427 | tmp_hex = tmp_raw - 1; | ||
| 428 | while (tmp_hex >= buf) { | ||
| 429 | tmp_raw--; | ||
| 430 | *tmp_raw = hex(*tmp_hex--); | ||
| 431 | *tmp_raw |= hex(*tmp_hex--) << 4; | ||
| 432 | } | ||
| 433 | |||
| 434 | return probe_kernel_write(mem, tmp_raw, count); | ||
| 435 | } | ||
| 436 | |||
| 437 | /* | ||
| 438 | * While we find nice hex chars, build a long_val. | ||
| 439 | * Return number of chars processed. | ||
| 440 | */ | ||
| 441 | int kgdb_hex2long(char **ptr, long *long_val) | ||
| 442 | { | ||
| 443 | int hex_val; | ||
| 444 | int num = 0; | ||
| 445 | |||
| 446 | *long_val = 0; | ||
| 447 | |||
| 448 | while (**ptr) { | ||
| 449 | hex_val = hex(**ptr); | ||
| 450 | if (hex_val < 0) | ||
| 451 | break; | ||
| 452 | |||
| 453 | *long_val = (*long_val << 4) | hex_val; | ||
| 454 | num++; | ||
| 455 | (*ptr)++; | ||
| 456 | } | ||
| 457 | |||
| 458 | return num; | ||
| 459 | } | ||
| 460 | |||
| 461 | /* Write memory due to an 'M' or 'X' packet. */ | ||
| 462 | static int write_mem_msg(int binary) | ||
| 463 | { | ||
| 464 | char *ptr = &remcom_in_buffer[1]; | ||
| 465 | unsigned long addr; | ||
| 466 | unsigned long length; | ||
| 467 | int err; | ||
| 468 | |||
| 469 | if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && | ||
| 470 | kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { | ||
| 471 | if (binary) | ||
| 472 | err = kgdb_ebin2mem(ptr, (char *)addr, length); | ||
| 473 | else | ||
| 474 | err = kgdb_hex2mem(ptr, (char *)addr, length); | ||
| 475 | if (err) | ||
| 476 | return err; | ||
| 477 | if (CACHE_FLUSH_IS_SAFE) | ||
| 478 | flush_icache_range(addr, addr + length + 1); | ||
| 479 | return 0; | ||
| 480 | } | ||
| 481 | |||
| 482 | return -EINVAL; | ||
| 483 | } | ||
| 484 | |||
| 485 | static void error_packet(char *pkt, int error) | ||
| 486 | { | ||
| 487 | error = -error; | ||
| 488 | pkt[0] = 'E'; | ||
| 489 | pkt[1] = hexchars[(error / 10)]; | ||
| 490 | pkt[2] = hexchars[(error % 10)]; | ||
| 491 | pkt[3] = '\0'; | ||
| 492 | } | ||
| 493 | |||
| 494 | /* | ||
| 495 | * Thread ID accessors. We represent a flat TID space to GDB, where | ||
| 496 | * the per CPU idle threads (which under Linux all have PID 0) are | ||
| 497 | * remapped to negative TIDs. | ||
| 498 | */ | ||
| 499 | |||
| 500 | #define BUF_THREAD_ID_SIZE 16 | ||
| 501 | |||
| 502 | static char *pack_threadid(char *pkt, unsigned char *id) | ||
| 503 | { | ||
| 504 | char *limit; | ||
| 505 | |||
| 506 | limit = pkt + BUF_THREAD_ID_SIZE; | ||
| 507 | while (pkt < limit) | ||
| 508 | pkt = pack_hex_byte(pkt, *id++); | ||
| 509 | |||
| 510 | return pkt; | ||
| 511 | } | ||
| 512 | |||
| 513 | static void int_to_threadref(unsigned char *id, int value) | ||
| 514 | { | ||
| 515 | unsigned char *scan; | ||
| 516 | int i = 4; | ||
| 517 | |||
| 518 | scan = (unsigned char *)id; | ||
| 519 | while (i--) | ||
| 520 | *scan++ = 0; | ||
| 521 | *scan++ = (value >> 24) & 0xff; | ||
| 522 | *scan++ = (value >> 16) & 0xff; | ||
| 523 | *scan++ = (value >> 8) & 0xff; | ||
| 524 | *scan++ = (value & 0xff); | ||
| 525 | } | ||
| 526 | |||
| 527 | static struct task_struct *getthread(struct pt_regs *regs, int tid) | ||
| 528 | { | ||
| 529 | /* | ||
| 530 | * Non-positive TIDs are remapped idle tasks: | ||
| 531 | */ | ||
| 532 | if (tid <= 0) | ||
| 533 | return idle_task(-tid); | ||
| 534 | |||
| 535 | /* | ||
| 536 | * find_task_by_pid_ns() does not take the tasklist lock anymore | ||
| 537 | * but is nicely RCU locked - hence is a pretty resilient | ||
| 538 | * thing to use: | ||
| 539 | */ | ||
| 540 | return find_task_by_pid_ns(tid, &init_pid_ns); | ||
| 541 | } | ||
| 542 | |||
| 543 | /* | ||
| 544 | * CPU debug state control: | ||
| 545 | */ | ||
| 546 | |||
| 547 | #ifdef CONFIG_SMP | ||
| 548 | static void kgdb_wait(struct pt_regs *regs) | ||
| 549 | { | ||
| 550 | unsigned long flags; | ||
| 551 | int cpu; | ||
| 552 | |||
| 553 | local_irq_save(flags); | ||
| 554 | cpu = raw_smp_processor_id(); | ||
| 555 | kgdb_info[cpu].debuggerinfo = regs; | ||
| 556 | kgdb_info[cpu].task = current; | ||
| 557 | /* | ||
| 558 | * Make sure the above info reaches the primary CPU before | ||
| 559 | * our cpu_in_kgdb[] flag setting does: | ||
| 560 | */ | ||
| 561 | smp_wmb(); | ||
| 562 | atomic_set(&cpu_in_kgdb[cpu], 1); | ||
| 563 | |||
| 564 | /* Wait till primary CPU is done with debugging */ | ||
| 565 | while (atomic_read(&passive_cpu_wait[cpu])) | ||
| 566 | cpu_relax(); | ||
| 567 | |||
| 568 | kgdb_info[cpu].debuggerinfo = NULL; | ||
| 569 | kgdb_info[cpu].task = NULL; | ||
| 570 | |||
| 571 | /* fix up hardware debug registers on local cpu */ | ||
| 572 | if (arch_kgdb_ops.correct_hw_break) | ||
| 573 | arch_kgdb_ops.correct_hw_break(); | ||
| 574 | |||
| 575 | /* Signal the primary CPU that we are done: */ | ||
| 576 | atomic_set(&cpu_in_kgdb[cpu], 0); | ||
| 577 | clocksource_touch_watchdog(); | ||
| 578 | local_irq_restore(flags); | ||
| 579 | } | ||
| 580 | #endif | ||
| 581 | |||
| 582 | /* | ||
| 583 | * Some architectures need cache flushes when we set/clear a | ||
| 584 | * breakpoint: | ||
| 585 | */ | ||
| 586 | static void kgdb_flush_swbreak_addr(unsigned long addr) | ||
| 587 | { | ||
| 588 | if (!CACHE_FLUSH_IS_SAFE) | ||
| 589 | return; | ||
| 590 | |||
| 591 | if (current->mm && current->mm->mmap_cache) { | ||
| 592 | flush_cache_range(current->mm->mmap_cache, | ||
| 593 | addr, addr + BREAK_INSTR_SIZE); | ||
| 594 | } | ||
| 595 | /* Force flush instruction cache if it was outside the mm */ | ||
| 596 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); | ||
| 597 | } | ||
| 598 | |||
| 599 | /* | ||
| 600 | * SW breakpoint management: | ||
| 601 | */ | ||
| 602 | static int kgdb_activate_sw_breakpoints(void) | ||
| 603 | { | ||
| 604 | unsigned long addr; | ||
| 605 | int error = 0; | ||
| 606 | int i; | ||
| 607 | |||
| 608 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 609 | if (kgdb_break[i].state != BP_SET) | ||
| 610 | continue; | ||
| 611 | |||
| 612 | addr = kgdb_break[i].bpt_addr; | ||
| 613 | error = kgdb_arch_set_breakpoint(addr, | ||
| 614 | kgdb_break[i].saved_instr); | ||
| 615 | if (error) | ||
| 616 | return error; | ||
| 617 | |||
| 618 | kgdb_flush_swbreak_addr(addr); | ||
| 619 | kgdb_break[i].state = BP_ACTIVE; | ||
| 620 | } | ||
| 621 | return 0; | ||
| 622 | } | ||
| 623 | |||
| 624 | static int kgdb_set_sw_break(unsigned long addr) | ||
| 625 | { | ||
| 626 | int err = kgdb_validate_break_address(addr); | ||
| 627 | int breakno = -1; | ||
| 628 | int i; | ||
| 629 | |||
| 630 | if (err) | ||
| 631 | return err; | ||
| 632 | |||
| 633 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 634 | if ((kgdb_break[i].state == BP_SET) && | ||
| 635 | (kgdb_break[i].bpt_addr == addr)) | ||
| 636 | return -EEXIST; | ||
| 637 | } | ||
| 638 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 639 | if (kgdb_break[i].state == BP_REMOVED && | ||
| 640 | kgdb_break[i].bpt_addr == addr) { | ||
| 641 | breakno = i; | ||
| 642 | break; | ||
| 643 | } | ||
| 644 | } | ||
| 645 | |||
| 646 | if (breakno == -1) { | ||
| 647 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 648 | if (kgdb_break[i].state == BP_UNDEFINED) { | ||
| 649 | breakno = i; | ||
| 650 | break; | ||
| 651 | } | ||
| 652 | } | ||
| 653 | } | ||
| 654 | |||
| 655 | if (breakno == -1) | ||
| 656 | return -E2BIG; | ||
| 657 | |||
| 658 | kgdb_break[breakno].state = BP_SET; | ||
| 659 | kgdb_break[breakno].type = BP_BREAKPOINT; | ||
| 660 | kgdb_break[breakno].bpt_addr = addr; | ||
| 661 | |||
| 662 | return 0; | ||
| 663 | } | ||
| 664 | |||
| 665 | static int kgdb_deactivate_sw_breakpoints(void) | ||
| 666 | { | ||
| 667 | unsigned long addr; | ||
| 668 | int error = 0; | ||
| 669 | int i; | ||
| 670 | |||
| 671 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 672 | if (kgdb_break[i].state != BP_ACTIVE) | ||
| 673 | continue; | ||
| 674 | addr = kgdb_break[i].bpt_addr; | ||
| 675 | error = kgdb_arch_remove_breakpoint(addr, | ||
| 676 | kgdb_break[i].saved_instr); | ||
| 677 | if (error) | ||
| 678 | return error; | ||
| 679 | |||
| 680 | kgdb_flush_swbreak_addr(addr); | ||
| 681 | kgdb_break[i].state = BP_SET; | ||
| 682 | } | ||
| 683 | return 0; | ||
| 684 | } | ||
| 685 | |||
| 686 | static int kgdb_remove_sw_break(unsigned long addr) | ||
| 687 | { | ||
| 688 | int i; | ||
| 689 | |||
| 690 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 691 | if ((kgdb_break[i].state == BP_SET) && | ||
| 692 | (kgdb_break[i].bpt_addr == addr)) { | ||
| 693 | kgdb_break[i].state = BP_REMOVED; | ||
| 694 | return 0; | ||
| 695 | } | ||
| 696 | } | ||
| 697 | return -ENOENT; | ||
| 698 | } | ||
| 699 | |||
| 700 | int kgdb_isremovedbreak(unsigned long addr) | ||
| 701 | { | ||
| 702 | int i; | ||
| 703 | |||
| 704 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 705 | if ((kgdb_break[i].state == BP_REMOVED) && | ||
| 706 | (kgdb_break[i].bpt_addr == addr)) | ||
| 707 | return 1; | ||
| 708 | } | ||
| 709 | return 0; | ||
| 710 | } | ||
| 711 | |||
| 712 | int remove_all_break(void) | ||
| 713 | { | ||
| 714 | unsigned long addr; | ||
| 715 | int error; | ||
| 716 | int i; | ||
| 717 | |||
| 718 | /* Clear memory breakpoints. */ | ||
| 719 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
| 720 | if (kgdb_break[i].state != BP_ACTIVE) | ||
| 721 | goto setundefined; | ||
| 722 | addr = kgdb_break[i].bpt_addr; | ||
| 723 | error = kgdb_arch_remove_breakpoint(addr, | ||
| 724 | kgdb_break[i].saved_instr); | ||
| 725 | if (error) | ||
| 726 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", | ||
| 727 | addr); | ||
| 728 | setundefined: | ||
| 729 | kgdb_break[i].state = BP_UNDEFINED; | ||
| 730 | } | ||
| 731 | |||
| 732 | /* Clear hardware breakpoints. */ | ||
| 733 | if (arch_kgdb_ops.remove_all_hw_break) | ||
| 734 | arch_kgdb_ops.remove_all_hw_break(); | ||
| 735 | |||
| 736 | return 0; | ||
| 737 | } | ||
| 738 | |||
| 739 | /* | ||
| 740 | * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: | ||
| 741 | */ | ||
| 742 | static inline int shadow_pid(int realpid) | ||
| 743 | { | ||
| 744 | if (realpid) | ||
| 745 | return realpid; | ||
| 746 | |||
| 747 | return -1-raw_smp_processor_id(); | ||
| 748 | } | ||
| 749 | |||
| 750 | static char gdbmsgbuf[BUFMAX + 1]; | ||
| 751 | |||
| 752 | static void kgdb_msg_write(const char *s, int len) | ||
| 753 | { | ||
| 754 | char *bufptr; | ||
| 755 | int wcount; | ||
| 756 | int i; | ||
| 757 | |||
| 758 | /* 'O'utput */ | ||
| 759 | gdbmsgbuf[0] = 'O'; | ||
| 760 | |||
| 761 | /* Fill and send buffers... */ | ||
| 762 | while (len > 0) { | ||
| 763 | bufptr = gdbmsgbuf + 1; | ||
| 764 | |||
| 765 | /* Calculate how many this time */ | ||
| 766 | if ((len << 1) > (BUFMAX - 2)) | ||
| 767 | wcount = (BUFMAX - 2) >> 1; | ||
| 768 | else | ||
| 769 | wcount = len; | ||
| 770 | |||
| 771 | /* Pack in hex chars */ | ||
| 772 | for (i = 0; i < wcount; i++) | ||
| 773 | bufptr = pack_hex_byte(bufptr, s[i]); | ||
| 774 | *bufptr = '\0'; | ||
| 775 | |||
| 776 | /* Move up */ | ||
| 777 | s += wcount; | ||
| 778 | len -= wcount; | ||
| 779 | |||
| 780 | /* Write packet */ | ||
| 781 | put_packet(gdbmsgbuf); | ||
| 782 | } | ||
| 783 | } | ||
| 784 | |||
| 785 | /* | ||
| 786 | * Return true if there is a valid kgdb I/O module. Also if no | ||
| 787 | * debugger is attached a message can be printed to the console about | ||
| 788 | * waiting for the debugger to attach. | ||
| 789 | * | ||
| 790 | * The print_wait argument is only to be true when called from inside | ||
| 791 | * the core kgdb_handle_exception, because it will wait for the | ||
| 792 | * debugger to attach. | ||
| 793 | */ | ||
| 794 | static int kgdb_io_ready(int print_wait) | ||
| 795 | { | ||
| 796 | if (!kgdb_io_ops) | ||
| 797 | return 0; | ||
| 798 | if (kgdb_connected) | ||
| 799 | return 1; | ||
| 800 | if (atomic_read(&kgdb_setting_breakpoint)) | ||
| 801 | return 1; | ||
| 802 | if (print_wait) | ||
| 803 | printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); | ||
| 804 | return 1; | ||
| 805 | } | ||
| 806 | |||
| 807 | /* | ||
| 808 | * All the functions that start with gdb_cmd are the various | ||
| 809 | * operations to implement the handlers for the gdbserial protocol | ||
| 810 | * where KGDB is communicating with an external debugger | ||
| 811 | */ | ||
| 812 | |||
| 813 | /* Handle the '?' status packets */ | ||
| 814 | static void gdb_cmd_status(struct kgdb_state *ks) | ||
| 815 | { | ||
| 816 | /* | ||
| 817 | * We know that this packet is only sent | ||
| 818 | * during initial connect. So to be safe, | ||
| 819 | * we clear out our breakpoints now in case | ||
| 820 | * GDB is reconnecting. | ||
| 821 | */ | ||
| 822 | remove_all_break(); | ||
| 823 | |||
| 824 | remcom_out_buffer[0] = 'S'; | ||
| 825 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); | ||
| 826 | } | ||
| 827 | |||
| 828 | /* Handle the 'g' get registers request */ | ||
| 829 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
| 830 | { | ||
| 831 | struct task_struct *thread; | ||
| 832 | void *local_debuggerinfo; | ||
| 833 | int i; | ||
| 834 | |||
| 835 | thread = kgdb_usethread; | ||
| 836 | if (!thread) { | ||
| 837 | thread = kgdb_info[ks->cpu].task; | ||
| 838 | local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; | ||
| 839 | } else { | ||
| 840 | local_debuggerinfo = NULL; | ||
| 841 | for (i = 0; i < NR_CPUS; i++) { | ||
| 842 | /* | ||
| 843 | * Try to find the task on some other | ||
| 844 | * or possibly this node if we do not | ||
| 845 | * find the matching task then we try | ||
| 846 | * to approximate the results. | ||
| 847 | */ | ||
| 848 | if (thread == kgdb_info[i].task) | ||
| 849 | local_debuggerinfo = kgdb_info[i].debuggerinfo; | ||
| 850 | } | ||
| 851 | } | ||
| 852 | |||
| 853 | /* | ||
| 854 | * All threads that don't have debuggerinfo should be | ||
| 855 | * in __schedule() sleeping, since all other CPUs | ||
| 856 | * are in kgdb_wait, and thus have debuggerinfo. | ||
| 857 | */ | ||
| 858 | if (local_debuggerinfo) { | ||
| 859 | pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); | ||
| 860 | } else { | ||
| 861 | /* | ||
| 862 | * Pull stuff saved during switch_to; nothing | ||
| 863 | * else is accessible (or even particularly | ||
| 864 | * relevant). | ||
| 865 | * | ||
| 866 | * This should be enough for a stack trace. | ||
| 867 | */ | ||
| 868 | sleeping_thread_to_gdb_regs(gdb_regs, thread); | ||
| 869 | } | ||
| 870 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); | ||
| 871 | } | ||
| 872 | |||
| 873 | /* Handle the 'G' set registers request */ | ||
| 874 | static void gdb_cmd_setregs(struct kgdb_state *ks) | ||
| 875 | { | ||
| 876 | kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); | ||
| 877 | |||
| 878 | if (kgdb_usethread && kgdb_usethread != current) { | ||
| 879 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 880 | } else { | ||
| 881 | gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); | ||
| 882 | strcpy(remcom_out_buffer, "OK"); | ||
| 883 | } | ||
| 884 | } | ||
| 885 | |||
| 886 | /* Handle the 'm' memory read bytes */ | ||
| 887 | static void gdb_cmd_memread(struct kgdb_state *ks) | ||
| 888 | { | ||
| 889 | char *ptr = &remcom_in_buffer[1]; | ||
| 890 | unsigned long length; | ||
| 891 | unsigned long addr; | ||
| 892 | int err; | ||
| 893 | |||
| 894 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && | ||
| 895 | kgdb_hex2long(&ptr, &length) > 0) { | ||
| 896 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); | ||
| 897 | if (err) | ||
| 898 | error_packet(remcom_out_buffer, err); | ||
| 899 | } else { | ||
| 900 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 901 | } | ||
| 902 | } | ||
| 903 | |||
| 904 | /* Handle the 'M' memory write bytes */ | ||
| 905 | static void gdb_cmd_memwrite(struct kgdb_state *ks) | ||
| 906 | { | ||
| 907 | int err = write_mem_msg(0); | ||
| 908 | |||
| 909 | if (err) | ||
| 910 | error_packet(remcom_out_buffer, err); | ||
| 911 | else | ||
| 912 | strcpy(remcom_out_buffer, "OK"); | ||
| 913 | } | ||
| 914 | |||
| 915 | /* Handle the 'X' memory binary write bytes */ | ||
| 916 | static void gdb_cmd_binwrite(struct kgdb_state *ks) | ||
| 917 | { | ||
| 918 | int err = write_mem_msg(1); | ||
| 919 | |||
| 920 | if (err) | ||
| 921 | error_packet(remcom_out_buffer, err); | ||
| 922 | else | ||
| 923 | strcpy(remcom_out_buffer, "OK"); | ||
| 924 | } | ||
| 925 | |||
| 926 | /* Handle the 'D' or 'k', detach or kill packets */ | ||
| 927 | static void gdb_cmd_detachkill(struct kgdb_state *ks) | ||
| 928 | { | ||
| 929 | int error; | ||
| 930 | |||
| 931 | /* The detach case */ | ||
| 932 | if (remcom_in_buffer[0] == 'D') { | ||
| 933 | error = remove_all_break(); | ||
| 934 | if (error < 0) { | ||
| 935 | error_packet(remcom_out_buffer, error); | ||
| 936 | } else { | ||
| 937 | strcpy(remcom_out_buffer, "OK"); | ||
| 938 | kgdb_connected = 0; | ||
| 939 | } | ||
| 940 | put_packet(remcom_out_buffer); | ||
| 941 | } else { | ||
| 942 | /* | ||
| 943 | * Assume the kill case, with no exit code checking, | ||
| 944 | * trying to force detach the debugger: | ||
| 945 | */ | ||
| 946 | remove_all_break(); | ||
| 947 | kgdb_connected = 0; | ||
| 948 | } | ||
| 949 | } | ||
| 950 | |||
| 951 | /* Handle the 'R' reboot packets */ | ||
| 952 | static int gdb_cmd_reboot(struct kgdb_state *ks) | ||
| 953 | { | ||
| 954 | /* For now, only honor R0 */ | ||
| 955 | if (strcmp(remcom_in_buffer, "R0") == 0) { | ||
| 956 | printk(KERN_CRIT "Executing emergency reboot\n"); | ||
| 957 | strcpy(remcom_out_buffer, "OK"); | ||
| 958 | put_packet(remcom_out_buffer); | ||
| 959 | |||
| 960 | /* | ||
| 961 | * Execution should not return from | ||
| 962 | * machine_emergency_restart() | ||
| 963 | */ | ||
| 964 | machine_emergency_restart(); | ||
| 965 | kgdb_connected = 0; | ||
| 966 | |||
| 967 | return 1; | ||
| 968 | } | ||
| 969 | return 0; | ||
| 970 | } | ||
| 971 | |||
| 972 | /* Handle the 'q' query packets */ | ||
| 973 | static void gdb_cmd_query(struct kgdb_state *ks) | ||
| 974 | { | ||
| 975 | struct task_struct *thread; | ||
| 976 | unsigned char thref[8]; | ||
| 977 | char *ptr; | ||
| 978 | int i; | ||
| 979 | |||
| 980 | switch (remcom_in_buffer[1]) { | ||
| 981 | case 's': | ||
| 982 | case 'f': | ||
| 983 | if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { | ||
| 984 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 985 | break; | ||
| 986 | } | ||
| 987 | |||
| 988 | if (remcom_in_buffer[1] == 'f') | ||
| 989 | ks->threadid = 1; | ||
| 990 | |||
| 991 | remcom_out_buffer[0] = 'm'; | ||
| 992 | ptr = remcom_out_buffer + 1; | ||
| 993 | |||
| 994 | for (i = 0; i < 17; ks->threadid++) { | ||
| 995 | thread = getthread(ks->linux_regs, ks->threadid); | ||
| 996 | if (thread) { | ||
| 997 | int_to_threadref(thref, ks->threadid); | ||
| 998 | pack_threadid(ptr, thref); | ||
| 999 | ptr += BUF_THREAD_ID_SIZE; | ||
| 1000 | *(ptr++) = ','; | ||
| 1001 | i++; | ||
| 1002 | } | ||
| 1003 | } | ||
| 1004 | *(--ptr) = '\0'; | ||
| 1005 | break; | ||
| 1006 | |||
| 1007 | case 'C': | ||
| 1008 | /* Current thread id */ | ||
| 1009 | strcpy(remcom_out_buffer, "QC"); | ||
| 1010 | ks->threadid = shadow_pid(current->pid); | ||
| 1011 | int_to_threadref(thref, ks->threadid); | ||
| 1012 | pack_threadid(remcom_out_buffer + 2, thref); | ||
| 1013 | break; | ||
| 1014 | case 'T': | ||
| 1015 | if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { | ||
| 1016 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1017 | break; | ||
| 1018 | } | ||
| 1019 | ks->threadid = 0; | ||
| 1020 | ptr = remcom_in_buffer + 17; | ||
| 1021 | kgdb_hex2long(&ptr, &ks->threadid); | ||
| 1022 | if (!getthread(ks->linux_regs, ks->threadid)) { | ||
| 1023 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1024 | break; | ||
| 1025 | } | ||
| 1026 | if (ks->threadid > 0) { | ||
| 1027 | kgdb_mem2hex(getthread(ks->linux_regs, | ||
| 1028 | ks->threadid)->comm, | ||
| 1029 | remcom_out_buffer, 16); | ||
| 1030 | } else { | ||
| 1031 | static char tmpstr[23 + BUF_THREAD_ID_SIZE]; | ||
| 1032 | |||
| 1033 | sprintf(tmpstr, "Shadow task %d for pid 0", | ||
| 1034 | (int)(-ks->threadid-1)); | ||
| 1035 | kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); | ||
| 1036 | } | ||
| 1037 | break; | ||
| 1038 | } | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | /* Handle the 'H' task query packets */ | ||
| 1042 | static void gdb_cmd_task(struct kgdb_state *ks) | ||
| 1043 | { | ||
| 1044 | struct task_struct *thread; | ||
| 1045 | char *ptr; | ||
| 1046 | |||
| 1047 | switch (remcom_in_buffer[1]) { | ||
| 1048 | case 'g': | ||
| 1049 | ptr = &remcom_in_buffer[2]; | ||
| 1050 | kgdb_hex2long(&ptr, &ks->threadid); | ||
| 1051 | thread = getthread(ks->linux_regs, ks->threadid); | ||
| 1052 | if (!thread && ks->threadid > 0) { | ||
| 1053 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1054 | break; | ||
| 1055 | } | ||
| 1056 | kgdb_usethread = thread; | ||
| 1057 | ks->kgdb_usethreadid = ks->threadid; | ||
| 1058 | strcpy(remcom_out_buffer, "OK"); | ||
| 1059 | break; | ||
| 1060 | case 'c': | ||
| 1061 | ptr = &remcom_in_buffer[2]; | ||
| 1062 | kgdb_hex2long(&ptr, &ks->threadid); | ||
| 1063 | if (!ks->threadid) { | ||
| 1064 | kgdb_contthread = NULL; | ||
| 1065 | } else { | ||
| 1066 | thread = getthread(ks->linux_regs, ks->threadid); | ||
| 1067 | if (!thread && ks->threadid > 0) { | ||
| 1068 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1069 | break; | ||
| 1070 | } | ||
| 1071 | kgdb_contthread = thread; | ||
| 1072 | } | ||
| 1073 | strcpy(remcom_out_buffer, "OK"); | ||
| 1074 | break; | ||
| 1075 | } | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | /* Handle the 'T' thread query packets */ | ||
| 1079 | static void gdb_cmd_thread(struct kgdb_state *ks) | ||
| 1080 | { | ||
| 1081 | char *ptr = &remcom_in_buffer[1]; | ||
| 1082 | struct task_struct *thread; | ||
| 1083 | |||
| 1084 | kgdb_hex2long(&ptr, &ks->threadid); | ||
| 1085 | thread = getthread(ks->linux_regs, ks->threadid); | ||
| 1086 | if (thread) | ||
| 1087 | strcpy(remcom_out_buffer, "OK"); | ||
| 1088 | else | ||
| 1089 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | /* Handle the 'z' or 'Z' breakpoint remove or set packets */ | ||
| 1093 | static void gdb_cmd_break(struct kgdb_state *ks) | ||
| 1094 | { | ||
| 1095 | /* | ||
| 1096 | * Since GDB-5.3, it's been drafted that '0' is a software | ||
| 1097 | * breakpoint, '1' is a hardware breakpoint, so let's do that. | ||
| 1098 | */ | ||
| 1099 | char *bpt_type = &remcom_in_buffer[1]; | ||
| 1100 | char *ptr = &remcom_in_buffer[2]; | ||
| 1101 | unsigned long addr; | ||
| 1102 | unsigned long length; | ||
| 1103 | int error = 0; | ||
| 1104 | |||
| 1105 | if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { | ||
| 1106 | /* Unsupported */ | ||
| 1107 | if (*bpt_type > '4') | ||
| 1108 | return; | ||
| 1109 | } else { | ||
| 1110 | if (*bpt_type != '0' && *bpt_type != '1') | ||
| 1111 | /* Unsupported. */ | ||
| 1112 | return; | ||
| 1113 | } | ||
| 1114 | |||
| 1115 | /* | ||
| 1116 | * Test if this is a hardware breakpoint, and | ||
| 1117 | * if we support it: | ||
| 1118 | */ | ||
| 1119 | if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) | ||
| 1120 | /* Unsupported. */ | ||
| 1121 | return; | ||
| 1122 | |||
| 1123 | if (*(ptr++) != ',') { | ||
| 1124 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1125 | return; | ||
| 1126 | } | ||
| 1127 | if (!kgdb_hex2long(&ptr, &addr)) { | ||
| 1128 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1129 | return; | ||
| 1130 | } | ||
| 1131 | if (*(ptr++) != ',' || | ||
| 1132 | !kgdb_hex2long(&ptr, &length)) { | ||
| 1133 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1134 | return; | ||
| 1135 | } | ||
| 1136 | |||
| 1137 | if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') | ||
| 1138 | error = kgdb_set_sw_break(addr); | ||
| 1139 | else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') | ||
| 1140 | error = kgdb_remove_sw_break(addr); | ||
| 1141 | else if (remcom_in_buffer[0] == 'Z') | ||
| 1142 | error = arch_kgdb_ops.set_hw_breakpoint(addr, | ||
| 1143 | (int)length, *bpt_type - '0'); | ||
| 1144 | else if (remcom_in_buffer[0] == 'z') | ||
| 1145 | error = arch_kgdb_ops.remove_hw_breakpoint(addr, | ||
| 1146 | (int) length, *bpt_type - '0'); | ||
| 1147 | |||
| 1148 | if (error == 0) | ||
| 1149 | strcpy(remcom_out_buffer, "OK"); | ||
| 1150 | else | ||
| 1151 | error_packet(remcom_out_buffer, error); | ||
| 1152 | } | ||
| 1153 | |||
| 1154 | /* Handle the 'C' signal / exception passing packets */ | ||
| 1155 | static int gdb_cmd_exception_pass(struct kgdb_state *ks) | ||
| 1156 | { | ||
| 1157 | /* C09 == pass exception | ||
| 1158 | * C15 == detach kgdb, pass exception | ||
| 1159 | */ | ||
| 1160 | if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { | ||
| 1161 | |||
| 1162 | ks->pass_exception = 1; | ||
| 1163 | remcom_in_buffer[0] = 'c'; | ||
| 1164 | |||
| 1165 | } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { | ||
| 1166 | |||
| 1167 | ks->pass_exception = 1; | ||
| 1168 | remcom_in_buffer[0] = 'D'; | ||
| 1169 | remove_all_break(); | ||
| 1170 | kgdb_connected = 0; | ||
| 1171 | return 1; | ||
| 1172 | |||
| 1173 | } else { | ||
| 1174 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1175 | return 0; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | /* Indicate fall through */ | ||
| 1179 | return -1; | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | /* | ||
| 1183 | * This function performs all gdbserial command procesing | ||
| 1184 | */ | ||
| 1185 | static int gdb_serial_stub(struct kgdb_state *ks) | ||
| 1186 | { | ||
| 1187 | int error = 0; | ||
| 1188 | int tmp; | ||
| 1189 | |||
| 1190 | /* Clear the out buffer. */ | ||
| 1191 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | ||
| 1192 | |||
| 1193 | if (kgdb_connected) { | ||
| 1194 | unsigned char thref[8]; | ||
| 1195 | char *ptr; | ||
| 1196 | |||
| 1197 | /* Reply to host that an exception has occurred */ | ||
| 1198 | ptr = remcom_out_buffer; | ||
| 1199 | *ptr++ = 'T'; | ||
| 1200 | ptr = pack_hex_byte(ptr, ks->signo); | ||
| 1201 | ptr += strlen(strcpy(ptr, "thread:")); | ||
| 1202 | int_to_threadref(thref, shadow_pid(current->pid)); | ||
| 1203 | ptr = pack_threadid(ptr, thref); | ||
| 1204 | *ptr++ = ';'; | ||
| 1205 | put_packet(remcom_out_buffer); | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
| 1209 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
| 1210 | ks->pass_exception = 0; | ||
| 1211 | |||
| 1212 | while (1) { | ||
| 1213 | error = 0; | ||
| 1214 | |||
| 1215 | /* Clear the out buffer. */ | ||
| 1216 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | ||
| 1217 | |||
| 1218 | get_packet(remcom_in_buffer); | ||
| 1219 | |||
| 1220 | switch (remcom_in_buffer[0]) { | ||
| 1221 | case '?': /* gdbserial status */ | ||
| 1222 | gdb_cmd_status(ks); | ||
| 1223 | break; | ||
| 1224 | case 'g': /* return the value of the CPU registers */ | ||
| 1225 | gdb_cmd_getregs(ks); | ||
| 1226 | break; | ||
| 1227 | case 'G': /* set the value of the CPU registers - return OK */ | ||
| 1228 | gdb_cmd_setregs(ks); | ||
| 1229 | break; | ||
| 1230 | case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ | ||
| 1231 | gdb_cmd_memread(ks); | ||
| 1232 | break; | ||
| 1233 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | ||
| 1234 | gdb_cmd_memwrite(ks); | ||
| 1235 | break; | ||
| 1236 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | ||
| 1237 | gdb_cmd_binwrite(ks); | ||
| 1238 | break; | ||
| 1239 | /* kill or detach. KGDB should treat this like a | ||
| 1240 | * continue. | ||
| 1241 | */ | ||
| 1242 | case 'D': /* Debugger detach */ | ||
| 1243 | case 'k': /* Debugger detach via kill */ | ||
| 1244 | gdb_cmd_detachkill(ks); | ||
| 1245 | goto default_handle; | ||
| 1246 | case 'R': /* Reboot */ | ||
| 1247 | if (gdb_cmd_reboot(ks)) | ||
| 1248 | goto default_handle; | ||
| 1249 | break; | ||
| 1250 | case 'q': /* query command */ | ||
| 1251 | gdb_cmd_query(ks); | ||
| 1252 | break; | ||
| 1253 | case 'H': /* task related */ | ||
| 1254 | gdb_cmd_task(ks); | ||
| 1255 | break; | ||
| 1256 | case 'T': /* Query thread status */ | ||
| 1257 | gdb_cmd_thread(ks); | ||
| 1258 | break; | ||
| 1259 | case 'z': /* Break point remove */ | ||
| 1260 | case 'Z': /* Break point set */ | ||
| 1261 | gdb_cmd_break(ks); | ||
| 1262 | break; | ||
| 1263 | case 'C': /* Exception passing */ | ||
| 1264 | tmp = gdb_cmd_exception_pass(ks); | ||
| 1265 | if (tmp > 0) | ||
| 1266 | goto default_handle; | ||
| 1267 | if (tmp == 0) | ||
| 1268 | break; | ||
| 1269 | /* Fall through on tmp < 0 */ | ||
| 1270 | case 'c': /* Continue packet */ | ||
| 1271 | case 's': /* Single step packet */ | ||
| 1272 | if (kgdb_contthread && kgdb_contthread != current) { | ||
| 1273 | /* Can't switch threads in kgdb */ | ||
| 1274 | error_packet(remcom_out_buffer, -EINVAL); | ||
| 1275 | break; | ||
| 1276 | } | ||
| 1277 | kgdb_activate_sw_breakpoints(); | ||
| 1278 | /* Fall through to default processing */ | ||
| 1279 | default: | ||
| 1280 | default_handle: | ||
| 1281 | error = kgdb_arch_handle_exception(ks->ex_vector, | ||
| 1282 | ks->signo, | ||
| 1283 | ks->err_code, | ||
| 1284 | remcom_in_buffer, | ||
| 1285 | remcom_out_buffer, | ||
| 1286 | ks->linux_regs); | ||
| 1287 | /* | ||
| 1288 | * Leave cmd processing on error, detach, | ||
| 1289 | * kill, continue, or single step. | ||
| 1290 | */ | ||
| 1291 | if (error >= 0 || remcom_in_buffer[0] == 'D' || | ||
| 1292 | remcom_in_buffer[0] == 'k') { | ||
| 1293 | error = 0; | ||
| 1294 | goto kgdb_exit; | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | } | ||
| 1298 | |||
| 1299 | /* reply to the request */ | ||
| 1300 | put_packet(remcom_out_buffer); | ||
| 1301 | } | ||
| 1302 | |||
| 1303 | kgdb_exit: | ||
| 1304 | if (ks->pass_exception) | ||
| 1305 | error = 1; | ||
| 1306 | return error; | ||
| 1307 | } | ||
| 1308 | |||
| 1309 | static int kgdb_reenter_check(struct kgdb_state *ks) | ||
| 1310 | { | ||
| 1311 | unsigned long addr; | ||
| 1312 | |||
| 1313 | if (atomic_read(&kgdb_active) != raw_smp_processor_id()) | ||
| 1314 | return 0; | ||
| 1315 | |||
| 1316 | /* Panic on recursive debugger calls: */ | ||
| 1317 | exception_level++; | ||
| 1318 | addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); | ||
| 1319 | kgdb_deactivate_sw_breakpoints(); | ||
| 1320 | |||
| 1321 | /* | ||
| 1322 | * If the break point removed ok at the place exception | ||
| 1323 | * occurred, try to recover and print a warning to the end | ||
| 1324 | * user because the user planted a breakpoint in a place that | ||
| 1325 | * KGDB needs in order to function. | ||
| 1326 | */ | ||
| 1327 | if (kgdb_remove_sw_break(addr) == 0) { | ||
| 1328 | exception_level = 0; | ||
| 1329 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | ||
| 1330 | kgdb_activate_sw_breakpoints(); | ||
| 1331 | printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", | ||
| 1332 | addr); | ||
| 1333 | WARN_ON_ONCE(1); | ||
| 1334 | |||
| 1335 | return 1; | ||
| 1336 | } | ||
| 1337 | remove_all_break(); | ||
| 1338 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | ||
| 1339 | |||
| 1340 | if (exception_level > 1) { | ||
| 1341 | dump_stack(); | ||
| 1342 | panic("Recursive entry to debugger"); | ||
| 1343 | } | ||
| 1344 | |||
| 1345 | printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); | ||
| 1346 | dump_stack(); | ||
| 1347 | panic("Recursive entry to debugger"); | ||
| 1348 | |||
| 1349 | return 1; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | /* | ||
| 1353 | * kgdb_handle_exception() - main entry point from a kernel exception | ||
| 1354 | * | ||
| 1355 | * Locking hierarchy: | ||
| 1356 | * interface locks, if any (begin_session) | ||
| 1357 | * kgdb lock (kgdb_active) | ||
| 1358 | */ | ||
| 1359 | int | ||
| 1360 | kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | ||
| 1361 | { | ||
| 1362 | struct kgdb_state kgdb_var; | ||
| 1363 | struct kgdb_state *ks = &kgdb_var; | ||
| 1364 | unsigned long flags; | ||
| 1365 | int error = 0; | ||
| 1366 | int i, cpu; | ||
| 1367 | |||
| 1368 | ks->cpu = raw_smp_processor_id(); | ||
| 1369 | ks->ex_vector = evector; | ||
| 1370 | ks->signo = signo; | ||
| 1371 | ks->ex_vector = evector; | ||
| 1372 | ks->err_code = ecode; | ||
| 1373 | ks->kgdb_usethreadid = 0; | ||
| 1374 | ks->linux_regs = regs; | ||
| 1375 | |||
| 1376 | if (kgdb_reenter_check(ks)) | ||
| 1377 | return 0; /* Ouch, double exception ! */ | ||
| 1378 | |||
| 1379 | acquirelock: | ||
| 1380 | /* | ||
| 1381 | * Interrupts will be restored by the 'trap return' code, except when | ||
| 1382 | * single stepping. | ||
| 1383 | */ | ||
| 1384 | local_irq_save(flags); | ||
| 1385 | |||
| 1386 | cpu = raw_smp_processor_id(); | ||
| 1387 | |||
| 1388 | /* | ||
| 1389 | * Acquire the kgdb_active lock: | ||
| 1390 | */ | ||
| 1391 | while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) | ||
| 1392 | cpu_relax(); | ||
| 1393 | |||
| 1394 | /* | ||
| 1395 | * Do not start the debugger connection on this CPU if the last | ||
| 1396 | * instance of the exception handler wanted to come into the | ||
| 1397 | * debugger on a different CPU via a single step | ||
| 1398 | */ | ||
| 1399 | if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && | ||
| 1400 | atomic_read(&kgdb_cpu_doing_single_step) != cpu) { | ||
| 1401 | |||
| 1402 | atomic_set(&kgdb_active, -1); | ||
| 1403 | clocksource_touch_watchdog(); | ||
| 1404 | local_irq_restore(flags); | ||
| 1405 | |||
| 1406 | goto acquirelock; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | if (!kgdb_io_ready(1)) { | ||
| 1410 | error = 1; | ||
| 1411 | goto kgdb_restore; /* No I/O connection, so resume the system */ | ||
| 1412 | } | ||
| 1413 | |||
| 1414 | /* | ||
| 1415 | * Don't enter if we have hit a removed breakpoint. | ||
| 1416 | */ | ||
| 1417 | if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) | ||
| 1418 | goto kgdb_restore; | ||
| 1419 | |||
| 1420 | /* Call the I/O driver's pre_exception routine */ | ||
| 1421 | if (kgdb_io_ops->pre_exception) | ||
| 1422 | kgdb_io_ops->pre_exception(); | ||
| 1423 | |||
| 1424 | kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; | ||
| 1425 | kgdb_info[ks->cpu].task = current; | ||
| 1426 | |||
| 1427 | kgdb_disable_hw_debug(ks->linux_regs); | ||
| 1428 | |||
| 1429 | /* | ||
| 1430 | * Get the passive CPU lock which will hold all the non-primary | ||
| 1431 | * CPU in a spin state while the debugger is active | ||
| 1432 | */ | ||
| 1433 | if (!kgdb_single_step || !kgdb_contthread) { | ||
| 1434 | for (i = 0; i < NR_CPUS; i++) | ||
| 1435 | atomic_set(&passive_cpu_wait[i], 1); | ||
| 1436 | } | ||
| 1437 | |||
| 1438 | /* | ||
| 1439 | * spin_lock code is good enough as a barrier so we don't | ||
| 1440 | * need one here: | ||
| 1441 | */ | ||
| 1442 | atomic_set(&cpu_in_kgdb[ks->cpu], 1); | ||
| 1443 | |||
| 1444 | #ifdef CONFIG_SMP | ||
| 1445 | /* Signal the other CPUs to enter kgdb_wait() */ | ||
| 1446 | if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) | ||
| 1447 | kgdb_roundup_cpus(flags); | ||
| 1448 | #endif | ||
| 1449 | |||
| 1450 | /* | ||
| 1451 | * Wait for the other CPUs to be notified and be waiting for us: | ||
| 1452 | */ | ||
| 1453 | for_each_online_cpu(i) { | ||
| 1454 | while (!atomic_read(&cpu_in_kgdb[i])) | ||
| 1455 | cpu_relax(); | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | /* | ||
| 1459 | * At this point the primary processor is completely | ||
| 1460 | * in the debugger and all secondary CPUs are quiescent | ||
| 1461 | */ | ||
| 1462 | kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); | ||
| 1463 | kgdb_deactivate_sw_breakpoints(); | ||
| 1464 | kgdb_single_step = 0; | ||
| 1465 | kgdb_contthread = NULL; | ||
| 1466 | exception_level = 0; | ||
| 1467 | |||
| 1468 | /* Talk to debugger with gdbserial protocol */ | ||
| 1469 | error = gdb_serial_stub(ks); | ||
| 1470 | |||
| 1471 | /* Call the I/O driver's post_exception routine */ | ||
| 1472 | if (kgdb_io_ops->post_exception) | ||
| 1473 | kgdb_io_ops->post_exception(); | ||
| 1474 | |||
| 1475 | kgdb_info[ks->cpu].debuggerinfo = NULL; | ||
| 1476 | kgdb_info[ks->cpu].task = NULL; | ||
| 1477 | atomic_set(&cpu_in_kgdb[ks->cpu], 0); | ||
| 1478 | |||
| 1479 | if (!kgdb_single_step || !kgdb_contthread) { | ||
| 1480 | for (i = NR_CPUS-1; i >= 0; i--) | ||
| 1481 | atomic_set(&passive_cpu_wait[i], 0); | ||
| 1482 | /* | ||
| 1483 | * Wait till all the CPUs have quit | ||
| 1484 | * from the debugger. | ||
| 1485 | */ | ||
| 1486 | for_each_online_cpu(i) { | ||
| 1487 | while (atomic_read(&cpu_in_kgdb[i])) | ||
| 1488 | cpu_relax(); | ||
| 1489 | } | ||
| 1490 | } | ||
| 1491 | |||
| 1492 | kgdb_restore: | ||
| 1493 | /* Free kgdb_active */ | ||
| 1494 | atomic_set(&kgdb_active, -1); | ||
| 1495 | clocksource_touch_watchdog(); | ||
| 1496 | local_irq_restore(flags); | ||
| 1497 | |||
| 1498 | return error; | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | int kgdb_nmicallback(int cpu, void *regs) | ||
| 1502 | { | ||
| 1503 | #ifdef CONFIG_SMP | ||
| 1504 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | ||
| 1505 | atomic_read(&kgdb_active) != cpu && | ||
| 1506 | atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { | ||
| 1507 | kgdb_wait((struct pt_regs *)regs); | ||
| 1508 | return 0; | ||
| 1509 | } | ||
| 1510 | #endif | ||
| 1511 | return 1; | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | void kgdb_console_write(struct console *co, const char *s, unsigned count) | ||
| 1515 | { | ||
| 1516 | unsigned long flags; | ||
| 1517 | |||
| 1518 | /* If we're debugging, or KGDB has not connected, don't try | ||
| 1519 | * and print. */ | ||
| 1520 | if (!kgdb_connected || atomic_read(&kgdb_active) != -1) | ||
| 1521 | return; | ||
| 1522 | |||
| 1523 | local_irq_save(flags); | ||
| 1524 | kgdb_msg_write(s, count); | ||
| 1525 | local_irq_restore(flags); | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | static struct console kgdbcons = { | ||
| 1529 | .name = "kgdb", | ||
| 1530 | .write = kgdb_console_write, | ||
| 1531 | .flags = CON_PRINTBUFFER | CON_ENABLED, | ||
| 1532 | .index = -1, | ||
| 1533 | }; | ||
| 1534 | |||
| 1535 | #ifdef CONFIG_MAGIC_SYSRQ | ||
| 1536 | static void sysrq_handle_gdb(int key, struct tty_struct *tty) | ||
| 1537 | { | ||
| 1538 | if (!kgdb_io_ops) { | ||
| 1539 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | ||
| 1540 | return; | ||
| 1541 | } | ||
| 1542 | if (!kgdb_connected) | ||
| 1543 | printk(KERN_CRIT "Entering KGDB\n"); | ||
| 1544 | |||
| 1545 | kgdb_breakpoint(); | ||
| 1546 | } | ||
| 1547 | |||
| 1548 | static struct sysrq_key_op sysrq_gdb_op = { | ||
| 1549 | .handler = sysrq_handle_gdb, | ||
| 1550 | .help_msg = "Gdb", | ||
| 1551 | .action_msg = "GDB", | ||
| 1552 | }; | ||
| 1553 | #endif | ||
| 1554 | |||
| 1555 | static void kgdb_register_callbacks(void) | ||
| 1556 | { | ||
| 1557 | if (!kgdb_io_module_registered) { | ||
| 1558 | kgdb_io_module_registered = 1; | ||
| 1559 | kgdb_arch_init(); | ||
| 1560 | #ifdef CONFIG_MAGIC_SYSRQ | ||
| 1561 | register_sysrq_key('g', &sysrq_gdb_op); | ||
| 1562 | #endif | ||
| 1563 | if (kgdb_use_con && !kgdb_con_registered) { | ||
| 1564 | register_console(&kgdbcons); | ||
| 1565 | kgdb_con_registered = 1; | ||
| 1566 | } | ||
| 1567 | } | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | static void kgdb_unregister_callbacks(void) | ||
| 1571 | { | ||
| 1572 | /* | ||
| 1573 | * When this routine is called KGDB should unregister from the | ||
| 1574 | * panic handler and clean up, making sure it is not handling any | ||
| 1575 | * break exceptions at the time. | ||
| 1576 | */ | ||
| 1577 | if (kgdb_io_module_registered) { | ||
| 1578 | kgdb_io_module_registered = 0; | ||
| 1579 | kgdb_arch_exit(); | ||
| 1580 | #ifdef CONFIG_MAGIC_SYSRQ | ||
| 1581 | unregister_sysrq_key('g', &sysrq_gdb_op); | ||
| 1582 | #endif | ||
| 1583 | if (kgdb_con_registered) { | ||
| 1584 | unregister_console(&kgdbcons); | ||
| 1585 | kgdb_con_registered = 0; | ||
| 1586 | } | ||
| 1587 | } | ||
| 1588 | } | ||
| 1589 | |||
| 1590 | static void kgdb_initial_breakpoint(void) | ||
| 1591 | { | ||
| 1592 | kgdb_break_asap = 0; | ||
| 1593 | |||
| 1594 | printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); | ||
| 1595 | kgdb_breakpoint(); | ||
| 1596 | } | ||
| 1597 | |||
| 1598 | /** | ||
| 1599 | * kgdb_register_io_module - register KGDB IO module | ||
| 1600 | * @new_kgdb_io_ops: the io ops vector | ||
| 1601 | * | ||
| 1602 | * Register it with the KGDB core. | ||
| 1603 | */ | ||
| 1604 | int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) | ||
| 1605 | { | ||
| 1606 | int err; | ||
| 1607 | |||
| 1608 | spin_lock(&kgdb_registration_lock); | ||
| 1609 | |||
| 1610 | if (kgdb_io_ops) { | ||
| 1611 | spin_unlock(&kgdb_registration_lock); | ||
| 1612 | |||
| 1613 | printk(KERN_ERR "kgdb: Another I/O driver is already " | ||
| 1614 | "registered with KGDB.\n"); | ||
| 1615 | return -EBUSY; | ||
| 1616 | } | ||
| 1617 | |||
| 1618 | if (new_kgdb_io_ops->init) { | ||
| 1619 | err = new_kgdb_io_ops->init(); | ||
| 1620 | if (err) { | ||
| 1621 | spin_unlock(&kgdb_registration_lock); | ||
| 1622 | return err; | ||
| 1623 | } | ||
| 1624 | } | ||
| 1625 | |||
| 1626 | kgdb_io_ops = new_kgdb_io_ops; | ||
| 1627 | |||
| 1628 | spin_unlock(&kgdb_registration_lock); | ||
| 1629 | |||
| 1630 | printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", | ||
| 1631 | new_kgdb_io_ops->name); | ||
| 1632 | |||
| 1633 | /* Arm KGDB now. */ | ||
| 1634 | kgdb_register_callbacks(); | ||
| 1635 | |||
| 1636 | if (kgdb_break_asap) | ||
| 1637 | kgdb_initial_breakpoint(); | ||
| 1638 | |||
| 1639 | return 0; | ||
| 1640 | } | ||
| 1641 | EXPORT_SYMBOL_GPL(kgdb_register_io_module); | ||
| 1642 | |||
| 1643 | /** | ||
| 1644 | * kkgdb_unregister_io_module - unregister KGDB IO module | ||
| 1645 | * @old_kgdb_io_ops: the io ops vector | ||
| 1646 | * | ||
| 1647 | * Unregister it with the KGDB core. | ||
| 1648 | */ | ||
| 1649 | void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) | ||
| 1650 | { | ||
| 1651 | BUG_ON(kgdb_connected); | ||
| 1652 | |||
| 1653 | /* | ||
| 1654 | * KGDB is no longer able to communicate out, so | ||
| 1655 | * unregister our callbacks and reset state. | ||
| 1656 | */ | ||
| 1657 | kgdb_unregister_callbacks(); | ||
| 1658 | |||
| 1659 | spin_lock(&kgdb_registration_lock); | ||
| 1660 | |||
| 1661 | WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); | ||
| 1662 | kgdb_io_ops = NULL; | ||
| 1663 | |||
| 1664 | spin_unlock(&kgdb_registration_lock); | ||
| 1665 | |||
| 1666 | printk(KERN_INFO | ||
| 1667 | "kgdb: Unregistered I/O driver %s, debugger disabled.\n", | ||
| 1668 | old_kgdb_io_ops->name); | ||
| 1669 | } | ||
| 1670 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); | ||
| 1671 | |||
| 1672 | /** | ||
| 1673 | * kgdb_breakpoint - generate breakpoint exception | ||
| 1674 | * | ||
| 1675 | * This function will generate a breakpoint exception. It is used at the | ||
| 1676 | * beginning of a program to sync up with a debugger and can be used | ||
| 1677 | * otherwise as a quick means to stop program execution and "break" into | ||
| 1678 | * the debugger. | ||
| 1679 | */ | ||
| 1680 | void kgdb_breakpoint(void) | ||
| 1681 | { | ||
| 1682 | atomic_set(&kgdb_setting_breakpoint, 1); | ||
| 1683 | wmb(); /* Sync point before breakpoint */ | ||
| 1684 | arch_kgdb_breakpoint(); | ||
| 1685 | wmb(); /* Sync point after breakpoint */ | ||
| 1686 | atomic_set(&kgdb_setting_breakpoint, 0); | ||
| 1687 | } | ||
| 1688 | EXPORT_SYMBOL_GPL(kgdb_breakpoint); | ||
| 1689 | |||
| 1690 | static int __init opt_kgdb_wait(char *str) | ||
| 1691 | { | ||
| 1692 | kgdb_break_asap = 1; | ||
| 1693 | |||
| 1694 | if (kgdb_io_module_registered) | ||
| 1695 | kgdb_initial_breakpoint(); | ||
| 1696 | |||
| 1697 | return 0; | ||
| 1698 | } | ||
| 1699 | |||
| 1700 | early_param("kgdbwait", opt_kgdb_wait); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f363..8df97d3dfda8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/mnt_namespace.h> | 27 | #include <linux/mnt_namespace.h> |
| 28 | #include <linux/completion.h> | 28 | #include <linux/completion.h> |
| 29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
| 30 | #include <linux/fdtable.h> | ||
| 30 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
| 31 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 32 | #include <linux/mount.h> | 33 | #include <linux/mount.h> |
| @@ -165,7 +166,7 @@ static int ____call_usermodehelper(void *data) | |||
| 165 | } | 166 | } |
| 166 | 167 | ||
| 167 | /* We can run anywhere, unlike our parent keventd(). */ | 168 | /* We can run anywhere, unlike our parent keventd(). */ |
| 168 | set_cpus_allowed(current, CPU_MASK_ALL); | 169 | set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); |
| 169 | 170 | ||
| 170 | /* | 171 | /* |
| 171 | * Our parent is keventd, which runs with elevated scheduling priority. | 172 | * Our parent is keventd, which runs with elevated scheduling priority. |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fcfb580c3afc..1e0250cb9486 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | |||
| 72 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 72 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
| 73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
| 74 | 74 | ||
| 75 | /* | ||
| 76 | * Normally, functions that we'd want to prohibit kprobes in, are marked | ||
| 77 | * __kprobes. But, there are cases where such functions already belong to | ||
| 78 | * a different section (__sched for preempt_schedule) | ||
| 79 | * | ||
| 80 | * For such cases, we now have a blacklist | ||
| 81 | */ | ||
| 82 | struct kprobe_blackpoint kprobe_blacklist[] = { | ||
| 83 | {"preempt_schedule",}, | ||
| 84 | {NULL} /* Terminator */ | ||
| 85 | }; | ||
| 86 | |||
| 75 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 87 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
| 76 | /* | 88 | /* |
| 77 | * kprobe->ainsn.insn points to the copy of the instruction to be | 89 | * kprobe->ainsn.insn points to the copy of the instruction to be |
| @@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp) | |||
| 417 | } | 429 | } |
| 418 | } | 430 | } |
| 419 | 431 | ||
| 432 | static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | ||
| 433 | { | ||
| 434 | unsigned long flags; | ||
| 435 | struct kretprobe_instance *ri; | ||
| 436 | struct hlist_node *pos, *next; | ||
| 437 | /* No race here */ | ||
| 438 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
| 439 | hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { | ||
| 440 | ri->rp = NULL; | ||
| 441 | hlist_del(&ri->uflist); | ||
| 442 | } | ||
| 443 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
| 444 | free_rp_inst(rp); | ||
| 445 | } | ||
| 446 | |||
| 420 | /* | 447 | /* |
| 421 | * Keep all fields in the kprobe consistent | 448 | * Keep all fields in the kprobe consistent |
| 422 | */ | 449 | */ |
| @@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 492 | 519 | ||
| 493 | static int __kprobes in_kprobes_functions(unsigned long addr) | 520 | static int __kprobes in_kprobes_functions(unsigned long addr) |
| 494 | { | 521 | { |
| 522 | struct kprobe_blackpoint *kb; | ||
| 523 | |||
| 495 | if (addr >= (unsigned long)__kprobes_text_start && | 524 | if (addr >= (unsigned long)__kprobes_text_start && |
| 496 | addr < (unsigned long)__kprobes_text_end) | 525 | addr < (unsigned long)__kprobes_text_end) |
| 497 | return -EINVAL; | 526 | return -EINVAL; |
| 527 | /* | ||
| 528 | * If there exists a kprobe_blacklist, verify and | ||
| 529 | * fail any probe registration in the prohibited area | ||
| 530 | */ | ||
| 531 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | ||
| 532 | if (kb->start_addr) { | ||
| 533 | if (addr >= kb->start_addr && | ||
| 534 | addr < (kb->start_addr + kb->range)) | ||
| 535 | return -EINVAL; | ||
| 536 | } | ||
| 537 | } | ||
| 498 | return 0; | 538 | return 0; |
| 499 | } | 539 | } |
| 500 | 540 | ||
| @@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
| 555 | } | 595 | } |
| 556 | 596 | ||
| 557 | p->nmissed = 0; | 597 | p->nmissed = 0; |
| 598 | INIT_LIST_HEAD(&p->list); | ||
| 558 | mutex_lock(&kprobe_mutex); | 599 | mutex_lock(&kprobe_mutex); |
| 559 | old_p = get_kprobe(p->addr); | 600 | old_p = get_kprobe(p->addr); |
| 560 | if (old_p) { | 601 | if (old_p) { |
| @@ -581,35 +622,28 @@ out: | |||
| 581 | return ret; | 622 | return ret; |
| 582 | } | 623 | } |
| 583 | 624 | ||
| 584 | int __kprobes register_kprobe(struct kprobe *p) | 625 | /* |
| 585 | { | 626 | * Unregister a kprobe without a scheduler synchronization. |
| 586 | return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); | 627 | */ |
| 587 | } | 628 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
| 588 | |||
| 589 | void __kprobes unregister_kprobe(struct kprobe *p) | ||
| 590 | { | 629 | { |
| 591 | struct module *mod; | ||
| 592 | struct kprobe *old_p, *list_p; | 630 | struct kprobe *old_p, *list_p; |
| 593 | int cleanup_p; | ||
| 594 | 631 | ||
| 595 | mutex_lock(&kprobe_mutex); | ||
| 596 | old_p = get_kprobe(p->addr); | 632 | old_p = get_kprobe(p->addr); |
| 597 | if (unlikely(!old_p)) { | 633 | if (unlikely(!old_p)) |
| 598 | mutex_unlock(&kprobe_mutex); | 634 | return -EINVAL; |
| 599 | return; | 635 | |
| 600 | } | ||
| 601 | if (p != old_p) { | 636 | if (p != old_p) { |
| 602 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 637 | list_for_each_entry_rcu(list_p, &old_p->list, list) |
| 603 | if (list_p == p) | 638 | if (list_p == p) |
| 604 | /* kprobe p is a valid probe */ | 639 | /* kprobe p is a valid probe */ |
| 605 | goto valid_p; | 640 | goto valid_p; |
| 606 | mutex_unlock(&kprobe_mutex); | 641 | return -EINVAL; |
| 607 | return; | ||
| 608 | } | 642 | } |
| 609 | valid_p: | 643 | valid_p: |
| 610 | if (old_p == p || | 644 | if (old_p == p || |
| 611 | (old_p->pre_handler == aggr_pre_handler && | 645 | (old_p->pre_handler == aggr_pre_handler && |
| 612 | p->list.next == &old_p->list && p->list.prev == &old_p->list)) { | 646 | list_is_singular(&old_p->list))) { |
| 613 | /* | 647 | /* |
| 614 | * Only probe on the hash list. Disarm only if kprobes are | 648 | * Only probe on the hash list. Disarm only if kprobes are |
| 615 | * enabled - otherwise, the breakpoint would already have | 649 | * enabled - otherwise, the breakpoint would already have |
| @@ -618,43 +652,97 @@ valid_p: | |||
| 618 | if (kprobe_enabled) | 652 | if (kprobe_enabled) |
| 619 | arch_disarm_kprobe(p); | 653 | arch_disarm_kprobe(p); |
| 620 | hlist_del_rcu(&old_p->hlist); | 654 | hlist_del_rcu(&old_p->hlist); |
| 621 | cleanup_p = 1; | ||
| 622 | } else { | 655 | } else { |
| 656 | if (p->break_handler) | ||
| 657 | old_p->break_handler = NULL; | ||
| 658 | if (p->post_handler) { | ||
| 659 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | ||
| 660 | if ((list_p != p) && (list_p->post_handler)) | ||
| 661 | goto noclean; | ||
| 662 | } | ||
| 663 | old_p->post_handler = NULL; | ||
| 664 | } | ||
| 665 | noclean: | ||
| 623 | list_del_rcu(&p->list); | 666 | list_del_rcu(&p->list); |
| 624 | cleanup_p = 0; | ||
| 625 | } | 667 | } |
| 668 | return 0; | ||
| 669 | } | ||
| 626 | 670 | ||
| 627 | mutex_unlock(&kprobe_mutex); | 671 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
| 672 | { | ||
| 673 | struct module *mod; | ||
| 674 | struct kprobe *old_p; | ||
| 628 | 675 | ||
| 629 | synchronize_sched(); | ||
| 630 | if (p->mod_refcounted) { | 676 | if (p->mod_refcounted) { |
| 631 | mod = module_text_address((unsigned long)p->addr); | 677 | mod = module_text_address((unsigned long)p->addr); |
| 632 | if (mod) | 678 | if (mod) |
| 633 | module_put(mod); | 679 | module_put(mod); |
| 634 | } | 680 | } |
| 635 | 681 | ||
| 636 | if (cleanup_p) { | 682 | if (list_empty(&p->list) || list_is_singular(&p->list)) { |
| 637 | if (p != old_p) { | 683 | if (!list_empty(&p->list)) { |
| 638 | list_del_rcu(&p->list); | 684 | /* "p" is the last child of an aggr_kprobe */ |
| 685 | old_p = list_entry(p->list.next, struct kprobe, list); | ||
| 686 | list_del(&p->list); | ||
| 639 | kfree(old_p); | 687 | kfree(old_p); |
| 640 | } | 688 | } |
| 641 | arch_remove_kprobe(p); | 689 | arch_remove_kprobe(p); |
| 642 | } else { | 690 | } |
| 643 | mutex_lock(&kprobe_mutex); | 691 | } |
| 644 | if (p->break_handler) | 692 | |
| 645 | old_p->break_handler = NULL; | 693 | static int __register_kprobes(struct kprobe **kps, int num, |
| 646 | if (p->post_handler){ | 694 | unsigned long called_from) |
| 647 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | 695 | { |
| 648 | if (list_p->post_handler){ | 696 | int i, ret = 0; |
| 649 | cleanup_p = 2; | 697 | |
| 650 | break; | 698 | if (num <= 0) |
| 651 | } | 699 | return -EINVAL; |
| 652 | } | 700 | for (i = 0; i < num; i++) { |
| 653 | if (cleanup_p == 0) | 701 | ret = __register_kprobe(kps[i], called_from); |
| 654 | old_p->post_handler = NULL; | 702 | if (ret < 0 && i > 0) { |
| 703 | unregister_kprobes(kps, i); | ||
| 704 | break; | ||
| 655 | } | 705 | } |
| 656 | mutex_unlock(&kprobe_mutex); | ||
| 657 | } | 706 | } |
| 707 | return ret; | ||
| 708 | } | ||
| 709 | |||
| 710 | /* | ||
| 711 | * Registration and unregistration functions for kprobe. | ||
| 712 | */ | ||
| 713 | int __kprobes register_kprobe(struct kprobe *p) | ||
| 714 | { | ||
| 715 | return __register_kprobes(&p, 1, | ||
| 716 | (unsigned long)__builtin_return_address(0)); | ||
| 717 | } | ||
| 718 | |||
| 719 | void __kprobes unregister_kprobe(struct kprobe *p) | ||
| 720 | { | ||
| 721 | unregister_kprobes(&p, 1); | ||
| 722 | } | ||
| 723 | |||
| 724 | int __kprobes register_kprobes(struct kprobe **kps, int num) | ||
| 725 | { | ||
| 726 | return __register_kprobes(kps, num, | ||
| 727 | (unsigned long)__builtin_return_address(0)); | ||
| 728 | } | ||
| 729 | |||
| 730 | void __kprobes unregister_kprobes(struct kprobe **kps, int num) | ||
| 731 | { | ||
| 732 | int i; | ||
| 733 | |||
| 734 | if (num <= 0) | ||
| 735 | return; | ||
| 736 | mutex_lock(&kprobe_mutex); | ||
| 737 | for (i = 0; i < num; i++) | ||
| 738 | if (__unregister_kprobe_top(kps[i]) < 0) | ||
| 739 | kps[i]->addr = NULL; | ||
| 740 | mutex_unlock(&kprobe_mutex); | ||
| 741 | |||
| 742 | synchronize_sched(); | ||
| 743 | for (i = 0; i < num; i++) | ||
| 744 | if (kps[i]->addr) | ||
| 745 | __unregister_kprobe_bottom(kps[i]); | ||
| 658 | } | 746 | } |
| 659 | 747 | ||
| 660 | static struct notifier_block kprobe_exceptions_nb = { | 748 | static struct notifier_block kprobe_exceptions_nb = { |
| @@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry) | |||
| 667 | return (unsigned long)entry; | 755 | return (unsigned long)entry; |
| 668 | } | 756 | } |
| 669 | 757 | ||
| 670 | int __kprobes register_jprobe(struct jprobe *jp) | 758 | static int __register_jprobes(struct jprobe **jps, int num, |
| 759 | unsigned long called_from) | ||
| 671 | { | 760 | { |
| 672 | unsigned long addr = arch_deref_entry_point(jp->entry); | 761 | struct jprobe *jp; |
| 762 | int ret = 0, i; | ||
| 673 | 763 | ||
| 674 | if (!kernel_text_address(addr)) | 764 | if (num <= 0) |
| 675 | return -EINVAL; | 765 | return -EINVAL; |
| 766 | for (i = 0; i < num; i++) { | ||
| 767 | unsigned long addr; | ||
| 768 | jp = jps[i]; | ||
| 769 | addr = arch_deref_entry_point(jp->entry); | ||
| 770 | |||
| 771 | if (!kernel_text_address(addr)) | ||
| 772 | ret = -EINVAL; | ||
| 773 | else { | ||
| 774 | /* Todo: Verify probepoint is a function entry point */ | ||
| 775 | jp->kp.pre_handler = setjmp_pre_handler; | ||
| 776 | jp->kp.break_handler = longjmp_break_handler; | ||
| 777 | ret = __register_kprobe(&jp->kp, called_from); | ||
| 778 | } | ||
| 779 | if (ret < 0 && i > 0) { | ||
| 780 | unregister_jprobes(jps, i); | ||
| 781 | break; | ||
| 782 | } | ||
| 783 | } | ||
| 784 | return ret; | ||
| 785 | } | ||
| 676 | 786 | ||
| 677 | /* Todo: Verify probepoint is a function entry point */ | 787 | int __kprobes register_jprobe(struct jprobe *jp) |
| 678 | jp->kp.pre_handler = setjmp_pre_handler; | 788 | { |
| 679 | jp->kp.break_handler = longjmp_break_handler; | 789 | return __register_jprobes(&jp, 1, |
| 680 | |||
| 681 | return __register_kprobe(&jp->kp, | ||
| 682 | (unsigned long)__builtin_return_address(0)); | 790 | (unsigned long)__builtin_return_address(0)); |
| 683 | } | 791 | } |
| 684 | 792 | ||
| 685 | void __kprobes unregister_jprobe(struct jprobe *jp) | 793 | void __kprobes unregister_jprobe(struct jprobe *jp) |
| 686 | { | 794 | { |
| 687 | unregister_kprobe(&jp->kp); | 795 | unregister_jprobes(&jp, 1); |
| 796 | } | ||
| 797 | |||
| 798 | int __kprobes register_jprobes(struct jprobe **jps, int num) | ||
| 799 | { | ||
| 800 | return __register_jprobes(jps, num, | ||
| 801 | (unsigned long)__builtin_return_address(0)); | ||
| 802 | } | ||
| 803 | |||
| 804 | void __kprobes unregister_jprobes(struct jprobe **jps, int num) | ||
| 805 | { | ||
| 806 | int i; | ||
| 807 | |||
| 808 | if (num <= 0) | ||
| 809 | return; | ||
| 810 | mutex_lock(&kprobe_mutex); | ||
| 811 | for (i = 0; i < num; i++) | ||
| 812 | if (__unregister_kprobe_top(&jps[i]->kp) < 0) | ||
| 813 | jps[i]->kp.addr = NULL; | ||
| 814 | mutex_unlock(&kprobe_mutex); | ||
| 815 | |||
| 816 | synchronize_sched(); | ||
| 817 | for (i = 0; i < num; i++) { | ||
| 818 | if (jps[i]->kp.addr) | ||
| 819 | __unregister_kprobe_bottom(&jps[i]->kp); | ||
| 820 | } | ||
| 688 | } | 821 | } |
| 689 | 822 | ||
| 690 | #ifdef CONFIG_KRETPROBES | 823 | #ifdef CONFIG_KRETPROBES |
| @@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
| 725 | return 0; | 858 | return 0; |
| 726 | } | 859 | } |
| 727 | 860 | ||
| 728 | int __kprobes register_kretprobe(struct kretprobe *rp) | 861 | static int __kprobes __register_kretprobe(struct kretprobe *rp, |
| 862 | unsigned long called_from) | ||
| 729 | { | 863 | { |
| 730 | int ret = 0; | 864 | int ret = 0; |
| 731 | struct kretprobe_instance *inst; | 865 | struct kretprobe_instance *inst; |
| @@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
| 771 | 905 | ||
| 772 | rp->nmissed = 0; | 906 | rp->nmissed = 0; |
| 773 | /* Establish function entry probe point */ | 907 | /* Establish function entry probe point */ |
| 774 | if ((ret = __register_kprobe(&rp->kp, | 908 | ret = __register_kprobe(&rp->kp, called_from); |
| 775 | (unsigned long)__builtin_return_address(0))) != 0) | 909 | if (ret != 0) |
| 776 | free_rp_inst(rp); | 910 | free_rp_inst(rp); |
| 777 | return ret; | 911 | return ret; |
| 778 | } | 912 | } |
| 779 | 913 | ||
| 914 | static int __register_kretprobes(struct kretprobe **rps, int num, | ||
| 915 | unsigned long called_from) | ||
| 916 | { | ||
| 917 | int ret = 0, i; | ||
| 918 | |||
| 919 | if (num <= 0) | ||
| 920 | return -EINVAL; | ||
| 921 | for (i = 0; i < num; i++) { | ||
| 922 | ret = __register_kretprobe(rps[i], called_from); | ||
| 923 | if (ret < 0 && i > 0) { | ||
| 924 | unregister_kretprobes(rps, i); | ||
| 925 | break; | ||
| 926 | } | ||
| 927 | } | ||
| 928 | return ret; | ||
| 929 | } | ||
| 930 | |||
| 931 | int __kprobes register_kretprobe(struct kretprobe *rp) | ||
| 932 | { | ||
| 933 | return __register_kretprobes(&rp, 1, | ||
| 934 | (unsigned long)__builtin_return_address(0)); | ||
| 935 | } | ||
| 936 | |||
| 937 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | ||
| 938 | { | ||
| 939 | unregister_kretprobes(&rp, 1); | ||
| 940 | } | ||
| 941 | |||
| 942 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) | ||
| 943 | { | ||
| 944 | return __register_kretprobes(rps, num, | ||
| 945 | (unsigned long)__builtin_return_address(0)); | ||
| 946 | } | ||
| 947 | |||
| 948 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | ||
| 949 | { | ||
| 950 | int i; | ||
| 951 | |||
| 952 | if (num <= 0) | ||
| 953 | return; | ||
| 954 | mutex_lock(&kprobe_mutex); | ||
| 955 | for (i = 0; i < num; i++) | ||
| 956 | if (__unregister_kprobe_top(&rps[i]->kp) < 0) | ||
| 957 | rps[i]->kp.addr = NULL; | ||
| 958 | mutex_unlock(&kprobe_mutex); | ||
| 959 | |||
| 960 | synchronize_sched(); | ||
| 961 | for (i = 0; i < num; i++) { | ||
| 962 | if (rps[i]->kp.addr) { | ||
| 963 | __unregister_kprobe_bottom(&rps[i]->kp); | ||
| 964 | cleanup_rp_inst(rps[i]); | ||
| 965 | } | ||
| 966 | } | ||
| 967 | } | ||
| 968 | |||
| 780 | #else /* CONFIG_KRETPROBES */ | 969 | #else /* CONFIG_KRETPROBES */ |
| 781 | int __kprobes register_kretprobe(struct kretprobe *rp) | 970 | int __kprobes register_kretprobe(struct kretprobe *rp) |
| 782 | { | 971 | { |
| 783 | return -ENOSYS; | 972 | return -ENOSYS; |
| 784 | } | 973 | } |
| 785 | 974 | ||
| 786 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | 975 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) |
| 787 | struct pt_regs *regs) | ||
| 788 | { | 976 | { |
| 789 | return 0; | 977 | return -ENOSYS; |
| 790 | } | 978 | } |
| 791 | #endif /* CONFIG_KRETPROBES */ | ||
| 792 | |||
| 793 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 979 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
| 794 | { | 980 | { |
| 795 | unsigned long flags; | 981 | } |
| 796 | struct kretprobe_instance *ri; | ||
| 797 | struct hlist_node *pos, *next; | ||
| 798 | 982 | ||
| 799 | unregister_kprobe(&rp->kp); | 983 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) |
| 984 | { | ||
| 985 | } | ||
| 800 | 986 | ||
| 801 | /* No race here */ | 987 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, |
| 802 | spin_lock_irqsave(&kretprobe_lock, flags); | 988 | struct pt_regs *regs) |
| 803 | hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { | 989 | { |
| 804 | ri->rp = NULL; | 990 | return 0; |
| 805 | hlist_del(&ri->uflist); | ||
| 806 | } | ||
| 807 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
| 808 | free_rp_inst(rp); | ||
| 809 | } | 991 | } |
| 810 | 992 | ||
| 993 | #endif /* CONFIG_KRETPROBES */ | ||
| 994 | |||
| 811 | static int __init init_kprobes(void) | 995 | static int __init init_kprobes(void) |
| 812 | { | 996 | { |
| 813 | int i, err = 0; | 997 | int i, err = 0; |
| 998 | unsigned long offset = 0, size = 0; | ||
| 999 | char *modname, namebuf[128]; | ||
| 1000 | const char *symbol_name; | ||
| 1001 | void *addr; | ||
| 1002 | struct kprobe_blackpoint *kb; | ||
| 814 | 1003 | ||
| 815 | /* FIXME allocate the probe table, currently defined statically */ | 1004 | /* FIXME allocate the probe table, currently defined statically */ |
| 816 | /* initialize all list heads */ | 1005 | /* initialize all list heads */ |
| @@ -819,6 +1008,28 @@ static int __init init_kprobes(void) | |||
| 819 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 1008 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
| 820 | } | 1009 | } |
| 821 | 1010 | ||
| 1011 | /* | ||
| 1012 | * Lookup and populate the kprobe_blacklist. | ||
| 1013 | * | ||
| 1014 | * Unlike the kretprobe blacklist, we'll need to determine | ||
| 1015 | * the range of addresses that belong to the said functions, | ||
| 1016 | * since a kprobe need not necessarily be at the beginning | ||
| 1017 | * of a function. | ||
| 1018 | */ | ||
| 1019 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | ||
| 1020 | kprobe_lookup_name(kb->name, addr); | ||
| 1021 | if (!addr) | ||
| 1022 | continue; | ||
| 1023 | |||
| 1024 | kb->start_addr = (unsigned long)addr; | ||
| 1025 | symbol_name = kallsyms_lookup(kb->start_addr, | ||
| 1026 | &size, &offset, &modname, namebuf); | ||
| 1027 | if (!symbol_name) | ||
| 1028 | kb->range = 0; | ||
| 1029 | else | ||
| 1030 | kb->range = size; | ||
| 1031 | } | ||
| 1032 | |||
| 822 | if (kretprobe_blacklist_size) { | 1033 | if (kretprobe_blacklist_size) { |
| 823 | /* lookup the function address from its name */ | 1034 | /* lookup the function address from its name */ |
| 824 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 1035 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
| @@ -1066,8 +1277,12 @@ module_init(init_kprobes); | |||
| 1066 | 1277 | ||
| 1067 | EXPORT_SYMBOL_GPL(register_kprobe); | 1278 | EXPORT_SYMBOL_GPL(register_kprobe); |
| 1068 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 1279 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
| 1280 | EXPORT_SYMBOL_GPL(register_kprobes); | ||
| 1281 | EXPORT_SYMBOL_GPL(unregister_kprobes); | ||
| 1069 | EXPORT_SYMBOL_GPL(register_jprobe); | 1282 | EXPORT_SYMBOL_GPL(register_jprobe); |
| 1070 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 1283 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
| 1284 | EXPORT_SYMBOL_GPL(register_jprobes); | ||
| 1285 | EXPORT_SYMBOL_GPL(unregister_jprobes); | ||
| 1071 | #ifdef CONFIG_KPROBES | 1286 | #ifdef CONFIG_KPROBES |
| 1072 | EXPORT_SYMBOL_GPL(jprobe_return); | 1287 | EXPORT_SYMBOL_GPL(jprobe_return); |
| 1073 | #endif | 1288 | #endif |
| @@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return); | |||
| 1075 | #ifdef CONFIG_KPROBES | 1290 | #ifdef CONFIG_KPROBES |
| 1076 | EXPORT_SYMBOL_GPL(register_kretprobe); | 1291 | EXPORT_SYMBOL_GPL(register_kretprobe); |
| 1077 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 1292 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
| 1293 | EXPORT_SYMBOL_GPL(register_kretprobes); | ||
| 1294 | EXPORT_SYMBOL_GPL(unregister_kretprobes); | ||
| 1078 | #endif | 1295 | #endif |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f90..bd1b9ea024e1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
| 16 | #include <asm/semaphore.h> | ||
| 17 | 16 | ||
| 18 | #define KTHREAD_NICE_LEVEL (-5) | 17 | #define KTHREAD_NICE_LEVEL (-5) |
| 19 | 18 | ||
| @@ -99,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 99 | struct sched_param param = { .sched_priority = 0 }; | 98 | struct sched_param param = { .sched_priority = 0 }; |
| 100 | wait_for_completion(&create->started); | 99 | wait_for_completion(&create->started); |
| 101 | read_lock(&tasklist_lock); | 100 | read_lock(&tasklist_lock); |
| 102 | create->result = find_task_by_pid(pid); | 101 | create->result = find_task_by_pid_ns(pid, &init_pid_ns); |
| 103 | read_unlock(&tasklist_lock); | 102 | read_unlock(&tasklist_lock); |
| 104 | /* | 103 | /* |
| 105 | * root may have changed our (kthreadd's) priority or CPU mask. | 104 | * root may have changed our (kthreadd's) priority or CPU mask. |
| @@ -145,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 145 | 144 | ||
| 146 | spin_lock(&kthread_create_lock); | 145 | spin_lock(&kthread_create_lock); |
| 147 | list_add_tail(&create.list, &kthread_create_list); | 146 | list_add_tail(&create.list, &kthread_create_list); |
| 148 | wake_up_process(kthreadd_task); | ||
| 149 | spin_unlock(&kthread_create_lock); | 147 | spin_unlock(&kthread_create_lock); |
| 150 | 148 | ||
| 149 | wake_up_process(kthreadd_task); | ||
| 151 | wait_for_completion(&create.done); | 150 | wait_for_completion(&create.done); |
| 152 | 151 | ||
| 153 | if (!IS_ERR(create.result)) { | 152 | if (!IS_ERR(create.result)) { |
| @@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
| 180 | wait_task_inactive(k); | 179 | wait_task_inactive(k); |
| 181 | set_task_cpu(k, cpu); | 180 | set_task_cpu(k, cpu); |
| 182 | k->cpus_allowed = cpumask_of_cpu(cpu); | 181 | k->cpus_allowed = cpumask_of_cpu(cpu); |
| 182 | k->rt.nr_cpus_allowed = 1; | ||
| 183 | } | 183 | } |
| 184 | EXPORT_SYMBOL(kthread_bind); | 184 | EXPORT_SYMBOL(kthread_bind); |
| 185 | 185 | ||
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe74..5e7b45c56923 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
| 64 | return; | 64 | return; |
| 65 | 65 | ||
| 66 | for (i = 0; i < MAXLR; i++) { | 66 | for (i = 0; i < MAXLR; i++) { |
| 67 | int q; | 67 | int q, same = 1; |
| 68 | int same = 1; | 68 | |
| 69 | /* Nothing stored: */ | 69 | /* Nothing stored: */ |
| 70 | if (!latency_record[i].backtrace[0]) { | 70 | if (!latency_record[i].backtrace[0]) { |
| 71 | if (firstnonnull > i) | 71 | if (firstnonnull > i) |
| @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
| 73 | continue; | 73 | continue; |
| 74 | } | 74 | } |
| 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
| 76 | if (latency_record[i].backtrace[q] != | 76 | unsigned long record = lat->backtrace[q]; |
| 77 | lat->backtrace[q]) | 77 | |
| 78 | if (latency_record[i].backtrace[q] != record) { | ||
| 78 | same = 0; | 79 | same = 0; |
| 79 | if (same && lat->backtrace[q] == 0) | ||
| 80 | break; | 80 | break; |
| 81 | if (same && lat->backtrace[q] == ULONG_MAX) | 81 | } |
| 82 | |||
| 83 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
| 84 | if (record == 0 || record == ULONG_MAX) | ||
| 82 | break; | 85 | break; |
| 83 | } | 86 | } |
| 84 | if (same) { | 87 | if (same) { |
| @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
| 143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | 146 | for (i = 0; i < LT_SAVECOUNT ; i++) { |
| 144 | struct latency_record *mylat; | 147 | struct latency_record *mylat; |
| 145 | int same = 1; | 148 | int same = 1; |
| 149 | |||
| 146 | mylat = &tsk->latency_record[i]; | 150 | mylat = &tsk->latency_record[i]; |
| 147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 151 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
| 148 | if (mylat->backtrace[q] != | 152 | unsigned long record = lat.backtrace[q]; |
| 149 | lat.backtrace[q]) | 153 | |
| 154 | if (mylat->backtrace[q] != record) { | ||
| 150 | same = 0; | 155 | same = 0; |
| 151 | if (same && lat.backtrace[q] == 0) | ||
| 152 | break; | 156 | break; |
| 153 | if (same && lat.backtrace[q] == ULONG_MAX) | 157 | } |
| 158 | |||
| 159 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
| 160 | if (record == 0 || record == ULONG_MAX) | ||
| 154 | break; | 161 | break; |
| 155 | } | 162 | } |
| 156 | if (same) { | 163 | if (same) { |
| @@ -226,14 +233,7 @@ static struct file_operations lstats_fops = { | |||
| 226 | 233 | ||
| 227 | static int __init init_lstats_procfs(void) | 234 | static int __init init_lstats_procfs(void) |
| 228 | { | 235 | { |
| 229 | struct proc_dir_entry *pe; | 236 | proc_create("latency_stats", 0644, NULL, &lstats_fops); |
| 230 | |||
| 231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
| 232 | if (!pe) | ||
| 233 | return -ENOMEM; | ||
| 234 | |||
| 235 | pe->proc_fops = &lstats_fops; | ||
| 236 | |||
| 237 | return 0; | 237 | return 0; |
| 238 | } | 238 | } |
| 239 | __initcall(init_lstats_procfs); | 239 | __initcall(init_lstats_procfs); |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 8a135bd163c2..dc5d29648d85 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
| @@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = { | |||
| 660 | 660 | ||
| 661 | static int __init lockdep_proc_init(void) | 661 | static int __init lockdep_proc_init(void) |
| 662 | { | 662 | { |
| 663 | struct proc_dir_entry *entry; | 663 | proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); |
| 664 | 664 | proc_create("lockdep_stats", S_IRUSR, NULL, | |
| 665 | entry = create_proc_entry("lockdep", S_IRUSR, NULL); | 665 | &proc_lockdep_stats_operations); |
| 666 | if (entry) | ||
| 667 | entry->proc_fops = &proc_lockdep_operations; | ||
| 668 | |||
| 669 | entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); | ||
| 670 | if (entry) | ||
| 671 | entry->proc_fops = &proc_lockdep_stats_operations; | ||
| 672 | 666 | ||
| 673 | #ifdef CONFIG_LOCK_STAT | 667 | #ifdef CONFIG_LOCK_STAT |
| 674 | entry = create_proc_entry("lock_stat", S_IRUSR, NULL); | 668 | proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); |
| 675 | if (entry) | ||
| 676 | entry->proc_fops = &proc_lock_stat_operations; | ||
| 677 | #endif | 669 | #endif |
| 678 | 670 | ||
| 679 | return 0; | 671 | return 0; |
diff --git a/kernel/marker.c b/kernel/marker.c index 041c33e3e95c..b5a9fe1d50d5 100644 --- a/kernel/marker.c +++ b/kernel/marker.c | |||
| @@ -23,12 +23,13 @@ | |||
| 23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
| 24 | #include <linux/marker.h> | 24 | #include <linux/marker.h> |
| 25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
| 26 | #include <linux/slab.h> | ||
| 26 | 27 | ||
| 27 | extern struct marker __start___markers[]; | 28 | extern struct marker __start___markers[]; |
| 28 | extern struct marker __stop___markers[]; | 29 | extern struct marker __stop___markers[]; |
| 29 | 30 | ||
| 30 | /* Set to 1 to enable marker debug output */ | 31 | /* Set to 1 to enable marker debug output */ |
| 31 | const int marker_debug; | 32 | static const int marker_debug; |
| 32 | 33 | ||
| 33 | /* | 34 | /* |
| 34 | * markers_mutex nests inside module_mutex. Markers mutex protects the builtin | 35 | * markers_mutex nests inside module_mutex. Markers mutex protects the builtin |
| @@ -671,6 +672,9 @@ int marker_probe_register(const char *name, const char *format, | |||
| 671 | entry->rcu_pending = 1; | 672 | entry->rcu_pending = 1; |
| 672 | /* write rcu_pending before calling the RCU callback */ | 673 | /* write rcu_pending before calling the RCU callback */ |
| 673 | smp_wmb(); | 674 | smp_wmb(); |
| 675 | #ifdef CONFIG_PREEMPT_RCU | ||
| 676 | synchronize_sched(); /* Until we have the call_rcu_sched() */ | ||
| 677 | #endif | ||
| 674 | call_rcu(&entry->rcu, free_old_closure); | 678 | call_rcu(&entry->rcu, free_old_closure); |
| 675 | end: | 679 | end: |
| 676 | mutex_unlock(&markers_mutex); | 680 | mutex_unlock(&markers_mutex); |
| @@ -714,6 +718,9 @@ int marker_probe_unregister(const char *name, | |||
| 714 | entry->rcu_pending = 1; | 718 | entry->rcu_pending = 1; |
| 715 | /* write rcu_pending before calling the RCU callback */ | 719 | /* write rcu_pending before calling the RCU callback */ |
| 716 | smp_wmb(); | 720 | smp_wmb(); |
| 721 | #ifdef CONFIG_PREEMPT_RCU | ||
| 722 | synchronize_sched(); /* Until we have the call_rcu_sched() */ | ||
| 723 | #endif | ||
| 717 | call_rcu(&entry->rcu, free_old_closure); | 724 | call_rcu(&entry->rcu, free_old_closure); |
| 718 | remove_marker(name); /* Ignore busy error message */ | 725 | remove_marker(name); /* Ignore busy error message */ |
| 719 | ret = 0; | 726 | ret = 0; |
| @@ -792,6 +799,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, | |||
| 792 | entry->rcu_pending = 1; | 799 | entry->rcu_pending = 1; |
| 793 | /* write rcu_pending before calling the RCU callback */ | 800 | /* write rcu_pending before calling the RCU callback */ |
| 794 | smp_wmb(); | 801 | smp_wmb(); |
| 802 | #ifdef CONFIG_PREEMPT_RCU | ||
| 803 | synchronize_sched(); /* Until we have the call_rcu_sched() */ | ||
| 804 | #endif | ||
| 795 | call_rcu(&entry->rcu, free_old_closure); | 805 | call_rcu(&entry->rcu, free_old_closure); |
| 796 | remove_marker(entry->name); /* Ignore busy error message */ | 806 | remove_marker(entry->name); /* Ignore busy error message */ |
| 797 | end: | 807 | end: |
diff --git a/kernel/module.c b/kernel/module.c index 5d437bffd8dc..8674a390a2e8 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -43,7 +43,6 @@ | |||
| 43 | #include <linux/mutex.h> | 43 | #include <linux/mutex.h> |
| 44 | #include <linux/unwind.h> | 44 | #include <linux/unwind.h> |
| 45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| 46 | #include <asm/semaphore.h> | ||
| 47 | #include <asm/cacheflush.h> | 46 | #include <asm/cacheflush.h> |
| 48 | #include <linux/license.h> | 47 | #include <linux/license.h> |
| 49 | #include <asm/sections.h> | 48 | #include <asm/sections.h> |
| @@ -165,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
| 165 | return NULL; | 164 | return NULL; |
| 166 | } | 165 | } |
| 167 | 166 | ||
| 168 | static void printk_unused_warning(const char *name) | 167 | static bool always_ok(bool gplok, bool warn, const char *name) |
| 169 | { | 168 | { |
| 170 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " | 169 | return true; |
| 171 | "however this module is using it.\n", name); | ||
| 172 | printk(KERN_WARNING "This symbol will go away in the future.\n"); | ||
| 173 | printk(KERN_WARNING "Please evalute if this is the right api to use, " | ||
| 174 | "and if it really is, submit a report the linux kernel " | ||
| 175 | "mailinglist together with submitting your code for " | ||
| 176 | "inclusion.\n"); | ||
| 177 | } | 170 | } |
| 178 | 171 | ||
| 179 | /* Find a symbol, return value, crc and module which owns it */ | 172 | static bool printk_unused_warning(bool gplok, bool warn, const char *name) |
| 180 | static unsigned long __find_symbol(const char *name, | ||
| 181 | struct module **owner, | ||
| 182 | const unsigned long **crc, | ||
| 183 | int gplok) | ||
| 184 | { | 173 | { |
| 185 | struct module *mod; | 174 | if (warn) { |
| 186 | const struct kernel_symbol *ks; | 175 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " |
| 187 | 176 | "however this module is using it.\n", name); | |
| 188 | /* Core kernel first. */ | 177 | printk(KERN_WARNING |
| 189 | *owner = NULL; | 178 | "This symbol will go away in the future.\n"); |
| 190 | ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); | 179 | printk(KERN_WARNING |
| 191 | if (ks) { | 180 | "Please evalute if this is the right api to use and if " |
| 192 | *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); | 181 | "it really is, submit a report the linux kernel " |
| 193 | return ks->value; | 182 | "mailinglist together with submitting your code for " |
| 183 | "inclusion.\n"); | ||
| 194 | } | 184 | } |
| 195 | if (gplok) { | 185 | return true; |
| 196 | ks = lookup_symbol(name, __start___ksymtab_gpl, | 186 | } |
| 197 | __stop___ksymtab_gpl); | 187 | |
| 198 | if (ks) { | 188 | static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name) |
| 199 | *crc = symversion(__start___kcrctab_gpl, | 189 | { |
| 200 | (ks - __start___ksymtab_gpl)); | 190 | if (!gplok) |
| 201 | return ks->value; | 191 | return false; |
| 202 | } | 192 | return printk_unused_warning(gplok, warn, name); |
| 203 | } | 193 | } |
| 204 | ks = lookup_symbol(name, __start___ksymtab_gpl_future, | 194 | |
| 205 | __stop___ksymtab_gpl_future); | 195 | static bool gpl_only(bool gplok, bool warn, const char *name) |
| 206 | if (ks) { | 196 | { |
| 207 | if (!gplok) { | 197 | return gplok; |
| 208 | printk(KERN_WARNING "Symbol %s is being used " | 198 | } |
| 209 | "by a non-GPL module, which will not " | 199 | |
| 210 | "be allowed in the future\n", name); | 200 | static bool warn_if_not_gpl(bool gplok, bool warn, const char *name) |
| 211 | printk(KERN_WARNING "Please see the file " | 201 | { |
| 212 | "Documentation/feature-removal-schedule.txt " | 202 | if (!gplok && warn) { |
| 213 | "in the kernel source tree for more " | 203 | printk(KERN_WARNING "Symbol %s is being used " |
| 214 | "details.\n"); | 204 | "by a non-GPL module, which will not " |
| 215 | } | 205 | "be allowed in the future\n", name); |
| 216 | *crc = symversion(__start___kcrctab_gpl_future, | 206 | printk(KERN_WARNING "Please see the file " |
| 217 | (ks - __start___ksymtab_gpl_future)); | 207 | "Documentation/feature-removal-schedule.txt " |
| 218 | return ks->value; | 208 | "in the kernel source tree for more details.\n"); |
| 219 | } | 209 | } |
| 210 | return true; | ||
| 211 | } | ||
| 220 | 212 | ||
| 221 | ks = lookup_symbol(name, __start___ksymtab_unused, | 213 | struct symsearch { |
| 222 | __stop___ksymtab_unused); | 214 | const struct kernel_symbol *start, *stop; |
| 223 | if (ks) { | 215 | const unsigned long *crcs; |
| 224 | printk_unused_warning(name); | 216 | bool (*check)(bool gplok, bool warn, const char *name); |
| 225 | *crc = symversion(__start___kcrctab_unused, | 217 | }; |
| 226 | (ks - __start___ksymtab_unused)); | 218 | |
| 227 | return ks->value; | 219 | /* Look through this array of symbol tables for a symbol match which |
| 220 | * passes the check function. */ | ||
| 221 | static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, | ||
| 222 | unsigned int num, | ||
| 223 | const char *name, | ||
| 224 | bool gplok, | ||
| 225 | bool warn, | ||
| 226 | const unsigned long **crc) | ||
| 227 | { | ||
| 228 | unsigned int i; | ||
| 229 | const struct kernel_symbol *ks; | ||
| 230 | |||
| 231 | for (i = 0; i < num; i++) { | ||
| 232 | ks = lookup_symbol(name, arr[i].start, arr[i].stop); | ||
| 233 | if (!ks || !arr[i].check(gplok, warn, name)) | ||
| 234 | continue; | ||
| 235 | |||
| 236 | if (crc) | ||
| 237 | *crc = symversion(arr[i].crcs, ks - arr[i].start); | ||
| 238 | return ks; | ||
| 228 | } | 239 | } |
| 240 | return NULL; | ||
| 241 | } | ||
| 242 | |||
| 243 | /* Find a symbol, return value, (optional) crc and (optional) module | ||
| 244 | * which owns it */ | ||
| 245 | static unsigned long find_symbol(const char *name, | ||
| 246 | struct module **owner, | ||
| 247 | const unsigned long **crc, | ||
| 248 | bool gplok, | ||
| 249 | bool warn) | ||
| 250 | { | ||
| 251 | struct module *mod; | ||
| 252 | const struct kernel_symbol *ks; | ||
| 253 | const struct symsearch arr[] = { | ||
| 254 | { __start___ksymtab, __stop___ksymtab, __start___kcrctab, | ||
| 255 | always_ok }, | ||
| 256 | { __start___ksymtab_gpl, __stop___ksymtab_gpl, | ||
| 257 | __start___kcrctab_gpl, gpl_only }, | ||
| 258 | { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, | ||
| 259 | __start___kcrctab_gpl_future, warn_if_not_gpl }, | ||
| 260 | { __start___ksymtab_unused, __stop___ksymtab_unused, | ||
| 261 | __start___kcrctab_unused, printk_unused_warning }, | ||
| 262 | { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, | ||
| 263 | __start___kcrctab_unused_gpl, gpl_only_unused_warning }, | ||
| 264 | }; | ||
| 229 | 265 | ||
| 230 | if (gplok) | 266 | /* Core kernel first. */ |
| 231 | ks = lookup_symbol(name, __start___ksymtab_unused_gpl, | 267 | ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); |
| 232 | __stop___ksymtab_unused_gpl); | ||
| 233 | if (ks) { | 268 | if (ks) { |
| 234 | printk_unused_warning(name); | 269 | if (owner) |
| 235 | *crc = symversion(__start___kcrctab_unused_gpl, | 270 | *owner = NULL; |
| 236 | (ks - __start___ksymtab_unused_gpl)); | ||
| 237 | return ks->value; | 271 | return ks->value; |
| 238 | } | 272 | } |
| 239 | 273 | ||
| 240 | /* Now try modules. */ | 274 | /* Now try modules. */ |
| 241 | list_for_each_entry(mod, &modules, list) { | 275 | list_for_each_entry(mod, &modules, list) { |
| 242 | *owner = mod; | 276 | struct symsearch arr[] = { |
| 243 | ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); | 277 | { mod->syms, mod->syms + mod->num_syms, mod->crcs, |
| 244 | if (ks) { | 278 | always_ok }, |
| 245 | *crc = symversion(mod->crcs, (ks - mod->syms)); | 279 | { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, |
| 246 | return ks->value; | 280 | mod->gpl_crcs, gpl_only }, |
| 247 | } | 281 | { mod->gpl_future_syms, |
| 248 | 282 | mod->gpl_future_syms + mod->num_gpl_future_syms, | |
| 249 | if (gplok) { | 283 | mod->gpl_future_crcs, warn_if_not_gpl }, |
| 250 | ks = lookup_symbol(name, mod->gpl_syms, | 284 | { mod->unused_syms, |
| 251 | mod->gpl_syms + mod->num_gpl_syms); | 285 | mod->unused_syms + mod->num_unused_syms, |
| 252 | if (ks) { | 286 | mod->unused_crcs, printk_unused_warning }, |
| 253 | *crc = symversion(mod->gpl_crcs, | 287 | { mod->unused_gpl_syms, |
| 254 | (ks - mod->gpl_syms)); | 288 | mod->unused_gpl_syms + mod->num_unused_gpl_syms, |
| 255 | return ks->value; | 289 | mod->unused_gpl_crcs, gpl_only_unused_warning }, |
| 256 | } | 290 | }; |
| 257 | } | 291 | |
| 258 | ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); | 292 | ks = search_symarrays(arr, ARRAY_SIZE(arr), |
| 293 | name, gplok, warn, crc); | ||
| 259 | if (ks) { | 294 | if (ks) { |
| 260 | printk_unused_warning(name); | 295 | if (owner) |
| 261 | *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); | 296 | *owner = mod; |
| 262 | return ks->value; | ||
| 263 | } | ||
| 264 | |||
| 265 | if (gplok) { | ||
| 266 | ks = lookup_symbol(name, mod->unused_gpl_syms, | ||
| 267 | mod->unused_gpl_syms + mod->num_unused_gpl_syms); | ||
| 268 | if (ks) { | ||
| 269 | printk_unused_warning(name); | ||
| 270 | *crc = symversion(mod->unused_gpl_crcs, | ||
| 271 | (ks - mod->unused_gpl_syms)); | ||
| 272 | return ks->value; | ||
| 273 | } | ||
| 274 | } | ||
| 275 | ks = lookup_symbol(name, mod->gpl_future_syms, | ||
| 276 | (mod->gpl_future_syms + | ||
| 277 | mod->num_gpl_future_syms)); | ||
| 278 | if (ks) { | ||
| 279 | if (!gplok) { | ||
| 280 | printk(KERN_WARNING "Symbol %s is being used " | ||
| 281 | "by a non-GPL module, which will not " | ||
| 282 | "be allowed in the future\n", name); | ||
| 283 | printk(KERN_WARNING "Please see the file " | ||
| 284 | "Documentation/feature-removal-schedule.txt " | ||
| 285 | "in the kernel source tree for more " | ||
| 286 | "details.\n"); | ||
| 287 | } | ||
| 288 | *crc = symversion(mod->gpl_future_crcs, | ||
| 289 | (ks - mod->gpl_future_syms)); | ||
| 290 | return ks->value; | 297 | return ks->value; |
| 291 | } | 298 | } |
| 292 | } | 299 | } |
| 300 | |||
| 293 | DEBUGP("Failed to find symbol %s\n", name); | 301 | DEBUGP("Failed to find symbol %s\n", name); |
| 294 | return -ENOENT; | 302 | return -ENOENT; |
| 295 | } | 303 | } |
| @@ -664,7 +672,7 @@ static void free_module(struct module *mod); | |||
| 664 | 672 | ||
| 665 | static void wait_for_zero_refcount(struct module *mod) | 673 | static void wait_for_zero_refcount(struct module *mod) |
| 666 | { | 674 | { |
| 667 | /* Since we might sleep for some time, drop the semaphore first */ | 675 | /* Since we might sleep for some time, release the mutex first */ |
| 668 | mutex_unlock(&module_mutex); | 676 | mutex_unlock(&module_mutex); |
| 669 | for (;;) { | 677 | for (;;) { |
| 670 | DEBUGP("Looking at refcount...\n"); | 678 | DEBUGP("Looking at refcount...\n"); |
| @@ -737,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
| 737 | if (!forced && module_refcount(mod) != 0) | 745 | if (!forced && module_refcount(mod) != 0) |
| 738 | wait_for_zero_refcount(mod); | 746 | wait_for_zero_refcount(mod); |
| 739 | 747 | ||
| 748 | mutex_unlock(&module_mutex); | ||
| 740 | /* Final destruction now noone is using it. */ | 749 | /* Final destruction now noone is using it. */ |
| 741 | if (mod->exit != NULL) { | 750 | if (mod->exit != NULL) |
| 742 | mutex_unlock(&module_mutex); | ||
| 743 | mod->exit(); | 751 | mod->exit(); |
| 744 | mutex_lock(&module_mutex); | 752 | blocking_notifier_call_chain(&module_notify_list, |
| 745 | } | 753 | MODULE_STATE_GOING, mod); |
| 754 | mutex_lock(&module_mutex); | ||
| 746 | /* Store the name of the last unloaded module for diagnostic purposes */ | 755 | /* Store the name of the last unloaded module for diagnostic purposes */ |
| 747 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); | 756 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); |
| 748 | free_module(mod); | 757 | free_module(mod); |
| @@ -778,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod) | |||
| 778 | void __symbol_put(const char *symbol) | 787 | void __symbol_put(const char *symbol) |
| 779 | { | 788 | { |
| 780 | struct module *owner; | 789 | struct module *owner; |
| 781 | const unsigned long *crc; | ||
| 782 | 790 | ||
| 783 | preempt_disable(); | 791 | preempt_disable(); |
| 784 | if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) | 792 | if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) |
| 785 | BUG(); | 793 | BUG(); |
| 786 | module_put(owner); | 794 | module_put(owner); |
| 787 | preempt_enable(); | 795 | preempt_enable(); |
| @@ -925,13 +933,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, | |||
| 925 | struct module *mod) | 933 | struct module *mod) |
| 926 | { | 934 | { |
| 927 | const unsigned long *crc; | 935 | const unsigned long *crc; |
| 928 | struct module *owner; | ||
| 929 | 936 | ||
| 930 | if (IS_ERR_VALUE(__find_symbol("struct_module", | 937 | if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) |
| 931 | &owner, &crc, 1))) | ||
| 932 | BUG(); | 938 | BUG(); |
| 933 | return check_version(sechdrs, versindex, "struct_module", mod, | 939 | return check_version(sechdrs, versindex, "struct_module", mod, crc); |
| 934 | crc); | ||
| 935 | } | 940 | } |
| 936 | 941 | ||
| 937 | /* First part is kernel version, which we ignore. */ | 942 | /* First part is kernel version, which we ignore. */ |
| @@ -975,8 +980,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
| 975 | unsigned long ret; | 980 | unsigned long ret; |
| 976 | const unsigned long *crc; | 981 | const unsigned long *crc; |
| 977 | 982 | ||
| 978 | ret = __find_symbol(name, &owner, &crc, | 983 | ret = find_symbol(name, &owner, &crc, |
| 979 | !(mod->taints & TAINT_PROPRIETARY_MODULE)); | 984 | !(mod->taints & TAINT_PROPRIETARY_MODULE), true); |
| 980 | if (!IS_ERR_VALUE(ret)) { | 985 | if (!IS_ERR_VALUE(ret)) { |
| 981 | /* use_module can fail due to OOM, | 986 | /* use_module can fail due to OOM, |
| 982 | or module initialization or unloading */ | 987 | or module initialization or unloading */ |
| @@ -992,6 +997,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
| 992 | * J. Corbet <corbet@lwn.net> | 997 | * J. Corbet <corbet@lwn.net> |
| 993 | */ | 998 | */ |
| 994 | #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) | 999 | #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) |
| 1000 | struct module_sect_attr | ||
| 1001 | { | ||
| 1002 | struct module_attribute mattr; | ||
| 1003 | char *name; | ||
| 1004 | unsigned long address; | ||
| 1005 | }; | ||
| 1006 | |||
| 1007 | struct module_sect_attrs | ||
| 1008 | { | ||
| 1009 | struct attribute_group grp; | ||
| 1010 | unsigned int nsections; | ||
| 1011 | struct module_sect_attr attrs[0]; | ||
| 1012 | }; | ||
| 1013 | |||
| 995 | static ssize_t module_sect_show(struct module_attribute *mattr, | 1014 | static ssize_t module_sect_show(struct module_attribute *mattr, |
| 996 | struct module *mod, char *buf) | 1015 | struct module *mod, char *buf) |
| 997 | { | 1016 | { |
| @@ -1002,7 +1021,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, | |||
| 1002 | 1021 | ||
| 1003 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | 1022 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) |
| 1004 | { | 1023 | { |
| 1005 | int section; | 1024 | unsigned int section; |
| 1006 | 1025 | ||
| 1007 | for (section = 0; section < sect_attrs->nsections; section++) | 1026 | for (section = 0; section < sect_attrs->nsections; section++) |
| 1008 | kfree(sect_attrs->attrs[section].name); | 1027 | kfree(sect_attrs->attrs[section].name); |
| @@ -1363,10 +1382,9 @@ void *__symbol_get(const char *symbol) | |||
| 1363 | { | 1382 | { |
| 1364 | struct module *owner; | 1383 | struct module *owner; |
| 1365 | unsigned long value; | 1384 | unsigned long value; |
| 1366 | const unsigned long *crc; | ||
| 1367 | 1385 | ||
| 1368 | preempt_disable(); | 1386 | preempt_disable(); |
| 1369 | value = __find_symbol(symbol, &owner, &crc, 1); | 1387 | value = find_symbol(symbol, &owner, NULL, true, true); |
| 1370 | if (IS_ERR_VALUE(value)) | 1388 | if (IS_ERR_VALUE(value)) |
| 1371 | value = 0; | 1389 | value = 0; |
| 1372 | else if (strong_try_module_get(owner)) | 1390 | else if (strong_try_module_get(owner)) |
| @@ -1383,33 +1401,33 @@ EXPORT_SYMBOL_GPL(__symbol_get); | |||
| 1383 | */ | 1401 | */ |
| 1384 | static int verify_export_symbols(struct module *mod) | 1402 | static int verify_export_symbols(struct module *mod) |
| 1385 | { | 1403 | { |
| 1386 | const char *name = NULL; | 1404 | unsigned int i; |
| 1387 | unsigned long i, ret = 0; | ||
| 1388 | struct module *owner; | 1405 | struct module *owner; |
| 1389 | const unsigned long *crc; | 1406 | const struct kernel_symbol *s; |
| 1390 | 1407 | struct { | |
| 1391 | for (i = 0; i < mod->num_syms; i++) | 1408 | const struct kernel_symbol *sym; |
| 1392 | if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, | 1409 | unsigned int num; |
| 1393 | &owner, &crc, 1))) { | 1410 | } arr[] = { |
| 1394 | name = mod->syms[i].name; | 1411 | { mod->syms, mod->num_syms }, |
| 1395 | ret = -ENOEXEC; | 1412 | { mod->gpl_syms, mod->num_gpl_syms }, |
| 1396 | goto dup; | 1413 | { mod->gpl_future_syms, mod->num_gpl_future_syms }, |
| 1397 | } | 1414 | { mod->unused_syms, mod->num_unused_syms }, |
| 1415 | { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, | ||
| 1416 | }; | ||
| 1398 | 1417 | ||
| 1399 | for (i = 0; i < mod->num_gpl_syms; i++) | 1418 | for (i = 0; i < ARRAY_SIZE(arr); i++) { |
| 1400 | if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, | 1419 | for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { |
| 1401 | &owner, &crc, 1))) { | 1420 | if (!IS_ERR_VALUE(find_symbol(s->name, &owner, |
| 1402 | name = mod->gpl_syms[i].name; | 1421 | NULL, true, false))) { |
| 1403 | ret = -ENOEXEC; | 1422 | printk(KERN_ERR |
| 1404 | goto dup; | 1423 | "%s: exports duplicate symbol %s" |
| 1424 | " (owned by %s)\n", | ||
| 1425 | mod->name, s->name, module_name(owner)); | ||
| 1426 | return -ENOEXEC; | ||
| 1427 | } | ||
| 1405 | } | 1428 | } |
| 1406 | 1429 | } | |
| 1407 | dup: | 1430 | return 0; |
| 1408 | if (ret) | ||
| 1409 | printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", | ||
| 1410 | mod->name, name, module_name(owner)); | ||
| 1411 | |||
| 1412 | return ret; | ||
| 1413 | } | 1431 | } |
| 1414 | 1432 | ||
| 1415 | /* Change all symbols so that st_value encodes the pointer directly. */ | 1433 | /* Change all symbols so that st_value encodes the pointer directly. */ |
| @@ -1815,8 +1833,9 @@ static struct module *load_module(void __user *umod, | |||
| 1815 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | 1833 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); |
| 1816 | #endif | 1834 | #endif |
| 1817 | 1835 | ||
| 1818 | /* Don't keep modinfo section */ | 1836 | /* Don't keep modinfo and version sections. */ |
| 1819 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1837 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
| 1838 | sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
| 1820 | #ifdef CONFIG_KALLSYMS | 1839 | #ifdef CONFIG_KALLSYMS |
| 1821 | /* Keep symbol and string tables for decoding later. */ | 1840 | /* Keep symbol and string tables for decoding later. */ |
| 1822 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1841 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
| @@ -1978,7 +1997,8 @@ static struct module *load_module(void __user *umod, | |||
| 1978 | mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; | 1997 | mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; |
| 1979 | mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; | 1998 | mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; |
| 1980 | if (unusedgplcrcindex) | 1999 | if (unusedgplcrcindex) |
| 1981 | mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; | 2000 | mod->unused_gpl_crcs |
| 2001 | = (void *)sechdrs[unusedgplcrcindex].sh_addr; | ||
| 1982 | 2002 | ||
| 1983 | #ifdef CONFIG_MODVERSIONS | 2003 | #ifdef CONFIG_MODVERSIONS |
| 1984 | if ((mod->num_syms && !crcindex) || | 2004 | if ((mod->num_syms && !crcindex) || |
| @@ -2172,6 +2192,8 @@ sys_init_module(void __user *umod, | |||
| 2172 | mod->state = MODULE_STATE_GOING; | 2192 | mod->state = MODULE_STATE_GOING; |
| 2173 | synchronize_sched(); | 2193 | synchronize_sched(); |
| 2174 | module_put(mod); | 2194 | module_put(mod); |
| 2195 | blocking_notifier_call_chain(&module_notify_list, | ||
| 2196 | MODULE_STATE_GOING, mod); | ||
| 2175 | mutex_lock(&module_mutex); | 2197 | mutex_lock(&module_mutex); |
| 2176 | free_module(mod); | 2198 | free_module(mod); |
| 2177 | mutex_unlock(&module_mutex); | 2199 | mutex_unlock(&module_mutex); |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 643360d1bb14..823be11584ef 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
| @@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl, | |||
| 31 | return 0; | 31 | return 0; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | static int notifier_chain_cond_register(struct notifier_block **nl, | ||
| 35 | struct notifier_block *n) | ||
| 36 | { | ||
| 37 | while ((*nl) != NULL) { | ||
| 38 | if ((*nl) == n) | ||
| 39 | return 0; | ||
| 40 | if (n->priority > (*nl)->priority) | ||
| 41 | break; | ||
| 42 | nl = &((*nl)->next); | ||
| 43 | } | ||
| 44 | n->next = *nl; | ||
| 45 | rcu_assign_pointer(*nl, n); | ||
| 46 | return 0; | ||
| 47 | } | ||
| 48 | |||
| 34 | static int notifier_chain_unregister(struct notifier_block **nl, | 49 | static int notifier_chain_unregister(struct notifier_block **nl, |
| 35 | struct notifier_block *n) | 50 | struct notifier_block *n) |
| 36 | { | 51 | { |
| @@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, | |||
| 205 | EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); | 220 | EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); |
| 206 | 221 | ||
| 207 | /** | 222 | /** |
| 223 | * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain | ||
| 224 | * @nh: Pointer to head of the blocking notifier chain | ||
| 225 | * @n: New entry in notifier chain | ||
| 226 | * | ||
| 227 | * Adds a notifier to a blocking notifier chain, only if not already | ||
| 228 | * present in the chain. | ||
| 229 | * Must be called in process context. | ||
| 230 | * | ||
| 231 | * Currently always returns zero. | ||
| 232 | */ | ||
| 233 | int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, | ||
| 234 | struct notifier_block *n) | ||
| 235 | { | ||
| 236 | int ret; | ||
| 237 | |||
| 238 | down_write(&nh->rwsem); | ||
| 239 | ret = notifier_chain_cond_register(&nh->head, n); | ||
| 240 | up_write(&nh->rwsem); | ||
| 241 | return ret; | ||
| 242 | } | ||
| 243 | EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); | ||
| 244 | |||
| 245 | /** | ||
| 208 | * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain | 246 | * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain |
| 209 | * @nh: Pointer to head of the blocking notifier chain | 247 | * @nh: Pointer to head of the blocking notifier chain |
| 210 | * @n: Entry to remove from notifier chain | 248 | * @n: Entry to remove from notifier chain |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index aead4d69f62b..48d7ed6fc3a4 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c | |||
| @@ -7,6 +7,8 @@ | |||
| 7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 8 | #include <linux/cgroup.h> | 8 | #include <linux/cgroup.h> |
| 9 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/nsproxy.h> | ||
| 10 | 12 | ||
| 11 | struct ns_cgroup { | 13 | struct ns_cgroup { |
| 12 | struct cgroup_subsys_state css; | 14 | struct cgroup_subsys_state css; |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5d332cf8c63..adc785146a1c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
| 139 | goto out; | 139 | goto out; |
| 140 | } | 140 | } |
| 141 | 141 | ||
| 142 | /* | ||
| 143 | * CLONE_NEWIPC must detach from the undolist: after switching | ||
| 144 | * to a new ipc namespace, the semaphore arrays from the old | ||
| 145 | * namespace are unreachable. In clone parlance, CLONE_SYSVSEM | ||
| 146 | * means share undolist with parent, so we must forbid using | ||
| 147 | * it along with CLONE_NEWIPC. | ||
| 148 | */ | ||
| 149 | if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { | ||
| 150 | err = -EINVAL; | ||
| 151 | goto out; | ||
| 152 | } | ||
| 153 | |||
| 142 | new_ns = create_new_namespaces(flags, tsk, tsk->fs); | 154 | new_ns = create_new_namespaces(flags, tsk, tsk->fs); |
| 143 | if (IS_ERR(new_ns)) { | 155 | if (IS_ERR(new_ns)) { |
| 144 | err = PTR_ERR(new_ns); | 156 | err = PTR_ERR(new_ns); |
diff --git a/kernel/panic.c b/kernel/panic.c index 24af9f8bac99..425567f45b9f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic); | |||
| 153 | * 'M' - System experienced a machine check exception. | 153 | * 'M' - System experienced a machine check exception. |
| 154 | * 'B' - System has hit bad_page. | 154 | * 'B' - System has hit bad_page. |
| 155 | * 'U' - Userspace-defined naughtiness. | 155 | * 'U' - Userspace-defined naughtiness. |
| 156 | * 'A' - ACPI table overridden. | ||
| 157 | * 'W' - Taint on warning. | ||
| 156 | * | 158 | * |
| 157 | * The string is overwritten by the next call to print_taint(). | 159 | * The string is overwritten by the next call to print_taint(). |
| 158 | */ | 160 | */ |
| @@ -161,7 +163,7 @@ const char *print_tainted(void) | |||
| 161 | { | 163 | { |
| 162 | static char buf[20]; | 164 | static char buf[20]; |
| 163 | if (tainted) { | 165 | if (tainted) { |
| 164 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", | 166 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", |
| 165 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', | 167 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', |
| 166 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', | 168 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', |
| 167 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', | 169 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', |
| @@ -170,7 +172,8 @@ const char *print_tainted(void) | |||
| 170 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', | 172 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', |
| 171 | tainted & TAINT_USER ? 'U' : ' ', | 173 | tainted & TAINT_USER ? 'U' : ' ', |
| 172 | tainted & TAINT_DIE ? 'D' : ' ', | 174 | tainted & TAINT_DIE ? 'D' : ' ', |
| 173 | tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); | 175 | tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', |
| 176 | tainted & TAINT_WARN ? 'W' : ' '); | ||
| 174 | } | 177 | } |
| 175 | else | 178 | else |
| 176 | snprintf(buf, sizeof(buf), "Not tainted"); | 179 | snprintf(buf, sizeof(buf), "Not tainted"); |
| @@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line) | |||
| 312 | print_modules(); | 315 | print_modules(); |
| 313 | dump_stack(); | 316 | dump_stack(); |
| 314 | print_oops_end_marker(); | 317 | print_oops_end_marker(); |
| 318 | add_taint(TAINT_WARN); | ||
| 315 | } | 319 | } |
| 316 | EXPORT_SYMBOL(warn_on_slowpath); | 320 | EXPORT_SYMBOL(warn_on_slowpath); |
| 317 | #endif | 321 | #endif |
diff --git a/kernel/pid.c b/kernel/pid.c index 477691576b33..20d59fa2d493 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init); | |||
| 111 | 111 | ||
| 112 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 112 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
| 113 | 113 | ||
| 114 | static void free_pidmap(struct pid_namespace *pid_ns, int pid) | 114 | static void free_pidmap(struct upid *upid) |
| 115 | { | 115 | { |
| 116 | struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; | 116 | int nr = upid->nr; |
| 117 | int offset = pid & BITS_PER_PAGE_MASK; | 117 | struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; |
| 118 | int offset = nr & BITS_PER_PAGE_MASK; | ||
| 118 | 119 | ||
| 119 | clear_bit(offset, map->page); | 120 | clear_bit(offset, map->page); |
| 120 | atomic_inc(&map->nr_free); | 121 | atomic_inc(&map->nr_free); |
| @@ -232,7 +233,7 @@ void free_pid(struct pid *pid) | |||
| 232 | spin_unlock_irqrestore(&pidmap_lock, flags); | 233 | spin_unlock_irqrestore(&pidmap_lock, flags); |
| 233 | 234 | ||
| 234 | for (i = 0; i <= pid->level; i++) | 235 | for (i = 0; i <= pid->level; i++) |
| 235 | free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); | 236 | free_pidmap(pid->numbers + i); |
| 236 | 237 | ||
| 237 | call_rcu(&pid->rcu, delayed_put_pid); | 238 | call_rcu(&pid->rcu, delayed_put_pid); |
| 238 | } | 239 | } |
| @@ -278,8 +279,8 @@ out: | |||
| 278 | return pid; | 279 | return pid; |
| 279 | 280 | ||
| 280 | out_free: | 281 | out_free: |
| 281 | for (i++; i <= ns->level; i++) | 282 | while (++i <= ns->level) |
| 282 | free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); | 283 | free_pidmap(pid->numbers + i); |
| 283 | 284 | ||
| 284 | kmem_cache_free(ns->pid_cachep, pid); | 285 | kmem_cache_free(ns->pid_cachep, pid); |
| 285 | pid = NULL; | 286 | pid = NULL; |
| @@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid); | |||
| 316 | /* | 317 | /* |
| 317 | * attach_pid() must be called with the tasklist_lock write-held. | 318 | * attach_pid() must be called with the tasklist_lock write-held. |
| 318 | */ | 319 | */ |
| 319 | int attach_pid(struct task_struct *task, enum pid_type type, | 320 | void attach_pid(struct task_struct *task, enum pid_type type, |
| 320 | struct pid *pid) | 321 | struct pid *pid) |
| 321 | { | 322 | { |
| 322 | struct pid_link *link; | 323 | struct pid_link *link; |
| @@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type, | |||
| 324 | link = &task->pids[type]; | 325 | link = &task->pids[type]; |
| 325 | link->pid = pid; | 326 | link->pid = pid; |
| 326 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); | 327 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); |
| 327 | |||
| 328 | return 0; | ||
| 329 | } | 328 | } |
| 330 | 329 | ||
| 331 | void detach_pid(struct task_struct *task, enum pid_type type) | 330 | static void __change_pid(struct task_struct *task, enum pid_type type, |
| 331 | struct pid *new) | ||
| 332 | { | 332 | { |
| 333 | struct pid_link *link; | 333 | struct pid_link *link; |
| 334 | struct pid *pid; | 334 | struct pid *pid; |
| @@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type) | |||
| 338 | pid = link->pid; | 338 | pid = link->pid; |
| 339 | 339 | ||
| 340 | hlist_del_rcu(&link->node); | 340 | hlist_del_rcu(&link->node); |
| 341 | link->pid = NULL; | 341 | link->pid = new; |
| 342 | 342 | ||
| 343 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) | 343 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) |
| 344 | if (!hlist_empty(&pid->tasks[tmp])) | 344 | if (!hlist_empty(&pid->tasks[tmp])) |
| @@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type) | |||
| 347 | free_pid(pid); | 347 | free_pid(pid); |
| 348 | } | 348 | } |
| 349 | 349 | ||
| 350 | void detach_pid(struct task_struct *task, enum pid_type type) | ||
| 351 | { | ||
| 352 | __change_pid(task, type, NULL); | ||
| 353 | } | ||
| 354 | |||
| 355 | void change_pid(struct task_struct *task, enum pid_type type, | ||
| 356 | struct pid *pid) | ||
| 357 | { | ||
| 358 | __change_pid(task, type, pid); | ||
| 359 | attach_pid(task, type, pid); | ||
| 360 | } | ||
| 361 | |||
| 350 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ | 362 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ |
| 351 | void transfer_pid(struct task_struct *old, struct task_struct *new, | 363 | void transfer_pid(struct task_struct *old, struct task_struct *new, |
| 352 | enum pid_type type) | 364 | enum pid_type type) |
| 353 | { | 365 | { |
| 354 | new->pids[type].pid = old->pids[type].pid; | 366 | new->pids[type].pid = old->pids[type].pid; |
| 355 | hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); | 367 | hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); |
| 356 | old->pids[type].pid = NULL; | ||
| 357 | } | 368 | } |
| 358 | 369 | ||
| 359 | struct task_struct *pid_task(struct pid *pid, enum pid_type type) | 370 | struct task_struct *pid_task(struct pid *pid, enum pid_type type) |
| @@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr, | |||
| 380 | 391 | ||
| 381 | EXPORT_SYMBOL(find_task_by_pid_type_ns); | 392 | EXPORT_SYMBOL(find_task_by_pid_type_ns); |
| 382 | 393 | ||
| 383 | struct task_struct *find_task_by_pid(pid_t nr) | ||
| 384 | { | ||
| 385 | return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); | ||
| 386 | } | ||
| 387 | EXPORT_SYMBOL(find_task_by_pid); | ||
| 388 | |||
| 389 | struct task_struct *find_task_by_vpid(pid_t vnr) | 394 | struct task_struct *find_task_by_vpid(pid_t vnr) |
| 390 | { | 395 | { |
| 391 | return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, | 396 | return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d792b66d854..98702b4b8851 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -66,7 +66,7 @@ err_alloc: | |||
| 66 | return NULL; | 66 | return NULL; |
| 67 | } | 67 | } |
| 68 | 68 | ||
| 69 | static struct pid_namespace *create_pid_namespace(int level) | 69 | static struct pid_namespace *create_pid_namespace(unsigned int level) |
| 70 | { | 70 | { |
| 71 | struct pid_namespace *ns; | 71 | struct pid_namespace *ns; |
| 72 | int i; | 72 | int i; |
| @@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level) | |||
| 92 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 92 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); |
| 93 | 93 | ||
| 94 | for (i = 1; i < PIDMAP_ENTRIES; i++) { | 94 | for (i = 1; i < PIDMAP_ENTRIES; i++) { |
| 95 | ns->pidmap[i].page = 0; | 95 | ns->pidmap[i].page = NULL; |
| 96 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 96 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
| 97 | } | 97 | } |
| 98 | 98 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2eae91f954ca..f1525ad06cb3 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -4,8 +4,9 @@ | |||
| 4 | 4 | ||
| 5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
| 6 | #include <linux/posix-timers.h> | 6 | #include <linux/posix-timers.h> |
| 7 | #include <asm/uaccess.h> | ||
| 8 | #include <linux/errno.h> | 7 | #include <linux/errno.h> |
| 8 | #include <linux/math64.h> | ||
| 9 | #include <asm/uaccess.h> | ||
| 9 | 10 | ||
| 10 | static int check_clock(const clockid_t which_clock) | 11 | static int check_clock(const clockid_t which_clock) |
| 11 | { | 12 | { |
| @@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock, | |||
| 47 | union cpu_time_count cpu, | 48 | union cpu_time_count cpu, |
| 48 | struct timespec *tp) | 49 | struct timespec *tp) |
| 49 | { | 50 | { |
| 50 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 51 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) |
| 51 | tp->tv_sec = div_long_long_rem(cpu.sched, | 52 | *tp = ns_to_timespec(cpu.sched); |
| 52 | NSEC_PER_SEC, &tp->tv_nsec); | 53 | else |
| 53 | } else { | ||
| 54 | cputime_to_timespec(cpu.cpu, tp); | 54 | cputime_to_timespec(cpu.cpu, tp); |
| 55 | } | ||
| 56 | } | 55 | } |
| 57 | 56 | ||
| 58 | static inline int cpu_time_before(const clockid_t which_clock, | 57 | static inline int cpu_time_before(const clockid_t which_clock, |
| @@ -1087,45 +1086,45 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1087 | maxfire = 20; | 1086 | maxfire = 20; |
| 1088 | prof_expires = cputime_zero; | 1087 | prof_expires = cputime_zero; |
| 1089 | while (!list_empty(timers)) { | 1088 | while (!list_empty(timers)) { |
| 1090 | struct cpu_timer_list *t = list_first_entry(timers, | 1089 | struct cpu_timer_list *tl = list_first_entry(timers, |
| 1091 | struct cpu_timer_list, | 1090 | struct cpu_timer_list, |
| 1092 | entry); | 1091 | entry); |
| 1093 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { | 1092 | if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { |
| 1094 | prof_expires = t->expires.cpu; | 1093 | prof_expires = tl->expires.cpu; |
| 1095 | break; | 1094 | break; |
| 1096 | } | 1095 | } |
| 1097 | t->firing = 1; | 1096 | tl->firing = 1; |
| 1098 | list_move_tail(&t->entry, firing); | 1097 | list_move_tail(&tl->entry, firing); |
| 1099 | } | 1098 | } |
| 1100 | 1099 | ||
| 1101 | ++timers; | 1100 | ++timers; |
| 1102 | maxfire = 20; | 1101 | maxfire = 20; |
| 1103 | virt_expires = cputime_zero; | 1102 | virt_expires = cputime_zero; |
| 1104 | while (!list_empty(timers)) { | 1103 | while (!list_empty(timers)) { |
| 1105 | struct cpu_timer_list *t = list_first_entry(timers, | 1104 | struct cpu_timer_list *tl = list_first_entry(timers, |
| 1106 | struct cpu_timer_list, | 1105 | struct cpu_timer_list, |
| 1107 | entry); | 1106 | entry); |
| 1108 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { | 1107 | if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { |
| 1109 | virt_expires = t->expires.cpu; | 1108 | virt_expires = tl->expires.cpu; |
| 1110 | break; | 1109 | break; |
| 1111 | } | 1110 | } |
| 1112 | t->firing = 1; | 1111 | tl->firing = 1; |
| 1113 | list_move_tail(&t->entry, firing); | 1112 | list_move_tail(&tl->entry, firing); |
| 1114 | } | 1113 | } |
| 1115 | 1114 | ||
| 1116 | ++timers; | 1115 | ++timers; |
| 1117 | maxfire = 20; | 1116 | maxfire = 20; |
| 1118 | sched_expires = 0; | 1117 | sched_expires = 0; |
| 1119 | while (!list_empty(timers)) { | 1118 | while (!list_empty(timers)) { |
| 1120 | struct cpu_timer_list *t = list_first_entry(timers, | 1119 | struct cpu_timer_list *tl = list_first_entry(timers, |
| 1121 | struct cpu_timer_list, | 1120 | struct cpu_timer_list, |
| 1122 | entry); | 1121 | entry); |
| 1123 | if (!--maxfire || sum_sched_runtime < t->expires.sched) { | 1122 | if (!--maxfire || sum_sched_runtime < tl->expires.sched) { |
| 1124 | sched_expires = t->expires.sched; | 1123 | sched_expires = tl->expires.sched; |
| 1125 | break; | 1124 | break; |
| 1126 | } | 1125 | } |
| 1127 | t->firing = 1; | 1126 | tl->firing = 1; |
| 1128 | list_move_tail(&t->entry, firing); | 1127 | list_move_tail(&tl->entry, firing); |
| 1129 | } | 1128 | } |
| 1130 | 1129 | ||
| 1131 | /* | 1130 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a9b04203a66d..dbd8398ddb0b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | #include <linux/mutex.h> | 37 | #include <linux/mutex.h> |
| 38 | 38 | ||
| 39 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
| 40 | #include <asm/semaphore.h> | ||
| 41 | #include <linux/list.h> | 40 | #include <linux/list.h> |
| 42 | #include <linux/init.h> | 41 | #include <linux/init.h> |
| 43 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
| @@ -311,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private) | |||
| 311 | 310 | ||
| 312 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { | 311 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { |
| 313 | struct task_struct *leader; | 312 | struct task_struct *leader; |
| 314 | int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, | 313 | int ret = send_sigqueue(timr->sigq, timr->it_process, 0); |
| 315 | timr->it_process); | ||
| 316 | 314 | ||
| 317 | if (likely(ret >= 0)) | 315 | if (likely(ret >= 0)) |
| 318 | return ret; | 316 | return ret; |
| @@ -323,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private) | |||
| 323 | timr->it_process = leader; | 321 | timr->it_process = leader; |
| 324 | } | 322 | } |
| 325 | 323 | ||
| 326 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | 324 | return send_sigqueue(timr->sigq, timr->it_process, 1); |
| 327 | timr->it_process); | ||
| 328 | } | 325 | } |
| 329 | EXPORT_SYMBOL_GPL(posix_timer_event); | 326 | EXPORT_SYMBOL_GPL(posix_timer_event); |
| 330 | 327 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6233f3b4ae66..b45da40e8d25 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -19,16 +19,6 @@ config PM | |||
| 19 | will issue the hlt instruction if nothing is to be done, thereby | 19 | will issue the hlt instruction if nothing is to be done, thereby |
| 20 | sending the processor to sleep and saving power. | 20 | sending the processor to sleep and saving power. |
| 21 | 21 | ||
| 22 | config PM_LEGACY | ||
| 23 | bool "Legacy Power Management API (DEPRECATED)" | ||
| 24 | depends on PM | ||
| 25 | default n | ||
| 26 | ---help--- | ||
| 27 | Support for pm_register() and friends. This old API is obsoleted | ||
| 28 | by the driver model. | ||
| 29 | |||
| 30 | If unsure, say N. | ||
| 31 | |||
| 32 | config PM_DEBUG | 22 | config PM_DEBUG |
| 33 | bool "Power Management Debug Support" | 23 | bool "Power Management Debug Support" |
| 34 | depends on PM | 24 | depends on PM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f7dfff28ecdb..597823b5b700 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG | |||
| 4 | endif | 4 | endif |
| 5 | 5 | ||
| 6 | obj-y := main.o | 6 | obj-y := main.o |
| 7 | obj-$(CONFIG_PM_LEGACY) += pm.o | ||
| 8 | obj-$(CONFIG_PM_SLEEP) += process.o console.o | 7 | obj-$(CONFIG_PM_SLEEP) += process.o console.o |
| 9 | obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o | 8 | obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o |
| 10 | 9 | ||
diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee5..b8628be2a465 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
| @@ -7,17 +7,39 @@ | |||
| 7 | #include <linux/vt_kern.h> | 7 | #include <linux/vt_kern.h> |
| 8 | #include <linux/kbd_kern.h> | 8 | #include <linux/kbd_kern.h> |
| 9 | #include <linux/console.h> | 9 | #include <linux/console.h> |
| 10 | #include <linux/module.h> | ||
| 10 | #include "power.h" | 11 | #include "power.h" |
| 11 | 12 | ||
| 12 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) | 13 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) |
| 13 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 14 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
| 14 | 15 | ||
| 15 | static int orig_fgconsole, orig_kmsg; | 16 | static int orig_fgconsole, orig_kmsg; |
| 17 | static int disable_vt_switch; | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Normally during a suspend, we allocate a new console and switch to it. | ||
| 21 | * When we resume, we switch back to the original console. This switch | ||
| 22 | * can be slow, so on systems where the framebuffer can handle restoration | ||
| 23 | * of video registers anyways, there's little point in doing the console | ||
| 24 | * switch. This function allows you to disable it by passing it '0'. | ||
| 25 | */ | ||
| 26 | void pm_set_vt_switch(int do_switch) | ||
| 27 | { | ||
| 28 | acquire_console_sem(); | ||
| 29 | disable_vt_switch = !do_switch; | ||
| 30 | release_console_sem(); | ||
| 31 | } | ||
| 32 | EXPORT_SYMBOL(pm_set_vt_switch); | ||
| 16 | 33 | ||
| 17 | int pm_prepare_console(void) | 34 | int pm_prepare_console(void) |
| 18 | { | 35 | { |
| 19 | acquire_console_sem(); | 36 | acquire_console_sem(); |
| 20 | 37 | ||
| 38 | if (disable_vt_switch) { | ||
| 39 | release_console_sem(); | ||
| 40 | return 0; | ||
| 41 | } | ||
| 42 | |||
| 21 | orig_fgconsole = fg_console; | 43 | orig_fgconsole = fg_console; |
| 22 | 44 | ||
| 23 | if (vc_allocate(SUSPEND_CONSOLE)) { | 45 | if (vc_allocate(SUSPEND_CONSOLE)) { |
| @@ -50,9 +72,12 @@ int pm_prepare_console(void) | |||
| 50 | void pm_restore_console(void) | 72 | void pm_restore_console(void) |
| 51 | { | 73 | { |
| 52 | acquire_console_sem(); | 74 | acquire_console_sem(); |
| 75 | if (disable_vt_switch) { | ||
| 76 | release_console_sem(); | ||
| 77 | return; | ||
| 78 | } | ||
| 53 | set_console(orig_fgconsole); | 79 | set_console(orig_fgconsole); |
| 54 | release_console_sem(); | 80 | release_console_sem(); |
| 55 | kmsg_redirect = orig_kmsg; | 81 | kmsg_redirect = orig_kmsg; |
| 56 | return; | ||
| 57 | } | 82 | } |
| 58 | #endif | 83 | #endif |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index 60c73fa670d5..000000000000 --- a/kernel/power/pm.c +++ /dev/null | |||
| @@ -1,205 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * pm.c - Power management interface | ||
| 3 | * | ||
| 4 | * Copyright (C) 2000 Andrew Henroid | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 19 | */ | ||
| 20 | #include <linux/init.h> | ||
| 21 | #include <linux/module.h> | ||
| 22 | #include <linux/spinlock.h> | ||
| 23 | #include <linux/mm.h> | ||
| 24 | #include <linux/slab.h> | ||
| 25 | #include <linux/pm.h> | ||
| 26 | #include <linux/pm_legacy.h> | ||
| 27 | #include <linux/interrupt.h> | ||
| 28 | #include <linux/mutex.h> | ||
| 29 | |||
| 30 | /* | ||
| 31 | * Locking notes: | ||
| 32 | * pm_devs_lock can be a semaphore providing pm ops are not called | ||
| 33 | * from an interrupt handler (already a bad idea so no change here). Each | ||
| 34 | * change must be protected so that an unlink of an entry doesn't clash | ||
| 35 | * with a pm send - which is permitted to sleep in the current architecture | ||
| 36 | * | ||
| 37 | * Module unloads clashing with pm events now work out safely, the module | ||
| 38 | * unload path will block until the event has been sent. It may well block | ||
| 39 | * until a resume but that will be fine. | ||
| 40 | */ | ||
| 41 | |||
| 42 | static DEFINE_MUTEX(pm_devs_lock); | ||
| 43 | static LIST_HEAD(pm_devs); | ||
| 44 | |||
| 45 | /** | ||
| 46 | * pm_register - register a device with power management | ||
| 47 | * @type: device type | ||
| 48 | * @id: device ID | ||
| 49 | * @callback: callback function | ||
| 50 | * | ||
| 51 | * Add a device to the list of devices that wish to be notified about | ||
| 52 | * power management events. A &pm_dev structure is returned on success, | ||
| 53 | * on failure the return is %NULL. | ||
| 54 | * | ||
| 55 | * The callback function will be called in process context and | ||
| 56 | * it may sleep. | ||
| 57 | */ | ||
| 58 | |||
| 59 | struct pm_dev *pm_register(pm_dev_t type, | ||
| 60 | unsigned long id, | ||
| 61 | pm_callback callback) | ||
| 62 | { | ||
| 63 | struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); | ||
| 64 | if (dev) { | ||
| 65 | dev->type = type; | ||
| 66 | dev->id = id; | ||
| 67 | dev->callback = callback; | ||
| 68 | |||
| 69 | mutex_lock(&pm_devs_lock); | ||
| 70 | list_add(&dev->entry, &pm_devs); | ||
| 71 | mutex_unlock(&pm_devs_lock); | ||
| 72 | } | ||
| 73 | return dev; | ||
| 74 | } | ||
| 75 | |||
| 76 | /** | ||
| 77 | * pm_send - send request to a single device | ||
| 78 | * @dev: device to send to | ||
| 79 | * @rqst: power management request | ||
| 80 | * @data: data for the callback | ||
| 81 | * | ||
| 82 | * Issue a power management request to a given device. The | ||
| 83 | * %PM_SUSPEND and %PM_RESUME events are handled specially. The | ||
| 84 | * data field must hold the intended next state. No call is made | ||
| 85 | * if the state matches. | ||
| 86 | * | ||
| 87 | * BUGS: what stops two power management requests occurring in parallel | ||
| 88 | * and conflicting. | ||
| 89 | * | ||
| 90 | * WARNING: Calling pm_send directly is not generally recommended, in | ||
| 91 | * particular there is no locking against the pm_dev going away. The | ||
| 92 | * caller must maintain all needed locking or have 'inside knowledge' | ||
| 93 | * on the safety. Also remember that this function is not locked against | ||
| 94 | * pm_unregister. This means that you must handle SMP races on callback | ||
| 95 | * execution and unload yourself. | ||
| 96 | */ | ||
| 97 | |||
| 98 | static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) | ||
| 99 | { | ||
| 100 | int status = 0; | ||
| 101 | unsigned long prev_state, next_state; | ||
| 102 | |||
| 103 | if (in_interrupt()) | ||
| 104 | BUG(); | ||
| 105 | |||
| 106 | switch (rqst) { | ||
| 107 | case PM_SUSPEND: | ||
| 108 | case PM_RESUME: | ||
| 109 | prev_state = dev->state; | ||
| 110 | next_state = (unsigned long) data; | ||
| 111 | if (prev_state != next_state) { | ||
| 112 | if (dev->callback) | ||
| 113 | status = (*dev->callback)(dev, rqst, data); | ||
| 114 | if (!status) { | ||
| 115 | dev->state = next_state; | ||
| 116 | dev->prev_state = prev_state; | ||
| 117 | } | ||
| 118 | } | ||
| 119 | else { | ||
| 120 | dev->prev_state = prev_state; | ||
| 121 | } | ||
| 122 | break; | ||
| 123 | default: | ||
| 124 | if (dev->callback) | ||
| 125 | status = (*dev->callback)(dev, rqst, data); | ||
| 126 | break; | ||
| 127 | } | ||
| 128 | return status; | ||
| 129 | } | ||
| 130 | |||
| 131 | /* | ||
| 132 | * Undo incomplete request | ||
| 133 | */ | ||
| 134 | static void pm_undo_all(struct pm_dev *last) | ||
| 135 | { | ||
| 136 | struct list_head *entry = last->entry.prev; | ||
| 137 | while (entry != &pm_devs) { | ||
| 138 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
| 139 | if (dev->state != dev->prev_state) { | ||
| 140 | /* previous state was zero (running) resume or | ||
| 141 | * previous state was non-zero (suspended) suspend | ||
| 142 | */ | ||
| 143 | pm_request_t undo = (dev->prev_state | ||
| 144 | ? PM_SUSPEND:PM_RESUME); | ||
| 145 | pm_send(dev, undo, (void*) dev->prev_state); | ||
| 146 | } | ||
| 147 | entry = entry->prev; | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | /** | ||
| 152 | * pm_send_all - send request to all managed devices | ||
| 153 | * @rqst: power management request | ||
| 154 | * @data: data for the callback | ||
| 155 | * | ||
| 156 | * Issue a power management request to a all devices. The | ||
| 157 | * %PM_SUSPEND events are handled specially. Any device is | ||
| 158 | * permitted to fail a suspend by returning a non zero (error) | ||
| 159 | * value from its callback function. If any device vetoes a | ||
| 160 | * suspend request then all other devices that have suspended | ||
| 161 | * during the processing of this request are restored to their | ||
| 162 | * previous state. | ||
| 163 | * | ||
| 164 | * WARNING: This function takes the pm_devs_lock. The lock is not dropped until | ||
| 165 | * the callbacks have completed. This prevents races against pm locking | ||
| 166 | * functions, races against module unload pm_unregister code. It does | ||
| 167 | * mean however that you must not issue pm_ functions within the callback | ||
| 168 | * or you will deadlock and users will hate you. | ||
| 169 | * | ||
| 170 | * Zero is returned on success. If a suspend fails then the status | ||
| 171 | * from the device that vetoes the suspend is returned. | ||
| 172 | * | ||
| 173 | * BUGS: what stops two power management requests occurring in parallel | ||
| 174 | * and conflicting. | ||
| 175 | */ | ||
| 176 | |||
| 177 | int pm_send_all(pm_request_t rqst, void *data) | ||
| 178 | { | ||
| 179 | struct list_head *entry; | ||
| 180 | |||
| 181 | mutex_lock(&pm_devs_lock); | ||
| 182 | entry = pm_devs.next; | ||
| 183 | while (entry != &pm_devs) { | ||
| 184 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
| 185 | if (dev->callback) { | ||
| 186 | int status = pm_send(dev, rqst, data); | ||
| 187 | if (status) { | ||
| 188 | /* return devices to previous state on | ||
| 189 | * failed suspend request | ||
| 190 | */ | ||
| 191 | if (rqst == PM_SUSPEND) | ||
| 192 | pm_undo_all(dev); | ||
| 193 | mutex_unlock(&pm_devs_lock); | ||
| 194 | return status; | ||
| 195 | } | ||
| 196 | } | ||
| 197 | entry = entry->next; | ||
| 198 | } | ||
| 199 | mutex_unlock(&pm_devs_lock); | ||
| 200 | return 0; | ||
| 201 | } | ||
| 202 | |||
| 203 | EXPORT_SYMBOL(pm_register); | ||
| 204 | EXPORT_SYMBOL(pm_send_all); | ||
| 205 | |||
diff --git a/kernel/printk.c b/kernel/printk.c index c46a20a19a15..8fb01c32aa3b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -111,6 +111,9 @@ struct console_cmdline | |||
| 111 | char name[8]; /* Name of the driver */ | 111 | char name[8]; /* Name of the driver */ |
| 112 | int index; /* Minor dev. to use */ | 112 | int index; /* Minor dev. to use */ |
| 113 | char *options; /* Options for the driver */ | 113 | char *options; /* Options for the driver */ |
| 114 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
| 115 | char *brl_options; /* Options for braille driver */ | ||
| 116 | #endif | ||
| 114 | }; | 117 | }; |
| 115 | 118 | ||
| 116 | #define MAX_CMDLINECONSOLES 8 | 119 | #define MAX_CMDLINECONSOLES 8 |
| @@ -643,8 +646,21 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) | |||
| 643 | { | 646 | { |
| 644 | int retval = 0; | 647 | int retval = 0; |
| 645 | 648 | ||
| 646 | if (can_use_console(cpu)) | 649 | if (!try_acquire_console_sem()) { |
| 647 | retval = !try_acquire_console_sem(); | 650 | retval = 1; |
| 651 | |||
| 652 | /* | ||
| 653 | * If we can't use the console, we need to release | ||
| 654 | * the console semaphore by hand to avoid flushing | ||
| 655 | * the buffer. We need to hold the console semaphore | ||
| 656 | * in order to do this test safely. | ||
| 657 | */ | ||
| 658 | if (!can_use_console(cpu)) { | ||
| 659 | console_locked = 0; | ||
| 660 | up(&console_sem); | ||
| 661 | retval = 0; | ||
| 662 | } | ||
| 663 | } | ||
| 648 | printk_cpu = UINT_MAX; | 664 | printk_cpu = UINT_MAX; |
| 649 | spin_unlock(&logbuf_lock); | 665 | spin_unlock(&logbuf_lock); |
| 650 | return retval; | 666 | return retval; |
| @@ -795,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end) | |||
| 795 | 811 | ||
| 796 | #endif | 812 | #endif |
| 797 | 813 | ||
| 814 | static int __add_preferred_console(char *name, int idx, char *options, | ||
| 815 | char *brl_options) | ||
| 816 | { | ||
| 817 | struct console_cmdline *c; | ||
| 818 | int i; | ||
| 819 | |||
| 820 | /* | ||
| 821 | * See if this tty is not yet registered, and | ||
| 822 | * if we have a slot free. | ||
| 823 | */ | ||
| 824 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | ||
| 825 | if (strcmp(console_cmdline[i].name, name) == 0 && | ||
| 826 | console_cmdline[i].index == idx) { | ||
| 827 | if (!brl_options) | ||
| 828 | selected_console = i; | ||
| 829 | return 0; | ||
| 830 | } | ||
| 831 | if (i == MAX_CMDLINECONSOLES) | ||
| 832 | return -E2BIG; | ||
| 833 | if (!brl_options) | ||
| 834 | selected_console = i; | ||
| 835 | c = &console_cmdline[i]; | ||
| 836 | strlcpy(c->name, name, sizeof(c->name)); | ||
| 837 | c->options = options; | ||
| 838 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
| 839 | c->brl_options = brl_options; | ||
| 840 | #endif | ||
| 841 | c->index = idx; | ||
| 842 | return 0; | ||
| 843 | } | ||
| 798 | /* | 844 | /* |
| 799 | * Set up a list of consoles. Called from init/main.c | 845 | * Set up a list of consoles. Called from init/main.c |
| 800 | */ | 846 | */ |
| 801 | static int __init console_setup(char *str) | 847 | static int __init console_setup(char *str) |
| 802 | { | 848 | { |
| 803 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ | 849 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ |
| 804 | char *s, *options; | 850 | char *s, *options, *brl_options = NULL; |
| 805 | int idx; | 851 | int idx; |
| 806 | 852 | ||
| 853 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
| 854 | if (!memcmp(str, "brl,", 4)) { | ||
| 855 | brl_options = ""; | ||
| 856 | str += 4; | ||
| 857 | } else if (!memcmp(str, "brl=", 4)) { | ||
| 858 | brl_options = str + 4; | ||
| 859 | str = strchr(brl_options, ','); | ||
| 860 | if (!str) { | ||
| 861 | printk(KERN_ERR "need port name after brl=\n"); | ||
| 862 | return 1; | ||
| 863 | } | ||
| 864 | *(str++) = 0; | ||
| 865 | } | ||
| 866 | #endif | ||
| 867 | |||
| 807 | /* | 868 | /* |
| 808 | * Decode str into name, index, options. | 869 | * Decode str into name, index, options. |
| 809 | */ | 870 | */ |
| @@ -828,7 +889,7 @@ static int __init console_setup(char *str) | |||
| 828 | idx = simple_strtoul(s, NULL, 10); | 889 | idx = simple_strtoul(s, NULL, 10); |
| 829 | *s = 0; | 890 | *s = 0; |
| 830 | 891 | ||
| 831 | add_preferred_console(buf, idx, options); | 892 | __add_preferred_console(buf, idx, options, brl_options); |
| 832 | return 1; | 893 | return 1; |
| 833 | } | 894 | } |
| 834 | __setup("console=", console_setup); | 895 | __setup("console=", console_setup); |
| @@ -848,28 +909,7 @@ __setup("console=", console_setup); | |||
| 848 | */ | 909 | */ |
| 849 | int add_preferred_console(char *name, int idx, char *options) | 910 | int add_preferred_console(char *name, int idx, char *options) |
| 850 | { | 911 | { |
| 851 | struct console_cmdline *c; | 912 | return __add_preferred_console(name, idx, options, NULL); |
| 852 | int i; | ||
| 853 | |||
| 854 | /* | ||
| 855 | * See if this tty is not yet registered, and | ||
| 856 | * if we have a slot free. | ||
| 857 | */ | ||
| 858 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | ||
| 859 | if (strcmp(console_cmdline[i].name, name) == 0 && | ||
| 860 | console_cmdline[i].index == idx) { | ||
| 861 | selected_console = i; | ||
| 862 | return 0; | ||
| 863 | } | ||
| 864 | if (i == MAX_CMDLINECONSOLES) | ||
| 865 | return -E2BIG; | ||
| 866 | selected_console = i; | ||
| 867 | c = &console_cmdline[i]; | ||
| 868 | memcpy(c->name, name, sizeof(c->name)); | ||
| 869 | c->name[sizeof(c->name) - 1] = 0; | ||
| 870 | c->options = options; | ||
| 871 | c->index = idx; | ||
| 872 | return 0; | ||
| 873 | } | 913 | } |
| 874 | 914 | ||
| 875 | int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) | 915 | int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) |
| @@ -881,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
| 881 | if (strcmp(console_cmdline[i].name, name) == 0 && | 921 | if (strcmp(console_cmdline[i].name, name) == 0 && |
| 882 | console_cmdline[i].index == idx) { | 922 | console_cmdline[i].index == idx) { |
| 883 | c = &console_cmdline[i]; | 923 | c = &console_cmdline[i]; |
| 884 | memcpy(c->name, name_new, sizeof(c->name)); | 924 | strlcpy(c->name, name_new, sizeof(c->name)); |
| 885 | c->name[sizeof(c->name) - 1] = 0; | 925 | c->name[sizeof(c->name) - 1] = 0; |
| 886 | c->options = options; | 926 | c->options = options; |
| 887 | c->index = idx_new; | 927 | c->index = idx_new; |
| @@ -1150,6 +1190,16 @@ void register_console(struct console *console) | |||
| 1150 | continue; | 1190 | continue; |
| 1151 | if (console->index < 0) | 1191 | if (console->index < 0) |
| 1152 | console->index = console_cmdline[i].index; | 1192 | console->index = console_cmdline[i].index; |
| 1193 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
| 1194 | if (console_cmdline[i].brl_options) { | ||
| 1195 | console->flags |= CON_BRL; | ||
| 1196 | braille_register_console(console, | ||
| 1197 | console_cmdline[i].index, | ||
| 1198 | console_cmdline[i].options, | ||
| 1199 | console_cmdline[i].brl_options); | ||
| 1200 | return; | ||
| 1201 | } | ||
| 1202 | #endif | ||
| 1153 | if (console->setup && | 1203 | if (console->setup && |
| 1154 | console->setup(console, console_cmdline[i].options) != 0) | 1204 | console->setup(console, console_cmdline[i].options) != 0) |
| 1155 | break; | 1205 | break; |
| @@ -1208,6 +1258,11 @@ int unregister_console(struct console *console) | |||
| 1208 | struct console *a, *b; | 1258 | struct console *a, *b; |
| 1209 | int res = 1; | 1259 | int res = 1; |
| 1210 | 1260 | ||
| 1261 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
| 1262 | if (console->flags & CON_BRL) | ||
| 1263 | return braille_unregister_console(console); | ||
| 1264 | #endif | ||
| 1265 | |||
| 1211 | acquire_console_sem(); | 1266 | acquire_console_sem(); |
| 1212 | if (console_drivers == console) { | 1267 | if (console_drivers == console) { |
| 1213 | console_drivers=console->next; | 1268 | console_drivers=console->next; |
| @@ -1259,8 +1314,8 @@ late_initcall(disable_boot_consoles); | |||
| 1259 | */ | 1314 | */ |
| 1260 | void tty_write_message(struct tty_struct *tty, char *msg) | 1315 | void tty_write_message(struct tty_struct *tty, char *msg) |
| 1261 | { | 1316 | { |
| 1262 | if (tty && tty->driver->write) | 1317 | if (tty && tty->ops->write) |
| 1263 | tty->driver->write(tty, msg, strlen(msg)); | 1318 | tty->ops->write(tty, msg, strlen(msg)); |
| 1264 | return; | 1319 | return; |
| 1265 | } | 1320 | } |
| 1266 | 1321 | ||
| @@ -1274,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) | |||
| 1274 | */ | 1329 | */ |
| 1275 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | 1330 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) |
| 1276 | { | 1331 | { |
| 1277 | static DEFINE_SPINLOCK(ratelimit_lock); | 1332 | return __ratelimit(ratelimit_jiffies, ratelimit_burst); |
| 1278 | static unsigned toks = 10 * 5 * HZ; | ||
| 1279 | static unsigned long last_msg; | ||
| 1280 | static int missed; | ||
| 1281 | unsigned long flags; | ||
| 1282 | unsigned long now = jiffies; | ||
| 1283 | |||
| 1284 | spin_lock_irqsave(&ratelimit_lock, flags); | ||
| 1285 | toks += now - last_msg; | ||
| 1286 | last_msg = now; | ||
| 1287 | if (toks > (ratelimit_burst * ratelimit_jiffies)) | ||
| 1288 | toks = ratelimit_burst * ratelimit_jiffies; | ||
| 1289 | if (toks >= ratelimit_jiffies) { | ||
| 1290 | int lost = missed; | ||
| 1291 | |||
| 1292 | missed = 0; | ||
| 1293 | toks -= ratelimit_jiffies; | ||
| 1294 | spin_unlock_irqrestore(&ratelimit_lock, flags); | ||
| 1295 | if (lost) | ||
| 1296 | printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); | ||
| 1297 | return 1; | ||
| 1298 | } | ||
| 1299 | missed++; | ||
| 1300 | spin_unlock_irqrestore(&ratelimit_lock, flags); | ||
| 1301 | return 0; | ||
| 1302 | } | 1333 | } |
| 1303 | EXPORT_SYMBOL(__printk_ratelimit); | 1334 | EXPORT_SYMBOL(__printk_ratelimit); |
| 1304 | 1335 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 3b7a1b055122..ae7ead82cbc9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -23,7 +23,6 @@ | |||
| 23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
| 24 | #include <linux/mutex.h> | 24 | #include <linux/mutex.h> |
| 25 | #include <asm/sections.h> | 25 | #include <asm/sections.h> |
| 26 | #include <asm/semaphore.h> | ||
| 27 | #include <asm/irq_regs.h> | 26 | #include <asm/irq_regs.h> |
| 28 | #include <asm/ptrace.h> | 27 | #include <asm/ptrace.h> |
| 29 | 28 | ||
| @@ -588,10 +587,10 @@ static int __init create_proc_profile(void) | |||
| 588 | return 0; | 587 | return 0; |
| 589 | if (create_hash_tables()) | 588 | if (create_hash_tables()) |
| 590 | return -1; | 589 | return -1; |
| 591 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); | 590 | entry = proc_create("profile", S_IWUSR | S_IRUGO, |
| 591 | NULL, &proc_profile_operations); | ||
| 592 | if (!entry) | 592 | if (!entry) |
| 593 | return 0; | 593 | return 0; |
| 594 | entry->proc_fops = &proc_profile_operations; | ||
| 595 | entry->size = (1+prof_len) * sizeof(atomic_t); | 594 | entry->size = (1+prof_len) * sizeof(atomic_t); |
| 596 | hotcpu_notifier(profile_cpu_callback, 0); | 595 | hotcpu_notifier(profile_cpu_callback, 0); |
| 597 | return 0; | 596 | return 0; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fdb34e86f923..6c19e94fd0a5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 73 | BUG_ON(!child->ptrace); | 73 | BUG_ON(!child->ptrace); |
| 74 | 74 | ||
| 75 | child->ptrace = 0; | 75 | child->ptrace = 0; |
| 76 | if (!list_empty(&child->ptrace_list)) { | 76 | if (ptrace_reparented(child)) { |
| 77 | list_del_init(&child->ptrace_list); | 77 | list_del_init(&child->ptrace_list); |
| 78 | remove_parent(child); | 78 | remove_parent(child); |
| 79 | child->parent = child->real_parent; | 79 | child->parent = child->real_parent; |
| @@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task) | |||
| 168 | audit_ptrace(task); | 168 | audit_ptrace(task); |
| 169 | 169 | ||
| 170 | retval = -EPERM; | 170 | retval = -EPERM; |
| 171 | if (task->pid <= 1) | ||
| 172 | goto out; | ||
| 173 | if (same_thread_group(task, current)) | 171 | if (same_thread_group(task, current)) |
| 174 | goto out; | 172 | goto out; |
| 175 | 173 | ||
| @@ -208,8 +206,7 @@ repeat: | |||
| 208 | 206 | ||
| 209 | __ptrace_link(task, current); | 207 | __ptrace_link(task, current); |
| 210 | 208 | ||
| 211 | force_sig_specific(SIGSTOP, task); | 209 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); |
| 212 | |||
| 213 | bad: | 210 | bad: |
| 214 | write_unlock_irqrestore(&tasklist_lock, flags); | 211 | write_unlock_irqrestore(&tasklist_lock, flags); |
| 215 | task_unlock(task); | 212 | task_unlock(task); |
| @@ -323,9 +320,8 @@ static int ptrace_setoptions(struct task_struct *child, long data) | |||
| 323 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; | 320 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; |
| 324 | } | 321 | } |
| 325 | 322 | ||
| 326 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | 323 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) |
| 327 | { | 324 | { |
| 328 | siginfo_t lastinfo; | ||
| 329 | int error = -ESRCH; | 325 | int error = -ESRCH; |
| 330 | 326 | ||
| 331 | read_lock(&tasklist_lock); | 327 | read_lock(&tasklist_lock); |
| @@ -333,31 +329,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
| 333 | error = -EINVAL; | 329 | error = -EINVAL; |
| 334 | spin_lock_irq(&child->sighand->siglock); | 330 | spin_lock_irq(&child->sighand->siglock); |
| 335 | if (likely(child->last_siginfo != NULL)) { | 331 | if (likely(child->last_siginfo != NULL)) { |
| 336 | lastinfo = *child->last_siginfo; | 332 | *info = *child->last_siginfo; |
| 337 | error = 0; | 333 | error = 0; |
| 338 | } | 334 | } |
| 339 | spin_unlock_irq(&child->sighand->siglock); | 335 | spin_unlock_irq(&child->sighand->siglock); |
| 340 | } | 336 | } |
| 341 | read_unlock(&tasklist_lock); | 337 | read_unlock(&tasklist_lock); |
| 342 | if (!error) | ||
| 343 | return copy_siginfo_to_user(data, &lastinfo); | ||
| 344 | return error; | 338 | return error; |
| 345 | } | 339 | } |
| 346 | 340 | ||
| 347 | static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | 341 | static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) |
| 348 | { | 342 | { |
| 349 | siginfo_t newinfo; | ||
| 350 | int error = -ESRCH; | 343 | int error = -ESRCH; |
| 351 | 344 | ||
| 352 | if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) | ||
| 353 | return -EFAULT; | ||
| 354 | |||
| 355 | read_lock(&tasklist_lock); | 345 | read_lock(&tasklist_lock); |
| 356 | if (likely(child->sighand != NULL)) { | 346 | if (likely(child->sighand != NULL)) { |
| 357 | error = -EINVAL; | 347 | error = -EINVAL; |
| 358 | spin_lock_irq(&child->sighand->siglock); | 348 | spin_lock_irq(&child->sighand->siglock); |
| 359 | if (likely(child->last_siginfo != NULL)) { | 349 | if (likely(child->last_siginfo != NULL)) { |
| 360 | *child->last_siginfo = newinfo; | 350 | *child->last_siginfo = *info; |
| 361 | error = 0; | 351 | error = 0; |
| 362 | } | 352 | } |
| 363 | spin_unlock_irq(&child->sighand->siglock); | 353 | spin_unlock_irq(&child->sighand->siglock); |
| @@ -424,6 +414,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 424 | long addr, long data) | 414 | long addr, long data) |
| 425 | { | 415 | { |
| 426 | int ret = -EIO; | 416 | int ret = -EIO; |
| 417 | siginfo_t siginfo; | ||
| 427 | 418 | ||
| 428 | switch (request) { | 419 | switch (request) { |
| 429 | case PTRACE_PEEKTEXT: | 420 | case PTRACE_PEEKTEXT: |
| @@ -442,12 +433,22 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 442 | case PTRACE_GETEVENTMSG: | 433 | case PTRACE_GETEVENTMSG: |
| 443 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 434 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); |
| 444 | break; | 435 | break; |
| 436 | |||
| 445 | case PTRACE_GETSIGINFO: | 437 | case PTRACE_GETSIGINFO: |
| 446 | ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); | 438 | ret = ptrace_getsiginfo(child, &siginfo); |
| 439 | if (!ret) | ||
| 440 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | ||
| 441 | &siginfo); | ||
| 447 | break; | 442 | break; |
| 443 | |||
| 448 | case PTRACE_SETSIGINFO: | 444 | case PTRACE_SETSIGINFO: |
| 449 | ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); | 445 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, |
| 446 | sizeof siginfo)) | ||
| 447 | ret = -EFAULT; | ||
| 448 | else | ||
| 449 | ret = ptrace_setsiginfo(child, &siginfo); | ||
| 450 | break; | 450 | break; |
| 451 | |||
| 451 | case PTRACE_DETACH: /* detach a process that was attached. */ | 452 | case PTRACE_DETACH: /* detach a process that was attached. */ |
| 452 | ret = ptrace_detach(child, data); | 453 | ret = ptrace_detach(child, data); |
| 453 | break; | 454 | break; |
| @@ -518,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
| 518 | { | 519 | { |
| 519 | struct task_struct *child; | 520 | struct task_struct *child; |
| 520 | 521 | ||
| 521 | /* | ||
| 522 | * Tracing init is not allowed. | ||
| 523 | */ | ||
| 524 | if (pid == 1) | ||
| 525 | return ERR_PTR(-EPERM); | ||
| 526 | |||
| 527 | read_lock(&tasklist_lock); | 522 | read_lock(&tasklist_lock); |
| 528 | child = find_task_by_vpid(pid); | 523 | child = find_task_by_vpid(pid); |
| 529 | if (child) | 524 | if (child) |
| @@ -539,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
| 539 | #define arch_ptrace_attach(child) do { } while (0) | 534 | #define arch_ptrace_attach(child) do { } while (0) |
| 540 | #endif | 535 | #endif |
| 541 | 536 | ||
| 542 | #ifndef __ARCH_SYS_PTRACE | ||
| 543 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | 537 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) |
| 544 | { | 538 | { |
| 545 | struct task_struct *child; | 539 | struct task_struct *child; |
| @@ -587,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
| 587 | unlock_kernel(); | 581 | unlock_kernel(); |
| 588 | return ret; | 582 | return ret; |
| 589 | } | 583 | } |
| 590 | #endif /* __ARCH_SYS_PTRACE */ | ||
| 591 | 584 | ||
| 592 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | 585 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) |
| 593 | { | 586 | { |
| @@ -608,7 +601,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | |||
| 608 | return (copied == sizeof(data)) ? 0 : -EIO; | 601 | return (copied == sizeof(data)) ? 0 : -EIO; |
| 609 | } | 602 | } |
| 610 | 603 | ||
| 611 | #ifdef CONFIG_COMPAT | 604 | #if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE |
| 612 | #include <linux/compat.h> | 605 | #include <linux/compat.h> |
| 613 | 606 | ||
| 614 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, | 607 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, |
| @@ -616,6 +609,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 616 | { | 609 | { |
| 617 | compat_ulong_t __user *datap = compat_ptr(data); | 610 | compat_ulong_t __user *datap = compat_ptr(data); |
| 618 | compat_ulong_t word; | 611 | compat_ulong_t word; |
| 612 | siginfo_t siginfo; | ||
| 619 | int ret; | 613 | int ret; |
| 620 | 614 | ||
| 621 | switch (request) { | 615 | switch (request) { |
| @@ -638,6 +632,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | 632 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); |
| 639 | break; | 633 | break; |
| 640 | 634 | ||
| 635 | case PTRACE_GETSIGINFO: | ||
| 636 | ret = ptrace_getsiginfo(child, &siginfo); | ||
| 637 | if (!ret) | ||
| 638 | ret = copy_siginfo_to_user32( | ||
| 639 | (struct compat_siginfo __user *) datap, | ||
| 640 | &siginfo); | ||
| 641 | break; | ||
| 642 | |||
| 643 | case PTRACE_SETSIGINFO: | ||
| 644 | memset(&siginfo, 0, sizeof siginfo); | ||
| 645 | if (copy_siginfo_from_user32( | ||
| 646 | &siginfo, (struct compat_siginfo __user *) datap)) | ||
| 647 | ret = -EFAULT; | ||
| 648 | else | ||
| 649 | ret = ptrace_setsiginfo(child, &siginfo); | ||
| 650 | break; | ||
| 651 | |||
| 641 | default: | 652 | default: |
| 642 | ret = ptrace_request(child, request, addr, data); | 653 | ret = ptrace_request(child, request, addr, data); |
| 643 | } | 654 | } |
| @@ -645,7 +656,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 645 | return ret; | 656 | return ret; |
| 646 | } | 657 | } |
| 647 | 658 | ||
| 648 | #ifdef __ARCH_WANT_COMPAT_SYS_PTRACE | ||
| 649 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | 659 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, |
| 650 | compat_long_t addr, compat_long_t data) | 660 | compat_long_t addr, compat_long_t data) |
| 651 | { | 661 | { |
| @@ -688,6 +698,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
| 688 | unlock_kernel(); | 698 | unlock_kernel(); |
| 689 | return ret; | 699 | return ret; |
| 690 | } | 700 | } |
| 691 | #endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ | 701 | #endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ |
| 692 | |||
| 693 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57c..e1cdf196a515 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
| @@ -1007,10 +1007,10 @@ void __synchronize_sched(void) | |||
| 1007 | if (sched_getaffinity(0, &oldmask) < 0) | 1007 | if (sched_getaffinity(0, &oldmask) < 0) |
| 1008 | oldmask = cpu_possible_map; | 1008 | oldmask = cpu_possible_map; |
| 1009 | for_each_online_cpu(cpu) { | 1009 | for_each_online_cpu(cpu) { |
| 1010 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | 1010 | sched_setaffinity(0, &cpumask_of_cpu(cpu)); |
| 1011 | schedule(); | 1011 | schedule(); |
| 1012 | } | 1012 | } |
| 1013 | sched_setaffinity(0, oldmask); | 1013 | sched_setaffinity(0, &oldmask); |
| 1014 | } | 1014 | } |
| 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); | 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); |
| 1016 | 1016 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72a..33acc424667e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/byteorder/swabb.h> | 45 | #include <linux/byteorder/swabb.h> |
| 46 | #include <linux/stat.h> | 46 | #include <linux/stat.h> |
| 47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
| 48 | #include <linux/slab.h> | ||
| 48 | 49 | ||
| 49 | MODULE_LICENSE("GPL"); | 50 | MODULE_LICENSE("GPL"); |
| 50 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 51 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
| @@ -723,9 +724,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | |||
| 723 | */ | 724 | */ |
| 724 | static void rcu_torture_shuffle_tasks(void) | 725 | static void rcu_torture_shuffle_tasks(void) |
| 725 | { | 726 | { |
| 726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 727 | cpumask_t tmp_mask; |
| 727 | int i; | 728 | int i; |
| 728 | 729 | ||
| 730 | cpus_setall(tmp_mask); | ||
| 729 | get_online_cpus(); | 731 | get_online_cpus(); |
| 730 | 732 | ||
| 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 733 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
| @@ -737,25 +739,27 @@ static void rcu_torture_shuffle_tasks(void) | |||
| 737 | if (rcu_idle_cpu != -1) | 739 | if (rcu_idle_cpu != -1) |
| 738 | cpu_clear(rcu_idle_cpu, tmp_mask); | 740 | cpu_clear(rcu_idle_cpu, tmp_mask); |
| 739 | 741 | ||
| 740 | set_cpus_allowed(current, tmp_mask); | 742 | set_cpus_allowed_ptr(current, &tmp_mask); |
| 741 | 743 | ||
| 742 | if (reader_tasks) { | 744 | if (reader_tasks) { |
| 743 | for (i = 0; i < nrealreaders; i++) | 745 | for (i = 0; i < nrealreaders; i++) |
| 744 | if (reader_tasks[i]) | 746 | if (reader_tasks[i]) |
| 745 | set_cpus_allowed(reader_tasks[i], tmp_mask); | 747 | set_cpus_allowed_ptr(reader_tasks[i], |
| 748 | &tmp_mask); | ||
| 746 | } | 749 | } |
| 747 | 750 | ||
| 748 | if (fakewriter_tasks) { | 751 | if (fakewriter_tasks) { |
| 749 | for (i = 0; i < nfakewriters; i++) | 752 | for (i = 0; i < nfakewriters; i++) |
| 750 | if (fakewriter_tasks[i]) | 753 | if (fakewriter_tasks[i]) |
| 751 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); | 754 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
| 755 | &tmp_mask); | ||
| 752 | } | 756 | } |
| 753 | 757 | ||
| 754 | if (writer_task) | 758 | if (writer_task) |
| 755 | set_cpus_allowed(writer_task, tmp_mask); | 759 | set_cpus_allowed_ptr(writer_task, &tmp_mask); |
| 756 | 760 | ||
| 757 | if (stats_task) | 761 | if (stats_task) |
| 758 | set_cpus_allowed(stats_task, tmp_mask); | 762 | set_cpus_allowed_ptr(stats_task, &tmp_mask); |
| 759 | 763 | ||
| 760 | if (rcu_idle_cpu == -1) | 764 | if (rcu_idle_cpu == -1) |
| 761 | rcu_idle_cpu = num_online_cpus() - 1; | 765 | rcu_idle_cpu = num_online_cpus() - 1; |
diff --git a/kernel/relay.c b/kernel/relay.c index 4c035a8a248c..7de644cdec43 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = { | |||
| 65 | .close = relay_file_mmap_close, | 65 | .close = relay_file_mmap_close, |
| 66 | }; | 66 | }; |
| 67 | 67 | ||
| 68 | /* | ||
| 69 | * allocate an array of pointers of struct page | ||
| 70 | */ | ||
| 71 | static struct page **relay_alloc_page_array(unsigned int n_pages) | ||
| 72 | { | ||
| 73 | struct page **array; | ||
| 74 | size_t pa_size = n_pages * sizeof(struct page *); | ||
| 75 | |||
| 76 | if (pa_size > PAGE_SIZE) { | ||
| 77 | array = vmalloc(pa_size); | ||
| 78 | if (array) | ||
| 79 | memset(array, 0, pa_size); | ||
| 80 | } else { | ||
| 81 | array = kzalloc(pa_size, GFP_KERNEL); | ||
| 82 | } | ||
| 83 | return array; | ||
| 84 | } | ||
| 85 | |||
| 86 | /* | ||
| 87 | * free an array of pointers of struct page | ||
| 88 | */ | ||
| 89 | static void relay_free_page_array(struct page **array) | ||
| 90 | { | ||
| 91 | if (is_vmalloc_addr(array)) | ||
| 92 | vfree(array); | ||
| 93 | else | ||
| 94 | kfree(array); | ||
| 95 | } | ||
| 96 | |||
| 68 | /** | 97 | /** |
| 69 | * relay_mmap_buf: - mmap channel buffer to process address space | 98 | * relay_mmap_buf: - mmap channel buffer to process address space |
| 70 | * @buf: relay channel buffer | 99 | * @buf: relay channel buffer |
| @@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | |||
| 109 | *size = PAGE_ALIGN(*size); | 138 | *size = PAGE_ALIGN(*size); |
| 110 | n_pages = *size >> PAGE_SHIFT; | 139 | n_pages = *size >> PAGE_SHIFT; |
| 111 | 140 | ||
| 112 | buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); | 141 | buf->page_array = relay_alloc_page_array(n_pages); |
| 113 | if (!buf->page_array) | 142 | if (!buf->page_array) |
| 114 | return NULL; | 143 | return NULL; |
| 115 | 144 | ||
| @@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | |||
| 130 | depopulate: | 159 | depopulate: |
| 131 | for (j = 0; j < i; j++) | 160 | for (j = 0; j < i; j++) |
| 132 | __free_page(buf->page_array[j]); | 161 | __free_page(buf->page_array[j]); |
| 133 | kfree(buf->page_array); | 162 | relay_free_page_array(buf->page_array); |
| 134 | return NULL; | 163 | return NULL; |
| 135 | } | 164 | } |
| 136 | 165 | ||
| @@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) | |||
| 189 | vunmap(buf->start); | 218 | vunmap(buf->start); |
| 190 | for (i = 0; i < buf->page_count; i++) | 219 | for (i = 0; i < buf->page_count; i++) |
| 191 | __free_page(buf->page_array[i]); | 220 | __free_page(buf->page_array[i]); |
| 192 | kfree(buf->page_array); | 221 | relay_free_page_array(buf->page_array); |
| 193 | } | 222 | } |
| 194 | chan->buf[buf->cpu] = NULL; | 223 | chan->buf[buf->cpu] = NULL; |
| 195 | kfree(buf->padding); | 224 | kfree(buf->padding); |
| @@ -736,7 +765,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) | |||
| 736 | kref_get(&buf->kref); | 765 | kref_get(&buf->kref); |
| 737 | filp->private_data = buf; | 766 | filp->private_data = buf; |
| 738 | 767 | ||
| 739 | return 0; | 768 | return nonseekable_open(inode, filp); |
| 740 | } | 769 | } |
| 741 | 770 | ||
| 742 | /** | 771 | /** |
| @@ -1056,6 +1085,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { | |||
| 1056 | .get = generic_pipe_buf_get, | 1085 | .get = generic_pipe_buf_get, |
| 1057 | }; | 1086 | }; |
| 1058 | 1087 | ||
| 1088 | static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) | ||
| 1089 | { | ||
| 1090 | } | ||
| 1091 | |||
| 1059 | /* | 1092 | /* |
| 1060 | * subbuf_splice_actor - splice up to one subbuf's worth of data | 1093 | * subbuf_splice_actor - splice up to one subbuf's worth of data |
| 1061 | */ | 1094 | */ |
| @@ -1083,6 +1116,7 @@ static int subbuf_splice_actor(struct file *in, | |||
| 1083 | .partial = partial, | 1116 | .partial = partial, |
| 1084 | .flags = flags, | 1117 | .flags = flags, |
| 1085 | .ops = &relay_pipe_buf_ops, | 1118 | .ops = &relay_pipe_buf_ops, |
| 1119 | .spd_release = relay_page_release, | ||
| 1086 | }; | 1120 | }; |
| 1087 | 1121 | ||
| 1088 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1122 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
| @@ -1157,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, | |||
| 1157 | ret = 0; | 1191 | ret = 0; |
| 1158 | spliced = 0; | 1192 | spliced = 0; |
| 1159 | 1193 | ||
| 1160 | while (len) { | 1194 | while (len && !spliced) { |
| 1161 | ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); | 1195 | ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); |
| 1162 | if (ret < 0) | 1196 | if (ret < 0) |
| 1163 | break; | 1197 | break; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index efbfc0fc232f..d3c61b4ebef2 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/types.h> | 10 | #include <linux/types.h> |
| 11 | #include <linux/parser.h> | 11 | #include <linux/parser.h> |
| 12 | #include <linux/fs.h> | 12 | #include <linux/fs.h> |
| 13 | #include <linux/slab.h> | ||
| 13 | #include <linux/res_counter.h> | 14 | #include <linux/res_counter.h> |
| 14 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
| 15 | 16 | ||
| @@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | |||
| 27 | } | 28 | } |
| 28 | 29 | ||
| 29 | counter->usage += val; | 30 | counter->usage += val; |
| 31 | if (counter->usage > counter->max_usage) | ||
| 32 | counter->max_usage = counter->usage; | ||
| 30 | return 0; | 33 | return 0; |
| 31 | } | 34 | } |
| 32 | 35 | ||
| @@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member) | |||
| 65 | switch (member) { | 68 | switch (member) { |
| 66 | case RES_USAGE: | 69 | case RES_USAGE: |
| 67 | return &counter->usage; | 70 | return &counter->usage; |
| 71 | case RES_MAX_USAGE: | ||
| 72 | return &counter->max_usage; | ||
| 68 | case RES_LIMIT: | 73 | case RES_LIMIT: |
| 69 | return &counter->limit; | 74 | return &counter->limit; |
| 70 | case RES_FAILCNT: | 75 | case RES_FAILCNT: |
| @@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member, | |||
| 92 | pos, buf, s - buf); | 97 | pos, buf, s - buf); |
| 93 | } | 98 | } |
| 94 | 99 | ||
| 100 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 101 | { | ||
| 102 | return *res_counter_member(counter, member); | ||
| 103 | } | ||
| 104 | |||
| 95 | ssize_t res_counter_write(struct res_counter *counter, int member, | 105 | ssize_t res_counter_write(struct res_counter *counter, int member, |
| 96 | const char __user *userbuf, size_t nbytes, loff_t *pos, | 106 | const char __user *userbuf, size_t nbytes, loff_t *pos, |
| 97 | int (*write_strategy)(char *st_buf, unsigned long long *val)) | 107 | int (*write_strategy)(char *st_buf, unsigned long long *val)) |
diff --git a/kernel/resource.c b/kernel/resource.c index 82aea814d409..74af2d7cb5a1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = { | |||
| 131 | 131 | ||
| 132 | static int __init ioresources_init(void) | 132 | static int __init ioresources_init(void) |
| 133 | { | 133 | { |
| 134 | struct proc_dir_entry *entry; | 134 | proc_create("ioports", 0, NULL, &proc_ioports_operations); |
| 135 | 135 | proc_create("iomem", 0, NULL, &proc_iomem_operations); | |
| 136 | entry = create_proc_entry("ioports", 0, NULL); | ||
| 137 | if (entry) | ||
| 138 | entry->proc_fops = &proc_ioports_operations; | ||
| 139 | entry = create_proc_entry("iomem", 0, NULL); | ||
| 140 | if (entry) | ||
| 141 | entry->proc_fops = &proc_iomem_operations; | ||
| 142 | return 0; | 136 | return 0; |
| 143 | } | 137 | } |
| 144 | __initcall(ioresources_init); | 138 | __initcall(ioresources_init); |
| @@ -486,6 +480,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
| 486 | 480 | ||
| 487 | EXPORT_SYMBOL(adjust_resource); | 481 | EXPORT_SYMBOL(adjust_resource); |
| 488 | 482 | ||
| 483 | /** | ||
| 484 | * resource_alignment - calculate resource's alignment | ||
| 485 | * @res: resource pointer | ||
| 486 | * | ||
| 487 | * Returns alignment on success, 0 (invalid alignment) on failure. | ||
| 488 | */ | ||
| 489 | resource_size_t resource_alignment(struct resource *res) | ||
| 490 | { | ||
| 491 | switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { | ||
| 492 | case IORESOURCE_SIZEALIGN: | ||
| 493 | return res->end - res->start + 1; | ||
| 494 | case IORESOURCE_STARTALIGN: | ||
| 495 | return res->start; | ||
| 496 | default: | ||
| 497 | return 0; | ||
| 498 | } | ||
| 499 | } | ||
| 500 | |||
| 489 | /* | 501 | /* |
| 490 | * This is compatibility stuff for IO resources. | 502 | * This is compatibility stuff for IO resources. |
| 491 | * | 503 | * |
diff --git a/kernel/sched.c b/kernel/sched.c index 28c73f07efb2..34bcc5bc120e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -66,6 +66,10 @@ | |||
| 66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
| 67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
| 68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
| 69 | #include <linux/tick.h> | ||
| 70 | #include <linux/bootmem.h> | ||
| 71 | #include <linux/debugfs.h> | ||
| 72 | #include <linux/ctype.h> | ||
| 69 | 73 | ||
| 70 | #include <asm/tlb.h> | 74 | #include <asm/tlb.h> |
| 71 | #include <asm/irq_regs.h> | 75 | #include <asm/irq_regs.h> |
| @@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 114 | */ | 118 | */ |
| 115 | #define DEF_TIMESLICE (100 * HZ / 1000) | 119 | #define DEF_TIMESLICE (100 * HZ / 1000) |
| 116 | 120 | ||
| 121 | /* | ||
| 122 | * single value that denotes runtime == period, ie unlimited time. | ||
| 123 | */ | ||
| 124 | #define RUNTIME_INF ((u64)~0ULL) | ||
| 125 | |||
| 117 | #ifdef CONFIG_SMP | 126 | #ifdef CONFIG_SMP |
| 118 | /* | 127 | /* |
| 119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
| @@ -155,6 +164,84 @@ struct rt_prio_array { | |||
| 155 | struct list_head queue[MAX_RT_PRIO]; | 164 | struct list_head queue[MAX_RT_PRIO]; |
| 156 | }; | 165 | }; |
| 157 | 166 | ||
| 167 | struct rt_bandwidth { | ||
| 168 | /* nests inside the rq lock: */ | ||
| 169 | spinlock_t rt_runtime_lock; | ||
| 170 | ktime_t rt_period; | ||
| 171 | u64 rt_runtime; | ||
| 172 | struct hrtimer rt_period_timer; | ||
| 173 | }; | ||
| 174 | |||
| 175 | static struct rt_bandwidth def_rt_bandwidth; | ||
| 176 | |||
| 177 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
| 178 | |||
| 179 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
| 180 | { | ||
| 181 | struct rt_bandwidth *rt_b = | ||
| 182 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
| 183 | ktime_t now; | ||
| 184 | int overrun; | ||
| 185 | int idle = 0; | ||
| 186 | |||
| 187 | for (;;) { | ||
| 188 | now = hrtimer_cb_get_time(timer); | ||
| 189 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
| 190 | |||
| 191 | if (!overrun) | ||
| 192 | break; | ||
| 193 | |||
| 194 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
| 195 | } | ||
| 196 | |||
| 197 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 198 | } | ||
| 199 | |||
| 200 | static | ||
| 201 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
| 202 | { | ||
| 203 | rt_b->rt_period = ns_to_ktime(period); | ||
| 204 | rt_b->rt_runtime = runtime; | ||
| 205 | |||
| 206 | spin_lock_init(&rt_b->rt_runtime_lock); | ||
| 207 | |||
| 208 | hrtimer_init(&rt_b->rt_period_timer, | ||
| 209 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 210 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
| 211 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
| 212 | } | ||
| 213 | |||
| 214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 215 | { | ||
| 216 | ktime_t now; | ||
| 217 | |||
| 218 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 219 | return; | ||
| 220 | |||
| 221 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 222 | return; | ||
| 223 | |||
| 224 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 225 | for (;;) { | ||
| 226 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 227 | break; | ||
| 228 | |||
| 229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
| 230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
| 231 | hrtimer_start(&rt_b->rt_period_timer, | ||
| 232 | rt_b->rt_period_timer.expires, | ||
| 233 | HRTIMER_MODE_ABS); | ||
| 234 | } | ||
| 235 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 236 | } | ||
| 237 | |||
| 238 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 239 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 240 | { | ||
| 241 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
| 242 | } | ||
| 243 | #endif | ||
| 244 | |||
| 158 | #ifdef CONFIG_GROUP_SCHED | 245 | #ifdef CONFIG_GROUP_SCHED |
| 159 | 246 | ||
| 160 | #include <linux/cgroup.h> | 247 | #include <linux/cgroup.h> |
| @@ -181,29 +268,39 @@ struct task_group { | |||
| 181 | struct sched_rt_entity **rt_se; | 268 | struct sched_rt_entity **rt_se; |
| 182 | struct rt_rq **rt_rq; | 269 | struct rt_rq **rt_rq; |
| 183 | 270 | ||
| 184 | u64 rt_runtime; | 271 | struct rt_bandwidth rt_bandwidth; |
| 185 | #endif | 272 | #endif |
| 186 | 273 | ||
| 187 | struct rcu_head rcu; | 274 | struct rcu_head rcu; |
| 188 | struct list_head list; | 275 | struct list_head list; |
| 276 | |||
| 277 | struct task_group *parent; | ||
| 278 | struct list_head siblings; | ||
| 279 | struct list_head children; | ||
| 189 | }; | 280 | }; |
| 190 | 281 | ||
| 282 | #ifdef CONFIG_USER_SCHED | ||
| 283 | |||
| 284 | /* | ||
| 285 | * Root task group. | ||
| 286 | * Every UID task group (including init_task_group aka UID-0) will | ||
| 287 | * be a child to this group. | ||
| 288 | */ | ||
| 289 | struct task_group root_task_group; | ||
| 290 | |||
| 191 | #ifdef CONFIG_FAIR_GROUP_SCHED | 291 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 192 | /* Default task group's sched entity on each cpu */ | 292 | /* Default task group's sched entity on each cpu */ |
| 193 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 293 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 194 | /* Default task group's cfs_rq on each cpu */ | 294 | /* Default task group's cfs_rq on each cpu */ |
| 195 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 295 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
| 196 | |||
| 197 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
| 198 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
| 199 | #endif | 296 | #endif |
| 200 | 297 | ||
| 201 | #ifdef CONFIG_RT_GROUP_SCHED | 298 | #ifdef CONFIG_RT_GROUP_SCHED |
| 202 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 299 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 203 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 300 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 204 | 301 | #endif | |
| 205 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | 302 | #else |
| 206 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | 303 | #define root_task_group init_task_group |
| 207 | #endif | 304 | #endif |
| 208 | 305 | ||
| 209 | /* task_group_lock serializes add/remove of task groups and also changes to | 306 | /* task_group_lock serializes add/remove of task groups and also changes to |
| @@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex); | |||
| 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 318 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 222 | #endif | 319 | #endif |
| 223 | 320 | ||
| 321 | #define MIN_SHARES 2 | ||
| 322 | |||
| 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 323 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
| 225 | #endif | 324 | #endif |
| 226 | 325 | ||
| 227 | /* Default task group. | 326 | /* Default task group. |
| 228 | * Every task in system belong to this group at bootup. | 327 | * Every task in system belong to this group at bootup. |
| 229 | */ | 328 | */ |
| 230 | struct task_group init_task_group = { | 329 | struct task_group init_task_group; |
| 231 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 232 | .se = init_sched_entity_p, | ||
| 233 | .cfs_rq = init_cfs_rq_p, | ||
| 234 | #endif | ||
| 235 | |||
| 236 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 237 | .rt_se = init_sched_rt_entity_p, | ||
| 238 | .rt_rq = init_rt_rq_p, | ||
| 239 | #endif | ||
| 240 | }; | ||
| 241 | 330 | ||
| 242 | /* return group to which a task belongs */ | 331 | /* return group to which a task belongs */ |
| 243 | static inline struct task_group *task_group(struct task_struct *p) | 332 | static inline struct task_group *task_group(struct task_struct *p) |
| @@ -297,8 +386,12 @@ struct cfs_rq { | |||
| 297 | 386 | ||
| 298 | struct rb_root tasks_timeline; | 387 | struct rb_root tasks_timeline; |
| 299 | struct rb_node *rb_leftmost; | 388 | struct rb_node *rb_leftmost; |
| 300 | struct rb_node *rb_load_balance_curr; | 389 | |
| 301 | /* 'curr' points to currently running entity on this cfs_rq. | 390 | struct list_head tasks; |
| 391 | struct list_head *balance_iterator; | ||
| 392 | |||
| 393 | /* | ||
| 394 | * 'curr' points to currently running entity on this cfs_rq. | ||
| 302 | * It is set to NULL otherwise (i.e when none are currently running). | 395 | * It is set to NULL otherwise (i.e when none are currently running). |
| 303 | */ | 396 | */ |
| 304 | struct sched_entity *curr, *next; | 397 | struct sched_entity *curr, *next; |
| @@ -318,6 +411,43 @@ struct cfs_rq { | |||
| 318 | */ | 411 | */ |
| 319 | struct list_head leaf_cfs_rq_list; | 412 | struct list_head leaf_cfs_rq_list; |
| 320 | struct task_group *tg; /* group that "owns" this runqueue */ | 413 | struct task_group *tg; /* group that "owns" this runqueue */ |
| 414 | |||
| 415 | #ifdef CONFIG_SMP | ||
| 416 | unsigned long task_weight; | ||
| 417 | unsigned long shares; | ||
| 418 | /* | ||
| 419 | * We need space to build a sched_domain wide view of the full task | ||
| 420 | * group tree, in order to avoid depending on dynamic memory allocation | ||
| 421 | * during the load balancing we place this in the per cpu task group | ||
| 422 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
| 423 | * but more should not be needed anyway. | ||
| 424 | */ | ||
| 425 | struct aggregate_struct { | ||
| 426 | /* | ||
| 427 | * load = weight(cpus) * f(tg) | ||
| 428 | * | ||
| 429 | * Where f(tg) is the recursive weight fraction assigned to | ||
| 430 | * this group. | ||
| 431 | */ | ||
| 432 | unsigned long load; | ||
| 433 | |||
| 434 | /* | ||
| 435 | * part of the group weight distributed to this span. | ||
| 436 | */ | ||
| 437 | unsigned long shares; | ||
| 438 | |||
| 439 | /* | ||
| 440 | * The sum of all runqueue weights within this span. | ||
| 441 | */ | ||
| 442 | unsigned long rq_weight; | ||
| 443 | |||
| 444 | /* | ||
| 445 | * Weight contributed by tasks; this is the part we can | ||
| 446 | * influence by moving tasks around. | ||
| 447 | */ | ||
| 448 | unsigned long task_weight; | ||
| 449 | } aggregate; | ||
| 450 | #endif | ||
| 321 | #endif | 451 | #endif |
| 322 | }; | 452 | }; |
| 323 | 453 | ||
| @@ -334,6 +464,9 @@ struct rt_rq { | |||
| 334 | #endif | 464 | #endif |
| 335 | int rt_throttled; | 465 | int rt_throttled; |
| 336 | u64 rt_time; | 466 | u64 rt_time; |
| 467 | u64 rt_runtime; | ||
| 468 | /* Nests inside the rq lock: */ | ||
| 469 | spinlock_t rt_runtime_lock; | ||
| 337 | 470 | ||
| 338 | #ifdef CONFIG_RT_GROUP_SCHED | 471 | #ifdef CONFIG_RT_GROUP_SCHED |
| 339 | unsigned long rt_nr_boosted; | 472 | unsigned long rt_nr_boosted; |
| @@ -396,6 +529,7 @@ struct rq { | |||
| 396 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 529 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 397 | unsigned char idle_at_tick; | 530 | unsigned char idle_at_tick; |
| 398 | #ifdef CONFIG_NO_HZ | 531 | #ifdef CONFIG_NO_HZ |
| 532 | unsigned long last_tick_seen; | ||
| 399 | unsigned char in_nohz_recently; | 533 | unsigned char in_nohz_recently; |
| 400 | #endif | 534 | #endif |
| 401 | /* capture load from *all* tasks on this cpu: */ | 535 | /* capture load from *all* tasks on this cpu: */ |
| @@ -405,8 +539,6 @@ struct rq { | |||
| 405 | 539 | ||
| 406 | struct cfs_rq cfs; | 540 | struct cfs_rq cfs; |
| 407 | struct rt_rq rt; | 541 | struct rt_rq rt; |
| 408 | u64 rt_period_expire; | ||
| 409 | int rt_throttled; | ||
| 410 | 542 | ||
| 411 | #ifdef CONFIG_FAIR_GROUP_SCHED | 543 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 412 | /* list of leaf cfs_rq on this cpu: */ | 544 | /* list of leaf cfs_rq on this cpu: */ |
| @@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq) | |||
| 499 | #endif | 631 | #endif |
| 500 | } | 632 | } |
| 501 | 633 | ||
| 634 | #ifdef CONFIG_NO_HZ | ||
| 635 | static inline bool nohz_on(int cpu) | ||
| 636 | { | ||
| 637 | return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; | ||
| 638 | } | ||
| 639 | |||
| 640 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
| 641 | { | ||
| 642 | return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; | ||
| 643 | } | ||
| 644 | |||
| 645 | static inline void update_last_tick_seen(struct rq *rq) | ||
| 646 | { | ||
| 647 | rq->last_tick_seen = jiffies; | ||
| 648 | } | ||
| 649 | #else | ||
| 650 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
| 651 | { | ||
| 652 | return 1; | ||
| 653 | } | ||
| 654 | |||
| 655 | static inline void update_last_tick_seen(struct rq *rq) | ||
| 656 | { | ||
| 657 | } | ||
| 658 | #endif | ||
| 659 | |||
| 502 | /* | 660 | /* |
| 503 | * Update the per-runqueue clock, as finegrained as the platform can give | 661 | * Update the per-runqueue clock, as finegrained as the platform can give |
| 504 | * us, but without assuming monotonicity, etc.: | 662 | * us, but without assuming monotonicity, etc.: |
| @@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) | |||
| 523 | /* | 681 | /* |
| 524 | * Catch too large forward jumps too: | 682 | * Catch too large forward jumps too: |
| 525 | */ | 683 | */ |
| 526 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { | 684 | u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; |
| 527 | if (clock < rq->tick_timestamp + TICK_NSEC) | 685 | u64 max_time = rq->tick_timestamp + max_jump; |
| 528 | clock = rq->tick_timestamp + TICK_NSEC; | 686 | |
| 687 | if (unlikely(clock + delta > max_time)) { | ||
| 688 | if (clock < max_time) | ||
| 689 | clock = max_time; | ||
| 529 | else | 690 | else |
| 530 | clock++; | 691 | clock++; |
| 531 | rq->clock_overflows++; | 692 | rq->clock_overflows++; |
| @@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq) | |||
| 561 | #define task_rq(p) cpu_rq(task_cpu(p)) | 722 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 562 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 723 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 563 | 724 | ||
| 564 | unsigned long rt_needs_cpu(int cpu) | ||
| 565 | { | ||
| 566 | struct rq *rq = cpu_rq(cpu); | ||
| 567 | u64 delta; | ||
| 568 | |||
| 569 | if (!rq->rt_throttled) | ||
| 570 | return 0; | ||
| 571 | |||
| 572 | if (rq->clock > rq->rt_period_expire) | ||
| 573 | return 1; | ||
| 574 | |||
| 575 | delta = rq->rt_period_expire - rq->clock; | ||
| 576 | do_div(delta, NSEC_PER_SEC / HZ); | ||
| 577 | |||
| 578 | return (unsigned long)delta; | ||
| 579 | } | ||
| 580 | |||
| 581 | /* | 725 | /* |
| 582 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 726 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
| 583 | */ | 727 | */ |
| @@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu) | |||
| 590 | /* | 734 | /* |
| 591 | * Debugging: various feature bits | 735 | * Debugging: various feature bits |
| 592 | */ | 736 | */ |
| 737 | |||
| 738 | #define SCHED_FEAT(name, enabled) \ | ||
| 739 | __SCHED_FEAT_##name , | ||
| 740 | |||
| 593 | enum { | 741 | enum { |
| 594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 742 | #include "sched_features.h" |
| 595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | ||
| 596 | SCHED_FEAT_START_DEBIT = 4, | ||
| 597 | SCHED_FEAT_HRTICK = 8, | ||
| 598 | SCHED_FEAT_DOUBLE_TICK = 16, | ||
| 599 | }; | 743 | }; |
| 600 | 744 | ||
| 745 | #undef SCHED_FEAT | ||
| 746 | |||
| 747 | #define SCHED_FEAT(name, enabled) \ | ||
| 748 | (1UL << __SCHED_FEAT_##name) * enabled | | ||
| 749 | |||
| 601 | const_debug unsigned int sysctl_sched_features = | 750 | const_debug unsigned int sysctl_sched_features = |
| 602 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 751 | #include "sched_features.h" |
| 603 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 752 | 0; |
| 604 | SCHED_FEAT_START_DEBIT * 1 | | 753 | |
| 605 | SCHED_FEAT_HRTICK * 1 | | 754 | #undef SCHED_FEAT |
| 606 | SCHED_FEAT_DOUBLE_TICK * 0; | 755 | |
| 756 | #ifdef CONFIG_SCHED_DEBUG | ||
| 757 | #define SCHED_FEAT(name, enabled) \ | ||
| 758 | #name , | ||
| 759 | |||
| 760 | __read_mostly char *sched_feat_names[] = { | ||
| 761 | #include "sched_features.h" | ||
| 762 | NULL | ||
| 763 | }; | ||
| 764 | |||
| 765 | #undef SCHED_FEAT | ||
| 766 | |||
| 767 | int sched_feat_open(struct inode *inode, struct file *filp) | ||
| 768 | { | ||
| 769 | filp->private_data = inode->i_private; | ||
| 770 | return 0; | ||
| 771 | } | ||
| 772 | |||
| 773 | static ssize_t | ||
| 774 | sched_feat_read(struct file *filp, char __user *ubuf, | ||
| 775 | size_t cnt, loff_t *ppos) | ||
| 776 | { | ||
| 777 | char *buf; | ||
| 778 | int r = 0; | ||
| 779 | int len = 0; | ||
| 780 | int i; | ||
| 781 | |||
| 782 | for (i = 0; sched_feat_names[i]; i++) { | ||
| 783 | len += strlen(sched_feat_names[i]); | ||
| 784 | len += 4; | ||
| 785 | } | ||
| 786 | |||
| 787 | buf = kmalloc(len + 2, GFP_KERNEL); | ||
| 788 | if (!buf) | ||
| 789 | return -ENOMEM; | ||
| 790 | |||
| 791 | for (i = 0; sched_feat_names[i]; i++) { | ||
| 792 | if (sysctl_sched_features & (1UL << i)) | ||
| 793 | r += sprintf(buf + r, "%s ", sched_feat_names[i]); | ||
| 794 | else | ||
| 795 | r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); | ||
| 796 | } | ||
| 607 | 797 | ||
| 608 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 798 | r += sprintf(buf + r, "\n"); |
| 799 | WARN_ON(r >= len + 2); | ||
| 800 | |||
| 801 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
| 802 | |||
| 803 | kfree(buf); | ||
| 804 | |||
| 805 | return r; | ||
| 806 | } | ||
| 807 | |||
| 808 | static ssize_t | ||
| 809 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
| 810 | size_t cnt, loff_t *ppos) | ||
| 811 | { | ||
| 812 | char buf[64]; | ||
| 813 | char *cmp = buf; | ||
| 814 | int neg = 0; | ||
| 815 | int i; | ||
| 816 | |||
| 817 | if (cnt > 63) | ||
| 818 | cnt = 63; | ||
| 819 | |||
| 820 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 821 | return -EFAULT; | ||
| 822 | |||
| 823 | buf[cnt] = 0; | ||
| 824 | |||
| 825 | if (strncmp(buf, "NO_", 3) == 0) { | ||
| 826 | neg = 1; | ||
| 827 | cmp += 3; | ||
| 828 | } | ||
| 829 | |||
| 830 | for (i = 0; sched_feat_names[i]; i++) { | ||
| 831 | int len = strlen(sched_feat_names[i]); | ||
| 832 | |||
| 833 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
| 834 | if (neg) | ||
| 835 | sysctl_sched_features &= ~(1UL << i); | ||
| 836 | else | ||
| 837 | sysctl_sched_features |= (1UL << i); | ||
| 838 | break; | ||
| 839 | } | ||
| 840 | } | ||
| 841 | |||
| 842 | if (!sched_feat_names[i]) | ||
| 843 | return -EINVAL; | ||
| 844 | |||
| 845 | filp->f_pos += cnt; | ||
| 846 | |||
| 847 | return cnt; | ||
| 848 | } | ||
| 849 | |||
| 850 | static struct file_operations sched_feat_fops = { | ||
| 851 | .open = sched_feat_open, | ||
| 852 | .read = sched_feat_read, | ||
| 853 | .write = sched_feat_write, | ||
| 854 | }; | ||
| 855 | |||
| 856 | static __init int sched_init_debug(void) | ||
| 857 | { | ||
| 858 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
| 859 | &sched_feat_fops); | ||
| 860 | |||
| 861 | return 0; | ||
| 862 | } | ||
| 863 | late_initcall(sched_init_debug); | ||
| 864 | |||
| 865 | #endif | ||
| 866 | |||
| 867 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
| 609 | 868 | ||
| 610 | /* | 869 | /* |
| 611 | * Number of tasks to iterate in a single balance run. | 870 | * Number of tasks to iterate in a single balance run. |
| @@ -627,16 +886,52 @@ static __read_mostly int scheduler_running; | |||
| 627 | */ | 886 | */ |
| 628 | int sysctl_sched_rt_runtime = 950000; | 887 | int sysctl_sched_rt_runtime = 950000; |
| 629 | 888 | ||
| 630 | /* | 889 | static inline u64 global_rt_period(void) |
| 631 | * single value that denotes runtime == period, ie unlimited time. | 890 | { |
| 632 | */ | 891 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
| 633 | #define RUNTIME_INF ((u64)~0ULL) | 892 | } |
| 893 | |||
| 894 | static inline u64 global_rt_runtime(void) | ||
| 895 | { | ||
| 896 | if (sysctl_sched_rt_period < 0) | ||
| 897 | return RUNTIME_INF; | ||
| 898 | |||
| 899 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 900 | } | ||
| 901 | |||
| 902 | static const unsigned long long time_sync_thresh = 100000; | ||
| 903 | |||
| 904 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
| 905 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
| 634 | 906 | ||
| 635 | /* | 907 | /* |
| 636 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 908 | * Global lock which we take every now and then to synchronize |
| 637 | * clock constructed from sched_clock(): | 909 | * the CPUs time. This method is not warp-safe, but it's good |
| 910 | * enough to synchronize slowly diverging time sources and thus | ||
| 911 | * it's good enough for tracing: | ||
| 638 | */ | 912 | */ |
| 639 | unsigned long long cpu_clock(int cpu) | 913 | static DEFINE_SPINLOCK(time_sync_lock); |
| 914 | static unsigned long long prev_global_time; | ||
| 915 | |||
| 916 | static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) | ||
| 917 | { | ||
| 918 | unsigned long flags; | ||
| 919 | |||
| 920 | spin_lock_irqsave(&time_sync_lock, flags); | ||
| 921 | |||
| 922 | if (time < prev_global_time) { | ||
| 923 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
| 924 | time = prev_global_time; | ||
| 925 | } else { | ||
| 926 | prev_global_time = time; | ||
| 927 | } | ||
| 928 | |||
| 929 | spin_unlock_irqrestore(&time_sync_lock, flags); | ||
| 930 | |||
| 931 | return time; | ||
| 932 | } | ||
| 933 | |||
| 934 | static unsigned long long __cpu_clock(int cpu) | ||
| 640 | { | 935 | { |
| 641 | unsigned long long now; | 936 | unsigned long long now; |
| 642 | unsigned long flags; | 937 | unsigned long flags; |
| @@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu) | |||
| 657 | 952 | ||
| 658 | return now; | 953 | return now; |
| 659 | } | 954 | } |
| 955 | |||
| 956 | /* | ||
| 957 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
| 958 | * clock constructed from sched_clock(): | ||
| 959 | */ | ||
| 960 | unsigned long long cpu_clock(int cpu) | ||
| 961 | { | ||
| 962 | unsigned long long prev_cpu_time, time, delta_time; | ||
| 963 | |||
| 964 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
| 965 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
| 966 | delta_time = time-prev_cpu_time; | ||
| 967 | |||
| 968 | if (unlikely(delta_time > time_sync_thresh)) | ||
| 969 | time = __sync_cpu_clock(time, cpu); | ||
| 970 | |||
| 971 | return time; | ||
| 972 | } | ||
| 660 | EXPORT_SYMBOL_GPL(cpu_clock); | 973 | EXPORT_SYMBOL_GPL(cpu_clock); |
| 661 | 974 | ||
| 662 | #ifndef prepare_arch_switch | 975 | #ifndef prepare_arch_switch |
| @@ -1052,6 +1365,49 @@ static void resched_cpu(int cpu) | |||
| 1052 | resched_task(cpu_curr(cpu)); | 1365 | resched_task(cpu_curr(cpu)); |
| 1053 | spin_unlock_irqrestore(&rq->lock, flags); | 1366 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1054 | } | 1367 | } |
| 1368 | |||
| 1369 | #ifdef CONFIG_NO_HZ | ||
| 1370 | /* | ||
| 1371 | * When add_timer_on() enqueues a timer into the timer wheel of an | ||
| 1372 | * idle CPU then this timer might expire before the next timer event | ||
| 1373 | * which is scheduled to wake up that CPU. In case of a completely | ||
| 1374 | * idle system the next event might even be infinite time into the | ||
| 1375 | * future. wake_up_idle_cpu() ensures that the CPU is woken up and | ||
| 1376 | * leaves the inner idle loop so the newly added timer is taken into | ||
| 1377 | * account when the CPU goes back to idle and evaluates the timer | ||
| 1378 | * wheel for the next timer event. | ||
| 1379 | */ | ||
| 1380 | void wake_up_idle_cpu(int cpu) | ||
| 1381 | { | ||
| 1382 | struct rq *rq = cpu_rq(cpu); | ||
| 1383 | |||
| 1384 | if (cpu == smp_processor_id()) | ||
| 1385 | return; | ||
| 1386 | |||
| 1387 | /* | ||
| 1388 | * This is safe, as this function is called with the timer | ||
| 1389 | * wheel base lock of (cpu) held. When the CPU is on the way | ||
| 1390 | * to idle and has not yet set rq->curr to idle then it will | ||
| 1391 | * be serialized on the timer wheel base lock and take the new | ||
| 1392 | * timer into account automatically. | ||
| 1393 | */ | ||
| 1394 | if (rq->curr != rq->idle) | ||
| 1395 | return; | ||
| 1396 | |||
| 1397 | /* | ||
| 1398 | * We can set TIF_RESCHED on the idle task of the other CPU | ||
| 1399 | * lockless. The worst case is that the other CPU runs the | ||
| 1400 | * idle task through an additional NOOP schedule() | ||
| 1401 | */ | ||
| 1402 | set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); | ||
| 1403 | |||
| 1404 | /* NEED_RESCHED must be visible before we test polling */ | ||
| 1405 | smp_mb(); | ||
| 1406 | if (!tsk_is_polling(rq->idle)) | ||
| 1407 | smp_send_reschedule(cpu); | ||
| 1408 | } | ||
| 1409 | #endif | ||
| 1410 | |||
| 1055 | #else | 1411 | #else |
| 1056 | static void __resched_task(struct task_struct *p, int tif_bit) | 1412 | static void __resched_task(struct task_struct *p, int tif_bit) |
| 1057 | { | 1413 | { |
| @@ -1073,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
| 1073 | */ | 1429 | */ |
| 1074 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1430 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
| 1075 | 1431 | ||
| 1432 | /* | ||
| 1433 | * delta *= weight / lw | ||
| 1434 | */ | ||
| 1076 | static unsigned long | 1435 | static unsigned long |
| 1077 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1436 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
| 1078 | struct load_weight *lw) | 1437 | struct load_weight *lw) |
| @@ -1095,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
| 1095 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1454 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
| 1096 | } | 1455 | } |
| 1097 | 1456 | ||
| 1098 | static inline unsigned long | ||
| 1099 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
| 1100 | { | ||
| 1101 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
| 1102 | } | ||
| 1103 | |||
| 1104 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1457 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 1105 | { | 1458 | { |
| 1106 | lw->weight += inc; | 1459 | lw->weight += inc; |
| @@ -1198,11 +1551,347 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 1198 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1551 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 1199 | #endif | 1552 | #endif |
| 1200 | 1553 | ||
| 1554 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
| 1555 | { | ||
| 1556 | update_load_add(&rq->load, load); | ||
| 1557 | } | ||
| 1558 | |||
| 1559 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
| 1560 | { | ||
| 1561 | update_load_sub(&rq->load, load); | ||
| 1562 | } | ||
| 1563 | |||
| 1201 | #ifdef CONFIG_SMP | 1564 | #ifdef CONFIG_SMP |
| 1202 | static unsigned long source_load(int cpu, int type); | 1565 | static unsigned long source_load(int cpu, int type); |
| 1203 | static unsigned long target_load(int cpu, int type); | 1566 | static unsigned long target_load(int cpu, int type); |
| 1204 | static unsigned long cpu_avg_load_per_task(int cpu); | 1567 | static unsigned long cpu_avg_load_per_task(int cpu); |
| 1205 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1568 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
| 1569 | |||
| 1570 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1571 | |||
| 1572 | /* | ||
| 1573 | * Group load balancing. | ||
| 1574 | * | ||
| 1575 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
| 1576 | * Given the pictures below, and assuming each item has equal weight: | ||
| 1577 | * | ||
| 1578 | * root 1 - thread | ||
| 1579 | * / | \ A - group | ||
| 1580 | * A 1 B | ||
| 1581 | * /|\ / \ | ||
| 1582 | * C 2 D 3 4 | ||
| 1583 | * | | | ||
| 1584 | * 5 6 | ||
| 1585 | * | ||
| 1586 | * load: | ||
| 1587 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
| 1588 | * which equals 1/9-th of the total load. | ||
| 1589 | * | ||
| 1590 | * shares: | ||
| 1591 | * The weight of this group on the selected cpus. | ||
| 1592 | * | ||
| 1593 | * rq_weight: | ||
| 1594 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
| 1595 | * B would get 2. | ||
| 1596 | * | ||
| 1597 | * task_weight: | ||
| 1598 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
| 1599 | * get 1, B gets 2. | ||
| 1600 | */ | ||
| 1601 | |||
| 1602 | static inline struct aggregate_struct * | ||
| 1603 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
| 1604 | { | ||
| 1605 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
| 1609 | |||
| 1610 | /* | ||
| 1611 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 1612 | * leaving it for the final time. | ||
| 1613 | */ | ||
| 1614 | static | ||
| 1615 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
| 1616 | struct sched_domain *sd) | ||
| 1617 | { | ||
| 1618 | struct task_group *parent, *child; | ||
| 1619 | |||
| 1620 | rcu_read_lock(); | ||
| 1621 | parent = &root_task_group; | ||
| 1622 | down: | ||
| 1623 | (*down)(parent, sd); | ||
| 1624 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
| 1625 | parent = child; | ||
| 1626 | goto down; | ||
| 1627 | |||
| 1628 | up: | ||
| 1629 | continue; | ||
| 1630 | } | ||
| 1631 | (*up)(parent, sd); | ||
| 1632 | |||
| 1633 | child = parent; | ||
| 1634 | parent = parent->parent; | ||
| 1635 | if (parent) | ||
| 1636 | goto up; | ||
| 1637 | rcu_read_unlock(); | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * Calculate the aggregate runqueue weight. | ||
| 1642 | */ | ||
| 1643 | static | ||
| 1644 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
| 1645 | { | ||
| 1646 | unsigned long rq_weight = 0; | ||
| 1647 | unsigned long task_weight = 0; | ||
| 1648 | int i; | ||
| 1649 | |||
| 1650 | for_each_cpu_mask(i, sd->span) { | ||
| 1651 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
| 1652 | task_weight += tg->cfs_rq[i]->task_weight; | ||
| 1653 | } | ||
| 1654 | |||
| 1655 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
| 1656 | aggregate(tg, sd)->task_weight = task_weight; | ||
| 1657 | } | ||
| 1658 | |||
| 1659 | /* | ||
| 1660 | * Compute the weight of this group on the given cpus. | ||
| 1661 | */ | ||
| 1662 | static | ||
| 1663 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
| 1664 | { | ||
| 1665 | unsigned long shares = 0; | ||
| 1666 | int i; | ||
| 1667 | |||
| 1668 | for_each_cpu_mask(i, sd->span) | ||
| 1669 | shares += tg->cfs_rq[i]->shares; | ||
| 1670 | |||
| 1671 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
| 1672 | shares = tg->shares; | ||
| 1673 | |||
| 1674 | aggregate(tg, sd)->shares = shares; | ||
| 1675 | } | ||
| 1676 | |||
| 1677 | /* | ||
| 1678 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
| 1679 | * weight and this group's parent's load, i.e. top-down. | ||
| 1680 | */ | ||
| 1681 | static | ||
| 1682 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
| 1683 | { | ||
| 1684 | unsigned long load; | ||
| 1685 | |||
| 1686 | if (!tg->parent) { | ||
| 1687 | int i; | ||
| 1688 | |||
| 1689 | load = 0; | ||
| 1690 | for_each_cpu_mask(i, sd->span) | ||
| 1691 | load += cpu_rq(i)->load.weight; | ||
| 1692 | |||
| 1693 | } else { | ||
| 1694 | load = aggregate(tg->parent, sd)->load; | ||
| 1695 | |||
| 1696 | /* | ||
| 1697 | * shares is our weight in the parent's rq so | ||
| 1698 | * shares/parent->rq_weight gives our fraction of the load | ||
| 1699 | */ | ||
| 1700 | load *= aggregate(tg, sd)->shares; | ||
| 1701 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
| 1702 | } | ||
| 1703 | |||
| 1704 | aggregate(tg, sd)->load = load; | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 1708 | |||
| 1709 | /* | ||
| 1710 | * Calculate and set the cpu's group shares. | ||
| 1711 | */ | ||
| 1712 | static void | ||
| 1713 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
| 1714 | int tcpu) | ||
| 1715 | { | ||
| 1716 | int boost = 0; | ||
| 1717 | unsigned long shares; | ||
| 1718 | unsigned long rq_weight; | ||
| 1719 | |||
| 1720 | if (!tg->se[tcpu]) | ||
| 1721 | return; | ||
| 1722 | |||
| 1723 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
| 1724 | |||
| 1725 | /* | ||
| 1726 | * If there are currently no tasks on the cpu pretend there is one of | ||
| 1727 | * average load so that when a new task gets to run here it will not | ||
| 1728 | * get delayed by group starvation. | ||
| 1729 | */ | ||
| 1730 | if (!rq_weight) { | ||
| 1731 | boost = 1; | ||
| 1732 | rq_weight = NICE_0_LOAD; | ||
| 1733 | } | ||
| 1734 | |||
| 1735 | /* | ||
| 1736 | * \Sum shares * rq_weight | ||
| 1737 | * shares = ----------------------- | ||
| 1738 | * \Sum rq_weight | ||
| 1739 | * | ||
| 1740 | */ | ||
| 1741 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
| 1742 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
| 1743 | |||
| 1744 | /* | ||
| 1745 | * record the actual number of shares, not the boosted amount. | ||
| 1746 | */ | ||
| 1747 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
| 1748 | |||
| 1749 | if (shares < MIN_SHARES) | ||
| 1750 | shares = MIN_SHARES; | ||
| 1751 | |||
| 1752 | __set_se_shares(tg->se[tcpu], shares); | ||
| 1753 | } | ||
| 1754 | |||
| 1755 | /* | ||
| 1756 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
| 1757 | * task went to. | ||
| 1758 | */ | ||
| 1759 | static void | ||
| 1760 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
| 1761 | int scpu, int dcpu) | ||
| 1762 | { | ||
| 1763 | unsigned long shares; | ||
| 1764 | |||
| 1765 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
| 1766 | |||
| 1767 | __update_group_shares_cpu(tg, sd, scpu); | ||
| 1768 | __update_group_shares_cpu(tg, sd, dcpu); | ||
| 1769 | |||
| 1770 | /* | ||
| 1771 | * ensure we never loose shares due to rounding errors in the | ||
| 1772 | * above redistribution. | ||
| 1773 | */ | ||
| 1774 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
| 1775 | if (shares) | ||
| 1776 | tg->cfs_rq[dcpu]->shares += shares; | ||
| 1777 | } | ||
| 1778 | |||
| 1779 | /* | ||
| 1780 | * Because changing a group's shares changes the weight of the super-group | ||
| 1781 | * we need to walk up the tree and change all shares until we hit the root. | ||
| 1782 | */ | ||
| 1783 | static void | ||
| 1784 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
| 1785 | int scpu, int dcpu) | ||
| 1786 | { | ||
| 1787 | while (tg) { | ||
| 1788 | __move_group_shares(tg, sd, scpu, dcpu); | ||
| 1789 | tg = tg->parent; | ||
| 1790 | } | ||
| 1791 | } | ||
| 1792 | |||
| 1793 | static | ||
| 1794 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
| 1795 | { | ||
| 1796 | unsigned long shares = aggregate(tg, sd)->shares; | ||
| 1797 | int i; | ||
| 1798 | |||
| 1799 | for_each_cpu_mask(i, sd->span) { | ||
| 1800 | struct rq *rq = cpu_rq(i); | ||
| 1801 | unsigned long flags; | ||
| 1802 | |||
| 1803 | spin_lock_irqsave(&rq->lock, flags); | ||
| 1804 | __update_group_shares_cpu(tg, sd, i); | ||
| 1805 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1806 | } | ||
| 1807 | |||
| 1808 | aggregate_group_shares(tg, sd); | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * ensure we never loose shares due to rounding errors in the | ||
| 1812 | * above redistribution. | ||
| 1813 | */ | ||
| 1814 | shares -= aggregate(tg, sd)->shares; | ||
| 1815 | if (shares) { | ||
| 1816 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
| 1817 | aggregate(tg, sd)->shares += shares; | ||
| 1818 | } | ||
| 1819 | } | ||
| 1820 | |||
| 1821 | /* | ||
| 1822 | * Calculate the accumulative weight and recursive load of each task group | ||
| 1823 | * while walking down the tree. | ||
| 1824 | */ | ||
| 1825 | static | ||
| 1826 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
| 1827 | { | ||
| 1828 | aggregate_group_weight(tg, sd); | ||
| 1829 | aggregate_group_shares(tg, sd); | ||
| 1830 | aggregate_group_load(tg, sd); | ||
| 1831 | } | ||
| 1832 | |||
| 1833 | /* | ||
| 1834 | * Rebalance the cpu shares while walking back up the tree. | ||
| 1835 | */ | ||
| 1836 | static | ||
| 1837 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
| 1838 | { | ||
| 1839 | aggregate_group_set_shares(tg, sd); | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
| 1843 | |||
| 1844 | static void __init init_aggregate(void) | ||
| 1845 | { | ||
| 1846 | int i; | ||
| 1847 | |||
| 1848 | for_each_possible_cpu(i) | ||
| 1849 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
| 1850 | } | ||
| 1851 | |||
| 1852 | static int get_aggregate(struct sched_domain *sd) | ||
| 1853 | { | ||
| 1854 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
| 1855 | return 0; | ||
| 1856 | |||
| 1857 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
| 1858 | return 1; | ||
| 1859 | } | ||
| 1860 | |||
| 1861 | static void put_aggregate(struct sched_domain *sd) | ||
| 1862 | { | ||
| 1863 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
| 1864 | } | ||
| 1865 | |||
| 1866 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1867 | { | ||
| 1868 | cfs_rq->shares = shares; | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | #else | ||
| 1872 | |||
| 1873 | static inline void init_aggregate(void) | ||
| 1874 | { | ||
| 1875 | } | ||
| 1876 | |||
| 1877 | static inline int get_aggregate(struct sched_domain *sd) | ||
| 1878 | { | ||
| 1879 | return 0; | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | static inline void put_aggregate(struct sched_domain *sd) | ||
| 1883 | { | ||
| 1884 | } | ||
| 1885 | #endif | ||
| 1886 | |||
| 1887 | #else /* CONFIG_SMP */ | ||
| 1888 | |||
| 1889 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1890 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1891 | { | ||
| 1892 | } | ||
| 1893 | #endif | ||
| 1894 | |||
| 1206 | #endif /* CONFIG_SMP */ | 1895 | #endif /* CONFIG_SMP */ |
| 1207 | 1896 | ||
| 1208 | #include "sched_stats.h" | 1897 | #include "sched_stats.h" |
| @@ -1215,26 +1904,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
| 1215 | 1904 | ||
| 1216 | #define sched_class_highest (&rt_sched_class) | 1905 | #define sched_class_highest (&rt_sched_class) |
| 1217 | 1906 | ||
| 1218 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1907 | static void inc_nr_running(struct rq *rq) |
| 1219 | { | ||
| 1220 | update_load_add(&rq->load, p->se.load.weight); | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
| 1224 | { | ||
| 1225 | update_load_sub(&rq->load, p->se.load.weight); | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 1229 | { | 1908 | { |
| 1230 | rq->nr_running++; | 1909 | rq->nr_running++; |
| 1231 | inc_load(rq, p); | ||
| 1232 | } | 1910 | } |
| 1233 | 1911 | ||
| 1234 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1912 | static void dec_nr_running(struct rq *rq) |
| 1235 | { | 1913 | { |
| 1236 | rq->nr_running--; | 1914 | rq->nr_running--; |
| 1237 | dec_load(rq, p); | ||
| 1238 | } | 1915 | } |
| 1239 | 1916 | ||
| 1240 | static void set_load_weight(struct task_struct *p) | 1917 | static void set_load_weight(struct task_struct *p) |
| @@ -1326,7 +2003,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 1326 | rq->nr_uninterruptible--; | 2003 | rq->nr_uninterruptible--; |
| 1327 | 2004 | ||
| 1328 | enqueue_task(rq, p, wakeup); | 2005 | enqueue_task(rq, p, wakeup); |
| 1329 | inc_nr_running(p, rq); | 2006 | inc_nr_running(rq); |
| 1330 | } | 2007 | } |
| 1331 | 2008 | ||
| 1332 | /* | 2009 | /* |
| @@ -1338,7 +2015,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
| 1338 | rq->nr_uninterruptible++; | 2015 | rq->nr_uninterruptible++; |
| 1339 | 2016 | ||
| 1340 | dequeue_task(rq, p, sleep); | 2017 | dequeue_task(rq, p, sleep); |
| 1341 | dec_nr_running(p, rq); | 2018 | dec_nr_running(rq); |
| 1342 | } | 2019 | } |
| 1343 | 2020 | ||
| 1344 | /** | 2021 | /** |
| @@ -1395,7 +2072,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1395 | /* | 2072 | /* |
| 1396 | * Buddy candidates are cache hot: | 2073 | * Buddy candidates are cache hot: |
| 1397 | */ | 2074 | */ |
| 1398 | if (&p->se == cfs_rq_of(&p->se)->next) | 2075 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) |
| 1399 | return 1; | 2076 | return 1; |
| 1400 | 2077 | ||
| 1401 | if (p->sched_class != &fair_sched_class) | 2078 | if (p->sched_class != &fair_sched_class) |
| @@ -1685,17 +2362,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1685 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 2362 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
| 1686 | */ | 2363 | */ |
| 1687 | static int | 2364 | static int |
| 1688 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 2365 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, |
| 2366 | cpumask_t *tmp) | ||
| 1689 | { | 2367 | { |
| 1690 | cpumask_t tmp; | ||
| 1691 | unsigned long load, min_load = ULONG_MAX; | 2368 | unsigned long load, min_load = ULONG_MAX; |
| 1692 | int idlest = -1; | 2369 | int idlest = -1; |
| 1693 | int i; | 2370 | int i; |
| 1694 | 2371 | ||
| 1695 | /* Traverse only the allowed CPUs */ | 2372 | /* Traverse only the allowed CPUs */ |
| 1696 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 2373 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); |
| 1697 | 2374 | ||
| 1698 | for_each_cpu_mask(i, tmp) { | 2375 | for_each_cpu_mask(i, *tmp) { |
| 1699 | load = weighted_cpuload(i); | 2376 | load = weighted_cpuload(i); |
| 1700 | 2377 | ||
| 1701 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2378 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| @@ -1734,7 +2411,7 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1734 | } | 2411 | } |
| 1735 | 2412 | ||
| 1736 | while (sd) { | 2413 | while (sd) { |
| 1737 | cpumask_t span; | 2414 | cpumask_t span, tmpmask; |
| 1738 | struct sched_group *group; | 2415 | struct sched_group *group; |
| 1739 | int new_cpu, weight; | 2416 | int new_cpu, weight; |
| 1740 | 2417 | ||
| @@ -1750,7 +2427,7 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1750 | continue; | 2427 | continue; |
| 1751 | } | 2428 | } |
| 1752 | 2429 | ||
| 1753 | new_cpu = find_idlest_cpu(group, t, cpu); | 2430 | new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); |
| 1754 | if (new_cpu == -1 || new_cpu == cpu) { | 2431 | if (new_cpu == -1 || new_cpu == cpu) { |
| 1755 | /* Now try balancing at a lower domain level of cpu */ | 2432 | /* Now try balancing at a lower domain level of cpu */ |
| 1756 | sd = sd->child; | 2433 | sd = sd->child; |
| @@ -1796,6 +2473,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1796 | long old_state; | 2473 | long old_state; |
| 1797 | struct rq *rq; | 2474 | struct rq *rq; |
| 1798 | 2475 | ||
| 2476 | if (!sched_feat(SYNC_WAKEUPS)) | ||
| 2477 | sync = 0; | ||
| 2478 | |||
| 1799 | smp_wmb(); | 2479 | smp_wmb(); |
| 1800 | rq = task_rq_lock(p, &flags); | 2480 | rq = task_rq_lock(p, &flags); |
| 1801 | old_state = p->state; | 2481 | old_state = p->state; |
| @@ -1912,6 +2592,7 @@ static void __sched_fork(struct task_struct *p) | |||
| 1912 | 2592 | ||
| 1913 | INIT_LIST_HEAD(&p->rt.run_list); | 2593 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1914 | p->se.on_rq = 0; | 2594 | p->se.on_rq = 0; |
| 2595 | INIT_LIST_HEAD(&p->se.group_node); | ||
| 1915 | 2596 | ||
| 1916 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2597 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 1917 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2598 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
| @@ -1987,7 +2668,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 1987 | * management (if any): | 2668 | * management (if any): |
| 1988 | */ | 2669 | */ |
| 1989 | p->sched_class->task_new(rq, p); | 2670 | p->sched_class->task_new(rq, p); |
| 1990 | inc_nr_running(p, rq); | 2671 | inc_nr_running(rq); |
| 1991 | } | 2672 | } |
| 1992 | check_preempt_curr(rq, p); | 2673 | check_preempt_curr(rq, p); |
| 1993 | #ifdef CONFIG_SMP | 2674 | #ifdef CONFIG_SMP |
| @@ -2631,7 +3312,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2631 | static struct sched_group * | 3312 | static struct sched_group * |
| 2632 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3313 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 2633 | unsigned long *imbalance, enum cpu_idle_type idle, | 3314 | unsigned long *imbalance, enum cpu_idle_type idle, |
| 2634 | int *sd_idle, cpumask_t *cpus, int *balance) | 3315 | int *sd_idle, const cpumask_t *cpus, int *balance) |
| 2635 | { | 3316 | { |
| 2636 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3317 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 2637 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3318 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| @@ -2932,7 +3613,7 @@ ret: | |||
| 2932 | */ | 3613 | */ |
| 2933 | static struct rq * | 3614 | static struct rq * |
| 2934 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 3615 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
| 2935 | unsigned long imbalance, cpumask_t *cpus) | 3616 | unsigned long imbalance, const cpumask_t *cpus) |
| 2936 | { | 3617 | { |
| 2937 | struct rq *busiest = NULL, *rq; | 3618 | struct rq *busiest = NULL, *rq; |
| 2938 | unsigned long max_load = 0; | 3619 | unsigned long max_load = 0; |
| @@ -2971,14 +3652,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 2971 | */ | 3652 | */ |
| 2972 | static int load_balance(int this_cpu, struct rq *this_rq, | 3653 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 2973 | struct sched_domain *sd, enum cpu_idle_type idle, | 3654 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2974 | int *balance) | 3655 | int *balance, cpumask_t *cpus) |
| 2975 | { | 3656 | { |
| 2976 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3657 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
| 2977 | struct sched_group *group; | 3658 | struct sched_group *group; |
| 2978 | unsigned long imbalance; | 3659 | unsigned long imbalance; |
| 2979 | struct rq *busiest; | 3660 | struct rq *busiest; |
| 2980 | cpumask_t cpus = CPU_MASK_ALL; | ||
| 2981 | unsigned long flags; | 3661 | unsigned long flags; |
| 3662 | int unlock_aggregate; | ||
| 3663 | |||
| 3664 | cpus_setall(*cpus); | ||
| 3665 | |||
| 3666 | unlock_aggregate = get_aggregate(sd); | ||
| 2982 | 3667 | ||
| 2983 | /* | 3668 | /* |
| 2984 | * When power savings policy is enabled for the parent domain, idle | 3669 | * When power savings policy is enabled for the parent domain, idle |
| @@ -2994,7 +3679,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 2994 | 3679 | ||
| 2995 | redo: | 3680 | redo: |
| 2996 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3681 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| 2997 | &cpus, balance); | 3682 | cpus, balance); |
| 2998 | 3683 | ||
| 2999 | if (*balance == 0) | 3684 | if (*balance == 0) |
| 3000 | goto out_balanced; | 3685 | goto out_balanced; |
| @@ -3004,7 +3689,7 @@ redo: | |||
| 3004 | goto out_balanced; | 3689 | goto out_balanced; |
| 3005 | } | 3690 | } |
| 3006 | 3691 | ||
| 3007 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 3692 | busiest = find_busiest_queue(group, idle, imbalance, cpus); |
| 3008 | if (!busiest) { | 3693 | if (!busiest) { |
| 3009 | schedstat_inc(sd, lb_nobusyq[idle]); | 3694 | schedstat_inc(sd, lb_nobusyq[idle]); |
| 3010 | goto out_balanced; | 3695 | goto out_balanced; |
| @@ -3037,8 +3722,8 @@ redo: | |||
| 3037 | 3722 | ||
| 3038 | /* All tasks on this runqueue were pinned by CPU affinity */ | 3723 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 3039 | if (unlikely(all_pinned)) { | 3724 | if (unlikely(all_pinned)) { |
| 3040 | cpu_clear(cpu_of(busiest), cpus); | 3725 | cpu_clear(cpu_of(busiest), *cpus); |
| 3041 | if (!cpus_empty(cpus)) | 3726 | if (!cpus_empty(*cpus)) |
| 3042 | goto redo; | 3727 | goto redo; |
| 3043 | goto out_balanced; | 3728 | goto out_balanced; |
| 3044 | } | 3729 | } |
| @@ -3095,8 +3780,9 @@ redo: | |||
| 3095 | 3780 | ||
| 3096 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3781 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3097 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3782 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3098 | return -1; | 3783 | ld_moved = -1; |
| 3099 | return ld_moved; | 3784 | |
| 3785 | goto out; | ||
| 3100 | 3786 | ||
| 3101 | out_balanced: | 3787 | out_balanced: |
| 3102 | schedstat_inc(sd, lb_balanced[idle]); | 3788 | schedstat_inc(sd, lb_balanced[idle]); |
| @@ -3111,8 +3797,13 @@ out_one_pinned: | |||
| 3111 | 3797 | ||
| 3112 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3798 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 3113 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3799 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 3114 | return -1; | 3800 | ld_moved = -1; |
| 3115 | return 0; | 3801 | else |
| 3802 | ld_moved = 0; | ||
| 3803 | out: | ||
| 3804 | if (unlock_aggregate) | ||
| 3805 | put_aggregate(sd); | ||
| 3806 | return ld_moved; | ||
| 3116 | } | 3807 | } |
| 3117 | 3808 | ||
| 3118 | /* | 3809 | /* |
| @@ -3123,7 +3814,8 @@ out_one_pinned: | |||
| 3123 | * this_rq is locked. | 3814 | * this_rq is locked. |
| 3124 | */ | 3815 | */ |
| 3125 | static int | 3816 | static int |
| 3126 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 3817 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, |
| 3818 | cpumask_t *cpus) | ||
| 3127 | { | 3819 | { |
| 3128 | struct sched_group *group; | 3820 | struct sched_group *group; |
| 3129 | struct rq *busiest = NULL; | 3821 | struct rq *busiest = NULL; |
| @@ -3131,7 +3823,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 3131 | int ld_moved = 0; | 3823 | int ld_moved = 0; |
| 3132 | int sd_idle = 0; | 3824 | int sd_idle = 0; |
| 3133 | int all_pinned = 0; | 3825 | int all_pinned = 0; |
| 3134 | cpumask_t cpus = CPU_MASK_ALL; | 3826 | |
| 3827 | cpus_setall(*cpus); | ||
| 3135 | 3828 | ||
| 3136 | /* | 3829 | /* |
| 3137 | * When power savings policy is enabled for the parent domain, idle | 3830 | * When power savings policy is enabled for the parent domain, idle |
| @@ -3146,14 +3839,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 3146 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3839 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
| 3147 | redo: | 3840 | redo: |
| 3148 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3841 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
| 3149 | &sd_idle, &cpus, NULL); | 3842 | &sd_idle, cpus, NULL); |
| 3150 | if (!group) { | 3843 | if (!group) { |
| 3151 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 3844 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
| 3152 | goto out_balanced; | 3845 | goto out_balanced; |
| 3153 | } | 3846 | } |
| 3154 | 3847 | ||
| 3155 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 3848 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); |
| 3156 | &cpus); | ||
| 3157 | if (!busiest) { | 3849 | if (!busiest) { |
| 3158 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 3850 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
| 3159 | goto out_balanced; | 3851 | goto out_balanced; |
| @@ -3175,8 +3867,8 @@ redo: | |||
| 3175 | spin_unlock(&busiest->lock); | 3867 | spin_unlock(&busiest->lock); |
| 3176 | 3868 | ||
| 3177 | if (unlikely(all_pinned)) { | 3869 | if (unlikely(all_pinned)) { |
| 3178 | cpu_clear(cpu_of(busiest), cpus); | 3870 | cpu_clear(cpu_of(busiest), *cpus); |
| 3179 | if (!cpus_empty(cpus)) | 3871 | if (!cpus_empty(*cpus)) |
| 3180 | goto redo; | 3872 | goto redo; |
| 3181 | } | 3873 | } |
| 3182 | } | 3874 | } |
| @@ -3210,6 +3902,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3210 | struct sched_domain *sd; | 3902 | struct sched_domain *sd; |
| 3211 | int pulled_task = -1; | 3903 | int pulled_task = -1; |
| 3212 | unsigned long next_balance = jiffies + HZ; | 3904 | unsigned long next_balance = jiffies + HZ; |
| 3905 | cpumask_t tmpmask; | ||
| 3213 | 3906 | ||
| 3214 | for_each_domain(this_cpu, sd) { | 3907 | for_each_domain(this_cpu, sd) { |
| 3215 | unsigned long interval; | 3908 | unsigned long interval; |
| @@ -3219,8 +3912,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3219 | 3912 | ||
| 3220 | if (sd->flags & SD_BALANCE_NEWIDLE) | 3913 | if (sd->flags & SD_BALANCE_NEWIDLE) |
| 3221 | /* If we've pulled tasks over stop searching: */ | 3914 | /* If we've pulled tasks over stop searching: */ |
| 3222 | pulled_task = load_balance_newidle(this_cpu, | 3915 | pulled_task = load_balance_newidle(this_cpu, this_rq, |
| 3223 | this_rq, sd); | 3916 | sd, &tmpmask); |
| 3224 | 3917 | ||
| 3225 | interval = msecs_to_jiffies(sd->balance_interval); | 3918 | interval = msecs_to_jiffies(sd->balance_interval); |
| 3226 | if (time_after(next_balance, sd->last_balance + interval)) | 3919 | if (time_after(next_balance, sd->last_balance + interval)) |
| @@ -3379,6 +4072,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3379 | /* Earliest time when we have to do rebalance again */ | 4072 | /* Earliest time when we have to do rebalance again */ |
| 3380 | unsigned long next_balance = jiffies + 60*HZ; | 4073 | unsigned long next_balance = jiffies + 60*HZ; |
| 3381 | int update_next_balance = 0; | 4074 | int update_next_balance = 0; |
| 4075 | cpumask_t tmp; | ||
| 3382 | 4076 | ||
| 3383 | for_each_domain(cpu, sd) { | 4077 | for_each_domain(cpu, sd) { |
| 3384 | if (!(sd->flags & SD_LOAD_BALANCE)) | 4078 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| @@ -3402,7 +4096,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3402 | } | 4096 | } |
| 3403 | 4097 | ||
| 3404 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 4098 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 3405 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 4099 | if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { |
| 3406 | /* | 4100 | /* |
| 3407 | * We've pulled tasks over so either we're no | 4101 | * We've pulled tasks over so either we're no |
| 3408 | * longer idle, or one of our SMT siblings is | 4102 | * longer idle, or one of our SMT siblings is |
| @@ -3518,7 +4212,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
| 3518 | */ | 4212 | */ |
| 3519 | int ilb = first_cpu(nohz.cpu_mask); | 4213 | int ilb = first_cpu(nohz.cpu_mask); |
| 3520 | 4214 | ||
| 3521 | if (ilb != NR_CPUS) | 4215 | if (ilb < nr_cpu_ids) |
| 3522 | resched_cpu(ilb); | 4216 | resched_cpu(ilb); |
| 3523 | } | 4217 | } |
| 3524 | } | 4218 | } |
| @@ -3722,9 +4416,9 @@ void scheduler_tick(void) | |||
| 3722 | rq->clock_underflows++; | 4416 | rq->clock_underflows++; |
| 3723 | } | 4417 | } |
| 3724 | rq->tick_timestamp = rq->clock; | 4418 | rq->tick_timestamp = rq->clock; |
| 4419 | update_last_tick_seen(rq); | ||
| 3725 | update_cpu_load(rq); | 4420 | update_cpu_load(rq); |
| 3726 | curr->sched_class->task_tick(rq, curr, 0); | 4421 | curr->sched_class->task_tick(rq, curr, 0); |
| 3727 | update_sched_rt_period(rq); | ||
| 3728 | spin_unlock(&rq->lock); | 4422 | spin_unlock(&rq->lock); |
| 3729 | 4423 | ||
| 3730 | #ifdef CONFIG_SMP | 4424 | #ifdef CONFIG_SMP |
| @@ -4324,10 +5018,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4324 | goto out_unlock; | 5018 | goto out_unlock; |
| 4325 | } | 5019 | } |
| 4326 | on_rq = p->se.on_rq; | 5020 | on_rq = p->se.on_rq; |
| 4327 | if (on_rq) { | 5021 | if (on_rq) |
| 4328 | dequeue_task(rq, p, 0); | 5022 | dequeue_task(rq, p, 0); |
| 4329 | dec_load(rq, p); | ||
| 4330 | } | ||
| 4331 | 5023 | ||
| 4332 | p->static_prio = NICE_TO_PRIO(nice); | 5024 | p->static_prio = NICE_TO_PRIO(nice); |
| 4333 | set_load_weight(p); | 5025 | set_load_weight(p); |
| @@ -4337,7 +5029,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4337 | 5029 | ||
| 4338 | if (on_rq) { | 5030 | if (on_rq) { |
| 4339 | enqueue_task(rq, p, 0); | 5031 | enqueue_task(rq, p, 0); |
| 4340 | inc_load(rq, p); | ||
| 4341 | /* | 5032 | /* |
| 4342 | * If the task increased its priority or is running and | 5033 | * If the task increased its priority or is running and |
| 4343 | * lowered its priority, then reschedule its CPU: | 5034 | * lowered its priority, then reschedule its CPU: |
| @@ -4559,7 +5250,7 @@ recheck: | |||
| 4559 | * Do not allow realtime tasks into groups that have no runtime | 5250 | * Do not allow realtime tasks into groups that have no runtime |
| 4560 | * assigned. | 5251 | * assigned. |
| 4561 | */ | 5252 | */ |
| 4562 | if (rt_policy(policy) && task_group(p)->rt_runtime == 0) | 5253 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) |
| 4563 | return -EPERM; | 5254 | return -EPERM; |
| 4564 | #endif | 5255 | #endif |
| 4565 | 5256 | ||
| @@ -4721,9 +5412,10 @@ out_unlock: | |||
| 4721 | return retval; | 5412 | return retval; |
| 4722 | } | 5413 | } |
| 4723 | 5414 | ||
| 4724 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 5415 | long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) |
| 4725 | { | 5416 | { |
| 4726 | cpumask_t cpus_allowed; | 5417 | cpumask_t cpus_allowed; |
| 5418 | cpumask_t new_mask = *in_mask; | ||
| 4727 | struct task_struct *p; | 5419 | struct task_struct *p; |
| 4728 | int retval; | 5420 | int retval; |
| 4729 | 5421 | ||
| @@ -4754,13 +5446,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4754 | if (retval) | 5446 | if (retval) |
| 4755 | goto out_unlock; | 5447 | goto out_unlock; |
| 4756 | 5448 | ||
| 4757 | cpus_allowed = cpuset_cpus_allowed(p); | 5449 | cpuset_cpus_allowed(p, &cpus_allowed); |
| 4758 | cpus_and(new_mask, new_mask, cpus_allowed); | 5450 | cpus_and(new_mask, new_mask, cpus_allowed); |
| 4759 | again: | 5451 | again: |
| 4760 | retval = set_cpus_allowed(p, new_mask); | 5452 | retval = set_cpus_allowed_ptr(p, &new_mask); |
| 4761 | 5453 | ||
| 4762 | if (!retval) { | 5454 | if (!retval) { |
| 4763 | cpus_allowed = cpuset_cpus_allowed(p); | 5455 | cpuset_cpus_allowed(p, &cpus_allowed); |
| 4764 | if (!cpus_subset(new_mask, cpus_allowed)) { | 5456 | if (!cpus_subset(new_mask, cpus_allowed)) { |
| 4765 | /* | 5457 | /* |
| 4766 | * We must have raced with a concurrent cpuset | 5458 | * We must have raced with a concurrent cpuset |
| @@ -4804,7 +5496,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 4804 | if (retval) | 5496 | if (retval) |
| 4805 | return retval; | 5497 | return retval; |
| 4806 | 5498 | ||
| 4807 | return sched_setaffinity(pid, new_mask); | 5499 | return sched_setaffinity(pid, &new_mask); |
| 4808 | } | 5500 | } |
| 4809 | 5501 | ||
| 4810 | /* | 5502 | /* |
| @@ -5266,7 +5958,6 @@ static inline void sched_init_granularity(void) | |||
| 5266 | sysctl_sched_latency = limit; | 5958 | sysctl_sched_latency = limit; |
| 5267 | 5959 | ||
| 5268 | sysctl_sched_wakeup_granularity *= factor; | 5960 | sysctl_sched_wakeup_granularity *= factor; |
| 5269 | sysctl_sched_batch_wakeup_granularity *= factor; | ||
| 5270 | } | 5961 | } |
| 5271 | 5962 | ||
| 5272 | #ifdef CONFIG_SMP | 5963 | #ifdef CONFIG_SMP |
| @@ -5295,7 +5986,7 @@ static inline void sched_init_granularity(void) | |||
| 5295 | * task must not exit() & deallocate itself prematurely. The | 5986 | * task must not exit() & deallocate itself prematurely. The |
| 5296 | * call is not atomic; no spinlocks may be held. | 5987 | * call is not atomic; no spinlocks may be held. |
| 5297 | */ | 5988 | */ |
| 5298 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 5989 | int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) |
| 5299 | { | 5990 | { |
| 5300 | struct migration_req req; | 5991 | struct migration_req req; |
| 5301 | unsigned long flags; | 5992 | unsigned long flags; |
| @@ -5303,23 +5994,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
| 5303 | int ret = 0; | 5994 | int ret = 0; |
| 5304 | 5995 | ||
| 5305 | rq = task_rq_lock(p, &flags); | 5996 | rq = task_rq_lock(p, &flags); |
| 5306 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 5997 | if (!cpus_intersects(*new_mask, cpu_online_map)) { |
| 5307 | ret = -EINVAL; | 5998 | ret = -EINVAL; |
| 5308 | goto out; | 5999 | goto out; |
| 5309 | } | 6000 | } |
| 5310 | 6001 | ||
| 5311 | if (p->sched_class->set_cpus_allowed) | 6002 | if (p->sched_class->set_cpus_allowed) |
| 5312 | p->sched_class->set_cpus_allowed(p, &new_mask); | 6003 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 5313 | else { | 6004 | else { |
| 5314 | p->cpus_allowed = new_mask; | 6005 | p->cpus_allowed = *new_mask; |
| 5315 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | 6006 | p->rt.nr_cpus_allowed = cpus_weight(*new_mask); |
| 5316 | } | 6007 | } |
| 5317 | 6008 | ||
| 5318 | /* Can the task run on the task's current CPU? If so, we're done */ | 6009 | /* Can the task run on the task's current CPU? If so, we're done */ |
| 5319 | if (cpu_isset(task_cpu(p), new_mask)) | 6010 | if (cpu_isset(task_cpu(p), *new_mask)) |
| 5320 | goto out; | 6011 | goto out; |
| 5321 | 6012 | ||
| 5322 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 6013 | if (migrate_task(p, any_online_cpu(*new_mask), &req)) { |
| 5323 | /* Need help from migration thread: drop lock and wait. */ | 6014 | /* Need help from migration thread: drop lock and wait. */ |
| 5324 | task_rq_unlock(rq, &flags); | 6015 | task_rq_unlock(rq, &flags); |
| 5325 | wake_up_process(rq->migration_thread); | 6016 | wake_up_process(rq->migration_thread); |
| @@ -5332,7 +6023,7 @@ out: | |||
| 5332 | 6023 | ||
| 5333 | return ret; | 6024 | return ret; |
| 5334 | } | 6025 | } |
| 5335 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 6026 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
| 5336 | 6027 | ||
| 5337 | /* | 6028 | /* |
| 5338 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 6029 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
| @@ -5470,12 +6161,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5470 | dest_cpu = any_online_cpu(mask); | 6161 | dest_cpu = any_online_cpu(mask); |
| 5471 | 6162 | ||
| 5472 | /* On any allowed CPU? */ | 6163 | /* On any allowed CPU? */ |
| 5473 | if (dest_cpu == NR_CPUS) | 6164 | if (dest_cpu >= nr_cpu_ids) |
| 5474 | dest_cpu = any_online_cpu(p->cpus_allowed); | 6165 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5475 | 6166 | ||
| 5476 | /* No more Mr. Nice Guy. */ | 6167 | /* No more Mr. Nice Guy. */ |
| 5477 | if (dest_cpu == NR_CPUS) { | 6168 | if (dest_cpu >= nr_cpu_ids) { |
| 5478 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); | 6169 | cpumask_t cpus_allowed; |
| 6170 | |||
| 6171 | cpuset_cpus_allowed_locked(p, &cpus_allowed); | ||
| 5479 | /* | 6172 | /* |
| 5480 | * Try to stay on the same cpuset, where the | 6173 | * Try to stay on the same cpuset, where the |
| 5481 | * current cpuset may be a subset of all cpus. | 6174 | * current cpuset may be a subset of all cpus. |
| @@ -5511,7 +6204,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5511 | */ | 6204 | */ |
| 5512 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6205 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 5513 | { | 6206 | { |
| 5514 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 6207 | struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); |
| 5515 | unsigned long flags; | 6208 | unsigned long flags; |
| 5516 | 6209 | ||
| 5517 | local_irq_save(flags); | 6210 | local_irq_save(flags); |
| @@ -5923,20 +6616,16 @@ void __init migration_init(void) | |||
| 5923 | 6616 | ||
| 5924 | #ifdef CONFIG_SMP | 6617 | #ifdef CONFIG_SMP |
| 5925 | 6618 | ||
| 5926 | /* Number of possible processor ids */ | ||
| 5927 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
| 5928 | EXPORT_SYMBOL(nr_cpu_ids); | ||
| 5929 | |||
| 5930 | #ifdef CONFIG_SCHED_DEBUG | 6619 | #ifdef CONFIG_SCHED_DEBUG |
| 5931 | 6620 | ||
| 5932 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | 6621 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 6622 | cpumask_t *groupmask) | ||
| 5933 | { | 6623 | { |
| 5934 | struct sched_group *group = sd->groups; | 6624 | struct sched_group *group = sd->groups; |
| 5935 | cpumask_t groupmask; | 6625 | char str[256]; |
| 5936 | char str[NR_CPUS]; | ||
| 5937 | 6626 | ||
| 5938 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 6627 | cpulist_scnprintf(str, sizeof(str), sd->span); |
| 5939 | cpus_clear(groupmask); | 6628 | cpus_clear(*groupmask); |
| 5940 | 6629 | ||
| 5941 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 6630 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
| 5942 | 6631 | ||
| @@ -5980,25 +6669,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
| 5980 | break; | 6669 | break; |
| 5981 | } | 6670 | } |
| 5982 | 6671 | ||
| 5983 | if (cpus_intersects(groupmask, group->cpumask)) { | 6672 | if (cpus_intersects(*groupmask, group->cpumask)) { |
| 5984 | printk(KERN_CONT "\n"); | 6673 | printk(KERN_CONT "\n"); |
| 5985 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 6674 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
| 5986 | break; | 6675 | break; |
| 5987 | } | 6676 | } |
| 5988 | 6677 | ||
| 5989 | cpus_or(groupmask, groupmask, group->cpumask); | 6678 | cpus_or(*groupmask, *groupmask, group->cpumask); |
| 5990 | 6679 | ||
| 5991 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 6680 | cpulist_scnprintf(str, sizeof(str), group->cpumask); |
| 5992 | printk(KERN_CONT " %s", str); | 6681 | printk(KERN_CONT " %s", str); |
| 5993 | 6682 | ||
| 5994 | group = group->next; | 6683 | group = group->next; |
| 5995 | } while (group != sd->groups); | 6684 | } while (group != sd->groups); |
| 5996 | printk(KERN_CONT "\n"); | 6685 | printk(KERN_CONT "\n"); |
| 5997 | 6686 | ||
| 5998 | if (!cpus_equal(sd->span, groupmask)) | 6687 | if (!cpus_equal(sd->span, *groupmask)) |
| 5999 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6688 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
| 6000 | 6689 | ||
| 6001 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) | 6690 | if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) |
| 6002 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6691 | printk(KERN_ERR "ERROR: parent span is not a superset " |
| 6003 | "of domain->span\n"); | 6692 | "of domain->span\n"); |
| 6004 | return 0; | 6693 | return 0; |
| @@ -6006,6 +6695,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
| 6006 | 6695 | ||
| 6007 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6696 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 6008 | { | 6697 | { |
| 6698 | cpumask_t *groupmask; | ||
| 6009 | int level = 0; | 6699 | int level = 0; |
| 6010 | 6700 | ||
| 6011 | if (!sd) { | 6701 | if (!sd) { |
| @@ -6015,14 +6705,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 6015 | 6705 | ||
| 6016 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6706 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
| 6017 | 6707 | ||
| 6708 | groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | ||
| 6709 | if (!groupmask) { | ||
| 6710 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
| 6711 | return; | ||
| 6712 | } | ||
| 6713 | |||
| 6018 | for (;;) { | 6714 | for (;;) { |
| 6019 | if (sched_domain_debug_one(sd, cpu, level)) | 6715 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) |
| 6020 | break; | 6716 | break; |
| 6021 | level++; | 6717 | level++; |
| 6022 | sd = sd->parent; | 6718 | sd = sd->parent; |
| 6023 | if (!sd) | 6719 | if (!sd) |
| 6024 | break; | 6720 | break; |
| 6025 | } | 6721 | } |
| 6722 | kfree(groupmask); | ||
| 6026 | } | 6723 | } |
| 6027 | #else | 6724 | #else |
| 6028 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6725 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| @@ -6210,30 +6907,33 @@ __setup("isolcpus=", isolated_cpu_setup); | |||
| 6210 | * and ->cpu_power to 0. | 6907 | * and ->cpu_power to 0. |
| 6211 | */ | 6908 | */ |
| 6212 | static void | 6909 | static void |
| 6213 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 6910 | init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, |
| 6214 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 6911 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
| 6215 | struct sched_group **sg)) | 6912 | struct sched_group **sg, |
| 6913 | cpumask_t *tmpmask), | ||
| 6914 | cpumask_t *covered, cpumask_t *tmpmask) | ||
| 6216 | { | 6915 | { |
| 6217 | struct sched_group *first = NULL, *last = NULL; | 6916 | struct sched_group *first = NULL, *last = NULL; |
| 6218 | cpumask_t covered = CPU_MASK_NONE; | ||
| 6219 | int i; | 6917 | int i; |
| 6220 | 6918 | ||
| 6221 | for_each_cpu_mask(i, span) { | 6919 | cpus_clear(*covered); |
| 6920 | |||
| 6921 | for_each_cpu_mask(i, *span) { | ||
| 6222 | struct sched_group *sg; | 6922 | struct sched_group *sg; |
| 6223 | int group = group_fn(i, cpu_map, &sg); | 6923 | int group = group_fn(i, cpu_map, &sg, tmpmask); |
| 6224 | int j; | 6924 | int j; |
| 6225 | 6925 | ||
| 6226 | if (cpu_isset(i, covered)) | 6926 | if (cpu_isset(i, *covered)) |
| 6227 | continue; | 6927 | continue; |
| 6228 | 6928 | ||
| 6229 | sg->cpumask = CPU_MASK_NONE; | 6929 | cpus_clear(sg->cpumask); |
| 6230 | sg->__cpu_power = 0; | 6930 | sg->__cpu_power = 0; |
| 6231 | 6931 | ||
| 6232 | for_each_cpu_mask(j, span) { | 6932 | for_each_cpu_mask(j, *span) { |
| 6233 | if (group_fn(j, cpu_map, NULL) != group) | 6933 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
| 6234 | continue; | 6934 | continue; |
| 6235 | 6935 | ||
| 6236 | cpu_set(j, covered); | 6936 | cpu_set(j, *covered); |
| 6237 | cpu_set(j, sg->cpumask); | 6937 | cpu_set(j, sg->cpumask); |
| 6238 | } | 6938 | } |
| 6239 | if (!first) | 6939 | if (!first) |
| @@ -6259,7 +6959,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
| 6259 | * | 6959 | * |
| 6260 | * Should use nodemask_t. | 6960 | * Should use nodemask_t. |
| 6261 | */ | 6961 | */ |
| 6262 | static int find_next_best_node(int node, unsigned long *used_nodes) | 6962 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
| 6263 | { | 6963 | { |
| 6264 | int i, n, val, min_val, best_node = 0; | 6964 | int i, n, val, min_val, best_node = 0; |
| 6265 | 6965 | ||
| @@ -6273,7 +6973,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 6273 | continue; | 6973 | continue; |
| 6274 | 6974 | ||
| 6275 | /* Skip already used nodes */ | 6975 | /* Skip already used nodes */ |
| 6276 | if (test_bit(n, used_nodes)) | 6976 | if (node_isset(n, *used_nodes)) |
| 6277 | continue; | 6977 | continue; |
| 6278 | 6978 | ||
| 6279 | /* Simple min distance search */ | 6979 | /* Simple min distance search */ |
| @@ -6285,40 +6985,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 6285 | } | 6985 | } |
| 6286 | } | 6986 | } |
| 6287 | 6987 | ||
| 6288 | set_bit(best_node, used_nodes); | 6988 | node_set(best_node, *used_nodes); |
| 6289 | return best_node; | 6989 | return best_node; |
| 6290 | } | 6990 | } |
| 6291 | 6991 | ||
| 6292 | /** | 6992 | /** |
| 6293 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 6993 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
| 6294 | * @node: node whose cpumask we're constructing | 6994 | * @node: node whose cpumask we're constructing |
| 6295 | * @size: number of nodes to include in this span | 6995 | * @span: resulting cpumask |
| 6296 | * | 6996 | * |
| 6297 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6997 | * Given a node, construct a good cpumask for its sched_domain to span. It |
| 6298 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6998 | * should be one that prevents unnecessary balancing, but also spreads tasks |
| 6299 | * out optimally. | 6999 | * out optimally. |
| 6300 | */ | 7000 | */ |
| 6301 | static cpumask_t sched_domain_node_span(int node) | 7001 | static void sched_domain_node_span(int node, cpumask_t *span) |
| 6302 | { | 7002 | { |
| 6303 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 7003 | nodemask_t used_nodes; |
| 6304 | cpumask_t span, nodemask; | 7004 | node_to_cpumask_ptr(nodemask, node); |
| 6305 | int i; | 7005 | int i; |
| 6306 | 7006 | ||
| 6307 | cpus_clear(span); | 7007 | cpus_clear(*span); |
| 6308 | bitmap_zero(used_nodes, MAX_NUMNODES); | 7008 | nodes_clear(used_nodes); |
| 6309 | 7009 | ||
| 6310 | nodemask = node_to_cpumask(node); | 7010 | cpus_or(*span, *span, *nodemask); |
| 6311 | cpus_or(span, span, nodemask); | 7011 | node_set(node, used_nodes); |
| 6312 | set_bit(node, used_nodes); | ||
| 6313 | 7012 | ||
| 6314 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7013 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| 6315 | int next_node = find_next_best_node(node, used_nodes); | 7014 | int next_node = find_next_best_node(node, &used_nodes); |
| 6316 | 7015 | ||
| 6317 | nodemask = node_to_cpumask(next_node); | 7016 | node_to_cpumask_ptr_next(nodemask, next_node); |
| 6318 | cpus_or(span, span, nodemask); | 7017 | cpus_or(*span, *span, *nodemask); |
| 6319 | } | 7018 | } |
| 6320 | |||
| 6321 | return span; | ||
| 6322 | } | 7019 | } |
| 6323 | #endif | 7020 | #endif |
| 6324 | 7021 | ||
| @@ -6332,7 +7029,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | |||
| 6332 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 7029 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
| 6333 | 7030 | ||
| 6334 | static int | 7031 | static int |
| 6335 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7032 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7033 | cpumask_t *unused) | ||
| 6336 | { | 7034 | { |
| 6337 | if (sg) | 7035 | if (sg) |
| 6338 | *sg = &per_cpu(sched_group_cpus, cpu); | 7036 | *sg = &per_cpu(sched_group_cpus, cpu); |
| @@ -6350,19 +7048,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
| 6350 | 7048 | ||
| 6351 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7049 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| 6352 | static int | 7050 | static int |
| 6353 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7051 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7052 | cpumask_t *mask) | ||
| 6354 | { | 7053 | { |
| 6355 | int group; | 7054 | int group; |
| 6356 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7055 | |
| 6357 | cpus_and(mask, mask, *cpu_map); | 7056 | *mask = per_cpu(cpu_sibling_map, cpu); |
| 6358 | group = first_cpu(mask); | 7057 | cpus_and(*mask, *mask, *cpu_map); |
| 7058 | group = first_cpu(*mask); | ||
| 6359 | if (sg) | 7059 | if (sg) |
| 6360 | *sg = &per_cpu(sched_group_core, group); | 7060 | *sg = &per_cpu(sched_group_core, group); |
| 6361 | return group; | 7061 | return group; |
| 6362 | } | 7062 | } |
| 6363 | #elif defined(CONFIG_SCHED_MC) | 7063 | #elif defined(CONFIG_SCHED_MC) |
| 6364 | static int | 7064 | static int |
| 6365 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7065 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7066 | cpumask_t *unused) | ||
| 6366 | { | 7067 | { |
| 6367 | if (sg) | 7068 | if (sg) |
| 6368 | *sg = &per_cpu(sched_group_core, cpu); | 7069 | *sg = &per_cpu(sched_group_core, cpu); |
| @@ -6374,17 +7075,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); | |||
| 6374 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 7075 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
| 6375 | 7076 | ||
| 6376 | static int | 7077 | static int |
| 6377 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7078 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
| 7079 | cpumask_t *mask) | ||
| 6378 | { | 7080 | { |
| 6379 | int group; | 7081 | int group; |
| 6380 | #ifdef CONFIG_SCHED_MC | 7082 | #ifdef CONFIG_SCHED_MC |
| 6381 | cpumask_t mask = cpu_coregroup_map(cpu); | 7083 | *mask = cpu_coregroup_map(cpu); |
| 6382 | cpus_and(mask, mask, *cpu_map); | 7084 | cpus_and(*mask, *mask, *cpu_map); |
| 6383 | group = first_cpu(mask); | 7085 | group = first_cpu(*mask); |
| 6384 | #elif defined(CONFIG_SCHED_SMT) | 7086 | #elif defined(CONFIG_SCHED_SMT) |
| 6385 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7087 | *mask = per_cpu(cpu_sibling_map, cpu); |
| 6386 | cpus_and(mask, mask, *cpu_map); | 7088 | cpus_and(*mask, *mask, *cpu_map); |
| 6387 | group = first_cpu(mask); | 7089 | group = first_cpu(*mask); |
| 6388 | #else | 7090 | #else |
| 6389 | group = cpu; | 7091 | group = cpu; |
| 6390 | #endif | 7092 | #endif |
| @@ -6400,19 +7102,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | |||
| 6400 | * gets dynamically allocated. | 7102 | * gets dynamically allocated. |
| 6401 | */ | 7103 | */ |
| 6402 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 7104 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
| 6403 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 7105 | static struct sched_group ***sched_group_nodes_bycpu; |
| 6404 | 7106 | ||
| 6405 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 7107 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
| 6406 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 7108 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
| 6407 | 7109 | ||
| 6408 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 7110 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
| 6409 | struct sched_group **sg) | 7111 | struct sched_group **sg, cpumask_t *nodemask) |
| 6410 | { | 7112 | { |
| 6411 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | ||
| 6412 | int group; | 7113 | int group; |
| 6413 | 7114 | ||
| 6414 | cpus_and(nodemask, nodemask, *cpu_map); | 7115 | *nodemask = node_to_cpumask(cpu_to_node(cpu)); |
| 6415 | group = first_cpu(nodemask); | 7116 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7117 | group = first_cpu(*nodemask); | ||
| 6416 | 7118 | ||
| 6417 | if (sg) | 7119 | if (sg) |
| 6418 | *sg = &per_cpu(sched_group_allnodes, group); | 7120 | *sg = &per_cpu(sched_group_allnodes, group); |
| @@ -6448,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 6448 | 7150 | ||
| 6449 | #ifdef CONFIG_NUMA | 7151 | #ifdef CONFIG_NUMA |
| 6450 | /* Free memory allocated for various sched_group structures */ | 7152 | /* Free memory allocated for various sched_group structures */ |
| 6451 | static void free_sched_groups(const cpumask_t *cpu_map) | 7153 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
| 6452 | { | 7154 | { |
| 6453 | int cpu, i; | 7155 | int cpu, i; |
| 6454 | 7156 | ||
| @@ -6460,11 +7162,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
| 6460 | continue; | 7162 | continue; |
| 6461 | 7163 | ||
| 6462 | for (i = 0; i < MAX_NUMNODES; i++) { | 7164 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6463 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 6464 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7165 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
| 6465 | 7166 | ||
| 6466 | cpus_and(nodemask, nodemask, *cpu_map); | 7167 | *nodemask = node_to_cpumask(i); |
| 6467 | if (cpus_empty(nodemask)) | 7168 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7169 | if (cpus_empty(*nodemask)) | ||
| 6468 | continue; | 7170 | continue; |
| 6469 | 7171 | ||
| 6470 | if (sg == NULL) | 7172 | if (sg == NULL) |
| @@ -6482,7 +7184,7 @@ next_sg: | |||
| 6482 | } | 7184 | } |
| 6483 | } | 7185 | } |
| 6484 | #else | 7186 | #else |
| 6485 | static void free_sched_groups(const cpumask_t *cpu_map) | 7187 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
| 6486 | { | 7188 | { |
| 6487 | } | 7189 | } |
| 6488 | #endif | 7190 | #endif |
| @@ -6540,13 +7242,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6540 | } | 7242 | } |
| 6541 | 7243 | ||
| 6542 | /* | 7244 | /* |
| 7245 | * Initializers for schedule domains | ||
| 7246 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
| 7247 | */ | ||
| 7248 | |||
| 7249 | #define SD_INIT(sd, type) sd_init_##type(sd) | ||
| 7250 | #define SD_INIT_FUNC(type) \ | ||
| 7251 | static noinline void sd_init_##type(struct sched_domain *sd) \ | ||
| 7252 | { \ | ||
| 7253 | memset(sd, 0, sizeof(*sd)); \ | ||
| 7254 | *sd = SD_##type##_INIT; \ | ||
| 7255 | sd->level = SD_LV_##type; \ | ||
| 7256 | } | ||
| 7257 | |||
| 7258 | SD_INIT_FUNC(CPU) | ||
| 7259 | #ifdef CONFIG_NUMA | ||
| 7260 | SD_INIT_FUNC(ALLNODES) | ||
| 7261 | SD_INIT_FUNC(NODE) | ||
| 7262 | #endif | ||
| 7263 | #ifdef CONFIG_SCHED_SMT | ||
| 7264 | SD_INIT_FUNC(SIBLING) | ||
| 7265 | #endif | ||
| 7266 | #ifdef CONFIG_SCHED_MC | ||
| 7267 | SD_INIT_FUNC(MC) | ||
| 7268 | #endif | ||
| 7269 | |||
| 7270 | /* | ||
| 7271 | * To minimize stack usage kmalloc room for cpumasks and share the | ||
| 7272 | * space as the usage in build_sched_domains() dictates. Used only | ||
| 7273 | * if the amount of space is significant. | ||
| 7274 | */ | ||
| 7275 | struct allmasks { | ||
| 7276 | cpumask_t tmpmask; /* make this one first */ | ||
| 7277 | union { | ||
| 7278 | cpumask_t nodemask; | ||
| 7279 | cpumask_t this_sibling_map; | ||
| 7280 | cpumask_t this_core_map; | ||
| 7281 | }; | ||
| 7282 | cpumask_t send_covered; | ||
| 7283 | |||
| 7284 | #ifdef CONFIG_NUMA | ||
| 7285 | cpumask_t domainspan; | ||
| 7286 | cpumask_t covered; | ||
| 7287 | cpumask_t notcovered; | ||
| 7288 | #endif | ||
| 7289 | }; | ||
| 7290 | |||
| 7291 | #if NR_CPUS > 128 | ||
| 7292 | #define SCHED_CPUMASK_ALLOC 1 | ||
| 7293 | #define SCHED_CPUMASK_FREE(v) kfree(v) | ||
| 7294 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v | ||
| 7295 | #else | ||
| 7296 | #define SCHED_CPUMASK_ALLOC 0 | ||
| 7297 | #define SCHED_CPUMASK_FREE(v) | ||
| 7298 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v | ||
| 7299 | #endif | ||
| 7300 | |||
| 7301 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ | ||
| 7302 | ((unsigned long)(a) + offsetof(struct allmasks, v)) | ||
| 7303 | |||
| 7304 | static int default_relax_domain_level = -1; | ||
| 7305 | |||
| 7306 | static int __init setup_relax_domain_level(char *str) | ||
| 7307 | { | ||
| 7308 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | ||
| 7309 | return 1; | ||
| 7310 | } | ||
| 7311 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
| 7312 | |||
| 7313 | static void set_domain_attribute(struct sched_domain *sd, | ||
| 7314 | struct sched_domain_attr *attr) | ||
| 7315 | { | ||
| 7316 | int request; | ||
| 7317 | |||
| 7318 | if (!attr || attr->relax_domain_level < 0) { | ||
| 7319 | if (default_relax_domain_level < 0) | ||
| 7320 | return; | ||
| 7321 | else | ||
| 7322 | request = default_relax_domain_level; | ||
| 7323 | } else | ||
| 7324 | request = attr->relax_domain_level; | ||
| 7325 | if (request < sd->level) { | ||
| 7326 | /* turn off idle balance on this domain */ | ||
| 7327 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | ||
| 7328 | } else { | ||
| 7329 | /* turn on idle balance on this domain */ | ||
| 7330 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | ||
| 7331 | } | ||
| 7332 | } | ||
| 7333 | |||
| 7334 | /* | ||
| 6543 | * Build sched domains for a given set of cpus and attach the sched domains | 7335 | * Build sched domains for a given set of cpus and attach the sched domains |
| 6544 | * to the individual cpus | 7336 | * to the individual cpus |
| 6545 | */ | 7337 | */ |
| 6546 | static int build_sched_domains(const cpumask_t *cpu_map) | 7338 | static int __build_sched_domains(const cpumask_t *cpu_map, |
| 7339 | struct sched_domain_attr *attr) | ||
| 6547 | { | 7340 | { |
| 6548 | int i; | 7341 | int i; |
| 6549 | struct root_domain *rd; | 7342 | struct root_domain *rd; |
| 7343 | SCHED_CPUMASK_DECLARE(allmasks); | ||
| 7344 | cpumask_t *tmpmask; | ||
| 6550 | #ifdef CONFIG_NUMA | 7345 | #ifdef CONFIG_NUMA |
| 6551 | struct sched_group **sched_group_nodes = NULL; | 7346 | struct sched_group **sched_group_nodes = NULL; |
| 6552 | int sd_allnodes = 0; | 7347 | int sd_allnodes = 0; |
| @@ -6560,39 +7355,65 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6560 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7355 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 6561 | return -ENOMEM; | 7356 | return -ENOMEM; |
| 6562 | } | 7357 | } |
| 6563 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
| 6564 | #endif | 7358 | #endif |
| 6565 | 7359 | ||
| 6566 | rd = alloc_rootdomain(); | 7360 | rd = alloc_rootdomain(); |
| 6567 | if (!rd) { | 7361 | if (!rd) { |
| 6568 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7362 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
| 7363 | #ifdef CONFIG_NUMA | ||
| 7364 | kfree(sched_group_nodes); | ||
| 7365 | #endif | ||
| 7366 | return -ENOMEM; | ||
| 7367 | } | ||
| 7368 | |||
| 7369 | #if SCHED_CPUMASK_ALLOC | ||
| 7370 | /* get space for all scratch cpumask variables */ | ||
| 7371 | allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); | ||
| 7372 | if (!allmasks) { | ||
| 7373 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); | ||
| 7374 | kfree(rd); | ||
| 7375 | #ifdef CONFIG_NUMA | ||
| 7376 | kfree(sched_group_nodes); | ||
| 7377 | #endif | ||
| 6569 | return -ENOMEM; | 7378 | return -ENOMEM; |
| 6570 | } | 7379 | } |
| 7380 | #endif | ||
| 7381 | tmpmask = (cpumask_t *)allmasks; | ||
| 7382 | |||
| 7383 | |||
| 7384 | #ifdef CONFIG_NUMA | ||
| 7385 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
| 7386 | #endif | ||
| 6571 | 7387 | ||
| 6572 | /* | 7388 | /* |
| 6573 | * Set up domains for cpus specified by the cpu_map. | 7389 | * Set up domains for cpus specified by the cpu_map. |
| 6574 | */ | 7390 | */ |
| 6575 | for_each_cpu_mask(i, *cpu_map) { | 7391 | for_each_cpu_mask(i, *cpu_map) { |
| 6576 | struct sched_domain *sd = NULL, *p; | 7392 | struct sched_domain *sd = NULL, *p; |
| 6577 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 7393 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 6578 | 7394 | ||
| 6579 | cpus_and(nodemask, nodemask, *cpu_map); | 7395 | *nodemask = node_to_cpumask(cpu_to_node(i)); |
| 7396 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
| 6580 | 7397 | ||
| 6581 | #ifdef CONFIG_NUMA | 7398 | #ifdef CONFIG_NUMA |
| 6582 | if (cpus_weight(*cpu_map) > | 7399 | if (cpus_weight(*cpu_map) > |
| 6583 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 7400 | SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { |
| 6584 | sd = &per_cpu(allnodes_domains, i); | 7401 | sd = &per_cpu(allnodes_domains, i); |
| 6585 | *sd = SD_ALLNODES_INIT; | 7402 | SD_INIT(sd, ALLNODES); |
| 7403 | set_domain_attribute(sd, attr); | ||
| 6586 | sd->span = *cpu_map; | 7404 | sd->span = *cpu_map; |
| 6587 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 7405 | sd->first_cpu = first_cpu(sd->span); |
| 7406 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 6588 | p = sd; | 7407 | p = sd; |
| 6589 | sd_allnodes = 1; | 7408 | sd_allnodes = 1; |
| 6590 | } else | 7409 | } else |
| 6591 | p = NULL; | 7410 | p = NULL; |
| 6592 | 7411 | ||
| 6593 | sd = &per_cpu(node_domains, i); | 7412 | sd = &per_cpu(node_domains, i); |
| 6594 | *sd = SD_NODE_INIT; | 7413 | SD_INIT(sd, NODE); |
| 6595 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 7414 | set_domain_attribute(sd, attr); |
| 7415 | sched_domain_node_span(cpu_to_node(i), &sd->span); | ||
| 7416 | sd->first_cpu = first_cpu(sd->span); | ||
| 6596 | sd->parent = p; | 7417 | sd->parent = p; |
| 6597 | if (p) | 7418 | if (p) |
| 6598 | p->child = sd; | 7419 | p->child = sd; |
| @@ -6601,94 +7422,120 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6601 | 7422 | ||
| 6602 | p = sd; | 7423 | p = sd; |
| 6603 | sd = &per_cpu(phys_domains, i); | 7424 | sd = &per_cpu(phys_domains, i); |
| 6604 | *sd = SD_CPU_INIT; | 7425 | SD_INIT(sd, CPU); |
| 6605 | sd->span = nodemask; | 7426 | set_domain_attribute(sd, attr); |
| 7427 | sd->span = *nodemask; | ||
| 7428 | sd->first_cpu = first_cpu(sd->span); | ||
| 6606 | sd->parent = p; | 7429 | sd->parent = p; |
| 6607 | if (p) | 7430 | if (p) |
| 6608 | p->child = sd; | 7431 | p->child = sd; |
| 6609 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 7432 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); |
| 6610 | 7433 | ||
| 6611 | #ifdef CONFIG_SCHED_MC | 7434 | #ifdef CONFIG_SCHED_MC |
| 6612 | p = sd; | 7435 | p = sd; |
| 6613 | sd = &per_cpu(core_domains, i); | 7436 | sd = &per_cpu(core_domains, i); |
| 6614 | *sd = SD_MC_INIT; | 7437 | SD_INIT(sd, MC); |
| 7438 | set_domain_attribute(sd, attr); | ||
| 6615 | sd->span = cpu_coregroup_map(i); | 7439 | sd->span = cpu_coregroup_map(i); |
| 7440 | sd->first_cpu = first_cpu(sd->span); | ||
| 6616 | cpus_and(sd->span, sd->span, *cpu_map); | 7441 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6617 | sd->parent = p; | 7442 | sd->parent = p; |
| 6618 | p->child = sd; | 7443 | p->child = sd; |
| 6619 | cpu_to_core_group(i, cpu_map, &sd->groups); | 7444 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); |
| 6620 | #endif | 7445 | #endif |
| 6621 | 7446 | ||
| 6622 | #ifdef CONFIG_SCHED_SMT | 7447 | #ifdef CONFIG_SCHED_SMT |
| 6623 | p = sd; | 7448 | p = sd; |
| 6624 | sd = &per_cpu(cpu_domains, i); | 7449 | sd = &per_cpu(cpu_domains, i); |
| 6625 | *sd = SD_SIBLING_INIT; | 7450 | SD_INIT(sd, SIBLING); |
| 7451 | set_domain_attribute(sd, attr); | ||
| 6626 | sd->span = per_cpu(cpu_sibling_map, i); | 7452 | sd->span = per_cpu(cpu_sibling_map, i); |
| 7453 | sd->first_cpu = first_cpu(sd->span); | ||
| 6627 | cpus_and(sd->span, sd->span, *cpu_map); | 7454 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6628 | sd->parent = p; | 7455 | sd->parent = p; |
| 6629 | p->child = sd; | 7456 | p->child = sd; |
| 6630 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 7457 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); |
| 6631 | #endif | 7458 | #endif |
| 6632 | } | 7459 | } |
| 6633 | 7460 | ||
| 6634 | #ifdef CONFIG_SCHED_SMT | 7461 | #ifdef CONFIG_SCHED_SMT |
| 6635 | /* Set up CPU (sibling) groups */ | 7462 | /* Set up CPU (sibling) groups */ |
| 6636 | for_each_cpu_mask(i, *cpu_map) { | 7463 | for_each_cpu_mask(i, *cpu_map) { |
| 6637 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); | 7464 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); |
| 6638 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 7465 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6639 | if (i != first_cpu(this_sibling_map)) | 7466 | |
| 7467 | *this_sibling_map = per_cpu(cpu_sibling_map, i); | ||
| 7468 | cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); | ||
| 7469 | if (i != first_cpu(*this_sibling_map)) | ||
| 6640 | continue; | 7470 | continue; |
| 6641 | 7471 | ||
| 6642 | init_sched_build_groups(this_sibling_map, cpu_map, | 7472 | init_sched_build_groups(this_sibling_map, cpu_map, |
| 6643 | &cpu_to_cpu_group); | 7473 | &cpu_to_cpu_group, |
| 7474 | send_covered, tmpmask); | ||
| 6644 | } | 7475 | } |
| 6645 | #endif | 7476 | #endif |
| 6646 | 7477 | ||
| 6647 | #ifdef CONFIG_SCHED_MC | 7478 | #ifdef CONFIG_SCHED_MC |
| 6648 | /* Set up multi-core groups */ | 7479 | /* Set up multi-core groups */ |
| 6649 | for_each_cpu_mask(i, *cpu_map) { | 7480 | for_each_cpu_mask(i, *cpu_map) { |
| 6650 | cpumask_t this_core_map = cpu_coregroup_map(i); | 7481 | SCHED_CPUMASK_VAR(this_core_map, allmasks); |
| 6651 | cpus_and(this_core_map, this_core_map, *cpu_map); | 7482 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6652 | if (i != first_cpu(this_core_map)) | 7483 | |
| 7484 | *this_core_map = cpu_coregroup_map(i); | ||
| 7485 | cpus_and(*this_core_map, *this_core_map, *cpu_map); | ||
| 7486 | if (i != first_cpu(*this_core_map)) | ||
| 6653 | continue; | 7487 | continue; |
| 7488 | |||
| 6654 | init_sched_build_groups(this_core_map, cpu_map, | 7489 | init_sched_build_groups(this_core_map, cpu_map, |
| 6655 | &cpu_to_core_group); | 7490 | &cpu_to_core_group, |
| 7491 | send_covered, tmpmask); | ||
| 6656 | } | 7492 | } |
| 6657 | #endif | 7493 | #endif |
| 6658 | 7494 | ||
| 6659 | /* Set up physical groups */ | 7495 | /* Set up physical groups */ |
| 6660 | for (i = 0; i < MAX_NUMNODES; i++) { | 7496 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6661 | cpumask_t nodemask = node_to_cpumask(i); | 7497 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 7498 | SCHED_CPUMASK_VAR(send_covered, allmasks); | ||
| 6662 | 7499 | ||
| 6663 | cpus_and(nodemask, nodemask, *cpu_map); | 7500 | *nodemask = node_to_cpumask(i); |
| 6664 | if (cpus_empty(nodemask)) | 7501 | cpus_and(*nodemask, *nodemask, *cpu_map); |
| 7502 | if (cpus_empty(*nodemask)) | ||
| 6665 | continue; | 7503 | continue; |
| 6666 | 7504 | ||
| 6667 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 7505 | init_sched_build_groups(nodemask, cpu_map, |
| 7506 | &cpu_to_phys_group, | ||
| 7507 | send_covered, tmpmask); | ||
| 6668 | } | 7508 | } |
| 6669 | 7509 | ||
| 6670 | #ifdef CONFIG_NUMA | 7510 | #ifdef CONFIG_NUMA |
| 6671 | /* Set up node groups */ | 7511 | /* Set up node groups */ |
| 6672 | if (sd_allnodes) | 7512 | if (sd_allnodes) { |
| 6673 | init_sched_build_groups(*cpu_map, cpu_map, | 7513 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
| 6674 | &cpu_to_allnodes_group); | 7514 | |
| 7515 | init_sched_build_groups(cpu_map, cpu_map, | ||
| 7516 | &cpu_to_allnodes_group, | ||
| 7517 | send_covered, tmpmask); | ||
| 7518 | } | ||
| 6675 | 7519 | ||
| 6676 | for (i = 0; i < MAX_NUMNODES; i++) { | 7520 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6677 | /* Set up node groups */ | 7521 | /* Set up node groups */ |
| 6678 | struct sched_group *sg, *prev; | 7522 | struct sched_group *sg, *prev; |
| 6679 | cpumask_t nodemask = node_to_cpumask(i); | 7523 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
| 6680 | cpumask_t domainspan; | 7524 | SCHED_CPUMASK_VAR(domainspan, allmasks); |
| 6681 | cpumask_t covered = CPU_MASK_NONE; | 7525 | SCHED_CPUMASK_VAR(covered, allmasks); |
| 6682 | int j; | 7526 | int j; |
| 6683 | 7527 | ||
| 6684 | cpus_and(nodemask, nodemask, *cpu_map); | 7528 | *nodemask = node_to_cpumask(i); |
| 6685 | if (cpus_empty(nodemask)) { | 7529 | cpus_clear(*covered); |
| 7530 | |||
| 7531 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
| 7532 | if (cpus_empty(*nodemask)) { | ||
| 6686 | sched_group_nodes[i] = NULL; | 7533 | sched_group_nodes[i] = NULL; |
| 6687 | continue; | 7534 | continue; |
| 6688 | } | 7535 | } |
| 6689 | 7536 | ||
| 6690 | domainspan = sched_domain_node_span(i); | 7537 | sched_domain_node_span(i, domainspan); |
| 6691 | cpus_and(domainspan, domainspan, *cpu_map); | 7538 | cpus_and(*domainspan, *domainspan, *cpu_map); |
| 6692 | 7539 | ||
| 6693 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 7540 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
| 6694 | if (!sg) { | 7541 | if (!sg) { |
| @@ -6697,31 +7544,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6697 | goto error; | 7544 | goto error; |
| 6698 | } | 7545 | } |
| 6699 | sched_group_nodes[i] = sg; | 7546 | sched_group_nodes[i] = sg; |
| 6700 | for_each_cpu_mask(j, nodemask) { | 7547 | for_each_cpu_mask(j, *nodemask) { |
| 6701 | struct sched_domain *sd; | 7548 | struct sched_domain *sd; |
| 6702 | 7549 | ||
| 6703 | sd = &per_cpu(node_domains, j); | 7550 | sd = &per_cpu(node_domains, j); |
| 6704 | sd->groups = sg; | 7551 | sd->groups = sg; |
| 6705 | } | 7552 | } |
| 6706 | sg->__cpu_power = 0; | 7553 | sg->__cpu_power = 0; |
| 6707 | sg->cpumask = nodemask; | 7554 | sg->cpumask = *nodemask; |
| 6708 | sg->next = sg; | 7555 | sg->next = sg; |
| 6709 | cpus_or(covered, covered, nodemask); | 7556 | cpus_or(*covered, *covered, *nodemask); |
| 6710 | prev = sg; | 7557 | prev = sg; |
| 6711 | 7558 | ||
| 6712 | for (j = 0; j < MAX_NUMNODES; j++) { | 7559 | for (j = 0; j < MAX_NUMNODES; j++) { |
| 6713 | cpumask_t tmp, notcovered; | 7560 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
| 6714 | int n = (i + j) % MAX_NUMNODES; | 7561 | int n = (i + j) % MAX_NUMNODES; |
| 7562 | node_to_cpumask_ptr(pnodemask, n); | ||
| 6715 | 7563 | ||
| 6716 | cpus_complement(notcovered, covered); | 7564 | cpus_complement(*notcovered, *covered); |
| 6717 | cpus_and(tmp, notcovered, *cpu_map); | 7565 | cpus_and(*tmpmask, *notcovered, *cpu_map); |
| 6718 | cpus_and(tmp, tmp, domainspan); | 7566 | cpus_and(*tmpmask, *tmpmask, *domainspan); |
| 6719 | if (cpus_empty(tmp)) | 7567 | if (cpus_empty(*tmpmask)) |
| 6720 | break; | 7568 | break; |
| 6721 | 7569 | ||
| 6722 | nodemask = node_to_cpumask(n); | 7570 | cpus_and(*tmpmask, *tmpmask, *pnodemask); |
| 6723 | cpus_and(tmp, tmp, nodemask); | 7571 | if (cpus_empty(*tmpmask)) |
| 6724 | if (cpus_empty(tmp)) | ||
| 6725 | continue; | 7572 | continue; |
| 6726 | 7573 | ||
| 6727 | sg = kmalloc_node(sizeof(struct sched_group), | 7574 | sg = kmalloc_node(sizeof(struct sched_group), |
| @@ -6732,9 +7579,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6732 | goto error; | 7579 | goto error; |
| 6733 | } | 7580 | } |
| 6734 | sg->__cpu_power = 0; | 7581 | sg->__cpu_power = 0; |
| 6735 | sg->cpumask = tmp; | 7582 | sg->cpumask = *tmpmask; |
| 6736 | sg->next = prev->next; | 7583 | sg->next = prev->next; |
| 6737 | cpus_or(covered, covered, tmp); | 7584 | cpus_or(*covered, *covered, *tmpmask); |
| 6738 | prev->next = sg; | 7585 | prev->next = sg; |
| 6739 | prev = sg; | 7586 | prev = sg; |
| 6740 | } | 7587 | } |
| @@ -6770,7 +7617,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6770 | if (sd_allnodes) { | 7617 | if (sd_allnodes) { |
| 6771 | struct sched_group *sg; | 7618 | struct sched_group *sg; |
| 6772 | 7619 | ||
| 6773 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 7620 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, |
| 7621 | tmpmask); | ||
| 6774 | init_numa_sched_groups_power(sg); | 7622 | init_numa_sched_groups_power(sg); |
| 6775 | } | 7623 | } |
| 6776 | #endif | 7624 | #endif |
| @@ -6788,17 +7636,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6788 | cpu_attach_domain(sd, rd, i); | 7636 | cpu_attach_domain(sd, rd, i); |
| 6789 | } | 7637 | } |
| 6790 | 7638 | ||
| 7639 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
| 6791 | return 0; | 7640 | return 0; |
| 6792 | 7641 | ||
| 6793 | #ifdef CONFIG_NUMA | 7642 | #ifdef CONFIG_NUMA |
| 6794 | error: | 7643 | error: |
| 6795 | free_sched_groups(cpu_map); | 7644 | free_sched_groups(cpu_map, tmpmask); |
| 7645 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
| 6796 | return -ENOMEM; | 7646 | return -ENOMEM; |
| 6797 | #endif | 7647 | #endif |
| 6798 | } | 7648 | } |
| 6799 | 7649 | ||
| 7650 | static int build_sched_domains(const cpumask_t *cpu_map) | ||
| 7651 | { | ||
| 7652 | return __build_sched_domains(cpu_map, NULL); | ||
| 7653 | } | ||
| 7654 | |||
| 6800 | static cpumask_t *doms_cur; /* current sched domains */ | 7655 | static cpumask_t *doms_cur; /* current sched domains */ |
| 6801 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7656 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
| 7657 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | ||
| 7658 | in 'doms_cur' */ | ||
| 6802 | 7659 | ||
| 6803 | /* | 7660 | /* |
| 6804 | * Special case: If a kmalloc of a doms_cur partition (array of | 7661 | * Special case: If a kmalloc of a doms_cur partition (array of |
| @@ -6826,15 +7683,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 6826 | if (!doms_cur) | 7683 | if (!doms_cur) |
| 6827 | doms_cur = &fallback_doms; | 7684 | doms_cur = &fallback_doms; |
| 6828 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); | 7685 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
| 7686 | dattr_cur = NULL; | ||
| 6829 | err = build_sched_domains(doms_cur); | 7687 | err = build_sched_domains(doms_cur); |
| 6830 | register_sched_domain_sysctl(); | 7688 | register_sched_domain_sysctl(); |
| 6831 | 7689 | ||
| 6832 | return err; | 7690 | return err; |
| 6833 | } | 7691 | } |
| 6834 | 7692 | ||
| 6835 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 7693 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map, |
| 7694 | cpumask_t *tmpmask) | ||
| 6836 | { | 7695 | { |
| 6837 | free_sched_groups(cpu_map); | 7696 | free_sched_groups(cpu_map, tmpmask); |
| 6838 | } | 7697 | } |
| 6839 | 7698 | ||
| 6840 | /* | 7699 | /* |
| @@ -6843,6 +7702,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | |||
| 6843 | */ | 7702 | */ |
| 6844 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 7703 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
| 6845 | { | 7704 | { |
| 7705 | cpumask_t tmpmask; | ||
| 6846 | int i; | 7706 | int i; |
| 6847 | 7707 | ||
| 6848 | unregister_sched_domain_sysctl(); | 7708 | unregister_sched_domain_sysctl(); |
| @@ -6850,7 +7710,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6850 | for_each_cpu_mask(i, *cpu_map) | 7710 | for_each_cpu_mask(i, *cpu_map) |
| 6851 | cpu_attach_domain(NULL, &def_root_domain, i); | 7711 | cpu_attach_domain(NULL, &def_root_domain, i); |
| 6852 | synchronize_sched(); | 7712 | synchronize_sched(); |
| 6853 | arch_destroy_sched_domains(cpu_map); | 7713 | arch_destroy_sched_domains(cpu_map, &tmpmask); |
| 7714 | } | ||
| 7715 | |||
| 7716 | /* handle null as "default" */ | ||
| 7717 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
| 7718 | struct sched_domain_attr *new, int idx_new) | ||
| 7719 | { | ||
| 7720 | struct sched_domain_attr tmp; | ||
| 7721 | |||
| 7722 | /* fast path */ | ||
| 7723 | if (!new && !cur) | ||
| 7724 | return 1; | ||
| 7725 | |||
| 7726 | tmp = SD_ATTR_INIT; | ||
| 7727 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
| 7728 | new ? (new + idx_new) : &tmp, | ||
| 7729 | sizeof(struct sched_domain_attr)); | ||
| 6854 | } | 7730 | } |
| 6855 | 7731 | ||
| 6856 | /* | 7732 | /* |
| @@ -6874,7 +7750,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6874 | * | 7750 | * |
| 6875 | * Call with hotplug lock held | 7751 | * Call with hotplug lock held |
| 6876 | */ | 7752 | */ |
| 6877 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | 7753 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
| 7754 | struct sched_domain_attr *dattr_new) | ||
| 6878 | { | 7755 | { |
| 6879 | int i, j; | 7756 | int i, j; |
| 6880 | 7757 | ||
| @@ -6887,12 +7764,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
| 6887 | ndoms_new = 1; | 7764 | ndoms_new = 1; |
| 6888 | doms_new = &fallback_doms; | 7765 | doms_new = &fallback_doms; |
| 6889 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7766 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
| 7767 | dattr_new = NULL; | ||
| 6890 | } | 7768 | } |
| 6891 | 7769 | ||
| 6892 | /* Destroy deleted domains */ | 7770 | /* Destroy deleted domains */ |
| 6893 | for (i = 0; i < ndoms_cur; i++) { | 7771 | for (i = 0; i < ndoms_cur; i++) { |
| 6894 | for (j = 0; j < ndoms_new; j++) { | 7772 | for (j = 0; j < ndoms_new; j++) { |
| 6895 | if (cpus_equal(doms_cur[i], doms_new[j])) | 7773 | if (cpus_equal(doms_cur[i], doms_new[j]) |
| 7774 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
| 6896 | goto match1; | 7775 | goto match1; |
| 6897 | } | 7776 | } |
| 6898 | /* no match - a current sched domain not in new doms_new[] */ | 7777 | /* no match - a current sched domain not in new doms_new[] */ |
| @@ -6904,11 +7783,13 @@ match1: | |||
| 6904 | /* Build new domains */ | 7783 | /* Build new domains */ |
| 6905 | for (i = 0; i < ndoms_new; i++) { | 7784 | for (i = 0; i < ndoms_new; i++) { |
| 6906 | for (j = 0; j < ndoms_cur; j++) { | 7785 | for (j = 0; j < ndoms_cur; j++) { |
| 6907 | if (cpus_equal(doms_new[i], doms_cur[j])) | 7786 | if (cpus_equal(doms_new[i], doms_cur[j]) |
| 7787 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
| 6908 | goto match2; | 7788 | goto match2; |
| 6909 | } | 7789 | } |
| 6910 | /* no match - add a new doms_new */ | 7790 | /* no match - add a new doms_new */ |
| 6911 | build_sched_domains(doms_new + i); | 7791 | __build_sched_domains(doms_new + i, |
| 7792 | dattr_new ? dattr_new + i : NULL); | ||
| 6912 | match2: | 7793 | match2: |
| 6913 | ; | 7794 | ; |
| 6914 | } | 7795 | } |
| @@ -6916,7 +7797,9 @@ match2: | |||
| 6916 | /* Remember the new sched domains */ | 7797 | /* Remember the new sched domains */ |
| 6917 | if (doms_cur != &fallback_doms) | 7798 | if (doms_cur != &fallback_doms) |
| 6918 | kfree(doms_cur); | 7799 | kfree(doms_cur); |
| 7800 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
| 6919 | doms_cur = doms_new; | 7801 | doms_cur = doms_new; |
| 7802 | dattr_cur = dattr_new; | ||
| 6920 | ndoms_cur = ndoms_new; | 7803 | ndoms_cur = ndoms_new; |
| 6921 | 7804 | ||
| 6922 | register_sched_domain_sysctl(); | 7805 | register_sched_domain_sysctl(); |
| @@ -7043,6 +7926,11 @@ void __init sched_init_smp(void) | |||
| 7043 | { | 7926 | { |
| 7044 | cpumask_t non_isolated_cpus; | 7927 | cpumask_t non_isolated_cpus; |
| 7045 | 7928 | ||
| 7929 | #if defined(CONFIG_NUMA) | ||
| 7930 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
| 7931 | GFP_KERNEL); | ||
| 7932 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
| 7933 | #endif | ||
| 7046 | get_online_cpus(); | 7934 | get_online_cpus(); |
| 7047 | arch_init_sched_domains(&cpu_online_map); | 7935 | arch_init_sched_domains(&cpu_online_map); |
| 7048 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7936 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
| @@ -7053,7 +7941,7 @@ void __init sched_init_smp(void) | |||
| 7053 | hotcpu_notifier(update_sched_domains, 0); | 7941 | hotcpu_notifier(update_sched_domains, 0); |
| 7054 | 7942 | ||
| 7055 | /* Move init over to a non-isolated CPU */ | 7943 | /* Move init over to a non-isolated CPU */ |
| 7056 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7944 | if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) |
| 7057 | BUG(); | 7945 | BUG(); |
| 7058 | sched_init_granularity(); | 7946 | sched_init_granularity(); |
| 7059 | } | 7947 | } |
| @@ -7074,6 +7962,7 @@ int in_sched_functions(unsigned long addr) | |||
| 7074 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 7962 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
| 7075 | { | 7963 | { |
| 7076 | cfs_rq->tasks_timeline = RB_ROOT; | 7964 | cfs_rq->tasks_timeline = RB_ROOT; |
| 7965 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
| 7077 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7966 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7078 | cfs_rq->rq = rq; | 7967 | cfs_rq->rq = rq; |
| 7079 | #endif | 7968 | #endif |
| @@ -7103,6 +7992,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 7103 | 7992 | ||
| 7104 | rt_rq->rt_time = 0; | 7993 | rt_rq->rt_time = 0; |
| 7105 | rt_rq->rt_throttled = 0; | 7994 | rt_rq->rt_throttled = 0; |
| 7995 | rt_rq->rt_runtime = 0; | ||
| 7996 | spin_lock_init(&rt_rq->rt_runtime_lock); | ||
| 7106 | 7997 | ||
| 7107 | #ifdef CONFIG_RT_GROUP_SCHED | 7998 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7108 | rt_rq->rt_nr_boosted = 0; | 7999 | rt_rq->rt_nr_boosted = 0; |
| @@ -7111,10 +8002,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 7111 | } | 8002 | } |
| 7112 | 8003 | ||
| 7113 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8004 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7114 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | 8005 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| 7115 | struct cfs_rq *cfs_rq, struct sched_entity *se, | 8006 | struct sched_entity *se, int cpu, int add, |
| 7116 | int cpu, int add) | 8007 | struct sched_entity *parent) |
| 7117 | { | 8008 | { |
| 8009 | struct rq *rq = cpu_rq(cpu); | ||
| 7118 | tg->cfs_rq[cpu] = cfs_rq; | 8010 | tg->cfs_rq[cpu] = cfs_rq; |
| 7119 | init_cfs_rq(cfs_rq, rq); | 8011 | init_cfs_rq(cfs_rq, rq); |
| 7120 | cfs_rq->tg = tg; | 8012 | cfs_rq->tg = tg; |
| @@ -7122,45 +8014,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
| 7122 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 8014 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
| 7123 | 8015 | ||
| 7124 | tg->se[cpu] = se; | 8016 | tg->se[cpu] = se; |
| 7125 | se->cfs_rq = &rq->cfs; | 8017 | /* se could be NULL for init_task_group */ |
| 8018 | if (!se) | ||
| 8019 | return; | ||
| 8020 | |||
| 8021 | if (!parent) | ||
| 8022 | se->cfs_rq = &rq->cfs; | ||
| 8023 | else | ||
| 8024 | se->cfs_rq = parent->my_q; | ||
| 8025 | |||
| 7126 | se->my_q = cfs_rq; | 8026 | se->my_q = cfs_rq; |
| 7127 | se->load.weight = tg->shares; | 8027 | se->load.weight = tg->shares; |
| 7128 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | 8028 | se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); |
| 7129 | se->parent = NULL; | 8029 | se->parent = parent; |
| 7130 | } | 8030 | } |
| 7131 | #endif | 8031 | #endif |
| 7132 | 8032 | ||
| 7133 | #ifdef CONFIG_RT_GROUP_SCHED | 8033 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7134 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | 8034 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
| 7135 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | 8035 | struct sched_rt_entity *rt_se, int cpu, int add, |
| 7136 | int cpu, int add) | 8036 | struct sched_rt_entity *parent) |
| 7137 | { | 8037 | { |
| 8038 | struct rq *rq = cpu_rq(cpu); | ||
| 8039 | |||
| 7138 | tg->rt_rq[cpu] = rt_rq; | 8040 | tg->rt_rq[cpu] = rt_rq; |
| 7139 | init_rt_rq(rt_rq, rq); | 8041 | init_rt_rq(rt_rq, rq); |
| 7140 | rt_rq->tg = tg; | 8042 | rt_rq->tg = tg; |
| 7141 | rt_rq->rt_se = rt_se; | 8043 | rt_rq->rt_se = rt_se; |
| 8044 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 7142 | if (add) | 8045 | if (add) |
| 7143 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 8046 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
| 7144 | 8047 | ||
| 7145 | tg->rt_se[cpu] = rt_se; | 8048 | tg->rt_se[cpu] = rt_se; |
| 8049 | if (!rt_se) | ||
| 8050 | return; | ||
| 8051 | |||
| 8052 | if (!parent) | ||
| 8053 | rt_se->rt_rq = &rq->rt; | ||
| 8054 | else | ||
| 8055 | rt_se->rt_rq = parent->my_q; | ||
| 8056 | |||
| 7146 | rt_se->rt_rq = &rq->rt; | 8057 | rt_se->rt_rq = &rq->rt; |
| 7147 | rt_se->my_q = rt_rq; | 8058 | rt_se->my_q = rt_rq; |
| 7148 | rt_se->parent = NULL; | 8059 | rt_se->parent = parent; |
| 7149 | INIT_LIST_HEAD(&rt_se->run_list); | 8060 | INIT_LIST_HEAD(&rt_se->run_list); |
| 7150 | } | 8061 | } |
| 7151 | #endif | 8062 | #endif |
| 7152 | 8063 | ||
| 7153 | void __init sched_init(void) | 8064 | void __init sched_init(void) |
| 7154 | { | 8065 | { |
| 7155 | int highest_cpu = 0; | ||
| 7156 | int i, j; | 8066 | int i, j; |
| 8067 | unsigned long alloc_size = 0, ptr; | ||
| 8068 | |||
| 8069 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8070 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
| 8071 | #endif | ||
| 8072 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8073 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
| 8074 | #endif | ||
| 8075 | #ifdef CONFIG_USER_SCHED | ||
| 8076 | alloc_size *= 2; | ||
| 8077 | #endif | ||
| 8078 | /* | ||
| 8079 | * As sched_init() is called before page_alloc is setup, | ||
| 8080 | * we use alloc_bootmem(). | ||
| 8081 | */ | ||
| 8082 | if (alloc_size) { | ||
| 8083 | ptr = (unsigned long)alloc_bootmem(alloc_size); | ||
| 8084 | |||
| 8085 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8086 | init_task_group.se = (struct sched_entity **)ptr; | ||
| 8087 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8088 | |||
| 8089 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
| 8090 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8091 | |||
| 8092 | #ifdef CONFIG_USER_SCHED | ||
| 8093 | root_task_group.se = (struct sched_entity **)ptr; | ||
| 8094 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8095 | |||
| 8096 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
| 8097 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8098 | #endif | ||
| 8099 | #endif | ||
| 8100 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8101 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
| 8102 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8103 | |||
| 8104 | init_task_group.rt_rq = (struct rt_rq **)ptr; | ||
| 8105 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8106 | |||
| 8107 | #ifdef CONFIG_USER_SCHED | ||
| 8108 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
| 8109 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8110 | |||
| 8111 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
| 8112 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 8113 | #endif | ||
| 8114 | #endif | ||
| 8115 | } | ||
| 7157 | 8116 | ||
| 7158 | #ifdef CONFIG_SMP | 8117 | #ifdef CONFIG_SMP |
| 8118 | init_aggregate(); | ||
| 7159 | init_defrootdomain(); | 8119 | init_defrootdomain(); |
| 7160 | #endif | 8120 | #endif |
| 7161 | 8121 | ||
| 8122 | init_rt_bandwidth(&def_rt_bandwidth, | ||
| 8123 | global_rt_period(), global_rt_runtime()); | ||
| 8124 | |||
| 8125 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8126 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | ||
| 8127 | global_rt_period(), global_rt_runtime()); | ||
| 8128 | #ifdef CONFIG_USER_SCHED | ||
| 8129 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
| 8130 | global_rt_period(), RUNTIME_INF); | ||
| 8131 | #endif | ||
| 8132 | #endif | ||
| 8133 | |||
| 7162 | #ifdef CONFIG_GROUP_SCHED | 8134 | #ifdef CONFIG_GROUP_SCHED |
| 7163 | list_add(&init_task_group.list, &task_groups); | 8135 | list_add(&init_task_group.list, &task_groups); |
| 8136 | INIT_LIST_HEAD(&init_task_group.children); | ||
| 8137 | |||
| 8138 | #ifdef CONFIG_USER_SCHED | ||
| 8139 | INIT_LIST_HEAD(&root_task_group.children); | ||
| 8140 | init_task_group.parent = &root_task_group; | ||
| 8141 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
| 8142 | #endif | ||
| 7164 | #endif | 8143 | #endif |
| 7165 | 8144 | ||
| 7166 | for_each_possible_cpu(i) { | 8145 | for_each_possible_cpu(i) { |
| @@ -7171,26 +8150,68 @@ void __init sched_init(void) | |||
| 7171 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 8150 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
| 7172 | rq->nr_running = 0; | 8151 | rq->nr_running = 0; |
| 7173 | rq->clock = 1; | 8152 | rq->clock = 1; |
| 8153 | update_last_tick_seen(rq); | ||
| 7174 | init_cfs_rq(&rq->cfs, rq); | 8154 | init_cfs_rq(&rq->cfs, rq); |
| 7175 | init_rt_rq(&rq->rt, rq); | 8155 | init_rt_rq(&rq->rt, rq); |
| 7176 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8156 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7177 | init_task_group.shares = init_task_group_load; | 8157 | init_task_group.shares = init_task_group_load; |
| 7178 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8158 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 7179 | init_tg_cfs_entry(rq, &init_task_group, | 8159 | #ifdef CONFIG_CGROUP_SCHED |
| 8160 | /* | ||
| 8161 | * How much cpu bandwidth does init_task_group get? | ||
| 8162 | * | ||
| 8163 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
| 8164 | * gets 100% of the cpu resources in the system. This overall | ||
| 8165 | * system cpu resource is divided among the tasks of | ||
| 8166 | * init_task_group and its child task-groups in a fair manner, | ||
| 8167 | * based on each entity's (task or task-group's) weight | ||
| 8168 | * (se->load.weight). | ||
| 8169 | * | ||
| 8170 | * In other words, if init_task_group has 10 tasks of weight | ||
| 8171 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
| 8172 | * then A0's share of the cpu resource is: | ||
| 8173 | * | ||
| 8174 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
| 8175 | * | ||
| 8176 | * We achieve this by letting init_task_group's tasks sit | ||
| 8177 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
| 8178 | */ | ||
| 8179 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | ||
| 8180 | #elif defined CONFIG_USER_SCHED | ||
| 8181 | root_task_group.shares = NICE_0_LOAD; | ||
| 8182 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
| 8183 | /* | ||
| 8184 | * In case of task-groups formed thr' the user id of tasks, | ||
| 8185 | * init_task_group represents tasks belonging to root user. | ||
| 8186 | * Hence it forms a sibling of all subsequent groups formed. | ||
| 8187 | * In this case, init_task_group gets only a fraction of overall | ||
| 8188 | * system cpu resource, based on the weight assigned to root | ||
| 8189 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
| 8190 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
| 8191 | * (init_cfs_rq) and having one entity represent this group of | ||
| 8192 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
| 8193 | */ | ||
| 8194 | init_tg_cfs_entry(&init_task_group, | ||
| 7180 | &per_cpu(init_cfs_rq, i), | 8195 | &per_cpu(init_cfs_rq, i), |
| 7181 | &per_cpu(init_sched_entity, i), i, 1); | 8196 | &per_cpu(init_sched_entity, i), i, 1, |
| 8197 | root_task_group.se[i]); | ||
| 7182 | 8198 | ||
| 7183 | #endif | 8199 | #endif |
| 8200 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 8201 | |||
| 8202 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
| 7184 | #ifdef CONFIG_RT_GROUP_SCHED | 8203 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7185 | init_task_group.rt_runtime = | ||
| 7186 | sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 7187 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8204 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
| 7188 | init_tg_rt_entry(rq, &init_task_group, | 8205 | #ifdef CONFIG_CGROUP_SCHED |
| 8206 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
| 8207 | #elif defined CONFIG_USER_SCHED | ||
| 8208 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
| 8209 | init_tg_rt_entry(&init_task_group, | ||
| 7189 | &per_cpu(init_rt_rq, i), | 8210 | &per_cpu(init_rt_rq, i), |
| 7190 | &per_cpu(init_sched_rt_entity, i), i, 1); | 8211 | &per_cpu(init_sched_rt_entity, i), i, 1, |
| 8212 | root_task_group.rt_se[i]); | ||
| 8213 | #endif | ||
| 7191 | #endif | 8214 | #endif |
| 7192 | rq->rt_period_expire = 0; | ||
| 7193 | rq->rt_throttled = 0; | ||
| 7194 | 8215 | ||
| 7195 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8216 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 7196 | rq->cpu_load[j] = 0; | 8217 | rq->cpu_load[j] = 0; |
| @@ -7207,7 +8228,6 @@ void __init sched_init(void) | |||
| 7207 | #endif | 8228 | #endif |
| 7208 | init_rq_hrtick(rq); | 8229 | init_rq_hrtick(rq); |
| 7209 | atomic_set(&rq->nr_iowait, 0); | 8230 | atomic_set(&rq->nr_iowait, 0); |
| 7210 | highest_cpu = i; | ||
| 7211 | } | 8231 | } |
| 7212 | 8232 | ||
| 7213 | set_load_weight(&init_task); | 8233 | set_load_weight(&init_task); |
| @@ -7217,7 +8237,6 @@ void __init sched_init(void) | |||
| 7217 | #endif | 8237 | #endif |
| 7218 | 8238 | ||
| 7219 | #ifdef CONFIG_SMP | 8239 | #ifdef CONFIG_SMP |
| 7220 | nr_cpu_ids = highest_cpu + 1; | ||
| 7221 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8240 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
| 7222 | #endif | 8241 | #endif |
| 7223 | 8242 | ||
| @@ -7376,8 +8395,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 7376 | 8395 | ||
| 7377 | #endif | 8396 | #endif |
| 7378 | 8397 | ||
| 7379 | #ifdef CONFIG_GROUP_SCHED | ||
| 7380 | |||
| 7381 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8398 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7382 | static void free_fair_sched_group(struct task_group *tg) | 8399 | static void free_fair_sched_group(struct task_group *tg) |
| 7383 | { | 8400 | { |
| @@ -7394,17 +8411,18 @@ static void free_fair_sched_group(struct task_group *tg) | |||
| 7394 | kfree(tg->se); | 8411 | kfree(tg->se); |
| 7395 | } | 8412 | } |
| 7396 | 8413 | ||
| 7397 | static int alloc_fair_sched_group(struct task_group *tg) | 8414 | static |
| 8415 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7398 | { | 8416 | { |
| 7399 | struct cfs_rq *cfs_rq; | 8417 | struct cfs_rq *cfs_rq; |
| 7400 | struct sched_entity *se; | 8418 | struct sched_entity *se, *parent_se; |
| 7401 | struct rq *rq; | 8419 | struct rq *rq; |
| 7402 | int i; | 8420 | int i; |
| 7403 | 8421 | ||
| 7404 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | 8422 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
| 7405 | if (!tg->cfs_rq) | 8423 | if (!tg->cfs_rq) |
| 7406 | goto err; | 8424 | goto err; |
| 7407 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 8425 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); |
| 7408 | if (!tg->se) | 8426 | if (!tg->se) |
| 7409 | goto err; | 8427 | goto err; |
| 7410 | 8428 | ||
| @@ -7423,7 +8441,8 @@ static int alloc_fair_sched_group(struct task_group *tg) | |||
| 7423 | if (!se) | 8441 | if (!se) |
| 7424 | goto err; | 8442 | goto err; |
| 7425 | 8443 | ||
| 7426 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); | 8444 | parent_se = parent ? parent->se[i] : NULL; |
| 8445 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); | ||
| 7427 | } | 8446 | } |
| 7428 | 8447 | ||
| 7429 | return 1; | 8448 | return 1; |
| @@ -7447,7 +8466,8 @@ static inline void free_fair_sched_group(struct task_group *tg) | |||
| 7447 | { | 8466 | { |
| 7448 | } | 8467 | } |
| 7449 | 8468 | ||
| 7450 | static inline int alloc_fair_sched_group(struct task_group *tg) | 8469 | static inline |
| 8470 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7451 | { | 8471 | { |
| 7452 | return 1; | 8472 | return 1; |
| 7453 | } | 8473 | } |
| @@ -7466,6 +8486,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
| 7466 | { | 8486 | { |
| 7467 | int i; | 8487 | int i; |
| 7468 | 8488 | ||
| 8489 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
| 8490 | |||
| 7469 | for_each_possible_cpu(i) { | 8491 | for_each_possible_cpu(i) { |
| 7470 | if (tg->rt_rq) | 8492 | if (tg->rt_rq) |
| 7471 | kfree(tg->rt_rq[i]); | 8493 | kfree(tg->rt_rq[i]); |
| @@ -7477,21 +8499,23 @@ static void free_rt_sched_group(struct task_group *tg) | |||
| 7477 | kfree(tg->rt_se); | 8499 | kfree(tg->rt_se); |
| 7478 | } | 8500 | } |
| 7479 | 8501 | ||
| 7480 | static int alloc_rt_sched_group(struct task_group *tg) | 8502 | static |
| 8503 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7481 | { | 8504 | { |
| 7482 | struct rt_rq *rt_rq; | 8505 | struct rt_rq *rt_rq; |
| 7483 | struct sched_rt_entity *rt_se; | 8506 | struct sched_rt_entity *rt_se, *parent_se; |
| 7484 | struct rq *rq; | 8507 | struct rq *rq; |
| 7485 | int i; | 8508 | int i; |
| 7486 | 8509 | ||
| 7487 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | 8510 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
| 7488 | if (!tg->rt_rq) | 8511 | if (!tg->rt_rq) |
| 7489 | goto err; | 8512 | goto err; |
| 7490 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | 8513 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); |
| 7491 | if (!tg->rt_se) | 8514 | if (!tg->rt_se) |
| 7492 | goto err; | 8515 | goto err; |
| 7493 | 8516 | ||
| 7494 | tg->rt_runtime = 0; | 8517 | init_rt_bandwidth(&tg->rt_bandwidth, |
| 8518 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
| 7495 | 8519 | ||
| 7496 | for_each_possible_cpu(i) { | 8520 | for_each_possible_cpu(i) { |
| 7497 | rq = cpu_rq(i); | 8521 | rq = cpu_rq(i); |
| @@ -7506,7 +8530,8 @@ static int alloc_rt_sched_group(struct task_group *tg) | |||
| 7506 | if (!rt_se) | 8530 | if (!rt_se) |
| 7507 | goto err; | 8531 | goto err; |
| 7508 | 8532 | ||
| 7509 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); | 8533 | parent_se = parent ? parent->rt_se[i] : NULL; |
| 8534 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); | ||
| 7510 | } | 8535 | } |
| 7511 | 8536 | ||
| 7512 | return 1; | 8537 | return 1; |
| @@ -7530,7 +8555,8 @@ static inline void free_rt_sched_group(struct task_group *tg) | |||
| 7530 | { | 8555 | { |
| 7531 | } | 8556 | } |
| 7532 | 8557 | ||
| 7533 | static inline int alloc_rt_sched_group(struct task_group *tg) | 8558 | static inline |
| 8559 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 7534 | { | 8560 | { |
| 7535 | return 1; | 8561 | return 1; |
| 7536 | } | 8562 | } |
| @@ -7544,6 +8570,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
| 7544 | } | 8570 | } |
| 7545 | #endif | 8571 | #endif |
| 7546 | 8572 | ||
| 8573 | #ifdef CONFIG_GROUP_SCHED | ||
| 7547 | static void free_sched_group(struct task_group *tg) | 8574 | static void free_sched_group(struct task_group *tg) |
| 7548 | { | 8575 | { |
| 7549 | free_fair_sched_group(tg); | 8576 | free_fair_sched_group(tg); |
| @@ -7552,7 +8579,7 @@ static void free_sched_group(struct task_group *tg) | |||
| 7552 | } | 8579 | } |
| 7553 | 8580 | ||
| 7554 | /* allocate runqueue etc for a new task group */ | 8581 | /* allocate runqueue etc for a new task group */ |
| 7555 | struct task_group *sched_create_group(void) | 8582 | struct task_group *sched_create_group(struct task_group *parent) |
| 7556 | { | 8583 | { |
| 7557 | struct task_group *tg; | 8584 | struct task_group *tg; |
| 7558 | unsigned long flags; | 8585 | unsigned long flags; |
| @@ -7562,10 +8589,10 @@ struct task_group *sched_create_group(void) | |||
| 7562 | if (!tg) | 8589 | if (!tg) |
| 7563 | return ERR_PTR(-ENOMEM); | 8590 | return ERR_PTR(-ENOMEM); |
| 7564 | 8591 | ||
| 7565 | if (!alloc_fair_sched_group(tg)) | 8592 | if (!alloc_fair_sched_group(tg, parent)) |
| 7566 | goto err; | 8593 | goto err; |
| 7567 | 8594 | ||
| 7568 | if (!alloc_rt_sched_group(tg)) | 8595 | if (!alloc_rt_sched_group(tg, parent)) |
| 7569 | goto err; | 8596 | goto err; |
| 7570 | 8597 | ||
| 7571 | spin_lock_irqsave(&task_group_lock, flags); | 8598 | spin_lock_irqsave(&task_group_lock, flags); |
| @@ -7574,6 +8601,12 @@ struct task_group *sched_create_group(void) | |||
| 7574 | register_rt_sched_group(tg, i); | 8601 | register_rt_sched_group(tg, i); |
| 7575 | } | 8602 | } |
| 7576 | list_add_rcu(&tg->list, &task_groups); | 8603 | list_add_rcu(&tg->list, &task_groups); |
| 8604 | |||
| 8605 | WARN_ON(!parent); /* root should already exist */ | ||
| 8606 | |||
| 8607 | tg->parent = parent; | ||
| 8608 | list_add_rcu(&tg->siblings, &parent->children); | ||
| 8609 | INIT_LIST_HEAD(&tg->children); | ||
| 7577 | spin_unlock_irqrestore(&task_group_lock, flags); | 8610 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7578 | 8611 | ||
| 7579 | return tg; | 8612 | return tg; |
| @@ -7602,6 +8635,7 @@ void sched_destroy_group(struct task_group *tg) | |||
| 7602 | unregister_rt_sched_group(tg, i); | 8635 | unregister_rt_sched_group(tg, i); |
| 7603 | } | 8636 | } |
| 7604 | list_del_rcu(&tg->list); | 8637 | list_del_rcu(&tg->list); |
| 8638 | list_del_rcu(&tg->siblings); | ||
| 7605 | spin_unlock_irqrestore(&task_group_lock, flags); | 8639 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7606 | 8640 | ||
| 7607 | /* wait for possible concurrent references to cfs_rqs complete */ | 8641 | /* wait for possible concurrent references to cfs_rqs complete */ |
| @@ -7645,27 +8679,34 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7645 | 8679 | ||
| 7646 | task_rq_unlock(rq, &flags); | 8680 | task_rq_unlock(rq, &flags); |
| 7647 | } | 8681 | } |
| 8682 | #endif | ||
| 7648 | 8683 | ||
| 7649 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8684 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7650 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8685 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
| 7651 | { | 8686 | { |
| 7652 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8687 | struct cfs_rq *cfs_rq = se->cfs_rq; |
| 7653 | struct rq *rq = cfs_rq->rq; | ||
| 7654 | int on_rq; | 8688 | int on_rq; |
| 7655 | 8689 | ||
| 7656 | spin_lock_irq(&rq->lock); | ||
| 7657 | |||
| 7658 | on_rq = se->on_rq; | 8690 | on_rq = se->on_rq; |
| 7659 | if (on_rq) | 8691 | if (on_rq) |
| 7660 | dequeue_entity(cfs_rq, se, 0); | 8692 | dequeue_entity(cfs_rq, se, 0); |
| 7661 | 8693 | ||
| 7662 | se->load.weight = shares; | 8694 | se->load.weight = shares; |
| 7663 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 8695 | se->load.inv_weight = div64_u64((1ULL<<32), shares); |
| 7664 | 8696 | ||
| 7665 | if (on_rq) | 8697 | if (on_rq) |
| 7666 | enqueue_entity(cfs_rq, se, 0); | 8698 | enqueue_entity(cfs_rq, se, 0); |
| 8699 | } | ||
| 7667 | 8700 | ||
| 7668 | spin_unlock_irq(&rq->lock); | 8701 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
| 8702 | { | ||
| 8703 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 8704 | struct rq *rq = cfs_rq->rq; | ||
| 8705 | unsigned long flags; | ||
| 8706 | |||
| 8707 | spin_lock_irqsave(&rq->lock, flags); | ||
| 8708 | __set_se_shares(se, shares); | ||
| 8709 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 7669 | } | 8710 | } |
| 7670 | 8711 | ||
| 7671 | static DEFINE_MUTEX(shares_mutex); | 8712 | static DEFINE_MUTEX(shares_mutex); |
| @@ -7676,12 +8717,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7676 | unsigned long flags; | 8717 | unsigned long flags; |
| 7677 | 8718 | ||
| 7678 | /* | 8719 | /* |
| 8720 | * We can't change the weight of the root cgroup. | ||
| 8721 | */ | ||
| 8722 | if (!tg->se[0]) | ||
| 8723 | return -EINVAL; | ||
| 8724 | |||
| 8725 | /* | ||
| 7679 | * A weight of 0 or 1 can cause arithmetics problems. | 8726 | * A weight of 0 or 1 can cause arithmetics problems. |
| 7680 | * (The default weight is 1024 - so there's no practical | 8727 | * (The default weight is 1024 - so there's no practical |
| 7681 | * limitation from this.) | 8728 | * limitation from this.) |
| 7682 | */ | 8729 | */ |
| 7683 | if (shares < 2) | 8730 | if (shares < MIN_SHARES) |
| 7684 | shares = 2; | 8731 | shares = MIN_SHARES; |
| 7685 | 8732 | ||
| 7686 | mutex_lock(&shares_mutex); | 8733 | mutex_lock(&shares_mutex); |
| 7687 | if (tg->shares == shares) | 8734 | if (tg->shares == shares) |
| @@ -7690,6 +8737,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7690 | spin_lock_irqsave(&task_group_lock, flags); | 8737 | spin_lock_irqsave(&task_group_lock, flags); |
| 7691 | for_each_possible_cpu(i) | 8738 | for_each_possible_cpu(i) |
| 7692 | unregister_fair_sched_group(tg, i); | 8739 | unregister_fair_sched_group(tg, i); |
| 8740 | list_del_rcu(&tg->siblings); | ||
| 7693 | spin_unlock_irqrestore(&task_group_lock, flags); | 8741 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7694 | 8742 | ||
| 7695 | /* wait for any ongoing reference to this group to finish */ | 8743 | /* wait for any ongoing reference to this group to finish */ |
| @@ -7700,8 +8748,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7700 | * w/o tripping rebalance_share or load_balance_fair. | 8748 | * w/o tripping rebalance_share or load_balance_fair. |
| 7701 | */ | 8749 | */ |
| 7702 | tg->shares = shares; | 8750 | tg->shares = shares; |
| 7703 | for_each_possible_cpu(i) | 8751 | for_each_possible_cpu(i) { |
| 7704 | set_se_shares(tg->se[i], shares); | 8752 | /* |
| 8753 | * force a rebalance | ||
| 8754 | */ | ||
| 8755 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
| 8756 | set_se_shares(tg->se[i], shares/nr_cpu_ids); | ||
| 8757 | } | ||
| 7705 | 8758 | ||
| 7706 | /* | 8759 | /* |
| 7707 | * Enable load balance activity on this group, by inserting it back on | 8760 | * Enable load balance activity on this group, by inserting it back on |
| @@ -7710,6 +8763,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7710 | spin_lock_irqsave(&task_group_lock, flags); | 8763 | spin_lock_irqsave(&task_group_lock, flags); |
| 7711 | for_each_possible_cpu(i) | 8764 | for_each_possible_cpu(i) |
| 7712 | register_fair_sched_group(tg, i); | 8765 | register_fair_sched_group(tg, i); |
| 8766 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
| 7713 | spin_unlock_irqrestore(&task_group_lock, flags); | 8767 | spin_unlock_irqrestore(&task_group_lock, flags); |
| 7714 | done: | 8768 | done: |
| 7715 | mutex_unlock(&shares_mutex); | 8769 | mutex_unlock(&shares_mutex); |
| @@ -7733,29 +8787,61 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
| 7733 | if (runtime == RUNTIME_INF) | 8787 | if (runtime == RUNTIME_INF) |
| 7734 | return 1ULL << 16; | 8788 | return 1ULL << 16; |
| 7735 | 8789 | ||
| 7736 | return div64_64(runtime << 16, period); | 8790 | return div64_u64(runtime << 16, period); |
| 7737 | } | 8791 | } |
| 7738 | 8792 | ||
| 8793 | #ifdef CONFIG_CGROUP_SCHED | ||
| 8794 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 8795 | { | ||
| 8796 | struct task_group *tgi, *parent = tg->parent; | ||
| 8797 | unsigned long total = 0; | ||
| 8798 | |||
| 8799 | if (!parent) { | ||
| 8800 | if (global_rt_period() < period) | ||
| 8801 | return 0; | ||
| 8802 | |||
| 8803 | return to_ratio(period, runtime) < | ||
| 8804 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 8805 | } | ||
| 8806 | |||
| 8807 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | ||
| 8808 | return 0; | ||
| 8809 | |||
| 8810 | rcu_read_lock(); | ||
| 8811 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | ||
| 8812 | if (tgi == tg) | ||
| 8813 | continue; | ||
| 8814 | |||
| 8815 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | ||
| 8816 | tgi->rt_bandwidth.rt_runtime); | ||
| 8817 | } | ||
| 8818 | rcu_read_unlock(); | ||
| 8819 | |||
| 8820 | return total + to_ratio(period, runtime) < | ||
| 8821 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | ||
| 8822 | parent->rt_bandwidth.rt_runtime); | ||
| 8823 | } | ||
| 8824 | #elif defined CONFIG_USER_SCHED | ||
| 7739 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8825 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 7740 | { | 8826 | { |
| 7741 | struct task_group *tgi; | 8827 | struct task_group *tgi; |
| 7742 | unsigned long total = 0; | 8828 | unsigned long total = 0; |
| 7743 | unsigned long global_ratio = | 8829 | unsigned long global_ratio = |
| 7744 | to_ratio(sysctl_sched_rt_period, | 8830 | to_ratio(global_rt_period(), global_rt_runtime()); |
| 7745 | sysctl_sched_rt_runtime < 0 ? | ||
| 7746 | RUNTIME_INF : sysctl_sched_rt_runtime); | ||
| 7747 | 8831 | ||
| 7748 | rcu_read_lock(); | 8832 | rcu_read_lock(); |
| 7749 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8833 | list_for_each_entry_rcu(tgi, &task_groups, list) { |
| 7750 | if (tgi == tg) | 8834 | if (tgi == tg) |
| 7751 | continue; | 8835 | continue; |
| 7752 | 8836 | ||
| 7753 | total += to_ratio(period, tgi->rt_runtime); | 8837 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), |
| 8838 | tgi->rt_bandwidth.rt_runtime); | ||
| 7754 | } | 8839 | } |
| 7755 | rcu_read_unlock(); | 8840 | rcu_read_unlock(); |
| 7756 | 8841 | ||
| 7757 | return total + to_ratio(period, runtime) < global_ratio; | 8842 | return total + to_ratio(period, runtime) < global_ratio; |
| 7758 | } | 8843 | } |
| 8844 | #endif | ||
| 7759 | 8845 | ||
| 7760 | /* Must be called with tasklist_lock held */ | 8846 | /* Must be called with tasklist_lock held */ |
| 7761 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8847 | static inline int tg_has_rt_tasks(struct task_group *tg) |
| @@ -7768,19 +8854,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 7768 | return 0; | 8854 | return 0; |
| 7769 | } | 8855 | } |
| 7770 | 8856 | ||
| 7771 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 8857 | static int tg_set_bandwidth(struct task_group *tg, |
| 8858 | u64 rt_period, u64 rt_runtime) | ||
| 7772 | { | 8859 | { |
| 7773 | u64 rt_runtime, rt_period; | 8860 | int i, err = 0; |
| 7774 | int err = 0; | ||
| 7775 | |||
| 7776 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 7777 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 7778 | if (rt_runtime_us == -1) | ||
| 7779 | rt_runtime = RUNTIME_INF; | ||
| 7780 | 8861 | ||
| 7781 | mutex_lock(&rt_constraints_mutex); | 8862 | mutex_lock(&rt_constraints_mutex); |
| 7782 | read_lock(&tasklist_lock); | 8863 | read_lock(&tasklist_lock); |
| 7783 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | 8864 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { |
| 7784 | err = -EBUSY; | 8865 | err = -EBUSY; |
| 7785 | goto unlock; | 8866 | goto unlock; |
| 7786 | } | 8867 | } |
| @@ -7788,7 +8869,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
| 7788 | err = -EINVAL; | 8869 | err = -EINVAL; |
| 7789 | goto unlock; | 8870 | goto unlock; |
| 7790 | } | 8871 | } |
| 7791 | tg->rt_runtime = rt_runtime; | 8872 | |
| 8873 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 8874 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
| 8875 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
| 8876 | |||
| 8877 | for_each_possible_cpu(i) { | ||
| 8878 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
| 8879 | |||
| 8880 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 8881 | rt_rq->rt_runtime = rt_runtime; | ||
| 8882 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 8883 | } | ||
| 8884 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 7792 | unlock: | 8885 | unlock: |
| 7793 | read_unlock(&tasklist_lock); | 8886 | read_unlock(&tasklist_lock); |
| 7794 | mutex_unlock(&rt_constraints_mutex); | 8887 | mutex_unlock(&rt_constraints_mutex); |
| @@ -7796,19 +8889,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
| 7796 | return err; | 8889 | return err; |
| 7797 | } | 8890 | } |
| 7798 | 8891 | ||
| 8892 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
| 8893 | { | ||
| 8894 | u64 rt_runtime, rt_period; | ||
| 8895 | |||
| 8896 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8897 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 8898 | if (rt_runtime_us < 0) | ||
| 8899 | rt_runtime = RUNTIME_INF; | ||
| 8900 | |||
| 8901 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
| 8902 | } | ||
| 8903 | |||
| 7799 | long sched_group_rt_runtime(struct task_group *tg) | 8904 | long sched_group_rt_runtime(struct task_group *tg) |
| 7800 | { | 8905 | { |
| 7801 | u64 rt_runtime_us; | 8906 | u64 rt_runtime_us; |
| 7802 | 8907 | ||
| 7803 | if (tg->rt_runtime == RUNTIME_INF) | 8908 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) |
| 7804 | return -1; | 8909 | return -1; |
| 7805 | 8910 | ||
| 7806 | rt_runtime_us = tg->rt_runtime; | 8911 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; |
| 7807 | do_div(rt_runtime_us, NSEC_PER_USEC); | 8912 | do_div(rt_runtime_us, NSEC_PER_USEC); |
| 7808 | return rt_runtime_us; | 8913 | return rt_runtime_us; |
| 7809 | } | 8914 | } |
| 8915 | |||
| 8916 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | ||
| 8917 | { | ||
| 8918 | u64 rt_runtime, rt_period; | ||
| 8919 | |||
| 8920 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | ||
| 8921 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8922 | |||
| 8923 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
| 8924 | } | ||
| 8925 | |||
| 8926 | long sched_group_rt_period(struct task_group *tg) | ||
| 8927 | { | ||
| 8928 | u64 rt_period_us; | ||
| 8929 | |||
| 8930 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 8931 | do_div(rt_period_us, NSEC_PER_USEC); | ||
| 8932 | return rt_period_us; | ||
| 8933 | } | ||
| 8934 | |||
| 8935 | static int sched_rt_global_constraints(void) | ||
| 8936 | { | ||
| 8937 | int ret = 0; | ||
| 8938 | |||
| 8939 | mutex_lock(&rt_constraints_mutex); | ||
| 8940 | if (!__rt_schedulable(NULL, 1, 0)) | ||
| 8941 | ret = -EINVAL; | ||
| 8942 | mutex_unlock(&rt_constraints_mutex); | ||
| 8943 | |||
| 8944 | return ret; | ||
| 8945 | } | ||
| 8946 | #else | ||
| 8947 | static int sched_rt_global_constraints(void) | ||
| 8948 | { | ||
| 8949 | unsigned long flags; | ||
| 8950 | int i; | ||
| 8951 | |||
| 8952 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 8953 | for_each_possible_cpu(i) { | ||
| 8954 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
| 8955 | |||
| 8956 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 8957 | rt_rq->rt_runtime = global_rt_runtime(); | ||
| 8958 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 8959 | } | ||
| 8960 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 8961 | |||
| 8962 | return 0; | ||
| 8963 | } | ||
| 7810 | #endif | 8964 | #endif |
| 7811 | #endif /* CONFIG_GROUP_SCHED */ | 8965 | |
| 8966 | int sched_rt_handler(struct ctl_table *table, int write, | ||
| 8967 | struct file *filp, void __user *buffer, size_t *lenp, | ||
| 8968 | loff_t *ppos) | ||
| 8969 | { | ||
| 8970 | int ret; | ||
| 8971 | int old_period, old_runtime; | ||
| 8972 | static DEFINE_MUTEX(mutex); | ||
| 8973 | |||
| 8974 | mutex_lock(&mutex); | ||
| 8975 | old_period = sysctl_sched_rt_period; | ||
| 8976 | old_runtime = sysctl_sched_rt_runtime; | ||
| 8977 | |||
| 8978 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
| 8979 | |||
| 8980 | if (!ret && write) { | ||
| 8981 | ret = sched_rt_global_constraints(); | ||
| 8982 | if (ret) { | ||
| 8983 | sysctl_sched_rt_period = old_period; | ||
| 8984 | sysctl_sched_rt_runtime = old_runtime; | ||
| 8985 | } else { | ||
| 8986 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
| 8987 | def_rt_bandwidth.rt_period = | ||
| 8988 | ns_to_ktime(global_rt_period()); | ||
| 8989 | } | ||
| 8990 | } | ||
| 8991 | mutex_unlock(&mutex); | ||
| 8992 | |||
| 8993 | return ret; | ||
| 8994 | } | ||
| 7812 | 8995 | ||
| 7813 | #ifdef CONFIG_CGROUP_SCHED | 8996 | #ifdef CONFIG_CGROUP_SCHED |
| 7814 | 8997 | ||
| @@ -7822,7 +9005,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
| 7822 | static struct cgroup_subsys_state * | 9005 | static struct cgroup_subsys_state * |
| 7823 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9006 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 7824 | { | 9007 | { |
| 7825 | struct task_group *tg; | 9008 | struct task_group *tg, *parent; |
| 7826 | 9009 | ||
| 7827 | if (!cgrp->parent) { | 9010 | if (!cgrp->parent) { |
| 7828 | /* This is early initialization for the top cgroup */ | 9011 | /* This is early initialization for the top cgroup */ |
| @@ -7830,11 +9013,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 7830 | return &init_task_group.css; | 9013 | return &init_task_group.css; |
| 7831 | } | 9014 | } |
| 7832 | 9015 | ||
| 7833 | /* we support only 1-level deep hierarchical scheduler atm */ | 9016 | parent = cgroup_tg(cgrp->parent); |
| 7834 | if (cgrp->parent->parent) | 9017 | tg = sched_create_group(parent); |
| 7835 | return ERR_PTR(-EINVAL); | ||
| 7836 | |||
| 7837 | tg = sched_create_group(); | ||
| 7838 | if (IS_ERR(tg)) | 9018 | if (IS_ERR(tg)) |
| 7839 | return ERR_PTR(-ENOMEM); | 9019 | return ERR_PTR(-ENOMEM); |
| 7840 | 9020 | ||
| @@ -7858,7 +9038,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 7858 | { | 9038 | { |
| 7859 | #ifdef CONFIG_RT_GROUP_SCHED | 9039 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7860 | /* Don't accept realtime tasks when there is no way for them to run */ | 9040 | /* Don't accept realtime tasks when there is no way for them to run */ |
| 7861 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) | 9041 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) |
| 7862 | return -EINVAL; | 9042 | return -EINVAL; |
| 7863 | #else | 9043 | #else |
| 7864 | /* We don't support RT-tasks being in separate groups */ | 9044 | /* We don't support RT-tasks being in separate groups */ |
| @@ -7877,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 7877 | } | 9057 | } |
| 7878 | 9058 | ||
| 7879 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9059 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7880 | static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 9060 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
| 7881 | u64 shareval) | 9061 | u64 shareval) |
| 7882 | { | 9062 | { |
| 7883 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 9063 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); |
| 7884 | } | 9064 | } |
| 7885 | 9065 | ||
| 7886 | static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | 9066 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
| 7887 | { | 9067 | { |
| 7888 | struct task_group *tg = cgroup_tg(cgrp); | 9068 | struct task_group *tg = cgroup_tg(cgrp); |
| 7889 | 9069 | ||
| @@ -7892,49 +9072,26 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
| 7892 | #endif | 9072 | #endif |
| 7893 | 9073 | ||
| 7894 | #ifdef CONFIG_RT_GROUP_SCHED | 9074 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7895 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 9075 | static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
| 7896 | struct file *file, | 9076 | s64 val) |
| 7897 | const char __user *userbuf, | ||
| 7898 | size_t nbytes, loff_t *unused_ppos) | ||
| 7899 | { | 9077 | { |
| 7900 | char buffer[64]; | 9078 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); |
| 7901 | int retval = 0; | 9079 | } |
| 7902 | s64 val; | ||
| 7903 | char *end; | ||
| 7904 | |||
| 7905 | if (!nbytes) | ||
| 7906 | return -EINVAL; | ||
| 7907 | if (nbytes >= sizeof(buffer)) | ||
| 7908 | return -E2BIG; | ||
| 7909 | if (copy_from_user(buffer, userbuf, nbytes)) | ||
| 7910 | return -EFAULT; | ||
| 7911 | |||
| 7912 | buffer[nbytes] = 0; /* nul-terminate */ | ||
| 7913 | |||
| 7914 | /* strip newline if necessary */ | ||
| 7915 | if (nbytes && (buffer[nbytes-1] == '\n')) | ||
| 7916 | buffer[nbytes-1] = 0; | ||
| 7917 | val = simple_strtoll(buffer, &end, 0); | ||
| 7918 | if (*end) | ||
| 7919 | return -EINVAL; | ||
| 7920 | 9080 | ||
| 7921 | /* Pass to subsystem */ | 9081 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) |
| 7922 | retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 9082 | { |
| 7923 | if (!retval) | 9083 | return sched_group_rt_runtime(cgroup_tg(cgrp)); |
| 7924 | retval = nbytes; | ||
| 7925 | return retval; | ||
| 7926 | } | 9084 | } |
| 7927 | 9085 | ||
| 7928 | static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, | 9086 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, |
| 7929 | struct file *file, | 9087 | u64 rt_period_us) |
| 7930 | char __user *buf, size_t nbytes, | ||
| 7931 | loff_t *ppos) | ||
| 7932 | { | 9088 | { |
| 7933 | char tmp[64]; | 9089 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); |
| 7934 | long val = sched_group_rt_runtime(cgroup_tg(cgrp)); | 9090 | } |
| 7935 | int len = sprintf(tmp, "%ld\n", val); | ||
| 7936 | 9091 | ||
| 7937 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 9092 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) |
| 9093 | { | ||
| 9094 | return sched_group_rt_period(cgroup_tg(cgrp)); | ||
| 7938 | } | 9095 | } |
| 7939 | #endif | 9096 | #endif |
| 7940 | 9097 | ||
| @@ -7942,15 +9099,20 @@ static struct cftype cpu_files[] = { | |||
| 7942 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9099 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7943 | { | 9100 | { |
| 7944 | .name = "shares", | 9101 | .name = "shares", |
| 7945 | .read_uint = cpu_shares_read_uint, | 9102 | .read_u64 = cpu_shares_read_u64, |
| 7946 | .write_uint = cpu_shares_write_uint, | 9103 | .write_u64 = cpu_shares_write_u64, |
| 7947 | }, | 9104 | }, |
| 7948 | #endif | 9105 | #endif |
| 7949 | #ifdef CONFIG_RT_GROUP_SCHED | 9106 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7950 | { | 9107 | { |
| 7951 | .name = "rt_runtime_us", | 9108 | .name = "rt_runtime_us", |
| 7952 | .read = cpu_rt_runtime_read, | 9109 | .read_s64 = cpu_rt_runtime_read, |
| 7953 | .write = cpu_rt_runtime_write, | 9110 | .write_s64 = cpu_rt_runtime_write, |
| 9111 | }, | ||
| 9112 | { | ||
| 9113 | .name = "rt_period_us", | ||
| 9114 | .read_u64 = cpu_rt_period_read_uint, | ||
| 9115 | .write_u64 = cpu_rt_period_write_uint, | ||
| 7954 | }, | 9116 | }, |
| 7955 | #endif | 9117 | #endif |
| 7956 | }; | 9118 | }; |
| @@ -7992,9 +9154,9 @@ struct cpuacct { | |||
| 7992 | struct cgroup_subsys cpuacct_subsys; | 9154 | struct cgroup_subsys cpuacct_subsys; |
| 7993 | 9155 | ||
| 7994 | /* return cpu accounting group corresponding to this container */ | 9156 | /* return cpu accounting group corresponding to this container */ |
| 7995 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | 9157 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
| 7996 | { | 9158 | { |
| 7997 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | 9159 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), |
| 7998 | struct cpuacct, css); | 9160 | struct cpuacct, css); |
| 7999 | } | 9161 | } |
| 8000 | 9162 | ||
| @@ -8007,7 +9169,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
| 8007 | 9169 | ||
| 8008 | /* create a new cpu accounting group */ | 9170 | /* create a new cpu accounting group */ |
| 8009 | static struct cgroup_subsys_state *cpuacct_create( | 9171 | static struct cgroup_subsys_state *cpuacct_create( |
| 8010 | struct cgroup_subsys *ss, struct cgroup *cont) | 9172 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8011 | { | 9173 | { |
| 8012 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 9174 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
| 8013 | 9175 | ||
| @@ -8025,18 +9187,18 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
| 8025 | 9187 | ||
| 8026 | /* destroy an existing cpu accounting group */ | 9188 | /* destroy an existing cpu accounting group */ |
| 8027 | static void | 9189 | static void |
| 8028 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 9190 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8029 | { | 9191 | { |
| 8030 | struct cpuacct *ca = cgroup_ca(cont); | 9192 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 8031 | 9193 | ||
| 8032 | free_percpu(ca->cpuusage); | 9194 | free_percpu(ca->cpuusage); |
| 8033 | kfree(ca); | 9195 | kfree(ca); |
| 8034 | } | 9196 | } |
| 8035 | 9197 | ||
| 8036 | /* return total cpu usage (in nanoseconds) of a group */ | 9198 | /* return total cpu usage (in nanoseconds) of a group */ |
| 8037 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | 9199 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
| 8038 | { | 9200 | { |
| 8039 | struct cpuacct *ca = cgroup_ca(cont); | 9201 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 8040 | u64 totalcpuusage = 0; | 9202 | u64 totalcpuusage = 0; |
| 8041 | int i; | 9203 | int i; |
| 8042 | 9204 | ||
| @@ -8055,16 +9217,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | |||
| 8055 | return totalcpuusage; | 9217 | return totalcpuusage; |
| 8056 | } | 9218 | } |
| 8057 | 9219 | ||
| 9220 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
| 9221 | u64 reset) | ||
| 9222 | { | ||
| 9223 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
| 9224 | int err = 0; | ||
| 9225 | int i; | ||
| 9226 | |||
| 9227 | if (reset) { | ||
| 9228 | err = -EINVAL; | ||
| 9229 | goto out; | ||
| 9230 | } | ||
| 9231 | |||
| 9232 | for_each_possible_cpu(i) { | ||
| 9233 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
| 9234 | |||
| 9235 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 9236 | *cpuusage = 0; | ||
| 9237 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 9238 | } | ||
| 9239 | out: | ||
| 9240 | return err; | ||
| 9241 | } | ||
| 9242 | |||
| 8058 | static struct cftype files[] = { | 9243 | static struct cftype files[] = { |
| 8059 | { | 9244 | { |
| 8060 | .name = "usage", | 9245 | .name = "usage", |
| 8061 | .read_uint = cpuusage_read, | 9246 | .read_u64 = cpuusage_read, |
| 9247 | .write_u64 = cpuusage_write, | ||
| 8062 | }, | 9248 | }, |
| 8063 | }; | 9249 | }; |
| 8064 | 9250 | ||
| 8065 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 9251 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 8066 | { | 9252 | { |
| 8067 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 9253 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); |
| 8068 | } | 9254 | } |
| 8069 | 9255 | ||
| 8070 | /* | 9256 | /* |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba07683..6b4a12558e88 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 67 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
| 68 | p->prio); | 68 | p->prio); |
| 69 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
| 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
| 71 | SPLIT_NS(p->se.vruntime), | 71 | SPLIT_NS(p->se.vruntime), |
| 72 | SPLIT_NS(p->se.sum_exec_runtime), | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
| 73 | SPLIT_NS(p->se.sum_sleep_runtime)); | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
| 74 | #else | 74 | #else |
| 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
| 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 77 | #endif | 77 | #endif |
| 78 | |||
| 79 | #ifdef CONFIG_CGROUP_SCHED | ||
| 80 | { | ||
| 81 | char path[64]; | ||
| 82 | |||
| 83 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
| 84 | SEQ_printf(m, " %s", path); | ||
| 85 | } | ||
| 86 | #endif | ||
| 87 | SEQ_printf(m, "\n"); | ||
| 78 | } | 88 | } |
| 79 | 89 | ||
| 80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 90 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
| @@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 109 | struct sched_entity *last; | 119 | struct sched_entity *last; |
| 110 | unsigned long flags; | 120 | unsigned long flags; |
| 111 | 121 | ||
| 112 | SEQ_printf(m, "\ncfs_rq\n"); | 122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) |
| 123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
| 124 | #else | ||
| 125 | char path[128] = ""; | ||
| 126 | struct cgroup *cgroup = NULL; | ||
| 127 | struct task_group *tg = cfs_rq->tg; | ||
| 128 | |||
| 129 | if (tg) | ||
| 130 | cgroup = tg->css.cgroup; | ||
| 131 | |||
| 132 | if (cgroup) | ||
| 133 | cgroup_path(cgroup, path, sizeof(path)); | ||
| 134 | |||
| 135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
| 136 | #endif | ||
| 113 | 137 | ||
| 114 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
| 115 | SPLIT_NS(cfs_rq->exec_clock)); | 139 | SPLIT_NS(cfs_rq->exec_clock)); |
| @@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 143 | #endif | 167 | #endif |
| 144 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
| 145 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
| 170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 171 | #ifdef CONFIG_SMP | ||
| 172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
| 173 | #endif | ||
| 174 | #endif | ||
| 146 | } | 175 | } |
| 147 | 176 | ||
| 148 | static void print_cpu(struct seq_file *m, int cpu) | 177 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 214 | PN(sysctl_sched_latency); | 243 | PN(sysctl_sched_latency); |
| 215 | PN(sysctl_sched_min_granularity); | 244 | PN(sysctl_sched_min_granularity); |
| 216 | PN(sysctl_sched_wakeup_granularity); | 245 | PN(sysctl_sched_wakeup_granularity); |
| 217 | PN(sysctl_sched_batch_wakeup_granularity); | ||
| 218 | PN(sysctl_sched_child_runs_first); | 246 | PN(sysctl_sched_child_runs_first); |
| 219 | P(sysctl_sched_features); | 247 | P(sysctl_sched_features); |
| 220 | #undef PN | 248 | #undef PN |
| @@ -249,12 +277,9 @@ static int __init init_sched_debug_procfs(void) | |||
| 249 | { | 277 | { |
| 250 | struct proc_dir_entry *pe; | 278 | struct proc_dir_entry *pe; |
| 251 | 279 | ||
| 252 | pe = create_proc_entry("sched_debug", 0644, NULL); | 280 | pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); |
| 253 | if (!pe) | 281 | if (!pe) |
| 254 | return -ENOMEM; | 282 | return -ENOMEM; |
| 255 | |||
| 256 | pe->proc_fops = &sched_debug_fops; | ||
| 257 | |||
| 258 | return 0; | 283 | return 0; |
| 259 | } | 284 | } |
| 260 | 285 | ||
| @@ -332,8 +357,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 332 | 357 | ||
| 333 | avg_per_cpu = p->se.sum_exec_runtime; | 358 | avg_per_cpu = p->se.sum_exec_runtime; |
| 334 | if (p->se.nr_migrations) { | 359 | if (p->se.nr_migrations) { |
| 335 | avg_per_cpu = div64_64(avg_per_cpu, | 360 | avg_per_cpu = div64_u64(avg_per_cpu, |
| 336 | p->se.nr_migrations); | 361 | p->se.nr_migrations); |
| 337 | } else { | 362 | } else { |
| 338 | avg_per_cpu = -1LL; | 363 | avg_per_cpu = -1LL; |
| 339 | } | 364 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 86a93376282c..89fa32b4edf2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; | |||
| 62 | unsigned int __read_mostly sysctl_sched_compat_yield; | 62 | unsigned int __read_mostly sysctl_sched_compat_yield; |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * SCHED_BATCH wake-up granularity. | ||
| 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
| 67 | * | ||
| 68 | * This option delays the preemption effects of decoupled workloads | ||
| 69 | * and reduces their over-scheduling. Synchronous workloads will still | ||
| 70 | * have immediate wakeup/sleep latencies. | ||
| 71 | */ | ||
| 72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | ||
| 73 | |||
| 74 | /* | ||
| 75 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
| 76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 77 | * | 67 | * |
| 78 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
| 79 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
| 80 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
| 81 | */ | 71 | */ |
| 82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
| 83 | 73 | ||
| 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 85 | 75 | ||
| @@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
| 87 | * CFS operations on generic schedulable entities: | 77 | * CFS operations on generic schedulable entities: |
| 88 | */ | 78 | */ |
| 89 | 79 | ||
| 80 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
| 81 | { | ||
| 82 | return container_of(se, struct task_struct, se); | ||
| 83 | } | ||
| 84 | |||
| 90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 85 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 91 | 86 | ||
| 92 | /* cpu runqueue to which this cfs_rq is attached */ | 87 | /* cpu runqueue to which this cfs_rq is attached */ |
| @@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 98 | /* An entity is a task if it doesn't "own" a runqueue */ | 93 | /* An entity is a task if it doesn't "own" a runqueue */ |
| 99 | #define entity_is_task(se) (!se->my_q) | 94 | #define entity_is_task(se) (!se->my_q) |
| 100 | 95 | ||
| 96 | /* Walk up scheduling entities hierarchy */ | ||
| 97 | #define for_each_sched_entity(se) \ | ||
| 98 | for (; se; se = se->parent) | ||
| 99 | |||
| 100 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 101 | { | ||
| 102 | return p->se.cfs_rq; | ||
| 103 | } | ||
| 104 | |||
| 105 | /* runqueue on which this entity is (to be) queued */ | ||
| 106 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 107 | { | ||
| 108 | return se->cfs_rq; | ||
| 109 | } | ||
| 110 | |||
| 111 | /* runqueue "owned" by this group */ | ||
| 112 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 113 | { | ||
| 114 | return grp->my_q; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
| 118 | * another cpu ('this_cpu') | ||
| 119 | */ | ||
| 120 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 121 | { | ||
| 122 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
| 123 | } | ||
| 124 | |||
| 125 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
| 126 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 127 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
| 128 | |||
| 129 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
| 130 | static inline int | ||
| 131 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 132 | { | ||
| 133 | if (se->cfs_rq == pse->cfs_rq) | ||
| 134 | return 1; | ||
| 135 | |||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 140 | { | ||
| 141 | return se->parent; | ||
| 142 | } | ||
| 143 | |||
| 101 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 144 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| 102 | 145 | ||
| 103 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 146 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
| @@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 107 | 150 | ||
| 108 | #define entity_is_task(se) 1 | 151 | #define entity_is_task(se) 1 |
| 109 | 152 | ||
| 110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 153 | #define for_each_sched_entity(se) \ |
| 154 | for (; se; se = NULL) | ||
| 111 | 155 | ||
| 112 | static inline struct task_struct *task_of(struct sched_entity *se) | 156 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
| 113 | { | 157 | { |
| 114 | return container_of(se, struct task_struct, se); | 158 | return &task_rq(p)->cfs; |
| 159 | } | ||
| 160 | |||
| 161 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 162 | { | ||
| 163 | struct task_struct *p = task_of(se); | ||
| 164 | struct rq *rq = task_rq(p); | ||
| 165 | |||
| 166 | return &rq->cfs; | ||
| 115 | } | 167 | } |
| 116 | 168 | ||
| 169 | /* runqueue "owned" by this group */ | ||
| 170 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 171 | { | ||
| 172 | return NULL; | ||
| 173 | } | ||
| 174 | |||
| 175 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 176 | { | ||
| 177 | return &cpu_rq(this_cpu)->cfs; | ||
| 178 | } | ||
| 179 | |||
| 180 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 181 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
| 182 | |||
| 183 | static inline int | ||
| 184 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 185 | { | ||
| 186 | return 1; | ||
| 187 | } | ||
| 188 | |||
| 189 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 190 | { | ||
| 191 | return NULL; | ||
| 192 | } | ||
| 193 | |||
| 194 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 195 | |||
| 117 | 196 | ||
| 118 | /************************************************************** | 197 | /************************************************************** |
| 119 | * Scheduling class tree data structure manipulation methods: | 198 | * Scheduling class tree data structure manipulation methods: |
| @@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
| 255 | #endif | 334 | #endif |
| 256 | 335 | ||
| 257 | /* | 336 | /* |
| 337 | * delta *= w / rw | ||
| 338 | */ | ||
| 339 | static inline unsigned long | ||
| 340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
| 341 | { | ||
| 342 | for_each_sched_entity(se) { | ||
| 343 | delta = calc_delta_mine(delta, | ||
| 344 | se->load.weight, &cfs_rq_of(se)->load); | ||
| 345 | } | ||
| 346 | |||
| 347 | return delta; | ||
| 348 | } | ||
| 349 | |||
| 350 | /* | ||
| 351 | * delta *= rw / w | ||
| 352 | */ | ||
| 353 | static inline unsigned long | ||
| 354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
| 355 | { | ||
| 356 | for_each_sched_entity(se) { | ||
| 357 | delta = calc_delta_mine(delta, | ||
| 358 | cfs_rq_of(se)->load.weight, &se->load); | ||
| 359 | } | ||
| 360 | |||
| 361 | return delta; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 258 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
| 259 | * | 366 | * |
| 260 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| @@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 283 | */ | 390 | */ |
| 284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 285 | { | 392 | { |
| 286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
| 287 | se->load.weight, &cfs_rq->load); | ||
| 288 | } | 394 | } |
| 289 | 395 | ||
| 290 | /* | 396 | /* |
| 291 | * We calculate the vruntime slice. | 397 | * We calculate the vruntime slice of a to be inserted task |
| 292 | * | 398 | * |
| 293 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
| 294 | */ | 400 | */ |
| 295 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 296 | { | 402 | { |
| 297 | u64 vslice = __sched_period(nr_running); | 403 | unsigned long nr_running = cfs_rq->nr_running; |
| 298 | 404 | ||
| 299 | vslice *= NICE_0_LOAD; | 405 | if (!se->on_rq) |
| 300 | do_div(vslice, rq_weight); | 406 | nr_running++; |
| 301 | 407 | ||
| 302 | return vslice; | 408 | return __sched_period(nr_running); |
| 303 | } | 409 | } |
| 304 | 410 | ||
| 305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 411 | /* |
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 306 | { | 424 | { |
| 307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 425 | struct load_weight lw = { |
| 308 | cfs_rq->nr_running + 1); | 426 | .weight = NICE_0_LOAD, |
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 429 | |||
| 430 | for_each_sched_entity(se) { | ||
| 431 | struct load_weight *se_lw = &se->load; | ||
| 432 | |||
| 433 | if (se->load.weight < NICE_0_LOAD) | ||
| 434 | se_lw = &lw; | ||
| 435 | |||
| 436 | delta = calc_delta_mine(delta, | ||
| 437 | cfs_rq_of(se)->load.weight, se_lw); | ||
| 438 | } | ||
| 439 | |||
| 440 | return delta; | ||
| 309 | } | 441 | } |
| 310 | 442 | ||
| 311 | /* | 443 | /* |
| @@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 322 | 454 | ||
| 323 | curr->sum_exec_runtime += delta_exec; | 455 | curr->sum_exec_runtime += delta_exec; |
| 324 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 325 | delta_exec_weighted = delta_exec; | 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
| 326 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
| 327 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
| 328 | &curr->load); | ||
| 329 | } | ||
| 330 | curr->vruntime += delta_exec_weighted; | 458 | curr->vruntime += delta_exec_weighted; |
| 331 | } | 459 | } |
| 332 | 460 | ||
| @@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 413 | * Scheduling class queueing methods: | 541 | * Scheduling class queueing methods: |
| 414 | */ | 542 | */ |
| 415 | 543 | ||
| 544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 545 | static void | ||
| 546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 547 | { | ||
| 548 | cfs_rq->task_weight += weight; | ||
| 549 | } | ||
| 550 | #else | ||
| 551 | static inline void | ||
| 552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 553 | { | ||
| 554 | } | ||
| 555 | #endif | ||
| 556 | |||
| 416 | static void | 557 | static void |
| 417 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 418 | { | 559 | { |
| 419 | update_load_add(&cfs_rq->load, se->load.weight); | 560 | update_load_add(&cfs_rq->load, se->load.weight); |
| 561 | if (!parent_entity(se)) | ||
| 562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 563 | if (entity_is_task(se)) | ||
| 564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
| 420 | cfs_rq->nr_running++; | 565 | cfs_rq->nr_running++; |
| 421 | se->on_rq = 1; | 566 | se->on_rq = 1; |
| 567 | list_add(&se->group_node, &cfs_rq->tasks); | ||
| 422 | } | 568 | } |
| 423 | 569 | ||
| 424 | static void | 570 | static void |
| 425 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 426 | { | 572 | { |
| 427 | update_load_sub(&cfs_rq->load, se->load.weight); | 573 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 574 | if (!parent_entity(se)) | ||
| 575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 576 | if (entity_is_task(se)) | ||
| 577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
| 428 | cfs_rq->nr_running--; | 578 | cfs_rq->nr_running--; |
| 429 | se->on_rq = 0; | 579 | se->on_rq = 0; |
| 580 | list_del_init(&se->group_node); | ||
| 430 | } | 581 | } |
| 431 | 582 | ||
| 432 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 583 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| @@ -511,8 +662,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 511 | if (!initial) { | 662 | if (!initial) { |
| 512 | /* sleeps upto a single latency don't count. */ | 663 | /* sleeps upto a single latency don't count. */ |
| 513 | if (sched_feat(NEW_FAIR_SLEEPERS)) { | 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
| 514 | vruntime -= calc_delta_fair(sysctl_sched_latency, | 665 | if (sched_feat(NORMALIZED_SLEEPER)) |
| 515 | &cfs_rq->load); | 666 | vruntime -= calc_delta_weight(sysctl_sched_latency, se); |
| 667 | else | ||
| 668 | vruntime -= sysctl_sched_latency; | ||
| 516 | } | 669 | } |
| 517 | 670 | ||
| 518 | /* ensure we never gain time by being placed backwards. */ | 671 | /* ensure we never gain time by being placed backwards. */ |
| @@ -629,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 629 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 782 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 630 | } | 783 | } |
| 631 | 784 | ||
| 785 | static int | ||
| 786 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
| 787 | |||
| 632 | static struct sched_entity * | 788 | static struct sched_entity * |
| 633 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 789 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 634 | { | 790 | { |
| 635 | s64 diff, gran; | ||
| 636 | |||
| 637 | if (!cfs_rq->next) | 791 | if (!cfs_rq->next) |
| 638 | return se; | 792 | return se; |
| 639 | 793 | ||
| 640 | diff = cfs_rq->next->vruntime - se->vruntime; | 794 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) |
| 641 | if (diff < 0) | ||
| 642 | return se; | ||
| 643 | |||
| 644 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
| 645 | if (diff > gran) | ||
| 646 | return se; | 795 | return se; |
| 647 | 796 | ||
| 648 | return cfs_rq->next; | 797 | return cfs_rq->next; |
| @@ -710,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 710 | * CFS operations on tasks: | 859 | * CFS operations on tasks: |
| 711 | */ | 860 | */ |
| 712 | 861 | ||
| 713 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 714 | |||
| 715 | /* Walk up scheduling entities hierarchy */ | ||
| 716 | #define for_each_sched_entity(se) \ | ||
| 717 | for (; se; se = se->parent) | ||
| 718 | |||
| 719 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 720 | { | ||
| 721 | return p->se.cfs_rq; | ||
| 722 | } | ||
| 723 | |||
| 724 | /* runqueue on which this entity is (to be) queued */ | ||
| 725 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 726 | { | ||
| 727 | return se->cfs_rq; | ||
| 728 | } | ||
| 729 | |||
| 730 | /* runqueue "owned" by this group */ | ||
| 731 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 732 | { | ||
| 733 | return grp->my_q; | ||
| 734 | } | ||
| 735 | |||
| 736 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
| 737 | * another cpu ('this_cpu') | ||
| 738 | */ | ||
| 739 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 740 | { | ||
| 741 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
| 742 | } | ||
| 743 | |||
| 744 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
| 745 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 746 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
| 747 | |||
| 748 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
| 749 | static inline int | ||
| 750 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 751 | { | ||
| 752 | if (se->cfs_rq == pse->cfs_rq) | ||
| 753 | return 1; | ||
| 754 | |||
| 755 | return 0; | ||
| 756 | } | ||
| 757 | |||
| 758 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 759 | { | ||
| 760 | return se->parent; | ||
| 761 | } | ||
| 762 | |||
| 763 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 764 | |||
| 765 | #define for_each_sched_entity(se) \ | ||
| 766 | for (; se; se = NULL) | ||
| 767 | |||
| 768 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
| 769 | { | ||
| 770 | return &task_rq(p)->cfs; | ||
| 771 | } | ||
| 772 | |||
| 773 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
| 774 | { | ||
| 775 | struct task_struct *p = task_of(se); | ||
| 776 | struct rq *rq = task_rq(p); | ||
| 777 | |||
| 778 | return &rq->cfs; | ||
| 779 | } | ||
| 780 | |||
| 781 | /* runqueue "owned" by this group */ | ||
| 782 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
| 783 | { | ||
| 784 | return NULL; | ||
| 785 | } | ||
| 786 | |||
| 787 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 788 | { | ||
| 789 | return &cpu_rq(this_cpu)->cfs; | ||
| 790 | } | ||
| 791 | |||
| 792 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
| 793 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
| 794 | |||
| 795 | static inline int | ||
| 796 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 797 | { | ||
| 798 | return 1; | ||
| 799 | } | ||
| 800 | |||
| 801 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
| 802 | { | ||
| 803 | return NULL; | ||
| 804 | } | ||
| 805 | |||
| 806 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 807 | |||
| 808 | #ifdef CONFIG_SCHED_HRTICK | 862 | #ifdef CONFIG_SCHED_HRTICK |
| 809 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | 863 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
| 810 | { | 864 | { |
| @@ -918,7 +972,7 @@ static void yield_task_fair(struct rq *rq) | |||
| 918 | /* | 972 | /* |
| 919 | * Already in the rightmost position? | 973 | * Already in the rightmost position? |
| 920 | */ | 974 | */ |
| 921 | if (unlikely(rightmost->vruntime < se->vruntime)) | 975 | if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) |
| 922 | return; | 976 | return; |
| 923 | 977 | ||
| 924 | /* | 978 | /* |
| @@ -957,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 957 | return cpu; | 1011 | return cpu; |
| 958 | 1012 | ||
| 959 | for_each_domain(cpu, sd) { | 1013 | for_each_domain(cpu, sd) { |
| 960 | if (sd->flags & SD_WAKE_IDLE) { | 1014 | if ((sd->flags & SD_WAKE_IDLE) |
| 1015 | || ((sd->flags & SD_WAKE_IDLE_FAR) | ||
| 1016 | && !task_hot(p, task_rq(p)->clock, sd))) { | ||
| 961 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1017 | cpus_and(tmp, sd->span, p->cpus_allowed); |
| 962 | for_each_cpu_mask(i, tmp) { | 1018 | for_each_cpu_mask(i, tmp) { |
| 963 | if (idle_cpu(i)) { | 1019 | if (idle_cpu(i)) { |
| @@ -1101,6 +1157,58 @@ out: | |||
| 1101 | } | 1157 | } |
| 1102 | #endif /* CONFIG_SMP */ | 1158 | #endif /* CONFIG_SMP */ |
| 1103 | 1159 | ||
| 1160 | static unsigned long wakeup_gran(struct sched_entity *se) | ||
| 1161 | { | ||
| 1162 | unsigned long gran = sysctl_sched_wakeup_granularity; | ||
| 1163 | |||
| 1164 | /* | ||
| 1165 | * More easily preempt - nice tasks, while not making it harder for | ||
| 1166 | * + nice tasks. | ||
| 1167 | */ | ||
| 1168 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | ||
| 1169 | |||
| 1170 | return gran; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | /* | ||
| 1174 | * Should 'se' preempt 'curr'. | ||
| 1175 | * | ||
| 1176 | * |s1 | ||
| 1177 | * |s2 | ||
| 1178 | * |s3 | ||
| 1179 | * g | ||
| 1180 | * |<--->|c | ||
| 1181 | * | ||
| 1182 | * w(c, s1) = -1 | ||
| 1183 | * w(c, s2) = 0 | ||
| 1184 | * w(c, s3) = 1 | ||
| 1185 | * | ||
| 1186 | */ | ||
| 1187 | static int | ||
| 1188 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
| 1189 | { | ||
| 1190 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
| 1191 | |||
| 1192 | if (vdiff < 0) | ||
| 1193 | return -1; | ||
| 1194 | |||
| 1195 | gran = wakeup_gran(curr); | ||
| 1196 | if (vdiff > gran) | ||
| 1197 | return 1; | ||
| 1198 | |||
| 1199 | return 0; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | /* return depth at which a sched entity is present in the hierarchy */ | ||
| 1203 | static inline int depth_se(struct sched_entity *se) | ||
| 1204 | { | ||
| 1205 | int depth = 0; | ||
| 1206 | |||
| 1207 | for_each_sched_entity(se) | ||
| 1208 | depth++; | ||
| 1209 | |||
| 1210 | return depth; | ||
| 1211 | } | ||
| 1104 | 1212 | ||
| 1105 | /* | 1213 | /* |
| 1106 | * Preempt the current task with a newly woken task if needed: | 1214 | * Preempt the current task with a newly woken task if needed: |
| @@ -1110,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1110 | struct task_struct *curr = rq->curr; | 1218 | struct task_struct *curr = rq->curr; |
| 1111 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1219 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1112 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1220 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 1113 | unsigned long gran; | 1221 | int se_depth, pse_depth; |
| 1114 | 1222 | ||
| 1115 | if (unlikely(rt_prio(p->prio))) { | 1223 | if (unlikely(rt_prio(p->prio))) { |
| 1116 | update_rq_clock(rq); | 1224 | update_rq_clock(rq); |
| @@ -1135,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1135 | if (!sched_feat(WAKEUP_PREEMPT)) | 1243 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1136 | return; | 1244 | return; |
| 1137 | 1245 | ||
| 1138 | while (!is_same_group(se, pse)) { | 1246 | /* |
| 1247 | * preemption test can be made between sibling entities who are in the | ||
| 1248 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
| 1249 | * both tasks until we find their ancestors who are siblings of common | ||
| 1250 | * parent. | ||
| 1251 | */ | ||
| 1252 | |||
| 1253 | /* First walk up until both entities are at same depth */ | ||
| 1254 | se_depth = depth_se(se); | ||
| 1255 | pse_depth = depth_se(pse); | ||
| 1256 | |||
| 1257 | while (se_depth > pse_depth) { | ||
| 1258 | se_depth--; | ||
| 1139 | se = parent_entity(se); | 1259 | se = parent_entity(se); |
| 1260 | } | ||
| 1261 | |||
| 1262 | while (pse_depth > se_depth) { | ||
| 1263 | pse_depth--; | ||
| 1140 | pse = parent_entity(pse); | 1264 | pse = parent_entity(pse); |
| 1141 | } | 1265 | } |
| 1142 | 1266 | ||
| 1143 | gran = sysctl_sched_wakeup_granularity; | 1267 | while (!is_same_group(se, pse)) { |
| 1144 | /* | 1268 | se = parent_entity(se); |
| 1145 | * More easily preempt - nice tasks, while not making | 1269 | pse = parent_entity(pse); |
| 1146 | * it harder for + nice tasks. | 1270 | } |
| 1147 | */ | ||
| 1148 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
| 1149 | gran = calc_delta_fair(gran, &se->load); | ||
| 1150 | 1271 | ||
| 1151 | if (pse->vruntime + gran < se->vruntime) | 1272 | if (wakeup_preempt_entity(se, pse) == 1) |
| 1152 | resched_task(curr); | 1273 | resched_task(curr); |
| 1153 | } | 1274 | } |
| 1154 | 1275 | ||
| @@ -1199,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1199 | * the current task: | 1320 | * the current task: |
| 1200 | */ | 1321 | */ |
| 1201 | static struct task_struct * | 1322 | static struct task_struct * |
| 1202 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1323 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) |
| 1203 | { | 1324 | { |
| 1204 | struct task_struct *p; | 1325 | struct task_struct *p = NULL; |
| 1326 | struct sched_entity *se; | ||
| 1327 | |||
| 1328 | if (next == &cfs_rq->tasks) | ||
| 1329 | return NULL; | ||
| 1330 | |||
| 1331 | /* Skip over entities that are not tasks */ | ||
| 1332 | do { | ||
| 1333 | se = list_entry(next, struct sched_entity, group_node); | ||
| 1334 | next = next->next; | ||
| 1335 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
| 1205 | 1336 | ||
| 1206 | if (!curr) | 1337 | if (next == &cfs_rq->tasks) |
| 1207 | return NULL; | 1338 | return NULL; |
| 1208 | 1339 | ||
| 1209 | p = rb_entry(curr, struct task_struct, se.run_node); | 1340 | cfs_rq->balance_iterator = next; |
| 1210 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1341 | |
| 1342 | if (entity_is_task(se)) | ||
| 1343 | p = task_of(se); | ||
| 1211 | 1344 | ||
| 1212 | return p; | 1345 | return p; |
| 1213 | } | 1346 | } |
| @@ -1216,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) | |||
| 1216 | { | 1349 | { |
| 1217 | struct cfs_rq *cfs_rq = arg; | 1350 | struct cfs_rq *cfs_rq = arg; |
| 1218 | 1351 | ||
| 1219 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 1352 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); |
| 1220 | } | 1353 | } |
| 1221 | 1354 | ||
| 1222 | static struct task_struct *load_balance_next_fair(void *arg) | 1355 | static struct task_struct *load_balance_next_fair(void *arg) |
| 1223 | { | 1356 | { |
| 1224 | struct cfs_rq *cfs_rq = arg; | 1357 | struct cfs_rq *cfs_rq = arg; |
| 1225 | 1358 | ||
| 1226 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1359 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
| 1227 | } | 1360 | } |
| 1228 | 1361 | ||
| 1229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1362 | static unsigned long |
| 1230 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1363 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1364 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1365 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
| 1366 | struct cfs_rq *cfs_rq) | ||
| 1231 | { | 1367 | { |
| 1232 | struct sched_entity *curr; | 1368 | struct rq_iterator cfs_rq_iterator; |
| 1233 | struct task_struct *p; | ||
| 1234 | |||
| 1235 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
| 1236 | return MAX_PRIO; | ||
| 1237 | |||
| 1238 | curr = cfs_rq->curr; | ||
| 1239 | if (!curr) | ||
| 1240 | curr = __pick_next_entity(cfs_rq); | ||
| 1241 | 1369 | ||
| 1242 | p = task_of(curr); | 1370 | cfs_rq_iterator.start = load_balance_start_fair; |
| 1371 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1372 | cfs_rq_iterator.arg = cfs_rq; | ||
| 1243 | 1373 | ||
| 1244 | return p->prio; | 1374 | return balance_tasks(this_rq, this_cpu, busiest, |
| 1375 | max_load_move, sd, idle, all_pinned, | ||
| 1376 | this_best_prio, &cfs_rq_iterator); | ||
| 1245 | } | 1377 | } |
| 1246 | #endif | ||
| 1247 | 1378 | ||
| 1379 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1248 | static unsigned long | 1380 | static unsigned long |
| 1249 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1381 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1250 | unsigned long max_load_move, | 1382 | unsigned long max_load_move, |
| 1251 | struct sched_domain *sd, enum cpu_idle_type idle, | 1383 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1252 | int *all_pinned, int *this_best_prio) | 1384 | int *all_pinned, int *this_best_prio) |
| 1253 | { | 1385 | { |
| 1254 | struct cfs_rq *busy_cfs_rq; | ||
| 1255 | long rem_load_move = max_load_move; | 1386 | long rem_load_move = max_load_move; |
| 1256 | struct rq_iterator cfs_rq_iterator; | 1387 | int busiest_cpu = cpu_of(busiest); |
| 1257 | 1388 | struct task_group *tg; | |
| 1258 | cfs_rq_iterator.start = load_balance_start_fair; | ||
| 1259 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1260 | 1389 | ||
| 1261 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1390 | rcu_read_lock(); |
| 1262 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1391 | list_for_each_entry(tg, &task_groups, list) { |
| 1263 | struct cfs_rq *this_cfs_rq; | ||
| 1264 | long imbalance; | 1392 | long imbalance; |
| 1265 | unsigned long maxload; | 1393 | unsigned long this_weight, busiest_weight; |
| 1394 | long rem_load, max_load, moved_load; | ||
| 1395 | |||
| 1396 | /* | ||
| 1397 | * empty group | ||
| 1398 | */ | ||
| 1399 | if (!aggregate(tg, sd)->task_weight) | ||
| 1400 | continue; | ||
| 1401 | |||
| 1402 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
| 1403 | rem_load /= aggregate(tg, sd)->load + 1; | ||
| 1404 | |||
| 1405 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
| 1406 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
| 1407 | |||
| 1408 | imbalance = (busiest_weight - this_weight) / 2; | ||
| 1266 | 1409 | ||
| 1267 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1410 | if (imbalance < 0) |
| 1411 | imbalance = busiest_weight; | ||
| 1268 | 1412 | ||
| 1269 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1413 | max_load = max(rem_load, imbalance); |
| 1270 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1414 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
| 1271 | if (imbalance <= 0) | 1415 | max_load, sd, idle, all_pinned, this_best_prio, |
| 1416 | tg->cfs_rq[busiest_cpu]); | ||
| 1417 | |||
| 1418 | if (!moved_load) | ||
| 1272 | continue; | 1419 | continue; |
| 1273 | 1420 | ||
| 1274 | /* Don't pull more than imbalance/2 */ | 1421 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
| 1275 | imbalance /= 2; | ||
| 1276 | maxload = min(rem_load_move, imbalance); | ||
| 1277 | 1422 | ||
| 1278 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1423 | moved_load *= aggregate(tg, sd)->load; |
| 1279 | #else | 1424 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
| 1280 | # define maxload rem_load_move | ||
| 1281 | #endif | ||
| 1282 | /* | ||
| 1283 | * pass busy_cfs_rq argument into | ||
| 1284 | * load_balance_[start|next]_fair iterators | ||
| 1285 | */ | ||
| 1286 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
| 1287 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
| 1288 | maxload, sd, idle, all_pinned, | ||
| 1289 | this_best_prio, | ||
| 1290 | &cfs_rq_iterator); | ||
| 1291 | 1425 | ||
| 1292 | if (rem_load_move <= 0) | 1426 | rem_load_move -= moved_load; |
| 1427 | if (rem_load_move < 0) | ||
| 1293 | break; | 1428 | break; |
| 1294 | } | 1429 | } |
| 1430 | rcu_read_unlock(); | ||
| 1295 | 1431 | ||
| 1296 | return max_load_move - rem_load_move; | 1432 | return max_load_move - rem_load_move; |
| 1297 | } | 1433 | } |
| 1434 | #else | ||
| 1435 | static unsigned long | ||
| 1436 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1437 | unsigned long max_load_move, | ||
| 1438 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1439 | int *all_pinned, int *this_best_prio) | ||
| 1440 | { | ||
| 1441 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1442 | max_load_move, sd, idle, all_pinned, | ||
| 1443 | this_best_prio, &busiest->cfs); | ||
| 1444 | } | ||
| 1445 | #endif | ||
| 1298 | 1446 | ||
| 1299 | static int | 1447 | static int |
| 1300 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1448 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| @@ -1463,16 +1611,40 @@ static const struct sched_class fair_sched_class = { | |||
| 1463 | }; | 1611 | }; |
| 1464 | 1612 | ||
| 1465 | #ifdef CONFIG_SCHED_DEBUG | 1613 | #ifdef CONFIG_SCHED_DEBUG |
| 1614 | static void | ||
| 1615 | print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) | ||
| 1616 | { | ||
| 1617 | struct sched_entity *se; | ||
| 1618 | |||
| 1619 | if (!cfs_rq) | ||
| 1620 | return; | ||
| 1621 | |||
| 1622 | list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { | ||
| 1623 | int i; | ||
| 1624 | |||
| 1625 | for (i = depth; i; i--) | ||
| 1626 | seq_puts(m, " "); | ||
| 1627 | |||
| 1628 | seq_printf(m, "%lu %s %lu\n", | ||
| 1629 | se->load.weight, | ||
| 1630 | entity_is_task(se) ? "T" : "G", | ||
| 1631 | calc_delta_weight(SCHED_LOAD_SCALE, se) | ||
| 1632 | ); | ||
| 1633 | if (!entity_is_task(se)) | ||
| 1634 | print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); | ||
| 1635 | } | ||
| 1636 | } | ||
| 1637 | |||
| 1466 | static void print_cfs_stats(struct seq_file *m, int cpu) | 1638 | static void print_cfs_stats(struct seq_file *m, int cpu) |
| 1467 | { | 1639 | { |
| 1468 | struct cfs_rq *cfs_rq; | 1640 | struct cfs_rq *cfs_rq; |
| 1469 | 1641 | ||
| 1470 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1471 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
| 1472 | #endif | ||
| 1473 | rcu_read_lock(); | 1642 | rcu_read_lock(); |
| 1474 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1643 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
| 1475 | print_cfs_rq(m, cpu, cfs_rq); | 1644 | print_cfs_rq(m, cpu, cfs_rq); |
| 1645 | |||
| 1646 | seq_printf(m, "\nWeight tree:\n"); | ||
| 1647 | print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); | ||
| 1476 | rcu_read_unlock(); | 1648 | rcu_read_unlock(); |
| 1477 | } | 1649 | } |
| 1478 | #endif | 1650 | #endif |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 000000000000..1c7283cb9581 --- /dev/null +++ b/kernel/sched_features.h | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | ||
| 2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
| 3 | SCHED_FEAT(START_DEBIT, 1) | ||
| 4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | ||
| 5 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | ||
| 6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | ||
| 7 | SCHED_FEAT(HRTICK, 1) | ||
| 8 | SCHED_FEAT(DOUBLE_TICK, 0) | ||
| 9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
| 10 | SCHED_FEAT(DEADLINE, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..c2730a5a4f05 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | |||
| 62 | if (!rt_rq->tg) | 62 | if (!rt_rq->tg) |
| 63 | return RUNTIME_INF; | 63 | return RUNTIME_INF; |
| 64 | 64 | ||
| 65 | return rt_rq->tg->rt_runtime; | 65 | return rt_rq->rt_runtime; |
| 66 | } | ||
| 67 | |||
| 68 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
| 69 | { | ||
| 70 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | ||
| 66 | } | 71 | } |
| 67 | 72 | ||
| 68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 73 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| @@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
| 127 | return p->prio != p->normal_prio; | 132 | return p->prio != p->normal_prio; |
| 128 | } | 133 | } |
| 129 | 134 | ||
| 135 | #ifdef CONFIG_SMP | ||
| 136 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 137 | { | ||
| 138 | return cpu_rq(smp_processor_id())->rd->span; | ||
| 139 | } | ||
| 140 | #else | ||
| 141 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 142 | { | ||
| 143 | return cpu_online_map; | ||
| 144 | } | ||
| 145 | #endif | ||
| 146 | |||
| 147 | static inline | ||
| 148 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
| 149 | { | ||
| 150 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
| 154 | { | ||
| 155 | return &rt_rq->tg->rt_bandwidth; | ||
| 156 | } | ||
| 157 | |||
| 130 | #else | 158 | #else |
| 131 | 159 | ||
| 132 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
| 133 | { | 161 | { |
| 134 | if (sysctl_sched_rt_runtime == -1) | 162 | return rt_rq->rt_runtime; |
| 135 | return RUNTIME_INF; | 163 | } |
| 136 | 164 | ||
| 137 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 165 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) |
| 166 | { | ||
| 167 | return ktime_to_ns(def_rt_bandwidth.rt_period); | ||
| 138 | } | 168 | } |
| 139 | 169 | ||
| 140 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 170 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| @@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) | |||
| 173 | { | 203 | { |
| 174 | return rt_rq->rt_throttled; | 204 | return rt_rq->rt_throttled; |
| 175 | } | 205 | } |
| 206 | |||
| 207 | static inline cpumask_t sched_rt_period_mask(void) | ||
| 208 | { | ||
| 209 | return cpu_online_map; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline | ||
| 213 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
| 214 | { | ||
| 215 | return &cpu_rq(cpu)->rt; | ||
| 216 | } | ||
| 217 | |||
| 218 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
| 219 | { | ||
| 220 | return &def_rt_bandwidth; | ||
| 221 | } | ||
| 222 | |||
| 223 | #endif | ||
| 224 | |||
| 225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
| 226 | { | ||
| 227 | int i, idle = 1; | ||
| 228 | cpumask_t span; | ||
| 229 | |||
| 230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
| 231 | return 1; | ||
| 232 | |||
| 233 | span = sched_rt_period_mask(); | ||
| 234 | for_each_cpu_mask(i, span) { | ||
| 235 | int enqueue = 0; | ||
| 236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
| 237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 238 | |||
| 239 | spin_lock(&rq->lock); | ||
| 240 | if (rt_rq->rt_time) { | ||
| 241 | u64 runtime; | ||
| 242 | |||
| 243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 244 | runtime = rt_rq->rt_runtime; | ||
| 245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
| 246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 247 | rt_rq->rt_throttled = 0; | ||
| 248 | enqueue = 1; | ||
| 249 | } | ||
| 250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
| 251 | idle = 0; | ||
| 252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 253 | } | ||
| 254 | |||
| 255 | if (enqueue) | ||
| 256 | sched_rt_rq_enqueue(rt_rq); | ||
| 257 | spin_unlock(&rq->lock); | ||
| 258 | } | ||
| 259 | |||
| 260 | return idle; | ||
| 261 | } | ||
| 262 | |||
| 263 | #ifdef CONFIG_SMP | ||
| 264 | static int balance_runtime(struct rt_rq *rt_rq) | ||
| 265 | { | ||
| 266 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
| 267 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | ||
| 268 | int i, weight, more = 0; | ||
| 269 | u64 rt_period; | ||
| 270 | |||
| 271 | weight = cpus_weight(rd->span); | ||
| 272 | |||
| 273 | spin_lock(&rt_b->rt_runtime_lock); | ||
| 274 | rt_period = ktime_to_ns(rt_b->rt_period); | ||
| 275 | for_each_cpu_mask(i, rd->span) { | ||
| 276 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
| 277 | s64 diff; | ||
| 278 | |||
| 279 | if (iter == rt_rq) | ||
| 280 | continue; | ||
| 281 | |||
| 282 | spin_lock(&iter->rt_runtime_lock); | ||
| 283 | diff = iter->rt_runtime - iter->rt_time; | ||
| 284 | if (diff > 0) { | ||
| 285 | do_div(diff, weight); | ||
| 286 | if (rt_rq->rt_runtime + diff > rt_period) | ||
| 287 | diff = rt_period - rt_rq->rt_runtime; | ||
| 288 | iter->rt_runtime -= diff; | ||
| 289 | rt_rq->rt_runtime += diff; | ||
| 290 | more = 1; | ||
| 291 | if (rt_rq->rt_runtime == rt_period) { | ||
| 292 | spin_unlock(&iter->rt_runtime_lock); | ||
| 293 | break; | ||
| 294 | } | ||
| 295 | } | ||
| 296 | spin_unlock(&iter->rt_runtime_lock); | ||
| 297 | } | ||
| 298 | spin_unlock(&rt_b->rt_runtime_lock); | ||
| 299 | |||
| 300 | return more; | ||
| 301 | } | ||
| 176 | #endif | 302 | #endif |
| 177 | 303 | ||
| 178 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 304 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
| @@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 197 | if (rt_rq->rt_throttled) | 323 | if (rt_rq->rt_throttled) |
| 198 | return rt_rq_throttled(rt_rq); | 324 | return rt_rq_throttled(rt_rq); |
| 199 | 325 | ||
| 326 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | ||
| 327 | return 0; | ||
| 328 | |||
| 329 | #ifdef CONFIG_SMP | ||
| 200 | if (rt_rq->rt_time > runtime) { | 330 | if (rt_rq->rt_time > runtime) { |
| 201 | struct rq *rq = rq_of_rt_rq(rt_rq); | 331 | int more; |
| 202 | 332 | ||
| 203 | rq->rt_throttled = 1; | 333 | spin_unlock(&rt_rq->rt_runtime_lock); |
| 204 | rt_rq->rt_throttled = 1; | 334 | more = balance_runtime(rt_rq); |
| 335 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 205 | 336 | ||
| 337 | if (more) | ||
| 338 | runtime = sched_rt_runtime(rt_rq); | ||
| 339 | } | ||
| 340 | #endif | ||
| 341 | |||
| 342 | if (rt_rq->rt_time > runtime) { | ||
| 343 | rt_rq->rt_throttled = 1; | ||
| 206 | if (rt_rq_throttled(rt_rq)) { | 344 | if (rt_rq_throttled(rt_rq)) { |
| 207 | sched_rt_rq_dequeue(rt_rq); | 345 | sched_rt_rq_dequeue(rt_rq); |
| 208 | return 1; | 346 | return 1; |
| @@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 212 | return 0; | 350 | return 0; |
| 213 | } | 351 | } |
| 214 | 352 | ||
| 215 | static void update_sched_rt_period(struct rq *rq) | ||
| 216 | { | ||
| 217 | struct rt_rq *rt_rq; | ||
| 218 | u64 period; | ||
| 219 | |||
| 220 | while (rq->clock > rq->rt_period_expire) { | ||
| 221 | period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 222 | rq->rt_period_expire += period; | ||
| 223 | |||
| 224 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
| 225 | u64 runtime = sched_rt_runtime(rt_rq); | ||
| 226 | |||
| 227 | rt_rq->rt_time -= min(rt_rq->rt_time, runtime); | ||
| 228 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
| 229 | rt_rq->rt_throttled = 0; | ||
| 230 | sched_rt_rq_enqueue(rt_rq); | ||
| 231 | } | ||
| 232 | } | ||
| 233 | |||
| 234 | rq->rt_throttled = 0; | ||
| 235 | } | ||
| 236 | } | ||
| 237 | |||
| 238 | /* | 353 | /* |
| 239 | * Update the current task's runtime statistics. Skip current tasks that | 354 | * Update the current task's runtime statistics. Skip current tasks that |
| 240 | * are not in our scheduling class. | 355 | * are not in our scheduling class. |
| @@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
| 259 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
| 260 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
| 261 | 376 | ||
| 262 | rt_rq->rt_time += delta_exec; | 377 | for_each_sched_rt_entity(rt_se) { |
| 263 | if (sched_rt_runtime_exceeded(rt_rq)) | 378 | rt_rq = rt_rq_of_se(rt_se); |
| 264 | resched_task(curr); | 379 | |
| 380 | spin_lock(&rt_rq->rt_runtime_lock); | ||
| 381 | rt_rq->rt_time += delta_exec; | ||
| 382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
| 383 | resched_task(curr); | ||
| 384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 385 | } | ||
| 265 | } | 386 | } |
| 266 | 387 | ||
| 267 | static inline | 388 | static inline |
| @@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 284 | #ifdef CONFIG_RT_GROUP_SCHED | 405 | #ifdef CONFIG_RT_GROUP_SCHED |
| 285 | if (rt_se_boosted(rt_se)) | 406 | if (rt_se_boosted(rt_se)) |
| 286 | rt_rq->rt_nr_boosted++; | 407 | rt_rq->rt_nr_boosted++; |
| 408 | |||
| 409 | if (rt_rq->tg) | ||
| 410 | start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); | ||
| 411 | #else | ||
| 412 | start_rt_bandwidth(&def_rt_bandwidth); | ||
| 287 | #endif | 413 | #endif |
| 288 | } | 414 | } |
| 289 | 415 | ||
| @@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 353 | /* | 479 | /* |
| 354 | * Because the prio of an upper entry depends on the lower | 480 | * Because the prio of an upper entry depends on the lower |
| 355 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
| 356 | * | ||
| 357 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
| 358 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
| 359 | */ | 482 | */ |
| 360 | static void dequeue_rt_stack(struct task_struct *p) | 483 | static void dequeue_rt_stack(struct task_struct *p) |
| 361 | { | 484 | { |
| 362 | struct sched_rt_entity *rt_se, *top_se; | 485 | struct sched_rt_entity *rt_se, *back = NULL; |
| 363 | 486 | ||
| 364 | /* | 487 | rt_se = &p->rt; |
| 365 | * dequeue all, top - down. | 488 | for_each_sched_rt_entity(rt_se) { |
| 366 | */ | 489 | rt_se->back = back; |
| 367 | do { | 490 | back = rt_se; |
| 368 | rt_se = &p->rt; | 491 | } |
| 369 | top_se = NULL; | 492 | |
| 370 | for_each_sched_rt_entity(rt_se) { | 493 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
| 371 | if (on_rt_rq(rt_se)) | 494 | if (on_rt_rq(rt_se)) |
| 372 | top_se = rt_se; | 495 | dequeue_rt_entity(rt_se); |
| 373 | } | 496 | } |
| 374 | if (top_se) | ||
| 375 | dequeue_rt_entity(top_se); | ||
| 376 | } while (top_se); | ||
| 377 | } | 497 | } |
| 378 | 498 | ||
| 379 | /* | 499 | /* |
| @@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 393 | */ | 513 | */ |
| 394 | for_each_sched_rt_entity(rt_se) | 514 | for_each_sched_rt_entity(rt_se) |
| 395 | enqueue_rt_entity(rt_se); | 515 | enqueue_rt_entity(rt_se); |
| 516 | |||
| 517 | inc_cpu_load(rq, p->se.load.weight); | ||
| 396 | } | 518 | } |
| 397 | 519 | ||
| 398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 520 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
| 412 | if (rt_rq && rt_rq->rt_nr_running) | 534 | if (rt_rq && rt_rq->rt_nr_running) |
| 413 | enqueue_rt_entity(rt_se); | 535 | enqueue_rt_entity(rt_se); |
| 414 | } | 536 | } |
| 537 | |||
| 538 | dec_cpu_load(rq, p->se.load.weight); | ||
| 415 | } | 539 | } |
| 416 | 540 | ||
| 417 | /* | 541 | /* |
| @@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1001 | return 0; | 1125 | return 0; |
| 1002 | } | 1126 | } |
| 1003 | 1127 | ||
| 1004 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | 1128 | static void set_cpus_allowed_rt(struct task_struct *p, |
| 1129 | const cpumask_t *new_mask) | ||
| 1005 | { | 1130 | { |
| 1006 | int weight = cpus_weight(*new_mask); | 1131 | int weight = cpus_weight(*new_mask); |
| 1007 | 1132 | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee5..5bae2e0c3ff2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -9,6 +9,11 @@ | |||
| 9 | static int show_schedstat(struct seq_file *seq, void *v) | 9 | static int show_schedstat(struct seq_file *seq, void *v) |
| 10 | { | 10 | { |
| 11 | int cpu; | 11 | int cpu; |
| 12 | int mask_len = NR_CPUS/32 * 9; | ||
| 13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
| 14 | |||
| 15 | if (mask_str == NULL) | ||
| 16 | return -ENOMEM; | ||
| 12 | 17 | ||
| 13 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
| 14 | seq_printf(seq, "timestamp %lu\n", jiffies); | 19 | seq_printf(seq, "timestamp %lu\n", jiffies); |
| @@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 36 | preempt_disable(); | 41 | preempt_disable(); |
| 37 | for_each_domain(cpu, sd) { | 42 | for_each_domain(cpu, sd) { |
| 38 | enum cpu_idle_type itype; | 43 | enum cpu_idle_type itype; |
| 39 | char mask_str[NR_CPUS]; | ||
| 40 | 44 | ||
| 41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 45 | cpumask_scnprintf(mask_str, mask_len, sd->span); |
| 42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | 46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
| 43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
| 44 | itype++) { | 48 | itype++) { |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 000000000000..5c2942e768cd --- /dev/null +++ b/kernel/semaphore.c | |||
| @@ -0,0 +1,264 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2008 Intel Corporation | ||
| 3 | * Author: Matthew Wilcox <willy@linux.intel.com> | ||
| 4 | * | ||
| 5 | * Distributed under the terms of the GNU GPL, version 2 | ||
| 6 | * | ||
| 7 | * This file implements counting semaphores. | ||
| 8 | * A counting semaphore may be acquired 'n' times before sleeping. | ||
| 9 | * See mutex.c for single-acquisition sleeping locks which enforce | ||
| 10 | * rules which allow code to be debugged more easily. | ||
| 11 | */ | ||
| 12 | |||
| 13 | /* | ||
| 14 | * Some notes on the implementation: | ||
| 15 | * | ||
| 16 | * The spinlock controls access to the other members of the semaphore. | ||
| 17 | * down_trylock() and up() can be called from interrupt context, so we | ||
| 18 | * have to disable interrupts when taking the lock. It turns out various | ||
| 19 | * parts of the kernel expect to be able to use down() on a semaphore in | ||
| 20 | * interrupt context when they know it will succeed, so we have to use | ||
| 21 | * irqsave variants for down(), down_interruptible() and down_killable() | ||
| 22 | * too. | ||
| 23 | * | ||
| 24 | * The ->count variable represents how many more tasks can acquire this | ||
| 25 | * semaphore. If it's zero, there may be tasks waiting on the wait_list. | ||
| 26 | */ | ||
| 27 | |||
| 28 | #include <linux/compiler.h> | ||
| 29 | #include <linux/kernel.h> | ||
| 30 | #include <linux/module.h> | ||
| 31 | #include <linux/sched.h> | ||
| 32 | #include <linux/semaphore.h> | ||
| 33 | #include <linux/spinlock.h> | ||
| 34 | |||
| 35 | static noinline void __down(struct semaphore *sem); | ||
| 36 | static noinline int __down_interruptible(struct semaphore *sem); | ||
| 37 | static noinline int __down_killable(struct semaphore *sem); | ||
| 38 | static noinline int __down_timeout(struct semaphore *sem, long jiffies); | ||
| 39 | static noinline void __up(struct semaphore *sem); | ||
| 40 | |||
| 41 | /** | ||
| 42 | * down - acquire the semaphore | ||
| 43 | * @sem: the semaphore to be acquired | ||
| 44 | * | ||
| 45 | * Acquires the semaphore. If no more tasks are allowed to acquire the | ||
| 46 | * semaphore, calling this function will put the task to sleep until the | ||
| 47 | * semaphore is released. | ||
| 48 | * | ||
| 49 | * Use of this function is deprecated, please use down_interruptible() or | ||
| 50 | * down_killable() instead. | ||
| 51 | */ | ||
| 52 | void down(struct semaphore *sem) | ||
| 53 | { | ||
| 54 | unsigned long flags; | ||
| 55 | |||
| 56 | spin_lock_irqsave(&sem->lock, flags); | ||
| 57 | if (likely(sem->count > 0)) | ||
| 58 | sem->count--; | ||
| 59 | else | ||
| 60 | __down(sem); | ||
| 61 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 62 | } | ||
| 63 | EXPORT_SYMBOL(down); | ||
| 64 | |||
| 65 | /** | ||
| 66 | * down_interruptible - acquire the semaphore unless interrupted | ||
| 67 | * @sem: the semaphore to be acquired | ||
| 68 | * | ||
| 69 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
| 70 | * acquire the semaphore, calling this function will put the task to sleep. | ||
| 71 | * If the sleep is interrupted by a signal, this function will return -EINTR. | ||
| 72 | * If the semaphore is successfully acquired, this function returns 0. | ||
| 73 | */ | ||
| 74 | int down_interruptible(struct semaphore *sem) | ||
| 75 | { | ||
| 76 | unsigned long flags; | ||
| 77 | int result = 0; | ||
| 78 | |||
| 79 | spin_lock_irqsave(&sem->lock, flags); | ||
| 80 | if (likely(sem->count > 0)) | ||
| 81 | sem->count--; | ||
| 82 | else | ||
| 83 | result = __down_interruptible(sem); | ||
| 84 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 85 | |||
| 86 | return result; | ||
| 87 | } | ||
| 88 | EXPORT_SYMBOL(down_interruptible); | ||
| 89 | |||
| 90 | /** | ||
| 91 | * down_killable - acquire the semaphore unless killed | ||
| 92 | * @sem: the semaphore to be acquired | ||
| 93 | * | ||
| 94 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
| 95 | * acquire the semaphore, calling this function will put the task to sleep. | ||
| 96 | * If the sleep is interrupted by a fatal signal, this function will return | ||
| 97 | * -EINTR. If the semaphore is successfully acquired, this function returns | ||
| 98 | * 0. | ||
| 99 | */ | ||
| 100 | int down_killable(struct semaphore *sem) | ||
| 101 | { | ||
| 102 | unsigned long flags; | ||
| 103 | int result = 0; | ||
| 104 | |||
| 105 | spin_lock_irqsave(&sem->lock, flags); | ||
| 106 | if (likely(sem->count > 0)) | ||
| 107 | sem->count--; | ||
| 108 | else | ||
| 109 | result = __down_killable(sem); | ||
| 110 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 111 | |||
| 112 | return result; | ||
| 113 | } | ||
| 114 | EXPORT_SYMBOL(down_killable); | ||
| 115 | |||
| 116 | /** | ||
| 117 | * down_trylock - try to acquire the semaphore, without waiting | ||
| 118 | * @sem: the semaphore to be acquired | ||
| 119 | * | ||
| 120 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | ||
| 121 | * been acquired successfully or 1 if it it cannot be acquired. | ||
| 122 | * | ||
| 123 | * NOTE: This return value is inverted from both spin_trylock and | ||
| 124 | * mutex_trylock! Be careful about this when converting code. | ||
| 125 | * | ||
| 126 | * Unlike mutex_trylock, this function can be used from interrupt context, | ||
| 127 | * and the semaphore can be released by any task or interrupt. | ||
| 128 | */ | ||
| 129 | int down_trylock(struct semaphore *sem) | ||
| 130 | { | ||
| 131 | unsigned long flags; | ||
| 132 | int count; | ||
| 133 | |||
| 134 | spin_lock_irqsave(&sem->lock, flags); | ||
| 135 | count = sem->count - 1; | ||
| 136 | if (likely(count >= 0)) | ||
| 137 | sem->count = count; | ||
| 138 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 139 | |||
| 140 | return (count < 0); | ||
| 141 | } | ||
| 142 | EXPORT_SYMBOL(down_trylock); | ||
| 143 | |||
| 144 | /** | ||
| 145 | * down_timeout - acquire the semaphore within a specified time | ||
| 146 | * @sem: the semaphore to be acquired | ||
| 147 | * @jiffies: how long to wait before failing | ||
| 148 | * | ||
| 149 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
| 150 | * acquire the semaphore, calling this function will put the task to sleep. | ||
| 151 | * If the semaphore is not released within the specified number of jiffies, | ||
| 152 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. | ||
| 153 | */ | ||
| 154 | int down_timeout(struct semaphore *sem, long jiffies) | ||
| 155 | { | ||
| 156 | unsigned long flags; | ||
| 157 | int result = 0; | ||
| 158 | |||
| 159 | spin_lock_irqsave(&sem->lock, flags); | ||
| 160 | if (likely(sem->count > 0)) | ||
| 161 | sem->count--; | ||
| 162 | else | ||
| 163 | result = __down_timeout(sem, jiffies); | ||
| 164 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 165 | |||
| 166 | return result; | ||
| 167 | } | ||
| 168 | EXPORT_SYMBOL(down_timeout); | ||
| 169 | |||
| 170 | /** | ||
| 171 | * up - release the semaphore | ||
| 172 | * @sem: the semaphore to release | ||
| 173 | * | ||
| 174 | * Release the semaphore. Unlike mutexes, up() may be called from any | ||
| 175 | * context and even by tasks which have never called down(). | ||
| 176 | */ | ||
| 177 | void up(struct semaphore *sem) | ||
| 178 | { | ||
| 179 | unsigned long flags; | ||
| 180 | |||
| 181 | spin_lock_irqsave(&sem->lock, flags); | ||
| 182 | if (likely(list_empty(&sem->wait_list))) | ||
| 183 | sem->count++; | ||
| 184 | else | ||
| 185 | __up(sem); | ||
| 186 | spin_unlock_irqrestore(&sem->lock, flags); | ||
| 187 | } | ||
| 188 | EXPORT_SYMBOL(up); | ||
| 189 | |||
| 190 | /* Functions for the contended case */ | ||
| 191 | |||
| 192 | struct semaphore_waiter { | ||
| 193 | struct list_head list; | ||
| 194 | struct task_struct *task; | ||
| 195 | int up; | ||
| 196 | }; | ||
| 197 | |||
| 198 | /* | ||
| 199 | * Because this function is inlined, the 'state' parameter will be | ||
| 200 | * constant, and thus optimised away by the compiler. Likewise the | ||
| 201 | * 'timeout' parameter for the cases without timeouts. | ||
| 202 | */ | ||
| 203 | static inline int __sched __down_common(struct semaphore *sem, long state, | ||
| 204 | long timeout) | ||
| 205 | { | ||
| 206 | struct task_struct *task = current; | ||
| 207 | struct semaphore_waiter waiter; | ||
| 208 | |||
| 209 | list_add_tail(&waiter.list, &sem->wait_list); | ||
| 210 | waiter.task = task; | ||
| 211 | waiter.up = 0; | ||
| 212 | |||
| 213 | for (;;) { | ||
| 214 | if (state == TASK_INTERRUPTIBLE && signal_pending(task)) | ||
| 215 | goto interrupted; | ||
| 216 | if (state == TASK_KILLABLE && fatal_signal_pending(task)) | ||
| 217 | goto interrupted; | ||
| 218 | if (timeout <= 0) | ||
| 219 | goto timed_out; | ||
| 220 | __set_task_state(task, state); | ||
| 221 | spin_unlock_irq(&sem->lock); | ||
| 222 | timeout = schedule_timeout(timeout); | ||
| 223 | spin_lock_irq(&sem->lock); | ||
| 224 | if (waiter.up) | ||
| 225 | return 0; | ||
| 226 | } | ||
| 227 | |||
| 228 | timed_out: | ||
| 229 | list_del(&waiter.list); | ||
| 230 | return -ETIME; | ||
| 231 | |||
| 232 | interrupted: | ||
| 233 | list_del(&waiter.list); | ||
| 234 | return -EINTR; | ||
| 235 | } | ||
| 236 | |||
| 237 | static noinline void __sched __down(struct semaphore *sem) | ||
| 238 | { | ||
| 239 | __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 240 | } | ||
| 241 | |||
| 242 | static noinline int __sched __down_interruptible(struct semaphore *sem) | ||
| 243 | { | ||
| 244 | return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 245 | } | ||
| 246 | |||
| 247 | static noinline int __sched __down_killable(struct semaphore *sem) | ||
| 248 | { | ||
| 249 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); | ||
| 250 | } | ||
| 251 | |||
| 252 | static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) | ||
| 253 | { | ||
| 254 | return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); | ||
| 255 | } | ||
| 256 | |||
| 257 | static noinline void __sched __up(struct semaphore *sem) | ||
| 258 | { | ||
| 259 | struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, | ||
| 260 | struct semaphore_waiter, list); | ||
| 261 | list_del(&waiter->list); | ||
| 262 | waiter->up = 1; | ||
| 263 | wake_up_process(waiter->task); | ||
| 264 | } | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 6af1210092c3..72bb4f51f963 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -39,11 +39,19 @@ | |||
| 39 | 39 | ||
| 40 | static struct kmem_cache *sigqueue_cachep; | 40 | static struct kmem_cache *sigqueue_cachep; |
| 41 | 41 | ||
| 42 | static int __sig_ignored(struct task_struct *t, int sig) | ||
| 43 | { | ||
| 44 | void __user *handler; | ||
| 45 | |||
| 46 | /* Is it explicitly or implicitly ignored? */ | ||
| 47 | |||
| 48 | handler = t->sighand->action[sig - 1].sa.sa_handler; | ||
| 49 | return handler == SIG_IGN || | ||
| 50 | (handler == SIG_DFL && sig_kernel_ignore(sig)); | ||
| 51 | } | ||
| 42 | 52 | ||
| 43 | static int sig_ignored(struct task_struct *t, int sig) | 53 | static int sig_ignored(struct task_struct *t, int sig) |
| 44 | { | 54 | { |
| 45 | void __user * handler; | ||
| 46 | |||
| 47 | /* | 55 | /* |
| 48 | * Tracers always want to know about signals.. | 56 | * Tracers always want to know about signals.. |
| 49 | */ | 57 | */ |
| @@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig) | |||
| 58 | if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) | 66 | if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) |
| 59 | return 0; | 67 | return 0; |
| 60 | 68 | ||
| 61 | /* Is it explicitly or implicitly ignored? */ | 69 | return __sig_ignored(t, sig); |
| 62 | handler = t->sighand->action[sig-1].sa.sa_handler; | ||
| 63 | return handler == SIG_IGN || | ||
| 64 | (handler == SIG_DFL && sig_kernel_ignore(sig)); | ||
| 65 | } | 70 | } |
| 66 | 71 | ||
| 67 | /* | 72 | /* |
| @@ -220,7 +225,7 @@ void flush_signals(struct task_struct *t) | |||
| 220 | unsigned long flags; | 225 | unsigned long flags; |
| 221 | 226 | ||
| 222 | spin_lock_irqsave(&t->sighand->siglock, flags); | 227 | spin_lock_irqsave(&t->sighand->siglock, flags); |
| 223 | clear_tsk_thread_flag(t,TIF_SIGPENDING); | 228 | clear_tsk_thread_flag(t, TIF_SIGPENDING); |
| 224 | flush_sigqueue(&t->pending); | 229 | flush_sigqueue(&t->pending); |
| 225 | flush_sigqueue(&t->signal->shared_pending); | 230 | flush_sigqueue(&t->signal->shared_pending); |
| 226 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 231 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
| @@ -372,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
| 372 | */ | 377 | */ |
| 373 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 378 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
| 374 | { | 379 | { |
| 375 | int signr = 0; | 380 | int signr; |
| 376 | 381 | ||
| 377 | /* We only dequeue private signals from ourselves, we don't let | 382 | /* We only dequeue private signals from ourselves, we don't let |
| 378 | * signalfd steal them | 383 | * signalfd steal them |
| @@ -405,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 405 | } | 410 | } |
| 406 | } | 411 | } |
| 407 | } | 412 | } |
| 413 | |||
| 408 | recalc_sigpending(); | 414 | recalc_sigpending(); |
| 409 | if (signr && unlikely(sig_kernel_stop(signr))) { | 415 | if (!signr) |
| 416 | return 0; | ||
| 417 | |||
| 418 | if (unlikely(sig_kernel_stop(signr))) { | ||
| 410 | /* | 419 | /* |
| 411 | * Set a marker that we have dequeued a stop signal. Our | 420 | * Set a marker that we have dequeued a stop signal. Our |
| 412 | * caller might release the siglock and then the pending | 421 | * caller might release the siglock and then the pending |
| @@ -422,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 422 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) | 431 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
| 423 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 432 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; |
| 424 | } | 433 | } |
| 425 | if (signr && | 434 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { |
| 426 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | ||
| 427 | info->si_sys_private){ | ||
| 428 | /* | 435 | /* |
| 429 | * Release the siglock to ensure proper locking order | 436 | * Release the siglock to ensure proper locking order |
| 430 | * of timer locks outside of siglocks. Note, we leave | 437 | * of timer locks outside of siglocks. Note, we leave |
| @@ -526,21 +533,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) | |||
| 526 | static int check_kill_permission(int sig, struct siginfo *info, | 533 | static int check_kill_permission(int sig, struct siginfo *info, |
| 527 | struct task_struct *t) | 534 | struct task_struct *t) |
| 528 | { | 535 | { |
| 529 | int error = -EINVAL; | 536 | struct pid *sid; |
| 537 | int error; | ||
| 538 | |||
| 530 | if (!valid_signal(sig)) | 539 | if (!valid_signal(sig)) |
| 531 | return error; | 540 | return -EINVAL; |
| 532 | 541 | ||
| 533 | if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { | 542 | if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) |
| 534 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ | 543 | return 0; |
| 535 | if (error) | 544 | |
| 536 | return error; | 545 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ |
| 537 | error = -EPERM; | 546 | if (error) |
| 538 | if (((sig != SIGCONT) || | ||
| 539 | (task_session_nr(current) != task_session_nr(t))) | ||
| 540 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | ||
| 541 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | ||
| 542 | && !capable(CAP_KILL)) | ||
| 543 | return error; | 547 | return error; |
| 548 | |||
| 549 | if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && | ||
| 550 | (current->uid ^ t->suid) && (current->uid ^ t->uid) && | ||
| 551 | !capable(CAP_KILL)) { | ||
| 552 | switch (sig) { | ||
| 553 | case SIGCONT: | ||
| 554 | sid = task_session(t); | ||
| 555 | /* | ||
| 556 | * We don't return the error if sid == NULL. The | ||
| 557 | * task was unhashed, the caller must notice this. | ||
| 558 | */ | ||
| 559 | if (!sid || sid == task_session(current)) | ||
| 560 | break; | ||
| 561 | default: | ||
| 562 | return -EPERM; | ||
| 563 | } | ||
| 544 | } | 564 | } |
| 545 | 565 | ||
| 546 | return security_task_kill(t, info, sig, 0); | 566 | return security_task_kill(t, info, sig, 0); |
| @@ -550,62 +570,44 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 550 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why); | 570 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why); |
| 551 | 571 | ||
| 552 | /* | 572 | /* |
| 553 | * Handle magic process-wide effects of stop/continue signals. | 573 | * Handle magic process-wide effects of stop/continue signals. Unlike |
| 554 | * Unlike the signal actions, these happen immediately at signal-generation | 574 | * the signal actions, these happen immediately at signal-generation |
| 555 | * time regardless of blocking, ignoring, or handling. This does the | 575 | * time regardless of blocking, ignoring, or handling. This does the |
| 556 | * actual continuing for SIGCONT, but not the actual stopping for stop | 576 | * actual continuing for SIGCONT, but not the actual stopping for stop |
| 557 | * signals. The process stop is done as a signal action for SIG_DFL. | 577 | * signals. The process stop is done as a signal action for SIG_DFL. |
| 578 | * | ||
| 579 | * Returns true if the signal should be actually delivered, otherwise | ||
| 580 | * it should be dropped. | ||
| 558 | */ | 581 | */ |
| 559 | static void handle_stop_signal(int sig, struct task_struct *p) | 582 | static int prepare_signal(int sig, struct task_struct *p) |
| 560 | { | 583 | { |
| 584 | struct signal_struct *signal = p->signal; | ||
| 561 | struct task_struct *t; | 585 | struct task_struct *t; |
| 562 | 586 | ||
| 563 | if (p->signal->flags & SIGNAL_GROUP_EXIT) | 587 | if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { |
| 564 | /* | 588 | /* |
| 565 | * The process is in the middle of dying already. | 589 | * The process is in the middle of dying, nothing to do. |
| 566 | */ | 590 | */ |
| 567 | return; | 591 | } else if (sig_kernel_stop(sig)) { |
| 568 | |||
| 569 | if (sig_kernel_stop(sig)) { | ||
| 570 | /* | 592 | /* |
| 571 | * This is a stop signal. Remove SIGCONT from all queues. | 593 | * This is a stop signal. Remove SIGCONT from all queues. |
| 572 | */ | 594 | */ |
| 573 | rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); | 595 | rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); |
| 574 | t = p; | 596 | t = p; |
| 575 | do { | 597 | do { |
| 576 | rm_from_queue(sigmask(SIGCONT), &t->pending); | 598 | rm_from_queue(sigmask(SIGCONT), &t->pending); |
| 577 | t = next_thread(t); | 599 | } while_each_thread(p, t); |
| 578 | } while (t != p); | ||
| 579 | } else if (sig == SIGCONT) { | 600 | } else if (sig == SIGCONT) { |
| 601 | unsigned int why; | ||
| 580 | /* | 602 | /* |
| 581 | * Remove all stop signals from all queues, | 603 | * Remove all stop signals from all queues, |
| 582 | * and wake all threads. | 604 | * and wake all threads. |
| 583 | */ | 605 | */ |
| 584 | if (unlikely(p->signal->group_stop_count > 0)) { | 606 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); |
| 585 | /* | ||
| 586 | * There was a group stop in progress. We'll | ||
| 587 | * pretend it finished before we got here. We are | ||
| 588 | * obliged to report it to the parent: if the | ||
| 589 | * SIGSTOP happened "after" this SIGCONT, then it | ||
| 590 | * would have cleared this pending SIGCONT. If it | ||
| 591 | * happened "before" this SIGCONT, then the parent | ||
| 592 | * got the SIGCHLD about the stop finishing before | ||
| 593 | * the continue happened. We do the notification | ||
| 594 | * now, and it's as if the stop had finished and | ||
| 595 | * the SIGCHLD was pending on entry to this kill. | ||
| 596 | */ | ||
| 597 | p->signal->group_stop_count = 0; | ||
| 598 | p->signal->flags = SIGNAL_STOP_CONTINUED; | ||
| 599 | spin_unlock(&p->sighand->siglock); | ||
| 600 | do_notify_parent_cldstop(p, CLD_STOPPED); | ||
| 601 | spin_lock(&p->sighand->siglock); | ||
| 602 | } | ||
| 603 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | ||
| 604 | t = p; | 607 | t = p; |
| 605 | do { | 608 | do { |
| 606 | unsigned int state; | 609 | unsigned int state; |
| 607 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | 610 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); |
| 608 | |||
| 609 | /* | 611 | /* |
| 610 | * If there is a handler for SIGCONT, we must make | 612 | * If there is a handler for SIGCONT, we must make |
| 611 | * sure that no thread returns to user mode before | 613 | * sure that no thread returns to user mode before |
| @@ -615,7 +617,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
| 615 | * running the handler. With the TIF_SIGPENDING | 617 | * running the handler. With the TIF_SIGPENDING |
| 616 | * flag set, the thread will pause and acquire the | 618 | * flag set, the thread will pause and acquire the |
| 617 | * siglock that we hold now and until we've queued | 619 | * siglock that we hold now and until we've queued |
| 618 | * the pending signal. | 620 | * the pending signal. |
| 619 | * | 621 | * |
| 620 | * Wake up the stopped thread _after_ setting | 622 | * Wake up the stopped thread _after_ setting |
| 621 | * TIF_SIGPENDING | 623 | * TIF_SIGPENDING |
| @@ -626,49 +628,163 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
| 626 | state |= TASK_INTERRUPTIBLE; | 628 | state |= TASK_INTERRUPTIBLE; |
| 627 | } | 629 | } |
| 628 | wake_up_state(t, state); | 630 | wake_up_state(t, state); |
| 631 | } while_each_thread(p, t); | ||
| 629 | 632 | ||
| 630 | t = next_thread(t); | 633 | /* |
| 631 | } while (t != p); | 634 | * Notify the parent with CLD_CONTINUED if we were stopped. |
| 635 | * | ||
| 636 | * If we were in the middle of a group stop, we pretend it | ||
| 637 | * was already finished, and then continued. Since SIGCHLD | ||
| 638 | * doesn't queue we report only CLD_STOPPED, as if the next | ||
| 639 | * CLD_CONTINUED was dropped. | ||
| 640 | */ | ||
| 641 | why = 0; | ||
| 642 | if (signal->flags & SIGNAL_STOP_STOPPED) | ||
| 643 | why |= SIGNAL_CLD_CONTINUED; | ||
| 644 | else if (signal->group_stop_count) | ||
| 645 | why |= SIGNAL_CLD_STOPPED; | ||
| 632 | 646 | ||
| 633 | if (p->signal->flags & SIGNAL_STOP_STOPPED) { | 647 | if (why) { |
| 634 | /* | 648 | /* |
| 635 | * We were in fact stopped, and are now continued. | 649 | * The first thread which returns from finish_stop() |
| 636 | * Notify the parent with CLD_CONTINUED. | 650 | * will take ->siglock, notice SIGNAL_CLD_MASK, and |
| 651 | * notify its parent. See get_signal_to_deliver(). | ||
| 637 | */ | 652 | */ |
| 638 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 653 | signal->flags = why | SIGNAL_STOP_CONTINUED; |
| 639 | p->signal->group_exit_code = 0; | 654 | signal->group_stop_count = 0; |
| 640 | spin_unlock(&p->sighand->siglock); | 655 | signal->group_exit_code = 0; |
| 641 | do_notify_parent_cldstop(p, CLD_CONTINUED); | ||
| 642 | spin_lock(&p->sighand->siglock); | ||
| 643 | } else { | 656 | } else { |
| 644 | /* | 657 | /* |
| 645 | * We are not stopped, but there could be a stop | 658 | * We are not stopped, but there could be a stop |
| 646 | * signal in the middle of being processed after | 659 | * signal in the middle of being processed after |
| 647 | * being removed from the queue. Clear that too. | 660 | * being removed from the queue. Clear that too. |
| 648 | */ | 661 | */ |
| 649 | p->signal->flags = 0; | 662 | signal->flags &= ~SIGNAL_STOP_DEQUEUED; |
| 663 | } | ||
| 664 | } | ||
| 665 | |||
| 666 | return !sig_ignored(p, sig); | ||
| 667 | } | ||
| 668 | |||
| 669 | /* | ||
| 670 | * Test if P wants to take SIG. After we've checked all threads with this, | ||
| 671 | * it's equivalent to finding no threads not blocking SIG. Any threads not | ||
| 672 | * blocking SIG were ruled out because they are not running and already | ||
| 673 | * have pending signals. Such threads will dequeue from the shared queue | ||
| 674 | * as soon as they're available, so putting the signal on the shared queue | ||
| 675 | * will be equivalent to sending it to one such thread. | ||
| 676 | */ | ||
| 677 | static inline int wants_signal(int sig, struct task_struct *p) | ||
| 678 | { | ||
| 679 | if (sigismember(&p->blocked, sig)) | ||
| 680 | return 0; | ||
| 681 | if (p->flags & PF_EXITING) | ||
| 682 | return 0; | ||
| 683 | if (sig == SIGKILL) | ||
| 684 | return 1; | ||
| 685 | if (task_is_stopped_or_traced(p)) | ||
| 686 | return 0; | ||
| 687 | return task_curr(p) || !signal_pending(p); | ||
| 688 | } | ||
| 689 | |||
| 690 | static void complete_signal(int sig, struct task_struct *p, int group) | ||
| 691 | { | ||
| 692 | struct signal_struct *signal = p->signal; | ||
| 693 | struct task_struct *t; | ||
| 694 | |||
| 695 | /* | ||
| 696 | * Now find a thread we can wake up to take the signal off the queue. | ||
| 697 | * | ||
| 698 | * If the main thread wants the signal, it gets first crack. | ||
| 699 | * Probably the least surprising to the average bear. | ||
| 700 | */ | ||
| 701 | if (wants_signal(sig, p)) | ||
| 702 | t = p; | ||
| 703 | else if (!group || thread_group_empty(p)) | ||
| 704 | /* | ||
| 705 | * There is just one thread and it does not need to be woken. | ||
| 706 | * It will dequeue unblocked signals before it runs again. | ||
| 707 | */ | ||
| 708 | return; | ||
| 709 | else { | ||
| 710 | /* | ||
| 711 | * Otherwise try to find a suitable thread. | ||
| 712 | */ | ||
| 713 | t = signal->curr_target; | ||
| 714 | while (!wants_signal(sig, t)) { | ||
| 715 | t = next_thread(t); | ||
| 716 | if (t == signal->curr_target) | ||
| 717 | /* | ||
| 718 | * No thread needs to be woken. | ||
| 719 | * Any eligible threads will see | ||
| 720 | * the signal in the queue soon. | ||
| 721 | */ | ||
| 722 | return; | ||
| 650 | } | 723 | } |
| 651 | } else if (sig == SIGKILL) { | 724 | signal->curr_target = t; |
| 725 | } | ||
| 726 | |||
| 727 | /* | ||
| 728 | * Found a killable thread. If the signal will be fatal, | ||
| 729 | * then start taking the whole group down immediately. | ||
| 730 | */ | ||
| 731 | if (sig_fatal(p, sig) && | ||
| 732 | !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && | ||
| 733 | !sigismember(&t->real_blocked, sig) && | ||
| 734 | (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { | ||
| 652 | /* | 735 | /* |
| 653 | * Make sure that any pending stop signal already dequeued | 736 | * This signal will be fatal to the whole group. |
| 654 | * is undone by the wakeup for SIGKILL. | ||
| 655 | */ | 737 | */ |
| 656 | p->signal->flags = 0; | 738 | if (!sig_kernel_coredump(sig)) { |
| 739 | /* | ||
| 740 | * Start a group exit and wake everybody up. | ||
| 741 | * This way we don't have other threads | ||
| 742 | * running and doing things after a slower | ||
| 743 | * thread has the fatal signal pending. | ||
| 744 | */ | ||
| 745 | signal->flags = SIGNAL_GROUP_EXIT; | ||
| 746 | signal->group_exit_code = sig; | ||
| 747 | signal->group_stop_count = 0; | ||
| 748 | t = p; | ||
| 749 | do { | ||
| 750 | sigaddset(&t->pending.signal, SIGKILL); | ||
| 751 | signal_wake_up(t, 1); | ||
| 752 | } while_each_thread(p, t); | ||
| 753 | return; | ||
| 754 | } | ||
| 657 | } | 755 | } |
| 756 | |||
| 757 | /* | ||
| 758 | * The signal is already in the shared-pending queue. | ||
| 759 | * Tell the chosen thread to wake up and dequeue it. | ||
| 760 | */ | ||
| 761 | signal_wake_up(t, sig == SIGKILL); | ||
| 762 | return; | ||
| 763 | } | ||
| 764 | |||
| 765 | static inline int legacy_queue(struct sigpending *signals, int sig) | ||
| 766 | { | ||
| 767 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | ||
| 658 | } | 768 | } |
| 659 | 769 | ||
| 660 | static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | 770 | static int send_signal(int sig, struct siginfo *info, struct task_struct *t, |
| 661 | struct sigpending *signals) | 771 | int group) |
| 662 | { | 772 | { |
| 663 | struct sigqueue * q = NULL; | 773 | struct sigpending *pending; |
| 664 | int ret = 0; | 774 | struct sigqueue *q; |
| 775 | |||
| 776 | assert_spin_locked(&t->sighand->siglock); | ||
| 777 | if (!prepare_signal(sig, t)) | ||
| 778 | return 0; | ||
| 665 | 779 | ||
| 780 | pending = group ? &t->signal->shared_pending : &t->pending; | ||
| 666 | /* | 781 | /* |
| 667 | * Deliver the signal to listening signalfds. This must be called | 782 | * Short-circuit ignored signals and support queuing |
| 668 | * with the sighand lock held. | 783 | * exactly one non-rt signal, so that we can get more |
| 784 | * detailed information about the cause of the signal. | ||
| 669 | */ | 785 | */ |
| 670 | signalfd_notify(t, sig); | 786 | if (legacy_queue(pending, sig)) |
| 671 | 787 | return 0; | |
| 672 | /* | 788 | /* |
| 673 | * fast-pathed signals for kernel-internal things like SIGSTOP | 789 | * fast-pathed signals for kernel-internal things like SIGSTOP |
| 674 | * or SIGKILL. | 790 | * or SIGKILL. |
| @@ -688,7 +804,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 688 | (is_si_special(info) || | 804 | (is_si_special(info) || |
| 689 | info->si_code >= 0))); | 805 | info->si_code >= 0))); |
| 690 | if (q) { | 806 | if (q) { |
| 691 | list_add_tail(&q->list, &signals->list); | 807 | list_add_tail(&q->list, &pending->list); |
| 692 | switch ((unsigned long) info) { | 808 | switch ((unsigned long) info) { |
| 693 | case (unsigned long) SEND_SIG_NOINFO: | 809 | case (unsigned long) SEND_SIG_NOINFO: |
| 694 | q->info.si_signo = sig; | 810 | q->info.si_signo = sig; |
| @@ -718,13 +834,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 718 | } | 834 | } |
| 719 | 835 | ||
| 720 | out_set: | 836 | out_set: |
| 721 | sigaddset(&signals->signal, sig); | 837 | signalfd_notify(t, sig); |
| 722 | return ret; | 838 | sigaddset(&pending->signal, sig); |
| 839 | complete_signal(sig, t, group); | ||
| 840 | return 0; | ||
| 723 | } | 841 | } |
| 724 | 842 | ||
| 725 | #define LEGACY_QUEUE(sigptr, sig) \ | ||
| 726 | (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) | ||
| 727 | |||
| 728 | int print_fatal_signals; | 843 | int print_fatal_signals; |
| 729 | 844 | ||
| 730 | static void print_fatal_signal(struct pt_regs *regs, int signr) | 845 | static void print_fatal_signal(struct pt_regs *regs, int signr) |
| @@ -757,29 +872,16 @@ static int __init setup_print_fatal_signals(char *str) | |||
| 757 | 872 | ||
| 758 | __setup("print-fatal-signals=", setup_print_fatal_signals); | 873 | __setup("print-fatal-signals=", setup_print_fatal_signals); |
| 759 | 874 | ||
| 875 | int | ||
| 876 | __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
| 877 | { | ||
| 878 | return send_signal(sig, info, p, 1); | ||
| 879 | } | ||
| 880 | |||
| 760 | static int | 881 | static int |
| 761 | specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 882 | specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) |
| 762 | { | 883 | { |
| 763 | int ret = 0; | 884 | return send_signal(sig, info, t, 0); |
| 764 | |||
| 765 | BUG_ON(!irqs_disabled()); | ||
| 766 | assert_spin_locked(&t->sighand->siglock); | ||
| 767 | |||
| 768 | /* Short-circuit ignored signals. */ | ||
| 769 | if (sig_ignored(t, sig)) | ||
| 770 | goto out; | ||
| 771 | |||
| 772 | /* Support queueing exactly one non-rt signal, so that we | ||
| 773 | can get more detailed information about the cause of | ||
| 774 | the signal. */ | ||
| 775 | if (LEGACY_QUEUE(&t->pending, sig)) | ||
| 776 | goto out; | ||
| 777 | |||
| 778 | ret = send_signal(sig, info, t, &t->pending); | ||
| 779 | if (!ret && !sigismember(&t->blocked, sig)) | ||
| 780 | signal_wake_up(t, sig == SIGKILL); | ||
| 781 | out: | ||
| 782 | return ret; | ||
| 783 | } | 885 | } |
| 784 | 886 | ||
| 785 | /* | 887 | /* |
| @@ -790,7 +892,8 @@ out: | |||
| 790 | * since we do not want to have a signal handler that was blocked | 892 | * since we do not want to have a signal handler that was blocked |
| 791 | * be invoked when user space had explicitly blocked it. | 893 | * be invoked when user space had explicitly blocked it. |
| 792 | * | 894 | * |
| 793 | * We don't want to have recursive SIGSEGV's etc, for example. | 895 | * We don't want to have recursive SIGSEGV's etc, for example, |
| 896 | * that is why we also clear SIGNAL_UNKILLABLE. | ||
| 794 | */ | 897 | */ |
| 795 | int | 898 | int |
| 796 | force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 899 | force_sig_info(int sig, struct siginfo *info, struct task_struct *t) |
| @@ -810,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
| 810 | recalc_sigpending_and_wake(t); | 913 | recalc_sigpending_and_wake(t); |
| 811 | } | 914 | } |
| 812 | } | 915 | } |
| 916 | if (action->sa.sa_handler == SIG_DFL) | ||
| 917 | t->signal->flags &= ~SIGNAL_UNKILLABLE; | ||
| 813 | ret = specific_send_sig_info(sig, info, t); | 918 | ret = specific_send_sig_info(sig, info, t); |
| 814 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 919 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
| 815 | 920 | ||
| @@ -823,134 +928,6 @@ force_sig_specific(int sig, struct task_struct *t) | |||
| 823 | } | 928 | } |
| 824 | 929 | ||
| 825 | /* | 930 | /* |
| 826 | * Test if P wants to take SIG. After we've checked all threads with this, | ||
| 827 | * it's equivalent to finding no threads not blocking SIG. Any threads not | ||
| 828 | * blocking SIG were ruled out because they are not running and already | ||
| 829 | * have pending signals. Such threads will dequeue from the shared queue | ||
| 830 | * as soon as they're available, so putting the signal on the shared queue | ||
| 831 | * will be equivalent to sending it to one such thread. | ||
| 832 | */ | ||
| 833 | static inline int wants_signal(int sig, struct task_struct *p) | ||
| 834 | { | ||
| 835 | if (sigismember(&p->blocked, sig)) | ||
| 836 | return 0; | ||
| 837 | if (p->flags & PF_EXITING) | ||
| 838 | return 0; | ||
| 839 | if (sig == SIGKILL) | ||
| 840 | return 1; | ||
| 841 | if (task_is_stopped_or_traced(p)) | ||
| 842 | return 0; | ||
| 843 | return task_curr(p) || !signal_pending(p); | ||
| 844 | } | ||
| 845 | |||
| 846 | static void | ||
| 847 | __group_complete_signal(int sig, struct task_struct *p) | ||
| 848 | { | ||
| 849 | struct task_struct *t; | ||
| 850 | |||
| 851 | /* | ||
| 852 | * Now find a thread we can wake up to take the signal off the queue. | ||
| 853 | * | ||
| 854 | * If the main thread wants the signal, it gets first crack. | ||
| 855 | * Probably the least surprising to the average bear. | ||
| 856 | */ | ||
| 857 | if (wants_signal(sig, p)) | ||
| 858 | t = p; | ||
| 859 | else if (thread_group_empty(p)) | ||
| 860 | /* | ||
| 861 | * There is just one thread and it does not need to be woken. | ||
| 862 | * It will dequeue unblocked signals before it runs again. | ||
| 863 | */ | ||
| 864 | return; | ||
| 865 | else { | ||
| 866 | /* | ||
| 867 | * Otherwise try to find a suitable thread. | ||
| 868 | */ | ||
| 869 | t = p->signal->curr_target; | ||
| 870 | if (t == NULL) | ||
| 871 | /* restart balancing at this thread */ | ||
| 872 | t = p->signal->curr_target = p; | ||
| 873 | |||
| 874 | while (!wants_signal(sig, t)) { | ||
| 875 | t = next_thread(t); | ||
| 876 | if (t == p->signal->curr_target) | ||
| 877 | /* | ||
| 878 | * No thread needs to be woken. | ||
| 879 | * Any eligible threads will see | ||
| 880 | * the signal in the queue soon. | ||
| 881 | */ | ||
| 882 | return; | ||
| 883 | } | ||
| 884 | p->signal->curr_target = t; | ||
| 885 | } | ||
| 886 | |||
| 887 | /* | ||
| 888 | * Found a killable thread. If the signal will be fatal, | ||
| 889 | * then start taking the whole group down immediately. | ||
| 890 | */ | ||
| 891 | if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) && | ||
| 892 | !sigismember(&t->real_blocked, sig) && | ||
| 893 | (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { | ||
| 894 | /* | ||
| 895 | * This signal will be fatal to the whole group. | ||
| 896 | */ | ||
| 897 | if (!sig_kernel_coredump(sig)) { | ||
| 898 | /* | ||
| 899 | * Start a group exit and wake everybody up. | ||
| 900 | * This way we don't have other threads | ||
| 901 | * running and doing things after a slower | ||
| 902 | * thread has the fatal signal pending. | ||
| 903 | */ | ||
| 904 | p->signal->flags = SIGNAL_GROUP_EXIT; | ||
| 905 | p->signal->group_exit_code = sig; | ||
| 906 | p->signal->group_stop_count = 0; | ||
| 907 | t = p; | ||
| 908 | do { | ||
| 909 | sigaddset(&t->pending.signal, SIGKILL); | ||
| 910 | signal_wake_up(t, 1); | ||
| 911 | } while_each_thread(p, t); | ||
| 912 | return; | ||
| 913 | } | ||
| 914 | } | ||
| 915 | |||
| 916 | /* | ||
| 917 | * The signal is already in the shared-pending queue. | ||
| 918 | * Tell the chosen thread to wake up and dequeue it. | ||
| 919 | */ | ||
| 920 | signal_wake_up(t, sig == SIGKILL); | ||
| 921 | return; | ||
| 922 | } | ||
| 923 | |||
| 924 | int | ||
| 925 | __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
| 926 | { | ||
| 927 | int ret = 0; | ||
| 928 | |||
| 929 | assert_spin_locked(&p->sighand->siglock); | ||
| 930 | handle_stop_signal(sig, p); | ||
| 931 | |||
| 932 | /* Short-circuit ignored signals. */ | ||
| 933 | if (sig_ignored(p, sig)) | ||
| 934 | return ret; | ||
| 935 | |||
| 936 | if (LEGACY_QUEUE(&p->signal->shared_pending, sig)) | ||
| 937 | /* This is a non-RT signal and we already have one queued. */ | ||
| 938 | return ret; | ||
| 939 | |||
| 940 | /* | ||
| 941 | * Put this signal on the shared-pending queue, or fail with EAGAIN. | ||
| 942 | * We always use the shared queue for process-wide signals, | ||
| 943 | * to avoid several races. | ||
| 944 | */ | ||
| 945 | ret = send_signal(sig, info, p, &p->signal->shared_pending); | ||
| 946 | if (unlikely(ret)) | ||
| 947 | return ret; | ||
| 948 | |||
| 949 | __group_complete_signal(sig, p); | ||
| 950 | return 0; | ||
| 951 | } | ||
| 952 | |||
| 953 | /* | ||
| 954 | * Nuke all other threads in the group. | 931 | * Nuke all other threads in the group. |
| 955 | */ | 932 | */ |
| 956 | void zap_other_threads(struct task_struct *p) | 933 | void zap_other_threads(struct task_struct *p) |
| @@ -978,13 +955,11 @@ int __fatal_signal_pending(struct task_struct *tsk) | |||
| 978 | } | 955 | } |
| 979 | EXPORT_SYMBOL(__fatal_signal_pending); | 956 | EXPORT_SYMBOL(__fatal_signal_pending); |
| 980 | 957 | ||
| 981 | /* | ||
| 982 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | ||
| 983 | */ | ||
| 984 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) | 958 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) |
| 985 | { | 959 | { |
| 986 | struct sighand_struct *sighand; | 960 | struct sighand_struct *sighand; |
| 987 | 961 | ||
| 962 | rcu_read_lock(); | ||
| 988 | for (;;) { | 963 | for (;;) { |
| 989 | sighand = rcu_dereference(tsk->sighand); | 964 | sighand = rcu_dereference(tsk->sighand); |
| 990 | if (unlikely(sighand == NULL)) | 965 | if (unlikely(sighand == NULL)) |
| @@ -995,6 +970,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long | |||
| 995 | break; | 970 | break; |
| 996 | spin_unlock_irqrestore(&sighand->siglock, *flags); | 971 | spin_unlock_irqrestore(&sighand->siglock, *flags); |
| 997 | } | 972 | } |
| 973 | rcu_read_unlock(); | ||
| 998 | 974 | ||
| 999 | return sighand; | 975 | return sighand; |
| 1000 | } | 976 | } |
| @@ -1043,9 +1019,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
| 1043 | struct task_struct *p; | 1019 | struct task_struct *p; |
| 1044 | 1020 | ||
| 1045 | rcu_read_lock(); | 1021 | rcu_read_lock(); |
| 1046 | if (unlikely(sig_needs_tasklist(sig))) | ||
| 1047 | read_lock(&tasklist_lock); | ||
| 1048 | |||
| 1049 | retry: | 1022 | retry: |
| 1050 | p = pid_task(pid, PIDTYPE_PID); | 1023 | p = pid_task(pid, PIDTYPE_PID); |
| 1051 | if (p) { | 1024 | if (p) { |
| @@ -1059,10 +1032,8 @@ retry: | |||
| 1059 | */ | 1032 | */ |
| 1060 | goto retry; | 1033 | goto retry; |
| 1061 | } | 1034 | } |
| 1062 | |||
| 1063 | if (unlikely(sig_needs_tasklist(sig))) | ||
| 1064 | read_unlock(&tasklist_lock); | ||
| 1065 | rcu_read_unlock(); | 1035 | rcu_read_unlock(); |
| 1036 | |||
| 1066 | return error; | 1037 | return error; |
| 1067 | } | 1038 | } |
| 1068 | 1039 | ||
| @@ -1159,8 +1130,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) | |||
| 1159 | */ | 1130 | */ |
| 1160 | 1131 | ||
| 1161 | /* | 1132 | /* |
| 1162 | * These two are the most common entry points. They send a signal | 1133 | * The caller must ensure the task can't exit. |
| 1163 | * just to the specific thread. | ||
| 1164 | */ | 1134 | */ |
| 1165 | int | 1135 | int |
| 1166 | send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1136 | send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
| @@ -1175,17 +1145,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
| 1175 | if (!valid_signal(sig)) | 1145 | if (!valid_signal(sig)) |
| 1176 | return -EINVAL; | 1146 | return -EINVAL; |
| 1177 | 1147 | ||
| 1178 | /* | ||
| 1179 | * We need the tasklist lock even for the specific | ||
| 1180 | * thread case (when we don't need to follow the group | ||
| 1181 | * lists) in order to avoid races with "p->sighand" | ||
| 1182 | * going away or changing from under us. | ||
| 1183 | */ | ||
| 1184 | read_lock(&tasklist_lock); | ||
| 1185 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1148 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 1186 | ret = specific_send_sig_info(sig, info, p); | 1149 | ret = specific_send_sig_info(sig, info, p); |
| 1187 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1150 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 1188 | read_unlock(&tasklist_lock); | ||
| 1189 | return ret; | 1151 | return ret; |
| 1190 | } | 1152 | } |
| 1191 | 1153 | ||
| @@ -1291,28 +1253,24 @@ void sigqueue_free(struct sigqueue *q) | |||
| 1291 | __sigqueue_free(q); | 1253 | __sigqueue_free(q); |
| 1292 | } | 1254 | } |
| 1293 | 1255 | ||
| 1294 | int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | 1256 | int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) |
| 1295 | { | 1257 | { |
| 1258 | int sig = q->info.si_signo; | ||
| 1259 | struct sigpending *pending; | ||
| 1296 | unsigned long flags; | 1260 | unsigned long flags; |
| 1297 | int ret = 0; | 1261 | int ret; |
| 1298 | 1262 | ||
| 1299 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1263 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
| 1300 | 1264 | ||
| 1301 | /* | 1265 | ret = -1; |
| 1302 | * The rcu based delayed sighand destroy makes it possible to | 1266 | if (!likely(lock_task_sighand(t, &flags))) |
| 1303 | * run this without tasklist lock held. The task struct itself | 1267 | goto ret; |
| 1304 | * cannot go away as create_timer did get_task_struct(). | ||
| 1305 | * | ||
| 1306 | * We return -1, when the task is marked exiting, so | ||
| 1307 | * posix_timer_event can redirect it to the group leader | ||
| 1308 | */ | ||
| 1309 | rcu_read_lock(); | ||
| 1310 | 1268 | ||
| 1311 | if (!likely(lock_task_sighand(p, &flags))) { | 1269 | ret = 1; /* the signal is ignored */ |
| 1312 | ret = -1; | 1270 | if (!prepare_signal(sig, t)) |
| 1313 | goto out_err; | 1271 | goto out; |
| 1314 | } | ||
| 1315 | 1272 | ||
| 1273 | ret = 0; | ||
| 1316 | if (unlikely(!list_empty(&q->list))) { | 1274 | if (unlikely(!list_empty(&q->list))) { |
| 1317 | /* | 1275 | /* |
| 1318 | * If an SI_TIMER entry is already queue just increment | 1276 | * If an SI_TIMER entry is already queue just increment |
| @@ -1322,77 +1280,15 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1322 | q->info.si_overrun++; | 1280 | q->info.si_overrun++; |
| 1323 | goto out; | 1281 | goto out; |
| 1324 | } | 1282 | } |
| 1325 | /* Short-circuit ignored signals. */ | ||
| 1326 | if (sig_ignored(p, sig)) { | ||
| 1327 | ret = 1; | ||
| 1328 | goto out; | ||
| 1329 | } | ||
| 1330 | /* | ||
| 1331 | * Deliver the signal to listening signalfds. This must be called | ||
| 1332 | * with the sighand lock held. | ||
| 1333 | */ | ||
| 1334 | signalfd_notify(p, sig); | ||
| 1335 | |||
| 1336 | list_add_tail(&q->list, &p->pending.list); | ||
| 1337 | sigaddset(&p->pending.signal, sig); | ||
| 1338 | if (!sigismember(&p->blocked, sig)) | ||
| 1339 | signal_wake_up(p, sig == SIGKILL); | ||
| 1340 | |||
| 1341 | out: | ||
| 1342 | unlock_task_sighand(p, &flags); | ||
| 1343 | out_err: | ||
| 1344 | rcu_read_unlock(); | ||
| 1345 | |||
| 1346 | return ret; | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | int | ||
| 1350 | send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | ||
| 1351 | { | ||
| 1352 | unsigned long flags; | ||
| 1353 | int ret = 0; | ||
| 1354 | |||
| 1355 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | ||
| 1356 | |||
| 1357 | read_lock(&tasklist_lock); | ||
| 1358 | /* Since it_lock is held, p->sighand cannot be NULL. */ | ||
| 1359 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 1360 | handle_stop_signal(sig, p); | ||
| 1361 | |||
| 1362 | /* Short-circuit ignored signals. */ | ||
| 1363 | if (sig_ignored(p, sig)) { | ||
| 1364 | ret = 1; | ||
| 1365 | goto out; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | if (unlikely(!list_empty(&q->list))) { | ||
| 1369 | /* | ||
| 1370 | * If an SI_TIMER entry is already queue just increment | ||
| 1371 | * the overrun count. Other uses should not try to | ||
| 1372 | * send the signal multiple times. | ||
| 1373 | */ | ||
| 1374 | BUG_ON(q->info.si_code != SI_TIMER); | ||
| 1375 | q->info.si_overrun++; | ||
| 1376 | goto out; | ||
| 1377 | } | ||
| 1378 | /* | ||
| 1379 | * Deliver the signal to listening signalfds. This must be called | ||
| 1380 | * with the sighand lock held. | ||
| 1381 | */ | ||
| 1382 | signalfd_notify(p, sig); | ||
| 1383 | 1283 | ||
| 1384 | /* | 1284 | signalfd_notify(t, sig); |
| 1385 | * Put this signal on the shared-pending queue. | 1285 | pending = group ? &t->signal->shared_pending : &t->pending; |
| 1386 | * We always use the shared queue for process-wide signals, | 1286 | list_add_tail(&q->list, &pending->list); |
| 1387 | * to avoid several races. | 1287 | sigaddset(&pending->signal, sig); |
| 1388 | */ | 1288 | complete_signal(sig, t, group); |
| 1389 | list_add_tail(&q->list, &p->signal->shared_pending.list); | ||
| 1390 | sigaddset(&p->signal->shared_pending.signal, sig); | ||
| 1391 | |||
| 1392 | __group_complete_signal(sig, p); | ||
| 1393 | out: | 1289 | out: |
| 1394 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1290 | unlock_task_sighand(t, &flags); |
| 1395 | read_unlock(&tasklist_lock); | 1291 | ret: |
| 1396 | return ret; | 1292 | return ret; |
| 1397 | } | 1293 | } |
| 1398 | 1294 | ||
| @@ -1723,8 +1619,9 @@ static int do_signal_stop(int signr) | |||
| 1723 | } else { | 1619 | } else { |
| 1724 | struct task_struct *t; | 1620 | struct task_struct *t; |
| 1725 | 1621 | ||
| 1726 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || | 1622 | if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) |
| 1727 | unlikely(sig->group_exit_task)) | 1623 | != SIGNAL_STOP_DEQUEUED) || |
| 1624 | unlikely(signal_group_exit(sig))) | ||
| 1728 | return 0; | 1625 | return 0; |
| 1729 | /* | 1626 | /* |
| 1730 | * There is no group stop already in progress. | 1627 | * There is no group stop already in progress. |
| @@ -1757,11 +1654,51 @@ static int do_signal_stop(int signr) | |||
| 1757 | return 1; | 1654 | return 1; |
| 1758 | } | 1655 | } |
| 1759 | 1656 | ||
| 1657 | static int ptrace_signal(int signr, siginfo_t *info, | ||
| 1658 | struct pt_regs *regs, void *cookie) | ||
| 1659 | { | ||
| 1660 | if (!(current->ptrace & PT_PTRACED)) | ||
| 1661 | return signr; | ||
| 1662 | |||
| 1663 | ptrace_signal_deliver(regs, cookie); | ||
| 1664 | |||
| 1665 | /* Let the debugger run. */ | ||
| 1666 | ptrace_stop(signr, 0, info); | ||
| 1667 | |||
| 1668 | /* We're back. Did the debugger cancel the sig? */ | ||
| 1669 | signr = current->exit_code; | ||
| 1670 | if (signr == 0) | ||
| 1671 | return signr; | ||
| 1672 | |||
| 1673 | current->exit_code = 0; | ||
| 1674 | |||
| 1675 | /* Update the siginfo structure if the signal has | ||
| 1676 | changed. If the debugger wanted something | ||
| 1677 | specific in the siginfo structure then it should | ||
| 1678 | have updated *info via PTRACE_SETSIGINFO. */ | ||
| 1679 | if (signr != info->si_signo) { | ||
| 1680 | info->si_signo = signr; | ||
| 1681 | info->si_errno = 0; | ||
| 1682 | info->si_code = SI_USER; | ||
| 1683 | info->si_pid = task_pid_vnr(current->parent); | ||
| 1684 | info->si_uid = current->parent->uid; | ||
| 1685 | } | ||
| 1686 | |||
| 1687 | /* If the (new) signal is now blocked, requeue it. */ | ||
| 1688 | if (sigismember(¤t->blocked, signr)) { | ||
| 1689 | specific_send_sig_info(signr, info, current); | ||
| 1690 | signr = 0; | ||
| 1691 | } | ||
| 1692 | |||
| 1693 | return signr; | ||
| 1694 | } | ||
| 1695 | |||
| 1760 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | 1696 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, |
| 1761 | struct pt_regs *regs, void *cookie) | 1697 | struct pt_regs *regs, void *cookie) |
| 1762 | { | 1698 | { |
| 1763 | sigset_t *mask = ¤t->blocked; | 1699 | struct sighand_struct *sighand = current->sighand; |
| 1764 | int signr = 0; | 1700 | struct signal_struct *signal = current->signal; |
| 1701 | int signr; | ||
| 1765 | 1702 | ||
| 1766 | relock: | 1703 | relock: |
| 1767 | /* | 1704 | /* |
| @@ -1772,52 +1709,42 @@ relock: | |||
| 1772 | */ | 1709 | */ |
| 1773 | try_to_freeze(); | 1710 | try_to_freeze(); |
| 1774 | 1711 | ||
| 1775 | spin_lock_irq(¤t->sighand->siglock); | 1712 | spin_lock_irq(&sighand->siglock); |
| 1713 | /* | ||
| 1714 | * Every stopped thread goes here after wakeup. Check to see if | ||
| 1715 | * we should notify the parent, prepare_signal(SIGCONT) encodes | ||
| 1716 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. | ||
| 1717 | */ | ||
| 1718 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { | ||
| 1719 | int why = (signal->flags & SIGNAL_STOP_CONTINUED) | ||
| 1720 | ? CLD_CONTINUED : CLD_STOPPED; | ||
| 1721 | signal->flags &= ~SIGNAL_CLD_MASK; | ||
| 1722 | spin_unlock_irq(&sighand->siglock); | ||
| 1723 | |||
| 1724 | read_lock(&tasklist_lock); | ||
| 1725 | do_notify_parent_cldstop(current->group_leader, why); | ||
| 1726 | read_unlock(&tasklist_lock); | ||
| 1727 | goto relock; | ||
| 1728 | } | ||
| 1729 | |||
| 1776 | for (;;) { | 1730 | for (;;) { |
| 1777 | struct k_sigaction *ka; | 1731 | struct k_sigaction *ka; |
| 1778 | 1732 | ||
| 1779 | if (unlikely(current->signal->group_stop_count > 0) && | 1733 | if (unlikely(signal->group_stop_count > 0) && |
| 1780 | do_signal_stop(0)) | 1734 | do_signal_stop(0)) |
| 1781 | goto relock; | 1735 | goto relock; |
| 1782 | 1736 | ||
| 1783 | signr = dequeue_signal(current, mask, info); | 1737 | signr = dequeue_signal(current, ¤t->blocked, info); |
| 1784 | |||
| 1785 | if (!signr) | 1738 | if (!signr) |
| 1786 | break; /* will return 0 */ | 1739 | break; /* will return 0 */ |
| 1787 | 1740 | ||
| 1788 | if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { | 1741 | if (signr != SIGKILL) { |
| 1789 | ptrace_signal_deliver(regs, cookie); | 1742 | signr = ptrace_signal(signr, info, regs, cookie); |
| 1790 | 1743 | if (!signr) | |
| 1791 | /* Let the debugger run. */ | ||
| 1792 | ptrace_stop(signr, 0, info); | ||
| 1793 | |||
| 1794 | /* We're back. Did the debugger cancel the sig? */ | ||
| 1795 | signr = current->exit_code; | ||
| 1796 | if (signr == 0) | ||
| 1797 | continue; | ||
| 1798 | |||
| 1799 | current->exit_code = 0; | ||
| 1800 | |||
| 1801 | /* Update the siginfo structure if the signal has | ||
| 1802 | changed. If the debugger wanted something | ||
| 1803 | specific in the siginfo structure then it should | ||
| 1804 | have updated *info via PTRACE_SETSIGINFO. */ | ||
| 1805 | if (signr != info->si_signo) { | ||
| 1806 | info->si_signo = signr; | ||
| 1807 | info->si_errno = 0; | ||
| 1808 | info->si_code = SI_USER; | ||
| 1809 | info->si_pid = task_pid_vnr(current->parent); | ||
| 1810 | info->si_uid = current->parent->uid; | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | /* If the (new) signal is now blocked, requeue it. */ | ||
| 1814 | if (sigismember(¤t->blocked, signr)) { | ||
| 1815 | specific_send_sig_info(signr, info, current); | ||
| 1816 | continue; | 1744 | continue; |
| 1817 | } | ||
| 1818 | } | 1745 | } |
| 1819 | 1746 | ||
| 1820 | ka = ¤t->sighand->action[signr-1]; | 1747 | ka = &sighand->action[signr-1]; |
| 1821 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ | 1748 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ |
| 1822 | continue; | 1749 | continue; |
| 1823 | if (ka->sa.sa_handler != SIG_DFL) { | 1750 | if (ka->sa.sa_handler != SIG_DFL) { |
| @@ -1839,7 +1766,8 @@ relock: | |||
| 1839 | /* | 1766 | /* |
| 1840 | * Global init gets no signals it doesn't want. | 1767 | * Global init gets no signals it doesn't want. |
| 1841 | */ | 1768 | */ |
| 1842 | if (is_global_init(current)) | 1769 | if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && |
| 1770 | !signal_group_exit(signal)) | ||
| 1843 | continue; | 1771 | continue; |
| 1844 | 1772 | ||
| 1845 | if (sig_kernel_stop(signr)) { | 1773 | if (sig_kernel_stop(signr)) { |
| @@ -1854,14 +1782,14 @@ relock: | |||
| 1854 | * We need to check for that and bail out if necessary. | 1782 | * We need to check for that and bail out if necessary. |
| 1855 | */ | 1783 | */ |
| 1856 | if (signr != SIGSTOP) { | 1784 | if (signr != SIGSTOP) { |
| 1857 | spin_unlock_irq(¤t->sighand->siglock); | 1785 | spin_unlock_irq(&sighand->siglock); |
| 1858 | 1786 | ||
| 1859 | /* signals can be posted during this window */ | 1787 | /* signals can be posted during this window */ |
| 1860 | 1788 | ||
| 1861 | if (is_current_pgrp_orphaned()) | 1789 | if (is_current_pgrp_orphaned()) |
| 1862 | goto relock; | 1790 | goto relock; |
| 1863 | 1791 | ||
| 1864 | spin_lock_irq(¤t->sighand->siglock); | 1792 | spin_lock_irq(&sighand->siglock); |
| 1865 | } | 1793 | } |
| 1866 | 1794 | ||
| 1867 | if (likely(do_signal_stop(signr))) { | 1795 | if (likely(do_signal_stop(signr))) { |
| @@ -1876,15 +1804,16 @@ relock: | |||
| 1876 | continue; | 1804 | continue; |
| 1877 | } | 1805 | } |
| 1878 | 1806 | ||
| 1879 | spin_unlock_irq(¤t->sighand->siglock); | 1807 | spin_unlock_irq(&sighand->siglock); |
| 1880 | 1808 | ||
| 1881 | /* | 1809 | /* |
| 1882 | * Anything else is fatal, maybe with a core dump. | 1810 | * Anything else is fatal, maybe with a core dump. |
| 1883 | */ | 1811 | */ |
| 1884 | current->flags |= PF_SIGNALED; | 1812 | current->flags |= PF_SIGNALED; |
| 1885 | if ((signr != SIGKILL) && print_fatal_signals) | 1813 | |
| 1886 | print_fatal_signal(regs, signr); | ||
| 1887 | if (sig_kernel_coredump(signr)) { | 1814 | if (sig_kernel_coredump(signr)) { |
| 1815 | if (print_fatal_signals) | ||
| 1816 | print_fatal_signal(regs, signr); | ||
| 1888 | /* | 1817 | /* |
| 1889 | * If it was able to dump core, this kills all | 1818 | * If it was able to dump core, this kills all |
| 1890 | * other threads in the group and synchronizes with | 1819 | * other threads in the group and synchronizes with |
| @@ -1902,7 +1831,7 @@ relock: | |||
| 1902 | do_group_exit(signr); | 1831 | do_group_exit(signr); |
| 1903 | /* NOTREACHED */ | 1832 | /* NOTREACHED */ |
| 1904 | } | 1833 | } |
| 1905 | spin_unlock_irq(¤t->sighand->siglock); | 1834 | spin_unlock_irq(&sighand->siglock); |
| 1906 | return signr; | 1835 | return signr; |
| 1907 | } | 1836 | } |
| 1908 | 1837 | ||
| @@ -2246,6 +2175,7 @@ static int do_tkill(int tgid, int pid, int sig) | |||
| 2246 | int error; | 2175 | int error; |
| 2247 | struct siginfo info; | 2176 | struct siginfo info; |
| 2248 | struct task_struct *p; | 2177 | struct task_struct *p; |
| 2178 | unsigned long flags; | ||
| 2249 | 2179 | ||
| 2250 | error = -ESRCH; | 2180 | error = -ESRCH; |
| 2251 | info.si_signo = sig; | 2181 | info.si_signo = sig; |
| @@ -2254,22 +2184,24 @@ static int do_tkill(int tgid, int pid, int sig) | |||
| 2254 | info.si_pid = task_tgid_vnr(current); | 2184 | info.si_pid = task_tgid_vnr(current); |
| 2255 | info.si_uid = current->uid; | 2185 | info.si_uid = current->uid; |
| 2256 | 2186 | ||
| 2257 | read_lock(&tasklist_lock); | 2187 | rcu_read_lock(); |
| 2258 | p = find_task_by_vpid(pid); | 2188 | p = find_task_by_vpid(pid); |
| 2259 | if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { | 2189 | if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { |
| 2260 | error = check_kill_permission(sig, &info, p); | 2190 | error = check_kill_permission(sig, &info, p); |
| 2261 | /* | 2191 | /* |
| 2262 | * The null signal is a permissions and process existence | 2192 | * The null signal is a permissions and process existence |
| 2263 | * probe. No signal is actually delivered. | 2193 | * probe. No signal is actually delivered. |
| 2194 | * | ||
| 2195 | * If lock_task_sighand() fails we pretend the task dies | ||
| 2196 | * after receiving the signal. The window is tiny, and the | ||
| 2197 | * signal is private anyway. | ||
| 2264 | */ | 2198 | */ |
| 2265 | if (!error && sig && p->sighand) { | 2199 | if (!error && sig && lock_task_sighand(p, &flags)) { |
| 2266 | spin_lock_irq(&p->sighand->siglock); | ||
| 2267 | handle_stop_signal(sig, p); | ||
| 2268 | error = specific_send_sig_info(sig, &info, p); | 2200 | error = specific_send_sig_info(sig, &info, p); |
| 2269 | spin_unlock_irq(&p->sighand->siglock); | 2201 | unlock_task_sighand(p, &flags); |
| 2270 | } | 2202 | } |
| 2271 | } | 2203 | } |
| 2272 | read_unlock(&tasklist_lock); | 2204 | rcu_read_unlock(); |
| 2273 | 2205 | ||
| 2274 | return error; | 2206 | return error; |
| 2275 | } | 2207 | } |
| @@ -2326,13 +2258,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) | |||
| 2326 | 2258 | ||
| 2327 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 2259 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
| 2328 | { | 2260 | { |
| 2261 | struct task_struct *t = current; | ||
| 2329 | struct k_sigaction *k; | 2262 | struct k_sigaction *k; |
| 2330 | sigset_t mask; | 2263 | sigset_t mask; |
| 2331 | 2264 | ||
| 2332 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 2265 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
| 2333 | return -EINVAL; | 2266 | return -EINVAL; |
| 2334 | 2267 | ||
| 2335 | k = ¤t->sighand->action[sig-1]; | 2268 | k = &t->sighand->action[sig-1]; |
| 2336 | 2269 | ||
| 2337 | spin_lock_irq(¤t->sighand->siglock); | 2270 | spin_lock_irq(¤t->sighand->siglock); |
| 2338 | if (oact) | 2271 | if (oact) |
| @@ -2353,9 +2286,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 2353 | * (for example, SIGCHLD), shall cause the pending signal to | 2286 | * (for example, SIGCHLD), shall cause the pending signal to |
| 2354 | * be discarded, whether or not it is blocked" | 2287 | * be discarded, whether or not it is blocked" |
| 2355 | */ | 2288 | */ |
| 2356 | if (act->sa.sa_handler == SIG_IGN || | 2289 | if (__sig_ignored(t, sig)) { |
| 2357 | (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) { | ||
| 2358 | struct task_struct *t = current; | ||
| 2359 | sigemptyset(&mask); | 2290 | sigemptyset(&mask); |
| 2360 | sigaddset(&mask, sig); | 2291 | sigaddset(&mask, sig); |
| 2361 | rm_from_queue_full(&mask, &t->signal->shared_pending); | 2292 | rm_from_queue_full(&mask, &t->signal->shared_pending); |
| @@ -2610,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) | |||
| 2610 | 2541 | ||
| 2611 | current->state = TASK_INTERRUPTIBLE; | 2542 | current->state = TASK_INTERRUPTIBLE; |
| 2612 | schedule(); | 2543 | schedule(); |
| 2613 | set_thread_flag(TIF_RESTORE_SIGMASK); | 2544 | set_restore_sigmask(); |
| 2614 | return -ERESTARTNOHAND; | 2545 | return -ERESTARTNOHAND; |
| 2615 | } | 2546 | } |
| 2616 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 2547 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 31e9f2a47928..36e061740047 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
| 356 | /* Tasklets */ | 356 | /* Tasklets */ |
| 357 | struct tasklet_head | 357 | struct tasklet_head |
| 358 | { | 358 | { |
| 359 | struct tasklet_struct *list; | 359 | struct tasklet_struct *head; |
| 360 | struct tasklet_struct **tail; | ||
| 360 | }; | 361 | }; |
| 361 | 362 | ||
| 362 | /* Some compilers disobey section attribute on statics when not | 363 | /* Some compilers disobey section attribute on statics when not |
| @@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
| 369 | unsigned long flags; | 370 | unsigned long flags; |
| 370 | 371 | ||
| 371 | local_irq_save(flags); | 372 | local_irq_save(flags); |
| 372 | t->next = __get_cpu_var(tasklet_vec).list; | 373 | t->next = NULL; |
| 373 | __get_cpu_var(tasklet_vec).list = t; | 374 | *__get_cpu_var(tasklet_vec).tail = t; |
| 375 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
| 374 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 376 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 375 | local_irq_restore(flags); | 377 | local_irq_restore(flags); |
| 376 | } | 378 | } |
| @@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
| 382 | unsigned long flags; | 384 | unsigned long flags; |
| 383 | 385 | ||
| 384 | local_irq_save(flags); | 386 | local_irq_save(flags); |
| 385 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 387 | t->next = NULL; |
| 386 | __get_cpu_var(tasklet_hi_vec).list = t; | 388 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
| 389 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
| 387 | raise_softirq_irqoff(HI_SOFTIRQ); | 390 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 388 | local_irq_restore(flags); | 391 | local_irq_restore(flags); |
| 389 | } | 392 | } |
| @@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) | |||
| 395 | struct tasklet_struct *list; | 398 | struct tasklet_struct *list; |
| 396 | 399 | ||
| 397 | local_irq_disable(); | 400 | local_irq_disable(); |
| 398 | list = __get_cpu_var(tasklet_vec).list; | 401 | list = __get_cpu_var(tasklet_vec).head; |
| 399 | __get_cpu_var(tasklet_vec).list = NULL; | 402 | __get_cpu_var(tasklet_vec).head = NULL; |
| 403 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | ||
| 400 | local_irq_enable(); | 404 | local_irq_enable(); |
| 401 | 405 | ||
| 402 | while (list) { | 406 | while (list) { |
| @@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) | |||
| 416 | } | 420 | } |
| 417 | 421 | ||
| 418 | local_irq_disable(); | 422 | local_irq_disable(); |
| 419 | t->next = __get_cpu_var(tasklet_vec).list; | 423 | t->next = NULL; |
| 420 | __get_cpu_var(tasklet_vec).list = t; | 424 | *__get_cpu_var(tasklet_vec).tail = t; |
| 425 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
| 421 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 426 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 422 | local_irq_enable(); | 427 | local_irq_enable(); |
| 423 | } | 428 | } |
| @@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 428 | struct tasklet_struct *list; | 433 | struct tasklet_struct *list; |
| 429 | 434 | ||
| 430 | local_irq_disable(); | 435 | local_irq_disable(); |
| 431 | list = __get_cpu_var(tasklet_hi_vec).list; | 436 | list = __get_cpu_var(tasklet_hi_vec).head; |
| 432 | __get_cpu_var(tasklet_hi_vec).list = NULL; | 437 | __get_cpu_var(tasklet_hi_vec).head = NULL; |
| 438 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | ||
| 433 | local_irq_enable(); | 439 | local_irq_enable(); |
| 434 | 440 | ||
| 435 | while (list) { | 441 | while (list) { |
| @@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 449 | } | 455 | } |
| 450 | 456 | ||
| 451 | local_irq_disable(); | 457 | local_irq_disable(); |
| 452 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 458 | t->next = NULL; |
| 453 | __get_cpu_var(tasklet_hi_vec).list = t; | 459 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
| 460 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
| 454 | __raise_softirq_irqoff(HI_SOFTIRQ); | 461 | __raise_softirq_irqoff(HI_SOFTIRQ); |
| 455 | local_irq_enable(); | 462 | local_irq_enable(); |
| 456 | } | 463 | } |
| @@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); | |||
| 487 | 494 | ||
| 488 | void __init softirq_init(void) | 495 | void __init softirq_init(void) |
| 489 | { | 496 | { |
| 497 | int cpu; | ||
| 498 | |||
| 499 | for_each_possible_cpu(cpu) { | ||
| 500 | per_cpu(tasklet_vec, cpu).tail = | ||
| 501 | &per_cpu(tasklet_vec, cpu).head; | ||
| 502 | per_cpu(tasklet_hi_vec, cpu).tail = | ||
| 503 | &per_cpu(tasklet_hi_vec, cpu).head; | ||
| 504 | } | ||
| 505 | |||
| 490 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); | 506 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); |
| 491 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); | 507 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); |
| 492 | } | 508 | } |
| @@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
| 555 | return; | 571 | return; |
| 556 | 572 | ||
| 557 | /* CPU is dead, so no lock needed. */ | 573 | /* CPU is dead, so no lock needed. */ |
| 558 | for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { | 574 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { |
| 559 | if (*i == t) { | 575 | if (*i == t) { |
| 560 | *i = t->next; | 576 | *i = t->next; |
| 577 | /* If this was the tail element, move the tail ptr */ | ||
| 578 | if (*i == NULL) | ||
| 579 | per_cpu(tasklet_vec, cpu).tail = i; | ||
| 561 | return; | 580 | return; |
| 562 | } | 581 | } |
| 563 | } | 582 | } |
| @@ -566,20 +585,24 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
| 566 | 585 | ||
| 567 | static void takeover_tasklets(unsigned int cpu) | 586 | static void takeover_tasklets(unsigned int cpu) |
| 568 | { | 587 | { |
| 569 | struct tasklet_struct **i; | ||
| 570 | |||
| 571 | /* CPU is dead, so no lock needed. */ | 588 | /* CPU is dead, so no lock needed. */ |
| 572 | local_irq_disable(); | 589 | local_irq_disable(); |
| 573 | 590 | ||
| 574 | /* Find end, append list for that CPU. */ | 591 | /* Find end, append list for that CPU. */ |
| 575 | for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); | 592 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
| 576 | *i = per_cpu(tasklet_vec, cpu).list; | 593 | *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; |
| 577 | per_cpu(tasklet_vec, cpu).list = NULL; | 594 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; |
| 595 | per_cpu(tasklet_vec, cpu).head = NULL; | ||
| 596 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | ||
| 597 | } | ||
| 578 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 598 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 579 | 599 | ||
| 580 | for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); | 600 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
| 581 | *i = per_cpu(tasklet_hi_vec, cpu).list; | 601 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; |
| 582 | per_cpu(tasklet_hi_vec, cpu).list = NULL; | 602 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; |
| 603 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | ||
| 604 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | ||
| 605 | } | ||
| 583 | raise_softirq_irqoff(HI_SOFTIRQ); | 606 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 584 | 607 | ||
| 585 | local_irq_enable(); | 608 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70c..0101aeef7ed7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -11,7 +11,6 @@ | |||
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | 12 | ||
| 13 | #include <asm/atomic.h> | 13 | #include <asm/atomic.h> |
| 14 | #include <asm/semaphore.h> | ||
| 15 | #include <asm/uaccess.h> | 14 | #include <asm/uaccess.h> |
| 16 | 15 | ||
| 17 | /* Since we effect priority and affinity (both of which are visible | 16 | /* Since we effect priority and affinity (both of which are visible |
| @@ -35,7 +34,7 @@ static int stopmachine(void *cpu) | |||
| 35 | int irqs_disabled = 0; | 34 | int irqs_disabled = 0; |
| 36 | int prepared = 0; | 35 | int prepared = 0; |
| 37 | 36 | ||
| 38 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | 37 | set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); |
| 39 | 38 | ||
| 40 | /* Ack: we are alive */ | 39 | /* Ack: we are alive */ |
| 41 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | 40 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ |
| @@ -135,8 +134,7 @@ static void restart_machine(void) | |||
| 135 | preempt_enable_no_resched(); | 134 | preempt_enable_no_resched(); |
| 136 | } | 135 | } |
| 137 | 136 | ||
| 138 | struct stop_machine_data | 137 | struct stop_machine_data { |
| 139 | { | ||
| 140 | int (*fn)(void *); | 138 | int (*fn)(void *); |
| 141 | void *data; | 139 | void *data; |
| 142 | struct completion done; | 140 | struct completion done; |
diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5db..895d2d4c9493 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -67,6 +67,12 @@ | |||
| 67 | #ifndef SET_ENDIAN | 67 | #ifndef SET_ENDIAN |
| 68 | # define SET_ENDIAN(a,b) (-EINVAL) | 68 | # define SET_ENDIAN(a,b) (-EINVAL) |
| 69 | #endif | 69 | #endif |
| 70 | #ifndef GET_TSC_CTL | ||
| 71 | # define GET_TSC_CTL(a) (-EINVAL) | ||
| 72 | #endif | ||
| 73 | #ifndef SET_TSC_CTL | ||
| 74 | # define SET_TSC_CTL(a) (-EINVAL) | ||
| 75 | #endif | ||
| 70 | 76 | ||
| 71 | /* | 77 | /* |
| 72 | * this is where the system-wide overflow UID and GID are defined, for | 78 | * this is where the system-wide overflow UID and GID are defined, for |
| @@ -972,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
| 972 | goto out; | 978 | goto out; |
| 973 | 979 | ||
| 974 | if (task_pgrp(p) != pgrp) { | 980 | if (task_pgrp(p) != pgrp) { |
| 975 | detach_pid(p, PIDTYPE_PGID); | 981 | change_pid(p, PIDTYPE_PGID, pgrp); |
| 976 | attach_pid(p, PIDTYPE_PGID, pgrp); | ||
| 977 | set_task_pgrp(p, pid_nr(pgrp)); | 982 | set_task_pgrp(p, pid_nr(pgrp)); |
| 978 | } | 983 | } |
| 979 | 984 | ||
| @@ -986,54 +991,67 @@ out: | |||
| 986 | 991 | ||
| 987 | asmlinkage long sys_getpgid(pid_t pid) | 992 | asmlinkage long sys_getpgid(pid_t pid) |
| 988 | { | 993 | { |
| 994 | struct task_struct *p; | ||
| 995 | struct pid *grp; | ||
| 996 | int retval; | ||
| 997 | |||
| 998 | rcu_read_lock(); | ||
| 989 | if (!pid) | 999 | if (!pid) |
| 990 | return task_pgrp_vnr(current); | 1000 | grp = task_pgrp(current); |
| 991 | else { | 1001 | else { |
| 992 | int retval; | ||
| 993 | struct task_struct *p; | ||
| 994 | |||
| 995 | read_lock(&tasklist_lock); | ||
| 996 | p = find_task_by_vpid(pid); | ||
| 997 | retval = -ESRCH; | 1002 | retval = -ESRCH; |
| 998 | if (p) { | 1003 | p = find_task_by_vpid(pid); |
| 999 | retval = security_task_getpgid(p); | 1004 | if (!p) |
| 1000 | if (!retval) | 1005 | goto out; |
| 1001 | retval = task_pgrp_vnr(p); | 1006 | grp = task_pgrp(p); |
| 1002 | } | 1007 | if (!grp) |
| 1003 | read_unlock(&tasklist_lock); | 1008 | goto out; |
| 1004 | return retval; | 1009 | |
| 1010 | retval = security_task_getpgid(p); | ||
| 1011 | if (retval) | ||
| 1012 | goto out; | ||
| 1005 | } | 1013 | } |
| 1014 | retval = pid_vnr(grp); | ||
| 1015 | out: | ||
| 1016 | rcu_read_unlock(); | ||
| 1017 | return retval; | ||
| 1006 | } | 1018 | } |
| 1007 | 1019 | ||
| 1008 | #ifdef __ARCH_WANT_SYS_GETPGRP | 1020 | #ifdef __ARCH_WANT_SYS_GETPGRP |
| 1009 | 1021 | ||
| 1010 | asmlinkage long sys_getpgrp(void) | 1022 | asmlinkage long sys_getpgrp(void) |
| 1011 | { | 1023 | { |
| 1012 | /* SMP - assuming writes are word atomic this is fine */ | 1024 | return sys_getpgid(0); |
| 1013 | return task_pgrp_vnr(current); | ||
| 1014 | } | 1025 | } |
| 1015 | 1026 | ||
| 1016 | #endif | 1027 | #endif |
| 1017 | 1028 | ||
| 1018 | asmlinkage long sys_getsid(pid_t pid) | 1029 | asmlinkage long sys_getsid(pid_t pid) |
| 1019 | { | 1030 | { |
| 1031 | struct task_struct *p; | ||
| 1032 | struct pid *sid; | ||
| 1033 | int retval; | ||
| 1034 | |||
| 1035 | rcu_read_lock(); | ||
| 1020 | if (!pid) | 1036 | if (!pid) |
| 1021 | return task_session_vnr(current); | 1037 | sid = task_session(current); |
| 1022 | else { | 1038 | else { |
| 1023 | int retval; | ||
| 1024 | struct task_struct *p; | ||
| 1025 | |||
| 1026 | rcu_read_lock(); | ||
| 1027 | p = find_task_by_vpid(pid); | ||
| 1028 | retval = -ESRCH; | 1039 | retval = -ESRCH; |
| 1029 | if (p) { | 1040 | p = find_task_by_vpid(pid); |
| 1030 | retval = security_task_getsid(p); | 1041 | if (!p) |
| 1031 | if (!retval) | 1042 | goto out; |
| 1032 | retval = task_session_vnr(p); | 1043 | sid = task_session(p); |
| 1033 | } | 1044 | if (!sid) |
| 1034 | rcu_read_unlock(); | 1045 | goto out; |
| 1035 | return retval; | 1046 | |
| 1047 | retval = security_task_getsid(p); | ||
| 1048 | if (retval) | ||
| 1049 | goto out; | ||
| 1036 | } | 1050 | } |
| 1051 | retval = pid_vnr(sid); | ||
| 1052 | out: | ||
| 1053 | rcu_read_unlock(); | ||
| 1054 | return retval; | ||
| 1037 | } | 1055 | } |
| 1038 | 1056 | ||
| 1039 | asmlinkage long sys_setsid(void) | 1057 | asmlinkage long sys_setsid(void) |
| @@ -1539,6 +1557,19 @@ out: | |||
| 1539 | * | 1557 | * |
| 1540 | */ | 1558 | */ |
| 1541 | 1559 | ||
| 1560 | static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, | ||
| 1561 | cputime_t *utimep, cputime_t *stimep) | ||
| 1562 | { | ||
| 1563 | *utimep = cputime_add(*utimep, t->utime); | ||
| 1564 | *stimep = cputime_add(*stimep, t->stime); | ||
| 1565 | r->ru_nvcsw += t->nvcsw; | ||
| 1566 | r->ru_nivcsw += t->nivcsw; | ||
| 1567 | r->ru_minflt += t->min_flt; | ||
| 1568 | r->ru_majflt += t->maj_flt; | ||
| 1569 | r->ru_inblock += task_io_get_inblock(t); | ||
| 1570 | r->ru_oublock += task_io_get_oublock(t); | ||
| 1571 | } | ||
| 1572 | |||
| 1542 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | 1573 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) |
| 1543 | { | 1574 | { |
| 1544 | struct task_struct *t; | 1575 | struct task_struct *t; |
| @@ -1548,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1548 | memset((char *) r, 0, sizeof *r); | 1579 | memset((char *) r, 0, sizeof *r); |
| 1549 | utime = stime = cputime_zero; | 1580 | utime = stime = cputime_zero; |
| 1550 | 1581 | ||
| 1551 | rcu_read_lock(); | 1582 | if (who == RUSAGE_THREAD) { |
| 1552 | if (!lock_task_sighand(p, &flags)) { | 1583 | accumulate_thread_rusage(p, r, &utime, &stime); |
| 1553 | rcu_read_unlock(); | 1584 | goto out; |
| 1554 | return; | ||
| 1555 | } | 1585 | } |
| 1556 | 1586 | ||
| 1587 | if (!lock_task_sighand(p, &flags)) | ||
| 1588 | return; | ||
| 1589 | |||
| 1557 | switch (who) { | 1590 | switch (who) { |
| 1558 | case RUSAGE_BOTH: | 1591 | case RUSAGE_BOTH: |
| 1559 | case RUSAGE_CHILDREN: | 1592 | case RUSAGE_CHILDREN: |
| @@ -1580,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1580 | r->ru_oublock += p->signal->oublock; | 1613 | r->ru_oublock += p->signal->oublock; |
| 1581 | t = p; | 1614 | t = p; |
| 1582 | do { | 1615 | do { |
| 1583 | utime = cputime_add(utime, t->utime); | 1616 | accumulate_thread_rusage(t, r, &utime, &stime); |
| 1584 | stime = cputime_add(stime, t->stime); | ||
| 1585 | r->ru_nvcsw += t->nvcsw; | ||
| 1586 | r->ru_nivcsw += t->nivcsw; | ||
| 1587 | r->ru_minflt += t->min_flt; | ||
| 1588 | r->ru_majflt += t->maj_flt; | ||
| 1589 | r->ru_inblock += task_io_get_inblock(t); | ||
| 1590 | r->ru_oublock += task_io_get_oublock(t); | ||
| 1591 | t = next_thread(t); | 1617 | t = next_thread(t); |
| 1592 | } while (t != p); | 1618 | } while (t != p); |
| 1593 | break; | 1619 | break; |
| @@ -1595,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1595 | default: | 1621 | default: |
| 1596 | BUG(); | 1622 | BUG(); |
| 1597 | } | 1623 | } |
| 1598 | |||
| 1599 | unlock_task_sighand(p, &flags); | 1624 | unlock_task_sighand(p, &flags); |
| 1600 | rcu_read_unlock(); | ||
| 1601 | 1625 | ||
| 1626 | out: | ||
| 1602 | cputime_to_timeval(utime, &r->ru_utime); | 1627 | cputime_to_timeval(utime, &r->ru_utime); |
| 1603 | cputime_to_timeval(stime, &r->ru_stime); | 1628 | cputime_to_timeval(stime, &r->ru_stime); |
| 1604 | } | 1629 | } |
| @@ -1612,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | |||
| 1612 | 1637 | ||
| 1613 | asmlinkage long sys_getrusage(int who, struct rusage __user *ru) | 1638 | asmlinkage long sys_getrusage(int who, struct rusage __user *ru) |
| 1614 | { | 1639 | { |
| 1615 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) | 1640 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && |
| 1641 | who != RUSAGE_THREAD) | ||
| 1616 | return -EINVAL; | 1642 | return -EINVAL; |
| 1617 | return getrusage(current, who, ru); | 1643 | return getrusage(current, who, ru); |
| 1618 | } | 1644 | } |
| @@ -1626,10 +1652,9 @@ asmlinkage long sys_umask(int mask) | |||
| 1626 | asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | 1652 | asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, |
| 1627 | unsigned long arg4, unsigned long arg5) | 1653 | unsigned long arg4, unsigned long arg5) |
| 1628 | { | 1654 | { |
| 1629 | long error; | 1655 | long uninitialized_var(error); |
| 1630 | 1656 | ||
| 1631 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); | 1657 | if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) |
| 1632 | if (error) | ||
| 1633 | return error; | 1658 | return error; |
| 1634 | 1659 | ||
| 1635 | switch (option) { | 1660 | switch (option) { |
| @@ -1682,17 +1707,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1682 | error = -EINVAL; | 1707 | error = -EINVAL; |
| 1683 | break; | 1708 | break; |
| 1684 | 1709 | ||
| 1685 | case PR_GET_KEEPCAPS: | ||
| 1686 | if (current->keep_capabilities) | ||
| 1687 | error = 1; | ||
| 1688 | break; | ||
| 1689 | case PR_SET_KEEPCAPS: | ||
| 1690 | if (arg2 != 0 && arg2 != 1) { | ||
| 1691 | error = -EINVAL; | ||
| 1692 | break; | ||
| 1693 | } | ||
| 1694 | current->keep_capabilities = arg2; | ||
| 1695 | break; | ||
| 1696 | case PR_SET_NAME: { | 1710 | case PR_SET_NAME: { |
| 1697 | struct task_struct *me = current; | 1711 | struct task_struct *me = current; |
| 1698 | unsigned char ncomm[sizeof(me->comm)]; | 1712 | unsigned char ncomm[sizeof(me->comm)]; |
| @@ -1726,18 +1740,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1726 | case PR_SET_SECCOMP: | 1740 | case PR_SET_SECCOMP: |
| 1727 | error = prctl_set_seccomp(arg2); | 1741 | error = prctl_set_seccomp(arg2); |
| 1728 | break; | 1742 | break; |
| 1729 | 1743 | case PR_GET_TSC: | |
| 1730 | case PR_CAPBSET_READ: | 1744 | error = GET_TSC_CTL(arg2); |
| 1731 | if (!cap_valid(arg2)) | 1745 | break; |
| 1732 | return -EINVAL; | 1746 | case PR_SET_TSC: |
| 1733 | return !!cap_raised(current->cap_bset, arg2); | 1747 | error = SET_TSC_CTL(arg2); |
| 1734 | case PR_CAPBSET_DROP: | 1748 | break; |
| 1735 | #ifdef CONFIG_SECURITY_FILE_CAPABILITIES | ||
| 1736 | return cap_prctl_drop(arg2); | ||
| 1737 | #else | ||
| 1738 | return -EINVAL; | ||
| 1739 | #endif | ||
| 1740 | |||
| 1741 | default: | 1749 | default: |
| 1742 | error = -EINVAL; | 1750 | error = -EINVAL; |
| 1743 | break; | 1751 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d6889bab..d7ffdc59816a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/writeback.h> | 38 | #include <linux/writeback.h> |
| 39 | #include <linux/hugetlb.h> | 39 | #include <linux/hugetlb.h> |
| 40 | #include <linux/initrd.h> | 40 | #include <linux/initrd.h> |
| 41 | #include <linux/key.h> | ||
| 41 | #include <linux/times.h> | 42 | #include <linux/times.h> |
| 42 | #include <linux/limits.h> | 43 | #include <linux/limits.h> |
| 43 | #include <linux/dcache.h> | 44 | #include <linux/dcache.h> |
| @@ -144,12 +145,6 @@ extern int no_unaligned_warning; | |||
| 144 | extern int max_lock_depth; | 145 | extern int max_lock_depth; |
| 145 | #endif | 146 | #endif |
| 146 | 147 | ||
| 147 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
| 148 | static int parse_table(int __user *, int, void __user *, size_t __user *, | ||
| 149 | void __user *, size_t, struct ctl_table *); | ||
| 150 | #endif | ||
| 151 | |||
| 152 | |||
| 153 | #ifdef CONFIG_PROC_SYSCTL | 148 | #ifdef CONFIG_PROC_SYSCTL |
| 154 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, | 149 | static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, |
| 155 | void __user *buffer, size_t *lenp, loff_t *ppos); | 150 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| @@ -270,17 +265,6 @@ static struct ctl_table kern_table[] = { | |||
| 270 | }, | 265 | }, |
| 271 | { | 266 | { |
| 272 | .ctl_name = CTL_UNNUMBERED, | 267 | .ctl_name = CTL_UNNUMBERED, |
| 273 | .procname = "sched_batch_wakeup_granularity_ns", | ||
| 274 | .data = &sysctl_sched_batch_wakeup_granularity, | ||
| 275 | .maxlen = sizeof(unsigned int), | ||
| 276 | .mode = 0644, | ||
| 277 | .proc_handler = &proc_dointvec_minmax, | ||
| 278 | .strategy = &sysctl_intvec, | ||
| 279 | .extra1 = &min_wakeup_granularity_ns, | ||
| 280 | .extra2 = &max_wakeup_granularity_ns, | ||
| 281 | }, | ||
| 282 | { | ||
| 283 | .ctl_name = CTL_UNNUMBERED, | ||
| 284 | .procname = "sched_child_runs_first", | 268 | .procname = "sched_child_runs_first", |
| 285 | .data = &sysctl_sched_child_runs_first, | 269 | .data = &sysctl_sched_child_runs_first, |
| 286 | .maxlen = sizeof(unsigned int), | 270 | .maxlen = sizeof(unsigned int), |
| @@ -318,7 +302,7 @@ static struct ctl_table kern_table[] = { | |||
| 318 | .data = &sysctl_sched_rt_period, | 302 | .data = &sysctl_sched_rt_period, |
| 319 | .maxlen = sizeof(unsigned int), | 303 | .maxlen = sizeof(unsigned int), |
| 320 | .mode = 0644, | 304 | .mode = 0644, |
| 321 | .proc_handler = &proc_dointvec, | 305 | .proc_handler = &sched_rt_handler, |
| 322 | }, | 306 | }, |
| 323 | { | 307 | { |
| 324 | .ctl_name = CTL_UNNUMBERED, | 308 | .ctl_name = CTL_UNNUMBERED, |
| @@ -326,7 +310,7 @@ static struct ctl_table kern_table[] = { | |||
| 326 | .data = &sysctl_sched_rt_runtime, | 310 | .data = &sysctl_sched_rt_runtime, |
| 327 | .maxlen = sizeof(int), | 311 | .maxlen = sizeof(int), |
| 328 | .mode = 0644, | 312 | .mode = 0644, |
| 329 | .proc_handler = &proc_dointvec, | 313 | .proc_handler = &sched_rt_handler, |
| 330 | }, | 314 | }, |
| 331 | { | 315 | { |
| 332 | .ctl_name = CTL_UNNUMBERED, | 316 | .ctl_name = CTL_UNNUMBERED, |
| @@ -820,6 +804,14 @@ static struct ctl_table kern_table[] = { | |||
| 820 | .proc_handler = &proc_dostring, | 804 | .proc_handler = &proc_dostring, |
| 821 | .strategy = &sysctl_string, | 805 | .strategy = &sysctl_string, |
| 822 | }, | 806 | }, |
| 807 | #ifdef CONFIG_KEYS | ||
| 808 | { | ||
| 809 | .ctl_name = CTL_UNNUMBERED, | ||
| 810 | .procname = "keys", | ||
| 811 | .mode = 0555, | ||
| 812 | .child = key_sysctls, | ||
| 813 | }, | ||
| 814 | #endif | ||
| 823 | /* | 815 | /* |
| 824 | * NOTE: do not add new entries to this table unless you have read | 816 | * NOTE: do not add new entries to this table unless you have read |
| 825 | * Documentation/sysctl/ctl_unnumbered.txt | 817 | * Documentation/sysctl/ctl_unnumbered.txt |
| @@ -1441,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root) | |||
| 1441 | } | 1433 | } |
| 1442 | 1434 | ||
| 1443 | #ifdef CONFIG_SYSCTL_SYSCALL | 1435 | #ifdef CONFIG_SYSCTL_SYSCALL |
| 1436 | /* Perform the actual read/write of a sysctl table entry. */ | ||
| 1437 | static int do_sysctl_strategy(struct ctl_table_root *root, | ||
| 1438 | struct ctl_table *table, | ||
| 1439 | int __user *name, int nlen, | ||
| 1440 | void __user *oldval, size_t __user *oldlenp, | ||
| 1441 | void __user *newval, size_t newlen) | ||
| 1442 | { | ||
| 1443 | int op = 0, rc; | ||
| 1444 | |||
| 1445 | if (oldval) | ||
| 1446 | op |= 004; | ||
| 1447 | if (newval) | ||
| 1448 | op |= 002; | ||
| 1449 | if (sysctl_perm(root, table, op)) | ||
| 1450 | return -EPERM; | ||
| 1451 | |||
| 1452 | if (table->strategy) { | ||
| 1453 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | ||
| 1454 | newval, newlen); | ||
| 1455 | if (rc < 0) | ||
| 1456 | return rc; | ||
| 1457 | if (rc > 0) | ||
| 1458 | return 0; | ||
| 1459 | } | ||
| 1460 | |||
| 1461 | /* If there is no strategy routine, or if the strategy returns | ||
| 1462 | * zero, proceed with automatic r/w */ | ||
| 1463 | if (table->data && table->maxlen) { | ||
| 1464 | rc = sysctl_data(table, name, nlen, oldval, oldlenp, | ||
| 1465 | newval, newlen); | ||
| 1466 | if (rc < 0) | ||
| 1467 | return rc; | ||
| 1468 | } | ||
| 1469 | return 0; | ||
| 1470 | } | ||
| 1471 | |||
| 1472 | static int parse_table(int __user *name, int nlen, | ||
| 1473 | void __user *oldval, size_t __user *oldlenp, | ||
| 1474 | void __user *newval, size_t newlen, | ||
| 1475 | struct ctl_table_root *root, | ||
| 1476 | struct ctl_table *table) | ||
| 1477 | { | ||
| 1478 | int n; | ||
| 1479 | repeat: | ||
| 1480 | if (!nlen) | ||
| 1481 | return -ENOTDIR; | ||
| 1482 | if (get_user(n, name)) | ||
| 1483 | return -EFAULT; | ||
| 1484 | for ( ; table->ctl_name || table->procname; table++) { | ||
| 1485 | if (!table->ctl_name) | ||
| 1486 | continue; | ||
| 1487 | if (n == table->ctl_name) { | ||
| 1488 | int error; | ||
| 1489 | if (table->child) { | ||
| 1490 | if (sysctl_perm(root, table, 001)) | ||
| 1491 | return -EPERM; | ||
| 1492 | name++; | ||
| 1493 | nlen--; | ||
| 1494 | table = table->child; | ||
| 1495 | goto repeat; | ||
| 1496 | } | ||
| 1497 | error = do_sysctl_strategy(root, table, name, nlen, | ||
| 1498 | oldval, oldlenp, | ||
| 1499 | newval, newlen); | ||
| 1500 | return error; | ||
| 1501 | } | ||
| 1502 | } | ||
| 1503 | return -ENOTDIR; | ||
| 1504 | } | ||
| 1505 | |||
| 1444 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1506 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
| 1445 | void __user *newval, size_t newlen) | 1507 | void __user *newval, size_t newlen) |
| 1446 | { | 1508 | { |
| @@ -1458,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
| 1458 | for (head = sysctl_head_next(NULL); head; | 1520 | for (head = sysctl_head_next(NULL); head; |
| 1459 | head = sysctl_head_next(head)) { | 1521 | head = sysctl_head_next(head)) { |
| 1460 | error = parse_table(name, nlen, oldval, oldlenp, | 1522 | error = parse_table(name, nlen, oldval, oldlenp, |
| 1461 | newval, newlen, head->ctl_table); | 1523 | newval, newlen, |
| 1524 | head->root, head->ctl_table); | ||
| 1462 | if (error != -ENOTDIR) { | 1525 | if (error != -ENOTDIR) { |
| 1463 | sysctl_head_finish(head); | 1526 | sysctl_head_finish(head); |
| 1464 | break; | 1527 | break; |
| @@ -1504,84 +1567,22 @@ static int test_perm(int mode, int op) | |||
| 1504 | return -EACCES; | 1567 | return -EACCES; |
| 1505 | } | 1568 | } |
| 1506 | 1569 | ||
| 1507 | int sysctl_perm(struct ctl_table *table, int op) | 1570 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) |
| 1508 | { | 1571 | { |
| 1509 | int error; | 1572 | int error; |
| 1573 | int mode; | ||
| 1574 | |||
| 1510 | error = security_sysctl(table, op); | 1575 | error = security_sysctl(table, op); |
| 1511 | if (error) | 1576 | if (error) |
| 1512 | return error; | 1577 | return error; |
| 1513 | return test_perm(table->mode, op); | ||
| 1514 | } | ||
| 1515 | 1578 | ||
| 1516 | #ifdef CONFIG_SYSCTL_SYSCALL | 1579 | if (root->permissions) |
| 1517 | static int parse_table(int __user *name, int nlen, | 1580 | mode = root->permissions(root, current->nsproxy, table); |
| 1518 | void __user *oldval, size_t __user *oldlenp, | 1581 | else |
| 1519 | void __user *newval, size_t newlen, | 1582 | mode = table->mode; |
| 1520 | struct ctl_table *table) | ||
| 1521 | { | ||
| 1522 | int n; | ||
| 1523 | repeat: | ||
| 1524 | if (!nlen) | ||
| 1525 | return -ENOTDIR; | ||
| 1526 | if (get_user(n, name)) | ||
| 1527 | return -EFAULT; | ||
| 1528 | for ( ; table->ctl_name || table->procname; table++) { | ||
| 1529 | if (!table->ctl_name) | ||
| 1530 | continue; | ||
| 1531 | if (n == table->ctl_name) { | ||
| 1532 | int error; | ||
| 1533 | if (table->child) { | ||
| 1534 | if (sysctl_perm(table, 001)) | ||
| 1535 | return -EPERM; | ||
| 1536 | name++; | ||
| 1537 | nlen--; | ||
| 1538 | table = table->child; | ||
| 1539 | goto repeat; | ||
| 1540 | } | ||
| 1541 | error = do_sysctl_strategy(table, name, nlen, | ||
| 1542 | oldval, oldlenp, | ||
| 1543 | newval, newlen); | ||
| 1544 | return error; | ||
| 1545 | } | ||
| 1546 | } | ||
| 1547 | return -ENOTDIR; | ||
| 1548 | } | ||
| 1549 | |||
| 1550 | /* Perform the actual read/write of a sysctl table entry. */ | ||
| 1551 | int do_sysctl_strategy (struct ctl_table *table, | ||
| 1552 | int __user *name, int nlen, | ||
| 1553 | void __user *oldval, size_t __user *oldlenp, | ||
| 1554 | void __user *newval, size_t newlen) | ||
| 1555 | { | ||
| 1556 | int op = 0, rc; | ||
| 1557 | 1583 | ||
| 1558 | if (oldval) | 1584 | return test_perm(mode, op); |
| 1559 | op |= 004; | ||
| 1560 | if (newval) | ||
| 1561 | op |= 002; | ||
| 1562 | if (sysctl_perm(table, op)) | ||
| 1563 | return -EPERM; | ||
| 1564 | |||
| 1565 | if (table->strategy) { | ||
| 1566 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | ||
| 1567 | newval, newlen); | ||
| 1568 | if (rc < 0) | ||
| 1569 | return rc; | ||
| 1570 | if (rc > 0) | ||
| 1571 | return 0; | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | /* If there is no strategy routine, or if the strategy returns | ||
| 1575 | * zero, proceed with automatic r/w */ | ||
| 1576 | if (table->data && table->maxlen) { | ||
| 1577 | rc = sysctl_data(table, name, nlen, oldval, oldlenp, | ||
| 1578 | newval, newlen); | ||
| 1579 | if (rc < 0) | ||
| 1580 | return rc; | ||
| 1581 | } | ||
| 1582 | return 0; | ||
| 1583 | } | 1585 | } |
| 1584 | #endif /* CONFIG_SYSCTL_SYSCALL */ | ||
| 1585 | 1586 | ||
| 1586 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | 1587 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) |
| 1587 | { | 1588 | { |
| @@ -1594,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | |||
| 1594 | 1595 | ||
| 1595 | static __init int sysctl_init(void) | 1596 | static __init int sysctl_init(void) |
| 1596 | { | 1597 | { |
| 1597 | int err; | ||
| 1598 | sysctl_set_parent(NULL, root_table); | 1598 | sysctl_set_parent(NULL, root_table); |
| 1599 | err = sysctl_check_table(current->nsproxy, root_table); | 1599 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
| 1600 | { | ||
| 1601 | int err; | ||
| 1602 | err = sysctl_check_table(current->nsproxy, root_table); | ||
| 1603 | } | ||
| 1604 | #endif | ||
| 1600 | return 0; | 1605 | return 0; |
| 1601 | } | 1606 | } |
| 1602 | 1607 | ||
| @@ -1723,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths( | |||
| 1723 | header->unregistering = NULL; | 1728 | header->unregistering = NULL; |
| 1724 | header->root = root; | 1729 | header->root = root; |
| 1725 | sysctl_set_parent(NULL, header->ctl_table); | 1730 | sysctl_set_parent(NULL, header->ctl_table); |
| 1731 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | ||
| 1726 | if (sysctl_check_table(namespaces, header->ctl_table)) { | 1732 | if (sysctl_check_table(namespaces, header->ctl_table)) { |
| 1727 | kfree(header); | 1733 | kfree(header); |
| 1728 | return NULL; | 1734 | return NULL; |
| 1729 | } | 1735 | } |
| 1736 | #endif | ||
| 1730 | spin_lock(&sysctl_lock); | 1737 | spin_lock(&sysctl_lock); |
| 1731 | header_list = lookup_header_list(root, namespaces); | 1738 | header_list = lookup_header_list(root, namespaces); |
| 1732 | list_add_tail(&header->ctl_entry, header_list); | 1739 | list_add_tail(&header->ctl_entry, header_list); |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 07e86a828073..4a23517169a6 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
| 183 | 183 | ||
| 184 | if (!tsk) { | 184 | if (!tsk) { |
| 185 | rcu_read_lock(); | 185 | rcu_read_lock(); |
| 186 | tsk = find_task_by_pid(pid); | 186 | tsk = find_task_by_vpid(pid); |
| 187 | if (tsk) | 187 | if (tsk) |
| 188 | get_task_struct(tsk); | 188 | get_task_struct(tsk); |
| 189 | rcu_read_unlock(); | 189 | rcu_read_unlock(); |
| @@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
| 230 | */ | 230 | */ |
| 231 | rcu_read_lock(); | 231 | rcu_read_lock(); |
| 232 | if (!first) | 232 | if (!first) |
| 233 | first = find_task_by_pid(tgid); | 233 | first = find_task_by_vpid(tgid); |
| 234 | 234 | ||
| 235 | if (!first || !lock_task_sighand(first, &flags)) | 235 | if (!first || !lock_task_sighand(first, &flags)) |
| 236 | goto out; | 236 | goto out; |
| @@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 547 | if (!stats) | 547 | if (!stats) |
| 548 | goto err; | 548 | goto err; |
| 549 | 549 | ||
| 550 | rc = fill_pid(tsk->pid, tsk, stats); | 550 | rc = fill_pid(-1, tsk, stats); |
| 551 | if (rc < 0) | 551 | if (rc < 0) |
| 552 | goto err; | 552 | goto err; |
| 553 | 553 | ||
diff --git a/kernel/time.c b/kernel/time.c index a5ec013b6c80..6a08660b4fac 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -35,6 +35,8 @@ | |||
| 35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
| 36 | #include <linux/security.h> | 36 | #include <linux/security.h> |
| 37 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
| 38 | #include <linux/slab.h> | ||
| 39 | #include <linux/math64.h> | ||
| 38 | 40 | ||
| 39 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
| 40 | #include <asm/unistd.h> | 42 | #include <asm/unistd.h> |
| @@ -244,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) | |||
| 244 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); | 246 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); |
| 245 | #else | 247 | #else |
| 246 | # if BITS_PER_LONG == 32 | 248 | # if BITS_PER_LONG == 32 |
| 247 | return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; | 249 | return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; |
| 248 | # else | 250 | # else |
| 249 | return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; | 251 | return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; |
| 250 | # endif | 252 | # endif |
| @@ -260,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j) | |||
| 260 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | 262 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); |
| 261 | #else | 263 | #else |
| 262 | # if BITS_PER_LONG == 32 | 264 | # if BITS_PER_LONG == 32 |
| 263 | return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; | 265 | return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; |
| 264 | # else | 266 | # else |
| 265 | return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; | 267 | return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; |
| 266 | # endif | 268 | # endif |
| @@ -379,6 +381,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | |||
| 379 | ts->tv_sec = sec; | 381 | ts->tv_sec = sec; |
| 380 | ts->tv_nsec = nsec; | 382 | ts->tv_nsec = nsec; |
| 381 | } | 383 | } |
| 384 | EXPORT_SYMBOL(set_normalized_timespec); | ||
| 382 | 385 | ||
| 383 | /** | 386 | /** |
| 384 | * ns_to_timespec - Convert nanoseconds to timespec | 387 | * ns_to_timespec - Convert nanoseconds to timespec |
| @@ -389,13 +392,17 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | |||
| 389 | struct timespec ns_to_timespec(const s64 nsec) | 392 | struct timespec ns_to_timespec(const s64 nsec) |
| 390 | { | 393 | { |
| 391 | struct timespec ts; | 394 | struct timespec ts; |
| 395 | s32 rem; | ||
| 392 | 396 | ||
| 393 | if (!nsec) | 397 | if (!nsec) |
| 394 | return (struct timespec) {0, 0}; | 398 | return (struct timespec) {0, 0}; |
| 395 | 399 | ||
| 396 | ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); | 400 | ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); |
| 397 | if (unlikely(nsec < 0)) | 401 | if (unlikely(rem < 0)) { |
| 398 | set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); | 402 | ts.tv_sec--; |
| 403 | rem += NSEC_PER_SEC; | ||
| 404 | } | ||
| 405 | ts.tv_nsec = rem; | ||
| 399 | 406 | ||
| 400 | return ts; | 407 | return ts; |
| 401 | } | 408 | } |
| @@ -469,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m) | |||
| 469 | if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) | 476 | if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) |
| 470 | return MAX_JIFFY_OFFSET; | 477 | return MAX_JIFFY_OFFSET; |
| 471 | 478 | ||
| 472 | return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) | 479 | return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) |
| 473 | >> MSEC_TO_HZ_SHR32; | 480 | >> MSEC_TO_HZ_SHR32; |
| 474 | #endif | 481 | #endif |
| 475 | } | 482 | } |
| @@ -484,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u) | |||
| 484 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | 491 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) |
| 485 | return u * (HZ / USEC_PER_SEC); | 492 | return u * (HZ / USEC_PER_SEC); |
| 486 | #else | 493 | #else |
| 487 | return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) | 494 | return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) |
| 488 | >> USEC_TO_HZ_SHR32; | 495 | >> USEC_TO_HZ_SHR32; |
| 489 | #endif | 496 | #endif |
| 490 | } | 497 | } |
| @@ -525,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | |||
| 525 | * Convert jiffies to nanoseconds and separate with | 532 | * Convert jiffies to nanoseconds and separate with |
| 526 | * one divide. | 533 | * one divide. |
| 527 | */ | 534 | */ |
| 528 | u64 nsec = (u64)jiffies * TICK_NSEC; | 535 | u32 rem; |
| 529 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); | 536 | value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, |
| 537 | NSEC_PER_SEC, &rem); | ||
| 538 | value->tv_nsec = rem; | ||
| 530 | } | 539 | } |
| 531 | EXPORT_SYMBOL(jiffies_to_timespec); | 540 | EXPORT_SYMBOL(jiffies_to_timespec); |
| 532 | 541 | ||
| @@ -564,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) | |||
| 564 | * Convert jiffies to nanoseconds and separate with | 573 | * Convert jiffies to nanoseconds and separate with |
| 565 | * one divide. | 574 | * one divide. |
| 566 | */ | 575 | */ |
| 567 | u64 nsec = (u64)jiffies * TICK_NSEC; | 576 | u32 rem; |
| 568 | long tv_usec; | ||
| 569 | 577 | ||
| 570 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); | 578 | value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, |
| 571 | tv_usec /= NSEC_PER_USEC; | 579 | NSEC_PER_SEC, &rem); |
| 572 | value->tv_usec = tv_usec; | 580 | value->tv_usec = rem / NSEC_PER_USEC; |
| 573 | } | 581 | } |
| 574 | EXPORT_SYMBOL(jiffies_to_timeval); | 582 | EXPORT_SYMBOL(jiffies_to_timeval); |
| 575 | 583 | ||
| @@ -585,9 +593,7 @@ clock_t jiffies_to_clock_t(long x) | |||
| 585 | return x / (HZ / USER_HZ); | 593 | return x / (HZ / USER_HZ); |
| 586 | # endif | 594 | # endif |
| 587 | #else | 595 | #else |
| 588 | u64 tmp = (u64)x * TICK_NSEC; | 596 | return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); |
| 589 | do_div(tmp, (NSEC_PER_SEC / USER_HZ)); | ||
| 590 | return (long)tmp; | ||
| 591 | #endif | 597 | #endif |
| 592 | } | 598 | } |
| 593 | EXPORT_SYMBOL(jiffies_to_clock_t); | 599 | EXPORT_SYMBOL(jiffies_to_clock_t); |
| @@ -599,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x) | |||
| 599 | return ~0UL; | 605 | return ~0UL; |
| 600 | return x * (HZ / USER_HZ); | 606 | return x * (HZ / USER_HZ); |
| 601 | #else | 607 | #else |
| 602 | u64 jif; | ||
| 603 | |||
| 604 | /* Don't worry about loss of precision here .. */ | 608 | /* Don't worry about loss of precision here .. */ |
| 605 | if (x >= ~0UL / HZ * USER_HZ) | 609 | if (x >= ~0UL / HZ * USER_HZ) |
| 606 | return ~0UL; | 610 | return ~0UL; |
| 607 | 611 | ||
| 608 | /* .. but do try to contain it here */ | 612 | /* .. but do try to contain it here */ |
| 609 | jif = x * (u64) HZ; | 613 | return div_u64((u64)x * HZ, USER_HZ); |
| 610 | do_div(jif, USER_HZ); | ||
| 611 | return jif; | ||
| 612 | #endif | 614 | #endif |
| 613 | } | 615 | } |
| 614 | EXPORT_SYMBOL(clock_t_to_jiffies); | 616 | EXPORT_SYMBOL(clock_t_to_jiffies); |
| @@ -617,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x) | |||
| 617 | { | 619 | { |
| 618 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | 620 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 |
| 619 | # if HZ < USER_HZ | 621 | # if HZ < USER_HZ |
| 620 | x *= USER_HZ; | 622 | x = div_u64(x * USER_HZ, HZ); |
| 621 | do_div(x, HZ); | ||
| 622 | # elif HZ > USER_HZ | 623 | # elif HZ > USER_HZ |
| 623 | do_div(x, HZ / USER_HZ); | 624 | x = div_u64(x, HZ / USER_HZ); |
| 624 | # else | 625 | # else |
| 625 | /* Nothing to do */ | 626 | /* Nothing to do */ |
| 626 | # endif | 627 | # endif |
| @@ -630,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x) | |||
| 630 | * but even this doesn't overflow in hundreds of years | 631 | * but even this doesn't overflow in hundreds of years |
| 631 | * in 64 bits, so.. | 632 | * in 64 bits, so.. |
| 632 | */ | 633 | */ |
| 633 | x *= TICK_NSEC; | 634 | x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); |
| 634 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | ||
| 635 | #endif | 635 | #endif |
| 636 | return x; | 636 | return x; |
| 637 | } | 637 | } |
| @@ -640,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); | |||
| 640 | u64 nsec_to_clock_t(u64 x) | 640 | u64 nsec_to_clock_t(u64 x) |
| 641 | { | 641 | { |
| 642 | #if (NSEC_PER_SEC % USER_HZ) == 0 | 642 | #if (NSEC_PER_SEC % USER_HZ) == 0 |
| 643 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | 643 | return div_u64(x, NSEC_PER_SEC / USER_HZ); |
| 644 | #elif (USER_HZ % 512) == 0 | 644 | #elif (USER_HZ % 512) == 0 |
| 645 | x *= USER_HZ/512; | 645 | return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); |
| 646 | do_div(x, (NSEC_PER_SEC / 512)); | ||
| 647 | #else | 646 | #else |
| 648 | /* | 647 | /* |
| 649 | * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, | 648 | * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, |
| 650 | * overflow after 64.99 years. | 649 | * overflow after 64.99 years. |
| 651 | * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... | 650 | * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... |
| 652 | */ | 651 | */ |
| 653 | x *= 9; | 652 | return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); |
| 654 | do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / | ||
| 655 | USER_HZ)); | ||
| 656 | #endif | 653 | #endif |
| 657 | return x; | ||
| 658 | } | 654 | } |
| 659 | 655 | ||
| 660 | #if (BITS_PER_LONG < 64) | 656 | #if (BITS_PER_LONG < 64) |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 278534bbca95..73961f35fdc8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data) | |||
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | if (!list_empty(&watchdog_list)) { | 143 | if (!list_empty(&watchdog_list)) { |
| 144 | __mod_timer(&watchdog_timer, | 144 | /* |
| 145 | watchdog_timer.expires + WATCHDOG_INTERVAL); | 145 | * Cycle through CPUs to check if the CPUs stay |
| 146 | * synchronized to each other. | ||
| 147 | */ | ||
| 148 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
| 149 | |||
| 150 | if (next_cpu >= NR_CPUS) | ||
| 151 | next_cpu = first_cpu(cpu_online_map); | ||
| 152 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
| 153 | add_timer_on(&watchdog_timer, next_cpu); | ||
| 146 | } | 154 | } |
| 147 | spin_unlock(&watchdog_lock); | 155 | spin_unlock(&watchdog_lock); |
| 148 | } | 156 | } |
| @@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 164 | if (!started && watchdog) { | 172 | if (!started && watchdog) { |
| 165 | watchdog_last = watchdog->read(); | 173 | watchdog_last = watchdog->read(); |
| 166 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 174 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
| 167 | add_timer(&watchdog_timer); | 175 | add_timer_on(&watchdog_timer, |
| 176 | first_cpu(cpu_online_map)); | ||
| 168 | } | 177 | } |
| 169 | } else { | 178 | } else { |
| 170 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 179 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
| @@ -174,7 +183,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 174 | if (watchdog) | 183 | if (watchdog) |
| 175 | del_timer(&watchdog_timer); | 184 | del_timer(&watchdog_timer); |
| 176 | watchdog = cs; | 185 | watchdog = cs; |
| 177 | init_timer_deferrable(&watchdog_timer); | 186 | init_timer(&watchdog_timer); |
| 178 | watchdog_timer.function = clocksource_watchdog; | 187 | watchdog_timer.function = clocksource_watchdog; |
| 179 | 188 | ||
| 180 | /* Reset watchdog cycles */ | 189 | /* Reset watchdog cycles */ |
| @@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
| 185 | watchdog_last = watchdog->read(); | 194 | watchdog_last = watchdog->read(); |
| 186 | watchdog_timer.expires = | 195 | watchdog_timer.expires = |
| 187 | jiffies + WATCHDOG_INTERVAL; | 196 | jiffies + WATCHDOG_INTERVAL; |
| 188 | add_timer(&watchdog_timer); | 197 | add_timer_on(&watchdog_timer, |
| 198 | first_cpu(cpu_online_map)); | ||
| 189 | } | 199 | } |
| 190 | } | 200 | } |
| 191 | } | 201 | } |
| @@ -222,6 +232,18 @@ void clocksource_resume(void) | |||
| 222 | } | 232 | } |
| 223 | 233 | ||
| 224 | /** | 234 | /** |
| 235 | * clocksource_touch_watchdog - Update watchdog | ||
| 236 | * | ||
| 237 | * Update the watchdog after exception contexts such as kgdb so as not | ||
| 238 | * to incorrectly trip the watchdog. | ||
| 239 | * | ||
| 240 | */ | ||
| 241 | void clocksource_touch_watchdog(void) | ||
| 242 | { | ||
| 243 | clocksource_resume_watchdog(); | ||
| 244 | } | ||
| 245 | |||
| 246 | /** | ||
| 225 | * clocksource_get_next - Returns the selected clocksource | 247 | * clocksource_get_next - Returns the selected clocksource |
| 226 | * | 248 | * |
| 227 | */ | 249 | */ |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5fd9b9469770..5125ddd8196b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -15,7 +15,8 @@ | |||
| 15 | #include <linux/jiffies.h> | 15 | #include <linux/jiffies.h> |
| 16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
| 17 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
| 18 | #include <asm/div64.h> | 18 | #include <linux/math64.h> |
| 19 | #include <linux/clocksource.h> | ||
| 19 | #include <asm/timex.h> | 20 | #include <asm/timex.h> |
| 20 | 21 | ||
| 21 | /* | 22 | /* |
| @@ -23,11 +24,14 @@ | |||
| 23 | */ | 24 | */ |
| 24 | unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ | 25 | unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ |
| 25 | unsigned long tick_nsec; /* ACTHZ period (nsec) */ | 26 | unsigned long tick_nsec; /* ACTHZ period (nsec) */ |
| 26 | static u64 tick_length, tick_length_base; | 27 | u64 tick_length; |
| 28 | static u64 tick_length_base; | ||
| 29 | |||
| 30 | static struct hrtimer leap_timer; | ||
| 27 | 31 | ||
| 28 | #define MAX_TICKADJ 500 /* microsecs */ | 32 | #define MAX_TICKADJ 500 /* microsecs */ |
| 29 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ | 33 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ |
| 30 | TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) | 34 | NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
| 31 | 35 | ||
| 32 | /* | 36 | /* |
| 33 | * phase-lock loop variables | 37 | * phase-lock loop variables |
| @@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base; | |||
| 35 | /* TIME_ERROR prevents overwriting the CMOS clock */ | 39 | /* TIME_ERROR prevents overwriting the CMOS clock */ |
| 36 | static int time_state = TIME_OK; /* clock synchronization status */ | 40 | static int time_state = TIME_OK; /* clock synchronization status */ |
| 37 | int time_status = STA_UNSYNC; /* clock status bits */ | 41 | int time_status = STA_UNSYNC; /* clock status bits */ |
| 38 | static s64 time_offset; /* time adjustment (ns) */ | 42 | static long time_tai; /* TAI offset (s) */ |
| 43 | static s64 time_offset; /* time adjustment (ns) */ | ||
| 39 | static long time_constant = 2; /* pll time constant */ | 44 | static long time_constant = 2; /* pll time constant */ |
| 40 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 45 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
| 41 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 46 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
| 42 | long time_freq; /* frequency offset (scaled ppm)*/ | 47 | static s64 time_freq; /* frequency offset (scaled ns/s)*/ |
| 43 | static long time_reftime; /* time at last adjustment (s) */ | 48 | static long time_reftime; /* time at last adjustment (s) */ |
| 44 | long time_adjust; | 49 | long time_adjust; |
| 45 | static long ntp_tick_adj; | 50 | static long ntp_tick_adj; |
| @@ -47,16 +52,56 @@ static long ntp_tick_adj; | |||
| 47 | static void ntp_update_frequency(void) | 52 | static void ntp_update_frequency(void) |
| 48 | { | 53 | { |
| 49 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) | 54 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) |
| 50 | << TICK_LENGTH_SHIFT; | 55 | << NTP_SCALE_SHIFT; |
| 51 | second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; | 56 | second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; |
| 52 | second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | 57 | second_length += time_freq; |
| 53 | 58 | ||
| 54 | tick_length_base = second_length; | 59 | tick_length_base = second_length; |
| 55 | 60 | ||
| 56 | do_div(second_length, HZ); | 61 | tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; |
| 57 | tick_nsec = second_length >> TICK_LENGTH_SHIFT; | 62 | tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); |
| 63 | } | ||
| 64 | |||
| 65 | static void ntp_update_offset(long offset) | ||
| 66 | { | ||
| 67 | long mtemp; | ||
| 68 | s64 freq_adj; | ||
| 69 | |||
| 70 | if (!(time_status & STA_PLL)) | ||
| 71 | return; | ||
| 58 | 72 | ||
| 59 | do_div(tick_length_base, NTP_INTERVAL_FREQ); | 73 | if (!(time_status & STA_NANO)) |
| 74 | offset *= NSEC_PER_USEC; | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Scale the phase adjustment and | ||
| 78 | * clamp to the operating range. | ||
| 79 | */ | ||
| 80 | offset = min(offset, MAXPHASE); | ||
| 81 | offset = max(offset, -MAXPHASE); | ||
| 82 | |||
| 83 | /* | ||
| 84 | * Select how the frequency is to be controlled | ||
| 85 | * and in which mode (PLL or FLL). | ||
| 86 | */ | ||
| 87 | if (time_status & STA_FREQHOLD || time_reftime == 0) | ||
| 88 | time_reftime = xtime.tv_sec; | ||
| 89 | mtemp = xtime.tv_sec - time_reftime; | ||
| 90 | time_reftime = xtime.tv_sec; | ||
| 91 | |||
| 92 | freq_adj = (s64)offset * mtemp; | ||
| 93 | freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); | ||
| 94 | time_status &= ~STA_MODE; | ||
| 95 | if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { | ||
| 96 | freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), | ||
| 97 | mtemp); | ||
| 98 | time_status |= STA_MODE; | ||
| 99 | } | ||
| 100 | freq_adj += time_freq; | ||
| 101 | freq_adj = min(freq_adj, MAXFREQ_SCALED); | ||
| 102 | time_freq = max(freq_adj, -MAXFREQ_SCALED); | ||
| 103 | |||
| 104 | time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); | ||
| 60 | } | 105 | } |
| 61 | 106 | ||
| 62 | /** | 107 | /** |
| @@ -78,62 +123,70 @@ void ntp_clear(void) | |||
| 78 | } | 123 | } |
| 79 | 124 | ||
| 80 | /* | 125 | /* |
| 81 | * this routine handles the overflow of the microsecond field | 126 | * Leap second processing. If in leap-insert state at the end of the |
| 82 | * | 127 | * day, the system clock is set back one second; if in leap-delete |
| 83 | * The tricky bits of code to handle the accurate clock support | 128 | * state, the system clock is set ahead one second. |
| 84 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
| 85 | * They were originally developed for SUN and DEC kernels. | ||
| 86 | * All the kudos should go to Dave for this stuff. | ||
| 87 | */ | 129 | */ |
| 88 | void second_overflow(void) | 130 | static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) |
| 89 | { | 131 | { |
| 90 | long time_adj; | 132 | enum hrtimer_restart res = HRTIMER_NORESTART; |
| 91 | 133 | ||
| 92 | /* Bump the maxerror field */ | 134 | write_seqlock_irq(&xtime_lock); |
| 93 | time_maxerror += MAXFREQ >> SHIFT_USEC; | ||
| 94 | if (time_maxerror > NTP_PHASE_LIMIT) { | ||
| 95 | time_maxerror = NTP_PHASE_LIMIT; | ||
| 96 | time_status |= STA_UNSYNC; | ||
| 97 | } | ||
| 98 | 135 | ||
| 99 | /* | ||
| 100 | * Leap second processing. If in leap-insert state at the end of the | ||
| 101 | * day, the system clock is set back one second; if in leap-delete | ||
| 102 | * state, the system clock is set ahead one second. The microtime() | ||
| 103 | * routine or external clock driver will insure that reported time is | ||
| 104 | * always monotonic. The ugly divides should be replaced. | ||
| 105 | */ | ||
| 106 | switch (time_state) { | 136 | switch (time_state) { |
| 107 | case TIME_OK: | 137 | case TIME_OK: |
| 108 | if (time_status & STA_INS) | ||
| 109 | time_state = TIME_INS; | ||
| 110 | else if (time_status & STA_DEL) | ||
| 111 | time_state = TIME_DEL; | ||
| 112 | break; | 138 | break; |
| 113 | case TIME_INS: | 139 | case TIME_INS: |
| 114 | if (xtime.tv_sec % 86400 == 0) { | 140 | xtime.tv_sec--; |
| 115 | xtime.tv_sec--; | 141 | wall_to_monotonic.tv_sec++; |
| 116 | wall_to_monotonic.tv_sec++; | 142 | time_state = TIME_OOP; |
| 117 | time_state = TIME_OOP; | 143 | printk(KERN_NOTICE "Clock: " |
| 118 | printk(KERN_NOTICE "Clock: inserting leap second " | 144 | "inserting leap second 23:59:60 UTC\n"); |
| 119 | "23:59:60 UTC\n"); | 145 | leap_timer.expires = ktime_add_ns(leap_timer.expires, |
| 120 | } | 146 | NSEC_PER_SEC); |
| 147 | res = HRTIMER_RESTART; | ||
| 121 | break; | 148 | break; |
| 122 | case TIME_DEL: | 149 | case TIME_DEL: |
| 123 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 150 | xtime.tv_sec++; |
| 124 | xtime.tv_sec++; | 151 | time_tai--; |
| 125 | wall_to_monotonic.tv_sec--; | 152 | wall_to_monotonic.tv_sec--; |
| 126 | time_state = TIME_WAIT; | 153 | time_state = TIME_WAIT; |
| 127 | printk(KERN_NOTICE "Clock: deleting leap second " | 154 | printk(KERN_NOTICE "Clock: " |
| 128 | "23:59:59 UTC\n"); | 155 | "deleting leap second 23:59:59 UTC\n"); |
| 129 | } | ||
| 130 | break; | 156 | break; |
| 131 | case TIME_OOP: | 157 | case TIME_OOP: |
| 158 | time_tai++; | ||
| 132 | time_state = TIME_WAIT; | 159 | time_state = TIME_WAIT; |
| 133 | break; | 160 | /* fall through */ |
| 134 | case TIME_WAIT: | 161 | case TIME_WAIT: |
| 135 | if (!(time_status & (STA_INS | STA_DEL))) | 162 | if (!(time_status & (STA_INS | STA_DEL))) |
| 136 | time_state = TIME_OK; | 163 | time_state = TIME_OK; |
| 164 | break; | ||
| 165 | } | ||
| 166 | update_vsyscall(&xtime, clock); | ||
| 167 | |||
| 168 | write_sequnlock_irq(&xtime_lock); | ||
| 169 | |||
| 170 | return res; | ||
| 171 | } | ||
| 172 | |||
| 173 | /* | ||
| 174 | * this routine handles the overflow of the microsecond field | ||
| 175 | * | ||
| 176 | * The tricky bits of code to handle the accurate clock support | ||
| 177 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
| 178 | * They were originally developed for SUN and DEC kernels. | ||
| 179 | * All the kudos should go to Dave for this stuff. | ||
| 180 | */ | ||
| 181 | void second_overflow(void) | ||
| 182 | { | ||
| 183 | s64 time_adj; | ||
| 184 | |||
| 185 | /* Bump the maxerror field */ | ||
| 186 | time_maxerror += MAXFREQ / NSEC_PER_USEC; | ||
| 187 | if (time_maxerror > NTP_PHASE_LIMIT) { | ||
| 188 | time_maxerror = NTP_PHASE_LIMIT; | ||
| 189 | time_status |= STA_UNSYNC; | ||
| 137 | } | 190 | } |
| 138 | 191 | ||
| 139 | /* | 192 | /* |
| @@ -143,7 +196,7 @@ void second_overflow(void) | |||
| 143 | tick_length = tick_length_base; | 196 | tick_length = tick_length_base; |
| 144 | time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); | 197 | time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); |
| 145 | time_offset -= time_adj; | 198 | time_offset -= time_adj; |
| 146 | tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); | 199 | tick_length += time_adj; |
| 147 | 200 | ||
| 148 | if (unlikely(time_adjust)) { | 201 | if (unlikely(time_adjust)) { |
| 149 | if (time_adjust > MAX_TICKADJ) { | 202 | if (time_adjust > MAX_TICKADJ) { |
| @@ -154,25 +207,12 @@ void second_overflow(void) | |||
| 154 | tick_length -= MAX_TICKADJ_SCALED; | 207 | tick_length -= MAX_TICKADJ_SCALED; |
| 155 | } else { | 208 | } else { |
| 156 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / | 209 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / |
| 157 | NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; | 210 | NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; |
| 158 | time_adjust = 0; | 211 | time_adjust = 0; |
| 159 | } | 212 | } |
| 160 | } | 213 | } |
| 161 | } | 214 | } |
| 162 | 215 | ||
| 163 | /* | ||
| 164 | * Return how long ticks are at the moment, that is, how much time | ||
| 165 | * update_wall_time_one_tick will add to xtime next time we call it | ||
| 166 | * (assuming no calls to do_adjtimex in the meantime). | ||
| 167 | * The return value is in fixed-point nanoseconds shifted by the | ||
| 168 | * specified number of bits to the right of the binary point. | ||
| 169 | * This function has no side-effects. | ||
| 170 | */ | ||
| 171 | u64 current_tick_length(void) | ||
| 172 | { | ||
| 173 | return tick_length; | ||
| 174 | } | ||
| 175 | |||
| 176 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 216 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
| 177 | 217 | ||
| 178 | /* Disable the cmos update - used by virtualization and embedded */ | 218 | /* Disable the cmos update - used by virtualization and embedded */ |
| @@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { } | |||
| 236 | */ | 276 | */ |
| 237 | int do_adjtimex(struct timex *txc) | 277 | int do_adjtimex(struct timex *txc) |
| 238 | { | 278 | { |
| 239 | long mtemp, save_adjust, rem; | 279 | struct timespec ts; |
| 240 | s64 freq_adj, temp64; | 280 | long save_adjust, sec; |
| 241 | int result; | 281 | int result; |
| 242 | 282 | ||
| 243 | /* In order to modify anything, you gotta be super-user! */ | 283 | /* In order to modify anything, you gotta be super-user! */ |
| @@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc) | |||
| 247 | /* Now we validate the data before disabling interrupts */ | 287 | /* Now we validate the data before disabling interrupts */ |
| 248 | 288 | ||
| 249 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { | 289 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { |
| 250 | /* singleshot must not be used with any other mode bits */ | 290 | /* singleshot must not be used with any other mode bits */ |
| 251 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && | 291 | if (txc->modes & ~ADJ_OFFSET_SS_READ) |
| 252 | txc->modes != ADJ_OFFSET_SS_READ) | ||
| 253 | return -EINVAL; | 292 | return -EINVAL; |
| 254 | } | 293 | } |
| 255 | 294 | ||
| 256 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) | ||
| 257 | /* adjustment Offset limited to +- .512 seconds */ | ||
| 258 | if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) | ||
| 259 | return -EINVAL; | ||
| 260 | |||
| 261 | /* if the quartz is off by more than 10% something is VERY wrong ! */ | 295 | /* if the quartz is off by more than 10% something is VERY wrong ! */ |
| 262 | if (txc->modes & ADJ_TICK) | 296 | if (txc->modes & ADJ_TICK) |
| 263 | if (txc->tick < 900000/USER_HZ || | 297 | if (txc->tick < 900000/USER_HZ || |
| 264 | txc->tick > 1100000/USER_HZ) | 298 | txc->tick > 1100000/USER_HZ) |
| 265 | return -EINVAL; | 299 | return -EINVAL; |
| 266 | 300 | ||
| 301 | if (time_state != TIME_OK && txc->modes & ADJ_STATUS) | ||
| 302 | hrtimer_cancel(&leap_timer); | ||
| 303 | getnstimeofday(&ts); | ||
| 304 | |||
| 267 | write_seqlock_irq(&xtime_lock); | 305 | write_seqlock_irq(&xtime_lock); |
| 268 | result = time_state; /* mostly `TIME_OK' */ | ||
| 269 | 306 | ||
| 270 | /* Save for later - semantics of adjtime is to return old value */ | 307 | /* Save for later - semantics of adjtime is to return old value */ |
| 271 | save_adjust = time_adjust; | 308 | save_adjust = time_adjust; |
| 272 | 309 | ||
| 273 | #if 0 /* STA_CLOCKERR is never set yet */ | ||
| 274 | time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ | ||
| 275 | #endif | ||
| 276 | /* If there are input parameters, then process them */ | 310 | /* If there are input parameters, then process them */ |
| 277 | if (txc->modes) | 311 | if (txc->modes) { |
| 278 | { | 312 | if (txc->modes & ADJ_STATUS) { |
| 279 | if (txc->modes & ADJ_STATUS) /* only set allowed bits */ | 313 | if ((time_status & STA_PLL) && |
| 280 | time_status = (txc->status & ~STA_RONLY) | | 314 | !(txc->status & STA_PLL)) { |
| 281 | (time_status & STA_RONLY); | 315 | time_state = TIME_OK; |
| 282 | 316 | time_status = STA_UNSYNC; | |
| 283 | if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ | 317 | } |
| 284 | if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { | 318 | /* only set allowed bits */ |
| 285 | result = -EINVAL; | 319 | time_status &= STA_RONLY; |
| 286 | goto leave; | 320 | time_status |= txc->status & ~STA_RONLY; |
| 287 | } | 321 | |
| 288 | time_freq = ((s64)txc->freq * NSEC_PER_USEC) | 322 | switch (time_state) { |
| 289 | >> (SHIFT_USEC - SHIFT_NSEC); | 323 | case TIME_OK: |
| 290 | } | 324 | start_timer: |
| 291 | 325 | sec = ts.tv_sec; | |
| 292 | if (txc->modes & ADJ_MAXERROR) { | 326 | if (time_status & STA_INS) { |
| 293 | if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { | 327 | time_state = TIME_INS; |
| 294 | result = -EINVAL; | 328 | sec += 86400 - sec % 86400; |
| 295 | goto leave; | 329 | hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); |
| 330 | } else if (time_status & STA_DEL) { | ||
| 331 | time_state = TIME_DEL; | ||
| 332 | sec += 86400 - (sec + 1) % 86400; | ||
| 333 | hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); | ||
| 334 | } | ||
| 335 | break; | ||
| 336 | case TIME_INS: | ||
| 337 | case TIME_DEL: | ||
| 338 | time_state = TIME_OK; | ||
| 339 | goto start_timer; | ||
| 340 | break; | ||
| 341 | case TIME_WAIT: | ||
| 342 | if (!(time_status & (STA_INS | STA_DEL))) | ||
| 343 | time_state = TIME_OK; | ||
| 344 | break; | ||
| 345 | case TIME_OOP: | ||
| 346 | hrtimer_restart(&leap_timer); | ||
| 347 | break; | ||
| 348 | } | ||
| 296 | } | 349 | } |
| 297 | time_maxerror = txc->maxerror; | ||
| 298 | } | ||
| 299 | 350 | ||
| 300 | if (txc->modes & ADJ_ESTERROR) { | 351 | if (txc->modes & ADJ_NANO) |
| 301 | if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { | 352 | time_status |= STA_NANO; |
| 302 | result = -EINVAL; | 353 | if (txc->modes & ADJ_MICRO) |
| 303 | goto leave; | 354 | time_status &= ~STA_NANO; |
| 355 | |||
| 356 | if (txc->modes & ADJ_FREQUENCY) { | ||
| 357 | time_freq = (s64)txc->freq * PPM_SCALE; | ||
| 358 | time_freq = min(time_freq, MAXFREQ_SCALED); | ||
| 359 | time_freq = max(time_freq, -MAXFREQ_SCALED); | ||
| 304 | } | 360 | } |
| 305 | time_esterror = txc->esterror; | ||
| 306 | } | ||
| 307 | 361 | ||
| 308 | if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ | 362 | if (txc->modes & ADJ_MAXERROR) |
| 309 | if (txc->constant < 0) { /* NTP v4 uses values > 6 */ | 363 | time_maxerror = txc->maxerror; |
| 310 | result = -EINVAL; | 364 | if (txc->modes & ADJ_ESTERROR) |
| 311 | goto leave; | 365 | time_esterror = txc->esterror; |
| 366 | |||
| 367 | if (txc->modes & ADJ_TIMECONST) { | ||
| 368 | time_constant = txc->constant; | ||
| 369 | if (!(time_status & STA_NANO)) | ||
| 370 | time_constant += 4; | ||
| 371 | time_constant = min(time_constant, (long)MAXTC); | ||
| 372 | time_constant = max(time_constant, 0l); | ||
| 312 | } | 373 | } |
| 313 | time_constant = min(txc->constant + 4, (long)MAXTC); | ||
| 314 | } | ||
| 315 | 374 | ||
| 316 | if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ | 375 | if (txc->modes & ADJ_TAI && txc->constant > 0) |
| 317 | if (txc->modes == ADJ_OFFSET_SINGLESHOT) { | 376 | time_tai = txc->constant; |
| 318 | /* adjtime() is independent from ntp_adjtime() */ | 377 | |
| 319 | time_adjust = txc->offset; | 378 | if (txc->modes & ADJ_OFFSET) { |
| 379 | if (txc->modes == ADJ_OFFSET_SINGLESHOT) | ||
| 380 | /* adjtime() is independent from ntp_adjtime() */ | ||
| 381 | time_adjust = txc->offset; | ||
| 382 | else | ||
| 383 | ntp_update_offset(txc->offset); | ||
| 320 | } | 384 | } |
| 321 | else if (time_status & STA_PLL) { | 385 | if (txc->modes & ADJ_TICK) |
| 322 | time_offset = txc->offset * NSEC_PER_USEC; | 386 | tick_usec = txc->tick; |
| 323 | 387 | ||
| 324 | /* | 388 | if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) |
| 325 | * Scale the phase adjustment and | 389 | ntp_update_frequency(); |
| 326 | * clamp to the operating range. | 390 | } |
| 327 | */ | 391 | |
| 328 | time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC); | 392 | result = time_state; /* mostly `TIME_OK' */ |
| 329 | time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC); | 393 | if (time_status & (STA_UNSYNC|STA_CLOCKERR)) |
| 330 | |||
| 331 | /* | ||
| 332 | * Select whether the frequency is to be controlled | ||
| 333 | * and in which mode (PLL or FLL). Clamp to the operating | ||
| 334 | * range. Ugly multiply/divide should be replaced someday. | ||
| 335 | */ | ||
| 336 | |||
| 337 | if (time_status & STA_FREQHOLD || time_reftime == 0) | ||
| 338 | time_reftime = xtime.tv_sec; | ||
| 339 | mtemp = xtime.tv_sec - time_reftime; | ||
| 340 | time_reftime = xtime.tv_sec; | ||
| 341 | |||
| 342 | freq_adj = time_offset * mtemp; | ||
| 343 | freq_adj = shift_right(freq_adj, time_constant * 2 + | ||
| 344 | (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); | ||
| 345 | if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { | ||
| 346 | u64 utemp64; | ||
| 347 | temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); | ||
| 348 | if (time_offset < 0) { | ||
| 349 | utemp64 = -temp64; | ||
| 350 | do_div(utemp64, mtemp); | ||
| 351 | freq_adj -= utemp64; | ||
| 352 | } else { | ||
| 353 | utemp64 = temp64; | ||
| 354 | do_div(utemp64, mtemp); | ||
| 355 | freq_adj += utemp64; | ||
| 356 | } | ||
| 357 | } | ||
| 358 | freq_adj += time_freq; | ||
| 359 | freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); | ||
| 360 | time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); | ||
| 361 | time_offset = div_long_long_rem_signed(time_offset, | ||
| 362 | NTP_INTERVAL_FREQ, | ||
| 363 | &rem); | ||
| 364 | time_offset <<= SHIFT_UPDATE; | ||
| 365 | } /* STA_PLL */ | ||
| 366 | } /* txc->modes & ADJ_OFFSET */ | ||
| 367 | if (txc->modes & ADJ_TICK) | ||
| 368 | tick_usec = txc->tick; | ||
| 369 | |||
| 370 | if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) | ||
| 371 | ntp_update_frequency(); | ||
| 372 | } /* txc->modes */ | ||
| 373 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | ||
| 374 | result = TIME_ERROR; | 394 | result = TIME_ERROR; |
| 375 | 395 | ||
| 376 | if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || | 396 | if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || |
| 377 | (txc->modes == ADJ_OFFSET_SS_READ)) | 397 | (txc->modes == ADJ_OFFSET_SS_READ)) |
| 378 | txc->offset = save_adjust; | 398 | txc->offset = save_adjust; |
| 379 | else | 399 | else { |
| 380 | txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * | 400 | txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, |
| 381 | NTP_INTERVAL_FREQ / 1000; | 401 | NTP_SCALE_SHIFT); |
| 382 | txc->freq = (time_freq / NSEC_PER_USEC) << | 402 | if (!(time_status & STA_NANO)) |
| 383 | (SHIFT_USEC - SHIFT_NSEC); | 403 | txc->offset /= NSEC_PER_USEC; |
| 404 | } | ||
| 405 | txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * | ||
| 406 | (s64)PPM_SCALE_INV, | ||
| 407 | NTP_SCALE_SHIFT); | ||
| 384 | txc->maxerror = time_maxerror; | 408 | txc->maxerror = time_maxerror; |
| 385 | txc->esterror = time_esterror; | 409 | txc->esterror = time_esterror; |
| 386 | txc->status = time_status; | 410 | txc->status = time_status; |
| 387 | txc->constant = time_constant; | 411 | txc->constant = time_constant; |
| 388 | txc->precision = 1; | 412 | txc->precision = 1; |
| 389 | txc->tolerance = MAXFREQ; | 413 | txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; |
| 390 | txc->tick = tick_usec; | 414 | txc->tick = tick_usec; |
| 415 | txc->tai = time_tai; | ||
| 391 | 416 | ||
| 392 | /* PPS is not implemented, so these are zero */ | 417 | /* PPS is not implemented, so these are zero */ |
| 393 | txc->ppsfreq = 0; | 418 | txc->ppsfreq = 0; |
| @@ -399,9 +424,15 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
| 399 | txc->errcnt = 0; | 424 | txc->errcnt = 0; |
| 400 | txc->stbcnt = 0; | 425 | txc->stbcnt = 0; |
| 401 | write_sequnlock_irq(&xtime_lock); | 426 | write_sequnlock_irq(&xtime_lock); |
| 402 | do_gettimeofday(&txc->time); | 427 | |
| 428 | txc->time.tv_sec = ts.tv_sec; | ||
| 429 | txc->time.tv_usec = ts.tv_nsec; | ||
| 430 | if (!(time_status & STA_NANO)) | ||
| 431 | txc->time.tv_usec /= NSEC_PER_USEC; | ||
| 432 | |||
| 403 | notify_cmos_timer(); | 433 | notify_cmos_timer(); |
| 404 | return(result); | 434 | |
| 435 | return result; | ||
| 405 | } | 436 | } |
| 406 | 437 | ||
| 407 | static int __init ntp_tick_adj_setup(char *str) | 438 | static int __init ntp_tick_adj_setup(char *str) |
| @@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str) | |||
| 411 | } | 442 | } |
| 412 | 443 | ||
| 413 | __setup("ntp_tick_adj=", ntp_tick_adj_setup); | 444 | __setup("ntp_tick_adj=", ntp_tick_adj_setup); |
| 445 | |||
| 446 | void __init ntp_init(void) | ||
| 447 | { | ||
| 448 | ntp_clear(); | ||
| 449 | hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | ||
| 450 | leap_timer.function = ntp_leap_second; | ||
| 451 | } | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e1bd50cbbf5d..57a1f02e5ec0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -14,7 +14,7 @@ | |||
| 14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
| 15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
| 16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
| 17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| @@ -262,7 +262,7 @@ out: | |||
| 262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | 262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) |
| 263 | { | 263 | { |
| 264 | if (!cpu_isset(*oncpu, cpu_online_map)) | 264 | if (!cpu_isset(*oncpu, cpu_online_map)) |
| 265 | printk(KERN_ERR "tick-braodcast: ignoring broadcast for " | 265 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " |
| 266 | "offline CPU #%d\n", *oncpu); | 266 | "offline CPU #%d\n", *oncpu); |
| 267 | else | 267 | else |
| 268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, | 268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1bea399a9ef0..4f3886562b8c 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -14,12 +14,14 @@ | |||
| 14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
| 15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
| 16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
| 17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/tick.h> | 21 | #include <linux/tick.h> |
| 22 | 22 | ||
| 23 | #include <asm/irq_regs.h> | ||
| 24 | |||
| 23 | #include "tick-internal.h" | 25 | #include "tick-internal.h" |
| 24 | 26 | ||
| 25 | /* | 27 | /* |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0258d3115d54..450c04935b66 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -14,7 +14,7 @@ | |||
| 14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
| 15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
| 16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
| 17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 686da821d376..b854a895591e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu) | |||
| 158 | } | 158 | } |
| 159 | } | 159 | } |
| 160 | 160 | ||
| 161 | static ktime_t tick_nohz_start_idle(int cpu) | 161 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) |
| 162 | { | 162 | { |
| 163 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
| 164 | ktime_t now, delta; | 163 | ktime_t now, delta; |
| 165 | 164 | ||
| 166 | now = ktime_get(); | 165 | now = ktime_get(); |
| @@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
| 192 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
| 193 | { | 192 | { |
| 194 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
| 195 | unsigned long rt_jiffies; | ||
| 196 | struct tick_sched *ts; | 194 | struct tick_sched *ts; |
| 197 | ktime_t last_update, expires, now; | 195 | ktime_t last_update, expires, now; |
| 198 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 196 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| @@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void) | |||
| 201 | local_irq_save(flags); | 199 | local_irq_save(flags); |
| 202 | 200 | ||
| 203 | cpu = smp_processor_id(); | 201 | cpu = smp_processor_id(); |
| 204 | now = tick_nohz_start_idle(cpu); | ||
| 205 | ts = &per_cpu(tick_cpu_sched, cpu); | 202 | ts = &per_cpu(tick_cpu_sched, cpu); |
| 203 | now = tick_nohz_start_idle(ts); | ||
| 206 | 204 | ||
| 207 | /* | 205 | /* |
| 208 | * If this cpu is offline and it is the one which updates | 206 | * If this cpu is offline and it is the one which updates |
| @@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void) | |||
| 222 | if (need_resched()) | 220 | if (need_resched()) |
| 223 | goto end; | 221 | goto end; |
| 224 | 222 | ||
| 225 | cpu = smp_processor_id(); | ||
| 226 | if (unlikely(local_softirq_pending())) { | 223 | if (unlikely(local_softirq_pending())) { |
| 227 | static int ratelimit; | 224 | static int ratelimit; |
| 228 | 225 | ||
| @@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) | |||
| 245 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 242 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
| 246 | delta_jiffies = next_jiffies - last_jiffies; | 243 | delta_jiffies = next_jiffies - last_jiffies; |
| 247 | 244 | ||
| 248 | rt_jiffies = rt_needs_cpu(cpu); | ||
| 249 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
| 250 | delta_jiffies = rt_jiffies; | ||
| 251 | |||
| 252 | if (rcu_needs_cpu(cpu)) | 245 | if (rcu_needs_cpu(cpu)) |
| 253 | delta_jiffies = 1; | 246 | delta_jiffies = 1; |
| 254 | /* | 247 | /* |
| @@ -400,6 +393,7 @@ void tick_nohz_restart_sched_tick(void) | |||
| 400 | sub_preempt_count(HARDIRQ_OFFSET); | 393 | sub_preempt_count(HARDIRQ_OFFSET); |
| 401 | } | 394 | } |
| 402 | 395 | ||
| 396 | touch_softlockup_watchdog(); | ||
| 403 | /* | 397 | /* |
| 404 | * Cancel the scheduled timer and restore the tick | 398 | * Cancel the scheduled timer and restore the tick |
| 405 | */ | 399 | */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a3fa587c350c..e91c29f961c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec) | |||
| 53 | timespec_add_ns(&xtime_cache, nsec); | 53 | timespec_add_ns(&xtime_cache, nsec); |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | static struct clocksource *clock; /* pointer to current clocksource */ | 56 | struct clocksource *clock; |
| 57 | 57 | ||
| 58 | 58 | ||
| 59 | #ifdef CONFIG_GENERIC_TIME | 59 | #ifdef CONFIG_GENERIC_TIME |
| @@ -178,6 +178,7 @@ static void change_clocksource(void) | |||
| 178 | if (clock == new) | 178 | if (clock == new) |
| 179 | return; | 179 | return; |
| 180 | 180 | ||
| 181 | new->cycle_last = 0; | ||
| 181 | now = clocksource_read(new); | 182 | now = clocksource_read(new); |
| 182 | nsec = __get_nsec_offset(); | 183 | nsec = __get_nsec_offset(); |
| 183 | timespec_add_ns(&xtime, nsec); | 184 | timespec_add_ns(&xtime, nsec); |
| @@ -245,7 +246,7 @@ void __init timekeeping_init(void) | |||
| 245 | 246 | ||
| 246 | write_seqlock_irqsave(&xtime_lock, flags); | 247 | write_seqlock_irqsave(&xtime_lock, flags); |
| 247 | 248 | ||
| 248 | ntp_clear(); | 249 | ntp_init(); |
| 249 | 250 | ||
| 250 | clock = clocksource_get_next(); | 251 | clock = clocksource_get_next(); |
| 251 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 252 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
| @@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
| 295 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | 296 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); |
| 296 | update_xtime_cache(0); | 297 | update_xtime_cache(0); |
| 297 | /* re-base the last cycle value */ | 298 | /* re-base the last cycle value */ |
| 299 | clock->cycle_last = 0; | ||
| 298 | clock->cycle_last = clocksource_read(clock); | 300 | clock->cycle_last = clocksource_read(clock); |
| 299 | clock->error = 0; | 301 | clock->error = 0; |
| 300 | timekeeping_suspended = 0; | 302 | timekeeping_suspended = 0; |
| @@ -369,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | |||
| 369 | * here. This is tuned so that an error of about 1 msec is adjusted | 371 | * here. This is tuned so that an error of about 1 msec is adjusted |
| 370 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | 372 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). |
| 371 | */ | 373 | */ |
| 372 | error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); | 374 | error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); |
| 373 | error2 = abs(error2); | 375 | error2 = abs(error2); |
| 374 | for (look_ahead = 0; error2 > 0; look_ahead++) | 376 | for (look_ahead = 0; error2 > 0; look_ahead++) |
| 375 | error2 >>= 2; | 377 | error2 >>= 2; |
| @@ -378,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | |||
| 378 | * Now calculate the error in (1 << look_ahead) ticks, but first | 380 | * Now calculate the error in (1 << look_ahead) ticks, but first |
| 379 | * remove the single look ahead already included in the error. | 381 | * remove the single look ahead already included in the error. |
| 380 | */ | 382 | */ |
| 381 | tick_error = current_tick_length() >> | 383 | tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); |
| 382 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
| 383 | tick_error -= clock->xtime_interval >> 1; | 384 | tick_error -= clock->xtime_interval >> 1; |
| 384 | error = ((error - tick_error) >> look_ahead) + tick_error; | 385 | error = ((error - tick_error) >> look_ahead) + tick_error; |
| 385 | 386 | ||
| @@ -410,7 +411,7 @@ static void clocksource_adjust(s64 offset) | |||
| 410 | s64 error, interval = clock->cycle_interval; | 411 | s64 error, interval = clock->cycle_interval; |
| 411 | int adj; | 412 | int adj; |
| 412 | 413 | ||
| 413 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | 414 | error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); |
| 414 | if (error > interval) { | 415 | if (error > interval) { |
| 415 | error >>= 2; | 416 | error >>= 2; |
| 416 | if (likely(error <= interval)) | 417 | if (likely(error <= interval)) |
| @@ -432,7 +433,7 @@ static void clocksource_adjust(s64 offset) | |||
| 432 | clock->xtime_interval += interval; | 433 | clock->xtime_interval += interval; |
| 433 | clock->xtime_nsec -= offset; | 434 | clock->xtime_nsec -= offset; |
| 434 | clock->error -= (interval - offset) << | 435 | clock->error -= (interval - offset) << |
| 435 | (TICK_LENGTH_SHIFT - clock->shift); | 436 | (NTP_SCALE_SHIFT - clock->shift); |
| 436 | } | 437 | } |
| 437 | 438 | ||
| 438 | /** | 439 | /** |
| @@ -471,8 +472,8 @@ void update_wall_time(void) | |||
| 471 | } | 472 | } |
| 472 | 473 | ||
| 473 | /* accumulate error between NTP and clock interval */ | 474 | /* accumulate error between NTP and clock interval */ |
| 474 | clock->error += current_tick_length(); | 475 | clock->error += tick_length; |
| 475 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | 476 | clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); |
| 476 | } | 477 | } |
| 477 | 478 | ||
| 478 | /* correct the clock when NTP error is too big */ | 479 | /* correct the clock when NTP error is too big */ |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 67fe8fc21fb1..a40e20fd0001 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void) | |||
| 278 | { | 278 | { |
| 279 | struct proc_dir_entry *pe; | 279 | struct proc_dir_entry *pe; |
| 280 | 280 | ||
| 281 | pe = create_proc_entry("timer_list", 0644, NULL); | 281 | pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); |
| 282 | if (!pe) | 282 | if (!pe) |
| 283 | return -ENOMEM; | 283 | return -ENOMEM; |
| 284 | |||
| 285 | pe->proc_fops = &timer_list_fops; | ||
| 286 | |||
| 287 | return 0; | 284 | return 0; |
| 288 | } | 285 | } |
| 289 | __initcall(init_timer_list_procfs); | 286 | __initcall(init_timer_list_procfs); |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 417da8c5bc72..c994530d166d 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
| @@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void) | |||
| 415 | { | 415 | { |
| 416 | struct proc_dir_entry *pe; | 416 | struct proc_dir_entry *pe; |
| 417 | 417 | ||
| 418 | pe = create_proc_entry("timer_stats", 0644, NULL); | 418 | pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); |
| 419 | if (!pe) | 419 | if (!pe) |
| 420 | return -ENOMEM; | 420 | return -ENOMEM; |
| 421 | |||
| 422 | pe->proc_fops = &tstats_fops; | ||
| 423 | |||
| 424 | return 0; | 421 | return 0; |
| 425 | } | 422 | } |
| 426 | __initcall(init_tstats_procfs); | 423 | __initcall(init_tstats_procfs); |
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index 41468035473c..eb51d76e058a 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #!/usr/bin/perl | 1 | #!/usr/bin/perl |
| 2 | # ----------------------------------------------------------------------- | 2 | # ----------------------------------------------------------------------- |
| 3 | # | 3 | # |
| 4 | # Copyright 2007 rPath, Inc. - All Rights Reserved | 4 | # Copyright 2007-2008 rPath, Inc. - All Rights Reserved |
| 5 | # | 5 | # |
| 6 | # This file is part of the Linux kernel, and is made available under | 6 | # This file is part of the Linux kernel, and is made available under |
| 7 | # the terms of the GNU General Public License version 2 or (at your | 7 | # the terms of the GNU General Public License version 2 or (at your |
| @@ -20,198 +20,138 @@ | |||
| 20 | %canned_values = ( | 20 | %canned_values = ( |
| 21 | 24 => [ | 21 | 24 => [ |
| 22 | '0xa6aaaaab','0x2aaaaaa',26, | 22 | '0xa6aaaaab','0x2aaaaaa',26, |
| 23 | '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58, | ||
| 24 | 125,3, | 23 | 125,3, |
| 25 | '0xc49ba5e4','0x1fbe76c8b4',37, | 24 | '0xc49ba5e4','0x1fbe76c8b4',37, |
| 26 | '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69, | ||
| 27 | 3,125, | 25 | 3,125, |
| 28 | '0xa2c2aaab','0xaaaa',16, | 26 | '0xa2c2aaab','0xaaaa',16, |
| 29 | '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48, | ||
| 30 | 125000,3, | 27 | 125000,3, |
| 31 | '0xc9539b89','0x7fffbce4217d',47, | 28 | '0xc9539b89','0x7fffbce4217d',47, |
| 32 | '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79, | ||
| 33 | 3,125000, | 29 | 3,125000, |
| 34 | ], 32 => [ | 30 | ], 32 => [ |
| 35 | '0xfa000000','0x6000000',27, | 31 | '0xfa000000','0x6000000',27, |
| 36 | '0xfa00000000000000','0x600000000000000',59, | ||
| 37 | 125,4, | 32 | 125,4, |
| 38 | '0x83126e98','0xfdf3b645a',36, | 33 | '0x83126e98','0xfdf3b645a',36, |
| 39 | '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68, | ||
| 40 | 4,125, | 34 | 4,125, |
| 41 | '0xf4240000','0x0',17, | 35 | '0xf4240000','0x0',17, |
| 42 | '0xf424000000000000','0x0',49, | ||
| 43 | 31250,1, | 36 | 31250,1, |
| 44 | '0x8637bd06','0x3fff79c842fa',46, | 37 | '0x8637bd06','0x3fff79c842fa',46, |
| 45 | '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78, | ||
| 46 | 1,31250, | 38 | 1,31250, |
| 47 | ], 48 => [ | 39 | ], 48 => [ |
| 48 | '0xa6aaaaab','0x6aaaaaa',27, | 40 | '0xa6aaaaab','0x6aaaaaa',27, |
| 49 | '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59, | ||
| 50 | 125,6, | 41 | 125,6, |
| 51 | '0xc49ba5e4','0xfdf3b645a',36, | 42 | '0xc49ba5e4','0xfdf3b645a',36, |
| 52 | '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68, | ||
| 53 | 6,125, | 43 | 6,125, |
| 54 | '0xa2c2aaab','0x15555',17, | 44 | '0xa2c2aaab','0x15555',17, |
| 55 | '0xa2c2aaaaaaaaaaab','0x1555555555555',49, | ||
| 56 | 62500,3, | 45 | 62500,3, |
| 57 | '0xc9539b89','0x3fffbce4217d',46, | 46 | '0xc9539b89','0x3fffbce4217d',46, |
| 58 | '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78, | ||
| 59 | 3,62500, | 47 | 3,62500, |
| 60 | ], 64 => [ | 48 | ], 64 => [ |
| 61 | '0xfa000000','0xe000000',28, | 49 | '0xfa000000','0xe000000',28, |
| 62 | '0xfa00000000000000','0xe00000000000000',60, | ||
| 63 | 125,8, | 50 | 125,8, |
| 64 | '0x83126e98','0x7ef9db22d',35, | 51 | '0x83126e98','0x7ef9db22d',35, |
| 65 | '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67, | ||
| 66 | 8,125, | 52 | 8,125, |
| 67 | '0xf4240000','0x0',18, | 53 | '0xf4240000','0x0',18, |
| 68 | '0xf424000000000000','0x0',50, | ||
| 69 | 15625,1, | 54 | 15625,1, |
| 70 | '0x8637bd06','0x1fff79c842fa',45, | 55 | '0x8637bd06','0x1fff79c842fa',45, |
| 71 | '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77, | ||
| 72 | 1,15625, | 56 | 1,15625, |
| 73 | ], 100 => [ | 57 | ], 100 => [ |
| 74 | '0xa0000000','0x0',28, | 58 | '0xa0000000','0x0',28, |
| 75 | '0xa000000000000000','0x0',60, | ||
| 76 | 10,1, | 59 | 10,1, |
| 77 | '0xcccccccd','0x733333333',35, | 60 | '0xcccccccd','0x733333333',35, |
| 78 | '0xcccccccccccccccd','0x73333333333333333',67, | ||
| 79 | 1,10, | 61 | 1,10, |
| 80 | '0x9c400000','0x0',18, | 62 | '0x9c400000','0x0',18, |
| 81 | '0x9c40000000000000','0x0',50, | ||
| 82 | 10000,1, | 63 | 10000,1, |
| 83 | '0xd1b71759','0x1fff2e48e8a7',45, | 64 | '0xd1b71759','0x1fff2e48e8a7',45, |
| 84 | '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77, | ||
| 85 | 1,10000, | 65 | 1,10000, |
| 86 | ], 122 => [ | 66 | ], 122 => [ |
| 87 | '0x8325c53f','0xfbcda3a',28, | 67 | '0x8325c53f','0xfbcda3a',28, |
| 88 | '0x8325c53ef368eb05','0xfbcda3ac10c9714',60, | ||
| 89 | 500,61, | 68 | 500,61, |
| 90 | '0xf9db22d1','0x7fbe76c8b',35, | 69 | '0xf9db22d1','0x7fbe76c8b',35, |
| 91 | '0xf9db22d0e560418a','0x7fbe76c8b43958106',67, | ||
| 92 | 61,500, | 70 | 61,500, |
| 93 | '0x8012e2a0','0x3ef36',18, | 71 | '0x8012e2a0','0x3ef36',18, |
| 94 | '0x8012e29f79b47583','0x3ef368eb04325',50, | ||
| 95 | 500000,61, | 72 | 500000,61, |
| 96 | '0xffda4053','0x1ffffbce4217',45, | 73 | '0xffda4053','0x1ffffbce4217',45, |
| 97 | '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77, | ||
| 98 | 61,500000, | 74 | 61,500000, |
| 99 | ], 128 => [ | 75 | ], 128 => [ |
| 100 | '0xfa000000','0x1e000000',29, | 76 | '0xfa000000','0x1e000000',29, |
| 101 | '0xfa00000000000000','0x1e00000000000000',61, | ||
| 102 | 125,16, | 77 | 125,16, |
| 103 | '0x83126e98','0x3f7ced916',34, | 78 | '0x83126e98','0x3f7ced916',34, |
| 104 | '0x83126e978d4fdf3c','0x3f7ced916872b020c',66, | ||
| 105 | 16,125, | 79 | 16,125, |
| 106 | '0xf4240000','0x40000',19, | 80 | '0xf4240000','0x40000',19, |
| 107 | '0xf424000000000000','0x4000000000000',51, | ||
| 108 | 15625,2, | 81 | 15625,2, |
| 109 | '0x8637bd06','0xfffbce4217d',44, | 82 | '0x8637bd06','0xfffbce4217d',44, |
| 110 | '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76, | ||
| 111 | 2,15625, | 83 | 2,15625, |
| 112 | ], 200 => [ | 84 | ], 200 => [ |
| 113 | '0xa0000000','0x0',29, | 85 | '0xa0000000','0x0',29, |
| 114 | '0xa000000000000000','0x0',61, | ||
| 115 | 5,1, | 86 | 5,1, |
| 116 | '0xcccccccd','0x333333333',34, | 87 | '0xcccccccd','0x333333333',34, |
| 117 | '0xcccccccccccccccd','0x33333333333333333',66, | ||
| 118 | 1,5, | 88 | 1,5, |
| 119 | '0x9c400000','0x0',19, | 89 | '0x9c400000','0x0',19, |
| 120 | '0x9c40000000000000','0x0',51, | ||
| 121 | 5000,1, | 90 | 5000,1, |
| 122 | '0xd1b71759','0xfff2e48e8a7',44, | 91 | '0xd1b71759','0xfff2e48e8a7',44, |
| 123 | '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76, | ||
| 124 | 1,5000, | 92 | 1,5000, |
| 125 | ], 250 => [ | 93 | ], 250 => [ |
| 126 | '0x80000000','0x0',29, | 94 | '0x80000000','0x0',29, |
| 127 | '0x8000000000000000','0x0',61, | ||
| 128 | 4,1, | 95 | 4,1, |
| 129 | '0x80000000','0x180000000',33, | 96 | '0x80000000','0x180000000',33, |
| 130 | '0x8000000000000000','0x18000000000000000',65, | ||
| 131 | 1,4, | 97 | 1,4, |
| 132 | '0xfa000000','0x0',20, | 98 | '0xfa000000','0x0',20, |
| 133 | '0xfa00000000000000','0x0',52, | ||
| 134 | 4000,1, | 99 | 4000,1, |
| 135 | '0x83126e98','0x7ff7ced9168',43, | 100 | '0x83126e98','0x7ff7ced9168',43, |
| 136 | '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75, | ||
| 137 | 1,4000, | 101 | 1,4000, |
| 138 | ], 256 => [ | 102 | ], 256 => [ |
| 139 | '0xfa000000','0x3e000000',30, | 103 | '0xfa000000','0x3e000000',30, |
| 140 | '0xfa00000000000000','0x3e00000000000000',62, | ||
| 141 | 125,32, | 104 | 125,32, |
| 142 | '0x83126e98','0x1fbe76c8b',33, | 105 | '0x83126e98','0x1fbe76c8b',33, |
| 143 | '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65, | ||
| 144 | 32,125, | 106 | 32,125, |
| 145 | '0xf4240000','0xc0000',20, | 107 | '0xf4240000','0xc0000',20, |
| 146 | '0xf424000000000000','0xc000000000000',52, | ||
| 147 | 15625,4, | 108 | 15625,4, |
| 148 | '0x8637bd06','0x7ffde7210be',43, | 109 | '0x8637bd06','0x7ffde7210be',43, |
| 149 | '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75, | ||
| 150 | 4,15625, | 110 | 4,15625, |
| 151 | ], 300 => [ | 111 | ], 300 => [ |
| 152 | '0xd5555556','0x2aaaaaaa',30, | 112 | '0xd5555556','0x2aaaaaaa',30, |
| 153 | '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62, | ||
| 154 | 10,3, | 113 | 10,3, |
| 155 | '0x9999999a','0x1cccccccc',33, | 114 | '0x9999999a','0x1cccccccc',33, |
| 156 | '0x999999999999999a','0x1cccccccccccccccc',65, | ||
| 157 | 3,10, | 115 | 3,10, |
| 158 | '0xd0555556','0xaaaaa',20, | 116 | '0xd0555556','0xaaaaa',20, |
| 159 | '0xd055555555555556','0xaaaaaaaaaaaaa',52, | ||
| 160 | 10000,3, | 117 | 10000,3, |
| 161 | '0x9d495183','0x7ffcb923a29',43, | 118 | '0x9d495183','0x7ffcb923a29',43, |
| 162 | '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75, | ||
| 163 | 3,10000, | 119 | 3,10000, |
| 164 | ], 512 => [ | 120 | ], 512 => [ |
| 165 | '0xfa000000','0x7e000000',31, | 121 | '0xfa000000','0x7e000000',31, |
| 166 | '0xfa00000000000000','0x7e00000000000000',63, | ||
| 167 | 125,64, | 122 | 125,64, |
| 168 | '0x83126e98','0xfdf3b645',32, | 123 | '0x83126e98','0xfdf3b645',32, |
| 169 | '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64, | ||
| 170 | 64,125, | 124 | 64,125, |
| 171 | '0xf4240000','0x1c0000',21, | 125 | '0xf4240000','0x1c0000',21, |
| 172 | '0xf424000000000000','0x1c000000000000',53, | ||
| 173 | 15625,8, | 126 | 15625,8, |
| 174 | '0x8637bd06','0x3ffef39085f',42, | 127 | '0x8637bd06','0x3ffef39085f',42, |
| 175 | '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74, | ||
| 176 | 8,15625, | 128 | 8,15625, |
| 177 | ], 1000 => [ | 129 | ], 1000 => [ |
| 178 | '0x80000000','0x0',31, | 130 | '0x80000000','0x0',31, |
| 179 | '0x8000000000000000','0x0',63, | ||
| 180 | 1,1, | 131 | 1,1, |
| 181 | '0x80000000','0x0',31, | 132 | '0x80000000','0x0',31, |
| 182 | '0x8000000000000000','0x0',63, | ||
| 183 | 1,1, | 133 | 1,1, |
| 184 | '0xfa000000','0x0',22, | 134 | '0xfa000000','0x0',22, |
| 185 | '0xfa00000000000000','0x0',54, | ||
| 186 | 1000,1, | 135 | 1000,1, |
| 187 | '0x83126e98','0x1ff7ced9168',41, | 136 | '0x83126e98','0x1ff7ced9168',41, |
| 188 | '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73, | ||
| 189 | 1,1000, | 137 | 1,1000, |
| 190 | ], 1024 => [ | 138 | ], 1024 => [ |
| 191 | '0xfa000000','0xfe000000',32, | 139 | '0xfa000000','0xfe000000',32, |
| 192 | '0xfa00000000000000','0xfe00000000000000',64, | ||
| 193 | 125,128, | 140 | 125,128, |
| 194 | '0x83126e98','0x7ef9db22',31, | 141 | '0x83126e98','0x7ef9db22',31, |
| 195 | '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63, | ||
| 196 | 128,125, | 142 | 128,125, |
| 197 | '0xf4240000','0x3c0000',22, | 143 | '0xf4240000','0x3c0000',22, |
| 198 | '0xf424000000000000','0x3c000000000000',54, | ||
| 199 | 15625,16, | 144 | 15625,16, |
| 200 | '0x8637bd06','0x1fff79c842f',41, | 145 | '0x8637bd06','0x1fff79c842f',41, |
| 201 | '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73, | ||
| 202 | 16,15625, | 146 | 16,15625, |
| 203 | ], 1200 => [ | 147 | ], 1200 => [ |
| 204 | '0xd5555556','0xd5555555',32, | 148 | '0xd5555556','0xd5555555',32, |
| 205 | '0xd555555555555556','0xd555555555555555',64, | ||
| 206 | 5,6, | 149 | 5,6, |
| 207 | '0x9999999a','0x66666666',31, | 150 | '0x9999999a','0x66666666',31, |
| 208 | '0x999999999999999a','0x6666666666666666',63, | ||
| 209 | 6,5, | 151 | 6,5, |
| 210 | '0xd0555556','0x2aaaaa',22, | 152 | '0xd0555556','0x2aaaaa',22, |
| 211 | '0xd055555555555556','0x2aaaaaaaaaaaaa',54, | ||
| 212 | 2500,3, | 153 | 2500,3, |
| 213 | '0x9d495183','0x1ffcb923a29',41, | 154 | '0x9d495183','0x1ffcb923a29',41, |
| 214 | '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73, | ||
| 215 | 3,2500, | 155 | 3,2500, |
| 216 | ] | 156 | ] |
| 217 | ); | 157 | ); |
| @@ -264,6 +204,15 @@ sub fmuls($$$) { | |||
| 264 | return 0; | 204 | return 0; |
| 265 | } | 205 | } |
| 266 | 206 | ||
| 207 | # Generate a hex value if the result fits in 64 bits; | ||
| 208 | # otherwise skip. | ||
| 209 | sub bignum_hex($) { | ||
| 210 | my($x) = @_; | ||
| 211 | my $s = $x->as_hex(); | ||
| 212 | |||
| 213 | return (length($s) > 18) ? undef : $s; | ||
| 214 | } | ||
| 215 | |||
| 267 | # Provides mul, adj, and shr factors for a specific | 216 | # Provides mul, adj, and shr factors for a specific |
| 268 | # (bit, time, hz) combination | 217 | # (bit, time, hz) combination |
| 269 | sub muladj($$$) { | 218 | sub muladj($$$) { |
| @@ -271,7 +220,7 @@ sub muladj($$$) { | |||
| 271 | my $s = fmuls($b, $t, $hz); | 220 | my $s = fmuls($b, $t, $hz); |
| 272 | my $m = fmul($s, $t, $hz); | 221 | my $m = fmul($s, $t, $hz); |
| 273 | my $a = fadj($s, $t, $hz); | 222 | my $a = fadj($s, $t, $hz); |
| 274 | return ($m->as_hex(), $a->as_hex(), $s); | 223 | return (bignum_hex($m), bignum_hex($a), $s); |
| 275 | } | 224 | } |
| 276 | 225 | ||
| 277 | # Provides numerator, denominator values | 226 | # Provides numerator, denominator values |
| @@ -288,12 +237,10 @@ sub conversions($$) { | |||
| 288 | 237 | ||
| 289 | # HZ_TO_xx | 238 | # HZ_TO_xx |
| 290 | push(@val, muladj(32, $t, $hz)); | 239 | push(@val, muladj(32, $t, $hz)); |
| 291 | push(@val, muladj(64, $t, $hz)); | ||
| 292 | push(@val, numden($t, $hz)); | 240 | push(@val, numden($t, $hz)); |
| 293 | 241 | ||
| 294 | # xx_TO_HZ | 242 | # xx_TO_HZ |
| 295 | push(@val, muladj(32, $hz, $t)); | 243 | push(@val, muladj(32, $hz, $t)); |
| 296 | push(@val, muladj(64, $hz, $t)); | ||
| 297 | push(@val, numden($hz, $t)); | 244 | push(@val, numden($hz, $t)); |
| 298 | 245 | ||
| 299 | return @val; | 246 | return @val; |
| @@ -318,6 +265,19 @@ sub compute_values($) { | |||
| 318 | return @val; | 265 | return @val; |
| 319 | } | 266 | } |
| 320 | 267 | ||
| 268 | sub outputval($$) | ||
| 269 | { | ||
| 270 | my($name, $val) = @_; | ||
| 271 | my $csuf; | ||
| 272 | |||
| 273 | if (defined($val)) { | ||
| 274 | if ($name !~ /SHR/) { | ||
| 275 | $val = "U64_C($val)"; | ||
| 276 | } | ||
| 277 | printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; | ||
| 278 | } | ||
| 279 | } | ||
| 280 | |||
| 321 | sub output($@) | 281 | sub output($@) |
| 322 | { | 282 | { |
| 323 | my($hz, @val) = @_; | 283 | my($hz, @val) = @_; |
| @@ -331,6 +291,7 @@ sub output($@) | |||
| 331 | print "\n"; | 291 | print "\n"; |
| 332 | 292 | ||
| 333 | print "#include <linux/param.h>\n"; | 293 | print "#include <linux/param.h>\n"; |
| 294 | print "#include <linux/types.h>\n"; | ||
| 334 | 295 | ||
| 335 | print "\n"; | 296 | print "\n"; |
| 336 | print "#if HZ != $hz\n"; | 297 | print "#if HZ != $hz\n"; |
| @@ -340,15 +301,13 @@ sub output($@) | |||
| 340 | 301 | ||
| 341 | foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', | 302 | foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', |
| 342 | 'HZ_TO_USEC','USEC_TO_HZ') { | 303 | 'HZ_TO_USEC','USEC_TO_HZ') { |
| 343 | foreach $bit (32, 64) { | 304 | foreach $bit (32) { |
| 344 | foreach $suf ('MUL', 'ADJ', 'SHR') { | 305 | foreach $suf ('MUL', 'ADJ', 'SHR') { |
| 345 | printf "#define %-23s %s\n", | 306 | outputval("${pfx}_$suf$bit", shift(@val)); |
| 346 | "${pfx}_$suf$bit", shift(@val); | ||
| 347 | } | 307 | } |
| 348 | } | 308 | } |
| 349 | foreach $suf ('NUM', 'DEN') { | 309 | foreach $suf ('NUM', 'DEN') { |
| 350 | printf "#define %-23s %s\n", | 310 | outputval("${pfx}_$suf", shift(@val)); |
| 351 | "${pfx}_$suf", shift(@val); | ||
| 352 | } | 311 | } |
| 353 | } | 312 | } |
| 354 | 313 | ||
| @@ -356,6 +315,23 @@ sub output($@) | |||
| 356 | print "#endif /* KERNEL_TIMECONST_H */\n"; | 315 | print "#endif /* KERNEL_TIMECONST_H */\n"; |
| 357 | } | 316 | } |
| 358 | 317 | ||
| 318 | # Pretty-print Perl values | ||
| 319 | sub perlvals(@) { | ||
| 320 | my $v; | ||
| 321 | my @l = (); | ||
| 322 | |||
| 323 | foreach $v (@_) { | ||
| 324 | if (!defined($v)) { | ||
| 325 | push(@l, 'undef'); | ||
| 326 | } elsif ($v =~ /^0x/) { | ||
| 327 | push(@l, "\'".$v."\'"); | ||
| 328 | } else { | ||
| 329 | push(@l, $v.''); | ||
| 330 | } | ||
| 331 | } | ||
| 332 | return join(',', @l); | ||
| 333 | } | ||
| 334 | |||
| 359 | ($hz) = @ARGV; | 335 | ($hz) = @ARGV; |
| 360 | 336 | ||
| 361 | # Use this to generate the %canned_values structure | 337 | # Use this to generate the %canned_values structure |
| @@ -373,15 +349,15 @@ if ($hz eq '--can') { | |||
| 373 | print "$pf$hz => [\n"; | 349 | print "$pf$hz => [\n"; |
| 374 | while (scalar(@values)) { | 350 | while (scalar(@values)) { |
| 375 | my $bit; | 351 | my $bit; |
| 376 | foreach $bit (32, 64) { | 352 | foreach $bit (32) { |
| 377 | my $m = shift(@values); | 353 | my $m = shift(@values); |
| 378 | my $a = shift(@values); | 354 | my $a = shift(@values); |
| 379 | my $s = shift(@values); | 355 | my $s = shift(@values); |
| 380 | print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n"; | 356 | print "\t\t", perlvals($m,$a,$s), ",\n"; |
| 381 | } | 357 | } |
| 382 | my $n = shift(@values); | 358 | my $n = shift(@values); |
| 383 | my $d = shift(@values); | 359 | my $d = shift(@values); |
| 384 | print "\t\t",$n,',',$d,",\n"; | 360 | print "\t\t", perlvals($n,$d), ",\n"; |
| 385 | } | 361 | } |
| 386 | print "\t]"; | 362 | print "\t]"; |
| 387 | $pf = ', '; | 363 | $pf = ', '; |
diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..ceacc6626572 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer) | |||
| 320 | static void timer_stats_account_timer(struct timer_list *timer) {} | 320 | static void timer_stats_account_timer(struct timer_list *timer) {} |
| 321 | #endif | 321 | #endif |
| 322 | 322 | ||
| 323 | /** | 323 | #ifdef CONFIG_DEBUG_OBJECTS_TIMERS |
| 324 | * init_timer - initialize a timer. | 324 | |
| 325 | * @timer: the timer to be initialized | 325 | static struct debug_obj_descr timer_debug_descr; |
| 326 | * | 326 | |
| 327 | * init_timer() must be done to a timer prior calling *any* of the | 327 | /* |
| 328 | * other timer functions. | 328 | * fixup_init is called when: |
| 329 | * - an active object is initialized | ||
| 329 | */ | 330 | */ |
| 330 | void init_timer(struct timer_list *timer) | 331 | static int timer_fixup_init(void *addr, enum debug_obj_state state) |
| 332 | { | ||
| 333 | struct timer_list *timer = addr; | ||
| 334 | |||
| 335 | switch (state) { | ||
| 336 | case ODEBUG_STATE_ACTIVE: | ||
| 337 | del_timer_sync(timer); | ||
| 338 | debug_object_init(timer, &timer_debug_descr); | ||
| 339 | return 1; | ||
| 340 | default: | ||
| 341 | return 0; | ||
| 342 | } | ||
| 343 | } | ||
| 344 | |||
| 345 | /* | ||
| 346 | * fixup_activate is called when: | ||
| 347 | * - an active object is activated | ||
| 348 | * - an unknown object is activated (might be a statically initialized object) | ||
| 349 | */ | ||
| 350 | static int timer_fixup_activate(void *addr, enum debug_obj_state state) | ||
| 351 | { | ||
| 352 | struct timer_list *timer = addr; | ||
| 353 | |||
| 354 | switch (state) { | ||
| 355 | |||
| 356 | case ODEBUG_STATE_NOTAVAILABLE: | ||
| 357 | /* | ||
| 358 | * This is not really a fixup. The timer was | ||
| 359 | * statically initialized. We just make sure that it | ||
| 360 | * is tracked in the object tracker. | ||
| 361 | */ | ||
| 362 | if (timer->entry.next == NULL && | ||
| 363 | timer->entry.prev == TIMER_ENTRY_STATIC) { | ||
| 364 | debug_object_init(timer, &timer_debug_descr); | ||
| 365 | debug_object_activate(timer, &timer_debug_descr); | ||
| 366 | return 0; | ||
| 367 | } else { | ||
| 368 | WARN_ON_ONCE(1); | ||
| 369 | } | ||
| 370 | return 0; | ||
| 371 | |||
| 372 | case ODEBUG_STATE_ACTIVE: | ||
| 373 | WARN_ON(1); | ||
| 374 | |||
| 375 | default: | ||
| 376 | return 0; | ||
| 377 | } | ||
| 378 | } | ||
| 379 | |||
| 380 | /* | ||
| 381 | * fixup_free is called when: | ||
| 382 | * - an active object is freed | ||
| 383 | */ | ||
| 384 | static int timer_fixup_free(void *addr, enum debug_obj_state state) | ||
| 385 | { | ||
| 386 | struct timer_list *timer = addr; | ||
| 387 | |||
| 388 | switch (state) { | ||
| 389 | case ODEBUG_STATE_ACTIVE: | ||
| 390 | del_timer_sync(timer); | ||
| 391 | debug_object_free(timer, &timer_debug_descr); | ||
| 392 | return 1; | ||
| 393 | default: | ||
| 394 | return 0; | ||
| 395 | } | ||
| 396 | } | ||
| 397 | |||
| 398 | static struct debug_obj_descr timer_debug_descr = { | ||
| 399 | .name = "timer_list", | ||
| 400 | .fixup_init = timer_fixup_init, | ||
| 401 | .fixup_activate = timer_fixup_activate, | ||
| 402 | .fixup_free = timer_fixup_free, | ||
| 403 | }; | ||
| 404 | |||
| 405 | static inline void debug_timer_init(struct timer_list *timer) | ||
| 406 | { | ||
| 407 | debug_object_init(timer, &timer_debug_descr); | ||
| 408 | } | ||
| 409 | |||
| 410 | static inline void debug_timer_activate(struct timer_list *timer) | ||
| 411 | { | ||
| 412 | debug_object_activate(timer, &timer_debug_descr); | ||
| 413 | } | ||
| 414 | |||
| 415 | static inline void debug_timer_deactivate(struct timer_list *timer) | ||
| 416 | { | ||
| 417 | debug_object_deactivate(timer, &timer_debug_descr); | ||
| 418 | } | ||
| 419 | |||
| 420 | static inline void debug_timer_free(struct timer_list *timer) | ||
| 421 | { | ||
| 422 | debug_object_free(timer, &timer_debug_descr); | ||
| 423 | } | ||
| 424 | |||
| 425 | static void __init_timer(struct timer_list *timer); | ||
| 426 | |||
| 427 | void init_timer_on_stack(struct timer_list *timer) | ||
| 428 | { | ||
| 429 | debug_object_init_on_stack(timer, &timer_debug_descr); | ||
| 430 | __init_timer(timer); | ||
| 431 | } | ||
| 432 | EXPORT_SYMBOL_GPL(init_timer_on_stack); | ||
| 433 | |||
| 434 | void destroy_timer_on_stack(struct timer_list *timer) | ||
| 435 | { | ||
| 436 | debug_object_free(timer, &timer_debug_descr); | ||
| 437 | } | ||
| 438 | EXPORT_SYMBOL_GPL(destroy_timer_on_stack); | ||
| 439 | |||
| 440 | #else | ||
| 441 | static inline void debug_timer_init(struct timer_list *timer) { } | ||
| 442 | static inline void debug_timer_activate(struct timer_list *timer) { } | ||
| 443 | static inline void debug_timer_deactivate(struct timer_list *timer) { } | ||
| 444 | #endif | ||
| 445 | |||
| 446 | static void __init_timer(struct timer_list *timer) | ||
| 331 | { | 447 | { |
| 332 | timer->entry.next = NULL; | 448 | timer->entry.next = NULL; |
| 333 | timer->base = __raw_get_cpu_var(tvec_bases); | 449 | timer->base = __raw_get_cpu_var(tvec_bases); |
| @@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer) | |||
| 337 | memset(timer->start_comm, 0, TASK_COMM_LEN); | 453 | memset(timer->start_comm, 0, TASK_COMM_LEN); |
| 338 | #endif | 454 | #endif |
| 339 | } | 455 | } |
| 456 | |||
| 457 | /** | ||
| 458 | * init_timer - initialize a timer. | ||
| 459 | * @timer: the timer to be initialized | ||
| 460 | * | ||
| 461 | * init_timer() must be done to a timer prior calling *any* of the | ||
| 462 | * other timer functions. | ||
| 463 | */ | ||
| 464 | void init_timer(struct timer_list *timer) | ||
| 465 | { | ||
| 466 | debug_timer_init(timer); | ||
| 467 | __init_timer(timer); | ||
| 468 | } | ||
| 340 | EXPORT_SYMBOL(init_timer); | 469 | EXPORT_SYMBOL(init_timer); |
| 341 | 470 | ||
| 342 | void init_timer_deferrable(struct timer_list *timer) | 471 | void init_timer_deferrable(struct timer_list *timer) |
| @@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer, | |||
| 351 | { | 480 | { |
| 352 | struct list_head *entry = &timer->entry; | 481 | struct list_head *entry = &timer->entry; |
| 353 | 482 | ||
| 483 | debug_timer_deactivate(timer); | ||
| 484 | |||
| 354 | __list_del(entry->prev, entry->next); | 485 | __list_del(entry->prev, entry->next); |
| 355 | if (clear_pending) | 486 | if (clear_pending) |
| 356 | entry->next = NULL; | 487 | entry->next = NULL; |
| @@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 405 | ret = 1; | 536 | ret = 1; |
| 406 | } | 537 | } |
| 407 | 538 | ||
| 539 | debug_timer_activate(timer); | ||
| 540 | |||
| 408 | new_base = __get_cpu_var(tvec_bases); | 541 | new_base = __get_cpu_var(tvec_bases); |
| 409 | 542 | ||
| 410 | if (base != new_base) { | 543 | if (base != new_base) { |
| @@ -450,11 +583,20 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 450 | BUG_ON(timer_pending(timer) || !timer->function); | 583 | BUG_ON(timer_pending(timer) || !timer->function); |
| 451 | spin_lock_irqsave(&base->lock, flags); | 584 | spin_lock_irqsave(&base->lock, flags); |
| 452 | timer_set_base(timer, base); | 585 | timer_set_base(timer, base); |
| 586 | debug_timer_activate(timer); | ||
| 453 | internal_add_timer(base, timer); | 587 | internal_add_timer(base, timer); |
| 588 | /* | ||
| 589 | * Check whether the other CPU is idle and needs to be | ||
| 590 | * triggered to reevaluate the timer wheel when nohz is | ||
| 591 | * active. We are protected against the other CPU fiddling | ||
| 592 | * with the timer by holding the timer base lock. This also | ||
| 593 | * makes sure that a CPU on the way to idle can not evaluate | ||
| 594 | * the timer wheel. | ||
| 595 | */ | ||
| 596 | wake_up_idle_cpu(cpu); | ||
| 454 | spin_unlock_irqrestore(&base->lock, flags); | 597 | spin_unlock_irqrestore(&base->lock, flags); |
| 455 | } | 598 | } |
| 456 | 599 | ||
| 457 | |||
| 458 | /** | 600 | /** |
| 459 | * mod_timer - modify a timer's timeout | 601 | * mod_timer - modify a timer's timeout |
| 460 | * @timer: the timer to be modified | 602 | * @timer: the timer to be modified |
| @@ -1078,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout) | |||
| 1078 | 1220 | ||
| 1079 | expire = timeout + jiffies; | 1221 | expire = timeout + jiffies; |
| 1080 | 1222 | ||
| 1081 | setup_timer(&timer, process_timeout, (unsigned long)current); | 1223 | setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); |
| 1082 | __mod_timer(&timer, expire); | 1224 | __mod_timer(&timer, expire); |
| 1083 | schedule(); | 1225 | schedule(); |
| 1084 | del_singleshot_timer_sync(&timer); | 1226 | del_singleshot_timer_sync(&timer); |
| 1085 | 1227 | ||
| 1228 | /* Remove the timer from the object tracker */ | ||
| 1229 | destroy_timer_on_stack(&timer); | ||
| 1230 | |||
| 1086 | timeout = expire - jiffies; | 1231 | timeout = expire - jiffies; |
| 1087 | 1232 | ||
| 1088 | out: | 1233 | out: |
| @@ -1220,13 +1365,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
| 1220 | return 0; | 1365 | return 0; |
| 1221 | } | 1366 | } |
| 1222 | 1367 | ||
| 1223 | /* | ||
| 1224 | * lockdep: we want to track each per-CPU base as a separate lock-class, | ||
| 1225 | * but timer-bases are kmalloc()-ed, so we need to attach separate | ||
| 1226 | * keys to them: | ||
| 1227 | */ | ||
| 1228 | static struct lock_class_key base_lock_keys[NR_CPUS]; | ||
| 1229 | |||
| 1230 | static int __cpuinit init_timers_cpu(int cpu) | 1368 | static int __cpuinit init_timers_cpu(int cpu) |
| 1231 | { | 1369 | { |
| 1232 | int j; | 1370 | int j; |
| @@ -1269,7 +1407,6 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
| 1269 | } | 1407 | } |
| 1270 | 1408 | ||
| 1271 | spin_lock_init(&base->lock); | 1409 | spin_lock_init(&base->lock); |
| 1272 | lockdep_set_class(&base->lock, base_lock_keys + cpu); | ||
| 1273 | 1410 | ||
| 1274 | for (j = 0; j < TVN_SIZE; j++) { | 1411 | for (j = 0; j < TVN_SIZE; j++) { |
| 1275 | INIT_LIST_HEAD(base->tv5.vec + j); | 1412 | INIT_LIST_HEAD(base->tv5.vec + j); |
| @@ -1308,8 +1445,8 @@ static void __cpuinit migrate_timers(int cpu) | |||
| 1308 | new_base = get_cpu_var(tvec_bases); | 1445 | new_base = get_cpu_var(tvec_bases); |
| 1309 | 1446 | ||
| 1310 | local_irq_disable(); | 1447 | local_irq_disable(); |
| 1311 | double_spin_lock(&new_base->lock, &old_base->lock, | 1448 | spin_lock(&new_base->lock); |
| 1312 | smp_processor_id() < cpu); | 1449 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
| 1313 | 1450 | ||
| 1314 | BUG_ON(old_base->running_timer); | 1451 | BUG_ON(old_base->running_timer); |
| 1315 | 1452 | ||
| @@ -1322,8 +1459,8 @@ static void __cpuinit migrate_timers(int cpu) | |||
| 1322 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1459 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
| 1323 | } | 1460 | } |
| 1324 | 1461 | ||
| 1325 | double_spin_unlock(&new_base->lock, &old_base->lock, | 1462 | spin_unlock(&old_base->lock); |
| 1326 | smp_processor_id() < cpu); | 1463 | spin_unlock(&new_base->lock); |
| 1327 | local_irq_enable(); | 1464 | local_irq_enable(); |
| 1328 | put_cpu_var(tvec_bases); | 1465 | put_cpu_var(tvec_bases); |
| 1329 | } | 1466 | } |
diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03b..3e41c1673e2f 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi | |||
| 21 | { | 21 | { |
| 22 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); | 22 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); |
| 23 | /* avoid REGPARM breakage on x86: */ | 23 | /* avoid REGPARM breakage on x86: */ |
| 24 | prevent_tail_call(ret); | 24 | asmlinkage_protect(3, ret, filename, user, group); |
| 25 | return ret; | 25 | return ret; |
| 26 | } | 26 | } |
| 27 | 27 | ||
| @@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g | |||
| 29 | { | 29 | { |
| 30 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); | 30 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); |
| 31 | /* avoid REGPARM breakage on x86: */ | 31 | /* avoid REGPARM breakage on x86: */ |
| 32 | prevent_tail_call(ret); | 32 | asmlinkage_protect(3, ret, filename, user, group); |
| 33 | return ret; | 33 | return ret; |
| 34 | } | 34 | } |
| 35 | 35 | ||
| @@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) | |||
| 37 | { | 37 | { |
| 38 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); | 38 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); |
| 39 | /* avoid REGPARM breakage on x86: */ | 39 | /* avoid REGPARM breakage on x86: */ |
| 40 | prevent_tail_call(ret); | 40 | asmlinkage_protect(3, ret, fd, user, group); |
| 41 | return ret; | 41 | return ret; |
| 42 | } | 42 | } |
| 43 | 43 | ||
| @@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) | |||
| 45 | { | 45 | { |
| 46 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); | 46 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); |
| 47 | /* avoid REGPARM breakage on x86: */ | 47 | /* avoid REGPARM breakage on x86: */ |
| 48 | prevent_tail_call(ret); | 48 | asmlinkage_protect(2, ret, rgid, egid); |
| 49 | return ret; | 49 | return ret; |
| 50 | } | 50 | } |
| 51 | 51 | ||
| @@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) | |||
| 53 | { | 53 | { |
| 54 | long ret = sys_setgid(low2highgid(gid)); | 54 | long ret = sys_setgid(low2highgid(gid)); |
| 55 | /* avoid REGPARM breakage on x86: */ | 55 | /* avoid REGPARM breakage on x86: */ |
| 56 | prevent_tail_call(ret); | 56 | asmlinkage_protect(1, ret, gid); |
| 57 | return ret; | 57 | return ret; |
| 58 | } | 58 | } |
| 59 | 59 | ||
| @@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) | |||
| 61 | { | 61 | { |
| 62 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 62 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
| 63 | /* avoid REGPARM breakage on x86: */ | 63 | /* avoid REGPARM breakage on x86: */ |
| 64 | prevent_tail_call(ret); | 64 | asmlinkage_protect(2, ret, ruid, euid); |
| 65 | return ret; | 65 | return ret; |
| 66 | } | 66 | } |
| 67 | 67 | ||
| @@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) | |||
| 69 | { | 69 | { |
| 70 | long ret = sys_setuid(low2highuid(uid)); | 70 | long ret = sys_setuid(low2highuid(uid)); |
| 71 | /* avoid REGPARM breakage on x86: */ | 71 | /* avoid REGPARM breakage on x86: */ |
| 72 | prevent_tail_call(ret); | 72 | asmlinkage_protect(1, ret, uid); |
| 73 | return ret; | 73 | return ret; |
| 74 | } | 74 | } |
| 75 | 75 | ||
| @@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) | |||
| 78 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), | 78 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), |
| 79 | low2highuid(suid)); | 79 | low2highuid(suid)); |
| 80 | /* avoid REGPARM breakage on x86: */ | 80 | /* avoid REGPARM breakage on x86: */ |
| 81 | prevent_tail_call(ret); | 81 | asmlinkage_protect(3, ret, ruid, euid, suid); |
| 82 | return ret; | 82 | return ret; |
| 83 | } | 83 | } |
| 84 | 84 | ||
| @@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) | |||
| 98 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), | 98 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), |
| 99 | low2highgid(sgid)); | 99 | low2highgid(sgid)); |
| 100 | /* avoid REGPARM breakage on x86: */ | 100 | /* avoid REGPARM breakage on x86: */ |
| 101 | prevent_tail_call(ret); | 101 | asmlinkage_protect(3, ret, rgid, egid, sgid); |
| 102 | return ret; | 102 | return ret; |
| 103 | } | 103 | } |
| 104 | 104 | ||
| @@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) | |||
| 117 | { | 117 | { |
| 118 | long ret = sys_setfsuid(low2highuid(uid)); | 118 | long ret = sys_setfsuid(low2highuid(uid)); |
| 119 | /* avoid REGPARM breakage on x86: */ | 119 | /* avoid REGPARM breakage on x86: */ |
| 120 | prevent_tail_call(ret); | 120 | asmlinkage_protect(1, ret, uid); |
| 121 | return ret; | 121 | return ret; |
| 122 | } | 122 | } |
| 123 | 123 | ||
| @@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) | |||
| 125 | { | 125 | { |
| 126 | long ret = sys_setfsgid(low2highgid(gid)); | 126 | long ret = sys_setfsgid(low2highgid(gid)); |
| 127 | /* avoid REGPARM breakage on x86: */ | 127 | /* avoid REGPARM breakage on x86: */ |
| 128 | prevent_tail_call(ret); | 128 | asmlinkage_protect(1, ret, gid); |
| 129 | return ret; | 129 | return ret; |
| 130 | } | 130 | } |
| 131 | 131 | ||
diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..865ecf57a096 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -53,10 +53,6 @@ struct user_struct root_user = { | |||
| 53 | .files = ATOMIC_INIT(0), | 53 | .files = ATOMIC_INIT(0), |
| 54 | .sigpending = ATOMIC_INIT(0), | 54 | .sigpending = ATOMIC_INIT(0), |
| 55 | .locked_shm = 0, | 55 | .locked_shm = 0, |
| 56 | #ifdef CONFIG_KEYS | ||
| 57 | .uid_keyring = &root_user_keyring, | ||
| 58 | .session_keyring = &root_session_keyring, | ||
| 59 | #endif | ||
| 60 | #ifdef CONFIG_USER_SCHED | 56 | #ifdef CONFIG_USER_SCHED |
| 61 | .tg = &init_task_group, | 57 | .tg = &init_task_group, |
| 62 | #endif | 58 | #endif |
| @@ -101,7 +97,7 @@ static int sched_create_user(struct user_struct *up) | |||
| 101 | { | 97 | { |
| 102 | int rc = 0; | 98 | int rc = 0; |
| 103 | 99 | ||
| 104 | up->tg = sched_create_group(); | 100 | up->tg = sched_create_group(&root_task_group); |
| 105 | if (IS_ERR(up->tg)) | 101 | if (IS_ERR(up->tg)) |
| 106 | rc = -ENOMEM; | 102 | rc = -ENOMEM; |
| 107 | 103 | ||
| @@ -193,6 +189,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
| 193 | 189 | ||
| 194 | static struct kobj_attribute cpu_rt_runtime_attr = | 190 | static struct kobj_attribute cpu_rt_runtime_attr = |
| 195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | 191 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); |
| 192 | |||
| 193 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
| 194 | struct kobj_attribute *attr, | ||
| 195 | char *buf) | ||
| 196 | { | ||
| 197 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 198 | |||
| 199 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
| 200 | } | ||
| 201 | |||
| 202 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
| 203 | struct kobj_attribute *attr, | ||
| 204 | const char *buf, size_t size) | ||
| 205 | { | ||
| 206 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 207 | unsigned long rt_period; | ||
| 208 | int rc; | ||
| 209 | |||
| 210 | sscanf(buf, "%lu", &rt_period); | ||
| 211 | |||
| 212 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
| 213 | |||
| 214 | return (rc ? rc : size); | ||
| 215 | } | ||
| 216 | |||
| 217 | static struct kobj_attribute cpu_rt_period_attr = | ||
| 218 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
| 196 | #endif | 219 | #endif |
| 197 | 220 | ||
| 198 | /* default attributes per uid directory */ | 221 | /* default attributes per uid directory */ |
| @@ -202,6 +225,7 @@ static struct attribute *uids_attributes[] = { | |||
| 202 | #endif | 225 | #endif |
| 203 | #ifdef CONFIG_RT_GROUP_SCHED | 226 | #ifdef CONFIG_RT_GROUP_SCHED |
| 204 | &cpu_rt_runtime_attr.attr, | 227 | &cpu_rt_runtime_attr.attr, |
| 228 | &cpu_rt_period_attr.attr, | ||
| 205 | #endif | 229 | #endif |
| 206 | NULL | 230 | NULL |
| 207 | }; | 231 | }; |
| @@ -360,7 +384,7 @@ void free_uid(struct user_struct *up) | |||
| 360 | local_irq_restore(flags); | 384 | local_irq_restore(flags); |
| 361 | } | 385 | } |
| 362 | 386 | ||
| 363 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 387 | struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) |
| 364 | { | 388 | { |
| 365 | struct hlist_head *hashent = uidhashentry(ns, uid); | 389 | struct hlist_head *hashent = uidhashentry(ns, uid); |
| 366 | struct user_struct *up, *new; | 390 | struct user_struct *up, *new; |
| @@ -375,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 375 | spin_unlock_irq(&uidhash_lock); | 399 | spin_unlock_irq(&uidhash_lock); |
| 376 | 400 | ||
| 377 | if (!up) { | 401 | if (!up) { |
| 378 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 402 | new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); |
| 379 | if (!new) | 403 | if (!new) |
| 380 | goto out_unlock; | 404 | goto out_unlock; |
| 381 | 405 | ||
| 382 | new->uid = uid; | 406 | new->uid = uid; |
| 383 | atomic_set(&new->__count, 1); | 407 | atomic_set(&new->__count, 1); |
| 384 | atomic_set(&new->processes, 0); | ||
| 385 | atomic_set(&new->files, 0); | ||
| 386 | atomic_set(&new->sigpending, 0); | ||
| 387 | #ifdef CONFIG_INOTIFY_USER | ||
| 388 | atomic_set(&new->inotify_watches, 0); | ||
| 389 | atomic_set(&new->inotify_devs, 0); | ||
| 390 | #endif | ||
| 391 | #ifdef CONFIG_POSIX_MQUEUE | ||
| 392 | new->mq_bytes = 0; | ||
| 393 | #endif | ||
| 394 | new->locked_shm = 0; | ||
| 395 | |||
| 396 | if (alloc_uid_keyring(new, current) < 0) | ||
| 397 | goto out_free_user; | ||
| 398 | 408 | ||
| 399 | if (sched_create_user(new) < 0) | 409 | if (sched_create_user(new) < 0) |
| 400 | goto out_put_keys; | 410 | goto out_free_user; |
| 401 | 411 | ||
| 402 | if (uids_user_create(new)) | 412 | if (uids_user_create(new)) |
| 403 | goto out_destoy_sched; | 413 | goto out_destoy_sched; |
| @@ -431,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 431 | 441 | ||
| 432 | out_destoy_sched: | 442 | out_destoy_sched: |
| 433 | sched_destroy_user(new); | 443 | sched_destroy_user(new); |
| 434 | out_put_keys: | ||
| 435 | key_put(new->uid_keyring); | ||
| 436 | key_put(new->session_keyring); | ||
| 437 | out_free_user: | 444 | out_free_user: |
| 438 | kmem_cache_free(uid_cachep, new); | 445 | kmem_cache_free(uid_cachep, new); |
| 439 | out_unlock: | 446 | out_unlock: |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4c9006275df7..a9ab0596de44 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
| 9 | #include <linux/version.h> | 9 | #include <linux/version.h> |
| 10 | #include <linux/nsproxy.h> | 10 | #include <linux/nsproxy.h> |
| 11 | #include <linux/slab.h> | ||
| 11 | #include <linux/user_namespace.h> | 12 | #include <linux/user_namespace.h> |
| 12 | 13 | ||
| 13 | /* | 14 | /* |
| @@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref) | |||
| 73 | release_uids(ns); | 74 | release_uids(ns); |
| 74 | kfree(ns); | 75 | kfree(ns); |
| 75 | } | 76 | } |
| 77 | EXPORT_SYMBOL(free_user_ns); | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 816d7b24fa03..64d398f12444 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
| 15 | #include <linux/version.h> | 15 | #include <linux/version.h> |
| 16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
| 17 | #include <linux/slab.h> | ||
| 17 | 18 | ||
| 18 | /* | 19 | /* |
| 19 | * Clone a new ns copying an original utsname, setting refcount to 1 | 20 | * Clone a new ns copying an original utsname, setting refcount to 1 |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff06611655af..29fc39f1029c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, | |||
| 158 | * | 158 | * |
| 159 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 159 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
| 160 | * | 160 | * |
| 161 | * We queue the work to the CPU it was submitted, but there is no | 161 | * We queue the work to the CPU on which it was submitted, but if the CPU dies |
| 162 | * guarantee that it will be processed by that CPU. | 162 | * it can be processed by another CPU. |
| 163 | */ | 163 | */ |
| 164 | int queue_work(struct workqueue_struct *wq, struct work_struct *work) | 164 | int queue_work(struct workqueue_struct *wq, struct work_struct *work) |
| 165 | { | 165 | { |
| @@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
| 195 | int queue_delayed_work(struct workqueue_struct *wq, | 195 | int queue_delayed_work(struct workqueue_struct *wq, |
| 196 | struct delayed_work *dwork, unsigned long delay) | 196 | struct delayed_work *dwork, unsigned long delay) |
| 197 | { | 197 | { |
| 198 | timer_stats_timer_set_start_info(&dwork->timer); | ||
| 199 | if (delay == 0) | 198 | if (delay == 0) |
| 200 | return queue_work(wq, &dwork->work); | 199 | return queue_work(wq, &dwork->work); |
| 201 | 200 | ||
| @@ -223,6 +222,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
| 223 | BUG_ON(timer_pending(timer)); | 222 | BUG_ON(timer_pending(timer)); |
| 224 | BUG_ON(!list_empty(&work->entry)); | 223 | BUG_ON(!list_empty(&work->entry)); |
| 225 | 224 | ||
| 225 | timer_stats_timer_set_start_info(&dwork->timer); | ||
| 226 | |||
| 226 | /* This stores cwq for the moment, for the timer_fn */ | 227 | /* This stores cwq for the moment, for the timer_fn */ |
| 227 | set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); | 228 | set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); |
| 228 | timer->expires = jiffies + delay; | 229 | timer->expires = jiffies + delay; |
| @@ -246,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) | |||
| 246 | if (cwq->run_depth > 3) { | 247 | if (cwq->run_depth > 3) { |
| 247 | /* morton gets to eat his hat */ | 248 | /* morton gets to eat his hat */ |
| 248 | printk("%s: recursion depth exceeded: %d\n", | 249 | printk("%s: recursion depth exceeded: %d\n", |
| 249 | __FUNCTION__, cwq->run_depth); | 250 | __func__, cwq->run_depth); |
| 250 | dump_stack(); | 251 | dump_stack(); |
| 251 | } | 252 | } |
| 252 | while (!list_empty(&cwq->worklist)) { | 253 | while (!list_empty(&cwq->worklist)) { |
| @@ -563,7 +564,6 @@ EXPORT_SYMBOL(schedule_work); | |||
| 563 | int schedule_delayed_work(struct delayed_work *dwork, | 564 | int schedule_delayed_work(struct delayed_work *dwork, |
| 564 | unsigned long delay) | 565 | unsigned long delay) |
| 565 | { | 566 | { |
| 566 | timer_stats_timer_set_start_info(&dwork->timer); | ||
| 567 | return queue_delayed_work(keventd_wq, dwork, delay); | 567 | return queue_delayed_work(keventd_wq, dwork, delay); |
| 568 | } | 568 | } |
| 569 | EXPORT_SYMBOL(schedule_delayed_work); | 569 | EXPORT_SYMBOL(schedule_delayed_work); |
| @@ -770,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
| 770 | } | 770 | } |
| 771 | EXPORT_SYMBOL_GPL(__create_workqueue_key); | 771 | EXPORT_SYMBOL_GPL(__create_workqueue_key); |
| 772 | 772 | ||
| 773 | static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 773 | static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) |
| 774 | { | 774 | { |
| 775 | /* | 775 | /* |
| 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
| @@ -806,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
| 806 | void destroy_workqueue(struct workqueue_struct *wq) | 806 | void destroy_workqueue(struct workqueue_struct *wq) |
| 807 | { | 807 | { |
| 808 | const cpumask_t *cpu_map = wq_cpu_map(wq); | 808 | const cpumask_t *cpu_map = wq_cpu_map(wq); |
| 809 | struct cpu_workqueue_struct *cwq; | ||
| 810 | int cpu; | 809 | int cpu; |
| 811 | 810 | ||
| 812 | get_online_cpus(); | 811 | get_online_cpus(); |
| 813 | spin_lock(&workqueue_lock); | 812 | spin_lock(&workqueue_lock); |
| 814 | list_del(&wq->list); | 813 | list_del(&wq->list); |
| 815 | spin_unlock(&workqueue_lock); | 814 | spin_unlock(&workqueue_lock); |
| 816 | put_online_cpus(); | ||
| 817 | 815 | ||
| 818 | for_each_cpu_mask(cpu, *cpu_map) { | 816 | for_each_cpu_mask(cpu, *cpu_map) |
| 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 817 | cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); |
| 820 | cleanup_workqueue_thread(cwq, cpu); | 818 | put_online_cpus(); |
| 821 | } | ||
| 822 | 819 | ||
| 823 | free_percpu(wq->cpu_wq); | 820 | free_percpu(wq->cpu_wq); |
| 824 | kfree(wq); | 821 | kfree(wq); |
| @@ -836,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 836 | action &= ~CPU_TASKS_FROZEN; | 833 | action &= ~CPU_TASKS_FROZEN; |
| 837 | 834 | ||
| 838 | switch (action) { | 835 | switch (action) { |
| 839 | |||
| 840 | case CPU_UP_PREPARE: | 836 | case CPU_UP_PREPARE: |
| 841 | cpu_set(cpu, cpu_populated_map); | 837 | cpu_set(cpu, cpu_populated_map); |
| 842 | } | 838 | } |
| @@ -859,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 859 | case CPU_UP_CANCELED: | 855 | case CPU_UP_CANCELED: |
| 860 | start_workqueue_thread(cwq, -1); | 856 | start_workqueue_thread(cwq, -1); |
| 861 | case CPU_DEAD: | 857 | case CPU_DEAD: |
| 862 | cleanup_workqueue_thread(cwq, cpu); | 858 | cleanup_workqueue_thread(cwq); |
| 863 | break; | 859 | break; |
| 864 | } | 860 | } |
| 865 | } | 861 | } |
| 866 | 862 | ||
| 863 | switch (action) { | ||
| 864 | case CPU_UP_CANCELED: | ||
| 865 | case CPU_DEAD: | ||
| 866 | cpu_clear(cpu, cpu_populated_map); | ||
| 867 | } | ||
| 868 | |||
| 867 | return NOTIFY_OK; | 869 | return NOTIFY_OK; |
| 868 | } | 870 | } |
| 869 | 871 | ||
