diff options
Diffstat (limited to 'kernel')
37 files changed, 1866 insertions, 680 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..ff4dc02ce170 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
| 12 | obj-$(CONFIG_FUTEX) += futex.o | 12 | obj-$(CONFIG_FUTEX) += futex.o |
| 13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
| 15 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | ||
| 15 | obj-$(CONFIG_UID16) += uid16.o | 16 | obj-$(CONFIG_UID16) += uid16.o |
| 16 | obj-$(CONFIG_MODULES) += module.o | 17 | obj-$(CONFIG_MODULES) += module.o |
| 17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 18 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
| @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o | |||
| 27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 28 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
| 28 | obj-$(CONFIG_KPROBES) += kprobes.o | 29 | obj-$(CONFIG_KPROBES) += kprobes.o |
| 29 | obj-$(CONFIG_SYSFS) += ksysfs.o | 30 | obj-$(CONFIG_SYSFS) += ksysfs.o |
| 31 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||
| 30 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 32 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 31 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 33 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
| 32 | obj-$(CONFIG_SECCOMP) += seccomp.o | 34 | obj-$(CONFIG_SECCOMP) += seccomp.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..b756f527497e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -165,7 +165,7 @@ out: | |||
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | /* | 167 | /* |
| 168 | * Close the old accouting file (if currently open) and then replace | 168 | * Close the old accounting file (if currently open) and then replace |
| 169 | * it with file (if non-NULL). | 169 | * it with file (if non-NULL). |
| 170 | * | 170 | * |
| 171 | * NOTE: acct_globals.lock MUST be held on entry and exit. | 171 | * NOTE: acct_globals.lock MUST be held on entry and exit. |
| @@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) | |||
| 199 | } | 199 | } |
| 200 | } | 200 | } |
| 201 | 201 | ||
| 202 | /* | 202 | /** |
| 203 | * sys_acct() is the only system call needed to implement process | 203 | * sys_acct - enable/disable process accounting |
| 204 | * accounting. It takes the name of the file where accounting records | 204 | * @name: file name for accounting records or NULL to shutdown accounting |
| 205 | * should be written. If the filename is NULL, accounting will be | 205 | * |
| 206 | * shutdown. | 206 | * Returns 0 for success or negative errno values for failure. |
| 207 | * | ||
| 208 | * sys_acct() is the only system call needed to implement process | ||
| 209 | * accounting. It takes the name of the file where accounting records | ||
| 210 | * should be written. If the filename is NULL, accounting will be | ||
| 211 | * shutdown. | ||
| 207 | */ | 212 | */ |
| 208 | asmlinkage long sys_acct(const char __user *name) | 213 | asmlinkage long sys_acct(const char __user *name) |
| 209 | { | 214 | { |
| @@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name) | |||
| 220 | return (PTR_ERR(tmp)); | 225 | return (PTR_ERR(tmp)); |
| 221 | } | 226 | } |
| 222 | /* Difference from BSD - they don't do O_APPEND */ | 227 | /* Difference from BSD - they don't do O_APPEND */ |
| 223 | file = filp_open(tmp, O_WRONLY|O_APPEND, 0); | 228 | file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); |
| 224 | putname(tmp); | 229 | putname(tmp); |
| 225 | if (IS_ERR(file)) { | 230 | if (IS_ERR(file)) { |
| 226 | return (PTR_ERR(file)); | 231 | return (PTR_ERR(file)); |
| @@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) | |||
| 250 | return (0); | 255 | return (0); |
| 251 | } | 256 | } |
| 252 | 257 | ||
| 253 | /* | 258 | /** |
| 254 | * If the accouting is turned on for a file in the filesystem pointed | 259 | * acct_auto_close - turn off a filesystem's accounting if it is on |
| 255 | * to by sb, turn accouting off. | 260 | * @sb: super block for the filesystem |
| 261 | * | ||
| 262 | * If the accounting is turned on for a file in the filesystem pointed | ||
| 263 | * to by sb, turn accounting off. | ||
| 256 | */ | 264 | */ |
| 257 | void acct_auto_close(struct super_block *sb) | 265 | void acct_auto_close(struct super_block *sb) |
| 258 | { | 266 | { |
| @@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 503 | set_fs(fs); | 511 | set_fs(fs); |
| 504 | } | 512 | } |
| 505 | 513 | ||
| 506 | /* | 514 | /** |
| 507 | * acct_process - now just a wrapper around do_acct_process | 515 | * acct_process - now just a wrapper around do_acct_process |
| 516 | * @exitcode: task exit code | ||
| 517 | * | ||
| 518 | * handles process accounting for an exiting task | ||
| 508 | */ | 519 | */ |
| 509 | void acct_process(long exitcode) | 520 | void acct_process(long exitcode) |
| 510 | { | 521 | { |
| @@ -530,9 +541,9 @@ void acct_process(long exitcode) | |||
| 530 | } | 541 | } |
| 531 | 542 | ||
| 532 | 543 | ||
| 533 | /* | 544 | /** |
| 534 | * acct_update_integrals | 545 | * acct_update_integrals - update mm integral fields in task_struct |
| 535 | * - update mm integral fields in task_struct | 546 | * @tsk: task_struct for accounting |
| 536 | */ | 547 | */ |
| 537 | void acct_update_integrals(struct task_struct *tsk) | 548 | void acct_update_integrals(struct task_struct *tsk) |
| 538 | { | 549 | { |
| @@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk) | |||
| 547 | } | 558 | } |
| 548 | } | 559 | } |
| 549 | 560 | ||
| 550 | /* | 561 | /** |
| 551 | * acct_clear_integrals | 562 | * acct_clear_integrals - clear the mm integral fields in task_struct |
| 552 | * - clear the mm integral fields in task_struct | 563 | * @tsk: task_struct whose accounting fields are cleared |
| 553 | */ | 564 | */ |
| 554 | void acct_clear_integrals(struct task_struct *tsk) | 565 | void acct_clear_integrals(struct task_struct *tsk) |
| 555 | { | 566 | { |
diff --git a/kernel/audit.c b/kernel/audit.c index 7f0699790d46..aefa73a8a586 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -79,6 +79,8 @@ static int audit_rate_limit; | |||
| 79 | 79 | ||
| 80 | /* Number of outstanding audit_buffers allowed. */ | 80 | /* Number of outstanding audit_buffers allowed. */ |
| 81 | static int audit_backlog_limit = 64; | 81 | static int audit_backlog_limit = 64; |
| 82 | static int audit_backlog_wait_time = 60 * HZ; | ||
| 83 | static int audit_backlog_wait_overflow = 0; | ||
| 82 | 84 | ||
| 83 | /* The identity of the user shutting down the audit system. */ | 85 | /* The identity of the user shutting down the audit system. */ |
| 84 | uid_t audit_sig_uid = -1; | 86 | uid_t audit_sig_uid = -1; |
| @@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist); | |||
| 106 | static struct sk_buff_head audit_skb_queue; | 108 | static struct sk_buff_head audit_skb_queue; |
| 107 | static struct task_struct *kauditd_task; | 109 | static struct task_struct *kauditd_task; |
| 108 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | 110 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); |
| 109 | 111 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | |
| 110 | /* There are three lists of rules -- one to search at task creation | ||
| 111 | * time, one to search at syscall entry time, and another to search at | ||
| 112 | * syscall exit time. */ | ||
| 113 | static LIST_HEAD(audit_tsklist); | ||
| 114 | static LIST_HEAD(audit_entlist); | ||
| 115 | static LIST_HEAD(audit_extlist); | ||
| 116 | 112 | ||
| 117 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 113 | /* The netlink socket is only to be read by 1 CPU, which lets us assume |
| 118 | * that list additions and deletions never happen simultaneously in | 114 | * that list additions and deletions never happen simultaneously in |
| 119 | * auditsc.c */ | 115 | * auditsc.c */ |
| 120 | static DECLARE_MUTEX(audit_netlink_sem); | 116 | DECLARE_MUTEX(audit_netlink_sem); |
| 121 | 117 | ||
| 122 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | 118 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting |
| 123 | * audit records. Since printk uses a 1024 byte buffer, this buffer | 119 | * audit records. Since printk uses a 1024 byte buffer, this buffer |
| @@ -137,6 +133,7 @@ struct audit_buffer { | |||
| 137 | struct list_head list; | 133 | struct list_head list; |
| 138 | struct sk_buff *skb; /* formatted skb ready to send */ | 134 | struct sk_buff *skb; /* formatted skb ready to send */ |
| 139 | struct audit_context *ctx; /* NULL or associated context */ | 135 | struct audit_context *ctx; /* NULL or associated context */ |
| 136 | int gfp_mask; | ||
| 140 | }; | 137 | }; |
| 141 | 138 | ||
| 142 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
| @@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | |||
| 145 | nlh->nlmsg_pid = pid; | 142 | nlh->nlmsg_pid = pid; |
| 146 | } | 143 | } |
| 147 | 144 | ||
| 148 | struct audit_entry { | ||
| 149 | struct list_head list; | ||
| 150 | struct audit_rule rule; | ||
| 151 | }; | ||
| 152 | |||
| 153 | static void audit_panic(const char *message) | 145 | static void audit_panic(const char *message) |
| 154 | { | 146 | { |
| 155 | switch (audit_failure) | 147 | switch (audit_failure) |
| @@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) | |||
| 233 | { | 225 | { |
| 234 | int old = audit_rate_limit; | 226 | int old = audit_rate_limit; |
| 235 | audit_rate_limit = limit; | 227 | audit_rate_limit = limit; |
| 236 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 228 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 237 | "audit_rate_limit=%d old=%d by auid=%u", | 229 | "audit_rate_limit=%d old=%d by auid=%u", |
| 238 | audit_rate_limit, old, loginuid); | 230 | audit_rate_limit, old, loginuid); |
| 239 | return old; | 231 | return old; |
| @@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) | |||
| 243 | { | 235 | { |
| 244 | int old = audit_backlog_limit; | 236 | int old = audit_backlog_limit; |
| 245 | audit_backlog_limit = limit; | 237 | audit_backlog_limit = limit; |
| 246 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 238 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 247 | "audit_backlog_limit=%d old=%d by auid=%u", | 239 | "audit_backlog_limit=%d old=%d by auid=%u", |
| 248 | audit_backlog_limit, old, loginuid); | 240 | audit_backlog_limit, old, loginuid); |
| 249 | return old; | 241 | return old; |
| @@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid) | |||
| 255 | if (state != 0 && state != 1) | 247 | if (state != 0 && state != 1) |
| 256 | return -EINVAL; | 248 | return -EINVAL; |
| 257 | audit_enabled = state; | 249 | audit_enabled = state; |
| 258 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 259 | "audit_enabled=%d old=%d by auid=%u", | 251 | "audit_enabled=%d old=%d by auid=%u", |
| 260 | audit_enabled, old, loginuid); | 252 | audit_enabled, old, loginuid); |
| 261 | return old; | 253 | return old; |
| @@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid) | |||
| 269 | && state != AUDIT_FAIL_PANIC) | 261 | && state != AUDIT_FAIL_PANIC) |
| 270 | return -EINVAL; | 262 | return -EINVAL; |
| 271 | audit_failure = state; | 263 | audit_failure = state; |
| 272 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 264 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 273 | "audit_failure=%d old=%d by auid=%u", | 265 | "audit_failure=%d old=%d by auid=%u", |
| 274 | audit_failure, old, loginuid); | 266 | audit_failure, old, loginuid); |
| 275 | return old; | 267 | return old; |
| @@ -281,6 +273,7 @@ int kauditd_thread(void *dummy) | |||
| 281 | 273 | ||
| 282 | while (1) { | 274 | while (1) { |
| 283 | skb = skb_dequeue(&audit_skb_queue); | 275 | skb = skb_dequeue(&audit_skb_queue); |
| 276 | wake_up(&audit_backlog_wait); | ||
| 284 | if (skb) { | 277 | if (skb) { |
| 285 | if (audit_pid) { | 278 | if (audit_pid) { |
| 286 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); | 279 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); |
| @@ -290,7 +283,7 @@ int kauditd_thread(void *dummy) | |||
| 290 | audit_pid = 0; | 283 | audit_pid = 0; |
| 291 | } | 284 | } |
| 292 | } else { | 285 | } else { |
| 293 | printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); | 286 | printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); |
| 294 | kfree_skb(skb); | 287 | kfree_skb(skb); |
| 295 | } | 288 | } |
| 296 | } else { | 289 | } else { |
| @@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 423 | if (status_get->mask & AUDIT_STATUS_PID) { | 416 | if (status_get->mask & AUDIT_STATUS_PID) { |
| 424 | int old = audit_pid; | 417 | int old = audit_pid; |
| 425 | audit_pid = status_get->pid; | 418 | audit_pid = status_get->pid; |
| 426 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 419 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 427 | "audit_pid=%d old=%d by auid=%u", | 420 | "audit_pid=%d old=%d by auid=%u", |
| 428 | audit_pid, old, loginuid); | 421 | audit_pid, old, loginuid); |
| 429 | } | 422 | } |
| @@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 435 | break; | 428 | break; |
| 436 | case AUDIT_USER: | 429 | case AUDIT_USER: |
| 437 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 430 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
| 438 | ab = audit_log_start(NULL, msg_type); | 431 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
| 439 | if (!ab) | 432 | return 0; |
| 440 | break; /* audit_panic has been called */ | 433 | |
| 441 | audit_log_format(ab, | 434 | err = audit_filter_user(&NETLINK_CB(skb), msg_type); |
| 442 | "user pid=%d uid=%u auid=%u" | 435 | if (err == 1) { |
| 443 | " msg='%.1024s'", | 436 | err = 0; |
| 444 | pid, uid, loginuid, (char *)data); | 437 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
| 445 | audit_set_pid(ab, pid); | 438 | if (ab) { |
| 446 | audit_log_end(ab); | 439 | audit_log_format(ab, |
| 440 | "user pid=%d uid=%u auid=%u msg='%.1024s'", | ||
| 441 | pid, uid, loginuid, (char *)data); | ||
| 442 | audit_set_pid(ab, pid); | ||
| 443 | audit_log_end(ab); | ||
| 444 | } | ||
| 445 | } | ||
| 447 | break; | 446 | break; |
| 448 | case AUDIT_ADD: | 447 | case AUDIT_ADD: |
| 449 | case AUDIT_DEL: | 448 | case AUDIT_DEL: |
| @@ -523,7 +522,7 @@ static int __init audit_init(void) | |||
| 523 | skb_queue_head_init(&audit_skb_queue); | 522 | skb_queue_head_init(&audit_skb_queue); |
| 524 | audit_initialized = 1; | 523 | audit_initialized = 1; |
| 525 | audit_enabled = audit_default; | 524 | audit_enabled = audit_default; |
| 526 | audit_log(NULL, AUDIT_KERNEL, "initialized"); | 525 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
| 527 | return 0; | 526 | return 0; |
| 528 | } | 527 | } |
| 529 | __initcall(audit_init); | 528 | __initcall(audit_init); |
| @@ -561,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab) | |||
| 561 | } | 560 | } |
| 562 | 561 | ||
| 563 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | 562 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, |
| 564 | int gfp_mask, int type) | 563 | gfp_t gfp_mask, int type) |
| 565 | { | 564 | { |
| 566 | unsigned long flags; | 565 | unsigned long flags; |
| 567 | struct audit_buffer *ab = NULL; | 566 | struct audit_buffer *ab = NULL; |
| @@ -587,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | |||
| 587 | goto err; | 586 | goto err; |
| 588 | 587 | ||
| 589 | ab->ctx = ctx; | 588 | ab->ctx = ctx; |
| 589 | ab->gfp_mask = gfp_mask; | ||
| 590 | nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); | 590 | nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); |
| 591 | nlh->nlmsg_type = type; | 591 | nlh->nlmsg_type = type; |
| 592 | nlh->nlmsg_flags = 0; | 592 | nlh->nlmsg_flags = 0; |
| @@ -606,26 +606,27 @@ err: | |||
| 606 | * (timestamp,serial) tuple is unique for each syscall and is live from | 606 | * (timestamp,serial) tuple is unique for each syscall and is live from |
| 607 | * syscall entry to syscall exit. | 607 | * syscall entry to syscall exit. |
| 608 | * | 608 | * |
| 609 | * Atomic values are only guaranteed to be 24-bit, so we count down. | ||
| 610 | * | ||
| 611 | * NOTE: Another possibility is to store the formatted records off the | 609 | * NOTE: Another possibility is to store the formatted records off the |
| 612 | * audit context (for those records that have a context), and emit them | 610 | * audit context (for those records that have a context), and emit them |
| 613 | * all at syscall exit. However, this could delay the reporting of | 611 | * all at syscall exit. However, this could delay the reporting of |
| 614 | * significant errors until syscall exit (or never, if the system | 612 | * significant errors until syscall exit (or never, if the system |
| 615 | * halts). */ | 613 | * halts). */ |
| 614 | |||
| 616 | unsigned int audit_serial(void) | 615 | unsigned int audit_serial(void) |
| 617 | { | 616 | { |
| 618 | static atomic_t serial = ATOMIC_INIT(0xffffff); | 617 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; |
| 619 | unsigned int a, b; | 618 | static unsigned int serial = 0; |
| 619 | |||
| 620 | unsigned long flags; | ||
| 621 | unsigned int ret; | ||
| 620 | 622 | ||
| 623 | spin_lock_irqsave(&serial_lock, flags); | ||
| 621 | do { | 624 | do { |
| 622 | a = atomic_read(&serial); | 625 | ret = ++serial; |
| 623 | if (atomic_dec_and_test(&serial)) | 626 | } while (unlikely(!ret)); |
| 624 | atomic_set(&serial, 0xffffff); | 627 | spin_unlock_irqrestore(&serial_lock, flags); |
| 625 | b = atomic_read(&serial); | ||
| 626 | } while (b != a - 1); | ||
| 627 | 628 | ||
| 628 | return 0xffffff - b; | 629 | return ret; |
| 629 | } | 630 | } |
| 630 | 631 | ||
| 631 | static inline void audit_get_stamp(struct audit_context *ctx, | 632 | static inline void audit_get_stamp(struct audit_context *ctx, |
| @@ -645,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
| 645 | * syscall, then the syscall is marked as auditable and an audit record | 646 | * syscall, then the syscall is marked as auditable and an audit record |
| 646 | * will be written at syscall exit. If there is no associated task, tsk | 647 | * will be written at syscall exit. If there is no associated task, tsk |
| 647 | * should be NULL. */ | 648 | * should be NULL. */ |
| 648 | struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) | 649 | |
| 650 | struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, | ||
| 651 | int type) | ||
| 649 | { | 652 | { |
| 650 | struct audit_buffer *ab = NULL; | 653 | struct audit_buffer *ab = NULL; |
| 651 | struct timespec t; | 654 | struct timespec t; |
| 652 | unsigned int serial; | 655 | unsigned int serial; |
| 656 | int reserve; | ||
| 657 | unsigned long timeout_start = jiffies; | ||
| 653 | 658 | ||
| 654 | if (!audit_initialized) | 659 | if (!audit_initialized) |
| 655 | return NULL; | 660 | return NULL; |
| 656 | 661 | ||
| 657 | if (audit_backlog_limit | 662 | if (gfp_mask & __GFP_WAIT) |
| 658 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { | 663 | reserve = 0; |
| 664 | else | ||
| 665 | reserve = 5; /* Allow atomic callers to go up to five | ||
| 666 | entries over the normal backlog limit */ | ||
| 667 | |||
| 668 | while (audit_backlog_limit | ||
| 669 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { | ||
| 670 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time | ||
| 671 | && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { | ||
| 672 | |||
| 673 | /* Wait for auditd to drain the queue a little */ | ||
| 674 | DECLARE_WAITQUEUE(wait, current); | ||
| 675 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 676 | add_wait_queue(&audit_backlog_wait, &wait); | ||
| 677 | |||
| 678 | if (audit_backlog_limit && | ||
| 679 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) | ||
| 680 | schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); | ||
| 681 | |||
| 682 | __set_current_state(TASK_RUNNING); | ||
| 683 | remove_wait_queue(&audit_backlog_wait, &wait); | ||
| 684 | continue; | ||
| 685 | } | ||
| 659 | if (audit_rate_check()) | 686 | if (audit_rate_check()) |
| 660 | printk(KERN_WARNING | 687 | printk(KERN_WARNING |
| 661 | "audit: audit_backlog=%d > " | 688 | "audit: audit_backlog=%d > " |
| @@ -663,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) | |||
| 663 | skb_queue_len(&audit_skb_queue), | 690 | skb_queue_len(&audit_skb_queue), |
| 664 | audit_backlog_limit); | 691 | audit_backlog_limit); |
| 665 | audit_log_lost("backlog limit exceeded"); | 692 | audit_log_lost("backlog limit exceeded"); |
| 693 | audit_backlog_wait_time = audit_backlog_wait_overflow; | ||
| 694 | wake_up(&audit_backlog_wait); | ||
| 666 | return NULL; | 695 | return NULL; |
| 667 | } | 696 | } |
| 668 | 697 | ||
| 669 | ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); | 698 | ab = audit_buffer_alloc(ctx, gfp_mask, type); |
| 670 | if (!ab) { | 699 | if (!ab) { |
| 671 | audit_log_lost("out of memory in audit_log_start"); | 700 | audit_log_lost("out of memory in audit_log_start"); |
| 672 | return NULL; | 701 | return NULL; |
| @@ -690,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) | |||
| 690 | { | 719 | { |
| 691 | struct sk_buff *skb = ab->skb; | 720 | struct sk_buff *skb = ab->skb; |
| 692 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, | 721 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, |
| 693 | GFP_ATOMIC); | 722 | ab->gfp_mask); |
| 694 | if (ret < 0) { | 723 | if (ret < 0) { |
| 695 | audit_log_lost("out of memory in audit_expand"); | 724 | audit_log_lost("out of memory in audit_expand"); |
| 696 | return 0; | 725 | return 0; |
| @@ -809,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
| 809 | audit_log_format(ab, " %s", prefix); | 838 | audit_log_format(ab, " %s", prefix); |
| 810 | 839 | ||
| 811 | /* We will allow 11 spaces for ' (deleted)' to be appended */ | 840 | /* We will allow 11 spaces for ' (deleted)' to be appended */ |
| 812 | path = kmalloc(PATH_MAX+11, GFP_KERNEL); | 841 | path = kmalloc(PATH_MAX+11, ab->gfp_mask); |
| 813 | if (!path) { | 842 | if (!path) { |
| 814 | audit_log_format(ab, "<no memory>"); | 843 | audit_log_format(ab, "<no memory>"); |
| 815 | return; | 844 | return; |
| @@ -841,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 841 | ab->skb = NULL; | 870 | ab->skb = NULL; |
| 842 | wake_up_interruptible(&kauditd_wait); | 871 | wake_up_interruptible(&kauditd_wait); |
| 843 | } else { | 872 | } else { |
| 844 | printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); | 873 | printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); |
| 845 | } | 874 | } |
| 846 | } | 875 | } |
| 847 | audit_buffer_free(ab); | 876 | audit_buffer_free(ab); |
| @@ -850,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 850 | /* Log an audit record. This is a convenience function that calls | 879 | /* Log an audit record. This is a convenience function that calls |
| 851 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 880 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be |
| 852 | * called in any context. */ | 881 | * called in any context. */ |
| 853 | void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) | 882 | void audit_log(struct audit_context *ctx, int gfp_mask, int type, |
| 883 | const char *fmt, ...) | ||
| 854 | { | 884 | { |
| 855 | struct audit_buffer *ab; | 885 | struct audit_buffer *ab; |
| 856 | va_list args; | 886 | va_list args; |
| 857 | 887 | ||
| 858 | ab = audit_log_start(ctx, type); | 888 | ab = audit_log_start(ctx, gfp_mask, type); |
| 859 | if (ab) { | 889 | if (ab) { |
| 860 | va_start(args, fmt); | 890 | va_start(args, fmt); |
| 861 | audit_log_vformat(ab, fmt, args); | 891 | audit_log_vformat(ab, fmt, args); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e75f84e1a1a0..88696f639aab 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -39,6 +39,9 @@ | |||
| 39 | #include <linux/audit.h> | 39 | #include <linux/audit.h> |
| 40 | #include <linux/personality.h> | 40 | #include <linux/personality.h> |
| 41 | #include <linux/time.h> | 41 | #include <linux/time.h> |
| 42 | #include <linux/kthread.h> | ||
| 43 | #include <linux/netlink.h> | ||
| 44 | #include <linux/compiler.h> | ||
| 42 | #include <asm/unistd.h> | 45 | #include <asm/unistd.h> |
| 43 | 46 | ||
| 44 | /* 0 = no checking | 47 | /* 0 = no checking |
| @@ -95,6 +98,7 @@ struct audit_names { | |||
| 95 | uid_t uid; | 98 | uid_t uid; |
| 96 | gid_t gid; | 99 | gid_t gid; |
| 97 | dev_t rdev; | 100 | dev_t rdev; |
| 101 | unsigned flags; | ||
| 98 | }; | 102 | }; |
| 99 | 103 | ||
| 100 | struct audit_aux_data { | 104 | struct audit_aux_data { |
| @@ -167,9 +171,16 @@ struct audit_context { | |||
| 167 | /* There are three lists of rules -- one to search at task creation | 171 | /* There are three lists of rules -- one to search at task creation |
| 168 | * time, one to search at syscall entry time, and another to search at | 172 | * time, one to search at syscall entry time, and another to search at |
| 169 | * syscall exit time. */ | 173 | * syscall exit time. */ |
| 170 | static LIST_HEAD(audit_tsklist); | 174 | static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { |
| 171 | static LIST_HEAD(audit_entlist); | 175 | LIST_HEAD_INIT(audit_filter_list[0]), |
| 172 | static LIST_HEAD(audit_extlist); | 176 | LIST_HEAD_INIT(audit_filter_list[1]), |
| 177 | LIST_HEAD_INIT(audit_filter_list[2]), | ||
| 178 | LIST_HEAD_INIT(audit_filter_list[3]), | ||
| 179 | LIST_HEAD_INIT(audit_filter_list[4]), | ||
| 180 | #if AUDIT_NR_FILTERS != 5 | ||
| 181 | #error Fix audit_filter_list initialiser | ||
| 182 | #endif | ||
| 183 | }; | ||
| 173 | 184 | ||
| 174 | struct audit_entry { | 185 | struct audit_entry { |
| 175 | struct list_head list; | 186 | struct list_head list; |
| @@ -179,9 +190,36 @@ struct audit_entry { | |||
| 179 | 190 | ||
| 180 | extern int audit_pid; | 191 | extern int audit_pid; |
| 181 | 192 | ||
| 193 | /* Copy rule from user-space to kernel-space. Called from | ||
| 194 | * audit_add_rule during AUDIT_ADD. */ | ||
| 195 | static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) | ||
| 196 | { | ||
| 197 | int i; | ||
| 198 | |||
| 199 | if (s->action != AUDIT_NEVER | ||
| 200 | && s->action != AUDIT_POSSIBLE | ||
| 201 | && s->action != AUDIT_ALWAYS) | ||
| 202 | return -1; | ||
| 203 | if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) | ||
| 204 | return -1; | ||
| 205 | if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) | ||
| 206 | return -1; | ||
| 207 | |||
| 208 | d->flags = s->flags; | ||
| 209 | d->action = s->action; | ||
| 210 | d->field_count = s->field_count; | ||
| 211 | for (i = 0; i < d->field_count; i++) { | ||
| 212 | d->fields[i] = s->fields[i]; | ||
| 213 | d->values[i] = s->values[i]; | ||
| 214 | } | ||
| 215 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; | ||
| 216 | return 0; | ||
| 217 | } | ||
| 218 | |||
| 182 | /* Check to see if two rules are identical. It is called from | 219 | /* Check to see if two rules are identical. It is called from |
| 220 | * audit_add_rule during AUDIT_ADD and | ||
| 183 | * audit_del_rule during AUDIT_DEL. */ | 221 | * audit_del_rule during AUDIT_DEL. */ |
| 184 | static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) | 222 | static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) |
| 185 | { | 223 | { |
| 186 | int i; | 224 | int i; |
| 187 | 225 | ||
| @@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) | |||
| 210 | /* Note that audit_add_rule and audit_del_rule are called via | 248 | /* Note that audit_add_rule and audit_del_rule are called via |
| 211 | * audit_receive() in audit.c, and are protected by | 249 | * audit_receive() in audit.c, and are protected by |
| 212 | * audit_netlink_sem. */ | 250 | * audit_netlink_sem. */ |
| 213 | static inline int audit_add_rule(struct audit_entry *entry, | 251 | static inline int audit_add_rule(struct audit_rule *rule, |
| 214 | struct list_head *list) | 252 | struct list_head *list) |
| 215 | { | 253 | { |
| 216 | if (entry->rule.flags & AUDIT_PREPEND) { | 254 | struct audit_entry *entry; |
| 217 | entry->rule.flags &= ~AUDIT_PREPEND; | 255 | |
| 256 | /* Do not use the _rcu iterator here, since this is the only | ||
| 257 | * addition routine. */ | ||
| 258 | list_for_each_entry(entry, list, list) { | ||
| 259 | if (!audit_compare_rule(rule, &entry->rule)) { | ||
| 260 | return -EEXIST; | ||
| 261 | } | ||
| 262 | } | ||
| 263 | |||
| 264 | if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) | ||
| 265 | return -ENOMEM; | ||
| 266 | if (audit_copy_rule(&entry->rule, rule)) { | ||
| 267 | kfree(entry); | ||
| 268 | return -EINVAL; | ||
| 269 | } | ||
| 270 | |||
| 271 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { | ||
| 272 | entry->rule.flags &= ~AUDIT_FILTER_PREPEND; | ||
| 218 | list_add_rcu(&entry->list, list); | 273 | list_add_rcu(&entry->list, list); |
| 219 | } else { | 274 | } else { |
| 220 | list_add_tail_rcu(&entry->list, list); | 275 | list_add_tail_rcu(&entry->list, list); |
| 221 | } | 276 | } |
| 277 | |||
| 222 | return 0; | 278 | return 0; |
| 223 | } | 279 | } |
| 224 | 280 | ||
| 225 | static void audit_free_rule(struct rcu_head *head) | 281 | static inline void audit_free_rule(struct rcu_head *head) |
| 226 | { | 282 | { |
| 227 | struct audit_entry *e = container_of(head, struct audit_entry, rcu); | 283 | struct audit_entry *e = container_of(head, struct audit_entry, rcu); |
| 228 | kfree(e); | 284 | kfree(e); |
| @@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule, | |||
| 245 | return 0; | 301 | return 0; |
| 246 | } | 302 | } |
| 247 | } | 303 | } |
| 248 | return -EFAULT; /* No matching rule */ | 304 | return -ENOENT; /* No matching rule */ |
| 249 | } | 305 | } |
| 250 | 306 | ||
| 251 | /* Copy rule from user-space to kernel-space. Called during | 307 | static int audit_list_rules(void *_dest) |
| 252 | * AUDIT_ADD. */ | ||
| 253 | static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) | ||
| 254 | { | 308 | { |
| 309 | int pid, seq; | ||
| 310 | int *dest = _dest; | ||
| 311 | struct audit_entry *entry; | ||
| 255 | int i; | 312 | int i; |
| 256 | 313 | ||
| 257 | if (s->action != AUDIT_NEVER | 314 | pid = dest[0]; |
| 258 | && s->action != AUDIT_POSSIBLE | 315 | seq = dest[1]; |
| 259 | && s->action != AUDIT_ALWAYS) | 316 | kfree(dest); |
| 260 | return -1; | ||
| 261 | if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) | ||
| 262 | return -1; | ||
| 263 | 317 | ||
| 264 | d->flags = s->flags; | 318 | down(&audit_netlink_sem); |
| 265 | d->action = s->action; | 319 | |
| 266 | d->field_count = s->field_count; | 320 | /* The *_rcu iterators not needed here because we are |
| 267 | for (i = 0; i < d->field_count; i++) { | 321 | always called with audit_netlink_sem held. */ |
| 268 | d->fields[i] = s->fields[i]; | 322 | for (i=0; i<AUDIT_NR_FILTERS; i++) { |
| 269 | d->values[i] = s->values[i]; | 323 | list_for_each_entry(entry, &audit_filter_list[i], list) |
| 324 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
| 325 | &entry->rule, sizeof(entry->rule)); | ||
| 270 | } | 326 | } |
| 271 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; | 327 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); |
| 328 | |||
| 329 | up(&audit_netlink_sem); | ||
| 272 | return 0; | 330 | return 0; |
| 273 | } | 331 | } |
| 274 | 332 | ||
| 275 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | 333 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, |
| 276 | uid_t loginuid) | 334 | uid_t loginuid) |
| 277 | { | 335 | { |
| 278 | u32 flags; | 336 | struct task_struct *tsk; |
| 279 | struct audit_entry *entry; | 337 | int *dest; |
| 280 | int err = 0; | 338 | int err = 0; |
| 339 | unsigned listnr; | ||
| 281 | 340 | ||
| 282 | switch (type) { | 341 | switch (type) { |
| 283 | case AUDIT_LIST: | 342 | case AUDIT_LIST: |
| 284 | /* The *_rcu iterators not needed here because we are | 343 | /* We can't just spew out the rules here because we might fill |
| 285 | always called with audit_netlink_sem held. */ | 344 | * the available socket buffer space and deadlock waiting for |
| 286 | list_for_each_entry(entry, &audit_tsklist, list) | 345 | * auditctl to read from it... which isn't ever going to |
| 287 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | 346 | * happen if we're actually running in the context of auditctl |
| 288 | &entry->rule, sizeof(entry->rule)); | 347 | * trying to _send_ the stuff */ |
| 289 | list_for_each_entry(entry, &audit_entlist, list) | 348 | |
| 290 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | 349 | dest = kmalloc(2 * sizeof(int), GFP_KERNEL); |
| 291 | &entry->rule, sizeof(entry->rule)); | 350 | if (!dest) |
| 292 | list_for_each_entry(entry, &audit_extlist, list) | 351 | return -ENOMEM; |
| 293 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | 352 | dest[0] = pid; |
| 294 | &entry->rule, sizeof(entry->rule)); | 353 | dest[1] = seq; |
| 295 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | 354 | |
| 355 | tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); | ||
| 356 | if (IS_ERR(tsk)) { | ||
| 357 | kfree(dest); | ||
| 358 | err = PTR_ERR(tsk); | ||
| 359 | } | ||
| 296 | break; | 360 | break; |
| 297 | case AUDIT_ADD: | 361 | case AUDIT_ADD: |
| 298 | if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) | 362 | listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; |
| 299 | return -ENOMEM; | 363 | if (listnr >= AUDIT_NR_FILTERS) |
| 300 | if (audit_copy_rule(&entry->rule, data)) { | ||
| 301 | kfree(entry); | ||
| 302 | return -EINVAL; | 364 | return -EINVAL; |
| 303 | } | 365 | |
| 304 | flags = entry->rule.flags; | 366 | err = audit_add_rule(data, &audit_filter_list[listnr]); |
| 305 | if (!err && (flags & AUDIT_PER_TASK)) | 367 | if (!err) |
| 306 | err = audit_add_rule(entry, &audit_tsklist); | 368 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 307 | if (!err && (flags & AUDIT_AT_ENTRY)) | 369 | "auid=%u added an audit rule\n", loginuid); |
| 308 | err = audit_add_rule(entry, &audit_entlist); | ||
| 309 | if (!err && (flags & AUDIT_AT_EXIT)) | ||
| 310 | err = audit_add_rule(entry, &audit_extlist); | ||
| 311 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | ||
| 312 | "auid=%u added an audit rule\n", loginuid); | ||
| 313 | break; | 370 | break; |
| 314 | case AUDIT_DEL: | 371 | case AUDIT_DEL: |
| 315 | flags =((struct audit_rule *)data)->flags; | 372 | listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; |
| 316 | if (!err && (flags & AUDIT_PER_TASK)) | 373 | if (listnr >= AUDIT_NR_FILTERS) |
| 317 | err = audit_del_rule(data, &audit_tsklist); | 374 | return -EINVAL; |
| 318 | if (!err && (flags & AUDIT_AT_ENTRY)) | 375 | |
| 319 | err = audit_del_rule(data, &audit_entlist); | 376 | err = audit_del_rule(data, &audit_filter_list[listnr]); |
| 320 | if (!err && (flags & AUDIT_AT_EXIT)) | 377 | if (!err) |
| 321 | err = audit_del_rule(data, &audit_extlist); | 378 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| 322 | audit_log(NULL, AUDIT_CONFIG_CHANGE, | 379 | "auid=%u removed an audit rule\n", loginuid); |
| 323 | "auid=%u removed an audit rule\n", loginuid); | ||
| 324 | break; | 380 | break; |
| 325 | default: | 381 | default: |
| 326 | return -EINVAL; | 382 | return -EINVAL; |
| @@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 384 | result = (ctx->return_code == value); | 440 | result = (ctx->return_code == value); |
| 385 | break; | 441 | break; |
| 386 | case AUDIT_SUCCESS: | 442 | case AUDIT_SUCCESS: |
| 387 | if (ctx && ctx->return_valid) | 443 | if (ctx && ctx->return_valid) { |
| 388 | result = (ctx->return_valid == AUDITSC_SUCCESS); | 444 | if (value) |
| 445 | result = (ctx->return_valid == AUDITSC_SUCCESS); | ||
| 446 | else | ||
| 447 | result = (ctx->return_valid == AUDITSC_FAILURE); | ||
| 448 | } | ||
| 389 | break; | 449 | break; |
| 390 | case AUDIT_DEVMAJOR: | 450 | case AUDIT_DEVMAJOR: |
| 391 | if (ctx) { | 451 | if (ctx) { |
| @@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) | |||
| 454 | enum audit_state state; | 514 | enum audit_state state; |
| 455 | 515 | ||
| 456 | rcu_read_lock(); | 516 | rcu_read_lock(); |
| 457 | list_for_each_entry_rcu(e, &audit_tsklist, list) { | 517 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
| 458 | if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { | 518 | if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { |
| 459 | rcu_read_unlock(); | 519 | rcu_read_unlock(); |
| 460 | return state; | 520 | return state; |
| @@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
| 474 | struct list_head *list) | 534 | struct list_head *list) |
| 475 | { | 535 | { |
| 476 | struct audit_entry *e; | 536 | struct audit_entry *e; |
| 537 | enum audit_state state; | ||
| 538 | |||
| 539 | if (audit_pid && tsk->tgid == audit_pid) | ||
| 540 | return AUDIT_DISABLED; | ||
| 541 | |||
| 542 | rcu_read_lock(); | ||
| 543 | if (!list_empty(list)) { | ||
| 544 | int word = AUDIT_WORD(ctx->major); | ||
| 545 | int bit = AUDIT_BIT(ctx->major); | ||
| 546 | |||
| 547 | list_for_each_entry_rcu(e, list, list) { | ||
| 548 | if ((e->rule.mask[word] & bit) == bit | ||
| 549 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | ||
| 550 | rcu_read_unlock(); | ||
| 551 | return state; | ||
| 552 | } | ||
| 553 | } | ||
| 554 | } | ||
| 555 | rcu_read_unlock(); | ||
| 556 | return AUDIT_BUILD_CONTEXT; | ||
| 557 | } | ||
| 558 | |||
| 559 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | ||
| 560 | struct audit_rule *rule, | ||
| 561 | enum audit_state *state) | ||
| 562 | { | ||
| 563 | int i; | ||
| 564 | |||
| 565 | for (i = 0; i < rule->field_count; i++) { | ||
| 566 | u32 field = rule->fields[i] & ~AUDIT_NEGATE; | ||
| 567 | u32 value = rule->values[i]; | ||
| 568 | int result = 0; | ||
| 569 | |||
| 570 | switch (field) { | ||
| 571 | case AUDIT_PID: | ||
| 572 | result = (cb->creds.pid == value); | ||
| 573 | break; | ||
| 574 | case AUDIT_UID: | ||
| 575 | result = (cb->creds.uid == value); | ||
| 576 | break; | ||
| 577 | case AUDIT_GID: | ||
| 578 | result = (cb->creds.gid == value); | ||
| 579 | break; | ||
| 580 | case AUDIT_LOGINUID: | ||
| 581 | result = (cb->loginuid == value); | ||
| 582 | break; | ||
| 583 | } | ||
| 584 | |||
| 585 | if (rule->fields[i] & AUDIT_NEGATE) | ||
| 586 | result = !result; | ||
| 587 | if (!result) | ||
| 588 | return 0; | ||
| 589 | } | ||
| 590 | switch (rule->action) { | ||
| 591 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | ||
| 592 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
| 593 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | ||
| 594 | } | ||
| 595 | return 1; | ||
| 596 | } | ||
| 597 | |||
| 598 | int audit_filter_user(struct netlink_skb_parms *cb, int type) | ||
| 599 | { | ||
| 600 | struct audit_entry *e; | ||
| 477 | enum audit_state state; | 601 | enum audit_state state; |
| 478 | int word = AUDIT_WORD(ctx->major); | 602 | int ret = 1; |
| 479 | int bit = AUDIT_BIT(ctx->major); | ||
| 480 | 603 | ||
| 481 | rcu_read_lock(); | 604 | rcu_read_lock(); |
| 482 | list_for_each_entry_rcu(e, list, list) { | 605 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { |
| 483 | if ((e->rule.mask[word] & bit) == bit | 606 | if (audit_filter_user_rules(cb, &e->rule, &state)) { |
| 484 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | 607 | if (state == AUDIT_DISABLED) |
| 485 | rcu_read_unlock(); | 608 | ret = 0; |
| 486 | return state; | 609 | break; |
| 487 | } | 610 | } |
| 488 | } | 611 | } |
| 489 | rcu_read_unlock(); | 612 | rcu_read_unlock(); |
| 490 | return AUDIT_BUILD_CONTEXT; | 613 | |
| 614 | return ret; /* Audit by default */ | ||
| 491 | } | 615 | } |
| 492 | 616 | ||
| 493 | /* This should be called with task_lock() held. */ | 617 | /* This should be called with task_lock() held. */ |
| @@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
| 504 | 628 | ||
| 505 | if (context->in_syscall && !context->auditable) { | 629 | if (context->in_syscall && !context->auditable) { |
| 506 | enum audit_state state; | 630 | enum audit_state state; |
| 507 | state = audit_filter_syscall(tsk, context, &audit_extlist); | 631 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); |
| 508 | if (state == AUDIT_RECORD_CONTEXT) | 632 | if (state == AUDIT_RECORD_CONTEXT) |
| 509 | context->auditable = 1; | 633 | context->auditable = 1; |
| 510 | } | 634 | } |
| @@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab) | |||
| 679 | up_read(&mm->mmap_sem); | 803 | up_read(&mm->mmap_sem); |
| 680 | } | 804 | } |
| 681 | 805 | ||
| 682 | static void audit_log_exit(struct audit_context *context) | 806 | static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) |
| 683 | { | 807 | { |
| 684 | int i; | 808 | int i; |
| 685 | struct audit_buffer *ab; | 809 | struct audit_buffer *ab; |
| 686 | struct audit_aux_data *aux; | 810 | struct audit_aux_data *aux; |
| 687 | 811 | ||
| 688 | ab = audit_log_start(context, AUDIT_SYSCALL); | 812 | ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); |
| 689 | if (!ab) | 813 | if (!ab) |
| 690 | return; /* audit_panic has been called */ | 814 | return; /* audit_panic has been called */ |
| 691 | audit_log_format(ab, "arch=%x syscall=%d", | 815 | audit_log_format(ab, "arch=%x syscall=%d", |
| @@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context) | |||
| 717 | 841 | ||
| 718 | for (aux = context->aux; aux; aux = aux->next) { | 842 | for (aux = context->aux; aux; aux = aux->next) { |
| 719 | 843 | ||
| 720 | ab = audit_log_start(context, aux->type); | 844 | ab = audit_log_start(context, GFP_KERNEL, aux->type); |
| 721 | if (!ab) | 845 | if (!ab) |
| 722 | continue; /* audit_panic has been called */ | 846 | continue; /* audit_panic has been called */ |
| 723 | 847 | ||
| @@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context) | |||
| 754 | } | 878 | } |
| 755 | 879 | ||
| 756 | if (context->pwd && context->pwdmnt) { | 880 | if (context->pwd && context->pwdmnt) { |
| 757 | ab = audit_log_start(context, AUDIT_CWD); | 881 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); |
| 758 | if (ab) { | 882 | if (ab) { |
| 759 | audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); | 883 | audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); |
| 760 | audit_log_end(ab); | 884 | audit_log_end(ab); |
| 761 | } | 885 | } |
| 762 | } | 886 | } |
| 763 | for (i = 0; i < context->name_count; i++) { | 887 | for (i = 0; i < context->name_count; i++) { |
| 764 | ab = audit_log_start(context, AUDIT_PATH); | 888 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); |
| 765 | if (!ab) | 889 | if (!ab) |
| 766 | continue; /* audit_panic has been called */ | 890 | continue; /* audit_panic has been called */ |
| 767 | 891 | ||
| @@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context) | |||
| 770 | audit_log_format(ab, " name="); | 894 | audit_log_format(ab, " name="); |
| 771 | audit_log_untrustedstring(ab, context->names[i].name); | 895 | audit_log_untrustedstring(ab, context->names[i].name); |
| 772 | } | 896 | } |
| 897 | audit_log_format(ab, " flags=%x\n", context->names[i].flags); | ||
| 898 | |||
| 773 | if (context->names[i].ino != (unsigned long)-1) | 899 | if (context->names[i].ino != (unsigned long)-1) |
| 774 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" | 900 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" |
| 775 | " ouid=%u ogid=%u rdev=%02x:%02x", | 901 | " ouid=%u ogid=%u rdev=%02x:%02x", |
| @@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk) | |||
| 799 | return; | 925 | return; |
| 800 | 926 | ||
| 801 | /* Check for system calls that do not go through the exit | 927 | /* Check for system calls that do not go through the exit |
| 802 | * function (e.g., exit_group), then free context block. */ | 928 | * function (e.g., exit_group), then free context block. |
| 803 | if (context->in_syscall && context->auditable && context->pid != audit_pid) | 929 | * We use GFP_ATOMIC here because we might be doing this |
| 804 | audit_log_exit(context); | 930 | * in the context of the idle thread */ |
| 931 | if (context->in_syscall && context->auditable) | ||
| 932 | audit_log_exit(context, GFP_ATOMIC); | ||
| 805 | 933 | ||
| 806 | audit_free_context(context); | 934 | audit_free_context(context); |
| 807 | } | 935 | } |
| @@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, | |||
| 876 | 1004 | ||
| 877 | state = context->state; | 1005 | state = context->state; |
| 878 | if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) | 1006 | if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) |
| 879 | state = audit_filter_syscall(tsk, context, &audit_entlist); | 1007 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); |
| 880 | if (likely(state == AUDIT_DISABLED)) | 1008 | if (likely(state == AUDIT_DISABLED)) |
| 881 | return; | 1009 | return; |
| 882 | 1010 | ||
| 883 | context->serial = audit_serial(); | 1011 | context->serial = 0; |
| 884 | context->ctime = CURRENT_TIME; | 1012 | context->ctime = CURRENT_TIME; |
| 885 | context->in_syscall = 1; | 1013 | context->in_syscall = 1; |
| 886 | context->auditable = !!(state == AUDIT_RECORD_CONTEXT); | 1014 | context->auditable = !!(state == AUDIT_RECORD_CONTEXT); |
| @@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) | |||
| 903 | /* Not having a context here is ok, since the parent may have | 1031 | /* Not having a context here is ok, since the parent may have |
| 904 | * called __put_task_struct. */ | 1032 | * called __put_task_struct. */ |
| 905 | if (likely(!context)) | 1033 | if (likely(!context)) |
| 906 | return; | 1034 | goto out; |
| 907 | 1035 | ||
| 908 | if (context->in_syscall && context->auditable && context->pid != audit_pid) | 1036 | if (context->in_syscall && context->auditable) |
| 909 | audit_log_exit(context); | 1037 | audit_log_exit(context, GFP_KERNEL); |
| 910 | 1038 | ||
| 911 | context->in_syscall = 0; | 1039 | context->in_syscall = 0; |
| 912 | context->auditable = 0; | 1040 | context->auditable = 0; |
| @@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) | |||
| 919 | } else { | 1047 | } else { |
| 920 | audit_free_names(context); | 1048 | audit_free_names(context); |
| 921 | audit_free_aux(context); | 1049 | audit_free_aux(context); |
| 922 | audit_zero_context(context, context->state); | ||
| 923 | tsk->audit_context = context; | 1050 | tsk->audit_context = context; |
| 924 | } | 1051 | } |
| 1052 | out: | ||
| 925 | put_task_struct(tsk); | 1053 | put_task_struct(tsk); |
| 926 | } | 1054 | } |
| 927 | 1055 | ||
| @@ -996,7 +1124,7 @@ void audit_putname(const char *name) | |||
| 996 | 1124 | ||
| 997 | /* Store the inode and device from a lookup. Called from | 1125 | /* Store the inode and device from a lookup. Called from |
| 998 | * fs/namei.c:path_lookup(). */ | 1126 | * fs/namei.c:path_lookup(). */ |
| 999 | void audit_inode(const char *name, const struct inode *inode) | 1127 | void audit_inode(const char *name, const struct inode *inode, unsigned flags) |
| 1000 | { | 1128 | { |
| 1001 | int idx; | 1129 | int idx; |
| 1002 | struct audit_context *context = current->audit_context; | 1130 | struct audit_context *context = current->audit_context; |
| @@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode) | |||
| 1022 | ++context->ino_count; | 1150 | ++context->ino_count; |
| 1023 | #endif | 1151 | #endif |
| 1024 | } | 1152 | } |
| 1025 | context->names[idx].ino = inode->i_ino; | 1153 | context->names[idx].flags = flags; |
| 1026 | context->names[idx].dev = inode->i_sb->s_dev; | 1154 | context->names[idx].ino = inode->i_ino; |
| 1027 | context->names[idx].mode = inode->i_mode; | 1155 | context->names[idx].dev = inode->i_sb->s_dev; |
| 1028 | context->names[idx].uid = inode->i_uid; | 1156 | context->names[idx].mode = inode->i_mode; |
| 1029 | context->names[idx].gid = inode->i_gid; | 1157 | context->names[idx].uid = inode->i_uid; |
| 1030 | context->names[idx].rdev = inode->i_rdev; | 1158 | context->names[idx].gid = inode->i_gid; |
| 1159 | context->names[idx].rdev = inode->i_rdev; | ||
| 1031 | } | 1160 | } |
| 1032 | 1161 | ||
| 1033 | void auditsc_get_stamp(struct audit_context *ctx, | 1162 | void auditsc_get_stamp(struct audit_context *ctx, |
| 1034 | struct timespec *t, unsigned int *serial) | 1163 | struct timespec *t, unsigned int *serial) |
| 1035 | { | 1164 | { |
| 1165 | if (!ctx->serial) | ||
| 1166 | ctx->serial = audit_serial(); | ||
| 1036 | t->tv_sec = ctx->ctime.tv_sec; | 1167 | t->tv_sec = ctx->ctime.tv_sec; |
| 1037 | t->tv_nsec = ctx->ctime.tv_nsec; | 1168 | t->tv_nsec = ctx->ctime.tv_nsec; |
| 1038 | *serial = ctx->serial; | 1169 | *serial = ctx->serial; |
| @@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | |||
| 1044 | if (task->audit_context) { | 1175 | if (task->audit_context) { |
| 1045 | struct audit_buffer *ab; | 1176 | struct audit_buffer *ab; |
| 1046 | 1177 | ||
| 1047 | ab = audit_log_start(NULL, AUDIT_LOGIN); | 1178 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); |
| 1048 | if (ab) { | 1179 | if (ab) { |
| 1049 | audit_log_format(ab, "login pid=%d uid=%u " | 1180 | audit_log_format(ab, "login pid=%d uid=%u " |
| 1050 | "old auid=%u new auid=%u", | 1181 | "old auid=%u new auid=%u", |
| @@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t) | |||
| 1153 | extern pid_t audit_sig_pid; | 1284 | extern pid_t audit_sig_pid; |
| 1154 | extern uid_t audit_sig_uid; | 1285 | extern uid_t audit_sig_uid; |
| 1155 | 1286 | ||
| 1156 | if (unlikely(audit_pid && t->pid == audit_pid)) { | 1287 | if (unlikely(audit_pid && t->tgid == audit_pid)) { |
| 1157 | if (sig == SIGTERM || sig == SIGHUP) { | 1288 | if (sig == SIGTERM || sig == SIGHUP) { |
| 1158 | struct audit_context *ctx = current->audit_context; | 1289 | struct audit_context *ctx = current->audit_context; |
| 1159 | audit_sig_pid = current->pid; | 1290 | audit_sig_pid = current->pid; |
diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) | |||
| 48 | if (!time_after(expire, now)) | 48 | if (!time_after(expire, now)) |
| 49 | return 0; | 49 | return 0; |
| 50 | 50 | ||
| 51 | current->state = TASK_INTERRUPTIBLE; | 51 | expire = schedule_timeout_interruptible(expire - now); |
| 52 | expire = schedule_timeout(expire - now); | ||
| 53 | if (expire == 0) | 52 | if (expire == 0) |
| 54 | return 0; | 53 | return 0; |
| 55 | 54 | ||
| @@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, | |||
| 82 | return -EINVAL; | 81 | return -EINVAL; |
| 83 | 82 | ||
| 84 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | 83 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); |
| 85 | current->state = TASK_INTERRUPTIBLE; | 84 | expire = schedule_timeout_interruptible(expire); |
| 86 | expire = schedule_timeout(expire); | ||
| 87 | if (expire == 0) | 85 | if (expire == 0) |
| 88 | return 0; | 86 | return 0; |
| 89 | 87 | ||
| @@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
| 795 | recalc_sigpending(); | 793 | recalc_sigpending(); |
| 796 | spin_unlock_irq(¤t->sighand->siglock); | 794 | spin_unlock_irq(¤t->sighand->siglock); |
| 797 | 795 | ||
| 798 | current->state = TASK_INTERRUPTIBLE; | 796 | timeout = schedule_timeout_interruptible(timeout); |
| 799 | timeout = schedule_timeout(timeout); | ||
| 800 | 797 | ||
| 801 | spin_lock_irq(¤t->sighand->siglock); | 798 | spin_lock_irq(¤t->sighand->siglock); |
| 802 | sig = dequeue_signal(current, &s, &info); | 799 | sig = dequeue_signal(current, &s, &info); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ab1b4e518b8..28176d083f7b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; | |||
| 180 | */ | 180 | */ |
| 181 | 181 | ||
| 182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
| 183 | static struct task_struct *cpuset_sem_owner; | ||
| 184 | static int cpuset_sem_depth; | ||
| 185 | |||
| 186 | /* | ||
| 187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
| 188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
| 189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
| 190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
| 191 | * to cpuset_excl_nodes_overlap()). | ||
| 192 | * | ||
| 193 | * But if the memory allocation is being done by cpuset.c code, it | ||
| 194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
| 195 | * semaphore deadlocks the current task, and any other task that | ||
| 196 | * subsequently tries to obtain the lock. | ||
| 197 | * | ||
| 198 | * Run all up's and down's on cpuset_sem through the following | ||
| 199 | * wrappers, which will detect this nested locking, and avoid | ||
| 200 | * deadlocking. | ||
| 201 | */ | ||
| 202 | |||
| 203 | static inline void cpuset_down(struct semaphore *psem) | ||
| 204 | { | ||
| 205 | if (cpuset_sem_owner != current) { | ||
| 206 | down(psem); | ||
| 207 | cpuset_sem_owner = current; | ||
| 208 | } | ||
| 209 | cpuset_sem_depth++; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline void cpuset_up(struct semaphore *psem) | ||
| 213 | { | ||
| 214 | if (--cpuset_sem_depth == 0) { | ||
| 215 | cpuset_sem_owner = NULL; | ||
| 216 | up(psem); | ||
| 217 | } | ||
| 218 | } | ||
| 183 | 219 | ||
| 184 | /* | 220 | /* |
| 185 | * A couple of forward declarations required, due to cyclic reference loop: | 221 | * A couple of forward declarations required, due to cyclic reference loop: |
| @@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 522 | * Refresh current tasks mems_allowed and mems_generation from | 558 | * Refresh current tasks mems_allowed and mems_generation from |
| 523 | * current tasks cpuset. Call with cpuset_sem held. | 559 | * current tasks cpuset. Call with cpuset_sem held. |
| 524 | * | 560 | * |
| 525 | * Be sure to call refresh_mems() on any cpuset operation which | 561 | * This routine is needed to update the per-task mems_allowed |
| 526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 562 | * data, within the tasks context, when it is trying to allocate |
| 527 | * Call after obtaining cpuset_sem lock, before any possible | 563 | * memory (in various mm/mempolicy.c routines) and notices |
| 528 | * allocation. Otherwise one risks trying to allocate memory | 564 | * that some other task has been modifying its cpuset. |
| 529 | * while the task cpuset_mems_generation is not the same as | ||
| 530 | * the mems_generation in its cpuset, which would deadlock on | ||
| 531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
| 532 | * | ||
| 533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
| 534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
| 535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
| 536 | * until we drop cpuset_sem. Anyone else who would change our | ||
| 537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
| 538 | */ | 565 | */ |
| 539 | 566 | ||
| 540 | static void refresh_mems(void) | 567 | static void refresh_mems(void) |
| @@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 628 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
| 629 | */ | 656 | */ |
| 630 | 657 | ||
| 631 | /* | ||
| 632 | * Hack to avoid 2.6.13 partial node dynamic sched domain bug. | ||
| 633 | * Disable letting 'cpu_exclusive' cpusets define dynamic sched | ||
| 634 | * domains, until the sched domain can handle partial nodes. | ||
| 635 | * Remove this #if hackery when sched domains fixed. | ||
| 636 | */ | ||
| 637 | #if 0 | ||
| 638 | static void update_cpu_domains(struct cpuset *cur) | 658 | static void update_cpu_domains(struct cpuset *cur) |
| 639 | { | 659 | { |
| 640 | struct cpuset *c, *par = cur->parent; | 660 | struct cpuset *c, *par = cur->parent; |
| @@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 675 | partition_sched_domains(&pspan, &cspan); | 695 | partition_sched_domains(&pspan, &cspan); |
| 676 | unlock_cpu_hotplug(); | 696 | unlock_cpu_hotplug(); |
| 677 | } | 697 | } |
| 678 | #else | ||
| 679 | static void update_cpu_domains(struct cpuset *cur) | ||
| 680 | { | ||
| 681 | } | ||
| 682 | #endif | ||
| 683 | 698 | ||
| 684 | static int update_cpumask(struct cpuset *cs, char *buf) | 699 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 685 | { | 700 | { |
| @@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 852 | } | 867 | } |
| 853 | buffer[nbytes] = 0; /* nul-terminate */ | 868 | buffer[nbytes] = 0; /* nul-terminate */ |
| 854 | 869 | ||
| 855 | down(&cpuset_sem); | 870 | cpuset_down(&cpuset_sem); |
| 856 | 871 | ||
| 857 | if (is_removed(cs)) { | 872 | if (is_removed(cs)) { |
| 858 | retval = -ENODEV; | 873 | retval = -ENODEV; |
| @@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 886 | if (retval == 0) | 901 | if (retval == 0) |
| 887 | retval = nbytes; | 902 | retval = nbytes; |
| 888 | out2: | 903 | out2: |
| 889 | up(&cpuset_sem); | 904 | cpuset_up(&cpuset_sem); |
| 890 | cpuset_release_agent(pathbuf); | 905 | cpuset_release_agent(pathbuf); |
| 891 | out1: | 906 | out1: |
| 892 | kfree(buffer); | 907 | kfree(buffer); |
| @@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
| 926 | { | 941 | { |
| 927 | cpumask_t mask; | 942 | cpumask_t mask; |
| 928 | 943 | ||
| 929 | down(&cpuset_sem); | 944 | cpuset_down(&cpuset_sem); |
| 930 | mask = cs->cpus_allowed; | 945 | mask = cs->cpus_allowed; |
| 931 | up(&cpuset_sem); | 946 | cpuset_up(&cpuset_sem); |
| 932 | 947 | ||
| 933 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
| 934 | } | 949 | } |
| @@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 937 | { | 952 | { |
| 938 | nodemask_t mask; | 953 | nodemask_t mask; |
| 939 | 954 | ||
| 940 | down(&cpuset_sem); | 955 | cpuset_down(&cpuset_sem); |
| 941 | mask = cs->mems_allowed; | 956 | mask = cs->mems_allowed; |
| 942 | up(&cpuset_sem); | 957 | cpuset_up(&cpuset_sem); |
| 943 | 958 | ||
| 944 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
| 945 | } | 960 | } |
| @@ -953,8 +968,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
| 953 | char *page; | 968 | char *page; |
| 954 | ssize_t retval = 0; | 969 | ssize_t retval = 0; |
| 955 | char *s; | 970 | char *s; |
| 956 | char *start; | ||
| 957 | size_t n; | ||
| 958 | 971 | ||
| 959 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) | 972 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) |
| 960 | return -ENOMEM; | 973 | return -ENOMEM; |
| @@ -984,10 +997,7 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
| 984 | *s++ = '\n'; | 997 | *s++ = '\n'; |
| 985 | *s = '\0'; | 998 | *s = '\0'; |
| 986 | 999 | ||
| 987 | start = page + *ppos; | 1000 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); |
| 988 | n = s - start; | ||
| 989 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | ||
| 990 | *ppos += retval; | ||
| 991 | out: | 1001 | out: |
| 992 | free_page((unsigned long)page); | 1002 | free_page((unsigned long)page); |
| 993 | return retval; | 1003 | return retval; |
| @@ -1342,8 +1352,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1342 | if (!cs) | 1352 | if (!cs) |
| 1343 | return -ENOMEM; | 1353 | return -ENOMEM; |
| 1344 | 1354 | ||
| 1345 | down(&cpuset_sem); | 1355 | cpuset_down(&cpuset_sem); |
| 1346 | refresh_mems(); | ||
| 1347 | cs->flags = 0; | 1356 | cs->flags = 0; |
| 1348 | if (notify_on_release(parent)) | 1357 | if (notify_on_release(parent)) |
| 1349 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1358 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
| @@ -1368,14 +1377,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1368 | * will down() this new directory's i_sem and if we race with | 1377 | * will down() this new directory's i_sem and if we race with |
| 1369 | * another mkdir, we might deadlock. | 1378 | * another mkdir, we might deadlock. |
| 1370 | */ | 1379 | */ |
| 1371 | up(&cpuset_sem); | 1380 | cpuset_up(&cpuset_sem); |
| 1372 | 1381 | ||
| 1373 | err = cpuset_populate_dir(cs->dentry); | 1382 | err = cpuset_populate_dir(cs->dentry); |
| 1374 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1383 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 1375 | return 0; | 1384 | return 0; |
| 1376 | err: | 1385 | err: |
| 1377 | list_del(&cs->sibling); | 1386 | list_del(&cs->sibling); |
| 1378 | up(&cpuset_sem); | 1387 | cpuset_up(&cpuset_sem); |
| 1379 | kfree(cs); | 1388 | kfree(cs); |
| 1380 | return err; | 1389 | return err; |
| 1381 | } | 1390 | } |
| @@ -1397,14 +1406,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1397 | 1406 | ||
| 1398 | /* the vfs holds both inode->i_sem already */ | 1407 | /* the vfs holds both inode->i_sem already */ |
| 1399 | 1408 | ||
| 1400 | down(&cpuset_sem); | 1409 | cpuset_down(&cpuset_sem); |
| 1401 | refresh_mems(); | ||
| 1402 | if (atomic_read(&cs->count) > 0) { | 1410 | if (atomic_read(&cs->count) > 0) { |
| 1403 | up(&cpuset_sem); | 1411 | cpuset_up(&cpuset_sem); |
| 1404 | return -EBUSY; | 1412 | return -EBUSY; |
| 1405 | } | 1413 | } |
| 1406 | if (!list_empty(&cs->children)) { | 1414 | if (!list_empty(&cs->children)) { |
| 1407 | up(&cpuset_sem); | 1415 | cpuset_up(&cpuset_sem); |
| 1408 | return -EBUSY; | 1416 | return -EBUSY; |
| 1409 | } | 1417 | } |
| 1410 | parent = cs->parent; | 1418 | parent = cs->parent; |
| @@ -1420,7 +1428,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1420 | spin_unlock(&d->d_lock); | 1428 | spin_unlock(&d->d_lock); |
| 1421 | cpuset_d_remove_dir(d); | 1429 | cpuset_d_remove_dir(d); |
| 1422 | dput(d); | 1430 | dput(d); |
| 1423 | up(&cpuset_sem); | 1431 | cpuset_up(&cpuset_sem); |
| 1424 | cpuset_release_agent(pathbuf); | 1432 | cpuset_release_agent(pathbuf); |
| 1425 | return 0; | 1433 | return 0; |
| 1426 | } | 1434 | } |
| @@ -1523,10 +1531,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
| 1523 | if (notify_on_release(cs)) { | 1531 | if (notify_on_release(cs)) { |
| 1524 | char *pathbuf = NULL; | 1532 | char *pathbuf = NULL; |
| 1525 | 1533 | ||
| 1526 | down(&cpuset_sem); | 1534 | cpuset_down(&cpuset_sem); |
| 1527 | if (atomic_dec_and_test(&cs->count)) | 1535 | if (atomic_dec_and_test(&cs->count)) |
| 1528 | check_for_release(cs, &pathbuf); | 1536 | check_for_release(cs, &pathbuf); |
| 1529 | up(&cpuset_sem); | 1537 | cpuset_up(&cpuset_sem); |
| 1530 | cpuset_release_agent(pathbuf); | 1538 | cpuset_release_agent(pathbuf); |
| 1531 | } else { | 1539 | } else { |
| 1532 | atomic_dec(&cs->count); | 1540 | atomic_dec(&cs->count); |
| @@ -1547,11 +1555,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
| 1547 | { | 1555 | { |
| 1548 | cpumask_t mask; | 1556 | cpumask_t mask; |
| 1549 | 1557 | ||
| 1550 | down(&cpuset_sem); | 1558 | cpuset_down(&cpuset_sem); |
| 1551 | task_lock((struct task_struct *)tsk); | 1559 | task_lock((struct task_struct *)tsk); |
| 1552 | guarantee_online_cpus(tsk->cpuset, &mask); | 1560 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 1553 | task_unlock((struct task_struct *)tsk); | 1561 | task_unlock((struct task_struct *)tsk); |
| 1554 | up(&cpuset_sem); | 1562 | cpuset_up(&cpuset_sem); |
| 1555 | 1563 | ||
| 1556 | return mask; | 1564 | return mask; |
| 1557 | } | 1565 | } |
| @@ -1576,9 +1584,9 @@ void cpuset_update_current_mems_allowed(void) | |||
| 1576 | if (!cs) | 1584 | if (!cs) |
| 1577 | return; /* task is exiting */ | 1585 | return; /* task is exiting */ |
| 1578 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1586 | if (current->cpuset_mems_generation != cs->mems_generation) { |
| 1579 | down(&cpuset_sem); | 1587 | cpuset_down(&cpuset_sem); |
| 1580 | refresh_mems(); | 1588 | refresh_mems(); |
| 1581 | up(&cpuset_sem); | 1589 | cpuset_up(&cpuset_sem); |
| 1582 | } | 1590 | } |
| 1583 | } | 1591 | } |
| 1584 | 1592 | ||
| @@ -1611,17 +1619,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 1611 | return 0; | 1619 | return 0; |
| 1612 | } | 1620 | } |
| 1613 | 1621 | ||
| 1622 | /* | ||
| 1623 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | ||
| 1624 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | ||
| 1625 | * If no ancestor is mem_exclusive (an unusual configuration), then | ||
| 1626 | * returns the root cpuset. | ||
| 1627 | */ | ||
| 1628 | static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | ||
| 1629 | { | ||
| 1630 | while (!is_mem_exclusive(cs) && cs->parent) | ||
| 1631 | cs = cs->parent; | ||
| 1632 | return cs; | ||
| 1633 | } | ||
| 1634 | |||
| 1614 | /** | 1635 | /** |
| 1615 | * cpuset_zone_allowed - is zone z allowed in current->mems_allowed | 1636 | * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? |
| 1616 | * @z: zone in question | 1637 | * @z: is this zone on an allowed node? |
| 1638 | * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) | ||
| 1617 | * | 1639 | * |
| 1618 | * Is zone z allowed in current->mems_allowed, or is | 1640 | * If we're in interrupt, yes, we can always allocate. If zone |
| 1619 | * the CPU in interrupt context? (zone is always allowed in this case) | 1641 | * z's node is in our tasks mems_allowed, yes. If it's not a |
| 1620 | */ | 1642 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
| 1621 | int cpuset_zone_allowed(struct zone *z) | 1643 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. |
| 1644 | * Otherwise, no. | ||
| 1645 | * | ||
| 1646 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | ||
| 1647 | * and do not allow allocations outside the current tasks cpuset. | ||
| 1648 | * GFP_KERNEL allocations are not so marked, so can escape to the | ||
| 1649 | * nearest mem_exclusive ancestor cpuset. | ||
| 1650 | * | ||
| 1651 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | ||
| 1652 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | ||
| 1653 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | ||
| 1654 | * mems_allowed came up empty on the first pass over the zonelist. | ||
| 1655 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | ||
| 1656 | * short of memory, might require taking the cpuset_sem semaphore. | ||
| 1657 | * | ||
| 1658 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | ||
| 1659 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | ||
| 1660 | * hardwall cpusets - no allocation on a node outside the cpuset is | ||
| 1661 | * allowed (unless in interrupt, of course). | ||
| 1662 | * | ||
| 1663 | * The second loop doesn't even call here for GFP_ATOMIC requests | ||
| 1664 | * (if the __alloc_pages() local variable 'wait' is set). That check | ||
| 1665 | * and the checks below have the combined affect in the second loop of | ||
| 1666 | * the __alloc_pages() routine that: | ||
| 1667 | * in_interrupt - any node ok (current task context irrelevant) | ||
| 1668 | * GFP_ATOMIC - any node ok | ||
| 1669 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | ||
| 1670 | * GFP_USER - only nodes in current tasks mems allowed ok. | ||
| 1671 | **/ | ||
| 1672 | |||
| 1673 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | ||
| 1674 | { | ||
| 1675 | int node; /* node that zone z is on */ | ||
| 1676 | const struct cpuset *cs; /* current cpuset ancestors */ | ||
| 1677 | int allowed = 1; /* is allocation in zone z allowed? */ | ||
| 1678 | |||
| 1679 | if (in_interrupt()) | ||
| 1680 | return 1; | ||
| 1681 | node = z->zone_pgdat->node_id; | ||
| 1682 | if (node_isset(node, current->mems_allowed)) | ||
| 1683 | return 1; | ||
| 1684 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | ||
| 1685 | return 0; | ||
| 1686 | |||
| 1687 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | ||
| 1688 | cpuset_down(&cpuset_sem); | ||
| 1689 | cs = current->cpuset; | ||
| 1690 | if (!cs) | ||
| 1691 | goto done; /* current task exiting */ | ||
| 1692 | cs = nearest_exclusive_ancestor(cs); | ||
| 1693 | allowed = node_isset(node, cs->mems_allowed); | ||
| 1694 | done: | ||
| 1695 | cpuset_up(&cpuset_sem); | ||
| 1696 | return allowed; | ||
| 1697 | } | ||
| 1698 | |||
| 1699 | /** | ||
| 1700 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | ||
| 1701 | * @p: pointer to task_struct of some other task. | ||
| 1702 | * | ||
| 1703 | * Description: Return true if the nearest mem_exclusive ancestor | ||
| 1704 | * cpusets of tasks @p and current overlap. Used by oom killer to | ||
| 1705 | * determine if task @p's memory usage might impact the memory | ||
| 1706 | * available to the current task. | ||
| 1707 | * | ||
| 1708 | * Acquires cpuset_sem - not suitable for calling from a fast path. | ||
| 1709 | **/ | ||
| 1710 | |||
| 1711 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | ||
| 1622 | { | 1712 | { |
| 1623 | return in_interrupt() || | 1713 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
| 1624 | node_isset(z->zone_pgdat->node_id, current->mems_allowed); | 1714 | int overlap = 0; /* do cpusets overlap? */ |
| 1715 | |||
| 1716 | cpuset_down(&cpuset_sem); | ||
| 1717 | cs1 = current->cpuset; | ||
| 1718 | if (!cs1) | ||
| 1719 | goto done; /* current task exiting */ | ||
| 1720 | cs2 = p->cpuset; | ||
| 1721 | if (!cs2) | ||
| 1722 | goto done; /* task p is exiting */ | ||
| 1723 | cs1 = nearest_exclusive_ancestor(cs1); | ||
| 1724 | cs2 = nearest_exclusive_ancestor(cs2); | ||
| 1725 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | ||
| 1726 | done: | ||
| 1727 | cpuset_up(&cpuset_sem); | ||
| 1728 | |||
| 1729 | return overlap; | ||
| 1625 | } | 1730 | } |
| 1626 | 1731 | ||
| 1627 | /* | 1732 | /* |
| @@ -1642,7 +1747,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1642 | return -ENOMEM; | 1747 | return -ENOMEM; |
| 1643 | 1748 | ||
| 1644 | tsk = m->private; | 1749 | tsk = m->private; |
| 1645 | down(&cpuset_sem); | 1750 | cpuset_down(&cpuset_sem); |
| 1646 | task_lock(tsk); | 1751 | task_lock(tsk); |
| 1647 | cs = tsk->cpuset; | 1752 | cs = tsk->cpuset; |
| 1648 | task_unlock(tsk); | 1753 | task_unlock(tsk); |
| @@ -1657,7 +1762,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1657 | seq_puts(m, buf); | 1762 | seq_puts(m, buf); |
| 1658 | seq_putc(m, '\n'); | 1763 | seq_putc(m, '\n'); |
| 1659 | out: | 1764 | out: |
| 1660 | up(&cpuset_sem); | 1765 | cpuset_up(&cpuset_sem); |
| 1661 | kfree(buf); | 1766 | kfree(buf); |
| 1662 | return retval; | 1767 | return retval; |
| 1663 | } | 1768 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 5b0fb9f09f21..3b25b182d2be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -368,17 +368,25 @@ EXPORT_SYMBOL(daemonize); | |||
| 368 | static inline void close_files(struct files_struct * files) | 368 | static inline void close_files(struct files_struct * files) |
| 369 | { | 369 | { |
| 370 | int i, j; | 370 | int i, j; |
| 371 | struct fdtable *fdt; | ||
| 371 | 372 | ||
| 372 | j = 0; | 373 | j = 0; |
| 374 | |||
| 375 | /* | ||
| 376 | * It is safe to dereference the fd table without RCU or | ||
| 377 | * ->file_lock because this is the last reference to the | ||
| 378 | * files structure. | ||
| 379 | */ | ||
| 380 | fdt = files_fdtable(files); | ||
| 373 | for (;;) { | 381 | for (;;) { |
| 374 | unsigned long set; | 382 | unsigned long set; |
| 375 | i = j * __NFDBITS; | 383 | i = j * __NFDBITS; |
| 376 | if (i >= files->max_fdset || i >= files->max_fds) | 384 | if (i >= fdt->max_fdset || i >= fdt->max_fds) |
| 377 | break; | 385 | break; |
| 378 | set = files->open_fds->fds_bits[j++]; | 386 | set = fdt->open_fds->fds_bits[j++]; |
| 379 | while (set) { | 387 | while (set) { |
| 380 | if (set & 1) { | 388 | if (set & 1) { |
| 381 | struct file * file = xchg(&files->fd[i], NULL); | 389 | struct file * file = xchg(&fdt->fd[i], NULL); |
| 382 | if (file) | 390 | if (file) |
| 383 | filp_close(file, files); | 391 | filp_close(file, files); |
| 384 | } | 392 | } |
| @@ -403,18 +411,22 @@ struct files_struct *get_files_struct(struct task_struct *task) | |||
| 403 | 411 | ||
| 404 | void fastcall put_files_struct(struct files_struct *files) | 412 | void fastcall put_files_struct(struct files_struct *files) |
| 405 | { | 413 | { |
| 414 | struct fdtable *fdt; | ||
| 415 | |||
| 406 | if (atomic_dec_and_test(&files->count)) { | 416 | if (atomic_dec_and_test(&files->count)) { |
| 407 | close_files(files); | 417 | close_files(files); |
| 408 | /* | 418 | /* |
| 409 | * Free the fd and fdset arrays if we expanded them. | 419 | * Free the fd and fdset arrays if we expanded them. |
| 420 | * If the fdtable was embedded, pass files for freeing | ||
| 421 | * at the end of the RCU grace period. Otherwise, | ||
| 422 | * you can free files immediately. | ||
| 410 | */ | 423 | */ |
| 411 | if (files->fd != &files->fd_array[0]) | 424 | fdt = files_fdtable(files); |
| 412 | free_fd_array(files->fd, files->max_fds); | 425 | if (fdt == &files->fdtab) |
| 413 | if (files->max_fdset > __FD_SETSIZE) { | 426 | fdt->free_files = files; |
| 414 | free_fdset(files->open_fds, files->max_fdset); | 427 | else |
| 415 | free_fdset(files->close_on_exec, files->max_fdset); | 428 | kmem_cache_free(files_cachep, files); |
| 416 | } | 429 | free_fdtable(fdt); |
| 417 | kmem_cache_free(files_cachep, files); | ||
| 418 | } | 430 | } |
| 419 | } | 431 | } |
| 420 | 432 | ||
| @@ -831,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 831 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 843 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
| 832 | if (group_dead) { | 844 | if (group_dead) { |
| 833 | del_timer_sync(&tsk->signal->real_timer); | 845 | del_timer_sync(&tsk->signal->real_timer); |
| 846 | exit_itimers(tsk->signal); | ||
| 834 | acct_process(code); | 847 | acct_process(code); |
| 835 | } | 848 | } |
| 836 | exit_mm(tsk); | 849 | exit_mm(tsk); |
| @@ -1191,7 +1204,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, | |||
| 1191 | 1204 | ||
| 1192 | exit_code = p->exit_code; | 1205 | exit_code = p->exit_code; |
| 1193 | if (unlikely(!exit_code) || | 1206 | if (unlikely(!exit_code) || |
| 1194 | unlikely(p->state > TASK_STOPPED)) | 1207 | unlikely(p->state & TASK_TRACED)) |
| 1195 | goto bail_ref; | 1208 | goto bail_ref; |
| 1196 | return wait_noreap_copyout(p, pid, uid, | 1209 | return wait_noreap_copyout(p, pid, uid, |
| 1197 | why, (exit_code << 8) | 0x7f, | 1210 | why, (exit_code << 8) | 0x7f, |
diff --git a/kernel/fork.c b/kernel/fork.c index 7e1ead9a6ba4..280bd44ac441 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
| 36 | #include <linux/jiffies.h> | 36 | #include <linux/jiffies.h> |
| 37 | #include <linux/futex.h> | 37 | #include <linux/futex.h> |
| 38 | #include <linux/rcupdate.h> | ||
| 38 | #include <linux/ptrace.h> | 39 | #include <linux/ptrace.h> |
| 39 | #include <linux/mount.h> | 40 | #include <linux/mount.h> |
| 40 | #include <linux/audit.h> | 41 | #include <linux/audit.h> |
| @@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 176 | 177 | ||
| 177 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 178 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
| 178 | atomic_set(&tsk->usage,2); | 179 | atomic_set(&tsk->usage,2); |
| 180 | atomic_set(&tsk->fs_excl, 0); | ||
| 179 | return tsk; | 181 | return tsk; |
| 180 | } | 182 | } |
| 181 | 183 | ||
| @@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |||
| 564 | return 0; | 566 | return 0; |
| 565 | } | 567 | } |
| 566 | 568 | ||
| 567 | static int count_open_files(struct files_struct *files, int size) | 569 | static int count_open_files(struct fdtable *fdt) |
| 568 | { | 570 | { |
| 571 | int size = fdt->max_fdset; | ||
| 569 | int i; | 572 | int i; |
| 570 | 573 | ||
| 571 | /* Find the last open fd */ | 574 | /* Find the last open fd */ |
| 572 | for (i = size/(8*sizeof(long)); i > 0; ) { | 575 | for (i = size/(8*sizeof(long)); i > 0; ) { |
| 573 | if (files->open_fds->fds_bits[--i]) | 576 | if (fdt->open_fds->fds_bits[--i]) |
| 574 | break; | 577 | break; |
| 575 | } | 578 | } |
| 576 | i = (i+1) * 8 * sizeof(long); | 579 | i = (i+1) * 8 * sizeof(long); |
| 577 | return i; | 580 | return i; |
| 578 | } | 581 | } |
| 579 | 582 | ||
| 583 | static struct files_struct *alloc_files(void) | ||
| 584 | { | ||
| 585 | struct files_struct *newf; | ||
| 586 | struct fdtable *fdt; | ||
| 587 | |||
| 588 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | ||
| 589 | if (!newf) | ||
| 590 | goto out; | ||
| 591 | |||
| 592 | atomic_set(&newf->count, 1); | ||
| 593 | |||
| 594 | spin_lock_init(&newf->file_lock); | ||
| 595 | fdt = &newf->fdtab; | ||
| 596 | fdt->next_fd = 0; | ||
| 597 | fdt->max_fds = NR_OPEN_DEFAULT; | ||
| 598 | fdt->max_fdset = __FD_SETSIZE; | ||
| 599 | fdt->close_on_exec = &newf->close_on_exec_init; | ||
| 600 | fdt->open_fds = &newf->open_fds_init; | ||
| 601 | fdt->fd = &newf->fd_array[0]; | ||
| 602 | INIT_RCU_HEAD(&fdt->rcu); | ||
| 603 | fdt->free_files = NULL; | ||
| 604 | fdt->next = NULL; | ||
| 605 | rcu_assign_pointer(newf->fdt, fdt); | ||
| 606 | out: | ||
| 607 | return newf; | ||
| 608 | } | ||
| 609 | |||
| 580 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 610 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) |
| 581 | { | 611 | { |
| 582 | struct files_struct *oldf, *newf; | 612 | struct files_struct *oldf, *newf; |
| 583 | struct file **old_fds, **new_fds; | 613 | struct file **old_fds, **new_fds; |
| 584 | int open_files, size, i, error = 0, expand; | 614 | int open_files, size, i, error = 0, expand; |
| 615 | struct fdtable *old_fdt, *new_fdt; | ||
| 585 | 616 | ||
| 586 | /* | 617 | /* |
| 587 | * A background process may not have any files ... | 618 | * A background process may not have any files ... |
| @@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
| 602 | */ | 633 | */ |
| 603 | tsk->files = NULL; | 634 | tsk->files = NULL; |
| 604 | error = -ENOMEM; | 635 | error = -ENOMEM; |
| 605 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | 636 | newf = alloc_files(); |
| 606 | if (!newf) | 637 | if (!newf) |
| 607 | goto out; | 638 | goto out; |
| 608 | 639 | ||
| 609 | atomic_set(&newf->count, 1); | ||
| 610 | |||
| 611 | spin_lock_init(&newf->file_lock); | ||
| 612 | newf->next_fd = 0; | ||
| 613 | newf->max_fds = NR_OPEN_DEFAULT; | ||
| 614 | newf->max_fdset = __FD_SETSIZE; | ||
| 615 | newf->close_on_exec = &newf->close_on_exec_init; | ||
| 616 | newf->open_fds = &newf->open_fds_init; | ||
| 617 | newf->fd = &newf->fd_array[0]; | ||
| 618 | |||
| 619 | spin_lock(&oldf->file_lock); | 640 | spin_lock(&oldf->file_lock); |
| 620 | 641 | old_fdt = files_fdtable(oldf); | |
| 621 | open_files = count_open_files(oldf, oldf->max_fdset); | 642 | new_fdt = files_fdtable(newf); |
| 643 | size = old_fdt->max_fdset; | ||
| 644 | open_files = count_open_files(old_fdt); | ||
| 622 | expand = 0; | 645 | expand = 0; |
| 623 | 646 | ||
| 624 | /* | 647 | /* |
| 625 | * Check whether we need to allocate a larger fd array or fd set. | 648 | * Check whether we need to allocate a larger fd array or fd set. |
| 626 | * Note: we're not a clone task, so the open count won't change. | 649 | * Note: we're not a clone task, so the open count won't change. |
| 627 | */ | 650 | */ |
| 628 | if (open_files > newf->max_fdset) { | 651 | if (open_files > new_fdt->max_fdset) { |
| 629 | newf->max_fdset = 0; | 652 | new_fdt->max_fdset = 0; |
| 630 | expand = 1; | 653 | expand = 1; |
| 631 | } | 654 | } |
| 632 | if (open_files > newf->max_fds) { | 655 | if (open_files > new_fdt->max_fds) { |
| 633 | newf->max_fds = 0; | 656 | new_fdt->max_fds = 0; |
| 634 | expand = 1; | 657 | expand = 1; |
| 635 | } | 658 | } |
| 636 | 659 | ||
| @@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
| 642 | spin_unlock(&newf->file_lock); | 665 | spin_unlock(&newf->file_lock); |
| 643 | if (error < 0) | 666 | if (error < 0) |
| 644 | goto out_release; | 667 | goto out_release; |
| 668 | new_fdt = files_fdtable(newf); | ||
| 669 | /* | ||
| 670 | * Reacquire the oldf lock and a pointer to its fd table | ||
| 671 | * who knows it may have a new bigger fd table. We need | ||
| 672 | * the latest pointer. | ||
| 673 | */ | ||
| 645 | spin_lock(&oldf->file_lock); | 674 | spin_lock(&oldf->file_lock); |
| 675 | old_fdt = files_fdtable(oldf); | ||
| 646 | } | 676 | } |
| 647 | 677 | ||
| 648 | old_fds = oldf->fd; | 678 | old_fds = old_fdt->fd; |
| 649 | new_fds = newf->fd; | 679 | new_fds = new_fdt->fd; |
| 650 | 680 | ||
| 651 | memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); | 681 | memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); |
| 652 | memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); | 682 | memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); |
| 653 | 683 | ||
| 654 | for (i = open_files; i != 0; i--) { | 684 | for (i = open_files; i != 0; i--) { |
| 655 | struct file *f = *old_fds++; | 685 | struct file *f = *old_fds++; |
| @@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
| 662 | * is partway through open(). So make sure that this | 692 | * is partway through open(). So make sure that this |
| 663 | * fd is available to the new process. | 693 | * fd is available to the new process. |
| 664 | */ | 694 | */ |
| 665 | FD_CLR(open_files - i, newf->open_fds); | 695 | FD_CLR(open_files - i, new_fdt->open_fds); |
| 666 | } | 696 | } |
| 667 | *new_fds++ = f; | 697 | rcu_assign_pointer(*new_fds++, f); |
| 668 | } | 698 | } |
| 669 | spin_unlock(&oldf->file_lock); | 699 | spin_unlock(&oldf->file_lock); |
| 670 | 700 | ||
| 671 | /* compute the remainder to be cleared */ | 701 | /* compute the remainder to be cleared */ |
| 672 | size = (newf->max_fds - open_files) * sizeof(struct file *); | 702 | size = (new_fdt->max_fds - open_files) * sizeof(struct file *); |
| 673 | 703 | ||
| 674 | /* This is long word aligned thus could use a optimized version */ | 704 | /* This is long word aligned thus could use a optimized version */ |
| 675 | memset(new_fds, 0, size); | 705 | memset(new_fds, 0, size); |
| 676 | 706 | ||
| 677 | if (newf->max_fdset > open_files) { | 707 | if (new_fdt->max_fdset > open_files) { |
| 678 | int left = (newf->max_fdset-open_files)/8; | 708 | int left = (new_fdt->max_fdset-open_files)/8; |
| 679 | int start = open_files / (8 * sizeof(unsigned long)); | 709 | int start = open_files / (8 * sizeof(unsigned long)); |
| 680 | 710 | ||
| 681 | memset(&newf->open_fds->fds_bits[start], 0, left); | 711 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); |
| 682 | memset(&newf->close_on_exec->fds_bits[start], 0, left); | 712 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); |
| 683 | } | 713 | } |
| 684 | 714 | ||
| 685 | tsk->files = newf; | 715 | tsk->files = newf; |
| @@ -688,9 +718,9 @@ out: | |||
| 688 | return error; | 718 | return error; |
| 689 | 719 | ||
| 690 | out_release: | 720 | out_release: |
| 691 | free_fdset (newf->close_on_exec, newf->max_fdset); | 721 | free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); |
| 692 | free_fdset (newf->open_fds, newf->max_fdset); | 722 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); |
| 693 | free_fd_array(newf->fd, newf->max_fds); | 723 | free_fd_array(new_fdt->fd, new_fdt->max_fds); |
| 694 | kmem_cache_free(files_cachep, newf); | 724 | kmem_cache_free(files_cachep, newf); |
| 695 | goto out; | 725 | goto out; |
| 696 | } | 726 | } |
| @@ -818,7 +848,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
| 818 | { | 848 | { |
| 819 | unsigned long new_flags = p->flags; | 849 | unsigned long new_flags = p->flags; |
| 820 | 850 | ||
| 821 | new_flags &= ~PF_SUPERPRIV; | 851 | new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); |
| 822 | new_flags |= PF_FORKNOEXEC; | 852 | new_flags |= PF_FORKNOEXEC; |
| 823 | if (!(clone_flags & CLONE_PTRACE)) | 853 | if (!(clone_flags & CLONE_PTRACE)) |
| 824 | p->ptrace = 0; | 854 | p->ptrace = 0; |
| @@ -1032,7 +1062,8 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1032 | * parent's CPU). This avoids alot of nasty races. | 1062 | * parent's CPU). This avoids alot of nasty races. |
| 1033 | */ | 1063 | */ |
| 1034 | p->cpus_allowed = current->cpus_allowed; | 1064 | p->cpus_allowed = current->cpus_allowed; |
| 1035 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) | 1065 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
| 1066 | !cpu_online(task_cpu(p)))) | ||
| 1036 | set_task_cpu(p, smp_processor_id()); | 1067 | set_task_cpu(p, smp_processor_id()); |
| 1037 | 1068 | ||
| 1038 | /* | 1069 | /* |
| @@ -1115,6 +1146,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1115 | __get_cpu_var(process_counts)++; | 1146 | __get_cpu_var(process_counts)++; |
| 1116 | } | 1147 | } |
| 1117 | 1148 | ||
| 1149 | if (!current->signal->tty && p->signal->tty) | ||
| 1150 | p->signal->tty = NULL; | ||
| 1151 | |||
| 1118 | nr_threads++; | 1152 | nr_threads++; |
| 1119 | total_forks++; | 1153 | total_forks++; |
| 1120 | write_unlock_irq(&tasklist_lock); | 1154 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..ca05fe6a70b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
| 41 | #include <linux/syscalls.h> | 41 | #include <linux/syscalls.h> |
| 42 | #include <linux/signal.h> | 42 | #include <linux/signal.h> |
| 43 | #include <asm/futex.h> | ||
| 43 | 44 | ||
| 44 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 45 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
| 45 | 46 | ||
| @@ -327,6 +328,118 @@ out: | |||
| 327 | } | 328 | } |
| 328 | 329 | ||
| 329 | /* | 330 | /* |
| 331 | * Wake up all waiters hashed on the physical page that is mapped | ||
| 332 | * to this virtual address: | ||
| 333 | */ | ||
| 334 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | ||
| 335 | { | ||
| 336 | union futex_key key1, key2; | ||
| 337 | struct futex_hash_bucket *bh1, *bh2; | ||
| 338 | struct list_head *head; | ||
| 339 | struct futex_q *this, *next; | ||
| 340 | int ret, op_ret, attempt = 0; | ||
| 341 | |||
| 342 | retryfull: | ||
| 343 | down_read(¤t->mm->mmap_sem); | ||
| 344 | |||
| 345 | ret = get_futex_key(uaddr1, &key1); | ||
| 346 | if (unlikely(ret != 0)) | ||
| 347 | goto out; | ||
| 348 | ret = get_futex_key(uaddr2, &key2); | ||
| 349 | if (unlikely(ret != 0)) | ||
| 350 | goto out; | ||
| 351 | |||
| 352 | bh1 = hash_futex(&key1); | ||
| 353 | bh2 = hash_futex(&key2); | ||
| 354 | |||
| 355 | retry: | ||
| 356 | if (bh1 < bh2) | ||
| 357 | spin_lock(&bh1->lock); | ||
| 358 | spin_lock(&bh2->lock); | ||
| 359 | if (bh1 > bh2) | ||
| 360 | spin_lock(&bh1->lock); | ||
| 361 | |||
| 362 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | ||
| 363 | if (unlikely(op_ret < 0)) { | ||
| 364 | int dummy; | ||
| 365 | |||
| 366 | spin_unlock(&bh1->lock); | ||
| 367 | if (bh1 != bh2) | ||
| 368 | spin_unlock(&bh2->lock); | ||
| 369 | |||
| 370 | /* futex_atomic_op_inuser needs to both read and write | ||
| 371 | * *(int __user *)uaddr2, but we can't modify it | ||
| 372 | * non-atomically. Therefore, if get_user below is not | ||
| 373 | * enough, we need to handle the fault ourselves, while | ||
| 374 | * still holding the mmap_sem. */ | ||
| 375 | if (attempt++) { | ||
| 376 | struct vm_area_struct * vma; | ||
| 377 | struct mm_struct *mm = current->mm; | ||
| 378 | |||
| 379 | ret = -EFAULT; | ||
| 380 | if (attempt >= 2 || | ||
| 381 | !(vma = find_vma(mm, uaddr2)) || | ||
| 382 | vma->vm_start > uaddr2 || | ||
| 383 | !(vma->vm_flags & VM_WRITE)) | ||
| 384 | goto out; | ||
| 385 | |||
| 386 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
| 387 | case VM_FAULT_MINOR: | ||
| 388 | current->min_flt++; | ||
| 389 | break; | ||
| 390 | case VM_FAULT_MAJOR: | ||
| 391 | current->maj_flt++; | ||
| 392 | break; | ||
| 393 | default: | ||
| 394 | goto out; | ||
| 395 | } | ||
| 396 | goto retry; | ||
| 397 | } | ||
| 398 | |||
| 399 | /* If we would have faulted, release mmap_sem, | ||
| 400 | * fault it in and start all over again. */ | ||
| 401 | up_read(¤t->mm->mmap_sem); | ||
| 402 | |||
| 403 | ret = get_user(dummy, (int __user *)uaddr2); | ||
| 404 | if (ret) | ||
| 405 | return ret; | ||
| 406 | |||
| 407 | goto retryfull; | ||
| 408 | } | ||
| 409 | |||
| 410 | head = &bh1->chain; | ||
| 411 | |||
| 412 | list_for_each_entry_safe(this, next, head, list) { | ||
| 413 | if (match_futex (&this->key, &key1)) { | ||
| 414 | wake_futex(this); | ||
| 415 | if (++ret >= nr_wake) | ||
| 416 | break; | ||
| 417 | } | ||
| 418 | } | ||
| 419 | |||
| 420 | if (op_ret > 0) { | ||
| 421 | head = &bh2->chain; | ||
| 422 | |||
| 423 | op_ret = 0; | ||
| 424 | list_for_each_entry_safe(this, next, head, list) { | ||
| 425 | if (match_futex (&this->key, &key2)) { | ||
| 426 | wake_futex(this); | ||
| 427 | if (++op_ret >= nr_wake2) | ||
| 428 | break; | ||
| 429 | } | ||
| 430 | } | ||
| 431 | ret += op_ret; | ||
| 432 | } | ||
| 433 | |||
| 434 | spin_unlock(&bh1->lock); | ||
| 435 | if (bh1 != bh2) | ||
| 436 | spin_unlock(&bh2->lock); | ||
| 437 | out: | ||
| 438 | up_read(¤t->mm->mmap_sem); | ||
| 439 | return ret; | ||
| 440 | } | ||
| 441 | |||
| 442 | /* | ||
| 330 | * Requeue all waiters hashed on one physical page to another | 443 | * Requeue all waiters hashed on one physical page to another |
| 331 | * physical page. | 444 | * physical page. |
| 332 | */ | 445 | */ |
| @@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 673 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | 786 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; |
| 674 | 787 | ||
| 675 | if (signal) { | 788 | if (signal) { |
| 676 | int err; | ||
| 677 | err = f_setown(filp, current->pid, 1); | 789 | err = f_setown(filp, current->pid, 1); |
| 678 | if (err < 0) { | 790 | if (err < 0) { |
| 679 | put_unused_fd(ret); | 791 | goto error; |
| 680 | put_filp(filp); | ||
| 681 | ret = err; | ||
| 682 | goto out; | ||
| 683 | } | 792 | } |
| 684 | filp->f_owner.signum = signal; | 793 | filp->f_owner.signum = signal; |
| 685 | } | 794 | } |
| 686 | 795 | ||
| 687 | q = kmalloc(sizeof(*q), GFP_KERNEL); | 796 | q = kmalloc(sizeof(*q), GFP_KERNEL); |
| 688 | if (!q) { | 797 | if (!q) { |
| 689 | put_unused_fd(ret); | 798 | err = -ENOMEM; |
| 690 | put_filp(filp); | 799 | goto error; |
| 691 | ret = -ENOMEM; | ||
| 692 | goto out; | ||
| 693 | } | 800 | } |
| 694 | 801 | ||
| 695 | down_read(¤t->mm->mmap_sem); | 802 | down_read(¤t->mm->mmap_sem); |
| @@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 697 | 804 | ||
| 698 | if (unlikely(err != 0)) { | 805 | if (unlikely(err != 0)) { |
| 699 | up_read(¤t->mm->mmap_sem); | 806 | up_read(¤t->mm->mmap_sem); |
| 700 | put_unused_fd(ret); | ||
| 701 | put_filp(filp); | ||
| 702 | kfree(q); | 807 | kfree(q); |
| 703 | return err; | 808 | goto error; |
| 704 | } | 809 | } |
| 705 | 810 | ||
| 706 | /* | 811 | /* |
| @@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 716 | fd_install(ret, filp); | 821 | fd_install(ret, filp); |
| 717 | out: | 822 | out: |
| 718 | return ret; | 823 | return ret; |
| 824 | error: | ||
| 825 | put_unused_fd(ret); | ||
| 826 | put_filp(filp); | ||
| 827 | ret = err; | ||
| 828 | goto out; | ||
| 719 | } | 829 | } |
| 720 | 830 | ||
| 721 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 831 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, |
| @@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
| 740 | case FUTEX_CMP_REQUEUE: | 850 | case FUTEX_CMP_REQUEUE: |
| 741 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); | 851 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); |
| 742 | break; | 852 | break; |
| 853 | case FUTEX_WAKE_OP: | ||
| 854 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | ||
| 855 | break; | ||
| 743 | default: | 856 | default: |
| 744 | ret = -ENOSYS; | 857 | ret = -ENOSYS; |
| 745 | } | 858 | } |
diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c | |||
| @@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void | |||
| 39 | struct list_head *tmp; | 39 | struct list_head *tmp; |
| 40 | struct inter_module_entry *ime, *ime_new; | 40 | struct inter_module_entry *ime, *ime_new; |
| 41 | 41 | ||
| 42 | if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { | 42 | if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { |
| 43 | /* Overloaded kernel, not fatal */ | 43 | /* Overloaded kernel, not fatal */ |
| 44 | printk(KERN_ERR | 44 | printk(KERN_ERR |
| 45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", | 45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", |
| @@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void | |||
| 47 | kmalloc_failed = 1; | 47 | kmalloc_failed = 1; |
| 48 | return; | 48 | return; |
| 49 | } | 49 | } |
| 50 | memset(ime_new, 0, sizeof(*ime_new)); | ||
| 51 | ime_new->im_name = im_name; | 50 | ime_new->im_name = im_name; |
| 52 | ime_new->owner = owner; | 51 | ime_new->owner = owner; |
| 53 | ime_new->userdata = userdata; | 52 | ime_new->userdata = userdata; |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..3ff7b925c387 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
| 111 | unsigned int status; | 111 | unsigned int status; |
| 112 | 112 | ||
| 113 | kstat_this_cpu.irqs[irq]++; | 113 | kstat_this_cpu.irqs[irq]++; |
| 114 | if (desc->status & IRQ_PER_CPU) { | 114 | if (CHECK_IRQ_PER_CPU(desc->status)) { |
| 115 | irqreturn_t action_ret; | 115 | irqreturn_t action_ret; |
| 116 | 116 | ||
| 117 | /* | 117 | /* |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -18,6 +18,10 @@ | |||
| 18 | 18 | ||
| 19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | 19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; |
| 20 | 20 | ||
| 21 | #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) | ||
| 22 | cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | ||
| 23 | #endif | ||
| 24 | |||
| 21 | /** | 25 | /** |
| 22 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
| 23 | * | 27 | * |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | |||
| 19 | */ | 19 | */ |
| 20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | 20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; |
| 21 | 21 | ||
| 22 | void __attribute__((weak)) | 22 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 23 | proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 23 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
| 24 | { | ||
| 25 | /* | ||
| 26 | * Save these away for later use. Re-progam when the | ||
| 27 | * interrupt is pending | ||
| 28 | */ | ||
| 29 | set_pending_irq(irq, mask_val); | ||
| 30 | } | ||
| 31 | #else | ||
| 32 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
| 24 | { | 33 | { |
| 25 | irq_affinity[irq] = mask_val; | 34 | irq_affinity[irq] = mask_val; |
| 26 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 35 | irq_desc[irq].handler->set_affinity(irq, mask_val); |
| 27 | } | 36 | } |
| 37 | #endif | ||
| 28 | 38 | ||
| 29 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
| 30 | int count, int *eof, void *data) | 40 | int count, int *eof, void *data) |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 179baafcdd96..64ab045c3d9d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
| @@ -36,7 +36,7 @@ | |||
| 36 | * struct kfifo with kfree(). | 36 | * struct kfifo with kfree(). |
| 37 | */ | 37 | */ |
| 38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, | 38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, |
| 39 | unsigned int __nocast gfp_mask, spinlock_t *lock) | 39 | gfp_t gfp_mask, spinlock_t *lock) |
| 40 | { | 40 | { |
| 41 | struct kfifo *fifo; | 41 | struct kfifo *fifo; |
| 42 | 42 | ||
| @@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init); | |||
| 64 | * | 64 | * |
| 65 | * The size will be rounded-up to a power of 2. | 65 | * The size will be rounded-up to a power of 2. |
| 66 | */ | 66 | */ |
| 67 | struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) | 67 | struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) |
| 68 | { | 68 | { |
| 69 | unsigned char *buffer; | 69 | unsigned char *buffer; |
| 70 | struct kfifo *ret; | 70 | struct kfifo *ret; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..f3ea492ab44d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/init.h> | 37 | #include <linux/init.h> |
| 38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
| 39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
| 40 | #include <asm-generic/sections.h> | ||
| 40 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
| 41 | #include <asm/errno.h> | 42 | #include <asm/errno.h> |
| 42 | #include <asm/kdebug.h> | 43 | #include <asm/kdebug.h> |
| @@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages; | |||
| 72 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 73 | * get_insn_slot() - Find a slot on an executable page for an instruction. |
| 73 | * We allocate an executable page if there's no room on existing ones. | 74 | * We allocate an executable page if there's no room on existing ones. |
| 74 | */ | 75 | */ |
| 75 | kprobe_opcode_t *get_insn_slot(void) | 76 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
| 76 | { | 77 | { |
| 77 | struct kprobe_insn_page *kip; | 78 | struct kprobe_insn_page *kip; |
| 78 | struct hlist_node *pos; | 79 | struct hlist_node *pos; |
| @@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void) | |||
| 117 | return kip->insns; | 118 | return kip->insns; |
| 118 | } | 119 | } |
| 119 | 120 | ||
| 120 | void free_insn_slot(kprobe_opcode_t *slot) | 121 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) |
| 121 | { | 122 | { |
| 122 | struct kprobe_insn_page *kip; | 123 | struct kprobe_insn_page *kip; |
| 123 | struct hlist_node *pos; | 124 | struct hlist_node *pos; |
| @@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot) | |||
| 152 | } | 153 | } |
| 153 | 154 | ||
| 154 | /* Locks kprobe: irqs must be disabled */ | 155 | /* Locks kprobe: irqs must be disabled */ |
| 155 | void lock_kprobes(void) | 156 | void __kprobes lock_kprobes(void) |
| 156 | { | 157 | { |
| 158 | unsigned long flags = 0; | ||
| 159 | |||
| 160 | /* Avoiding local interrupts to happen right after we take the kprobe_lock | ||
| 161 | * and before we get a chance to update kprobe_cpu, this to prevent | ||
| 162 | * deadlock when we have a kprobe on ISR routine and a kprobe on task | ||
| 163 | * routine | ||
| 164 | */ | ||
| 165 | local_irq_save(flags); | ||
| 166 | |||
| 157 | spin_lock(&kprobe_lock); | 167 | spin_lock(&kprobe_lock); |
| 158 | kprobe_cpu = smp_processor_id(); | 168 | kprobe_cpu = smp_processor_id(); |
| 169 | |||
| 170 | local_irq_restore(flags); | ||
| 159 | } | 171 | } |
| 160 | 172 | ||
| 161 | void unlock_kprobes(void) | 173 | void __kprobes unlock_kprobes(void) |
| 162 | { | 174 | { |
| 175 | unsigned long flags = 0; | ||
| 176 | |||
| 177 | /* Avoiding local interrupts to happen right after we update | ||
| 178 | * kprobe_cpu and before we get a a chance to release kprobe_lock, | ||
| 179 | * this to prevent deadlock when we have a kprobe on ISR routine and | ||
| 180 | * a kprobe on task routine | ||
| 181 | */ | ||
| 182 | local_irq_save(flags); | ||
| 183 | |||
| 163 | kprobe_cpu = NR_CPUS; | 184 | kprobe_cpu = NR_CPUS; |
| 164 | spin_unlock(&kprobe_lock); | 185 | spin_unlock(&kprobe_lock); |
| 186 | |||
| 187 | local_irq_restore(flags); | ||
| 165 | } | 188 | } |
| 166 | 189 | ||
| 167 | /* You have to be holding the kprobe_lock */ | 190 | /* You have to be holding the kprobe_lock */ |
| 168 | struct kprobe *get_kprobe(void *addr) | 191 | struct kprobe __kprobes *get_kprobe(void *addr) |
| 169 | { | 192 | { |
| 170 | struct hlist_head *head; | 193 | struct hlist_head *head; |
| 171 | struct hlist_node *node; | 194 | struct hlist_node *node; |
| @@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr) | |||
| 183 | * Aggregate handlers for multiple kprobes support - these handlers | 206 | * Aggregate handlers for multiple kprobes support - these handlers |
| 184 | * take care of invoking the individual kprobe handlers on p->list | 207 | * take care of invoking the individual kprobe handlers on p->list |
| 185 | */ | 208 | */ |
| 186 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 209 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
| 187 | { | 210 | { |
| 188 | struct kprobe *kp; | 211 | struct kprobe *kp; |
| 189 | 212 | ||
| @@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
| 198 | return 0; | 221 | return 0; |
| 199 | } | 222 | } |
| 200 | 223 | ||
| 201 | static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 224 | static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
| 202 | unsigned long flags) | 225 | unsigned long flags) |
| 203 | { | 226 | { |
| 204 | struct kprobe *kp; | 227 | struct kprobe *kp; |
| 205 | 228 | ||
| @@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 213 | return; | 236 | return; |
| 214 | } | 237 | } |
| 215 | 238 | ||
| 216 | static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 239 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
| 217 | int trapnr) | 240 | int trapnr) |
| 218 | { | 241 | { |
| 219 | /* | 242 | /* |
| 220 | * if we faulted "during" the execution of a user specified | 243 | * if we faulted "during" the execution of a user specified |
| @@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 227 | return 0; | 250 | return 0; |
| 228 | } | 251 | } |
| 229 | 252 | ||
| 230 | static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 253 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
| 231 | { | 254 | { |
| 232 | struct kprobe *kp = curr_kprobe; | 255 | struct kprobe *kp = curr_kprobe; |
| 233 | if (curr_kprobe && kp->break_handler) { | 256 | if (curr_kprobe && kp->break_handler) { |
| @@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
| 240 | return 0; | 263 | return 0; |
| 241 | } | 264 | } |
| 242 | 265 | ||
| 243 | struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | 266 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) |
| 244 | { | 267 | { |
| 245 | struct hlist_node *node; | 268 | struct hlist_node *node; |
| 246 | struct kretprobe_instance *ri; | 269 | struct kretprobe_instance *ri; |
| @@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | |||
| 249 | return NULL; | 272 | return NULL; |
| 250 | } | 273 | } |
| 251 | 274 | ||
| 252 | static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | 275 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe |
| 276 | *rp) | ||
| 253 | { | 277 | { |
| 254 | struct hlist_node *node; | 278 | struct hlist_node *node; |
| 255 | struct kretprobe_instance *ri; | 279 | struct kretprobe_instance *ri; |
| @@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | |||
| 258 | return NULL; | 282 | return NULL; |
| 259 | } | 283 | } |
| 260 | 284 | ||
| 261 | void add_rp_inst(struct kretprobe_instance *ri) | 285 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) |
| 262 | { | 286 | { |
| 263 | /* | 287 | /* |
| 264 | * Remove rp inst off the free list - | 288 | * Remove rp inst off the free list - |
| @@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri) | |||
| 276 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | 300 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); |
| 277 | } | 301 | } |
| 278 | 302 | ||
| 279 | void recycle_rp_inst(struct kretprobe_instance *ri) | 303 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) |
| 280 | { | 304 | { |
| 281 | /* remove rp inst off the rprobe_inst_table */ | 305 | /* remove rp inst off the rprobe_inst_table */ |
| 282 | hlist_del(&ri->hlist); | 306 | hlist_del(&ri->hlist); |
| @@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) | |||
| 291 | kfree(ri); | 315 | kfree(ri); |
| 292 | } | 316 | } |
| 293 | 317 | ||
| 294 | struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | 318 | struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) |
| 295 | { | 319 | { |
| 296 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | 320 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; |
| 297 | } | 321 | } |
| @@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | |||
| 302 | * instances associated with this task. These left over instances represent | 326 | * instances associated with this task. These left over instances represent |
| 303 | * probed functions that have been called but will never return. | 327 | * probed functions that have been called but will never return. |
| 304 | */ | 328 | */ |
| 305 | void kprobe_flush_task(struct task_struct *tk) | 329 | void __kprobes kprobe_flush_task(struct task_struct *tk) |
| 306 | { | 330 | { |
| 307 | struct kretprobe_instance *ri; | 331 | struct kretprobe_instance *ri; |
| 308 | struct hlist_head *head; | 332 | struct hlist_head *head; |
| @@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk) | |||
| 322 | * This kprobe pre_handler is registered with every kretprobe. When probe | 346 | * This kprobe pre_handler is registered with every kretprobe. When probe |
| 323 | * hits it will set up the return probe. | 347 | * hits it will set up the return probe. |
| 324 | */ | 348 | */ |
| 325 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | 349 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, |
| 350 | struct pt_regs *regs) | ||
| 326 | { | 351 | { |
| 327 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | 352 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); |
| 328 | 353 | ||
| @@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
| 353 | * Add the new probe to old_p->list. Fail if this is the | 378 | * Add the new probe to old_p->list. Fail if this is the |
| 354 | * second jprobe at the address - two jprobes can't coexist | 379 | * second jprobe at the address - two jprobes can't coexist |
| 355 | */ | 380 | */ |
| 356 | static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 381 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
| 357 | { | 382 | { |
| 358 | struct kprobe *kp; | 383 | struct kprobe *kp; |
| 359 | 384 | ||
| @@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 395 | * the intricacies | 420 | * the intricacies |
| 396 | * TODO: Move kcalloc outside the spinlock | 421 | * TODO: Move kcalloc outside the spinlock |
| 397 | */ | 422 | */ |
| 398 | static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | 423 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
| 424 | struct kprobe *p) | ||
| 399 | { | 425 | { |
| 400 | int ret = 0; | 426 | int ret = 0; |
| 401 | struct kprobe *ap; | 427 | struct kprobe *ap; |
| @@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, | |||
| 434 | spin_unlock_irqrestore(&kprobe_lock, flags); | 460 | spin_unlock_irqrestore(&kprobe_lock, flags); |
| 435 | } | 461 | } |
| 436 | 462 | ||
| 437 | int register_kprobe(struct kprobe *p) | 463 | static int __kprobes in_kprobes_functions(unsigned long addr) |
| 464 | { | ||
| 465 | if (addr >= (unsigned long)__kprobes_text_start | ||
| 466 | && addr < (unsigned long)__kprobes_text_end) | ||
| 467 | return -EINVAL; | ||
| 468 | return 0; | ||
| 469 | } | ||
| 470 | |||
| 471 | int __kprobes register_kprobe(struct kprobe *p) | ||
| 438 | { | 472 | { |
| 439 | int ret = 0; | 473 | int ret = 0; |
| 440 | unsigned long flags = 0; | 474 | unsigned long flags = 0; |
| 441 | struct kprobe *old_p; | 475 | struct kprobe *old_p; |
| 442 | 476 | ||
| 443 | if ((ret = arch_prepare_kprobe(p)) != 0) { | 477 | if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) |
| 478 | return ret; | ||
| 479 | if ((ret = arch_prepare_kprobe(p)) != 0) | ||
| 444 | goto rm_kprobe; | 480 | goto rm_kprobe; |
| 445 | } | 481 | |
| 446 | spin_lock_irqsave(&kprobe_lock, flags); | 482 | spin_lock_irqsave(&kprobe_lock, flags); |
| 447 | old_p = get_kprobe(p->addr); | 483 | old_p = get_kprobe(p->addr); |
| 448 | p->nmissed = 0; | 484 | p->nmissed = 0; |
| @@ -466,7 +502,7 @@ rm_kprobe: | |||
| 466 | return ret; | 502 | return ret; |
| 467 | } | 503 | } |
| 468 | 504 | ||
| 469 | void unregister_kprobe(struct kprobe *p) | 505 | void __kprobes unregister_kprobe(struct kprobe *p) |
| 470 | { | 506 | { |
| 471 | unsigned long flags; | 507 | unsigned long flags; |
| 472 | struct kprobe *old_p; | 508 | struct kprobe *old_p; |
| @@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = { | |||
| 487 | .priority = 0x7fffffff /* we need to notified first */ | 523 | .priority = 0x7fffffff /* we need to notified first */ |
| 488 | }; | 524 | }; |
| 489 | 525 | ||
| 490 | int register_jprobe(struct jprobe *jp) | 526 | int __kprobes register_jprobe(struct jprobe *jp) |
| 491 | { | 527 | { |
| 492 | /* Todo: Verify probepoint is a function entry point */ | 528 | /* Todo: Verify probepoint is a function entry point */ |
| 493 | jp->kp.pre_handler = setjmp_pre_handler; | 529 | jp->kp.pre_handler = setjmp_pre_handler; |
| @@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp) | |||
| 496 | return register_kprobe(&jp->kp); | 532 | return register_kprobe(&jp->kp); |
| 497 | } | 533 | } |
| 498 | 534 | ||
| 499 | void unregister_jprobe(struct jprobe *jp) | 535 | void __kprobes unregister_jprobe(struct jprobe *jp) |
| 500 | { | 536 | { |
| 501 | unregister_kprobe(&jp->kp); | 537 | unregister_kprobe(&jp->kp); |
| 502 | } | 538 | } |
| 503 | 539 | ||
| 504 | #ifdef ARCH_SUPPORTS_KRETPROBES | 540 | #ifdef ARCH_SUPPORTS_KRETPROBES |
| 505 | 541 | ||
| 506 | int register_kretprobe(struct kretprobe *rp) | 542 | int __kprobes register_kretprobe(struct kretprobe *rp) |
| 507 | { | 543 | { |
| 508 | int ret = 0; | 544 | int ret = 0; |
| 509 | struct kretprobe_instance *inst; | 545 | struct kretprobe_instance *inst; |
| @@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp) | |||
| 540 | 576 | ||
| 541 | #else /* ARCH_SUPPORTS_KRETPROBES */ | 577 | #else /* ARCH_SUPPORTS_KRETPROBES */ |
| 542 | 578 | ||
| 543 | int register_kretprobe(struct kretprobe *rp) | 579 | int __kprobes register_kretprobe(struct kretprobe *rp) |
| 544 | { | 580 | { |
| 545 | return -ENOSYS; | 581 | return -ENOSYS; |
| 546 | } | 582 | } |
| 547 | 583 | ||
| 548 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | 584 | #endif /* ARCH_SUPPORTS_KRETPROBES */ |
| 549 | 585 | ||
| 550 | void unregister_kretprobe(struct kretprobe *rp) | 586 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
| 551 | { | 587 | { |
| 552 | unsigned long flags; | 588 | unsigned long flags; |
| 553 | struct kretprobe_instance *ri; | 589 | struct kretprobe_instance *ri; |
diff --git a/kernel/module.c b/kernel/module.c index c32995fbd8fd..ff5c500ab625 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
| 21 | #include <linux/moduleloader.h> | 21 | #include <linux/moduleloader.h> |
| 22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
| 23 | #include <linux/kernel.h> | ||
| 23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
| 24 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
| 25 | #include <linux/elf.h> | 26 | #include <linux/elf.h> |
| @@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags) | |||
| 498 | { | 499 | { |
| 499 | int ret = (flags & O_TRUNC); | 500 | int ret = (flags & O_TRUNC); |
| 500 | if (ret) | 501 | if (ret) |
| 501 | tainted |= TAINT_FORCED_MODULE; | 502 | add_taint(TAINT_FORCED_MODULE); |
| 502 | return ret; | 503 | return ret; |
| 503 | } | 504 | } |
| 504 | #else | 505 | #else |
| @@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
| 897 | if (!(tainted & TAINT_FORCED_MODULE)) { | 898 | if (!(tainted & TAINT_FORCED_MODULE)) { |
| 898 | printk("%s: no version for \"%s\" found: kernel tainted.\n", | 899 | printk("%s: no version for \"%s\" found: kernel tainted.\n", |
| 899 | mod->name, symname); | 900 | mod->name, symname); |
| 900 | tainted |= TAINT_FORCED_MODULE; | 901 | add_taint(TAINT_FORCED_MODULE); |
| 901 | } | 902 | } |
| 902 | return 1; | 903 | return 1; |
| 903 | } | 904 | } |
| @@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license) | |||
| 1352 | if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { | 1353 | if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { |
| 1353 | printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", | 1354 | printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", |
| 1354 | mod->name, license); | 1355 | mod->name, license); |
| 1355 | tainted |= TAINT_PROPRIETARY_MODULE; | 1356 | add_taint(TAINT_PROPRIETARY_MODULE); |
| 1356 | } | 1357 | } |
| 1357 | } | 1358 | } |
| 1358 | 1359 | ||
| @@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod, | |||
| 1509 | long err = 0; | 1510 | long err = 0; |
| 1510 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1511 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
| 1511 | struct exception_table_entry *extable; | 1512 | struct exception_table_entry *extable; |
| 1513 | mm_segment_t old_fs; | ||
| 1512 | 1514 | ||
| 1513 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | 1515 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", |
| 1514 | umod, len, uargs); | 1516 | umod, len, uargs); |
| @@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod, | |||
| 1609 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); | 1611 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); |
| 1610 | /* This is allowed: modprobe --force will invalidate it. */ | 1612 | /* This is allowed: modprobe --force will invalidate it. */ |
| 1611 | if (!modmagic) { | 1613 | if (!modmagic) { |
| 1612 | tainted |= TAINT_FORCED_MODULE; | 1614 | add_taint(TAINT_FORCED_MODULE); |
| 1613 | printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", | 1615 | printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", |
| 1614 | mod->name); | 1616 | mod->name); |
| 1615 | } else if (!same_magic(modmagic, vermagic)) { | 1617 | } else if (!same_magic(modmagic, vermagic)) { |
| @@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod, | |||
| 1738 | (mod->num_gpl_syms && !gplcrcindex)) { | 1740 | (mod->num_gpl_syms && !gplcrcindex)) { |
| 1739 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1741 | printk(KERN_WARNING "%s: No versions for exported symbols." |
| 1740 | " Tainting kernel.\n", mod->name); | 1742 | " Tainting kernel.\n", mod->name); |
| 1741 | tainted |= TAINT_FORCED_MODULE; | 1743 | add_taint(TAINT_FORCED_MODULE); |
| 1742 | } | 1744 | } |
| 1743 | #endif | 1745 | #endif |
| 1744 | 1746 | ||
| @@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod, | |||
| 1779 | if (err < 0) | 1781 | if (err < 0) |
| 1780 | goto cleanup; | 1782 | goto cleanup; |
| 1781 | 1783 | ||
| 1784 | /* flush the icache in correct context */ | ||
| 1785 | old_fs = get_fs(); | ||
| 1786 | set_fs(KERNEL_DS); | ||
| 1787 | |||
| 1788 | /* | ||
| 1789 | * Flush the instruction cache, since we've played with text. | ||
| 1790 | * Do it before processing of module parameters, so the module | ||
| 1791 | * can provide parameter accessor functions of its own. | ||
| 1792 | */ | ||
| 1793 | if (mod->module_init) | ||
| 1794 | flush_icache_range((unsigned long)mod->module_init, | ||
| 1795 | (unsigned long)mod->module_init | ||
| 1796 | + mod->init_size); | ||
| 1797 | flush_icache_range((unsigned long)mod->module_core, | ||
| 1798 | (unsigned long)mod->module_core + mod->core_size); | ||
| 1799 | |||
| 1800 | set_fs(old_fs); | ||
| 1801 | |||
| 1782 | mod->args = args; | 1802 | mod->args = args; |
| 1783 | if (obsparmindex) { | 1803 | if (obsparmindex) { |
| 1784 | err = obsolete_params(mod->name, mod->args, | 1804 | err = obsolete_params(mod->name, mod->args, |
| @@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod, | |||
| 1860 | const char __user *uargs) | 1880 | const char __user *uargs) |
| 1861 | { | 1881 | { |
| 1862 | struct module *mod; | 1882 | struct module *mod; |
| 1863 | mm_segment_t old_fs = get_fs(); | ||
| 1864 | int ret = 0; | 1883 | int ret = 0; |
| 1865 | 1884 | ||
| 1866 | /* Must have permission */ | 1885 | /* Must have permission */ |
| @@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod, | |||
| 1878 | return PTR_ERR(mod); | 1897 | return PTR_ERR(mod); |
| 1879 | } | 1898 | } |
| 1880 | 1899 | ||
| 1881 | /* flush the icache in correct context */ | ||
| 1882 | set_fs(KERNEL_DS); | ||
| 1883 | |||
| 1884 | /* Flush the instruction cache, since we've played with text */ | ||
| 1885 | if (mod->module_init) | ||
| 1886 | flush_icache_range((unsigned long)mod->module_init, | ||
| 1887 | (unsigned long)mod->module_init | ||
| 1888 | + mod->init_size); | ||
| 1889 | flush_icache_range((unsigned long)mod->module_core, | ||
| 1890 | (unsigned long)mod->module_core + mod->core_size); | ||
| 1891 | |||
| 1892 | set_fs(old_fs); | ||
| 1893 | |||
| 1894 | /* Now sew it into the lists. They won't access us, since | 1900 | /* Now sew it into the lists. They won't access us, since |
| 1895 | strong_try_module_get() will fail. */ | 1901 | strong_try_module_get() will fail. */ |
| 1896 | stop_machine_run(__link_module, mod, NR_CPUS); | 1902 | stop_machine_run(__link_module, mod, NR_CPUS); |
diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..1a8614bac5d5 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -80,8 +80,6 @@ static char *next_arg(char *args, char **param, char **val) | |||
| 80 | int in_quote = 0, quoted = 0; | 80 | int in_quote = 0, quoted = 0; |
| 81 | char *next; | 81 | char *next; |
| 82 | 82 | ||
| 83 | /* Chew any extra spaces */ | ||
| 84 | while (*args == ' ') args++; | ||
| 85 | if (*args == '"') { | 83 | if (*args == '"') { |
| 86 | args++; | 84 | args++; |
| 87 | in_quote = 1; | 85 | in_quote = 1; |
| @@ -121,6 +119,10 @@ static char *next_arg(char *args, char **param, char **val) | |||
| 121 | next = args + i + 1; | 119 | next = args + i + 1; |
| 122 | } else | 120 | } else |
| 123 | next = args + i; | 121 | next = args + i; |
| 122 | |||
| 123 | /* Chew up trailing spaces. */ | ||
| 124 | while (*next == ' ') | ||
| 125 | next++; | ||
| 124 | return next; | 126 | return next; |
| 125 | } | 127 | } |
| 126 | 128 | ||
| @@ -135,6 +137,10 @@ int parse_args(const char *name, | |||
| 135 | 137 | ||
| 136 | DEBUGP("Parsing ARGS: %s\n", args); | 138 | DEBUGP("Parsing ARGS: %s\n", args); |
| 137 | 139 | ||
| 140 | /* Chew leading spaces */ | ||
| 141 | while (*args == ' ') | ||
| 142 | args++; | ||
| 143 | |||
| 138 | while (*args) { | 144 | while (*args) { |
| 139 | int ret; | 145 | int ret; |
| 140 | 146 | ||
| @@ -542,8 +548,8 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
| 542 | { | 548 | { |
| 543 | struct module_kobject *mk; | 549 | struct module_kobject *mk; |
| 544 | 550 | ||
| 545 | mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); | 551 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
| 546 | memset(mk, 0, sizeof(struct module_kobject)); | 552 | BUG_ON(!mk); |
| 547 | 553 | ||
| 548 | mk->mod = THIS_MODULE; | 554 | mk->mod = THIS_MODULE; |
| 549 | kobj_set_kset_s(mk, module_subsys); | 555 | kobj_set_kset_s(mk, module_subsys); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index ad85d3f0dcc4..bf374fceb39c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, | |||
| 91 | * Update expiry time from increment, and increase overrun count, | 91 | * Update expiry time from increment, and increase overrun count, |
| 92 | * given the current clock sample. | 92 | * given the current clock sample. |
| 93 | */ | 93 | */ |
| 94 | static inline void bump_cpu_timer(struct k_itimer *timer, | 94 | static void bump_cpu_timer(struct k_itimer *timer, |
| 95 | union cpu_time_count now) | 95 | union cpu_time_count now) |
| 96 | { | 96 | { |
| 97 | int i; | 97 | int i; |
| @@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, | |||
| 110 | for (i = 0; incr < delta - incr; i++) | 110 | for (i = 0; incr < delta - incr; i++) |
| 111 | incr = incr << 1; | 111 | incr = incr << 1; |
| 112 | for (; i >= 0; incr >>= 1, i--) { | 112 | for (; i >= 0; incr >>= 1, i--) { |
| 113 | if (delta <= incr) | 113 | if (delta < incr) |
| 114 | continue; | 114 | continue; |
| 115 | timer->it.cpu.expires.sched += incr; | 115 | timer->it.cpu.expires.sched += incr; |
| 116 | timer->it_overrun += 1 << i; | 116 | timer->it_overrun += 1 << i; |
| @@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, | |||
| 128 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | 128 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) |
| 129 | incr = cputime_add(incr, incr); | 129 | incr = cputime_add(incr, incr); |
| 130 | for (; i >= 0; incr = cputime_halve(incr), i--) { | 130 | for (; i >= 0; incr = cputime_halve(incr), i--) { |
| 131 | if (cputime_le(delta, incr)) | 131 | if (cputime_lt(delta, incr)) |
| 132 | continue; | 132 | continue; |
| 133 | timer->it.cpu.expires.cpu = | 133 | timer->it.cpu.expires.cpu = |
| 134 | cputime_add(timer->it.cpu.expires.cpu, incr); | 134 | cputime_add(timer->it.cpu.expires.cpu, incr); |
| @@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 380 | int posix_cpu_timer_del(struct k_itimer *timer) | 380 | int posix_cpu_timer_del(struct k_itimer *timer) |
| 381 | { | 381 | { |
| 382 | struct task_struct *p = timer->it.cpu.task; | 382 | struct task_struct *p = timer->it.cpu.task; |
| 383 | int ret = 0; | ||
| 383 | 384 | ||
| 384 | if (timer->it.cpu.firing) | 385 | if (likely(p != NULL)) { |
| 385 | return TIMER_RETRY; | ||
| 386 | |||
| 387 | if (unlikely(p == NULL)) | ||
| 388 | return 0; | ||
| 389 | |||
| 390 | if (!list_empty(&timer->it.cpu.entry)) { | ||
| 391 | read_lock(&tasklist_lock); | 386 | read_lock(&tasklist_lock); |
| 392 | if (unlikely(p->signal == NULL)) { | 387 | if (unlikely(p->signal == NULL)) { |
| 393 | /* | 388 | /* |
| @@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer) | |||
| 396 | */ | 391 | */ |
| 397 | BUG_ON(!list_empty(&timer->it.cpu.entry)); | 392 | BUG_ON(!list_empty(&timer->it.cpu.entry)); |
| 398 | } else { | 393 | } else { |
| 399 | /* | ||
| 400 | * Take us off the task's timer list. | ||
| 401 | */ | ||
| 402 | spin_lock(&p->sighand->siglock); | 394 | spin_lock(&p->sighand->siglock); |
| 403 | list_del(&timer->it.cpu.entry); | 395 | if (timer->it.cpu.firing) |
| 396 | ret = TIMER_RETRY; | ||
| 397 | else | ||
| 398 | list_del(&timer->it.cpu.entry); | ||
| 404 | spin_unlock(&p->sighand->siglock); | 399 | spin_unlock(&p->sighand->siglock); |
| 405 | } | 400 | } |
| 406 | read_unlock(&tasklist_lock); | 401 | read_unlock(&tasklist_lock); |
| 402 | |||
| 403 | if (!ret) | ||
| 404 | put_task_struct(p); | ||
| 407 | } | 405 | } |
| 408 | put_task_struct(p); | ||
| 409 | 406 | ||
| 410 | return 0; | 407 | return ret; |
| 411 | } | 408 | } |
| 412 | 409 | ||
| 413 | /* | 410 | /* |
| @@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head, | |||
| 424 | cputime_t ptime = cputime_add(utime, stime); | 421 | cputime_t ptime = cputime_add(utime, stime); |
| 425 | 422 | ||
| 426 | list_for_each_entry_safe(timer, next, head, entry) { | 423 | list_for_each_entry_safe(timer, next, head, entry) { |
| 427 | timer->task = NULL; | ||
| 428 | list_del_init(&timer->entry); | 424 | list_del_init(&timer->entry); |
| 429 | if (cputime_lt(timer->expires.cpu, ptime)) { | 425 | if (cputime_lt(timer->expires.cpu, ptime)) { |
| 430 | timer->expires.cpu = cputime_zero; | 426 | timer->expires.cpu = cputime_zero; |
| @@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head, | |||
| 436 | 432 | ||
| 437 | ++head; | 433 | ++head; |
| 438 | list_for_each_entry_safe(timer, next, head, entry) { | 434 | list_for_each_entry_safe(timer, next, head, entry) { |
| 439 | timer->task = NULL; | ||
| 440 | list_del_init(&timer->entry); | 435 | list_del_init(&timer->entry); |
| 441 | if (cputime_lt(timer->expires.cpu, utime)) { | 436 | if (cputime_lt(timer->expires.cpu, utime)) { |
| 442 | timer->expires.cpu = cputime_zero; | 437 | timer->expires.cpu = cputime_zero; |
| @@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head, | |||
| 448 | 443 | ||
| 449 | ++head; | 444 | ++head; |
| 450 | list_for_each_entry_safe(timer, next, head, entry) { | 445 | list_for_each_entry_safe(timer, next, head, entry) { |
| 451 | timer->task = NULL; | ||
| 452 | list_del_init(&timer->entry); | 446 | list_del_init(&timer->entry); |
| 453 | if (timer->expires.sched < sched_time) { | 447 | if (timer->expires.sched < sched_time) { |
| 454 | timer->expires.sched = 0; | 448 | timer->expires.sched = 0; |
| @@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p, | |||
| 492 | struct task_struct *t = p; | 486 | struct task_struct *t = p; |
| 493 | unsigned int nthreads = atomic_read(&p->signal->live); | 487 | unsigned int nthreads = atomic_read(&p->signal->live); |
| 494 | 488 | ||
| 489 | if (!nthreads) | ||
| 490 | return; | ||
| 491 | |||
| 495 | switch (clock_idx) { | 492 | switch (clock_idx) { |
| 496 | default: | 493 | default: |
| 497 | BUG(); | 494 | BUG(); |
| @@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
| 500 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 497 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), |
| 501 | nthreads); | 498 | nthreads); |
| 502 | do { | 499 | do { |
| 503 | if (!unlikely(t->exit_state)) { | 500 | if (!unlikely(t->flags & PF_EXITING)) { |
| 504 | ticks = cputime_add(prof_ticks(t), left); | 501 | ticks = cputime_add(prof_ticks(t), left); |
| 505 | if (cputime_eq(t->it_prof_expires, | 502 | if (cputime_eq(t->it_prof_expires, |
| 506 | cputime_zero) || | 503 | cputime_zero) || |
| @@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
| 515 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 512 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), |
| 516 | nthreads); | 513 | nthreads); |
| 517 | do { | 514 | do { |
| 518 | if (!unlikely(t->exit_state)) { | 515 | if (!unlikely(t->flags & PF_EXITING)) { |
| 519 | ticks = cputime_add(virt_ticks(t), left); | 516 | ticks = cputime_add(virt_ticks(t), left); |
| 520 | if (cputime_eq(t->it_virt_expires, | 517 | if (cputime_eq(t->it_virt_expires, |
| 521 | cputime_zero) || | 518 | cputime_zero) || |
| @@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
| 530 | nsleft = expires.sched - val.sched; | 527 | nsleft = expires.sched - val.sched; |
| 531 | do_div(nsleft, nthreads); | 528 | do_div(nsleft, nthreads); |
| 532 | do { | 529 | do { |
| 533 | if (!unlikely(t->exit_state)) { | 530 | if (!unlikely(t->flags & PF_EXITING)) { |
| 534 | ns = t->sched_time + nsleft; | 531 | ns = t->sched_time + nsleft; |
| 535 | if (t->it_sched_expires == 0 || | 532 | if (t->it_sched_expires == 0 || |
| 536 | t->it_sched_expires > ns) { | 533 | t->it_sched_expires > ns) { |
| @@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
| 569 | struct cpu_timer_list *next; | 566 | struct cpu_timer_list *next; |
| 570 | unsigned long i; | 567 | unsigned long i; |
| 571 | 568 | ||
| 569 | if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) | ||
| 570 | return; | ||
| 571 | |||
| 572 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? | 572 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? |
| 573 | p->cpu_timers : p->signal->cpu_timers); | 573 | p->cpu_timers : p->signal->cpu_timers); |
| 574 | head += CPUCLOCK_WHICH(timer->it_clock); | 574 | head += CPUCLOCK_WHICH(timer->it_clock); |
| @@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
| 579 | listpos = head; | 579 | listpos = head; |
| 580 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | 580 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { |
| 581 | list_for_each_entry(next, head, entry) { | 581 | list_for_each_entry(next, head, entry) { |
| 582 | if (next->expires.sched > nt->expires.sched) { | 582 | if (next->expires.sched > nt->expires.sched) |
| 583 | listpos = &next->entry; | ||
| 584 | break; | 583 | break; |
| 585 | } | 584 | listpos = &next->entry; |
| 586 | } | 585 | } |
| 587 | } else { | 586 | } else { |
| 588 | list_for_each_entry(next, head, entry) { | 587 | list_for_each_entry(next, head, entry) { |
| 589 | if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { | 588 | if (cputime_gt(next->expires.cpu, nt->expires.cpu)) |
| 590 | listpos = &next->entry; | ||
| 591 | break; | 589 | break; |
| 592 | } | 590 | listpos = &next->entry; |
| 593 | } | 591 | } |
| 594 | } | 592 | } |
| 595 | list_add(&nt->entry, listpos); | 593 | list_add(&nt->entry, listpos); |
| @@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
| 733 | * Disarm any old timer after extracting its expiry time. | 731 | * Disarm any old timer after extracting its expiry time. |
| 734 | */ | 732 | */ |
| 735 | BUG_ON(!irqs_disabled()); | 733 | BUG_ON(!irqs_disabled()); |
| 734 | |||
| 735 | ret = 0; | ||
| 736 | spin_lock(&p->sighand->siglock); | 736 | spin_lock(&p->sighand->siglock); |
| 737 | old_expires = timer->it.cpu.expires; | 737 | old_expires = timer->it.cpu.expires; |
| 738 | list_del_init(&timer->it.cpu.entry); | 738 | if (unlikely(timer->it.cpu.firing)) { |
| 739 | timer->it.cpu.firing = -1; | ||
| 740 | ret = TIMER_RETRY; | ||
| 741 | } else | ||
| 742 | list_del_init(&timer->it.cpu.entry); | ||
| 739 | spin_unlock(&p->sighand->siglock); | 743 | spin_unlock(&p->sighand->siglock); |
| 740 | 744 | ||
| 741 | /* | 745 | /* |
| @@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
| 783 | } | 787 | } |
| 784 | } | 788 | } |
| 785 | 789 | ||
| 786 | if (unlikely(timer->it.cpu.firing)) { | 790 | if (unlikely(ret)) { |
| 787 | /* | 791 | /* |
| 788 | * We are colliding with the timer actually firing. | 792 | * We are colliding with the timer actually firing. |
| 789 | * Punt after filling in the timer's old value, and | 793 | * Punt after filling in the timer's old value, and |
| @@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
| 791 | * it as an overrun (thanks to bump_cpu_timer above). | 795 | * it as an overrun (thanks to bump_cpu_timer above). |
| 792 | */ | 796 | */ |
| 793 | read_unlock(&tasklist_lock); | 797 | read_unlock(&tasklist_lock); |
| 794 | timer->it.cpu.firing = -1; | ||
| 795 | ret = TIMER_RETRY; | ||
| 796 | goto out; | 798 | goto out; |
| 797 | } | 799 | } |
| 798 | 800 | ||
| @@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
| 958 | static void check_thread_timers(struct task_struct *tsk, | 960 | static void check_thread_timers(struct task_struct *tsk, |
| 959 | struct list_head *firing) | 961 | struct list_head *firing) |
| 960 | { | 962 | { |
| 963 | int maxfire; | ||
| 961 | struct list_head *timers = tsk->cpu_timers; | 964 | struct list_head *timers = tsk->cpu_timers; |
| 962 | 965 | ||
| 966 | maxfire = 20; | ||
| 963 | tsk->it_prof_expires = cputime_zero; | 967 | tsk->it_prof_expires = cputime_zero; |
| 964 | while (!list_empty(timers)) { | 968 | while (!list_empty(timers)) { |
| 965 | struct cpu_timer_list *t = list_entry(timers->next, | 969 | struct cpu_timer_list *t = list_entry(timers->next, |
| 966 | struct cpu_timer_list, | 970 | struct cpu_timer_list, |
| 967 | entry); | 971 | entry); |
| 968 | if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 972 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { |
| 969 | tsk->it_prof_expires = t->expires.cpu; | 973 | tsk->it_prof_expires = t->expires.cpu; |
| 970 | break; | 974 | break; |
| 971 | } | 975 | } |
| @@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 974 | } | 978 | } |
| 975 | 979 | ||
| 976 | ++timers; | 980 | ++timers; |
| 981 | maxfire = 20; | ||
| 977 | tsk->it_virt_expires = cputime_zero; | 982 | tsk->it_virt_expires = cputime_zero; |
| 978 | while (!list_empty(timers)) { | 983 | while (!list_empty(timers)) { |
| 979 | struct cpu_timer_list *t = list_entry(timers->next, | 984 | struct cpu_timer_list *t = list_entry(timers->next, |
| 980 | struct cpu_timer_list, | 985 | struct cpu_timer_list, |
| 981 | entry); | 986 | entry); |
| 982 | if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 987 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { |
| 983 | tsk->it_virt_expires = t->expires.cpu; | 988 | tsk->it_virt_expires = t->expires.cpu; |
| 984 | break; | 989 | break; |
| 985 | } | 990 | } |
| @@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 988 | } | 993 | } |
| 989 | 994 | ||
| 990 | ++timers; | 995 | ++timers; |
| 996 | maxfire = 20; | ||
| 991 | tsk->it_sched_expires = 0; | 997 | tsk->it_sched_expires = 0; |
| 992 | while (!list_empty(timers)) { | 998 | while (!list_empty(timers)) { |
| 993 | struct cpu_timer_list *t = list_entry(timers->next, | 999 | struct cpu_timer_list *t = list_entry(timers->next, |
| 994 | struct cpu_timer_list, | 1000 | struct cpu_timer_list, |
| 995 | entry); | 1001 | entry); |
| 996 | if (tsk->sched_time < t->expires.sched) { | 1002 | if (!--maxfire || tsk->sched_time < t->expires.sched) { |
| 997 | tsk->it_sched_expires = t->expires.sched; | 1003 | tsk->it_sched_expires = t->expires.sched; |
| 998 | break; | 1004 | break; |
| 999 | } | 1005 | } |
| @@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 1010 | static void check_process_timers(struct task_struct *tsk, | 1016 | static void check_process_timers(struct task_struct *tsk, |
| 1011 | struct list_head *firing) | 1017 | struct list_head *firing) |
| 1012 | { | 1018 | { |
| 1019 | int maxfire; | ||
| 1013 | struct signal_struct *const sig = tsk->signal; | 1020 | struct signal_struct *const sig = tsk->signal; |
| 1014 | cputime_t utime, stime, ptime, virt_expires, prof_expires; | 1021 | cputime_t utime, stime, ptime, virt_expires, prof_expires; |
| 1015 | unsigned long long sched_time, sched_expires; | 1022 | unsigned long long sched_time, sched_expires; |
| @@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1042 | } while (t != tsk); | 1049 | } while (t != tsk); |
| 1043 | ptime = cputime_add(utime, stime); | 1050 | ptime = cputime_add(utime, stime); |
| 1044 | 1051 | ||
| 1052 | maxfire = 20; | ||
| 1045 | prof_expires = cputime_zero; | 1053 | prof_expires = cputime_zero; |
| 1046 | while (!list_empty(timers)) { | 1054 | while (!list_empty(timers)) { |
| 1047 | struct cpu_timer_list *t = list_entry(timers->next, | 1055 | struct cpu_timer_list *t = list_entry(timers->next, |
| 1048 | struct cpu_timer_list, | 1056 | struct cpu_timer_list, |
| 1049 | entry); | 1057 | entry); |
| 1050 | if (cputime_lt(ptime, t->expires.cpu)) { | 1058 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { |
| 1051 | prof_expires = t->expires.cpu; | 1059 | prof_expires = t->expires.cpu; |
| 1052 | break; | 1060 | break; |
| 1053 | } | 1061 | } |
| @@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1056 | } | 1064 | } |
| 1057 | 1065 | ||
| 1058 | ++timers; | 1066 | ++timers; |
| 1067 | maxfire = 20; | ||
| 1059 | virt_expires = cputime_zero; | 1068 | virt_expires = cputime_zero; |
| 1060 | while (!list_empty(timers)) { | 1069 | while (!list_empty(timers)) { |
| 1061 | struct cpu_timer_list *t = list_entry(timers->next, | 1070 | struct cpu_timer_list *t = list_entry(timers->next, |
| 1062 | struct cpu_timer_list, | 1071 | struct cpu_timer_list, |
| 1063 | entry); | 1072 | entry); |
| 1064 | if (cputime_lt(utime, t->expires.cpu)) { | 1073 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { |
| 1065 | virt_expires = t->expires.cpu; | 1074 | virt_expires = t->expires.cpu; |
| 1066 | break; | 1075 | break; |
| 1067 | } | 1076 | } |
| @@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1070 | } | 1079 | } |
| 1071 | 1080 | ||
| 1072 | ++timers; | 1081 | ++timers; |
| 1082 | maxfire = 20; | ||
| 1073 | sched_expires = 0; | 1083 | sched_expires = 0; |
| 1074 | while (!list_empty(timers)) { | 1084 | while (!list_empty(timers)) { |
| 1075 | struct cpu_timer_list *t = list_entry(timers->next, | 1085 | struct cpu_timer_list *t = list_entry(timers->next, |
| 1076 | struct cpu_timer_list, | 1086 | struct cpu_timer_list, |
| 1077 | entry); | 1087 | entry); |
| 1078 | if (sched_time < t->expires.sched) { | 1088 | if (!--maxfire || sched_time < t->expires.sched) { |
| 1079 | sched_expires = t->expires.sched; | 1089 | sched_expires = t->expires.sched; |
| 1080 | break; | 1090 | break; |
| 1081 | } | 1091 | } |
| @@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1158 | unsigned long long sched_left, sched; | 1168 | unsigned long long sched_left, sched; |
| 1159 | const unsigned int nthreads = atomic_read(&sig->live); | 1169 | const unsigned int nthreads = atomic_read(&sig->live); |
| 1160 | 1170 | ||
| 1171 | if (!nthreads) | ||
| 1172 | return; | ||
| 1173 | |||
| 1161 | prof_left = cputime_sub(prof_expires, utime); | 1174 | prof_left = cputime_sub(prof_expires, utime); |
| 1162 | prof_left = cputime_sub(prof_left, stime); | 1175 | prof_left = cputime_sub(prof_left, stime); |
| 1163 | prof_left = cputime_div(prof_left, nthreads); | 1176 | prof_left = cputime_div(prof_left, nthreads); |
| @@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 1194 | 1207 | ||
| 1195 | do { | 1208 | do { |
| 1196 | t = next_thread(t); | 1209 | t = next_thread(t); |
| 1197 | } while (unlikely(t->exit_state)); | 1210 | } while (unlikely(t->flags & PF_EXITING)); |
| 1198 | } while (t != tsk); | 1211 | } while (t != tsk); |
| 1199 | } | 1212 | } |
| 1200 | } | 1213 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 38798a2ff994..dda3cda73c77 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) | |||
| 427 | timr->sigq->info.si_code = SI_TIMER; | 427 | timr->sigq->info.si_code = SI_TIMER; |
| 428 | timr->sigq->info.si_tid = timr->it_id; | 428 | timr->sigq->info.si_tid = timr->it_id; |
| 429 | timr->sigq->info.si_value = timr->it_sigev_value; | 429 | timr->sigq->info.si_value = timr->it_sigev_value; |
| 430 | |||
| 430 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { | 431 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { |
| 431 | if (unlikely(timr->it_process->flags & PF_EXITING)) { | 432 | struct task_struct *leader; |
| 432 | timr->it_sigev_notify = SIGEV_SIGNAL; | 433 | int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, |
| 433 | put_task_struct(timr->it_process); | 434 | timr->it_process); |
| 434 | timr->it_process = timr->it_process->group_leader; | 435 | |
| 435 | goto group; | 436 | if (likely(ret >= 0)) |
| 436 | } | 437 | return ret; |
| 437 | return send_sigqueue(timr->it_sigev_signo, timr->sigq, | 438 | |
| 438 | timr->it_process); | 439 | timr->it_sigev_notify = SIGEV_SIGNAL; |
| 439 | } | 440 | leader = timr->it_process->group_leader; |
| 440 | else { | 441 | put_task_struct(timr->it_process); |
| 441 | group: | 442 | timr->it_process = leader; |
| 442 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
| 443 | timr->it_process); | ||
| 444 | } | 443 | } |
| 444 | |||
| 445 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
| 446 | timr->it_process); | ||
| 445 | } | 447 | } |
| 446 | EXPORT_SYMBOL_GPL(posix_timer_event); | 448 | EXPORT_SYMBOL_GPL(posix_timer_event); |
| 447 | 449 | ||
| @@ -1155,7 +1157,7 @@ retry_delete: | |||
| 1155 | } | 1157 | } |
| 1156 | 1158 | ||
| 1157 | /* | 1159 | /* |
| 1158 | * This is called by __exit_signal, only when there are no more | 1160 | * This is called by do_exit or de_thread, only when there are no more |
| 1159 | * references to the shared signal_struct. | 1161 | * references to the shared signal_struct. |
| 1160 | */ | 1162 | */ |
| 1161 | void exit_itimers(struct signal_struct *sig) | 1163 | void exit_itimers(struct signal_struct *sig) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 917066a5767c..46a5e5acff97 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | config PM | 1 | config PM |
| 2 | bool "Power Management support" | 2 | bool "Power Management support" |
| 3 | depends on !IA64_HP_SIM | ||
| 3 | ---help--- | 4 | ---help--- |
| 4 | "Power Management" means that parts of your computer are shut | 5 | "Power Management" means that parts of your computer are shut |
| 5 | off or put into a power conserving "sleep" mode if they are not | 6 | off or put into a power conserving "sleep" mode if they are not |
| @@ -28,7 +29,7 @@ config PM_DEBUG | |||
| 28 | 29 | ||
| 29 | config SOFTWARE_SUSPEND | 30 | config SOFTWARE_SUSPEND |
| 30 | bool "Software Suspend" | 31 | bool "Software Suspend" |
| 31 | depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) | 32 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) |
| 32 | ---help--- | 33 | ---help--- |
| 33 | Enable the possibility of suspending the machine. | 34 | Enable the possibility of suspending the machine. |
| 34 | It doesn't need APM. | 35 | It doesn't need APM. |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 2d8bf054d036..761956e813f5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -17,12 +17,12 @@ | |||
| 17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
| 18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
| 19 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
| 20 | #include <linux/pm.h> | ||
| 20 | 21 | ||
| 21 | #include "power.h" | 22 | #include "power.h" |
| 22 | 23 | ||
| 23 | 24 | ||
| 24 | extern suspend_disk_method_t pm_disk_mode; | 25 | extern suspend_disk_method_t pm_disk_mode; |
| 25 | extern struct pm_ops * pm_ops; | ||
| 26 | 26 | ||
| 27 | extern int swsusp_suspend(void); | 27 | extern int swsusp_suspend(void); |
| 28 | extern int swsusp_write(void); | 28 | extern int swsusp_write(void); |
| @@ -49,13 +49,11 @@ dev_t swsusp_resume_device; | |||
| 49 | 49 | ||
| 50 | static void power_down(suspend_disk_method_t mode) | 50 | static void power_down(suspend_disk_method_t mode) |
| 51 | { | 51 | { |
| 52 | unsigned long flags; | ||
| 53 | int error = 0; | 52 | int error = 0; |
| 54 | 53 | ||
| 55 | local_irq_save(flags); | ||
| 56 | switch(mode) { | 54 | switch(mode) { |
| 57 | case PM_DISK_PLATFORM: | 55 | case PM_DISK_PLATFORM: |
| 58 | device_shutdown(); | 56 | kernel_power_off_prepare(); |
| 59 | error = pm_ops->enter(PM_SUSPEND_DISK); | 57 | error = pm_ops->enter(PM_SUSPEND_DISK); |
| 60 | break; | 58 | break; |
| 61 | case PM_DISK_SHUTDOWN: | 59 | case PM_DISK_SHUTDOWN: |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
| @@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
| 60 | unsigned long id, | 60 | unsigned long id, |
| 61 | pm_callback callback) | 61 | pm_callback callback) |
| 62 | { | 62 | { |
| 63 | struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); | 63 | struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); |
| 64 | if (dev) { | 64 | if (dev) { |
| 65 | memset(dev, 0, sizeof(*dev)); | ||
| 66 | dev->type = type; | 65 | dev->type = type; |
| 67 | dev->id = id; | 66 | dev->id = id; |
| 68 | dev->callback = callback; | 67 | dev->callback = callback; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index cd6a3493cc0d..6748de23e83c 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #include <linux/suspend.h> | 1 | #include <linux/suspend.h> |
| 2 | #include <linux/utsname.h> | 2 | #include <linux/utsname.h> |
| 3 | 3 | ||
| 4 | /* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but | 4 | /* With SUSPEND_CONSOLE defined suspend looks *really* cool, but |
| 5 | we probably do not take enough locks for switching consoles, etc, | 5 | we probably do not take enough locks for switching consoles, etc, |
| 6 | so bad things might happen. | 6 | so bad things might happen. |
| 7 | */ | 7 | */ |
| @@ -9,6 +9,9 @@ | |||
| 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
| 10 | #endif | 10 | #endif |
| 11 | 11 | ||
| 12 | #define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ | ||
| 13 | - 4 - 3*sizeof(unsigned long) - sizeof(int) \ | ||
| 14 | - sizeof(void *)) / sizeof(swp_entry_t)) | ||
| 12 | 15 | ||
| 13 | struct swsusp_info { | 16 | struct swsusp_info { |
| 14 | struct new_utsname uts; | 17 | struct new_utsname uts; |
| @@ -18,7 +21,7 @@ struct swsusp_info { | |||
| 18 | unsigned long image_pages; | 21 | unsigned long image_pages; |
| 19 | unsigned long pagedir_pages; | 22 | unsigned long pagedir_pages; |
| 20 | suspend_pagedir_t * suspend_pagedir; | 23 | suspend_pagedir_t * suspend_pagedir; |
| 21 | swp_entry_t pagedir[768]; | 24 | swp_entry_t pagedir[MAX_PBES]; |
| 22 | } __attribute__((aligned(PAGE_SIZE))); | 25 | } __attribute__((aligned(PAGE_SIZE))); |
| 23 | 26 | ||
| 24 | 27 | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index eaacd5cb5889..2d5c45676442 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -363,7 +363,7 @@ static void lock_swapdevices(void) | |||
| 363 | } | 363 | } |
| 364 | 364 | ||
| 365 | /** | 365 | /** |
| 366 | * write_swap_page - Write one page to a fresh swap location. | 366 | * write_page - Write one page to a fresh swap location. |
| 367 | * @addr: Address we're writing. | 367 | * @addr: Address we're writing. |
| 368 | * @loc: Place to store the entry we used. | 368 | * @loc: Place to store the entry we used. |
| 369 | * | 369 | * |
| @@ -402,15 +402,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
| 402 | static void data_free(void) | 402 | static void data_free(void) |
| 403 | { | 403 | { |
| 404 | swp_entry_t entry; | 404 | swp_entry_t entry; |
| 405 | int i; | 405 | struct pbe * p; |
| 406 | 406 | ||
| 407 | for (i = 0; i < nr_copy_pages; i++) { | 407 | for_each_pbe(p, pagedir_nosave) { |
| 408 | entry = (pagedir_nosave + i)->swap_address; | 408 | entry = p->swap_address; |
| 409 | if (entry.val) | 409 | if (entry.val) |
| 410 | swap_free(entry); | 410 | swap_free(entry); |
| 411 | else | 411 | else |
| 412 | break; | 412 | break; |
| 413 | (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; | ||
| 414 | } | 413 | } |
| 415 | } | 414 | } |
| 416 | 415 | ||
| @@ -863,6 +862,9 @@ static int alloc_image_pages(void) | |||
| 863 | return 0; | 862 | return 0; |
| 864 | } | 863 | } |
| 865 | 864 | ||
| 865 | /* Free pages we allocated for suspend. Suspend pages are alocated | ||
| 866 | * before atomic copy, so we need to free them after resume. | ||
| 867 | */ | ||
| 866 | void swsusp_free(void) | 868 | void swsusp_free(void) |
| 867 | { | 869 | { |
| 868 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); | 870 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); |
| @@ -918,6 +920,7 @@ static int swsusp_alloc(void) | |||
| 918 | 920 | ||
| 919 | pagedir_nosave = NULL; | 921 | pagedir_nosave = NULL; |
| 920 | nr_copy_pages = calc_nr(nr_copy_pages); | 922 | nr_copy_pages = calc_nr(nr_copy_pages); |
| 923 | nr_copy_pages_check = nr_copy_pages; | ||
| 921 | 924 | ||
| 922 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | 925 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", |
| 923 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | 926 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); |
| @@ -928,6 +931,10 @@ static int swsusp_alloc(void) | |||
| 928 | if (!enough_swap()) | 931 | if (!enough_swap()) |
| 929 | return -ENOSPC; | 932 | return -ENOSPC; |
| 930 | 933 | ||
| 934 | if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE + | ||
| 935 | !!(nr_copy_pages % PBES_PER_PAGE)) | ||
| 936 | return -ENOSPC; | ||
| 937 | |||
| 931 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | 938 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { |
| 932 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 939 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); |
| 933 | return -ENOMEM; | 940 | return -ENOMEM; |
| @@ -940,7 +947,6 @@ static int swsusp_alloc(void) | |||
| 940 | return error; | 947 | return error; |
| 941 | } | 948 | } |
| 942 | 949 | ||
| 943 | nr_copy_pages_check = nr_copy_pages; | ||
| 944 | return 0; | 950 | return 0; |
| 945 | } | 951 | } |
| 946 | 952 | ||
| @@ -1059,6 +1065,7 @@ int swsusp_resume(void) | |||
| 1059 | BUG_ON(!error); | 1065 | BUG_ON(!error); |
| 1060 | restore_processor_state(); | 1066 | restore_processor_state(); |
| 1061 | restore_highmem(); | 1067 | restore_highmem(); |
| 1068 | touch_softlockup_watchdog(); | ||
| 1062 | device_power_up(); | 1069 | device_power_up(); |
| 1063 | local_irq_enable(); | 1070 | local_irq_enable(); |
| 1064 | return error; | 1071 | return error; |
| @@ -1088,7 +1095,7 @@ static inline void eat_page(void *page) | |||
| 1088 | *eaten_memory = c; | 1095 | *eaten_memory = c; |
| 1089 | } | 1096 | } |
| 1090 | 1097 | ||
| 1091 | static unsigned long get_usable_page(unsigned gfp_mask) | 1098 | unsigned long get_usable_page(unsigned gfp_mask) |
| 1092 | { | 1099 | { |
| 1093 | unsigned long m; | 1100 | unsigned long m; |
| 1094 | 1101 | ||
| @@ -1102,7 +1109,7 @@ static unsigned long get_usable_page(unsigned gfp_mask) | |||
| 1102 | return m; | 1109 | return m; |
| 1103 | } | 1110 | } |
| 1104 | 1111 | ||
| 1105 | static void free_eaten_memory(void) | 1112 | void free_eaten_memory(void) |
| 1106 | { | 1113 | { |
| 1107 | unsigned long m; | 1114 | unsigned long m; |
| 1108 | void **c; | 1115 | void **c; |
| @@ -1212,8 +1219,9 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1212 | free_pagedir(pblist); | 1219 | free_pagedir(pblist); |
| 1213 | free_eaten_memory(); | 1220 | free_eaten_memory(); |
| 1214 | pblist = NULL; | 1221 | pblist = NULL; |
| 1215 | } | 1222 | /* Is this even worth handling? It should never ever happen, and we |
| 1216 | else | 1223 | have just lost user's state, anyway... */ |
| 1224 | } else | ||
| 1217 | printk("swsusp: Relocated %d pages\n", rel); | 1225 | printk("swsusp: Relocated %d pages\n", rel); |
| 1218 | 1226 | ||
| 1219 | return pblist; | 1227 | return pblist; |
| @@ -1433,9 +1441,9 @@ static int read_pagedir(struct pbe *pblist) | |||
| 1433 | } | 1441 | } |
| 1434 | 1442 | ||
| 1435 | if (error) | 1443 | if (error) |
| 1436 | free_page((unsigned long)pblist); | 1444 | free_pagedir(pblist); |
| 1437 | 1445 | else | |
| 1438 | BUG_ON(i != swsusp_info.pagedir_pages); | 1446 | BUG_ON(i != swsusp_info.pagedir_pages); |
| 1439 | 1447 | ||
| 1440 | return error; | 1448 | return error; |
| 1441 | } | 1449 | } |
| @@ -1473,11 +1481,12 @@ static int read_suspend_image(void) | |||
| 1473 | /* Allocate memory for the image and read the data from swap */ | 1481 | /* Allocate memory for the image and read the data from swap */ |
| 1474 | 1482 | ||
| 1475 | error = check_pagedir(pagedir_nosave); | 1483 | error = check_pagedir(pagedir_nosave); |
| 1476 | free_eaten_memory(); | 1484 | |
| 1477 | if (!error) | 1485 | if (!error) |
| 1478 | error = data_read(pagedir_nosave); | 1486 | error = data_read(pagedir_nosave); |
| 1479 | 1487 | ||
| 1480 | if (error) { /* We fail cleanly */ | 1488 | if (error) { /* We fail cleanly */ |
| 1489 | free_eaten_memory(); | ||
| 1481 | for_each_pbe (p, pagedir_nosave) | 1490 | for_each_pbe (p, pagedir_nosave) |
| 1482 | if (p->address) { | 1491 | if (p->address) { |
| 1483 | free_page(p->address); | 1492 | free_page(p->address); |
diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..4b8f0f9230a4 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -488,6 +488,11 @@ static int __init printk_time_setup(char *str) | |||
| 488 | 488 | ||
| 489 | __setup("time", printk_time_setup); | 489 | __setup("time", printk_time_setup); |
| 490 | 490 | ||
| 491 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
| 492 | { | ||
| 493 | return sched_clock(); | ||
| 494 | } | ||
| 495 | |||
| 491 | /* | 496 | /* |
| 492 | * This is printk. It can be called from any context. We want it to work. | 497 | * This is printk. It can be called from any context. We want it to work. |
| 493 | * | 498 | * |
| @@ -514,6 +519,9 @@ asmlinkage int printk(const char *fmt, ...) | |||
| 514 | return r; | 519 | return r; |
| 515 | } | 520 | } |
| 516 | 521 | ||
| 522 | /* cpu currently holding logbuf_lock */ | ||
| 523 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
| 524 | |||
| 517 | asmlinkage int vprintk(const char *fmt, va_list args) | 525 | asmlinkage int vprintk(const char *fmt, va_list args) |
| 518 | { | 526 | { |
| 519 | unsigned long flags; | 527 | unsigned long flags; |
| @@ -522,11 +530,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 522 | static char printk_buf[1024]; | 530 | static char printk_buf[1024]; |
| 523 | static int log_level_unknown = 1; | 531 | static int log_level_unknown = 1; |
| 524 | 532 | ||
| 525 | if (unlikely(oops_in_progress)) | 533 | preempt_disable(); |
| 534 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
| 535 | /* If a crash is occurring during printk() on this CPU, | ||
| 536 | * make sure we can't deadlock */ | ||
| 526 | zap_locks(); | 537 | zap_locks(); |
| 527 | 538 | ||
| 528 | /* This stops the holder of console_sem just where we want him */ | 539 | /* This stops the holder of console_sem just where we want him */ |
| 529 | spin_lock_irqsave(&logbuf_lock, flags); | 540 | spin_lock_irqsave(&logbuf_lock, flags); |
| 541 | printk_cpu = smp_processor_id(); | ||
| 530 | 542 | ||
| 531 | /* Emit the output into the temporary buffer */ | 543 | /* Emit the output into the temporary buffer */ |
| 532 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 544 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); |
| @@ -558,7 +570,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 558 | loglev_char = default_message_loglevel | 570 | loglev_char = default_message_loglevel |
| 559 | + '0'; | 571 | + '0'; |
| 560 | } | 572 | } |
| 561 | t = sched_clock(); | 573 | t = printk_clock(); |
| 562 | nanosec_rem = do_div(t, 1000000000); | 574 | nanosec_rem = do_div(t, 1000000000); |
| 563 | tlen = sprintf(tbuf, | 575 | tlen = sprintf(tbuf, |
| 564 | "<%c>[%5lu.%06lu] ", | 576 | "<%c>[%5lu.%06lu] ", |
| @@ -595,6 +607,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 595 | * CPU until it is officially up. We shouldn't be calling into | 607 | * CPU until it is officially up. We shouldn't be calling into |
| 596 | * random console drivers on a CPU which doesn't exist yet.. | 608 | * random console drivers on a CPU which doesn't exist yet.. |
| 597 | */ | 609 | */ |
| 610 | printk_cpu = UINT_MAX; | ||
| 598 | spin_unlock_irqrestore(&logbuf_lock, flags); | 611 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 599 | goto out; | 612 | goto out; |
| 600 | } | 613 | } |
| @@ -604,6 +617,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 604 | * We own the drivers. We can drop the spinlock and let | 617 | * We own the drivers. We can drop the spinlock and let |
| 605 | * release_console_sem() print the text | 618 | * release_console_sem() print the text |
| 606 | */ | 619 | */ |
| 620 | printk_cpu = UINT_MAX; | ||
| 607 | spin_unlock_irqrestore(&logbuf_lock, flags); | 621 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 608 | console_may_schedule = 0; | 622 | console_may_schedule = 0; |
| 609 | release_console_sem(); | 623 | release_console_sem(); |
| @@ -613,9 +627,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 613 | * allows the semaphore holder to proceed and to call the | 627 | * allows the semaphore holder to proceed and to call the |
| 614 | * console drivers with the output which we just produced. | 628 | * console drivers with the output which we just produced. |
| 615 | */ | 629 | */ |
| 630 | printk_cpu = UINT_MAX; | ||
| 616 | spin_unlock_irqrestore(&logbuf_lock, flags); | 631 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 617 | } | 632 | } |
| 618 | out: | 633 | out: |
| 634 | preempt_enable(); | ||
| 619 | return printed_len; | 635 | return printed_len; |
| 620 | } | 636 | } |
| 621 | EXPORT_SYMBOL(printk); | 637 | EXPORT_SYMBOL(printk); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..019e04ec065a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
| 118 | return ret; | 118 | return ret; |
| 119 | } | 119 | } |
| 120 | 120 | ||
| 121 | static int may_attach(struct task_struct *task) | ||
| 122 | { | ||
| 123 | if (!task->mm) | ||
| 124 | return -EPERM; | ||
| 125 | if (((current->uid != task->euid) || | ||
| 126 | (current->uid != task->suid) || | ||
| 127 | (current->uid != task->uid) || | ||
| 128 | (current->gid != task->egid) || | ||
| 129 | (current->gid != task->sgid) || | ||
| 130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | ||
| 131 | return -EPERM; | ||
| 132 | smp_rmb(); | ||
| 133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | ||
| 134 | return -EPERM; | ||
| 135 | |||
| 136 | return security_ptrace(current, task); | ||
| 137 | } | ||
| 138 | |||
| 139 | int ptrace_may_attach(struct task_struct *task) | ||
| 140 | { | ||
| 141 | int err; | ||
| 142 | task_lock(task); | ||
| 143 | err = may_attach(task); | ||
| 144 | task_unlock(task); | ||
| 145 | return !err; | ||
| 146 | } | ||
| 147 | |||
| 121 | int ptrace_attach(struct task_struct *task) | 148 | int ptrace_attach(struct task_struct *task) |
| 122 | { | 149 | { |
| 123 | int retval; | 150 | int retval; |
| @@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task) | |||
| 127 | goto bad; | 154 | goto bad; |
| 128 | if (task == current) | 155 | if (task == current) |
| 129 | goto bad; | 156 | goto bad; |
| 130 | if (!task->mm) | ||
| 131 | goto bad; | ||
| 132 | if(((current->uid != task->euid) || | ||
| 133 | (current->uid != task->suid) || | ||
| 134 | (current->uid != task->uid) || | ||
| 135 | (current->gid != task->egid) || | ||
| 136 | (current->gid != task->sgid) || | ||
| 137 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | ||
| 138 | goto bad; | ||
| 139 | smp_rmb(); | ||
| 140 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | ||
| 141 | goto bad; | ||
| 142 | /* the same process cannot be attached many times */ | 157 | /* the same process cannot be attached many times */ |
| 143 | if (task->ptrace & PT_PTRACED) | 158 | if (task->ptrace & PT_PTRACED) |
| 144 | goto bad; | 159 | goto bad; |
| 145 | retval = security_ptrace(current, task); | 160 | retval = may_attach(task); |
| 146 | if (retval) | 161 | if (retval) |
| 147 | goto bad; | 162 | goto bad; |
| 148 | 163 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..2559d4b8f23f 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/percpu.h> | 45 | #include <linux/percpu.h> |
| 46 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
| 47 | #include <linux/rcupdate.h> | 47 | #include <linux/rcupdate.h> |
| 48 | #include <linux/rcuref.h> | ||
| 48 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 49 | 50 | ||
| 50 | /* Definition for rcupdate control block. */ | 51 | /* Definition for rcupdate control block. */ |
| @@ -70,7 +71,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
| 70 | 71 | ||
| 71 | /* Fake initialization required by compiler */ | 72 | /* Fake initialization required by compiler */ |
| 72 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
| 73 | static int maxbatch = 10; | 74 | static int maxbatch = 10000; |
| 75 | |||
| 76 | #ifndef __HAVE_ARCH_CMPXCHG | ||
| 77 | /* | ||
| 78 | * We use an array of spinlocks for the rcurefs -- similar to ones in sparc | ||
| 79 | * 32 bit atomic_t implementations, and a hash function similar to that | ||
| 80 | * for our refcounting needs. | ||
| 81 | * Can't help multiprocessors which donot have cmpxchg :( | ||
| 82 | */ | ||
| 83 | |||
| 84 | spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { | ||
| 85 | [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED | ||
| 86 | }; | ||
| 87 | #endif | ||
| 74 | 88 | ||
| 75 | /** | 89 | /** |
| 76 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 90 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
| @@ -95,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head, | |||
| 95 | rdp = &__get_cpu_var(rcu_data); | 109 | rdp = &__get_cpu_var(rcu_data); |
| 96 | *rdp->nxttail = head; | 110 | *rdp->nxttail = head; |
| 97 | rdp->nxttail = &head->next; | 111 | rdp->nxttail = &head->next; |
| 112 | |||
| 113 | if (unlikely(++rdp->count > 10000)) | ||
| 114 | set_need_resched(); | ||
| 115 | |||
| 98 | local_irq_restore(flags); | 116 | local_irq_restore(flags); |
| 99 | } | 117 | } |
| 100 | 118 | ||
| @@ -126,6 +144,12 @@ void fastcall call_rcu_bh(struct rcu_head *head, | |||
| 126 | rdp = &__get_cpu_var(rcu_bh_data); | 144 | rdp = &__get_cpu_var(rcu_bh_data); |
| 127 | *rdp->nxttail = head; | 145 | *rdp->nxttail = head; |
| 128 | rdp->nxttail = &head->next; | 146 | rdp->nxttail = &head->next; |
| 147 | rdp->count++; | ||
| 148 | /* | ||
| 149 | * Should we directly call rcu_do_batch() here ? | ||
| 150 | * if (unlikely(rdp->count > 10000)) | ||
| 151 | * rcu_do_batch(rdp); | ||
| 152 | */ | ||
| 129 | local_irq_restore(flags); | 153 | local_irq_restore(flags); |
| 130 | } | 154 | } |
| 131 | 155 | ||
| @@ -143,6 +167,7 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 143 | next = rdp->donelist = list->next; | 167 | next = rdp->donelist = list->next; |
| 144 | list->func(list); | 168 | list->func(list); |
| 145 | list = next; | 169 | list = next; |
| 170 | rdp->count--; | ||
| 146 | if (++count >= maxbatch) | 171 | if (++count >= maxbatch) |
| 147 | break; | 172 | break; |
| 148 | } | 173 | } |
diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); | |||
| 430 | */ | 430 | */ |
| 431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | 431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) |
| 432 | { | 432 | { |
| 433 | struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); | 433 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
| 434 | 434 | ||
| 435 | if (res) { | 435 | if (res) { |
| 436 | memset(res, 0, sizeof(*res)); | ||
| 437 | res->name = name; | 436 | res->name = name; |
| 438 | res->start = start; | 437 | res->start = start; |
| 439 | res->end = start + n - 1; | 438 | res->end = start + n - 1; |
diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfcc..1e5cafdf4e27 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | |||
| 294 | 294 | ||
| 295 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 295 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) |
| 296 | { | 296 | { |
| 297 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
| 298 | /* this is a valid case when another task releases the spinlock */ | ||
| 299 | rq->lock.owner = current; | ||
| 300 | #endif | ||
| 297 | spin_unlock_irq(&rq->lock); | 301 | spin_unlock_irq(&rq->lock); |
| 298 | } | 302 | } |
| 299 | 303 | ||
| @@ -875,7 +879,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 875 | * smp_call_function() if an IPI is sent by the same process we are | 879 | * smp_call_function() if an IPI is sent by the same process we are |
| 876 | * waiting to become inactive. | 880 | * waiting to become inactive. |
| 877 | */ | 881 | */ |
| 878 | void wait_task_inactive(task_t * p) | 882 | void wait_task_inactive(task_t *p) |
| 879 | { | 883 | { |
| 880 | unsigned long flags; | 884 | unsigned long flags; |
| 881 | runqueue_t *rq; | 885 | runqueue_t *rq; |
| @@ -966,8 +970,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 966 | int local_group; | 970 | int local_group; |
| 967 | int i; | 971 | int i; |
| 968 | 972 | ||
| 973 | /* Skip over this group if it has no CPUs allowed */ | ||
| 974 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | ||
| 975 | goto nextgroup; | ||
| 976 | |||
| 969 | local_group = cpu_isset(this_cpu, group->cpumask); | 977 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 970 | /* XXX: put a cpus allowed check */ | ||
| 971 | 978 | ||
| 972 | /* Tally up the load of all CPUs in the group */ | 979 | /* Tally up the load of all CPUs in the group */ |
| 973 | avg_load = 0; | 980 | avg_load = 0; |
| @@ -992,6 +999,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 992 | min_load = avg_load; | 999 | min_load = avg_load; |
| 993 | idlest = group; | 1000 | idlest = group; |
| 994 | } | 1001 | } |
| 1002 | nextgroup: | ||
| 995 | group = group->next; | 1003 | group = group->next; |
| 996 | } while (group != sd->groups); | 1004 | } while (group != sd->groups); |
| 997 | 1005 | ||
| @@ -1003,13 +1011,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1003 | /* | 1011 | /* |
| 1004 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | 1012 | * find_idlest_queue - find the idlest runqueue among the cpus in group. |
| 1005 | */ | 1013 | */ |
| 1006 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | 1014 | static int |
| 1015 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
| 1007 | { | 1016 | { |
| 1017 | cpumask_t tmp; | ||
| 1008 | unsigned long load, min_load = ULONG_MAX; | 1018 | unsigned long load, min_load = ULONG_MAX; |
| 1009 | int idlest = -1; | 1019 | int idlest = -1; |
| 1010 | int i; | 1020 | int i; |
| 1011 | 1021 | ||
| 1012 | for_each_cpu_mask(i, group->cpumask) { | 1022 | /* Traverse only the allowed CPUs */ |
| 1023 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | ||
| 1024 | |||
| 1025 | for_each_cpu_mask(i, tmp) { | ||
| 1013 | load = source_load(i, 0); | 1026 | load = source_load(i, 0); |
| 1014 | 1027 | ||
| 1015 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1028 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| @@ -1052,7 +1065,7 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1052 | if (!group) | 1065 | if (!group) |
| 1053 | goto nextlevel; | 1066 | goto nextlevel; |
| 1054 | 1067 | ||
| 1055 | new_cpu = find_idlest_cpu(group, cpu); | 1068 | new_cpu = find_idlest_cpu(group, t, cpu); |
| 1056 | if (new_cpu == -1 || new_cpu == cpu) | 1069 | if (new_cpu == -1 || new_cpu == cpu) |
| 1057 | goto nextlevel; | 1070 | goto nextlevel; |
| 1058 | 1071 | ||
| @@ -1127,7 +1140,7 @@ static inline int wake_idle(int cpu, task_t *p) | |||
| 1127 | * | 1140 | * |
| 1128 | * returns failure only if the task is already active. | 1141 | * returns failure only if the task is already active. |
| 1129 | */ | 1142 | */ |
| 1130 | static int try_to_wake_up(task_t * p, unsigned int state, int sync) | 1143 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) |
| 1131 | { | 1144 | { |
| 1132 | int cpu, this_cpu, success = 0; | 1145 | int cpu, this_cpu, success = 0; |
| 1133 | unsigned long flags; | 1146 | unsigned long flags; |
| @@ -1252,6 +1265,16 @@ out_activate: | |||
| 1252 | } | 1265 | } |
| 1253 | 1266 | ||
| 1254 | /* | 1267 | /* |
| 1268 | * Tasks that have marked their sleep as noninteractive get | ||
| 1269 | * woken up without updating their sleep average. (i.e. their | ||
| 1270 | * sleep is handled in a priority-neutral manner, no priority | ||
| 1271 | * boost and no penalty.) | ||
| 1272 | */ | ||
| 1273 | if (old_state & TASK_NONINTERACTIVE) | ||
| 1274 | __activate_task(p, rq); | ||
| 1275 | else | ||
| 1276 | activate_task(p, rq, cpu == this_cpu); | ||
| 1277 | /* | ||
| 1255 | * Sync wakeups (i.e. those types of wakeups where the waker | 1278 | * Sync wakeups (i.e. those types of wakeups where the waker |
| 1256 | * has indicated that it will leave the CPU in short order) | 1279 | * has indicated that it will leave the CPU in short order) |
| 1257 | * don't trigger a preemption, if the woken up task will run on | 1280 | * don't trigger a preemption, if the woken up task will run on |
| @@ -1259,7 +1282,6 @@ out_activate: | |||
| 1259 | * the waker guarantees that the freshly woken up task is going | 1282 | * the waker guarantees that the freshly woken up task is going |
| 1260 | * to be considered on this CPU.) | 1283 | * to be considered on this CPU.) |
| 1261 | */ | 1284 | */ |
| 1262 | activate_task(p, rq, cpu == this_cpu); | ||
| 1263 | if (!sync || cpu != this_cpu) { | 1285 | if (!sync || cpu != this_cpu) { |
| 1264 | if (TASK_PREEMPTS_CURR(p, rq)) | 1286 | if (TASK_PREEMPTS_CURR(p, rq)) |
| 1265 | resched_task(rq->curr); | 1287 | resched_task(rq->curr); |
| @@ -1274,7 +1296,7 @@ out: | |||
| 1274 | return success; | 1296 | return success; |
| 1275 | } | 1297 | } |
| 1276 | 1298 | ||
| 1277 | int fastcall wake_up_process(task_t * p) | 1299 | int fastcall wake_up_process(task_t *p) |
| 1278 | { | 1300 | { |
| 1279 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1301 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
| 1280 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1302 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
| @@ -1353,7 +1375,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1353 | * that must be done for every newly created context, then puts the task | 1375 | * that must be done for every newly created context, then puts the task |
| 1354 | * on the runqueue and wakes it. | 1376 | * on the runqueue and wakes it. |
| 1355 | */ | 1377 | */ |
| 1356 | void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | 1378 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) |
| 1357 | { | 1379 | { |
| 1358 | unsigned long flags; | 1380 | unsigned long flags; |
| 1359 | int this_cpu, cpu; | 1381 | int this_cpu, cpu; |
| @@ -1436,7 +1458,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
| 1436 | * artificially, because any timeslice recovered here | 1458 | * artificially, because any timeslice recovered here |
| 1437 | * was given away by the parent in the first place.) | 1459 | * was given away by the parent in the first place.) |
| 1438 | */ | 1460 | */ |
| 1439 | void fastcall sched_exit(task_t * p) | 1461 | void fastcall sched_exit(task_t *p) |
| 1440 | { | 1462 | { |
| 1441 | unsigned long flags; | 1463 | unsigned long flags; |
| 1442 | runqueue_t *rq; | 1464 | runqueue_t *rq; |
| @@ -1478,6 +1500,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
| 1478 | 1500 | ||
| 1479 | /** | 1501 | /** |
| 1480 | * finish_task_switch - clean up after a task-switch | 1502 | * finish_task_switch - clean up after a task-switch |
| 1503 | * @rq: runqueue associated with task-switch | ||
| 1481 | * @prev: the thread we just switched away from. | 1504 | * @prev: the thread we just switched away from. |
| 1482 | * | 1505 | * |
| 1483 | * finish_task_switch must be called after the context switch, paired | 1506 | * finish_task_switch must be called after the context switch, paired |
| @@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
| 1752 | */ | 1775 | */ |
| 1753 | static inline | 1776 | static inline |
| 1754 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1777 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
| 1755 | struct sched_domain *sd, enum idle_type idle, int *all_pinned) | 1778 | struct sched_domain *sd, enum idle_type idle, |
| 1779 | int *all_pinned) | ||
| 1756 | { | 1780 | { |
| 1757 | /* | 1781 | /* |
| 1758 | * We do not migrate tasks that are: | 1782 | * We do not migrate tasks that are: |
| @@ -1882,10 +1906,11 @@ out: | |||
| 1882 | */ | 1906 | */ |
| 1883 | static struct sched_group * | 1907 | static struct sched_group * |
| 1884 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 1908 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 1885 | unsigned long *imbalance, enum idle_type idle) | 1909 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) |
| 1886 | { | 1910 | { |
| 1887 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1911 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 1888 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1912 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| 1913 | unsigned long max_pull; | ||
| 1889 | int load_idx; | 1914 | int load_idx; |
| 1890 | 1915 | ||
| 1891 | max_load = this_load = total_load = total_pwr = 0; | 1916 | max_load = this_load = total_load = total_pwr = 0; |
| @@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1907 | avg_load = 0; | 1932 | avg_load = 0; |
| 1908 | 1933 | ||
| 1909 | for_each_cpu_mask(i, group->cpumask) { | 1934 | for_each_cpu_mask(i, group->cpumask) { |
| 1935 | if (*sd_idle && !idle_cpu(i)) | ||
| 1936 | *sd_idle = 0; | ||
| 1937 | |||
| 1910 | /* Bias balancing toward cpus of our domain */ | 1938 | /* Bias balancing toward cpus of our domain */ |
| 1911 | if (local_group) | 1939 | if (local_group) |
| 1912 | load = target_load(i, load_idx); | 1940 | load = target_load(i, load_idx); |
| @@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1932 | group = group->next; | 1960 | group = group->next; |
| 1933 | } while (group != sd->groups); | 1961 | } while (group != sd->groups); |
| 1934 | 1962 | ||
| 1935 | if (!busiest || this_load >= max_load) | 1963 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) |
| 1936 | goto out_balanced; | 1964 | goto out_balanced; |
| 1937 | 1965 | ||
| 1938 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 1966 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
| @@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1952 | * by pulling tasks to us. Be careful of negative numbers as they'll | 1980 | * by pulling tasks to us. Be careful of negative numbers as they'll |
| 1953 | * appear as very large values with unsigned longs. | 1981 | * appear as very large values with unsigned longs. |
| 1954 | */ | 1982 | */ |
| 1983 | |||
| 1984 | /* Don't want to pull so many tasks that a group would go idle */ | ||
| 1985 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | ||
| 1986 | |||
| 1955 | /* How much load to actually move to equalise the imbalance */ | 1987 | /* How much load to actually move to equalise the imbalance */ |
| 1956 | *imbalance = min((max_load - avg_load) * busiest->cpu_power, | 1988 | *imbalance = min(max_pull * busiest->cpu_power, |
| 1957 | (avg_load - this_load) * this->cpu_power) | 1989 | (avg_load - this_load) * this->cpu_power) |
| 1958 | / SCHED_LOAD_SCALE; | 1990 | / SCHED_LOAD_SCALE; |
| 1959 | 1991 | ||
| @@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2050 | unsigned long imbalance; | 2082 | unsigned long imbalance; |
| 2051 | int nr_moved, all_pinned = 0; | 2083 | int nr_moved, all_pinned = 0; |
| 2052 | int active_balance = 0; | 2084 | int active_balance = 0; |
| 2085 | int sd_idle = 0; | ||
| 2086 | |||
| 2087 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | ||
| 2088 | sd_idle = 1; | ||
| 2053 | 2089 | ||
| 2054 | spin_lock(&this_rq->lock); | ||
| 2055 | schedstat_inc(sd, lb_cnt[idle]); | 2090 | schedstat_inc(sd, lb_cnt[idle]); |
| 2056 | 2091 | ||
| 2057 | group = find_busiest_group(sd, this_cpu, &imbalance, idle); | 2092 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); |
| 2058 | if (!group) { | 2093 | if (!group) { |
| 2059 | schedstat_inc(sd, lb_nobusyg[idle]); | 2094 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 2060 | goto out_balanced; | 2095 | goto out_balanced; |
| @@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2078 | * still unbalanced. nr_moved simply stays zero, so it is | 2113 | * still unbalanced. nr_moved simply stays zero, so it is |
| 2079 | * correctly treated as an imbalance. | 2114 | * correctly treated as an imbalance. |
| 2080 | */ | 2115 | */ |
| 2081 | double_lock_balance(this_rq, busiest); | 2116 | double_rq_lock(this_rq, busiest); |
| 2082 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2117 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2083 | imbalance, sd, idle, | 2118 | imbalance, sd, idle, &all_pinned); |
| 2084 | &all_pinned); | 2119 | double_rq_unlock(this_rq, busiest); |
| 2085 | spin_unlock(&busiest->lock); | ||
| 2086 | 2120 | ||
| 2087 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2121 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 2088 | if (unlikely(all_pinned)) | 2122 | if (unlikely(all_pinned)) |
| 2089 | goto out_balanced; | 2123 | goto out_balanced; |
| 2090 | } | 2124 | } |
| 2091 | 2125 | ||
| 2092 | spin_unlock(&this_rq->lock); | ||
| 2093 | |||
| 2094 | if (!nr_moved) { | 2126 | if (!nr_moved) { |
| 2095 | schedstat_inc(sd, lb_failed[idle]); | 2127 | schedstat_inc(sd, lb_failed[idle]); |
| 2096 | sd->nr_balance_failed++; | 2128 | sd->nr_balance_failed++; |
| @@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2098 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2130 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
| 2099 | 2131 | ||
| 2100 | spin_lock(&busiest->lock); | 2132 | spin_lock(&busiest->lock); |
| 2133 | |||
| 2134 | /* don't kick the migration_thread, if the curr | ||
| 2135 | * task on busiest cpu can't be moved to this_cpu | ||
| 2136 | */ | ||
| 2137 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | ||
| 2138 | spin_unlock(&busiest->lock); | ||
| 2139 | all_pinned = 1; | ||
| 2140 | goto out_one_pinned; | ||
| 2141 | } | ||
| 2142 | |||
| 2101 | if (!busiest->active_balance) { | 2143 | if (!busiest->active_balance) { |
| 2102 | busiest->active_balance = 1; | 2144 | busiest->active_balance = 1; |
| 2103 | busiest->push_cpu = this_cpu; | 2145 | busiest->push_cpu = this_cpu; |
| @@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2130 | sd->balance_interval *= 2; | 2172 | sd->balance_interval *= 2; |
| 2131 | } | 2173 | } |
| 2132 | 2174 | ||
| 2175 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
| 2176 | return -1; | ||
| 2133 | return nr_moved; | 2177 | return nr_moved; |
| 2134 | 2178 | ||
| 2135 | out_balanced: | 2179 | out_balanced: |
| 2136 | spin_unlock(&this_rq->lock); | ||
| 2137 | |||
| 2138 | schedstat_inc(sd, lb_balanced[idle]); | 2180 | schedstat_inc(sd, lb_balanced[idle]); |
| 2139 | 2181 | ||
| 2140 | sd->nr_balance_failed = 0; | 2182 | sd->nr_balance_failed = 0; |
| 2183 | |||
| 2184 | out_one_pinned: | ||
| 2141 | /* tune up the balancing interval */ | 2185 | /* tune up the balancing interval */ |
| 2142 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 2186 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
| 2143 | (sd->balance_interval < sd->max_interval)) | 2187 | (sd->balance_interval < sd->max_interval)) |
| 2144 | sd->balance_interval *= 2; | 2188 | sd->balance_interval *= 2; |
| 2145 | 2189 | ||
| 2190 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
| 2191 | return -1; | ||
| 2146 | return 0; | 2192 | return 0; |
| 2147 | } | 2193 | } |
| 2148 | 2194 | ||
| @@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2160 | runqueue_t *busiest = NULL; | 2206 | runqueue_t *busiest = NULL; |
| 2161 | unsigned long imbalance; | 2207 | unsigned long imbalance; |
| 2162 | int nr_moved = 0; | 2208 | int nr_moved = 0; |
| 2209 | int sd_idle = 0; | ||
| 2210 | |||
| 2211 | if (sd->flags & SD_SHARE_CPUPOWER) | ||
| 2212 | sd_idle = 1; | ||
| 2163 | 2213 | ||
| 2164 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2214 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
| 2165 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | 2215 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); |
| 2166 | if (!group) { | 2216 | if (!group) { |
| 2167 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2217 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
| 2168 | goto out_balanced; | 2218 | goto out_balanced; |
| @@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2176 | 2226 | ||
| 2177 | BUG_ON(busiest == this_rq); | 2227 | BUG_ON(busiest == this_rq); |
| 2178 | 2228 | ||
| 2179 | /* Attempt to move tasks */ | ||
| 2180 | double_lock_balance(this_rq, busiest); | ||
| 2181 | |||
| 2182 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2229 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
| 2183 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2230 | |
| 2231 | nr_moved = 0; | ||
| 2232 | if (busiest->nr_running > 1) { | ||
| 2233 | /* Attempt to move tasks */ | ||
| 2234 | double_lock_balance(this_rq, busiest); | ||
| 2235 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
| 2184 | imbalance, sd, NEWLY_IDLE, NULL); | 2236 | imbalance, sd, NEWLY_IDLE, NULL); |
| 2185 | if (!nr_moved) | 2237 | spin_unlock(&busiest->lock); |
| 2238 | } | ||
| 2239 | |||
| 2240 | if (!nr_moved) { | ||
| 2186 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2241 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
| 2187 | else | 2242 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
| 2243 | return -1; | ||
| 2244 | } else | ||
| 2188 | sd->nr_balance_failed = 0; | 2245 | sd->nr_balance_failed = 0; |
| 2189 | 2246 | ||
| 2190 | spin_unlock(&busiest->lock); | ||
| 2191 | return nr_moved; | 2247 | return nr_moved; |
| 2192 | 2248 | ||
| 2193 | out_balanced: | 2249 | out_balanced: |
| 2194 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2250 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
| 2251 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
| 2252 | return -1; | ||
| 2195 | sd->nr_balance_failed = 0; | 2253 | sd->nr_balance_failed = 0; |
| 2196 | return 0; | 2254 | return 0; |
| 2197 | } | 2255 | } |
| @@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2316 | 2374 | ||
| 2317 | if (j - sd->last_balance >= interval) { | 2375 | if (j - sd->last_balance >= interval) { |
| 2318 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2376 | if (load_balance(this_cpu, this_rq, sd, idle)) { |
| 2319 | /* We've pulled tasks over so no longer idle */ | 2377 | /* |
| 2378 | * We've pulled tasks over so either we're no | ||
| 2379 | * longer idle, or one of our SMT siblings is | ||
| 2380 | * not idle. | ||
| 2381 | */ | ||
| 2320 | idle = NOT_IDLE; | 2382 | idle = NOT_IDLE; |
| 2321 | } | 2383 | } |
| 2322 | sd->last_balance += interval; | 2384 | sd->last_balance += interval; |
| @@ -2575,6 +2637,13 @@ out: | |||
| 2575 | } | 2637 | } |
| 2576 | 2638 | ||
| 2577 | #ifdef CONFIG_SCHED_SMT | 2639 | #ifdef CONFIG_SCHED_SMT |
| 2640 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | ||
| 2641 | { | ||
| 2642 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | ||
| 2643 | if (rq->curr == rq->idle && rq->nr_running) | ||
| 2644 | resched_task(rq->idle); | ||
| 2645 | } | ||
| 2646 | |||
| 2578 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2647 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
| 2579 | { | 2648 | { |
| 2580 | struct sched_domain *tmp, *sd = NULL; | 2649 | struct sched_domain *tmp, *sd = NULL; |
| @@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
| 2608 | for_each_cpu_mask(i, sibling_map) { | 2677 | for_each_cpu_mask(i, sibling_map) { |
| 2609 | runqueue_t *smt_rq = cpu_rq(i); | 2678 | runqueue_t *smt_rq = cpu_rq(i); |
| 2610 | 2679 | ||
| 2611 | /* | 2680 | wakeup_busy_runqueue(smt_rq); |
| 2612 | * If an SMT sibling task is sleeping due to priority | ||
| 2613 | * reasons wake it up now. | ||
| 2614 | */ | ||
| 2615 | if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) | ||
| 2616 | resched_task(smt_rq->idle); | ||
| 2617 | } | 2681 | } |
| 2618 | 2682 | ||
| 2619 | for_each_cpu_mask(i, sibling_map) | 2683 | for_each_cpu_mask(i, sibling_map) |
| @@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
| 2624 | */ | 2688 | */ |
| 2625 | } | 2689 | } |
| 2626 | 2690 | ||
| 2691 | /* | ||
| 2692 | * number of 'lost' timeslices this task wont be able to fully | ||
| 2693 | * utilize, if another task runs on a sibling. This models the | ||
| 2694 | * slowdown effect of other tasks running on siblings: | ||
| 2695 | */ | ||
| 2696 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | ||
| 2697 | { | ||
| 2698 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
| 2699 | } | ||
| 2700 | |||
| 2627 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2701 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
| 2628 | { | 2702 | { |
| 2629 | struct sched_domain *tmp, *sd = NULL; | 2703 | struct sched_domain *tmp, *sd = NULL; |
| @@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
| 2667 | runqueue_t *smt_rq = cpu_rq(i); | 2741 | runqueue_t *smt_rq = cpu_rq(i); |
| 2668 | task_t *smt_curr = smt_rq->curr; | 2742 | task_t *smt_curr = smt_rq->curr; |
| 2669 | 2743 | ||
| 2744 | /* Kernel threads do not participate in dependent sleeping */ | ||
| 2745 | if (!p->mm || !smt_curr->mm || rt_task(p)) | ||
| 2746 | goto check_smt_task; | ||
| 2747 | |||
| 2670 | /* | 2748 | /* |
| 2671 | * If a user task with lower static priority than the | 2749 | * If a user task with lower static priority than the |
| 2672 | * running task on the SMT sibling is trying to schedule, | 2750 | * running task on the SMT sibling is trying to schedule, |
| @@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
| 2675 | * task from using an unfair proportion of the | 2753 | * task from using an unfair proportion of the |
| 2676 | * physical cpu's resources. -ck | 2754 | * physical cpu's resources. -ck |
| 2677 | */ | 2755 | */ |
| 2678 | if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > | 2756 | if (rt_task(smt_curr)) { |
| 2679 | task_timeslice(p) || rt_task(smt_curr)) && | 2757 | /* |
| 2680 | p->mm && smt_curr->mm && !rt_task(p)) | 2758 | * With real time tasks we run non-rt tasks only |
| 2681 | ret = 1; | 2759 | * per_cpu_gain% of the time. |
| 2760 | */ | ||
| 2761 | if ((jiffies % DEF_TIMESLICE) > | ||
| 2762 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
| 2763 | ret = 1; | ||
| 2764 | } else | ||
| 2765 | if (smt_curr->static_prio < p->static_prio && | ||
| 2766 | !TASK_PREEMPTS_CURR(p, smt_rq) && | ||
| 2767 | smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
| 2768 | ret = 1; | ||
| 2769 | |||
| 2770 | check_smt_task: | ||
| 2771 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
| 2772 | rt_task(smt_curr)) | ||
| 2773 | continue; | ||
| 2774 | if (!p->mm) { | ||
| 2775 | wakeup_busy_runqueue(smt_rq); | ||
| 2776 | continue; | ||
| 2777 | } | ||
| 2682 | 2778 | ||
| 2683 | /* | 2779 | /* |
| 2684 | * Reschedule a lower priority task on the SMT sibling, | 2780 | * Reschedule a lower priority task on the SMT sibling for |
| 2685 | * or wake it up if it has been put to sleep for priority | 2781 | * it to be put to sleep, or wake it up if it has been put to |
| 2686 | * reasons. | 2782 | * sleep for priority reasons to see if it should run now. |
| 2687 | */ | 2783 | */ |
| 2688 | if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > | 2784 | if (rt_task(p)) { |
| 2689 | task_timeslice(smt_curr) || rt_task(p)) && | 2785 | if ((jiffies % DEF_TIMESLICE) > |
| 2690 | smt_curr->mm && p->mm && !rt_task(smt_curr)) || | 2786 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
| 2691 | (smt_curr == smt_rq->idle && smt_rq->nr_running)) | 2787 | resched_task(smt_curr); |
| 2692 | resched_task(smt_curr); | 2788 | } else { |
| 2789 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
| 2790 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
| 2791 | resched_task(smt_curr); | ||
| 2792 | else | ||
| 2793 | wakeup_busy_runqueue(smt_rq); | ||
| 2794 | } | ||
| 2693 | } | 2795 | } |
| 2694 | out_unlock: | 2796 | out_unlock: |
| 2695 | for_each_cpu_mask(i, sibling_map) | 2797 | for_each_cpu_mask(i, sibling_map) |
| @@ -2887,6 +2989,7 @@ switch_tasks: | |||
| 2887 | if (next == rq->idle) | 2989 | if (next == rq->idle) |
| 2888 | schedstat_inc(rq, sched_goidle); | 2990 | schedstat_inc(rq, sched_goidle); |
| 2889 | prefetch(next); | 2991 | prefetch(next); |
| 2992 | prefetch_stack(next); | ||
| 2890 | clear_tsk_need_resched(prev); | 2993 | clear_tsk_need_resched(prev); |
| 2891 | rcu_qsctr_inc(task_cpu(prev)); | 2994 | rcu_qsctr_inc(task_cpu(prev)); |
| 2892 | 2995 | ||
| @@ -3014,7 +3117,8 @@ need_resched: | |||
| 3014 | 3117 | ||
| 3015 | #endif /* CONFIG_PREEMPT */ | 3118 | #endif /* CONFIG_PREEMPT */ |
| 3016 | 3119 | ||
| 3017 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | 3120 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
| 3121 | void *key) | ||
| 3018 | { | 3122 | { |
| 3019 | task_t *p = curr->private; | 3123 | task_t *p = curr->private; |
| 3020 | return try_to_wake_up(p, mode, sync); | 3124 | return try_to_wake_up(p, mode, sync); |
| @@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 3056 | * @key: is directly passed to the wakeup function | 3160 | * @key: is directly passed to the wakeup function |
| 3057 | */ | 3161 | */ |
| 3058 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 3162 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
| 3059 | int nr_exclusive, void *key) | 3163 | int nr_exclusive, void *key) |
| 3060 | { | 3164 | { |
| 3061 | unsigned long flags; | 3165 | unsigned long flags; |
| 3062 | 3166 | ||
| @@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | |||
| 3088 | * | 3192 | * |
| 3089 | * On UP it can prevent extra preemption. | 3193 | * On UP it can prevent extra preemption. |
| 3090 | */ | 3194 | */ |
| 3091 | void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 3195 | void fastcall |
| 3196 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
| 3092 | { | 3197 | { |
| 3093 | unsigned long flags; | 3198 | unsigned long flags; |
| 3094 | int sync = 1; | 3199 | int sync = 1; |
| @@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
| 3279 | 3384 | ||
| 3280 | EXPORT_SYMBOL(interruptible_sleep_on); | 3385 | EXPORT_SYMBOL(interruptible_sleep_on); |
| 3281 | 3386 | ||
| 3282 | long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3387 | long fastcall __sched |
| 3388 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
| 3283 | { | 3389 | { |
| 3284 | SLEEP_ON_VAR | 3390 | SLEEP_ON_VAR |
| 3285 | 3391 | ||
| @@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
| 3498 | * @policy: new policy. | 3604 | * @policy: new policy. |
| 3499 | * @param: structure containing the new RT priority. | 3605 | * @param: structure containing the new RT priority. |
| 3500 | */ | 3606 | */ |
| 3501 | int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) | 3607 | int sched_setscheduler(struct task_struct *p, int policy, |
| 3608 | struct sched_param *param) | ||
| 3502 | { | 3609 | { |
| 3503 | int retval; | 3610 | int retval; |
| 3504 | int oldprio, oldpolicy = -1; | 3611 | int oldprio, oldpolicy = -1; |
| @@ -3518,7 +3625,7 @@ recheck: | |||
| 3518 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3625 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. |
| 3519 | */ | 3626 | */ |
| 3520 | if (param->sched_priority < 0 || | 3627 | if (param->sched_priority < 0 || |
| 3521 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3628 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
| 3522 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3629 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
| 3523 | return -EINVAL; | 3630 | return -EINVAL; |
| 3524 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3631 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) |
| @@ -3581,7 +3688,8 @@ recheck: | |||
| 3581 | } | 3688 | } |
| 3582 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3689 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
| 3583 | 3690 | ||
| 3584 | static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 3691 | static int |
| 3692 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | ||
| 3585 | { | 3693 | { |
| 3586 | int retval; | 3694 | int retval; |
| 3587 | struct sched_param lparam; | 3695 | struct sched_param lparam; |
| @@ -3771,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map); | |||
| 3771 | 3879 | ||
| 3772 | #ifndef CONFIG_SMP | 3880 | #ifndef CONFIG_SMP |
| 3773 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 3881 | cpumask_t cpu_online_map = CPU_MASK_ALL; |
| 3882 | EXPORT_SYMBOL_GPL(cpu_online_map); | ||
| 3774 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 3883 | cpumask_t cpu_possible_map = CPU_MASK_ALL; |
| 3775 | #endif | 3884 | #endif |
| 3776 | 3885 | ||
| @@ -3848,7 +3957,7 @@ asmlinkage long sys_sched_yield(void) | |||
| 3848 | if (rt_task(current)) | 3957 | if (rt_task(current)) |
| 3849 | target = rq->active; | 3958 | target = rq->active; |
| 3850 | 3959 | ||
| 3851 | if (current->array->nr_active == 1) { | 3960 | if (array->nr_active == 1) { |
| 3852 | schedstat_inc(rq, yld_act_empty); | 3961 | schedstat_inc(rq, yld_act_empty); |
| 3853 | if (!rq->expired->nr_active) | 3962 | if (!rq->expired->nr_active) |
| 3854 | schedstat_inc(rq, yld_both_empty); | 3963 | schedstat_inc(rq, yld_both_empty); |
| @@ -3912,7 +4021,7 @@ EXPORT_SYMBOL(cond_resched); | |||
| 3912 | * operations here to prevent schedule() from being called twice (once via | 4021 | * operations here to prevent schedule() from being called twice (once via |
| 3913 | * spin_unlock(), once by hand). | 4022 | * spin_unlock(), once by hand). |
| 3914 | */ | 4023 | */ |
| 3915 | int cond_resched_lock(spinlock_t * lock) | 4024 | int cond_resched_lock(spinlock_t *lock) |
| 3916 | { | 4025 | { |
| 3917 | int ret = 0; | 4026 | int ret = 0; |
| 3918 | 4027 | ||
| @@ -4095,7 +4204,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) | |||
| 4095 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4204 | return list_entry(p->sibling.next,struct task_struct,sibling); |
| 4096 | } | 4205 | } |
| 4097 | 4206 | ||
| 4098 | static void show_task(task_t * p) | 4207 | static void show_task(task_t *p) |
| 4099 | { | 4208 | { |
| 4100 | task_t *relative; | 4209 | task_t *relative; |
| 4101 | unsigned state; | 4210 | unsigned state; |
| @@ -4121,7 +4230,7 @@ static void show_task(task_t * p) | |||
| 4121 | #endif | 4230 | #endif |
| 4122 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4231 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| 4123 | { | 4232 | { |
| 4124 | unsigned long * n = (unsigned long *) (p->thread_info+1); | 4233 | unsigned long *n = (unsigned long *) (p->thread_info+1); |
| 4125 | while (!*n) | 4234 | while (!*n) |
| 4126 | n++; | 4235 | n++; |
| 4127 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); | 4236 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); |
| @@ -4330,7 +4439,7 @@ out: | |||
| 4330 | * thread migration by bumping thread off CPU then 'pushing' onto | 4439 | * thread migration by bumping thread off CPU then 'pushing' onto |
| 4331 | * another runqueue. | 4440 | * another runqueue. |
| 4332 | */ | 4441 | */ |
| 4333 | static int migration_thread(void * data) | 4442 | static int migration_thread(void *data) |
| 4334 | { | 4443 | { |
| 4335 | runqueue_t *rq; | 4444 | runqueue_t *rq; |
| 4336 | int cpu = (long)data; | 4445 | int cpu = (long)data; |
| @@ -4779,7 +4888,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
| 4779 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4888 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
| 4780 | * hold the hotplug lock. | 4889 | * hold the hotplug lock. |
| 4781 | */ | 4890 | */ |
| 4782 | void cpu_attach_domain(struct sched_domain *sd, int cpu) | 4891 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
| 4783 | { | 4892 | { |
| 4784 | runqueue_t *rq = cpu_rq(cpu); | 4893 | runqueue_t *rq = cpu_rq(cpu); |
| 4785 | struct sched_domain *tmp; | 4894 | struct sched_domain *tmp; |
| @@ -4802,7 +4911,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
| 4802 | } | 4911 | } |
| 4803 | 4912 | ||
| 4804 | /* cpus with isolated domains */ | 4913 | /* cpus with isolated domains */ |
| 4805 | cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; | 4914 | static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; |
| 4806 | 4915 | ||
| 4807 | /* Setup the mask of cpus configured for isolated domains */ | 4916 | /* Setup the mask of cpus configured for isolated domains */ |
| 4808 | static int __init isolated_cpu_setup(char *str) | 4917 | static int __init isolated_cpu_setup(char *str) |
| @@ -4830,8 +4939,8 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
| 4830 | * covered by the given span, and will set each group's ->cpumask correctly, | 4939 | * covered by the given span, and will set each group's ->cpumask correctly, |
| 4831 | * and ->cpu_power to 0. | 4940 | * and ->cpu_power to 0. |
| 4832 | */ | 4941 | */ |
| 4833 | void init_sched_build_groups(struct sched_group groups[], | 4942 | static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, |
| 4834 | cpumask_t span, int (*group_fn)(int cpu)) | 4943 | int (*group_fn)(int cpu)) |
| 4835 | { | 4944 | { |
| 4836 | struct sched_group *first = NULL, *last = NULL; | 4945 | struct sched_group *first = NULL, *last = NULL; |
| 4837 | cpumask_t covered = CPU_MASK_NONE; | 4946 | cpumask_t covered = CPU_MASK_NONE; |
| @@ -4864,12 +4973,85 @@ void init_sched_build_groups(struct sched_group groups[], | |||
| 4864 | last->next = first; | 4973 | last->next = first; |
| 4865 | } | 4974 | } |
| 4866 | 4975 | ||
| 4976 | #define SD_NODES_PER_DOMAIN 16 | ||
| 4867 | 4977 | ||
| 4868 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4978 | #ifdef CONFIG_NUMA |
| 4869 | extern void build_sched_domains(const cpumask_t *cpu_map); | 4979 | /** |
| 4870 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); | 4980 | * find_next_best_node - find the next node to include in a sched_domain |
| 4871 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | 4981 | * @node: node whose sched_domain we're building |
| 4872 | #else | 4982 | * @used_nodes: nodes already in the sched_domain |
| 4983 | * | ||
| 4984 | * Find the next node to include in a given scheduling domain. Simply | ||
| 4985 | * finds the closest node not already in the @used_nodes map. | ||
| 4986 | * | ||
| 4987 | * Should use nodemask_t. | ||
| 4988 | */ | ||
| 4989 | static int find_next_best_node(int node, unsigned long *used_nodes) | ||
| 4990 | { | ||
| 4991 | int i, n, val, min_val, best_node = 0; | ||
| 4992 | |||
| 4993 | min_val = INT_MAX; | ||
| 4994 | |||
| 4995 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 4996 | /* Start at @node */ | ||
| 4997 | n = (node + i) % MAX_NUMNODES; | ||
| 4998 | |||
| 4999 | if (!nr_cpus_node(n)) | ||
| 5000 | continue; | ||
| 5001 | |||
| 5002 | /* Skip already used nodes */ | ||
| 5003 | if (test_bit(n, used_nodes)) | ||
| 5004 | continue; | ||
| 5005 | |||
| 5006 | /* Simple min distance search */ | ||
| 5007 | val = node_distance(node, n); | ||
| 5008 | |||
| 5009 | if (val < min_val) { | ||
| 5010 | min_val = val; | ||
| 5011 | best_node = n; | ||
| 5012 | } | ||
| 5013 | } | ||
| 5014 | |||
| 5015 | set_bit(best_node, used_nodes); | ||
| 5016 | return best_node; | ||
| 5017 | } | ||
| 5018 | |||
| 5019 | /** | ||
| 5020 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
| 5021 | * @node: node whose cpumask we're constructing | ||
| 5022 | * @size: number of nodes to include in this span | ||
| 5023 | * | ||
| 5024 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
| 5025 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
| 5026 | * out optimally. | ||
| 5027 | */ | ||
| 5028 | static cpumask_t sched_domain_node_span(int node) | ||
| 5029 | { | ||
| 5030 | int i; | ||
| 5031 | cpumask_t span, nodemask; | ||
| 5032 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | ||
| 5033 | |||
| 5034 | cpus_clear(span); | ||
| 5035 | bitmap_zero(used_nodes, MAX_NUMNODES); | ||
| 5036 | |||
| 5037 | nodemask = node_to_cpumask(node); | ||
| 5038 | cpus_or(span, span, nodemask); | ||
| 5039 | set_bit(node, used_nodes); | ||
| 5040 | |||
| 5041 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
| 5042 | int next_node = find_next_best_node(node, used_nodes); | ||
| 5043 | nodemask = node_to_cpumask(next_node); | ||
| 5044 | cpus_or(span, span, nodemask); | ||
| 5045 | } | ||
| 5046 | |||
| 5047 | return span; | ||
| 5048 | } | ||
| 5049 | #endif | ||
| 5050 | |||
| 5051 | /* | ||
| 5052 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | ||
| 5053 | * can switch it on easily if needed. | ||
| 5054 | */ | ||
| 4873 | #ifdef CONFIG_SCHED_SMT | 5055 | #ifdef CONFIG_SCHED_SMT |
| 4874 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 5056 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| 4875 | static struct sched_group sched_group_cpus[NR_CPUS]; | 5057 | static struct sched_group sched_group_cpus[NR_CPUS]; |
| @@ -4891,36 +5073,20 @@ static int cpu_to_phys_group(int cpu) | |||
| 4891 | } | 5073 | } |
| 4892 | 5074 | ||
| 4893 | #ifdef CONFIG_NUMA | 5075 | #ifdef CONFIG_NUMA |
| 4894 | |||
| 4895 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | ||
| 4896 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | ||
| 4897 | static int cpu_to_node_group(int cpu) | ||
| 4898 | { | ||
| 4899 | return cpu_to_node(cpu); | ||
| 4900 | } | ||
| 4901 | #endif | ||
| 4902 | |||
| 4903 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
| 4904 | /* | 5076 | /* |
| 4905 | * The domains setup code relies on siblings not spanning | 5077 | * The init_sched_build_groups can't handle what we want to do with node |
| 4906 | * multiple nodes. Make sure the architecture has a proper | 5078 | * groups, so roll our own. Now each node has its own list of groups which |
| 4907 | * siblings map: | 5079 | * gets dynamically allocated. |
| 4908 | */ | 5080 | */ |
| 4909 | static void check_sibling_maps(void) | 5081 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
| 4910 | { | 5082 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
| 4911 | int i, j; | ||
| 4912 | 5083 | ||
| 4913 | for_each_online_cpu(i) { | 5084 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
| 4914 | for_each_cpu_mask(j, cpu_sibling_map[i]) { | 5085 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; |
| 4915 | if (cpu_to_node(i) != cpu_to_node(j)) { | 5086 | |
| 4916 | printk(KERN_INFO "warning: CPU %d siblings map " | 5087 | static int cpu_to_allnodes_group(int cpu) |
| 4917 | "to different node - isolating " | 5088 | { |
| 4918 | "them.\n", i); | 5089 | return cpu_to_node(cpu); |
| 4919 | cpu_sibling_map[i] = cpumask_of_cpu(i); | ||
| 4920 | break; | ||
| 4921 | } | ||
| 4922 | } | ||
| 4923 | } | ||
| 4924 | } | 5090 | } |
| 4925 | #endif | 5091 | #endif |
| 4926 | 5092 | ||
| @@ -4928,9 +5094,24 @@ static void check_sibling_maps(void) | |||
| 4928 | * Build sched domains for a given set of cpus and attach the sched domains | 5094 | * Build sched domains for a given set of cpus and attach the sched domains |
| 4929 | * to the individual cpus | 5095 | * to the individual cpus |
| 4930 | */ | 5096 | */ |
| 4931 | static void build_sched_domains(const cpumask_t *cpu_map) | 5097 | void build_sched_domains(const cpumask_t *cpu_map) |
| 4932 | { | 5098 | { |
| 4933 | int i; | 5099 | int i; |
| 5100 | #ifdef CONFIG_NUMA | ||
| 5101 | struct sched_group **sched_group_nodes = NULL; | ||
| 5102 | struct sched_group *sched_group_allnodes = NULL; | ||
| 5103 | |||
| 5104 | /* | ||
| 5105 | * Allocate the per-node list of sched groups | ||
| 5106 | */ | ||
| 5107 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | ||
| 5108 | GFP_ATOMIC); | ||
| 5109 | if (!sched_group_nodes) { | ||
| 5110 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
| 5111 | return; | ||
| 5112 | } | ||
| 5113 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
| 5114 | #endif | ||
| 4934 | 5115 | ||
| 4935 | /* | 5116 | /* |
| 4936 | * Set up domains for cpus specified by the cpu_map. | 5117 | * Set up domains for cpus specified by the cpu_map. |
| @@ -4943,11 +5124,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
| 4943 | cpus_and(nodemask, nodemask, *cpu_map); | 5124 | cpus_and(nodemask, nodemask, *cpu_map); |
| 4944 | 5125 | ||
| 4945 | #ifdef CONFIG_NUMA | 5126 | #ifdef CONFIG_NUMA |
| 5127 | if (cpus_weight(*cpu_map) | ||
| 5128 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | ||
| 5129 | if (!sched_group_allnodes) { | ||
| 5130 | sched_group_allnodes | ||
| 5131 | = kmalloc(sizeof(struct sched_group) | ||
| 5132 | * MAX_NUMNODES, | ||
| 5133 | GFP_KERNEL); | ||
| 5134 | if (!sched_group_allnodes) { | ||
| 5135 | printk(KERN_WARNING | ||
| 5136 | "Can not alloc allnodes sched group\n"); | ||
| 5137 | break; | ||
| 5138 | } | ||
| 5139 | sched_group_allnodes_bycpu[i] | ||
| 5140 | = sched_group_allnodes; | ||
| 5141 | } | ||
| 5142 | sd = &per_cpu(allnodes_domains, i); | ||
| 5143 | *sd = SD_ALLNODES_INIT; | ||
| 5144 | sd->span = *cpu_map; | ||
| 5145 | group = cpu_to_allnodes_group(i); | ||
| 5146 | sd->groups = &sched_group_allnodes[group]; | ||
| 5147 | p = sd; | ||
| 5148 | } else | ||
| 5149 | p = NULL; | ||
| 5150 | |||
| 4946 | sd = &per_cpu(node_domains, i); | 5151 | sd = &per_cpu(node_domains, i); |
| 4947 | group = cpu_to_node_group(i); | ||
| 4948 | *sd = SD_NODE_INIT; | 5152 | *sd = SD_NODE_INIT; |
| 4949 | sd->span = *cpu_map; | 5153 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
| 4950 | sd->groups = &sched_group_nodes[group]; | 5154 | sd->parent = p; |
| 5155 | cpus_and(sd->span, sd->span, *cpu_map); | ||
| 4951 | #endif | 5156 | #endif |
| 4952 | 5157 | ||
| 4953 | p = sd; | 5158 | p = sd; |
| @@ -4972,7 +5177,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
| 4972 | 5177 | ||
| 4973 | #ifdef CONFIG_SCHED_SMT | 5178 | #ifdef CONFIG_SCHED_SMT |
| 4974 | /* Set up CPU (sibling) groups */ | 5179 | /* Set up CPU (sibling) groups */ |
| 4975 | for_each_online_cpu(i) { | 5180 | for_each_cpu_mask(i, *cpu_map) { |
| 4976 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 5181 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
| 4977 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 5182 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
| 4978 | if (i != first_cpu(this_sibling_map)) | 5183 | if (i != first_cpu(this_sibling_map)) |
| @@ -4997,8 +5202,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
| 4997 | 5202 | ||
| 4998 | #ifdef CONFIG_NUMA | 5203 | #ifdef CONFIG_NUMA |
| 4999 | /* Set up node groups */ | 5204 | /* Set up node groups */ |
| 5000 | init_sched_build_groups(sched_group_nodes, *cpu_map, | 5205 | if (sched_group_allnodes) |
| 5001 | &cpu_to_node_group); | 5206 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
| 5207 | &cpu_to_allnodes_group); | ||
| 5208 | |||
| 5209 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 5210 | /* Set up node groups */ | ||
| 5211 | struct sched_group *sg, *prev; | ||
| 5212 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 5213 | cpumask_t domainspan; | ||
| 5214 | cpumask_t covered = CPU_MASK_NONE; | ||
| 5215 | int j; | ||
| 5216 | |||
| 5217 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 5218 | if (cpus_empty(nodemask)) { | ||
| 5219 | sched_group_nodes[i] = NULL; | ||
| 5220 | continue; | ||
| 5221 | } | ||
| 5222 | |||
| 5223 | domainspan = sched_domain_node_span(i); | ||
| 5224 | cpus_and(domainspan, domainspan, *cpu_map); | ||
| 5225 | |||
| 5226 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
| 5227 | sched_group_nodes[i] = sg; | ||
| 5228 | for_each_cpu_mask(j, nodemask) { | ||
| 5229 | struct sched_domain *sd; | ||
| 5230 | sd = &per_cpu(node_domains, j); | ||
| 5231 | sd->groups = sg; | ||
| 5232 | if (sd->groups == NULL) { | ||
| 5233 | /* Turn off balancing if we have no groups */ | ||
| 5234 | sd->flags = 0; | ||
| 5235 | } | ||
| 5236 | } | ||
| 5237 | if (!sg) { | ||
| 5238 | printk(KERN_WARNING | ||
| 5239 | "Can not alloc domain group for node %d\n", i); | ||
| 5240 | continue; | ||
| 5241 | } | ||
| 5242 | sg->cpu_power = 0; | ||
| 5243 | sg->cpumask = nodemask; | ||
| 5244 | cpus_or(covered, covered, nodemask); | ||
| 5245 | prev = sg; | ||
| 5246 | |||
| 5247 | for (j = 0; j < MAX_NUMNODES; j++) { | ||
| 5248 | cpumask_t tmp, notcovered; | ||
| 5249 | int n = (i + j) % MAX_NUMNODES; | ||
| 5250 | |||
| 5251 | cpus_complement(notcovered, covered); | ||
| 5252 | cpus_and(tmp, notcovered, *cpu_map); | ||
| 5253 | cpus_and(tmp, tmp, domainspan); | ||
| 5254 | if (cpus_empty(tmp)) | ||
| 5255 | break; | ||
| 5256 | |||
| 5257 | nodemask = node_to_cpumask(n); | ||
| 5258 | cpus_and(tmp, tmp, nodemask); | ||
| 5259 | if (cpus_empty(tmp)) | ||
| 5260 | continue; | ||
| 5261 | |||
| 5262 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
| 5263 | if (!sg) { | ||
| 5264 | printk(KERN_WARNING | ||
| 5265 | "Can not alloc domain group for node %d\n", j); | ||
| 5266 | break; | ||
| 5267 | } | ||
| 5268 | sg->cpu_power = 0; | ||
| 5269 | sg->cpumask = tmp; | ||
| 5270 | cpus_or(covered, covered, tmp); | ||
| 5271 | prev->next = sg; | ||
| 5272 | prev = sg; | ||
| 5273 | } | ||
| 5274 | prev->next = sched_group_nodes[i]; | ||
| 5275 | } | ||
| 5002 | #endif | 5276 | #endif |
| 5003 | 5277 | ||
| 5004 | /* Calculate CPU power for physical packages and nodes */ | 5278 | /* Calculate CPU power for physical packages and nodes */ |
| @@ -5017,14 +5291,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5017 | sd->groups->cpu_power = power; | 5291 | sd->groups->cpu_power = power; |
| 5018 | 5292 | ||
| 5019 | #ifdef CONFIG_NUMA | 5293 | #ifdef CONFIG_NUMA |
| 5020 | if (i == first_cpu(sd->groups->cpumask)) { | 5294 | sd = &per_cpu(allnodes_domains, i); |
| 5021 | /* Only add "power" once for each physical package. */ | 5295 | if (sd->groups) { |
| 5022 | sd = &per_cpu(node_domains, i); | 5296 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
| 5023 | sd->groups->cpu_power += power; | 5297 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
| 5298 | sd->groups->cpu_power = power; | ||
| 5024 | } | 5299 | } |
| 5025 | #endif | 5300 | #endif |
| 5026 | } | 5301 | } |
| 5027 | 5302 | ||
| 5303 | #ifdef CONFIG_NUMA | ||
| 5304 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 5305 | struct sched_group *sg = sched_group_nodes[i]; | ||
| 5306 | int j; | ||
| 5307 | |||
| 5308 | if (sg == NULL) | ||
| 5309 | continue; | ||
| 5310 | next_sg: | ||
| 5311 | for_each_cpu_mask(j, sg->cpumask) { | ||
| 5312 | struct sched_domain *sd; | ||
| 5313 | int power; | ||
| 5314 | |||
| 5315 | sd = &per_cpu(phys_domains, j); | ||
| 5316 | if (j != first_cpu(sd->groups->cpumask)) { | ||
| 5317 | /* | ||
| 5318 | * Only add "power" once for each | ||
| 5319 | * physical package. | ||
| 5320 | */ | ||
| 5321 | continue; | ||
| 5322 | } | ||
| 5323 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
| 5324 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
| 5325 | |||
| 5326 | sg->cpu_power += power; | ||
| 5327 | } | ||
| 5328 | sg = sg->next; | ||
| 5329 | if (sg != sched_group_nodes[i]) | ||
| 5330 | goto next_sg; | ||
| 5331 | } | ||
| 5332 | #endif | ||
| 5333 | |||
| 5028 | /* Attach the domains */ | 5334 | /* Attach the domains */ |
| 5029 | for_each_cpu_mask(i, *cpu_map) { | 5335 | for_each_cpu_mask(i, *cpu_map) { |
| 5030 | struct sched_domain *sd; | 5336 | struct sched_domain *sd; |
| @@ -5039,13 +5345,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5039 | /* | 5345 | /* |
| 5040 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5346 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 5041 | */ | 5347 | */ |
| 5042 | static void arch_init_sched_domains(cpumask_t *cpu_map) | 5348 | static void arch_init_sched_domains(const cpumask_t *cpu_map) |
| 5043 | { | 5349 | { |
| 5044 | cpumask_t cpu_default_map; | 5350 | cpumask_t cpu_default_map; |
| 5045 | 5351 | ||
| 5046 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
| 5047 | check_sibling_maps(); | ||
| 5048 | #endif | ||
| 5049 | /* | 5352 | /* |
| 5050 | * Setup mask for cpus without special case scheduling requirements. | 5353 | * Setup mask for cpus without special case scheduling requirements. |
| 5051 | * For now this just excludes isolated cpus, but could be used to | 5354 | * For now this just excludes isolated cpus, but could be used to |
| @@ -5058,10 +5361,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) | |||
| 5058 | 5361 | ||
| 5059 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 5362 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
| 5060 | { | 5363 | { |
| 5061 | /* Do nothing: everything is statically allocated. */ | 5364 | #ifdef CONFIG_NUMA |
| 5062 | } | 5365 | int i; |
| 5366 | int cpu; | ||
| 5063 | 5367 | ||
| 5064 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5368 | for_each_cpu_mask(cpu, *cpu_map) { |
| 5369 | struct sched_group *sched_group_allnodes | ||
| 5370 | = sched_group_allnodes_bycpu[cpu]; | ||
| 5371 | struct sched_group **sched_group_nodes | ||
| 5372 | = sched_group_nodes_bycpu[cpu]; | ||
| 5373 | |||
| 5374 | if (sched_group_allnodes) { | ||
| 5375 | kfree(sched_group_allnodes); | ||
| 5376 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 5377 | } | ||
| 5378 | |||
| 5379 | if (!sched_group_nodes) | ||
| 5380 | continue; | ||
| 5381 | |||
| 5382 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 5383 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 5384 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
| 5385 | |||
| 5386 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 5387 | if (cpus_empty(nodemask)) | ||
| 5388 | continue; | ||
| 5389 | |||
| 5390 | if (sg == NULL) | ||
| 5391 | continue; | ||
| 5392 | sg = sg->next; | ||
| 5393 | next_sg: | ||
| 5394 | oldsg = sg; | ||
| 5395 | sg = sg->next; | ||
| 5396 | kfree(oldsg); | ||
| 5397 | if (oldsg != sched_group_nodes[i]) | ||
| 5398 | goto next_sg; | ||
| 5399 | } | ||
| 5400 | kfree(sched_group_nodes); | ||
| 5401 | sched_group_nodes_bycpu[cpu] = NULL; | ||
| 5402 | } | ||
| 5403 | #endif | ||
| 5404 | } | ||
| 5065 | 5405 | ||
| 5066 | /* | 5406 | /* |
| 5067 | * Detach sched domains from a group of cpus specified in cpu_map | 5407 | * Detach sched domains from a group of cpus specified in cpu_map |
| @@ -5263,3 +5603,47 @@ void normalize_rt_tasks(void) | |||
| 5263 | } | 5603 | } |
| 5264 | 5604 | ||
| 5265 | #endif /* CONFIG_MAGIC_SYSRQ */ | 5605 | #endif /* CONFIG_MAGIC_SYSRQ */ |
| 5606 | |||
| 5607 | #ifdef CONFIG_IA64 | ||
| 5608 | /* | ||
| 5609 | * These functions are only useful for the IA64 MCA handling. | ||
| 5610 | * | ||
| 5611 | * They can only be called when the whole system has been | ||
| 5612 | * stopped - every CPU needs to be quiescent, and no scheduling | ||
| 5613 | * activity can take place. Using them for anything else would | ||
| 5614 | * be a serious bug, and as a result, they aren't even visible | ||
| 5615 | * under any other configuration. | ||
| 5616 | */ | ||
| 5617 | |||
| 5618 | /** | ||
| 5619 | * curr_task - return the current task for a given cpu. | ||
| 5620 | * @cpu: the processor in question. | ||
| 5621 | * | ||
| 5622 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | ||
| 5623 | */ | ||
| 5624 | task_t *curr_task(int cpu) | ||
| 5625 | { | ||
| 5626 | return cpu_curr(cpu); | ||
| 5627 | } | ||
| 5628 | |||
| 5629 | /** | ||
| 5630 | * set_curr_task - set the current task for a given cpu. | ||
| 5631 | * @cpu: the processor in question. | ||
| 5632 | * @p: the task pointer to set. | ||
| 5633 | * | ||
| 5634 | * Description: This function must only be used when non-maskable interrupts | ||
| 5635 | * are serviced on a separate stack. It allows the architecture to switch the | ||
| 5636 | * notion of the current task on a cpu in a non-blocking manner. This function | ||
| 5637 | * must be called with all CPU's synchronized, and interrupts disabled, the | ||
| 5638 | * and caller must save the original value of the current task (see | ||
| 5639 | * curr_task() above) and restore that value before reenabling interrupts and | ||
| 5640 | * re-starting the system. | ||
| 5641 | * | ||
| 5642 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | ||
| 5643 | */ | ||
| 5644 | void set_curr_task(int cpu, task_t *p) | ||
| 5645 | { | ||
| 5646 | cpu_curr(cpu) = p; | ||
| 5647 | } | ||
| 5648 | |||
| 5649 | #endif | ||
diff --git a/kernel/signal.c b/kernel/signal.c index d282fea81138..f2b96b08fb44 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask) | |||
| 262 | return sig; | 262 | return sig; |
| 263 | } | 263 | } |
| 264 | 264 | ||
| 265 | static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, | 265 | static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, |
| 266 | int override_rlimit) | 266 | int override_rlimit) |
| 267 | { | 267 | { |
| 268 | struct sigqueue *q = NULL; | 268 | struct sigqueue *q = NULL; |
| @@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk) | |||
| 397 | flush_sigqueue(&tsk->pending); | 397 | flush_sigqueue(&tsk->pending); |
| 398 | if (sig) { | 398 | if (sig) { |
| 399 | /* | 399 | /* |
| 400 | * We are cleaning up the signal_struct here. We delayed | 400 | * We are cleaning up the signal_struct here. |
| 401 | * calling exit_itimers until after flush_sigqueue, just in | ||
| 402 | * case our thread-local pending queue contained a queued | ||
| 403 | * timer signal that would have been cleared in | ||
| 404 | * exit_itimers. When that called sigqueue_free, it would | ||
| 405 | * attempt to re-take the tasklist_lock and deadlock. This | ||
| 406 | * can never happen if we ensure that all queues the | ||
| 407 | * timer's signal might be queued on have been flushed | ||
| 408 | * first. The shared_pending queue, and our own pending | ||
| 409 | * queue are the only queues the timer could be on, since | ||
| 410 | * there are no other threads left in the group and timer | ||
| 411 | * signals are constrained to threads inside the group. | ||
| 412 | */ | 401 | */ |
| 413 | exit_itimers(sig); | ||
| 414 | exit_thread_group_keys(sig); | 402 | exit_thread_group_keys(sig); |
| 415 | kmem_cache_free(signal_cachep, sig); | 403 | kmem_cache_free(signal_cachep, sig); |
| 416 | } | 404 | } |
| @@ -578,7 +566,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 578 | * is to alert stop-signal processing code when another | 566 | * is to alert stop-signal processing code when another |
| 579 | * processor has come along and cleared the flag. | 567 | * processor has come along and cleared the flag. |
| 580 | */ | 568 | */ |
| 581 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 569 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
| 570 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | ||
| 582 | } | 571 | } |
| 583 | if ( signr && | 572 | if ( signr && |
| 584 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 573 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
| @@ -678,7 +667,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 678 | 667 | ||
| 679 | /* forward decl */ | 668 | /* forward decl */ |
| 680 | static void do_notify_parent_cldstop(struct task_struct *tsk, | 669 | static void do_notify_parent_cldstop(struct task_struct *tsk, |
| 681 | struct task_struct *parent, | 670 | int to_self, |
| 682 | int why); | 671 | int why); |
| 683 | 672 | ||
| 684 | /* | 673 | /* |
| @@ -729,14 +718,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
| 729 | p->signal->group_stop_count = 0; | 718 | p->signal->group_stop_count = 0; |
| 730 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 719 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
| 731 | spin_unlock(&p->sighand->siglock); | 720 | spin_unlock(&p->sighand->siglock); |
| 732 | if (p->ptrace & PT_PTRACED) | 721 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); |
| 733 | do_notify_parent_cldstop(p, p->parent, | ||
| 734 | CLD_STOPPED); | ||
| 735 | else | ||
| 736 | do_notify_parent_cldstop( | ||
| 737 | p->group_leader, | ||
| 738 | p->group_leader->real_parent, | ||
| 739 | CLD_STOPPED); | ||
| 740 | spin_lock(&p->sighand->siglock); | 722 | spin_lock(&p->sighand->siglock); |
| 741 | } | 723 | } |
| 742 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | 724 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); |
| @@ -777,14 +759,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
| 777 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 759 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
| 778 | p->signal->group_exit_code = 0; | 760 | p->signal->group_exit_code = 0; |
| 779 | spin_unlock(&p->sighand->siglock); | 761 | spin_unlock(&p->sighand->siglock); |
| 780 | if (p->ptrace & PT_PTRACED) | 762 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); |
| 781 | do_notify_parent_cldstop(p, p->parent, | ||
| 782 | CLD_CONTINUED); | ||
| 783 | else | ||
| 784 | do_notify_parent_cldstop( | ||
| 785 | p->group_leader, | ||
| 786 | p->group_leader->real_parent, | ||
| 787 | CLD_CONTINUED); | ||
| 788 | spin_lock(&p->sighand->siglock); | 763 | spin_lock(&p->sighand->siglock); |
| 789 | } else { | 764 | } else { |
| 790 | /* | 765 | /* |
| @@ -950,34 +925,31 @@ force_sig_specific(int sig, struct task_struct *t) | |||
| 950 | * as soon as they're available, so putting the signal on the shared queue | 925 | * as soon as they're available, so putting the signal on the shared queue |
| 951 | * will be equivalent to sending it to one such thread. | 926 | * will be equivalent to sending it to one such thread. |
| 952 | */ | 927 | */ |
| 953 | #define wants_signal(sig, p, mask) \ | 928 | static inline int wants_signal(int sig, struct task_struct *p) |
| 954 | (!sigismember(&(p)->blocked, sig) \ | 929 | { |
| 955 | && !((p)->state & mask) \ | 930 | if (sigismember(&p->blocked, sig)) |
| 956 | && !((p)->flags & PF_EXITING) \ | 931 | return 0; |
| 957 | && (task_curr(p) || !signal_pending(p))) | 932 | if (p->flags & PF_EXITING) |
| 958 | 933 | return 0; | |
| 934 | if (sig == SIGKILL) | ||
| 935 | return 1; | ||
| 936 | if (p->state & (TASK_STOPPED | TASK_TRACED)) | ||
| 937 | return 0; | ||
| 938 | return task_curr(p) || !signal_pending(p); | ||
| 939 | } | ||
| 959 | 940 | ||
| 960 | static void | 941 | static void |
| 961 | __group_complete_signal(int sig, struct task_struct *p) | 942 | __group_complete_signal(int sig, struct task_struct *p) |
| 962 | { | 943 | { |
| 963 | unsigned int mask; | ||
| 964 | struct task_struct *t; | 944 | struct task_struct *t; |
| 965 | 945 | ||
| 966 | /* | 946 | /* |
| 967 | * Don't bother traced and stopped tasks (but | ||
| 968 | * SIGKILL will punch through that). | ||
| 969 | */ | ||
| 970 | mask = TASK_STOPPED | TASK_TRACED; | ||
| 971 | if (sig == SIGKILL) | ||
| 972 | mask = 0; | ||
| 973 | |||
| 974 | /* | ||
| 975 | * Now find a thread we can wake up to take the signal off the queue. | 947 | * Now find a thread we can wake up to take the signal off the queue. |
| 976 | * | 948 | * |
| 977 | * If the main thread wants the signal, it gets first crack. | 949 | * If the main thread wants the signal, it gets first crack. |
| 978 | * Probably the least surprising to the average bear. | 950 | * Probably the least surprising to the average bear. |
| 979 | */ | 951 | */ |
| 980 | if (wants_signal(sig, p, mask)) | 952 | if (wants_signal(sig, p)) |
| 981 | t = p; | 953 | t = p; |
| 982 | else if (thread_group_empty(p)) | 954 | else if (thread_group_empty(p)) |
| 983 | /* | 955 | /* |
| @@ -995,7 +967,7 @@ __group_complete_signal(int sig, struct task_struct *p) | |||
| 995 | t = p->signal->curr_target = p; | 967 | t = p->signal->curr_target = p; |
| 996 | BUG_ON(t->tgid != p->tgid); | 968 | BUG_ON(t->tgid != p->tgid); |
| 997 | 969 | ||
| 998 | while (!wants_signal(sig, t, mask)) { | 970 | while (!wants_signal(sig, t)) { |
| 999 | t = next_thread(t); | 971 | t = next_thread(t); |
| 1000 | if (t == p->signal->curr_target) | 972 | if (t == p->signal->curr_target) |
| 1001 | /* | 973 | /* |
| @@ -1209,6 +1181,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
| 1209 | return error; | 1181 | return error; |
| 1210 | } | 1182 | } |
| 1211 | 1183 | ||
| 1184 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ | ||
| 1185 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | ||
| 1186 | uid_t uid, uid_t euid) | ||
| 1187 | { | ||
| 1188 | int ret = -EINVAL; | ||
| 1189 | struct task_struct *p; | ||
| 1190 | |||
| 1191 | if (!valid_signal(sig)) | ||
| 1192 | return ret; | ||
| 1193 | |||
| 1194 | read_lock(&tasklist_lock); | ||
| 1195 | p = find_task_by_pid(pid); | ||
| 1196 | if (!p) { | ||
| 1197 | ret = -ESRCH; | ||
| 1198 | goto out_unlock; | ||
| 1199 | } | ||
| 1200 | if ((!info || ((unsigned long)info != 1 && | ||
| 1201 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
| 1202 | && (euid != p->suid) && (euid != p->uid) | ||
| 1203 | && (uid != p->suid) && (uid != p->uid)) { | ||
| 1204 | ret = -EPERM; | ||
| 1205 | goto out_unlock; | ||
| 1206 | } | ||
| 1207 | if (sig && p->sighand) { | ||
| 1208 | unsigned long flags; | ||
| 1209 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 1210 | ret = __group_send_sig_info(sig, info, p); | ||
| 1211 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 1212 | } | ||
| 1213 | out_unlock: | ||
| 1214 | read_unlock(&tasklist_lock); | ||
| 1215 | return ret; | ||
| 1216 | } | ||
| 1217 | EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); | ||
| 1212 | 1218 | ||
| 1213 | /* | 1219 | /* |
| 1214 | * kill_something_info() interprets pid in interesting ways just like kill(2). | 1220 | * kill_something_info() interprets pid in interesting ways just like kill(2). |
| @@ -1380,16 +1386,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1380 | unsigned long flags; | 1386 | unsigned long flags; |
| 1381 | int ret = 0; | 1387 | int ret = 0; |
| 1382 | 1388 | ||
| 1383 | /* | ||
| 1384 | * We need the tasklist lock even for the specific | ||
| 1385 | * thread case (when we don't need to follow the group | ||
| 1386 | * lists) in order to avoid races with "p->sighand" | ||
| 1387 | * going away or changing from under us. | ||
| 1388 | */ | ||
| 1389 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1389 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
| 1390 | read_lock(&tasklist_lock); | 1390 | read_lock(&tasklist_lock); |
| 1391 | |||
| 1392 | if (unlikely(p->flags & PF_EXITING)) { | ||
| 1393 | ret = -1; | ||
| 1394 | goto out_err; | ||
| 1395 | } | ||
| 1396 | |||
| 1391 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1397 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 1392 | 1398 | ||
| 1393 | if (unlikely(!list_empty(&q->list))) { | 1399 | if (unlikely(!list_empty(&q->list))) { |
| 1394 | /* | 1400 | /* |
| 1395 | * If an SI_TIMER entry is already queue just increment | 1401 | * If an SI_TIMER entry is already queue just increment |
| @@ -1399,7 +1405,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1399 | BUG(); | 1405 | BUG(); |
| 1400 | q->info.si_overrun++; | 1406 | q->info.si_overrun++; |
| 1401 | goto out; | 1407 | goto out; |
| 1402 | } | 1408 | } |
| 1403 | /* Short-circuit ignored signals. */ | 1409 | /* Short-circuit ignored signals. */ |
| 1404 | if (sig_ignored(p, sig)) { | 1410 | if (sig_ignored(p, sig)) { |
| 1405 | ret = 1; | 1411 | ret = 1; |
| @@ -1414,8 +1420,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1414 | 1420 | ||
| 1415 | out: | 1421 | out: |
| 1416 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1422 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 1423 | out_err: | ||
| 1417 | read_unlock(&tasklist_lock); | 1424 | read_unlock(&tasklist_lock); |
| 1418 | return(ret); | 1425 | |
| 1426 | return ret; | ||
| 1419 | } | 1427 | } |
| 1420 | 1428 | ||
| 1421 | int | 1429 | int |
| @@ -1542,14 +1550,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1542 | spin_unlock_irqrestore(&psig->siglock, flags); | 1550 | spin_unlock_irqrestore(&psig->siglock, flags); |
| 1543 | } | 1551 | } |
| 1544 | 1552 | ||
| 1545 | static void | 1553 | static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) |
| 1546 | do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, | ||
| 1547 | int why) | ||
| 1548 | { | 1554 | { |
| 1549 | struct siginfo info; | 1555 | struct siginfo info; |
| 1550 | unsigned long flags; | 1556 | unsigned long flags; |
| 1557 | struct task_struct *parent; | ||
| 1551 | struct sighand_struct *sighand; | 1558 | struct sighand_struct *sighand; |
| 1552 | 1559 | ||
| 1560 | if (to_self) | ||
| 1561 | parent = tsk->parent; | ||
| 1562 | else { | ||
| 1563 | tsk = tsk->group_leader; | ||
| 1564 | parent = tsk->real_parent; | ||
| 1565 | } | ||
| 1566 | |||
| 1553 | info.si_signo = SIGCHLD; | 1567 | info.si_signo = SIGCHLD; |
| 1554 | info.si_errno = 0; | 1568 | info.si_errno = 0; |
| 1555 | info.si_pid = tsk->pid; | 1569 | info.si_pid = tsk->pid; |
| @@ -1618,8 +1632,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
| 1618 | !(current->ptrace & PT_ATTACHED)) && | 1632 | !(current->ptrace & PT_ATTACHED)) && |
| 1619 | (likely(current->parent->signal != current->signal) || | 1633 | (likely(current->parent->signal != current->signal) || |
| 1620 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | 1634 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { |
| 1621 | do_notify_parent_cldstop(current, current->parent, | 1635 | do_notify_parent_cldstop(current, 1, CLD_TRAPPED); |
| 1622 | CLD_TRAPPED); | ||
| 1623 | read_unlock(&tasklist_lock); | 1636 | read_unlock(&tasklist_lock); |
| 1624 | schedule(); | 1637 | schedule(); |
| 1625 | } else { | 1638 | } else { |
| @@ -1668,25 +1681,25 @@ void ptrace_notify(int exit_code) | |||
| 1668 | static void | 1681 | static void |
| 1669 | finish_stop(int stop_count) | 1682 | finish_stop(int stop_count) |
| 1670 | { | 1683 | { |
| 1684 | int to_self; | ||
| 1685 | |||
| 1671 | /* | 1686 | /* |
| 1672 | * If there are no other threads in the group, or if there is | 1687 | * If there are no other threads in the group, or if there is |
| 1673 | * a group stop in progress and we are the last to stop, | 1688 | * a group stop in progress and we are the last to stop, |
| 1674 | * report to the parent. When ptraced, every thread reports itself. | 1689 | * report to the parent. When ptraced, every thread reports itself. |
| 1675 | */ | 1690 | */ |
| 1676 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { | 1691 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) |
| 1677 | read_lock(&tasklist_lock); | 1692 | to_self = 1; |
| 1678 | do_notify_parent_cldstop(current, current->parent, | 1693 | else if (stop_count == 0) |
| 1679 | CLD_STOPPED); | 1694 | to_self = 0; |
| 1680 | read_unlock(&tasklist_lock); | 1695 | else |
| 1681 | } | 1696 | goto out; |
| 1682 | else if (stop_count == 0) { | ||
| 1683 | read_lock(&tasklist_lock); | ||
| 1684 | do_notify_parent_cldstop(current->group_leader, | ||
| 1685 | current->group_leader->real_parent, | ||
| 1686 | CLD_STOPPED); | ||
| 1687 | read_unlock(&tasklist_lock); | ||
| 1688 | } | ||
| 1689 | 1697 | ||
| 1698 | read_lock(&tasklist_lock); | ||
| 1699 | do_notify_parent_cldstop(current, to_self, CLD_STOPPED); | ||
| 1700 | read_unlock(&tasklist_lock); | ||
| 1701 | |||
| 1702 | out: | ||
| 1690 | schedule(); | 1703 | schedule(); |
| 1691 | /* | 1704 | /* |
| 1692 | * Now we don't run again until continued. | 1705 | * Now we don't run again until continued. |
| @@ -1773,7 +1786,8 @@ do_signal_stop(int signr) | |||
| 1773 | * stop is always done with the siglock held, | 1786 | * stop is always done with the siglock held, |
| 1774 | * so this check has no races. | 1787 | * so this check has no races. |
| 1775 | */ | 1788 | */ |
| 1776 | if (t->state < TASK_STOPPED) { | 1789 | if (!t->exit_state && |
| 1790 | !(t->state & (TASK_STOPPED|TASK_TRACED))) { | ||
| 1777 | stop_count++; | 1791 | stop_count++; |
| 1778 | signal_wake_up(t, 0); | 1792 | signal_wake_up(t, 0); |
| 1779 | } | 1793 | } |
| @@ -2228,8 +2242,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
| 2228 | recalc_sigpending(); | 2242 | recalc_sigpending(); |
| 2229 | spin_unlock_irq(¤t->sighand->siglock); | 2243 | spin_unlock_irq(¤t->sighand->siglock); |
| 2230 | 2244 | ||
| 2231 | current->state = TASK_INTERRUPTIBLE; | 2245 | timeout = schedule_timeout_interruptible(timeout); |
| 2232 | timeout = schedule_timeout(timeout); | ||
| 2233 | 2246 | ||
| 2234 | try_to_freeze(); | 2247 | try_to_freeze(); |
| 2235 | spin_lock_irq(¤t->sighand->siglock); | 2248 | spin_lock_irq(¤t->sighand->siglock); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) | |||
| 84 | cpu = smp_processor_id(); | 84 | cpu = smp_processor_id(); |
| 85 | restart: | 85 | restart: |
| 86 | /* Reset the pending bitmask before enabling irqs */ | 86 | /* Reset the pending bitmask before enabling irqs */ |
| 87 | local_softirq_pending() = 0; | 87 | set_softirq_pending(0); |
| 88 | 88 | ||
| 89 | local_irq_enable(); | 89 | local_irq_enable(); |
| 90 | 90 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c | |||
| @@ -0,0 +1,151 @@ | |||
| 1 | /* | ||
| 2 | * Detect Soft Lockups | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar, (C) 2005, Red Hat | ||
| 5 | * | ||
| 6 | * this code detects soft lockups: incidents in where on a CPU | ||
| 7 | * the kernel does not reschedule for 10 seconds or more. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/cpu.h> | ||
| 12 | #include <linux/init.h> | ||
| 13 | #include <linux/delay.h> | ||
| 14 | #include <linux/kthread.h> | ||
| 15 | #include <linux/notifier.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | |||
| 18 | static DEFINE_SPINLOCK(print_lock); | ||
| 19 | |||
| 20 | static DEFINE_PER_CPU(unsigned long, timestamp) = 0; | ||
| 21 | static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; | ||
| 22 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | ||
| 23 | |||
| 24 | static int did_panic = 0; | ||
| 25 | static int softlock_panic(struct notifier_block *this, unsigned long event, | ||
| 26 | void *ptr) | ||
| 27 | { | ||
| 28 | did_panic = 1; | ||
| 29 | |||
| 30 | return NOTIFY_DONE; | ||
| 31 | } | ||
| 32 | |||
| 33 | static struct notifier_block panic_block = { | ||
| 34 | .notifier_call = softlock_panic, | ||
| 35 | }; | ||
| 36 | |||
| 37 | void touch_softlockup_watchdog(void) | ||
| 38 | { | ||
| 39 | per_cpu(timestamp, raw_smp_processor_id()) = jiffies; | ||
| 40 | } | ||
| 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
| 42 | |||
| 43 | /* | ||
| 44 | * This callback runs from the timer interrupt, and checks | ||
| 45 | * whether the watchdog thread has hung or not: | ||
| 46 | */ | ||
| 47 | void softlockup_tick(struct pt_regs *regs) | ||
| 48 | { | ||
| 49 | int this_cpu = smp_processor_id(); | ||
| 50 | unsigned long timestamp = per_cpu(timestamp, this_cpu); | ||
| 51 | |||
| 52 | if (per_cpu(print_timestamp, this_cpu) == timestamp) | ||
| 53 | return; | ||
| 54 | |||
| 55 | /* Do not cause a second panic when there already was one */ | ||
| 56 | if (did_panic) | ||
| 57 | return; | ||
| 58 | |||
| 59 | if (time_after(jiffies, timestamp + 10*HZ)) { | ||
| 60 | per_cpu(print_timestamp, this_cpu) = timestamp; | ||
| 61 | |||
| 62 | spin_lock(&print_lock); | ||
| 63 | printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", | ||
| 64 | this_cpu); | ||
| 65 | show_regs(regs); | ||
| 66 | spin_unlock(&print_lock); | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | /* | ||
| 71 | * The watchdog thread - runs every second and touches the timestamp. | ||
| 72 | */ | ||
| 73 | static int watchdog(void * __bind_cpu) | ||
| 74 | { | ||
| 75 | struct sched_param param = { .sched_priority = 99 }; | ||
| 76 | int this_cpu = (long) __bind_cpu; | ||
| 77 | |||
| 78 | printk("softlockup thread %d started up.\n", this_cpu); | ||
| 79 | |||
| 80 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
| 81 | current->flags |= PF_NOFREEZE; | ||
| 82 | |||
| 83 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Run briefly once per second - if this gets delayed for | ||
| 87 | * more than 10 seconds then the debug-printout triggers | ||
| 88 | * in softlockup_tick(): | ||
| 89 | */ | ||
| 90 | while (!kthread_should_stop()) { | ||
| 91 | msleep_interruptible(1000); | ||
| 92 | touch_softlockup_watchdog(); | ||
| 93 | } | ||
| 94 | __set_current_state(TASK_RUNNING); | ||
| 95 | |||
| 96 | return 0; | ||
| 97 | } | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Create/destroy watchdog threads as CPUs come and go: | ||
| 101 | */ | ||
| 102 | static int __devinit | ||
| 103 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
| 104 | { | ||
| 105 | int hotcpu = (unsigned long)hcpu; | ||
| 106 | struct task_struct *p; | ||
| 107 | |||
| 108 | switch (action) { | ||
| 109 | case CPU_UP_PREPARE: | ||
| 110 | BUG_ON(per_cpu(watchdog_task, hotcpu)); | ||
| 111 | p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | ||
| 112 | if (IS_ERR(p)) { | ||
| 113 | printk("watchdog for %i failed\n", hotcpu); | ||
| 114 | return NOTIFY_BAD; | ||
| 115 | } | ||
| 116 | per_cpu(watchdog_task, hotcpu) = p; | ||
| 117 | kthread_bind(p, hotcpu); | ||
| 118 | break; | ||
| 119 | case CPU_ONLINE: | ||
| 120 | |||
| 121 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | ||
| 122 | break; | ||
| 123 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 124 | case CPU_UP_CANCELED: | ||
| 125 | /* Unbind so it can run. Fall thru. */ | ||
| 126 | kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); | ||
| 127 | case CPU_DEAD: | ||
| 128 | p = per_cpu(watchdog_task, hotcpu); | ||
| 129 | per_cpu(watchdog_task, hotcpu) = NULL; | ||
| 130 | kthread_stop(p); | ||
| 131 | break; | ||
| 132 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 133 | } | ||
| 134 | return NOTIFY_OK; | ||
| 135 | } | ||
| 136 | |||
| 137 | static struct notifier_block __devinitdata cpu_nfb = { | ||
| 138 | .notifier_call = cpu_callback | ||
| 139 | }; | ||
| 140 | |||
| 141 | __init void spawn_softlockup_task(void) | ||
| 142 | { | ||
| 143 | void *cpu = (void *)(long)smp_processor_id(); | ||
| 144 | |||
| 145 | cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
| 146 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
| 147 | register_cpu_notifier(&cpu_nfb); | ||
| 148 | |||
| 149 | notifier_chain_register(&panic_notifier_list, &panic_block); | ||
| 150 | } | ||
| 151 | |||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
| @@ -3,7 +3,10 @@ | |||
| 3 | * | 3 | * |
| 4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> | 4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> |
| 5 | * | 5 | * |
| 6 | * Copyright (2004) Ingo Molnar | 6 | * Copyright (2004, 2005) Ingo Molnar |
| 7 | * | ||
| 8 | * This file contains the spinlock/rwlock implementations for the | ||
| 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | ||
| 7 | */ | 10 | */ |
| 8 | 11 | ||
| 9 | #include <linux/config.h> | 12 | #include <linux/config.h> |
| @@ -17,12 +20,12 @@ | |||
| 17 | * Generic declaration of the raw read_trylock() function, | 20 | * Generic declaration of the raw read_trylock() function, |
| 18 | * architectures are supposed to optimize this: | 21 | * architectures are supposed to optimize this: |
| 19 | */ | 22 | */ |
| 20 | int __lockfunc generic_raw_read_trylock(rwlock_t *lock) | 23 | int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) |
| 21 | { | 24 | { |
| 22 | _raw_read_lock(lock); | 25 | __raw_read_lock(lock); |
| 23 | return 1; | 26 | return 1; |
| 24 | } | 27 | } |
| 25 | EXPORT_SYMBOL(generic_raw_read_trylock); | 28 | EXPORT_SYMBOL(generic__raw_read_trylock); |
| 26 | 29 | ||
| 27 | int __lockfunc _spin_trylock(spinlock_t *lock) | 30 | int __lockfunc _spin_trylock(spinlock_t *lock) |
| 28 | { | 31 | { |
| @@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) | |||
| 57 | } | 60 | } |
| 58 | EXPORT_SYMBOL(_write_trylock); | 61 | EXPORT_SYMBOL(_write_trylock); |
| 59 | 62 | ||
| 60 | #ifndef CONFIG_PREEMPT | 63 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) |
| 61 | 64 | ||
| 62 | void __lockfunc _read_lock(rwlock_t *lock) | 65 | void __lockfunc _read_lock(rwlock_t *lock) |
| 63 | { | 66 | { |
| @@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
| 72 | 75 | ||
| 73 | local_irq_save(flags); | 76 | local_irq_save(flags); |
| 74 | preempt_disable(); | 77 | preempt_disable(); |
| 75 | _raw_spin_lock_flags(lock, flags); | 78 | _raw_spin_lock_flags(lock, &flags); |
| 76 | return flags; | 79 | return flags; |
| 77 | } | 80 | } |
| 78 | EXPORT_SYMBOL(_spin_lock_irqsave); | 81 | EXPORT_SYMBOL(_spin_lock_irqsave); |
diff --git a/kernel/sys.c b/kernel/sys.c index 0bcaed6560ac..2fa1ed18123c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -361,17 +361,35 @@ out_unlock: | |||
| 361 | return retval; | 361 | return retval; |
| 362 | } | 362 | } |
| 363 | 363 | ||
| 364 | /** | ||
| 365 | * emergency_restart - reboot the system | ||
| 366 | * | ||
| 367 | * Without shutting down any hardware or taking any locks | ||
| 368 | * reboot the system. This is called when we know we are in | ||
| 369 | * trouble so this is our best effort to reboot. This is | ||
| 370 | * safe to call in interrupt context. | ||
| 371 | */ | ||
| 364 | void emergency_restart(void) | 372 | void emergency_restart(void) |
| 365 | { | 373 | { |
| 366 | machine_emergency_restart(); | 374 | machine_emergency_restart(); |
| 367 | } | 375 | } |
| 368 | EXPORT_SYMBOL_GPL(emergency_restart); | 376 | EXPORT_SYMBOL_GPL(emergency_restart); |
| 369 | 377 | ||
| 370 | void kernel_restart(char *cmd) | 378 | /** |
| 379 | * kernel_restart - reboot the system | ||
| 380 | * | ||
| 381 | * Shutdown everything and perform a clean reboot. | ||
| 382 | * This is not safe to call in interrupt context. | ||
| 383 | */ | ||
| 384 | void kernel_restart_prepare(char *cmd) | ||
| 371 | { | 385 | { |
| 372 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 386 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
| 373 | system_state = SYSTEM_RESTART; | 387 | system_state = SYSTEM_RESTART; |
| 374 | device_shutdown(); | 388 | device_shutdown(); |
| 389 | } | ||
| 390 | void kernel_restart(char *cmd) | ||
| 391 | { | ||
| 392 | kernel_restart_prepare(cmd); | ||
| 375 | if (!cmd) { | 393 | if (!cmd) { |
| 376 | printk(KERN_EMERG "Restarting system.\n"); | 394 | printk(KERN_EMERG "Restarting system.\n"); |
| 377 | } else { | 395 | } else { |
| @@ -382,6 +400,12 @@ void kernel_restart(char *cmd) | |||
| 382 | } | 400 | } |
| 383 | EXPORT_SYMBOL_GPL(kernel_restart); | 401 | EXPORT_SYMBOL_GPL(kernel_restart); |
| 384 | 402 | ||
| 403 | /** | ||
| 404 | * kernel_kexec - reboot the system | ||
| 405 | * | ||
| 406 | * Move into place and start executing a preloaded standalone | ||
| 407 | * executable. If nothing was preloaded return an error. | ||
| 408 | */ | ||
| 385 | void kernel_kexec(void) | 409 | void kernel_kexec(void) |
| 386 | { | 410 | { |
| 387 | #ifdef CONFIG_KEXEC | 411 | #ifdef CONFIG_KEXEC |
| @@ -390,9 +414,7 @@ void kernel_kexec(void) | |||
| 390 | if (!image) { | 414 | if (!image) { |
| 391 | return; | 415 | return; |
| 392 | } | 416 | } |
| 393 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | 417 | kernel_restart_prepare(NULL); |
| 394 | system_state = SYSTEM_RESTART; | ||
| 395 | device_shutdown(); | ||
| 396 | printk(KERN_EMERG "Starting new kernel\n"); | 418 | printk(KERN_EMERG "Starting new kernel\n"); |
| 397 | machine_shutdown(); | 419 | machine_shutdown(); |
| 398 | machine_kexec(image); | 420 | machine_kexec(image); |
| @@ -400,21 +422,39 @@ void kernel_kexec(void) | |||
| 400 | } | 422 | } |
| 401 | EXPORT_SYMBOL_GPL(kernel_kexec); | 423 | EXPORT_SYMBOL_GPL(kernel_kexec); |
| 402 | 424 | ||
| 403 | void kernel_halt(void) | 425 | /** |
| 426 | * kernel_halt - halt the system | ||
| 427 | * | ||
| 428 | * Shutdown everything and perform a clean system halt. | ||
| 429 | */ | ||
| 430 | void kernel_halt_prepare(void) | ||
| 404 | { | 431 | { |
| 405 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | 432 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); |
| 406 | system_state = SYSTEM_HALT; | 433 | system_state = SYSTEM_HALT; |
| 407 | device_shutdown(); | 434 | device_shutdown(); |
| 435 | } | ||
| 436 | void kernel_halt(void) | ||
| 437 | { | ||
| 438 | kernel_halt_prepare(); | ||
| 408 | printk(KERN_EMERG "System halted.\n"); | 439 | printk(KERN_EMERG "System halted.\n"); |
| 409 | machine_halt(); | 440 | machine_halt(); |
| 410 | } | 441 | } |
| 411 | EXPORT_SYMBOL_GPL(kernel_halt); | 442 | EXPORT_SYMBOL_GPL(kernel_halt); |
| 412 | 443 | ||
| 413 | void kernel_power_off(void) | 444 | /** |
| 445 | * kernel_power_off - power_off the system | ||
| 446 | * | ||
| 447 | * Shutdown everything and perform a clean system power_off. | ||
| 448 | */ | ||
| 449 | void kernel_power_off_prepare(void) | ||
| 414 | { | 450 | { |
| 415 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | 451 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); |
| 416 | system_state = SYSTEM_POWER_OFF; | 452 | system_state = SYSTEM_POWER_OFF; |
| 417 | device_shutdown(); | 453 | device_shutdown(); |
| 454 | } | ||
| 455 | void kernel_power_off(void) | ||
| 456 | { | ||
| 457 | kernel_power_off_prepare(); | ||
| 418 | printk(KERN_EMERG "Power down.\n"); | 458 | printk(KERN_EMERG "Power down.\n"); |
| 419 | machine_power_off(); | 459 | machine_power_off(); |
| 420 | } | 460 | } |
| @@ -1711,7 +1751,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1711 | unsigned long arg4, unsigned long arg5) | 1751 | unsigned long arg4, unsigned long arg5) |
| 1712 | { | 1752 | { |
| 1713 | long error; | 1753 | long error; |
| 1714 | int sig; | ||
| 1715 | 1754 | ||
| 1716 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); | 1755 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); |
| 1717 | if (error) | 1756 | if (error) |
| @@ -1719,19 +1758,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1719 | 1758 | ||
| 1720 | switch (option) { | 1759 | switch (option) { |
| 1721 | case PR_SET_PDEATHSIG: | 1760 | case PR_SET_PDEATHSIG: |
| 1722 | sig = arg2; | 1761 | if (!valid_signal(arg2)) { |
| 1723 | if (!valid_signal(sig)) { | ||
| 1724 | error = -EINVAL; | 1762 | error = -EINVAL; |
| 1725 | break; | 1763 | break; |
| 1726 | } | 1764 | } |
| 1727 | current->pdeath_signal = sig; | 1765 | current->pdeath_signal = arg2; |
| 1728 | break; | 1766 | break; |
| 1729 | case PR_GET_PDEATHSIG: | 1767 | case PR_GET_PDEATHSIG: |
| 1730 | error = put_user(current->pdeath_signal, (int __user *)arg2); | 1768 | error = put_user(current->pdeath_signal, (int __user *)arg2); |
| 1731 | break; | 1769 | break; |
| 1732 | case PR_GET_DUMPABLE: | 1770 | case PR_GET_DUMPABLE: |
| 1733 | if (current->mm->dumpable) | 1771 | error = current->mm->dumpable; |
| 1734 | error = 1; | ||
| 1735 | break; | 1772 | break; |
| 1736 | case PR_SET_DUMPABLE: | 1773 | case PR_SET_DUMPABLE: |
| 1737 | if (arg2 < 0 || arg2 > 2) { | 1774 | if (arg2 < 0 || arg2 > 2) { |
diff --git a/kernel/time.c b/kernel/time.c index dd5ae1162a8f..40c2410ac99a 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -570,6 +570,7 @@ void getnstimeofday(struct timespec *tv) | |||
| 570 | tv->tv_sec = x.tv_sec; | 570 | tv->tv_sec = x.tv_sec; |
| 571 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | 571 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; |
| 572 | } | 572 | } |
| 573 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
| 573 | #endif | 574 | #endif |
| 574 | 575 | ||
| 575 | #if (BITS_PER_LONG < 64) | 576 | #if (BITS_PER_LONG < 64) |
diff --git a/kernel/timer.c b/kernel/timer.c index 5377f40723ff..3ba10fa35b60 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) | |||
| 950 | { | 950 | { |
| 951 | jiffies_64++; | 951 | jiffies_64++; |
| 952 | update_times(); | 952 | update_times(); |
| 953 | softlockup_tick(regs); | ||
| 953 | } | 954 | } |
| 954 | 955 | ||
| 955 | #ifdef __ARCH_WANT_SYS_ALARM | 956 | #ifdef __ARCH_WANT_SYS_ALARM |
| @@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
| 1150 | out: | 1151 | out: |
| 1151 | return timeout < 0 ? 0 : timeout; | 1152 | return timeout < 0 ? 0 : timeout; |
| 1152 | } | 1153 | } |
| 1153 | |||
| 1154 | EXPORT_SYMBOL(schedule_timeout); | 1154 | EXPORT_SYMBOL(schedule_timeout); |
| 1155 | 1155 | ||
| 1156 | /* | ||
| 1157 | * We can use __set_current_state() here because schedule_timeout() calls | ||
| 1158 | * schedule() unconditionally. | ||
| 1159 | */ | ||
| 1160 | signed long __sched schedule_timeout_interruptible(signed long timeout) | ||
| 1161 | { | ||
| 1162 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 1163 | return schedule_timeout(timeout); | ||
| 1164 | } | ||
| 1165 | EXPORT_SYMBOL(schedule_timeout_interruptible); | ||
| 1166 | |||
| 1167 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | ||
| 1168 | { | ||
| 1169 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 1170 | return schedule_timeout(timeout); | ||
| 1171 | } | ||
| 1172 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | ||
| 1173 | |||
| 1156 | /* Thread ID - the internal kernel "pid" */ | 1174 | /* Thread ID - the internal kernel "pid" */ |
| 1157 | asmlinkage long sys_gettid(void) | 1175 | asmlinkage long sys_gettid(void) |
| 1158 | { | 1176 | { |
| @@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) | |||
| 1169 | if (!time_after(expire, now)) | 1187 | if (!time_after(expire, now)) |
| 1170 | return 0; | 1188 | return 0; |
| 1171 | 1189 | ||
| 1172 | current->state = TASK_INTERRUPTIBLE; | 1190 | expire = schedule_timeout_interruptible(expire - now); |
| 1173 | expire = schedule_timeout(expire - now); | ||
| 1174 | 1191 | ||
| 1175 | ret = 0; | 1192 | ret = 0; |
| 1176 | if (expire) { | 1193 | if (expire) { |
| @@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us | |||
| 1198 | return -EINVAL; | 1215 | return -EINVAL; |
| 1199 | 1216 | ||
| 1200 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | 1217 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); |
| 1201 | current->state = TASK_INTERRUPTIBLE; | 1218 | expire = schedule_timeout_interruptible(expire); |
| 1202 | expire = schedule_timeout(expire); | ||
| 1203 | 1219 | ||
| 1204 | ret = 0; | 1220 | ret = 0; |
| 1205 | if (expire) { | 1221 | if (expire) { |
| @@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) | |||
| 1428 | } | 1444 | } |
| 1429 | } | 1445 | } |
| 1430 | 1446 | ||
| 1431 | static inline u64 time_interpolator_get_counter(void) | 1447 | static inline u64 time_interpolator_get_counter(int writelock) |
| 1432 | { | 1448 | { |
| 1433 | unsigned int src = time_interpolator->source; | 1449 | unsigned int src = time_interpolator->source; |
| 1434 | 1450 | ||
| @@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void) | |||
| 1442 | now = time_interpolator_get_cycles(src); | 1458 | now = time_interpolator_get_cycles(src); |
| 1443 | if (lcycle && time_after(lcycle, now)) | 1459 | if (lcycle && time_after(lcycle, now)) |
| 1444 | return lcycle; | 1460 | return lcycle; |
| 1461 | |||
| 1462 | /* When holding the xtime write lock, there's no need | ||
| 1463 | * to add the overhead of the cmpxchg. Readers are | ||
| 1464 | * force to retry until the write lock is released. | ||
| 1465 | */ | ||
| 1466 | if (writelock) { | ||
| 1467 | time_interpolator->last_cycle = now; | ||
| 1468 | return now; | ||
| 1469 | } | ||
| 1445 | /* Keep track of the last timer value returned. The use of cmpxchg here | 1470 | /* Keep track of the last timer value returned. The use of cmpxchg here |
| 1446 | * will cause contention in an SMP environment. | 1471 | * will cause contention in an SMP environment. |
| 1447 | */ | 1472 | */ |
| @@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void) | |||
| 1455 | void time_interpolator_reset(void) | 1480 | void time_interpolator_reset(void) |
| 1456 | { | 1481 | { |
| 1457 | time_interpolator->offset = 0; | 1482 | time_interpolator->offset = 0; |
| 1458 | time_interpolator->last_counter = time_interpolator_get_counter(); | 1483 | time_interpolator->last_counter = time_interpolator_get_counter(1); |
| 1459 | } | 1484 | } |
| 1460 | 1485 | ||
| 1461 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | 1486 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) |
| @@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void) | |||
| 1467 | return 0; | 1492 | return 0; |
| 1468 | 1493 | ||
| 1469 | return time_interpolator->offset + | 1494 | return time_interpolator->offset + |
| 1470 | GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); | 1495 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); |
| 1471 | } | 1496 | } |
| 1472 | 1497 | ||
| 1473 | #define INTERPOLATOR_ADJUST 65536 | 1498 | #define INTERPOLATOR_ADJUST 65536 |
| @@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec) | |||
| 1490 | * and the tuning logic insures that. | 1515 | * and the tuning logic insures that. |
| 1491 | */ | 1516 | */ |
| 1492 | 1517 | ||
| 1493 | counter = time_interpolator_get_counter(); | 1518 | counter = time_interpolator_get_counter(1); |
| 1494 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | 1519 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); |
| 1495 | 1520 | ||
| 1496 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1521 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
| @@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs) | |||
| 1588 | { | 1613 | { |
| 1589 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1614 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
| 1590 | 1615 | ||
| 1591 | while (timeout) { | 1616 | while (timeout) |
| 1592 | set_current_state(TASK_UNINTERRUPTIBLE); | 1617 | timeout = schedule_timeout_uninterruptible(timeout); |
| 1593 | timeout = schedule_timeout(timeout); | ||
| 1594 | } | ||
| 1595 | } | 1618 | } |
| 1596 | 1619 | ||
| 1597 | EXPORT_SYMBOL(msleep); | 1620 | EXPORT_SYMBOL(msleep); |
| @@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
| 1604 | { | 1627 | { |
| 1605 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1628 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
| 1606 | 1629 | ||
| 1607 | while (timeout && !signal_pending(current)) { | 1630 | while (timeout && !signal_pending(current)) |
| 1608 | set_current_state(TASK_INTERRUPTIBLE); | 1631 | timeout = schedule_timeout_interruptible(timeout); |
| 1609 | timeout = schedule_timeout(timeout); | ||
| 1610 | } | ||
| 1611 | return jiffies_to_msecs(timeout); | 1632 | return jiffies_to_msecs(timeout); |
| 1612 | } | 1633 | } |
| 1613 | 1634 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c7e36d4a70ca..91bacb13a7e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 308 | struct workqueue_struct *wq; | 308 | struct workqueue_struct *wq; |
| 309 | struct task_struct *p; | 309 | struct task_struct *p; |
| 310 | 310 | ||
| 311 | wq = kmalloc(sizeof(*wq), GFP_KERNEL); | 311 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); |
| 312 | if (!wq) | 312 | if (!wq) |
| 313 | return NULL; | 313 | return NULL; |
| 314 | memset(wq, 0, sizeof(*wq)); | ||
| 315 | 314 | ||
| 316 | wq->name = name; | 315 | wq->name = name; |
| 317 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 316 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
| @@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 499 | case CPU_UP_PREPARE: | 498 | case CPU_UP_PREPARE: |
| 500 | /* Create a new workqueue thread for it. */ | 499 | /* Create a new workqueue thread for it. */ |
| 501 | list_for_each_entry(wq, &workqueues, list) { | 500 | list_for_each_entry(wq, &workqueues, list) { |
| 502 | if (create_workqueue_thread(wq, hotcpu) < 0) { | 501 | if (!create_workqueue_thread(wq, hotcpu)) { |
| 503 | printk("workqueue for %i failed\n", hotcpu); | 502 | printk("workqueue for %i failed\n", hotcpu); |
| 504 | return NOTIFY_BAD; | 503 | return NOTIFY_BAD; |
| 505 | } | 504 | } |
