diff options
| author | Jeff Garzik <jgarzik@pretzel.yyz.us> | 2005-06-26 23:38:58 -0400 |
|---|---|---|
| committer | Jeff Garzik <jgarzik@pobox.com> | 2005-06-26 23:38:58 -0400 |
| commit | 5696c1944a33b4434a9a1ebb6383b906afd43a10 (patch) | |
| tree | 16fbe6ba431bcf949ee8645510b0c2fd39b5810f /kernel | |
| parent | 66b04a80eea60cabf9d89fd34deb3234a740052f (diff) | |
| parent | 020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff) | |
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'kernel')
37 files changed, 3338 insertions, 1263 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz new file mode 100644 index 000000000000..248e1c396f8b --- /dev/null +++ b/kernel/Kconfig.hz | |||
| @@ -0,0 +1,46 @@ | |||
| 1 | # | ||
| 2 | # Timer Interrupt Frequency Configuration | ||
| 3 | # | ||
| 4 | |||
| 5 | choice | ||
| 6 | prompt "Timer frequency" | ||
| 7 | default HZ_250 | ||
| 8 | help | ||
| 9 | Allows the configuration of the timer frequency. It is customary | ||
| 10 | to have the timer interrupt run at 1000 HZ but 100 HZ may be more | ||
| 11 | beneficial for servers and NUMA systems that do not need to have | ||
| 12 | a fast response for user interaction and that may experience bus | ||
| 13 | contention and cacheline bounces as a result of timer interrupts. | ||
| 14 | Note that the timer interrupt occurs on each processor in an SMP | ||
| 15 | environment leading to NR_CPUS * HZ number of timer interrupts | ||
| 16 | per second. | ||
| 17 | |||
| 18 | |||
| 19 | config HZ_100 | ||
| 20 | bool "100 HZ" | ||
| 21 | help | ||
| 22 | 100 HZ is a typical choice for servers, SMP and NUMA systems | ||
| 23 | with lots of processors that may show reduced performance if | ||
| 24 | too many timer interrupts are occurring. | ||
| 25 | |||
| 26 | config HZ_250 | ||
| 27 | bool "250 HZ" | ||
| 28 | help | ||
| 29 | 250 HZ is a good compromise choice allowing server performance | ||
| 30 | while also showing good interactive responsiveness even | ||
| 31 | on SMP and NUMA systems. | ||
| 32 | |||
| 33 | config HZ_1000 | ||
| 34 | bool "1000 HZ" | ||
| 35 | help | ||
| 36 | 1000 HZ is the preferred choice for desktop systems and other | ||
| 37 | systems requiring fast interactive responses to events. | ||
| 38 | |||
| 39 | endchoice | ||
| 40 | |||
| 41 | config HZ | ||
| 42 | int | ||
| 43 | default 100 if HZ_100 | ||
| 44 | default 250 if HZ_250 | ||
| 45 | default 1000 if HZ_1000 | ||
| 46 | |||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt new file mode 100644 index 000000000000..0b46a5dff4c0 --- /dev/null +++ b/kernel/Kconfig.preempt | |||
| @@ -0,0 +1,65 @@ | |||
| 1 | |||
| 2 | choice | ||
| 3 | prompt "Preemption Model" | ||
| 4 | default PREEMPT_NONE | ||
| 5 | |||
| 6 | config PREEMPT_NONE | ||
| 7 | bool "No Forced Preemption (Server)" | ||
| 8 | help | ||
| 9 | This is the traditional Linux preemption model, geared towards | ||
| 10 | throughput. It will still provide good latencies most of the | ||
| 11 | time, but there are no guarantees and occasional longer delays | ||
| 12 | are possible. | ||
| 13 | |||
| 14 | Select this option if you are building a kernel for a server or | ||
| 15 | scientific/computation system, or if you want to maximize the | ||
| 16 | raw processing power of the kernel, irrespective of scheduling | ||
| 17 | latencies. | ||
| 18 | |||
| 19 | config PREEMPT_VOLUNTARY | ||
| 20 | bool "Voluntary Kernel Preemption (Desktop)" | ||
| 21 | help | ||
| 22 | This option reduces the latency of the kernel by adding more | ||
| 23 | "explicit preemption points" to the kernel code. These new | ||
| 24 | preemption points have been selected to reduce the maximum | ||
| 25 | latency of rescheduling, providing faster application reactions, | ||
| 26 | at the cost of slighly lower throughput. | ||
| 27 | |||
| 28 | This allows reaction to interactive events by allowing a | ||
| 29 | low priority process to voluntarily preempt itself even if it | ||
| 30 | is in kernel mode executing a system call. This allows | ||
| 31 | applications to run more 'smoothly' even when the system is | ||
| 32 | under load. | ||
| 33 | |||
| 34 | Select this if you are building a kernel for a desktop system. | ||
| 35 | |||
| 36 | config PREEMPT | ||
| 37 | bool "Preemptible Kernel (Low-Latency Desktop)" | ||
| 38 | help | ||
| 39 | This option reduces the latency of the kernel by making | ||
| 40 | all kernel code (that is not executing in a critical section) | ||
| 41 | preemptible. This allows reaction to interactive events by | ||
| 42 | permitting a low priority process to be preempted involuntarily | ||
| 43 | even if it is in kernel mode executing a system call and would | ||
| 44 | otherwise not be about to reach a natural preemption point. | ||
| 45 | This allows applications to run more 'smoothly' even when the | ||
| 46 | system is under load, at the cost of slighly lower throughput | ||
| 47 | and a slight runtime overhead to kernel code. | ||
| 48 | |||
| 49 | Select this if you are building a kernel for a desktop or | ||
| 50 | embedded system with latency requirements in the milliseconds | ||
| 51 | range. | ||
| 52 | |||
| 53 | endchoice | ||
| 54 | |||
| 55 | config PREEMPT_BKL | ||
| 56 | bool "Preempt The Big Kernel Lock" | ||
| 57 | depends on SMP || PREEMPT | ||
| 58 | default y | ||
| 59 | help | ||
| 60 | This option reduces the latency of the kernel by making the | ||
| 61 | big kernel lock preemptible. | ||
| 62 | |||
| 63 | Say Y here if you are building a kernel for a desktop system. | ||
| 64 | Say N if you are unsure. | ||
| 65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index b01d26fe8db7..cb05cd05d237 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o | |||
| 17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
| 18 | obj-$(CONFIG_PM) += power/ | 18 | obj-$(CONFIG_PM) += power/ |
| 19 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 19 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
| 20 | obj-$(CONFIG_KEXEC) += kexec.o | ||
| 20 | obj-$(CONFIG_COMPAT) += compat.o | 21 | obj-$(CONFIG_COMPAT) += compat.o |
| 21 | obj-$(CONFIG_CPUSETS) += cpuset.o | 22 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 22 | obj-$(CONFIG_IKCONFIG) += configs.o | 23 | obj-$(CONFIG_IKCONFIG) += configs.o |
| @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | |||
| 27 | obj-$(CONFIG_KPROBES) += kprobes.o | 28 | obj-$(CONFIG_KPROBES) += kprobes.o |
| 28 | obj-$(CONFIG_SYSFS) += ksysfs.o | 29 | obj-$(CONFIG_SYSFS) += ksysfs.o |
| 29 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 30 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 31 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
| 30 | obj-$(CONFIG_SECCOMP) += seccomp.o | 32 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 31 | 33 | ||
| 32 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 34 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
diff --git a/kernel/audit.c b/kernel/audit.c index 9c4f1af0c794..ef35166fdc29 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -46,6 +46,8 @@ | |||
| 46 | #include <asm/types.h> | 46 | #include <asm/types.h> |
| 47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
| 48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
| 49 | #include <linux/err.h> | ||
| 50 | #include <linux/kthread.h> | ||
| 49 | 51 | ||
| 50 | #include <linux/audit.h> | 52 | #include <linux/audit.h> |
| 51 | 53 | ||
| @@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; | |||
| 68 | 70 | ||
| 69 | /* If audit records are to be written to the netlink socket, audit_pid | 71 | /* If audit records are to be written to the netlink socket, audit_pid |
| 70 | * contains the (non-zero) pid. */ | 72 | * contains the (non-zero) pid. */ |
| 71 | static int audit_pid; | 73 | int audit_pid; |
| 72 | 74 | ||
| 73 | /* If audit_limit is non-zero, limit the rate of sending audit records | 75 | /* If audit_limit is non-zero, limit the rate of sending audit records |
| 74 | * to that number per second. This prevents DoS attacks, but results in | 76 | * to that number per second. This prevents DoS attacks, but results in |
| @@ -77,7 +79,10 @@ static int audit_rate_limit; | |||
| 77 | 79 | ||
| 78 | /* Number of outstanding audit_buffers allowed. */ | 80 | /* Number of outstanding audit_buffers allowed. */ |
| 79 | static int audit_backlog_limit = 64; | 81 | static int audit_backlog_limit = 64; |
| 80 | static atomic_t audit_backlog = ATOMIC_INIT(0); | 82 | |
| 83 | /* The identity of the user shutting down the audit system. */ | ||
| 84 | uid_t audit_sig_uid = -1; | ||
| 85 | pid_t audit_sig_pid = -1; | ||
| 81 | 86 | ||
| 82 | /* Records can be lost in several ways: | 87 | /* Records can be lost in several ways: |
| 83 | 0) [suppressed in audit_alloc] | 88 | 0) [suppressed in audit_alloc] |
| @@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
| 91 | /* The netlink socket. */ | 96 | /* The netlink socket. */ |
| 92 | static struct sock *audit_sock; | 97 | static struct sock *audit_sock; |
| 93 | 98 | ||
| 94 | /* There are two lists of audit buffers. The txlist contains audit | 99 | /* The audit_freelist is a list of pre-allocated audit buffers (if more |
| 95 | * buffers that cannot be sent immediately to the netlink device because | ||
| 96 | * we are in an irq context (these are sent later in a tasklet). | ||
| 97 | * | ||
| 98 | * The second list is a list of pre-allocated audit buffers (if more | ||
| 99 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | 100 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of |
| 100 | * being placed on the freelist). */ | 101 | * being placed on the freelist). */ |
| 101 | static DEFINE_SPINLOCK(audit_txlist_lock); | ||
| 102 | static DEFINE_SPINLOCK(audit_freelist_lock); | 102 | static DEFINE_SPINLOCK(audit_freelist_lock); |
| 103 | static int audit_freelist_count = 0; | 103 | static int audit_freelist_count = 0; |
| 104 | static LIST_HEAD(audit_txlist); | ||
| 105 | static LIST_HEAD(audit_freelist); | 104 | static LIST_HEAD(audit_freelist); |
| 106 | 105 | ||
| 106 | static struct sk_buff_head audit_skb_queue; | ||
| 107 | static struct task_struct *kauditd_task; | ||
| 108 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | ||
| 109 | |||
| 107 | /* There are three lists of rules -- one to search at task creation | 110 | /* There are three lists of rules -- one to search at task creation |
| 108 | * time, one to search at syscall entry time, and another to search at | 111 | * time, one to search at syscall entry time, and another to search at |
| 109 | * syscall exit time. */ | 112 | * syscall exit time. */ |
| @@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist); | |||
| 112 | static LIST_HEAD(audit_extlist); | 115 | static LIST_HEAD(audit_extlist); |
| 113 | 116 | ||
| 114 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 117 | /* The netlink socket is only to be read by 1 CPU, which lets us assume |
| 115 | * that list additions and deletions never happen simultaneiously in | 118 | * that list additions and deletions never happen simultaneously in |
| 116 | * auditsc.c */ | 119 | * auditsc.c */ |
| 117 | static DECLARE_MUTEX(audit_netlink_sem); | 120 | static DECLARE_MUTEX(audit_netlink_sem); |
| 118 | 121 | ||
| @@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem); | |||
| 132 | * use simultaneously. */ | 135 | * use simultaneously. */ |
| 133 | struct audit_buffer { | 136 | struct audit_buffer { |
| 134 | struct list_head list; | 137 | struct list_head list; |
| 135 | struct sk_buff_head sklist; /* formatted skbs ready to send */ | 138 | struct sk_buff *skb; /* formatted skb ready to send */ |
| 136 | struct audit_context *ctx; /* NULL or associated context */ | 139 | struct audit_context *ctx; /* NULL or associated context */ |
| 137 | int len; /* used area of tmp */ | ||
| 138 | char tmp[AUDIT_BUFSIZ]; | ||
| 139 | |||
| 140 | /* Pointer to header and contents */ | ||
| 141 | struct nlmsghdr *nlh; | ||
| 142 | int total; | ||
| 143 | int type; | ||
| 144 | int pid; | ||
| 145 | }; | 140 | }; |
| 146 | 141 | ||
| 147 | void audit_set_type(struct audit_buffer *ab, int type) | 142 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
| 148 | { | 143 | { |
| 149 | ab->type = type; | 144 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; |
| 145 | nlh->nlmsg_pid = pid; | ||
| 150 | } | 146 | } |
| 151 | 147 | ||
| 152 | struct audit_entry { | 148 | struct audit_entry { |
| @@ -154,9 +150,6 @@ struct audit_entry { | |||
| 154 | struct audit_rule rule; | 150 | struct audit_rule rule; |
| 155 | }; | 151 | }; |
| 156 | 152 | ||
| 157 | static void audit_log_end_irq(struct audit_buffer *ab); | ||
| 158 | static void audit_log_end_fast(struct audit_buffer *ab); | ||
| 159 | |||
| 160 | static void audit_panic(const char *message) | 153 | static void audit_panic(const char *message) |
| 161 | { | 154 | { |
| 162 | switch (audit_failure) | 155 | switch (audit_failure) |
| @@ -227,10 +220,8 @@ void audit_log_lost(const char *message) | |||
| 227 | 220 | ||
| 228 | if (print) { | 221 | if (print) { |
| 229 | printk(KERN_WARNING | 222 | printk(KERN_WARNING |
| 230 | "audit: audit_lost=%d audit_backlog=%d" | 223 | "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", |
| 231 | " audit_rate_limit=%d audit_backlog_limit=%d\n", | ||
| 232 | atomic_read(&audit_lost), | 224 | atomic_read(&audit_lost), |
| 233 | atomic_read(&audit_backlog), | ||
| 234 | audit_rate_limit, | 225 | audit_rate_limit, |
| 235 | audit_backlog_limit); | 226 | audit_backlog_limit); |
| 236 | audit_panic(message); | 227 | audit_panic(message); |
| @@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) | |||
| 242 | { | 233 | { |
| 243 | int old = audit_rate_limit; | 234 | int old = audit_rate_limit; |
| 244 | audit_rate_limit = limit; | 235 | audit_rate_limit = limit; |
| 245 | audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", | 236 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 237 | "audit_rate_limit=%d old=%d by auid=%u", | ||
| 246 | audit_rate_limit, old, loginuid); | 238 | audit_rate_limit, old, loginuid); |
| 247 | return old; | 239 | return old; |
| 248 | } | 240 | } |
| @@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) | |||
| 251 | { | 243 | { |
| 252 | int old = audit_backlog_limit; | 244 | int old = audit_backlog_limit; |
| 253 | audit_backlog_limit = limit; | 245 | audit_backlog_limit = limit; |
| 254 | audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", | 246 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 247 | "audit_backlog_limit=%d old=%d by auid=%u", | ||
| 255 | audit_backlog_limit, old, loginuid); | 248 | audit_backlog_limit, old, loginuid); |
| 256 | return old; | 249 | return old; |
| 257 | } | 250 | } |
| @@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid) | |||
| 262 | if (state != 0 && state != 1) | 255 | if (state != 0 && state != 1) |
| 263 | return -EINVAL; | 256 | return -EINVAL; |
| 264 | audit_enabled = state; | 257 | audit_enabled = state; |
| 265 | audit_log(NULL, "audit_enabled=%d old=%d by auid %u", | 258 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 266 | audit_enabled, old, loginuid); | 259 | "audit_enabled=%d old=%d by auid=%u", |
| 260 | audit_enabled, old, loginuid); | ||
| 267 | return old; | 261 | return old; |
| 268 | } | 262 | } |
| 269 | 263 | ||
| @@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid) | |||
| 275 | && state != AUDIT_FAIL_PANIC) | 269 | && state != AUDIT_FAIL_PANIC) |
| 276 | return -EINVAL; | 270 | return -EINVAL; |
| 277 | audit_failure = state; | 271 | audit_failure = state; |
| 278 | audit_log(NULL, "audit_failure=%d old=%d by auid %u", | 272 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 279 | audit_failure, old, loginuid); | 273 | "audit_failure=%d old=%d by auid=%u", |
| 274 | audit_failure, old, loginuid); | ||
| 280 | return old; | 275 | return old; |
| 281 | } | 276 | } |
| 282 | 277 | ||
| 283 | #ifdef CONFIG_NET | 278 | int kauditd_thread(void *dummy) |
| 279 | { | ||
| 280 | struct sk_buff *skb; | ||
| 281 | |||
| 282 | while (1) { | ||
| 283 | skb = skb_dequeue(&audit_skb_queue); | ||
| 284 | if (skb) { | ||
| 285 | if (audit_pid) { | ||
| 286 | int err = netlink_unicast(audit_sock, skb, audit_pid, 0); | ||
| 287 | if (err < 0) { | ||
| 288 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | ||
| 289 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | ||
| 290 | audit_pid = 0; | ||
| 291 | } | ||
| 292 | } else { | ||
| 293 | printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); | ||
| 294 | kfree_skb(skb); | ||
| 295 | } | ||
| 296 | } else { | ||
| 297 | DECLARE_WAITQUEUE(wait, current); | ||
| 298 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 299 | add_wait_queue(&kauditd_wait, &wait); | ||
| 300 | |||
| 301 | if (!skb_queue_len(&audit_skb_queue)) | ||
| 302 | schedule(); | ||
| 303 | |||
| 304 | __set_current_state(TASK_RUNNING); | ||
| 305 | remove_wait_queue(&kauditd_wait, &wait); | ||
| 306 | } | ||
| 307 | } | ||
| 308 | } | ||
| 309 | |||
| 284 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | 310 | void audit_send_reply(int pid, int seq, int type, int done, int multi, |
| 285 | void *payload, int size) | 311 | void *payload, int size) |
| 286 | { | 312 | { |
| @@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, | |||
| 293 | 319 | ||
| 294 | skb = alloc_skb(len, GFP_KERNEL); | 320 | skb = alloc_skb(len, GFP_KERNEL); |
| 295 | if (!skb) | 321 | if (!skb) |
| 296 | goto nlmsg_failure; | 322 | return; |
| 297 | 323 | ||
| 298 | nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); | 324 | nlh = NLMSG_PUT(skb, pid, seq, t, size); |
| 299 | nlh->nlmsg_flags = flags; | 325 | nlh->nlmsg_flags = flags; |
| 300 | data = NLMSG_DATA(nlh); | 326 | data = NLMSG_DATA(nlh); |
| 301 | memcpy(data, payload, size); | 327 | memcpy(data, payload, size); |
| 302 | netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); | 328 | |
| 329 | /* Ignore failure. It'll only happen if the sender goes away, | ||
| 330 | because our timeout is set to infinite. */ | ||
| 331 | netlink_unicast(audit_sock, skb, pid, 0); | ||
| 303 | return; | 332 | return; |
| 304 | 333 | ||
| 305 | nlmsg_failure: /* Used by NLMSG_PUT */ | 334 | nlmsg_failure: /* Used by NLMSG_PUT */ |
| @@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
| 321 | case AUDIT_SET: | 350 | case AUDIT_SET: |
| 322 | case AUDIT_ADD: | 351 | case AUDIT_ADD: |
| 323 | case AUDIT_DEL: | 352 | case AUDIT_DEL: |
| 353 | case AUDIT_SIGNAL_INFO: | ||
| 324 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 354 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) |
| 325 | err = -EPERM; | 355 | err = -EPERM; |
| 326 | break; | 356 | break; |
| 327 | case AUDIT_USER: | 357 | case AUDIT_USER: |
| 358 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | ||
| 328 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 359 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) |
| 329 | err = -EPERM; | 360 | err = -EPERM; |
| 330 | break; | 361 | break; |
| @@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 344 | struct audit_buffer *ab; | 375 | struct audit_buffer *ab; |
| 345 | u16 msg_type = nlh->nlmsg_type; | 376 | u16 msg_type = nlh->nlmsg_type; |
| 346 | uid_t loginuid; /* loginuid of sender */ | 377 | uid_t loginuid; /* loginuid of sender */ |
| 378 | struct audit_sig_info sig_data; | ||
| 347 | 379 | ||
| 348 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | 380 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); |
| 349 | if (err) | 381 | if (err) |
| 350 | return err; | 382 | return err; |
| 351 | 383 | ||
| 384 | /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ | ||
| 385 | if (!kauditd_task) | ||
| 386 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); | ||
| 387 | if (IS_ERR(kauditd_task)) { | ||
| 388 | err = PTR_ERR(kauditd_task); | ||
| 389 | kauditd_task = NULL; | ||
| 390 | return err; | ||
| 391 | } | ||
| 392 | |||
| 352 | pid = NETLINK_CREDS(skb)->pid; | 393 | pid = NETLINK_CREDS(skb)->pid; |
| 353 | uid = NETLINK_CREDS(skb)->uid; | 394 | uid = NETLINK_CREDS(skb)->uid; |
| 354 | loginuid = NETLINK_CB(skb).loginuid; | 395 | loginuid = NETLINK_CB(skb).loginuid; |
| @@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 363 | status_set.rate_limit = audit_rate_limit; | 404 | status_set.rate_limit = audit_rate_limit; |
| 364 | status_set.backlog_limit = audit_backlog_limit; | 405 | status_set.backlog_limit = audit_backlog_limit; |
| 365 | status_set.lost = atomic_read(&audit_lost); | 406 | status_set.lost = atomic_read(&audit_lost); |
| 366 | status_set.backlog = atomic_read(&audit_backlog); | 407 | status_set.backlog = skb_queue_len(&audit_skb_queue); |
| 367 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, | 408 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, |
| 368 | &status_set, sizeof(status_set)); | 409 | &status_set, sizeof(status_set)); |
| 369 | break; | 410 | break; |
| @@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 382 | if (status_get->mask & AUDIT_STATUS_PID) { | 423 | if (status_get->mask & AUDIT_STATUS_PID) { |
| 383 | int old = audit_pid; | 424 | int old = audit_pid; |
| 384 | audit_pid = status_get->pid; | 425 | audit_pid = status_get->pid; |
| 385 | audit_log(NULL, "audit_pid=%d old=%d by auid %u", | 426 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 427 | "audit_pid=%d old=%d by auid=%u", | ||
| 386 | audit_pid, old, loginuid); | 428 | audit_pid, old, loginuid); |
| 387 | } | 429 | } |
| 388 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 430 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
| @@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 392 | loginuid); | 434 | loginuid); |
| 393 | break; | 435 | break; |
| 394 | case AUDIT_USER: | 436 | case AUDIT_USER: |
| 395 | ab = audit_log_start(NULL); | 437 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
| 438 | ab = audit_log_start(NULL, msg_type); | ||
| 396 | if (!ab) | 439 | if (!ab) |
| 397 | break; /* audit_panic has been called */ | 440 | break; /* audit_panic has been called */ |
| 398 | audit_log_format(ab, | 441 | audit_log_format(ab, |
| 399 | "user pid=%d uid=%d length=%d loginuid=%u" | 442 | "user pid=%d uid=%u auid=%u" |
| 400 | " msg='%.1024s'", | 443 | " msg='%.1024s'", |
| 401 | pid, uid, | 444 | pid, uid, loginuid, (char *)data); |
| 402 | (int)(nlh->nlmsg_len | 445 | audit_set_pid(ab, pid); |
| 403 | - ((char *)data - (char *)nlh)), | ||
| 404 | loginuid, (char *)data); | ||
| 405 | ab->type = AUDIT_USER; | ||
| 406 | ab->pid = pid; | ||
| 407 | audit_log_end(ab); | 446 | audit_log_end(ab); |
| 408 | break; | 447 | break; |
| 409 | case AUDIT_ADD: | 448 | case AUDIT_ADD: |
| @@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 412 | return -EINVAL; | 451 | return -EINVAL; |
| 413 | /* fallthrough */ | 452 | /* fallthrough */ |
| 414 | case AUDIT_LIST: | 453 | case AUDIT_LIST: |
| 415 | #ifdef CONFIG_AUDITSYSCALL | ||
| 416 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 454 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
| 417 | uid, seq, data, loginuid); | 455 | uid, seq, data, loginuid); |
| 418 | #else | 456 | break; |
| 419 | err = -EOPNOTSUPP; | 457 | case AUDIT_SIGNAL_INFO: |
| 420 | #endif | 458 | sig_data.uid = audit_sig_uid; |
| 459 | sig_data.pid = audit_sig_pid; | ||
| 460 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | ||
| 461 | 0, 0, &sig_data, sizeof(sig_data)); | ||
| 421 | break; | 462 | break; |
| 422 | default: | 463 | default: |
| 423 | err = -EINVAL; | 464 | err = -EINVAL; |
| @@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length) | |||
| 467 | up(&audit_netlink_sem); | 508 | up(&audit_netlink_sem); |
| 468 | } | 509 | } |
| 469 | 510 | ||
| 470 | /* Move data from tmp buffer into an skb. This is an extra copy, and | ||
| 471 | * that is unfortunate. However, the copy will only occur when a record | ||
| 472 | * is being written to user space, which is already a high-overhead | ||
| 473 | * operation. (Elimination of the copy is possible, for example, by | ||
| 474 | * writing directly into a pre-allocated skb, at the cost of wasting | ||
| 475 | * memory. */ | ||
| 476 | static void audit_log_move(struct audit_buffer *ab) | ||
| 477 | { | ||
| 478 | struct sk_buff *skb; | ||
| 479 | char *start; | ||
| 480 | int extra = ab->nlh ? 0 : NLMSG_SPACE(0); | ||
| 481 | |||
| 482 | /* possible resubmission */ | ||
| 483 | if (ab->len == 0) | ||
| 484 | return; | ||
| 485 | |||
| 486 | skb = skb_peek_tail(&ab->sklist); | ||
| 487 | if (!skb || skb_tailroom(skb) <= ab->len + extra) { | ||
| 488 | skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); | ||
| 489 | if (!skb) { | ||
| 490 | ab->len = 0; /* Lose information in ab->tmp */ | ||
| 491 | audit_log_lost("out of memory in audit_log_move"); | ||
| 492 | return; | ||
| 493 | } | ||
| 494 | __skb_queue_tail(&ab->sklist, skb); | ||
| 495 | if (!ab->nlh) | ||
| 496 | ab->nlh = (struct nlmsghdr *)skb_put(skb, | ||
| 497 | NLMSG_SPACE(0)); | ||
| 498 | } | ||
| 499 | start = skb_put(skb, ab->len); | ||
| 500 | memcpy(start, ab->tmp, ab->len); | ||
| 501 | ab->len = 0; | ||
| 502 | } | ||
| 503 | |||
| 504 | /* Iterate over the skbuff in the audit_buffer, sending their contents | ||
| 505 | * to user space. */ | ||
| 506 | static inline int audit_log_drain(struct audit_buffer *ab) | ||
| 507 | { | ||
| 508 | struct sk_buff *skb; | ||
| 509 | |||
| 510 | while ((skb = skb_dequeue(&ab->sklist))) { | ||
| 511 | int retval = 0; | ||
| 512 | |||
| 513 | if (audit_pid) { | ||
| 514 | if (ab->nlh) { | ||
| 515 | ab->nlh->nlmsg_len = ab->total; | ||
| 516 | ab->nlh->nlmsg_type = ab->type; | ||
| 517 | ab->nlh->nlmsg_flags = 0; | ||
| 518 | ab->nlh->nlmsg_seq = 0; | ||
| 519 | ab->nlh->nlmsg_pid = ab->pid; | ||
| 520 | } | ||
| 521 | skb_get(skb); /* because netlink_* frees */ | ||
| 522 | retval = netlink_unicast(audit_sock, skb, audit_pid, | ||
| 523 | MSG_DONTWAIT); | ||
| 524 | } | ||
| 525 | if (retval == -EAGAIN && | ||
| 526 | (atomic_read(&audit_backlog)) < audit_backlog_limit) { | ||
| 527 | skb_queue_head(&ab->sklist, skb); | ||
| 528 | audit_log_end_irq(ab); | ||
| 529 | return 1; | ||
| 530 | } | ||
| 531 | if (retval < 0) { | ||
| 532 | if (retval == -ECONNREFUSED) { | ||
| 533 | printk(KERN_ERR | ||
| 534 | "audit: *NO* daemon at audit_pid=%d\n", | ||
| 535 | audit_pid); | ||
| 536 | audit_pid = 0; | ||
| 537 | } else | ||
| 538 | audit_log_lost("netlink socket too busy"); | ||
| 539 | } | ||
| 540 | if (!audit_pid) { /* No daemon */ | ||
| 541 | int offset = ab->nlh ? NLMSG_SPACE(0) : 0; | ||
| 542 | int len = skb->len - offset; | ||
| 543 | skb->data[offset + len] = '\0'; | ||
| 544 | printk(KERN_ERR "%s\n", skb->data + offset); | ||
| 545 | } | ||
| 546 | kfree_skb(skb); | ||
| 547 | ab->nlh = NULL; | ||
| 548 | } | ||
| 549 | return 0; | ||
| 550 | } | ||
| 551 | 511 | ||
| 552 | /* Initialize audit support at boot time. */ | 512 | /* Initialize audit support at boot time. */ |
| 553 | static int __init audit_init(void) | 513 | static int __init audit_init(void) |
| @@ -558,40 +518,13 @@ static int __init audit_init(void) | |||
| 558 | if (!audit_sock) | 518 | if (!audit_sock) |
| 559 | audit_panic("cannot initialize netlink socket"); | 519 | audit_panic("cannot initialize netlink socket"); |
| 560 | 520 | ||
| 521 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | ||
| 522 | skb_queue_head_init(&audit_skb_queue); | ||
| 561 | audit_initialized = 1; | 523 | audit_initialized = 1; |
| 562 | audit_enabled = audit_default; | 524 | audit_enabled = audit_default; |
| 563 | audit_log(NULL, "initialized"); | 525 | audit_log(NULL, AUDIT_KERNEL, "initialized"); |
| 564 | return 0; | ||
| 565 | } | ||
| 566 | |||
| 567 | #else | ||
| 568 | /* Without CONFIG_NET, we have no skbuffs. For now, print what we have | ||
| 569 | * in the buffer. */ | ||
| 570 | static void audit_log_move(struct audit_buffer *ab) | ||
| 571 | { | ||
| 572 | printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); | ||
| 573 | ab->len = 0; | ||
| 574 | } | ||
| 575 | |||
| 576 | static inline int audit_log_drain(struct audit_buffer *ab) | ||
| 577 | { | ||
| 578 | return 0; | ||
| 579 | } | ||
| 580 | |||
| 581 | /* Initialize audit support at boot time. */ | ||
| 582 | int __init audit_init(void) | ||
| 583 | { | ||
| 584 | printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); | ||
| 585 | audit_sock = NULL; | ||
| 586 | audit_pid = 0; | ||
| 587 | |||
| 588 | audit_initialized = 1; | ||
| 589 | audit_enabled = audit_default; | ||
| 590 | audit_log(NULL, "initialized"); | ||
| 591 | return 0; | 526 | return 0; |
| 592 | } | 527 | } |
| 593 | #endif | ||
| 594 | |||
| 595 | __initcall(audit_init); | 528 | __initcall(audit_init); |
| 596 | 529 | ||
| 597 | /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ | 530 | /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ |
| @@ -608,6 +541,102 @@ static int __init audit_enable(char *str) | |||
| 608 | 541 | ||
| 609 | __setup("audit=", audit_enable); | 542 | __setup("audit=", audit_enable); |
| 610 | 543 | ||
| 544 | static void audit_buffer_free(struct audit_buffer *ab) | ||
| 545 | { | ||
| 546 | unsigned long flags; | ||
| 547 | |||
| 548 | if (!ab) | ||
| 549 | return; | ||
| 550 | |||
| 551 | if (ab->skb) | ||
| 552 | kfree_skb(ab->skb); | ||
| 553 | |||
| 554 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
| 555 | if (++audit_freelist_count > AUDIT_MAXFREE) | ||
| 556 | kfree(ab); | ||
| 557 | else | ||
| 558 | list_add(&ab->list, &audit_freelist); | ||
| 559 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
| 560 | } | ||
| 561 | |||
| 562 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | ||
| 563 | int gfp_mask, int type) | ||
| 564 | { | ||
| 565 | unsigned long flags; | ||
| 566 | struct audit_buffer *ab = NULL; | ||
| 567 | struct nlmsghdr *nlh; | ||
| 568 | |||
| 569 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
| 570 | if (!list_empty(&audit_freelist)) { | ||
| 571 | ab = list_entry(audit_freelist.next, | ||
| 572 | struct audit_buffer, list); | ||
| 573 | list_del(&ab->list); | ||
| 574 | --audit_freelist_count; | ||
| 575 | } | ||
| 576 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
| 577 | |||
| 578 | if (!ab) { | ||
| 579 | ab = kmalloc(sizeof(*ab), gfp_mask); | ||
| 580 | if (!ab) | ||
| 581 | goto err; | ||
| 582 | } | ||
| 583 | |||
| 584 | ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); | ||
| 585 | if (!ab->skb) | ||
| 586 | goto err; | ||
| 587 | |||
| 588 | ab->ctx = ctx; | ||
| 589 | nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); | ||
| 590 | nlh->nlmsg_type = type; | ||
| 591 | nlh->nlmsg_flags = 0; | ||
| 592 | nlh->nlmsg_pid = 0; | ||
| 593 | nlh->nlmsg_seq = 0; | ||
| 594 | return ab; | ||
| 595 | err: | ||
| 596 | audit_buffer_free(ab); | ||
| 597 | return NULL; | ||
| 598 | } | ||
| 599 | |||
| 600 | /* Compute a serial number for the audit record. Audit records are | ||
| 601 | * written to user-space as soon as they are generated, so a complete | ||
| 602 | * audit record may be written in several pieces. The timestamp of the | ||
| 603 | * record and this serial number are used by the user-space tools to | ||
| 604 | * determine which pieces belong to the same audit record. The | ||
| 605 | * (timestamp,serial) tuple is unique for each syscall and is live from | ||
| 606 | * syscall entry to syscall exit. | ||
| 607 | * | ||
| 608 | * Atomic values are only guaranteed to be 24-bit, so we count down. | ||
| 609 | * | ||
| 610 | * NOTE: Another possibility is to store the formatted records off the | ||
| 611 | * audit context (for those records that have a context), and emit them | ||
| 612 | * all at syscall exit. However, this could delay the reporting of | ||
| 613 | * significant errors until syscall exit (or never, if the system | ||
| 614 | * halts). */ | ||
| 615 | unsigned int audit_serial(void) | ||
| 616 | { | ||
| 617 | static atomic_t serial = ATOMIC_INIT(0xffffff); | ||
| 618 | unsigned int a, b; | ||
| 619 | |||
| 620 | do { | ||
| 621 | a = atomic_read(&serial); | ||
| 622 | if (atomic_dec_and_test(&serial)) | ||
| 623 | atomic_set(&serial, 0xffffff); | ||
| 624 | b = atomic_read(&serial); | ||
| 625 | } while (b != a - 1); | ||
| 626 | |||
| 627 | return 0xffffff - b; | ||
| 628 | } | ||
| 629 | |||
| 630 | static inline void audit_get_stamp(struct audit_context *ctx, | ||
| 631 | struct timespec *t, unsigned int *serial) | ||
| 632 | { | ||
| 633 | if (ctx) | ||
| 634 | auditsc_get_stamp(ctx, t, serial); | ||
| 635 | else { | ||
| 636 | *t = CURRENT_TIME; | ||
| 637 | *serial = audit_serial(); | ||
| 638 | } | ||
| 639 | } | ||
| 611 | 640 | ||
| 612 | /* Obtain an audit buffer. This routine does locking to obtain the | 641 | /* Obtain an audit buffer. This routine does locking to obtain the |
| 613 | * audit buffer, but then no locking is required for calls to | 642 | * audit buffer, but then no locking is required for calls to |
| @@ -615,10 +644,9 @@ __setup("audit=", audit_enable); | |||
| 615 | * syscall, then the syscall is marked as auditable and an audit record | 644 | * syscall, then the syscall is marked as auditable and an audit record |
| 616 | * will be written at syscall exit. If there is no associated task, tsk | 645 | * will be written at syscall exit. If there is no associated task, tsk |
| 617 | * should be NULL. */ | 646 | * should be NULL. */ |
| 618 | struct audit_buffer *audit_log_start(struct audit_context *ctx) | 647 | struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) |
| 619 | { | 648 | { |
| 620 | struct audit_buffer *ab = NULL; | 649 | struct audit_buffer *ab = NULL; |
| 621 | unsigned long flags; | ||
| 622 | struct timespec t; | 650 | struct timespec t; |
| 623 | unsigned int serial; | 651 | unsigned int serial; |
| 624 | 652 | ||
| @@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) | |||
| 626 | return NULL; | 654 | return NULL; |
| 627 | 655 | ||
| 628 | if (audit_backlog_limit | 656 | if (audit_backlog_limit |
| 629 | && atomic_read(&audit_backlog) > audit_backlog_limit) { | 657 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { |
| 630 | if (audit_rate_check()) | 658 | if (audit_rate_check()) |
| 631 | printk(KERN_WARNING | 659 | printk(KERN_WARNING |
| 632 | "audit: audit_backlog=%d > " | 660 | "audit: audit_backlog=%d > " |
| 633 | "audit_backlog_limit=%d\n", | 661 | "audit_backlog_limit=%d\n", |
| 634 | atomic_read(&audit_backlog), | 662 | skb_queue_len(&audit_skb_queue), |
| 635 | audit_backlog_limit); | 663 | audit_backlog_limit); |
| 636 | audit_log_lost("backlog limit exceeded"); | 664 | audit_log_lost("backlog limit exceeded"); |
| 637 | return NULL; | 665 | return NULL; |
| 638 | } | 666 | } |
| 639 | 667 | ||
| 640 | spin_lock_irqsave(&audit_freelist_lock, flags); | 668 | ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); |
| 641 | if (!list_empty(&audit_freelist)) { | ||
| 642 | ab = list_entry(audit_freelist.next, | ||
| 643 | struct audit_buffer, list); | ||
| 644 | list_del(&ab->list); | ||
| 645 | --audit_freelist_count; | ||
| 646 | } | ||
| 647 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
| 648 | |||
| 649 | if (!ab) | ||
| 650 | ab = kmalloc(sizeof(*ab), GFP_ATOMIC); | ||
| 651 | if (!ab) { | 669 | if (!ab) { |
| 652 | audit_log_lost("out of memory in audit_log_start"); | 670 | audit_log_lost("out of memory in audit_log_start"); |
| 653 | return NULL; | 671 | return NULL; |
| 654 | } | 672 | } |
| 655 | 673 | ||
| 656 | atomic_inc(&audit_backlog); | 674 | audit_get_stamp(ab->ctx, &t, &serial); |
| 657 | skb_queue_head_init(&ab->sklist); | ||
| 658 | |||
| 659 | ab->ctx = ctx; | ||
| 660 | ab->len = 0; | ||
| 661 | ab->nlh = NULL; | ||
| 662 | ab->total = 0; | ||
| 663 | ab->type = AUDIT_KERNEL; | ||
| 664 | ab->pid = 0; | ||
| 665 | 675 | ||
| 666 | #ifdef CONFIG_AUDITSYSCALL | ||
| 667 | if (ab->ctx) | ||
| 668 | audit_get_stamp(ab->ctx, &t, &serial); | ||
| 669 | else | ||
| 670 | #endif | ||
| 671 | { | ||
| 672 | t = CURRENT_TIME; | ||
| 673 | serial = 0; | ||
| 674 | } | ||
| 675 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", | 676 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", |
| 676 | t.tv_sec, t.tv_nsec/1000000, serial); | 677 | t.tv_sec, t.tv_nsec/1000000, serial); |
| 677 | return ab; | 678 | return ab; |
| 678 | } | 679 | } |
| 679 | 680 | ||
| 681 | /** | ||
| 682 | * audit_expand - expand skb in the audit buffer | ||
| 683 | * @ab: audit_buffer | ||
| 684 | * | ||
| 685 | * Returns 0 (no space) on failed expansion, or available space if | ||
| 686 | * successful. | ||
| 687 | */ | ||
| 688 | static inline int audit_expand(struct audit_buffer *ab, int extra) | ||
| 689 | { | ||
| 690 | struct sk_buff *skb = ab->skb; | ||
| 691 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, | ||
| 692 | GFP_ATOMIC); | ||
| 693 | if (ret < 0) { | ||
| 694 | audit_log_lost("out of memory in audit_expand"); | ||
| 695 | return 0; | ||
| 696 | } | ||
| 697 | return skb_tailroom(skb); | ||
| 698 | } | ||
| 680 | 699 | ||
| 681 | /* Format an audit message into the audit buffer. If there isn't enough | 700 | /* Format an audit message into the audit buffer. If there isn't enough |
| 682 | * room in the audit buffer, more room will be allocated and vsnprint | 701 | * room in the audit buffer, more room will be allocated and vsnprint |
| @@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | |||
| 686 | va_list args) | 705 | va_list args) |
| 687 | { | 706 | { |
| 688 | int len, avail; | 707 | int len, avail; |
| 708 | struct sk_buff *skb; | ||
| 709 | va_list args2; | ||
| 689 | 710 | ||
| 690 | if (!ab) | 711 | if (!ab) |
| 691 | return; | 712 | return; |
| 692 | 713 | ||
| 693 | avail = sizeof(ab->tmp) - ab->len; | 714 | BUG_ON(!ab->skb); |
| 694 | if (avail <= 0) { | 715 | skb = ab->skb; |
| 695 | audit_log_move(ab); | 716 | avail = skb_tailroom(skb); |
| 696 | avail = sizeof(ab->tmp) - ab->len; | 717 | if (avail == 0) { |
| 718 | avail = audit_expand(ab, AUDIT_BUFSIZ); | ||
| 719 | if (!avail) | ||
| 720 | goto out; | ||
| 697 | } | 721 | } |
| 698 | len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); | 722 | va_copy(args2, args); |
| 723 | len = vsnprintf(skb->tail, avail, fmt, args); | ||
| 699 | if (len >= avail) { | 724 | if (len >= avail) { |
| 700 | /* The printk buffer is 1024 bytes long, so if we get | 725 | /* The printk buffer is 1024 bytes long, so if we get |
| 701 | * here and AUDIT_BUFSIZ is at least 1024, then we can | 726 | * here and AUDIT_BUFSIZ is at least 1024, then we can |
| 702 | * log everything that printk could have logged. */ | 727 | * log everything that printk could have logged. */ |
| 703 | audit_log_move(ab); | 728 | avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); |
| 704 | avail = sizeof(ab->tmp) - ab->len; | 729 | if (!avail) |
| 705 | len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); | 730 | goto out; |
| 731 | len = vsnprintf(skb->tail, avail, fmt, args2); | ||
| 706 | } | 732 | } |
| 707 | ab->len += (len < avail) ? len : avail; | 733 | if (len > 0) |
| 708 | ab->total += (len < avail) ? len : avail; | 734 | skb_put(skb, len); |
| 735 | out: | ||
| 736 | return; | ||
| 709 | } | 737 | } |
| 710 | 738 | ||
| 711 | /* Format a message into the audit buffer. All the work is done in | 739 | /* Format a message into the audit buffer. All the work is done in |
| @@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | |||
| 721 | va_end(args); | 749 | va_end(args); |
| 722 | } | 750 | } |
| 723 | 751 | ||
| 724 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) | 752 | /* This function will take the passed buf and convert it into a string of |
| 753 | * ascii hex digits. The new string is placed onto the skb. */ | ||
| 754 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | ||
| 755 | size_t len) | ||
| 725 | { | 756 | { |
| 726 | int i; | 757 | int i, avail, new_len; |
| 758 | unsigned char *ptr; | ||
| 759 | struct sk_buff *skb; | ||
| 760 | static const unsigned char *hex = "0123456789ABCDEF"; | ||
| 761 | |||
| 762 | BUG_ON(!ab->skb); | ||
| 763 | skb = ab->skb; | ||
| 764 | avail = skb_tailroom(skb); | ||
| 765 | new_len = len<<1; | ||
| 766 | if (new_len >= avail) { | ||
| 767 | /* Round the buffer request up to the next multiple */ | ||
| 768 | new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); | ||
| 769 | avail = audit_expand(ab, new_len); | ||
| 770 | if (!avail) | ||
| 771 | return; | ||
| 772 | } | ||
| 727 | 773 | ||
| 728 | for (i=0; i<len; i++) | 774 | ptr = skb->tail; |
| 729 | audit_log_format(ab, "%02x", buf[i]); | 775 | for (i=0; i<len; i++) { |
| 776 | *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ | ||
| 777 | *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ | ||
| 778 | } | ||
| 779 | *ptr = 0; | ||
| 780 | skb_put(skb, len << 1); /* new string is twice the old string */ | ||
| 730 | } | 781 | } |
| 731 | 782 | ||
| 783 | /* This code will escape a string that is passed to it if the string | ||
| 784 | * contains a control character, unprintable character, double quote mark, | ||
| 785 | * or a space. Unescaped strings will start and end with a double quote mark. | ||
| 786 | * Strings that are escaped are printed in hex (2 digits per char). */ | ||
| 732 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 787 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
| 733 | { | 788 | { |
| 734 | const unsigned char *p = string; | 789 | const unsigned char *p = string; |
| 735 | 790 | ||
| 736 | while (*p) { | 791 | while (*p) { |
| 737 | if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { | 792 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { |
| 738 | audit_log_hex(ab, string, strlen(string)); | 793 | audit_log_hex(ab, string, strlen(string)); |
| 739 | return; | 794 | return; |
| 740 | } | 795 | } |
| @@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | |||
| 743 | audit_log_format(ab, "\"%s\"", string); | 798 | audit_log_format(ab, "\"%s\"", string); |
| 744 | } | 799 | } |
| 745 | 800 | ||
| 746 | 801 | /* This is a helper-function to print the escaped d_path */ | |
| 747 | /* This is a helper-function to print the d_path without using a static | ||
| 748 | * buffer or allocating another buffer in addition to the one in | ||
| 749 | * audit_buffer. */ | ||
| 750 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | 802 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, |
| 751 | struct dentry *dentry, struct vfsmount *vfsmnt) | 803 | struct dentry *dentry, struct vfsmount *vfsmnt) |
| 752 | { | 804 | { |
| 753 | char *p; | 805 | char *p, *path; |
| 754 | int len, avail; | ||
| 755 | 806 | ||
| 756 | if (prefix) audit_log_format(ab, " %s", prefix); | 807 | if (prefix) |
| 757 | 808 | audit_log_format(ab, " %s", prefix); | |
| 758 | if (ab->len > 128) | ||
| 759 | audit_log_move(ab); | ||
| 760 | avail = sizeof(ab->tmp) - ab->len; | ||
| 761 | p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); | ||
| 762 | if (IS_ERR(p)) { | ||
| 763 | /* FIXME: can we save some information here? */ | ||
| 764 | audit_log_format(ab, "<toolong>"); | ||
| 765 | } else { | ||
| 766 | /* path isn't at start of buffer */ | ||
| 767 | len = (ab->tmp + sizeof(ab->tmp) - 1) - p; | ||
| 768 | memmove(ab->tmp + ab->len, p, len); | ||
| 769 | ab->len += len; | ||
| 770 | ab->total += len; | ||
| 771 | } | ||
| 772 | } | ||
| 773 | |||
| 774 | /* Remove queued messages from the audit_txlist and send them to userspace. */ | ||
| 775 | static void audit_tasklet_handler(unsigned long arg) | ||
| 776 | { | ||
| 777 | LIST_HEAD(list); | ||
| 778 | struct audit_buffer *ab; | ||
| 779 | unsigned long flags; | ||
| 780 | 809 | ||
| 781 | spin_lock_irqsave(&audit_txlist_lock, flags); | 810 | /* We will allow 11 spaces for ' (deleted)' to be appended */ |
| 782 | list_splice_init(&audit_txlist, &list); | 811 | path = kmalloc(PATH_MAX+11, GFP_KERNEL); |
| 783 | spin_unlock_irqrestore(&audit_txlist_lock, flags); | 812 | if (!path) { |
| 784 | 813 | audit_log_format(ab, "<no memory>"); | |
| 785 | while (!list_empty(&list)) { | 814 | return; |
| 786 | ab = list_entry(list.next, struct audit_buffer, list); | ||
| 787 | list_del(&ab->list); | ||
| 788 | audit_log_end_fast(ab); | ||
| 789 | } | 815 | } |
| 816 | p = d_path(dentry, vfsmnt, path, PATH_MAX+11); | ||
| 817 | if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ | ||
| 818 | /* FIXME: can we save some information here? */ | ||
| 819 | audit_log_format(ab, "<too long>"); | ||
| 820 | } else | ||
| 821 | audit_log_untrustedstring(ab, p); | ||
| 822 | kfree(path); | ||
| 790 | } | 823 | } |
| 791 | 824 | ||
| 792 | static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); | ||
| 793 | |||
| 794 | /* The netlink_* functions cannot be called inside an irq context, so | 825 | /* The netlink_* functions cannot be called inside an irq context, so |
| 795 | * the audit buffer is places on a queue and a tasklet is scheduled to | 826 | * the audit buffer is places on a queue and a tasklet is scheduled to |
| 796 | * remove them from the queue outside the irq context. May be called in | 827 | * remove them from the queue outside the irq context. May be called in |
| 797 | * any context. */ | 828 | * any context. */ |
| 798 | static void audit_log_end_irq(struct audit_buffer *ab) | 829 | void audit_log_end(struct audit_buffer *ab) |
| 799 | { | ||
| 800 | unsigned long flags; | ||
| 801 | |||
| 802 | if (!ab) | ||
| 803 | return; | ||
| 804 | spin_lock_irqsave(&audit_txlist_lock, flags); | ||
| 805 | list_add_tail(&ab->list, &audit_txlist); | ||
| 806 | spin_unlock_irqrestore(&audit_txlist_lock, flags); | ||
| 807 | |||
| 808 | tasklet_schedule(&audit_tasklet); | ||
| 809 | } | ||
| 810 | |||
| 811 | /* Send the message in the audit buffer directly to user space. May not | ||
| 812 | * be called in an irq context. */ | ||
| 813 | static void audit_log_end_fast(struct audit_buffer *ab) | ||
| 814 | { | 830 | { |
| 815 | unsigned long flags; | ||
| 816 | |||
| 817 | BUG_ON(in_irq()); | ||
| 818 | if (!ab) | 831 | if (!ab) |
| 819 | return; | 832 | return; |
| 820 | if (!audit_rate_check()) { | 833 | if (!audit_rate_check()) { |
| 821 | audit_log_lost("rate limit exceeded"); | 834 | audit_log_lost("rate limit exceeded"); |
| 822 | } else { | 835 | } else { |
| 823 | audit_log_move(ab); | 836 | if (audit_pid) { |
| 824 | if (audit_log_drain(ab)) | 837 | struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; |
| 825 | return; | 838 | nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); |
| 839 | skb_queue_tail(&audit_skb_queue, ab->skb); | ||
| 840 | ab->skb = NULL; | ||
| 841 | wake_up_interruptible(&kauditd_wait); | ||
| 842 | } else { | ||
| 843 | printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); | ||
| 844 | } | ||
| 826 | } | 845 | } |
| 827 | 846 | audit_buffer_free(ab); | |
| 828 | atomic_dec(&audit_backlog); | ||
| 829 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
| 830 | if (++audit_freelist_count > AUDIT_MAXFREE) | ||
| 831 | kfree(ab); | ||
| 832 | else | ||
| 833 | list_add(&ab->list, &audit_freelist); | ||
| 834 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
| 835 | } | ||
| 836 | |||
| 837 | /* Send or queue the message in the audit buffer, depending on the | ||
| 838 | * current context. (A convenience function that may be called in any | ||
| 839 | * context.) */ | ||
| 840 | void audit_log_end(struct audit_buffer *ab) | ||
| 841 | { | ||
| 842 | if (in_irq()) | ||
| 843 | audit_log_end_irq(ab); | ||
| 844 | else | ||
| 845 | audit_log_end_fast(ab); | ||
| 846 | } | 847 | } |
| 847 | 848 | ||
| 848 | /* Log an audit record. This is a convenience function that calls | 849 | /* Log an audit record. This is a convenience function that calls |
| 849 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 850 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be |
| 850 | * called in any context. */ | 851 | * called in any context. */ |
| 851 | void audit_log(struct audit_context *ctx, const char *fmt, ...) | 852 | void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) |
| 852 | { | 853 | { |
| 853 | struct audit_buffer *ab; | 854 | struct audit_buffer *ab; |
| 854 | va_list args; | 855 | va_list args; |
| 855 | 856 | ||
| 856 | ab = audit_log_start(ctx); | 857 | ab = audit_log_start(ctx, type); |
| 857 | if (ab) { | 858 | if (ab) { |
| 858 | va_start(args, fmt); | 859 | va_start(args, fmt); |
| 859 | audit_log_vformat(ab, fmt, args); | 860 | audit_log_vformat(ab, fmt, args); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 37b3ac94bc47..e75f84e1a1a0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -34,7 +34,8 @@ | |||
| 34 | #include <asm/types.h> | 34 | #include <asm/types.h> |
| 35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
| 36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
| 37 | 37 | #include <linux/mount.h> | |
| 38 | #include <linux/socket.h> | ||
| 38 | #include <linux/audit.h> | 39 | #include <linux/audit.h> |
| 39 | #include <linux/personality.h> | 40 | #include <linux/personality.h> |
| 40 | #include <linux/time.h> | 41 | #include <linux/time.h> |
| @@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl { | |||
| 112 | mode_t mode; | 113 | mode_t mode; |
| 113 | }; | 114 | }; |
| 114 | 115 | ||
| 116 | struct audit_aux_data_socketcall { | ||
| 117 | struct audit_aux_data d; | ||
| 118 | int nargs; | ||
| 119 | unsigned long args[0]; | ||
| 120 | }; | ||
| 121 | |||
| 122 | struct audit_aux_data_sockaddr { | ||
| 123 | struct audit_aux_data d; | ||
| 124 | int len; | ||
| 125 | char a[0]; | ||
| 126 | }; | ||
| 127 | |||
| 128 | struct audit_aux_data_path { | ||
| 129 | struct audit_aux_data d; | ||
| 130 | struct dentry *dentry; | ||
| 131 | struct vfsmount *mnt; | ||
| 132 | }; | ||
| 115 | 133 | ||
| 116 | /* The per-task audit context. */ | 134 | /* The per-task audit context. */ |
| 117 | struct audit_context { | 135 | struct audit_context { |
| @@ -127,6 +145,8 @@ struct audit_context { | |||
| 127 | int auditable; /* 1 if record should be written */ | 145 | int auditable; /* 1 if record should be written */ |
| 128 | int name_count; | 146 | int name_count; |
| 129 | struct audit_names names[AUDIT_NAMES]; | 147 | struct audit_names names[AUDIT_NAMES]; |
| 148 | struct dentry * pwd; | ||
| 149 | struct vfsmount * pwdmnt; | ||
| 130 | struct audit_context *previous; /* For nested syscalls */ | 150 | struct audit_context *previous; /* For nested syscalls */ |
| 131 | struct audit_aux_data *aux; | 151 | struct audit_aux_data *aux; |
| 132 | 152 | ||
| @@ -157,6 +177,8 @@ struct audit_entry { | |||
| 157 | struct audit_rule rule; | 177 | struct audit_rule rule; |
| 158 | }; | 178 | }; |
| 159 | 179 | ||
| 180 | extern int audit_pid; | ||
| 181 | |||
| 160 | /* Check to see if two rules are identical. It is called from | 182 | /* Check to see if two rules are identical. It is called from |
| 161 | * audit_del_rule during AUDIT_DEL. */ | 183 | * audit_del_rule during AUDIT_DEL. */ |
| 162 | static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) | 184 | static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) |
| @@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule, | |||
| 226 | return -EFAULT; /* No matching rule */ | 248 | return -EFAULT; /* No matching rule */ |
| 227 | } | 249 | } |
| 228 | 250 | ||
| 229 | #ifdef CONFIG_NET | ||
| 230 | /* Copy rule from user-space to kernel-space. Called during | 251 | /* Copy rule from user-space to kernel-space. Called during |
| 231 | * AUDIT_ADD. */ | 252 | * AUDIT_ADD. */ |
| 232 | static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) | 253 | static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) |
| @@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 287 | err = audit_add_rule(entry, &audit_entlist); | 308 | err = audit_add_rule(entry, &audit_entlist); |
| 288 | if (!err && (flags & AUDIT_AT_EXIT)) | 309 | if (!err && (flags & AUDIT_AT_EXIT)) |
| 289 | err = audit_add_rule(entry, &audit_extlist); | 310 | err = audit_add_rule(entry, &audit_extlist); |
| 290 | audit_log(NULL, "auid %u added an audit rule\n", loginuid); | 311 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 312 | "auid=%u added an audit rule\n", loginuid); | ||
| 291 | break; | 313 | break; |
| 292 | case AUDIT_DEL: | 314 | case AUDIT_DEL: |
| 293 | flags =((struct audit_rule *)data)->flags; | 315 | flags =((struct audit_rule *)data)->flags; |
| @@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 297 | err = audit_del_rule(data, &audit_entlist); | 319 | err = audit_del_rule(data, &audit_entlist); |
| 298 | if (!err && (flags & AUDIT_AT_EXIT)) | 320 | if (!err && (flags & AUDIT_AT_EXIT)) |
| 299 | err = audit_del_rule(data, &audit_extlist); | 321 | err = audit_del_rule(data, &audit_extlist); |
| 300 | audit_log(NULL, "auid %u removed an audit rule\n", loginuid); | 322 | audit_log(NULL, AUDIT_CONFIG_CHANGE, |
| 323 | "auid=%u removed an audit rule\n", loginuid); | ||
| 301 | break; | 324 | break; |
| 302 | default: | 325 | default: |
| 303 | return -EINVAL; | 326 | return -EINVAL; |
| @@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 305 | 328 | ||
| 306 | return err; | 329 | return err; |
| 307 | } | 330 | } |
| 308 | #endif | ||
| 309 | 331 | ||
| 310 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 332 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
| 311 | * otherwise. */ | 333 | * otherwise. */ |
| @@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) | |||
| 444 | 466 | ||
| 445 | /* At syscall entry and exit time, this filter is called if the | 467 | /* At syscall entry and exit time, this filter is called if the |
| 446 | * audit_state is not low enough that auditing cannot take place, but is | 468 | * audit_state is not low enough that auditing cannot take place, but is |
| 447 | * also not high enough that we already know we have to write and audit | 469 | * also not high enough that we already know we have to write an audit |
| 448 | * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). | 470 | * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). |
| 449 | */ | 471 | */ |
| 450 | static enum audit_state audit_filter_syscall(struct task_struct *tsk, | 472 | static enum audit_state audit_filter_syscall(struct task_struct *tsk, |
| @@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context) | |||
| 532 | if (context->names[i].name) | 554 | if (context->names[i].name) |
| 533 | __putname(context->names[i].name); | 555 | __putname(context->names[i].name); |
| 534 | context->name_count = 0; | 556 | context->name_count = 0; |
| 557 | if (context->pwd) | ||
| 558 | dput(context->pwd); | ||
| 559 | if (context->pwdmnt) | ||
| 560 | mntput(context->pwdmnt); | ||
| 561 | context->pwd = NULL; | ||
| 562 | context->pwdmnt = NULL; | ||
| 535 | } | 563 | } |
| 536 | 564 | ||
| 537 | static inline void audit_free_aux(struct audit_context *context) | 565 | static inline void audit_free_aux(struct audit_context *context) |
| @@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context) | |||
| 539 | struct audit_aux_data *aux; | 567 | struct audit_aux_data *aux; |
| 540 | 568 | ||
| 541 | while ((aux = context->aux)) { | 569 | while ((aux = context->aux)) { |
| 570 | if (aux->type == AUDIT_AVC_PATH) { | ||
| 571 | struct audit_aux_data_path *axi = (void *)aux; | ||
| 572 | dput(axi->dentry); | ||
| 573 | mntput(axi->mnt); | ||
| 574 | } | ||
| 542 | context->aux = aux->next; | 575 | context->aux = aux->next; |
| 543 | kfree(aux); | 576 | kfree(aux); |
| 544 | } | 577 | } |
| @@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab) | |||
| 625 | struct vm_area_struct *vma; | 658 | struct vm_area_struct *vma; |
| 626 | 659 | ||
| 627 | get_task_comm(name, current); | 660 | get_task_comm(name, current); |
| 628 | audit_log_format(ab, " comm=%s", name); | 661 | audit_log_format(ab, " comm="); |
| 662 | audit_log_untrustedstring(ab, name); | ||
| 629 | 663 | ||
| 630 | if (!mm) | 664 | if (!mm) |
| 631 | return; | 665 | return; |
| @@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context) | |||
| 649 | { | 683 | { |
| 650 | int i; | 684 | int i; |
| 651 | struct audit_buffer *ab; | 685 | struct audit_buffer *ab; |
| 686 | struct audit_aux_data *aux; | ||
| 652 | 687 | ||
| 653 | ab = audit_log_start(context); | 688 | ab = audit_log_start(context, AUDIT_SYSCALL); |
| 654 | if (!ab) | 689 | if (!ab) |
| 655 | return; /* audit_panic has been called */ | 690 | return; /* audit_panic has been called */ |
| 656 | audit_log_format(ab, "syscall=%d", context->major); | 691 | audit_log_format(ab, "arch=%x syscall=%d", |
| 692 | context->arch, context->major); | ||
| 657 | if (context->personality != PER_LINUX) | 693 | if (context->personality != PER_LINUX) |
| 658 | audit_log_format(ab, " per=%lx", context->personality); | 694 | audit_log_format(ab, " per=%lx", context->personality); |
| 659 | audit_log_format(ab, " arch=%x", context->arch); | ||
| 660 | if (context->return_valid) | 695 | if (context->return_valid) |
| 661 | audit_log_format(ab, " success=%s exit=%ld", | 696 | audit_log_format(ab, " success=%s exit=%ld", |
| 662 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", | 697 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", |
| 663 | context->return_code); | 698 | context->return_code); |
| 664 | audit_log_format(ab, | 699 | audit_log_format(ab, |
| 665 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 700 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
| 666 | " pid=%d loginuid=%d uid=%d gid=%d" | 701 | " pid=%d auid=%u uid=%u gid=%u" |
| 667 | " euid=%d suid=%d fsuid=%d" | 702 | " euid=%u suid=%u fsuid=%u" |
| 668 | " egid=%d sgid=%d fsgid=%d", | 703 | " egid=%u sgid=%u fsgid=%u", |
| 669 | context->argv[0], | 704 | context->argv[0], |
| 670 | context->argv[1], | 705 | context->argv[1], |
| 671 | context->argv[2], | 706 | context->argv[2], |
| @@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context) | |||
| 679 | context->egid, context->sgid, context->fsgid); | 714 | context->egid, context->sgid, context->fsgid); |
| 680 | audit_log_task_info(ab); | 715 | audit_log_task_info(ab); |
| 681 | audit_log_end(ab); | 716 | audit_log_end(ab); |
| 682 | while (context->aux) { | ||
| 683 | struct audit_aux_data *aux; | ||
| 684 | 717 | ||
| 685 | ab = audit_log_start(context); | 718 | for (aux = context->aux; aux; aux = aux->next) { |
| 719 | |||
| 720 | ab = audit_log_start(context, aux->type); | ||
| 686 | if (!ab) | 721 | if (!ab) |
| 687 | continue; /* audit_panic has been called */ | 722 | continue; /* audit_panic has been called */ |
| 688 | 723 | ||
| 689 | aux = context->aux; | ||
| 690 | context->aux = aux->next; | ||
| 691 | |||
| 692 | audit_log_format(ab, "auxitem=%d", aux->type); | ||
| 693 | switch (aux->type) { | 724 | switch (aux->type) { |
| 694 | case AUDIT_AUX_IPCPERM: { | 725 | case AUDIT_IPC: { |
| 695 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 726 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
| 696 | audit_log_format(ab, | 727 | audit_log_format(ab, |
| 697 | " qbytes=%lx uid=%d gid=%d mode=%x", | 728 | " qbytes=%lx iuid=%u igid=%u mode=%x", |
| 698 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 729 | axi->qbytes, axi->uid, axi->gid, axi->mode); |
| 699 | } | 730 | break; } |
| 731 | |||
| 732 | case AUDIT_SOCKETCALL: { | ||
| 733 | int i; | ||
| 734 | struct audit_aux_data_socketcall *axs = (void *)aux; | ||
| 735 | audit_log_format(ab, "nargs=%d", axs->nargs); | ||
| 736 | for (i=0; i<axs->nargs; i++) | ||
| 737 | audit_log_format(ab, " a%d=%lx", i, axs->args[i]); | ||
| 738 | break; } | ||
| 739 | |||
| 740 | case AUDIT_SOCKADDR: { | ||
| 741 | struct audit_aux_data_sockaddr *axs = (void *)aux; | ||
| 742 | |||
| 743 | audit_log_format(ab, "saddr="); | ||
| 744 | audit_log_hex(ab, axs->a, axs->len); | ||
| 745 | break; } | ||
| 746 | |||
| 747 | case AUDIT_AVC_PATH: { | ||
| 748 | struct audit_aux_data_path *axi = (void *)aux; | ||
| 749 | audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); | ||
| 750 | break; } | ||
| 751 | |||
| 700 | } | 752 | } |
| 701 | audit_log_end(ab); | 753 | audit_log_end(ab); |
| 702 | kfree(aux); | ||
| 703 | } | 754 | } |
| 704 | 755 | ||
| 756 | if (context->pwd && context->pwdmnt) { | ||
| 757 | ab = audit_log_start(context, AUDIT_CWD); | ||
| 758 | if (ab) { | ||
| 759 | audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); | ||
| 760 | audit_log_end(ab); | ||
| 761 | } | ||
| 762 | } | ||
| 705 | for (i = 0; i < context->name_count; i++) { | 763 | for (i = 0; i < context->name_count; i++) { |
| 706 | ab = audit_log_start(context); | 764 | ab = audit_log_start(context, AUDIT_PATH); |
| 707 | if (!ab) | 765 | if (!ab) |
| 708 | continue; /* audit_panic has been called */ | 766 | continue; /* audit_panic has been called */ |
| 767 | |||
| 709 | audit_log_format(ab, "item=%d", i); | 768 | audit_log_format(ab, "item=%d", i); |
| 710 | if (context->names[i].name) { | 769 | if (context->names[i].name) { |
| 711 | audit_log_format(ab, " name="); | 770 | audit_log_format(ab, " name="); |
| @@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context) | |||
| 713 | } | 772 | } |
| 714 | if (context->names[i].ino != (unsigned long)-1) | 773 | if (context->names[i].ino != (unsigned long)-1) |
| 715 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" | 774 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" |
| 716 | " uid=%d gid=%d rdev=%02x:%02x", | 775 | " ouid=%u ogid=%u rdev=%02x:%02x", |
| 717 | context->names[i].ino, | 776 | context->names[i].ino, |
| 718 | MAJOR(context->names[i].dev), | 777 | MAJOR(context->names[i].dev), |
| 719 | MINOR(context->names[i].dev), | 778 | MINOR(context->names[i].dev), |
| @@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk) | |||
| 741 | 800 | ||
| 742 | /* Check for system calls that do not go through the exit | 801 | /* Check for system calls that do not go through the exit |
| 743 | * function (e.g., exit_group), then free context block. */ | 802 | * function (e.g., exit_group), then free context block. */ |
| 744 | if (context->in_syscall && context->auditable) | 803 | if (context->in_syscall && context->auditable && context->pid != audit_pid) |
| 745 | audit_log_exit(context); | 804 | audit_log_exit(context); |
| 746 | 805 | ||
| 747 | audit_free_context(context); | 806 | audit_free_context(context); |
| 748 | } | 807 | } |
| 749 | 808 | ||
| 750 | /* Compute a serial number for the audit record. Audit records are | ||
| 751 | * written to user-space as soon as they are generated, so a complete | ||
| 752 | * audit record may be written in several pieces. The timestamp of the | ||
| 753 | * record and this serial number are used by the user-space daemon to | ||
| 754 | * determine which pieces belong to the same audit record. The | ||
| 755 | * (timestamp,serial) tuple is unique for each syscall and is live from | ||
| 756 | * syscall entry to syscall exit. | ||
| 757 | * | ||
| 758 | * Atomic values are only guaranteed to be 24-bit, so we count down. | ||
| 759 | * | ||
| 760 | * NOTE: Another possibility is to store the formatted records off the | ||
| 761 | * audit context (for those records that have a context), and emit them | ||
| 762 | * all at syscall exit. However, this could delay the reporting of | ||
| 763 | * significant errors until syscall exit (or never, if the system | ||
| 764 | * halts). */ | ||
| 765 | static inline unsigned int audit_serial(void) | ||
| 766 | { | ||
| 767 | static atomic_t serial = ATOMIC_INIT(0xffffff); | ||
| 768 | unsigned int a, b; | ||
| 769 | |||
| 770 | do { | ||
| 771 | a = atomic_read(&serial); | ||
| 772 | if (atomic_dec_and_test(&serial)) | ||
| 773 | atomic_set(&serial, 0xffffff); | ||
| 774 | b = atomic_read(&serial); | ||
| 775 | } while (b != a - 1); | ||
| 776 | |||
| 777 | return 0xffffff - b; | ||
| 778 | } | ||
| 779 | |||
| 780 | /* Fill in audit context at syscall entry. This only happens if the | 809 | /* Fill in audit context at syscall entry. This only happens if the |
| 781 | * audit context was created when the task was created and the state or | 810 | * audit context was created when the task was created and the state or |
| 782 | * filters demand the audit context be built. If the state from the | 811 | * filters demand the audit context be built. If the state from the |
| @@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) | |||
| 876 | if (likely(!context)) | 905 | if (likely(!context)) |
| 877 | return; | 906 | return; |
| 878 | 907 | ||
| 879 | if (context->in_syscall && context->auditable) | 908 | if (context->in_syscall && context->auditable && context->pid != audit_pid) |
| 880 | audit_log_exit(context); | 909 | audit_log_exit(context); |
| 881 | 910 | ||
| 882 | context->in_syscall = 0; | 911 | context->in_syscall = 0; |
| @@ -916,6 +945,13 @@ void audit_getname(const char *name) | |||
| 916 | context->names[context->name_count].name = name; | 945 | context->names[context->name_count].name = name; |
| 917 | context->names[context->name_count].ino = (unsigned long)-1; | 946 | context->names[context->name_count].ino = (unsigned long)-1; |
| 918 | ++context->name_count; | 947 | ++context->name_count; |
| 948 | if (!context->pwd) { | ||
| 949 | read_lock(¤t->fs->lock); | ||
| 950 | context->pwd = dget(current->fs->pwd); | ||
| 951 | context->pwdmnt = mntget(current->fs->pwdmnt); | ||
| 952 | read_unlock(¤t->fs->lock); | ||
| 953 | } | ||
| 954 | |||
| 919 | } | 955 | } |
| 920 | 956 | ||
| 921 | /* Intercept a putname request. Called from | 957 | /* Intercept a putname request. Called from |
| @@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode) | |||
| 994 | context->names[idx].rdev = inode->i_rdev; | 1030 | context->names[idx].rdev = inode->i_rdev; |
| 995 | } | 1031 | } |
| 996 | 1032 | ||
| 997 | void audit_get_stamp(struct audit_context *ctx, | 1033 | void auditsc_get_stamp(struct audit_context *ctx, |
| 998 | struct timespec *t, unsigned int *serial) | 1034 | struct timespec *t, unsigned int *serial) |
| 999 | { | 1035 | { |
| 1000 | if (ctx) { | 1036 | t->tv_sec = ctx->ctime.tv_sec; |
| 1001 | t->tv_sec = ctx->ctime.tv_sec; | 1037 | t->tv_nsec = ctx->ctime.tv_nsec; |
| 1002 | t->tv_nsec = ctx->ctime.tv_nsec; | 1038 | *serial = ctx->serial; |
| 1003 | *serial = ctx->serial; | 1039 | ctx->auditable = 1; |
| 1004 | ctx->auditable = 1; | ||
| 1005 | } else { | ||
| 1006 | *t = CURRENT_TIME; | ||
| 1007 | *serial = 0; | ||
| 1008 | } | ||
| 1009 | } | 1040 | } |
| 1010 | 1041 | ||
| 1011 | extern int audit_set_type(struct audit_buffer *ab, int type); | ||
| 1012 | |||
| 1013 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 1042 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) |
| 1014 | { | 1043 | { |
| 1015 | if (task->audit_context) { | 1044 | if (task->audit_context) { |
| 1016 | struct audit_buffer *ab; | 1045 | struct audit_buffer *ab; |
| 1017 | 1046 | ||
| 1018 | ab = audit_log_start(NULL); | 1047 | ab = audit_log_start(NULL, AUDIT_LOGIN); |
| 1019 | if (ab) { | 1048 | if (ab) { |
| 1020 | audit_log_format(ab, "login pid=%d uid=%u " | 1049 | audit_log_format(ab, "login pid=%d uid=%u " |
| 1021 | "old loginuid=%u new loginuid=%u", | 1050 | "old auid=%u new auid=%u", |
| 1022 | task->pid, task->uid, | 1051 | task->pid, task->uid, |
| 1023 | task->audit_context->loginuid, loginuid); | 1052 | task->audit_context->loginuid, loginuid); |
| 1024 | audit_set_type(ab, AUDIT_LOGIN); | ||
| 1025 | audit_log_end(ab); | 1053 | audit_log_end(ab); |
| 1026 | } | 1054 | } |
| 1027 | task->audit_context->loginuid = loginuid; | 1055 | task->audit_context->loginuid = loginuid; |
| @@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | |||
| 1051 | ax->gid = gid; | 1079 | ax->gid = gid; |
| 1052 | ax->mode = mode; | 1080 | ax->mode = mode; |
| 1053 | 1081 | ||
| 1054 | ax->d.type = AUDIT_AUX_IPCPERM; | 1082 | ax->d.type = AUDIT_IPC; |
| 1083 | ax->d.next = context->aux; | ||
| 1084 | context->aux = (void *)ax; | ||
| 1085 | return 0; | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | int audit_socketcall(int nargs, unsigned long *args) | ||
| 1089 | { | ||
| 1090 | struct audit_aux_data_socketcall *ax; | ||
| 1091 | struct audit_context *context = current->audit_context; | ||
| 1092 | |||
| 1093 | if (likely(!context)) | ||
| 1094 | return 0; | ||
| 1095 | |||
| 1096 | ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); | ||
| 1097 | if (!ax) | ||
| 1098 | return -ENOMEM; | ||
| 1099 | |||
| 1100 | ax->nargs = nargs; | ||
| 1101 | memcpy(ax->args, args, nargs * sizeof(unsigned long)); | ||
| 1102 | |||
| 1103 | ax->d.type = AUDIT_SOCKETCALL; | ||
| 1104 | ax->d.next = context->aux; | ||
| 1105 | context->aux = (void *)ax; | ||
| 1106 | return 0; | ||
| 1107 | } | ||
| 1108 | |||
| 1109 | int audit_sockaddr(int len, void *a) | ||
| 1110 | { | ||
| 1111 | struct audit_aux_data_sockaddr *ax; | ||
| 1112 | struct audit_context *context = current->audit_context; | ||
| 1113 | |||
| 1114 | if (likely(!context)) | ||
| 1115 | return 0; | ||
| 1116 | |||
| 1117 | ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); | ||
| 1118 | if (!ax) | ||
| 1119 | return -ENOMEM; | ||
| 1120 | |||
| 1121 | ax->len = len; | ||
| 1122 | memcpy(ax->a, a, len); | ||
| 1123 | |||
| 1124 | ax->d.type = AUDIT_SOCKADDR; | ||
| 1055 | ax->d.next = context->aux; | 1125 | ax->d.next = context->aux; |
| 1056 | context->aux = (void *)ax; | 1126 | context->aux = (void *)ax; |
| 1057 | return 0; | 1127 | return 0; |
| 1058 | } | 1128 | } |
| 1129 | |||
| 1130 | int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | ||
| 1131 | { | ||
| 1132 | struct audit_aux_data_path *ax; | ||
| 1133 | struct audit_context *context = current->audit_context; | ||
| 1134 | |||
| 1135 | if (likely(!context)) | ||
| 1136 | return 0; | ||
| 1137 | |||
| 1138 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
| 1139 | if (!ax) | ||
| 1140 | return -ENOMEM; | ||
| 1141 | |||
| 1142 | ax->dentry = dget(dentry); | ||
| 1143 | ax->mnt = mntget(mnt); | ||
| 1144 | |||
| 1145 | ax->d.type = AUDIT_AVC_PATH; | ||
| 1146 | ax->d.next = context->aux; | ||
| 1147 | context->aux = (void *)ax; | ||
| 1148 | return 0; | ||
| 1149 | } | ||
| 1150 | |||
| 1151 | void audit_signal_info(int sig, struct task_struct *t) | ||
| 1152 | { | ||
| 1153 | extern pid_t audit_sig_pid; | ||
| 1154 | extern uid_t audit_sig_uid; | ||
| 1155 | |||
| 1156 | if (unlikely(audit_pid && t->pid == audit_pid)) { | ||
| 1157 | if (sig == SIGTERM || sig == SIGHUP) { | ||
| 1158 | struct audit_context *ctx = current->audit_context; | ||
| 1159 | audit_sig_pid = current->pid; | ||
| 1160 | if (ctx) | ||
| 1161 | audit_sig_uid = ctx->loginuid; | ||
| 1162 | else | ||
| 1163 | audit_sig_uid = current->uid; | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | } | ||
| 1167 | |||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 628f4ccda127..53d8263ae12e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) | |||
| 63 | { | 63 | { |
| 64 | int err; | 64 | int err; |
| 65 | 65 | ||
| 66 | /* Take offline: makes arch_cpu_down somewhat easier. */ | ||
| 67 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
| 68 | |||
| 69 | /* Ensure this CPU doesn't handle any more interrupts. */ | 66 | /* Ensure this CPU doesn't handle any more interrupts. */ |
| 70 | err = __cpu_disable(); | 67 | err = __cpu_disable(); |
| 71 | if (err < 0) | 68 | if (err < 0) |
| 72 | cpu_set(smp_processor_id(), cpu_online_map); | 69 | return err; |
| 73 | else | ||
| 74 | /* Force idle task to run as soon as we yield: it should | ||
| 75 | immediately notice cpu is offline and die quickly. */ | ||
| 76 | sched_idle_next(); | ||
| 77 | 70 | ||
| 78 | return err; | 71 | /* Force idle task to run as soon as we yield: it should |
| 72 | immediately notice cpu is offline and die quickly. */ | ||
| 73 | sched_idle_next(); | ||
| 74 | return 0; | ||
| 79 | } | 75 | } |
| 80 | 76 | ||
| 81 | int cpu_down(unsigned int cpu) | 77 | int cpu_down(unsigned int cpu) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00e8f2575512..984c0bf3807f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = { | |||
| 228 | 228 | ||
| 229 | static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) | 229 | static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) |
| 230 | { | 230 | { |
| 231 | struct qstr qstr; | 231 | struct dentry *d = lookup_one_len(name, parent, strlen(name)); |
| 232 | struct dentry *d; | ||
| 233 | |||
| 234 | qstr.name = name; | ||
| 235 | qstr.len = strlen(name); | ||
| 236 | qstr.hash = full_name_hash(name, qstr.len); | ||
| 237 | d = lookup_hash(&qstr, parent); | ||
| 238 | if (!IS_ERR(d)) | 232 | if (!IS_ERR(d)) |
| 239 | d->d_op = &cpuset_dops; | 233 | d->d_op = &cpuset_dops; |
| 240 | return d; | 234 | return d; |
| @@ -601,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 601 | return 0; | 595 | return 0; |
| 602 | } | 596 | } |
| 603 | 597 | ||
| 598 | /* | ||
| 599 | * For a given cpuset cur, partition the system as follows | ||
| 600 | * a. All cpus in the parent cpuset's cpus_allowed that are not part of any | ||
| 601 | * exclusive child cpusets | ||
| 602 | * b. All cpus in the current cpuset's cpus_allowed that are not part of any | ||
| 603 | * exclusive child cpusets | ||
| 604 | * Build these two partitions by calling partition_sched_domains | ||
| 605 | * | ||
| 606 | * Call with cpuset_sem held. May nest a call to the | ||
| 607 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | ||
| 608 | */ | ||
| 609 | static void update_cpu_domains(struct cpuset *cur) | ||
| 610 | { | ||
| 611 | struct cpuset *c, *par = cur->parent; | ||
| 612 | cpumask_t pspan, cspan; | ||
| 613 | |||
| 614 | if (par == NULL || cpus_empty(cur->cpus_allowed)) | ||
| 615 | return; | ||
| 616 | |||
| 617 | /* | ||
| 618 | * Get all cpus from parent's cpus_allowed not part of exclusive | ||
| 619 | * children | ||
| 620 | */ | ||
| 621 | pspan = par->cpus_allowed; | ||
| 622 | list_for_each_entry(c, &par->children, sibling) { | ||
| 623 | if (is_cpu_exclusive(c)) | ||
| 624 | cpus_andnot(pspan, pspan, c->cpus_allowed); | ||
| 625 | } | ||
| 626 | if (is_removed(cur) || !is_cpu_exclusive(cur)) { | ||
| 627 | cpus_or(pspan, pspan, cur->cpus_allowed); | ||
| 628 | if (cpus_equal(pspan, cur->cpus_allowed)) | ||
| 629 | return; | ||
| 630 | cspan = CPU_MASK_NONE; | ||
| 631 | } else { | ||
| 632 | if (cpus_empty(pspan)) | ||
| 633 | return; | ||
| 634 | cspan = cur->cpus_allowed; | ||
| 635 | /* | ||
| 636 | * Get all cpus from current cpuset's cpus_allowed not part | ||
| 637 | * of exclusive children | ||
| 638 | */ | ||
| 639 | list_for_each_entry(c, &cur->children, sibling) { | ||
| 640 | if (is_cpu_exclusive(c)) | ||
| 641 | cpus_andnot(cspan, cspan, c->cpus_allowed); | ||
| 642 | } | ||
| 643 | } | ||
| 644 | |||
| 645 | lock_cpu_hotplug(); | ||
| 646 | partition_sched_domains(&pspan, &cspan); | ||
| 647 | unlock_cpu_hotplug(); | ||
| 648 | } | ||
| 649 | |||
| 604 | static int update_cpumask(struct cpuset *cs, char *buf) | 650 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 605 | { | 651 | { |
| 606 | struct cpuset trialcs; | 652 | struct cpuset trialcs; |
| 607 | int retval; | 653 | int retval, cpus_unchanged; |
| 608 | 654 | ||
| 609 | trialcs = *cs; | 655 | trialcs = *cs; |
| 610 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 656 | retval = cpulist_parse(buf, trialcs.cpus_allowed); |
| @@ -614,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 614 | if (cpus_empty(trialcs.cpus_allowed)) | 660 | if (cpus_empty(trialcs.cpus_allowed)) |
| 615 | return -ENOSPC; | 661 | return -ENOSPC; |
| 616 | retval = validate_change(cs, &trialcs); | 662 | retval = validate_change(cs, &trialcs); |
| 617 | if (retval == 0) | 663 | if (retval < 0) |
| 618 | cs->cpus_allowed = trialcs.cpus_allowed; | 664 | return retval; |
| 619 | return retval; | 665 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
| 666 | cs->cpus_allowed = trialcs.cpus_allowed; | ||
| 667 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | ||
| 668 | update_cpu_domains(cs); | ||
| 669 | return 0; | ||
| 620 | } | 670 | } |
| 621 | 671 | ||
| 622 | static int update_nodemask(struct cpuset *cs, char *buf) | 672 | static int update_nodemask(struct cpuset *cs, char *buf) |
| @@ -652,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 652 | { | 702 | { |
| 653 | int turning_on; | 703 | int turning_on; |
| 654 | struct cpuset trialcs; | 704 | struct cpuset trialcs; |
| 655 | int err; | 705 | int err, cpu_exclusive_changed; |
| 656 | 706 | ||
| 657 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | 707 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); |
| 658 | 708 | ||
| @@ -663,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 663 | clear_bit(bit, &trialcs.flags); | 713 | clear_bit(bit, &trialcs.flags); |
| 664 | 714 | ||
| 665 | err = validate_change(cs, &trialcs); | 715 | err = validate_change(cs, &trialcs); |
| 666 | if (err == 0) { | 716 | if (err < 0) |
| 667 | if (turning_on) | 717 | return err; |
| 668 | set_bit(bit, &cs->flags); | 718 | cpu_exclusive_changed = |
| 669 | else | 719 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
| 670 | clear_bit(bit, &cs->flags); | 720 | if (turning_on) |
| 671 | } | 721 | set_bit(bit, &cs->flags); |
| 672 | return err; | 722 | else |
| 723 | clear_bit(bit, &cs->flags); | ||
| 724 | |||
| 725 | if (cpu_exclusive_changed) | ||
| 726 | update_cpu_domains(cs); | ||
| 727 | return 0; | ||
| 673 | } | 728 | } |
| 674 | 729 | ||
| 675 | static int attach_task(struct cpuset *cs, char *buf) | 730 | static int attach_task(struct cpuset *cs, char *buf) |
| @@ -1315,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1315 | up(&cpuset_sem); | 1370 | up(&cpuset_sem); |
| 1316 | return -EBUSY; | 1371 | return -EBUSY; |
| 1317 | } | 1372 | } |
| 1318 | spin_lock(&cs->dentry->d_lock); | ||
| 1319 | parent = cs->parent; | 1373 | parent = cs->parent; |
| 1320 | set_bit(CS_REMOVED, &cs->flags); | 1374 | set_bit(CS_REMOVED, &cs->flags); |
| 1375 | if (is_cpu_exclusive(cs)) | ||
| 1376 | update_cpu_domains(cs); | ||
| 1321 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1377 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
| 1322 | if (list_empty(&parent->children)) | 1378 | if (list_empty(&parent->children)) |
| 1323 | check_for_release(parent); | 1379 | check_for_release(parent); |
| 1380 | spin_lock(&cs->dentry->d_lock); | ||
| 1324 | d = dget(cs->dentry); | 1381 | d = dget(cs->dentry); |
| 1325 | cs->dentry = NULL; | 1382 | cs->dentry = NULL; |
| 1326 | spin_unlock(&d->d_lock); | 1383 | spin_unlock(&d->d_lock); |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..459ba49e376a --- /dev/null +++ b/kernel/crash_dump.c | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | /* | ||
| 2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
| 3 | * | ||
| 4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
| 5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/smp_lock.h> | ||
| 9 | #include <linux/errno.h> | ||
| 10 | #include <linux/proc_fs.h> | ||
| 11 | #include <linux/bootmem.h> | ||
| 12 | #include <linux/highmem.h> | ||
| 13 | #include <linux/crash_dump.h> | ||
| 14 | |||
| 15 | #include <asm/io.h> | ||
| 16 | #include <asm/uaccess.h> | ||
| 17 | |||
| 18 | /* Stores the physical address of elf header of crash image. */ | ||
| 19 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
| 20 | |||
| 21 | /* | ||
| 22 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
| 23 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
| 24 | */ | ||
| 25 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
| 26 | size_t csize, unsigned long offset, int userbuf) | ||
| 27 | { | ||
| 28 | void *page, *vaddr; | ||
| 29 | |||
| 30 | if (!csize) | ||
| 31 | return 0; | ||
| 32 | |||
| 33 | page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 34 | if (!page) | ||
| 35 | return -ENOMEM; | ||
| 36 | |||
| 37 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | ||
| 38 | copy_page(page, vaddr); | ||
| 39 | kunmap_atomic(vaddr, KM_PTE0); | ||
| 40 | |||
| 41 | if (userbuf) { | ||
| 42 | if (copy_to_user(buf, (page + offset), csize)) { | ||
| 43 | kfree(page); | ||
| 44 | return -EFAULT; | ||
| 45 | } | ||
| 46 | } else { | ||
| 47 | memcpy(buf, (page + offset), csize); | ||
| 48 | } | ||
| 49 | |||
| 50 | kfree(page); | ||
| 51 | return csize; | ||
| 52 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index edaa50b5bbfa..3ebcd60a19c6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -72,6 +72,11 @@ repeat: | |||
| 72 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 72 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
| 73 | __exit_signal(p); | 73 | __exit_signal(p); |
| 74 | __exit_sighand(p); | 74 | __exit_sighand(p); |
| 75 | /* | ||
| 76 | * Note that the fastpath in sys_times depends on __exit_signal having | ||
| 77 | * updated the counters before a task is removed from the tasklist of | ||
| 78 | * the process by __unhash_process. | ||
| 79 | */ | ||
| 75 | __unhash_process(p); | 80 | __unhash_process(p); |
| 76 | 81 | ||
| 77 | /* | 82 | /* |
| @@ -793,6 +798,17 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 793 | ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); | 798 | ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); |
| 794 | } | 799 | } |
| 795 | 800 | ||
| 801 | /* | ||
| 802 | * We're taking recursive faults here in do_exit. Safest is to just | ||
| 803 | * leave this task alone and wait for reboot. | ||
| 804 | */ | ||
| 805 | if (unlikely(tsk->flags & PF_EXITING)) { | ||
| 806 | printk(KERN_ALERT | ||
| 807 | "Fixing recursive fault but reboot is needed!\n"); | ||
| 808 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 809 | schedule(); | ||
| 810 | } | ||
| 811 | |||
| 796 | tsk->flags |= PF_EXITING; | 812 | tsk->flags |= PF_EXITING; |
| 797 | 813 | ||
| 798 | /* | 814 | /* |
| @@ -811,10 +827,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 811 | acct_update_integrals(tsk); | 827 | acct_update_integrals(tsk); |
| 812 | update_mem_hiwater(tsk); | 828 | update_mem_hiwater(tsk); |
| 813 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 829 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
| 814 | if (group_dead) { | 830 | if (group_dead) |
| 815 | del_timer_sync(&tsk->signal->real_timer); | ||
| 816 | acct_process(code); | 831 | acct_process(code); |
| 817 | } | ||
| 818 | exit_mm(tsk); | 832 | exit_mm(tsk); |
| 819 | 833 | ||
| 820 | exit_sem(tsk); | 834 | exit_sem(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index f42a17f88699..2c7806873bfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
| 194 | mm->mmap = NULL; | 194 | mm->mmap = NULL; |
| 195 | mm->mmap_cache = NULL; | 195 | mm->mmap_cache = NULL; |
| 196 | mm->free_area_cache = oldmm->mmap_base; | 196 | mm->free_area_cache = oldmm->mmap_base; |
| 197 | mm->cached_hole_size = ~0UL; | ||
| 197 | mm->map_count = 0; | 198 | mm->map_count = 0; |
| 198 | set_mm_counter(mm, rss, 0); | 199 | set_mm_counter(mm, rss, 0); |
| 199 | set_mm_counter(mm, anon_rss, 0); | 200 | set_mm_counter(mm, anon_rss, 0); |
| @@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
| 249 | 250 | ||
| 250 | /* | 251 | /* |
| 251 | * Link in the new vma and copy the page table entries: | 252 | * Link in the new vma and copy the page table entries: |
| 252 | * link in first so that swapoff can see swap entries, | 253 | * link in first so that swapoff can see swap entries. |
| 253 | * and try_to_unmap_one's find_vma find the new vma. | 254 | * Note that, exceptionally, here the vma is inserted |
| 255 | * without holding mm->mmap_sem. | ||
| 254 | */ | 256 | */ |
| 255 | spin_lock(&mm->page_table_lock); | 257 | spin_lock(&mm->page_table_lock); |
| 256 | *pprev = tmp; | 258 | *pprev = tmp; |
| @@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
| 322 | mm->ioctx_list = NULL; | 324 | mm->ioctx_list = NULL; |
| 323 | mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); | 325 | mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); |
| 324 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 326 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
| 327 | mm->cached_hole_size = ~0UL; | ||
| 325 | 328 | ||
| 326 | if (likely(!mm_alloc_pgd(mm))) { | 329 | if (likely(!mm_alloc_pgd(mm))) { |
| 327 | mm->def_flags = 0; | 330 | mm->def_flags = 0; |
| @@ -1000,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1000 | p->pdeath_signal = 0; | 1003 | p->pdeath_signal = 0; |
| 1001 | p->exit_state = 0; | 1004 | p->exit_state = 0; |
| 1002 | 1005 | ||
| 1003 | /* Perform scheduler related setup */ | ||
| 1004 | sched_fork(p); | ||
| 1005 | |||
| 1006 | /* | 1006 | /* |
| 1007 | * Ok, make it visible to the rest of the system. | 1007 | * Ok, make it visible to the rest of the system. |
| 1008 | * We dont wake it up yet. | 1008 | * We dont wake it up yet. |
| @@ -1011,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1011 | INIT_LIST_HEAD(&p->ptrace_children); | 1011 | INIT_LIST_HEAD(&p->ptrace_children); |
| 1012 | INIT_LIST_HEAD(&p->ptrace_list); | 1012 | INIT_LIST_HEAD(&p->ptrace_list); |
| 1013 | 1013 | ||
| 1014 | /* Perform scheduler related setup. Assign this task to a CPU. */ | ||
| 1015 | sched_fork(p, clone_flags); | ||
| 1016 | |||
| 1014 | /* Need tasklist lock for parent etc handling! */ | 1017 | /* Need tasklist lock for parent etc handling! */ |
| 1015 | write_lock_irq(&tasklist_lock); | 1018 | write_lock_irq(&tasklist_lock); |
| 1016 | 1019 | ||
| 1017 | /* | 1020 | /* |
| 1018 | * The task hasn't been attached yet, so cpus_allowed mask cannot | 1021 | * The task hasn't been attached yet, so its cpus_allowed mask will |
| 1019 | * have changed. The cpus_allowed mask of the parent may have | 1022 | * not be changed, nor will its assigned CPU. |
| 1020 | * changed after it was copied first time, and it may then move to | 1023 | * |
| 1021 | * another CPU - so we re-copy it here and set the child's CPU to | 1024 | * The cpus_allowed mask of the parent may have changed after it was |
| 1022 | * the parent's CPU. This avoids alot of nasty races. | 1025 | * copied first time - so re-copy it here, then check the child's CPU |
| 1026 | * to ensure it is on a valid CPU (and if not, just force it back to | ||
| 1027 | * parent's CPU). This avoids alot of nasty races. | ||
| 1023 | */ | 1028 | */ |
| 1024 | p->cpus_allowed = current->cpus_allowed; | 1029 | p->cpus_allowed = current->cpus_allowed; |
| 1025 | set_task_cpu(p, smp_processor_id()); | 1030 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) |
| 1031 | set_task_cpu(p, smp_processor_id()); | ||
| 1026 | 1032 | ||
| 1027 | /* | 1033 | /* |
| 1028 | * Check for pending SIGKILL! The new thread should not be allowed | 1034 | * Check for pending SIGKILL! The new thread should not be allowed |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 06b5a6323998..436c7d93c00a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
| 119 | */ | 119 | */ |
| 120 | desc->handler->ack(irq); | 120 | desc->handler->ack(irq); |
| 121 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 121 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
| 122 | if (!noirqdebug) | ||
| 123 | note_interrupt(irq, desc, action_ret); | ||
| 124 | desc->handler->end(irq); | 122 | desc->handler->end(irq); |
| 125 | return 1; | 123 | return 1; |
| 126 | } | 124 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5202e4c4a5b6..ac6700985705 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | * This file contains driver APIs to the irq subsystem. | 6 | * This file contains driver APIs to the irq subsystem. |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #include <linux/config.h> | ||
| 9 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
| 10 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 11 | #include <linux/random.h> | 12 | #include <linux/random.h> |
| @@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id) | |||
| 255 | 256 | ||
| 256 | /* Found it - now remove it from the list of entries */ | 257 | /* Found it - now remove it from the list of entries */ |
| 257 | *pp = action->next; | 258 | *pp = action->next; |
| 259 | |||
| 260 | /* Currently used only by UML, might disappear one day.*/ | ||
| 261 | #ifdef CONFIG_IRQ_RELEASE_METHOD | ||
| 262 | if (desc->handler->release) | ||
| 263 | desc->handler->release(irq, dev_id); | ||
| 264 | #endif | ||
| 265 | |||
| 258 | if (!desc->action) { | 266 | if (!desc->action) { |
| 259 | desc->status |= IRQ_DISABLED; | 267 | desc->status |= IRQ_DISABLED; |
| 260 | if (desc->handler->shutdown) | 268 | if (desc->handler->shutdown) |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f6297c306905..ba039e827d58 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
| 45 | } | 45 | } |
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 48 | static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) |
| 49 | { | 49 | { |
| 50 | static int count = 100; | 50 | static int count = 100; |
| 51 | 51 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 000000000000..7843548cf2d9 --- /dev/null +++ b/kernel/kexec.c | |||
| @@ -0,0 +1,1063 @@ | |||
| 1 | /* | ||
| 2 | * kexec.c - kexec system call | ||
| 3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
| 4 | * | ||
| 5 | * This source code is licensed under the GNU General Public License, | ||
| 6 | * Version 2. See the file COPYING for more details. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/file.h> | ||
| 11 | #include <linux/slab.h> | ||
| 12 | #include <linux/fs.h> | ||
| 13 | #include <linux/kexec.h> | ||
| 14 | #include <linux/spinlock.h> | ||
| 15 | #include <linux/list.h> | ||
| 16 | #include <linux/highmem.h> | ||
| 17 | #include <linux/syscalls.h> | ||
| 18 | #include <linux/reboot.h> | ||
| 19 | #include <linux/syscalls.h> | ||
| 20 | #include <linux/ioport.h> | ||
| 21 | #include <linux/hardirq.h> | ||
| 22 | |||
| 23 | #include <asm/page.h> | ||
| 24 | #include <asm/uaccess.h> | ||
| 25 | #include <asm/io.h> | ||
| 26 | #include <asm/system.h> | ||
| 27 | #include <asm/semaphore.h> | ||
| 28 | |||
| 29 | /* Location of the reserved area for the crash kernel */ | ||
| 30 | struct resource crashk_res = { | ||
| 31 | .name = "Crash kernel", | ||
| 32 | .start = 0, | ||
| 33 | .end = 0, | ||
| 34 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
| 35 | }; | ||
| 36 | |||
| 37 | int kexec_should_crash(struct task_struct *p) | ||
| 38 | { | ||
| 39 | if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) | ||
| 40 | return 1; | ||
| 41 | return 0; | ||
| 42 | } | ||
| 43 | |||
| 44 | /* | ||
| 45 | * When kexec transitions to the new kernel there is a one-to-one | ||
| 46 | * mapping between physical and virtual addresses. On processors | ||
| 47 | * where you can disable the MMU this is trivial, and easy. For | ||
| 48 | * others it is still a simple predictable page table to setup. | ||
| 49 | * | ||
| 50 | * In that environment kexec copies the new kernel to its final | ||
| 51 | * resting place. This means I can only support memory whose | ||
| 52 | * physical address can fit in an unsigned long. In particular | ||
| 53 | * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. | ||
| 54 | * If the assembly stub has more restrictive requirements | ||
| 55 | * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be | ||
| 56 | * defined more restrictively in <asm/kexec.h>. | ||
| 57 | * | ||
| 58 | * The code for the transition from the current kernel to the | ||
| 59 | * the new kernel is placed in the control_code_buffer, whose size | ||
| 60 | * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single | ||
| 61 | * page of memory is necessary, but some architectures require more. | ||
| 62 | * Because this memory must be identity mapped in the transition from | ||
| 63 | * virtual to physical addresses it must live in the range | ||
| 64 | * 0 - TASK_SIZE, as only the user space mappings are arbitrarily | ||
| 65 | * modifiable. | ||
| 66 | * | ||
| 67 | * The assembly stub in the control code buffer is passed a linked list | ||
| 68 | * of descriptor pages detailing the source pages of the new kernel, | ||
| 69 | * and the destination addresses of those source pages. As this data | ||
| 70 | * structure is not used in the context of the current OS, it must | ||
| 71 | * be self-contained. | ||
| 72 | * | ||
| 73 | * The code has been made to work with highmem pages and will use a | ||
| 74 | * destination page in its final resting place (if it happens | ||
| 75 | * to allocate it). The end product of this is that most of the | ||
| 76 | * physical address space, and most of RAM can be used. | ||
| 77 | * | ||
| 78 | * Future directions include: | ||
| 79 | * - allocating a page table with the control code buffer identity | ||
| 80 | * mapped, to simplify machine_kexec and make kexec_on_panic more | ||
| 81 | * reliable. | ||
| 82 | */ | ||
| 83 | |||
| 84 | /* | ||
| 85 | * KIMAGE_NO_DEST is an impossible destination address..., for | ||
| 86 | * allocating pages whose destination address we do not care about. | ||
| 87 | */ | ||
| 88 | #define KIMAGE_NO_DEST (-1UL) | ||
| 89 | |||
| 90 | static int kimage_is_destination_range(struct kimage *image, | ||
| 91 | unsigned long start, unsigned long end); | ||
| 92 | static struct page *kimage_alloc_page(struct kimage *image, | ||
| 93 | unsigned int gfp_mask, | ||
| 94 | unsigned long dest); | ||
| 95 | |||
| 96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | ||
| 97 | unsigned long nr_segments, | ||
| 98 | struct kexec_segment __user *segments) | ||
| 99 | { | ||
| 100 | size_t segment_bytes; | ||
| 101 | struct kimage *image; | ||
| 102 | unsigned long i; | ||
| 103 | int result; | ||
| 104 | |||
| 105 | /* Allocate a controlling structure */ | ||
| 106 | result = -ENOMEM; | ||
| 107 | image = kmalloc(sizeof(*image), GFP_KERNEL); | ||
| 108 | if (!image) | ||
| 109 | goto out; | ||
| 110 | |||
| 111 | memset(image, 0, sizeof(*image)); | ||
| 112 | image->head = 0; | ||
| 113 | image->entry = &image->head; | ||
| 114 | image->last_entry = &image->head; | ||
| 115 | image->control_page = ~0; /* By default this does not apply */ | ||
| 116 | image->start = entry; | ||
| 117 | image->type = KEXEC_TYPE_DEFAULT; | ||
| 118 | |||
| 119 | /* Initialize the list of control pages */ | ||
| 120 | INIT_LIST_HEAD(&image->control_pages); | ||
| 121 | |||
| 122 | /* Initialize the list of destination pages */ | ||
| 123 | INIT_LIST_HEAD(&image->dest_pages); | ||
| 124 | |||
| 125 | /* Initialize the list of unuseable pages */ | ||
| 126 | INIT_LIST_HEAD(&image->unuseable_pages); | ||
| 127 | |||
| 128 | /* Read in the segments */ | ||
| 129 | image->nr_segments = nr_segments; | ||
| 130 | segment_bytes = nr_segments * sizeof(*segments); | ||
| 131 | result = copy_from_user(image->segment, segments, segment_bytes); | ||
| 132 | if (result) | ||
| 133 | goto out; | ||
| 134 | |||
| 135 | /* | ||
| 136 | * Verify we have good destination addresses. The caller is | ||
| 137 | * responsible for making certain we don't attempt to load | ||
| 138 | * the new image into invalid or reserved areas of RAM. This | ||
| 139 | * just verifies it is an address we can use. | ||
| 140 | * | ||
| 141 | * Since the kernel does everything in page size chunks ensure | ||
| 142 | * the destination addreses are page aligned. Too many | ||
| 143 | * special cases crop of when we don't do this. The most | ||
| 144 | * insidious is getting overlapping destination addresses | ||
| 145 | * simply because addresses are changed to page size | ||
| 146 | * granularity. | ||
| 147 | */ | ||
| 148 | result = -EADDRNOTAVAIL; | ||
| 149 | for (i = 0; i < nr_segments; i++) { | ||
| 150 | unsigned long mstart, mend; | ||
| 151 | |||
| 152 | mstart = image->segment[i].mem; | ||
| 153 | mend = mstart + image->segment[i].memsz; | ||
| 154 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | ||
| 155 | goto out; | ||
| 156 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | ||
| 157 | goto out; | ||
| 158 | } | ||
| 159 | |||
| 160 | /* Verify our destination addresses do not overlap. | ||
| 161 | * If we alloed overlapping destination addresses | ||
| 162 | * through very weird things can happen with no | ||
| 163 | * easy explanation as one segment stops on another. | ||
| 164 | */ | ||
| 165 | result = -EINVAL; | ||
| 166 | for (i = 0; i < nr_segments; i++) { | ||
| 167 | unsigned long mstart, mend; | ||
| 168 | unsigned long j; | ||
| 169 | |||
| 170 | mstart = image->segment[i].mem; | ||
| 171 | mend = mstart + image->segment[i].memsz; | ||
| 172 | for (j = 0; j < i; j++) { | ||
| 173 | unsigned long pstart, pend; | ||
| 174 | pstart = image->segment[j].mem; | ||
| 175 | pend = pstart + image->segment[j].memsz; | ||
| 176 | /* Do the segments overlap ? */ | ||
| 177 | if ((mend > pstart) && (mstart < pend)) | ||
| 178 | goto out; | ||
| 179 | } | ||
| 180 | } | ||
| 181 | |||
| 182 | /* Ensure our buffer sizes are strictly less than | ||
| 183 | * our memory sizes. This should always be the case, | ||
| 184 | * and it is easier to check up front than to be surprised | ||
| 185 | * later on. | ||
| 186 | */ | ||
| 187 | result = -EINVAL; | ||
| 188 | for (i = 0; i < nr_segments; i++) { | ||
| 189 | if (image->segment[i].bufsz > image->segment[i].memsz) | ||
| 190 | goto out; | ||
| 191 | } | ||
| 192 | |||
| 193 | result = 0; | ||
| 194 | out: | ||
| 195 | if (result == 0) | ||
| 196 | *rimage = image; | ||
| 197 | else | ||
| 198 | kfree(image); | ||
| 199 | |||
| 200 | return result; | ||
| 201 | |||
| 202 | } | ||
| 203 | |||
| 204 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | ||
| 205 | unsigned long nr_segments, | ||
| 206 | struct kexec_segment __user *segments) | ||
| 207 | { | ||
| 208 | int result; | ||
| 209 | struct kimage *image; | ||
| 210 | |||
| 211 | /* Allocate and initialize a controlling structure */ | ||
| 212 | image = NULL; | ||
| 213 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | ||
| 214 | if (result) | ||
| 215 | goto out; | ||
| 216 | |||
| 217 | *rimage = image; | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Find a location for the control code buffer, and add it | ||
| 221 | * the vector of segments so that it's pages will also be | ||
| 222 | * counted as destination pages. | ||
| 223 | */ | ||
| 224 | result = -ENOMEM; | ||
| 225 | image->control_code_page = kimage_alloc_control_pages(image, | ||
| 226 | get_order(KEXEC_CONTROL_CODE_SIZE)); | ||
| 227 | if (!image->control_code_page) { | ||
| 228 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | ||
| 229 | goto out; | ||
| 230 | } | ||
| 231 | |||
| 232 | result = 0; | ||
| 233 | out: | ||
| 234 | if (result == 0) | ||
| 235 | *rimage = image; | ||
| 236 | else | ||
| 237 | kfree(image); | ||
| 238 | |||
| 239 | return result; | ||
| 240 | } | ||
| 241 | |||
| 242 | static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | ||
| 243 | unsigned long nr_segments, | ||
| 244 | struct kexec_segment *segments) | ||
| 245 | { | ||
| 246 | int result; | ||
| 247 | struct kimage *image; | ||
| 248 | unsigned long i; | ||
| 249 | |||
| 250 | image = NULL; | ||
| 251 | /* Verify we have a valid entry point */ | ||
| 252 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) { | ||
| 253 | result = -EADDRNOTAVAIL; | ||
| 254 | goto out; | ||
| 255 | } | ||
| 256 | |||
| 257 | /* Allocate and initialize a controlling structure */ | ||
| 258 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | ||
| 259 | if (result) | ||
| 260 | goto out; | ||
| 261 | |||
| 262 | /* Enable the special crash kernel control page | ||
| 263 | * allocation policy. | ||
| 264 | */ | ||
| 265 | image->control_page = crashk_res.start; | ||
| 266 | image->type = KEXEC_TYPE_CRASH; | ||
| 267 | |||
| 268 | /* | ||
| 269 | * Verify we have good destination addresses. Normally | ||
| 270 | * the caller is responsible for making certain we don't | ||
| 271 | * attempt to load the new image into invalid or reserved | ||
| 272 | * areas of RAM. But crash kernels are preloaded into a | ||
| 273 | * reserved area of ram. We must ensure the addresses | ||
| 274 | * are in the reserved area otherwise preloading the | ||
| 275 | * kernel could corrupt things. | ||
| 276 | */ | ||
| 277 | result = -EADDRNOTAVAIL; | ||
| 278 | for (i = 0; i < nr_segments; i++) { | ||
| 279 | unsigned long mstart, mend; | ||
| 280 | |||
| 281 | mstart = image->segment[i].mem; | ||
| 282 | mend = mstart + image->segment[i].memsz - 1; | ||
| 283 | /* Ensure we are within the crash kernel limits */ | ||
| 284 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) | ||
| 285 | goto out; | ||
| 286 | } | ||
| 287 | |||
| 288 | /* | ||
| 289 | * Find a location for the control code buffer, and add | ||
| 290 | * the vector of segments so that it's pages will also be | ||
| 291 | * counted as destination pages. | ||
| 292 | */ | ||
| 293 | result = -ENOMEM; | ||
| 294 | image->control_code_page = kimage_alloc_control_pages(image, | ||
| 295 | get_order(KEXEC_CONTROL_CODE_SIZE)); | ||
| 296 | if (!image->control_code_page) { | ||
| 297 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | ||
| 298 | goto out; | ||
| 299 | } | ||
| 300 | |||
| 301 | result = 0; | ||
| 302 | out: | ||
| 303 | if (result == 0) | ||
| 304 | *rimage = image; | ||
| 305 | else | ||
| 306 | kfree(image); | ||
| 307 | |||
| 308 | return result; | ||
| 309 | } | ||
| 310 | |||
| 311 | static int kimage_is_destination_range(struct kimage *image, | ||
| 312 | unsigned long start, | ||
| 313 | unsigned long end) | ||
| 314 | { | ||
| 315 | unsigned long i; | ||
| 316 | |||
| 317 | for (i = 0; i < image->nr_segments; i++) { | ||
| 318 | unsigned long mstart, mend; | ||
| 319 | |||
| 320 | mstart = image->segment[i].mem; | ||
| 321 | mend = mstart + image->segment[i].memsz; | ||
| 322 | if ((end > mstart) && (start < mend)) | ||
| 323 | return 1; | ||
| 324 | } | ||
| 325 | |||
| 326 | return 0; | ||
| 327 | } | ||
| 328 | |||
| 329 | static struct page *kimage_alloc_pages(unsigned int gfp_mask, | ||
| 330 | unsigned int order) | ||
| 331 | { | ||
| 332 | struct page *pages; | ||
| 333 | |||
| 334 | pages = alloc_pages(gfp_mask, order); | ||
| 335 | if (pages) { | ||
| 336 | unsigned int count, i; | ||
| 337 | pages->mapping = NULL; | ||
| 338 | pages->private = order; | ||
| 339 | count = 1 << order; | ||
| 340 | for (i = 0; i < count; i++) | ||
| 341 | SetPageReserved(pages + i); | ||
| 342 | } | ||
| 343 | |||
| 344 | return pages; | ||
| 345 | } | ||
| 346 | |||
| 347 | static void kimage_free_pages(struct page *page) | ||
| 348 | { | ||
| 349 | unsigned int order, count, i; | ||
| 350 | |||
| 351 | order = page->private; | ||
| 352 | count = 1 << order; | ||
| 353 | for (i = 0; i < count; i++) | ||
| 354 | ClearPageReserved(page + i); | ||
| 355 | __free_pages(page, order); | ||
| 356 | } | ||
| 357 | |||
| 358 | static void kimage_free_page_list(struct list_head *list) | ||
| 359 | { | ||
| 360 | struct list_head *pos, *next; | ||
| 361 | |||
| 362 | list_for_each_safe(pos, next, list) { | ||
| 363 | struct page *page; | ||
| 364 | |||
| 365 | page = list_entry(pos, struct page, lru); | ||
| 366 | list_del(&page->lru); | ||
| 367 | kimage_free_pages(page); | ||
| 368 | } | ||
| 369 | } | ||
| 370 | |||
| 371 | static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | ||
| 372 | unsigned int order) | ||
| 373 | { | ||
| 374 | /* Control pages are special, they are the intermediaries | ||
| 375 | * that are needed while we copy the rest of the pages | ||
| 376 | * to their final resting place. As such they must | ||
| 377 | * not conflict with either the destination addresses | ||
| 378 | * or memory the kernel is already using. | ||
| 379 | * | ||
| 380 | * The only case where we really need more than one of | ||
| 381 | * these are for architectures where we cannot disable | ||
| 382 | * the MMU and must instead generate an identity mapped | ||
| 383 | * page table for all of the memory. | ||
| 384 | * | ||
| 385 | * At worst this runs in O(N) of the image size. | ||
| 386 | */ | ||
| 387 | struct list_head extra_pages; | ||
| 388 | struct page *pages; | ||
| 389 | unsigned int count; | ||
| 390 | |||
| 391 | count = 1 << order; | ||
| 392 | INIT_LIST_HEAD(&extra_pages); | ||
| 393 | |||
| 394 | /* Loop while I can allocate a page and the page allocated | ||
| 395 | * is a destination page. | ||
| 396 | */ | ||
| 397 | do { | ||
| 398 | unsigned long pfn, epfn, addr, eaddr; | ||
| 399 | |||
| 400 | pages = kimage_alloc_pages(GFP_KERNEL, order); | ||
| 401 | if (!pages) | ||
| 402 | break; | ||
| 403 | pfn = page_to_pfn(pages); | ||
| 404 | epfn = pfn + count; | ||
| 405 | addr = pfn << PAGE_SHIFT; | ||
| 406 | eaddr = epfn << PAGE_SHIFT; | ||
| 407 | if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || | ||
| 408 | kimage_is_destination_range(image, addr, eaddr)) { | ||
| 409 | list_add(&pages->lru, &extra_pages); | ||
| 410 | pages = NULL; | ||
| 411 | } | ||
| 412 | } while (!pages); | ||
| 413 | |||
| 414 | if (pages) { | ||
| 415 | /* Remember the allocated page... */ | ||
| 416 | list_add(&pages->lru, &image->control_pages); | ||
| 417 | |||
| 418 | /* Because the page is already in it's destination | ||
| 419 | * location we will never allocate another page at | ||
| 420 | * that address. Therefore kimage_alloc_pages | ||
| 421 | * will not return it (again) and we don't need | ||
| 422 | * to give it an entry in image->segment[]. | ||
| 423 | */ | ||
| 424 | } | ||
| 425 | /* Deal with the destination pages I have inadvertently allocated. | ||
| 426 | * | ||
| 427 | * Ideally I would convert multi-page allocations into single | ||
| 428 | * page allocations, and add everyting to image->dest_pages. | ||
| 429 | * | ||
| 430 | * For now it is simpler to just free the pages. | ||
| 431 | */ | ||
| 432 | kimage_free_page_list(&extra_pages); | ||
| 433 | |||
| 434 | return pages; | ||
| 435 | } | ||
| 436 | |||
| 437 | static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | ||
| 438 | unsigned int order) | ||
| 439 | { | ||
| 440 | /* Control pages are special, they are the intermediaries | ||
| 441 | * that are needed while we copy the rest of the pages | ||
| 442 | * to their final resting place. As such they must | ||
| 443 | * not conflict with either the destination addresses | ||
| 444 | * or memory the kernel is already using. | ||
| 445 | * | ||
| 446 | * Control pages are also the only pags we must allocate | ||
| 447 | * when loading a crash kernel. All of the other pages | ||
| 448 | * are specified by the segments and we just memcpy | ||
| 449 | * into them directly. | ||
| 450 | * | ||
| 451 | * The only case where we really need more than one of | ||
| 452 | * these are for architectures where we cannot disable | ||
| 453 | * the MMU and must instead generate an identity mapped | ||
| 454 | * page table for all of the memory. | ||
| 455 | * | ||
| 456 | * Given the low demand this implements a very simple | ||
| 457 | * allocator that finds the first hole of the appropriate | ||
| 458 | * size in the reserved memory region, and allocates all | ||
| 459 | * of the memory up to and including the hole. | ||
| 460 | */ | ||
| 461 | unsigned long hole_start, hole_end, size; | ||
| 462 | struct page *pages; | ||
| 463 | |||
| 464 | pages = NULL; | ||
| 465 | size = (1 << order) << PAGE_SHIFT; | ||
| 466 | hole_start = (image->control_page + (size - 1)) & ~(size - 1); | ||
| 467 | hole_end = hole_start + size - 1; | ||
| 468 | while (hole_end <= crashk_res.end) { | ||
| 469 | unsigned long i; | ||
| 470 | |||
| 471 | if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) | ||
| 472 | break; | ||
| 473 | if (hole_end > crashk_res.end) | ||
| 474 | break; | ||
| 475 | /* See if I overlap any of the segments */ | ||
| 476 | for (i = 0; i < image->nr_segments; i++) { | ||
| 477 | unsigned long mstart, mend; | ||
| 478 | |||
| 479 | mstart = image->segment[i].mem; | ||
| 480 | mend = mstart + image->segment[i].memsz - 1; | ||
| 481 | if ((hole_end >= mstart) && (hole_start <= mend)) { | ||
| 482 | /* Advance the hole to the end of the segment */ | ||
| 483 | hole_start = (mend + (size - 1)) & ~(size - 1); | ||
| 484 | hole_end = hole_start + size - 1; | ||
| 485 | break; | ||
| 486 | } | ||
| 487 | } | ||
| 488 | /* If I don't overlap any segments I have found my hole! */ | ||
| 489 | if (i == image->nr_segments) { | ||
| 490 | pages = pfn_to_page(hole_start >> PAGE_SHIFT); | ||
| 491 | break; | ||
| 492 | } | ||
| 493 | } | ||
| 494 | if (pages) | ||
| 495 | image->control_page = hole_end; | ||
| 496 | |||
| 497 | return pages; | ||
| 498 | } | ||
| 499 | |||
| 500 | |||
| 501 | struct page *kimage_alloc_control_pages(struct kimage *image, | ||
| 502 | unsigned int order) | ||
| 503 | { | ||
| 504 | struct page *pages = NULL; | ||
| 505 | |||
| 506 | switch (image->type) { | ||
| 507 | case KEXEC_TYPE_DEFAULT: | ||
| 508 | pages = kimage_alloc_normal_control_pages(image, order); | ||
| 509 | break; | ||
| 510 | case KEXEC_TYPE_CRASH: | ||
| 511 | pages = kimage_alloc_crash_control_pages(image, order); | ||
| 512 | break; | ||
| 513 | } | ||
| 514 | |||
| 515 | return pages; | ||
| 516 | } | ||
| 517 | |||
| 518 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | ||
| 519 | { | ||
| 520 | if (*image->entry != 0) | ||
| 521 | image->entry++; | ||
| 522 | |||
| 523 | if (image->entry == image->last_entry) { | ||
| 524 | kimage_entry_t *ind_page; | ||
| 525 | struct page *page; | ||
| 526 | |||
| 527 | page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); | ||
| 528 | if (!page) | ||
| 529 | return -ENOMEM; | ||
| 530 | |||
| 531 | ind_page = page_address(page); | ||
| 532 | *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; | ||
| 533 | image->entry = ind_page; | ||
| 534 | image->last_entry = ind_page + | ||
| 535 | ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); | ||
| 536 | } | ||
| 537 | *image->entry = entry; | ||
| 538 | image->entry++; | ||
| 539 | *image->entry = 0; | ||
| 540 | |||
| 541 | return 0; | ||
| 542 | } | ||
| 543 | |||
| 544 | static int kimage_set_destination(struct kimage *image, | ||
| 545 | unsigned long destination) | ||
| 546 | { | ||
| 547 | int result; | ||
| 548 | |||
| 549 | destination &= PAGE_MASK; | ||
| 550 | result = kimage_add_entry(image, destination | IND_DESTINATION); | ||
| 551 | if (result == 0) | ||
| 552 | image->destination = destination; | ||
| 553 | |||
| 554 | return result; | ||
| 555 | } | ||
| 556 | |||
| 557 | |||
| 558 | static int kimage_add_page(struct kimage *image, unsigned long page) | ||
| 559 | { | ||
| 560 | int result; | ||
| 561 | |||
| 562 | page &= PAGE_MASK; | ||
| 563 | result = kimage_add_entry(image, page | IND_SOURCE); | ||
| 564 | if (result == 0) | ||
| 565 | image->destination += PAGE_SIZE; | ||
| 566 | |||
| 567 | return result; | ||
| 568 | } | ||
| 569 | |||
| 570 | |||
| 571 | static void kimage_free_extra_pages(struct kimage *image) | ||
| 572 | { | ||
| 573 | /* Walk through and free any extra destination pages I may have */ | ||
| 574 | kimage_free_page_list(&image->dest_pages); | ||
| 575 | |||
| 576 | /* Walk through and free any unuseable pages I have cached */ | ||
| 577 | kimage_free_page_list(&image->unuseable_pages); | ||
| 578 | |||
| 579 | } | ||
| 580 | static int kimage_terminate(struct kimage *image) | ||
| 581 | { | ||
| 582 | if (*image->entry != 0) | ||
| 583 | image->entry++; | ||
| 584 | |||
| 585 | *image->entry = IND_DONE; | ||
| 586 | |||
| 587 | return 0; | ||
| 588 | } | ||
| 589 | |||
| 590 | #define for_each_kimage_entry(image, ptr, entry) \ | ||
| 591 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | ||
| 592 | ptr = (entry & IND_INDIRECTION)? \ | ||
| 593 | phys_to_virt((entry & PAGE_MASK)): ptr +1) | ||
| 594 | |||
| 595 | static void kimage_free_entry(kimage_entry_t entry) | ||
| 596 | { | ||
| 597 | struct page *page; | ||
| 598 | |||
| 599 | page = pfn_to_page(entry >> PAGE_SHIFT); | ||
| 600 | kimage_free_pages(page); | ||
| 601 | } | ||
| 602 | |||
| 603 | static void kimage_free(struct kimage *image) | ||
| 604 | { | ||
| 605 | kimage_entry_t *ptr, entry; | ||
| 606 | kimage_entry_t ind = 0; | ||
| 607 | |||
| 608 | if (!image) | ||
| 609 | return; | ||
| 610 | |||
| 611 | kimage_free_extra_pages(image); | ||
| 612 | for_each_kimage_entry(image, ptr, entry) { | ||
| 613 | if (entry & IND_INDIRECTION) { | ||
| 614 | /* Free the previous indirection page */ | ||
| 615 | if (ind & IND_INDIRECTION) | ||
| 616 | kimage_free_entry(ind); | ||
| 617 | /* Save this indirection page until we are | ||
| 618 | * done with it. | ||
| 619 | */ | ||
| 620 | ind = entry; | ||
| 621 | } | ||
| 622 | else if (entry & IND_SOURCE) | ||
| 623 | kimage_free_entry(entry); | ||
| 624 | } | ||
| 625 | /* Free the final indirection page */ | ||
| 626 | if (ind & IND_INDIRECTION) | ||
| 627 | kimage_free_entry(ind); | ||
| 628 | |||
| 629 | /* Handle any machine specific cleanup */ | ||
| 630 | machine_kexec_cleanup(image); | ||
| 631 | |||
| 632 | /* Free the kexec control pages... */ | ||
| 633 | kimage_free_page_list(&image->control_pages); | ||
| 634 | kfree(image); | ||
| 635 | } | ||
| 636 | |||
| 637 | static kimage_entry_t *kimage_dst_used(struct kimage *image, | ||
| 638 | unsigned long page) | ||
| 639 | { | ||
| 640 | kimage_entry_t *ptr, entry; | ||
| 641 | unsigned long destination = 0; | ||
| 642 | |||
| 643 | for_each_kimage_entry(image, ptr, entry) { | ||
| 644 | if (entry & IND_DESTINATION) | ||
| 645 | destination = entry & PAGE_MASK; | ||
| 646 | else if (entry & IND_SOURCE) { | ||
| 647 | if (page == destination) | ||
| 648 | return ptr; | ||
| 649 | destination += PAGE_SIZE; | ||
| 650 | } | ||
| 651 | } | ||
| 652 | |||
| 653 | return 0; | ||
| 654 | } | ||
| 655 | |||
| 656 | static struct page *kimage_alloc_page(struct kimage *image, | ||
| 657 | unsigned int gfp_mask, | ||
| 658 | unsigned long destination) | ||
| 659 | { | ||
| 660 | /* | ||
| 661 | * Here we implement safeguards to ensure that a source page | ||
| 662 | * is not copied to its destination page before the data on | ||
| 663 | * the destination page is no longer useful. | ||
| 664 | * | ||
| 665 | * To do this we maintain the invariant that a source page is | ||
| 666 | * either its own destination page, or it is not a | ||
| 667 | * destination page at all. | ||
| 668 | * | ||
| 669 | * That is slightly stronger than required, but the proof | ||
| 670 | * that no problems will not occur is trivial, and the | ||
| 671 | * implementation is simply to verify. | ||
| 672 | * | ||
| 673 | * When allocating all pages normally this algorithm will run | ||
| 674 | * in O(N) time, but in the worst case it will run in O(N^2) | ||
| 675 | * time. If the runtime is a problem the data structures can | ||
| 676 | * be fixed. | ||
| 677 | */ | ||
| 678 | struct page *page; | ||
| 679 | unsigned long addr; | ||
| 680 | |||
| 681 | /* | ||
| 682 | * Walk through the list of destination pages, and see if I | ||
| 683 | * have a match. | ||
| 684 | */ | ||
| 685 | list_for_each_entry(page, &image->dest_pages, lru) { | ||
| 686 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
| 687 | if (addr == destination) { | ||
| 688 | list_del(&page->lru); | ||
| 689 | return page; | ||
| 690 | } | ||
| 691 | } | ||
| 692 | page = NULL; | ||
| 693 | while (1) { | ||
| 694 | kimage_entry_t *old; | ||
| 695 | |||
| 696 | /* Allocate a page, if we run out of memory give up */ | ||
| 697 | page = kimage_alloc_pages(gfp_mask, 0); | ||
| 698 | if (!page) | ||
| 699 | return 0; | ||
| 700 | /* If the page cannot be used file it away */ | ||
| 701 | if (page_to_pfn(page) > | ||
| 702 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | ||
| 703 | list_add(&page->lru, &image->unuseable_pages); | ||
| 704 | continue; | ||
| 705 | } | ||
| 706 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
| 707 | |||
| 708 | /* If it is the destination page we want use it */ | ||
| 709 | if (addr == destination) | ||
| 710 | break; | ||
| 711 | |||
| 712 | /* If the page is not a destination page use it */ | ||
| 713 | if (!kimage_is_destination_range(image, addr, | ||
| 714 | addr + PAGE_SIZE)) | ||
| 715 | break; | ||
| 716 | |||
| 717 | /* | ||
| 718 | * I know that the page is someones destination page. | ||
| 719 | * See if there is already a source page for this | ||
| 720 | * destination page. And if so swap the source pages. | ||
| 721 | */ | ||
| 722 | old = kimage_dst_used(image, addr); | ||
| 723 | if (old) { | ||
| 724 | /* If so move it */ | ||
| 725 | unsigned long old_addr; | ||
| 726 | struct page *old_page; | ||
| 727 | |||
| 728 | old_addr = *old & PAGE_MASK; | ||
| 729 | old_page = pfn_to_page(old_addr >> PAGE_SHIFT); | ||
| 730 | copy_highpage(page, old_page); | ||
| 731 | *old = addr | (*old & ~PAGE_MASK); | ||
| 732 | |||
| 733 | /* The old page I have found cannot be a | ||
| 734 | * destination page, so return it. | ||
| 735 | */ | ||
| 736 | addr = old_addr; | ||
| 737 | page = old_page; | ||
| 738 | break; | ||
| 739 | } | ||
| 740 | else { | ||
| 741 | /* Place the page on the destination list I | ||
| 742 | * will use it later. | ||
| 743 | */ | ||
| 744 | list_add(&page->lru, &image->dest_pages); | ||
| 745 | } | ||
| 746 | } | ||
| 747 | |||
| 748 | return page; | ||
| 749 | } | ||
| 750 | |||
| 751 | static int kimage_load_normal_segment(struct kimage *image, | ||
| 752 | struct kexec_segment *segment) | ||
| 753 | { | ||
| 754 | unsigned long maddr; | ||
| 755 | unsigned long ubytes, mbytes; | ||
| 756 | int result; | ||
| 757 | unsigned char *buf; | ||
| 758 | |||
| 759 | result = 0; | ||
| 760 | buf = segment->buf; | ||
| 761 | ubytes = segment->bufsz; | ||
| 762 | mbytes = segment->memsz; | ||
| 763 | maddr = segment->mem; | ||
| 764 | |||
| 765 | result = kimage_set_destination(image, maddr); | ||
| 766 | if (result < 0) | ||
| 767 | goto out; | ||
| 768 | |||
| 769 | while (mbytes) { | ||
| 770 | struct page *page; | ||
| 771 | char *ptr; | ||
| 772 | size_t uchunk, mchunk; | ||
| 773 | |||
| 774 | page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); | ||
| 775 | if (page == 0) { | ||
| 776 | result = -ENOMEM; | ||
| 777 | goto out; | ||
| 778 | } | ||
| 779 | result = kimage_add_page(image, page_to_pfn(page) | ||
| 780 | << PAGE_SHIFT); | ||
| 781 | if (result < 0) | ||
| 782 | goto out; | ||
| 783 | |||
| 784 | ptr = kmap(page); | ||
| 785 | /* Start with a clear page */ | ||
| 786 | memset(ptr, 0, PAGE_SIZE); | ||
| 787 | ptr += maddr & ~PAGE_MASK; | ||
| 788 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | ||
| 789 | if (mchunk > mbytes) | ||
| 790 | mchunk = mbytes; | ||
| 791 | |||
| 792 | uchunk = mchunk; | ||
| 793 | if (uchunk > ubytes) | ||
| 794 | uchunk = ubytes; | ||
| 795 | |||
| 796 | result = copy_from_user(ptr, buf, uchunk); | ||
| 797 | kunmap(page); | ||
| 798 | if (result) { | ||
| 799 | result = (result < 0) ? result : -EIO; | ||
| 800 | goto out; | ||
| 801 | } | ||
| 802 | ubytes -= uchunk; | ||
| 803 | maddr += mchunk; | ||
| 804 | buf += mchunk; | ||
| 805 | mbytes -= mchunk; | ||
| 806 | } | ||
| 807 | out: | ||
| 808 | return result; | ||
| 809 | } | ||
| 810 | |||
| 811 | static int kimage_load_crash_segment(struct kimage *image, | ||
| 812 | struct kexec_segment *segment) | ||
| 813 | { | ||
| 814 | /* For crash dumps kernels we simply copy the data from | ||
| 815 | * user space to it's destination. | ||
| 816 | * We do things a page at a time for the sake of kmap. | ||
| 817 | */ | ||
| 818 | unsigned long maddr; | ||
| 819 | unsigned long ubytes, mbytes; | ||
| 820 | int result; | ||
| 821 | unsigned char *buf; | ||
| 822 | |||
| 823 | result = 0; | ||
| 824 | buf = segment->buf; | ||
| 825 | ubytes = segment->bufsz; | ||
| 826 | mbytes = segment->memsz; | ||
| 827 | maddr = segment->mem; | ||
| 828 | while (mbytes) { | ||
| 829 | struct page *page; | ||
| 830 | char *ptr; | ||
| 831 | size_t uchunk, mchunk; | ||
| 832 | |||
| 833 | page = pfn_to_page(maddr >> PAGE_SHIFT); | ||
| 834 | if (page == 0) { | ||
| 835 | result = -ENOMEM; | ||
| 836 | goto out; | ||
| 837 | } | ||
| 838 | ptr = kmap(page); | ||
| 839 | ptr += maddr & ~PAGE_MASK; | ||
| 840 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | ||
| 841 | if (mchunk > mbytes) | ||
| 842 | mchunk = mbytes; | ||
| 843 | |||
| 844 | uchunk = mchunk; | ||
| 845 | if (uchunk > ubytes) { | ||
| 846 | uchunk = ubytes; | ||
| 847 | /* Zero the trailing part of the page */ | ||
| 848 | memset(ptr + uchunk, 0, mchunk - uchunk); | ||
| 849 | } | ||
| 850 | result = copy_from_user(ptr, buf, uchunk); | ||
| 851 | kunmap(page); | ||
| 852 | if (result) { | ||
| 853 | result = (result < 0) ? result : -EIO; | ||
| 854 | goto out; | ||
| 855 | } | ||
| 856 | ubytes -= uchunk; | ||
| 857 | maddr += mchunk; | ||
| 858 | buf += mchunk; | ||
| 859 | mbytes -= mchunk; | ||
| 860 | } | ||
| 861 | out: | ||
| 862 | return result; | ||
| 863 | } | ||
| 864 | |||
| 865 | static int kimage_load_segment(struct kimage *image, | ||
| 866 | struct kexec_segment *segment) | ||
| 867 | { | ||
| 868 | int result = -ENOMEM; | ||
| 869 | |||
| 870 | switch (image->type) { | ||
| 871 | case KEXEC_TYPE_DEFAULT: | ||
| 872 | result = kimage_load_normal_segment(image, segment); | ||
| 873 | break; | ||
| 874 | case KEXEC_TYPE_CRASH: | ||
| 875 | result = kimage_load_crash_segment(image, segment); | ||
| 876 | break; | ||
| 877 | } | ||
| 878 | |||
| 879 | return result; | ||
| 880 | } | ||
| 881 | |||
| 882 | /* | ||
| 883 | * Exec Kernel system call: for obvious reasons only root may call it. | ||
| 884 | * | ||
| 885 | * This call breaks up into three pieces. | ||
| 886 | * - A generic part which loads the new kernel from the current | ||
| 887 | * address space, and very carefully places the data in the | ||
| 888 | * allocated pages. | ||
| 889 | * | ||
| 890 | * - A generic part that interacts with the kernel and tells all of | ||
| 891 | * the devices to shut down. Preventing on-going dmas, and placing | ||
| 892 | * the devices in a consistent state so a later kernel can | ||
| 893 | * reinitialize them. | ||
| 894 | * | ||
| 895 | * - A machine specific part that includes the syscall number | ||
| 896 | * and the copies the image to it's final destination. And | ||
| 897 | * jumps into the image at entry. | ||
| 898 | * | ||
| 899 | * kexec does not sync, or unmount filesystems so if you need | ||
| 900 | * that to happen you need to do that yourself. | ||
| 901 | */ | ||
| 902 | struct kimage *kexec_image = NULL; | ||
| 903 | static struct kimage *kexec_crash_image = NULL; | ||
| 904 | /* | ||
| 905 | * A home grown binary mutex. | ||
| 906 | * Nothing can wait so this mutex is safe to use | ||
| 907 | * in interrupt context :) | ||
| 908 | */ | ||
| 909 | static int kexec_lock = 0; | ||
| 910 | |||
| 911 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, | ||
| 912 | struct kexec_segment __user *segments, | ||
| 913 | unsigned long flags) | ||
| 914 | { | ||
| 915 | struct kimage **dest_image, *image; | ||
| 916 | int locked; | ||
| 917 | int result; | ||
| 918 | |||
| 919 | /* We only trust the superuser with rebooting the system. */ | ||
| 920 | if (!capable(CAP_SYS_BOOT)) | ||
| 921 | return -EPERM; | ||
| 922 | |||
| 923 | /* | ||
| 924 | * Verify we have a legal set of flags | ||
| 925 | * This leaves us room for future extensions. | ||
| 926 | */ | ||
| 927 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) | ||
| 928 | return -EINVAL; | ||
| 929 | |||
| 930 | /* Verify we are on the appropriate architecture */ | ||
| 931 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
| 932 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
| 933 | return -EINVAL; | ||
| 934 | |||
| 935 | /* Put an artificial cap on the number | ||
| 936 | * of segments passed to kexec_load. | ||
| 937 | */ | ||
| 938 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
| 939 | return -EINVAL; | ||
| 940 | |||
| 941 | image = NULL; | ||
| 942 | result = 0; | ||
| 943 | |||
| 944 | /* Because we write directly to the reserved memory | ||
| 945 | * region when loading crash kernels we need a mutex here to | ||
| 946 | * prevent multiple crash kernels from attempting to load | ||
| 947 | * simultaneously, and to prevent a crash kernel from loading | ||
| 948 | * over the top of a in use crash kernel. | ||
| 949 | * | ||
| 950 | * KISS: always take the mutex. | ||
| 951 | */ | ||
| 952 | locked = xchg(&kexec_lock, 1); | ||
| 953 | if (locked) | ||
| 954 | return -EBUSY; | ||
| 955 | |||
| 956 | dest_image = &kexec_image; | ||
| 957 | if (flags & KEXEC_ON_CRASH) | ||
| 958 | dest_image = &kexec_crash_image; | ||
| 959 | if (nr_segments > 0) { | ||
| 960 | unsigned long i; | ||
| 961 | |||
| 962 | /* Loading another kernel to reboot into */ | ||
| 963 | if ((flags & KEXEC_ON_CRASH) == 0) | ||
| 964 | result = kimage_normal_alloc(&image, entry, | ||
| 965 | nr_segments, segments); | ||
| 966 | /* Loading another kernel to switch to if this one crashes */ | ||
| 967 | else if (flags & KEXEC_ON_CRASH) { | ||
| 968 | /* Free any current crash dump kernel before | ||
| 969 | * we corrupt it. | ||
| 970 | */ | ||
| 971 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
| 972 | result = kimage_crash_alloc(&image, entry, | ||
| 973 | nr_segments, segments); | ||
| 974 | } | ||
| 975 | if (result) | ||
| 976 | goto out; | ||
| 977 | |||
| 978 | result = machine_kexec_prepare(image); | ||
| 979 | if (result) | ||
| 980 | goto out; | ||
| 981 | |||
| 982 | for (i = 0; i < nr_segments; i++) { | ||
| 983 | result = kimage_load_segment(image, &image->segment[i]); | ||
| 984 | if (result) | ||
| 985 | goto out; | ||
| 986 | } | ||
| 987 | result = kimage_terminate(image); | ||
| 988 | if (result) | ||
| 989 | goto out; | ||
| 990 | } | ||
| 991 | /* Install the new kernel, and Uninstall the old */ | ||
| 992 | image = xchg(dest_image, image); | ||
| 993 | |||
| 994 | out: | ||
| 995 | xchg(&kexec_lock, 0); /* Release the mutex */ | ||
| 996 | kimage_free(image); | ||
| 997 | |||
| 998 | return result; | ||
| 999 | } | ||
| 1000 | |||
| 1001 | #ifdef CONFIG_COMPAT | ||
| 1002 | asmlinkage long compat_sys_kexec_load(unsigned long entry, | ||
| 1003 | unsigned long nr_segments, | ||
| 1004 | struct compat_kexec_segment __user *segments, | ||
| 1005 | unsigned long flags) | ||
| 1006 | { | ||
| 1007 | struct compat_kexec_segment in; | ||
| 1008 | struct kexec_segment out, __user *ksegments; | ||
| 1009 | unsigned long i, result; | ||
| 1010 | |||
| 1011 | /* Don't allow clients that don't understand the native | ||
| 1012 | * architecture to do anything. | ||
| 1013 | */ | ||
| 1014 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) | ||
| 1015 | return -EINVAL; | ||
| 1016 | |||
| 1017 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
| 1018 | return -EINVAL; | ||
| 1019 | |||
| 1020 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | ||
| 1021 | for (i=0; i < nr_segments; i++) { | ||
| 1022 | result = copy_from_user(&in, &segments[i], sizeof(in)); | ||
| 1023 | if (result) | ||
| 1024 | return -EFAULT; | ||
| 1025 | |||
| 1026 | out.buf = compat_ptr(in.buf); | ||
| 1027 | out.bufsz = in.bufsz; | ||
| 1028 | out.mem = in.mem; | ||
| 1029 | out.memsz = in.memsz; | ||
| 1030 | |||
| 1031 | result = copy_to_user(&ksegments[i], &out, sizeof(out)); | ||
| 1032 | if (result) | ||
| 1033 | return -EFAULT; | ||
| 1034 | } | ||
| 1035 | |||
| 1036 | return sys_kexec_load(entry, nr_segments, ksegments, flags); | ||
| 1037 | } | ||
| 1038 | #endif | ||
| 1039 | |||
| 1040 | void crash_kexec(struct pt_regs *regs) | ||
| 1041 | { | ||
| 1042 | struct kimage *image; | ||
| 1043 | int locked; | ||
| 1044 | |||
| 1045 | |||
| 1046 | /* Take the kexec_lock here to prevent sys_kexec_load | ||
| 1047 | * running on one cpu from replacing the crash kernel | ||
| 1048 | * we are using after a panic on a different cpu. | ||
| 1049 | * | ||
| 1050 | * If the crash kernel was not located in a fixed area | ||
| 1051 | * of memory the xchg(&kexec_crash_image) would be | ||
| 1052 | * sufficient. But since I reuse the memory... | ||
| 1053 | */ | ||
| 1054 | locked = xchg(&kexec_lock, 1); | ||
| 1055 | if (!locked) { | ||
| 1056 | image = xchg(&kexec_crash_image, NULL); | ||
| 1057 | if (image) { | ||
| 1058 | machine_crash_shutdown(regs); | ||
| 1059 | machine_kexec(image); | ||
| 1060 | } | ||
| 1061 | xchg(&kexec_lock, 0); | ||
| 1062 | } | ||
| 1063 | } | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index eed53d4f5230..44166e3bb8af 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -120,6 +120,7 @@ struct subprocess_info { | |||
| 120 | char *path; | 120 | char *path; |
| 121 | char **argv; | 121 | char **argv; |
| 122 | char **envp; | 122 | char **envp; |
| 123 | struct key *ring; | ||
| 123 | int wait; | 124 | int wait; |
| 124 | int retval; | 125 | int retval; |
| 125 | }; | 126 | }; |
| @@ -130,16 +131,21 @@ struct subprocess_info { | |||
| 130 | static int ____call_usermodehelper(void *data) | 131 | static int ____call_usermodehelper(void *data) |
| 131 | { | 132 | { |
| 132 | struct subprocess_info *sub_info = data; | 133 | struct subprocess_info *sub_info = data; |
| 134 | struct key *old_session; | ||
| 133 | int retval; | 135 | int retval; |
| 134 | 136 | ||
| 135 | /* Unblock all signals. */ | 137 | /* Unblock all signals and set the session keyring. */ |
| 138 | key_get(sub_info->ring); | ||
| 136 | flush_signals(current); | 139 | flush_signals(current); |
| 137 | spin_lock_irq(¤t->sighand->siglock); | 140 | spin_lock_irq(¤t->sighand->siglock); |
| 141 | old_session = __install_session_keyring(current, sub_info->ring); | ||
| 138 | flush_signal_handlers(current, 1); | 142 | flush_signal_handlers(current, 1); |
| 139 | sigemptyset(¤t->blocked); | 143 | sigemptyset(¤t->blocked); |
| 140 | recalc_sigpending(); | 144 | recalc_sigpending(); |
| 141 | spin_unlock_irq(¤t->sighand->siglock); | 145 | spin_unlock_irq(¤t->sighand->siglock); |
| 142 | 146 | ||
| 147 | key_put(old_session); | ||
| 148 | |||
| 143 | /* We can run anywhere, unlike our parent keventd(). */ | 149 | /* We can run anywhere, unlike our parent keventd(). */ |
| 144 | set_cpus_allowed(current, CPU_MASK_ALL); | 150 | set_cpus_allowed(current, CPU_MASK_ALL); |
| 145 | 151 | ||
| @@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data) | |||
| 211 | } | 217 | } |
| 212 | 218 | ||
| 213 | /** | 219 | /** |
| 214 | * call_usermodehelper - start a usermode application | 220 | * call_usermodehelper_keys - start a usermode application |
| 215 | * @path: pathname for the application | 221 | * @path: pathname for the application |
| 216 | * @argv: null-terminated argument list | 222 | * @argv: null-terminated argument list |
| 217 | * @envp: null-terminated environment list | 223 | * @envp: null-terminated environment list |
| 224 | * @session_keyring: session keyring for process (NULL for an empty keyring) | ||
| 218 | * @wait: wait for the application to finish and return status. | 225 | * @wait: wait for the application to finish and return status. |
| 219 | * | 226 | * |
| 220 | * Runs a user-space application. The application is started | 227 | * Runs a user-space application. The application is started |
| @@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data) | |||
| 224 | * Must be called from process context. Returns a negative error code | 231 | * Must be called from process context. Returns a negative error code |
| 225 | * if program was not execed successfully, or 0. | 232 | * if program was not execed successfully, or 0. |
| 226 | */ | 233 | */ |
| 227 | int call_usermodehelper(char *path, char **argv, char **envp, int wait) | 234 | int call_usermodehelper_keys(char *path, char **argv, char **envp, |
| 235 | struct key *session_keyring, int wait) | ||
| 228 | { | 236 | { |
| 229 | DECLARE_COMPLETION(done); | 237 | DECLARE_COMPLETION(done); |
| 230 | struct subprocess_info sub_info = { | 238 | struct subprocess_info sub_info = { |
| @@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) | |||
| 232 | .path = path, | 240 | .path = path, |
| 233 | .argv = argv, | 241 | .argv = argv, |
| 234 | .envp = envp, | 242 | .envp = envp, |
| 243 | .ring = session_keyring, | ||
| 235 | .wait = wait, | 244 | .wait = wait, |
| 236 | .retval = 0, | 245 | .retval = 0, |
| 237 | }; | 246 | }; |
| @@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) | |||
| 247 | wait_for_completion(&done); | 256 | wait_for_completion(&done); |
| 248 | return sub_info.retval; | 257 | return sub_info.retval; |
| 249 | } | 258 | } |
| 250 | EXPORT_SYMBOL(call_usermodehelper); | 259 | EXPORT_SYMBOL(call_usermodehelper_keys); |
| 251 | 260 | ||
| 252 | void __init usermodehelper_init(void) | 261 | void __init usermodehelper_init(void) |
| 253 | { | 262 | { |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 037142b72a49..334f37472c56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -27,6 +27,9 @@ | |||
| 27 | * interface to access function arguments. | 27 | * interface to access function arguments. |
| 28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes | 28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes |
| 29 | * exceptions notifier to be first on the priority list. | 29 | * exceptions notifier to be first on the priority list. |
| 30 | * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston | ||
| 31 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | ||
| 32 | * <prasanna@in.ibm.com> added function-return probes. | ||
| 30 | */ | 33 | */ |
| 31 | #include <linux/kprobes.h> | 34 | #include <linux/kprobes.h> |
| 32 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
| @@ -41,6 +44,7 @@ | |||
| 41 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | 44 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) |
| 42 | 45 | ||
| 43 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 46 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
| 47 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | ||
| 44 | 48 | ||
| 45 | unsigned int kprobe_cpu = NR_CPUS; | 49 | unsigned int kprobe_cpu = NR_CPUS; |
| 46 | static DEFINE_SPINLOCK(kprobe_lock); | 50 | static DEFINE_SPINLOCK(kprobe_lock); |
| @@ -78,22 +82,23 @@ struct kprobe *get_kprobe(void *addr) | |||
| 78 | * Aggregate handlers for multiple kprobes support - these handlers | 82 | * Aggregate handlers for multiple kprobes support - these handlers |
| 79 | * take care of invoking the individual kprobe handlers on p->list | 83 | * take care of invoking the individual kprobe handlers on p->list |
| 80 | */ | 84 | */ |
| 81 | int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 85 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
| 82 | { | 86 | { |
| 83 | struct kprobe *kp; | 87 | struct kprobe *kp; |
| 84 | 88 | ||
| 85 | list_for_each_entry(kp, &p->list, list) { | 89 | list_for_each_entry(kp, &p->list, list) { |
| 86 | if (kp->pre_handler) { | 90 | if (kp->pre_handler) { |
| 87 | curr_kprobe = kp; | 91 | curr_kprobe = kp; |
| 88 | kp->pre_handler(kp, regs); | 92 | if (kp->pre_handler(kp, regs)) |
| 89 | curr_kprobe = NULL; | 93 | return 1; |
| 90 | } | 94 | } |
| 95 | curr_kprobe = NULL; | ||
| 91 | } | 96 | } |
| 92 | return 0; | 97 | return 0; |
| 93 | } | 98 | } |
| 94 | 99 | ||
| 95 | void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 100 | static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
| 96 | unsigned long flags) | 101 | unsigned long flags) |
| 97 | { | 102 | { |
| 98 | struct kprobe *kp; | 103 | struct kprobe *kp; |
| 99 | 104 | ||
| @@ -107,7 +112,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 107 | return; | 112 | return; |
| 108 | } | 113 | } |
| 109 | 114 | ||
| 110 | int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) | 115 | static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
| 116 | int trapnr) | ||
| 111 | { | 117 | { |
| 112 | /* | 118 | /* |
| 113 | * if we faulted "during" the execution of a user specified | 119 | * if we faulted "during" the execution of a user specified |
| @@ -120,19 +126,191 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) | |||
| 120 | return 0; | 126 | return 0; |
| 121 | } | 127 | } |
| 122 | 128 | ||
| 129 | static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
| 130 | { | ||
| 131 | struct kprobe *kp = curr_kprobe; | ||
| 132 | if (curr_kprobe && kp->break_handler) { | ||
| 133 | if (kp->break_handler(kp, regs)) { | ||
| 134 | curr_kprobe = NULL; | ||
| 135 | return 1; | ||
| 136 | } | ||
| 137 | } | ||
| 138 | curr_kprobe = NULL; | ||
| 139 | return 0; | ||
| 140 | } | ||
| 141 | |||
| 142 | struct kprobe trampoline_p = { | ||
| 143 | .addr = (kprobe_opcode_t *) &kretprobe_trampoline, | ||
| 144 | .pre_handler = trampoline_probe_handler, | ||
| 145 | .post_handler = trampoline_post_handler | ||
| 146 | }; | ||
| 147 | |||
| 148 | struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | ||
| 149 | { | ||
| 150 | struct hlist_node *node; | ||
| 151 | struct kretprobe_instance *ri; | ||
| 152 | hlist_for_each_entry(ri, node, &rp->free_instances, uflist) | ||
| 153 | return ri; | ||
| 154 | return NULL; | ||
| 155 | } | ||
| 156 | |||
| 157 | static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | ||
| 158 | { | ||
| 159 | struct hlist_node *node; | ||
| 160 | struct kretprobe_instance *ri; | ||
| 161 | hlist_for_each_entry(ri, node, &rp->used_instances, uflist) | ||
| 162 | return ri; | ||
| 163 | return NULL; | ||
| 164 | } | ||
| 165 | |||
| 166 | struct kretprobe_instance *get_rp_inst(void *sara) | ||
| 167 | { | ||
| 168 | struct hlist_head *head; | ||
| 169 | struct hlist_node *node; | ||
| 170 | struct task_struct *tsk; | ||
| 171 | struct kretprobe_instance *ri; | ||
| 172 | |||
| 173 | tsk = arch_get_kprobe_task(sara); | ||
| 174 | head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | ||
| 175 | hlist_for_each_entry(ri, node, head, hlist) { | ||
| 176 | if (ri->stack_addr == sara) | ||
| 177 | return ri; | ||
| 178 | } | ||
| 179 | return NULL; | ||
| 180 | } | ||
| 181 | |||
| 182 | void add_rp_inst(struct kretprobe_instance *ri) | ||
| 183 | { | ||
| 184 | struct task_struct *tsk; | ||
| 185 | /* | ||
| 186 | * Remove rp inst off the free list - | ||
| 187 | * Add it back when probed function returns | ||
| 188 | */ | ||
| 189 | hlist_del(&ri->uflist); | ||
| 190 | tsk = arch_get_kprobe_task(ri->stack_addr); | ||
| 191 | /* Add rp inst onto table */ | ||
| 192 | INIT_HLIST_NODE(&ri->hlist); | ||
| 193 | hlist_add_head(&ri->hlist, | ||
| 194 | &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]); | ||
| 195 | |||
| 196 | /* Also add this rp inst to the used list. */ | ||
| 197 | INIT_HLIST_NODE(&ri->uflist); | ||
| 198 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | ||
| 199 | } | ||
| 200 | |||
| 201 | void recycle_rp_inst(struct kretprobe_instance *ri) | ||
| 202 | { | ||
| 203 | /* remove rp inst off the rprobe_inst_table */ | ||
| 204 | hlist_del(&ri->hlist); | ||
| 205 | if (ri->rp) { | ||
| 206 | /* remove rp inst off the used list */ | ||
| 207 | hlist_del(&ri->uflist); | ||
| 208 | /* put rp inst back onto the free list */ | ||
| 209 | INIT_HLIST_NODE(&ri->uflist); | ||
| 210 | hlist_add_head(&ri->uflist, &ri->rp->free_instances); | ||
| 211 | } else | ||
| 212 | /* Unregistering */ | ||
| 213 | kfree(ri); | ||
| 214 | } | ||
| 215 | |||
| 216 | struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | ||
| 217 | { | ||
| 218 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | ||
| 219 | } | ||
| 220 | |||
| 221 | struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk) | ||
| 222 | { | ||
| 223 | struct task_struct *tsk; | ||
| 224 | struct hlist_head *head; | ||
| 225 | struct hlist_node *node; | ||
| 226 | struct kretprobe_instance *ri; | ||
| 227 | |||
| 228 | head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)]; | ||
| 229 | |||
| 230 | hlist_for_each_entry(ri, node, head, hlist) { | ||
| 231 | tsk = arch_get_kprobe_task(ri->stack_addr); | ||
| 232 | if (tsk == tk) | ||
| 233 | return ri; | ||
| 234 | } | ||
| 235 | return NULL; | ||
| 236 | } | ||
| 237 | |||
| 238 | /* | ||
| 239 | * This function is called from do_exit or do_execv when task tk's stack is | ||
| 240 | * about to be recycled. Recycle any function-return probe instances | ||
| 241 | * associated with this task. These represent probed functions that have | ||
| 242 | * been called but may never return. | ||
| 243 | */ | ||
| 244 | void kprobe_flush_task(struct task_struct *tk) | ||
| 245 | { | ||
| 246 | unsigned long flags = 0; | ||
| 247 | spin_lock_irqsave(&kprobe_lock, flags); | ||
| 248 | arch_kprobe_flush_task(tk); | ||
| 249 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * This kprobe pre_handler is registered with every kretprobe. When probe | ||
| 254 | * hits it will set up the return probe. | ||
| 255 | */ | ||
| 256 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | ||
| 257 | { | ||
| 258 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | ||
| 259 | |||
| 260 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | ||
| 261 | arch_prepare_kretprobe(rp, regs); | ||
| 262 | return 0; | ||
| 263 | } | ||
| 264 | |||
| 265 | static inline void free_rp_inst(struct kretprobe *rp) | ||
| 266 | { | ||
| 267 | struct kretprobe_instance *ri; | ||
| 268 | while ((ri = get_free_rp_inst(rp)) != NULL) { | ||
| 269 | hlist_del(&ri->uflist); | ||
| 270 | kfree(ri); | ||
| 271 | } | ||
| 272 | } | ||
| 273 | |||
| 274 | /* | ||
| 275 | * Keep all fields in the kprobe consistent | ||
| 276 | */ | ||
| 277 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
| 278 | { | ||
| 279 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
| 280 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
| 281 | } | ||
| 282 | |||
| 283 | /* | ||
| 284 | * Add the new probe to old_p->list. Fail if this is the | ||
| 285 | * second jprobe at the address - two jprobes can't coexist | ||
| 286 | */ | ||
| 287 | static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
| 288 | { | ||
| 289 | struct kprobe *kp; | ||
| 290 | |||
| 291 | if (p->break_handler) { | ||
| 292 | list_for_each_entry(kp, &old_p->list, list) { | ||
| 293 | if (kp->break_handler) | ||
| 294 | return -EEXIST; | ||
| 295 | } | ||
| 296 | list_add_tail(&p->list, &old_p->list); | ||
| 297 | } else | ||
| 298 | list_add(&p->list, &old_p->list); | ||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | |||
| 123 | /* | 302 | /* |
| 124 | * Fill in the required fields of the "manager kprobe". Replace the | 303 | * Fill in the required fields of the "manager kprobe". Replace the |
| 125 | * earlier kprobe in the hlist with the manager kprobe | 304 | * earlier kprobe in the hlist with the manager kprobe |
| 126 | */ | 305 | */ |
| 127 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 306 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
| 128 | { | 307 | { |
| 308 | copy_kprobe(p, ap); | ||
| 129 | ap->addr = p->addr; | 309 | ap->addr = p->addr; |
| 130 | ap->opcode = p->opcode; | ||
| 131 | memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); | ||
| 132 | |||
| 133 | ap->pre_handler = aggr_pre_handler; | 310 | ap->pre_handler = aggr_pre_handler; |
| 134 | ap->post_handler = aggr_post_handler; | 311 | ap->post_handler = aggr_post_handler; |
| 135 | ap->fault_handler = aggr_fault_handler; | 312 | ap->fault_handler = aggr_fault_handler; |
| 313 | ap->break_handler = aggr_break_handler; | ||
| 136 | 314 | ||
| 137 | INIT_LIST_HEAD(&ap->list); | 315 | INIT_LIST_HEAD(&ap->list); |
| 138 | list_add(&p->list, &ap->list); | 316 | list_add(&p->list, &ap->list); |
| @@ -153,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
| 153 | int ret = 0; | 331 | int ret = 0; |
| 154 | struct kprobe *ap; | 332 | struct kprobe *ap; |
| 155 | 333 | ||
| 156 | if (old_p->break_handler || p->break_handler) { | 334 | if (old_p->pre_handler == aggr_pre_handler) { |
| 157 | ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ | 335 | copy_kprobe(old_p, p); |
| 158 | } else if (old_p->pre_handler == aggr_pre_handler) { | 336 | ret = add_new_kprobe(old_p, p); |
| 159 | list_add(&p->list, &old_p->list); | ||
| 160 | } else { | 337 | } else { |
| 161 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); | 338 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); |
| 162 | if (!ap) | 339 | if (!ap) |
| 163 | return -ENOMEM; | 340 | return -ENOMEM; |
| 164 | add_aggr_kprobe(ap, old_p); | 341 | add_aggr_kprobe(ap, old_p); |
| 165 | list_add(&p->list, &ap->list); | 342 | copy_kprobe(ap, p); |
| 343 | ret = add_new_kprobe(ap, p); | ||
| 166 | } | 344 | } |
| 167 | return ret; | 345 | return ret; |
| 168 | } | 346 | } |
| @@ -170,10 +348,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
| 170 | /* kprobe removal house-keeping routines */ | 348 | /* kprobe removal house-keeping routines */ |
| 171 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | 349 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) |
| 172 | { | 350 | { |
| 173 | *p->addr = p->opcode; | 351 | arch_disarm_kprobe(p); |
| 174 | hlist_del(&p->hlist); | 352 | hlist_del(&p->hlist); |
| 175 | flush_icache_range((unsigned long) p->addr, | ||
| 176 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
| 177 | spin_unlock_irqrestore(&kprobe_lock, flags); | 353 | spin_unlock_irqrestore(&kprobe_lock, flags); |
| 178 | arch_remove_kprobe(p); | 354 | arch_remove_kprobe(p); |
| 179 | } | 355 | } |
| @@ -200,6 +376,7 @@ int register_kprobe(struct kprobe *p) | |||
| 200 | } | 376 | } |
| 201 | spin_lock_irqsave(&kprobe_lock, flags); | 377 | spin_lock_irqsave(&kprobe_lock, flags); |
| 202 | old_p = get_kprobe(p->addr); | 378 | old_p = get_kprobe(p->addr); |
| 379 | p->nmissed = 0; | ||
| 203 | if (old_p) { | 380 | if (old_p) { |
| 204 | ret = register_aggr_kprobe(old_p, p); | 381 | ret = register_aggr_kprobe(old_p, p); |
| 205 | goto out; | 382 | goto out; |
| @@ -210,10 +387,8 @@ int register_kprobe(struct kprobe *p) | |||
| 210 | hlist_add_head(&p->hlist, | 387 | hlist_add_head(&p->hlist, |
| 211 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 388 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
| 212 | 389 | ||
| 213 | p->opcode = *p->addr; | 390 | arch_arm_kprobe(p); |
| 214 | *p->addr = BREAKPOINT_INSTRUCTION; | 391 | |
| 215 | flush_icache_range((unsigned long) p->addr, | ||
| 216 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
| 217 | out: | 392 | out: |
| 218 | spin_unlock_irqrestore(&kprobe_lock, flags); | 393 | spin_unlock_irqrestore(&kprobe_lock, flags); |
| 219 | rm_kprobe: | 394 | rm_kprobe: |
| @@ -257,16 +432,82 @@ void unregister_jprobe(struct jprobe *jp) | |||
| 257 | unregister_kprobe(&jp->kp); | 432 | unregister_kprobe(&jp->kp); |
| 258 | } | 433 | } |
| 259 | 434 | ||
| 435 | #ifdef ARCH_SUPPORTS_KRETPROBES | ||
| 436 | |||
| 437 | int register_kretprobe(struct kretprobe *rp) | ||
| 438 | { | ||
| 439 | int ret = 0; | ||
| 440 | struct kretprobe_instance *inst; | ||
| 441 | int i; | ||
| 442 | |||
| 443 | rp->kp.pre_handler = pre_handler_kretprobe; | ||
| 444 | |||
| 445 | /* Pre-allocate memory for max kretprobe instances */ | ||
| 446 | if (rp->maxactive <= 0) { | ||
| 447 | #ifdef CONFIG_PREEMPT | ||
| 448 | rp->maxactive = max(10, 2 * NR_CPUS); | ||
| 449 | #else | ||
| 450 | rp->maxactive = NR_CPUS; | ||
| 451 | #endif | ||
| 452 | } | ||
| 453 | INIT_HLIST_HEAD(&rp->used_instances); | ||
| 454 | INIT_HLIST_HEAD(&rp->free_instances); | ||
| 455 | for (i = 0; i < rp->maxactive; i++) { | ||
| 456 | inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); | ||
| 457 | if (inst == NULL) { | ||
| 458 | free_rp_inst(rp); | ||
| 459 | return -ENOMEM; | ||
| 460 | } | ||
| 461 | INIT_HLIST_NODE(&inst->uflist); | ||
| 462 | hlist_add_head(&inst->uflist, &rp->free_instances); | ||
| 463 | } | ||
| 464 | |||
| 465 | rp->nmissed = 0; | ||
| 466 | /* Establish function entry probe point */ | ||
| 467 | if ((ret = register_kprobe(&rp->kp)) != 0) | ||
| 468 | free_rp_inst(rp); | ||
| 469 | return ret; | ||
| 470 | } | ||
| 471 | |||
| 472 | #else /* ARCH_SUPPORTS_KRETPROBES */ | ||
| 473 | |||
| 474 | int register_kretprobe(struct kretprobe *rp) | ||
| 475 | { | ||
| 476 | return -ENOSYS; | ||
| 477 | } | ||
| 478 | |||
| 479 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | ||
| 480 | |||
| 481 | void unregister_kretprobe(struct kretprobe *rp) | ||
| 482 | { | ||
| 483 | unsigned long flags; | ||
| 484 | struct kretprobe_instance *ri; | ||
| 485 | |||
| 486 | unregister_kprobe(&rp->kp); | ||
| 487 | /* No race here */ | ||
| 488 | spin_lock_irqsave(&kprobe_lock, flags); | ||
| 489 | free_rp_inst(rp); | ||
| 490 | while ((ri = get_used_rp_inst(rp)) != NULL) { | ||
| 491 | ri->rp = NULL; | ||
| 492 | hlist_del(&ri->uflist); | ||
| 493 | } | ||
| 494 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
| 495 | } | ||
| 496 | |||
| 260 | static int __init init_kprobes(void) | 497 | static int __init init_kprobes(void) |
| 261 | { | 498 | { |
| 262 | int i, err = 0; | 499 | int i, err = 0; |
| 263 | 500 | ||
| 264 | /* FIXME allocate the probe table, currently defined statically */ | 501 | /* FIXME allocate the probe table, currently defined statically */ |
| 265 | /* initialize all list heads */ | 502 | /* initialize all list heads */ |
| 266 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) | 503 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 267 | INIT_HLIST_HEAD(&kprobe_table[i]); | 504 | INIT_HLIST_HEAD(&kprobe_table[i]); |
| 505 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | ||
| 506 | } | ||
| 268 | 507 | ||
| 269 | err = register_die_notifier(&kprobe_exceptions_nb); | 508 | err = register_die_notifier(&kprobe_exceptions_nb); |
| 509 | /* Register the trampoline probe for return probe */ | ||
| 510 | register_kprobe(&trampoline_p); | ||
| 270 | return err; | 511 | return err; |
| 271 | } | 512 | } |
| 272 | 513 | ||
| @@ -277,3 +518,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe); | |||
| 277 | EXPORT_SYMBOL_GPL(register_jprobe); | 518 | EXPORT_SYMBOL_GPL(register_jprobe); |
| 278 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 519 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
| 279 | EXPORT_SYMBOL_GPL(jprobe_return); | 520 | EXPORT_SYMBOL_GPL(jprobe_return); |
| 521 | EXPORT_SYMBOL_GPL(register_kretprobe); | ||
| 522 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | ||
| 523 | |||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1f064a63f8cf..015fb69ad94d 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) | |||
| 30 | KERNEL_ATTR_RO(hotplug_seqnum); | 30 | KERNEL_ATTR_RO(hotplug_seqnum); |
| 31 | #endif | 31 | #endif |
| 32 | 32 | ||
| 33 | #ifdef CONFIG_KEXEC | ||
| 34 | #include <asm/kexec.h> | ||
| 35 | |||
| 36 | static ssize_t crash_notes_show(struct subsystem *subsys, char *page) | ||
| 37 | { | ||
| 38 | return sprintf(page, "%p\n", (void *)crash_notes); | ||
| 39 | } | ||
| 40 | KERNEL_ATTR_RO(crash_notes); | ||
| 41 | #endif | ||
| 42 | |||
| 33 | decl_subsys(kernel, NULL, NULL); | 43 | decl_subsys(kernel, NULL, NULL); |
| 34 | EXPORT_SYMBOL_GPL(kernel_subsys); | 44 | EXPORT_SYMBOL_GPL(kernel_subsys); |
| 35 | 45 | ||
| @@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = { | |||
| 37 | #ifdef CONFIG_HOTPLUG | 47 | #ifdef CONFIG_HOTPLUG |
| 38 | &hotplug_seqnum_attr.attr, | 48 | &hotplug_seqnum_attr.attr, |
| 39 | #endif | 49 | #endif |
| 50 | #ifdef CONFIG_KEXEC | ||
| 51 | &crash_notes_attr.attr, | ||
| 52 | #endif | ||
| 40 | NULL | 53 | NULL |
| 41 | }; | 54 | }; |
| 42 | 55 | ||
diff --git a/kernel/module.c b/kernel/module.c index 5734ab09d3f9..068e271ab3a5 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
| 36 | #include <linux/stop_machine.h> | 36 | #include <linux/stop_machine.h> |
| 37 | #include <linux/device.h> | 37 | #include <linux/device.h> |
| 38 | #include <linux/string.h> | ||
| 38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
| 39 | #include <asm/semaphore.h> | 40 | #include <asm/semaphore.h> |
| 40 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
| @@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, | |||
| 370 | #endif /* CONFIG_SMP */ | 371 | #endif /* CONFIG_SMP */ |
| 371 | 372 | ||
| 372 | #ifdef CONFIG_MODULE_UNLOAD | 373 | #ifdef CONFIG_MODULE_UNLOAD |
| 374 | #define MODINFO_ATTR(field) \ | ||
| 375 | static void setup_modinfo_##field(struct module *mod, const char *s) \ | ||
| 376 | { \ | ||
| 377 | mod->field = kstrdup(s, GFP_KERNEL); \ | ||
| 378 | } \ | ||
| 379 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | ||
| 380 | struct module *mod, char *buffer) \ | ||
| 381 | { \ | ||
| 382 | return sprintf(buffer, "%s\n", mod->field); \ | ||
| 383 | } \ | ||
| 384 | static int modinfo_##field##_exists(struct module *mod) \ | ||
| 385 | { \ | ||
| 386 | return mod->field != NULL; \ | ||
| 387 | } \ | ||
| 388 | static void free_modinfo_##field(struct module *mod) \ | ||
| 389 | { \ | ||
| 390 | kfree(mod->field); \ | ||
| 391 | mod->field = NULL; \ | ||
| 392 | } \ | ||
| 393 | static struct module_attribute modinfo_##field = { \ | ||
| 394 | .attr = { .name = __stringify(field), .mode = 0444, \ | ||
| 395 | .owner = THIS_MODULE }, \ | ||
| 396 | .show = show_modinfo_##field, \ | ||
| 397 | .setup = setup_modinfo_##field, \ | ||
| 398 | .test = modinfo_##field##_exists, \ | ||
| 399 | .free = free_modinfo_##field, \ | ||
| 400 | }; | ||
| 401 | |||
| 402 | MODINFO_ATTR(version); | ||
| 403 | MODINFO_ATTR(srcversion); | ||
| 404 | |||
| 405 | static struct module_attribute *modinfo_attrs[] = { | ||
| 406 | &modinfo_version, | ||
| 407 | &modinfo_srcversion, | ||
| 408 | NULL, | ||
| 409 | }; | ||
| 410 | |||
| 373 | /* Init the unload section of the module. */ | 411 | /* Init the unload section of the module. */ |
| 374 | static void module_unload_init(struct module *mod) | 412 | static void module_unload_init(struct module *mod) |
| 375 | { | 413 | { |
| @@ -379,7 +417,7 @@ static void module_unload_init(struct module *mod) | |||
| 379 | for (i = 0; i < NR_CPUS; i++) | 417 | for (i = 0; i < NR_CPUS; i++) |
| 380 | local_set(&mod->ref[i].count, 0); | 418 | local_set(&mod->ref[i].count, 0); |
| 381 | /* Hold reference count during initialization. */ | 419 | /* Hold reference count during initialization. */ |
| 382 | local_set(&mod->ref[_smp_processor_id()].count, 1); | 420 | local_set(&mod->ref[raw_smp_processor_id()].count, 1); |
| 383 | /* Backwards compatibility macros put refcount during init. */ | 421 | /* Backwards compatibility macros put refcount during init. */ |
| 384 | mod->waiter = current; | 422 | mod->waiter = current; |
| 385 | } | 423 | } |
| @@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp) | |||
| 692 | return 0; | 730 | return 0; |
| 693 | } | 731 | } |
| 694 | 732 | ||
| 695 | int set_obsolete(const char *val, struct kernel_param *kp) | 733 | static int set_obsolete(const char *val, struct kernel_param *kp) |
| 696 | { | 734 | { |
| 697 | unsigned int min, max; | 735 | unsigned int min, max; |
| 698 | unsigned int size, maxsize; | 736 | unsigned int size, maxsize; |
| @@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod) | |||
| 1031 | } | 1069 | } |
| 1032 | #endif | 1070 | #endif |
| 1033 | 1071 | ||
| 1072 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1073 | static int module_add_modinfo_attrs(struct module *mod) | ||
| 1074 | { | ||
| 1075 | struct module_attribute *attr; | ||
| 1076 | int error = 0; | ||
| 1077 | int i; | ||
| 1078 | |||
| 1079 | for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { | ||
| 1080 | if (!attr->test || | ||
| 1081 | (attr->test && attr->test(mod))) | ||
| 1082 | error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); | ||
| 1083 | } | ||
| 1084 | return error; | ||
| 1085 | } | ||
| 1086 | |||
| 1087 | static void module_remove_modinfo_attrs(struct module *mod) | ||
| 1088 | { | ||
| 1089 | struct module_attribute *attr; | ||
| 1090 | int i; | ||
| 1091 | |||
| 1092 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | ||
| 1093 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); | ||
| 1094 | attr->free(mod); | ||
| 1095 | } | ||
| 1096 | } | ||
| 1097 | #endif | ||
| 1034 | 1098 | ||
| 1035 | static int mod_sysfs_setup(struct module *mod, | 1099 | static int mod_sysfs_setup(struct module *mod, |
| 1036 | struct kernel_param *kparam, | 1100 | struct kernel_param *kparam, |
| @@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod, | |||
| 1056 | if (err) | 1120 | if (err) |
| 1057 | goto out_unreg; | 1121 | goto out_unreg; |
| 1058 | 1122 | ||
| 1123 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1124 | err = module_add_modinfo_attrs(mod); | ||
| 1125 | if (err) | ||
| 1126 | goto out_unreg; | ||
| 1127 | #endif | ||
| 1128 | |||
| 1059 | return 0; | 1129 | return 0; |
| 1060 | 1130 | ||
| 1061 | out_unreg: | 1131 | out_unreg: |
| @@ -1066,6 +1136,9 @@ out: | |||
| 1066 | 1136 | ||
| 1067 | static void mod_kobject_remove(struct module *mod) | 1137 | static void mod_kobject_remove(struct module *mod) |
| 1068 | { | 1138 | { |
| 1139 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1140 | module_remove_modinfo_attrs(mod); | ||
| 1141 | #endif | ||
| 1069 | module_remove_refcnt_attr(mod); | 1142 | module_remove_refcnt_attr(mod); |
| 1070 | module_param_sysfs_remove(mod); | 1143 | module_param_sysfs_remove(mod); |
| 1071 | 1144 | ||
| @@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs, | |||
| 1311 | return NULL; | 1384 | return NULL; |
| 1312 | } | 1385 | } |
| 1313 | 1386 | ||
| 1387 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1388 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | ||
| 1389 | unsigned int infoindex) | ||
| 1390 | { | ||
| 1391 | struct module_attribute *attr; | ||
| 1392 | int i; | ||
| 1393 | |||
| 1394 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | ||
| 1395 | if (attr->setup) | ||
| 1396 | attr->setup(mod, | ||
| 1397 | get_modinfo(sechdrs, | ||
| 1398 | infoindex, | ||
| 1399 | attr->attr.name)); | ||
| 1400 | } | ||
| 1401 | } | ||
| 1402 | #endif | ||
| 1403 | |||
| 1314 | #ifdef CONFIG_KALLSYMS | 1404 | #ifdef CONFIG_KALLSYMS |
| 1315 | int is_exported(const char *name, const struct module *mod) | 1405 | int is_exported(const char *name, const struct module *mod) |
| 1316 | { | 1406 | { |
| @@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod, | |||
| 1615 | /* Set up license info based on the info section */ | 1705 | /* Set up license info based on the info section */ |
| 1616 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1706 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
| 1617 | 1707 | ||
| 1708 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1709 | /* Set up MODINFO_ATTR fields */ | ||
| 1710 | setup_modinfo(mod, sechdrs, infoindex); | ||
| 1711 | #endif | ||
| 1712 | |||
| 1618 | /* Fix up syms, so that st_value is a pointer to location. */ | 1713 | /* Fix up syms, so that st_value is a pointer to location. */ |
| 1619 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | 1714 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, |
| 1620 | mod); | 1715 | mod); |
| @@ -1758,6 +1853,7 @@ sys_init_module(void __user *umod, | |||
| 1758 | const char __user *uargs) | 1853 | const char __user *uargs) |
| 1759 | { | 1854 | { |
| 1760 | struct module *mod; | 1855 | struct module *mod; |
| 1856 | mm_segment_t old_fs = get_fs(); | ||
| 1761 | int ret = 0; | 1857 | int ret = 0; |
| 1762 | 1858 | ||
| 1763 | /* Must have permission */ | 1859 | /* Must have permission */ |
| @@ -1775,6 +1871,9 @@ sys_init_module(void __user *umod, | |||
| 1775 | return PTR_ERR(mod); | 1871 | return PTR_ERR(mod); |
| 1776 | } | 1872 | } |
| 1777 | 1873 | ||
| 1874 | /* flush the icache in correct context */ | ||
| 1875 | set_fs(KERNEL_DS); | ||
| 1876 | |||
| 1778 | /* Flush the instruction cache, since we've played with text */ | 1877 | /* Flush the instruction cache, since we've played with text */ |
| 1779 | if (mod->module_init) | 1878 | if (mod->module_init) |
| 1780 | flush_icache_range((unsigned long)mod->module_init, | 1879 | flush_icache_range((unsigned long)mod->module_init, |
| @@ -1783,6 +1882,8 @@ sys_init_module(void __user *umod, | |||
| 1783 | flush_icache_range((unsigned long)mod->module_core, | 1882 | flush_icache_range((unsigned long)mod->module_core, |
| 1784 | (unsigned long)mod->module_core + mod->core_size); | 1883 | (unsigned long)mod->module_core + mod->core_size); |
| 1785 | 1884 | ||
| 1885 | set_fs(old_fs); | ||
| 1886 | |||
| 1786 | /* Now sew it into the lists. They won't access us, since | 1887 | /* Now sew it into the lists. They won't access us, since |
| 1787 | strong_try_module_get() will fail. */ | 1888 | strong_try_module_get() will fail. */ |
| 1788 | stop_machine_run(__link_module, mod, NR_CPUS); | 1889 | stop_machine_run(__link_module, mod, NR_CPUS); |
diff --git a/kernel/panic.c b/kernel/panic.c index 081f7465fc8d..74ba5f3e46c7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/sysrq.h> | 18 | #include <linux/sysrq.h> |
| 19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
| 20 | #include <linux/nmi.h> | 20 | #include <linux/nmi.h> |
| 21 | #include <linux/kexec.h> | ||
| 21 | 22 | ||
| 22 | int panic_timeout; | 23 | int panic_timeout; |
| 23 | int panic_on_oops; | 24 | int panic_on_oops; |
| @@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 63 | unsigned long caller = (unsigned long) __builtin_return_address(0); | 64 | unsigned long caller = (unsigned long) __builtin_return_address(0); |
| 64 | #endif | 65 | #endif |
| 65 | 66 | ||
| 67 | /* | ||
| 68 | * It's possible to come here directly from a panic-assertion and not | ||
| 69 | * have preempt disabled. Some functions called from here want | ||
| 70 | * preempt to be disabled. No point enabling it later though... | ||
| 71 | */ | ||
| 72 | preempt_disable(); | ||
| 73 | |||
| 66 | bust_spinlocks(1); | 74 | bust_spinlocks(1); |
| 67 | va_start(args, fmt); | 75 | va_start(args, fmt); |
| 68 | vsnprintf(buf, sizeof(buf), fmt, args); | 76 | vsnprintf(buf, sizeof(buf), fmt, args); |
| @@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 70 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | 78 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); |
| 71 | bust_spinlocks(0); | 79 | bust_spinlocks(0); |
| 72 | 80 | ||
| 81 | /* | ||
| 82 | * If we have crashed and we have a crash kernel loaded let it handle | ||
| 83 | * everything else. | ||
| 84 | * Do we want to call this before we try to display a message? | ||
| 85 | */ | ||
| 86 | crash_kexec(NULL); | ||
| 87 | |||
| 73 | #ifdef CONFIG_SMP | 88 | #ifdef CONFIG_SMP |
| 89 | /* | ||
| 90 | * Note smp_send_stop is the usual smp shutdown function, which | ||
| 91 | * unfortunately means it may not be hardened to work in a panic | ||
| 92 | * situation. | ||
| 93 | */ | ||
| 74 | smp_send_stop(); | 94 | smp_send_stop(); |
| 75 | #endif | 95 | #endif |
| 76 | 96 | ||
| @@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 79 | if (!panic_blink) | 99 | if (!panic_blink) |
| 80 | panic_blink = no_blink; | 100 | panic_blink = no_blink; |
| 81 | 101 | ||
| 82 | if (panic_timeout > 0) | 102 | if (panic_timeout > 0) { |
| 83 | { | ||
| 84 | /* | 103 | /* |
| 85 | * Delay timeout seconds before rebooting the machine. | 104 | * Delay timeout seconds before rebooting the machine. |
| 86 | * We can't use the "normal" timers since we just panicked.. | 105 | * We can't use the "normal" timers since we just panicked.. |
diff --git a/kernel/params.c b/kernel/params.c index 5513844bec13..d586c35ef8fc 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj, | |||
| 629 | mk = to_module_kobject(kobj); | 629 | mk = to_module_kobject(kobj); |
| 630 | 630 | ||
| 631 | if (!attribute->show) | 631 | if (!attribute->show) |
| 632 | return -EPERM; | 632 | return -EIO; |
| 633 | 633 | ||
| 634 | if (!try_module_get(mk->mod)) | 634 | if (!try_module_get(mk->mod)) |
| 635 | return -ENODEV; | 635 | return -ENODEV; |
| @@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
| 653 | mk = to_module_kobject(kobj); | 653 | mk = to_module_kobject(kobj); |
| 654 | 654 | ||
| 655 | if (!attribute->store) | 655 | if (!attribute->store) |
| 656 | return -EPERM; | 656 | return -EIO; |
| 657 | 657 | ||
| 658 | if (!try_module_get(mk->mod)) | 658 | if (!try_module_get(mk->mod)) |
| 659 | return -ENODEV; | 659 | return -ENODEV; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fd316c272260..5b7b4736d82b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -89,23 +89,6 @@ static struct idr posix_timers_id; | |||
| 89 | static DEFINE_SPINLOCK(idr_lock); | 89 | static DEFINE_SPINLOCK(idr_lock); |
| 90 | 90 | ||
| 91 | /* | 91 | /* |
| 92 | * Just because the timer is not in the timer list does NOT mean it is | ||
| 93 | * inactive. It could be in the "fire" routine getting a new expire time. | ||
| 94 | */ | ||
| 95 | #define TIMER_INACTIVE 1 | ||
| 96 | |||
| 97 | #ifdef CONFIG_SMP | ||
| 98 | # define timer_active(tmr) \ | ||
| 99 | ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) | ||
| 100 | # define set_timer_inactive(tmr) \ | ||
| 101 | do { \ | ||
| 102 | (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ | ||
| 103 | } while (0) | ||
| 104 | #else | ||
| 105 | # define timer_active(tmr) BARFY // error to use outside of SMP | ||
| 106 | # define set_timer_inactive(tmr) do { } while (0) | ||
| 107 | #endif | ||
| 108 | /* | ||
| 109 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other | 92 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other |
| 110 | * SIGEV values. Here we put out an error if this assumption fails. | 93 | * SIGEV values. Here we put out an error if this assumption fails. |
| 111 | */ | 94 | */ |
| @@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer) | |||
| 226 | init_timer(&new_timer->it.real.timer); | 209 | init_timer(&new_timer->it.real.timer); |
| 227 | new_timer->it.real.timer.data = (unsigned long) new_timer; | 210 | new_timer->it.real.timer.data = (unsigned long) new_timer; |
| 228 | new_timer->it.real.timer.function = posix_timer_fn; | 211 | new_timer->it.real.timer.function = posix_timer_fn; |
| 229 | set_timer_inactive(new_timer); | ||
| 230 | return 0; | 212 | return 0; |
| 231 | } | 213 | } |
| 232 | 214 | ||
| @@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data) | |||
| 480 | int do_notify = 1; | 462 | int do_notify = 1; |
| 481 | 463 | ||
| 482 | spin_lock_irqsave(&timr->it_lock, flags); | 464 | spin_lock_irqsave(&timr->it_lock, flags); |
| 483 | set_timer_inactive(timr); | ||
| 484 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | 465 | if (!list_empty(&timr->it.real.abs_timer_entry)) { |
| 485 | spin_lock(&abs_list.lock); | 466 | spin_lock(&abs_list.lock); |
| 486 | do { | 467 | do { |
| @@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
| 983 | * careful here. If smp we could be in the "fire" routine which will | 964 | * careful here. If smp we could be in the "fire" routine which will |
| 984 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | 965 | * be spinning as we hold the lock. But this is ONLY an SMP issue. |
| 985 | */ | 966 | */ |
| 967 | if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { | ||
| 986 | #ifdef CONFIG_SMP | 968 | #ifdef CONFIG_SMP |
| 987 | if (timer_active(timr) && !del_timer(&timr->it.real.timer)) | ||
| 988 | /* | 969 | /* |
| 989 | * It can only be active if on an other cpu. Since | 970 | * It can only be active if on an other cpu. Since |
| 990 | * we have cleared the interval stuff above, it should | 971 | * we have cleared the interval stuff above, it should |
| @@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
| 994 | * a "retry" exit status. | 975 | * a "retry" exit status. |
| 995 | */ | 976 | */ |
| 996 | return TIMER_RETRY; | 977 | return TIMER_RETRY; |
| 997 | |||
| 998 | set_timer_inactive(timr); | ||
| 999 | #else | ||
| 1000 | del_timer(&timr->it.real.timer); | ||
| 1001 | #endif | 978 | #endif |
| 979 | } | ||
| 980 | |||
| 1002 | remove_from_abslist(timr); | 981 | remove_from_abslist(timr); |
| 1003 | 982 | ||
| 1004 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | 983 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & |
| @@ -1083,8 +1062,9 @@ retry: | |||
| 1083 | static inline int common_timer_del(struct k_itimer *timer) | 1062 | static inline int common_timer_del(struct k_itimer *timer) |
| 1084 | { | 1063 | { |
| 1085 | timer->it.real.incr = 0; | 1064 | timer->it.real.incr = 0; |
| 1065 | |||
| 1066 | if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { | ||
| 1086 | #ifdef CONFIG_SMP | 1067 | #ifdef CONFIG_SMP |
| 1087 | if (timer_active(timer) && !del_timer(&timer->it.real.timer)) | ||
| 1088 | /* | 1068 | /* |
| 1089 | * It can only be active if on an other cpu. Since | 1069 | * It can only be active if on an other cpu. Since |
| 1090 | * we have cleared the interval stuff above, it should | 1070 | * we have cleared the interval stuff above, it should |
| @@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer) | |||
| 1094 | * a "retry" exit status. | 1074 | * a "retry" exit status. |
| 1095 | */ | 1075 | */ |
| 1096 | return TIMER_RETRY; | 1076 | return TIMER_RETRY; |
| 1097 | #else | ||
| 1098 | del_timer(&timer->it.real.timer); | ||
| 1099 | #endif | 1077 | #endif |
| 1078 | } | ||
| 1079 | |||
| 1100 | remove_from_abslist(timer); | 1080 | remove_from_abslist(timer); |
| 1101 | 1081 | ||
| 1102 | return 0; | 1082 | return 0; |
| @@ -1197,6 +1177,7 @@ void exit_itimers(struct signal_struct *sig) | |||
| 1197 | tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); | 1177 | tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); |
| 1198 | itimer_delete(tmr); | 1178 | itimer_delete(tmr); |
| 1199 | } | 1179 | } |
| 1180 | del_timer_sync(&sig->real_timer); | ||
| 1200 | } | 1181 | } |
| 1201 | 1182 | ||
| 1202 | /* | 1183 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 696387ffe49c..2c7121d9bff1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -27,8 +27,8 @@ config PM_DEBUG | |||
| 27 | like suspend support. | 27 | like suspend support. |
| 28 | 28 | ||
| 29 | config SOFTWARE_SUSPEND | 29 | config SOFTWARE_SUSPEND |
| 30 | bool "Software Suspend (EXPERIMENTAL)" | 30 | bool "Software Suspend" |
| 31 | depends on EXPERIMENTAL && PM && SWAP | 31 | depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) |
| 32 | ---help--- | 32 | ---help--- |
| 33 | Enable the possibility of suspending the machine. | 33 | Enable the possibility of suspending the machine. |
| 34 | It doesn't need APM. | 34 | It doesn't need APM. |
| @@ -72,3 +72,7 @@ config PM_STD_PARTITION | |||
| 72 | suspended image to. It will simply pick the first available swap | 72 | suspended image to. It will simply pick the first available swap |
| 73 | device. | 73 | device. |
| 74 | 74 | ||
| 75 | config SUSPEND_SMP | ||
| 76 | bool | ||
| 77 | depends on HOTPLUG_CPU && X86 && PM | ||
| 78 | default y | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index fbdc634135a7..2f438d0eaa13 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) | |||
| 3 | EXTRA_CFLAGS += -DDEBUG | 3 | EXTRA_CFLAGS += -DDEBUG |
| 4 | endif | 4 | endif |
| 5 | 5 | ||
| 6 | swsusp-smp-$(CONFIG_SMP) += smp.o | ||
| 7 | |||
| 8 | obj-y := main.o process.o console.o pm.o | 6 | obj-y := main.o process.o console.o pm.o |
| 9 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o | 7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o |
| 8 | |||
| 9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | ||
| 10 | 10 | ||
| 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02b6764034dc..fb8de63c2919 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -117,8 +117,8 @@ static void finish(void) | |||
| 117 | { | 117 | { |
| 118 | device_resume(); | 118 | device_resume(); |
| 119 | platform_finish(); | 119 | platform_finish(); |
| 120 | enable_nonboot_cpus(); | ||
| 121 | thaw_processes(); | 120 | thaw_processes(); |
| 121 | enable_nonboot_cpus(); | ||
| 122 | pm_restore_console(); | 122 | pm_restore_console(); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| @@ -131,28 +131,35 @@ static int prepare_processes(void) | |||
| 131 | 131 | ||
| 132 | sys_sync(); | 132 | sys_sync(); |
| 133 | 133 | ||
| 134 | disable_nonboot_cpus(); | ||
| 135 | |||
| 134 | if (freeze_processes()) { | 136 | if (freeze_processes()) { |
| 135 | error = -EBUSY; | 137 | error = -EBUSY; |
| 136 | return error; | 138 | goto thaw; |
| 137 | } | 139 | } |
| 138 | 140 | ||
| 139 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 141 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
| 140 | if (pm_ops && pm_ops->prepare) { | 142 | if (pm_ops && pm_ops->prepare) { |
| 141 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) | 143 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) |
| 142 | return error; | 144 | goto thaw; |
| 143 | } | 145 | } |
| 144 | } | 146 | } |
| 145 | 147 | ||
| 146 | /* Free memory before shutting down devices. */ | 148 | /* Free memory before shutting down devices. */ |
| 147 | free_some_memory(); | 149 | free_some_memory(); |
| 148 | |||
| 149 | return 0; | 150 | return 0; |
| 151 | thaw: | ||
| 152 | thaw_processes(); | ||
| 153 | enable_nonboot_cpus(); | ||
| 154 | pm_restore_console(); | ||
| 155 | return error; | ||
| 150 | } | 156 | } |
| 151 | 157 | ||
| 152 | static void unprepare_processes(void) | 158 | static void unprepare_processes(void) |
| 153 | { | 159 | { |
| 154 | enable_nonboot_cpus(); | 160 | platform_finish(); |
| 155 | thaw_processes(); | 161 | thaw_processes(); |
| 162 | enable_nonboot_cpus(); | ||
| 156 | pm_restore_console(); | 163 | pm_restore_console(); |
| 157 | } | 164 | } |
| 158 | 165 | ||
| @@ -160,15 +167,9 @@ static int prepare_devices(void) | |||
| 160 | { | 167 | { |
| 161 | int error; | 168 | int error; |
| 162 | 169 | ||
| 163 | disable_nonboot_cpus(); | 170 | if ((error = device_suspend(PMSG_FREEZE))) |
| 164 | if ((error = device_suspend(PMSG_FREEZE))) { | ||
| 165 | printk("Some devices failed to suspend\n"); | 171 | printk("Some devices failed to suspend\n"); |
| 166 | platform_finish(); | 172 | return error; |
| 167 | enable_nonboot_cpus(); | ||
| 168 | return error; | ||
| 169 | } | ||
| 170 | |||
| 171 | return 0; | ||
| 172 | } | 173 | } |
| 173 | 174 | ||
| 174 | /** | 175 | /** |
| @@ -185,9 +186,9 @@ int pm_suspend_disk(void) | |||
| 185 | int error; | 186 | int error; |
| 186 | 187 | ||
| 187 | error = prepare_processes(); | 188 | error = prepare_processes(); |
| 188 | if (!error) { | 189 | if (error) |
| 189 | error = prepare_devices(); | 190 | return error; |
| 190 | } | 191 | error = prepare_devices(); |
| 191 | 192 | ||
| 192 | if (error) { | 193 | if (error) { |
| 193 | unprepare_processes(); | 194 | unprepare_processes(); |
| @@ -250,7 +251,7 @@ static int software_resume(void) | |||
| 250 | 251 | ||
| 251 | if ((error = prepare_processes())) { | 252 | if ((error = prepare_processes())) { |
| 252 | swsusp_close(); | 253 | swsusp_close(); |
| 253 | goto Cleanup; | 254 | goto Done; |
| 254 | } | 255 | } |
| 255 | 256 | ||
| 256 | pr_debug("PM: Reading swsusp image.\n"); | 257 | pr_debug("PM: Reading swsusp image.\n"); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 4cdebc972ff2..c94cb9e95090 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state) | |||
| 55 | 55 | ||
| 56 | pm_prepare_console(); | 56 | pm_prepare_console(); |
| 57 | 57 | ||
| 58 | disable_nonboot_cpus(); | ||
| 59 | |||
| 60 | if (num_online_cpus() != 1) { | ||
| 61 | error = -EPERM; | ||
| 62 | goto Enable_cpu; | ||
| 63 | } | ||
| 64 | |||
| 58 | if (freeze_processes()) { | 65 | if (freeze_processes()) { |
| 59 | error = -EAGAIN; | 66 | error = -EAGAIN; |
| 60 | goto Thaw; | 67 | goto Thaw; |
| @@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state) | |||
| 75 | pm_ops->finish(state); | 82 | pm_ops->finish(state); |
| 76 | Thaw: | 83 | Thaw: |
| 77 | thaw_processes(); | 84 | thaw_processes(); |
| 85 | Enable_cpu: | ||
| 86 | enable_nonboot_cpus(); | ||
| 78 | pm_restore_console(); | 87 | pm_restore_console(); |
| 79 | return error; | 88 | return error; |
| 80 | } | 89 | } |
| @@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state) | |||
| 113 | if (pm_ops && pm_ops->finish) | 122 | if (pm_ops && pm_ops->finish) |
| 114 | pm_ops->finish(state); | 123 | pm_ops->finish(state); |
| 115 | thaw_processes(); | 124 | thaw_processes(); |
| 125 | enable_nonboot_cpus(); | ||
| 116 | pm_restore_console(); | 126 | pm_restore_console(); |
| 117 | } | 127 | } |
| 118 | 128 | ||
| @@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state) | |||
| 150 | goto Unlock; | 160 | goto Unlock; |
| 151 | } | 161 | } |
| 152 | 162 | ||
| 153 | /* Suspend is hard to get right on SMP. */ | ||
| 154 | if (num_online_cpus() != 1) { | ||
| 155 | error = -EPERM; | ||
| 156 | goto Unlock; | ||
| 157 | } | ||
| 158 | |||
| 159 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 163 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
| 160 | if ((error = suspend_prepare(state))) | 164 | if ((error = suspend_prepare(state))) |
| 161 | goto Unlock; | 165 | goto Unlock; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 78d92dc6a1ed..0a086640bcfc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p) | |||
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | /* Refrigerator is place where frozen processes are stored :-). */ | 34 | /* Refrigerator is place where frozen processes are stored :-). */ |
| 35 | void refrigerator(unsigned long flag) | 35 | void refrigerator(void) |
| 36 | { | 36 | { |
| 37 | /* Hmm, should we be allowed to suspend when there are realtime | 37 | /* Hmm, should we be allowed to suspend when there are realtime |
| 38 | processes around? */ | 38 | processes around? */ |
| @@ -41,14 +41,13 @@ void refrigerator(unsigned long flag) | |||
| 41 | current->state = TASK_UNINTERRUPTIBLE; | 41 | current->state = TASK_UNINTERRUPTIBLE; |
| 42 | pr_debug("%s entered refrigerator\n", current->comm); | 42 | pr_debug("%s entered refrigerator\n", current->comm); |
| 43 | printk("="); | 43 | printk("="); |
| 44 | current->flags &= ~PF_FREEZE; | ||
| 45 | 44 | ||
| 45 | frozen_process(current); | ||
| 46 | spin_lock_irq(¤t->sighand->siglock); | 46 | spin_lock_irq(¤t->sighand->siglock); |
| 47 | recalc_sigpending(); /* We sent fake signal, clean it up */ | 47 | recalc_sigpending(); /* We sent fake signal, clean it up */ |
| 48 | spin_unlock_irq(¤t->sighand->siglock); | 48 | spin_unlock_irq(¤t->sighand->siglock); |
| 49 | 49 | ||
| 50 | current->flags |= PF_FROZEN; | 50 | while (frozen(current)) |
| 51 | while (current->flags & PF_FROZEN) | ||
| 52 | schedule(); | 51 | schedule(); |
| 53 | pr_debug("%s left refrigerator\n", current->comm); | 52 | pr_debug("%s left refrigerator\n", current->comm); |
| 54 | current->state = save; | 53 | current->state = save; |
| @@ -57,10 +56,10 @@ void refrigerator(unsigned long flag) | |||
| 57 | /* 0 = success, else # of processes that we failed to stop */ | 56 | /* 0 = success, else # of processes that we failed to stop */ |
| 58 | int freeze_processes(void) | 57 | int freeze_processes(void) |
| 59 | { | 58 | { |
| 60 | int todo; | 59 | int todo; |
| 61 | unsigned long start_time; | 60 | unsigned long start_time; |
| 62 | struct task_struct *g, *p; | 61 | struct task_struct *g, *p; |
| 63 | 62 | ||
| 64 | printk( "Stopping tasks: " ); | 63 | printk( "Stopping tasks: " ); |
| 65 | start_time = jiffies; | 64 | start_time = jiffies; |
| 66 | do { | 65 | do { |
| @@ -70,14 +69,12 @@ int freeze_processes(void) | |||
| 70 | unsigned long flags; | 69 | unsigned long flags; |
| 71 | if (!freezeable(p)) | 70 | if (!freezeable(p)) |
| 72 | continue; | 71 | continue; |
| 73 | if ((p->flags & PF_FROZEN) || | 72 | if ((frozen(p)) || |
| 74 | (p->state == TASK_TRACED) || | 73 | (p->state == TASK_TRACED) || |
| 75 | (p->state == TASK_STOPPED)) | 74 | (p->state == TASK_STOPPED)) |
| 76 | continue; | 75 | continue; |
| 77 | 76 | ||
| 78 | /* FIXME: smp problem here: we may not access other process' flags | 77 | freeze(p); |
| 79 | without locking */ | ||
| 80 | p->flags |= PF_FREEZE; | ||
| 81 | spin_lock_irqsave(&p->sighand->siglock, flags); | 78 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 82 | signal_wake_up(p, 0); | 79 | signal_wake_up(p, 0); |
| 83 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 80 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| @@ -91,7 +88,7 @@ int freeze_processes(void) | |||
| 91 | return todo; | 88 | return todo; |
| 92 | } | 89 | } |
| 93 | } while(todo); | 90 | } while(todo); |
| 94 | 91 | ||
| 95 | printk( "|\n" ); | 92 | printk( "|\n" ); |
| 96 | BUG_ON(in_atomic()); | 93 | BUG_ON(in_atomic()); |
| 97 | return 0; | 94 | return 0; |
| @@ -106,10 +103,7 @@ void thaw_processes(void) | |||
| 106 | do_each_thread(g, p) { | 103 | do_each_thread(g, p) { |
| 107 | if (!freezeable(p)) | 104 | if (!freezeable(p)) |
| 108 | continue; | 105 | continue; |
| 109 | if (p->flags & PF_FROZEN) { | 106 | if (!thaw_process(p)) |
| 110 | p->flags &= ~PF_FROZEN; | ||
| 111 | wake_up_process(p); | ||
| 112 | } else | ||
| 113 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); | 107 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); |
| 114 | } while_each_thread(g, p); | 108 | } while_each_thread(g, p); |
| 115 | 109 | ||
diff --git a/kernel/power/smp.c b/kernel/power/smp.c index cba3584b80fe..bbe23079c62c 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c | |||
| @@ -13,73 +13,52 @@ | |||
| 13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
| 14 | #include <linux/suspend.h> | 14 | #include <linux/suspend.h> |
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/cpu.h> | ||
| 16 | #include <asm/atomic.h> | 17 | #include <asm/atomic.h> |
| 17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
| 18 | 19 | ||
| 19 | static atomic_t cpu_counter, freeze; | 20 | /* This is protected by pm_sem semaphore */ |
| 20 | 21 | static cpumask_t frozen_cpus; | |
| 21 | |||
| 22 | static void smp_pause(void * data) | ||
| 23 | { | ||
| 24 | struct saved_context ctxt; | ||
| 25 | __save_processor_state(&ctxt); | ||
| 26 | printk("Sleeping in:\n"); | ||
| 27 | dump_stack(); | ||
| 28 | atomic_inc(&cpu_counter); | ||
| 29 | while (atomic_read(&freeze)) { | ||
| 30 | /* FIXME: restore takes place at random piece inside this. | ||
| 31 | This should probably be written in assembly, and | ||
| 32 | preserve general-purpose registers, too | ||
| 33 | |||
| 34 | What about stack? We may need to move to new stack here. | ||
| 35 | |||
| 36 | This should better be ran with interrupts disabled. | ||
| 37 | */ | ||
| 38 | cpu_relax(); | ||
| 39 | barrier(); | ||
| 40 | } | ||
| 41 | atomic_dec(&cpu_counter); | ||
| 42 | __restore_processor_state(&ctxt); | ||
| 43 | } | ||
| 44 | |||
| 45 | static cpumask_t oldmask; | ||
| 46 | 22 | ||
| 47 | void disable_nonboot_cpus(void) | 23 | void disable_nonboot_cpus(void) |
| 48 | { | 24 | { |
| 49 | oldmask = current->cpus_allowed; | 25 | int cpu, error; |
| 50 | set_cpus_allowed(current, cpumask_of_cpu(0)); | ||
| 51 | printk("Freezing CPUs (at %d)", _smp_processor_id()); | ||
| 52 | current->state = TASK_INTERRUPTIBLE; | ||
| 53 | schedule_timeout(HZ); | ||
| 54 | printk("..."); | ||
| 55 | BUG_ON(_smp_processor_id() != 0); | ||
| 56 | |||
| 57 | /* FIXME: for this to work, all the CPUs must be running | ||
| 58 | * "idle" thread (or we deadlock). Is that guaranteed? */ | ||
| 59 | 26 | ||
| 60 | atomic_set(&cpu_counter, 0); | 27 | error = 0; |
| 61 | atomic_set(&freeze, 1); | 28 | cpus_clear(frozen_cpus); |
| 62 | smp_call_function(smp_pause, NULL, 0, 0); | 29 | printk("Freezing cpus ...\n"); |
| 63 | while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { | 30 | for_each_online_cpu(cpu) { |
| 64 | cpu_relax(); | 31 | if (cpu == 0) |
| 65 | barrier(); | 32 | continue; |
| 33 | error = cpu_down(cpu); | ||
| 34 | if (!error) { | ||
| 35 | cpu_set(cpu, frozen_cpus); | ||
| 36 | printk("CPU%d is down\n", cpu); | ||
| 37 | continue; | ||
| 38 | } | ||
| 39 | printk("Error taking cpu %d down: %d\n", cpu, error); | ||
| 66 | } | 40 | } |
| 67 | printk("ok\n"); | 41 | BUG_ON(smp_processor_id() != 0); |
| 42 | if (error) | ||
| 43 | panic("cpus not sleeping"); | ||
| 68 | } | 44 | } |
| 69 | 45 | ||
| 70 | void enable_nonboot_cpus(void) | 46 | void enable_nonboot_cpus(void) |
| 71 | { | 47 | { |
| 72 | printk("Restarting CPUs"); | 48 | int cpu, error; |
| 73 | atomic_set(&freeze, 0); | ||
| 74 | while (atomic_read(&cpu_counter)) { | ||
| 75 | cpu_relax(); | ||
| 76 | barrier(); | ||
| 77 | } | ||
| 78 | printk("..."); | ||
| 79 | set_cpus_allowed(current, oldmask); | ||
| 80 | schedule(); | ||
| 81 | printk("ok\n"); | ||
| 82 | 49 | ||
| 50 | printk("Thawing cpus ...\n"); | ||
| 51 | for_each_cpu_mask(cpu, frozen_cpus) { | ||
| 52 | error = smp_prepare_cpu(cpu); | ||
| 53 | if (!error) | ||
| 54 | error = cpu_up(cpu); | ||
| 55 | if (!error) { | ||
| 56 | printk("CPU%d is up\n", cpu); | ||
| 57 | continue; | ||
| 58 | } | ||
| 59 | printk("Error taking cpu %d up: %d\n", cpu, error); | ||
| 60 | panic("Not enough cpus"); | ||
| 61 | } | ||
| 62 | cpus_clear(frozen_cpus); | ||
| 83 | } | 63 | } |
| 84 | 64 | ||
| 85 | |||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 90b3b68dee3f..c285fc5a2320 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -10,12 +10,12 @@ | |||
| 10 | * This file is released under the GPLv2. | 10 | * This file is released under the GPLv2. |
| 11 | * | 11 | * |
| 12 | * I'd like to thank the following people for their work: | 12 | * I'd like to thank the following people for their work: |
| 13 | * | 13 | * |
| 14 | * Pavel Machek <pavel@ucw.cz>: | 14 | * Pavel Machek <pavel@ucw.cz>: |
| 15 | * Modifications, defectiveness pointing, being with me at the very beginning, | 15 | * Modifications, defectiveness pointing, being with me at the very beginning, |
| 16 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | 16 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. |
| 17 | * | 17 | * |
| 18 | * Steve Doddi <dirk@loth.demon.co.uk>: | 18 | * Steve Doddi <dirk@loth.demon.co.uk>: |
| 19 | * Support the possibility of hardware state restoring. | 19 | * Support the possibility of hardware state restoring. |
| 20 | * | 20 | * |
| 21 | * Raph <grey.havens@earthling.net>: | 21 | * Raph <grey.havens@earthling.net>: |
| @@ -81,14 +81,14 @@ static int nr_copy_pages_check; | |||
| 81 | extern char resume_file[]; | 81 | extern char resume_file[]; |
| 82 | 82 | ||
| 83 | /* Local variables that should not be affected by save */ | 83 | /* Local variables that should not be affected by save */ |
| 84 | unsigned int nr_copy_pages __nosavedata = 0; | 84 | static unsigned int nr_copy_pages __nosavedata = 0; |
| 85 | 85 | ||
| 86 | /* Suspend pagedir is allocated before final copy, therefore it | 86 | /* Suspend pagedir is allocated before final copy, therefore it |
| 87 | must be freed after resume | 87 | must be freed after resume |
| 88 | 88 | ||
| 89 | Warning: this is evil. There are actually two pagedirs at time of | 89 | Warning: this is evil. There are actually two pagedirs at time of |
| 90 | resume. One is "pagedir_save", which is empty frame allocated at | 90 | resume. One is "pagedir_save", which is empty frame allocated at |
| 91 | time of suspend, that must be freed. Second is "pagedir_nosave", | 91 | time of suspend, that must be freed. Second is "pagedir_nosave", |
| 92 | allocated at time of resume, that travels through memory not to | 92 | allocated at time of resume, that travels through memory not to |
| 93 | collide with anything. | 93 | collide with anything. |
| 94 | 94 | ||
| @@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
| 132 | { | 132 | { |
| 133 | int error; | 133 | int error; |
| 134 | 134 | ||
| 135 | rw_swap_page_sync(READ, | 135 | rw_swap_page_sync(READ, |
| 136 | swp_entry(root_swap, 0), | 136 | swp_entry(root_swap, 0), |
| 137 | virt_to_page((unsigned long)&swsusp_header)); | 137 | virt_to_page((unsigned long)&swsusp_header)); |
| 138 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | 138 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || |
| @@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
| 140 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 140 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
| 141 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 141 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
| 142 | swsusp_header.swsusp_info = prev; | 142 | swsusp_header.swsusp_info = prev; |
| 143 | error = rw_swap_page_sync(WRITE, | 143 | error = rw_swap_page_sync(WRITE, |
| 144 | swp_entry(root_swap, 0), | 144 | swp_entry(root_swap, 0), |
| 145 | virt_to_page((unsigned long) | 145 | virt_to_page((unsigned long) |
| 146 | &swsusp_header)); | 146 | &swsusp_header)); |
| @@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) | |||
| 174 | static int swsusp_swap_check(void) /* This is called before saving image */ | 174 | static int swsusp_swap_check(void) /* This is called before saving image */ |
| 175 | { | 175 | { |
| 176 | int i, len; | 176 | int i, len; |
| 177 | 177 | ||
| 178 | len=strlen(resume_file); | 178 | len=strlen(resume_file); |
| 179 | root_swap = 0xFFFF; | 179 | root_swap = 0xFFFF; |
| 180 | 180 | ||
| 181 | swap_list_lock(); | 181 | swap_list_lock(); |
| 182 | for(i=0; i<MAX_SWAPFILES; i++) { | 182 | for (i=0; i<MAX_SWAPFILES; i++) { |
| 183 | if (swap_info[i].flags == 0) { | 183 | if (swap_info[i].flags == 0) { |
| 184 | swapfile_used[i]=SWAPFILE_UNUSED; | 184 | swapfile_used[i]=SWAPFILE_UNUSED; |
| 185 | } else { | 185 | } else { |
| 186 | if(!len) { | 186 | if (!len) { |
| 187 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); | 187 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); |
| 188 | if(root_swap == 0xFFFF) { | 188 | if (root_swap == 0xFFFF) { |
| 189 | swapfile_used[i] = SWAPFILE_SUSPEND; | 189 | swapfile_used[i] = SWAPFILE_SUSPEND; |
| 190 | root_swap = i; | 190 | root_swap = i; |
| 191 | } else | 191 | } else |
| 192 | swapfile_used[i] = SWAPFILE_IGNORED; | 192 | swapfile_used[i] = SWAPFILE_IGNORED; |
| 193 | } else { | 193 | } else { |
| 194 | /* we ignore all swap devices that are not the resume_file */ | 194 | /* we ignore all swap devices that are not the resume_file */ |
| 195 | if (is_resume_device(&swap_info[i])) { | 195 | if (is_resume_device(&swap_info[i])) { |
| @@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
| 209 | * This is called after saving image so modification | 209 | * This is called after saving image so modification |
| 210 | * will be lost after resume... and that's what we want. | 210 | * will be lost after resume... and that's what we want. |
| 211 | * we make the device unusable. A new call to | 211 | * we make the device unusable. A new call to |
| 212 | * lock_swapdevices can unlock the devices. | 212 | * lock_swapdevices can unlock the devices. |
| 213 | */ | 213 | */ |
| 214 | static void lock_swapdevices(void) | 214 | static void lock_swapdevices(void) |
| 215 | { | 215 | { |
| 216 | int i; | 216 | int i; |
| 217 | 217 | ||
| 218 | swap_list_lock(); | 218 | swap_list_lock(); |
| 219 | for(i = 0; i< MAX_SWAPFILES; i++) | 219 | for (i = 0; i< MAX_SWAPFILES; i++) |
| 220 | if(swapfile_used[i] == SWAPFILE_IGNORED) { | 220 | if (swapfile_used[i] == SWAPFILE_IGNORED) { |
| 221 | swap_info[i].flags ^= 0xFF; | 221 | swap_info[i].flags ^= 0xFF; |
| 222 | } | 222 | } |
| 223 | swap_list_unlock(); | 223 | swap_list_unlock(); |
| @@ -229,7 +229,7 @@ static void lock_swapdevices(void) | |||
| 229 | * @loc: Place to store the entry we used. | 229 | * @loc: Place to store the entry we used. |
| 230 | * | 230 | * |
| 231 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO | 231 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO |
| 232 | * errors. That is an artifact left over from swsusp. It did not | 232 | * errors. That is an artifact left over from swsusp. It did not |
| 233 | * check the return of rw_swap_page_sync() at all, since most pages | 233 | * check the return of rw_swap_page_sync() at all, since most pages |
| 234 | * written back to swap would return -EIO. | 234 | * written back to swap would return -EIO. |
| 235 | * This is a partial improvement, since we will at least return other | 235 | * This is a partial improvement, since we will at least return other |
| @@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
| 241 | int error = 0; | 241 | int error = 0; |
| 242 | 242 | ||
| 243 | entry = get_swap_page(); | 243 | entry = get_swap_page(); |
| 244 | if (swp_offset(entry) && | 244 | if (swp_offset(entry) && |
| 245 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { | 245 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { |
| 246 | error = rw_swap_page_sync(WRITE, entry, | 246 | error = rw_swap_page_sync(WRITE, entry, |
| 247 | virt_to_page(addr)); | 247 | virt_to_page(addr)); |
| @@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
| 257 | /** | 257 | /** |
| 258 | * data_free - Free the swap entries used by the saved image. | 258 | * data_free - Free the swap entries used by the saved image. |
| 259 | * | 259 | * |
| 260 | * Walk the list of used swap entries and free each one. | 260 | * Walk the list of used swap entries and free each one. |
| 261 | * This is only used for cleanup when suspend fails. | 261 | * This is only used for cleanup when suspend fails. |
| 262 | */ | 262 | */ |
| 263 | static void data_free(void) | 263 | static void data_free(void) |
| @@ -290,7 +290,7 @@ static int data_write(void) | |||
| 290 | mod = 1; | 290 | mod = 1; |
| 291 | 291 | ||
| 292 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); | 292 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); |
| 293 | for_each_pbe(p, pagedir_nosave) { | 293 | for_each_pbe (p, pagedir_nosave) { |
| 294 | if (!(i%mod)) | 294 | if (!(i%mod)) |
| 295 | printk( "\b\b\b\b%3d%%", i / mod ); | 295 | printk( "\b\b\b\b%3d%%", i / mod ); |
| 296 | if ((error = write_page(p->address, &(p->swap_address)))) | 296 | if ((error = write_page(p->address, &(p->swap_address)))) |
| @@ -335,7 +335,7 @@ static int close_swap(void) | |||
| 335 | 335 | ||
| 336 | dump_info(); | 336 | dump_info(); |
| 337 | error = write_page((unsigned long)&swsusp_info, &entry); | 337 | error = write_page((unsigned long)&swsusp_info, &entry); |
| 338 | if (!error) { | 338 | if (!error) { |
| 339 | printk( "S" ); | 339 | printk( "S" ); |
| 340 | error = mark_swapfiles(entry); | 340 | error = mark_swapfiles(entry); |
| 341 | printk( "|\n" ); | 341 | printk( "|\n" ); |
| @@ -370,7 +370,7 @@ static int write_pagedir(void) | |||
| 370 | struct pbe * pbe; | 370 | struct pbe * pbe; |
| 371 | 371 | ||
| 372 | printk( "Writing pagedir..."); | 372 | printk( "Writing pagedir..."); |
| 373 | for_each_pb_page(pbe, pagedir_nosave) { | 373 | for_each_pb_page (pbe, pagedir_nosave) { |
| 374 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) | 374 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) |
| 375 | return error; | 375 | return error; |
| 376 | } | 376 | } |
| @@ -472,7 +472,7 @@ static int save_highmem(void) | |||
| 472 | int res = 0; | 472 | int res = 0; |
| 473 | 473 | ||
| 474 | pr_debug("swsusp: Saving Highmem\n"); | 474 | pr_debug("swsusp: Saving Highmem\n"); |
| 475 | for_each_zone(zone) { | 475 | for_each_zone (zone) { |
| 476 | if (is_highmem(zone)) | 476 | if (is_highmem(zone)) |
| 477 | res = save_highmem_zone(zone); | 477 | res = save_highmem_zone(zone); |
| 478 | if (res) | 478 | if (res) |
| @@ -547,7 +547,7 @@ static void count_data_pages(void) | |||
| 547 | 547 | ||
| 548 | nr_copy_pages = 0; | 548 | nr_copy_pages = 0; |
| 549 | 549 | ||
| 550 | for_each_zone(zone) { | 550 | for_each_zone (zone) { |
| 551 | if (is_highmem(zone)) | 551 | if (is_highmem(zone)) |
| 552 | continue; | 552 | continue; |
| 553 | mark_free_pages(zone); | 553 | mark_free_pages(zone); |
| @@ -562,9 +562,9 @@ static void copy_data_pages(void) | |||
| 562 | struct zone *zone; | 562 | struct zone *zone; |
| 563 | unsigned long zone_pfn; | 563 | unsigned long zone_pfn; |
| 564 | struct pbe * pbe = pagedir_nosave; | 564 | struct pbe * pbe = pagedir_nosave; |
| 565 | 565 | ||
| 566 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | 566 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); |
| 567 | for_each_zone(zone) { | 567 | for_each_zone (zone) { |
| 568 | if (is_highmem(zone)) | 568 | if (is_highmem(zone)) |
| 569 | continue; | 569 | continue; |
| 570 | mark_free_pages(zone); | 570 | mark_free_pages(zone); |
| @@ -702,7 +702,7 @@ static void free_image_pages(void) | |||
| 702 | { | 702 | { |
| 703 | struct pbe * p; | 703 | struct pbe * p; |
| 704 | 704 | ||
| 705 | for_each_pbe(p, pagedir_save) { | 705 | for_each_pbe (p, pagedir_save) { |
| 706 | if (p->address) { | 706 | if (p->address) { |
| 707 | ClearPageNosave(virt_to_page(p->address)); | 707 | ClearPageNosave(virt_to_page(p->address)); |
| 708 | free_page(p->address); | 708 | free_page(p->address); |
| @@ -719,7 +719,7 @@ static int alloc_image_pages(void) | |||
| 719 | { | 719 | { |
| 720 | struct pbe * p; | 720 | struct pbe * p; |
| 721 | 721 | ||
| 722 | for_each_pbe(p, pagedir_save) { | 722 | for_each_pbe (p, pagedir_save) { |
| 723 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | 723 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); |
| 724 | if (!p->address) | 724 | if (!p->address) |
| 725 | return -ENOMEM; | 725 | return -ENOMEM; |
| @@ -740,7 +740,7 @@ void swsusp_free(void) | |||
| 740 | /** | 740 | /** |
| 741 | * enough_free_mem - Make sure we enough free memory to snapshot. | 741 | * enough_free_mem - Make sure we enough free memory to snapshot. |
| 742 | * | 742 | * |
| 743 | * Returns TRUE or FALSE after checking the number of available | 743 | * Returns TRUE or FALSE after checking the number of available |
| 744 | * free pages. | 744 | * free pages. |
| 745 | */ | 745 | */ |
| 746 | 746 | ||
| @@ -758,11 +758,11 @@ static int enough_free_mem(void) | |||
| 758 | /** | 758 | /** |
| 759 | * enough_swap - Make sure we have enough swap to save the image. | 759 | * enough_swap - Make sure we have enough swap to save the image. |
| 760 | * | 760 | * |
| 761 | * Returns TRUE or FALSE after checking the total amount of swap | 761 | * Returns TRUE or FALSE after checking the total amount of swap |
| 762 | * space avaiable. | 762 | * space avaiable. |
| 763 | * | 763 | * |
| 764 | * FIXME: si_swapinfo(&i) returns all swap devices information. | 764 | * FIXME: si_swapinfo(&i) returns all swap devices information. |
| 765 | * We should only consider resume_device. | 765 | * We should only consider resume_device. |
| 766 | */ | 766 | */ |
| 767 | 767 | ||
| 768 | static int enough_swap(void) | 768 | static int enough_swap(void) |
| @@ -781,18 +781,18 @@ static int swsusp_alloc(void) | |||
| 781 | { | 781 | { |
| 782 | int error; | 782 | int error; |
| 783 | 783 | ||
| 784 | pagedir_nosave = NULL; | ||
| 785 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
| 786 | |||
| 784 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | 787 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", |
| 785 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | 788 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); |
| 786 | 789 | ||
| 787 | pagedir_nosave = NULL; | ||
| 788 | if (!enough_free_mem()) | 790 | if (!enough_free_mem()) |
| 789 | return -ENOMEM; | 791 | return -ENOMEM; |
| 790 | 792 | ||
| 791 | if (!enough_swap()) | 793 | if (!enough_swap()) |
| 792 | return -ENOSPC; | 794 | return -ENOSPC; |
| 793 | 795 | ||
| 794 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
| 795 | |||
| 796 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | 796 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { |
| 797 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 797 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); |
| 798 | return -ENOMEM; | 798 | return -ENOMEM; |
| @@ -827,8 +827,8 @@ static int suspend_prepare_image(void) | |||
| 827 | error = swsusp_alloc(); | 827 | error = swsusp_alloc(); |
| 828 | if (error) | 828 | if (error) |
| 829 | return error; | 829 | return error; |
| 830 | 830 | ||
| 831 | /* During allocating of suspend pagedir, new cold pages may appear. | 831 | /* During allocating of suspend pagedir, new cold pages may appear. |
| 832 | * Kill them. | 832 | * Kill them. |
| 833 | */ | 833 | */ |
| 834 | drain_local_pages(); | 834 | drain_local_pages(); |
| @@ -929,21 +929,6 @@ int swsusp_resume(void) | |||
| 929 | return error; | 929 | return error; |
| 930 | } | 930 | } |
| 931 | 931 | ||
| 932 | /* More restore stuff */ | ||
| 933 | |||
| 934 | /* | ||
| 935 | * Returns true if given address/order collides with any orig_address | ||
| 936 | */ | ||
| 937 | static int does_collide_order(unsigned long addr, int order) | ||
| 938 | { | ||
| 939 | int i; | ||
| 940 | |||
| 941 | for (i=0; i < (1<<order); i++) | ||
| 942 | if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) | ||
| 943 | return 1; | ||
| 944 | return 0; | ||
| 945 | } | ||
| 946 | |||
| 947 | /** | 932 | /** |
| 948 | * On resume, for storing the PBE list and the image, | 933 | * On resume, for storing the PBE list and the image, |
| 949 | * we can only use memory pages that do not conflict with the pages | 934 | * we can only use memory pages that do not conflict with the pages |
| @@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask) | |||
| 973 | unsigned long m; | 958 | unsigned long m; |
| 974 | 959 | ||
| 975 | m = get_zeroed_page(gfp_mask); | 960 | m = get_zeroed_page(gfp_mask); |
| 976 | while (does_collide_order(m, 0)) { | 961 | while (!PageNosaveFree(virt_to_page(m))) { |
| 977 | eat_page((void *)m); | 962 | eat_page((void *)m); |
| 978 | m = get_zeroed_page(gfp_mask); | 963 | m = get_zeroed_page(gfp_mask); |
| 979 | if (!m) | 964 | if (!m) |
| @@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1045 | 1030 | ||
| 1046 | /* Set page flags */ | 1031 | /* Set page flags */ |
| 1047 | 1032 | ||
| 1048 | for_each_zone(zone) { | 1033 | for_each_zone (zone) { |
| 1049 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 1034 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
| 1050 | SetPageNosaveFree(pfn_to_page(zone_pfn + | 1035 | SetPageNosaveFree(pfn_to_page(zone_pfn + |
| 1051 | zone->zone_start_pfn)); | 1036 | zone->zone_start_pfn)); |
| @@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
| 1061 | /* Relocate colliding pages */ | 1046 | /* Relocate colliding pages */ |
| 1062 | 1047 | ||
| 1063 | for_each_pb_page (pbpage, pblist) { | 1048 | for_each_pb_page (pbpage, pblist) { |
| 1064 | if (does_collide_order((unsigned long)pbpage, 0)) { | 1049 | if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { |
| 1065 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | 1050 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); |
| 1066 | if (!m) { | 1051 | if (!m) { |
| 1067 | error = -ENOMEM; | 1052 | error = -ENOMEM; |
| @@ -1193,8 +1178,10 @@ static const char * sanity_check(void) | |||
| 1193 | return "version"; | 1178 | return "version"; |
| 1194 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | 1179 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) |
| 1195 | return "machine"; | 1180 | return "machine"; |
| 1181 | #if 0 | ||
| 1196 | if(swsusp_info.cpus != num_online_cpus()) | 1182 | if(swsusp_info.cpus != num_online_cpus()) |
| 1197 | return "number of cpus"; | 1183 | return "number of cpus"; |
| 1184 | #endif | ||
| 1198 | return NULL; | 1185 | return NULL; |
| 1199 | } | 1186 | } |
| 1200 | 1187 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 01b58d7d17ff..5092397fac29 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 588 | log_level_unknown = 1; | 588 | log_level_unknown = 1; |
| 589 | } | 589 | } |
| 590 | 590 | ||
| 591 | if (!cpu_online(smp_processor_id()) && | 591 | if (!cpu_online(smp_processor_id())) { |
| 592 | system_state != SYSTEM_RUNNING) { | ||
| 593 | /* | 592 | /* |
| 594 | * Some console drivers may assume that per-cpu resources have | 593 | * Some console drivers may assume that per-cpu resources have |
| 595 | * been allocated. So don't allow them to be called by this | 594 | * been allocated. So don't allow them to be called by this |
| @@ -876,8 +875,10 @@ void register_console(struct console * console) | |||
| 876 | break; | 875 | break; |
| 877 | console->flags |= CON_ENABLED; | 876 | console->flags |= CON_ENABLED; |
| 878 | console->index = console_cmdline[i].index; | 877 | console->index = console_cmdline[i].index; |
| 879 | if (i == preferred_console) | 878 | if (i == selected_console) { |
| 880 | console->flags |= CON_CONSDEV; | 879 | console->flags |= CON_CONSDEV; |
| 880 | preferred_console = selected_console; | ||
| 881 | } | ||
| 881 | break; | 882 | break; |
| 882 | } | 883 | } |
| 883 | 884 | ||
| @@ -897,6 +898,8 @@ void register_console(struct console * console) | |||
| 897 | if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { | 898 | if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { |
| 898 | console->next = console_drivers; | 899 | console->next = console_drivers; |
| 899 | console_drivers = console; | 900 | console_drivers = console; |
| 901 | if (console->next) | ||
| 902 | console->next->flags &= ~CON_CONSDEV; | ||
| 900 | } else { | 903 | } else { |
| 901 | console->next = console_drivers->next; | 904 | console->next = console_drivers->next; |
| 902 | console_drivers->next = console; | 905 | console_drivers->next = console; |
| @@ -937,10 +940,14 @@ int unregister_console(struct console * console) | |||
| 937 | /* If last console is removed, we re-enable picking the first | 940 | /* If last console is removed, we re-enable picking the first |
| 938 | * one that gets registered. Without that, pmac early boot console | 941 | * one that gets registered. Without that, pmac early boot console |
| 939 | * would prevent fbcon from taking over. | 942 | * would prevent fbcon from taking over. |
| 943 | * | ||
| 944 | * If this isn't the last console and it has CON_CONSDEV set, we | ||
| 945 | * need to set it on the next preferred console. | ||
| 940 | */ | 946 | */ |
| 941 | if (console_drivers == NULL) | 947 | if (console_drivers == NULL) |
| 942 | preferred_console = selected_console; | 948 | preferred_console = selected_console; |
| 943 | 949 | else if (console->flags & CON_CONSDEV) | |
| 950 | console_drivers->flags |= CON_CONSDEV; | ||
| 944 | 951 | ||
| 945 | release_console_sem(); | 952 | release_console_sem(); |
| 946 | return res; | 953 | return res; |
diff --git a/kernel/resource.c b/kernel/resource.c index 52f696f11adf..26967e042201 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 263 | new->start = min; | 263 | new->start = min; |
| 264 | if (new->end > max) | 264 | if (new->end > max) |
| 265 | new->end = max; | 265 | new->end = max; |
| 266 | new->start = (new->start + align - 1) & ~(align - 1); | 266 | new->start = ALIGN(new->start, align); |
| 267 | if (alignf) | 267 | if (alignf) |
| 268 | alignf(alignf_data, new, size, align); | 268 | alignf(alignf_data, new, size, align); |
| 269 | if (new->start < new->end && new->end - new->start >= size - 1) { | 269 | if (new->start < new->end && new->end - new->start >= size - 1) { |
diff --git a/kernel/sched.c b/kernel/sched.c index 66b2ed784822..a07cff90d849 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -166,7 +166,7 @@ | |||
| 166 | #define SCALE_PRIO(x, prio) \ | 166 | #define SCALE_PRIO(x, prio) \ |
| 167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) |
| 168 | 168 | ||
| 169 | static inline unsigned int task_timeslice(task_t *p) | 169 | static unsigned int task_timeslice(task_t *p) |
| 170 | { | 170 | { |
| 171 | if (p->static_prio < NICE_TO_PRIO(0)) | 171 | if (p->static_prio < NICE_TO_PRIO(0)) |
| 172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); |
| @@ -206,7 +206,7 @@ struct runqueue { | |||
| 206 | */ | 206 | */ |
| 207 | unsigned long nr_running; | 207 | unsigned long nr_running; |
| 208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
| 209 | unsigned long cpu_load; | 209 | unsigned long cpu_load[3]; |
| 210 | #endif | 210 | #endif |
| 211 | unsigned long long nr_switches; | 211 | unsigned long long nr_switches; |
| 212 | 212 | ||
| @@ -260,23 +260,87 @@ struct runqueue { | |||
| 260 | 260 | ||
| 261 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 261 | static DEFINE_PER_CPU(struct runqueue, runqueues); |
| 262 | 262 | ||
| 263 | /* | ||
| 264 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
| 265 | * See detach_destroy_domains: synchronize_sched for details. | ||
| 266 | * | ||
| 267 | * The domain tree of any CPU may only be accessed from within | ||
| 268 | * preempt-disabled sections. | ||
| 269 | */ | ||
| 263 | #define for_each_domain(cpu, domain) \ | 270 | #define for_each_domain(cpu, domain) \ |
| 264 | for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) | 271 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) |
| 265 | 272 | ||
| 266 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 273 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 267 | #define this_rq() (&__get_cpu_var(runqueues)) | 274 | #define this_rq() (&__get_cpu_var(runqueues)) |
| 268 | #define task_rq(p) cpu_rq(task_cpu(p)) | 275 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 269 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 276 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 270 | 277 | ||
| 271 | /* | ||
| 272 | * Default context-switch locking: | ||
| 273 | */ | ||
| 274 | #ifndef prepare_arch_switch | 278 | #ifndef prepare_arch_switch |
| 275 | # define prepare_arch_switch(rq, next) do { } while (0) | 279 | # define prepare_arch_switch(next) do { } while (0) |
| 276 | # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) | 280 | #endif |
| 277 | # define task_running(rq, p) ((rq)->curr == (p)) | 281 | #ifndef finish_arch_switch |
| 282 | # define finish_arch_switch(prev) do { } while (0) | ||
| 278 | #endif | 283 | #endif |
| 279 | 284 | ||
| 285 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 286 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
| 287 | { | ||
| 288 | return rq->curr == p; | ||
| 289 | } | ||
| 290 | |||
| 291 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
| 292 | { | ||
| 293 | } | ||
| 294 | |||
| 295 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
| 296 | { | ||
| 297 | spin_unlock_irq(&rq->lock); | ||
| 298 | } | ||
| 299 | |||
| 300 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 301 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
| 302 | { | ||
| 303 | #ifdef CONFIG_SMP | ||
| 304 | return p->oncpu; | ||
| 305 | #else | ||
| 306 | return rq->curr == p; | ||
| 307 | #endif | ||
| 308 | } | ||
| 309 | |||
| 310 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
| 311 | { | ||
| 312 | #ifdef CONFIG_SMP | ||
| 313 | /* | ||
| 314 | * We can optimise this out completely for !SMP, because the | ||
| 315 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 316 | * here. | ||
| 317 | */ | ||
| 318 | next->oncpu = 1; | ||
| 319 | #endif | ||
| 320 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 321 | spin_unlock_irq(&rq->lock); | ||
| 322 | #else | ||
| 323 | spin_unlock(&rq->lock); | ||
| 324 | #endif | ||
| 325 | } | ||
| 326 | |||
| 327 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
| 328 | { | ||
| 329 | #ifdef CONFIG_SMP | ||
| 330 | /* | ||
| 331 | * After ->oncpu is cleared, the task can be moved to a different CPU. | ||
| 332 | * We must ensure this doesn't happen until the switch is completely | ||
| 333 | * finished. | ||
| 334 | */ | ||
| 335 | smp_wmb(); | ||
| 336 | prev->oncpu = 0; | ||
| 337 | #endif | ||
| 338 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 339 | local_irq_enable(); | ||
| 340 | #endif | ||
| 341 | } | ||
| 342 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 343 | |||
| 280 | /* | 344 | /* |
| 281 | * task_rq_lock - lock the runqueue a given task resides on and disable | 345 | * task_rq_lock - lock the runqueue a given task resides on and disable |
| 282 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 346 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
| @@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | |||
| 309 | * bump this up when changing the output format or the meaning of an existing | 373 | * bump this up when changing the output format or the meaning of an existing |
| 310 | * format, so that tools can adapt (or abort) | 374 | * format, so that tools can adapt (or abort) |
| 311 | */ | 375 | */ |
| 312 | #define SCHEDSTAT_VERSION 11 | 376 | #define SCHEDSTAT_VERSION 12 |
| 313 | 377 | ||
| 314 | static int show_schedstat(struct seq_file *seq, void *v) | 378 | static int show_schedstat(struct seq_file *seq, void *v) |
| 315 | { | 379 | { |
| @@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 338 | 402 | ||
| 339 | #ifdef CONFIG_SMP | 403 | #ifdef CONFIG_SMP |
| 340 | /* domain-specific stats */ | 404 | /* domain-specific stats */ |
| 405 | preempt_disable(); | ||
| 341 | for_each_domain(cpu, sd) { | 406 | for_each_domain(cpu, sd) { |
| 342 | enum idle_type itype; | 407 | enum idle_type itype; |
| 343 | char mask_str[NR_CPUS]; | 408 | char mask_str[NR_CPUS]; |
| @@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 356 | sd->lb_nobusyq[itype], | 421 | sd->lb_nobusyq[itype], |
| 357 | sd->lb_nobusyg[itype]); | 422 | sd->lb_nobusyg[itype]); |
| 358 | } | 423 | } |
| 359 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", | 424 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", |
| 360 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 425 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
| 361 | sd->sbe_pushed, sd->sbe_attempts, | 426 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
| 427 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | ||
| 362 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 428 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); |
| 363 | } | 429 | } |
| 430 | preempt_enable(); | ||
| 364 | #endif | 431 | #endif |
| 365 | } | 432 | } |
| 366 | return 0; | 433 | return 0; |
| @@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) | |||
| 414 | return rq; | 481 | return rq; |
| 415 | } | 482 | } |
| 416 | 483 | ||
| 417 | #ifdef CONFIG_SCHED_SMT | ||
| 418 | static int cpu_and_siblings_are_idle(int cpu) | ||
| 419 | { | ||
| 420 | int sib; | ||
| 421 | for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { | ||
| 422 | if (idle_cpu(sib)) | ||
| 423 | continue; | ||
| 424 | return 0; | ||
| 425 | } | ||
| 426 | |||
| 427 | return 1; | ||
| 428 | } | ||
| 429 | #else | ||
| 430 | #define cpu_and_siblings_are_idle(A) idle_cpu(A) | ||
| 431 | #endif | ||
| 432 | |||
| 433 | #ifdef CONFIG_SCHEDSTATS | 484 | #ifdef CONFIG_SCHEDSTATS |
| 434 | /* | 485 | /* |
| 435 | * Called when a process is dequeued from the active array and given | 486 | * Called when a process is dequeued from the active array and given |
| @@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | |||
| 622 | rq->nr_running++; | 673 | rq->nr_running++; |
| 623 | } | 674 | } |
| 624 | 675 | ||
| 625 | static void recalc_task_prio(task_t *p, unsigned long long now) | 676 | static int recalc_task_prio(task_t *p, unsigned long long now) |
| 626 | { | 677 | { |
| 627 | /* Caller must always ensure 'now >= p->timestamp' */ | 678 | /* Caller must always ensure 'now >= p->timestamp' */ |
| 628 | unsigned long long __sleep_time = now - p->timestamp; | 679 | unsigned long long __sleep_time = now - p->timestamp; |
| @@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) | |||
| 681 | } | 732 | } |
| 682 | } | 733 | } |
| 683 | 734 | ||
| 684 | p->prio = effective_prio(p); | 735 | return effective_prio(p); |
| 685 | } | 736 | } |
| 686 | 737 | ||
| 687 | /* | 738 | /* |
| @@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
| 704 | } | 755 | } |
| 705 | #endif | 756 | #endif |
| 706 | 757 | ||
| 707 | recalc_task_prio(p, now); | 758 | p->prio = recalc_task_prio(p, now); |
| 708 | 759 | ||
| 709 | /* | 760 | /* |
| 710 | * This checks to make sure it's not an uninterruptible task | 761 | * This checks to make sure it's not an uninterruptible task |
| @@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) | |||
| 782 | } | 833 | } |
| 783 | 834 | ||
| 784 | #ifdef CONFIG_SMP | 835 | #ifdef CONFIG_SMP |
| 785 | enum request_type { | ||
| 786 | REQ_MOVE_TASK, | ||
| 787 | REQ_SET_DOMAIN, | ||
| 788 | }; | ||
| 789 | |||
| 790 | typedef struct { | 836 | typedef struct { |
| 791 | struct list_head list; | 837 | struct list_head list; |
| 792 | enum request_type type; | ||
| 793 | 838 | ||
| 794 | /* For REQ_MOVE_TASK */ | ||
| 795 | task_t *task; | 839 | task_t *task; |
| 796 | int dest_cpu; | 840 | int dest_cpu; |
| 797 | 841 | ||
| 798 | /* For REQ_SET_DOMAIN */ | ||
| 799 | struct sched_domain *sd; | ||
| 800 | |||
| 801 | struct completion done; | 842 | struct completion done; |
| 802 | } migration_req_t; | 843 | } migration_req_t; |
| 803 | 844 | ||
| @@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 819 | } | 860 | } |
| 820 | 861 | ||
| 821 | init_completion(&req->done); | 862 | init_completion(&req->done); |
| 822 | req->type = REQ_MOVE_TASK; | ||
| 823 | req->task = p; | 863 | req->task = p; |
| 824 | req->dest_cpu = dest_cpu; | 864 | req->dest_cpu = dest_cpu; |
| 825 | list_add(&req->list, &rq->migration_queue); | 865 | list_add(&req->list, &rq->migration_queue); |
| @@ -886,26 +926,154 @@ void kick_process(task_t *p) | |||
| 886 | * We want to under-estimate the load of migration sources, to | 926 | * We want to under-estimate the load of migration sources, to |
| 887 | * balance conservatively. | 927 | * balance conservatively. |
| 888 | */ | 928 | */ |
| 889 | static inline unsigned long source_load(int cpu) | 929 | static inline unsigned long source_load(int cpu, int type) |
| 890 | { | 930 | { |
| 891 | runqueue_t *rq = cpu_rq(cpu); | 931 | runqueue_t *rq = cpu_rq(cpu); |
| 892 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 932 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
| 933 | if (type == 0) | ||
| 934 | return load_now; | ||
| 893 | 935 | ||
| 894 | return min(rq->cpu_load, load_now); | 936 | return min(rq->cpu_load[type-1], load_now); |
| 895 | } | 937 | } |
| 896 | 938 | ||
| 897 | /* | 939 | /* |
| 898 | * Return a high guess at the load of a migration-target cpu | 940 | * Return a high guess at the load of a migration-target cpu |
| 899 | */ | 941 | */ |
| 900 | static inline unsigned long target_load(int cpu) | 942 | static inline unsigned long target_load(int cpu, int type) |
| 901 | { | 943 | { |
| 902 | runqueue_t *rq = cpu_rq(cpu); | 944 | runqueue_t *rq = cpu_rq(cpu); |
| 903 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 945 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
| 946 | if (type == 0) | ||
| 947 | return load_now; | ||
| 904 | 948 | ||
| 905 | return max(rq->cpu_load, load_now); | 949 | return max(rq->cpu_load[type-1], load_now); |
| 906 | } | 950 | } |
| 907 | 951 | ||
| 908 | #endif | 952 | /* |
| 953 | * find_idlest_group finds and returns the least busy CPU group within the | ||
| 954 | * domain. | ||
| 955 | */ | ||
| 956 | static struct sched_group * | ||
| 957 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
| 958 | { | ||
| 959 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
| 960 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
| 961 | int load_idx = sd->forkexec_idx; | ||
| 962 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
| 963 | |||
| 964 | do { | ||
| 965 | unsigned long load, avg_load; | ||
| 966 | int local_group; | ||
| 967 | int i; | ||
| 968 | |||
| 969 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
| 970 | /* XXX: put a cpus allowed check */ | ||
| 971 | |||
| 972 | /* Tally up the load of all CPUs in the group */ | ||
| 973 | avg_load = 0; | ||
| 974 | |||
| 975 | for_each_cpu_mask(i, group->cpumask) { | ||
| 976 | /* Bias balancing toward cpus of our domain */ | ||
| 977 | if (local_group) | ||
| 978 | load = source_load(i, load_idx); | ||
| 979 | else | ||
| 980 | load = target_load(i, load_idx); | ||
| 981 | |||
| 982 | avg_load += load; | ||
| 983 | } | ||
| 984 | |||
| 985 | /* Adjust by relative CPU power of the group */ | ||
| 986 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
| 987 | |||
| 988 | if (local_group) { | ||
| 989 | this_load = avg_load; | ||
| 990 | this = group; | ||
| 991 | } else if (avg_load < min_load) { | ||
| 992 | min_load = avg_load; | ||
| 993 | idlest = group; | ||
| 994 | } | ||
| 995 | group = group->next; | ||
| 996 | } while (group != sd->groups); | ||
| 997 | |||
| 998 | if (!idlest || 100*this_load < imbalance*min_load) | ||
| 999 | return NULL; | ||
| 1000 | return idlest; | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | /* | ||
| 1004 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | ||
| 1005 | */ | ||
| 1006 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | ||
| 1007 | { | ||
| 1008 | unsigned long load, min_load = ULONG_MAX; | ||
| 1009 | int idlest = -1; | ||
| 1010 | int i; | ||
| 1011 | |||
| 1012 | for_each_cpu_mask(i, group->cpumask) { | ||
| 1013 | load = source_load(i, 0); | ||
| 1014 | |||
| 1015 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
| 1016 | min_load = load; | ||
| 1017 | idlest = i; | ||
| 1018 | } | ||
| 1019 | } | ||
| 1020 | |||
| 1021 | return idlest; | ||
| 1022 | } | ||
| 1023 | |||
| 1024 | /* | ||
| 1025 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
| 1026 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
| 1027 | * SD_BALANCE_EXEC. | ||
| 1028 | * | ||
| 1029 | * Balance, ie. select the least loaded group. | ||
| 1030 | * | ||
| 1031 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
| 1032 | * | ||
| 1033 | * preempt must be disabled. | ||
| 1034 | */ | ||
| 1035 | static int sched_balance_self(int cpu, int flag) | ||
| 1036 | { | ||
| 1037 | struct task_struct *t = current; | ||
| 1038 | struct sched_domain *tmp, *sd = NULL; | ||
| 1039 | |||
| 1040 | for_each_domain(cpu, tmp) | ||
| 1041 | if (tmp->flags & flag) | ||
| 1042 | sd = tmp; | ||
| 1043 | |||
| 1044 | while (sd) { | ||
| 1045 | cpumask_t span; | ||
| 1046 | struct sched_group *group; | ||
| 1047 | int new_cpu; | ||
| 1048 | int weight; | ||
| 1049 | |||
| 1050 | span = sd->span; | ||
| 1051 | group = find_idlest_group(sd, t, cpu); | ||
| 1052 | if (!group) | ||
| 1053 | goto nextlevel; | ||
| 1054 | |||
| 1055 | new_cpu = find_idlest_cpu(group, cpu); | ||
| 1056 | if (new_cpu == -1 || new_cpu == cpu) | ||
| 1057 | goto nextlevel; | ||
| 1058 | |||
| 1059 | /* Now try balancing at a lower domain level */ | ||
| 1060 | cpu = new_cpu; | ||
| 1061 | nextlevel: | ||
| 1062 | sd = NULL; | ||
| 1063 | weight = cpus_weight(span); | ||
| 1064 | for_each_domain(cpu, tmp) { | ||
| 1065 | if (weight <= cpus_weight(tmp->span)) | ||
| 1066 | break; | ||
| 1067 | if (tmp->flags & flag) | ||
| 1068 | sd = tmp; | ||
| 1069 | } | ||
| 1070 | /* while loop will break here if sd == NULL */ | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | return cpu; | ||
| 1074 | } | ||
| 1075 | |||
| 1076 | #endif /* CONFIG_SMP */ | ||
| 909 | 1077 | ||
| 910 | /* | 1078 | /* |
| 911 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1079 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
| @@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) | |||
| 927 | 1095 | ||
| 928 | for_each_domain(cpu, sd) { | 1096 | for_each_domain(cpu, sd) { |
| 929 | if (sd->flags & SD_WAKE_IDLE) { | 1097 | if (sd->flags & SD_WAKE_IDLE) { |
| 930 | cpus_and(tmp, sd->span, cpu_online_map); | 1098 | cpus_and(tmp, sd->span, p->cpus_allowed); |
| 931 | cpus_and(tmp, tmp, p->cpus_allowed); | ||
| 932 | for_each_cpu_mask(i, tmp) { | 1099 | for_each_cpu_mask(i, tmp) { |
| 933 | if (idle_cpu(i)) | 1100 | if (idle_cpu(i)) |
| 934 | return i; | 1101 | return i; |
| 935 | } | 1102 | } |
| 936 | } | 1103 | } |
| 937 | else break; | 1104 | else |
| 1105 | break; | ||
| 938 | } | 1106 | } |
| 939 | return cpu; | 1107 | return cpu; |
| 940 | } | 1108 | } |
| @@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
| 967 | runqueue_t *rq; | 1135 | runqueue_t *rq; |
| 968 | #ifdef CONFIG_SMP | 1136 | #ifdef CONFIG_SMP |
| 969 | unsigned long load, this_load; | 1137 | unsigned long load, this_load; |
| 970 | struct sched_domain *sd; | 1138 | struct sched_domain *sd, *this_sd = NULL; |
| 971 | int new_cpu; | 1139 | int new_cpu; |
| 972 | #endif | 1140 | #endif |
| 973 | 1141 | ||
| @@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
| 986 | if (unlikely(task_running(rq, p))) | 1154 | if (unlikely(task_running(rq, p))) |
| 987 | goto out_activate; | 1155 | goto out_activate; |
| 988 | 1156 | ||
| 989 | #ifdef CONFIG_SCHEDSTATS | 1157 | new_cpu = cpu; |
| 1158 | |||
| 990 | schedstat_inc(rq, ttwu_cnt); | 1159 | schedstat_inc(rq, ttwu_cnt); |
| 991 | if (cpu == this_cpu) { | 1160 | if (cpu == this_cpu) { |
| 992 | schedstat_inc(rq, ttwu_local); | 1161 | schedstat_inc(rq, ttwu_local); |
| 993 | } else { | 1162 | goto out_set_cpu; |
| 994 | for_each_domain(this_cpu, sd) { | 1163 | } |
| 995 | if (cpu_isset(cpu, sd->span)) { | 1164 | |
| 996 | schedstat_inc(sd, ttwu_wake_remote); | 1165 | for_each_domain(this_cpu, sd) { |
| 997 | break; | 1166 | if (cpu_isset(cpu, sd->span)) { |
| 998 | } | 1167 | schedstat_inc(sd, ttwu_wake_remote); |
| 1168 | this_sd = sd; | ||
| 1169 | break; | ||
| 999 | } | 1170 | } |
| 1000 | } | 1171 | } |
| 1001 | #endif | ||
| 1002 | 1172 | ||
| 1003 | new_cpu = cpu; | 1173 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
| 1004 | if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
| 1005 | goto out_set_cpu; | 1174 | goto out_set_cpu; |
| 1006 | 1175 | ||
| 1007 | load = source_load(cpu); | ||
| 1008 | this_load = target_load(this_cpu); | ||
| 1009 | |||
| 1010 | /* | 1176 | /* |
| 1011 | * If sync wakeup then subtract the (maximum possible) effect of | 1177 | * Check for affine wakeup and passive balancing possibilities. |
| 1012 | * the currently running task from the load of the current CPU: | ||
| 1013 | */ | 1178 | */ |
| 1014 | if (sync) | 1179 | if (this_sd) { |
| 1015 | this_load -= SCHED_LOAD_SCALE; | 1180 | int idx = this_sd->wake_idx; |
| 1181 | unsigned int imbalance; | ||
| 1016 | 1182 | ||
| 1017 | /* Don't pull the task off an idle CPU to a busy one */ | 1183 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
| 1018 | if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) | ||
| 1019 | goto out_set_cpu; | ||
| 1020 | 1184 | ||
| 1021 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1185 | load = source_load(cpu, idx); |
| 1186 | this_load = target_load(this_cpu, idx); | ||
| 1022 | 1187 | ||
| 1023 | /* | 1188 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
| 1024 | * Scan domains for affine wakeup and passive balancing | ||
| 1025 | * possibilities. | ||
| 1026 | */ | ||
| 1027 | for_each_domain(this_cpu, sd) { | ||
| 1028 | unsigned int imbalance; | ||
| 1029 | /* | ||
| 1030 | * Start passive balancing when half the imbalance_pct | ||
| 1031 | * limit is reached. | ||
| 1032 | */ | ||
| 1033 | imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; | ||
| 1034 | 1189 | ||
| 1035 | if ((sd->flags & SD_WAKE_AFFINE) && | 1190 | if (this_sd->flags & SD_WAKE_AFFINE) { |
| 1036 | !task_hot(p, rq->timestamp_last_tick, sd)) { | 1191 | unsigned long tl = this_load; |
| 1037 | /* | 1192 | /* |
| 1038 | * This domain has SD_WAKE_AFFINE and p is cache cold | 1193 | * If sync wakeup then subtract the (maximum possible) |
| 1039 | * in this domain. | 1194 | * effect of the currently running task from the load |
| 1195 | * of the current CPU: | ||
| 1040 | */ | 1196 | */ |
| 1041 | if (cpu_isset(cpu, sd->span)) { | 1197 | if (sync) |
| 1042 | schedstat_inc(sd, ttwu_move_affine); | 1198 | tl -= SCHED_LOAD_SCALE; |
| 1199 | |||
| 1200 | if ((tl <= load && | ||
| 1201 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | ||
| 1202 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | ||
| 1203 | /* | ||
| 1204 | * This domain has SD_WAKE_AFFINE and | ||
| 1205 | * p is cache cold in this domain, and | ||
| 1206 | * there is no bad imbalance. | ||
| 1207 | */ | ||
| 1208 | schedstat_inc(this_sd, ttwu_move_affine); | ||
| 1043 | goto out_set_cpu; | 1209 | goto out_set_cpu; |
| 1044 | } | 1210 | } |
| 1045 | } else if ((sd->flags & SD_WAKE_BALANCE) && | 1211 | } |
| 1046 | imbalance*this_load <= 100*load) { | 1212 | |
| 1047 | /* | 1213 | /* |
| 1048 | * This domain has SD_WAKE_BALANCE and there is | 1214 | * Start passive balancing when half the imbalance_pct |
| 1049 | * an imbalance. | 1215 | * limit is reached. |
| 1050 | */ | 1216 | */ |
| 1051 | if (cpu_isset(cpu, sd->span)) { | 1217 | if (this_sd->flags & SD_WAKE_BALANCE) { |
| 1052 | schedstat_inc(sd, ttwu_move_balance); | 1218 | if (imbalance*this_load <= 100*load) { |
| 1219 | schedstat_inc(this_sd, ttwu_move_balance); | ||
| 1053 | goto out_set_cpu; | 1220 | goto out_set_cpu; |
| 1054 | } | 1221 | } |
| 1055 | } | 1222 | } |
| @@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
| 1120 | return try_to_wake_up(p, state, 0); | 1287 | return try_to_wake_up(p, state, 0); |
| 1121 | } | 1288 | } |
| 1122 | 1289 | ||
| 1123 | #ifdef CONFIG_SMP | ||
| 1124 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
| 1125 | struct sched_domain *sd); | ||
| 1126 | #endif | ||
| 1127 | |||
| 1128 | /* | 1290 | /* |
| 1129 | * Perform scheduler related setup for a newly forked process p. | 1291 | * Perform scheduler related setup for a newly forked process p. |
| 1130 | * p is forked by current. | 1292 | * p is forked by current. |
| 1131 | */ | 1293 | */ |
| 1132 | void fastcall sched_fork(task_t *p) | 1294 | void fastcall sched_fork(task_t *p, int clone_flags) |
| 1133 | { | 1295 | { |
| 1296 | int cpu = get_cpu(); | ||
| 1297 | |||
| 1298 | #ifdef CONFIG_SMP | ||
| 1299 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
| 1300 | #endif | ||
| 1301 | set_task_cpu(p, cpu); | ||
| 1302 | |||
| 1134 | /* | 1303 | /* |
| 1135 | * We mark the process as running here, but have not actually | 1304 | * We mark the process as running here, but have not actually |
| 1136 | * inserted it onto the runqueue yet. This guarantees that | 1305 | * inserted it onto the runqueue yet. This guarantees that |
| @@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) | |||
| 1140 | p->state = TASK_RUNNING; | 1309 | p->state = TASK_RUNNING; |
| 1141 | INIT_LIST_HEAD(&p->run_list); | 1310 | INIT_LIST_HEAD(&p->run_list); |
| 1142 | p->array = NULL; | 1311 | p->array = NULL; |
| 1143 | spin_lock_init(&p->switch_lock); | ||
| 1144 | #ifdef CONFIG_SCHEDSTATS | 1312 | #ifdef CONFIG_SCHEDSTATS |
| 1145 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1313 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 1146 | #endif | 1314 | #endif |
| 1315 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
| 1316 | p->oncpu = 0; | ||
| 1317 | #endif | ||
| 1147 | #ifdef CONFIG_PREEMPT | 1318 | #ifdef CONFIG_PREEMPT |
| 1148 | /* | 1319 | /* Want to start with kernel preemption disabled. */ |
| 1149 | * During context-switch we hold precisely one spinlock, which | ||
| 1150 | * schedule_tail drops. (in the common case it's this_rq()->lock, | ||
| 1151 | * but it also can be p->switch_lock.) So we compensate with a count | ||
| 1152 | * of 1. Also, we want to start with kernel preemption disabled. | ||
| 1153 | */ | ||
| 1154 | p->thread_info->preempt_count = 1; | 1320 | p->thread_info->preempt_count = 1; |
| 1155 | #endif | 1321 | #endif |
| 1156 | /* | 1322 | /* |
| @@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) | |||
| 1174 | * runqueue lock is not a problem. | 1340 | * runqueue lock is not a problem. |
| 1175 | */ | 1341 | */ |
| 1176 | current->time_slice = 1; | 1342 | current->time_slice = 1; |
| 1177 | preempt_disable(); | ||
| 1178 | scheduler_tick(); | 1343 | scheduler_tick(); |
| 1179 | local_irq_enable(); | 1344 | } |
| 1180 | preempt_enable(); | 1345 | local_irq_enable(); |
| 1181 | } else | 1346 | put_cpu(); |
| 1182 | local_irq_enable(); | ||
| 1183 | } | 1347 | } |
| 1184 | 1348 | ||
| 1185 | /* | 1349 | /* |
| @@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
| 1196 | runqueue_t *rq, *this_rq; | 1360 | runqueue_t *rq, *this_rq; |
| 1197 | 1361 | ||
| 1198 | rq = task_rq_lock(p, &flags); | 1362 | rq = task_rq_lock(p, &flags); |
| 1199 | cpu = task_cpu(p); | ||
| 1200 | this_cpu = smp_processor_id(); | ||
| 1201 | |||
| 1202 | BUG_ON(p->state != TASK_RUNNING); | 1363 | BUG_ON(p->state != TASK_RUNNING); |
| 1364 | this_cpu = smp_processor_id(); | ||
| 1365 | cpu = task_cpu(p); | ||
| 1203 | 1366 | ||
| 1204 | /* | 1367 | /* |
| 1205 | * We decrease the sleep average of forking parents | 1368 | * We decrease the sleep average of forking parents |
| @@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) | |||
| 1296 | } | 1459 | } |
| 1297 | 1460 | ||
| 1298 | /** | 1461 | /** |
| 1462 | * prepare_task_switch - prepare to switch tasks | ||
| 1463 | * @rq: the runqueue preparing to switch | ||
| 1464 | * @next: the task we are going to switch to. | ||
| 1465 | * | ||
| 1466 | * This is called with the rq lock held and interrupts off. It must | ||
| 1467 | * be paired with a subsequent finish_task_switch after the context | ||
| 1468 | * switch. | ||
| 1469 | * | ||
| 1470 | * prepare_task_switch sets up locking and calls architecture specific | ||
| 1471 | * hooks. | ||
| 1472 | */ | ||
| 1473 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | ||
| 1474 | { | ||
| 1475 | prepare_lock_switch(rq, next); | ||
| 1476 | prepare_arch_switch(next); | ||
| 1477 | } | ||
| 1478 | |||
| 1479 | /** | ||
| 1299 | * finish_task_switch - clean up after a task-switch | 1480 | * finish_task_switch - clean up after a task-switch |
| 1300 | * @prev: the thread we just switched away from. | 1481 | * @prev: the thread we just switched away from. |
| 1301 | * | 1482 | * |
| 1302 | * We enter this with the runqueue still locked, and finish_arch_switch() | 1483 | * finish_task_switch must be called after the context switch, paired |
| 1303 | * will unlock it along with doing any other architecture-specific cleanup | 1484 | * with a prepare_task_switch call before the context switch. |
| 1304 | * actions. | 1485 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
| 1486 | * and do any other architecture-specific cleanup actions. | ||
| 1305 | * | 1487 | * |
| 1306 | * Note that we may have delayed dropping an mm in context_switch(). If | 1488 | * Note that we may have delayed dropping an mm in context_switch(). If |
| 1307 | * so, we finish that here outside of the runqueue lock. (Doing it | 1489 | * so, we finish that here outside of the runqueue lock. (Doing it |
| 1308 | * with the lock held can cause deadlocks; see schedule() for | 1490 | * with the lock held can cause deadlocks; see schedule() for |
| 1309 | * details.) | 1491 | * details.) |
| 1310 | */ | 1492 | */ |
| 1311 | static inline void finish_task_switch(task_t *prev) | 1493 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) |
| 1312 | __releases(rq->lock) | 1494 | __releases(rq->lock) |
| 1313 | { | 1495 | { |
| 1314 | runqueue_t *rq = this_rq(); | ||
| 1315 | struct mm_struct *mm = rq->prev_mm; | 1496 | struct mm_struct *mm = rq->prev_mm; |
| 1316 | unsigned long prev_task_flags; | 1497 | unsigned long prev_task_flags; |
| 1317 | 1498 | ||
| @@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) | |||
| 1329 | * Manfred Spraul <manfred@colorfullife.com> | 1510 | * Manfred Spraul <manfred@colorfullife.com> |
| 1330 | */ | 1511 | */ |
| 1331 | prev_task_flags = prev->flags; | 1512 | prev_task_flags = prev->flags; |
| 1332 | finish_arch_switch(rq, prev); | 1513 | finish_arch_switch(prev); |
| 1514 | finish_lock_switch(rq, prev); | ||
| 1333 | if (mm) | 1515 | if (mm) |
| 1334 | mmdrop(mm); | 1516 | mmdrop(mm); |
| 1335 | if (unlikely(prev_task_flags & PF_DEAD)) | 1517 | if (unlikely(prev_task_flags & PF_DEAD)) |
| @@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) | |||
| 1343 | asmlinkage void schedule_tail(task_t *prev) | 1525 | asmlinkage void schedule_tail(task_t *prev) |
| 1344 | __releases(rq->lock) | 1526 | __releases(rq->lock) |
| 1345 | { | 1527 | { |
| 1346 | finish_task_switch(prev); | 1528 | runqueue_t *rq = this_rq(); |
| 1347 | 1529 | finish_task_switch(rq, prev); | |
| 1530 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 1531 | /* In this case, finish_task_switch does not reenable preemption */ | ||
| 1532 | preempt_enable(); | ||
| 1533 | #endif | ||
| 1348 | if (current->set_child_tid) | 1534 | if (current->set_child_tid) |
| 1349 | put_user(current->pid, current->set_child_tid); | 1535 | put_user(current->pid, current->set_child_tid); |
| 1350 | } | 1536 | } |
| @@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
| 1494 | } | 1680 | } |
| 1495 | 1681 | ||
| 1496 | /* | 1682 | /* |
| 1497 | * find_idlest_cpu - find the least busy runqueue. | ||
| 1498 | */ | ||
| 1499 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
| 1500 | struct sched_domain *sd) | ||
| 1501 | { | ||
| 1502 | unsigned long load, min_load, this_load; | ||
| 1503 | int i, min_cpu; | ||
| 1504 | cpumask_t mask; | ||
| 1505 | |||
| 1506 | min_cpu = UINT_MAX; | ||
| 1507 | min_load = ULONG_MAX; | ||
| 1508 | |||
| 1509 | cpus_and(mask, sd->span, p->cpus_allowed); | ||
| 1510 | |||
| 1511 | for_each_cpu_mask(i, mask) { | ||
| 1512 | load = target_load(i); | ||
| 1513 | |||
| 1514 | if (load < min_load) { | ||
| 1515 | min_cpu = i; | ||
| 1516 | min_load = load; | ||
| 1517 | |||
| 1518 | /* break out early on an idle CPU: */ | ||
| 1519 | if (!min_load) | ||
| 1520 | break; | ||
| 1521 | } | ||
| 1522 | } | ||
| 1523 | |||
| 1524 | /* add +1 to account for the new task */ | ||
| 1525 | this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; | ||
| 1526 | |||
| 1527 | /* | ||
| 1528 | * Would with the addition of the new task to the | ||
| 1529 | * current CPU there be an imbalance between this | ||
| 1530 | * CPU and the idlest CPU? | ||
| 1531 | * | ||
| 1532 | * Use half of the balancing threshold - new-context is | ||
| 1533 | * a good opportunity to balance. | ||
| 1534 | */ | ||
| 1535 | if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) | ||
| 1536 | return min_cpu; | ||
| 1537 | |||
| 1538 | return this_cpu; | ||
| 1539 | } | ||
| 1540 | |||
| 1541 | /* | ||
| 1542 | * If dest_cpu is allowed for this process, migrate the task to it. | 1683 | * If dest_cpu is allowed for this process, migrate the task to it. |
| 1543 | * This is accomplished by forcing the cpu_allowed mask to only | 1684 | * This is accomplished by forcing the cpu_allowed mask to only |
| 1544 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1685 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
| @@ -1571,37 +1712,16 @@ out: | |||
| 1571 | } | 1712 | } |
| 1572 | 1713 | ||
| 1573 | /* | 1714 | /* |
| 1574 | * sched_exec(): find the highest-level, exec-balance-capable | 1715 | * sched_exec - execve() is a valuable balancing opportunity, because at |
| 1575 | * domain and try to migrate the task to the least loaded CPU. | 1716 | * this point the task has the smallest effective memory and cache footprint. |
| 1576 | * | ||
| 1577 | * execve() is a valuable balancing opportunity, because at this point | ||
| 1578 | * the task has the smallest effective memory and cache footprint. | ||
| 1579 | */ | 1717 | */ |
| 1580 | void sched_exec(void) | 1718 | void sched_exec(void) |
| 1581 | { | 1719 | { |
| 1582 | struct sched_domain *tmp, *sd = NULL; | ||
| 1583 | int new_cpu, this_cpu = get_cpu(); | 1720 | int new_cpu, this_cpu = get_cpu(); |
| 1584 | 1721 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | |
| 1585 | /* Prefer the current CPU if there's only this task running */ | ||
| 1586 | if (this_rq()->nr_running <= 1) | ||
| 1587 | goto out; | ||
| 1588 | |||
| 1589 | for_each_domain(this_cpu, tmp) | ||
| 1590 | if (tmp->flags & SD_BALANCE_EXEC) | ||
| 1591 | sd = tmp; | ||
| 1592 | |||
| 1593 | if (sd) { | ||
| 1594 | schedstat_inc(sd, sbe_attempts); | ||
| 1595 | new_cpu = find_idlest_cpu(current, this_cpu, sd); | ||
| 1596 | if (new_cpu != this_cpu) { | ||
| 1597 | schedstat_inc(sd, sbe_pushed); | ||
| 1598 | put_cpu(); | ||
| 1599 | sched_migrate_task(current, new_cpu); | ||
| 1600 | return; | ||
| 1601 | } | ||
| 1602 | } | ||
| 1603 | out: | ||
| 1604 | put_cpu(); | 1722 | put_cpu(); |
| 1723 | if (new_cpu != this_cpu) | ||
| 1724 | sched_migrate_task(current, new_cpu); | ||
| 1605 | } | 1725 | } |
| 1606 | 1726 | ||
| 1607 | /* | 1727 | /* |
| @@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
| 1632 | */ | 1752 | */ |
| 1633 | static inline | 1753 | static inline |
| 1634 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1754 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
| 1635 | struct sched_domain *sd, enum idle_type idle) | 1755 | struct sched_domain *sd, enum idle_type idle, int *all_pinned) |
| 1636 | { | 1756 | { |
| 1637 | /* | 1757 | /* |
| 1638 | * We do not migrate tasks that are: | 1758 | * We do not migrate tasks that are: |
| @@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
| 1640 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 1760 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
| 1641 | * 3) are cache-hot on their current CPU. | 1761 | * 3) are cache-hot on their current CPU. |
| 1642 | */ | 1762 | */ |
| 1643 | if (task_running(rq, p)) | ||
| 1644 | return 0; | ||
| 1645 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 1763 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
| 1646 | return 0; | 1764 | return 0; |
| 1765 | *all_pinned = 0; | ||
| 1766 | |||
| 1767 | if (task_running(rq, p)) | ||
| 1768 | return 0; | ||
| 1647 | 1769 | ||
| 1648 | /* | 1770 | /* |
| 1649 | * Aggressive migration if: | 1771 | * Aggressive migration if: |
| 1650 | * 1) the [whole] cpu is idle, or | 1772 | * 1) task is cache cold, or |
| 1651 | * 2) too many balance attempts have failed. | 1773 | * 2) too many balance attempts have failed. |
| 1652 | */ | 1774 | */ |
| 1653 | 1775 | ||
| 1654 | if (cpu_and_siblings_are_idle(this_cpu) || \ | 1776 | if (sd->nr_balance_failed > sd->cache_nice_tries) |
| 1655 | sd->nr_balance_failed > sd->cache_nice_tries) | ||
| 1656 | return 1; | 1777 | return 1; |
| 1657 | 1778 | ||
| 1658 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 1779 | if (task_hot(p, rq->timestamp_last_tick, sd)) |
| 1659 | return 0; | 1780 | return 0; |
| 1660 | return 1; | 1781 | return 1; |
| 1661 | } | 1782 | } |
| 1662 | 1783 | ||
| @@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
| 1669 | */ | 1790 | */ |
| 1670 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 1791 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
| 1671 | unsigned long max_nr_move, struct sched_domain *sd, | 1792 | unsigned long max_nr_move, struct sched_domain *sd, |
| 1672 | enum idle_type idle) | 1793 | enum idle_type idle, int *all_pinned) |
| 1673 | { | 1794 | { |
| 1674 | prio_array_t *array, *dst_array; | 1795 | prio_array_t *array, *dst_array; |
| 1675 | struct list_head *head, *curr; | 1796 | struct list_head *head, *curr; |
| 1676 | int idx, pulled = 0; | 1797 | int idx, pulled = 0, pinned = 0; |
| 1677 | task_t *tmp; | 1798 | task_t *tmp; |
| 1678 | 1799 | ||
| 1679 | if (max_nr_move <= 0 || busiest->nr_running <= 1) | 1800 | if (max_nr_move == 0) |
| 1680 | goto out; | 1801 | goto out; |
| 1681 | 1802 | ||
| 1803 | pinned = 1; | ||
| 1804 | |||
| 1682 | /* | 1805 | /* |
| 1683 | * We first consider expired tasks. Those will likely not be | 1806 | * We first consider expired tasks. Those will likely not be |
| 1684 | * executed in the near future, and they are most likely to | 1807 | * executed in the near future, and they are most likely to |
| @@ -1717,7 +1840,7 @@ skip_queue: | |||
| 1717 | 1840 | ||
| 1718 | curr = curr->prev; | 1841 | curr = curr->prev; |
| 1719 | 1842 | ||
| 1720 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { | 1843 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { |
| 1721 | if (curr != head) | 1844 | if (curr != head) |
| 1722 | goto skip_queue; | 1845 | goto skip_queue; |
| 1723 | idx++; | 1846 | idx++; |
| @@ -1746,6 +1869,9 @@ out: | |||
| 1746 | * inside pull_task(). | 1869 | * inside pull_task(). |
| 1747 | */ | 1870 | */ |
| 1748 | schedstat_add(sd, lb_gained[idle], pulled); | 1871 | schedstat_add(sd, lb_gained[idle], pulled); |
| 1872 | |||
| 1873 | if (all_pinned) | ||
| 1874 | *all_pinned = pinned; | ||
| 1749 | return pulled; | 1875 | return pulled; |
| 1750 | } | 1876 | } |
| 1751 | 1877 | ||
| @@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1760 | { | 1886 | { |
| 1761 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1887 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 1762 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1888 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| 1889 | int load_idx; | ||
| 1763 | 1890 | ||
| 1764 | max_load = this_load = total_load = total_pwr = 0; | 1891 | max_load = this_load = total_load = total_pwr = 0; |
| 1892 | if (idle == NOT_IDLE) | ||
| 1893 | load_idx = sd->busy_idx; | ||
| 1894 | else if (idle == NEWLY_IDLE) | ||
| 1895 | load_idx = sd->newidle_idx; | ||
| 1896 | else | ||
| 1897 | load_idx = sd->idle_idx; | ||
| 1765 | 1898 | ||
| 1766 | do { | 1899 | do { |
| 1767 | unsigned long load; | 1900 | unsigned long load; |
| @@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1776 | for_each_cpu_mask(i, group->cpumask) { | 1909 | for_each_cpu_mask(i, group->cpumask) { |
| 1777 | /* Bias balancing toward cpus of our domain */ | 1910 | /* Bias balancing toward cpus of our domain */ |
| 1778 | if (local_group) | 1911 | if (local_group) |
| 1779 | load = target_load(i); | 1912 | load = target_load(i, load_idx); |
| 1780 | else | 1913 | else |
| 1781 | load = source_load(i); | 1914 | load = source_load(i, load_idx); |
| 1782 | 1915 | ||
| 1783 | avg_load += load; | 1916 | avg_load += load; |
| 1784 | } | 1917 | } |
| @@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1792 | if (local_group) { | 1925 | if (local_group) { |
| 1793 | this_load = avg_load; | 1926 | this_load = avg_load; |
| 1794 | this = group; | 1927 | this = group; |
| 1795 | goto nextgroup; | ||
| 1796 | } else if (avg_load > max_load) { | 1928 | } else if (avg_load > max_load) { |
| 1797 | max_load = avg_load; | 1929 | max_load = avg_load; |
| 1798 | busiest = group; | 1930 | busiest = group; |
| 1799 | } | 1931 | } |
| 1800 | nextgroup: | ||
| 1801 | group = group->next; | 1932 | group = group->next; |
| 1802 | } while (group != sd->groups); | 1933 | } while (group != sd->groups); |
| 1803 | 1934 | ||
| @@ -1870,15 +2001,9 @@ nextgroup: | |||
| 1870 | 2001 | ||
| 1871 | /* Get rid of the scaling factor, rounding down as we divide */ | 2002 | /* Get rid of the scaling factor, rounding down as we divide */ |
| 1872 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | 2003 | *imbalance = *imbalance / SCHED_LOAD_SCALE; |
| 1873 | |||
| 1874 | return busiest; | 2004 | return busiest; |
| 1875 | 2005 | ||
| 1876 | out_balanced: | 2006 | out_balanced: |
| 1877 | if (busiest && (idle == NEWLY_IDLE || | ||
| 1878 | (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { | ||
| 1879 | *imbalance = 1; | ||
| 1880 | return busiest; | ||
| 1881 | } | ||
| 1882 | 2007 | ||
| 1883 | *imbalance = 0; | 2008 | *imbalance = 0; |
| 1884 | return NULL; | 2009 | return NULL; |
| @@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
| 1894 | int i; | 2019 | int i; |
| 1895 | 2020 | ||
| 1896 | for_each_cpu_mask(i, group->cpumask) { | 2021 | for_each_cpu_mask(i, group->cpumask) { |
| 1897 | load = source_load(i); | 2022 | load = source_load(i, 0); |
| 1898 | 2023 | ||
| 1899 | if (load > max_load) { | 2024 | if (load > max_load) { |
| 1900 | max_load = load; | 2025 | max_load = load; |
| @@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
| 1906 | } | 2031 | } |
| 1907 | 2032 | ||
| 1908 | /* | 2033 | /* |
| 2034 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
| 2035 | * so long as it is large enough. | ||
| 2036 | */ | ||
| 2037 | #define MAX_PINNED_INTERVAL 512 | ||
| 2038 | |||
| 2039 | /* | ||
| 1909 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2040 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 1910 | * tasks if there is an imbalance. | 2041 | * tasks if there is an imbalance. |
| 1911 | * | 2042 | * |
| @@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 1917 | struct sched_group *group; | 2048 | struct sched_group *group; |
| 1918 | runqueue_t *busiest; | 2049 | runqueue_t *busiest; |
| 1919 | unsigned long imbalance; | 2050 | unsigned long imbalance; |
| 1920 | int nr_moved; | 2051 | int nr_moved, all_pinned = 0; |
| 2052 | int active_balance = 0; | ||
| 1921 | 2053 | ||
| 1922 | spin_lock(&this_rq->lock); | 2054 | spin_lock(&this_rq->lock); |
| 1923 | schedstat_inc(sd, lb_cnt[idle]); | 2055 | schedstat_inc(sd, lb_cnt[idle]); |
| @@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 1934 | goto out_balanced; | 2066 | goto out_balanced; |
| 1935 | } | 2067 | } |
| 1936 | 2068 | ||
| 1937 | /* | 2069 | BUG_ON(busiest == this_rq); |
| 1938 | * This should be "impossible", but since load | ||
| 1939 | * balancing is inherently racy and statistical, | ||
| 1940 | * it could happen in theory. | ||
| 1941 | */ | ||
| 1942 | if (unlikely(busiest == this_rq)) { | ||
| 1943 | WARN_ON(1); | ||
| 1944 | goto out_balanced; | ||
| 1945 | } | ||
| 1946 | 2070 | ||
| 1947 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2071 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
| 1948 | 2072 | ||
| @@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 1956 | */ | 2080 | */ |
| 1957 | double_lock_balance(this_rq, busiest); | 2081 | double_lock_balance(this_rq, busiest); |
| 1958 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2082 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 1959 | imbalance, sd, idle); | 2083 | imbalance, sd, idle, |
| 2084 | &all_pinned); | ||
| 1960 | spin_unlock(&busiest->lock); | 2085 | spin_unlock(&busiest->lock); |
| 2086 | |||
| 2087 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
| 2088 | if (unlikely(all_pinned)) | ||
| 2089 | goto out_balanced; | ||
| 1961 | } | 2090 | } |
| 2091 | |||
| 1962 | spin_unlock(&this_rq->lock); | 2092 | spin_unlock(&this_rq->lock); |
| 1963 | 2093 | ||
| 1964 | if (!nr_moved) { | 2094 | if (!nr_moved) { |
| @@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 1966 | sd->nr_balance_failed++; | 2096 | sd->nr_balance_failed++; |
| 1967 | 2097 | ||
| 1968 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2098 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
| 1969 | int wake = 0; | ||
| 1970 | 2099 | ||
| 1971 | spin_lock(&busiest->lock); | 2100 | spin_lock(&busiest->lock); |
| 1972 | if (!busiest->active_balance) { | 2101 | if (!busiest->active_balance) { |
| 1973 | busiest->active_balance = 1; | 2102 | busiest->active_balance = 1; |
| 1974 | busiest->push_cpu = this_cpu; | 2103 | busiest->push_cpu = this_cpu; |
| 1975 | wake = 1; | 2104 | active_balance = 1; |
| 1976 | } | 2105 | } |
| 1977 | spin_unlock(&busiest->lock); | 2106 | spin_unlock(&busiest->lock); |
| 1978 | if (wake) | 2107 | if (active_balance) |
| 1979 | wake_up_process(busiest->migration_thread); | 2108 | wake_up_process(busiest->migration_thread); |
| 1980 | 2109 | ||
| 1981 | /* | 2110 | /* |
| 1982 | * We've kicked active balancing, reset the failure | 2111 | * We've kicked active balancing, reset the failure |
| 1983 | * counter. | 2112 | * counter. |
| 1984 | */ | 2113 | */ |
| 1985 | sd->nr_balance_failed = sd->cache_nice_tries; | 2114 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
| 1986 | } | 2115 | } |
| 1987 | 2116 | } else | |
| 1988 | /* | ||
| 1989 | * We were unbalanced, but unsuccessful in move_tasks(), | ||
| 1990 | * so bump the balance_interval to lessen the lock contention. | ||
| 1991 | */ | ||
| 1992 | if (sd->balance_interval < sd->max_interval) | ||
| 1993 | sd->balance_interval++; | ||
| 1994 | } else { | ||
| 1995 | sd->nr_balance_failed = 0; | 2117 | sd->nr_balance_failed = 0; |
| 1996 | 2118 | ||
| 2119 | if (likely(!active_balance)) { | ||
| 1997 | /* We were unbalanced, so reset the balancing interval */ | 2120 | /* We were unbalanced, so reset the balancing interval */ |
| 1998 | sd->balance_interval = sd->min_interval; | 2121 | sd->balance_interval = sd->min_interval; |
| 2122 | } else { | ||
| 2123 | /* | ||
| 2124 | * If we've begun active balancing, start to back off. This | ||
| 2125 | * case may not be covered by the all_pinned logic if there | ||
| 2126 | * is only 1 task on the busy runqueue (because we don't call | ||
| 2127 | * move_tasks). | ||
| 2128 | */ | ||
| 2129 | if (sd->balance_interval < sd->max_interval) | ||
| 2130 | sd->balance_interval *= 2; | ||
| 1999 | } | 2131 | } |
| 2000 | 2132 | ||
| 2001 | return nr_moved; | 2133 | return nr_moved; |
| @@ -2005,8 +2137,10 @@ out_balanced: | |||
| 2005 | 2137 | ||
| 2006 | schedstat_inc(sd, lb_balanced[idle]); | 2138 | schedstat_inc(sd, lb_balanced[idle]); |
| 2007 | 2139 | ||
| 2140 | sd->nr_balance_failed = 0; | ||
| 2008 | /* tune up the balancing interval */ | 2141 | /* tune up the balancing interval */ |
| 2009 | if (sd->balance_interval < sd->max_interval) | 2142 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
| 2143 | (sd->balance_interval < sd->max_interval)) | ||
| 2010 | sd->balance_interval *= 2; | 2144 | sd->balance_interval *= 2; |
| 2011 | 2145 | ||
| 2012 | return 0; | 2146 | return 0; |
| @@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2030 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2164 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
| 2031 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | 2165 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); |
| 2032 | if (!group) { | 2166 | if (!group) { |
| 2033 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
| 2034 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2167 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
| 2035 | goto out; | 2168 | goto out_balanced; |
| 2036 | } | 2169 | } |
| 2037 | 2170 | ||
| 2038 | busiest = find_busiest_queue(group); | 2171 | busiest = find_busiest_queue(group); |
| 2039 | if (!busiest || busiest == this_rq) { | 2172 | if (!busiest) { |
| 2040 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
| 2041 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2173 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
| 2042 | goto out; | 2174 | goto out_balanced; |
| 2043 | } | 2175 | } |
| 2044 | 2176 | ||
| 2177 | BUG_ON(busiest == this_rq); | ||
| 2178 | |||
| 2045 | /* Attempt to move tasks */ | 2179 | /* Attempt to move tasks */ |
| 2046 | double_lock_balance(this_rq, busiest); | 2180 | double_lock_balance(this_rq, busiest); |
| 2047 | 2181 | ||
| 2048 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2182 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
| 2049 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2183 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2050 | imbalance, sd, NEWLY_IDLE); | 2184 | imbalance, sd, NEWLY_IDLE, NULL); |
| 2051 | if (!nr_moved) | 2185 | if (!nr_moved) |
| 2052 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2186 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
| 2187 | else | ||
| 2188 | sd->nr_balance_failed = 0; | ||
| 2053 | 2189 | ||
| 2054 | spin_unlock(&busiest->lock); | 2190 | spin_unlock(&busiest->lock); |
| 2055 | |||
| 2056 | out: | ||
| 2057 | return nr_moved; | 2191 | return nr_moved; |
| 2192 | |||
| 2193 | out_balanced: | ||
| 2194 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
| 2195 | sd->nr_balance_failed = 0; | ||
| 2196 | return 0; | ||
| 2058 | } | 2197 | } |
| 2059 | 2198 | ||
| 2060 | /* | 2199 | /* |
| @@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
| 2086 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2225 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) |
| 2087 | { | 2226 | { |
| 2088 | struct sched_domain *sd; | 2227 | struct sched_domain *sd; |
| 2089 | struct sched_group *cpu_group; | ||
| 2090 | runqueue_t *target_rq; | 2228 | runqueue_t *target_rq; |
| 2091 | cpumask_t visited_cpus; | 2229 | int target_cpu = busiest_rq->push_cpu; |
| 2092 | int cpu; | 2230 | |
| 2231 | if (busiest_rq->nr_running <= 1) | ||
| 2232 | /* no task to move */ | ||
| 2233 | return; | ||
| 2234 | |||
| 2235 | target_rq = cpu_rq(target_cpu); | ||
| 2093 | 2236 | ||
| 2094 | /* | 2237 | /* |
| 2095 | * Search for suitable CPUs to push tasks to in successively higher | 2238 | * This condition is "impossible", if it occurs |
| 2096 | * domains with SD_LOAD_BALANCE set. | 2239 | * we need to fix it. Originally reported by |
| 2240 | * Bjorn Helgaas on a 128-cpu setup. | ||
| 2097 | */ | 2241 | */ |
| 2098 | visited_cpus = CPU_MASK_NONE; | 2242 | BUG_ON(busiest_rq == target_rq); |
| 2099 | for_each_domain(busiest_cpu, sd) { | ||
| 2100 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 2101 | /* no more domains to search */ | ||
| 2102 | break; | ||
| 2103 | 2243 | ||
| 2104 | schedstat_inc(sd, alb_cnt); | 2244 | /* move a task from busiest_rq to target_rq */ |
| 2245 | double_lock_balance(busiest_rq, target_rq); | ||
| 2105 | 2246 | ||
| 2106 | cpu_group = sd->groups; | 2247 | /* Search for an sd spanning us and the target CPU. */ |
| 2107 | do { | 2248 | for_each_domain(target_cpu, sd) |
| 2108 | for_each_cpu_mask(cpu, cpu_group->cpumask) { | 2249 | if ((sd->flags & SD_LOAD_BALANCE) && |
| 2109 | if (busiest_rq->nr_running <= 1) | 2250 | cpu_isset(busiest_cpu, sd->span)) |
| 2110 | /* no more tasks left to move */ | 2251 | break; |
| 2111 | return; | 2252 | |
| 2112 | if (cpu_isset(cpu, visited_cpus)) | 2253 | if (unlikely(sd == NULL)) |
| 2113 | continue; | 2254 | goto out; |
| 2114 | cpu_set(cpu, visited_cpus); | 2255 | |
| 2115 | if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) | 2256 | schedstat_inc(sd, alb_cnt); |
| 2116 | continue; | 2257 | |
| 2117 | 2258 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | |
| 2118 | target_rq = cpu_rq(cpu); | 2259 | schedstat_inc(sd, alb_pushed); |
| 2119 | /* | 2260 | else |
| 2120 | * This condition is "impossible", if it occurs | 2261 | schedstat_inc(sd, alb_failed); |
| 2121 | * we need to fix it. Originally reported by | 2262 | out: |
| 2122 | * Bjorn Helgaas on a 128-cpu setup. | 2263 | spin_unlock(&target_rq->lock); |
| 2123 | */ | ||
| 2124 | BUG_ON(busiest_rq == target_rq); | ||
| 2125 | |||
| 2126 | /* move a task from busiest_rq to target_rq */ | ||
| 2127 | double_lock_balance(busiest_rq, target_rq); | ||
| 2128 | if (move_tasks(target_rq, cpu, busiest_rq, | ||
| 2129 | 1, sd, SCHED_IDLE)) { | ||
| 2130 | schedstat_inc(sd, alb_pushed); | ||
| 2131 | } else { | ||
| 2132 | schedstat_inc(sd, alb_failed); | ||
| 2133 | } | ||
| 2134 | spin_unlock(&target_rq->lock); | ||
| 2135 | } | ||
| 2136 | cpu_group = cpu_group->next; | ||
| 2137 | } while (cpu_group != sd->groups); | ||
| 2138 | } | ||
| 2139 | } | 2264 | } |
| 2140 | 2265 | ||
| 2141 | /* | 2266 | /* |
| @@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2156 | unsigned long old_load, this_load; | 2281 | unsigned long old_load, this_load; |
| 2157 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | 2282 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); |
| 2158 | struct sched_domain *sd; | 2283 | struct sched_domain *sd; |
| 2284 | int i; | ||
| 2159 | 2285 | ||
| 2160 | /* Update our load */ | ||
| 2161 | old_load = this_rq->cpu_load; | ||
| 2162 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2286 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; |
| 2163 | /* | 2287 | /* Update our load */ |
| 2164 | * Round up the averaging division if load is increasing. This | 2288 | for (i = 0; i < 3; i++) { |
| 2165 | * prevents us from getting stuck on 9 if the load is 10, for | 2289 | unsigned long new_load = this_load; |
| 2166 | * example. | 2290 | int scale = 1 << i; |
| 2167 | */ | 2291 | old_load = this_rq->cpu_load[i]; |
| 2168 | if (this_load > old_load) | 2292 | /* |
| 2169 | old_load++; | 2293 | * Round up the averaging division if load is increasing. This |
| 2170 | this_rq->cpu_load = (old_load + this_load) / 2; | 2294 | * prevents us from getting stuck on 9 if the load is 10, for |
| 2295 | * example. | ||
| 2296 | */ | ||
| 2297 | if (new_load > old_load) | ||
| 2298 | new_load += scale-1; | ||
| 2299 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | ||
| 2300 | } | ||
| 2171 | 2301 | ||
| 2172 | for_each_domain(this_cpu, sd) { | 2302 | for_each_domain(this_cpu, sd) { |
| 2173 | unsigned long interval; | 2303 | unsigned long interval; |
| @@ -2447,11 +2577,15 @@ out: | |||
| 2447 | #ifdef CONFIG_SCHED_SMT | 2577 | #ifdef CONFIG_SCHED_SMT |
| 2448 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2578 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
| 2449 | { | 2579 | { |
| 2450 | struct sched_domain *sd = this_rq->sd; | 2580 | struct sched_domain *tmp, *sd = NULL; |
| 2451 | cpumask_t sibling_map; | 2581 | cpumask_t sibling_map; |
| 2452 | int i; | 2582 | int i; |
| 2453 | 2583 | ||
| 2454 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2584 | for_each_domain(this_cpu, tmp) |
| 2585 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
| 2586 | sd = tmp; | ||
| 2587 | |||
| 2588 | if (!sd) | ||
| 2455 | return; | 2589 | return; |
| 2456 | 2590 | ||
| 2457 | /* | 2591 | /* |
| @@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
| 2492 | 2626 | ||
| 2493 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2627 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
| 2494 | { | 2628 | { |
| 2495 | struct sched_domain *sd = this_rq->sd; | 2629 | struct sched_domain *tmp, *sd = NULL; |
| 2496 | cpumask_t sibling_map; | 2630 | cpumask_t sibling_map; |
| 2497 | prio_array_t *array; | 2631 | prio_array_t *array; |
| 2498 | int ret = 0, i; | 2632 | int ret = 0, i; |
| 2499 | task_t *p; | 2633 | task_t *p; |
| 2500 | 2634 | ||
| 2501 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2635 | for_each_domain(this_cpu, tmp) |
| 2636 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
| 2637 | sd = tmp; | ||
| 2638 | |||
| 2639 | if (!sd) | ||
| 2502 | return 0; | 2640 | return 0; |
| 2503 | 2641 | ||
| 2504 | /* | 2642 | /* |
| @@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val) | |||
| 2576 | /* | 2714 | /* |
| 2577 | * Underflow? | 2715 | * Underflow? |
| 2578 | */ | 2716 | */ |
| 2579 | BUG_ON(((int)preempt_count() < 0)); | 2717 | BUG_ON((preempt_count() < 0)); |
| 2580 | preempt_count() += val; | 2718 | preempt_count() += val; |
| 2581 | /* | 2719 | /* |
| 2582 | * Spinlock count overflowing soon? | 2720 | * Spinlock count overflowing soon? |
| @@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void) | |||
| 2613 | struct list_head *queue; | 2751 | struct list_head *queue; |
| 2614 | unsigned long long now; | 2752 | unsigned long long now; |
| 2615 | unsigned long run_time; | 2753 | unsigned long run_time; |
| 2616 | int cpu, idx; | 2754 | int cpu, idx, new_prio; |
| 2617 | 2755 | ||
| 2618 | /* | 2756 | /* |
| 2619 | * Test if we are atomic. Since do_exit() needs to call into | 2757 | * Test if we are atomic. Since do_exit() needs to call into |
| @@ -2735,9 +2873,14 @@ go_idle: | |||
| 2735 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 2873 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
| 2736 | 2874 | ||
| 2737 | array = next->array; | 2875 | array = next->array; |
| 2738 | dequeue_task(next, array); | 2876 | new_prio = recalc_task_prio(next, next->timestamp + delta); |
| 2739 | recalc_task_prio(next, next->timestamp + delta); | 2877 | |
| 2740 | enqueue_task(next, array); | 2878 | if (unlikely(next->prio != new_prio)) { |
| 2879 | dequeue_task(next, array); | ||
| 2880 | next->prio = new_prio; | ||
| 2881 | enqueue_task(next, array); | ||
| 2882 | } else | ||
| 2883 | requeue_task(next, array); | ||
| 2741 | } | 2884 | } |
| 2742 | next->activated = 0; | 2885 | next->activated = 0; |
| 2743 | switch_tasks: | 2886 | switch_tasks: |
| @@ -2761,11 +2904,15 @@ switch_tasks: | |||
| 2761 | rq->curr = next; | 2904 | rq->curr = next; |
| 2762 | ++*switch_count; | 2905 | ++*switch_count; |
| 2763 | 2906 | ||
| 2764 | prepare_arch_switch(rq, next); | 2907 | prepare_task_switch(rq, next); |
| 2765 | prev = context_switch(rq, prev, next); | 2908 | prev = context_switch(rq, prev, next); |
| 2766 | barrier(); | 2909 | barrier(); |
| 2767 | 2910 | /* | |
| 2768 | finish_task_switch(prev); | 2911 | * this_rq must be evaluated again because prev may have moved |
| 2912 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
| 2913 | * frame will be invalid. | ||
| 2914 | */ | ||
| 2915 | finish_task_switch(this_rq(), prev); | ||
| 2769 | } else | 2916 | } else |
| 2770 | spin_unlock_irq(&rq->lock); | 2917 | spin_unlock_irq(&rq->lock); |
| 2771 | 2918 | ||
| @@ -2869,7 +3016,7 @@ need_resched: | |||
| 2869 | 3016 | ||
| 2870 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | 3017 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) |
| 2871 | { | 3018 | { |
| 2872 | task_t *p = curr->task; | 3019 | task_t *p = curr->private; |
| 2873 | return try_to_wake_up(p, mode, sync); | 3020 | return try_to_wake_up(p, mode, sync); |
| 2874 | } | 3021 | } |
| 2875 | 3022 | ||
| @@ -3384,13 +3531,24 @@ recheck: | |||
| 3384 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3531 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) |
| 3385 | return -EINVAL; | 3532 | return -EINVAL; |
| 3386 | 3533 | ||
| 3387 | if ((policy == SCHED_FIFO || policy == SCHED_RR) && | 3534 | /* |
| 3388 | param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && | 3535 | * Allow unprivileged RT tasks to decrease priority: |
| 3389 | !capable(CAP_SYS_NICE)) | 3536 | */ |
| 3390 | return -EPERM; | 3537 | if (!capable(CAP_SYS_NICE)) { |
| 3391 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 3538 | /* can't change policy */ |
| 3392 | !capable(CAP_SYS_NICE)) | 3539 | if (policy != p->policy) |
| 3393 | return -EPERM; | 3540 | return -EPERM; |
| 3541 | /* can't increase priority */ | ||
| 3542 | if (policy != SCHED_NORMAL && | ||
| 3543 | param->sched_priority > p->rt_priority && | ||
| 3544 | param->sched_priority > | ||
| 3545 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
| 3546 | return -EPERM; | ||
| 3547 | /* can't change other user's priorities */ | ||
| 3548 | if ((current->euid != p->euid) && | ||
| 3549 | (current->euid != p->uid)) | ||
| 3550 | return -EPERM; | ||
| 3551 | } | ||
| 3394 | 3552 | ||
| 3395 | retval = security_task_setscheduler(p, policy, param); | 3553 | retval = security_task_setscheduler(p, policy, param); |
| 3396 | if (retval) | 3554 | if (retval) |
| @@ -3755,19 +3913,22 @@ EXPORT_SYMBOL(cond_resched); | |||
| 3755 | */ | 3913 | */ |
| 3756 | int cond_resched_lock(spinlock_t * lock) | 3914 | int cond_resched_lock(spinlock_t * lock) |
| 3757 | { | 3915 | { |
| 3916 | int ret = 0; | ||
| 3917 | |||
| 3758 | if (need_lockbreak(lock)) { | 3918 | if (need_lockbreak(lock)) { |
| 3759 | spin_unlock(lock); | 3919 | spin_unlock(lock); |
| 3760 | cpu_relax(); | 3920 | cpu_relax(); |
| 3921 | ret = 1; | ||
| 3761 | spin_lock(lock); | 3922 | spin_lock(lock); |
| 3762 | } | 3923 | } |
| 3763 | if (need_resched()) { | 3924 | if (need_resched()) { |
| 3764 | _raw_spin_unlock(lock); | 3925 | _raw_spin_unlock(lock); |
| 3765 | preempt_enable_no_resched(); | 3926 | preempt_enable_no_resched(); |
| 3766 | __cond_resched(); | 3927 | __cond_resched(); |
| 3928 | ret = 1; | ||
| 3767 | spin_lock(lock); | 3929 | spin_lock(lock); |
| 3768 | return 1; | ||
| 3769 | } | 3930 | } |
| 3770 | return 0; | 3931 | return ret; |
| 3771 | } | 3932 | } |
| 3772 | 3933 | ||
| 3773 | EXPORT_SYMBOL(cond_resched_lock); | 3934 | EXPORT_SYMBOL(cond_resched_lock); |
| @@ -3811,7 +3972,7 @@ EXPORT_SYMBOL(yield); | |||
| 3811 | */ | 3972 | */ |
| 3812 | void __sched io_schedule(void) | 3973 | void __sched io_schedule(void) |
| 3813 | { | 3974 | { |
| 3814 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | 3975 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
| 3815 | 3976 | ||
| 3816 | atomic_inc(&rq->nr_iowait); | 3977 | atomic_inc(&rq->nr_iowait); |
| 3817 | schedule(); | 3978 | schedule(); |
| @@ -3822,7 +3983,7 @@ EXPORT_SYMBOL(io_schedule); | |||
| 3822 | 3983 | ||
| 3823 | long __sched io_schedule_timeout(long timeout) | 3984 | long __sched io_schedule_timeout(long timeout) |
| 3824 | { | 3985 | { |
| 3825 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | 3986 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
| 3826 | long ret; | 3987 | long ret; |
| 3827 | 3988 | ||
| 3828 | atomic_inc(&rq->nr_iowait); | 3989 | atomic_inc(&rq->nr_iowait); |
| @@ -4027,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
| 4027 | 4188 | ||
| 4028 | spin_lock_irqsave(&rq->lock, flags); | 4189 | spin_lock_irqsave(&rq->lock, flags); |
| 4029 | rq->curr = rq->idle = idle; | 4190 | rq->curr = rq->idle = idle; |
| 4191 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
| 4192 | idle->oncpu = 1; | ||
| 4193 | #endif | ||
| 4030 | set_tsk_need_resched(idle); | 4194 | set_tsk_need_resched(idle); |
| 4031 | spin_unlock_irqrestore(&rq->lock, flags); | 4195 | spin_unlock_irqrestore(&rq->lock, flags); |
| 4032 | 4196 | ||
| @@ -4171,8 +4335,7 @@ static int migration_thread(void * data) | |||
| 4171 | struct list_head *head; | 4335 | struct list_head *head; |
| 4172 | migration_req_t *req; | 4336 | migration_req_t *req; |
| 4173 | 4337 | ||
| 4174 | if (current->flags & PF_FREEZE) | 4338 | try_to_freeze(); |
| 4175 | refrigerator(PF_FREEZE); | ||
| 4176 | 4339 | ||
| 4177 | spin_lock_irq(&rq->lock); | 4340 | spin_lock_irq(&rq->lock); |
| 4178 | 4341 | ||
| @@ -4197,17 +4360,9 @@ static int migration_thread(void * data) | |||
| 4197 | req = list_entry(head->next, migration_req_t, list); | 4360 | req = list_entry(head->next, migration_req_t, list); |
| 4198 | list_del_init(head->next); | 4361 | list_del_init(head->next); |
| 4199 | 4362 | ||
| 4200 | if (req->type == REQ_MOVE_TASK) { | 4363 | spin_unlock(&rq->lock); |
| 4201 | spin_unlock(&rq->lock); | 4364 | __migrate_task(req->task, cpu, req->dest_cpu); |
| 4202 | __migrate_task(req->task, cpu, req->dest_cpu); | 4365 | local_irq_enable(); |
| 4203 | local_irq_enable(); | ||
| 4204 | } else if (req->type == REQ_SET_DOMAIN) { | ||
| 4205 | rq->sd = req->sd; | ||
| 4206 | spin_unlock_irq(&rq->lock); | ||
| 4207 | } else { | ||
| 4208 | spin_unlock_irq(&rq->lock); | ||
| 4209 | WARN_ON(1); | ||
| 4210 | } | ||
| 4211 | 4366 | ||
| 4212 | complete(&req->done); | 4367 | complete(&req->done); |
| 4213 | } | 4368 | } |
| @@ -4438,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4438 | migration_req_t *req; | 4593 | migration_req_t *req; |
| 4439 | req = list_entry(rq->migration_queue.next, | 4594 | req = list_entry(rq->migration_queue.next, |
| 4440 | migration_req_t, list); | 4595 | migration_req_t, list); |
| 4441 | BUG_ON(req->type != REQ_MOVE_TASK); | ||
| 4442 | list_del_init(&req->list); | 4596 | list_del_init(&req->list); |
| 4443 | complete(&req->done); | 4597 | complete(&req->done); |
| 4444 | } | 4598 | } |
| @@ -4469,12 +4623,17 @@ int __init migration_init(void) | |||
| 4469 | #endif | 4623 | #endif |
| 4470 | 4624 | ||
| 4471 | #ifdef CONFIG_SMP | 4625 | #ifdef CONFIG_SMP |
| 4472 | #define SCHED_DOMAIN_DEBUG | 4626 | #undef SCHED_DOMAIN_DEBUG |
| 4473 | #ifdef SCHED_DOMAIN_DEBUG | 4627 | #ifdef SCHED_DOMAIN_DEBUG |
| 4474 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 4628 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 4475 | { | 4629 | { |
| 4476 | int level = 0; | 4630 | int level = 0; |
| 4477 | 4631 | ||
| 4632 | if (!sd) { | ||
| 4633 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
| 4634 | return; | ||
| 4635 | } | ||
| 4636 | |||
| 4478 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 4637 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
| 4479 | 4638 | ||
| 4480 | do { | 4639 | do { |
| @@ -4557,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 4557 | #define sched_domain_debug(sd, cpu) {} | 4716 | #define sched_domain_debug(sd, cpu) {} |
| 4558 | #endif | 4717 | #endif |
| 4559 | 4718 | ||
| 4719 | static int sd_degenerate(struct sched_domain *sd) | ||
| 4720 | { | ||
| 4721 | if (cpus_weight(sd->span) == 1) | ||
| 4722 | return 1; | ||
| 4723 | |||
| 4724 | /* Following flags need at least 2 groups */ | ||
| 4725 | if (sd->flags & (SD_LOAD_BALANCE | | ||
| 4726 | SD_BALANCE_NEWIDLE | | ||
| 4727 | SD_BALANCE_FORK | | ||
| 4728 | SD_BALANCE_EXEC)) { | ||
| 4729 | if (sd->groups != sd->groups->next) | ||
| 4730 | return 0; | ||
| 4731 | } | ||
| 4732 | |||
| 4733 | /* Following flags don't use groups */ | ||
| 4734 | if (sd->flags & (SD_WAKE_IDLE | | ||
| 4735 | SD_WAKE_AFFINE | | ||
| 4736 | SD_WAKE_BALANCE)) | ||
| 4737 | return 0; | ||
| 4738 | |||
| 4739 | return 1; | ||
| 4740 | } | ||
| 4741 | |||
| 4742 | static int sd_parent_degenerate(struct sched_domain *sd, | ||
| 4743 | struct sched_domain *parent) | ||
| 4744 | { | ||
| 4745 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
| 4746 | |||
| 4747 | if (sd_degenerate(parent)) | ||
| 4748 | return 1; | ||
| 4749 | |||
| 4750 | if (!cpus_equal(sd->span, parent->span)) | ||
| 4751 | return 0; | ||
| 4752 | |||
| 4753 | /* Does parent contain flags not in child? */ | ||
| 4754 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
| 4755 | if (cflags & SD_WAKE_AFFINE) | ||
| 4756 | pflags &= ~SD_WAKE_BALANCE; | ||
| 4757 | /* Flags needing groups don't count if only 1 group in parent */ | ||
| 4758 | if (parent->groups == parent->groups->next) { | ||
| 4759 | pflags &= ~(SD_LOAD_BALANCE | | ||
| 4760 | SD_BALANCE_NEWIDLE | | ||
| 4761 | SD_BALANCE_FORK | | ||
| 4762 | SD_BALANCE_EXEC); | ||
| 4763 | } | ||
| 4764 | if (~cflags & pflags) | ||
| 4765 | return 0; | ||
| 4766 | |||
| 4767 | return 1; | ||
| 4768 | } | ||
| 4769 | |||
| 4560 | /* | 4770 | /* |
| 4561 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4771 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
| 4562 | * hold the hotplug lock. | 4772 | * hold the hotplug lock. |
| 4563 | */ | 4773 | */ |
| 4564 | void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) | 4774 | void cpu_attach_domain(struct sched_domain *sd, int cpu) |
| 4565 | { | 4775 | { |
| 4566 | migration_req_t req; | ||
| 4567 | unsigned long flags; | ||
| 4568 | runqueue_t *rq = cpu_rq(cpu); | 4776 | runqueue_t *rq = cpu_rq(cpu); |
| 4569 | int local = 1; | 4777 | struct sched_domain *tmp; |
| 4570 | |||
| 4571 | sched_domain_debug(sd, cpu); | ||
| 4572 | 4778 | ||
| 4573 | spin_lock_irqsave(&rq->lock, flags); | 4779 | /* Remove the sched domains which do not contribute to scheduling. */ |
| 4574 | 4780 | for (tmp = sd; tmp; tmp = tmp->parent) { | |
| 4575 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | 4781 | struct sched_domain *parent = tmp->parent; |
| 4576 | rq->sd = sd; | 4782 | if (!parent) |
| 4577 | } else { | 4783 | break; |
| 4578 | init_completion(&req.done); | 4784 | if (sd_parent_degenerate(tmp, parent)) |
| 4579 | req.type = REQ_SET_DOMAIN; | 4785 | tmp->parent = parent->parent; |
| 4580 | req.sd = sd; | ||
| 4581 | list_add(&req.list, &rq->migration_queue); | ||
| 4582 | local = 0; | ||
| 4583 | } | 4786 | } |
| 4584 | 4787 | ||
| 4585 | spin_unlock_irqrestore(&rq->lock, flags); | 4788 | if (sd && sd_degenerate(sd)) |
| 4789 | sd = sd->parent; | ||
| 4586 | 4790 | ||
| 4587 | if (!local) { | 4791 | sched_domain_debug(sd, cpu); |
| 4588 | wake_up_process(rq->migration_thread); | 4792 | |
| 4589 | wait_for_completion(&req.done); | 4793 | rcu_assign_pointer(rq->sd, sd); |
| 4590 | } | ||
| 4591 | } | 4794 | } |
| 4592 | 4795 | ||
| 4593 | /* cpus with isolated domains */ | 4796 | /* cpus with isolated domains */ |
| @@ -4619,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
| 4619 | * covered by the given span, and will set each group's ->cpumask correctly, | 4822 | * covered by the given span, and will set each group's ->cpumask correctly, |
| 4620 | * and ->cpu_power to 0. | 4823 | * and ->cpu_power to 0. |
| 4621 | */ | 4824 | */ |
| 4622 | void __devinit init_sched_build_groups(struct sched_group groups[], | 4825 | void init_sched_build_groups(struct sched_group groups[], |
| 4623 | cpumask_t span, int (*group_fn)(int cpu)) | 4826 | cpumask_t span, int (*group_fn)(int cpu)) |
| 4624 | { | 4827 | { |
| 4625 | struct sched_group *first = NULL, *last = NULL; | 4828 | struct sched_group *first = NULL, *last = NULL; |
| @@ -4655,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], | |||
| 4655 | 4858 | ||
| 4656 | 4859 | ||
| 4657 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4860 | #ifdef ARCH_HAS_SCHED_DOMAIN |
| 4658 | extern void __devinit arch_init_sched_domains(void); | 4861 | extern void build_sched_domains(const cpumask_t *cpu_map); |
| 4659 | extern void __devinit arch_destroy_sched_domains(void); | 4862 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); |
| 4863 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | ||
| 4660 | #else | 4864 | #else |
| 4661 | #ifdef CONFIG_SCHED_SMT | 4865 | #ifdef CONFIG_SCHED_SMT |
| 4662 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 4866 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| 4663 | static struct sched_group sched_group_cpus[NR_CPUS]; | 4867 | static struct sched_group sched_group_cpus[NR_CPUS]; |
| 4664 | static int __devinit cpu_to_cpu_group(int cpu) | 4868 | static int cpu_to_cpu_group(int cpu) |
| 4665 | { | 4869 | { |
| 4666 | return cpu; | 4870 | return cpu; |
| 4667 | } | 4871 | } |
| @@ -4669,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu) | |||
| 4669 | 4873 | ||
| 4670 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 4874 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
| 4671 | static struct sched_group sched_group_phys[NR_CPUS]; | 4875 | static struct sched_group sched_group_phys[NR_CPUS]; |
| 4672 | static int __devinit cpu_to_phys_group(int cpu) | 4876 | static int cpu_to_phys_group(int cpu) |
| 4673 | { | 4877 | { |
| 4674 | #ifdef CONFIG_SCHED_SMT | 4878 | #ifdef CONFIG_SCHED_SMT |
| 4675 | return first_cpu(cpu_sibling_map[cpu]); | 4879 | return first_cpu(cpu_sibling_map[cpu]); |
| @@ -4682,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu) | |||
| 4682 | 4886 | ||
| 4683 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 4887 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
| 4684 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | 4888 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; |
| 4685 | static int __devinit cpu_to_node_group(int cpu) | 4889 | static int cpu_to_node_group(int cpu) |
| 4686 | { | 4890 | { |
| 4687 | return cpu_to_node(cpu); | 4891 | return cpu_to_node(cpu); |
| 4688 | } | 4892 | } |
| @@ -4713,39 +4917,28 @@ static void check_sibling_maps(void) | |||
| 4713 | #endif | 4917 | #endif |
| 4714 | 4918 | ||
| 4715 | /* | 4919 | /* |
| 4716 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 4920 | * Build sched domains for a given set of cpus and attach the sched domains |
| 4921 | * to the individual cpus | ||
| 4717 | */ | 4922 | */ |
| 4718 | static void __devinit arch_init_sched_domains(void) | 4923 | static void build_sched_domains(const cpumask_t *cpu_map) |
| 4719 | { | 4924 | { |
| 4720 | int i; | 4925 | int i; |
| 4721 | cpumask_t cpu_default_map; | ||
| 4722 | |||
| 4723 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
| 4724 | check_sibling_maps(); | ||
| 4725 | #endif | ||
| 4726 | /* | ||
| 4727 | * Setup mask for cpus without special case scheduling requirements. | ||
| 4728 | * For now this just excludes isolated cpus, but could be used to | ||
| 4729 | * exclude other special cases in the future. | ||
| 4730 | */ | ||
| 4731 | cpus_complement(cpu_default_map, cpu_isolated_map); | ||
| 4732 | cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); | ||
| 4733 | 4926 | ||
| 4734 | /* | 4927 | /* |
| 4735 | * Set up domains. Isolated domains just stay on the dummy domain. | 4928 | * Set up domains for cpus specified by the cpu_map. |
| 4736 | */ | 4929 | */ |
| 4737 | for_each_cpu_mask(i, cpu_default_map) { | 4930 | for_each_cpu_mask(i, *cpu_map) { |
| 4738 | int group; | 4931 | int group; |
| 4739 | struct sched_domain *sd = NULL, *p; | 4932 | struct sched_domain *sd = NULL, *p; |
| 4740 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 4933 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
| 4741 | 4934 | ||
| 4742 | cpus_and(nodemask, nodemask, cpu_default_map); | 4935 | cpus_and(nodemask, nodemask, *cpu_map); |
| 4743 | 4936 | ||
| 4744 | #ifdef CONFIG_NUMA | 4937 | #ifdef CONFIG_NUMA |
| 4745 | sd = &per_cpu(node_domains, i); | 4938 | sd = &per_cpu(node_domains, i); |
| 4746 | group = cpu_to_node_group(i); | 4939 | group = cpu_to_node_group(i); |
| 4747 | *sd = SD_NODE_INIT; | 4940 | *sd = SD_NODE_INIT; |
| 4748 | sd->span = cpu_default_map; | 4941 | sd->span = *cpu_map; |
| 4749 | sd->groups = &sched_group_nodes[group]; | 4942 | sd->groups = &sched_group_nodes[group]; |
| 4750 | #endif | 4943 | #endif |
| 4751 | 4944 | ||
| @@ -4763,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4763 | group = cpu_to_cpu_group(i); | 4956 | group = cpu_to_cpu_group(i); |
| 4764 | *sd = SD_SIBLING_INIT; | 4957 | *sd = SD_SIBLING_INIT; |
| 4765 | sd->span = cpu_sibling_map[i]; | 4958 | sd->span = cpu_sibling_map[i]; |
| 4766 | cpus_and(sd->span, sd->span, cpu_default_map); | 4959 | cpus_and(sd->span, sd->span, *cpu_map); |
| 4767 | sd->parent = p; | 4960 | sd->parent = p; |
| 4768 | sd->groups = &sched_group_cpus[group]; | 4961 | sd->groups = &sched_group_cpus[group]; |
| 4769 | #endif | 4962 | #endif |
| @@ -4773,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4773 | /* Set up CPU (sibling) groups */ | 4966 | /* Set up CPU (sibling) groups */ |
| 4774 | for_each_online_cpu(i) { | 4967 | for_each_online_cpu(i) { |
| 4775 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 4968 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
| 4776 | cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); | 4969 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
| 4777 | if (i != first_cpu(this_sibling_map)) | 4970 | if (i != first_cpu(this_sibling_map)) |
| 4778 | continue; | 4971 | continue; |
| 4779 | 4972 | ||
| @@ -4786,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4786 | for (i = 0; i < MAX_NUMNODES; i++) { | 4979 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 4787 | cpumask_t nodemask = node_to_cpumask(i); | 4980 | cpumask_t nodemask = node_to_cpumask(i); |
| 4788 | 4981 | ||
| 4789 | cpus_and(nodemask, nodemask, cpu_default_map); | 4982 | cpus_and(nodemask, nodemask, *cpu_map); |
| 4790 | if (cpus_empty(nodemask)) | 4983 | if (cpus_empty(nodemask)) |
| 4791 | continue; | 4984 | continue; |
| 4792 | 4985 | ||
| @@ -4796,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4796 | 4989 | ||
| 4797 | #ifdef CONFIG_NUMA | 4990 | #ifdef CONFIG_NUMA |
| 4798 | /* Set up node groups */ | 4991 | /* Set up node groups */ |
| 4799 | init_sched_build_groups(sched_group_nodes, cpu_default_map, | 4992 | init_sched_build_groups(sched_group_nodes, *cpu_map, |
| 4800 | &cpu_to_node_group); | 4993 | &cpu_to_node_group); |
| 4801 | #endif | 4994 | #endif |
| 4802 | 4995 | ||
| 4803 | /* Calculate CPU power for physical packages and nodes */ | 4996 | /* Calculate CPU power for physical packages and nodes */ |
| 4804 | for_each_cpu_mask(i, cpu_default_map) { | 4997 | for_each_cpu_mask(i, *cpu_map) { |
| 4805 | int power; | 4998 | int power; |
| 4806 | struct sched_domain *sd; | 4999 | struct sched_domain *sd; |
| 4807 | #ifdef CONFIG_SCHED_SMT | 5000 | #ifdef CONFIG_SCHED_SMT |
| @@ -4825,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4825 | } | 5018 | } |
| 4826 | 5019 | ||
| 4827 | /* Attach the domains */ | 5020 | /* Attach the domains */ |
| 4828 | for_each_online_cpu(i) { | 5021 | for_each_cpu_mask(i, *cpu_map) { |
| 4829 | struct sched_domain *sd; | 5022 | struct sched_domain *sd; |
| 4830 | #ifdef CONFIG_SCHED_SMT | 5023 | #ifdef CONFIG_SCHED_SMT |
| 4831 | sd = &per_cpu(cpu_domains, i); | 5024 | sd = &per_cpu(cpu_domains, i); |
| @@ -4835,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void) | |||
| 4835 | cpu_attach_domain(sd, i); | 5028 | cpu_attach_domain(sd, i); |
| 4836 | } | 5029 | } |
| 4837 | } | 5030 | } |
| 5031 | /* | ||
| 5032 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
| 5033 | */ | ||
| 5034 | static void arch_init_sched_domains(cpumask_t *cpu_map) | ||
| 5035 | { | ||
| 5036 | cpumask_t cpu_default_map; | ||
| 5037 | |||
| 5038 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
| 5039 | check_sibling_maps(); | ||
| 5040 | #endif | ||
| 5041 | /* | ||
| 5042 | * Setup mask for cpus without special case scheduling requirements. | ||
| 5043 | * For now this just excludes isolated cpus, but could be used to | ||
| 5044 | * exclude other special cases in the future. | ||
| 5045 | */ | ||
| 5046 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | ||
| 5047 | |||
| 5048 | build_sched_domains(&cpu_default_map); | ||
| 5049 | } | ||
| 4838 | 5050 | ||
| 4839 | #ifdef CONFIG_HOTPLUG_CPU | 5051 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
| 4840 | static void __devinit arch_destroy_sched_domains(void) | ||
| 4841 | { | 5052 | { |
| 4842 | /* Do nothing: everything is statically allocated. */ | 5053 | /* Do nothing: everything is statically allocated. */ |
| 4843 | } | 5054 | } |
| 4844 | #endif | ||
| 4845 | 5055 | ||
| 4846 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5056 | #endif /* ARCH_HAS_SCHED_DOMAIN */ |
| 4847 | 5057 | ||
| 4848 | /* | 5058 | /* |
| 4849 | * Initial dummy domain for early boot and for hotplug cpu. Being static, | 5059 | * Detach sched domains from a group of cpus specified in cpu_map |
| 4850 | * it is initialized to zero, so all balancing flags are cleared which is | 5060 | * These cpus will now be attached to the NULL domain |
| 4851 | * what we want. | ||
| 4852 | */ | 5061 | */ |
| 4853 | static struct sched_domain sched_domain_dummy; | 5062 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) |
| 5063 | { | ||
| 5064 | int i; | ||
| 5065 | |||
| 5066 | for_each_cpu_mask(i, *cpu_map) | ||
| 5067 | cpu_attach_domain(NULL, i); | ||
| 5068 | synchronize_sched(); | ||
| 5069 | arch_destroy_sched_domains(cpu_map); | ||
| 5070 | } | ||
| 5071 | |||
| 5072 | /* | ||
| 5073 | * Partition sched domains as specified by the cpumasks below. | ||
| 5074 | * This attaches all cpus from the cpumasks to the NULL domain, | ||
| 5075 | * waits for a RCU quiescent period, recalculates sched | ||
| 5076 | * domain information and then attaches them back to the | ||
| 5077 | * correct sched domains | ||
| 5078 | * Call with hotplug lock held | ||
| 5079 | */ | ||
| 5080 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | ||
| 5081 | { | ||
| 5082 | cpumask_t change_map; | ||
| 5083 | |||
| 5084 | cpus_and(*partition1, *partition1, cpu_online_map); | ||
| 5085 | cpus_and(*partition2, *partition2, cpu_online_map); | ||
| 5086 | cpus_or(change_map, *partition1, *partition2); | ||
| 5087 | |||
| 5088 | /* Detach sched domains from all of the affected cpus */ | ||
| 5089 | detach_destroy_domains(&change_map); | ||
| 5090 | if (!cpus_empty(*partition1)) | ||
| 5091 | build_sched_domains(partition1); | ||
| 5092 | if (!cpus_empty(*partition2)) | ||
| 5093 | build_sched_domains(partition2); | ||
| 5094 | } | ||
| 4854 | 5095 | ||
| 4855 | #ifdef CONFIG_HOTPLUG_CPU | 5096 | #ifdef CONFIG_HOTPLUG_CPU |
| 4856 | /* | 5097 | /* |
| 4857 | * Force a reinitialization of the sched domains hierarchy. The domains | 5098 | * Force a reinitialization of the sched domains hierarchy. The domains |
| 4858 | * and groups cannot be updated in place without racing with the balancing | 5099 | * and groups cannot be updated in place without racing with the balancing |
| 4859 | * code, so we temporarily attach all running cpus to a "dummy" domain | 5100 | * code, so we temporarily attach all running cpus to the NULL domain |
| 4860 | * which will prevent rebalancing while the sched domains are recalculated. | 5101 | * which will prevent rebalancing while the sched domains are recalculated. |
| 4861 | */ | 5102 | */ |
| 4862 | static int update_sched_domains(struct notifier_block *nfb, | 5103 | static int update_sched_domains(struct notifier_block *nfb, |
| 4863 | unsigned long action, void *hcpu) | 5104 | unsigned long action, void *hcpu) |
| 4864 | { | 5105 | { |
| 4865 | int i; | ||
| 4866 | |||
| 4867 | switch (action) { | 5106 | switch (action) { |
| 4868 | case CPU_UP_PREPARE: | 5107 | case CPU_UP_PREPARE: |
| 4869 | case CPU_DOWN_PREPARE: | 5108 | case CPU_DOWN_PREPARE: |
| 4870 | for_each_online_cpu(i) | 5109 | detach_destroy_domains(&cpu_online_map); |
| 4871 | cpu_attach_domain(&sched_domain_dummy, i); | ||
| 4872 | arch_destroy_sched_domains(); | ||
| 4873 | return NOTIFY_OK; | 5110 | return NOTIFY_OK; |
| 4874 | 5111 | ||
| 4875 | case CPU_UP_CANCELED: | 5112 | case CPU_UP_CANCELED: |
| @@ -4885,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
| 4885 | } | 5122 | } |
| 4886 | 5123 | ||
| 4887 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 5124 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
| 4888 | arch_init_sched_domains(); | 5125 | arch_init_sched_domains(&cpu_online_map); |
| 4889 | 5126 | ||
| 4890 | return NOTIFY_OK; | 5127 | return NOTIFY_OK; |
| 4891 | } | 5128 | } |
| @@ -4894,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
| 4894 | void __init sched_init_smp(void) | 5131 | void __init sched_init_smp(void) |
| 4895 | { | 5132 | { |
| 4896 | lock_cpu_hotplug(); | 5133 | lock_cpu_hotplug(); |
| 4897 | arch_init_sched_domains(); | 5134 | arch_init_sched_domains(&cpu_online_map); |
| 4898 | unlock_cpu_hotplug(); | 5135 | unlock_cpu_hotplug(); |
| 4899 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 5136 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
| 4900 | hotcpu_notifier(update_sched_domains, 0); | 5137 | hotcpu_notifier(update_sched_domains, 0); |
| @@ -4924,13 +5161,15 @@ void __init sched_init(void) | |||
| 4924 | 5161 | ||
| 4925 | rq = cpu_rq(i); | 5162 | rq = cpu_rq(i); |
| 4926 | spin_lock_init(&rq->lock); | 5163 | spin_lock_init(&rq->lock); |
| 5164 | rq->nr_running = 0; | ||
| 4927 | rq->active = rq->arrays; | 5165 | rq->active = rq->arrays; |
| 4928 | rq->expired = rq->arrays + 1; | 5166 | rq->expired = rq->arrays + 1; |
| 4929 | rq->best_expired_prio = MAX_PRIO; | 5167 | rq->best_expired_prio = MAX_PRIO; |
| 4930 | 5168 | ||
| 4931 | #ifdef CONFIG_SMP | 5169 | #ifdef CONFIG_SMP |
| 4932 | rq->sd = &sched_domain_dummy; | 5170 | rq->sd = NULL; |
| 4933 | rq->cpu_load = 0; | 5171 | for (j = 1; j < 3; j++) |
| 5172 | rq->cpu_load[j] = 0; | ||
| 4934 | rq->active_balance = 0; | 5173 | rq->active_balance = 0; |
| 4935 | rq->push_cpu = 0; | 5174 | rq->push_cpu = 0; |
| 4936 | rq->migration_thread = NULL; | 5175 | rq->migration_thread = NULL; |
diff --git a/kernel/signal.c b/kernel/signal.c index b3c24c732c5a..ca1186eef938 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
| 25 | #include <linux/posix-timers.h> | 25 | #include <linux/posix-timers.h> |
| 26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
| 27 | #include <linux/audit.h> | ||
| 27 | #include <asm/param.h> | 28 | #include <asm/param.h> |
| 28 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
| 29 | #include <asm/unistd.h> | 30 | #include <asm/unistd.h> |
| @@ -212,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
| 212 | fastcall void recalc_sigpending_tsk(struct task_struct *t) | 213 | fastcall void recalc_sigpending_tsk(struct task_struct *t) |
| 213 | { | 214 | { |
| 214 | if (t->signal->group_stop_count > 0 || | 215 | if (t->signal->group_stop_count > 0 || |
| 216 | (freezing(t)) || | ||
| 215 | PENDING(&t->pending, &t->blocked) || | 217 | PENDING(&t->pending, &t->blocked) || |
| 216 | PENDING(&t->signal->shared_pending, &t->blocked)) | 218 | PENDING(&t->signal->shared_pending, &t->blocked)) |
| 217 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 219 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
| @@ -667,7 +669,11 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 667 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 669 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
| 668 | && !capable(CAP_KILL)) | 670 | && !capable(CAP_KILL)) |
| 669 | return error; | 671 | return error; |
| 670 | return security_task_kill(t, info, sig); | 672 | |
| 673 | error = security_task_kill(t, info, sig); | ||
| 674 | if (!error) | ||
| 675 | audit_signal_info(sig, t); /* Let audit system see the signal */ | ||
| 676 | return error; | ||
| 671 | } | 677 | } |
| 672 | 678 | ||
| 673 | /* forward decl */ | 679 | /* forward decl */ |
| @@ -2225,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
| 2225 | current->state = TASK_INTERRUPTIBLE; | 2231 | current->state = TASK_INTERRUPTIBLE; |
| 2226 | timeout = schedule_timeout(timeout); | 2232 | timeout = schedule_timeout(timeout); |
| 2227 | 2233 | ||
| 2228 | if (current->flags & PF_FREEZE) | 2234 | try_to_freeze(); |
| 2229 | refrigerator(PF_FREEZE); | ||
| 2230 | spin_lock_irq(¤t->sighand->siglock); | 2235 | spin_lock_irq(¤t->sighand->siglock); |
| 2231 | sig = dequeue_signal(current, &these, &info); | 2236 | sig = dequeue_signal(current, &these, &info); |
| 2232 | current->blocked = current->real_blocked; | 2237 | current->blocked = current->real_blocked; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6116b25aa7cf..84a9d18aa8da 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -100,7 +100,7 @@ static int stop_machine(void) | |||
| 100 | stopmachine_state = STOPMACHINE_WAIT; | 100 | stopmachine_state = STOPMACHINE_WAIT; |
| 101 | 101 | ||
| 102 | for_each_online_cpu(i) { | 102 | for_each_online_cpu(i) { |
| 103 | if (i == _smp_processor_id()) | 103 | if (i == raw_smp_processor_id()) |
| 104 | continue; | 104 | continue; |
| 105 | ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); | 105 | ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); |
| 106 | if (ret < 0) | 106 | if (ret < 0) |
| @@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, | |||
| 182 | 182 | ||
| 183 | /* If they don't care which CPU fn runs on, bind to any online one. */ | 183 | /* If they don't care which CPU fn runs on, bind to any online one. */ |
| 184 | if (cpu == NR_CPUS) | 184 | if (cpu == NR_CPUS) |
| 185 | cpu = _smp_processor_id(); | 185 | cpu = raw_smp_processor_id(); |
| 186 | 186 | ||
| 187 | p = kthread_create(do_stop, &smdata, "kstopmachine"); | 187 | p = kthread_create(do_stop, &smdata, "kstopmachine"); |
| 188 | if (!IS_ERR(p)) { | 188 | if (!IS_ERR(p)) { |
diff --git a/kernel/sys.c b/kernel/sys.c index f006632c2ba7..9a24374c23bc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
| 17 | #include <linux/highuid.h> | 17 | #include <linux/highuid.h> |
| 18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
| 19 | #include <linux/kernel.h> | ||
| 20 | #include <linux/kexec.h> | ||
| 19 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
| 20 | #include <linux/device.h> | 22 | #include <linux/device.h> |
| 21 | #include <linux/key.h> | 23 | #include <linux/key.h> |
| @@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
| 405 | case LINUX_REBOOT_CMD_HALT: | 407 | case LINUX_REBOOT_CMD_HALT: |
| 406 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | 408 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); |
| 407 | system_state = SYSTEM_HALT; | 409 | system_state = SYSTEM_HALT; |
| 410 | device_suspend(PMSG_SUSPEND); | ||
| 408 | device_shutdown(); | 411 | device_shutdown(); |
| 409 | printk(KERN_EMERG "System halted.\n"); | 412 | printk(KERN_EMERG "System halted.\n"); |
| 410 | machine_halt(); | 413 | machine_halt(); |
| @@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
| 415 | case LINUX_REBOOT_CMD_POWER_OFF: | 418 | case LINUX_REBOOT_CMD_POWER_OFF: |
| 416 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | 419 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); |
| 417 | system_state = SYSTEM_POWER_OFF; | 420 | system_state = SYSTEM_POWER_OFF; |
| 421 | device_suspend(PMSG_SUSPEND); | ||
| 418 | device_shutdown(); | 422 | device_shutdown(); |
| 419 | printk(KERN_EMERG "Power down.\n"); | 423 | printk(KERN_EMERG "Power down.\n"); |
| 420 | machine_power_off(); | 424 | machine_power_off(); |
| @@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
| 431 | 435 | ||
| 432 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); | 436 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); |
| 433 | system_state = SYSTEM_RESTART; | 437 | system_state = SYSTEM_RESTART; |
| 438 | device_suspend(PMSG_FREEZE); | ||
| 434 | device_shutdown(); | 439 | device_shutdown(); |
| 435 | printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); | 440 | printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); |
| 436 | machine_restart(buffer); | 441 | machine_restart(buffer); |
| 437 | break; | 442 | break; |
| 438 | 443 | ||
| 444 | #ifdef CONFIG_KEXEC | ||
| 445 | case LINUX_REBOOT_CMD_KEXEC: | ||
| 446 | { | ||
| 447 | struct kimage *image; | ||
| 448 | image = xchg(&kexec_image, 0); | ||
| 449 | if (!image) { | ||
| 450 | unlock_kernel(); | ||
| 451 | return -EINVAL; | ||
| 452 | } | ||
| 453 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | ||
| 454 | system_state = SYSTEM_RESTART; | ||
| 455 | device_shutdown(); | ||
| 456 | printk(KERN_EMERG "Starting new kernel\n"); | ||
| 457 | machine_shutdown(); | ||
| 458 | machine_kexec(image); | ||
| 459 | break; | ||
| 460 | } | ||
| 461 | #endif | ||
| 439 | #ifdef CONFIG_SOFTWARE_SUSPEND | 462 | #ifdef CONFIG_SOFTWARE_SUSPEND |
| 440 | case LINUX_REBOOT_CMD_SW_SUSPEND: | 463 | case LINUX_REBOOT_CMD_SW_SUSPEND: |
| 441 | { | 464 | { |
| @@ -525,7 +548,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | |||
| 525 | } | 548 | } |
| 526 | if (new_egid != old_egid) | 549 | if (new_egid != old_egid) |
| 527 | { | 550 | { |
| 528 | current->mm->dumpable = 0; | 551 | current->mm->dumpable = suid_dumpable; |
| 529 | smp_wmb(); | 552 | smp_wmb(); |
| 530 | } | 553 | } |
| 531 | if (rgid != (gid_t) -1 || | 554 | if (rgid != (gid_t) -1 || |
| @@ -556,7 +579,7 @@ asmlinkage long sys_setgid(gid_t gid) | |||
| 556 | { | 579 | { |
| 557 | if(old_egid != gid) | 580 | if(old_egid != gid) |
| 558 | { | 581 | { |
| 559 | current->mm->dumpable=0; | 582 | current->mm->dumpable = suid_dumpable; |
| 560 | smp_wmb(); | 583 | smp_wmb(); |
| 561 | } | 584 | } |
| 562 | current->gid = current->egid = current->sgid = current->fsgid = gid; | 585 | current->gid = current->egid = current->sgid = current->fsgid = gid; |
| @@ -565,7 +588,7 @@ asmlinkage long sys_setgid(gid_t gid) | |||
| 565 | { | 588 | { |
| 566 | if(old_egid != gid) | 589 | if(old_egid != gid) |
| 567 | { | 590 | { |
| 568 | current->mm->dumpable=0; | 591 | current->mm->dumpable = suid_dumpable; |
| 569 | smp_wmb(); | 592 | smp_wmb(); |
| 570 | } | 593 | } |
| 571 | current->egid = current->fsgid = gid; | 594 | current->egid = current->fsgid = gid; |
| @@ -596,7 +619,7 @@ static int set_user(uid_t new_ruid, int dumpclear) | |||
| 596 | 619 | ||
| 597 | if(dumpclear) | 620 | if(dumpclear) |
| 598 | { | 621 | { |
| 599 | current->mm->dumpable = 0; | 622 | current->mm->dumpable = suid_dumpable; |
| 600 | smp_wmb(); | 623 | smp_wmb(); |
| 601 | } | 624 | } |
| 602 | current->uid = new_ruid; | 625 | current->uid = new_ruid; |
| @@ -653,7 +676,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
| 653 | 676 | ||
| 654 | if (new_euid != old_euid) | 677 | if (new_euid != old_euid) |
| 655 | { | 678 | { |
| 656 | current->mm->dumpable=0; | 679 | current->mm->dumpable = suid_dumpable; |
| 657 | smp_wmb(); | 680 | smp_wmb(); |
| 658 | } | 681 | } |
| 659 | current->fsuid = current->euid = new_euid; | 682 | current->fsuid = current->euid = new_euid; |
| @@ -703,7 +726,7 @@ asmlinkage long sys_setuid(uid_t uid) | |||
| 703 | 726 | ||
| 704 | if (old_euid != uid) | 727 | if (old_euid != uid) |
| 705 | { | 728 | { |
| 706 | current->mm->dumpable = 0; | 729 | current->mm->dumpable = suid_dumpable; |
| 707 | smp_wmb(); | 730 | smp_wmb(); |
| 708 | } | 731 | } |
| 709 | current->fsuid = current->euid = uid; | 732 | current->fsuid = current->euid = uid; |
| @@ -748,7 +771,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | |||
| 748 | if (euid != (uid_t) -1) { | 771 | if (euid != (uid_t) -1) { |
| 749 | if (euid != current->euid) | 772 | if (euid != current->euid) |
| 750 | { | 773 | { |
| 751 | current->mm->dumpable = 0; | 774 | current->mm->dumpable = suid_dumpable; |
| 752 | smp_wmb(); | 775 | smp_wmb(); |
| 753 | } | 776 | } |
| 754 | current->euid = euid; | 777 | current->euid = euid; |
| @@ -798,7 +821,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | |||
| 798 | if (egid != (gid_t) -1) { | 821 | if (egid != (gid_t) -1) { |
| 799 | if (egid != current->egid) | 822 | if (egid != current->egid) |
| 800 | { | 823 | { |
| 801 | current->mm->dumpable = 0; | 824 | current->mm->dumpable = suid_dumpable; |
| 802 | smp_wmb(); | 825 | smp_wmb(); |
| 803 | } | 826 | } |
| 804 | current->egid = egid; | 827 | current->egid = egid; |
| @@ -845,7 +868,7 @@ asmlinkage long sys_setfsuid(uid_t uid) | |||
| 845 | { | 868 | { |
| 846 | if (uid != old_fsuid) | 869 | if (uid != old_fsuid) |
| 847 | { | 870 | { |
| 848 | current->mm->dumpable = 0; | 871 | current->mm->dumpable = suid_dumpable; |
| 849 | smp_wmb(); | 872 | smp_wmb(); |
| 850 | } | 873 | } |
| 851 | current->fsuid = uid; | 874 | current->fsuid = uid; |
| @@ -875,7 +898,7 @@ asmlinkage long sys_setfsgid(gid_t gid) | |||
| 875 | { | 898 | { |
| 876 | if (gid != old_fsgid) | 899 | if (gid != old_fsgid) |
| 877 | { | 900 | { |
| 878 | current->mm->dumpable = 0; | 901 | current->mm->dumpable = suid_dumpable; |
| 879 | smp_wmb(); | 902 | smp_wmb(); |
| 880 | } | 903 | } |
| 881 | current->fsgid = gid; | 904 | current->fsgid = gid; |
| @@ -894,35 +917,69 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
| 894 | */ | 917 | */ |
| 895 | if (tbuf) { | 918 | if (tbuf) { |
| 896 | struct tms tmp; | 919 | struct tms tmp; |
| 897 | struct task_struct *tsk = current; | ||
| 898 | struct task_struct *t; | ||
| 899 | cputime_t utime, stime, cutime, cstime; | 920 | cputime_t utime, stime, cutime, cstime; |
| 900 | 921 | ||
| 901 | read_lock(&tasklist_lock); | 922 | #ifdef CONFIG_SMP |
| 902 | utime = tsk->signal->utime; | 923 | if (thread_group_empty(current)) { |
| 903 | stime = tsk->signal->stime; | 924 | /* |
| 904 | t = tsk; | 925 | * Single thread case without the use of any locks. |
| 905 | do { | 926 | * |
| 906 | utime = cputime_add(utime, t->utime); | 927 | * We may race with release_task if two threads are |
| 907 | stime = cputime_add(stime, t->stime); | 928 | * executing. However, release task first adds up the |
| 908 | t = next_thread(t); | 929 | * counters (__exit_signal) before removing the task |
| 909 | } while (t != tsk); | 930 | * from the process tasklist (__unhash_process). |
| 910 | 931 | * __exit_signal also acquires and releases the | |
| 911 | /* | 932 | * siglock which results in the proper memory ordering |
| 912 | * While we have tasklist_lock read-locked, no dying thread | 933 | * so that the list modifications are always visible |
| 913 | * can be updating current->signal->[us]time. Instead, | 934 | * after the counters have been updated. |
| 914 | * we got their counts included in the live thread loop. | 935 | * |
| 915 | * However, another thread can come in right now and | 936 | * If the counters have been updated by the second thread |
| 916 | * do a wait call that updates current->signal->c[us]time. | 937 | * but the thread has not yet been removed from the list |
| 917 | * To make sure we always see that pair updated atomically, | 938 | * then the other branch will be executing which will |
| 918 | * we take the siglock around fetching them. | 939 | * block on tasklist_lock until the exit handling of the |
| 919 | */ | 940 | * other task is finished. |
| 920 | spin_lock_irq(&tsk->sighand->siglock); | 941 | * |
| 921 | cutime = tsk->signal->cutime; | 942 | * This also implies that the sighand->siglock cannot |
| 922 | cstime = tsk->signal->cstime; | 943 | * be held by another processor. So we can also |
| 923 | spin_unlock_irq(&tsk->sighand->siglock); | 944 | * skip acquiring that lock. |
| 924 | read_unlock(&tasklist_lock); | 945 | */ |
| 946 | utime = cputime_add(current->signal->utime, current->utime); | ||
| 947 | stime = cputime_add(current->signal->utime, current->stime); | ||
| 948 | cutime = current->signal->cutime; | ||
| 949 | cstime = current->signal->cstime; | ||
| 950 | } else | ||
| 951 | #endif | ||
| 952 | { | ||
| 925 | 953 | ||
| 954 | /* Process with multiple threads */ | ||
| 955 | struct task_struct *tsk = current; | ||
| 956 | struct task_struct *t; | ||
| 957 | |||
| 958 | read_lock(&tasklist_lock); | ||
| 959 | utime = tsk->signal->utime; | ||
| 960 | stime = tsk->signal->stime; | ||
| 961 | t = tsk; | ||
| 962 | do { | ||
| 963 | utime = cputime_add(utime, t->utime); | ||
| 964 | stime = cputime_add(stime, t->stime); | ||
| 965 | t = next_thread(t); | ||
| 966 | } while (t != tsk); | ||
| 967 | |||
| 968 | /* | ||
| 969 | * While we have tasklist_lock read-locked, no dying thread | ||
| 970 | * can be updating current->signal->[us]time. Instead, | ||
| 971 | * we got their counts included in the live thread loop. | ||
| 972 | * However, another thread can come in right now and | ||
| 973 | * do a wait call that updates current->signal->c[us]time. | ||
| 974 | * To make sure we always see that pair updated atomically, | ||
| 975 | * we take the siglock around fetching them. | ||
| 976 | */ | ||
| 977 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 978 | cutime = tsk->signal->cutime; | ||
| 979 | cstime = tsk->signal->cstime; | ||
| 980 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 981 | read_unlock(&tasklist_lock); | ||
| 982 | } | ||
| 926 | tmp.tms_utime = cputime_to_clock_t(utime); | 983 | tmp.tms_utime = cputime_to_clock_t(utime); |
| 927 | tmp.tms_stime = cputime_to_clock_t(stime); | 984 | tmp.tms_stime = cputime_to_clock_t(stime); |
| 928 | tmp.tms_cutime = cputime_to_clock_t(cutime); | 985 | tmp.tms_cutime = cputime_to_clock_t(cutime); |
| @@ -1225,7 +1282,7 @@ static void groups_sort(struct group_info *group_info) | |||
| 1225 | } | 1282 | } |
| 1226 | 1283 | ||
| 1227 | /* a simple bsearch */ | 1284 | /* a simple bsearch */ |
| 1228 | static int groups_search(struct group_info *group_info, gid_t grp) | 1285 | int groups_search(struct group_info *group_info, gid_t grp) |
| 1229 | { | 1286 | { |
| 1230 | int left, right; | 1287 | int left, right; |
| 1231 | 1288 | ||
| @@ -1652,7 +1709,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1652 | error = 1; | 1709 | error = 1; |
| 1653 | break; | 1710 | break; |
| 1654 | case PR_SET_DUMPABLE: | 1711 | case PR_SET_DUMPABLE: |
| 1655 | if (arg2 != 0 && arg2 != 1) { | 1712 | if (arg2 < 0 || arg2 > 2) { |
| 1656 | error = -EINVAL; | 1713 | error = -EINVAL; |
| 1657 | break; | 1714 | break; |
| 1658 | } | 1715 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0dda70ed1f98..29196ce9b40f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -18,6 +18,8 @@ cond_syscall(sys_acct); | |||
| 18 | cond_syscall(sys_lookup_dcookie); | 18 | cond_syscall(sys_lookup_dcookie); |
| 19 | cond_syscall(sys_swapon); | 19 | cond_syscall(sys_swapon); |
| 20 | cond_syscall(sys_swapoff); | 20 | cond_syscall(sys_swapoff); |
| 21 | cond_syscall(sys_kexec_load); | ||
| 22 | cond_syscall(compat_sys_kexec_load); | ||
| 21 | cond_syscall(sys_init_module); | 23 | cond_syscall(sys_init_module); |
| 22 | cond_syscall(sys_delete_module); | 24 | cond_syscall(sys_delete_module); |
| 23 | cond_syscall(sys_socketpair); | 25 | cond_syscall(sys_socketpair); |
| @@ -77,6 +79,7 @@ cond_syscall(sys_request_key); | |||
| 77 | cond_syscall(sys_keyctl); | 79 | cond_syscall(sys_keyctl); |
| 78 | cond_syscall(compat_sys_keyctl); | 80 | cond_syscall(compat_sys_keyctl); |
| 79 | cond_syscall(compat_sys_socketcall); | 81 | cond_syscall(compat_sys_socketcall); |
| 82 | cond_syscall(sys_set_zone_reclaim); | ||
| 80 | 83 | ||
| 81 | /* arch-specific weak syscall entries */ | 84 | /* arch-specific weak syscall entries */ |
| 82 | cond_syscall(sys_pciconfig_read); | 85 | cond_syscall(sys_pciconfig_read); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 701d12c63068..270ee7fadbd8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio; | |||
| 58 | extern int max_threads; | 58 | extern int max_threads; |
| 59 | extern int sysrq_enabled; | 59 | extern int sysrq_enabled; |
| 60 | extern int core_uses_pid; | 60 | extern int core_uses_pid; |
| 61 | extern int suid_dumpable; | ||
| 61 | extern char core_pattern[]; | 62 | extern char core_pattern[]; |
| 62 | extern int cad_pid; | 63 | extern int cad_pid; |
| 63 | extern int pid_max; | 64 | extern int pid_max; |
| @@ -950,6 +951,14 @@ static ctl_table fs_table[] = { | |||
| 950 | .proc_handler = &proc_dointvec, | 951 | .proc_handler = &proc_dointvec, |
| 951 | }, | 952 | }, |
| 952 | #endif | 953 | #endif |
| 954 | { | ||
| 955 | .ctl_name = KERN_SETUID_DUMPABLE, | ||
| 956 | .procname = "suid_dumpable", | ||
| 957 | .data = &suid_dumpable, | ||
| 958 | .maxlen = sizeof(int), | ||
| 959 | .mode = 0644, | ||
| 960 | .proc_handler = &proc_dointvec, | ||
| 961 | }, | ||
| 953 | { .ctl_name = 0 } | 962 | { .ctl_name = 0 } |
| 954 | }; | 963 | }; |
| 955 | 964 | ||
| @@ -991,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
| 991 | int error = parse_table(name, nlen, oldval, oldlenp, | 1000 | int error = parse_table(name, nlen, oldval, oldlenp, |
| 992 | newval, newlen, head->ctl_table, | 1001 | newval, newlen, head->ctl_table, |
| 993 | &context); | 1002 | &context); |
| 994 | if (context) | 1003 | kfree(context); |
| 995 | kfree(context); | ||
| 996 | if (error != -ENOTDIR) | 1004 | if (error != -ENOTDIR) |
| 997 | return error; | 1005 | return error; |
| 998 | tmp = tmp->next; | 1006 | tmp = tmp->next; |
diff --git a/kernel/timer.c b/kernel/timer.c index 207aa4f0aa10..f2a11887a726 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec); | |||
| 57 | #define TVN_MASK (TVN_SIZE - 1) | 57 | #define TVN_MASK (TVN_SIZE - 1) |
| 58 | #define TVR_MASK (TVR_SIZE - 1) | 58 | #define TVR_MASK (TVR_SIZE - 1) |
| 59 | 59 | ||
| 60 | struct timer_base_s { | ||
| 61 | spinlock_t lock; | ||
| 62 | struct timer_list *running_timer; | ||
| 63 | }; | ||
| 64 | |||
| 60 | typedef struct tvec_s { | 65 | typedef struct tvec_s { |
| 61 | struct list_head vec[TVN_SIZE]; | 66 | struct list_head vec[TVN_SIZE]; |
| 62 | } tvec_t; | 67 | } tvec_t; |
| @@ -66,9 +71,8 @@ typedef struct tvec_root_s { | |||
| 66 | } tvec_root_t; | 71 | } tvec_root_t; |
| 67 | 72 | ||
| 68 | struct tvec_t_base_s { | 73 | struct tvec_t_base_s { |
| 69 | spinlock_t lock; | 74 | struct timer_base_s t_base; |
| 70 | unsigned long timer_jiffies; | 75 | unsigned long timer_jiffies; |
| 71 | struct timer_list *running_timer; | ||
| 72 | tvec_root_t tv1; | 76 | tvec_root_t tv1; |
| 73 | tvec_t tv2; | 77 | tvec_t tv2; |
| 74 | tvec_t tv3; | 78 | tvec_t tv3; |
| @@ -77,18 +81,16 @@ struct tvec_t_base_s { | |||
| 77 | } ____cacheline_aligned_in_smp; | 81 | } ____cacheline_aligned_in_smp; |
| 78 | 82 | ||
| 79 | typedef struct tvec_t_base_s tvec_base_t; | 83 | typedef struct tvec_t_base_s tvec_base_t; |
| 84 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases); | ||
| 80 | 85 | ||
| 81 | static inline void set_running_timer(tvec_base_t *base, | 86 | static inline void set_running_timer(tvec_base_t *base, |
| 82 | struct timer_list *timer) | 87 | struct timer_list *timer) |
| 83 | { | 88 | { |
| 84 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
| 85 | base->running_timer = timer; | 90 | base->t_base.running_timer = timer; |
| 86 | #endif | 91 | #endif |
| 87 | } | 92 | } |
| 88 | 93 | ||
| 89 | /* Fake initialization */ | ||
| 90 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; | ||
| 91 | |||
| 92 | static void check_timer_failed(struct timer_list *timer) | 94 | static void check_timer_failed(struct timer_list *timer) |
| 93 | { | 95 | { |
| 94 | static int whine_count; | 96 | static int whine_count; |
| @@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer) | |||
| 103 | /* | 105 | /* |
| 104 | * Now fix it up | 106 | * Now fix it up |
| 105 | */ | 107 | */ |
| 106 | spin_lock_init(&timer->lock); | ||
| 107 | timer->magic = TIMER_MAGIC; | 108 | timer->magic = TIMER_MAGIC; |
| 108 | } | 109 | } |
| 109 | 110 | ||
| @@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
| 156 | list_add_tail(&timer->entry, vec); | 157 | list_add_tail(&timer->entry, vec); |
| 157 | } | 158 | } |
| 158 | 159 | ||
| 160 | typedef struct timer_base_s timer_base_t; | ||
| 161 | /* | ||
| 162 | * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) | ||
| 163 | * at compile time, and we need timer->base to lock the timer. | ||
| 164 | */ | ||
| 165 | timer_base_t __init_timer_base | ||
| 166 | ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; | ||
| 167 | EXPORT_SYMBOL(__init_timer_base); | ||
| 168 | |||
| 169 | /*** | ||
| 170 | * init_timer - initialize a timer. | ||
| 171 | * @timer: the timer to be initialized | ||
| 172 | * | ||
| 173 | * init_timer() must be done to a timer prior calling *any* of the | ||
| 174 | * other timer functions. | ||
| 175 | */ | ||
| 176 | void fastcall init_timer(struct timer_list *timer) | ||
| 177 | { | ||
| 178 | timer->entry.next = NULL; | ||
| 179 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | ||
| 180 | timer->magic = TIMER_MAGIC; | ||
| 181 | } | ||
| 182 | EXPORT_SYMBOL(init_timer); | ||
| 183 | |||
| 184 | static inline void detach_timer(struct timer_list *timer, | ||
| 185 | int clear_pending) | ||
| 186 | { | ||
| 187 | struct list_head *entry = &timer->entry; | ||
| 188 | |||
| 189 | __list_del(entry->prev, entry->next); | ||
| 190 | if (clear_pending) | ||
| 191 | entry->next = NULL; | ||
| 192 | entry->prev = LIST_POISON2; | ||
| 193 | } | ||
| 194 | |||
| 195 | /* | ||
| 196 | * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock | ||
| 197 | * means that all timers which are tied to this base via timer->base are | ||
| 198 | * locked, and the base itself is locked too. | ||
| 199 | * | ||
| 200 | * So __run_timers/migrate_timers can safely modify all timers which could | ||
| 201 | * be found on ->tvX lists. | ||
| 202 | * | ||
| 203 | * When the timer's base is locked, and the timer removed from list, it is | ||
| 204 | * possible to set timer->base = NULL and drop the lock: the timer remains | ||
| 205 | * locked. | ||
| 206 | */ | ||
| 207 | static timer_base_t *lock_timer_base(struct timer_list *timer, | ||
| 208 | unsigned long *flags) | ||
| 209 | { | ||
| 210 | timer_base_t *base; | ||
| 211 | |||
| 212 | for (;;) { | ||
| 213 | base = timer->base; | ||
| 214 | if (likely(base != NULL)) { | ||
| 215 | spin_lock_irqsave(&base->lock, *flags); | ||
| 216 | if (likely(base == timer->base)) | ||
| 217 | return base; | ||
| 218 | /* The timer has migrated to another CPU */ | ||
| 219 | spin_unlock_irqrestore(&base->lock, *flags); | ||
| 220 | } | ||
| 221 | cpu_relax(); | ||
| 222 | } | ||
| 223 | } | ||
| 224 | |||
| 159 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 225 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
| 160 | { | 226 | { |
| 161 | tvec_base_t *old_base, *new_base; | 227 | timer_base_t *base; |
| 228 | tvec_base_t *new_base; | ||
| 162 | unsigned long flags; | 229 | unsigned long flags; |
| 163 | int ret = 0; | 230 | int ret = 0; |
| 164 | 231 | ||
| 165 | BUG_ON(!timer->function); | 232 | BUG_ON(!timer->function); |
| 166 | |||
| 167 | check_timer(timer); | 233 | check_timer(timer); |
| 168 | 234 | ||
| 169 | spin_lock_irqsave(&timer->lock, flags); | 235 | base = lock_timer_base(timer, &flags); |
| 236 | |||
| 237 | if (timer_pending(timer)) { | ||
| 238 | detach_timer(timer, 0); | ||
| 239 | ret = 1; | ||
| 240 | } | ||
| 241 | |||
| 170 | new_base = &__get_cpu_var(tvec_bases); | 242 | new_base = &__get_cpu_var(tvec_bases); |
| 171 | repeat: | ||
| 172 | old_base = timer->base; | ||
| 173 | 243 | ||
| 174 | /* | 244 | if (base != &new_base->t_base) { |
| 175 | * Prevent deadlocks via ordering by old_base < new_base. | ||
| 176 | */ | ||
| 177 | if (old_base && (new_base != old_base)) { | ||
| 178 | if (old_base < new_base) { | ||
| 179 | spin_lock(&new_base->lock); | ||
| 180 | spin_lock(&old_base->lock); | ||
| 181 | } else { | ||
| 182 | spin_lock(&old_base->lock); | ||
| 183 | spin_lock(&new_base->lock); | ||
| 184 | } | ||
| 185 | /* | 245 | /* |
| 186 | * The timer base might have been cancelled while we were | 246 | * We are trying to schedule the timer on the local CPU. |
| 187 | * trying to take the lock(s): | 247 | * However we can't change timer's base while it is running, |
| 248 | * otherwise del_timer_sync() can't detect that the timer's | ||
| 249 | * handler yet has not finished. This also guarantees that | ||
| 250 | * the timer is serialized wrt itself. | ||
| 188 | */ | 251 | */ |
| 189 | if (timer->base != old_base) { | 252 | if (unlikely(base->running_timer == timer)) { |
| 190 | spin_unlock(&new_base->lock); | 253 | /* The timer remains on a former base */ |
| 191 | spin_unlock(&old_base->lock); | 254 | new_base = container_of(base, tvec_base_t, t_base); |
| 192 | goto repeat; | 255 | } else { |
| 193 | } | 256 | /* See the comment in lock_timer_base() */ |
| 194 | } else { | 257 | timer->base = NULL; |
| 195 | spin_lock(&new_base->lock); | 258 | spin_unlock(&base->lock); |
| 196 | if (timer->base != old_base) { | 259 | spin_lock(&new_base->t_base.lock); |
| 197 | spin_unlock(&new_base->lock); | 260 | timer->base = &new_base->t_base; |
| 198 | goto repeat; | ||
| 199 | } | 261 | } |
| 200 | } | 262 | } |
| 201 | 263 | ||
| 202 | /* | ||
| 203 | * Delete the previous timeout (if there was any), and install | ||
| 204 | * the new one: | ||
| 205 | */ | ||
| 206 | if (old_base) { | ||
| 207 | list_del(&timer->entry); | ||
| 208 | ret = 1; | ||
| 209 | } | ||
| 210 | timer->expires = expires; | 264 | timer->expires = expires; |
| 211 | internal_add_timer(new_base, timer); | 265 | internal_add_timer(new_base, timer); |
| 212 | timer->base = new_base; | 266 | spin_unlock_irqrestore(&new_base->t_base.lock, flags); |
| 213 | |||
| 214 | if (old_base && (new_base != old_base)) | ||
| 215 | spin_unlock(&old_base->lock); | ||
| 216 | spin_unlock(&new_base->lock); | ||
| 217 | spin_unlock_irqrestore(&timer->lock, flags); | ||
| 218 | 267 | ||
| 219 | return ret; | 268 | return ret; |
| 220 | } | 269 | } |
| @@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 232 | { | 281 | { |
| 233 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); | 282 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); |
| 234 | unsigned long flags; | 283 | unsigned long flags; |
| 235 | 284 | ||
| 236 | BUG_ON(timer_pending(timer) || !timer->function); | 285 | BUG_ON(timer_pending(timer) || !timer->function); |
| 237 | 286 | ||
| 238 | check_timer(timer); | 287 | check_timer(timer); |
| 239 | 288 | ||
| 240 | spin_lock_irqsave(&base->lock, flags); | 289 | spin_lock_irqsave(&base->t_base.lock, flags); |
| 290 | timer->base = &base->t_base; | ||
| 241 | internal_add_timer(base, timer); | 291 | internal_add_timer(base, timer); |
| 242 | timer->base = base; | 292 | spin_unlock_irqrestore(&base->t_base.lock, flags); |
| 243 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 244 | } | 293 | } |
| 245 | 294 | ||
| 246 | 295 | ||
| @@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer); | |||
| 295 | */ | 344 | */ |
| 296 | int del_timer(struct timer_list *timer) | 345 | int del_timer(struct timer_list *timer) |
| 297 | { | 346 | { |
| 347 | timer_base_t *base; | ||
| 298 | unsigned long flags; | 348 | unsigned long flags; |
| 299 | tvec_base_t *base; | 349 | int ret = 0; |
| 300 | 350 | ||
| 301 | check_timer(timer); | 351 | check_timer(timer); |
| 302 | 352 | ||
| 303 | repeat: | 353 | if (timer_pending(timer)) { |
| 304 | base = timer->base; | 354 | base = lock_timer_base(timer, &flags); |
| 305 | if (!base) | 355 | if (timer_pending(timer)) { |
| 306 | return 0; | 356 | detach_timer(timer, 1); |
| 307 | spin_lock_irqsave(&base->lock, flags); | 357 | ret = 1; |
| 308 | if (base != timer->base) { | 358 | } |
| 309 | spin_unlock_irqrestore(&base->lock, flags); | 359 | spin_unlock_irqrestore(&base->lock, flags); |
| 310 | goto repeat; | ||
| 311 | } | 360 | } |
| 312 | list_del(&timer->entry); | ||
| 313 | /* Need to make sure that anybody who sees a NULL base also sees the list ops */ | ||
| 314 | smp_wmb(); | ||
| 315 | timer->base = NULL; | ||
| 316 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 317 | 361 | ||
| 318 | return 1; | 362 | return ret; |
| 319 | } | 363 | } |
| 320 | 364 | ||
| 321 | EXPORT_SYMBOL(del_timer); | 365 | EXPORT_SYMBOL(del_timer); |
| 322 | 366 | ||
| 323 | #ifdef CONFIG_SMP | 367 | #ifdef CONFIG_SMP |
| 324 | /*** | 368 | /* |
| 325 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 369 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
| 326 | * @timer: the timer to be deactivated | 370 | * exit the timer is not queued and the handler is not running on any CPU. |
| 327 | * | ||
| 328 | * This function only differs from del_timer() on SMP: besides deactivating | ||
| 329 | * the timer it also makes sure the handler has finished executing on other | ||
| 330 | * CPUs. | ||
| 331 | * | ||
| 332 | * Synchronization rules: callers must prevent restarting of the timer, | ||
| 333 | * otherwise this function is meaningless. It must not be called from | ||
| 334 | * interrupt contexts. The caller must not hold locks which would prevent | ||
| 335 | * completion of the timer's handler. Upon exit the timer is not queued and | ||
| 336 | * the handler is not running on any CPU. | ||
| 337 | * | ||
| 338 | * The function returns whether it has deactivated a pending timer or not. | ||
| 339 | * | 371 | * |
| 340 | * del_timer_sync() is slow and complicated because it copes with timer | 372 | * It must not be called from interrupt contexts. |
| 341 | * handlers which re-arm the timer (periodic timers). If the timer handler | ||
| 342 | * is known to not do this (a single shot timer) then use | ||
| 343 | * del_singleshot_timer_sync() instead. | ||
| 344 | */ | 373 | */ |
| 345 | int del_timer_sync(struct timer_list *timer) | 374 | int try_to_del_timer_sync(struct timer_list *timer) |
| 346 | { | 375 | { |
| 347 | tvec_base_t *base; | 376 | timer_base_t *base; |
| 348 | int i, ret = 0; | 377 | unsigned long flags; |
| 378 | int ret = -1; | ||
| 349 | 379 | ||
| 350 | check_timer(timer); | 380 | base = lock_timer_base(timer, &flags); |
| 351 | 381 | ||
| 352 | del_again: | 382 | if (base->running_timer == timer) |
| 353 | ret += del_timer(timer); | 383 | goto out; |
| 354 | 384 | ||
| 355 | for_each_online_cpu(i) { | 385 | ret = 0; |
| 356 | base = &per_cpu(tvec_bases, i); | 386 | if (timer_pending(timer)) { |
| 357 | if (base->running_timer == timer) { | 387 | detach_timer(timer, 1); |
| 358 | while (base->running_timer == timer) { | 388 | ret = 1; |
| 359 | cpu_relax(); | ||
| 360 | preempt_check_resched(); | ||
| 361 | } | ||
| 362 | break; | ||
| 363 | } | ||
| 364 | } | 389 | } |
| 365 | smp_rmb(); | 390 | out: |
| 366 | if (timer_pending(timer)) | 391 | spin_unlock_irqrestore(&base->lock, flags); |
| 367 | goto del_again; | ||
| 368 | 392 | ||
| 369 | return ret; | 393 | return ret; |
| 370 | } | 394 | } |
| 371 | EXPORT_SYMBOL(del_timer_sync); | ||
| 372 | 395 | ||
| 373 | /*** | 396 | /*** |
| 374 | * del_singleshot_timer_sync - deactivate a non-recursive timer | 397 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
| 375 | * @timer: the timer to be deactivated | 398 | * @timer: the timer to be deactivated |
| 376 | * | 399 | * |
| 377 | * This function is an optimization of del_timer_sync for the case where the | 400 | * This function only differs from del_timer() on SMP: besides deactivating |
| 378 | * caller can guarantee the timer does not reschedule itself in its timer | 401 | * the timer it also makes sure the handler has finished executing on other |
| 379 | * function. | 402 | * CPUs. |
| 380 | * | 403 | * |
| 381 | * Synchronization rules: callers must prevent restarting of the timer, | 404 | * Synchronization rules: callers must prevent restarting of the timer, |
| 382 | * otherwise this function is meaningless. It must not be called from | 405 | * otherwise this function is meaningless. It must not be called from |
| 383 | * interrupt contexts. The caller must not hold locks which wold prevent | 406 | * interrupt contexts. The caller must not hold locks which would prevent |
| 384 | * completion of the timer's handler. Upon exit the timer is not queued and | 407 | * completion of the timer's handler. The timer's handler must not call |
| 385 | * the handler is not running on any CPU. | 408 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
| 409 | * not running on any CPU. | ||
| 386 | * | 410 | * |
| 387 | * The function returns whether it has deactivated a pending timer or not. | 411 | * The function returns whether it has deactivated a pending timer or not. |
| 388 | */ | 412 | */ |
| 389 | int del_singleshot_timer_sync(struct timer_list *timer) | 413 | int del_timer_sync(struct timer_list *timer) |
| 390 | { | 414 | { |
| 391 | int ret = del_timer(timer); | 415 | check_timer(timer); |
| 392 | 416 | ||
| 393 | if (!ret) { | 417 | for (;;) { |
| 394 | ret = del_timer_sync(timer); | 418 | int ret = try_to_del_timer_sync(timer); |
| 395 | BUG_ON(ret); | 419 | if (ret >= 0) |
| 420 | return ret; | ||
| 396 | } | 421 | } |
| 397 | |||
| 398 | return ret; | ||
| 399 | } | 422 | } |
| 400 | EXPORT_SYMBOL(del_singleshot_timer_sync); | 423 | |
| 424 | EXPORT_SYMBOL(del_timer_sync); | ||
| 401 | #endif | 425 | #endif |
| 402 | 426 | ||
| 403 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 427 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) |
| @@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
| 415 | struct timer_list *tmp; | 439 | struct timer_list *tmp; |
| 416 | 440 | ||
| 417 | tmp = list_entry(curr, struct timer_list, entry); | 441 | tmp = list_entry(curr, struct timer_list, entry); |
| 418 | BUG_ON(tmp->base != base); | 442 | BUG_ON(tmp->base != &base->t_base); |
| 419 | curr = curr->next; | 443 | curr = curr->next; |
| 420 | internal_add_timer(base, tmp); | 444 | internal_add_timer(base, tmp); |
| 421 | } | 445 | } |
| @@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
| 437 | { | 461 | { |
| 438 | struct timer_list *timer; | 462 | struct timer_list *timer; |
| 439 | 463 | ||
| 440 | spin_lock_irq(&base->lock); | 464 | spin_lock_irq(&base->t_base.lock); |
| 441 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 465 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
| 442 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 466 | struct list_head work_list = LIST_HEAD_INIT(work_list); |
| 443 | struct list_head *head = &work_list; | 467 | struct list_head *head = &work_list; |
| @@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
| 453 | cascade(base, &base->tv5, INDEX(3)); | 477 | cascade(base, &base->tv5, INDEX(3)); |
| 454 | ++base->timer_jiffies; | 478 | ++base->timer_jiffies; |
| 455 | list_splice_init(base->tv1.vec + index, &work_list); | 479 | list_splice_init(base->tv1.vec + index, &work_list); |
| 456 | repeat: | 480 | while (!list_empty(head)) { |
| 457 | if (!list_empty(head)) { | ||
| 458 | void (*fn)(unsigned long); | 481 | void (*fn)(unsigned long); |
| 459 | unsigned long data; | 482 | unsigned long data; |
| 460 | 483 | ||
| @@ -462,25 +485,26 @@ repeat: | |||
| 462 | fn = timer->function; | 485 | fn = timer->function; |
| 463 | data = timer->data; | 486 | data = timer->data; |
| 464 | 487 | ||
| 465 | list_del(&timer->entry); | ||
| 466 | set_running_timer(base, timer); | 488 | set_running_timer(base, timer); |
| 467 | smp_wmb(); | 489 | detach_timer(timer, 1); |
| 468 | timer->base = NULL; | 490 | spin_unlock_irq(&base->t_base.lock); |
| 469 | spin_unlock_irq(&base->lock); | ||
| 470 | { | 491 | { |
| 471 | u32 preempt_count = preempt_count(); | 492 | int preempt_count = preempt_count(); |
| 472 | fn(data); | 493 | fn(data); |
| 473 | if (preempt_count != preempt_count()) { | 494 | if (preempt_count != preempt_count()) { |
| 474 | printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); | 495 | printk(KERN_WARNING "huh, entered %p " |
| 496 | "with preempt_count %08x, exited" | ||
| 497 | " with %08x?\n", | ||
| 498 | fn, preempt_count, | ||
| 499 | preempt_count()); | ||
| 475 | BUG(); | 500 | BUG(); |
| 476 | } | 501 | } |
| 477 | } | 502 | } |
| 478 | spin_lock_irq(&base->lock); | 503 | spin_lock_irq(&base->t_base.lock); |
| 479 | goto repeat; | ||
| 480 | } | 504 | } |
| 481 | } | 505 | } |
| 482 | set_running_timer(base, NULL); | 506 | set_running_timer(base, NULL); |
| 483 | spin_unlock_irq(&base->lock); | 507 | spin_unlock_irq(&base->t_base.lock); |
| 484 | } | 508 | } |
| 485 | 509 | ||
| 486 | #ifdef CONFIG_NO_IDLE_HZ | 510 | #ifdef CONFIG_NO_IDLE_HZ |
| @@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void) | |||
| 499 | int i, j; | 523 | int i, j; |
| 500 | 524 | ||
| 501 | base = &__get_cpu_var(tvec_bases); | 525 | base = &__get_cpu_var(tvec_bases); |
| 502 | spin_lock(&base->lock); | 526 | spin_lock(&base->t_base.lock); |
| 503 | expires = base->timer_jiffies + (LONG_MAX >> 1); | 527 | expires = base->timer_jiffies + (LONG_MAX >> 1); |
| 504 | list = 0; | 528 | list = 0; |
| 505 | 529 | ||
| @@ -547,7 +571,7 @@ found: | |||
| 547 | expires = nte->expires; | 571 | expires = nte->expires; |
| 548 | } | 572 | } |
| 549 | } | 573 | } |
| 550 | spin_unlock(&base->lock); | 574 | spin_unlock(&base->t_base.lock); |
| 551 | return expires; | 575 | return expires; |
| 552 | } | 576 | } |
| 553 | #endif | 577 | #endif |
| @@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu) | |||
| 1286 | { | 1310 | { |
| 1287 | int j; | 1311 | int j; |
| 1288 | tvec_base_t *base; | 1312 | tvec_base_t *base; |
| 1289 | 1313 | ||
| 1290 | base = &per_cpu(tvec_bases, cpu); | 1314 | base = &per_cpu(tvec_bases, cpu); |
| 1291 | spin_lock_init(&base->lock); | 1315 | spin_lock_init(&base->t_base.lock); |
| 1292 | for (j = 0; j < TVN_SIZE; j++) { | 1316 | for (j = 0; j < TVN_SIZE; j++) { |
| 1293 | INIT_LIST_HEAD(base->tv5.vec + j); | 1317 | INIT_LIST_HEAD(base->tv5.vec + j); |
| 1294 | INIT_LIST_HEAD(base->tv4.vec + j); | 1318 | INIT_LIST_HEAD(base->tv4.vec + j); |
| @@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu) | |||
| 1302 | } | 1326 | } |
| 1303 | 1327 | ||
| 1304 | #ifdef CONFIG_HOTPLUG_CPU | 1328 | #ifdef CONFIG_HOTPLUG_CPU |
| 1305 | static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1329 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) |
| 1306 | { | 1330 | { |
| 1307 | struct timer_list *timer; | 1331 | struct timer_list *timer; |
| 1308 | 1332 | ||
| 1309 | while (!list_empty(head)) { | 1333 | while (!list_empty(head)) { |
| 1310 | timer = list_entry(head->next, struct timer_list, entry); | 1334 | timer = list_entry(head->next, struct timer_list, entry); |
| 1311 | /* We're locking backwards from __mod_timer order here, | 1335 | detach_timer(timer, 0); |
| 1312 | beware deadlock. */ | 1336 | timer->base = &new_base->t_base; |
| 1313 | if (!spin_trylock(&timer->lock)) | ||
| 1314 | return 0; | ||
| 1315 | list_del(&timer->entry); | ||
| 1316 | internal_add_timer(new_base, timer); | 1337 | internal_add_timer(new_base, timer); |
| 1317 | timer->base = new_base; | ||
| 1318 | spin_unlock(&timer->lock); | ||
| 1319 | } | 1338 | } |
| 1320 | return 1; | ||
| 1321 | } | 1339 | } |
| 1322 | 1340 | ||
| 1323 | static void __devinit migrate_timers(int cpu) | 1341 | static void __devinit migrate_timers(int cpu) |
| @@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu) | |||
| 1331 | new_base = &get_cpu_var(tvec_bases); | 1349 | new_base = &get_cpu_var(tvec_bases); |
| 1332 | 1350 | ||
| 1333 | local_irq_disable(); | 1351 | local_irq_disable(); |
| 1334 | again: | 1352 | spin_lock(&new_base->t_base.lock); |
| 1335 | /* Prevent deadlocks via ordering by old_base < new_base. */ | 1353 | spin_lock(&old_base->t_base.lock); |
| 1336 | if (old_base < new_base) { | ||
| 1337 | spin_lock(&new_base->lock); | ||
| 1338 | spin_lock(&old_base->lock); | ||
| 1339 | } else { | ||
| 1340 | spin_lock(&old_base->lock); | ||
| 1341 | spin_lock(&new_base->lock); | ||
| 1342 | } | ||
| 1343 | 1354 | ||
| 1344 | if (old_base->running_timer) | 1355 | if (old_base->t_base.running_timer) |
| 1345 | BUG(); | 1356 | BUG(); |
| 1346 | for (i = 0; i < TVR_SIZE; i++) | 1357 | for (i = 0; i < TVR_SIZE; i++) |
| 1347 | if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) | 1358 | migrate_timer_list(new_base, old_base->tv1.vec + i); |
| 1348 | goto unlock_again; | 1359 | for (i = 0; i < TVN_SIZE; i++) { |
| 1349 | for (i = 0; i < TVN_SIZE; i++) | 1360 | migrate_timer_list(new_base, old_base->tv2.vec + i); |
| 1350 | if (!migrate_timer_list(new_base, old_base->tv2.vec + i) | 1361 | migrate_timer_list(new_base, old_base->tv3.vec + i); |
| 1351 | || !migrate_timer_list(new_base, old_base->tv3.vec + i) | 1362 | migrate_timer_list(new_base, old_base->tv4.vec + i); |
| 1352 | || !migrate_timer_list(new_base, old_base->tv4.vec + i) | 1363 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
| 1353 | || !migrate_timer_list(new_base, old_base->tv5.vec + i)) | 1364 | } |
| 1354 | goto unlock_again; | 1365 | |
| 1355 | spin_unlock(&old_base->lock); | 1366 | spin_unlock(&old_base->t_base.lock); |
| 1356 | spin_unlock(&new_base->lock); | 1367 | spin_unlock(&new_base->t_base.lock); |
| 1357 | local_irq_enable(); | 1368 | local_irq_enable(); |
| 1358 | put_cpu_var(tvec_bases); | 1369 | put_cpu_var(tvec_bases); |
| 1359 | return; | ||
| 1360 | |||
| 1361 | unlock_again: | ||
| 1362 | /* Avoid deadlock with __mod_timer, by backing off. */ | ||
| 1363 | spin_unlock(&old_base->lock); | ||
| 1364 | spin_unlock(&new_base->lock); | ||
| 1365 | cpu_relax(); | ||
| 1366 | goto again; | ||
| 1367 | } | 1370 | } |
| 1368 | #endif /* CONFIG_HOTPLUG_CPU */ | 1371 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 1369 | 1372 | ||
| @@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs) | |||
| 1594 | EXPORT_SYMBOL(msleep); | 1597 | EXPORT_SYMBOL(msleep); |
| 1595 | 1598 | ||
| 1596 | /** | 1599 | /** |
| 1597 | * msleep_interruptible - sleep waiting for waitqueue interruptions | 1600 | * msleep_interruptible - sleep waiting for signals |
| 1598 | * @msecs: Time in milliseconds to sleep for | 1601 | * @msecs: Time in milliseconds to sleep for |
| 1599 | */ | 1602 | */ |
| 1600 | unsigned long msleep_interruptible(unsigned int msecs) | 1603 | unsigned long msleep_interruptible(unsigned int msecs) |
