diff options
Diffstat (limited to 'kernel')
61 files changed, 7186 insertions, 3413 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4ae0fbde81..58908f9d15 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
12 | 12 | ||
13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
14 | obj-$(CONFIG_FUTEX) += futex.o | 14 | obj-$(CONFIG_FUTEX) += futex.o |
15 | ifeq ($(CONFIG_COMPAT),y) | ||
16 | obj-$(CONFIG_FUTEX) += futex_compat.o | ||
17 | endif | ||
15 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 18 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
16 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 19 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
17 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 20 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
@@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o | |||
26 | obj-$(CONFIG_CPUSETS) += cpuset.o | 29 | obj-$(CONFIG_CPUSETS) += cpuset.o |
27 | obj-$(CONFIG_IKCONFIG) += configs.o | 30 | obj-$(CONFIG_IKCONFIG) += configs.o |
28 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 31 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
29 | obj-$(CONFIG_AUDIT) += audit.o | 32 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
30 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 33 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
31 | obj-$(CONFIG_KPROBES) += kprobes.o | 34 | obj-$(CONFIG_KPROBES) += kprobes.o |
32 | obj-$(CONFIG_SYSFS) += ksysfs.o | 35 | obj-$(CONFIG_SYSFS) += ksysfs.o |
@@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
34 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 37 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
35 | obj-$(CONFIG_SECCOMP) += seccomp.o | 38 | obj-$(CONFIG_SECCOMP) += seccomp.o |
36 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 39 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
40 | obj-$(CONFIG_RELAY) += relay.o | ||
37 | 41 | ||
38 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 42 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
39 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 43 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index 065d8b4e51..b327f4d201 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file) | |||
449 | /* calculate run_time in nsec*/ | 449 | /* calculate run_time in nsec*/ |
450 | do_posix_clock_monotonic_gettime(&uptime); | 450 | do_posix_clock_monotonic_gettime(&uptime); |
451 | run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; | 451 | run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; |
452 | run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC | 452 | run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC |
453 | + current->start_time.tv_nsec; | 453 | + current->group_leader->start_time.tv_nsec; |
454 | /* convert nsec -> AHZ */ | 454 | /* convert nsec -> AHZ */ |
455 | elapsed = nsec_to_AHZ(run_time); | 455 | elapsed = nsec_to_AHZ(run_time); |
456 | #if ACCT_VERSION==3 | 456 | #if ACCT_VERSION==3 |
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file) | |||
469 | #endif | 469 | #endif |
470 | do_div(elapsed, AHZ); | 470 | do_div(elapsed, AHZ); |
471 | ac.ac_btime = xtime.tv_sec - elapsed; | 471 | ac.ac_btime = xtime.tv_sec - elapsed; |
472 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, | 472 | jiffies = cputime_to_jiffies(cputime_add(current->utime, |
473 | current->signal->utime)); | 473 | current->signal->utime)); |
474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); | 474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
475 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, | 475 | jiffies = cputime_to_jiffies(cputime_add(current->stime, |
476 | current->signal->stime)); | 476 | current->signal->stime)); |
477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); | 477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
478 | /* we really need to bite the bullet and change layout */ | 478 | /* we really need to bite the bullet and change layout */ |
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file) | |||
522 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | 522 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ |
523 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | 523 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); |
524 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + | 524 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + |
525 | current->group_leader->min_flt); | 525 | current->min_flt); |
526 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + | 526 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + |
527 | current->group_leader->maj_flt); | 527 | current->maj_flt); |
528 | ac.ac_swaps = encode_comp_t(0); | 528 | ac.ac_swaps = encode_comp_t(0); |
529 | ac.ac_exitcode = exitcode; | 529 | ac.ac_exitcode = exitcode; |
530 | 530 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 0a813d2883..df57b493e1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -52,8 +52,12 @@ | |||
52 | #include <linux/audit.h> | 52 | #include <linux/audit.h> |
53 | 53 | ||
54 | #include <net/sock.h> | 54 | #include <net/sock.h> |
55 | #include <net/netlink.h> | ||
55 | #include <linux/skbuff.h> | 56 | #include <linux/skbuff.h> |
56 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
58 | #include <linux/selinux.h> | ||
59 | |||
60 | #include "audit.h" | ||
57 | 61 | ||
58 | /* No auditing will take place until audit_initialized != 0. | 62 | /* No auditing will take place until audit_initialized != 0. |
59 | * (Initialization happens after skb_init is called.) */ | 63 | * (Initialization happens after skb_init is called.) */ |
@@ -72,7 +76,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; | |||
72 | * contains the (non-zero) pid. */ | 76 | * contains the (non-zero) pid. */ |
73 | int audit_pid; | 77 | int audit_pid; |
74 | 78 | ||
75 | /* If audit_limit is non-zero, limit the rate of sending audit records | 79 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
76 | * to that number per second. This prevents DoS attacks, but results in | 80 | * to that number per second. This prevents DoS attacks, but results in |
77 | * audit records being dropped. */ | 81 | * audit records being dropped. */ |
78 | static int audit_rate_limit; | 82 | static int audit_rate_limit; |
@@ -102,7 +106,7 @@ static struct sock *audit_sock; | |||
102 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | 106 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of |
103 | * being placed on the freelist). */ | 107 | * being placed on the freelist). */ |
104 | static DEFINE_SPINLOCK(audit_freelist_lock); | 108 | static DEFINE_SPINLOCK(audit_freelist_lock); |
105 | static int audit_freelist_count = 0; | 109 | static int audit_freelist_count; |
106 | static LIST_HEAD(audit_freelist); | 110 | static LIST_HEAD(audit_freelist); |
107 | 111 | ||
108 | static struct sk_buff_head audit_skb_queue; | 112 | static struct sk_buff_head audit_skb_queue; |
@@ -113,7 +117,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | |||
113 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 117 | /* The netlink socket is only to be read by 1 CPU, which lets us assume |
114 | * that list additions and deletions never happen simultaneously in | 118 | * that list additions and deletions never happen simultaneously in |
115 | * auditsc.c */ | 119 | * auditsc.c */ |
116 | DECLARE_MUTEX(audit_netlink_sem); | 120 | DEFINE_MUTEX(audit_netlink_mutex); |
117 | 121 | ||
118 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | 122 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting |
119 | * audit records. Since printk uses a 1024 byte buffer, this buffer | 123 | * audit records. Since printk uses a 1024 byte buffer, this buffer |
@@ -142,7 +146,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | |||
142 | nlh->nlmsg_pid = pid; | 146 | nlh->nlmsg_pid = pid; |
143 | } | 147 | } |
144 | 148 | ||
145 | static void audit_panic(const char *message) | 149 | void audit_panic(const char *message) |
146 | { | 150 | { |
147 | switch (audit_failure) | 151 | switch (audit_failure) |
148 | { | 152 | { |
@@ -186,8 +190,14 @@ static inline int audit_rate_check(void) | |||
186 | return retval; | 190 | return retval; |
187 | } | 191 | } |
188 | 192 | ||
189 | /* Emit at least 1 message per second, even if audit_rate_check is | 193 | /** |
190 | * throttling. */ | 194 | * audit_log_lost - conditionally log lost audit message event |
195 | * @message: the message stating reason for lost audit message | ||
196 | * | ||
197 | * Emit at least 1 message per second, even if audit_rate_check is | ||
198 | * throttling. | ||
199 | * Always increment the lost messages counter. | ||
200 | */ | ||
191 | void audit_log_lost(const char *message) | 201 | void audit_log_lost(const char *message) |
192 | { | 202 | { |
193 | static unsigned long last_msg = 0; | 203 | static unsigned long last_msg = 0; |
@@ -218,52 +228,105 @@ void audit_log_lost(const char *message) | |||
218 | audit_backlog_limit); | 228 | audit_backlog_limit); |
219 | audit_panic(message); | 229 | audit_panic(message); |
220 | } | 230 | } |
221 | |||
222 | } | 231 | } |
223 | 232 | ||
224 | static int audit_set_rate_limit(int limit, uid_t loginuid) | 233 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) |
225 | { | 234 | { |
226 | int old = audit_rate_limit; | 235 | int old = audit_rate_limit; |
227 | audit_rate_limit = limit; | 236 | |
228 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 237 | if (sid) { |
238 | char *ctx = NULL; | ||
239 | u32 len; | ||
240 | int rc; | ||
241 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | ||
242 | return rc; | ||
243 | else | ||
244 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
245 | "audit_rate_limit=%d old=%d by auid=%u subj=%s", | ||
246 | limit, old, loginuid, ctx); | ||
247 | kfree(ctx); | ||
248 | } else | ||
249 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
229 | "audit_rate_limit=%d old=%d by auid=%u", | 250 | "audit_rate_limit=%d old=%d by auid=%u", |
230 | audit_rate_limit, old, loginuid); | 251 | limit, old, loginuid); |
252 | audit_rate_limit = limit; | ||
231 | return old; | 253 | return old; |
232 | } | 254 | } |
233 | 255 | ||
234 | static int audit_set_backlog_limit(int limit, uid_t loginuid) | 256 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) |
235 | { | 257 | { |
236 | int old = audit_backlog_limit; | 258 | int old = audit_backlog_limit; |
237 | audit_backlog_limit = limit; | 259 | |
238 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 260 | if (sid) { |
261 | char *ctx = NULL; | ||
262 | u32 len; | ||
263 | int rc; | ||
264 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | ||
265 | return rc; | ||
266 | else | ||
267 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
268 | "audit_backlog_limit=%d old=%d by auid=%u subj=%s", | ||
269 | limit, old, loginuid, ctx); | ||
270 | kfree(ctx); | ||
271 | } else | ||
272 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
239 | "audit_backlog_limit=%d old=%d by auid=%u", | 273 | "audit_backlog_limit=%d old=%d by auid=%u", |
240 | audit_backlog_limit, old, loginuid); | 274 | limit, old, loginuid); |
275 | audit_backlog_limit = limit; | ||
241 | return old; | 276 | return old; |
242 | } | 277 | } |
243 | 278 | ||
244 | static int audit_set_enabled(int state, uid_t loginuid) | 279 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) |
245 | { | 280 | { |
246 | int old = audit_enabled; | 281 | int old = audit_enabled; |
282 | |||
247 | if (state != 0 && state != 1) | 283 | if (state != 0 && state != 1) |
248 | return -EINVAL; | 284 | return -EINVAL; |
249 | audit_enabled = state; | 285 | |
250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 286 | if (sid) { |
287 | char *ctx = NULL; | ||
288 | u32 len; | ||
289 | int rc; | ||
290 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | ||
291 | return rc; | ||
292 | else | ||
293 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
294 | "audit_enabled=%d old=%d by auid=%u subj=%s", | ||
295 | state, old, loginuid, ctx); | ||
296 | kfree(ctx); | ||
297 | } else | ||
298 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
251 | "audit_enabled=%d old=%d by auid=%u", | 299 | "audit_enabled=%d old=%d by auid=%u", |
252 | audit_enabled, old, loginuid); | 300 | state, old, loginuid); |
301 | audit_enabled = state; | ||
253 | return old; | 302 | return old; |
254 | } | 303 | } |
255 | 304 | ||
256 | static int audit_set_failure(int state, uid_t loginuid) | 305 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) |
257 | { | 306 | { |
258 | int old = audit_failure; | 307 | int old = audit_failure; |
308 | |||
259 | if (state != AUDIT_FAIL_SILENT | 309 | if (state != AUDIT_FAIL_SILENT |
260 | && state != AUDIT_FAIL_PRINTK | 310 | && state != AUDIT_FAIL_PRINTK |
261 | && state != AUDIT_FAIL_PANIC) | 311 | && state != AUDIT_FAIL_PANIC) |
262 | return -EINVAL; | 312 | return -EINVAL; |
263 | audit_failure = state; | 313 | |
264 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 314 | if (sid) { |
315 | char *ctx = NULL; | ||
316 | u32 len; | ||
317 | int rc; | ||
318 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | ||
319 | return rc; | ||
320 | else | ||
321 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
322 | "audit_failure=%d old=%d by auid=%u subj=%s", | ||
323 | state, old, loginuid, ctx); | ||
324 | kfree(ctx); | ||
325 | } else | ||
326 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
265 | "audit_failure=%d old=%d by auid=%u", | 327 | "audit_failure=%d old=%d by auid=%u", |
266 | audit_failure, old, loginuid); | 328 | state, old, loginuid); |
329 | audit_failure = state; | ||
267 | return old; | 330 | return old; |
268 | } | 331 | } |
269 | 332 | ||
@@ -300,8 +363,22 @@ static int kauditd_thread(void *dummy) | |||
300 | remove_wait_queue(&kauditd_wait, &wait); | 363 | remove_wait_queue(&kauditd_wait, &wait); |
301 | } | 364 | } |
302 | } | 365 | } |
366 | return 0; | ||
303 | } | 367 | } |
304 | 368 | ||
369 | /** | ||
370 | * audit_send_reply - send an audit reply message via netlink | ||
371 | * @pid: process id to send reply to | ||
372 | * @seq: sequence number | ||
373 | * @type: audit message type | ||
374 | * @done: done (last) flag | ||
375 | * @multi: multi-part message flag | ||
376 | * @payload: payload data | ||
377 | * @size: payload size | ||
378 | * | ||
379 | * Allocates an skb, builds the netlink message, and sends it to the pid. | ||
380 | * No failure notifications. | ||
381 | */ | ||
305 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | 382 | void audit_send_reply(int pid, int seq, int type, int done, int multi, |
306 | void *payload, int size) | 383 | void *payload, int size) |
307 | { | 384 | { |
@@ -342,15 +419,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
342 | switch (msg_type) { | 419 | switch (msg_type) { |
343 | case AUDIT_GET: | 420 | case AUDIT_GET: |
344 | case AUDIT_LIST: | 421 | case AUDIT_LIST: |
422 | case AUDIT_LIST_RULES: | ||
345 | case AUDIT_SET: | 423 | case AUDIT_SET: |
346 | case AUDIT_ADD: | 424 | case AUDIT_ADD: |
425 | case AUDIT_ADD_RULE: | ||
347 | case AUDIT_DEL: | 426 | case AUDIT_DEL: |
427 | case AUDIT_DEL_RULE: | ||
348 | case AUDIT_SIGNAL_INFO: | 428 | case AUDIT_SIGNAL_INFO: |
349 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 429 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) |
350 | err = -EPERM; | 430 | err = -EPERM; |
351 | break; | 431 | break; |
352 | case AUDIT_USER: | 432 | case AUDIT_USER: |
353 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 433 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
434 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | ||
354 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 435 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) |
355 | err = -EPERM; | 436 | err = -EPERM; |
356 | break; | 437 | break; |
@@ -363,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
363 | 444 | ||
364 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 445 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
365 | { | 446 | { |
366 | u32 uid, pid, seq; | 447 | u32 uid, pid, seq, sid; |
367 | void *data; | 448 | void *data; |
368 | struct audit_status *status_get, status_set; | 449 | struct audit_status *status_get, status_set; |
369 | int err; | 450 | int err; |
@@ -376,7 +457,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
376 | if (err) | 457 | if (err) |
377 | return err; | 458 | return err; |
378 | 459 | ||
379 | /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ | 460 | /* As soon as there's any sign of userspace auditd, |
461 | * start kauditd to talk to it */ | ||
380 | if (!kauditd_task) | 462 | if (!kauditd_task) |
381 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); | 463 | kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); |
382 | if (IS_ERR(kauditd_task)) { | 464 | if (IS_ERR(kauditd_task)) { |
@@ -388,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
388 | pid = NETLINK_CREDS(skb)->pid; | 470 | pid = NETLINK_CREDS(skb)->pid; |
389 | uid = NETLINK_CREDS(skb)->uid; | 471 | uid = NETLINK_CREDS(skb)->uid; |
390 | loginuid = NETLINK_CB(skb).loginuid; | 472 | loginuid = NETLINK_CB(skb).loginuid; |
473 | sid = NETLINK_CB(skb).sid; | ||
391 | seq = nlh->nlmsg_seq; | 474 | seq = nlh->nlmsg_seq; |
392 | data = NLMSG_DATA(nlh); | 475 | data = NLMSG_DATA(nlh); |
393 | 476 | ||
@@ -408,28 +491,47 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
408 | return -EINVAL; | 491 | return -EINVAL; |
409 | status_get = (struct audit_status *)data; | 492 | status_get = (struct audit_status *)data; |
410 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | 493 | if (status_get->mask & AUDIT_STATUS_ENABLED) { |
411 | err = audit_set_enabled(status_get->enabled, loginuid); | 494 | err = audit_set_enabled(status_get->enabled, |
495 | loginuid, sid); | ||
412 | if (err < 0) return err; | 496 | if (err < 0) return err; |
413 | } | 497 | } |
414 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | 498 | if (status_get->mask & AUDIT_STATUS_FAILURE) { |
415 | err = audit_set_failure(status_get->failure, loginuid); | 499 | err = audit_set_failure(status_get->failure, |
500 | loginuid, sid); | ||
416 | if (err < 0) return err; | 501 | if (err < 0) return err; |
417 | } | 502 | } |
418 | if (status_get->mask & AUDIT_STATUS_PID) { | 503 | if (status_get->mask & AUDIT_STATUS_PID) { |
419 | int old = audit_pid; | 504 | int old = audit_pid; |
505 | if (sid) { | ||
506 | char *ctx = NULL; | ||
507 | u32 len; | ||
508 | int rc; | ||
509 | if ((rc = selinux_ctxid_to_string( | ||
510 | sid, &ctx, &len))) | ||
511 | return rc; | ||
512 | else | ||
513 | audit_log(NULL, GFP_KERNEL, | ||
514 | AUDIT_CONFIG_CHANGE, | ||
515 | "audit_pid=%d old=%d by auid=%u subj=%s", | ||
516 | status_get->pid, old, | ||
517 | loginuid, ctx); | ||
518 | kfree(ctx); | ||
519 | } else | ||
520 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
521 | "audit_pid=%d old=%d by auid=%u", | ||
522 | status_get->pid, old, loginuid); | ||
420 | audit_pid = status_get->pid; | 523 | audit_pid = status_get->pid; |
421 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
422 | "audit_pid=%d old=%d by auid=%u", | ||
423 | audit_pid, old, loginuid); | ||
424 | } | 524 | } |
425 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 525 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
426 | audit_set_rate_limit(status_get->rate_limit, loginuid); | 526 | audit_set_rate_limit(status_get->rate_limit, |
527 | loginuid, sid); | ||
427 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 528 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
428 | audit_set_backlog_limit(status_get->backlog_limit, | 529 | audit_set_backlog_limit(status_get->backlog_limit, |
429 | loginuid); | 530 | loginuid, sid); |
430 | break; | 531 | break; |
431 | case AUDIT_USER: | 532 | case AUDIT_USER: |
432 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 533 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
534 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | ||
433 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 535 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
434 | return 0; | 536 | return 0; |
435 | 537 | ||
@@ -439,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
439 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 541 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
440 | if (ab) { | 542 | if (ab) { |
441 | audit_log_format(ab, | 543 | audit_log_format(ab, |
442 | "user pid=%d uid=%u auid=%u msg='%.1024s'", | 544 | "user pid=%d uid=%u auid=%u", |
443 | pid, uid, loginuid, (char *)data); | 545 | pid, uid, loginuid); |
546 | if (sid) { | ||
547 | char *ctx = NULL; | ||
548 | u32 len; | ||
549 | if (selinux_ctxid_to_string( | ||
550 | sid, &ctx, &len)) { | ||
551 | audit_log_format(ab, | ||
552 | " ssid=%u", sid); | ||
553 | /* Maybe call audit_panic? */ | ||
554 | } else | ||
555 | audit_log_format(ab, | ||
556 | " subj=%s", ctx); | ||
557 | kfree(ctx); | ||
558 | } | ||
559 | audit_log_format(ab, " msg='%.1024s'", | ||
560 | (char *)data); | ||
444 | audit_set_pid(ab, pid); | 561 | audit_set_pid(ab, pid); |
445 | audit_log_end(ab); | 562 | audit_log_end(ab); |
446 | } | 563 | } |
@@ -448,12 +565,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
448 | break; | 565 | break; |
449 | case AUDIT_ADD: | 566 | case AUDIT_ADD: |
450 | case AUDIT_DEL: | 567 | case AUDIT_DEL: |
451 | if (nlh->nlmsg_len < sizeof(struct audit_rule)) | 568 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) |
452 | return -EINVAL; | 569 | return -EINVAL; |
453 | /* fallthrough */ | 570 | /* fallthrough */ |
454 | case AUDIT_LIST: | 571 | case AUDIT_LIST: |
455 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 572 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
456 | uid, seq, data, loginuid); | 573 | uid, seq, data, nlmsg_len(nlh), |
574 | loginuid, sid); | ||
575 | break; | ||
576 | case AUDIT_ADD_RULE: | ||
577 | case AUDIT_DEL_RULE: | ||
578 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | ||
579 | return -EINVAL; | ||
580 | /* fallthrough */ | ||
581 | case AUDIT_LIST_RULES: | ||
582 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | ||
583 | uid, seq, data, nlmsg_len(nlh), | ||
584 | loginuid, sid); | ||
457 | break; | 585 | break; |
458 | case AUDIT_SIGNAL_INFO: | 586 | case AUDIT_SIGNAL_INFO: |
459 | sig_data.uid = audit_sig_uid; | 587 | sig_data.uid = audit_sig_uid; |
@@ -469,9 +597,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
469 | return err < 0 ? err : 0; | 597 | return err < 0 ? err : 0; |
470 | } | 598 | } |
471 | 599 | ||
472 | /* Get message from skb (based on rtnetlink_rcv_skb). Each message is | 600 | /* |
601 | * Get message from skb (based on rtnetlink_rcv_skb). Each message is | ||
473 | * processed by audit_receive_msg. Malformed skbs with wrong length are | 602 | * processed by audit_receive_msg. Malformed skbs with wrong length are |
474 | * discarded silently. */ | 603 | * discarded silently. |
604 | */ | ||
475 | static void audit_receive_skb(struct sk_buff *skb) | 605 | static void audit_receive_skb(struct sk_buff *skb) |
476 | { | 606 | { |
477 | int err; | 607 | int err; |
@@ -499,14 +629,14 @@ static void audit_receive(struct sock *sk, int length) | |||
499 | struct sk_buff *skb; | 629 | struct sk_buff *skb; |
500 | unsigned int qlen; | 630 | unsigned int qlen; |
501 | 631 | ||
502 | down(&audit_netlink_sem); | 632 | mutex_lock(&audit_netlink_mutex); |
503 | 633 | ||
504 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | 634 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { |
505 | skb = skb_dequeue(&sk->sk_receive_queue); | 635 | skb = skb_dequeue(&sk->sk_receive_queue); |
506 | audit_receive_skb(skb); | 636 | audit_receive_skb(skb); |
507 | kfree_skb(skb); | 637 | kfree_skb(skb); |
508 | } | 638 | } |
509 | up(&audit_netlink_sem); | 639 | mutex_unlock(&audit_netlink_mutex); |
510 | } | 640 | } |
511 | 641 | ||
512 | 642 | ||
@@ -519,11 +649,17 @@ static int __init audit_init(void) | |||
519 | THIS_MODULE); | 649 | THIS_MODULE); |
520 | if (!audit_sock) | 650 | if (!audit_sock) |
521 | audit_panic("cannot initialize netlink socket"); | 651 | audit_panic("cannot initialize netlink socket"); |
652 | else | ||
653 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | ||
522 | 654 | ||
523 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | ||
524 | skb_queue_head_init(&audit_skb_queue); | 655 | skb_queue_head_init(&audit_skb_queue); |
525 | audit_initialized = 1; | 656 | audit_initialized = 1; |
526 | audit_enabled = audit_default; | 657 | audit_enabled = audit_default; |
658 | |||
659 | /* Register the callback with selinux. This callback will be invoked | ||
660 | * when a new policy is loaded. */ | ||
661 | selinux_audit_set_callback(&selinux_audit_rule_update); | ||
662 | |||
527 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 663 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
528 | return 0; | 664 | return 0; |
529 | } | 665 | } |
@@ -538,7 +674,7 @@ static int __init audit_enable(char *str) | |||
538 | audit_initialized ? "" : " (after initialization)"); | 674 | audit_initialized ? "" : " (after initialization)"); |
539 | if (audit_initialized) | 675 | if (audit_initialized) |
540 | audit_enabled = audit_default; | 676 | audit_enabled = audit_default; |
541 | return 0; | 677 | return 1; |
542 | } | 678 | } |
543 | 679 | ||
544 | __setup("audit=", audit_enable); | 680 | __setup("audit=", audit_enable); |
@@ -600,7 +736,10 @@ err: | |||
600 | return NULL; | 736 | return NULL; |
601 | } | 737 | } |
602 | 738 | ||
603 | /* Compute a serial number for the audit record. Audit records are | 739 | /** |
740 | * audit_serial - compute a serial number for the audit record | ||
741 | * | ||
742 | * Compute a serial number for the audit record. Audit records are | ||
604 | * written to user-space as soon as they are generated, so a complete | 743 | * written to user-space as soon as they are generated, so a complete |
605 | * audit record may be written in several pieces. The timestamp of the | 744 | * audit record may be written in several pieces. The timestamp of the |
606 | * record and this serial number are used by the user-space tools to | 745 | * record and this serial number are used by the user-space tools to |
@@ -612,8 +751,8 @@ err: | |||
612 | * audit context (for those records that have a context), and emit them | 751 | * audit context (for those records that have a context), and emit them |
613 | * all at syscall exit. However, this could delay the reporting of | 752 | * all at syscall exit. However, this could delay the reporting of |
614 | * significant errors until syscall exit (or never, if the system | 753 | * significant errors until syscall exit (or never, if the system |
615 | * halts). */ | 754 | * halts). |
616 | 755 | */ | |
617 | unsigned int audit_serial(void) | 756 | unsigned int audit_serial(void) |
618 | { | 757 | { |
619 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 758 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; |
@@ -649,6 +788,21 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
649 | * will be written at syscall exit. If there is no associated task, tsk | 788 | * will be written at syscall exit. If there is no associated task, tsk |
650 | * should be NULL. */ | 789 | * should be NULL. */ |
651 | 790 | ||
791 | /** | ||
792 | * audit_log_start - obtain an audit buffer | ||
793 | * @ctx: audit_context (may be NULL) | ||
794 | * @gfp_mask: type of allocation | ||
795 | * @type: audit message type | ||
796 | * | ||
797 | * Returns audit_buffer pointer on success or NULL on error. | ||
798 | * | ||
799 | * Obtain an audit buffer. This routine does locking to obtain the | ||
800 | * audit buffer, but then no locking is required for calls to | ||
801 | * audit_log_*format. If the task (ctx) is a task that is currently in a | ||
802 | * syscall, then the syscall is marked as auditable and an audit record | ||
803 | * will be written at syscall exit. If there is no associated task, then | ||
804 | * task context (ctx) should be NULL. | ||
805 | */ | ||
652 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | 806 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, |
653 | int type) | 807 | int type) |
654 | { | 808 | { |
@@ -661,6 +815,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
661 | if (!audit_initialized) | 815 | if (!audit_initialized) |
662 | return NULL; | 816 | return NULL; |
663 | 817 | ||
818 | if (unlikely(audit_filter_type(type))) | ||
819 | return NULL; | ||
820 | |||
664 | if (gfp_mask & __GFP_WAIT) | 821 | if (gfp_mask & __GFP_WAIT) |
665 | reserve = 0; | 822 | reserve = 0; |
666 | else | 823 | else |
@@ -713,6 +870,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
713 | /** | 870 | /** |
714 | * audit_expand - expand skb in the audit buffer | 871 | * audit_expand - expand skb in the audit buffer |
715 | * @ab: audit_buffer | 872 | * @ab: audit_buffer |
873 | * @extra: space to add at tail of the skb | ||
716 | * | 874 | * |
717 | * Returns 0 (no space) on failed expansion, or available space if | 875 | * Returns 0 (no space) on failed expansion, or available space if |
718 | * successful. | 876 | * successful. |
@@ -729,10 +887,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) | |||
729 | return skb_tailroom(skb); | 887 | return skb_tailroom(skb); |
730 | } | 888 | } |
731 | 889 | ||
732 | /* Format an audit message into the audit buffer. If there isn't enough | 890 | /* |
891 | * Format an audit message into the audit buffer. If there isn't enough | ||
733 | * room in the audit buffer, more room will be allocated and vsnprint | 892 | * room in the audit buffer, more room will be allocated and vsnprint |
734 | * will be called a second time. Currently, we assume that a printk | 893 | * will be called a second time. Currently, we assume that a printk |
735 | * can't format message larger than 1024 bytes, so we don't either. */ | 894 | * can't format message larger than 1024 bytes, so we don't either. |
895 | */ | ||
736 | static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | 896 | static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, |
737 | va_list args) | 897 | va_list args) |
738 | { | 898 | { |
@@ -757,7 +917,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | |||
757 | /* The printk buffer is 1024 bytes long, so if we get | 917 | /* The printk buffer is 1024 bytes long, so if we get |
758 | * here and AUDIT_BUFSIZ is at least 1024, then we can | 918 | * here and AUDIT_BUFSIZ is at least 1024, then we can |
759 | * log everything that printk could have logged. */ | 919 | * log everything that printk could have logged. */ |
760 | avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); | 920 | avail = audit_expand(ab, |
921 | max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); | ||
761 | if (!avail) | 922 | if (!avail) |
762 | goto out; | 923 | goto out; |
763 | len = vsnprintf(skb->tail, avail, fmt, args2); | 924 | len = vsnprintf(skb->tail, avail, fmt, args2); |
@@ -768,8 +929,14 @@ out: | |||
768 | return; | 929 | return; |
769 | } | 930 | } |
770 | 931 | ||
771 | /* Format a message into the audit buffer. All the work is done in | 932 | /** |
772 | * audit_log_vformat. */ | 933 | * audit_log_format - format a message into the audit buffer. |
934 | * @ab: audit_buffer | ||
935 | * @fmt: format string | ||
936 | * @...: optional parameters matching @fmt string | ||
937 | * | ||
938 | * All the work is done in audit_log_vformat. | ||
939 | */ | ||
773 | void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | 940 | void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) |
774 | { | 941 | { |
775 | va_list args; | 942 | va_list args; |
@@ -781,9 +948,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | |||
781 | va_end(args); | 948 | va_end(args); |
782 | } | 949 | } |
783 | 950 | ||
784 | /* This function will take the passed buf and convert it into a string of | 951 | /** |
785 | * ascii hex digits. The new string is placed onto the skb. */ | 952 | * audit_log_hex - convert a buffer to hex and append it to the audit skb |
786 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | 953 | * @ab: the audit_buffer |
954 | * @buf: buffer to convert to hex | ||
955 | * @len: length of @buf to be converted | ||
956 | * | ||
957 | * No return value; failure to expand is silently ignored. | ||
958 | * | ||
959 | * This function will take the passed buf and convert it into a string of | ||
960 | * ascii hex digits. The new string is placed onto the skb. | ||
961 | */ | ||
962 | void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | ||
787 | size_t len) | 963 | size_t len) |
788 | { | 964 | { |
789 | int i, avail, new_len; | 965 | int i, avail, new_len; |
@@ -812,10 +988,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
812 | skb_put(skb, len << 1); /* new string is twice the old string */ | 988 | skb_put(skb, len << 1); /* new string is twice the old string */ |
813 | } | 989 | } |
814 | 990 | ||
815 | /* This code will escape a string that is passed to it if the string | 991 | /** |
816 | * contains a control character, unprintable character, double quote mark, | 992 | * audit_log_unstrustedstring - log a string that may contain random characters |
993 | * @ab: audit_buffer | ||
994 | * @string: string to be logged | ||
995 | * | ||
996 | * This code will escape a string that is passed to it if the string | ||
997 | * contains a control character, unprintable character, double quote mark, | ||
817 | * or a space. Unescaped strings will start and end with a double quote mark. | 998 | * or a space. Unescaped strings will start and end with a double quote mark. |
818 | * Strings that are escaped are printed in hex (2 digits per char). */ | 999 | * Strings that are escaped are printed in hex (2 digits per char). |
1000 | */ | ||
819 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1001 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
820 | { | 1002 | { |
821 | const unsigned char *p = string; | 1003 | const unsigned char *p = string; |
@@ -854,10 +1036,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
854 | kfree(path); | 1036 | kfree(path); |
855 | } | 1037 | } |
856 | 1038 | ||
857 | /* The netlink_* functions cannot be called inside an irq context, so | 1039 | /** |
858 | * the audit buffer is places on a queue and a tasklet is scheduled to | 1040 | * audit_log_end - end one audit record |
1041 | * @ab: the audit_buffer | ||
1042 | * | ||
1043 | * The netlink_* functions cannot be called inside an irq context, so | ||
1044 | * the audit buffer is placed on a queue and a tasklet is scheduled to | ||
859 | * remove them from the queue outside the irq context. May be called in | 1045 | * remove them from the queue outside the irq context. May be called in |
860 | * any context. */ | 1046 | * any context. |
1047 | */ | ||
861 | void audit_log_end(struct audit_buffer *ab) | 1048 | void audit_log_end(struct audit_buffer *ab) |
862 | { | 1049 | { |
863 | if (!ab) | 1050 | if (!ab) |
@@ -878,9 +1065,18 @@ void audit_log_end(struct audit_buffer *ab) | |||
878 | audit_buffer_free(ab); | 1065 | audit_buffer_free(ab); |
879 | } | 1066 | } |
880 | 1067 | ||
881 | /* Log an audit record. This is a convenience function that calls | 1068 | /** |
882 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 1069 | * audit_log - Log an audit record |
883 | * called in any context. */ | 1070 | * @ctx: audit context |
1071 | * @gfp_mask: type of allocation | ||
1072 | * @type: audit message type | ||
1073 | * @fmt: format string to use | ||
1074 | * @...: variable parameters matching the format string | ||
1075 | * | ||
1076 | * This is a convenience function that calls audit_log_start, | ||
1077 | * audit_log_vformat, and audit_log_end. It may be called | ||
1078 | * in any context. | ||
1079 | */ | ||
884 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, | 1080 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, |
885 | const char *fmt, ...) | 1081 | const char *fmt, ...) |
886 | { | 1082 | { |
@@ -895,3 +1091,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, | |||
895 | audit_log_end(ab); | 1091 | audit_log_end(ab); |
896 | } | 1092 | } |
897 | } | 1093 | } |
1094 | |||
1095 | EXPORT_SYMBOL(audit_log_start); | ||
1096 | EXPORT_SYMBOL(audit_log_end); | ||
1097 | EXPORT_SYMBOL(audit_log_format); | ||
1098 | EXPORT_SYMBOL(audit_log); | ||
diff --git a/kernel/audit.h b/kernel/audit.h new file mode 100644 index 0000000000..6f733920fd --- /dev/null +++ b/kernel/audit.h | |||
@@ -0,0 +1,92 @@ | |||
1 | /* audit -- definition of audit_context structure and supporting types | ||
2 | * | ||
3 | * Copyright 2003-2004 Red Hat, Inc. | ||
4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | ||
5 | * Copyright 2005 IBM Corporation | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
20 | */ | ||
21 | |||
22 | #include <linux/mutex.h> | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/audit.h> | ||
25 | |||
26 | /* 0 = no checking | ||
27 | 1 = put_count checking | ||
28 | 2 = verbose put_count checking | ||
29 | */ | ||
30 | #define AUDIT_DEBUG 0 | ||
31 | |||
32 | /* At task start time, the audit_state is set in the audit_context using | ||
33 | a per-task filter. At syscall entry, the audit_state is augmented by | ||
34 | the syscall filter. */ | ||
35 | enum audit_state { | ||
36 | AUDIT_DISABLED, /* Do not create per-task audit_context. | ||
37 | * No syscall-specific audit records can | ||
38 | * be generated. */ | ||
39 | AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, | ||
40 | * but don't necessarily fill it in at | ||
41 | * syscall entry time (i.e., filter | ||
42 | * instead). */ | ||
43 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, | ||
44 | * and always fill it in at syscall | ||
45 | * entry time. This makes a full | ||
46 | * syscall record available if some | ||
47 | * other part of the kernel decides it | ||
48 | * should be recorded. */ | ||
49 | AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, | ||
50 | * always fill it in at syscall entry | ||
51 | * time, and always write out the audit | ||
52 | * record at syscall exit time. */ | ||
53 | }; | ||
54 | |||
55 | /* Rule lists */ | ||
56 | struct audit_field { | ||
57 | u32 type; | ||
58 | u32 val; | ||
59 | u32 op; | ||
60 | char *se_str; | ||
61 | struct selinux_audit_rule *se_rule; | ||
62 | }; | ||
63 | |||
64 | struct audit_krule { | ||
65 | int vers_ops; | ||
66 | u32 flags; | ||
67 | u32 listnr; | ||
68 | u32 action; | ||
69 | u32 mask[AUDIT_BITMASK_SIZE]; | ||
70 | u32 buflen; /* for data alloc on list rules */ | ||
71 | u32 field_count; | ||
72 | struct audit_field *fields; | ||
73 | }; | ||
74 | |||
75 | struct audit_entry { | ||
76 | struct list_head list; | ||
77 | struct rcu_head rcu; | ||
78 | struct audit_krule rule; | ||
79 | }; | ||
80 | |||
81 | |||
82 | extern int audit_pid; | ||
83 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | ||
84 | |||
85 | extern void audit_send_reply(int pid, int seq, int type, | ||
86 | int done, int multi, | ||
87 | void *payload, int size); | ||
88 | extern void audit_log_lost(const char *message); | ||
89 | extern void audit_panic(const char *message); | ||
90 | extern struct mutex audit_netlink_mutex; | ||
91 | |||
92 | extern int selinux_audit_rule_update(void); | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c new file mode 100644 index 0000000000..7c134906d6 --- /dev/null +++ b/kernel/auditfilter.c | |||
@@ -0,0 +1,857 @@ | |||
1 | /* auditfilter.c -- filtering of audit events | ||
2 | * | ||
3 | * Copyright 2003-2004 Red Hat, Inc. | ||
4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | ||
5 | * Copyright 2005 IBM Corporation | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/audit.h> | ||
24 | #include <linux/kthread.h> | ||
25 | #include <linux/netlink.h> | ||
26 | #include <linux/selinux.h> | ||
27 | #include "audit.h" | ||
28 | |||
29 | /* There are three lists of rules -- one to search at task creation | ||
30 | * time, one to search at syscall entry time, and another to search at | ||
31 | * syscall exit time. */ | ||
32 | struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | ||
33 | LIST_HEAD_INIT(audit_filter_list[0]), | ||
34 | LIST_HEAD_INIT(audit_filter_list[1]), | ||
35 | LIST_HEAD_INIT(audit_filter_list[2]), | ||
36 | LIST_HEAD_INIT(audit_filter_list[3]), | ||
37 | LIST_HEAD_INIT(audit_filter_list[4]), | ||
38 | LIST_HEAD_INIT(audit_filter_list[5]), | ||
39 | #if AUDIT_NR_FILTERS != 6 | ||
40 | #error Fix audit_filter_list initialiser | ||
41 | #endif | ||
42 | }; | ||
43 | |||
44 | static inline void audit_free_rule(struct audit_entry *e) | ||
45 | { | ||
46 | int i; | ||
47 | if (e->rule.fields) | ||
48 | for (i = 0; i < e->rule.field_count; i++) { | ||
49 | struct audit_field *f = &e->rule.fields[i]; | ||
50 | kfree(f->se_str); | ||
51 | selinux_audit_rule_free(f->se_rule); | ||
52 | } | ||
53 | kfree(e->rule.fields); | ||
54 | kfree(e); | ||
55 | } | ||
56 | |||
57 | static inline void audit_free_rule_rcu(struct rcu_head *head) | ||
58 | { | ||
59 | struct audit_entry *e = container_of(head, struct audit_entry, rcu); | ||
60 | audit_free_rule(e); | ||
61 | } | ||
62 | |||
63 | /* Initialize an audit filterlist entry. */ | ||
64 | static inline struct audit_entry *audit_init_entry(u32 field_count) | ||
65 | { | ||
66 | struct audit_entry *entry; | ||
67 | struct audit_field *fields; | ||
68 | |||
69 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); | ||
70 | if (unlikely(!entry)) | ||
71 | return NULL; | ||
72 | |||
73 | fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); | ||
74 | if (unlikely(!fields)) { | ||
75 | kfree(entry); | ||
76 | return NULL; | ||
77 | } | ||
78 | entry->rule.fields = fields; | ||
79 | |||
80 | return entry; | ||
81 | } | ||
82 | |||
83 | /* Unpack a filter field's string representation from user-space | ||
84 | * buffer. */ | ||
85 | static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) | ||
86 | { | ||
87 | char *str; | ||
88 | |||
89 | if (!*bufp || (len == 0) || (len > *remain)) | ||
90 | return ERR_PTR(-EINVAL); | ||
91 | |||
92 | /* Of the currently implemented string fields, PATH_MAX | ||
93 | * defines the longest valid length. | ||
94 | */ | ||
95 | if (len > PATH_MAX) | ||
96 | return ERR_PTR(-ENAMETOOLONG); | ||
97 | |||
98 | str = kmalloc(len + 1, GFP_KERNEL); | ||
99 | if (unlikely(!str)) | ||
100 | return ERR_PTR(-ENOMEM); | ||
101 | |||
102 | memcpy(str, *bufp, len); | ||
103 | str[len] = 0; | ||
104 | *bufp += len; | ||
105 | *remain -= len; | ||
106 | |||
107 | return str; | ||
108 | } | ||
109 | |||
110 | /* Common user-space to kernel rule translation. */ | ||
111 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | ||
112 | { | ||
113 | unsigned listnr; | ||
114 | struct audit_entry *entry; | ||
115 | int i, err; | ||
116 | |||
117 | err = -EINVAL; | ||
118 | listnr = rule->flags & ~AUDIT_FILTER_PREPEND; | ||
119 | switch(listnr) { | ||
120 | default: | ||
121 | goto exit_err; | ||
122 | case AUDIT_FILTER_USER: | ||
123 | case AUDIT_FILTER_TYPE: | ||
124 | #ifdef CONFIG_AUDITSYSCALL | ||
125 | case AUDIT_FILTER_ENTRY: | ||
126 | case AUDIT_FILTER_EXIT: | ||
127 | case AUDIT_FILTER_TASK: | ||
128 | #endif | ||
129 | ; | ||
130 | } | ||
131 | if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && | ||
132 | rule->action != AUDIT_ALWAYS) | ||
133 | goto exit_err; | ||
134 | if (rule->field_count > AUDIT_MAX_FIELDS) | ||
135 | goto exit_err; | ||
136 | |||
137 | err = -ENOMEM; | ||
138 | entry = audit_init_entry(rule->field_count); | ||
139 | if (!entry) | ||
140 | goto exit_err; | ||
141 | |||
142 | entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; | ||
143 | entry->rule.listnr = listnr; | ||
144 | entry->rule.action = rule->action; | ||
145 | entry->rule.field_count = rule->field_count; | ||
146 | |||
147 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | ||
148 | entry->rule.mask[i] = rule->mask[i]; | ||
149 | |||
150 | return entry; | ||
151 | |||
152 | exit_err: | ||
153 | return ERR_PTR(err); | ||
154 | } | ||
155 | |||
156 | /* Translate struct audit_rule to kernel's rule respresentation. | ||
157 | * Exists for backward compatibility with userspace. */ | ||
158 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | ||
159 | { | ||
160 | struct audit_entry *entry; | ||
161 | int err = 0; | ||
162 | int i; | ||
163 | |||
164 | entry = audit_to_entry_common(rule); | ||
165 | if (IS_ERR(entry)) | ||
166 | goto exit_nofree; | ||
167 | |||
168 | for (i = 0; i < rule->field_count; i++) { | ||
169 | struct audit_field *f = &entry->rule.fields[i]; | ||
170 | |||
171 | f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); | ||
172 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | ||
173 | f->val = rule->values[i]; | ||
174 | |||
175 | if (f->type & AUDIT_UNUSED_BITS || | ||
176 | f->type == AUDIT_SE_USER || | ||
177 | f->type == AUDIT_SE_ROLE || | ||
178 | f->type == AUDIT_SE_TYPE || | ||
179 | f->type == AUDIT_SE_SEN || | ||
180 | f->type == AUDIT_SE_CLR) { | ||
181 | err = -EINVAL; | ||
182 | goto exit_free; | ||
183 | } | ||
184 | |||
185 | entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; | ||
186 | |||
187 | /* Support for legacy operators where | ||
188 | * AUDIT_NEGATE bit signifies != and otherwise assumes == */ | ||
189 | if (f->op & AUDIT_NEGATE) | ||
190 | f->op = AUDIT_NOT_EQUAL; | ||
191 | else if (!f->op) | ||
192 | f->op = AUDIT_EQUAL; | ||
193 | else if (f->op == AUDIT_OPERATORS) { | ||
194 | err = -EINVAL; | ||
195 | goto exit_free; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | exit_nofree: | ||
200 | return entry; | ||
201 | |||
202 | exit_free: | ||
203 | audit_free_rule(entry); | ||
204 | return ERR_PTR(err); | ||
205 | } | ||
206 | |||
207 | /* Translate struct audit_rule_data to kernel's rule respresentation. */ | ||
208 | static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | ||
209 | size_t datasz) | ||
210 | { | ||
211 | int err = 0; | ||
212 | struct audit_entry *entry; | ||
213 | void *bufp; | ||
214 | size_t remain = datasz - sizeof(struct audit_rule_data); | ||
215 | int i; | ||
216 | char *str; | ||
217 | |||
218 | entry = audit_to_entry_common((struct audit_rule *)data); | ||
219 | if (IS_ERR(entry)) | ||
220 | goto exit_nofree; | ||
221 | |||
222 | bufp = data->buf; | ||
223 | entry->rule.vers_ops = 2; | ||
224 | for (i = 0; i < data->field_count; i++) { | ||
225 | struct audit_field *f = &entry->rule.fields[i]; | ||
226 | |||
227 | err = -EINVAL; | ||
228 | if (!(data->fieldflags[i] & AUDIT_OPERATORS) || | ||
229 | data->fieldflags[i] & ~AUDIT_OPERATORS) | ||
230 | goto exit_free; | ||
231 | |||
232 | f->op = data->fieldflags[i] & AUDIT_OPERATORS; | ||
233 | f->type = data->fields[i]; | ||
234 | f->val = data->values[i]; | ||
235 | f->se_str = NULL; | ||
236 | f->se_rule = NULL; | ||
237 | switch(f->type) { | ||
238 | case AUDIT_SE_USER: | ||
239 | case AUDIT_SE_ROLE: | ||
240 | case AUDIT_SE_TYPE: | ||
241 | case AUDIT_SE_SEN: | ||
242 | case AUDIT_SE_CLR: | ||
243 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
244 | if (IS_ERR(str)) | ||
245 | goto exit_free; | ||
246 | entry->rule.buflen += f->val; | ||
247 | |||
248 | err = selinux_audit_rule_init(f->type, f->op, str, | ||
249 | &f->se_rule); | ||
250 | /* Keep currently invalid fields around in case they | ||
251 | * become valid after a policy reload. */ | ||
252 | if (err == -EINVAL) { | ||
253 | printk(KERN_WARNING "audit rule for selinux " | ||
254 | "\'%s\' is invalid\n", str); | ||
255 | err = 0; | ||
256 | } | ||
257 | if (err) { | ||
258 | kfree(str); | ||
259 | goto exit_free; | ||
260 | } else | ||
261 | f->se_str = str; | ||
262 | break; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | exit_nofree: | ||
267 | return entry; | ||
268 | |||
269 | exit_free: | ||
270 | audit_free_rule(entry); | ||
271 | return ERR_PTR(err); | ||
272 | } | ||
273 | |||
274 | /* Pack a filter field's string representation into data block. */ | ||
275 | static inline size_t audit_pack_string(void **bufp, char *str) | ||
276 | { | ||
277 | size_t len = strlen(str); | ||
278 | |||
279 | memcpy(*bufp, str, len); | ||
280 | *bufp += len; | ||
281 | |||
282 | return len; | ||
283 | } | ||
284 | |||
285 | /* Translate kernel rule respresentation to struct audit_rule. | ||
286 | * Exists for backward compatibility with userspace. */ | ||
287 | static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | ||
288 | { | ||
289 | struct audit_rule *rule; | ||
290 | int i; | ||
291 | |||
292 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); | ||
293 | if (unlikely(!rule)) | ||
294 | return ERR_PTR(-ENOMEM); | ||
295 | memset(rule, 0, sizeof(*rule)); | ||
296 | |||
297 | rule->flags = krule->flags | krule->listnr; | ||
298 | rule->action = krule->action; | ||
299 | rule->field_count = krule->field_count; | ||
300 | for (i = 0; i < rule->field_count; i++) { | ||
301 | rule->values[i] = krule->fields[i].val; | ||
302 | rule->fields[i] = krule->fields[i].type; | ||
303 | |||
304 | if (krule->vers_ops == 1) { | ||
305 | if (krule->fields[i].op & AUDIT_NOT_EQUAL) | ||
306 | rule->fields[i] |= AUDIT_NEGATE; | ||
307 | } else { | ||
308 | rule->fields[i] |= krule->fields[i].op; | ||
309 | } | ||
310 | } | ||
311 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; | ||
312 | |||
313 | return rule; | ||
314 | } | ||
315 | |||
316 | /* Translate kernel rule respresentation to struct audit_rule_data. */ | ||
317 | static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | ||
318 | { | ||
319 | struct audit_rule_data *data; | ||
320 | void *bufp; | ||
321 | int i; | ||
322 | |||
323 | data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); | ||
324 | if (unlikely(!data)) | ||
325 | return ERR_PTR(-ENOMEM); | ||
326 | memset(data, 0, sizeof(*data)); | ||
327 | |||
328 | data->flags = krule->flags | krule->listnr; | ||
329 | data->action = krule->action; | ||
330 | data->field_count = krule->field_count; | ||
331 | bufp = data->buf; | ||
332 | for (i = 0; i < data->field_count; i++) { | ||
333 | struct audit_field *f = &krule->fields[i]; | ||
334 | |||
335 | data->fields[i] = f->type; | ||
336 | data->fieldflags[i] = f->op; | ||
337 | switch(f->type) { | ||
338 | case AUDIT_SE_USER: | ||
339 | case AUDIT_SE_ROLE: | ||
340 | case AUDIT_SE_TYPE: | ||
341 | case AUDIT_SE_SEN: | ||
342 | case AUDIT_SE_CLR: | ||
343 | data->buflen += data->values[i] = | ||
344 | audit_pack_string(&bufp, f->se_str); | ||
345 | break; | ||
346 | default: | ||
347 | data->values[i] = f->val; | ||
348 | } | ||
349 | } | ||
350 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; | ||
351 | |||
352 | return data; | ||
353 | } | ||
354 | |||
355 | /* Compare two rules in kernel format. Considered success if rules | ||
356 | * don't match. */ | ||
357 | static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | ||
358 | { | ||
359 | int i; | ||
360 | |||
361 | if (a->flags != b->flags || | ||
362 | a->listnr != b->listnr || | ||
363 | a->action != b->action || | ||
364 | a->field_count != b->field_count) | ||
365 | return 1; | ||
366 | |||
367 | for (i = 0; i < a->field_count; i++) { | ||
368 | if (a->fields[i].type != b->fields[i].type || | ||
369 | a->fields[i].op != b->fields[i].op) | ||
370 | return 1; | ||
371 | |||
372 | switch(a->fields[i].type) { | ||
373 | case AUDIT_SE_USER: | ||
374 | case AUDIT_SE_ROLE: | ||
375 | case AUDIT_SE_TYPE: | ||
376 | case AUDIT_SE_SEN: | ||
377 | case AUDIT_SE_CLR: | ||
378 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | ||
379 | return 1; | ||
380 | break; | ||
381 | default: | ||
382 | if (a->fields[i].val != b->fields[i].val) | ||
383 | return 1; | ||
384 | } | ||
385 | } | ||
386 | |||
387 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | ||
388 | if (a->mask[i] != b->mask[i]) | ||
389 | return 1; | ||
390 | |||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | /* Duplicate selinux field information. The se_rule is opaque, so must be | ||
395 | * re-initialized. */ | ||
396 | static inline int audit_dupe_selinux_field(struct audit_field *df, | ||
397 | struct audit_field *sf) | ||
398 | { | ||
399 | int ret = 0; | ||
400 | char *se_str; | ||
401 | |||
402 | /* our own copy of se_str */ | ||
403 | se_str = kstrdup(sf->se_str, GFP_KERNEL); | ||
404 | if (unlikely(IS_ERR(se_str))) | ||
405 | return -ENOMEM; | ||
406 | df->se_str = se_str; | ||
407 | |||
408 | /* our own (refreshed) copy of se_rule */ | ||
409 | ret = selinux_audit_rule_init(df->type, df->op, df->se_str, | ||
410 | &df->se_rule); | ||
411 | /* Keep currently invalid fields around in case they | ||
412 | * become valid after a policy reload. */ | ||
413 | if (ret == -EINVAL) { | ||
414 | printk(KERN_WARNING "audit rule for selinux \'%s\' is " | ||
415 | "invalid\n", df->se_str); | ||
416 | ret = 0; | ||
417 | } | ||
418 | |||
419 | return ret; | ||
420 | } | ||
421 | |||
422 | /* Duplicate an audit rule. This will be a deep copy with the exception | ||
423 | * of the watch - that pointer is carried over. The selinux specific fields | ||
424 | * will be updated in the copy. The point is to be able to replace the old | ||
425 | * rule with the new rule in the filterlist, then free the old rule. */ | ||
426 | static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | ||
427 | { | ||
428 | u32 fcount = old->field_count; | ||
429 | struct audit_entry *entry; | ||
430 | struct audit_krule *new; | ||
431 | int i, err = 0; | ||
432 | |||
433 | entry = audit_init_entry(fcount); | ||
434 | if (unlikely(!entry)) | ||
435 | return ERR_PTR(-ENOMEM); | ||
436 | |||
437 | new = &entry->rule; | ||
438 | new->vers_ops = old->vers_ops; | ||
439 | new->flags = old->flags; | ||
440 | new->listnr = old->listnr; | ||
441 | new->action = old->action; | ||
442 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | ||
443 | new->mask[i] = old->mask[i]; | ||
444 | new->buflen = old->buflen; | ||
445 | new->field_count = old->field_count; | ||
446 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); | ||
447 | |||
448 | /* deep copy this information, updating the se_rule fields, because | ||
449 | * the originals will all be freed when the old rule is freed. */ | ||
450 | for (i = 0; i < fcount; i++) { | ||
451 | switch (new->fields[i].type) { | ||
452 | case AUDIT_SE_USER: | ||
453 | case AUDIT_SE_ROLE: | ||
454 | case AUDIT_SE_TYPE: | ||
455 | case AUDIT_SE_SEN: | ||
456 | case AUDIT_SE_CLR: | ||
457 | err = audit_dupe_selinux_field(&new->fields[i], | ||
458 | &old->fields[i]); | ||
459 | } | ||
460 | if (err) { | ||
461 | audit_free_rule(entry); | ||
462 | return ERR_PTR(err); | ||
463 | } | ||
464 | } | ||
465 | |||
466 | return entry; | ||
467 | } | ||
468 | |||
469 | /* Add rule to given filterlist if not a duplicate. Protected by | ||
470 | * audit_netlink_mutex. */ | ||
471 | static inline int audit_add_rule(struct audit_entry *entry, | ||
472 | struct list_head *list) | ||
473 | { | ||
474 | struct audit_entry *e; | ||
475 | |||
476 | /* Do not use the _rcu iterator here, since this is the only | ||
477 | * addition routine. */ | ||
478 | list_for_each_entry(e, list, list) { | ||
479 | if (!audit_compare_rule(&entry->rule, &e->rule)) | ||
480 | return -EEXIST; | ||
481 | } | ||
482 | |||
483 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { | ||
484 | list_add_rcu(&entry->list, list); | ||
485 | } else { | ||
486 | list_add_tail_rcu(&entry->list, list); | ||
487 | } | ||
488 | |||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | /* Remove an existing rule from filterlist. Protected by | ||
493 | * audit_netlink_mutex. */ | ||
494 | static inline int audit_del_rule(struct audit_entry *entry, | ||
495 | struct list_head *list) | ||
496 | { | ||
497 | struct audit_entry *e; | ||
498 | |||
499 | /* Do not use the _rcu iterator here, since this is the only | ||
500 | * deletion routine. */ | ||
501 | list_for_each_entry(e, list, list) { | ||
502 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | ||
503 | list_del_rcu(&e->list); | ||
504 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
505 | return 0; | ||
506 | } | ||
507 | } | ||
508 | return -ENOENT; /* No matching rule */ | ||
509 | } | ||
510 | |||
511 | /* List rules using struct audit_rule. Exists for backward | ||
512 | * compatibility with userspace. */ | ||
513 | static int audit_list(void *_dest) | ||
514 | { | ||
515 | int pid, seq; | ||
516 | int *dest = _dest; | ||
517 | struct audit_entry *entry; | ||
518 | int i; | ||
519 | |||
520 | pid = dest[0]; | ||
521 | seq = dest[1]; | ||
522 | kfree(dest); | ||
523 | |||
524 | mutex_lock(&audit_netlink_mutex); | ||
525 | |||
526 | /* The *_rcu iterators not needed here because we are | ||
527 | always called with audit_netlink_mutex held. */ | ||
528 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | ||
529 | list_for_each_entry(entry, &audit_filter_list[i], list) { | ||
530 | struct audit_rule *rule; | ||
531 | |||
532 | rule = audit_krule_to_rule(&entry->rule); | ||
533 | if (unlikely(!rule)) | ||
534 | break; | ||
535 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
536 | rule, sizeof(*rule)); | ||
537 | kfree(rule); | ||
538 | } | ||
539 | } | ||
540 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
541 | |||
542 | mutex_unlock(&audit_netlink_mutex); | ||
543 | return 0; | ||
544 | } | ||
545 | |||
546 | /* List rules using struct audit_rule_data. */ | ||
547 | static int audit_list_rules(void *_dest) | ||
548 | { | ||
549 | int pid, seq; | ||
550 | int *dest = _dest; | ||
551 | struct audit_entry *e; | ||
552 | int i; | ||
553 | |||
554 | pid = dest[0]; | ||
555 | seq = dest[1]; | ||
556 | kfree(dest); | ||
557 | |||
558 | mutex_lock(&audit_netlink_mutex); | ||
559 | |||
560 | /* The *_rcu iterators not needed here because we are | ||
561 | always called with audit_netlink_mutex held. */ | ||
562 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | ||
563 | list_for_each_entry(e, &audit_filter_list[i], list) { | ||
564 | struct audit_rule_data *data; | ||
565 | |||
566 | data = audit_krule_to_data(&e->rule); | ||
567 | if (unlikely(!data)) | ||
568 | break; | ||
569 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | ||
570 | data, sizeof(*data)); | ||
571 | kfree(data); | ||
572 | } | ||
573 | } | ||
574 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | ||
575 | |||
576 | mutex_unlock(&audit_netlink_mutex); | ||
577 | return 0; | ||
578 | } | ||
579 | |||
580 | /** | ||
581 | * audit_receive_filter - apply all rules to the specified message type | ||
582 | * @type: audit message type | ||
583 | * @pid: target pid for netlink audit messages | ||
584 | * @uid: target uid for netlink audit messages | ||
585 | * @seq: netlink audit message sequence (serial) number | ||
586 | * @data: payload data | ||
587 | * @datasz: size of payload data | ||
588 | * @loginuid: loginuid of sender | ||
589 | * @sid: SE Linux Security ID of sender | ||
590 | */ | ||
591 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | ||
592 | size_t datasz, uid_t loginuid, u32 sid) | ||
593 | { | ||
594 | struct task_struct *tsk; | ||
595 | int *dest; | ||
596 | int err = 0; | ||
597 | struct audit_entry *entry; | ||
598 | |||
599 | switch (type) { | ||
600 | case AUDIT_LIST: | ||
601 | case AUDIT_LIST_RULES: | ||
602 | /* We can't just spew out the rules here because we might fill | ||
603 | * the available socket buffer space and deadlock waiting for | ||
604 | * auditctl to read from it... which isn't ever going to | ||
605 | * happen if we're actually running in the context of auditctl | ||
606 | * trying to _send_ the stuff */ | ||
607 | |||
608 | dest = kmalloc(2 * sizeof(int), GFP_KERNEL); | ||
609 | if (!dest) | ||
610 | return -ENOMEM; | ||
611 | dest[0] = pid; | ||
612 | dest[1] = seq; | ||
613 | |||
614 | if (type == AUDIT_LIST) | ||
615 | tsk = kthread_run(audit_list, dest, "audit_list"); | ||
616 | else | ||
617 | tsk = kthread_run(audit_list_rules, dest, | ||
618 | "audit_list_rules"); | ||
619 | if (IS_ERR(tsk)) { | ||
620 | kfree(dest); | ||
621 | err = PTR_ERR(tsk); | ||
622 | } | ||
623 | break; | ||
624 | case AUDIT_ADD: | ||
625 | case AUDIT_ADD_RULE: | ||
626 | if (type == AUDIT_ADD) | ||
627 | entry = audit_rule_to_entry(data); | ||
628 | else | ||
629 | entry = audit_data_to_entry(data, datasz); | ||
630 | if (IS_ERR(entry)) | ||
631 | return PTR_ERR(entry); | ||
632 | |||
633 | err = audit_add_rule(entry, | ||
634 | &audit_filter_list[entry->rule.listnr]); | ||
635 | if (sid) { | ||
636 | char *ctx = NULL; | ||
637 | u32 len; | ||
638 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
639 | /* Maybe call audit_panic? */ | ||
640 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
641 | "auid=%u ssid=%u add rule to list=%d res=%d", | ||
642 | loginuid, sid, entry->rule.listnr, !err); | ||
643 | } else | ||
644 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
645 | "auid=%u subj=%s add rule to list=%d res=%d", | ||
646 | loginuid, ctx, entry->rule.listnr, !err); | ||
647 | kfree(ctx); | ||
648 | } else | ||
649 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
650 | "auid=%u add rule to list=%d res=%d", | ||
651 | loginuid, entry->rule.listnr, !err); | ||
652 | |||
653 | if (err) | ||
654 | audit_free_rule(entry); | ||
655 | break; | ||
656 | case AUDIT_DEL: | ||
657 | case AUDIT_DEL_RULE: | ||
658 | if (type == AUDIT_DEL) | ||
659 | entry = audit_rule_to_entry(data); | ||
660 | else | ||
661 | entry = audit_data_to_entry(data, datasz); | ||
662 | if (IS_ERR(entry)) | ||
663 | return PTR_ERR(entry); | ||
664 | |||
665 | err = audit_del_rule(entry, | ||
666 | &audit_filter_list[entry->rule.listnr]); | ||
667 | |||
668 | if (sid) { | ||
669 | char *ctx = NULL; | ||
670 | u32 len; | ||
671 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
672 | /* Maybe call audit_panic? */ | ||
673 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
674 | "auid=%u ssid=%u remove rule from list=%d res=%d", | ||
675 | loginuid, sid, entry->rule.listnr, !err); | ||
676 | } else | ||
677 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
678 | "auid=%u subj=%s remove rule from list=%d res=%d", | ||
679 | loginuid, ctx, entry->rule.listnr, !err); | ||
680 | kfree(ctx); | ||
681 | } else | ||
682 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
683 | "auid=%u remove rule from list=%d res=%d", | ||
684 | loginuid, entry->rule.listnr, !err); | ||
685 | |||
686 | audit_free_rule(entry); | ||
687 | break; | ||
688 | default: | ||
689 | return -EINVAL; | ||
690 | } | ||
691 | |||
692 | return err; | ||
693 | } | ||
694 | |||
695 | int audit_comparator(const u32 left, const u32 op, const u32 right) | ||
696 | { | ||
697 | switch (op) { | ||
698 | case AUDIT_EQUAL: | ||
699 | return (left == right); | ||
700 | case AUDIT_NOT_EQUAL: | ||
701 | return (left != right); | ||
702 | case AUDIT_LESS_THAN: | ||
703 | return (left < right); | ||
704 | case AUDIT_LESS_THAN_OR_EQUAL: | ||
705 | return (left <= right); | ||
706 | case AUDIT_GREATER_THAN: | ||
707 | return (left > right); | ||
708 | case AUDIT_GREATER_THAN_OR_EQUAL: | ||
709 | return (left >= right); | ||
710 | } | ||
711 | BUG(); | ||
712 | return 0; | ||
713 | } | ||
714 | |||
715 | |||
716 | |||
717 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | ||
718 | struct audit_krule *rule, | ||
719 | enum audit_state *state) | ||
720 | { | ||
721 | int i; | ||
722 | |||
723 | for (i = 0; i < rule->field_count; i++) { | ||
724 | struct audit_field *f = &rule->fields[i]; | ||
725 | int result = 0; | ||
726 | |||
727 | switch (f->type) { | ||
728 | case AUDIT_PID: | ||
729 | result = audit_comparator(cb->creds.pid, f->op, f->val); | ||
730 | break; | ||
731 | case AUDIT_UID: | ||
732 | result = audit_comparator(cb->creds.uid, f->op, f->val); | ||
733 | break; | ||
734 | case AUDIT_GID: | ||
735 | result = audit_comparator(cb->creds.gid, f->op, f->val); | ||
736 | break; | ||
737 | case AUDIT_LOGINUID: | ||
738 | result = audit_comparator(cb->loginuid, f->op, f->val); | ||
739 | break; | ||
740 | } | ||
741 | |||
742 | if (!result) | ||
743 | return 0; | ||
744 | } | ||
745 | switch (rule->action) { | ||
746 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | ||
747 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
748 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | ||
749 | } | ||
750 | return 1; | ||
751 | } | ||
752 | |||
753 | int audit_filter_user(struct netlink_skb_parms *cb, int type) | ||
754 | { | ||
755 | struct audit_entry *e; | ||
756 | enum audit_state state; | ||
757 | int ret = 1; | ||
758 | |||
759 | rcu_read_lock(); | ||
760 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | ||
761 | if (audit_filter_user_rules(cb, &e->rule, &state)) { | ||
762 | if (state == AUDIT_DISABLED) | ||
763 | ret = 0; | ||
764 | break; | ||
765 | } | ||
766 | } | ||
767 | rcu_read_unlock(); | ||
768 | |||
769 | return ret; /* Audit by default */ | ||
770 | } | ||
771 | |||
772 | int audit_filter_type(int type) | ||
773 | { | ||
774 | struct audit_entry *e; | ||
775 | int result = 0; | ||
776 | |||
777 | rcu_read_lock(); | ||
778 | if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) | ||
779 | goto unlock_and_return; | ||
780 | |||
781 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], | ||
782 | list) { | ||
783 | int i; | ||
784 | for (i = 0; i < e->rule.field_count; i++) { | ||
785 | struct audit_field *f = &e->rule.fields[i]; | ||
786 | if (f->type == AUDIT_MSGTYPE) { | ||
787 | result = audit_comparator(type, f->op, f->val); | ||
788 | if (!result) | ||
789 | break; | ||
790 | } | ||
791 | } | ||
792 | if (result) | ||
793 | goto unlock_and_return; | ||
794 | } | ||
795 | unlock_and_return: | ||
796 | rcu_read_unlock(); | ||
797 | return result; | ||
798 | } | ||
799 | |||
800 | /* Check to see if the rule contains any selinux fields. Returns 1 if there | ||
801 | are selinux fields specified in the rule, 0 otherwise. */ | ||
802 | static inline int audit_rule_has_selinux(struct audit_krule *rule) | ||
803 | { | ||
804 | int i; | ||
805 | |||
806 | for (i = 0; i < rule->field_count; i++) { | ||
807 | struct audit_field *f = &rule->fields[i]; | ||
808 | switch (f->type) { | ||
809 | case AUDIT_SE_USER: | ||
810 | case AUDIT_SE_ROLE: | ||
811 | case AUDIT_SE_TYPE: | ||
812 | case AUDIT_SE_SEN: | ||
813 | case AUDIT_SE_CLR: | ||
814 | return 1; | ||
815 | } | ||
816 | } | ||
817 | |||
818 | return 0; | ||
819 | } | ||
820 | |||
821 | /* This function will re-initialize the se_rule field of all applicable rules. | ||
822 | * It will traverse the filter lists serarching for rules that contain selinux | ||
823 | * specific filter fields. When such a rule is found, it is copied, the | ||
824 | * selinux field is re-initialized, and the old rule is replaced with the | ||
825 | * updated rule. */ | ||
826 | int selinux_audit_rule_update(void) | ||
827 | { | ||
828 | struct audit_entry *entry, *n, *nentry; | ||
829 | int i, err = 0; | ||
830 | |||
831 | /* audit_netlink_mutex synchronizes the writers */ | ||
832 | mutex_lock(&audit_netlink_mutex); | ||
833 | |||
834 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { | ||
835 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { | ||
836 | if (!audit_rule_has_selinux(&entry->rule)) | ||
837 | continue; | ||
838 | |||
839 | nentry = audit_dupe_rule(&entry->rule); | ||
840 | if (unlikely(IS_ERR(nentry))) { | ||
841 | /* save the first error encountered for the | ||
842 | * return value */ | ||
843 | if (!err) | ||
844 | err = PTR_ERR(nentry); | ||
845 | audit_panic("error updating selinux filters"); | ||
846 | list_del_rcu(&entry->list); | ||
847 | } else { | ||
848 | list_replace_rcu(&entry->list, &nentry->list); | ||
849 | } | ||
850 | call_rcu(&entry->rcu, audit_free_rule_rcu); | ||
851 | } | ||
852 | } | ||
853 | |||
854 | mutex_unlock(&audit_netlink_mutex); | ||
855 | |||
856 | return err; | ||
857 | } | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 685c25175d..1c03a4ed1b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -2,6 +2,8 @@ | |||
2 | * Handles all system-call specific auditing features. | 2 | * Handles all system-call specific auditing features. |
3 | * | 3 | * |
4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | 4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. |
5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | ||
6 | * Copyright (C) 2005 IBM Corporation | ||
5 | * All Rights Reserved. | 7 | * All Rights Reserved. |
6 | * | 8 | * |
7 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
@@ -27,11 +29,22 @@ | |||
27 | * this file -- see entry.S) is based on a GPL'd patch written by | 29 | * this file -- see entry.S) is based on a GPL'd patch written by |
28 | * okir@suse.de and Copyright 2003 SuSE Linux AG. | 30 | * okir@suse.de and Copyright 2003 SuSE Linux AG. |
29 | * | 31 | * |
32 | * The support of additional filter rules compares (>, <, >=, <=) was | ||
33 | * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. | ||
34 | * | ||
35 | * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional | ||
36 | * filesystem information. | ||
37 | * | ||
38 | * Subject and object context labeling support added by <danjones@us.ibm.com> | ||
39 | * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. | ||
30 | */ | 40 | */ |
31 | 41 | ||
32 | #include <linux/init.h> | 42 | #include <linux/init.h> |
33 | #include <asm/types.h> | 43 | #include <asm/types.h> |
34 | #include <asm/atomic.h> | 44 | #include <asm/atomic.h> |
45 | #include <asm/types.h> | ||
46 | #include <linux/fs.h> | ||
47 | #include <linux/namei.h> | ||
35 | #include <linux/mm.h> | 48 | #include <linux/mm.h> |
36 | #include <linux/module.h> | 49 | #include <linux/module.h> |
37 | #include <linux/mount.h> | 50 | #include <linux/mount.h> |
@@ -39,16 +52,17 @@ | |||
39 | #include <linux/audit.h> | 52 | #include <linux/audit.h> |
40 | #include <linux/personality.h> | 53 | #include <linux/personality.h> |
41 | #include <linux/time.h> | 54 | #include <linux/time.h> |
42 | #include <linux/kthread.h> | ||
43 | #include <linux/netlink.h> | 55 | #include <linux/netlink.h> |
44 | #include <linux/compiler.h> | 56 | #include <linux/compiler.h> |
45 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
58 | #include <linux/security.h> | ||
59 | #include <linux/list.h> | ||
60 | #include <linux/tty.h> | ||
61 | #include <linux/selinux.h> | ||
62 | |||
63 | #include "audit.h" | ||
46 | 64 | ||
47 | /* 0 = no checking | 65 | extern struct list_head audit_filter_list[]; |
48 | 1 = put_count checking | ||
49 | 2 = verbose put_count checking | ||
50 | */ | ||
51 | #define AUDIT_DEBUG 0 | ||
52 | 66 | ||
53 | /* No syscall auditing will take place unless audit_enabled != 0. */ | 67 | /* No syscall auditing will take place unless audit_enabled != 0. */ |
54 | extern int audit_enabled; | 68 | extern int audit_enabled; |
@@ -62,29 +76,6 @@ extern int audit_enabled; | |||
62 | * path_lookup. */ | 76 | * path_lookup. */ |
63 | #define AUDIT_NAMES_RESERVED 7 | 77 | #define AUDIT_NAMES_RESERVED 7 |
64 | 78 | ||
65 | /* At task start time, the audit_state is set in the audit_context using | ||
66 | a per-task filter. At syscall entry, the audit_state is augmented by | ||
67 | the syscall filter. */ | ||
68 | enum audit_state { | ||
69 | AUDIT_DISABLED, /* Do not create per-task audit_context. | ||
70 | * No syscall-specific audit records can | ||
71 | * be generated. */ | ||
72 | AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, | ||
73 | * but don't necessarily fill it in at | ||
74 | * syscall entry time (i.e., filter | ||
75 | * instead). */ | ||
76 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, | ||
77 | * and always fill it in at syscall | ||
78 | * entry time. This makes a full | ||
79 | * syscall record available if some | ||
80 | * other part of the kernel decides it | ||
81 | * should be recorded. */ | ||
82 | AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, | ||
83 | * always fill it in at syscall entry | ||
84 | * time, and always write out the audit | ||
85 | * record at syscall exit time. */ | ||
86 | }; | ||
87 | |||
88 | /* When fs/namei.c:getname() is called, we store the pointer in name and | 79 | /* When fs/namei.c:getname() is called, we store the pointer in name and |
89 | * we don't let putname() free it (instead we free all of the saved | 80 | * we don't let putname() free it (instead we free all of the saved |
90 | * pointers at syscall exit time). | 81 | * pointers at syscall exit time). |
@@ -93,12 +84,13 @@ enum audit_state { | |||
93 | struct audit_names { | 84 | struct audit_names { |
94 | const char *name; | 85 | const char *name; |
95 | unsigned long ino; | 86 | unsigned long ino; |
87 | unsigned long pino; | ||
96 | dev_t dev; | 88 | dev_t dev; |
97 | umode_t mode; | 89 | umode_t mode; |
98 | uid_t uid; | 90 | uid_t uid; |
99 | gid_t gid; | 91 | gid_t gid; |
100 | dev_t rdev; | 92 | dev_t rdev; |
101 | unsigned flags; | 93 | u32 osid; |
102 | }; | 94 | }; |
103 | 95 | ||
104 | struct audit_aux_data { | 96 | struct audit_aux_data { |
@@ -115,6 +107,7 @@ struct audit_aux_data_ipcctl { | |||
115 | uid_t uid; | 107 | uid_t uid; |
116 | gid_t gid; | 108 | gid_t gid; |
117 | mode_t mode; | 109 | mode_t mode; |
110 | u32 osid; | ||
118 | }; | 111 | }; |
119 | 112 | ||
120 | struct audit_aux_data_socketcall { | 113 | struct audit_aux_data_socketcall { |
@@ -167,290 +160,73 @@ struct audit_context { | |||
167 | #endif | 160 | #endif |
168 | }; | 161 | }; |
169 | 162 | ||
170 | /* Public API */ | ||
171 | /* There are three lists of rules -- one to search at task creation | ||
172 | * time, one to search at syscall entry time, and another to search at | ||
173 | * syscall exit time. */ | ||
174 | static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | ||
175 | LIST_HEAD_INIT(audit_filter_list[0]), | ||
176 | LIST_HEAD_INIT(audit_filter_list[1]), | ||
177 | LIST_HEAD_INIT(audit_filter_list[2]), | ||
178 | LIST_HEAD_INIT(audit_filter_list[3]), | ||
179 | LIST_HEAD_INIT(audit_filter_list[4]), | ||
180 | #if AUDIT_NR_FILTERS != 5 | ||
181 | #error Fix audit_filter_list initialiser | ||
182 | #endif | ||
183 | }; | ||
184 | |||
185 | struct audit_entry { | ||
186 | struct list_head list; | ||
187 | struct rcu_head rcu; | ||
188 | struct audit_rule rule; | ||
189 | }; | ||
190 | |||
191 | extern int audit_pid; | ||
192 | |||
193 | /* Copy rule from user-space to kernel-space. Called from | ||
194 | * audit_add_rule during AUDIT_ADD. */ | ||
195 | static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) | ||
196 | { | ||
197 | int i; | ||
198 | |||
199 | if (s->action != AUDIT_NEVER | ||
200 | && s->action != AUDIT_POSSIBLE | ||
201 | && s->action != AUDIT_ALWAYS) | ||
202 | return -1; | ||
203 | if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) | ||
204 | return -1; | ||
205 | if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) | ||
206 | return -1; | ||
207 | |||
208 | d->flags = s->flags; | ||
209 | d->action = s->action; | ||
210 | d->field_count = s->field_count; | ||
211 | for (i = 0; i < d->field_count; i++) { | ||
212 | d->fields[i] = s->fields[i]; | ||
213 | d->values[i] = s->values[i]; | ||
214 | } | ||
215 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; | ||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | /* Check to see if two rules are identical. It is called from | ||
220 | * audit_add_rule during AUDIT_ADD and | ||
221 | * audit_del_rule during AUDIT_DEL. */ | ||
222 | static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) | ||
223 | { | ||
224 | int i; | ||
225 | |||
226 | if (a->flags != b->flags) | ||
227 | return 1; | ||
228 | |||
229 | if (a->action != b->action) | ||
230 | return 1; | ||
231 | |||
232 | if (a->field_count != b->field_count) | ||
233 | return 1; | ||
234 | |||
235 | for (i = 0; i < a->field_count; i++) { | ||
236 | if (a->fields[i] != b->fields[i] | ||
237 | || a->values[i] != b->values[i]) | ||
238 | return 1; | ||
239 | } | ||
240 | |||
241 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | ||
242 | if (a->mask[i] != b->mask[i]) | ||
243 | return 1; | ||
244 | |||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | /* Note that audit_add_rule and audit_del_rule are called via | ||
249 | * audit_receive() in audit.c, and are protected by | ||
250 | * audit_netlink_sem. */ | ||
251 | static inline int audit_add_rule(struct audit_rule *rule, | ||
252 | struct list_head *list) | ||
253 | { | ||
254 | struct audit_entry *entry; | ||
255 | |||
256 | /* Do not use the _rcu iterator here, since this is the only | ||
257 | * addition routine. */ | ||
258 | list_for_each_entry(entry, list, list) { | ||
259 | if (!audit_compare_rule(rule, &entry->rule)) { | ||
260 | return -EEXIST; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) | ||
265 | return -ENOMEM; | ||
266 | if (audit_copy_rule(&entry->rule, rule)) { | ||
267 | kfree(entry); | ||
268 | return -EINVAL; | ||
269 | } | ||
270 | |||
271 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { | ||
272 | entry->rule.flags &= ~AUDIT_FILTER_PREPEND; | ||
273 | list_add_rcu(&entry->list, list); | ||
274 | } else { | ||
275 | list_add_tail_rcu(&entry->list, list); | ||
276 | } | ||
277 | |||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | static inline void audit_free_rule(struct rcu_head *head) | ||
282 | { | ||
283 | struct audit_entry *e = container_of(head, struct audit_entry, rcu); | ||
284 | kfree(e); | ||
285 | } | ||
286 | |||
287 | /* Note that audit_add_rule and audit_del_rule are called via | ||
288 | * audit_receive() in audit.c, and are protected by | ||
289 | * audit_netlink_sem. */ | ||
290 | static inline int audit_del_rule(struct audit_rule *rule, | ||
291 | struct list_head *list) | ||
292 | { | ||
293 | struct audit_entry *e; | ||
294 | |||
295 | /* Do not use the _rcu iterator here, since this is the only | ||
296 | * deletion routine. */ | ||
297 | list_for_each_entry(e, list, list) { | ||
298 | if (!audit_compare_rule(rule, &e->rule)) { | ||
299 | list_del_rcu(&e->list); | ||
300 | call_rcu(&e->rcu, audit_free_rule); | ||
301 | return 0; | ||
302 | } | ||
303 | } | ||
304 | return -ENOENT; /* No matching rule */ | ||
305 | } | ||
306 | |||
307 | static int audit_list_rules(void *_dest) | ||
308 | { | ||
309 | int pid, seq; | ||
310 | int *dest = _dest; | ||
311 | struct audit_entry *entry; | ||
312 | int i; | ||
313 | |||
314 | pid = dest[0]; | ||
315 | seq = dest[1]; | ||
316 | kfree(dest); | ||
317 | |||
318 | down(&audit_netlink_sem); | ||
319 | |||
320 | /* The *_rcu iterators not needed here because we are | ||
321 | always called with audit_netlink_sem held. */ | ||
322 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | ||
323 | list_for_each_entry(entry, &audit_filter_list[i], list) | ||
324 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
325 | &entry->rule, sizeof(entry->rule)); | ||
326 | } | ||
327 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
328 | |||
329 | up(&audit_netlink_sem); | ||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | ||
334 | uid_t loginuid) | ||
335 | { | ||
336 | struct task_struct *tsk; | ||
337 | int *dest; | ||
338 | int err = 0; | ||
339 | unsigned listnr; | ||
340 | |||
341 | switch (type) { | ||
342 | case AUDIT_LIST: | ||
343 | /* We can't just spew out the rules here because we might fill | ||
344 | * the available socket buffer space and deadlock waiting for | ||
345 | * auditctl to read from it... which isn't ever going to | ||
346 | * happen if we're actually running in the context of auditctl | ||
347 | * trying to _send_ the stuff */ | ||
348 | |||
349 | dest = kmalloc(2 * sizeof(int), GFP_KERNEL); | ||
350 | if (!dest) | ||
351 | return -ENOMEM; | ||
352 | dest[0] = pid; | ||
353 | dest[1] = seq; | ||
354 | |||
355 | tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); | ||
356 | if (IS_ERR(tsk)) { | ||
357 | kfree(dest); | ||
358 | err = PTR_ERR(tsk); | ||
359 | } | ||
360 | break; | ||
361 | case AUDIT_ADD: | ||
362 | listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; | ||
363 | if (listnr >= AUDIT_NR_FILTERS) | ||
364 | return -EINVAL; | ||
365 | |||
366 | err = audit_add_rule(data, &audit_filter_list[listnr]); | ||
367 | if (!err) | ||
368 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
369 | "auid=%u added an audit rule\n", loginuid); | ||
370 | break; | ||
371 | case AUDIT_DEL: | ||
372 | listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; | ||
373 | if (listnr >= AUDIT_NR_FILTERS) | ||
374 | return -EINVAL; | ||
375 | |||
376 | err = audit_del_rule(data, &audit_filter_list[listnr]); | ||
377 | if (!err) | ||
378 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
379 | "auid=%u removed an audit rule\n", loginuid); | ||
380 | break; | ||
381 | default: | ||
382 | return -EINVAL; | ||
383 | } | ||
384 | |||
385 | return err; | ||
386 | } | ||
387 | 163 | ||
388 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 164 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
389 | * otherwise. */ | 165 | * otherwise. */ |
390 | static int audit_filter_rules(struct task_struct *tsk, | 166 | static int audit_filter_rules(struct task_struct *tsk, |
391 | struct audit_rule *rule, | 167 | struct audit_krule *rule, |
392 | struct audit_context *ctx, | 168 | struct audit_context *ctx, |
393 | enum audit_state *state) | 169 | enum audit_state *state) |
394 | { | 170 | { |
395 | int i, j; | 171 | int i, j, need_sid = 1; |
172 | u32 sid; | ||
396 | 173 | ||
397 | for (i = 0; i < rule->field_count; i++) { | 174 | for (i = 0; i < rule->field_count; i++) { |
398 | u32 field = rule->fields[i] & ~AUDIT_NEGATE; | 175 | struct audit_field *f = &rule->fields[i]; |
399 | u32 value = rule->values[i]; | ||
400 | int result = 0; | 176 | int result = 0; |
401 | 177 | ||
402 | switch (field) { | 178 | switch (f->type) { |
403 | case AUDIT_PID: | 179 | case AUDIT_PID: |
404 | result = (tsk->pid == value); | 180 | result = audit_comparator(tsk->pid, f->op, f->val); |
405 | break; | 181 | break; |
406 | case AUDIT_UID: | 182 | case AUDIT_UID: |
407 | result = (tsk->uid == value); | 183 | result = audit_comparator(tsk->uid, f->op, f->val); |
408 | break; | 184 | break; |
409 | case AUDIT_EUID: | 185 | case AUDIT_EUID: |
410 | result = (tsk->euid == value); | 186 | result = audit_comparator(tsk->euid, f->op, f->val); |
411 | break; | 187 | break; |
412 | case AUDIT_SUID: | 188 | case AUDIT_SUID: |
413 | result = (tsk->suid == value); | 189 | result = audit_comparator(tsk->suid, f->op, f->val); |
414 | break; | 190 | break; |
415 | case AUDIT_FSUID: | 191 | case AUDIT_FSUID: |
416 | result = (tsk->fsuid == value); | 192 | result = audit_comparator(tsk->fsuid, f->op, f->val); |
417 | break; | 193 | break; |
418 | case AUDIT_GID: | 194 | case AUDIT_GID: |
419 | result = (tsk->gid == value); | 195 | result = audit_comparator(tsk->gid, f->op, f->val); |
420 | break; | 196 | break; |
421 | case AUDIT_EGID: | 197 | case AUDIT_EGID: |
422 | result = (tsk->egid == value); | 198 | result = audit_comparator(tsk->egid, f->op, f->val); |
423 | break; | 199 | break; |
424 | case AUDIT_SGID: | 200 | case AUDIT_SGID: |
425 | result = (tsk->sgid == value); | 201 | result = audit_comparator(tsk->sgid, f->op, f->val); |
426 | break; | 202 | break; |
427 | case AUDIT_FSGID: | 203 | case AUDIT_FSGID: |
428 | result = (tsk->fsgid == value); | 204 | result = audit_comparator(tsk->fsgid, f->op, f->val); |
429 | break; | 205 | break; |
430 | case AUDIT_PERS: | 206 | case AUDIT_PERS: |
431 | result = (tsk->personality == value); | 207 | result = audit_comparator(tsk->personality, f->op, f->val); |
432 | break; | 208 | break; |
433 | case AUDIT_ARCH: | 209 | case AUDIT_ARCH: |
434 | if (ctx) | 210 | if (ctx) |
435 | result = (ctx->arch == value); | 211 | result = audit_comparator(ctx->arch, f->op, f->val); |
436 | break; | 212 | break; |
437 | 213 | ||
438 | case AUDIT_EXIT: | 214 | case AUDIT_EXIT: |
439 | if (ctx && ctx->return_valid) | 215 | if (ctx && ctx->return_valid) |
440 | result = (ctx->return_code == value); | 216 | result = audit_comparator(ctx->return_code, f->op, f->val); |
441 | break; | 217 | break; |
442 | case AUDIT_SUCCESS: | 218 | case AUDIT_SUCCESS: |
443 | if (ctx && ctx->return_valid) { | 219 | if (ctx && ctx->return_valid) { |
444 | if (value) | 220 | if (f->val) |
445 | result = (ctx->return_valid == AUDITSC_SUCCESS); | 221 | result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); |
446 | else | 222 | else |
447 | result = (ctx->return_valid == AUDITSC_FAILURE); | 223 | result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); |
448 | } | 224 | } |
449 | break; | 225 | break; |
450 | case AUDIT_DEVMAJOR: | 226 | case AUDIT_DEVMAJOR: |
451 | if (ctx) { | 227 | if (ctx) { |
452 | for (j = 0; j < ctx->name_count; j++) { | 228 | for (j = 0; j < ctx->name_count; j++) { |
453 | if (MAJOR(ctx->names[j].dev)==value) { | 229 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { |
454 | ++result; | 230 | ++result; |
455 | break; | 231 | break; |
456 | } | 232 | } |
@@ -460,7 +236,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
460 | case AUDIT_DEVMINOR: | 236 | case AUDIT_DEVMINOR: |
461 | if (ctx) { | 237 | if (ctx) { |
462 | for (j = 0; j < ctx->name_count; j++) { | 238 | for (j = 0; j < ctx->name_count; j++) { |
463 | if (MINOR(ctx->names[j].dev)==value) { | 239 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { |
464 | ++result; | 240 | ++result; |
465 | break; | 241 | break; |
466 | } | 242 | } |
@@ -470,7 +246,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
470 | case AUDIT_INODE: | 246 | case AUDIT_INODE: |
471 | if (ctx) { | 247 | if (ctx) { |
472 | for (j = 0; j < ctx->name_count; j++) { | 248 | for (j = 0; j < ctx->name_count; j++) { |
473 | if (ctx->names[j].ino == value) { | 249 | if (audit_comparator(ctx->names[j].ino, f->op, f->val) || |
250 | audit_comparator(ctx->names[j].pino, f->op, f->val)) { | ||
474 | ++result; | 251 | ++result; |
475 | break; | 252 | break; |
476 | } | 253 | } |
@@ -480,19 +257,38 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
480 | case AUDIT_LOGINUID: | 257 | case AUDIT_LOGINUID: |
481 | result = 0; | 258 | result = 0; |
482 | if (ctx) | 259 | if (ctx) |
483 | result = (ctx->loginuid == value); | 260 | result = audit_comparator(ctx->loginuid, f->op, f->val); |
261 | break; | ||
262 | case AUDIT_SE_USER: | ||
263 | case AUDIT_SE_ROLE: | ||
264 | case AUDIT_SE_TYPE: | ||
265 | case AUDIT_SE_SEN: | ||
266 | case AUDIT_SE_CLR: | ||
267 | /* NOTE: this may return negative values indicating | ||
268 | a temporary error. We simply treat this as a | ||
269 | match for now to avoid losing information that | ||
270 | may be wanted. An error message will also be | ||
271 | logged upon error */ | ||
272 | if (f->se_rule) { | ||
273 | if (need_sid) { | ||
274 | selinux_task_ctxid(tsk, &sid); | ||
275 | need_sid = 0; | ||
276 | } | ||
277 | result = selinux_audit_rule_match(sid, f->type, | ||
278 | f->op, | ||
279 | f->se_rule, | ||
280 | ctx); | ||
281 | } | ||
484 | break; | 282 | break; |
485 | case AUDIT_ARG0: | 283 | case AUDIT_ARG0: |
486 | case AUDIT_ARG1: | 284 | case AUDIT_ARG1: |
487 | case AUDIT_ARG2: | 285 | case AUDIT_ARG2: |
488 | case AUDIT_ARG3: | 286 | case AUDIT_ARG3: |
489 | if (ctx) | 287 | if (ctx) |
490 | result = (ctx->argv[field-AUDIT_ARG0]==value); | 288 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); |
491 | break; | 289 | break; |
492 | } | 290 | } |
493 | 291 | ||
494 | if (rule->fields[i] & AUDIT_NEGATE) | ||
495 | result = !result; | ||
496 | if (!result) | 292 | if (!result) |
497 | return 0; | 293 | return 0; |
498 | } | 294 | } |
@@ -527,7 +323,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) | |||
527 | /* At syscall entry and exit time, this filter is called if the | 323 | /* At syscall entry and exit time, this filter is called if the |
528 | * audit_state is not low enough that auditing cannot take place, but is | 324 | * audit_state is not low enough that auditing cannot take place, but is |
529 | * also not high enough that we already know we have to write an audit | 325 | * also not high enough that we already know we have to write an audit |
530 | * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). | 326 | * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). |
531 | */ | 327 | */ |
532 | static enum audit_state audit_filter_syscall(struct task_struct *tsk, | 328 | static enum audit_state audit_filter_syscall(struct task_struct *tsk, |
533 | struct audit_context *ctx, | 329 | struct audit_context *ctx, |
@@ -541,80 +337,21 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
541 | 337 | ||
542 | rcu_read_lock(); | 338 | rcu_read_lock(); |
543 | if (!list_empty(list)) { | 339 | if (!list_empty(list)) { |
544 | int word = AUDIT_WORD(ctx->major); | 340 | int word = AUDIT_WORD(ctx->major); |
545 | int bit = AUDIT_BIT(ctx->major); | 341 | int bit = AUDIT_BIT(ctx->major); |
546 | 342 | ||
547 | list_for_each_entry_rcu(e, list, list) { | 343 | list_for_each_entry_rcu(e, list, list) { |
548 | if ((e->rule.mask[word] & bit) == bit | 344 | if ((e->rule.mask[word] & bit) == bit |
549 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | 345 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { |
550 | rcu_read_unlock(); | 346 | rcu_read_unlock(); |
551 | return state; | 347 | return state; |
552 | } | 348 | } |
553 | } | ||
554 | } | ||
555 | rcu_read_unlock(); | ||
556 | return AUDIT_BUILD_CONTEXT; | ||
557 | } | ||
558 | |||
559 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | ||
560 | struct audit_rule *rule, | ||
561 | enum audit_state *state) | ||
562 | { | ||
563 | int i; | ||
564 | |||
565 | for (i = 0; i < rule->field_count; i++) { | ||
566 | u32 field = rule->fields[i] & ~AUDIT_NEGATE; | ||
567 | u32 value = rule->values[i]; | ||
568 | int result = 0; | ||
569 | |||
570 | switch (field) { | ||
571 | case AUDIT_PID: | ||
572 | result = (cb->creds.pid == value); | ||
573 | break; | ||
574 | case AUDIT_UID: | ||
575 | result = (cb->creds.uid == value); | ||
576 | break; | ||
577 | case AUDIT_GID: | ||
578 | result = (cb->creds.gid == value); | ||
579 | break; | ||
580 | case AUDIT_LOGINUID: | ||
581 | result = (cb->loginuid == value); | ||
582 | break; | ||
583 | } | ||
584 | |||
585 | if (rule->fields[i] & AUDIT_NEGATE) | ||
586 | result = !result; | ||
587 | if (!result) | ||
588 | return 0; | ||
589 | } | ||
590 | switch (rule->action) { | ||
591 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | ||
592 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
593 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | ||
594 | } | ||
595 | return 1; | ||
596 | } | ||
597 | |||
598 | int audit_filter_user(struct netlink_skb_parms *cb, int type) | ||
599 | { | ||
600 | struct audit_entry *e; | ||
601 | enum audit_state state; | ||
602 | int ret = 1; | ||
603 | |||
604 | rcu_read_lock(); | ||
605 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | ||
606 | if (audit_filter_user_rules(cb, &e->rule, &state)) { | ||
607 | if (state == AUDIT_DISABLED) | ||
608 | ret = 0; | ||
609 | break; | ||
610 | } | 349 | } |
611 | } | 350 | } |
612 | rcu_read_unlock(); | 351 | rcu_read_unlock(); |
613 | 352 | return AUDIT_BUILD_CONTEXT; | |
614 | return ret; /* Audit by default */ | ||
615 | } | 353 | } |
616 | 354 | ||
617 | /* This should be called with task_lock() held. */ | ||
618 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, | 355 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, |
619 | int return_valid, | 356 | int return_valid, |
620 | int return_code) | 357 | int return_code) |
@@ -654,17 +391,18 @@ static inline void audit_free_names(struct audit_context *context) | |||
654 | #if AUDIT_DEBUG == 2 | 391 | #if AUDIT_DEBUG == 2 |
655 | if (context->auditable | 392 | if (context->auditable |
656 | ||context->put_count + context->ino_count != context->name_count) { | 393 | ||context->put_count + context->ino_count != context->name_count) { |
657 | printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" | 394 | printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" |
658 | " name_count=%d put_count=%d" | 395 | " name_count=%d put_count=%d" |
659 | " ino_count=%d [NOT freeing]\n", | 396 | " ino_count=%d [NOT freeing]\n", |
660 | __LINE__, | 397 | __FILE__, __LINE__, |
661 | context->serial, context->major, context->in_syscall, | 398 | context->serial, context->major, context->in_syscall, |
662 | context->name_count, context->put_count, | 399 | context->name_count, context->put_count, |
663 | context->ino_count); | 400 | context->ino_count); |
664 | for (i = 0; i < context->name_count; i++) | 401 | for (i = 0; i < context->name_count; i++) { |
665 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | 402 | printk(KERN_ERR "names[%d] = %p = %s\n", i, |
666 | context->names[i].name, | 403 | context->names[i].name, |
667 | context->names[i].name); | 404 | context->names[i].name ?: "(null)"); |
405 | } | ||
668 | dump_stack(); | 406 | dump_stack(); |
669 | return; | 407 | return; |
670 | } | 408 | } |
@@ -674,9 +412,10 @@ static inline void audit_free_names(struct audit_context *context) | |||
674 | context->ino_count = 0; | 412 | context->ino_count = 0; |
675 | #endif | 413 | #endif |
676 | 414 | ||
677 | for (i = 0; i < context->name_count; i++) | 415 | for (i = 0; i < context->name_count; i++) { |
678 | if (context->names[i].name) | 416 | if (context->names[i].name) |
679 | __putname(context->names[i].name); | 417 | __putname(context->names[i].name); |
418 | } | ||
680 | context->name_count = 0; | 419 | context->name_count = 0; |
681 | if (context->pwd) | 420 | if (context->pwd) |
682 | dput(context->pwd); | 421 | dput(context->pwd); |
@@ -696,6 +435,7 @@ static inline void audit_free_aux(struct audit_context *context) | |||
696 | dput(axi->dentry); | 435 | dput(axi->dentry); |
697 | mntput(axi->mnt); | 436 | mntput(axi->mnt); |
698 | } | 437 | } |
438 | |||
699 | context->aux = aux->next; | 439 | context->aux = aux->next; |
700 | kfree(aux); | 440 | kfree(aux); |
701 | } | 441 | } |
@@ -721,10 +461,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) | |||
721 | return context; | 461 | return context; |
722 | } | 462 | } |
723 | 463 | ||
724 | /* Filter on the task information and allocate a per-task audit context | 464 | /** |
465 | * audit_alloc - allocate an audit context block for a task | ||
466 | * @tsk: task | ||
467 | * | ||
468 | * Filter on the task information and allocate a per-task audit context | ||
725 | * if necessary. Doing so turns on system call auditing for the | 469 | * if necessary. Doing so turns on system call auditing for the |
726 | * specified task. This is called from copy_process, so no lock is | 470 | * specified task. This is called from copy_process, so no lock is |
727 | * needed. */ | 471 | * needed. |
472 | */ | ||
728 | int audit_alloc(struct task_struct *tsk) | 473 | int audit_alloc(struct task_struct *tsk) |
729 | { | 474 | { |
730 | struct audit_context *context; | 475 | struct audit_context *context; |
@@ -775,41 +520,76 @@ static inline void audit_free_context(struct audit_context *context) | |||
775 | printk(KERN_ERR "audit: freed %d contexts\n", count); | 520 | printk(KERN_ERR "audit: freed %d contexts\n", count); |
776 | } | 521 | } |
777 | 522 | ||
778 | static void audit_log_task_info(struct audit_buffer *ab) | 523 | static void audit_log_task_context(struct audit_buffer *ab) |
779 | { | 524 | { |
780 | char name[sizeof(current->comm)]; | 525 | char *ctx = NULL; |
781 | struct mm_struct *mm = current->mm; | 526 | ssize_t len = 0; |
527 | |||
528 | len = security_getprocattr(current, "current", NULL, 0); | ||
529 | if (len < 0) { | ||
530 | if (len != -EINVAL) | ||
531 | goto error_path; | ||
532 | return; | ||
533 | } | ||
534 | |||
535 | ctx = kmalloc(len, GFP_KERNEL); | ||
536 | if (!ctx) | ||
537 | goto error_path; | ||
538 | |||
539 | len = security_getprocattr(current, "current", ctx, len); | ||
540 | if (len < 0 ) | ||
541 | goto error_path; | ||
542 | |||
543 | audit_log_format(ab, " subj=%s", ctx); | ||
544 | return; | ||
545 | |||
546 | error_path: | ||
547 | if (ctx) | ||
548 | kfree(ctx); | ||
549 | audit_panic("error in audit_log_task_context"); | ||
550 | return; | ||
551 | } | ||
552 | |||
553 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | ||
554 | { | ||
555 | char name[sizeof(tsk->comm)]; | ||
556 | struct mm_struct *mm = tsk->mm; | ||
782 | struct vm_area_struct *vma; | 557 | struct vm_area_struct *vma; |
783 | 558 | ||
784 | get_task_comm(name, current); | 559 | /* tsk == current */ |
560 | |||
561 | get_task_comm(name, tsk); | ||
785 | audit_log_format(ab, " comm="); | 562 | audit_log_format(ab, " comm="); |
786 | audit_log_untrustedstring(ab, name); | 563 | audit_log_untrustedstring(ab, name); |
787 | 564 | ||
788 | if (!mm) | 565 | if (mm) { |
789 | return; | 566 | down_read(&mm->mmap_sem); |
790 | 567 | vma = mm->mmap; | |
791 | down_read(&mm->mmap_sem); | 568 | while (vma) { |
792 | vma = mm->mmap; | 569 | if ((vma->vm_flags & VM_EXECUTABLE) && |
793 | while (vma) { | 570 | vma->vm_file) { |
794 | if ((vma->vm_flags & VM_EXECUTABLE) && | 571 | audit_log_d_path(ab, "exe=", |
795 | vma->vm_file) { | 572 | vma->vm_file->f_dentry, |
796 | audit_log_d_path(ab, "exe=", | 573 | vma->vm_file->f_vfsmnt); |
797 | vma->vm_file->f_dentry, | 574 | break; |
798 | vma->vm_file->f_vfsmnt); | 575 | } |
799 | break; | 576 | vma = vma->vm_next; |
800 | } | 577 | } |
801 | vma = vma->vm_next; | 578 | up_read(&mm->mmap_sem); |
802 | } | 579 | } |
803 | up_read(&mm->mmap_sem); | 580 | audit_log_task_context(ab); |
804 | } | 581 | } |
805 | 582 | ||
806 | static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) | 583 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
807 | { | 584 | { |
808 | int i; | 585 | int i, call_panic = 0; |
809 | struct audit_buffer *ab; | 586 | struct audit_buffer *ab; |
810 | struct audit_aux_data *aux; | 587 | struct audit_aux_data *aux; |
588 | const char *tty; | ||
811 | 589 | ||
812 | ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); | 590 | /* tsk == current */ |
591 | |||
592 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); | ||
813 | if (!ab) | 593 | if (!ab) |
814 | return; /* audit_panic has been called */ | 594 | return; /* audit_panic has been called */ |
815 | audit_log_format(ab, "arch=%x syscall=%d", | 595 | audit_log_format(ab, "arch=%x syscall=%d", |
@@ -820,11 +600,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) | |||
820 | audit_log_format(ab, " success=%s exit=%ld", | 600 | audit_log_format(ab, " success=%s exit=%ld", |
821 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", | 601 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", |
822 | context->return_code); | 602 | context->return_code); |
603 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | ||
604 | tty = tsk->signal->tty->name; | ||
605 | else | ||
606 | tty = "(none)"; | ||
823 | audit_log_format(ab, | 607 | audit_log_format(ab, |
824 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 608 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
825 | " pid=%d auid=%u uid=%u gid=%u" | 609 | " pid=%d auid=%u uid=%u gid=%u" |
826 | " euid=%u suid=%u fsuid=%u" | 610 | " euid=%u suid=%u fsuid=%u" |
827 | " egid=%u sgid=%u fsgid=%u", | 611 | " egid=%u sgid=%u fsgid=%u tty=%s", |
828 | context->argv[0], | 612 | context->argv[0], |
829 | context->argv[1], | 613 | context->argv[1], |
830 | context->argv[2], | 614 | context->argv[2], |
@@ -835,8 +619,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) | |||
835 | context->uid, | 619 | context->uid, |
836 | context->gid, | 620 | context->gid, |
837 | context->euid, context->suid, context->fsuid, | 621 | context->euid, context->suid, context->fsuid, |
838 | context->egid, context->sgid, context->fsgid); | 622 | context->egid, context->sgid, context->fsgid, tty); |
839 | audit_log_task_info(ab); | 623 | audit_log_task_info(ab, tsk); |
840 | audit_log_end(ab); | 624 | audit_log_end(ab); |
841 | 625 | ||
842 | for (aux = context->aux; aux; aux = aux->next) { | 626 | for (aux = context->aux; aux; aux = aux->next) { |
@@ -849,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) | |||
849 | case AUDIT_IPC: { | 633 | case AUDIT_IPC: { |
850 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 634 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
851 | audit_log_format(ab, | 635 | audit_log_format(ab, |
852 | " qbytes=%lx iuid=%u igid=%u mode=%x", | 636 | " qbytes=%lx iuid=%u igid=%u mode=%x", |
853 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 637 | axi->qbytes, axi->uid, axi->gid, axi->mode); |
638 | if (axi->osid != 0) { | ||
639 | char *ctx = NULL; | ||
640 | u32 len; | ||
641 | if (selinux_ctxid_to_string( | ||
642 | axi->osid, &ctx, &len)) { | ||
643 | audit_log_format(ab, " osid=%u", | ||
644 | axi->osid); | ||
645 | call_panic = 1; | ||
646 | } else | ||
647 | audit_log_format(ab, " obj=%s", ctx); | ||
648 | kfree(ctx); | ||
649 | } | ||
650 | break; } | ||
651 | |||
652 | case AUDIT_IPC_SET_PERM: { | ||
653 | struct audit_aux_data_ipcctl *axi = (void *)aux; | ||
654 | audit_log_format(ab, | ||
655 | " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", | ||
656 | axi->qbytes, axi->uid, axi->gid, axi->mode); | ||
657 | if (axi->osid != 0) { | ||
658 | char *ctx = NULL; | ||
659 | u32 len; | ||
660 | if (selinux_ctxid_to_string( | ||
661 | axi->osid, &ctx, &len)) { | ||
662 | audit_log_format(ab, " osid=%u", | ||
663 | axi->osid); | ||
664 | call_panic = 1; | ||
665 | } else | ||
666 | audit_log_format(ab, " obj=%s", ctx); | ||
667 | kfree(ctx); | ||
668 | } | ||
854 | break; } | 669 | break; } |
855 | 670 | ||
856 | case AUDIT_SOCKETCALL: { | 671 | case AUDIT_SOCKETCALL: { |
@@ -885,42 +700,65 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) | |||
885 | } | 700 | } |
886 | } | 701 | } |
887 | for (i = 0; i < context->name_count; i++) { | 702 | for (i = 0; i < context->name_count; i++) { |
703 | unsigned long ino = context->names[i].ino; | ||
704 | unsigned long pino = context->names[i].pino; | ||
705 | |||
888 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | 706 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); |
889 | if (!ab) | 707 | if (!ab) |
890 | continue; /* audit_panic has been called */ | 708 | continue; /* audit_panic has been called */ |
891 | 709 | ||
892 | audit_log_format(ab, "item=%d", i); | 710 | audit_log_format(ab, "item=%d", i); |
893 | if (context->names[i].name) { | 711 | |
894 | audit_log_format(ab, " name="); | 712 | audit_log_format(ab, " name="); |
713 | if (context->names[i].name) | ||
895 | audit_log_untrustedstring(ab, context->names[i].name); | 714 | audit_log_untrustedstring(ab, context->names[i].name); |
896 | } | 715 | else |
897 | audit_log_format(ab, " flags=%x\n", context->names[i].flags); | 716 | audit_log_format(ab, "(null)"); |
898 | 717 | ||
899 | if (context->names[i].ino != (unsigned long)-1) | 718 | if (pino != (unsigned long)-1) |
900 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" | 719 | audit_log_format(ab, " parent=%lu", pino); |
901 | " ouid=%u ogid=%u rdev=%02x:%02x", | 720 | if (ino != (unsigned long)-1) |
902 | context->names[i].ino, | 721 | audit_log_format(ab, " inode=%lu", ino); |
903 | MAJOR(context->names[i].dev), | 722 | if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) |
904 | MINOR(context->names[i].dev), | 723 | audit_log_format(ab, " dev=%02x:%02x mode=%#o" |
905 | context->names[i].mode, | 724 | " ouid=%u ogid=%u rdev=%02x:%02x", |
906 | context->names[i].uid, | 725 | MAJOR(context->names[i].dev), |
907 | context->names[i].gid, | 726 | MINOR(context->names[i].dev), |
908 | MAJOR(context->names[i].rdev), | 727 | context->names[i].mode, |
728 | context->names[i].uid, | ||
729 | context->names[i].gid, | ||
730 | MAJOR(context->names[i].rdev), | ||
909 | MINOR(context->names[i].rdev)); | 731 | MINOR(context->names[i].rdev)); |
732 | if (context->names[i].osid != 0) { | ||
733 | char *ctx = NULL; | ||
734 | u32 len; | ||
735 | if (selinux_ctxid_to_string( | ||
736 | context->names[i].osid, &ctx, &len)) { | ||
737 | audit_log_format(ab, " osid=%u", | ||
738 | context->names[i].osid); | ||
739 | call_panic = 2; | ||
740 | } else | ||
741 | audit_log_format(ab, " obj=%s", ctx); | ||
742 | kfree(ctx); | ||
743 | } | ||
744 | |||
910 | audit_log_end(ab); | 745 | audit_log_end(ab); |
911 | } | 746 | } |
747 | if (call_panic) | ||
748 | audit_panic("error converting sid to string"); | ||
912 | } | 749 | } |
913 | 750 | ||
914 | /* Free a per-task audit context. Called from copy_process and | 751 | /** |
915 | * __put_task_struct. */ | 752 | * audit_free - free a per-task audit context |
753 | * @tsk: task whose audit context block to free | ||
754 | * | ||
755 | * Called from copy_process and do_exit | ||
756 | */ | ||
916 | void audit_free(struct task_struct *tsk) | 757 | void audit_free(struct task_struct *tsk) |
917 | { | 758 | { |
918 | struct audit_context *context; | 759 | struct audit_context *context; |
919 | 760 | ||
920 | task_lock(tsk); | ||
921 | context = audit_get_context(tsk, 0, 0); | 761 | context = audit_get_context(tsk, 0, 0); |
922 | task_unlock(tsk); | ||
923 | |||
924 | if (likely(!context)) | 762 | if (likely(!context)) |
925 | return; | 763 | return; |
926 | 764 | ||
@@ -928,29 +766,43 @@ void audit_free(struct task_struct *tsk) | |||
928 | * function (e.g., exit_group), then free context block. | 766 | * function (e.g., exit_group), then free context block. |
929 | * We use GFP_ATOMIC here because we might be doing this | 767 | * We use GFP_ATOMIC here because we might be doing this |
930 | * in the context of the idle thread */ | 768 | * in the context of the idle thread */ |
769 | /* that can happen only if we are called from do_exit() */ | ||
931 | if (context->in_syscall && context->auditable) | 770 | if (context->in_syscall && context->auditable) |
932 | audit_log_exit(context, GFP_ATOMIC); | 771 | audit_log_exit(context, tsk); |
933 | 772 | ||
934 | audit_free_context(context); | 773 | audit_free_context(context); |
935 | } | 774 | } |
936 | 775 | ||
937 | /* Fill in audit context at syscall entry. This only happens if the | 776 | /** |
777 | * audit_syscall_entry - fill in an audit record at syscall entry | ||
778 | * @tsk: task being audited | ||
779 | * @arch: architecture type | ||
780 | * @major: major syscall type (function) | ||
781 | * @a1: additional syscall register 1 | ||
782 | * @a2: additional syscall register 2 | ||
783 | * @a3: additional syscall register 3 | ||
784 | * @a4: additional syscall register 4 | ||
785 | * | ||
786 | * Fill in audit context at syscall entry. This only happens if the | ||
938 | * audit context was created when the task was created and the state or | 787 | * audit context was created when the task was created and the state or |
939 | * filters demand the audit context be built. If the state from the | 788 | * filters demand the audit context be built. If the state from the |
940 | * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, | 789 | * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, |
941 | * then the record will be written at syscall exit time (otherwise, it | 790 | * then the record will be written at syscall exit time (otherwise, it |
942 | * will only be written if another part of the kernel requests that it | 791 | * will only be written if another part of the kernel requests that it |
943 | * be written). */ | 792 | * be written). |
944 | void audit_syscall_entry(struct task_struct *tsk, int arch, int major, | 793 | */ |
794 | void audit_syscall_entry(int arch, int major, | ||
945 | unsigned long a1, unsigned long a2, | 795 | unsigned long a1, unsigned long a2, |
946 | unsigned long a3, unsigned long a4) | 796 | unsigned long a3, unsigned long a4) |
947 | { | 797 | { |
798 | struct task_struct *tsk = current; | ||
948 | struct audit_context *context = tsk->audit_context; | 799 | struct audit_context *context = tsk->audit_context; |
949 | enum audit_state state; | 800 | enum audit_state state; |
950 | 801 | ||
951 | BUG_ON(!context); | 802 | BUG_ON(!context); |
952 | 803 | ||
953 | /* This happens only on certain architectures that make system | 804 | /* |
805 | * This happens only on certain architectures that make system | ||
954 | * calls in kernel_thread via the entry.S interface, instead of | 806 | * calls in kernel_thread via the entry.S interface, instead of |
955 | * with direct calls. (If you are porting to a new | 807 | * with direct calls. (If you are porting to a new |
956 | * architecture, hitting this condition can indicate that you | 808 | * architecture, hitting this condition can indicate that you |
@@ -958,7 +810,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, | |||
958 | * | 810 | * |
959 | * i386 no | 811 | * i386 no |
960 | * x86_64 no | 812 | * x86_64 no |
961 | * ppc64 yes (see arch/ppc64/kernel/misc.S) | 813 | * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) |
962 | * | 814 | * |
963 | * This also happens with vm86 emulation in a non-nested manner | 815 | * This also happens with vm86 emulation in a non-nested manner |
964 | * (entries without exits), so this case must be caught. | 816 | * (entries without exits), so this case must be caught. |
@@ -966,11 +818,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, | |||
966 | if (context->in_syscall) { | 818 | if (context->in_syscall) { |
967 | struct audit_context *newctx; | 819 | struct audit_context *newctx; |
968 | 820 | ||
969 | #if defined(__NR_vm86) && defined(__NR_vm86old) | ||
970 | /* vm86 mode should only be entered once */ | ||
971 | if (major == __NR_vm86 || major == __NR_vm86old) | ||
972 | return; | ||
973 | #endif | ||
974 | #if AUDIT_DEBUG | 821 | #if AUDIT_DEBUG |
975 | printk(KERN_ERR | 822 | printk(KERN_ERR |
976 | "audit(:%d) pid=%d in syscall=%d;" | 823 | "audit(:%d) pid=%d in syscall=%d;" |
@@ -1014,27 +861,30 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, | |||
1014 | context->auditable = !!(state == AUDIT_RECORD_CONTEXT); | 861 | context->auditable = !!(state == AUDIT_RECORD_CONTEXT); |
1015 | } | 862 | } |
1016 | 863 | ||
1017 | /* Tear down after system call. If the audit context has been marked as | 864 | /** |
865 | * audit_syscall_exit - deallocate audit context after a system call | ||
866 | * @tsk: task being audited | ||
867 | * @valid: success/failure flag | ||
868 | * @return_code: syscall return value | ||
869 | * | ||
870 | * Tear down after system call. If the audit context has been marked as | ||
1018 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from | 871 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from |
1019 | * filtering, or because some other part of the kernel write an audit | 872 | * filtering, or because some other part of the kernel write an audit |
1020 | * message), then write out the syscall information. In call cases, | 873 | * message), then write out the syscall information. In call cases, |
1021 | * free the names stored from getname(). */ | 874 | * free the names stored from getname(). |
1022 | void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) | 875 | */ |
876 | void audit_syscall_exit(int valid, long return_code) | ||
1023 | { | 877 | { |
878 | struct task_struct *tsk = current; | ||
1024 | struct audit_context *context; | 879 | struct audit_context *context; |
1025 | 880 | ||
1026 | get_task_struct(tsk); | ||
1027 | task_lock(tsk); | ||
1028 | context = audit_get_context(tsk, valid, return_code); | 881 | context = audit_get_context(tsk, valid, return_code); |
1029 | task_unlock(tsk); | ||
1030 | 882 | ||
1031 | /* Not having a context here is ok, since the parent may have | ||
1032 | * called __put_task_struct. */ | ||
1033 | if (likely(!context)) | 883 | if (likely(!context)) |
1034 | goto out; | 884 | return; |
1035 | 885 | ||
1036 | if (context->in_syscall && context->auditable) | 886 | if (context->in_syscall && context->auditable) |
1037 | audit_log_exit(context, GFP_KERNEL); | 887 | audit_log_exit(context, tsk); |
1038 | 888 | ||
1039 | context->in_syscall = 0; | 889 | context->in_syscall = 0; |
1040 | context->auditable = 0; | 890 | context->auditable = 0; |
@@ -1049,11 +899,15 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) | |||
1049 | audit_free_aux(context); | 899 | audit_free_aux(context); |
1050 | tsk->audit_context = context; | 900 | tsk->audit_context = context; |
1051 | } | 901 | } |
1052 | out: | ||
1053 | put_task_struct(tsk); | ||
1054 | } | 902 | } |
1055 | 903 | ||
1056 | /* Add a name to the list. Called from fs/namei.c:getname(). */ | 904 | /** |
905 | * audit_getname - add a name to the list | ||
906 | * @name: name to add | ||
907 | * | ||
908 | * Add a name to the list of audit names for this context. | ||
909 | * Called from fs/namei.c:getname(). | ||
910 | */ | ||
1057 | void audit_getname(const char *name) | 911 | void audit_getname(const char *name) |
1058 | { | 912 | { |
1059 | struct audit_context *context = current->audit_context; | 913 | struct audit_context *context = current->audit_context; |
@@ -1082,10 +936,13 @@ void audit_getname(const char *name) | |||
1082 | 936 | ||
1083 | } | 937 | } |
1084 | 938 | ||
1085 | /* Intercept a putname request. Called from | 939 | /* audit_putname - intercept a putname request |
1086 | * include/linux/fs.h:putname(). If we have stored the name from | 940 | * @name: name to intercept and delay for putname |
1087 | * getname in the audit context, then we delay the putname until syscall | 941 | * |
1088 | * exit. */ | 942 | * If we have stored the name from getname in the audit context, |
943 | * then we delay the putname until syscall exit. | ||
944 | * Called from include/linux/fs.h:putname(). | ||
945 | */ | ||
1089 | void audit_putname(const char *name) | 946 | void audit_putname(const char *name) |
1090 | { | 947 | { |
1091 | struct audit_context *context = current->audit_context; | 948 | struct audit_context *context = current->audit_context; |
@@ -1100,7 +957,7 @@ void audit_putname(const char *name) | |||
1100 | for (i = 0; i < context->name_count; i++) | 957 | for (i = 0; i < context->name_count; i++) |
1101 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | 958 | printk(KERN_ERR "name[%d] = %p = %s\n", i, |
1102 | context->names[i].name, | 959 | context->names[i].name, |
1103 | context->names[i].name); | 960 | context->names[i].name ?: "(null)"); |
1104 | } | 961 | } |
1105 | #endif | 962 | #endif |
1106 | __putname(name); | 963 | __putname(name); |
@@ -1122,9 +979,23 @@ void audit_putname(const char *name) | |||
1122 | #endif | 979 | #endif |
1123 | } | 980 | } |
1124 | 981 | ||
1125 | /* Store the inode and device from a lookup. Called from | 982 | static void audit_inode_context(int idx, const struct inode *inode) |
1126 | * fs/namei.c:path_lookup(). */ | 983 | { |
1127 | void audit_inode(const char *name, const struct inode *inode, unsigned flags) | 984 | struct audit_context *context = current->audit_context; |
985 | |||
986 | selinux_get_inode_sid(inode, &context->names[idx].osid); | ||
987 | } | ||
988 | |||
989 | |||
990 | /** | ||
991 | * audit_inode - store the inode and device from a lookup | ||
992 | * @name: name being audited | ||
993 | * @inode: inode being audited | ||
994 | * @flags: lookup flags (as used in path_lookup()) | ||
995 | * | ||
996 | * Called from fs/namei.c:path_lookup(). | ||
997 | */ | ||
998 | void __audit_inode(const char *name, const struct inode *inode, unsigned flags) | ||
1128 | { | 999 | { |
1129 | int idx; | 1000 | int idx; |
1130 | struct audit_context *context = current->audit_context; | 1001 | struct audit_context *context = current->audit_context; |
@@ -1150,15 +1021,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags) | |||
1150 | ++context->ino_count; | 1021 | ++context->ino_count; |
1151 | #endif | 1022 | #endif |
1152 | } | 1023 | } |
1153 | context->names[idx].flags = flags; | ||
1154 | context->names[idx].ino = inode->i_ino; | ||
1155 | context->names[idx].dev = inode->i_sb->s_dev; | 1024 | context->names[idx].dev = inode->i_sb->s_dev; |
1156 | context->names[idx].mode = inode->i_mode; | 1025 | context->names[idx].mode = inode->i_mode; |
1157 | context->names[idx].uid = inode->i_uid; | 1026 | context->names[idx].uid = inode->i_uid; |
1158 | context->names[idx].gid = inode->i_gid; | 1027 | context->names[idx].gid = inode->i_gid; |
1159 | context->names[idx].rdev = inode->i_rdev; | 1028 | context->names[idx].rdev = inode->i_rdev; |
1029 | audit_inode_context(idx, inode); | ||
1030 | if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && | ||
1031 | (strcmp(name, ".") != 0)) { | ||
1032 | context->names[idx].ino = (unsigned long)-1; | ||
1033 | context->names[idx].pino = inode->i_ino; | ||
1034 | } else { | ||
1035 | context->names[idx].ino = inode->i_ino; | ||
1036 | context->names[idx].pino = (unsigned long)-1; | ||
1037 | } | ||
1160 | } | 1038 | } |
1161 | 1039 | ||
1040 | /** | ||
1041 | * audit_inode_child - collect inode info for created/removed objects | ||
1042 | * @dname: inode's dentry name | ||
1043 | * @inode: inode being audited | ||
1044 | * @pino: inode number of dentry parent | ||
1045 | * | ||
1046 | * For syscalls that create or remove filesystem objects, audit_inode | ||
1047 | * can only collect information for the filesystem object's parent. | ||
1048 | * This call updates the audit context with the child's information. | ||
1049 | * Syscalls that create a new filesystem object must be hooked after | ||
1050 | * the object is created. Syscalls that remove a filesystem object | ||
1051 | * must be hooked prior, in order to capture the target inode during | ||
1052 | * unsuccessful attempts. | ||
1053 | */ | ||
1054 | void __audit_inode_child(const char *dname, const struct inode *inode, | ||
1055 | unsigned long pino) | ||
1056 | { | ||
1057 | int idx; | ||
1058 | struct audit_context *context = current->audit_context; | ||
1059 | |||
1060 | if (!context->in_syscall) | ||
1061 | return; | ||
1062 | |||
1063 | /* determine matching parent */ | ||
1064 | if (dname) | ||
1065 | for (idx = 0; idx < context->name_count; idx++) | ||
1066 | if (context->names[idx].pino == pino) { | ||
1067 | const char *n; | ||
1068 | const char *name = context->names[idx].name; | ||
1069 | int dlen = strlen(dname); | ||
1070 | int nlen = name ? strlen(name) : 0; | ||
1071 | |||
1072 | if (nlen < dlen) | ||
1073 | continue; | ||
1074 | |||
1075 | /* disregard trailing slashes */ | ||
1076 | n = name + nlen - 1; | ||
1077 | while ((*n == '/') && (n > name)) | ||
1078 | n--; | ||
1079 | |||
1080 | /* find last path component */ | ||
1081 | n = n - dlen + 1; | ||
1082 | if (n < name) | ||
1083 | continue; | ||
1084 | else if (n > name) { | ||
1085 | if (*--n != '/') | ||
1086 | continue; | ||
1087 | else | ||
1088 | n++; | ||
1089 | } | ||
1090 | |||
1091 | if (strncmp(n, dname, dlen) == 0) | ||
1092 | goto update_context; | ||
1093 | } | ||
1094 | |||
1095 | /* catch-all in case match not found */ | ||
1096 | idx = context->name_count++; | ||
1097 | context->names[idx].name = NULL; | ||
1098 | context->names[idx].pino = pino; | ||
1099 | #if AUDIT_DEBUG | ||
1100 | context->ino_count++; | ||
1101 | #endif | ||
1102 | |||
1103 | update_context: | ||
1104 | if (inode) { | ||
1105 | context->names[idx].ino = inode->i_ino; | ||
1106 | context->names[idx].dev = inode->i_sb->s_dev; | ||
1107 | context->names[idx].mode = inode->i_mode; | ||
1108 | context->names[idx].uid = inode->i_uid; | ||
1109 | context->names[idx].gid = inode->i_gid; | ||
1110 | context->names[idx].rdev = inode->i_rdev; | ||
1111 | audit_inode_context(idx, inode); | ||
1112 | } | ||
1113 | } | ||
1114 | |||
1115 | /** | ||
1116 | * auditsc_get_stamp - get local copies of audit_context values | ||
1117 | * @ctx: audit_context for the task | ||
1118 | * @t: timespec to store time recorded in the audit_context | ||
1119 | * @serial: serial value that is recorded in the audit_context | ||
1120 | * | ||
1121 | * Also sets the context as auditable. | ||
1122 | */ | ||
1162 | void auditsc_get_stamp(struct audit_context *ctx, | 1123 | void auditsc_get_stamp(struct audit_context *ctx, |
1163 | struct timespec *t, unsigned int *serial) | 1124 | struct timespec *t, unsigned int *serial) |
1164 | { | 1125 | { |
@@ -1170,6 +1131,15 @@ void auditsc_get_stamp(struct audit_context *ctx, | |||
1170 | ctx->auditable = 1; | 1131 | ctx->auditable = 1; |
1171 | } | 1132 | } |
1172 | 1133 | ||
1134 | /** | ||
1135 | * audit_set_loginuid - set a task's audit_context loginuid | ||
1136 | * @task: task whose audit context is being modified | ||
1137 | * @loginuid: loginuid value | ||
1138 | * | ||
1139 | * Returns 0. | ||
1140 | * | ||
1141 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). | ||
1142 | */ | ||
1173 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 1143 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) |
1174 | { | 1144 | { |
1175 | if (task->audit_context) { | 1145 | if (task->audit_context) { |
@@ -1188,12 +1158,24 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | |||
1188 | return 0; | 1158 | return 0; |
1189 | } | 1159 | } |
1190 | 1160 | ||
1161 | /** | ||
1162 | * audit_get_loginuid - get the loginuid for an audit_context | ||
1163 | * @ctx: the audit_context | ||
1164 | * | ||
1165 | * Returns the context's loginuid or -1 if @ctx is NULL. | ||
1166 | */ | ||
1191 | uid_t audit_get_loginuid(struct audit_context *ctx) | 1167 | uid_t audit_get_loginuid(struct audit_context *ctx) |
1192 | { | 1168 | { |
1193 | return ctx ? ctx->loginuid : -1; | 1169 | return ctx ? ctx->loginuid : -1; |
1194 | } | 1170 | } |
1195 | 1171 | ||
1196 | int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | 1172 | /** |
1173 | * audit_ipc_obj - record audit data for ipc object | ||
1174 | * @ipcp: ipc permissions | ||
1175 | * | ||
1176 | * Returns 0 for success or NULL context or < 0 on error. | ||
1177 | */ | ||
1178 | int audit_ipc_obj(struct kern_ipc_perm *ipcp) | ||
1197 | { | 1179 | { |
1198 | struct audit_aux_data_ipcctl *ax; | 1180 | struct audit_aux_data_ipcctl *ax; |
1199 | struct audit_context *context = current->audit_context; | 1181 | struct audit_context *context = current->audit_context; |
@@ -1201,7 +1183,39 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | |||
1201 | if (likely(!context)) | 1183 | if (likely(!context)) |
1202 | return 0; | 1184 | return 0; |
1203 | 1185 | ||
1204 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | 1186 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); |
1187 | if (!ax) | ||
1188 | return -ENOMEM; | ||
1189 | |||
1190 | ax->uid = ipcp->uid; | ||
1191 | ax->gid = ipcp->gid; | ||
1192 | ax->mode = ipcp->mode; | ||
1193 | selinux_get_ipc_sid(ipcp, &ax->osid); | ||
1194 | |||
1195 | ax->d.type = AUDIT_IPC; | ||
1196 | ax->d.next = context->aux; | ||
1197 | context->aux = (void *)ax; | ||
1198 | return 0; | ||
1199 | } | ||
1200 | |||
1201 | /** | ||
1202 | * audit_ipc_set_perm - record audit data for new ipc permissions | ||
1203 | * @qbytes: msgq bytes | ||
1204 | * @uid: msgq user id | ||
1205 | * @gid: msgq group id | ||
1206 | * @mode: msgq mode (permissions) | ||
1207 | * | ||
1208 | * Returns 0 for success or NULL context or < 0 on error. | ||
1209 | */ | ||
1210 | int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) | ||
1211 | { | ||
1212 | struct audit_aux_data_ipcctl *ax; | ||
1213 | struct audit_context *context = current->audit_context; | ||
1214 | |||
1215 | if (likely(!context)) | ||
1216 | return 0; | ||
1217 | |||
1218 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1205 | if (!ax) | 1219 | if (!ax) |
1206 | return -ENOMEM; | 1220 | return -ENOMEM; |
1207 | 1221 | ||
@@ -1209,13 +1223,21 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | |||
1209 | ax->uid = uid; | 1223 | ax->uid = uid; |
1210 | ax->gid = gid; | 1224 | ax->gid = gid; |
1211 | ax->mode = mode; | 1225 | ax->mode = mode; |
1226 | selinux_get_ipc_sid(ipcp, &ax->osid); | ||
1212 | 1227 | ||
1213 | ax->d.type = AUDIT_IPC; | 1228 | ax->d.type = AUDIT_IPC_SET_PERM; |
1214 | ax->d.next = context->aux; | 1229 | ax->d.next = context->aux; |
1215 | context->aux = (void *)ax; | 1230 | context->aux = (void *)ax; |
1216 | return 0; | 1231 | return 0; |
1217 | } | 1232 | } |
1218 | 1233 | ||
1234 | /** | ||
1235 | * audit_socketcall - record audit data for sys_socketcall | ||
1236 | * @nargs: number of args | ||
1237 | * @args: args array | ||
1238 | * | ||
1239 | * Returns 0 for success or NULL context or < 0 on error. | ||
1240 | */ | ||
1219 | int audit_socketcall(int nargs, unsigned long *args) | 1241 | int audit_socketcall(int nargs, unsigned long *args) |
1220 | { | 1242 | { |
1221 | struct audit_aux_data_socketcall *ax; | 1243 | struct audit_aux_data_socketcall *ax; |
@@ -1237,6 +1259,13 @@ int audit_socketcall(int nargs, unsigned long *args) | |||
1237 | return 0; | 1259 | return 0; |
1238 | } | 1260 | } |
1239 | 1261 | ||
1262 | /** | ||
1263 | * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto | ||
1264 | * @len: data length in user space | ||
1265 | * @a: data address in kernel space | ||
1266 | * | ||
1267 | * Returns 0 for success or NULL context or < 0 on error. | ||
1268 | */ | ||
1240 | int audit_sockaddr(int len, void *a) | 1269 | int audit_sockaddr(int len, void *a) |
1241 | { | 1270 | { |
1242 | struct audit_aux_data_sockaddr *ax; | 1271 | struct audit_aux_data_sockaddr *ax; |
@@ -1258,6 +1287,15 @@ int audit_sockaddr(int len, void *a) | |||
1258 | return 0; | 1287 | return 0; |
1259 | } | 1288 | } |
1260 | 1289 | ||
1290 | /** | ||
1291 | * audit_avc_path - record the granting or denial of permissions | ||
1292 | * @dentry: dentry to record | ||
1293 | * @mnt: mnt to record | ||
1294 | * | ||
1295 | * Returns 0 for success or NULL context or < 0 on error. | ||
1296 | * | ||
1297 | * Called from security/selinux/avc.c::avc_audit() | ||
1298 | */ | ||
1261 | int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | 1299 | int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) |
1262 | { | 1300 | { |
1263 | struct audit_aux_data_path *ax; | 1301 | struct audit_aux_data_path *ax; |
@@ -1279,6 +1317,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | |||
1279 | return 0; | 1317 | return 0; |
1280 | } | 1318 | } |
1281 | 1319 | ||
1320 | /** | ||
1321 | * audit_signal_info - record signal info for shutting down audit subsystem | ||
1322 | * @sig: signal value | ||
1323 | * @t: task being signaled | ||
1324 | * | ||
1325 | * If the audit subsystem is being terminated, record the task (pid) | ||
1326 | * and uid that is doing that. | ||
1327 | */ | ||
1282 | void audit_signal_info(int sig, struct task_struct *t) | 1328 | void audit_signal_info(int sig, struct task_struct *t) |
1283 | { | 1329 | { |
1284 | extern pid_t audit_sig_pid; | 1330 | extern pid_t audit_sig_pid; |
@@ -1295,4 +1341,3 @@ void audit_signal_info(int sig, struct task_struct *t) | |||
1295 | } | 1341 | } |
1296 | } | 1342 | } |
1297 | } | 1343 | } |
1298 | |||
diff --git a/kernel/capability.c b/kernel/capability.c index bfa3c92e16..1a4d8a40d3 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -233,3 +233,19 @@ out: | |||
233 | 233 | ||
234 | return ret; | 234 | return ret; |
235 | } | 235 | } |
236 | |||
237 | int __capable(struct task_struct *t, int cap) | ||
238 | { | ||
239 | if (security_capable(t, cap) == 0) { | ||
240 | t->flags |= PF_SUPERPRIV; | ||
241 | return 1; | ||
242 | } | ||
243 | return 0; | ||
244 | } | ||
245 | EXPORT_SYMBOL(__capable); | ||
246 | |||
247 | int capable(int cap) | ||
248 | { | ||
249 | return __capable(current, cap); | ||
250 | } | ||
251 | EXPORT_SYMBOL(capable); | ||
diff --git a/kernel/compat.c b/kernel/compat.c index 8c9cd88b67..c1601a84f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -17,10 +17,10 @@ | |||
17 | #include <linux/time.h> | 17 | #include <linux/time.h> |
18 | #include <linux/signal.h> | 18 | #include <linux/signal.h> |
19 | #include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ | 19 | #include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ |
20 | #include <linux/futex.h> /* for FUTEX_WAIT */ | ||
21 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
22 | #include <linux/unistd.h> | 21 | #include <linux/unistd.h> |
23 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/timex.h> | ||
24 | 24 | ||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | 26 | ||
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | |||
238 | return ret; | 238 | return ret; |
239 | } | 239 | } |
240 | 240 | ||
241 | #ifdef CONFIG_FUTEX | ||
242 | asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, | ||
243 | struct compat_timespec __user *utime, u32 __user *uaddr2, | ||
244 | int val3) | ||
245 | { | ||
246 | struct timespec t; | ||
247 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | ||
248 | int val2 = 0; | ||
249 | |||
250 | if ((op == FUTEX_WAIT) && utime) { | ||
251 | if (get_compat_timespec(&t, utime)) | ||
252 | return -EFAULT; | ||
253 | timeout = timespec_to_jiffies(&t) + 1; | ||
254 | } | ||
255 | if (op >= FUTEX_REQUEUE) | ||
256 | val2 = (int) (unsigned long) utime; | ||
257 | |||
258 | return do_futex((unsigned long)uaddr, op, val, timeout, | ||
259 | (unsigned long)uaddr2, val2, val3); | ||
260 | } | ||
261 | #endif | ||
262 | |||
263 | asmlinkage long compat_sys_setrlimit(unsigned int resource, | 241 | asmlinkage long compat_sys_setrlimit(unsigned int resource, |
264 | struct compat_rlimit __user *rlim) | 242 | struct compat_rlimit __user *rlim) |
265 | { | 243 | { |
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
898 | return -ERESTARTNOHAND; | 876 | return -ERESTARTNOHAND; |
899 | } | 877 | } |
900 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | 878 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ |
879 | |||
880 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | ||
881 | { | ||
882 | struct timex txc; | ||
883 | int ret; | ||
884 | |||
885 | memset(&txc, 0, sizeof(struct timex)); | ||
886 | |||
887 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | ||
888 | __get_user(txc.modes, &utp->modes) || | ||
889 | __get_user(txc.offset, &utp->offset) || | ||
890 | __get_user(txc.freq, &utp->freq) || | ||
891 | __get_user(txc.maxerror, &utp->maxerror) || | ||
892 | __get_user(txc.esterror, &utp->esterror) || | ||
893 | __get_user(txc.status, &utp->status) || | ||
894 | __get_user(txc.constant, &utp->constant) || | ||
895 | __get_user(txc.precision, &utp->precision) || | ||
896 | __get_user(txc.tolerance, &utp->tolerance) || | ||
897 | __get_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
898 | __get_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
899 | __get_user(txc.tick, &utp->tick) || | ||
900 | __get_user(txc.ppsfreq, &utp->ppsfreq) || | ||
901 | __get_user(txc.jitter, &utp->jitter) || | ||
902 | __get_user(txc.shift, &utp->shift) || | ||
903 | __get_user(txc.stabil, &utp->stabil) || | ||
904 | __get_user(txc.jitcnt, &utp->jitcnt) || | ||
905 | __get_user(txc.calcnt, &utp->calcnt) || | ||
906 | __get_user(txc.errcnt, &utp->errcnt) || | ||
907 | __get_user(txc.stbcnt, &utp->stbcnt)) | ||
908 | return -EFAULT; | ||
909 | |||
910 | ret = do_adjtimex(&txc); | ||
911 | |||
912 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | ||
913 | __put_user(txc.modes, &utp->modes) || | ||
914 | __put_user(txc.offset, &utp->offset) || | ||
915 | __put_user(txc.freq, &utp->freq) || | ||
916 | __put_user(txc.maxerror, &utp->maxerror) || | ||
917 | __put_user(txc.esterror, &utp->esterror) || | ||
918 | __put_user(txc.status, &utp->status) || | ||
919 | __put_user(txc.constant, &utp->constant) || | ||
920 | __put_user(txc.precision, &utp->precision) || | ||
921 | __put_user(txc.tolerance, &utp->tolerance) || | ||
922 | __put_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
923 | __put_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
924 | __put_user(txc.tick, &utp->tick) || | ||
925 | __put_user(txc.ppsfreq, &utp->ppsfreq) || | ||
926 | __put_user(txc.jitter, &utp->jitter) || | ||
927 | __put_user(txc.shift, &utp->shift) || | ||
928 | __put_user(txc.stabil, &utp->stabil) || | ||
929 | __put_user(txc.jitcnt, &utp->jitcnt) || | ||
930 | __put_user(txc.calcnt, &utp->calcnt) || | ||
931 | __put_user(txc.errcnt, &utp->errcnt) || | ||
932 | __put_user(txc.stbcnt, &utp->stbcnt)) | ||
933 | ret = -EFAULT; | ||
934 | |||
935 | return ret; | ||
936 | } | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index e882c6babf..fe2b8d0bfe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -18,7 +18,7 @@ | |||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | static DECLARE_MUTEX(cpucontrol); | 19 | static DECLARE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static struct notifier_block *cpu_chain; | 21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | 23 | #ifdef CONFIG_HOTPLUG_CPU |
24 | static struct task_struct *lock_cpu_hotplug_owner; | 24 | static struct task_struct *lock_cpu_hotplug_owner; |
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | |||
71 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
72 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int register_cpu_notifier(struct notifier_block *nb) |
73 | { | 73 | { |
74 | int ret; | 74 | return blocking_notifier_chain_register(&cpu_chain, nb); |
75 | |||
76 | if ((ret = lock_cpu_hotplug_interruptible()) != 0) | ||
77 | return ret; | ||
78 | ret = notifier_chain_register(&cpu_chain, nb); | ||
79 | unlock_cpu_hotplug(); | ||
80 | return ret; | ||
81 | } | 75 | } |
82 | EXPORT_SYMBOL(register_cpu_notifier); | 76 | EXPORT_SYMBOL(register_cpu_notifier); |
83 | 77 | ||
84 | void unregister_cpu_notifier(struct notifier_block *nb) | 78 | void unregister_cpu_notifier(struct notifier_block *nb) |
85 | { | 79 | { |
86 | lock_cpu_hotplug(); | 80 | blocking_notifier_chain_unregister(&cpu_chain, nb); |
87 | notifier_chain_unregister(&cpu_chain, nb); | ||
88 | unlock_cpu_hotplug(); | ||
89 | } | 81 | } |
90 | EXPORT_SYMBOL(unregister_cpu_notifier); | 82 | EXPORT_SYMBOL(unregister_cpu_notifier); |
91 | 83 | ||
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu) | |||
141 | goto out; | 133 | goto out; |
142 | } | 134 | } |
143 | 135 | ||
144 | err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, | 136 | err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, |
145 | (void *)(long)cpu); | 137 | (void *)(long)cpu); |
146 | if (err == NOTIFY_BAD) { | 138 | if (err == NOTIFY_BAD) { |
147 | printk("%s: attempt to take down CPU %u failed\n", | 139 | printk("%s: attempt to take down CPU %u failed\n", |
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu) | |||
159 | p = __stop_machine_run(take_cpu_down, NULL, cpu); | 151 | p = __stop_machine_run(take_cpu_down, NULL, cpu); |
160 | if (IS_ERR(p)) { | 152 | if (IS_ERR(p)) { |
161 | /* CPU didn't die: tell everyone. Can't complain. */ | 153 | /* CPU didn't die: tell everyone. Can't complain. */ |
162 | if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, | 154 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, |
163 | (void *)(long)cpu) == NOTIFY_BAD) | 155 | (void *)(long)cpu) == NOTIFY_BAD) |
164 | BUG(); | 156 | BUG(); |
165 | 157 | ||
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu) | |||
182 | put_cpu(); | 174 | put_cpu(); |
183 | 175 | ||
184 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 176 | /* CPU is completely dead: tell everyone. Too late to complain. */ |
185 | if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) | 177 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, |
186 | == NOTIFY_BAD) | 178 | (void *)(long)cpu) == NOTIFY_BAD) |
187 | BUG(); | 179 | BUG(); |
188 | 180 | ||
189 | check_for_tasks(cpu); | 181 | check_for_tasks(cpu); |
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu) | |||
211 | goto out; | 203 | goto out; |
212 | } | 204 | } |
213 | 205 | ||
214 | ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); | 206 | ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); |
215 | if (ret == NOTIFY_BAD) { | 207 | if (ret == NOTIFY_BAD) { |
216 | printk("%s: attempt to bring up CPU %u failed\n", | 208 | printk("%s: attempt to bring up CPU %u failed\n", |
217 | __FUNCTION__, cpu); | 209 | __FUNCTION__, cpu); |
@@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu) | |||
223 | ret = __cpu_up(cpu); | 215 | ret = __cpu_up(cpu); |
224 | if (ret != 0) | 216 | if (ret != 0) |
225 | goto out_notify; | 217 | goto out_notify; |
226 | if (!cpu_online(cpu)) | 218 | BUG_ON(!cpu_online(cpu)); |
227 | BUG(); | ||
228 | 219 | ||
229 | /* Now call notifier in preparation. */ | 220 | /* Now call notifier in preparation. */ |
230 | notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); | 221 | blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); |
231 | 222 | ||
232 | out_notify: | 223 | out_notify: |
233 | if (ret != 0) | 224 | if (ret != 0) |
234 | notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); | 225 | blocking_notifier_call_chain(&cpu_chain, |
226 | CPU_UP_CANCELED, hcpu); | ||
235 | out: | 227 | out: |
236 | unlock_cpu_hotplug(); | 228 | unlock_cpu_hotplug(); |
237 | return ret; | 229 | return ret; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba42b0a769..ab81fdd457 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -4,15 +4,14 @@ | |||
4 | * Processor and Memory placement constraints for sets of tasks. | 4 | * Processor and Memory placement constraints for sets of tasks. |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA. | 6 | * Copyright (C) 2003 BULL SA. |
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | * Portions derived from Patrick Mochel's sysfs code. | 9 | * Portions derived from Patrick Mochel's sysfs code. |
10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | 10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel |
11 | * Portions Copyright (c) 2004 Silicon Graphics, Inc. | ||
12 | * | 11 | * |
13 | * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> | 12 | * 2003-10-10 Written by Simon Derr. |
14 | * 2003-10-22 Updates by Stephen Hemminger. | 13 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson <pj@sgi.com> | 14 | * 2004 May-July Rework by Paul Jackson. |
16 | * | 15 | * |
17 | * This file is subject to the terms and conditions of the GNU General Public | 16 | * This file is subject to the terms and conditions of the GNU General Public |
18 | * License. See the file COPYING in the main directory of the Linux | 17 | * License. See the file COPYING in the main directory of the Linux |
@@ -53,7 +52,7 @@ | |||
53 | 52 | ||
54 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
55 | #include <asm/atomic.h> | 54 | #include <asm/atomic.h> |
56 | #include <asm/semaphore.h> | 55 | #include <linux/mutex.h> |
57 | 56 | ||
58 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 57 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
59 | 58 | ||
@@ -108,37 +107,49 @@ typedef enum { | |||
108 | CS_MEM_EXCLUSIVE, | 107 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | 108 | CS_MEMORY_MIGRATE, |
110 | CS_REMOVED, | 109 | CS_REMOVED, |
111 | CS_NOTIFY_ON_RELEASE | 110 | CS_NOTIFY_ON_RELEASE, |
111 | CS_SPREAD_PAGE, | ||
112 | CS_SPREAD_SLAB, | ||
112 | } cpuset_flagbits_t; | 113 | } cpuset_flagbits_t; |
113 | 114 | ||
114 | /* convenient tests for these bits */ | 115 | /* convenient tests for these bits */ |
115 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 116 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
116 | { | 117 | { |
117 | return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); | 118 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
118 | } | 119 | } |
119 | 120 | ||
120 | static inline int is_mem_exclusive(const struct cpuset *cs) | 121 | static inline int is_mem_exclusive(const struct cpuset *cs) |
121 | { | 122 | { |
122 | return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | 123 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
123 | } | 124 | } |
124 | 125 | ||
125 | static inline int is_removed(const struct cpuset *cs) | 126 | static inline int is_removed(const struct cpuset *cs) |
126 | { | 127 | { |
127 | return !!test_bit(CS_REMOVED, &cs->flags); | 128 | return test_bit(CS_REMOVED, &cs->flags); |
128 | } | 129 | } |
129 | 130 | ||
130 | static inline int notify_on_release(const struct cpuset *cs) | 131 | static inline int notify_on_release(const struct cpuset *cs) |
131 | { | 132 | { |
132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 133 | return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
133 | } | 134 | } |
134 | 135 | ||
135 | static inline int is_memory_migrate(const struct cpuset *cs) | 136 | static inline int is_memory_migrate(const struct cpuset *cs) |
136 | { | 137 | { |
137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | 138 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
139 | } | ||
140 | |||
141 | static inline int is_spread_page(const struct cpuset *cs) | ||
142 | { | ||
143 | return test_bit(CS_SPREAD_PAGE, &cs->flags); | ||
144 | } | ||
145 | |||
146 | static inline int is_spread_slab(const struct cpuset *cs) | ||
147 | { | ||
148 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | ||
138 | } | 149 | } |
139 | 150 | ||
140 | /* | 151 | /* |
141 | * Increment this atomic integer everytime any cpuset changes its | 152 | * Increment this integer everytime any cpuset changes its |
142 | * mems_allowed value. Users of cpusets can track this generation | 153 | * mems_allowed value. Users of cpusets can track this generation |
143 | * number, and avoid having to lock and reload mems_allowed unless | 154 | * number, and avoid having to lock and reload mems_allowed unless |
144 | * the cpuset they're using changes generation. | 155 | * the cpuset they're using changes generation. |
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs) | |||
152 | * on every visit to __alloc_pages(), to efficiently check whether | 163 | * on every visit to __alloc_pages(), to efficiently check whether |
153 | * its current->cpuset->mems_allowed has changed, requiring an update | 164 | * its current->cpuset->mems_allowed has changed, requiring an update |
154 | * of its current->mems_allowed. | 165 | * of its current->mems_allowed. |
166 | * | ||
167 | * Since cpuset_mems_generation is guarded by manage_mutex, | ||
168 | * there is no need to mark it atomic. | ||
155 | */ | 169 | */ |
156 | static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); | 170 | static int cpuset_mems_generation; |
157 | 171 | ||
158 | static struct cpuset top_cpuset = { | 172 | static struct cpuset top_cpuset = { |
159 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 173 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
@@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount; | |||
168 | static struct super_block *cpuset_sb; | 182 | static struct super_block *cpuset_sb; |
169 | 183 | ||
170 | /* | 184 | /* |
171 | * We have two global cpuset semaphores below. They can nest. | 185 | * We have two global cpuset mutexes below. They can nest. |
172 | * It is ok to first take manage_sem, then nest callback_sem. We also | 186 | * It is ok to first take manage_mutex, then nest callback_mutex. We also |
173 | * require taking task_lock() when dereferencing a tasks cpuset pointer. | 187 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
174 | * See "The task_lock() exception", at the end of this comment. | 188 | * See "The task_lock() exception", at the end of this comment. |
175 | * | 189 | * |
176 | * A task must hold both semaphores to modify cpusets. If a task | 190 | * A task must hold both mutexes to modify cpusets. If a task |
177 | * holds manage_sem, then it blocks others wanting that semaphore, | 191 | * holds manage_mutex, then it blocks others wanting that mutex, |
178 | * ensuring that it is the only task able to also acquire callback_sem | 192 | * ensuring that it is the only task able to also acquire callback_mutex |
179 | * and be able to modify cpusets. It can perform various checks on | 193 | * and be able to modify cpusets. It can perform various checks on |
180 | * the cpuset structure first, knowing nothing will change. It can | 194 | * the cpuset structure first, knowing nothing will change. It can |
181 | * also allocate memory while just holding manage_sem. While it is | 195 | * also allocate memory while just holding manage_mutex. While it is |
182 | * performing these checks, various callback routines can briefly | 196 | * performing these checks, various callback routines can briefly |
183 | * acquire callback_sem to query cpusets. Once it is ready to make | 197 | * acquire callback_mutex to query cpusets. Once it is ready to make |
184 | * the changes, it takes callback_sem, blocking everyone else. | 198 | * the changes, it takes callback_mutex, blocking everyone else. |
185 | * | 199 | * |
186 | * Calls to the kernel memory allocator can not be made while holding | 200 | * Calls to the kernel memory allocator can not be made while holding |
187 | * callback_sem, as that would risk double tripping on callback_sem | 201 | * callback_mutex, as that would risk double tripping on callback_mutex |
188 | * from one of the callbacks into the cpuset code from within | 202 | * from one of the callbacks into the cpuset code from within |
189 | * __alloc_pages(). | 203 | * __alloc_pages(). |
190 | * | 204 | * |
191 | * If a task is only holding callback_sem, then it has read-only | 205 | * If a task is only holding callback_mutex, then it has read-only |
192 | * access to cpusets. | 206 | * access to cpusets. |
193 | * | 207 | * |
194 | * The task_struct fields mems_allowed and mems_generation may only | 208 | * The task_struct fields mems_allowed and mems_generation may only |
195 | * be accessed in the context of that task, so require no locks. | 209 | * be accessed in the context of that task, so require no locks. |
196 | * | 210 | * |
197 | * Any task can increment and decrement the count field without lock. | 211 | * Any task can increment and decrement the count field without lock. |
198 | * So in general, code holding manage_sem or callback_sem can't rely | 212 | * So in general, code holding manage_mutex or callback_mutex can't rely |
199 | * on the count field not changing. However, if the count goes to | 213 | * on the count field not changing. However, if the count goes to |
200 | * zero, then only attach_task(), which holds both semaphores, can | 214 | * zero, then only attach_task(), which holds both mutexes, can |
201 | * increment it again. Because a count of zero means that no tasks | 215 | * increment it again. Because a count of zero means that no tasks |
202 | * are currently attached, therefore there is no way a task attached | 216 | * are currently attached, therefore there is no way a task attached |
203 | * to that cpuset can fork (the other way to increment the count). | 217 | * to that cpuset can fork (the other way to increment the count). |
204 | * So code holding manage_sem or callback_sem can safely assume that | 218 | * So code holding manage_mutex or callback_mutex can safely assume that |
205 | * if the count is zero, it will stay zero. Similarly, if a task | 219 | * if the count is zero, it will stay zero. Similarly, if a task |
206 | * holds manage_sem or callback_sem on a cpuset with zero count, it | 220 | * holds manage_mutex or callback_mutex on a cpuset with zero count, it |
207 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | 221 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs |
208 | * both of those semaphores. | 222 | * both of those mutexes. |
209 | * | ||
210 | * A possible optimization to improve parallelism would be to make | ||
211 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
212 | * to proceed in parallel, with read access, until the holder of | ||
213 | * manage_sem needed to take this rwsem for exclusive write access | ||
214 | * and modify some cpusets. | ||
215 | * | 223 | * |
216 | * The cpuset_common_file_write handler for operations that modify | 224 | * The cpuset_common_file_write handler for operations that modify |
217 | * the cpuset hierarchy holds manage_sem across the entire operation, | 225 | * the cpuset hierarchy holds manage_mutex across the entire operation, |
218 | * single threading all such cpuset modifications across the system. | 226 | * single threading all such cpuset modifications across the system. |
219 | * | 227 | * |
220 | * The cpuset_common_file_read() handlers only hold callback_sem across | 228 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
221 | * small pieces of code, such as when reading out possibly multi-word | 229 | * small pieces of code, such as when reading out possibly multi-word |
222 | * cpumasks and nodemasks. | 230 | * cpumasks and nodemasks. |
223 | * | 231 | * |
224 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | 232 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't |
225 | * (usually) take either semaphore. These are the two most performance | 233 | * (usually) take either mutex. These are the two most performance |
226 | * critical pieces of code here. The exception occurs on cpuset_exit(), | 234 | * critical pieces of code here. The exception occurs on cpuset_exit(), |
227 | * when a task in a notify_on_release cpuset exits. Then manage_sem | 235 | * when a task in a notify_on_release cpuset exits. Then manage_mutex |
228 | * is taken, and if the cpuset count is zero, a usermode call made | 236 | * is taken, and if the cpuset count is zero, a usermode call made |
229 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 237 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
230 | * relative to the root of cpuset file system) as the argument. | 238 | * relative to the root of cpuset file system) as the argument. |
@@ -242,9 +250,9 @@ static struct super_block *cpuset_sb; | |||
242 | * | 250 | * |
243 | * The need for this exception arises from the action of attach_task(), | 251 | * The need for this exception arises from the action of attach_task(), |
244 | * which overwrites one tasks cpuset pointer with another. It does | 252 | * which overwrites one tasks cpuset pointer with another. It does |
245 | * so using both semaphores, however there are several performance | 253 | * so using both mutexes, however there are several performance |
246 | * critical places that need to reference task->cpuset without the | 254 | * critical places that need to reference task->cpuset without the |
247 | * expense of grabbing a system global semaphore. Therefore except as | 255 | * expense of grabbing a system global mutex. Therefore except as |
248 | * noted below, when dereferencing or, as in attach_task(), modifying | 256 | * noted below, when dereferencing or, as in attach_task(), modifying |
249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 257 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
250 | * (task->alloc_lock) already in the task_struct routinely used for | 258 | * (task->alloc_lock) already in the task_struct routinely used for |
@@ -256,8 +264,8 @@ static struct super_block *cpuset_sb; | |||
256 | * the routine cpuset_update_task_memory_state(). | 264 | * the routine cpuset_update_task_memory_state(). |
257 | */ | 265 | */ |
258 | 266 | ||
259 | static DECLARE_MUTEX(manage_sem); | 267 | static DEFINE_MUTEX(manage_mutex); |
260 | static DECLARE_MUTEX(callback_sem); | 268 | static DEFINE_MUTEX(callback_mutex); |
261 | 269 | ||
262 | /* | 270 | /* |
263 | * A couple of forward declarations required, due to cyclic reference loop: | 271 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
432 | } | 440 | } |
433 | 441 | ||
434 | /* | 442 | /* |
435 | * Call with manage_sem held. Writes path of cpuset into buf. | 443 | * Call with manage_mutex held. Writes path of cpuset into buf. |
436 | * Returns 0 on success, -errno on error. | 444 | * Returns 0 on success, -errno on error. |
437 | */ | 445 | */ |
438 | 446 | ||
@@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
484 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 492 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
485 | * our caller up for that. | 493 | * our caller up for that. |
486 | * | 494 | * |
487 | * When we had only one cpuset semaphore, we had to call this | 495 | * When we had only one cpuset mutex, we had to call this |
488 | * without holding it, to avoid deadlock when call_usermodehelper() | 496 | * without holding it, to avoid deadlock when call_usermodehelper() |
489 | * allocated memory. With two locks, we could now call this while | 497 | * allocated memory. With two locks, we could now call this while |
490 | * holding manage_sem, but we still don't, so as to minimize | 498 | * holding manage_mutex, but we still don't, so as to minimize |
491 | * the time manage_sem is held. | 499 | * the time manage_mutex is held. |
492 | */ | 500 | */ |
493 | 501 | ||
494 | static void cpuset_release_agent(const char *pathbuf) | 502 | static void cpuset_release_agent(const char *pathbuf) |
@@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
520 | * cs is notify_on_release() and now both the user count is zero and | 528 | * cs is notify_on_release() and now both the user count is zero and |
521 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 529 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
522 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 530 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
523 | * cpuset_release_agent() with it later on, once manage_sem is dropped. | 531 | * cpuset_release_agent() with it later on, once manage_mutex is dropped. |
524 | * Call here with manage_sem held. | 532 | * Call here with manage_mutex held. |
525 | * | 533 | * |
526 | * This check_for_release() routine is responsible for kmalloc'ing | 534 | * This check_for_release() routine is responsible for kmalloc'ing |
527 | * pathbuf. The above cpuset_release_agent() is responsible for | 535 | * pathbuf. The above cpuset_release_agent() is responsible for |
528 | * kfree'ing pathbuf. The caller of these routines is responsible | 536 | * kfree'ing pathbuf. The caller of these routines is responsible |
529 | * for providing a pathbuf pointer, initialized to NULL, then | 537 | * for providing a pathbuf pointer, initialized to NULL, then |
530 | * calling check_for_release() with manage_sem held and the address | 538 | * calling check_for_release() with manage_mutex held and the address |
531 | * of the pathbuf pointer, then dropping manage_sem, then calling | 539 | * of the pathbuf pointer, then dropping manage_mutex, then calling |
532 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 540 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
533 | */ | 541 | */ |
534 | 542 | ||
@@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
559 | * One way or another, we guarantee to return some non-empty subset | 567 | * One way or another, we guarantee to return some non-empty subset |
560 | * of cpu_online_map. | 568 | * of cpu_online_map. |
561 | * | 569 | * |
562 | * Call with callback_sem held. | 570 | * Call with callback_mutex held. |
563 | */ | 571 | */ |
564 | 572 | ||
565 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 573 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
@@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
583 | * One way or another, we guarantee to return some non-empty subset | 591 | * One way or another, we guarantee to return some non-empty subset |
584 | * of node_online_map. | 592 | * of node_online_map. |
585 | * | 593 | * |
586 | * Call with callback_sem held. | 594 | * Call with callback_mutex held. |
587 | */ | 595 | */ |
588 | 596 | ||
589 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 597 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
@@ -608,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
608 | * current->cpuset if a task has its memory placement changed. | 616 | * current->cpuset if a task has its memory placement changed. |
609 | * Do not call this routine if in_interrupt(). | 617 | * Do not call this routine if in_interrupt(). |
610 | * | 618 | * |
611 | * Call without callback_sem or task_lock() held. May be called | 619 | * Call without callback_mutex or task_lock() held. May be |
612 | * with or without manage_sem held. Doesn't need task_lock to guard | 620 | * called with or without manage_mutex held. Thanks in part to |
613 | * against another task changing a non-NULL cpuset pointer to NULL, | 621 | * 'the_top_cpuset_hack', the tasks cpuset pointer will never |
614 | * as that is only done by a task on itself, and if the current task | 622 | * be NULL. This routine also might acquire callback_mutex and |
615 | * is here, it is not simultaneously in the exit code NULL'ing its | ||
616 | * cpuset pointer. This routine also might acquire callback_sem and | ||
617 | * current->mm->mmap_sem during call. | 623 | * current->mm->mmap_sem during call. |
618 | * | 624 | * |
619 | * Reading current->cpuset->mems_generation doesn't need task_lock | 625 | * Reading current->cpuset->mems_generation doesn't need task_lock |
@@ -658,13 +664,21 @@ void cpuset_update_task_memory_state(void) | |||
658 | } | 664 | } |
659 | 665 | ||
660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | 666 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { |
661 | down(&callback_sem); | 667 | mutex_lock(&callback_mutex); |
662 | task_lock(tsk); | 668 | task_lock(tsk); |
663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ | 669 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
664 | guarantee_online_mems(cs, &tsk->mems_allowed); | 670 | guarantee_online_mems(cs, &tsk->mems_allowed); |
665 | tsk->cpuset_mems_generation = cs->mems_generation; | 671 | tsk->cpuset_mems_generation = cs->mems_generation; |
672 | if (is_spread_page(cs)) | ||
673 | tsk->flags |= PF_SPREAD_PAGE; | ||
674 | else | ||
675 | tsk->flags &= ~PF_SPREAD_PAGE; | ||
676 | if (is_spread_slab(cs)) | ||
677 | tsk->flags |= PF_SPREAD_SLAB; | ||
678 | else | ||
679 | tsk->flags &= ~PF_SPREAD_SLAB; | ||
666 | task_unlock(tsk); | 680 | task_unlock(tsk); |
667 | up(&callback_sem); | 681 | mutex_unlock(&callback_mutex); |
668 | mpol_rebind_task(tsk, &tsk->mems_allowed); | 682 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
669 | } | 683 | } |
670 | } | 684 | } |
@@ -674,7 +688,7 @@ void cpuset_update_task_memory_state(void) | |||
674 | * | 688 | * |
675 | * One cpuset is a subset of another if all its allowed CPUs and | 689 | * One cpuset is a subset of another if all its allowed CPUs and |
676 | * Memory Nodes are a subset of the other, and its exclusive flags | 690 | * Memory Nodes are a subset of the other, and its exclusive flags |
677 | * are only set if the other's are set. Call holding manage_sem. | 691 | * are only set if the other's are set. Call holding manage_mutex. |
678 | */ | 692 | */ |
679 | 693 | ||
680 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 694 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -692,7 +706,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
692 | * If we replaced the flag and mask values of the current cpuset | 706 | * If we replaced the flag and mask values of the current cpuset |
693 | * (cur) with those values in the trial cpuset (trial), would | 707 | * (cur) with those values in the trial cpuset (trial), would |
694 | * our various subset and exclusive rules still be valid? Presumes | 708 | * our various subset and exclusive rules still be valid? Presumes |
695 | * manage_sem held. | 709 | * manage_mutex held. |
696 | * | 710 | * |
697 | * 'cur' is the address of an actual, in-use cpuset. Operations | 711 | * 'cur' is the address of an actual, in-use cpuset. Operations |
698 | * such as list traversal that depend on the actual address of the | 712 | * such as list traversal that depend on the actual address of the |
@@ -746,7 +760,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
746 | * exclusive child cpusets | 760 | * exclusive child cpusets |
747 | * Build these two partitions by calling partition_sched_domains | 761 | * Build these two partitions by calling partition_sched_domains |
748 | * | 762 | * |
749 | * Call with manage_sem held. May nest a call to the | 763 | * Call with manage_mutex held. May nest a call to the |
750 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 764 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
751 | */ | 765 | */ |
752 | 766 | ||
@@ -792,7 +806,7 @@ static void update_cpu_domains(struct cpuset *cur) | |||
792 | } | 806 | } |
793 | 807 | ||
794 | /* | 808 | /* |
795 | * Call with manage_sem held. May take callback_sem during call. | 809 | * Call with manage_mutex held. May take callback_mutex during call. |
796 | */ | 810 | */ |
797 | 811 | ||
798 | static int update_cpumask(struct cpuset *cs, char *buf) | 812 | static int update_cpumask(struct cpuset *cs, char *buf) |
@@ -811,15 +825,64 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
811 | if (retval < 0) | 825 | if (retval < 0) |
812 | return retval; | 826 | return retval; |
813 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 827 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
814 | down(&callback_sem); | 828 | mutex_lock(&callback_mutex); |
815 | cs->cpus_allowed = trialcs.cpus_allowed; | 829 | cs->cpus_allowed = trialcs.cpus_allowed; |
816 | up(&callback_sem); | 830 | mutex_unlock(&callback_mutex); |
817 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 831 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
818 | update_cpu_domains(cs); | 832 | update_cpu_domains(cs); |
819 | return 0; | 833 | return 0; |
820 | } | 834 | } |
821 | 835 | ||
822 | /* | 836 | /* |
837 | * cpuset_migrate_mm | ||
838 | * | ||
839 | * Migrate memory region from one set of nodes to another. | ||
840 | * | ||
841 | * Temporarilly set tasks mems_allowed to target nodes of migration, | ||
842 | * so that the migration code can allocate pages on these nodes. | ||
843 | * | ||
844 | * Call holding manage_mutex, so our current->cpuset won't change | ||
845 | * during this call, as manage_mutex holds off any attach_task() | ||
846 | * calls. Therefore we don't need to take task_lock around the | ||
847 | * call to guarantee_online_mems(), as we know no one is changing | ||
848 | * our tasks cpuset. | ||
849 | * | ||
850 | * Hold callback_mutex around the two modifications of our tasks | ||
851 | * mems_allowed to synchronize with cpuset_mems_allowed(). | ||
852 | * | ||
853 | * While the mm_struct we are migrating is typically from some | ||
854 | * other task, the task_struct mems_allowed that we are hacking | ||
855 | * is for our current task, which must allocate new pages for that | ||
856 | * migrating memory region. | ||
857 | * | ||
858 | * We call cpuset_update_task_memory_state() before hacking | ||
859 | * our tasks mems_allowed, so that we are assured of being in | ||
860 | * sync with our tasks cpuset, and in particular, callbacks to | ||
861 | * cpuset_update_task_memory_state() from nested page allocations | ||
862 | * won't see any mismatch of our cpuset and task mems_generation | ||
863 | * values, so won't overwrite our hacked tasks mems_allowed | ||
864 | * nodemask. | ||
865 | */ | ||
866 | |||
867 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | ||
868 | const nodemask_t *to) | ||
869 | { | ||
870 | struct task_struct *tsk = current; | ||
871 | |||
872 | cpuset_update_task_memory_state(); | ||
873 | |||
874 | mutex_lock(&callback_mutex); | ||
875 | tsk->mems_allowed = *to; | ||
876 | mutex_unlock(&callback_mutex); | ||
877 | |||
878 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | ||
879 | |||
880 | mutex_lock(&callback_mutex); | ||
881 | guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); | ||
882 | mutex_unlock(&callback_mutex); | ||
883 | } | ||
884 | |||
885 | /* | ||
823 | * Handle user request to change the 'mems' memory placement | 886 | * Handle user request to change the 'mems' memory placement |
824 | * of a cpuset. Needs to validate the request, update the | 887 | * of a cpuset. Needs to validate the request, update the |
825 | * cpusets mems_allowed and mems_generation, and for each | 888 | * cpusets mems_allowed and mems_generation, and for each |
@@ -827,7 +890,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
827 | * the cpuset is marked 'memory_migrate', migrate the tasks | 890 | * the cpuset is marked 'memory_migrate', migrate the tasks |
828 | * pages to the new memory. | 891 | * pages to the new memory. |
829 | * | 892 | * |
830 | * Call with manage_sem held. May take callback_sem during call. | 893 | * Call with manage_mutex held. May take callback_mutex during call. |
831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 894 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 895 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
833 | * their mempolicies to the cpusets new mems_allowed. | 896 | * their mempolicies to the cpusets new mems_allowed. |
@@ -862,11 +925,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
862 | if (retval < 0) | 925 | if (retval < 0) |
863 | goto done; | 926 | goto done; |
864 | 927 | ||
865 | down(&callback_sem); | 928 | mutex_lock(&callback_mutex); |
866 | cs->mems_allowed = trialcs.mems_allowed; | 929 | cs->mems_allowed = trialcs.mems_allowed; |
867 | atomic_inc(&cpuset_mems_generation); | 930 | cs->mems_generation = cpuset_mems_generation++; |
868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 931 | mutex_unlock(&callback_mutex); |
869 | up(&callback_sem); | ||
870 | 932 | ||
871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | 933 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ |
872 | 934 | ||
@@ -922,7 +984,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 984 | * tasklist_lock. Forks can happen again now - the mpol_copy() |
923 | * cpuset_being_rebound check will catch such forks, and rebind | 985 | * cpuset_being_rebound check will catch such forks, and rebind |
924 | * their vma mempolicies too. Because we still hold the global | 986 | * their vma mempolicies too. Because we still hold the global |
925 | * cpuset manage_sem, we know that no other rebind effort will | 987 | * cpuset manage_mutex, we know that no other rebind effort will |
926 | * be contending for the global variable cpuset_being_rebound. | 988 | * be contending for the global variable cpuset_being_rebound. |
927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 989 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
928 | * is idempotent. Also migrate pages in each mm to new nodes. | 990 | * is idempotent. Also migrate pages in each mm to new nodes. |
@@ -932,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
932 | struct mm_struct *mm = mmarray[i]; | 994 | struct mm_struct *mm = mmarray[i]; |
933 | 995 | ||
934 | mpol_rebind_mm(mm, &cs->mems_allowed); | 996 | mpol_rebind_mm(mm, &cs->mems_allowed); |
935 | if (migrate) { | 997 | if (migrate) |
936 | do_migrate_pages(mm, &oldmem, &cs->mems_allowed, | 998 | cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); |
937 | MPOL_MF_MOVE_ALL); | ||
938 | } | ||
939 | mmput(mm); | 999 | mmput(mm); |
940 | } | 1000 | } |
941 | 1001 | ||
@@ -948,7 +1008,7 @@ done: | |||
948 | } | 1008 | } |
949 | 1009 | ||
950 | /* | 1010 | /* |
951 | * Call with manage_sem held. | 1011 | * Call with manage_mutex held. |
952 | */ | 1012 | */ |
953 | 1013 | ||
954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | 1014 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) |
@@ -963,11 +1023,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
963 | /* | 1023 | /* |
964 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1024 | * update_flag - read a 0 or a 1 in a file and update associated flag |
965 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1025 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
966 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) | 1026 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, |
1027 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | ||
967 | * cs: the cpuset to update | 1028 | * cs: the cpuset to update |
968 | * buf: the buffer where we read the 0 or 1 | 1029 | * buf: the buffer where we read the 0 or 1 |
969 | * | 1030 | * |
970 | * Call with manage_sem held. | 1031 | * Call with manage_mutex held. |
971 | */ | 1032 | */ |
972 | 1033 | ||
973 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 1034 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -989,12 +1050,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
989 | return err; | 1050 | return err; |
990 | cpu_exclusive_changed = | 1051 | cpu_exclusive_changed = |
991 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 1052 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
992 | down(&callback_sem); | 1053 | mutex_lock(&callback_mutex); |
993 | if (turning_on) | 1054 | if (turning_on) |
994 | set_bit(bit, &cs->flags); | 1055 | set_bit(bit, &cs->flags); |
995 | else | 1056 | else |
996 | clear_bit(bit, &cs->flags); | 1057 | clear_bit(bit, &cs->flags); |
997 | up(&callback_sem); | 1058 | mutex_unlock(&callback_mutex); |
998 | 1059 | ||
999 | if (cpu_exclusive_changed) | 1060 | if (cpu_exclusive_changed) |
1000 | update_cpu_domains(cs); | 1061 | update_cpu_domains(cs); |
@@ -1104,7 +1165,7 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1165 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
1105 | * notified on release. | 1166 | * notified on release. |
1106 | * | 1167 | * |
1107 | * Call holding manage_sem. May take callback_sem and task_lock of | 1168 | * Call holding manage_mutex. May take callback_mutex and task_lock of |
1108 | * the task 'pid' during call. | 1169 | * the task 'pid' during call. |
1109 | */ | 1170 | */ |
1110 | 1171 | ||
@@ -1144,13 +1205,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1144 | get_task_struct(tsk); | 1205 | get_task_struct(tsk); |
1145 | } | 1206 | } |
1146 | 1207 | ||
1147 | down(&callback_sem); | 1208 | mutex_lock(&callback_mutex); |
1148 | 1209 | ||
1149 | task_lock(tsk); | 1210 | task_lock(tsk); |
1150 | oldcs = tsk->cpuset; | 1211 | oldcs = tsk->cpuset; |
1151 | if (!oldcs) { | 1212 | if (!oldcs) { |
1152 | task_unlock(tsk); | 1213 | task_unlock(tsk); |
1153 | up(&callback_sem); | 1214 | mutex_unlock(&callback_mutex); |
1154 | put_task_struct(tsk); | 1215 | put_task_struct(tsk); |
1155 | return -ESRCH; | 1216 | return -ESRCH; |
1156 | } | 1217 | } |
@@ -1164,16 +1225,16 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1164 | from = oldcs->mems_allowed; | 1225 | from = oldcs->mems_allowed; |
1165 | to = cs->mems_allowed; | 1226 | to = cs->mems_allowed; |
1166 | 1227 | ||
1167 | up(&callback_sem); | 1228 | mutex_unlock(&callback_mutex); |
1168 | 1229 | ||
1169 | mm = get_task_mm(tsk); | 1230 | mm = get_task_mm(tsk); |
1170 | if (mm) { | 1231 | if (mm) { |
1171 | mpol_rebind_mm(mm, &to); | 1232 | mpol_rebind_mm(mm, &to); |
1233 | if (is_memory_migrate(cs)) | ||
1234 | cpuset_migrate_mm(mm, &from, &to); | ||
1172 | mmput(mm); | 1235 | mmput(mm); |
1173 | } | 1236 | } |
1174 | 1237 | ||
1175 | if (is_memory_migrate(cs)) | ||
1176 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
1177 | put_task_struct(tsk); | 1238 | put_task_struct(tsk); |
1178 | synchronize_rcu(); | 1239 | synchronize_rcu(); |
1179 | if (atomic_dec_and_test(&oldcs->count)) | 1240 | if (atomic_dec_and_test(&oldcs->count)) |
@@ -1194,6 +1255,8 @@ typedef enum { | |||
1194 | FILE_NOTIFY_ON_RELEASE, | 1255 | FILE_NOTIFY_ON_RELEASE, |
1195 | FILE_MEMORY_PRESSURE_ENABLED, | 1256 | FILE_MEMORY_PRESSURE_ENABLED, |
1196 | FILE_MEMORY_PRESSURE, | 1257 | FILE_MEMORY_PRESSURE, |
1258 | FILE_SPREAD_PAGE, | ||
1259 | FILE_SPREAD_SLAB, | ||
1197 | FILE_TASKLIST, | 1260 | FILE_TASKLIST, |
1198 | } cpuset_filetype_t; | 1261 | } cpuset_filetype_t; |
1199 | 1262 | ||
@@ -1221,7 +1284,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
1221 | } | 1284 | } |
1222 | buffer[nbytes] = 0; /* nul-terminate */ | 1285 | buffer[nbytes] = 0; /* nul-terminate */ |
1223 | 1286 | ||
1224 | down(&manage_sem); | 1287 | mutex_lock(&manage_mutex); |
1225 | 1288 | ||
1226 | if (is_removed(cs)) { | 1289 | if (is_removed(cs)) { |
1227 | retval = -ENODEV; | 1290 | retval = -ENODEV; |
@@ -1253,6 +1316,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
1253 | case FILE_MEMORY_PRESSURE: | 1316 | case FILE_MEMORY_PRESSURE: |
1254 | retval = -EACCES; | 1317 | retval = -EACCES; |
1255 | break; | 1318 | break; |
1319 | case FILE_SPREAD_PAGE: | ||
1320 | retval = update_flag(CS_SPREAD_PAGE, cs, buffer); | ||
1321 | cs->mems_generation = cpuset_mems_generation++; | ||
1322 | break; | ||
1323 | case FILE_SPREAD_SLAB: | ||
1324 | retval = update_flag(CS_SPREAD_SLAB, cs, buffer); | ||
1325 | cs->mems_generation = cpuset_mems_generation++; | ||
1326 | break; | ||
1256 | case FILE_TASKLIST: | 1327 | case FILE_TASKLIST: |
1257 | retval = attach_task(cs, buffer, &pathbuf); | 1328 | retval = attach_task(cs, buffer, &pathbuf); |
1258 | break; | 1329 | break; |
@@ -1264,7 +1335,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
1264 | if (retval == 0) | 1335 | if (retval == 0) |
1265 | retval = nbytes; | 1336 | retval = nbytes; |
1266 | out2: | 1337 | out2: |
1267 | up(&manage_sem); | 1338 | mutex_unlock(&manage_mutex); |
1268 | cpuset_release_agent(pathbuf); | 1339 | cpuset_release_agent(pathbuf); |
1269 | out1: | 1340 | out1: |
1270 | kfree(buffer); | 1341 | kfree(buffer); |
@@ -1304,9 +1375,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
1304 | { | 1375 | { |
1305 | cpumask_t mask; | 1376 | cpumask_t mask; |
1306 | 1377 | ||
1307 | down(&callback_sem); | 1378 | mutex_lock(&callback_mutex); |
1308 | mask = cs->cpus_allowed; | 1379 | mask = cs->cpus_allowed; |
1309 | up(&callback_sem); | 1380 | mutex_unlock(&callback_mutex); |
1310 | 1381 | ||
1311 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1382 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
1312 | } | 1383 | } |
@@ -1315,9 +1386,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1315 | { | 1386 | { |
1316 | nodemask_t mask; | 1387 | nodemask_t mask; |
1317 | 1388 | ||
1318 | down(&callback_sem); | 1389 | mutex_lock(&callback_mutex); |
1319 | mask = cs->mems_allowed; | 1390 | mask = cs->mems_allowed; |
1320 | up(&callback_sem); | 1391 | mutex_unlock(&callback_mutex); |
1321 | 1392 | ||
1322 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1393 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
1323 | } | 1394 | } |
@@ -1362,6 +1433,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1362 | case FILE_MEMORY_PRESSURE: | 1433 | case FILE_MEMORY_PRESSURE: |
1363 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | 1434 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); |
1364 | break; | 1435 | break; |
1436 | case FILE_SPREAD_PAGE: | ||
1437 | *s++ = is_spread_page(cs) ? '1' : '0'; | ||
1438 | break; | ||
1439 | case FILE_SPREAD_SLAB: | ||
1440 | *s++ = is_spread_slab(cs) ? '1' : '0'; | ||
1441 | break; | ||
1365 | default: | 1442 | default: |
1366 | retval = -EINVAL; | 1443 | retval = -EINVAL; |
1367 | goto out; | 1444 | goto out; |
@@ -1598,7 +1675,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1598 | * Handle an open on 'tasks' file. Prepare a buffer listing the | 1675 | * Handle an open on 'tasks' file. Prepare a buffer listing the |
1599 | * process id's of tasks currently attached to the cpuset being opened. | 1676 | * process id's of tasks currently attached to the cpuset being opened. |
1600 | * | 1677 | * |
1601 | * Does not require any specific cpuset semaphores, and does not take any. | 1678 | * Does not require any specific cpuset mutexes, and does not take any. |
1602 | */ | 1679 | */ |
1603 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1680 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1604 | { | 1681 | { |
@@ -1725,6 +1802,16 @@ static struct cftype cft_memory_pressure = { | |||
1725 | .private = FILE_MEMORY_PRESSURE, | 1802 | .private = FILE_MEMORY_PRESSURE, |
1726 | }; | 1803 | }; |
1727 | 1804 | ||
1805 | static struct cftype cft_spread_page = { | ||
1806 | .name = "memory_spread_page", | ||
1807 | .private = FILE_SPREAD_PAGE, | ||
1808 | }; | ||
1809 | |||
1810 | static struct cftype cft_spread_slab = { | ||
1811 | .name = "memory_spread_slab", | ||
1812 | .private = FILE_SPREAD_SLAB, | ||
1813 | }; | ||
1814 | |||
1728 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1815 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1729 | { | 1816 | { |
1730 | int err; | 1817 | int err; |
@@ -1743,6 +1830,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1743 | return err; | 1830 | return err; |
1744 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | 1831 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) |
1745 | return err; | 1832 | return err; |
1833 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) | ||
1834 | return err; | ||
1835 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) | ||
1836 | return err; | ||
1746 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1837 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1747 | return err; | 1838 | return err; |
1748 | return 0; | 1839 | return 0; |
@@ -1754,7 +1845,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1754 | * name: name of the new cpuset. Will be strcpy'ed. | 1845 | * name: name of the new cpuset. Will be strcpy'ed. |
1755 | * mode: mode to set on new inode | 1846 | * mode: mode to set on new inode |
1756 | * | 1847 | * |
1757 | * Must be called with the semaphore on the parent inode held | 1848 | * Must be called with the mutex on the parent inode held |
1758 | */ | 1849 | */ |
1759 | 1850 | ||
1760 | static long cpuset_create(struct cpuset *parent, const char *name, int mode) | 1851 | static long cpuset_create(struct cpuset *parent, const char *name, int mode) |
@@ -1766,44 +1857,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1766 | if (!cs) | 1857 | if (!cs) |
1767 | return -ENOMEM; | 1858 | return -ENOMEM; |
1768 | 1859 | ||
1769 | down(&manage_sem); | 1860 | mutex_lock(&manage_mutex); |
1770 | cpuset_update_task_memory_state(); | 1861 | cpuset_update_task_memory_state(); |
1771 | cs->flags = 0; | 1862 | cs->flags = 0; |
1772 | if (notify_on_release(parent)) | 1863 | if (notify_on_release(parent)) |
1773 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1864 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
1865 | if (is_spread_page(parent)) | ||
1866 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
1867 | if (is_spread_slab(parent)) | ||
1868 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
1774 | cs->cpus_allowed = CPU_MASK_NONE; | 1869 | cs->cpus_allowed = CPU_MASK_NONE; |
1775 | cs->mems_allowed = NODE_MASK_NONE; | 1870 | cs->mems_allowed = NODE_MASK_NONE; |
1776 | atomic_set(&cs->count, 0); | 1871 | atomic_set(&cs->count, 0); |
1777 | INIT_LIST_HEAD(&cs->sibling); | 1872 | INIT_LIST_HEAD(&cs->sibling); |
1778 | INIT_LIST_HEAD(&cs->children); | 1873 | INIT_LIST_HEAD(&cs->children); |
1779 | atomic_inc(&cpuset_mems_generation); | 1874 | cs->mems_generation = cpuset_mems_generation++; |
1780 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1781 | fmeter_init(&cs->fmeter); | 1875 | fmeter_init(&cs->fmeter); |
1782 | 1876 | ||
1783 | cs->parent = parent; | 1877 | cs->parent = parent; |
1784 | 1878 | ||
1785 | down(&callback_sem); | 1879 | mutex_lock(&callback_mutex); |
1786 | list_add(&cs->sibling, &cs->parent->children); | 1880 | list_add(&cs->sibling, &cs->parent->children); |
1787 | number_of_cpusets++; | 1881 | number_of_cpusets++; |
1788 | up(&callback_sem); | 1882 | mutex_unlock(&callback_mutex); |
1789 | 1883 | ||
1790 | err = cpuset_create_dir(cs, name, mode); | 1884 | err = cpuset_create_dir(cs, name, mode); |
1791 | if (err < 0) | 1885 | if (err < 0) |
1792 | goto err; | 1886 | goto err; |
1793 | 1887 | ||
1794 | /* | 1888 | /* |
1795 | * Release manage_sem before cpuset_populate_dir() because it | 1889 | * Release manage_mutex before cpuset_populate_dir() because it |
1796 | * will down() this new directory's i_mutex and if we race with | 1890 | * will down() this new directory's i_mutex and if we race with |
1797 | * another mkdir, we might deadlock. | 1891 | * another mkdir, we might deadlock. |
1798 | */ | 1892 | */ |
1799 | up(&manage_sem); | 1893 | mutex_unlock(&manage_mutex); |
1800 | 1894 | ||
1801 | err = cpuset_populate_dir(cs->dentry); | 1895 | err = cpuset_populate_dir(cs->dentry); |
1802 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1896 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1803 | return 0; | 1897 | return 0; |
1804 | err: | 1898 | err: |
1805 | list_del(&cs->sibling); | 1899 | list_del(&cs->sibling); |
1806 | up(&manage_sem); | 1900 | mutex_unlock(&manage_mutex); |
1807 | kfree(cs); | 1901 | kfree(cs); |
1808 | return err; | 1902 | return err; |
1809 | } | 1903 | } |
@@ -1825,18 +1919,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1825 | 1919 | ||
1826 | /* the vfs holds both inode->i_mutex already */ | 1920 | /* the vfs holds both inode->i_mutex already */ |
1827 | 1921 | ||
1828 | down(&manage_sem); | 1922 | mutex_lock(&manage_mutex); |
1829 | cpuset_update_task_memory_state(); | 1923 | cpuset_update_task_memory_state(); |
1830 | if (atomic_read(&cs->count) > 0) { | 1924 | if (atomic_read(&cs->count) > 0) { |
1831 | up(&manage_sem); | 1925 | mutex_unlock(&manage_mutex); |
1832 | return -EBUSY; | 1926 | return -EBUSY; |
1833 | } | 1927 | } |
1834 | if (!list_empty(&cs->children)) { | 1928 | if (!list_empty(&cs->children)) { |
1835 | up(&manage_sem); | 1929 | mutex_unlock(&manage_mutex); |
1836 | return -EBUSY; | 1930 | return -EBUSY; |
1837 | } | 1931 | } |
1838 | parent = cs->parent; | 1932 | parent = cs->parent; |
1839 | down(&callback_sem); | 1933 | mutex_lock(&callback_mutex); |
1840 | set_bit(CS_REMOVED, &cs->flags); | 1934 | set_bit(CS_REMOVED, &cs->flags); |
1841 | if (is_cpu_exclusive(cs)) | 1935 | if (is_cpu_exclusive(cs)) |
1842 | update_cpu_domains(cs); | 1936 | update_cpu_domains(cs); |
@@ -1848,10 +1942,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1848 | cpuset_d_remove_dir(d); | 1942 | cpuset_d_remove_dir(d); |
1849 | dput(d); | 1943 | dput(d); |
1850 | number_of_cpusets--; | 1944 | number_of_cpusets--; |
1851 | up(&callback_sem); | 1945 | mutex_unlock(&callback_mutex); |
1852 | if (list_empty(&parent->children)) | 1946 | if (list_empty(&parent->children)) |
1853 | check_for_release(parent, &pathbuf); | 1947 | check_for_release(parent, &pathbuf); |
1854 | up(&manage_sem); | 1948 | mutex_unlock(&manage_mutex); |
1855 | cpuset_release_agent(pathbuf); | 1949 | cpuset_release_agent(pathbuf); |
1856 | return 0; | 1950 | return 0; |
1857 | } | 1951 | } |
@@ -1867,7 +1961,7 @@ int __init cpuset_init_early(void) | |||
1867 | struct task_struct *tsk = current; | 1961 | struct task_struct *tsk = current; |
1868 | 1962 | ||
1869 | tsk->cpuset = &top_cpuset; | 1963 | tsk->cpuset = &top_cpuset; |
1870 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | 1964 | tsk->cpuset->mems_generation = cpuset_mems_generation++; |
1871 | return 0; | 1965 | return 0; |
1872 | } | 1966 | } |
1873 | 1967 | ||
@@ -1886,8 +1980,7 @@ int __init cpuset_init(void) | |||
1886 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1980 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1887 | 1981 | ||
1888 | fmeter_init(&top_cpuset.fmeter); | 1982 | fmeter_init(&top_cpuset.fmeter); |
1889 | atomic_inc(&cpuset_mems_generation); | 1983 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1890 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | ||
1891 | 1984 | ||
1892 | init_task.cpuset = &top_cpuset; | 1985 | init_task.cpuset = &top_cpuset; |
1893 | 1986 | ||
@@ -1960,23 +2053,56 @@ void cpuset_fork(struct task_struct *child) | |||
1960 | * Description: Detach cpuset from @tsk and release it. | 2053 | * Description: Detach cpuset from @tsk and release it. |
1961 | * | 2054 | * |
1962 | * Note that cpusets marked notify_on_release force every task in | 2055 | * Note that cpusets marked notify_on_release force every task in |
1963 | * them to take the global manage_sem semaphore when exiting. | 2056 | * them to take the global manage_mutex mutex when exiting. |
1964 | * This could impact scaling on very large systems. Be reluctant to | 2057 | * This could impact scaling on very large systems. Be reluctant to |
1965 | * use notify_on_release cpusets where very high task exit scaling | 2058 | * use notify_on_release cpusets where very high task exit scaling |
1966 | * is required on large systems. | 2059 | * is required on large systems. |
1967 | * | 2060 | * |
1968 | * Don't even think about derefencing 'cs' after the cpuset use count | 2061 | * Don't even think about derefencing 'cs' after the cpuset use count |
1969 | * goes to zero, except inside a critical section guarded by manage_sem | 2062 | * goes to zero, except inside a critical section guarded by manage_mutex |
1970 | * or callback_sem. Otherwise a zero cpuset use count is a license to | 2063 | * or callback_mutex. Otherwise a zero cpuset use count is a license to |
1971 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). | 2064 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
1972 | * | 2065 | * |
1973 | * This routine has to take manage_sem, not callback_sem, because | 2066 | * This routine has to take manage_mutex, not callback_mutex, because |
1974 | * it is holding that semaphore while calling check_for_release(), | 2067 | * it is holding that mutex while calling check_for_release(), |
1975 | * which calls kmalloc(), so can't be called holding callback__sem(). | 2068 | * which calls kmalloc(), so can't be called holding callback_mutex(). |
1976 | * | 2069 | * |
1977 | * We don't need to task_lock() this reference to tsk->cpuset, | 2070 | * We don't need to task_lock() this reference to tsk->cpuset, |
1978 | * because tsk is already marked PF_EXITING, so attach_task() won't | 2071 | * because tsk is already marked PF_EXITING, so attach_task() won't |
1979 | * mess with it, or task is a failed fork, never visible to attach_task. | 2072 | * mess with it, or task is a failed fork, never visible to attach_task. |
2073 | * | ||
2074 | * the_top_cpuset_hack: | ||
2075 | * | ||
2076 | * Set the exiting tasks cpuset to the root cpuset (top_cpuset). | ||
2077 | * | ||
2078 | * Don't leave a task unable to allocate memory, as that is an | ||
2079 | * accident waiting to happen should someone add a callout in | ||
2080 | * do_exit() after the cpuset_exit() call that might allocate. | ||
2081 | * If a task tries to allocate memory with an invalid cpuset, | ||
2082 | * it will oops in cpuset_update_task_memory_state(). | ||
2083 | * | ||
2084 | * We call cpuset_exit() while the task is still competent to | ||
2085 | * handle notify_on_release(), then leave the task attached to | ||
2086 | * the root cpuset (top_cpuset) for the remainder of its exit. | ||
2087 | * | ||
2088 | * To do this properly, we would increment the reference count on | ||
2089 | * top_cpuset, and near the very end of the kernel/exit.c do_exit() | ||
2090 | * code we would add a second cpuset function call, to drop that | ||
2091 | * reference. This would just create an unnecessary hot spot on | ||
2092 | * the top_cpuset reference count, to no avail. | ||
2093 | * | ||
2094 | * Normally, holding a reference to a cpuset without bumping its | ||
2095 | * count is unsafe. The cpuset could go away, or someone could | ||
2096 | * attach us to a different cpuset, decrementing the count on | ||
2097 | * the first cpuset that we never incremented. But in this case, | ||
2098 | * top_cpuset isn't going away, and either task has PF_EXITING set, | ||
2099 | * which wards off any attach_task() attempts, or task is a failed | ||
2100 | * fork, never visible to attach_task. | ||
2101 | * | ||
2102 | * Another way to do this would be to set the cpuset pointer | ||
2103 | * to NULL here, and check in cpuset_update_task_memory_state() | ||
2104 | * for a NULL pointer. This hack avoids that NULL check, for no | ||
2105 | * cost (other than this way too long comment ;). | ||
1980 | **/ | 2106 | **/ |
1981 | 2107 | ||
1982 | void cpuset_exit(struct task_struct *tsk) | 2108 | void cpuset_exit(struct task_struct *tsk) |
@@ -1984,15 +2110,15 @@ void cpuset_exit(struct task_struct *tsk) | |||
1984 | struct cpuset *cs; | 2110 | struct cpuset *cs; |
1985 | 2111 | ||
1986 | cs = tsk->cpuset; | 2112 | cs = tsk->cpuset; |
1987 | tsk->cpuset = NULL; | 2113 | tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ |
1988 | 2114 | ||
1989 | if (notify_on_release(cs)) { | 2115 | if (notify_on_release(cs)) { |
1990 | char *pathbuf = NULL; | 2116 | char *pathbuf = NULL; |
1991 | 2117 | ||
1992 | down(&manage_sem); | 2118 | mutex_lock(&manage_mutex); |
1993 | if (atomic_dec_and_test(&cs->count)) | 2119 | if (atomic_dec_and_test(&cs->count)) |
1994 | check_for_release(cs, &pathbuf); | 2120 | check_for_release(cs, &pathbuf); |
1995 | up(&manage_sem); | 2121 | mutex_unlock(&manage_mutex); |
1996 | cpuset_release_agent(pathbuf); | 2122 | cpuset_release_agent(pathbuf); |
1997 | } else { | 2123 | } else { |
1998 | atomic_dec(&cs->count); | 2124 | atomic_dec(&cs->count); |
@@ -2013,11 +2139,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | |||
2013 | { | 2139 | { |
2014 | cpumask_t mask; | 2140 | cpumask_t mask; |
2015 | 2141 | ||
2016 | down(&callback_sem); | 2142 | mutex_lock(&callback_mutex); |
2017 | task_lock(tsk); | 2143 | task_lock(tsk); |
2018 | guarantee_online_cpus(tsk->cpuset, &mask); | 2144 | guarantee_online_cpus(tsk->cpuset, &mask); |
2019 | task_unlock(tsk); | 2145 | task_unlock(tsk); |
2020 | up(&callback_sem); | 2146 | mutex_unlock(&callback_mutex); |
2021 | 2147 | ||
2022 | return mask; | 2148 | return mask; |
2023 | } | 2149 | } |
@@ -2041,11 +2167,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |||
2041 | { | 2167 | { |
2042 | nodemask_t mask; | 2168 | nodemask_t mask; |
2043 | 2169 | ||
2044 | down(&callback_sem); | 2170 | mutex_lock(&callback_mutex); |
2045 | task_lock(tsk); | 2171 | task_lock(tsk); |
2046 | guarantee_online_mems(tsk->cpuset, &mask); | 2172 | guarantee_online_mems(tsk->cpuset, &mask); |
2047 | task_unlock(tsk); | 2173 | task_unlock(tsk); |
2048 | up(&callback_sem); | 2174 | mutex_unlock(&callback_mutex); |
2049 | 2175 | ||
2050 | return mask; | 2176 | return mask; |
2051 | } | 2177 | } |
@@ -2071,7 +2197,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
2071 | 2197 | ||
2072 | /* | 2198 | /* |
2073 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 2199 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
2074 | * ancestor to the specified cpuset. Call holding callback_sem. | 2200 | * ancestor to the specified cpuset. Call holding callback_mutex. |
2075 | * If no ancestor is mem_exclusive (an unusual configuration), then | 2201 | * If no ancestor is mem_exclusive (an unusual configuration), then |
2076 | * returns the root cpuset. | 2202 | * returns the root cpuset. |
2077 | */ | 2203 | */ |
@@ -2098,37 +2224,44 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
2098 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2224 | * GFP_KERNEL allocations are not so marked, so can escape to the |
2099 | * nearest mem_exclusive ancestor cpuset. | 2225 | * nearest mem_exclusive ancestor cpuset. |
2100 | * | 2226 | * |
2101 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() | 2227 | * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() |
2102 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 2228 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
2103 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 2229 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
2104 | * mems_allowed came up empty on the first pass over the zonelist. | 2230 | * mems_allowed came up empty on the first pass over the zonelist. |
2105 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 2231 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
2106 | * short of memory, might require taking the callback_sem semaphore. | 2232 | * short of memory, might require taking the callback_mutex mutex. |
2107 | * | 2233 | * |
2108 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 2234 | * The first call here from mm/page_alloc:get_page_from_freelist() |
2109 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 2235 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so |
2110 | * hardwall cpusets - no allocation on a node outside the cpuset is | 2236 | * no allocation on a node outside the cpuset is allowed (unless in |
2111 | * allowed (unless in interrupt, of course). | 2237 | * interrupt, of course). |
2112 | * | 2238 | * |
2113 | * The second loop doesn't even call here for GFP_ATOMIC requests | 2239 | * The second pass through get_page_from_freelist() doesn't even call |
2114 | * (if the __alloc_pages() local variable 'wait' is set). That check | 2240 | * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() |
2115 | * and the checks below have the combined affect in the second loop of | 2241 | * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set |
2116 | * the __alloc_pages() routine that: | 2242 | * in alloc_flags. That logic and the checks below have the combined |
2243 | * affect that: | ||
2117 | * in_interrupt - any node ok (current task context irrelevant) | 2244 | * in_interrupt - any node ok (current task context irrelevant) |
2118 | * GFP_ATOMIC - any node ok | 2245 | * GFP_ATOMIC - any node ok |
2119 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | 2246 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok |
2120 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2247 | * GFP_USER - only nodes in current tasks mems allowed ok. |
2248 | * | ||
2249 | * Rule: | ||
2250 | * Don't call cpuset_zone_allowed() if you can't sleep, unless you | ||
2251 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | ||
2252 | * the code that might scan up ancestor cpusets and sleep. | ||
2121 | **/ | 2253 | **/ |
2122 | 2254 | ||
2123 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2255 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
2124 | { | 2256 | { |
2125 | int node; /* node that zone z is on */ | 2257 | int node; /* node that zone z is on */ |
2126 | const struct cpuset *cs; /* current cpuset ancestors */ | 2258 | const struct cpuset *cs; /* current cpuset ancestors */ |
2127 | int allowed = 1; /* is allocation in zone z allowed? */ | 2259 | int allowed; /* is allocation in zone z allowed? */ |
2128 | 2260 | ||
2129 | if (in_interrupt()) | 2261 | if (in_interrupt()) |
2130 | return 1; | 2262 | return 1; |
2131 | node = z->zone_pgdat->node_id; | 2263 | node = z->zone_pgdat->node_id; |
2264 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | ||
2132 | if (node_isset(node, current->mems_allowed)) | 2265 | if (node_isset(node, current->mems_allowed)) |
2133 | return 1; | 2266 | return 1; |
2134 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | 2267 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ |
@@ -2138,31 +2271,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
2138 | return 1; | 2271 | return 1; |
2139 | 2272 | ||
2140 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 2273 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
2141 | down(&callback_sem); | 2274 | mutex_lock(&callback_mutex); |
2142 | 2275 | ||
2143 | task_lock(current); | 2276 | task_lock(current); |
2144 | cs = nearest_exclusive_ancestor(current->cpuset); | 2277 | cs = nearest_exclusive_ancestor(current->cpuset); |
2145 | task_unlock(current); | 2278 | task_unlock(current); |
2146 | 2279 | ||
2147 | allowed = node_isset(node, cs->mems_allowed); | 2280 | allowed = node_isset(node, cs->mems_allowed); |
2148 | up(&callback_sem); | 2281 | mutex_unlock(&callback_mutex); |
2149 | return allowed; | 2282 | return allowed; |
2150 | } | 2283 | } |
2151 | 2284 | ||
2152 | /** | 2285 | /** |
2153 | * cpuset_lock - lock out any changes to cpuset structures | 2286 | * cpuset_lock - lock out any changes to cpuset structures |
2154 | * | 2287 | * |
2155 | * The out of memory (oom) code needs to lock down cpusets | 2288 | * The out of memory (oom) code needs to mutex_lock cpusets |
2156 | * from being changed while it scans the tasklist looking for a | 2289 | * from being changed while it scans the tasklist looking for a |
2157 | * task in an overlapping cpuset. Expose callback_sem via this | 2290 | * task in an overlapping cpuset. Expose callback_mutex via this |
2158 | * cpuset_lock() routine, so the oom code can lock it, before | 2291 | * cpuset_lock() routine, so the oom code can lock it, before |
2159 | * locking the task list. The tasklist_lock is a spinlock, so | 2292 | * locking the task list. The tasklist_lock is a spinlock, so |
2160 | * must be taken inside callback_sem. | 2293 | * must be taken inside callback_mutex. |
2161 | */ | 2294 | */ |
2162 | 2295 | ||
2163 | void cpuset_lock(void) | 2296 | void cpuset_lock(void) |
2164 | { | 2297 | { |
2165 | down(&callback_sem); | 2298 | mutex_lock(&callback_mutex); |
2166 | } | 2299 | } |
2167 | 2300 | ||
2168 | /** | 2301 | /** |
@@ -2173,10 +2306,48 @@ void cpuset_lock(void) | |||
2173 | 2306 | ||
2174 | void cpuset_unlock(void) | 2307 | void cpuset_unlock(void) |
2175 | { | 2308 | { |
2176 | up(&callback_sem); | 2309 | mutex_unlock(&callback_mutex); |
2177 | } | 2310 | } |
2178 | 2311 | ||
2179 | /** | 2312 | /** |
2313 | * cpuset_mem_spread_node() - On which node to begin search for a page | ||
2314 | * | ||
2315 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | ||
2316 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | ||
2317 | * and if the memory allocation used cpuset_mem_spread_node() | ||
2318 | * to determine on which node to start looking, as it will for | ||
2319 | * certain page cache or slab cache pages such as used for file | ||
2320 | * system buffers and inode caches, then instead of starting on the | ||
2321 | * local node to look for a free page, rather spread the starting | ||
2322 | * node around the tasks mems_allowed nodes. | ||
2323 | * | ||
2324 | * We don't have to worry about the returned node being offline | ||
2325 | * because "it can't happen", and even if it did, it would be ok. | ||
2326 | * | ||
2327 | * The routines calling guarantee_online_mems() are careful to | ||
2328 | * only set nodes in task->mems_allowed that are online. So it | ||
2329 | * should not be possible for the following code to return an | ||
2330 | * offline node. But if it did, that would be ok, as this routine | ||
2331 | * is not returning the node where the allocation must be, only | ||
2332 | * the node where the search should start. The zonelist passed to | ||
2333 | * __alloc_pages() will include all nodes. If the slab allocator | ||
2334 | * is passed an offline node, it will fall back to the local node. | ||
2335 | * See kmem_cache_alloc_node(). | ||
2336 | */ | ||
2337 | |||
2338 | int cpuset_mem_spread_node(void) | ||
2339 | { | ||
2340 | int node; | ||
2341 | |||
2342 | node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); | ||
2343 | if (node == MAX_NUMNODES) | ||
2344 | node = first_node(current->mems_allowed); | ||
2345 | current->cpuset_mem_spread_rotor = node; | ||
2346 | return node; | ||
2347 | } | ||
2348 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | ||
2349 | |||
2350 | /** | ||
2180 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | 2351 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? |
2181 | * @p: pointer to task_struct of some other task. | 2352 | * @p: pointer to task_struct of some other task. |
2182 | * | 2353 | * |
@@ -2185,7 +2356,7 @@ void cpuset_unlock(void) | |||
2185 | * determine if task @p's memory usage might impact the memory | 2356 | * determine if task @p's memory usage might impact the memory |
2186 | * available to the current task. | 2357 | * available to the current task. |
2187 | * | 2358 | * |
2188 | * Call while holding callback_sem. | 2359 | * Call while holding callback_mutex. |
2189 | **/ | 2360 | **/ |
2190 | 2361 | ||
2191 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2362 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -2256,13 +2427,13 @@ void __cpuset_memory_pressure_bump(void) | |||
2256 | * - Used for /proc/<pid>/cpuset. | 2427 | * - Used for /proc/<pid>/cpuset. |
2257 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2428 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
2258 | * doesn't really matter if tsk->cpuset changes after we read it, | 2429 | * doesn't really matter if tsk->cpuset changes after we read it, |
2259 | * and we take manage_sem, keeping attach_task() from changing it | 2430 | * and we take manage_mutex, keeping attach_task() from changing it |
2260 | * anyway. | 2431 | * anyway. No need to check that tsk->cpuset != NULL, thanks to |
2432 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks | ||
2433 | * cpuset to top_cpuset. | ||
2261 | */ | 2434 | */ |
2262 | |||
2263 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2435 | static int proc_cpuset_show(struct seq_file *m, void *v) |
2264 | { | 2436 | { |
2265 | struct cpuset *cs; | ||
2266 | struct task_struct *tsk; | 2437 | struct task_struct *tsk; |
2267 | char *buf; | 2438 | char *buf; |
2268 | int retval = 0; | 2439 | int retval = 0; |
@@ -2272,20 +2443,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
2272 | return -ENOMEM; | 2443 | return -ENOMEM; |
2273 | 2444 | ||
2274 | tsk = m->private; | 2445 | tsk = m->private; |
2275 | down(&manage_sem); | 2446 | mutex_lock(&manage_mutex); |
2276 | cs = tsk->cpuset; | 2447 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
2277 | if (!cs) { | ||
2278 | retval = -EINVAL; | ||
2279 | goto out; | ||
2280 | } | ||
2281 | |||
2282 | retval = cpuset_path(cs, buf, PAGE_SIZE); | ||
2283 | if (retval < 0) | 2448 | if (retval < 0) |
2284 | goto out; | 2449 | goto out; |
2285 | seq_puts(m, buf); | 2450 | seq_puts(m, buf); |
2286 | seq_putc(m, '\n'); | 2451 | seq_putc(m, '\n'); |
2287 | out: | 2452 | out: |
2288 | up(&manage_sem); | 2453 | mutex_unlock(&manage_mutex); |
2289 | kfree(buf); | 2454 | kfree(buf); |
2290 | return retval; | 2455 | return retval; |
2291 | } | 2456 | } |
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 867d6dbeb5..c01cead2cf 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -140,6 +140,7 @@ __set_personality(u_long personality) | |||
140 | ep = lookup_exec_domain(personality); | 140 | ep = lookup_exec_domain(personality); |
141 | if (ep == current_thread_info()->exec_domain) { | 141 | if (ep == current_thread_info()->exec_domain) { |
142 | current->personality = personality; | 142 | current->personality = personality; |
143 | module_put(ep->module); | ||
143 | return 0; | 144 | return 0; |
144 | } | 145 | } |
145 | 146 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 93cee36713..e95b932822 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -29,8 +29,13 @@ | |||
29 | #include <linux/cpuset.h> | 29 | #include <linux/cpuset.h> |
30 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
31 | #include <linux/signal.h> | 31 | #include <linux/signal.h> |
32 | #include <linux/posix-timers.h> | ||
32 | #include <linux/cn_proc.h> | 33 | #include <linux/cn_proc.h> |
33 | #include <linux/mutex.h> | 34 | #include <linux/mutex.h> |
35 | #include <linux/futex.h> | ||
36 | #include <linux/compat.h> | ||
37 | #include <linux/pipe_fs_i.h> | ||
38 | #include <linux/audit.h> /* for audit_free() */ | ||
34 | 39 | ||
35 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
36 | #include <asm/unistd.h> | 41 | #include <asm/unistd.h> |
@@ -48,15 +53,85 @@ static void __unhash_process(struct task_struct *p) | |||
48 | { | 53 | { |
49 | nr_threads--; | 54 | nr_threads--; |
50 | detach_pid(p, PIDTYPE_PID); | 55 | detach_pid(p, PIDTYPE_PID); |
51 | detach_pid(p, PIDTYPE_TGID); | ||
52 | if (thread_group_leader(p)) { | 56 | if (thread_group_leader(p)) { |
53 | detach_pid(p, PIDTYPE_PGID); | 57 | detach_pid(p, PIDTYPE_PGID); |
54 | detach_pid(p, PIDTYPE_SID); | 58 | detach_pid(p, PIDTYPE_SID); |
55 | if (p->pid) | 59 | |
56 | __get_cpu_var(process_counts)--; | 60 | list_del_rcu(&p->tasks); |
61 | __get_cpu_var(process_counts)--; | ||
62 | } | ||
63 | list_del_rcu(&p->thread_group); | ||
64 | remove_parent(p); | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | * This function expects the tasklist_lock write-locked. | ||
69 | */ | ||
70 | static void __exit_signal(struct task_struct *tsk) | ||
71 | { | ||
72 | struct signal_struct *sig = tsk->signal; | ||
73 | struct sighand_struct *sighand; | ||
74 | |||
75 | BUG_ON(!sig); | ||
76 | BUG_ON(!atomic_read(&sig->count)); | ||
77 | |||
78 | rcu_read_lock(); | ||
79 | sighand = rcu_dereference(tsk->sighand); | ||
80 | spin_lock(&sighand->siglock); | ||
81 | |||
82 | posix_cpu_timers_exit(tsk); | ||
83 | if (atomic_dec_and_test(&sig->count)) | ||
84 | posix_cpu_timers_exit_group(tsk); | ||
85 | else { | ||
86 | /* | ||
87 | * If there is any task waiting for the group exit | ||
88 | * then notify it: | ||
89 | */ | ||
90 | if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { | ||
91 | wake_up_process(sig->group_exit_task); | ||
92 | sig->group_exit_task = NULL; | ||
93 | } | ||
94 | if (tsk == sig->curr_target) | ||
95 | sig->curr_target = next_thread(tsk); | ||
96 | /* | ||
97 | * Accumulate here the counters for all threads but the | ||
98 | * group leader as they die, so they can be added into | ||
99 | * the process-wide totals when those are taken. | ||
100 | * The group leader stays around as a zombie as long | ||
101 | * as there are other threads. When it gets reaped, | ||
102 | * the exit.c code will add its counts into these totals. | ||
103 | * We won't ever get here for the group leader, since it | ||
104 | * will have been the last reference on the signal_struct. | ||
105 | */ | ||
106 | sig->utime = cputime_add(sig->utime, tsk->utime); | ||
107 | sig->stime = cputime_add(sig->stime, tsk->stime); | ||
108 | sig->min_flt += tsk->min_flt; | ||
109 | sig->maj_flt += tsk->maj_flt; | ||
110 | sig->nvcsw += tsk->nvcsw; | ||
111 | sig->nivcsw += tsk->nivcsw; | ||
112 | sig->sched_time += tsk->sched_time; | ||
113 | sig = NULL; /* Marker for below. */ | ||
57 | } | 114 | } |
58 | 115 | ||
59 | REMOVE_LINKS(p); | 116 | __unhash_process(tsk); |
117 | |||
118 | tsk->signal = NULL; | ||
119 | tsk->sighand = NULL; | ||
120 | spin_unlock(&sighand->siglock); | ||
121 | rcu_read_unlock(); | ||
122 | |||
123 | __cleanup_sighand(sighand); | ||
124 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | ||
125 | flush_sigqueue(&tsk->pending); | ||
126 | if (sig) { | ||
127 | flush_sigqueue(&sig->shared_pending); | ||
128 | __cleanup_signal(sig); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | static void delayed_put_task_struct(struct rcu_head *rhp) | ||
133 | { | ||
134 | put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
60 | } | 135 | } |
61 | 136 | ||
62 | void release_task(struct task_struct * p) | 137 | void release_task(struct task_struct * p) |
@@ -65,21 +140,14 @@ void release_task(struct task_struct * p) | |||
65 | task_t *leader; | 140 | task_t *leader; |
66 | struct dentry *proc_dentry; | 141 | struct dentry *proc_dentry; |
67 | 142 | ||
68 | repeat: | 143 | repeat: |
69 | atomic_dec(&p->user->processes); | 144 | atomic_dec(&p->user->processes); |
70 | spin_lock(&p->proc_lock); | 145 | spin_lock(&p->proc_lock); |
71 | proc_dentry = proc_pid_unhash(p); | 146 | proc_dentry = proc_pid_unhash(p); |
72 | write_lock_irq(&tasklist_lock); | 147 | write_lock_irq(&tasklist_lock); |
73 | if (unlikely(p->ptrace)) | 148 | ptrace_unlink(p); |
74 | __ptrace_unlink(p); | ||
75 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 149 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
76 | __exit_signal(p); | 150 | __exit_signal(p); |
77 | /* | ||
78 | * Note that the fastpath in sys_times depends on __exit_signal having | ||
79 | * updated the counters before a task is removed from the tasklist of | ||
80 | * the process by __unhash_process. | ||
81 | */ | ||
82 | __unhash_process(p); | ||
83 | 151 | ||
84 | /* | 152 | /* |
85 | * If we are the last non-leader member of the thread | 153 | * If we are the last non-leader member of the thread |
@@ -107,28 +175,13 @@ repeat: | |||
107 | spin_unlock(&p->proc_lock); | 175 | spin_unlock(&p->proc_lock); |
108 | proc_pid_flush(proc_dentry); | 176 | proc_pid_flush(proc_dentry); |
109 | release_thread(p); | 177 | release_thread(p); |
110 | put_task_struct(p); | 178 | call_rcu(&p->rcu, delayed_put_task_struct); |
111 | 179 | ||
112 | p = leader; | 180 | p = leader; |
113 | if (unlikely(zap_leader)) | 181 | if (unlikely(zap_leader)) |
114 | goto repeat; | 182 | goto repeat; |
115 | } | 183 | } |
116 | 184 | ||
117 | /* we are using it only for SMP init */ | ||
118 | |||
119 | void unhash_process(struct task_struct *p) | ||
120 | { | ||
121 | struct dentry *proc_dentry; | ||
122 | |||
123 | spin_lock(&p->proc_lock); | ||
124 | proc_dentry = proc_pid_unhash(p); | ||
125 | write_lock_irq(&tasklist_lock); | ||
126 | __unhash_process(p); | ||
127 | write_unlock_irq(&tasklist_lock); | ||
128 | spin_unlock(&p->proc_lock); | ||
129 | proc_pid_flush(proc_dentry); | ||
130 | } | ||
131 | |||
132 | /* | 185 | /* |
133 | * This checks not only the pgrp, but falls back on the pid if no | 186 | * This checks not only the pgrp, but falls back on the pid if no |
134 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | 187 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly |
@@ -236,10 +289,10 @@ static void reparent_to_init(void) | |||
236 | 289 | ||
237 | ptrace_unlink(current); | 290 | ptrace_unlink(current); |
238 | /* Reparent to init */ | 291 | /* Reparent to init */ |
239 | REMOVE_LINKS(current); | 292 | remove_parent(current); |
240 | current->parent = child_reaper; | 293 | current->parent = child_reaper; |
241 | current->real_parent = child_reaper; | 294 | current->real_parent = child_reaper; |
242 | SET_LINKS(current); | 295 | add_parent(current); |
243 | 296 | ||
244 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 297 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
245 | current->exit_signal = SIGCHLD; | 298 | current->exit_signal = SIGCHLD; |
@@ -345,9 +398,9 @@ void daemonize(const char *name, ...) | |||
345 | exit_mm(current); | 398 | exit_mm(current); |
346 | 399 | ||
347 | set_special_pids(1, 1); | 400 | set_special_pids(1, 1); |
348 | down(&tty_sem); | 401 | mutex_lock(&tty_mutex); |
349 | current->signal->tty = NULL; | 402 | current->signal->tty = NULL; |
350 | up(&tty_sem); | 403 | mutex_unlock(&tty_mutex); |
351 | 404 | ||
352 | /* Block and flush all signals */ | 405 | /* Block and flush all signals */ |
353 | sigfillset(&blocked); | 406 | sigfillset(&blocked); |
@@ -360,6 +413,9 @@ void daemonize(const char *name, ...) | |||
360 | fs = init_task.fs; | 413 | fs = init_task.fs; |
361 | current->fs = fs; | 414 | current->fs = fs; |
362 | atomic_inc(&fs->count); | 415 | atomic_inc(&fs->count); |
416 | exit_namespace(current); | ||
417 | current->namespace = init_task.namespace; | ||
418 | get_namespace(current->namespace); | ||
363 | exit_files(current); | 419 | exit_files(current); |
364 | current->files = init_task.files; | 420 | current->files = init_task.files; |
365 | atomic_inc(¤t->files->count); | 421 | atomic_inc(¤t->files->count); |
@@ -533,13 +589,13 @@ static void exit_mm(struct task_struct * tsk) | |||
533 | mmput(mm); | 589 | mmput(mm); |
534 | } | 590 | } |
535 | 591 | ||
536 | static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) | 592 | static inline void choose_new_parent(task_t *p, task_t *reaper) |
537 | { | 593 | { |
538 | /* | 594 | /* |
539 | * Make sure we're not reparenting to ourselves and that | 595 | * Make sure we're not reparenting to ourselves and that |
540 | * the parent is not a zombie. | 596 | * the parent is not a zombie. |
541 | */ | 597 | */ |
542 | BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); | 598 | BUG_ON(p == reaper || reaper->exit_state); |
543 | p->real_parent = reaper; | 599 | p->real_parent = reaper; |
544 | } | 600 | } |
545 | 601 | ||
@@ -564,9 +620,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced) | |||
564 | * anyway, so let go of it. | 620 | * anyway, so let go of it. |
565 | */ | 621 | */ |
566 | p->ptrace = 0; | 622 | p->ptrace = 0; |
567 | list_del_init(&p->sibling); | 623 | remove_parent(p); |
568 | p->parent = p->real_parent; | 624 | p->parent = p->real_parent; |
569 | list_add_tail(&p->sibling, &p->parent->children); | 625 | add_parent(p); |
570 | 626 | ||
571 | /* If we'd notified the old parent about this child's death, | 627 | /* If we'd notified the old parent about this child's death, |
572 | * also notify the new parent. | 628 | * also notify the new parent. |
@@ -640,7 +696,7 @@ static void forget_original_parent(struct task_struct * father, | |||
640 | 696 | ||
641 | if (father == p->real_parent) { | 697 | if (father == p->real_parent) { |
642 | /* reparent with a reaper, real father it's us */ | 698 | /* reparent with a reaper, real father it's us */ |
643 | choose_new_parent(p, reaper, child_reaper); | 699 | choose_new_parent(p, reaper); |
644 | reparent_thread(p, father, 0); | 700 | reparent_thread(p, father, 0); |
645 | } else { | 701 | } else { |
646 | /* reparent ptraced task to its real parent */ | 702 | /* reparent ptraced task to its real parent */ |
@@ -661,7 +717,7 @@ static void forget_original_parent(struct task_struct * father, | |||
661 | } | 717 | } |
662 | list_for_each_safe(_p, _n, &father->ptrace_children) { | 718 | list_for_each_safe(_p, _n, &father->ptrace_children) { |
663 | p = list_entry(_p,struct task_struct,ptrace_list); | 719 | p = list_entry(_p,struct task_struct,ptrace_list); |
664 | choose_new_parent(p, reaper, child_reaper); | 720 | choose_new_parent(p, reaper); |
665 | reparent_thread(p, father, 1); | 721 | reparent_thread(p, father, 1); |
666 | } | 722 | } |
667 | } | 723 | } |
@@ -802,10 +858,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
802 | panic("Aiee, killing interrupt handler!"); | 858 | panic("Aiee, killing interrupt handler!"); |
803 | if (unlikely(!tsk->pid)) | 859 | if (unlikely(!tsk->pid)) |
804 | panic("Attempted to kill the idle task!"); | 860 | panic("Attempted to kill the idle task!"); |
805 | if (unlikely(tsk->pid == 1)) | 861 | if (unlikely(tsk == child_reaper)) |
806 | panic("Attempted to kill init!"); | 862 | panic("Attempted to kill init!"); |
807 | if (tsk->io_context) | ||
808 | exit_io_context(); | ||
809 | 863 | ||
810 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { | 864 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { |
811 | current->ptrace_message = code; | 865 | current->ptrace_message = code; |
@@ -819,6 +873,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
819 | if (unlikely(tsk->flags & PF_EXITING)) { | 873 | if (unlikely(tsk->flags & PF_EXITING)) { |
820 | printk(KERN_ALERT | 874 | printk(KERN_ALERT |
821 | "Fixing recursive fault but reboot is needed!\n"); | 875 | "Fixing recursive fault but reboot is needed!\n"); |
876 | if (tsk->io_context) | ||
877 | exit_io_context(); | ||
822 | set_current_state(TASK_UNINTERRUPTIBLE); | 878 | set_current_state(TASK_UNINTERRUPTIBLE); |
823 | schedule(); | 879 | schedule(); |
824 | } | 880 | } |
@@ -849,6 +905,14 @@ fastcall NORET_TYPE void do_exit(long code) | |||
849 | exit_itimers(tsk->signal); | 905 | exit_itimers(tsk->signal); |
850 | acct_process(code); | 906 | acct_process(code); |
851 | } | 907 | } |
908 | if (unlikely(tsk->robust_list)) | ||
909 | exit_robust_list(tsk); | ||
910 | #ifdef CONFIG_COMPAT | ||
911 | if (unlikely(tsk->compat_robust_list)) | ||
912 | compat_exit_robust_list(tsk); | ||
913 | #endif | ||
914 | if (unlikely(tsk->audit_context)) | ||
915 | audit_free(tsk); | ||
852 | exit_mm(tsk); | 916 | exit_mm(tsk); |
853 | 917 | ||
854 | exit_sem(tsk); | 918 | exit_sem(tsk); |
@@ -878,6 +942,12 @@ fastcall NORET_TYPE void do_exit(long code) | |||
878 | */ | 942 | */ |
879 | mutex_debug_check_no_locks_held(tsk); | 943 | mutex_debug_check_no_locks_held(tsk); |
880 | 944 | ||
945 | if (tsk->io_context) | ||
946 | exit_io_context(); | ||
947 | |||
948 | if (tsk->splice_pipe) | ||
949 | __free_pipe_info(tsk->splice_pipe); | ||
950 | |||
881 | /* PF_DEAD causes final put_task_struct after we schedule. */ | 951 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
882 | preempt_disable(); | 952 | preempt_disable(); |
883 | BUG_ON(tsk->flags & PF_DEAD); | 953 | BUG_ON(tsk->flags & PF_DEAD); |
@@ -906,13 +976,6 @@ asmlinkage long sys_exit(int error_code) | |||
906 | do_exit((error_code&0xff)<<8); | 976 | do_exit((error_code&0xff)<<8); |
907 | } | 977 | } |
908 | 978 | ||
909 | task_t fastcall *next_thread(const task_t *p) | ||
910 | { | ||
911 | return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); | ||
912 | } | ||
913 | |||
914 | EXPORT_SYMBOL(next_thread); | ||
915 | |||
916 | /* | 979 | /* |
917 | * Take down every thread in the group. This is called by fatal signals | 980 | * Take down every thread in the group. This is called by fatal signals |
918 | * as well as by sys_exit_group (below). | 981 | * as well as by sys_exit_group (below). |
@@ -927,7 +990,6 @@ do_group_exit(int exit_code) | |||
927 | else if (!thread_group_empty(current)) { | 990 | else if (!thread_group_empty(current)) { |
928 | struct signal_struct *const sig = current->signal; | 991 | struct signal_struct *const sig = current->signal; |
929 | struct sighand_struct *const sighand = current->sighand; | 992 | struct sighand_struct *const sighand = current->sighand; |
930 | read_lock(&tasklist_lock); | ||
931 | spin_lock_irq(&sighand->siglock); | 993 | spin_lock_irq(&sighand->siglock); |
932 | if (sig->flags & SIGNAL_GROUP_EXIT) | 994 | if (sig->flags & SIGNAL_GROUP_EXIT) |
933 | /* Another thread got here before we took the lock. */ | 995 | /* Another thread got here before we took the lock. */ |
@@ -937,7 +999,6 @@ do_group_exit(int exit_code) | |||
937 | zap_other_threads(current); | 999 | zap_other_threads(current); |
938 | } | 1000 | } |
939 | spin_unlock_irq(&sighand->siglock); | 1001 | spin_unlock_irq(&sighand->siglock); |
940 | read_unlock(&tasklist_lock); | ||
941 | } | 1002 | } |
942 | 1003 | ||
943 | do_exit(exit_code); | 1004 | do_exit(exit_code); |
@@ -1267,7 +1328,7 @@ bail_ref: | |||
1267 | 1328 | ||
1268 | /* move to end of parent's list to avoid starvation */ | 1329 | /* move to end of parent's list to avoid starvation */ |
1269 | remove_parent(p); | 1330 | remove_parent(p); |
1270 | add_parent(p, p->parent); | 1331 | add_parent(p); |
1271 | 1332 | ||
1272 | write_unlock_irq(&tasklist_lock); | 1333 | write_unlock_irq(&tasklist_lock); |
1273 | 1334 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7501b531ce..7fe2628553 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) | |||
40 | return e; | 40 | return e; |
41 | } | 41 | } |
42 | 42 | ||
43 | static int core_kernel_text(unsigned long addr) | 43 | int core_kernel_text(unsigned long addr) |
44 | { | 44 | { |
45 | if (addr >= (unsigned long)_stext && | 45 | if (addr >= (unsigned long)_stext && |
46 | addr <= (unsigned long)_etext) | 46 | addr <= (unsigned long)_etext) |
diff --git a/kernel/fork.c b/kernel/fork.c index 8e88b374ce..ac8100e308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep; | |||
84 | #endif | 84 | #endif |
85 | 85 | ||
86 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 86 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
87 | kmem_cache_t *signal_cachep; | 87 | static kmem_cache_t *signal_cachep; |
88 | 88 | ||
89 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ | 89 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ |
90 | kmem_cache_t *sighand_cachep; | 90 | kmem_cache_t *sighand_cachep; |
@@ -114,8 +114,6 @@ void __put_task_struct(struct task_struct *tsk) | |||
114 | WARN_ON(atomic_read(&tsk->usage)); | 114 | WARN_ON(atomic_read(&tsk->usage)); |
115 | WARN_ON(tsk == current); | 115 | WARN_ON(tsk == current); |
116 | 116 | ||
117 | if (unlikely(tsk->audit_context)) | ||
118 | audit_free(tsk); | ||
119 | security_task_free(tsk); | 117 | security_task_free(tsk); |
120 | free_uid(tsk->user); | 118 | free_uid(tsk->user); |
121 | put_group_info(tsk->group_info); | 119 | put_group_info(tsk->group_info); |
@@ -179,6 +177,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
179 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 177 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
180 | atomic_set(&tsk->usage,2); | 178 | atomic_set(&tsk->usage,2); |
181 | atomic_set(&tsk->fs_excl, 0); | 179 | atomic_set(&tsk->fs_excl, 0); |
180 | tsk->btrace_seq = 0; | ||
181 | tsk->splice_pipe = NULL; | ||
182 | return tsk; | 182 | return tsk; |
183 | } | 183 | } |
184 | 184 | ||
@@ -605,12 +605,12 @@ static struct files_struct *alloc_files(void) | |||
605 | atomic_set(&newf->count, 1); | 605 | atomic_set(&newf->count, 1); |
606 | 606 | ||
607 | spin_lock_init(&newf->file_lock); | 607 | spin_lock_init(&newf->file_lock); |
608 | newf->next_fd = 0; | ||
608 | fdt = &newf->fdtab; | 609 | fdt = &newf->fdtab; |
609 | fdt->next_fd = 0; | ||
610 | fdt->max_fds = NR_OPEN_DEFAULT; | 610 | fdt->max_fds = NR_OPEN_DEFAULT; |
611 | fdt->max_fdset = __FD_SETSIZE; | 611 | fdt->max_fdset = EMBEDDED_FD_SET_SIZE; |
612 | fdt->close_on_exec = &newf->close_on_exec_init; | 612 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; |
613 | fdt->open_fds = &newf->open_fds_init; | 613 | fdt->open_fds = (fd_set *)&newf->open_fds_init; |
614 | fdt->fd = &newf->fd_array[0]; | 614 | fdt->fd = &newf->fd_array[0]; |
615 | INIT_RCU_HEAD(&fdt->rcu); | 615 | INIT_RCU_HEAD(&fdt->rcu); |
616 | fdt->free_files = NULL; | 616 | fdt->free_files = NULL; |
@@ -718,7 +718,7 @@ out_release: | |||
718 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); | 718 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); |
719 | free_fd_array(new_fdt->fd, new_fdt->max_fds); | 719 | free_fd_array(new_fdt->fd, new_fdt->max_fds); |
720 | kmem_cache_free(files_cachep, newf); | 720 | kmem_cache_free(files_cachep, newf); |
721 | goto out; | 721 | return NULL; |
722 | } | 722 | } |
723 | 723 | ||
724 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 724 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) |
@@ -766,8 +766,7 @@ int unshare_files(void) | |||
766 | struct files_struct *files = current->files; | 766 | struct files_struct *files = current->files; |
767 | int rc; | 767 | int rc; |
768 | 768 | ||
769 | if(!files) | 769 | BUG_ON(!files); |
770 | BUG(); | ||
771 | 770 | ||
772 | /* This can race but the race causes us to copy when we don't | 771 | /* This can race but the race causes us to copy when we don't |
773 | need to and drop the copy */ | 772 | need to and drop the copy */ |
@@ -784,14 +783,6 @@ int unshare_files(void) | |||
784 | 783 | ||
785 | EXPORT_SYMBOL(unshare_files); | 784 | EXPORT_SYMBOL(unshare_files); |
786 | 785 | ||
787 | void sighand_free_cb(struct rcu_head *rhp) | ||
788 | { | ||
789 | struct sighand_struct *sp; | ||
790 | |||
791 | sp = container_of(rhp, struct sighand_struct, rcu); | ||
792 | kmem_cache_free(sighand_cachep, sp); | ||
793 | } | ||
794 | |||
795 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | 786 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) |
796 | { | 787 | { |
797 | struct sighand_struct *sig; | 788 | struct sighand_struct *sig; |
@@ -804,12 +795,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t | |||
804 | rcu_assign_pointer(tsk->sighand, sig); | 795 | rcu_assign_pointer(tsk->sighand, sig); |
805 | if (!sig) | 796 | if (!sig) |
806 | return -ENOMEM; | 797 | return -ENOMEM; |
807 | spin_lock_init(&sig->siglock); | ||
808 | atomic_set(&sig->count, 1); | 798 | atomic_set(&sig->count, 1); |
809 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); | 799 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
810 | return 0; | 800 | return 0; |
811 | } | 801 | } |
812 | 802 | ||
803 | void __cleanup_sighand(struct sighand_struct *sighand) | ||
804 | { | ||
805 | if (atomic_dec_and_test(&sighand->count)) | ||
806 | kmem_cache_free(sighand_cachep, sighand); | ||
807 | } | ||
808 | |||
813 | static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) | 809 | static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) |
814 | { | 810 | { |
815 | struct signal_struct *sig; | 811 | struct signal_struct *sig; |
@@ -845,7 +841,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
845 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); | 841 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); |
846 | sig->it_real_incr.tv64 = 0; | 842 | sig->it_real_incr.tv64 = 0; |
847 | sig->real_timer.function = it_real_fn; | 843 | sig->real_timer.function = it_real_fn; |
848 | sig->real_timer.data = tsk; | 844 | sig->tsk = tsk; |
849 | 845 | ||
850 | sig->it_virt_expires = cputime_zero; | 846 | sig->it_virt_expires = cputime_zero; |
851 | sig->it_virt_incr = cputime_zero; | 847 | sig->it_virt_incr = cputime_zero; |
@@ -879,6 +875,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
879 | return 0; | 875 | return 0; |
880 | } | 876 | } |
881 | 877 | ||
878 | void __cleanup_signal(struct signal_struct *sig) | ||
879 | { | ||
880 | exit_thread_group_keys(sig); | ||
881 | kmem_cache_free(signal_cachep, sig); | ||
882 | } | ||
883 | |||
884 | static inline void cleanup_signal(struct task_struct *tsk) | ||
885 | { | ||
886 | struct signal_struct *sig = tsk->signal; | ||
887 | |||
888 | atomic_dec(&sig->live); | ||
889 | |||
890 | if (atomic_dec_and_test(&sig->count)) | ||
891 | __cleanup_signal(sig); | ||
892 | } | ||
893 | |||
882 | static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | 894 | static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) |
883 | { | 895 | { |
884 | unsigned long new_flags = p->flags; | 896 | unsigned long new_flags = p->flags; |
@@ -1018,6 +1030,7 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1018 | p->mempolicy = NULL; | 1030 | p->mempolicy = NULL; |
1019 | goto bad_fork_cleanup_cpuset; | 1031 | goto bad_fork_cleanup_cpuset; |
1020 | } | 1032 | } |
1033 | mpol_fix_fork_child_flag(p); | ||
1021 | #endif | 1034 | #endif |
1022 | 1035 | ||
1023 | #ifdef CONFIG_DEBUG_MUTEXES | 1036 | #ifdef CONFIG_DEBUG_MUTEXES |
@@ -1058,6 +1071,15 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1058 | * Clear TID on mm_release()? | 1071 | * Clear TID on mm_release()? |
1059 | */ | 1072 | */ |
1060 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1073 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; |
1074 | p->robust_list = NULL; | ||
1075 | #ifdef CONFIG_COMPAT | ||
1076 | p->compat_robust_list = NULL; | ||
1077 | #endif | ||
1078 | /* | ||
1079 | * sigaltstack should be cleared when sharing the same VM | ||
1080 | */ | ||
1081 | if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) | ||
1082 | p->sas_ss_sp = p->sas_ss_size = 0; | ||
1061 | 1083 | ||
1062 | /* | 1084 | /* |
1063 | * Syscall tracing should be turned off in the child regardless | 1085 | * Syscall tracing should be turned off in the child regardless |
@@ -1083,6 +1105,7 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1083 | * We dont wake it up yet. | 1105 | * We dont wake it up yet. |
1084 | */ | 1106 | */ |
1085 | p->group_leader = p; | 1107 | p->group_leader = p; |
1108 | INIT_LIST_HEAD(&p->thread_group); | ||
1086 | INIT_LIST_HEAD(&p->ptrace_children); | 1109 | INIT_LIST_HEAD(&p->ptrace_children); |
1087 | INIT_LIST_HEAD(&p->ptrace_list); | 1110 | INIT_LIST_HEAD(&p->ptrace_list); |
1088 | 1111 | ||
@@ -1106,16 +1129,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1106 | !cpu_online(task_cpu(p)))) | 1129 | !cpu_online(task_cpu(p)))) |
1107 | set_task_cpu(p, smp_processor_id()); | 1130 | set_task_cpu(p, smp_processor_id()); |
1108 | 1131 | ||
1109 | /* | ||
1110 | * Check for pending SIGKILL! The new thread should not be allowed | ||
1111 | * to slip out of an OOM kill. (or normal SIGKILL.) | ||
1112 | */ | ||
1113 | if (sigismember(¤t->pending.signal, SIGKILL)) { | ||
1114 | write_unlock_irq(&tasklist_lock); | ||
1115 | retval = -EINTR; | ||
1116 | goto bad_fork_cleanup_namespace; | ||
1117 | } | ||
1118 | |||
1119 | /* CLONE_PARENT re-uses the old parent */ | 1132 | /* CLONE_PARENT re-uses the old parent */ |
1120 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) | 1133 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) |
1121 | p->real_parent = current->real_parent; | 1134 | p->real_parent = current->real_parent; |
@@ -1123,8 +1136,25 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1123 | p->real_parent = current; | 1136 | p->real_parent = current; |
1124 | p->parent = p->real_parent; | 1137 | p->parent = p->real_parent; |
1125 | 1138 | ||
1139 | spin_lock(¤t->sighand->siglock); | ||
1140 | |||
1141 | /* | ||
1142 | * Process group and session signals need to be delivered to just the | ||
1143 | * parent before the fork or both the parent and the child after the | ||
1144 | * fork. Restart if a signal comes in before we add the new process to | ||
1145 | * it's process group. | ||
1146 | * A fatal signal pending means that current will exit, so the new | ||
1147 | * thread can't slip out of an OOM kill (or normal SIGKILL). | ||
1148 | */ | ||
1149 | recalc_sigpending(); | ||
1150 | if (signal_pending(current)) { | ||
1151 | spin_unlock(¤t->sighand->siglock); | ||
1152 | write_unlock_irq(&tasklist_lock); | ||
1153 | retval = -ERESTARTNOINTR; | ||
1154 | goto bad_fork_cleanup_namespace; | ||
1155 | } | ||
1156 | |||
1126 | if (clone_flags & CLONE_THREAD) { | 1157 | if (clone_flags & CLONE_THREAD) { |
1127 | spin_lock(¤t->sighand->siglock); | ||
1128 | /* | 1158 | /* |
1129 | * Important: if an exit-all has been started then | 1159 | * Important: if an exit-all has been started then |
1130 | * do not create this new thread - the whole thread | 1160 | * do not create this new thread - the whole thread |
@@ -1136,17 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1136 | retval = -EAGAIN; | 1166 | retval = -EAGAIN; |
1137 | goto bad_fork_cleanup_namespace; | 1167 | goto bad_fork_cleanup_namespace; |
1138 | } | 1168 | } |
1139 | p->group_leader = current->group_leader; | ||
1140 | 1169 | ||
1141 | if (current->signal->group_stop_count > 0) { | 1170 | p->group_leader = current->group_leader; |
1142 | /* | 1171 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
1143 | * There is an all-stop in progress for the group. | ||
1144 | * We ourselves will stop as soon as we check signals. | ||
1145 | * Make the new thread part of that group stop too. | ||
1146 | */ | ||
1147 | current->signal->group_stop_count++; | ||
1148 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1149 | } | ||
1150 | 1172 | ||
1151 | if (!cputime_eq(current->signal->it_virt_expires, | 1173 | if (!cputime_eq(current->signal->it_virt_expires, |
1152 | cputime_zero) || | 1174 | cputime_zero) || |
@@ -1162,8 +1184,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1162 | */ | 1184 | */ |
1163 | p->it_prof_expires = jiffies_to_cputime(1); | 1185 | p->it_prof_expires = jiffies_to_cputime(1); |
1164 | } | 1186 | } |
1165 | |||
1166 | spin_unlock(¤t->sighand->siglock); | ||
1167 | } | 1187 | } |
1168 | 1188 | ||
1169 | /* | 1189 | /* |
@@ -1171,24 +1191,27 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1171 | */ | 1191 | */ |
1172 | p->ioprio = current->ioprio; | 1192 | p->ioprio = current->ioprio; |
1173 | 1193 | ||
1174 | SET_LINKS(p); | 1194 | if (likely(p->pid)) { |
1175 | if (unlikely(p->ptrace & PT_PTRACED)) | 1195 | add_parent(p); |
1176 | __ptrace_link(p, current->parent); | 1196 | if (unlikely(p->ptrace & PT_PTRACED)) |
1177 | 1197 | __ptrace_link(p, current->parent); | |
1178 | attach_pid(p, PIDTYPE_PID, p->pid); | 1198 | |
1179 | attach_pid(p, PIDTYPE_TGID, p->tgid); | 1199 | if (thread_group_leader(p)) { |
1180 | if (thread_group_leader(p)) { | 1200 | p->signal->tty = current->signal->tty; |
1181 | p->signal->tty = current->signal->tty; | 1201 | p->signal->pgrp = process_group(current); |
1182 | p->signal->pgrp = process_group(current); | 1202 | p->signal->session = current->signal->session; |
1183 | p->signal->session = current->signal->session; | 1203 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
1184 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1204 | attach_pid(p, PIDTYPE_SID, p->signal->session); |
1185 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1205 | |
1186 | if (p->pid) | 1206 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1187 | __get_cpu_var(process_counts)++; | 1207 | __get_cpu_var(process_counts)++; |
1208 | } | ||
1209 | attach_pid(p, PIDTYPE_PID, p->pid); | ||
1210 | nr_threads++; | ||
1188 | } | 1211 | } |
1189 | 1212 | ||
1190 | nr_threads++; | ||
1191 | total_forks++; | 1213 | total_forks++; |
1214 | spin_unlock(¤t->sighand->siglock); | ||
1192 | write_unlock_irq(&tasklist_lock); | 1215 | write_unlock_irq(&tasklist_lock); |
1193 | proc_fork_connector(p); | 1216 | proc_fork_connector(p); |
1194 | return p; | 1217 | return p; |
@@ -1201,9 +1224,9 @@ bad_fork_cleanup_mm: | |||
1201 | if (p->mm) | 1224 | if (p->mm) |
1202 | mmput(p->mm); | 1225 | mmput(p->mm); |
1203 | bad_fork_cleanup_signal: | 1226 | bad_fork_cleanup_signal: |
1204 | exit_signal(p); | 1227 | cleanup_signal(p); |
1205 | bad_fork_cleanup_sighand: | 1228 | bad_fork_cleanup_sighand: |
1206 | exit_sighand(p); | 1229 | __cleanup_sighand(p->sighand); |
1207 | bad_fork_cleanup_fs: | 1230 | bad_fork_cleanup_fs: |
1208 | exit_fs(p); /* blocking */ | 1231 | exit_fs(p); /* blocking */ |
1209 | bad_fork_cleanup_files: | 1232 | bad_fork_cleanup_files: |
@@ -1250,7 +1273,7 @@ task_t * __devinit fork_idle(int cpu) | |||
1250 | if (!task) | 1273 | if (!task) |
1251 | return ERR_PTR(-ENOMEM); | 1274 | return ERR_PTR(-ENOMEM); |
1252 | init_idle(task, cpu); | 1275 | init_idle(task, cpu); |
1253 | unhash_process(task); | 1276 | |
1254 | return task; | 1277 | return task; |
1255 | } | 1278 | } |
1256 | 1279 | ||
@@ -1285,17 +1308,19 @@ long do_fork(unsigned long clone_flags, | |||
1285 | { | 1308 | { |
1286 | struct task_struct *p; | 1309 | struct task_struct *p; |
1287 | int trace = 0; | 1310 | int trace = 0; |
1288 | long pid = alloc_pidmap(); | 1311 | struct pid *pid = alloc_pid(); |
1312 | long nr; | ||
1289 | 1313 | ||
1290 | if (pid < 0) | 1314 | if (!pid) |
1291 | return -EAGAIN; | 1315 | return -EAGAIN; |
1316 | nr = pid->nr; | ||
1292 | if (unlikely(current->ptrace)) { | 1317 | if (unlikely(current->ptrace)) { |
1293 | trace = fork_traceflag (clone_flags); | 1318 | trace = fork_traceflag (clone_flags); |
1294 | if (trace) | 1319 | if (trace) |
1295 | clone_flags |= CLONE_PTRACE; | 1320 | clone_flags |= CLONE_PTRACE; |
1296 | } | 1321 | } |
1297 | 1322 | ||
1298 | p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); | 1323 | p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); |
1299 | /* | 1324 | /* |
1300 | * Do this prior waking up the new thread - the thread pointer | 1325 | * Do this prior waking up the new thread - the thread pointer |
1301 | * might get invalid after that point, if the thread exits quickly. | 1326 | * might get invalid after that point, if the thread exits quickly. |
@@ -1322,7 +1347,7 @@ long do_fork(unsigned long clone_flags, | |||
1322 | p->state = TASK_STOPPED; | 1347 | p->state = TASK_STOPPED; |
1323 | 1348 | ||
1324 | if (unlikely (trace)) { | 1349 | if (unlikely (trace)) { |
1325 | current->ptrace_message = pid; | 1350 | current->ptrace_message = nr; |
1326 | ptrace_notify ((trace << 8) | SIGTRAP); | 1351 | ptrace_notify ((trace << 8) | SIGTRAP); |
1327 | } | 1352 | } |
1328 | 1353 | ||
@@ -1332,21 +1357,31 @@ long do_fork(unsigned long clone_flags, | |||
1332 | ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); | 1357 | ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); |
1333 | } | 1358 | } |
1334 | } else { | 1359 | } else { |
1335 | free_pidmap(pid); | 1360 | free_pid(pid); |
1336 | pid = PTR_ERR(p); | 1361 | nr = PTR_ERR(p); |
1337 | } | 1362 | } |
1338 | return pid; | 1363 | return nr; |
1339 | } | 1364 | } |
1340 | 1365 | ||
1341 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | 1366 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN |
1342 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1367 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
1343 | #endif | 1368 | #endif |
1344 | 1369 | ||
1370 | static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | ||
1371 | { | ||
1372 | struct sighand_struct *sighand = data; | ||
1373 | |||
1374 | if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == | ||
1375 | SLAB_CTOR_CONSTRUCTOR) | ||
1376 | spin_lock_init(&sighand->siglock); | ||
1377 | } | ||
1378 | |||
1345 | void __init proc_caches_init(void) | 1379 | void __init proc_caches_init(void) |
1346 | { | 1380 | { |
1347 | sighand_cachep = kmem_cache_create("sighand_cache", | 1381 | sighand_cachep = kmem_cache_create("sighand_cache", |
1348 | sizeof(struct sighand_struct), 0, | 1382 | sizeof(struct sighand_struct), 0, |
1349 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1383 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, |
1384 | sighand_ctor, NULL); | ||
1350 | signal_cachep = kmem_cache_create("signal_cache", | 1385 | signal_cachep = kmem_cache_create("signal_cache", |
1351 | sizeof(struct signal_struct), 0, | 1386 | sizeof(struct signal_struct), 0, |
1352 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1387 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); |
@@ -1471,9 +1506,7 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) | |||
1471 | 1506 | ||
1472 | if ((unshare_flags & CLONE_VM) && | 1507 | if ((unshare_flags & CLONE_VM) && |
1473 | (mm && atomic_read(&mm->mm_users) > 1)) { | 1508 | (mm && atomic_read(&mm->mm_users) > 1)) { |
1474 | *new_mmp = dup_mm(current); | 1509 | return -EINVAL; |
1475 | if (!*new_mmp) | ||
1476 | return -ENOMEM; | ||
1477 | } | 1510 | } |
1478 | 1511 | ||
1479 | return 0; | 1512 | return 0; |
@@ -1529,6 +1562,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1529 | 1562 | ||
1530 | check_unshare_flags(&unshare_flags); | 1563 | check_unshare_flags(&unshare_flags); |
1531 | 1564 | ||
1565 | /* Return -EINVAL for all unsupported flags */ | ||
1566 | err = -EINVAL; | ||
1567 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
1568 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) | ||
1569 | goto bad_unshare_out; | ||
1570 | |||
1532 | if ((err = unshare_thread(unshare_flags))) | 1571 | if ((err = unshare_thread(unshare_flags))) |
1533 | goto bad_unshare_out; | 1572 | goto bad_unshare_out; |
1534 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1573 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
@@ -1562,7 +1601,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1562 | 1601 | ||
1563 | if (new_sigh) { | 1602 | if (new_sigh) { |
1564 | sigh = current->sighand; | 1603 | sigh = current->sighand; |
1565 | current->sighand = new_sigh; | 1604 | rcu_assign_pointer(current->sighand, new_sigh); |
1566 | new_sigh = sigh; | 1605 | new_sigh = sigh; |
1567 | } | 1606 | } |
1568 | 1607 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 5efa2f9780..5699c51205 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -8,6 +8,10 @@ | |||
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | 8 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
9 | * (C) Copyright 2003, 2004 Jamie Lokier | 9 | * (C) Copyright 2003, 2004 Jamie Lokier |
10 | * | 10 | * |
11 | * Robust futex support started by Ingo Molnar | ||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | ||
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | ||
14 | * | ||
11 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
12 | * enough at me, Linus for the original (flawed) idea, Matthew | 16 | * enough at me, Linus for the original (flawed) idea, Matthew |
13 | * Kirkwood for proof-of-concept implementation. | 17 | * Kirkwood for proof-of-concept implementation. |
@@ -829,6 +833,172 @@ error: | |||
829 | goto out; | 833 | goto out; |
830 | } | 834 | } |
831 | 835 | ||
836 | /* | ||
837 | * Support for robust futexes: the kernel cleans up held futexes at | ||
838 | * thread exit time. | ||
839 | * | ||
840 | * Implementation: user-space maintains a per-thread list of locks it | ||
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | ||
842 | * and marks all locks that are owned by this thread with the | ||
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | ||
844 | * always manipulated with the lock held, so the list is private and | ||
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | ||
846 | * field, to allow the kernel to clean up if the thread dies after | ||
847 | * acquiring the lock, but just before it could have added itself to | ||
848 | * the list. There can only be one such pending lock. | ||
849 | */ | ||
850 | |||
851 | /** | ||
852 | * sys_set_robust_list - set the robust-futex list head of a task | ||
853 | * @head: pointer to the list-head | ||
854 | * @len: length of the list-head, as userspace expects | ||
855 | */ | ||
856 | asmlinkage long | ||
857 | sys_set_robust_list(struct robust_list_head __user *head, | ||
858 | size_t len) | ||
859 | { | ||
860 | /* | ||
861 | * The kernel knows only one size for now: | ||
862 | */ | ||
863 | if (unlikely(len != sizeof(*head))) | ||
864 | return -EINVAL; | ||
865 | |||
866 | current->robust_list = head; | ||
867 | |||
868 | return 0; | ||
869 | } | ||
870 | |||
871 | /** | ||
872 | * sys_get_robust_list - get the robust-futex list head of a task | ||
873 | * @pid: pid of the process [zero for current task] | ||
874 | * @head_ptr: pointer to a list-head pointer, the kernel fills it in | ||
875 | * @len_ptr: pointer to a length field, the kernel fills in the header size | ||
876 | */ | ||
877 | asmlinkage long | ||
878 | sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | ||
879 | size_t __user *len_ptr) | ||
880 | { | ||
881 | struct robust_list_head *head; | ||
882 | unsigned long ret; | ||
883 | |||
884 | if (!pid) | ||
885 | head = current->robust_list; | ||
886 | else { | ||
887 | struct task_struct *p; | ||
888 | |||
889 | ret = -ESRCH; | ||
890 | read_lock(&tasklist_lock); | ||
891 | p = find_task_by_pid(pid); | ||
892 | if (!p) | ||
893 | goto err_unlock; | ||
894 | ret = -EPERM; | ||
895 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
896 | !capable(CAP_SYS_PTRACE)) | ||
897 | goto err_unlock; | ||
898 | head = p->robust_list; | ||
899 | read_unlock(&tasklist_lock); | ||
900 | } | ||
901 | |||
902 | if (put_user(sizeof(*head), len_ptr)) | ||
903 | return -EFAULT; | ||
904 | return put_user(head, head_ptr); | ||
905 | |||
906 | err_unlock: | ||
907 | read_unlock(&tasklist_lock); | ||
908 | |||
909 | return ret; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Process a futex-list entry, check whether it's owned by the | ||
914 | * dying task, and do notification if so: | ||
915 | */ | ||
916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | ||
917 | { | ||
918 | u32 uval; | ||
919 | |||
920 | retry: | ||
921 | if (get_user(uval, uaddr)) | ||
922 | return -1; | ||
923 | |||
924 | if ((uval & FUTEX_TID_MASK) == curr->pid) { | ||
925 | /* | ||
926 | * Ok, this dying thread is truly holding a futex | ||
927 | * of interest. Set the OWNER_DIED bit atomically | ||
928 | * via cmpxchg, and if the value had FUTEX_WAITERS | ||
929 | * set, wake up a waiter (if any). (We have to do a | ||
930 | * futex_wake() even if OWNER_DIED is already set - | ||
931 | * to handle the rare but possible case of recursive | ||
932 | * thread-death.) The rest of the cleanup is done in | ||
933 | * userspace. | ||
934 | */ | ||
935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | ||
936 | uval | FUTEX_OWNER_DIED) != uval) | ||
937 | goto retry; | ||
938 | |||
939 | if (uval & FUTEX_WAITERS) | ||
940 | futex_wake((unsigned long)uaddr, 1); | ||
941 | } | ||
942 | return 0; | ||
943 | } | ||
944 | |||
945 | /* | ||
946 | * Walk curr->robust_list (very carefully, it's a userspace list!) | ||
947 | * and mark any locks found there dead, and notify any waiters. | ||
948 | * | ||
949 | * We silently return on any sign of list-walking problem. | ||
950 | */ | ||
951 | void exit_robust_list(struct task_struct *curr) | ||
952 | { | ||
953 | struct robust_list_head __user *head = curr->robust_list; | ||
954 | struct robust_list __user *entry, *pending; | ||
955 | unsigned int limit = ROBUST_LIST_LIMIT; | ||
956 | unsigned long futex_offset; | ||
957 | |||
958 | /* | ||
959 | * Fetch the list head (which was registered earlier, via | ||
960 | * sys_set_robust_list()): | ||
961 | */ | ||
962 | if (get_user(entry, &head->list.next)) | ||
963 | return; | ||
964 | /* | ||
965 | * Fetch the relative futex offset: | ||
966 | */ | ||
967 | if (get_user(futex_offset, &head->futex_offset)) | ||
968 | return; | ||
969 | /* | ||
970 | * Fetch any possibly pending lock-add first, and handle it | ||
971 | * if it exists: | ||
972 | */ | ||
973 | if (get_user(pending, &head->list_op_pending)) | ||
974 | return; | ||
975 | if (pending) | ||
976 | handle_futex_death((void *)pending + futex_offset, curr); | ||
977 | |||
978 | while (entry != &head->list) { | ||
979 | /* | ||
980 | * A pending lock might already be on the list, so | ||
981 | * dont process it twice: | ||
982 | */ | ||
983 | if (entry != pending) | ||
984 | if (handle_futex_death((void *)entry + futex_offset, | ||
985 | curr)) | ||
986 | return; | ||
987 | /* | ||
988 | * Fetch the next entry in the list: | ||
989 | */ | ||
990 | if (get_user(entry, &entry->next)) | ||
991 | return; | ||
992 | /* | ||
993 | * Avoid excessively long or circular lists: | ||
994 | */ | ||
995 | if (!--limit) | ||
996 | break; | ||
997 | |||
998 | cond_resched(); | ||
999 | } | ||
1000 | } | ||
1001 | |||
832 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, |
833 | unsigned long uaddr2, int val2, int val3) | 1003 | unsigned long uaddr2, int val2, int val3) |
834 | { | 1004 | { |
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | |||
869 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
870 | int val2 = 0; | 1040 | int val2 = 0; |
871 | 1041 | ||
872 | if ((op == FUTEX_WAIT) && utime) { | 1042 | if (utime && (op == FUTEX_WAIT)) { |
873 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
874 | return -EFAULT; | 1044 | return -EFAULT; |
1045 | if (!timespec_valid(&t)) | ||
1046 | return -EINVAL; | ||
875 | timeout = timespec_to_jiffies(&t) + 1; | 1047 | timeout = timespec_to_jiffies(&t) + 1; |
876 | } | 1048 | } |
877 | /* | 1049 | /* |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c new file mode 100644 index 0000000000..1ab6a0ea3d --- /dev/null +++ b/kernel/futex_compat.c | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * linux/kernel/futex_compat.c | ||
3 | * | ||
4 | * Futex compatibililty routines. | ||
5 | * | ||
6 | * Copyright 2006, Red Hat, Inc., Ingo Molnar | ||
7 | */ | ||
8 | |||
9 | #include <linux/linkage.h> | ||
10 | #include <linux/compat.h> | ||
11 | #include <linux/futex.h> | ||
12 | |||
13 | #include <asm/uaccess.h> | ||
14 | |||
15 | /* | ||
16 | * Walk curr->robust_list (very carefully, it's a userspace list!) | ||
17 | * and mark any locks found there dead, and notify any waiters. | ||
18 | * | ||
19 | * We silently return on any sign of list-walking problem. | ||
20 | */ | ||
21 | void compat_exit_robust_list(struct task_struct *curr) | ||
22 | { | ||
23 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | ||
24 | struct robust_list __user *entry, *pending; | ||
25 | compat_uptr_t uentry, upending; | ||
26 | unsigned int limit = ROBUST_LIST_LIMIT; | ||
27 | compat_long_t futex_offset; | ||
28 | |||
29 | /* | ||
30 | * Fetch the list head (which was registered earlier, via | ||
31 | * sys_set_robust_list()): | ||
32 | */ | ||
33 | if (get_user(uentry, &head->list.next)) | ||
34 | return; | ||
35 | entry = compat_ptr(uentry); | ||
36 | /* | ||
37 | * Fetch the relative futex offset: | ||
38 | */ | ||
39 | if (get_user(futex_offset, &head->futex_offset)) | ||
40 | return; | ||
41 | /* | ||
42 | * Fetch any possibly pending lock-add first, and handle it | ||
43 | * if it exists: | ||
44 | */ | ||
45 | if (get_user(upending, &head->list_op_pending)) | ||
46 | return; | ||
47 | pending = compat_ptr(upending); | ||
48 | if (upending) | ||
49 | handle_futex_death((void *)pending + futex_offset, curr); | ||
50 | |||
51 | while (compat_ptr(uentry) != &head->list) { | ||
52 | /* | ||
53 | * A pending lock might already be on the list, so | ||
54 | * dont process it twice: | ||
55 | */ | ||
56 | if (entry != pending) | ||
57 | if (handle_futex_death((void *)entry + futex_offset, | ||
58 | curr)) | ||
59 | return; | ||
60 | |||
61 | /* | ||
62 | * Fetch the next entry in the list: | ||
63 | */ | ||
64 | if (get_user(uentry, (compat_uptr_t *)&entry->next)) | ||
65 | return; | ||
66 | entry = compat_ptr(uentry); | ||
67 | /* | ||
68 | * Avoid excessively long or circular lists: | ||
69 | */ | ||
70 | if (!--limit) | ||
71 | break; | ||
72 | |||
73 | cond_resched(); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | asmlinkage long | ||
78 | compat_sys_set_robust_list(struct compat_robust_list_head __user *head, | ||
79 | compat_size_t len) | ||
80 | { | ||
81 | if (unlikely(len != sizeof(*head))) | ||
82 | return -EINVAL; | ||
83 | |||
84 | current->compat_robust_list = head; | ||
85 | |||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | asmlinkage long | ||
90 | compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr, | ||
91 | compat_size_t __user *len_ptr) | ||
92 | { | ||
93 | struct compat_robust_list_head *head; | ||
94 | unsigned long ret; | ||
95 | |||
96 | if (!pid) | ||
97 | head = current->compat_robust_list; | ||
98 | else { | ||
99 | struct task_struct *p; | ||
100 | |||
101 | ret = -ESRCH; | ||
102 | read_lock(&tasklist_lock); | ||
103 | p = find_task_by_pid(pid); | ||
104 | if (!p) | ||
105 | goto err_unlock; | ||
106 | ret = -EPERM; | ||
107 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
108 | !capable(CAP_SYS_PTRACE)) | ||
109 | goto err_unlock; | ||
110 | head = p->compat_robust_list; | ||
111 | read_unlock(&tasklist_lock); | ||
112 | } | ||
113 | |||
114 | if (put_user(sizeof(*head), len_ptr)) | ||
115 | return -EFAULT; | ||
116 | return put_user(ptr_to_compat(head), head_ptr); | ||
117 | |||
118 | err_unlock: | ||
119 | read_unlock(&tasklist_lock); | ||
120 | |||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | ||
125 | struct compat_timespec __user *utime, u32 __user *uaddr2, | ||
126 | u32 val3) | ||
127 | { | ||
128 | struct timespec t; | ||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | ||
130 | int val2 = 0; | ||
131 | |||
132 | if (utime && (op == FUTEX_WAIT)) { | ||
133 | if (get_compat_timespec(&t, utime)) | ||
134 | return -EFAULT; | ||
135 | if (!timespec_valid(&t)) | ||
136 | return -EINVAL; | ||
137 | timeout = timespec_to_jiffies(&t) + 1; | ||
138 | } | ||
139 | if (op >= FUTEX_REQUEUE) | ||
140 | val2 = (int) (unsigned long) utime; | ||
141 | |||
142 | return do_futex((unsigned long)uaddr, op, val, timeout, | ||
143 | (unsigned long)uaddr2, val2, val3); | ||
144 | } | ||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2b6e1757ae..01fa2ae98a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts) | |||
123 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 123 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
124 | 124 | ||
125 | /* | 125 | /* |
126 | * Get the coarse grained time at the softirq based on xtime and | ||
127 | * wall_to_monotonic. | ||
128 | */ | ||
129 | static void hrtimer_get_softirq_time(struct hrtimer_base *base) | ||
130 | { | ||
131 | ktime_t xtim, tomono; | ||
132 | unsigned long seq; | ||
133 | |||
134 | do { | ||
135 | seq = read_seqbegin(&xtime_lock); | ||
136 | xtim = timespec_to_ktime(xtime); | ||
137 | tomono = timespec_to_ktime(wall_to_monotonic); | ||
138 | |||
139 | } while (read_seqretry(&xtime_lock, seq)); | ||
140 | |||
141 | base[CLOCK_REALTIME].softirq_time = xtim; | ||
142 | base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); | ||
143 | } | ||
144 | |||
145 | /* | ||
126 | * Functions and macros which are different for UP/SMP systems are kept in a | 146 | * Functions and macros which are different for UP/SMP systems are kept in a |
127 | * single place | 147 | * single place |
128 | */ | 148 | */ |
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
246 | /* | 266 | /* |
247 | * Divide a ktime value by a nanosecond value | 267 | * Divide a ktime value by a nanosecond value |
248 | */ | 268 | */ |
249 | static unsigned long ktime_divns(const ktime_t kt, nsec_t div) | 269 | static unsigned long ktime_divns(const ktime_t kt, s64 div) |
250 | { | 270 | { |
251 | u64 dclc, inc, dns; | 271 | u64 dclc, inc, dns; |
252 | int sft = 0; | 272 | int sft = 0; |
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
281 | * hrtimer_forward - forward the timer expiry | 301 | * hrtimer_forward - forward the timer expiry |
282 | * | 302 | * |
283 | * @timer: hrtimer to forward | 303 | * @timer: hrtimer to forward |
304 | * @now: forward past this time | ||
284 | * @interval: the interval to forward | 305 | * @interval: the interval to forward |
285 | * | 306 | * |
286 | * Forward the timer expiry so it will expire in the future. | 307 | * Forward the timer expiry so it will expire in the future. |
287 | * Returns the number of overruns. | 308 | * Returns the number of overruns. |
288 | */ | 309 | */ |
289 | unsigned long | 310 | unsigned long |
290 | hrtimer_forward(struct hrtimer *timer, ktime_t interval) | 311 | hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) |
291 | { | 312 | { |
292 | unsigned long orun = 1; | 313 | unsigned long orun = 1; |
293 | ktime_t delta, now; | 314 | ktime_t delta; |
294 | |||
295 | now = timer->base->get_time(); | ||
296 | 315 | ||
297 | delta = ktime_sub(now, timer->expires); | 316 | delta = ktime_sub(now, timer->expires); |
298 | 317 | ||
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval) | |||
303 | interval.tv64 = timer->base->resolution.tv64; | 322 | interval.tv64 = timer->base->resolution.tv64; |
304 | 323 | ||
305 | if (unlikely(delta.tv64 >= interval.tv64)) { | 324 | if (unlikely(delta.tv64 >= interval.tv64)) { |
306 | nsec_t incr = ktime_to_ns(interval); | 325 | s64 incr = ktime_to_ns(interval); |
307 | 326 | ||
308 | orun = ktime_divns(delta, incr); | 327 | orun = ktime_divns(delta, incr); |
309 | timer->expires = ktime_add_ns(timer->expires, incr * orun); | 328 | timer->expires = ktime_add_ns(timer->expires, incr * orun); |
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
355 | rb_link_node(&timer->node, parent, link); | 374 | rb_link_node(&timer->node, parent, link); |
356 | rb_insert_color(&timer->node, &base->active); | 375 | rb_insert_color(&timer->node, &base->active); |
357 | 376 | ||
358 | timer->state = HRTIMER_PENDING; | ||
359 | |||
360 | if (!base->first || timer->expires.tv64 < | 377 | if (!base->first || timer->expires.tv64 < |
361 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | 378 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) |
362 | base->first = &timer->node; | 379 | base->first = &timer->node; |
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
376 | if (base->first == &timer->node) | 393 | if (base->first == &timer->node) |
377 | base->first = rb_next(&timer->node); | 394 | base->first = rb_next(&timer->node); |
378 | rb_erase(&timer->node, &base->active); | 395 | rb_erase(&timer->node, &base->active); |
396 | timer->node.rb_parent = HRTIMER_INACTIVE; | ||
379 | } | 397 | } |
380 | 398 | ||
381 | /* | 399 | /* |
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
386 | { | 404 | { |
387 | if (hrtimer_active(timer)) { | 405 | if (hrtimer_active(timer)) { |
388 | __remove_hrtimer(timer, base); | 406 | __remove_hrtimer(timer, base); |
389 | timer->state = HRTIMER_INACTIVE; | ||
390 | return 1; | 407 | return 1; |
391 | } | 408 | } |
392 | return 0; | 409 | return 0; |
@@ -418,8 +435,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
418 | /* Switch the timer base, if necessary: */ | 435 | /* Switch the timer base, if necessary: */ |
419 | new_base = switch_hrtimer_base(timer, base); | 436 | new_base = switch_hrtimer_base(timer, base); |
420 | 437 | ||
421 | if (mode == HRTIMER_REL) | 438 | if (mode == HRTIMER_REL) { |
422 | tim = ktime_add(tim, new_base->get_time()); | 439 | tim = ktime_add(tim, new_base->get_time()); |
440 | /* | ||
441 | * CONFIG_TIME_LOW_RES is a temporary way for architectures | ||
442 | * to signal that they simply return xtime in | ||
443 | * do_gettimeoffset(). In this case we want to round up by | ||
444 | * resolution when starting a relative timer, to avoid short | ||
445 | * timeouts. This will go away with the GTOD framework. | ||
446 | */ | ||
447 | #ifdef CONFIG_TIME_LOW_RES | ||
448 | tim = ktime_add(tim, base->resolution); | ||
449 | #endif | ||
450 | } | ||
423 | timer->expires = tim; | 451 | timer->expires = tim; |
424 | 452 | ||
425 | enqueue_hrtimer(timer, new_base); | 453 | enqueue_hrtimer(timer, new_base); |
@@ -428,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
428 | 456 | ||
429 | return ret; | 457 | return ret; |
430 | } | 458 | } |
459 | EXPORT_SYMBOL_GPL(hrtimer_start); | ||
431 | 460 | ||
432 | /** | 461 | /** |
433 | * hrtimer_try_to_cancel - try to deactivate a timer | 462 | * hrtimer_try_to_cancel - try to deactivate a timer |
@@ -456,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) | |||
456 | return ret; | 485 | return ret; |
457 | 486 | ||
458 | } | 487 | } |
488 | EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | ||
459 | 489 | ||
460 | /** | 490 | /** |
461 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | 491 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. |
@@ -473,8 +503,10 @@ int hrtimer_cancel(struct hrtimer *timer) | |||
473 | 503 | ||
474 | if (ret >= 0) | 504 | if (ret >= 0) |
475 | return ret; | 505 | return ret; |
506 | cpu_relax(); | ||
476 | } | 507 | } |
477 | } | 508 | } |
509 | EXPORT_SYMBOL_GPL(hrtimer_cancel); | ||
478 | 510 | ||
479 | /** | 511 | /** |
480 | * hrtimer_get_remaining - get remaining time for the timer | 512 | * hrtimer_get_remaining - get remaining time for the timer |
@@ -493,6 +525,42 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | |||
493 | 525 | ||
494 | return rem; | 526 | return rem; |
495 | } | 527 | } |
528 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | ||
529 | |||
530 | #ifdef CONFIG_NO_IDLE_HZ | ||
531 | /** | ||
532 | * hrtimer_get_next_event - get the time until next expiry event | ||
533 | * | ||
534 | * Returns the delta to the next expiry event or KTIME_MAX if no timer | ||
535 | * is pending. | ||
536 | */ | ||
537 | ktime_t hrtimer_get_next_event(void) | ||
538 | { | ||
539 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | ||
540 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; | ||
541 | unsigned long flags; | ||
542 | int i; | ||
543 | |||
544 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { | ||
545 | struct hrtimer *timer; | ||
546 | |||
547 | spin_lock_irqsave(&base->lock, flags); | ||
548 | if (!base->first) { | ||
549 | spin_unlock_irqrestore(&base->lock, flags); | ||
550 | continue; | ||
551 | } | ||
552 | timer = rb_entry(base->first, struct hrtimer, node); | ||
553 | delta.tv64 = timer->expires.tv64; | ||
554 | spin_unlock_irqrestore(&base->lock, flags); | ||
555 | delta = ktime_sub(delta, base->get_time()); | ||
556 | if (delta.tv64 < mindelta.tv64) | ||
557 | mindelta.tv64 = delta.tv64; | ||
558 | } | ||
559 | if (mindelta.tv64 < 0) | ||
560 | mindelta.tv64 = 0; | ||
561 | return mindelta; | ||
562 | } | ||
563 | #endif | ||
496 | 564 | ||
497 | /** | 565 | /** |
498 | * hrtimer_init - initialize a timer to the given clock | 566 | * hrtimer_init - initialize a timer to the given clock |
@@ -514,7 +582,9 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
514 | clock_id = CLOCK_MONOTONIC; | 582 | clock_id = CLOCK_MONOTONIC; |
515 | 583 | ||
516 | timer->base = &bases[clock_id]; | 584 | timer->base = &bases[clock_id]; |
585 | timer->node.rb_parent = HRTIMER_INACTIVE; | ||
517 | } | 586 | } |
587 | EXPORT_SYMBOL_GPL(hrtimer_init); | ||
518 | 588 | ||
519 | /** | 589 | /** |
520 | * hrtimer_get_res - get the timer resolution for a clock | 590 | * hrtimer_get_res - get the timer resolution for a clock |
@@ -534,54 +604,45 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
534 | 604 | ||
535 | return 0; | 605 | return 0; |
536 | } | 606 | } |
607 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | ||
537 | 608 | ||
538 | /* | 609 | /* |
539 | * Expire the per base hrtimer-queue: | 610 | * Expire the per base hrtimer-queue: |
540 | */ | 611 | */ |
541 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | 612 | static inline void run_hrtimer_queue(struct hrtimer_base *base) |
542 | { | 613 | { |
543 | ktime_t now = base->get_time(); | ||
544 | struct rb_node *node; | 614 | struct rb_node *node; |
545 | 615 | ||
616 | if (!base->first) | ||
617 | return; | ||
618 | |||
619 | if (base->get_softirq_time) | ||
620 | base->softirq_time = base->get_softirq_time(); | ||
621 | |||
546 | spin_lock_irq(&base->lock); | 622 | spin_lock_irq(&base->lock); |
547 | 623 | ||
548 | while ((node = base->first)) { | 624 | while ((node = base->first)) { |
549 | struct hrtimer *timer; | 625 | struct hrtimer *timer; |
550 | int (*fn)(void *); | 626 | int (*fn)(struct hrtimer *); |
551 | int restart; | 627 | int restart; |
552 | void *data; | ||
553 | 628 | ||
554 | timer = rb_entry(node, struct hrtimer, node); | 629 | timer = rb_entry(node, struct hrtimer, node); |
555 | if (now.tv64 <= timer->expires.tv64) | 630 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
556 | break; | 631 | break; |
557 | 632 | ||
558 | fn = timer->function; | 633 | fn = timer->function; |
559 | data = timer->data; | ||
560 | set_curr_timer(base, timer); | 634 | set_curr_timer(base, timer); |
561 | timer->state = HRTIMER_RUNNING; | ||
562 | __remove_hrtimer(timer, base); | 635 | __remove_hrtimer(timer, base); |
563 | spin_unlock_irq(&base->lock); | 636 | spin_unlock_irq(&base->lock); |
564 | 637 | ||
565 | /* | 638 | restart = fn(timer); |
566 | * fn == NULL is special case for the simplest timer | ||
567 | * variant - wake up process and do not restart: | ||
568 | */ | ||
569 | if (!fn) { | ||
570 | wake_up_process(data); | ||
571 | restart = HRTIMER_NORESTART; | ||
572 | } else | ||
573 | restart = fn(data); | ||
574 | 639 | ||
575 | spin_lock_irq(&base->lock); | 640 | spin_lock_irq(&base->lock); |
576 | 641 | ||
577 | /* Another CPU has added back the timer */ | 642 | if (restart != HRTIMER_NORESTART) { |
578 | if (timer->state != HRTIMER_RUNNING) | 643 | BUG_ON(hrtimer_active(timer)); |
579 | continue; | ||
580 | |||
581 | if (restart == HRTIMER_RESTART) | ||
582 | enqueue_hrtimer(timer, base); | 644 | enqueue_hrtimer(timer, base); |
583 | else | 645 | } |
584 | timer->state = HRTIMER_EXPIRED; | ||
585 | } | 646 | } |
586 | set_curr_timer(base, NULL); | 647 | set_curr_timer(base, NULL); |
587 | spin_unlock_irq(&base->lock); | 648 | spin_unlock_irq(&base->lock); |
@@ -595,6 +656,8 @@ void hrtimer_run_queues(void) | |||
595 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | 656 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); |
596 | int i; | 657 | int i; |
597 | 658 | ||
659 | hrtimer_get_softirq_time(base); | ||
660 | |||
598 | for (i = 0; i < MAX_HRTIMER_BASES; i++) | 661 | for (i = 0; i < MAX_HRTIMER_BASES; i++) |
599 | run_hrtimer_queue(&base[i]); | 662 | run_hrtimer_queue(&base[i]); |
600 | } | 663 | } |
@@ -602,80 +665,69 @@ void hrtimer_run_queues(void) | |||
602 | /* | 665 | /* |
603 | * Sleep related functions: | 666 | * Sleep related functions: |
604 | */ | 667 | */ |
605 | 668 | static int hrtimer_wakeup(struct hrtimer *timer) | |
606 | /** | ||
607 | * schedule_hrtimer - sleep until timeout | ||
608 | * | ||
609 | * @timer: hrtimer variable initialized with the correct clock base | ||
610 | * @mode: timeout value is abs/rel | ||
611 | * | ||
612 | * Make the current task sleep until @timeout is | ||
613 | * elapsed. | ||
614 | * | ||
615 | * You can set the task state as follows - | ||
616 | * | ||
617 | * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to | ||
618 | * pass before the routine returns. The routine will return 0 | ||
619 | * | ||
620 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | ||
621 | * delivered to the current task. In this case the remaining time | ||
622 | * will be returned | ||
623 | * | ||
624 | * The current task state is guaranteed to be TASK_RUNNING when this | ||
625 | * routine returns. | ||
626 | */ | ||
627 | static ktime_t __sched | ||
628 | schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) | ||
629 | { | 669 | { |
630 | /* fn stays NULL, meaning single-shot wakeup: */ | 670 | struct hrtimer_sleeper *t = |
631 | timer->data = current; | 671 | container_of(timer, struct hrtimer_sleeper, timer); |
672 | struct task_struct *task = t->task; | ||
632 | 673 | ||
633 | hrtimer_start(timer, timer->expires, mode); | 674 | t->task = NULL; |
675 | if (task) | ||
676 | wake_up_process(task); | ||
634 | 677 | ||
635 | schedule(); | 678 | return HRTIMER_NORESTART; |
636 | hrtimer_cancel(timer); | 679 | } |
637 | 680 | ||
638 | /* Return the remaining time: */ | 681 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) |
639 | if (timer->state != HRTIMER_EXPIRED) | 682 | { |
640 | return ktime_sub(timer->expires, timer->base->get_time()); | 683 | sl->timer.function = hrtimer_wakeup; |
641 | else | 684 | sl->task = task; |
642 | return (ktime_t) {.tv64 = 0 }; | ||
643 | } | 685 | } |
644 | 686 | ||
645 | static inline ktime_t __sched | 687 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) |
646 | schedule_hrtimer_interruptible(struct hrtimer *timer, | ||
647 | const enum hrtimer_mode mode) | ||
648 | { | 688 | { |
649 | set_current_state(TASK_INTERRUPTIBLE); | 689 | hrtimer_init_sleeper(t, current); |
690 | |||
691 | do { | ||
692 | set_current_state(TASK_INTERRUPTIBLE); | ||
693 | hrtimer_start(&t->timer, t->timer.expires, mode); | ||
694 | |||
695 | schedule(); | ||
696 | |||
697 | hrtimer_cancel(&t->timer); | ||
698 | mode = HRTIMER_ABS; | ||
650 | 699 | ||
651 | return schedule_hrtimer(timer, mode); | 700 | } while (t->task && !signal_pending(current)); |
701 | |||
702 | return t->task == NULL; | ||
652 | } | 703 | } |
653 | 704 | ||
654 | static long __sched nanosleep_restart(struct restart_block *restart) | 705 | static long __sched nanosleep_restart(struct restart_block *restart) |
655 | { | 706 | { |
707 | struct hrtimer_sleeper t; | ||
656 | struct timespec __user *rmtp; | 708 | struct timespec __user *rmtp; |
657 | struct timespec tu; | 709 | struct timespec tu; |
658 | void *rfn_save = restart->fn; | 710 | ktime_t time; |
659 | struct hrtimer timer; | ||
660 | ktime_t rem; | ||
661 | 711 | ||
662 | restart->fn = do_no_restart_syscall; | 712 | restart->fn = do_no_restart_syscall; |
663 | 713 | ||
664 | hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); | 714 | hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); |
665 | 715 | t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; | |
666 | timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; | ||
667 | |||
668 | rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); | ||
669 | 716 | ||
670 | if (rem.tv64 <= 0) | 717 | if (do_nanosleep(&t, HRTIMER_ABS)) |
671 | return 0; | 718 | return 0; |
672 | 719 | ||
673 | rmtp = (struct timespec __user *) restart->arg2; | 720 | rmtp = (struct timespec __user *) restart->arg2; |
674 | tu = ktime_to_timespec(rem); | 721 | if (rmtp) { |
675 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | 722 | time = ktime_sub(t.timer.expires, t.timer.base->get_time()); |
676 | return -EFAULT; | 723 | if (time.tv64 <= 0) |
724 | return 0; | ||
725 | tu = ktime_to_timespec(time); | ||
726 | if (copy_to_user(rmtp, &tu, sizeof(tu))) | ||
727 | return -EFAULT; | ||
728 | } | ||
677 | 729 | ||
678 | restart->fn = rfn_save; | 730 | restart->fn = nanosleep_restart; |
679 | 731 | ||
680 | /* The other values in restart are already filled in */ | 732 | /* The other values in restart are already filled in */ |
681 | return -ERESTART_RESTARTBLOCK; | 733 | return -ERESTART_RESTARTBLOCK; |
@@ -685,33 +737,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
685 | const enum hrtimer_mode mode, const clockid_t clockid) | 737 | const enum hrtimer_mode mode, const clockid_t clockid) |
686 | { | 738 | { |
687 | struct restart_block *restart; | 739 | struct restart_block *restart; |
688 | struct hrtimer timer; | 740 | struct hrtimer_sleeper t; |
689 | struct timespec tu; | 741 | struct timespec tu; |
690 | ktime_t rem; | 742 | ktime_t rem; |
691 | 743 | ||
692 | hrtimer_init(&timer, clockid, mode); | 744 | hrtimer_init(&t.timer, clockid, mode); |
693 | 745 | t.timer.expires = timespec_to_ktime(*rqtp); | |
694 | timer.expires = timespec_to_ktime(*rqtp); | 746 | if (do_nanosleep(&t, mode)) |
695 | |||
696 | rem = schedule_hrtimer_interruptible(&timer, mode); | ||
697 | if (rem.tv64 <= 0) | ||
698 | return 0; | 747 | return 0; |
699 | 748 | ||
700 | /* Absolute timers do not update the rmtp value and restart: */ | 749 | /* Absolute timers do not update the rmtp value and restart: */ |
701 | if (mode == HRTIMER_ABS) | 750 | if (mode == HRTIMER_ABS) |
702 | return -ERESTARTNOHAND; | 751 | return -ERESTARTNOHAND; |
703 | 752 | ||
704 | tu = ktime_to_timespec(rem); | 753 | if (rmtp) { |
705 | 754 | rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); | |
706 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | 755 | if (rem.tv64 <= 0) |
707 | return -EFAULT; | 756 | return 0; |
757 | tu = ktime_to_timespec(rem); | ||
758 | if (copy_to_user(rmtp, &tu, sizeof(tu))) | ||
759 | return -EFAULT; | ||
760 | } | ||
708 | 761 | ||
709 | restart = ¤t_thread_info()->restart_block; | 762 | restart = ¤t_thread_info()->restart_block; |
710 | restart->fn = nanosleep_restart; | 763 | restart->fn = nanosleep_restart; |
711 | restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; | 764 | restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; |
712 | restart->arg1 = timer.expires.tv64 >> 32; | 765 | restart->arg1 = t.timer.expires.tv64 >> 32; |
713 | restart->arg2 = (unsigned long) rmtp; | 766 | restart->arg2 = (unsigned long) rmtp; |
714 | restart->arg3 = (unsigned long) timer.base->index; | 767 | restart->arg3 = (unsigned long) t.timer.base->index; |
715 | 768 | ||
716 | return -ERESTART_RESTARTBLOCK; | 769 | return -ERESTART_RESTARTBLOCK; |
717 | } | 770 | } |
@@ -789,7 +842,7 @@ static void migrate_hrtimers(int cpu) | |||
789 | } | 842 | } |
790 | #endif /* CONFIG_HOTPLUG_CPU */ | 843 | #endif /* CONFIG_HOTPLUG_CPU */ |
791 | 844 | ||
792 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, | 845 | static int hrtimer_cpu_notify(struct notifier_block *self, |
793 | unsigned long action, void *hcpu) | 846 | unsigned long action, void *hcpu) |
794 | { | 847 | { |
795 | long cpu = (long)hcpu; | 848 | long cpu = (long)hcpu; |
@@ -813,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, | |||
813 | return NOTIFY_OK; | 866 | return NOTIFY_OK; |
814 | } | 867 | } |
815 | 868 | ||
816 | static struct notifier_block __devinitdata hrtimers_nb = { | 869 | static struct notifier_block hrtimers_nb = { |
817 | .notifier_call = hrtimer_cpu_notify, | 870 | .notifier_call = hrtimer_cpu_notify, |
818 | }; | 871 | }; |
819 | 872 | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 49378738ff..9f77f50d81 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -2,4 +2,4 @@ | |||
2 | obj-y := handle.o manage.o spurious.o | 2 | obj-y := handle.o manage.o spurious.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 97d5559997..1279e34995 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
204 | p = &desc->action; | 204 | p = &desc->action; |
205 | if ((old = *p) != NULL) { | 205 | if ((old = *p) != NULL) { |
206 | /* Can't share interrupts unless both agree to */ | 206 | /* Can't share interrupts unless both agree to */ |
207 | if (!(old->flags & new->flags & SA_SHIRQ)) { | 207 | if (!(old->flags & new->flags & SA_SHIRQ)) |
208 | spin_unlock_irqrestore(&desc->lock,flags); | 208 | goto mismatch; |
209 | return -EBUSY; | 209 | |
210 | } | 210 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) |
211 | /* All handlers must agree on per-cpuness */ | ||
212 | if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) | ||
213 | goto mismatch; | ||
214 | #endif | ||
211 | 215 | ||
212 | /* add new interrupt at end of irq queue */ | 216 | /* add new interrupt at end of irq queue */ |
213 | do { | 217 | do { |
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
218 | } | 222 | } |
219 | 223 | ||
220 | *p = new; | 224 | *p = new; |
221 | 225 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | |
226 | if (new->flags & SA_PERCPU_IRQ) | ||
227 | desc->status |= IRQ_PER_CPU; | ||
228 | #endif | ||
222 | if (!shared) { | 229 | if (!shared) { |
223 | desc->depth = 0; | 230 | desc->depth = 0; |
224 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | | 231 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | |
@@ -236,6 +243,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
236 | register_handler_proc(irq, new); | 243 | register_handler_proc(irq, new); |
237 | 244 | ||
238 | return 0; | 245 | return 0; |
246 | |||
247 | mismatch: | ||
248 | spin_unlock_irqrestore(&desc->lock, flags); | ||
249 | if (!(new->flags & SA_PROBEIRQ)) { | ||
250 | printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); | ||
251 | dump_stack(); | ||
252 | } | ||
253 | return -EBUSY; | ||
239 | } | 254 | } |
240 | 255 | ||
241 | /** | 256 | /** |
@@ -258,6 +273,7 @@ void free_irq(unsigned int irq, void *dev_id) | |||
258 | struct irqaction **p; | 273 | struct irqaction **p; |
259 | unsigned long flags; | 274 | unsigned long flags; |
260 | 275 | ||
276 | WARN_ON(in_interrupt()); | ||
261 | if (irq >= NR_IRQS) | 277 | if (irq >= NR_IRQS) |
262 | return; | 278 | return; |
263 | 279 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c new file mode 100644 index 0000000000..134f9f2e0e --- /dev/null +++ b/kernel/irq/migration.c | |||
@@ -0,0 +1,62 @@ | |||
1 | |||
2 | #include <linux/irq.h> | ||
3 | |||
4 | void set_pending_irq(unsigned int irq, cpumask_t mask) | ||
5 | { | ||
6 | irq_desc_t *desc = irq_desc + irq; | ||
7 | unsigned long flags; | ||
8 | |||
9 | spin_lock_irqsave(&desc->lock, flags); | ||
10 | desc->move_irq = 1; | ||
11 | pending_irq_cpumask[irq] = mask; | ||
12 | spin_unlock_irqrestore(&desc->lock, flags); | ||
13 | } | ||
14 | |||
15 | void move_native_irq(int irq) | ||
16 | { | ||
17 | cpumask_t tmp; | ||
18 | irq_desc_t *desc = irq_descp(irq); | ||
19 | |||
20 | if (likely(!desc->move_irq)) | ||
21 | return; | ||
22 | |||
23 | /* | ||
24 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. | ||
25 | */ | ||
26 | if (CHECK_IRQ_PER_CPU(desc->status)) { | ||
27 | WARN_ON(1); | ||
28 | return; | ||
29 | } | ||
30 | |||
31 | desc->move_irq = 0; | ||
32 | |||
33 | if (likely(cpus_empty(pending_irq_cpumask[irq]))) | ||
34 | return; | ||
35 | |||
36 | if (!desc->handler->set_affinity) | ||
37 | return; | ||
38 | |||
39 | assert_spin_locked(&desc->lock); | ||
40 | |||
41 | cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); | ||
42 | |||
43 | /* | ||
44 | * If there was a valid mask to work with, please | ||
45 | * do the disable, re-program, enable sequence. | ||
46 | * This is *not* particularly important for level triggered | ||
47 | * but in a edge trigger case, we might be setting rte | ||
48 | * when an active trigger is comming in. This could | ||
49 | * cause some ioapics to mal-function. | ||
50 | * Being paranoid i guess! | ||
51 | */ | ||
52 | if (unlikely(!cpus_empty(tmp))) { | ||
53 | if (likely(!(desc->status & IRQ_DISABLED))) | ||
54 | desc->handler->disable(irq); | ||
55 | |||
56 | desc->handler->set_affinity(irq,tmp); | ||
57 | |||
58 | if (likely(!(desc->status & IRQ_DISABLED))) | ||
59 | desc->handler->enable(irq); | ||
60 | } | ||
61 | cpus_clear(pending_irq_cpumask[irq]); | ||
62 | } | ||
diff --git a/kernel/itimer.c b/kernel/itimer.c index 379be2f8c8..204ed7939e 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
128 | /* | 128 | /* |
129 | * The timer is automagically restarted, when interval != 0 | 129 | * The timer is automagically restarted, when interval != 0 |
130 | */ | 130 | */ |
131 | int it_real_fn(void *data) | 131 | int it_real_fn(struct hrtimer *timer) |
132 | { | 132 | { |
133 | struct task_struct *tsk = (struct task_struct *) data; | 133 | struct signal_struct *sig = |
134 | container_of(timer, struct signal_struct, real_timer); | ||
134 | 135 | ||
135 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); | 136 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); |
136 | |||
137 | if (tsk->signal->it_real_incr.tv64 != 0) { | ||
138 | hrtimer_forward(&tsk->signal->real_timer, | ||
139 | tsk->signal->it_real_incr); | ||
140 | 137 | ||
138 | if (sig->it_real_incr.tv64 != 0) { | ||
139 | hrtimer_forward(timer, timer->base->softirq_time, | ||
140 | sig->it_real_incr); | ||
141 | return HRTIMER_RESTART; | 141 | return HRTIMER_RESTART; |
142 | } | 142 | } |
143 | return HRTIMER_NORESTART; | 143 | return HRTIMER_NORESTART; |
144 | } | 144 | } |
145 | 145 | ||
146 | /* | ||
147 | * We do not care about correctness. We just sanitize the values so | ||
148 | * the ktime_t operations which expect normalized values do not | ||
149 | * break. This converts negative values to long timeouts similar to | ||
150 | * the code in kernel versions < 2.6.16 | ||
151 | * | ||
152 | * Print a limited number of warning messages when an invalid timeval | ||
153 | * is detected. | ||
154 | */ | ||
155 | static void fixup_timeval(struct timeval *tv, int interval) | ||
156 | { | ||
157 | static int warnlimit = 10; | ||
158 | unsigned long tmp; | ||
159 | |||
160 | if (warnlimit > 0) { | ||
161 | warnlimit--; | ||
162 | printk(KERN_WARNING | ||
163 | "setitimer: %s (pid = %d) provided " | ||
164 | "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", | ||
165 | current->comm, current->pid, | ||
166 | interval ? "it_interval" : "it_value", | ||
167 | tv->tv_sec, (long) tv->tv_usec); | ||
168 | } | ||
169 | |||
170 | tmp = tv->tv_usec; | ||
171 | if (tmp >= USEC_PER_SEC) { | ||
172 | tv->tv_usec = tmp % USEC_PER_SEC; | ||
173 | tv->tv_sec += tmp / USEC_PER_SEC; | ||
174 | } | ||
175 | |||
176 | tmp = tv->tv_sec; | ||
177 | if (tmp > LONG_MAX) | ||
178 | tv->tv_sec = LONG_MAX; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Returns true if the timeval is in canonical form | ||
183 | */ | ||
184 | #define timeval_valid(t) \ | ||
185 | (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) | ||
186 | |||
187 | /* | ||
188 | * Check for invalid timevals, sanitize them and print a limited | ||
189 | * number of warnings. | ||
190 | */ | ||
191 | static void check_itimerval(struct itimerval *value) { | ||
192 | |||
193 | if (unlikely(!timeval_valid(&value->it_value))) | ||
194 | fixup_timeval(&value->it_value, 0); | ||
195 | |||
196 | if (unlikely(!timeval_valid(&value->it_interval))) | ||
197 | fixup_timeval(&value->it_interval, 1); | ||
198 | } | ||
199 | |||
146 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | 200 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) |
147 | { | 201 | { |
148 | struct task_struct *tsk = current; | 202 | struct task_struct *tsk = current; |
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | |||
150 | ktime_t expires; | 204 | ktime_t expires; |
151 | cputime_t cval, cinterval, nval, ninterval; | 205 | cputime_t cval, cinterval, nval, ninterval; |
152 | 206 | ||
207 | /* | ||
208 | * Validate the timevals in value. | ||
209 | * | ||
210 | * Note: Although the spec requires that invalid values shall | ||
211 | * return -EINVAL, we just fixup the value and print a limited | ||
212 | * number of warnings in order not to break users of this | ||
213 | * historical misfeature. | ||
214 | * | ||
215 | * Scheduled for replacement in March 2007 | ||
216 | */ | ||
217 | check_itimerval(value); | ||
218 | |||
153 | switch (which) { | 219 | switch (which) { |
154 | case ITIMER_REAL: | 220 | case ITIMER_REAL: |
155 | again: | 221 | again: |
@@ -226,6 +292,43 @@ again: | |||
226 | return 0; | 292 | return 0; |
227 | } | 293 | } |
228 | 294 | ||
295 | /** | ||
296 | * alarm_setitimer - set alarm in seconds | ||
297 | * | ||
298 | * @seconds: number of seconds until alarm | ||
299 | * 0 disables the alarm | ||
300 | * | ||
301 | * Returns the remaining time in seconds of a pending timer or 0 when | ||
302 | * the timer is not active. | ||
303 | * | ||
304 | * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid | ||
305 | * negative timeval settings which would cause immediate expiry. | ||
306 | */ | ||
307 | unsigned int alarm_setitimer(unsigned int seconds) | ||
308 | { | ||
309 | struct itimerval it_new, it_old; | ||
310 | |||
311 | #if BITS_PER_LONG < 64 | ||
312 | if (seconds > INT_MAX) | ||
313 | seconds = INT_MAX; | ||
314 | #endif | ||
315 | it_new.it_value.tv_sec = seconds; | ||
316 | it_new.it_value.tv_usec = 0; | ||
317 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; | ||
318 | |||
319 | do_setitimer(ITIMER_REAL, &it_new, &it_old); | ||
320 | |||
321 | /* | ||
322 | * We can't return 0 if we have an alarm pending ... And we'd | ||
323 | * better return too much than too little anyway | ||
324 | */ | ||
325 | if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || | ||
326 | it_old.it_value.tv_usec >= 500000) | ||
327 | it_old.it_value.tv_sec++; | ||
328 | |||
329 | return it_old.it_value.tv_sec; | ||
330 | } | ||
331 | |||
229 | asmlinkage long sys_setitimer(int which, | 332 | asmlinkage long sys_setitimer(int which, |
230 | struct itimerval __user *value, | 333 | struct itimerval __user *value, |
231 | struct itimerval __user *ovalue) | 334 | struct itimerval __user *ovalue) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 51a892063a..20a997c73c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data) | |||
170 | sa.sa.sa_handler = SIG_IGN; | 170 | sa.sa.sa_handler = SIG_IGN; |
171 | sa.sa.sa_flags = 0; | 171 | sa.sa.sa_flags = 0; |
172 | siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); | 172 | siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); |
173 | do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); | 173 | do_sigaction(SIGCHLD, &sa, NULL); |
174 | allow_signal(SIGCHLD); | 174 | allow_signal(SIGCHLD); |
175 | 175 | ||
176 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); | 176 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fef1af8a73..1fbf466a29 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -48,7 +48,7 @@ | |||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | 50 | ||
51 | DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 54 | ||
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) | |||
323 | } | 323 | } |
324 | 324 | ||
325 | /* | 325 | /* |
326 | * This function is called from exit_thread or flush_thread when task tk's | 326 | * This function is called from finish_task_switch when task tk becomes dead, |
327 | * stack is being recycled so that we can recycle any function-return probe | 327 | * so that we can recycle any function-return probe instances associated |
328 | * instances associated with this task. These left over instances represent | 328 | * with this task. These left over instances represent probed functions |
329 | * probed functions that have been called but will never return. | 329 | * that have been called but will never return. |
330 | */ | 330 | */ |
331 | void __kprobes kprobe_flush_task(struct task_struct *tk) | 331 | void __kprobes kprobe_flush_task(struct task_struct *tk) |
332 | { | 332 | { |
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
336 | unsigned long flags = 0; | 336 | unsigned long flags = 0; |
337 | 337 | ||
338 | spin_lock_irqsave(&kretprobe_lock, flags); | 338 | spin_lock_irqsave(&kretprobe_lock, flags); |
339 | head = kretprobe_inst_table_head(current); | 339 | head = kretprobe_inst_table_head(tk); |
340 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | 340 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { |
341 | if (ri->task == tk) | 341 | if (ri->task == tk) |
342 | recycle_rp_inst(ri); | 342 | recycle_rp_inst(ri); |
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
460 | } | 460 | } |
461 | 461 | ||
462 | p->nmissed = 0; | 462 | p->nmissed = 0; |
463 | down(&kprobe_mutex); | 463 | mutex_lock(&kprobe_mutex); |
464 | old_p = get_kprobe(p->addr); | 464 | old_p = get_kprobe(p->addr); |
465 | if (old_p) { | 465 | if (old_p) { |
466 | ret = register_aggr_kprobe(old_p, p); | 466 | ret = register_aggr_kprobe(old_p, p); |
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
477 | arch_arm_kprobe(p); | 477 | arch_arm_kprobe(p); |
478 | 478 | ||
479 | out: | 479 | out: |
480 | up(&kprobe_mutex); | 480 | mutex_unlock(&kprobe_mutex); |
481 | 481 | ||
482 | if (ret && probed_mod) | 482 | if (ret && probed_mod) |
483 | module_put(probed_mod); | 483 | module_put(probed_mod); |
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
496 | struct kprobe *old_p, *list_p; | 496 | struct kprobe *old_p, *list_p; |
497 | int cleanup_p; | 497 | int cleanup_p; |
498 | 498 | ||
499 | down(&kprobe_mutex); | 499 | mutex_lock(&kprobe_mutex); |
500 | old_p = get_kprobe(p->addr); | 500 | old_p = get_kprobe(p->addr); |
501 | if (unlikely(!old_p)) { | 501 | if (unlikely(!old_p)) { |
502 | up(&kprobe_mutex); | 502 | mutex_unlock(&kprobe_mutex); |
503 | return; | 503 | return; |
504 | } | 504 | } |
505 | if (p != old_p) { | 505 | if (p != old_p) { |
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
507 | if (list_p == p) | 507 | if (list_p == p) |
508 | /* kprobe p is a valid probe */ | 508 | /* kprobe p is a valid probe */ |
509 | goto valid_p; | 509 | goto valid_p; |
510 | up(&kprobe_mutex); | 510 | mutex_unlock(&kprobe_mutex); |
511 | return; | 511 | return; |
512 | } | 512 | } |
513 | valid_p: | 513 | valid_p: |
@@ -523,7 +523,7 @@ valid_p: | |||
523 | cleanup_p = 0; | 523 | cleanup_p = 0; |
524 | } | 524 | } |
525 | 525 | ||
526 | up(&kprobe_mutex); | 526 | mutex_unlock(&kprobe_mutex); |
527 | 527 | ||
528 | synchronize_sched(); | 528 | synchronize_sched(); |
529 | if (p->mod_refcounted && | 529 | if (p->mod_refcounted && |
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
585 | int i; | 585 | int i; |
586 | 586 | ||
587 | rp->kp.pre_handler = pre_handler_kretprobe; | 587 | rp->kp.pre_handler = pre_handler_kretprobe; |
588 | rp->kp.post_handler = NULL; | ||
589 | rp->kp.fault_handler = NULL; | ||
590 | rp->kp.break_handler = NULL; | ||
588 | 591 | ||
589 | /* Pre-allocate memory for max kretprobe instances */ | 592 | /* Pre-allocate memory for max kretprobe instances */ |
590 | if (rp->maxactive <= 0) { | 593 | if (rp->maxactive <= 0) { |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d5eeae0fa5..f119e098e6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -15,9 +15,6 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | 17 | ||
18 | u64 uevent_seqnum; | ||
19 | char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; | ||
20 | |||
21 | #define KERNEL_ATTR_RO(_name) \ | 18 | #define KERNEL_ATTR_RO(_name) \ |
22 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
23 | 20 | ||
@@ -25,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | |||
25 | static struct subsys_attribute _name##_attr = \ | 22 | static struct subsys_attribute _name##_attr = \ |
26 | __ATTR(_name, 0644, _name##_show, _name##_store) | 23 | __ATTR(_name, 0644, _name##_show, _name##_store) |
27 | 24 | ||
28 | #ifdef CONFIG_HOTPLUG | 25 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
29 | /* current uevent sequence number */ | 26 | /* current uevent sequence number */ |
30 | static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) | 27 | static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) |
31 | { | 28 | { |
@@ -55,7 +52,7 @@ decl_subsys(kernel, NULL, NULL); | |||
55 | EXPORT_SYMBOL_GPL(kernel_subsys); | 52 | EXPORT_SYMBOL_GPL(kernel_subsys); |
56 | 53 | ||
57 | static struct attribute * kernel_attrs[] = { | 54 | static struct attribute * kernel_attrs[] = { |
58 | #ifdef CONFIG_HOTPLUG | 55 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
59 | &uevent_seqnum_attr.attr, | 56 | &uevent_seqnum_attr.attr, |
60 | &uevent_helper_attr.attr, | 57 | &uevent_helper_attr.attr, |
61 | #endif | 58 | #endif |
diff --git a/kernel/kthread.c b/kernel/kthread.c index e75950a109..c5f3c6613b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/unistd.h> | 12 | #include <linux/unistd.h> |
13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mutex.h> | ||
15 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
16 | 17 | ||
17 | /* | 18 | /* |
@@ -41,7 +42,7 @@ struct kthread_stop_info | |||
41 | 42 | ||
42 | /* Thread stopping is done by setthing this var: lock serializes | 43 | /* Thread stopping is done by setthing this var: lock serializes |
43 | * multiple kthread_stop calls. */ | 44 | * multiple kthread_stop calls. */ |
44 | static DECLARE_MUTEX(kthread_stop_lock); | 45 | static DEFINE_MUTEX(kthread_stop_lock); |
45 | static struct kthread_stop_info kthread_stop_info; | 46 | static struct kthread_stop_info kthread_stop_info; |
46 | 47 | ||
47 | int kthread_should_stop(void) | 48 | int kthread_should_stop(void) |
@@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create) | |||
114 | create->result = ERR_PTR(pid); | 115 | create->result = ERR_PTR(pid); |
115 | } else { | 116 | } else { |
116 | wait_for_completion(&create->started); | 117 | wait_for_completion(&create->started); |
118 | read_lock(&tasklist_lock); | ||
117 | create->result = find_task_by_pid(pid); | 119 | create->result = find_task_by_pid(pid); |
120 | read_unlock(&tasklist_lock); | ||
118 | } | 121 | } |
119 | complete(&create->done); | 122 | complete(&create->done); |
120 | } | 123 | } |
@@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
173 | { | 176 | { |
174 | int ret; | 177 | int ret; |
175 | 178 | ||
176 | down(&kthread_stop_lock); | 179 | mutex_lock(&kthread_stop_lock); |
177 | 180 | ||
178 | /* It could exit after stop_info.k set, but before wake_up_process. */ | 181 | /* It could exit after stop_info.k set, but before wake_up_process. */ |
179 | get_task_struct(k); | 182 | get_task_struct(k); |
@@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
194 | wait_for_completion(&kthread_stop_info.done); | 197 | wait_for_completion(&kthread_stop_info.done); |
195 | kthread_stop_info.k = NULL; | 198 | kthread_stop_info.k = NULL; |
196 | ret = kthread_stop_info.err; | 199 | ret = kthread_stop_info.err; |
197 | up(&kthread_stop_lock); | 200 | mutex_unlock(&kthread_stop_lock); |
198 | 201 | ||
199 | return ret; | 202 | return ret; |
200 | } | 203 | } |
diff --git a/kernel/module.c b/kernel/module.c index 5aad477ddc..bbe04862e1 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/device.h> | 39 | #include <linux/device.h> |
40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/mutex.h> | ||
42 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
43 | #include <asm/semaphore.h> | 44 | #include <asm/semaphore.h> |
44 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
@@ -60,29 +61,20 @@ | |||
60 | static DEFINE_SPINLOCK(modlist_lock); | 61 | static DEFINE_SPINLOCK(modlist_lock); |
61 | 62 | ||
62 | /* List of modules, protected by module_mutex AND modlist_lock */ | 63 | /* List of modules, protected by module_mutex AND modlist_lock */ |
63 | static DECLARE_MUTEX(module_mutex); | 64 | static DEFINE_MUTEX(module_mutex); |
64 | static LIST_HEAD(modules); | 65 | static LIST_HEAD(modules); |
65 | 66 | ||
66 | static DECLARE_MUTEX(notify_mutex); | 67 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); |
67 | static struct notifier_block * module_notify_list; | ||
68 | 68 | ||
69 | int register_module_notifier(struct notifier_block * nb) | 69 | int register_module_notifier(struct notifier_block * nb) |
70 | { | 70 | { |
71 | int err; | 71 | return blocking_notifier_chain_register(&module_notify_list, nb); |
72 | down(¬ify_mutex); | ||
73 | err = notifier_chain_register(&module_notify_list, nb); | ||
74 | up(¬ify_mutex); | ||
75 | return err; | ||
76 | } | 72 | } |
77 | EXPORT_SYMBOL(register_module_notifier); | 73 | EXPORT_SYMBOL(register_module_notifier); |
78 | 74 | ||
79 | int unregister_module_notifier(struct notifier_block * nb) | 75 | int unregister_module_notifier(struct notifier_block * nb) |
80 | { | 76 | { |
81 | int err; | 77 | return blocking_notifier_chain_unregister(&module_notify_list, nb); |
82 | down(¬ify_mutex); | ||
83 | err = notifier_chain_unregister(&module_notify_list, nb); | ||
84 | up(¬ify_mutex); | ||
85 | return err; | ||
86 | } | 78 | } |
87 | EXPORT_SYMBOL(unregister_module_notifier); | 79 | EXPORT_SYMBOL(unregister_module_notifier); |
88 | 80 | ||
@@ -126,15 +118,30 @@ extern const struct kernel_symbol __start___ksymtab[]; | |||
126 | extern const struct kernel_symbol __stop___ksymtab[]; | 118 | extern const struct kernel_symbol __stop___ksymtab[]; |
127 | extern const struct kernel_symbol __start___ksymtab_gpl[]; | 119 | extern const struct kernel_symbol __start___ksymtab_gpl[]; |
128 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; | 120 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; |
121 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | ||
122 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | ||
129 | extern const unsigned long __start___kcrctab[]; | 123 | extern const unsigned long __start___kcrctab[]; |
130 | extern const unsigned long __start___kcrctab_gpl[]; | 124 | extern const unsigned long __start___kcrctab_gpl[]; |
125 | extern const unsigned long __start___kcrctab_gpl_future[]; | ||
131 | 126 | ||
132 | #ifndef CONFIG_MODVERSIONS | 127 | #ifndef CONFIG_MODVERSIONS |
133 | #define symversion(base, idx) NULL | 128 | #define symversion(base, idx) NULL |
134 | #else | 129 | #else |
135 | #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) | 130 | #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) |
136 | #endif | 131 | #endif |
137 | 132 | ||
133 | /* lookup symbol in given range of kernel_symbols */ | ||
134 | static const struct kernel_symbol *lookup_symbol(const char *name, | ||
135 | const struct kernel_symbol *start, | ||
136 | const struct kernel_symbol *stop) | ||
137 | { | ||
138 | const struct kernel_symbol *ks = start; | ||
139 | for (; ks < stop; ks++) | ||
140 | if (strcmp(ks->name, name) == 0) | ||
141 | return ks; | ||
142 | return NULL; | ||
143 | } | ||
144 | |||
138 | /* Find a symbol, return value, crc and module which owns it */ | 145 | /* Find a symbol, return value, crc and module which owns it */ |
139 | static unsigned long __find_symbol(const char *name, | 146 | static unsigned long __find_symbol(const char *name, |
140 | struct module **owner, | 147 | struct module **owner, |
@@ -142,64 +149,81 @@ static unsigned long __find_symbol(const char *name, | |||
142 | int gplok) | 149 | int gplok) |
143 | { | 150 | { |
144 | struct module *mod; | 151 | struct module *mod; |
145 | unsigned int i; | 152 | const struct kernel_symbol *ks; |
146 | 153 | ||
147 | /* Core kernel first. */ | 154 | /* Core kernel first. */ |
148 | *owner = NULL; | 155 | *owner = NULL; |
149 | for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { | 156 | ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); |
150 | if (strcmp(__start___ksymtab[i].name, name) == 0) { | 157 | if (ks) { |
151 | *crc = symversion(__start___kcrctab, i); | 158 | *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); |
152 | return __start___ksymtab[i].value; | 159 | return ks->value; |
153 | } | ||
154 | } | 160 | } |
155 | if (gplok) { | 161 | if (gplok) { |
156 | for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) | 162 | ks = lookup_symbol(name, __start___ksymtab_gpl, |
157 | if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { | 163 | __stop___ksymtab_gpl); |
158 | *crc = symversion(__start___kcrctab_gpl, i); | 164 | if (ks) { |
159 | return __start___ksymtab_gpl[i].value; | 165 | *crc = symversion(__start___kcrctab_gpl, |
160 | } | 166 | (ks - __start___ksymtab_gpl)); |
167 | return ks->value; | ||
168 | } | ||
169 | } | ||
170 | ks = lookup_symbol(name, __start___ksymtab_gpl_future, | ||
171 | __stop___ksymtab_gpl_future); | ||
172 | if (ks) { | ||
173 | if (!gplok) { | ||
174 | printk(KERN_WARNING "Symbol %s is being used " | ||
175 | "by a non-GPL module, which will not " | ||
176 | "be allowed in the future\n", name); | ||
177 | printk(KERN_WARNING "Please see the file " | ||
178 | "Documentation/feature-removal-schedule.txt " | ||
179 | "in the kernel source tree for more " | ||
180 | "details.\n"); | ||
181 | } | ||
182 | *crc = symversion(__start___kcrctab_gpl_future, | ||
183 | (ks - __start___ksymtab_gpl_future)); | ||
184 | return ks->value; | ||
161 | } | 185 | } |
162 | 186 | ||
163 | /* Now try modules. */ | 187 | /* Now try modules. */ |
164 | list_for_each_entry(mod, &modules, list) { | 188 | list_for_each_entry(mod, &modules, list) { |
165 | *owner = mod; | 189 | *owner = mod; |
166 | for (i = 0; i < mod->num_syms; i++) | 190 | ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); |
167 | if (strcmp(mod->syms[i].name, name) == 0) { | 191 | if (ks) { |
168 | *crc = symversion(mod->crcs, i); | 192 | *crc = symversion(mod->crcs, (ks - mod->syms)); |
169 | return mod->syms[i].value; | 193 | return ks->value; |
170 | } | 194 | } |
171 | 195 | ||
172 | if (gplok) { | 196 | if (gplok) { |
173 | for (i = 0; i < mod->num_gpl_syms; i++) { | 197 | ks = lookup_symbol(name, mod->gpl_syms, |
174 | if (strcmp(mod->gpl_syms[i].name, name) == 0) { | 198 | mod->gpl_syms + mod->num_gpl_syms); |
175 | *crc = symversion(mod->gpl_crcs, i); | 199 | if (ks) { |
176 | return mod->gpl_syms[i].value; | 200 | *crc = symversion(mod->gpl_crcs, |
177 | } | 201 | (ks - mod->gpl_syms)); |
202 | return ks->value; | ||
178 | } | 203 | } |
179 | } | 204 | } |
205 | ks = lookup_symbol(name, mod->gpl_future_syms, | ||
206 | (mod->gpl_future_syms + | ||
207 | mod->num_gpl_future_syms)); | ||
208 | if (ks) { | ||
209 | if (!gplok) { | ||
210 | printk(KERN_WARNING "Symbol %s is being used " | ||
211 | "by a non-GPL module, which will not " | ||
212 | "be allowed in the future\n", name); | ||
213 | printk(KERN_WARNING "Please see the file " | ||
214 | "Documentation/feature-removal-schedule.txt " | ||
215 | "in the kernel source tree for more " | ||
216 | "details.\n"); | ||
217 | } | ||
218 | *crc = symversion(mod->gpl_future_crcs, | ||
219 | (ks - mod->gpl_future_syms)); | ||
220 | return ks->value; | ||
221 | } | ||
180 | } | 222 | } |
181 | DEBUGP("Failed to find symbol %s\n", name); | 223 | DEBUGP("Failed to find symbol %s\n", name); |
182 | return 0; | 224 | return 0; |
183 | } | 225 | } |
184 | 226 | ||
185 | /* Find a symbol in this elf symbol table */ | ||
186 | static unsigned long find_local_symbol(Elf_Shdr *sechdrs, | ||
187 | unsigned int symindex, | ||
188 | const char *strtab, | ||
189 | const char *name) | ||
190 | { | ||
191 | unsigned int i; | ||
192 | Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; | ||
193 | |||
194 | /* Search (defined) internal symbols first. */ | ||
195 | for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { | ||
196 | if (sym[i].st_shndx != SHN_UNDEF | ||
197 | && strcmp(name, strtab + sym[i].st_name) == 0) | ||
198 | return sym[i].st_value; | ||
199 | } | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | /* Search for module by name: must hold module_mutex. */ | 227 | /* Search for module by name: must hold module_mutex. */ |
204 | static struct module *find_module(const char *name) | 228 | static struct module *find_module(const char *name) |
205 | { | 229 | { |
@@ -379,7 +403,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, | |||
379 | } | 403 | } |
380 | #endif /* CONFIG_SMP */ | 404 | #endif /* CONFIG_SMP */ |
381 | 405 | ||
382 | #ifdef CONFIG_MODULE_UNLOAD | ||
383 | #define MODINFO_ATTR(field) \ | 406 | #define MODINFO_ATTR(field) \ |
384 | static void setup_modinfo_##field(struct module *mod, const char *s) \ | 407 | static void setup_modinfo_##field(struct module *mod, const char *s) \ |
385 | { \ | 408 | { \ |
@@ -411,12 +434,7 @@ static struct module_attribute modinfo_##field = { \ | |||
411 | MODINFO_ATTR(version); | 434 | MODINFO_ATTR(version); |
412 | MODINFO_ATTR(srcversion); | 435 | MODINFO_ATTR(srcversion); |
413 | 436 | ||
414 | static struct module_attribute *modinfo_attrs[] = { | 437 | #ifdef CONFIG_MODULE_UNLOAD |
415 | &modinfo_version, | ||
416 | &modinfo_srcversion, | ||
417 | NULL, | ||
418 | }; | ||
419 | |||
420 | /* Init the unload section of the module. */ | 438 | /* Init the unload section of the module. */ |
421 | static void module_unload_init(struct module *mod) | 439 | static void module_unload_init(struct module *mod) |
422 | { | 440 | { |
@@ -557,7 +575,7 @@ static void free_module(struct module *mod); | |||
557 | static void wait_for_zero_refcount(struct module *mod) | 575 | static void wait_for_zero_refcount(struct module *mod) |
558 | { | 576 | { |
559 | /* Since we might sleep for some time, drop the semaphore first */ | 577 | /* Since we might sleep for some time, drop the semaphore first */ |
560 | up(&module_mutex); | 578 | mutex_unlock(&module_mutex); |
561 | for (;;) { | 579 | for (;;) { |
562 | DEBUGP("Looking at refcount...\n"); | 580 | DEBUGP("Looking at refcount...\n"); |
563 | set_current_state(TASK_UNINTERRUPTIBLE); | 581 | set_current_state(TASK_UNINTERRUPTIBLE); |
@@ -566,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod) | |||
566 | schedule(); | 584 | schedule(); |
567 | } | 585 | } |
568 | current->state = TASK_RUNNING; | 586 | current->state = TASK_RUNNING; |
569 | down(&module_mutex); | 587 | mutex_lock(&module_mutex); |
570 | } | 588 | } |
571 | 589 | ||
572 | asmlinkage long | 590 | asmlinkage long |
@@ -583,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
583 | return -EFAULT; | 601 | return -EFAULT; |
584 | name[MODULE_NAME_LEN-1] = '\0'; | 602 | name[MODULE_NAME_LEN-1] = '\0'; |
585 | 603 | ||
586 | if (down_interruptible(&module_mutex) != 0) | 604 | if (mutex_lock_interruptible(&module_mutex) != 0) |
587 | return -EINTR; | 605 | return -EINTR; |
588 | 606 | ||
589 | mod = find_module(name); | 607 | mod = find_module(name); |
@@ -632,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
632 | 650 | ||
633 | /* Final destruction now noone is using it. */ | 651 | /* Final destruction now noone is using it. */ |
634 | if (mod->exit != NULL) { | 652 | if (mod->exit != NULL) { |
635 | up(&module_mutex); | 653 | mutex_unlock(&module_mutex); |
636 | mod->exit(); | 654 | mod->exit(); |
637 | down(&module_mutex); | 655 | mutex_lock(&module_mutex); |
638 | } | 656 | } |
639 | free_module(mod); | 657 | free_module(mod); |
640 | 658 | ||
641 | out: | 659 | out: |
642 | up(&module_mutex); | 660 | mutex_unlock(&module_mutex); |
643 | return ret; | 661 | return ret; |
644 | } | 662 | } |
645 | 663 | ||
@@ -687,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put); | |||
687 | 705 | ||
688 | void symbol_put_addr(void *addr) | 706 | void symbol_put_addr(void *addr) |
689 | { | 707 | { |
690 | unsigned long flags; | 708 | struct module *modaddr; |
691 | 709 | ||
692 | spin_lock_irqsave(&modlist_lock, flags); | 710 | if (core_kernel_text((unsigned long)addr)) |
693 | if (!kernel_text_address((unsigned long)addr)) | 711 | return; |
694 | BUG(); | ||
695 | 712 | ||
696 | module_put(module_text_address((unsigned long)addr)); | 713 | if (!(modaddr = module_text_address((unsigned long)addr))) |
697 | spin_unlock_irqrestore(&modlist_lock, flags); | 714 | BUG(); |
715 | module_put(modaddr); | ||
698 | } | 716 | } |
699 | EXPORT_SYMBOL_GPL(symbol_put_addr); | 717 | EXPORT_SYMBOL_GPL(symbol_put_addr); |
700 | 718 | ||
@@ -731,138 +749,14 @@ static inline void module_unload_init(struct module *mod) | |||
731 | } | 749 | } |
732 | #endif /* CONFIG_MODULE_UNLOAD */ | 750 | #endif /* CONFIG_MODULE_UNLOAD */ |
733 | 751 | ||
734 | #ifdef CONFIG_OBSOLETE_MODPARM | 752 | static struct module_attribute *modinfo_attrs[] = { |
735 | /* Bounds checking done below */ | 753 | &modinfo_version, |
736 | static int obsparm_copy_string(const char *val, struct kernel_param *kp) | 754 | &modinfo_srcversion, |
737 | { | 755 | #ifdef CONFIG_MODULE_UNLOAD |
738 | strcpy(kp->arg, val); | 756 | &refcnt, |
739 | return 0; | 757 | #endif |
740 | } | 758 | NULL, |
741 | 759 | }; | |
742 | static int set_obsolete(const char *val, struct kernel_param *kp) | ||
743 | { | ||
744 | unsigned int min, max; | ||
745 | unsigned int size, maxsize; | ||
746 | int dummy; | ||
747 | char *endp; | ||
748 | const char *p; | ||
749 | struct obsolete_modparm *obsparm = kp->arg; | ||
750 | |||
751 | if (!val) { | ||
752 | printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); | ||
753 | return -EINVAL; | ||
754 | } | ||
755 | |||
756 | /* type is: [min[-max]]{b,h,i,l,s} */ | ||
757 | p = obsparm->type; | ||
758 | min = simple_strtol(p, &endp, 10); | ||
759 | if (endp == obsparm->type) | ||
760 | min = max = 1; | ||
761 | else if (*endp == '-') { | ||
762 | p = endp+1; | ||
763 | max = simple_strtol(p, &endp, 10); | ||
764 | } else | ||
765 | max = min; | ||
766 | switch (*endp) { | ||
767 | case 'b': | ||
768 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
769 | 1, param_set_byte, &dummy); | ||
770 | case 'h': | ||
771 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
772 | sizeof(short), param_set_short, &dummy); | ||
773 | case 'i': | ||
774 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
775 | sizeof(int), param_set_int, &dummy); | ||
776 | case 'l': | ||
777 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
778 | sizeof(long), param_set_long, &dummy); | ||
779 | case 's': | ||
780 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
781 | sizeof(char *), param_set_charp, &dummy); | ||
782 | |||
783 | case 'c': | ||
784 | /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, | ||
785 | and the decl is "char xxx[5][50];" */ | ||
786 | p = endp+1; | ||
787 | maxsize = simple_strtol(p, &endp, 10); | ||
788 | /* We check lengths here (yes, this is a hack). */ | ||
789 | p = val; | ||
790 | while (p[size = strcspn(p, ",")]) { | ||
791 | if (size >= maxsize) | ||
792 | goto oversize; | ||
793 | p += size+1; | ||
794 | } | ||
795 | if (size >= maxsize) | ||
796 | goto oversize; | ||
797 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
798 | maxsize, obsparm_copy_string, &dummy); | ||
799 | } | ||
800 | printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); | ||
801 | return -EINVAL; | ||
802 | oversize: | ||
803 | printk(KERN_ERR | ||
804 | "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); | ||
805 | return -EINVAL; | ||
806 | } | ||
807 | |||
808 | static int obsolete_params(const char *name, | ||
809 | char *args, | ||
810 | struct obsolete_modparm obsparm[], | ||
811 | unsigned int num, | ||
812 | Elf_Shdr *sechdrs, | ||
813 | unsigned int symindex, | ||
814 | const char *strtab) | ||
815 | { | ||
816 | struct kernel_param *kp; | ||
817 | unsigned int i; | ||
818 | int ret; | ||
819 | |||
820 | kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); | ||
821 | if (!kp) | ||
822 | return -ENOMEM; | ||
823 | |||
824 | for (i = 0; i < num; i++) { | ||
825 | char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; | ||
826 | |||
827 | snprintf(sym_name, sizeof(sym_name), "%s%s", | ||
828 | MODULE_SYMBOL_PREFIX, obsparm[i].name); | ||
829 | |||
830 | kp[i].name = obsparm[i].name; | ||
831 | kp[i].perm = 000; | ||
832 | kp[i].set = set_obsolete; | ||
833 | kp[i].get = NULL; | ||
834 | obsparm[i].addr | ||
835 | = (void *)find_local_symbol(sechdrs, symindex, strtab, | ||
836 | sym_name); | ||
837 | if (!obsparm[i].addr) { | ||
838 | printk("%s: falsely claims to have parameter %s\n", | ||
839 | name, obsparm[i].name); | ||
840 | ret = -EINVAL; | ||
841 | goto out; | ||
842 | } | ||
843 | kp[i].arg = &obsparm[i]; | ||
844 | } | ||
845 | |||
846 | ret = parse_args(name, args, kp, num, NULL); | ||
847 | out: | ||
848 | kfree(kp); | ||
849 | return ret; | ||
850 | } | ||
851 | #else | ||
852 | static int obsolete_params(const char *name, | ||
853 | char *args, | ||
854 | struct obsolete_modparm obsparm[], | ||
855 | unsigned int num, | ||
856 | Elf_Shdr *sechdrs, | ||
857 | unsigned int symindex, | ||
858 | const char *strtab) | ||
859 | { | ||
860 | if (num != 0) | ||
861 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | ||
862 | name); | ||
863 | return 0; | ||
864 | } | ||
865 | #endif /* CONFIG_OBSOLETE_MODPARM */ | ||
866 | 760 | ||
867 | static const char vermagic[] = VERMAGIC_STRING; | 761 | static const char vermagic[] = VERMAGIC_STRING; |
868 | 762 | ||
@@ -1056,37 +950,28 @@ static inline void remove_sect_attrs(struct module *mod) | |||
1056 | } | 950 | } |
1057 | #endif /* CONFIG_KALLSYMS */ | 951 | #endif /* CONFIG_KALLSYMS */ |
1058 | 952 | ||
1059 | |||
1060 | #ifdef CONFIG_MODULE_UNLOAD | ||
1061 | static inline int module_add_refcnt_attr(struct module *mod) | ||
1062 | { | ||
1063 | return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); | ||
1064 | } | ||
1065 | static void module_remove_refcnt_attr(struct module *mod) | ||
1066 | { | ||
1067 | return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); | ||
1068 | } | ||
1069 | #else | ||
1070 | static inline int module_add_refcnt_attr(struct module *mod) | ||
1071 | { | ||
1072 | return 0; | ||
1073 | } | ||
1074 | static void module_remove_refcnt_attr(struct module *mod) | ||
1075 | { | ||
1076 | } | ||
1077 | #endif | ||
1078 | |||
1079 | #ifdef CONFIG_MODULE_UNLOAD | ||
1080 | static int module_add_modinfo_attrs(struct module *mod) | 953 | static int module_add_modinfo_attrs(struct module *mod) |
1081 | { | 954 | { |
1082 | struct module_attribute *attr; | 955 | struct module_attribute *attr; |
956 | struct module_attribute *temp_attr; | ||
1083 | int error = 0; | 957 | int error = 0; |
1084 | int i; | 958 | int i; |
1085 | 959 | ||
960 | mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * | ||
961 | (ARRAY_SIZE(modinfo_attrs) + 1)), | ||
962 | GFP_KERNEL); | ||
963 | if (!mod->modinfo_attrs) | ||
964 | return -ENOMEM; | ||
965 | |||
966 | temp_attr = mod->modinfo_attrs; | ||
1086 | for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { | 967 | for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { |
1087 | if (!attr->test || | 968 | if (!attr->test || |
1088 | (attr->test && attr->test(mod))) | 969 | (attr->test && attr->test(mod))) { |
1089 | error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); | 970 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
971 | temp_attr->attr.owner = mod; | ||
972 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | ||
973 | ++temp_attr; | ||
974 | } | ||
1090 | } | 975 | } |
1091 | return error; | 976 | return error; |
1092 | } | 977 | } |
@@ -1096,12 +981,16 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
1096 | struct module_attribute *attr; | 981 | struct module_attribute *attr; |
1097 | int i; | 982 | int i; |
1098 | 983 | ||
1099 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | 984 | for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { |
985 | /* pick a field to test for end of list */ | ||
986 | if (!attr->attr.name) | ||
987 | break; | ||
1100 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); | 988 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); |
1101 | attr->free(mod); | 989 | if (attr->free) |
990 | attr->free(mod); | ||
1102 | } | 991 | } |
992 | kfree(mod->modinfo_attrs); | ||
1103 | } | 993 | } |
1104 | #endif | ||
1105 | 994 | ||
1106 | static int mod_sysfs_setup(struct module *mod, | 995 | static int mod_sysfs_setup(struct module *mod, |
1107 | struct kernel_param *kparam, | 996 | struct kernel_param *kparam, |
@@ -1119,19 +1008,13 @@ static int mod_sysfs_setup(struct module *mod, | |||
1119 | if (err) | 1008 | if (err) |
1120 | goto out; | 1009 | goto out; |
1121 | 1010 | ||
1122 | err = module_add_refcnt_attr(mod); | ||
1123 | if (err) | ||
1124 | goto out_unreg; | ||
1125 | |||
1126 | err = module_param_sysfs_setup(mod, kparam, num_params); | 1011 | err = module_param_sysfs_setup(mod, kparam, num_params); |
1127 | if (err) | 1012 | if (err) |
1128 | goto out_unreg; | 1013 | goto out_unreg; |
1129 | 1014 | ||
1130 | #ifdef CONFIG_MODULE_UNLOAD | ||
1131 | err = module_add_modinfo_attrs(mod); | 1015 | err = module_add_modinfo_attrs(mod); |
1132 | if (err) | 1016 | if (err) |
1133 | goto out_unreg; | 1017 | goto out_unreg; |
1134 | #endif | ||
1135 | 1018 | ||
1136 | return 0; | 1019 | return 0; |
1137 | 1020 | ||
@@ -1143,10 +1026,7 @@ out: | |||
1143 | 1026 | ||
1144 | static void mod_kobject_remove(struct module *mod) | 1027 | static void mod_kobject_remove(struct module *mod) |
1145 | { | 1028 | { |
1146 | #ifdef CONFIG_MODULE_UNLOAD | ||
1147 | module_remove_modinfo_attrs(mod); | 1029 | module_remove_modinfo_attrs(mod); |
1148 | #endif | ||
1149 | module_remove_refcnt_attr(mod); | ||
1150 | module_param_sysfs_remove(mod); | 1030 | module_param_sysfs_remove(mod); |
1151 | 1031 | ||
1152 | kobject_unregister(&mod->mkobj.kobj); | 1032 | kobject_unregister(&mod->mkobj.kobj); |
@@ -1374,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license) | |||
1374 | || strcmp(license, "GPL v2") == 0 | 1254 | || strcmp(license, "GPL v2") == 0 |
1375 | || strcmp(license, "GPL and additional rights") == 0 | 1255 | || strcmp(license, "GPL and additional rights") == 0 |
1376 | || strcmp(license, "Dual BSD/GPL") == 0 | 1256 | || strcmp(license, "Dual BSD/GPL") == 0 |
1257 | || strcmp(license, "Dual MIT/GPL") == 0 | ||
1377 | || strcmp(license, "Dual MPL/GPL") == 0); | 1258 | || strcmp(license, "Dual MPL/GPL") == 0); |
1378 | } | 1259 | } |
1379 | 1260 | ||
@@ -1424,7 +1305,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs, | |||
1424 | return NULL; | 1305 | return NULL; |
1425 | } | 1306 | } |
1426 | 1307 | ||
1427 | #ifdef CONFIG_MODULE_UNLOAD | ||
1428 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | 1308 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, |
1429 | unsigned int infoindex) | 1309 | unsigned int infoindex) |
1430 | { | 1310 | { |
@@ -1439,23 +1319,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | |||
1439 | attr->attr.name)); | 1319 | attr->attr.name)); |
1440 | } | 1320 | } |
1441 | } | 1321 | } |
1442 | #endif | ||
1443 | 1322 | ||
1444 | #ifdef CONFIG_KALLSYMS | 1323 | #ifdef CONFIG_KALLSYMS |
1445 | int is_exported(const char *name, const struct module *mod) | 1324 | int is_exported(const char *name, const struct module *mod) |
1446 | { | 1325 | { |
1447 | unsigned int i; | 1326 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) |
1448 | 1327 | return 1; | |
1449 | if (!mod) { | 1328 | else |
1450 | for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) | 1329 | if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) |
1451 | if (strcmp(__start___ksymtab[i].name, name) == 0) | ||
1452 | return 1; | ||
1453 | return 0; | ||
1454 | } | ||
1455 | for (i = 0; i < mod->num_syms; i++) | ||
1456 | if (strcmp(mod->syms[i].name, name) == 0) | ||
1457 | return 1; | 1330 | return 1; |
1458 | return 0; | 1331 | else |
1332 | return 0; | ||
1459 | } | 1333 | } |
1460 | 1334 | ||
1461 | /* As per nm */ | 1335 | /* As per nm */ |
@@ -1537,8 +1411,8 @@ static struct module *load_module(void __user *umod, | |||
1537 | char *secstrings, *args, *modmagic, *strtab = NULL; | 1411 | char *secstrings, *args, *modmagic, *strtab = NULL; |
1538 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | 1412 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, |
1539 | exportindex, modindex, obsparmindex, infoindex, gplindex, | 1413 | exportindex, modindex, obsparmindex, infoindex, gplindex, |
1540 | crcindex, gplcrcindex, versindex, pcpuindex; | 1414 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, |
1541 | long arglen; | 1415 | gplfuturecrcindex; |
1542 | struct module *mod; | 1416 | struct module *mod; |
1543 | long err = 0; | 1417 | long err = 0; |
1544 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1418 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
@@ -1618,8 +1492,10 @@ static struct module *load_module(void __user *umod, | |||
1618 | /* Optional sections */ | 1492 | /* Optional sections */ |
1619 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); | 1493 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); |
1620 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); | 1494 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); |
1495 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); | ||
1621 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); | 1496 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); |
1622 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | 1497 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); |
1498 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); | ||
1623 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); | 1499 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); |
1624 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); | 1500 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); |
1625 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); | 1501 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); |
@@ -1655,23 +1531,11 @@ static struct module *load_module(void __user *umod, | |||
1655 | } | 1531 | } |
1656 | 1532 | ||
1657 | /* Now copy in args */ | 1533 | /* Now copy in args */ |
1658 | arglen = strlen_user(uargs); | 1534 | args = strndup_user(uargs, ~0UL >> 1); |
1659 | if (!arglen) { | 1535 | if (IS_ERR(args)) { |
1660 | err = -EFAULT; | 1536 | err = PTR_ERR(args); |
1661 | goto free_hdr; | 1537 | goto free_hdr; |
1662 | } | 1538 | } |
1663 | args = kmalloc(arglen, GFP_KERNEL); | ||
1664 | if (!args) { | ||
1665 | err = -ENOMEM; | ||
1666 | goto free_hdr; | ||
1667 | } | ||
1668 | if (copy_from_user(args, uargs, arglen) != 0) { | ||
1669 | err = -EFAULT; | ||
1670 | goto free_mod; | ||
1671 | } | ||
1672 | |||
1673 | /* Userspace could have altered the string after the strlen_user() */ | ||
1674 | args[arglen - 1] = '\0'; | ||
1675 | 1539 | ||
1676 | if (find_module(mod->name)) { | 1540 | if (find_module(mod->name)) { |
1677 | err = -EEXIST; | 1541 | err = -EEXIST; |
@@ -1755,10 +1619,8 @@ static struct module *load_module(void __user *umod, | |||
1755 | if (strcmp(mod->name, "driverloader") == 0) | 1619 | if (strcmp(mod->name, "driverloader") == 0) |
1756 | add_taint(TAINT_PROPRIETARY_MODULE); | 1620 | add_taint(TAINT_PROPRIETARY_MODULE); |
1757 | 1621 | ||
1758 | #ifdef CONFIG_MODULE_UNLOAD | ||
1759 | /* Set up MODINFO_ATTR fields */ | 1622 | /* Set up MODINFO_ATTR fields */ |
1760 | setup_modinfo(mod, sechdrs, infoindex); | 1623 | setup_modinfo(mod, sechdrs, infoindex); |
1761 | #endif | ||
1762 | 1624 | ||
1763 | /* Fix up syms, so that st_value is a pointer to location. */ | 1625 | /* Fix up syms, so that st_value is a pointer to location. */ |
1764 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | 1626 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, |
@@ -1775,10 +1637,16 @@ static struct module *load_module(void __user *umod, | |||
1775 | mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; | 1637 | mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; |
1776 | if (gplcrcindex) | 1638 | if (gplcrcindex) |
1777 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; | 1639 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; |
1640 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / | ||
1641 | sizeof(*mod->gpl_future_syms); | ||
1642 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; | ||
1643 | if (gplfuturecrcindex) | ||
1644 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; | ||
1778 | 1645 | ||
1779 | #ifdef CONFIG_MODVERSIONS | 1646 | #ifdef CONFIG_MODVERSIONS |
1780 | if ((mod->num_syms && !crcindex) || | 1647 | if ((mod->num_syms && !crcindex) || |
1781 | (mod->num_gpl_syms && !gplcrcindex)) { | 1648 | (mod->num_gpl_syms && !gplcrcindex) || |
1649 | (mod->num_gpl_future_syms && !gplfuturecrcindex)) { | ||
1782 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1650 | printk(KERN_WARNING "%s: No versions for exported symbols." |
1783 | " Tainting kernel.\n", mod->name); | 1651 | " Tainting kernel.\n", mod->name); |
1784 | add_taint(TAINT_FORCED_MODULE); | 1652 | add_taint(TAINT_FORCED_MODULE); |
@@ -1847,27 +1715,17 @@ static struct module *load_module(void __user *umod, | |||
1847 | set_fs(old_fs); | 1715 | set_fs(old_fs); |
1848 | 1716 | ||
1849 | mod->args = args; | 1717 | mod->args = args; |
1850 | if (obsparmindex) { | 1718 | if (obsparmindex) |
1851 | err = obsolete_params(mod->name, mod->args, | 1719 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
1852 | (struct obsolete_modparm *) | 1720 | mod->name); |
1853 | sechdrs[obsparmindex].sh_addr, | 1721 | |
1854 | sechdrs[obsparmindex].sh_size | 1722 | /* Size of section 0 is 0, so this works well if no params */ |
1855 | / sizeof(struct obsolete_modparm), | 1723 | err = parse_args(mod->name, mod->args, |
1856 | sechdrs, symindex, | 1724 | (struct kernel_param *) |
1857 | (char *)sechdrs[strindex].sh_addr); | 1725 | sechdrs[setupindex].sh_addr, |
1858 | if (setupindex) | 1726 | sechdrs[setupindex].sh_size |
1859 | printk(KERN_WARNING "%s: Ignoring new-style " | 1727 | / sizeof(struct kernel_param), |
1860 | "parameters in presence of obsolete ones\n", | 1728 | NULL); |
1861 | mod->name); | ||
1862 | } else { | ||
1863 | /* Size of section 0 is 0, so this works well if no params */ | ||
1864 | err = parse_args(mod->name, mod->args, | ||
1865 | (struct kernel_param *) | ||
1866 | sechdrs[setupindex].sh_addr, | ||
1867 | sechdrs[setupindex].sh_size | ||
1868 | / sizeof(struct kernel_param), | ||
1869 | NULL); | ||
1870 | } | ||
1871 | if (err < 0) | 1729 | if (err < 0) |
1872 | goto arch_cleanup; | 1730 | goto arch_cleanup; |
1873 | 1731 | ||
@@ -1933,13 +1791,13 @@ sys_init_module(void __user *umod, | |||
1933 | return -EPERM; | 1791 | return -EPERM; |
1934 | 1792 | ||
1935 | /* Only one module load at a time, please */ | 1793 | /* Only one module load at a time, please */ |
1936 | if (down_interruptible(&module_mutex) != 0) | 1794 | if (mutex_lock_interruptible(&module_mutex) != 0) |
1937 | return -EINTR; | 1795 | return -EINTR; |
1938 | 1796 | ||
1939 | /* Do all the hard work */ | 1797 | /* Do all the hard work */ |
1940 | mod = load_module(umod, len, uargs); | 1798 | mod = load_module(umod, len, uargs); |
1941 | if (IS_ERR(mod)) { | 1799 | if (IS_ERR(mod)) { |
1942 | up(&module_mutex); | 1800 | mutex_unlock(&module_mutex); |
1943 | return PTR_ERR(mod); | 1801 | return PTR_ERR(mod); |
1944 | } | 1802 | } |
1945 | 1803 | ||
@@ -1948,11 +1806,10 @@ sys_init_module(void __user *umod, | |||
1948 | stop_machine_run(__link_module, mod, NR_CPUS); | 1806 | stop_machine_run(__link_module, mod, NR_CPUS); |
1949 | 1807 | ||
1950 | /* Drop lock so they can recurse */ | 1808 | /* Drop lock so they can recurse */ |
1951 | up(&module_mutex); | 1809 | mutex_unlock(&module_mutex); |
1952 | 1810 | ||
1953 | down(¬ify_mutex); | 1811 | blocking_notifier_call_chain(&module_notify_list, |
1954 | notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); | 1812 | MODULE_STATE_COMING, mod); |
1955 | up(¬ify_mutex); | ||
1956 | 1813 | ||
1957 | /* Start the module */ | 1814 | /* Start the module */ |
1958 | if (mod->init != NULL) | 1815 | if (mod->init != NULL) |
@@ -1967,15 +1824,15 @@ sys_init_module(void __user *umod, | |||
1967 | mod->name); | 1824 | mod->name); |
1968 | else { | 1825 | else { |
1969 | module_put(mod); | 1826 | module_put(mod); |
1970 | down(&module_mutex); | 1827 | mutex_lock(&module_mutex); |
1971 | free_module(mod); | 1828 | free_module(mod); |
1972 | up(&module_mutex); | 1829 | mutex_unlock(&module_mutex); |
1973 | } | 1830 | } |
1974 | return ret; | 1831 | return ret; |
1975 | } | 1832 | } |
1976 | 1833 | ||
1977 | /* Now it's a first class citizen! */ | 1834 | /* Now it's a first class citizen! */ |
1978 | down(&module_mutex); | 1835 | mutex_lock(&module_mutex); |
1979 | mod->state = MODULE_STATE_LIVE; | 1836 | mod->state = MODULE_STATE_LIVE; |
1980 | /* Drop initial reference. */ | 1837 | /* Drop initial reference. */ |
1981 | module_put(mod); | 1838 | module_put(mod); |
@@ -1983,7 +1840,7 @@ sys_init_module(void __user *umod, | |||
1983 | mod->module_init = NULL; | 1840 | mod->module_init = NULL; |
1984 | mod->init_size = 0; | 1841 | mod->init_size = 0; |
1985 | mod->init_text_size = 0; | 1842 | mod->init_text_size = 0; |
1986 | up(&module_mutex); | 1843 | mutex_unlock(&module_mutex); |
1987 | 1844 | ||
1988 | return 0; | 1845 | return 0; |
1989 | } | 1846 | } |
@@ -2073,7 +1930,7 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
2073 | { | 1930 | { |
2074 | struct module *mod; | 1931 | struct module *mod; |
2075 | 1932 | ||
2076 | down(&module_mutex); | 1933 | mutex_lock(&module_mutex); |
2077 | list_for_each_entry(mod, &modules, list) { | 1934 | list_for_each_entry(mod, &modules, list) { |
2078 | if (symnum < mod->num_symtab) { | 1935 | if (symnum < mod->num_symtab) { |
2079 | *value = mod->symtab[symnum].st_value; | 1936 | *value = mod->symtab[symnum].st_value; |
@@ -2081,12 +1938,12 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
2081 | strncpy(namebuf, | 1938 | strncpy(namebuf, |
2082 | mod->strtab + mod->symtab[symnum].st_name, | 1939 | mod->strtab + mod->symtab[symnum].st_name, |
2083 | 127); | 1940 | 127); |
2084 | up(&module_mutex); | 1941 | mutex_unlock(&module_mutex); |
2085 | return mod; | 1942 | return mod; |
2086 | } | 1943 | } |
2087 | symnum -= mod->num_symtab; | 1944 | symnum -= mod->num_symtab; |
2088 | } | 1945 | } |
2089 | up(&module_mutex); | 1946 | mutex_unlock(&module_mutex); |
2090 | return NULL; | 1947 | return NULL; |
2091 | } | 1948 | } |
2092 | 1949 | ||
@@ -2129,7 +1986,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) | |||
2129 | struct list_head *i; | 1986 | struct list_head *i; |
2130 | loff_t n = 0; | 1987 | loff_t n = 0; |
2131 | 1988 | ||
2132 | down(&module_mutex); | 1989 | mutex_lock(&module_mutex); |
2133 | list_for_each(i, &modules) { | 1990 | list_for_each(i, &modules) { |
2134 | if (n++ == *pos) | 1991 | if (n++ == *pos) |
2135 | break; | 1992 | break; |
@@ -2150,7 +2007,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos) | |||
2150 | 2007 | ||
2151 | static void m_stop(struct seq_file *m, void *p) | 2008 | static void m_stop(struct seq_file *m, void *p) |
2152 | { | 2009 | { |
2153 | up(&module_mutex); | 2010 | mutex_unlock(&module_mutex); |
2154 | } | 2011 | } |
2155 | 2012 | ||
2156 | static int m_show(struct seq_file *m, void *p) | 2013 | static int m_show(struct seq_file *m, void *p) |
diff --git a/kernel/panic.c b/kernel/panic.c index c5c4ab2558..cc2a4c9c36 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -20,13 +20,15 @@ | |||
20 | #include <linux/nmi.h> | 20 | #include <linux/nmi.h> |
21 | #include <linux/kexec.h> | 21 | #include <linux/kexec.h> |
22 | 22 | ||
23 | int panic_timeout; | ||
24 | int panic_on_oops; | 23 | int panic_on_oops; |
25 | int tainted; | 24 | int tainted; |
25 | static int pause_on_oops; | ||
26 | static int pause_on_oops_flag; | ||
27 | static DEFINE_SPINLOCK(pause_on_oops_lock); | ||
26 | 28 | ||
27 | EXPORT_SYMBOL(panic_timeout); | 29 | int panic_timeout; |
28 | 30 | ||
29 | struct notifier_block *panic_notifier_list; | 31 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
30 | 32 | ||
31 | EXPORT_SYMBOL(panic_notifier_list); | 33 | EXPORT_SYMBOL(panic_notifier_list); |
32 | 34 | ||
@@ -94,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
94 | smp_send_stop(); | 96 | smp_send_stop(); |
95 | #endif | 97 | #endif |
96 | 98 | ||
97 | notifier_call_chain(&panic_notifier_list, 0, buf); | 99 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
98 | 100 | ||
99 | if (!panic_blink) | 101 | if (!panic_blink) |
100 | panic_blink = no_blink; | 102 | panic_blink = no_blink; |
@@ -130,6 +132,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
130 | #endif | 132 | #endif |
131 | local_irq_enable(); | 133 | local_irq_enable(); |
132 | for (i = 0;;) { | 134 | for (i = 0;;) { |
135 | touch_softlockup_watchdog(); | ||
133 | i += panic_blink(i); | 136 | i += panic_blink(i); |
134 | mdelay(1); | 137 | mdelay(1); |
135 | i++; | 138 | i++; |
@@ -173,3 +176,95 @@ void add_taint(unsigned flag) | |||
173 | tainted |= flag; | 176 | tainted |= flag; |
174 | } | 177 | } |
175 | EXPORT_SYMBOL(add_taint); | 178 | EXPORT_SYMBOL(add_taint); |
179 | |||
180 | static int __init pause_on_oops_setup(char *str) | ||
181 | { | ||
182 | pause_on_oops = simple_strtoul(str, NULL, 0); | ||
183 | return 1; | ||
184 | } | ||
185 | __setup("pause_on_oops=", pause_on_oops_setup); | ||
186 | |||
187 | static void spin_msec(int msecs) | ||
188 | { | ||
189 | int i; | ||
190 | |||
191 | for (i = 0; i < msecs; i++) { | ||
192 | touch_nmi_watchdog(); | ||
193 | mdelay(1); | ||
194 | } | ||
195 | } | ||
196 | |||
197 | /* | ||
198 | * It just happens that oops_enter() and oops_exit() are identically | ||
199 | * implemented... | ||
200 | */ | ||
201 | static void do_oops_enter_exit(void) | ||
202 | { | ||
203 | unsigned long flags; | ||
204 | static int spin_counter; | ||
205 | |||
206 | if (!pause_on_oops) | ||
207 | return; | ||
208 | |||
209 | spin_lock_irqsave(&pause_on_oops_lock, flags); | ||
210 | if (pause_on_oops_flag == 0) { | ||
211 | /* This CPU may now print the oops message */ | ||
212 | pause_on_oops_flag = 1; | ||
213 | } else { | ||
214 | /* We need to stall this CPU */ | ||
215 | if (!spin_counter) { | ||
216 | /* This CPU gets to do the counting */ | ||
217 | spin_counter = pause_on_oops; | ||
218 | do { | ||
219 | spin_unlock(&pause_on_oops_lock); | ||
220 | spin_msec(MSEC_PER_SEC); | ||
221 | spin_lock(&pause_on_oops_lock); | ||
222 | } while (--spin_counter); | ||
223 | pause_on_oops_flag = 0; | ||
224 | } else { | ||
225 | /* This CPU waits for a different one */ | ||
226 | while (spin_counter) { | ||
227 | spin_unlock(&pause_on_oops_lock); | ||
228 | spin_msec(1); | ||
229 | spin_lock(&pause_on_oops_lock); | ||
230 | } | ||
231 | } | ||
232 | } | ||
233 | spin_unlock_irqrestore(&pause_on_oops_lock, flags); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Return true if the calling CPU is allowed to print oops-related info. This | ||
238 | * is a bit racy.. | ||
239 | */ | ||
240 | int oops_may_print(void) | ||
241 | { | ||
242 | return pause_on_oops_flag == 0; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Called when the architecture enters its oops handler, before it prints | ||
247 | * anything. If this is the first CPU to oops, and it's oopsing the first time | ||
248 | * then let it proceed. | ||
249 | * | ||
250 | * This is all enabled by the pause_on_oops kernel boot option. We do all this | ||
251 | * to ensure that oopses don't scroll off the screen. It has the side-effect | ||
252 | * of preventing later-oopsing CPUs from mucking up the display, too. | ||
253 | * | ||
254 | * It turns out that the CPU which is allowed to print ends up pausing for the | ||
255 | * right duration, whereas all the other CPUs pause for twice as long: once in | ||
256 | * oops_enter(), once in oops_exit(). | ||
257 | */ | ||
258 | void oops_enter(void) | ||
259 | { | ||
260 | do_oops_enter_exit(); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Called when the architecture exits its oops handler, after printing | ||
265 | * everything. | ||
266 | */ | ||
267 | void oops_exit(void) | ||
268 | { | ||
269 | do_oops_enter_exit(); | ||
270 | } | ||
diff --git a/kernel/params.c b/kernel/params.c index c76ad25e6a..af43ecdc8d 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -31,7 +31,7 @@ | |||
31 | #define DEBUGP(fmt, a...) | 31 | #define DEBUGP(fmt, a...) |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | static inline int dash2underscore(char c) | 34 | static inline char dash2underscore(char c) |
35 | { | 35 | { |
36 | if (c == '-') | 36 | if (c == '-') |
37 | return '_'; | 37 | return '_'; |
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) | |||
265 | } | 265 | } |
266 | 266 | ||
267 | /* We cheat here and temporarily mangle the string. */ | 267 | /* We cheat here and temporarily mangle the string. */ |
268 | int param_array(const char *name, | 268 | static int param_array(const char *name, |
269 | const char *val, | 269 | const char *val, |
270 | unsigned int min, unsigned int max, | 270 | unsigned int min, unsigned int max, |
271 | void *elem, int elemsize, | 271 | void *elem, int elemsize, |
272 | int (*set)(const char *, struct kernel_param *kp), | 272 | int (*set)(const char *, struct kernel_param *kp), |
273 | int *num) | 273 | int *num) |
274 | { | 274 | { |
275 | int ret; | 275 | int ret; |
276 | struct kernel_param kp; | 276 | struct kernel_param kp; |
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj, | |||
638 | if (!attribute->show) | 638 | if (!attribute->show) |
639 | return -EIO; | 639 | return -EIO; |
640 | 640 | ||
641 | if (!try_module_get(mk->mod)) | ||
642 | return -ENODEV; | ||
643 | |||
644 | ret = attribute->show(attribute, mk->mod, buf); | 641 | ret = attribute->show(attribute, mk->mod, buf); |
645 | 642 | ||
646 | module_put(mk->mod); | ||
647 | |||
648 | return ret; | 643 | return ret; |
649 | } | 644 | } |
650 | 645 | ||
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
662 | if (!attribute->store) | 657 | if (!attribute->store) |
663 | return -EIO; | 658 | return -EIO; |
664 | 659 | ||
665 | if (!try_module_get(mk->mod)) | ||
666 | return -ENODEV; | ||
667 | |||
668 | ret = attribute->store(attribute, mk->mod, buf, len); | 660 | ret = attribute->store(attribute, mk->mod, buf, len); |
669 | 661 | ||
670 | module_put(mk->mod); | ||
671 | |||
672 | return ret; | 662 | return ret; |
673 | } | 663 | } |
674 | 664 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 1acc072469..eeb836b65c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -28,8 +28,9 @@ | |||
28 | #include <linux/hash.h> | 28 | #include <linux/hash.h> |
29 | 29 | ||
30 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) | 30 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) |
31 | static struct hlist_head *pid_hash[PIDTYPE_MAX]; | 31 | static struct hlist_head *pid_hash; |
32 | static int pidhash_shift; | 32 | static int pidhash_shift; |
33 | static kmem_cache_t *pid_cachep; | ||
33 | 34 | ||
34 | int pid_max = PID_MAX_DEFAULT; | 35 | int pid_max = PID_MAX_DEFAULT; |
35 | int last_pid; | 36 | int last_pid; |
@@ -60,9 +61,22 @@ typedef struct pidmap { | |||
60 | static pidmap_t pidmap_array[PIDMAP_ENTRIES] = | 61 | static pidmap_t pidmap_array[PIDMAP_ENTRIES] = |
61 | { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; | 62 | { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; |
62 | 63 | ||
64 | /* | ||
65 | * Note: disable interrupts while the pidmap_lock is held as an | ||
66 | * interrupt might come in and do read_lock(&tasklist_lock). | ||
67 | * | ||
68 | * If we don't disable interrupts there is a nasty deadlock between | ||
69 | * detach_pid()->free_pid() and another cpu that does | ||
70 | * spin_lock(&pidmap_lock) followed by an interrupt routine that does | ||
71 | * read_lock(&tasklist_lock); | ||
72 | * | ||
73 | * After we clean up the tasklist_lock and know there are no | ||
74 | * irq handlers that take it we can leave the interrupts enabled. | ||
75 | * For now it is easier to be safe than to prove it can't happen. | ||
76 | */ | ||
63 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 77 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
64 | 78 | ||
65 | fastcall void free_pidmap(int pid) | 79 | static fastcall void free_pidmap(int pid) |
66 | { | 80 | { |
67 | pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; | 81 | pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; |
68 | int offset = pid & BITS_PER_PAGE_MASK; | 82 | int offset = pid & BITS_PER_PAGE_MASK; |
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid) | |||
71 | atomic_inc(&map->nr_free); | 85 | atomic_inc(&map->nr_free); |
72 | } | 86 | } |
73 | 87 | ||
74 | int alloc_pidmap(void) | 88 | static int alloc_pidmap(void) |
75 | { | 89 | { |
76 | int i, offset, max_scan, pid, last = last_pid; | 90 | int i, offset, max_scan, pid, last = last_pid; |
77 | pidmap_t *map; | 91 | pidmap_t *map; |
@@ -89,12 +103,12 @@ int alloc_pidmap(void) | |||
89 | * Free the page if someone raced with us | 103 | * Free the page if someone raced with us |
90 | * installing it: | 104 | * installing it: |
91 | */ | 105 | */ |
92 | spin_lock(&pidmap_lock); | 106 | spin_lock_irq(&pidmap_lock); |
93 | if (map->page) | 107 | if (map->page) |
94 | free_page(page); | 108 | free_page(page); |
95 | else | 109 | else |
96 | map->page = (void *)page; | 110 | map->page = (void *)page; |
97 | spin_unlock(&pidmap_lock); | 111 | spin_unlock_irq(&pidmap_lock); |
98 | if (unlikely(!map->page)) | 112 | if (unlikely(!map->page)) |
99 | break; | 113 | break; |
100 | } | 114 | } |
@@ -131,13 +145,73 @@ int alloc_pidmap(void) | |||
131 | return -1; | 145 | return -1; |
132 | } | 146 | } |
133 | 147 | ||
134 | struct pid * fastcall find_pid(enum pid_type type, int nr) | 148 | fastcall void put_pid(struct pid *pid) |
149 | { | ||
150 | if (!pid) | ||
151 | return; | ||
152 | if ((atomic_read(&pid->count) == 1) || | ||
153 | atomic_dec_and_test(&pid->count)) | ||
154 | kmem_cache_free(pid_cachep, pid); | ||
155 | } | ||
156 | |||
157 | static void delayed_put_pid(struct rcu_head *rhp) | ||
158 | { | ||
159 | struct pid *pid = container_of(rhp, struct pid, rcu); | ||
160 | put_pid(pid); | ||
161 | } | ||
162 | |||
163 | fastcall void free_pid(struct pid *pid) | ||
164 | { | ||
165 | /* We can be called with write_lock_irq(&tasklist_lock) held */ | ||
166 | unsigned long flags; | ||
167 | |||
168 | spin_lock_irqsave(&pidmap_lock, flags); | ||
169 | hlist_del_rcu(&pid->pid_chain); | ||
170 | spin_unlock_irqrestore(&pidmap_lock, flags); | ||
171 | |||
172 | free_pidmap(pid->nr); | ||
173 | call_rcu(&pid->rcu, delayed_put_pid); | ||
174 | } | ||
175 | |||
176 | struct pid *alloc_pid(void) | ||
177 | { | ||
178 | struct pid *pid; | ||
179 | enum pid_type type; | ||
180 | int nr = -1; | ||
181 | |||
182 | pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); | ||
183 | if (!pid) | ||
184 | goto out; | ||
185 | |||
186 | nr = alloc_pidmap(); | ||
187 | if (nr < 0) | ||
188 | goto out_free; | ||
189 | |||
190 | atomic_set(&pid->count, 1); | ||
191 | pid->nr = nr; | ||
192 | for (type = 0; type < PIDTYPE_MAX; ++type) | ||
193 | INIT_HLIST_HEAD(&pid->tasks[type]); | ||
194 | |||
195 | spin_lock_irq(&pidmap_lock); | ||
196 | hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); | ||
197 | spin_unlock_irq(&pidmap_lock); | ||
198 | |||
199 | out: | ||
200 | return pid; | ||
201 | |||
202 | out_free: | ||
203 | kmem_cache_free(pid_cachep, pid); | ||
204 | pid = NULL; | ||
205 | goto out; | ||
206 | } | ||
207 | |||
208 | struct pid * fastcall find_pid(int nr) | ||
135 | { | 209 | { |
136 | struct hlist_node *elem; | 210 | struct hlist_node *elem; |
137 | struct pid *pid; | 211 | struct pid *pid; |
138 | 212 | ||
139 | hlist_for_each_entry_rcu(pid, elem, | 213 | hlist_for_each_entry_rcu(pid, elem, |
140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { | 214 | &pid_hash[pid_hashfn(nr)], pid_chain) { |
141 | if (pid->nr == nr) | 215 | if (pid->nr == nr) |
142 | return pid; | 216 | return pid; |
143 | } | 217 | } |
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) | |||
146 | 220 | ||
147 | int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | 221 | int fastcall attach_pid(task_t *task, enum pid_type type, int nr) |
148 | { | 222 | { |
149 | struct pid *pid, *task_pid; | 223 | struct pid_link *link; |
150 | 224 | struct pid *pid; | |
151 | task_pid = &task->pids[type]; | ||
152 | pid = find_pid(type, nr); | ||
153 | task_pid->nr = nr; | ||
154 | if (pid == NULL) { | ||
155 | INIT_LIST_HEAD(&task_pid->pid_list); | ||
156 | hlist_add_head_rcu(&task_pid->pid_chain, | ||
157 | &pid_hash[type][pid_hashfn(nr)]); | ||
158 | } else { | ||
159 | INIT_HLIST_NODE(&task_pid->pid_chain); | ||
160 | list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); | ||
161 | } | ||
162 | |||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static fastcall int __detach_pid(task_t *task, enum pid_type type) | ||
167 | { | ||
168 | struct pid *pid, *pid_next; | ||
169 | int nr = 0; | ||
170 | |||
171 | pid = &task->pids[type]; | ||
172 | if (!hlist_unhashed(&pid->pid_chain)) { | ||
173 | 225 | ||
174 | if (list_empty(&pid->pid_list)) { | 226 | WARN_ON(!task->pid); /* to be removed soon */ |
175 | nr = pid->nr; | 227 | WARN_ON(!nr); /* to be removed soon */ |
176 | hlist_del_rcu(&pid->pid_chain); | ||
177 | } else { | ||
178 | pid_next = list_entry(pid->pid_list.next, | ||
179 | struct pid, pid_list); | ||
180 | /* insert next pid from pid_list to hash */ | ||
181 | hlist_replace_rcu(&pid->pid_chain, | ||
182 | &pid_next->pid_chain); | ||
183 | } | ||
184 | } | ||
185 | 228 | ||
186 | list_del_rcu(&pid->pid_list); | 229 | link = &task->pids[type]; |
187 | pid->nr = 0; | 230 | link->pid = pid = find_pid(nr); |
231 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); | ||
188 | 232 | ||
189 | return nr; | 233 | return 0; |
190 | } | 234 | } |
191 | 235 | ||
192 | void fastcall detach_pid(task_t *task, enum pid_type type) | 236 | void fastcall detach_pid(task_t *task, enum pid_type type) |
193 | { | 237 | { |
194 | int tmp, nr; | 238 | struct pid_link *link; |
239 | struct pid *pid; | ||
240 | int tmp; | ||
195 | 241 | ||
196 | nr = __detach_pid(task, type); | 242 | link = &task->pids[type]; |
197 | if (!nr) | 243 | pid = link->pid; |
198 | return; | 244 | |
245 | hlist_del_rcu(&link->node); | ||
246 | link->pid = NULL; | ||
199 | 247 | ||
200 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) | 248 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) |
201 | if (tmp != type && find_pid(tmp, nr)) | 249 | if (!hlist_empty(&pid->tasks[tmp])) |
202 | return; | 250 | return; |
203 | 251 | ||
204 | free_pidmap(nr); | 252 | free_pid(pid); |
205 | } | 253 | } |
206 | 254 | ||
207 | task_t *find_task_by_pid_type(int type, int nr) | 255 | struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) |
208 | { | 256 | { |
209 | struct pid *pid; | 257 | struct task_struct *result = NULL; |
210 | 258 | if (pid) { | |
211 | pid = find_pid(type, nr); | 259 | struct hlist_node *first; |
212 | if (!pid) | 260 | first = rcu_dereference(pid->tasks[type].first); |
213 | return NULL; | 261 | if (first) |
262 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | ||
263 | } | ||
264 | return result; | ||
265 | } | ||
214 | 266 | ||
215 | return pid_task(&pid->pid_list, type); | 267 | /* |
268 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | ||
269 | */ | ||
270 | task_t *find_task_by_pid_type(int type, int nr) | ||
271 | { | ||
272 | return pid_task(find_pid(nr), type); | ||
216 | } | 273 | } |
217 | 274 | ||
218 | EXPORT_SYMBOL(find_task_by_pid_type); | 275 | EXPORT_SYMBOL(find_task_by_pid_type); |
219 | 276 | ||
220 | /* | 277 | struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) |
221 | * This function switches the PIDs if a non-leader thread calls | 278 | { |
222 | * sys_execve() - this must be done without releasing the PID. | 279 | struct task_struct *result; |
223 | * (which a detach_pid() would eventually do.) | 280 | rcu_read_lock(); |
224 | */ | 281 | result = pid_task(pid, type); |
225 | void switch_exec_pids(task_t *leader, task_t *thread) | 282 | if (result) |
283 | get_task_struct(result); | ||
284 | rcu_read_unlock(); | ||
285 | return result; | ||
286 | } | ||
287 | |||
288 | struct pid *find_get_pid(pid_t nr) | ||
226 | { | 289 | { |
227 | __detach_pid(leader, PIDTYPE_PID); | 290 | struct pid *pid; |
228 | __detach_pid(leader, PIDTYPE_TGID); | 291 | |
229 | __detach_pid(leader, PIDTYPE_PGID); | 292 | rcu_read_lock(); |
230 | __detach_pid(leader, PIDTYPE_SID); | 293 | pid = get_pid(find_pid(nr)); |
231 | 294 | rcu_read_unlock(); | |
232 | __detach_pid(thread, PIDTYPE_PID); | 295 | |
233 | __detach_pid(thread, PIDTYPE_TGID); | 296 | return pid; |
234 | |||
235 | leader->pid = leader->tgid = thread->pid; | ||
236 | thread->pid = thread->tgid; | ||
237 | |||
238 | attach_pid(thread, PIDTYPE_PID, thread->pid); | ||
239 | attach_pid(thread, PIDTYPE_TGID, thread->tgid); | ||
240 | attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); | ||
241 | attach_pid(thread, PIDTYPE_SID, thread->signal->session); | ||
242 | list_add_tail(&thread->tasks, &init_task.tasks); | ||
243 | |||
244 | attach_pid(leader, PIDTYPE_PID, leader->pid); | ||
245 | attach_pid(leader, PIDTYPE_TGID, leader->tgid); | ||
246 | attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); | ||
247 | attach_pid(leader, PIDTYPE_SID, leader->signal->session); | ||
248 | } | 297 | } |
249 | 298 | ||
250 | /* | 299 | /* |
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread) | |||
254 | */ | 303 | */ |
255 | void __init pidhash_init(void) | 304 | void __init pidhash_init(void) |
256 | { | 305 | { |
257 | int i, j, pidhash_size; | 306 | int i, pidhash_size; |
258 | unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); | 307 | unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); |
259 | 308 | ||
260 | pidhash_shift = max(4, fls(megabytes * 4)); | 309 | pidhash_shift = max(4, fls(megabytes * 4)); |
@@ -263,30 +312,23 @@ void __init pidhash_init(void) | |||
263 | 312 | ||
264 | printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", | 313 | printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", |
265 | pidhash_size, pidhash_shift, | 314 | pidhash_size, pidhash_shift, |
266 | PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); | 315 | pidhash_size * sizeof(struct hlist_head)); |
267 | 316 | ||
268 | for (i = 0; i < PIDTYPE_MAX; i++) { | 317 | pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); |
269 | pid_hash[i] = alloc_bootmem(pidhash_size * | 318 | if (!pid_hash) |
270 | sizeof(*(pid_hash[i]))); | 319 | panic("Could not alloc pidhash!\n"); |
271 | if (!pid_hash[i]) | 320 | for (i = 0; i < pidhash_size; i++) |
272 | panic("Could not alloc pidhash!\n"); | 321 | INIT_HLIST_HEAD(&pid_hash[i]); |
273 | for (j = 0; j < pidhash_size; j++) | ||
274 | INIT_HLIST_HEAD(&pid_hash[i][j]); | ||
275 | } | ||
276 | } | 322 | } |
277 | 323 | ||
278 | void __init pidmap_init(void) | 324 | void __init pidmap_init(void) |
279 | { | 325 | { |
280 | int i; | ||
281 | |||
282 | pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); | 326 | pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); |
327 | /* Reserve PID 0. We never call free_pidmap(0) */ | ||
283 | set_bit(0, pidmap_array->page); | 328 | set_bit(0, pidmap_array->page); |
284 | atomic_dec(&pidmap_array->nr_free); | 329 | atomic_dec(&pidmap_array->nr_free); |
285 | 330 | ||
286 | /* | 331 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), |
287 | * Allocate PID 0, and hash it via all PID types: | 332 | __alignof__(struct pid), |
288 | */ | 333 | SLAB_PANIC, NULL, NULL); |
289 | |||
290 | for (i = 0; i < PIDTYPE_MAX; i++) | ||
291 | attach_pid(current, i, 0); | ||
292 | } | 334 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 216f574b5f..ac6dc87444 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/interrupt.h> | 35 | #include <linux/interrupt.h> |
36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <linux/time.h> | 37 | #include <linux/time.h> |
38 | #include <linux/mutex.h> | ||
38 | 39 | ||
39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
40 | #include <asm/semaphore.h> | 41 | #include <asm/semaphore.h> |
@@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, | |||
144 | struct itimerspec *, struct itimerspec *); | 145 | struct itimerspec *, struct itimerspec *); |
145 | static int common_timer_del(struct k_itimer *timer); | 146 | static int common_timer_del(struct k_itimer *timer); |
146 | 147 | ||
147 | static int posix_timer_fn(void *data); | 148 | static int posix_timer_fn(struct hrtimer *data); |
148 | 149 | ||
149 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 150 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); |
150 | 151 | ||
@@ -250,15 +251,18 @@ __initcall(init_posix_timers); | |||
250 | 251 | ||
251 | static void schedule_next_timer(struct k_itimer *timr) | 252 | static void schedule_next_timer(struct k_itimer *timr) |
252 | { | 253 | { |
254 | struct hrtimer *timer = &timr->it.real.timer; | ||
255 | |||
253 | if (timr->it.real.interval.tv64 == 0) | 256 | if (timr->it.real.interval.tv64 == 0) |
254 | return; | 257 | return; |
255 | 258 | ||
256 | timr->it_overrun += hrtimer_forward(&timr->it.real.timer, | 259 | timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), |
257 | timr->it.real.interval); | 260 | timr->it.real.interval); |
261 | |||
258 | timr->it_overrun_last = timr->it_overrun; | 262 | timr->it_overrun_last = timr->it_overrun; |
259 | timr->it_overrun = -1; | 263 | timr->it_overrun = -1; |
260 | ++timr->it_requeue_pending; | 264 | ++timr->it_requeue_pending; |
261 | hrtimer_restart(&timr->it.real.timer); | 265 | hrtimer_restart(timer); |
262 | } | 266 | } |
263 | 267 | ||
264 | /* | 268 | /* |
@@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event); | |||
330 | 334 | ||
331 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | 335 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. |
332 | */ | 336 | */ |
333 | static int posix_timer_fn(void *data) | 337 | static int posix_timer_fn(struct hrtimer *timer) |
334 | { | 338 | { |
335 | struct k_itimer *timr = data; | 339 | struct k_itimer *timr; |
336 | unsigned long flags; | 340 | unsigned long flags; |
337 | int si_private = 0; | 341 | int si_private = 0; |
338 | int ret = HRTIMER_NORESTART; | 342 | int ret = HRTIMER_NORESTART; |
339 | 343 | ||
344 | timr = container_of(timer, struct k_itimer, it.real.timer); | ||
340 | spin_lock_irqsave(&timr->it_lock, flags); | 345 | spin_lock_irqsave(&timr->it_lock, flags); |
341 | 346 | ||
342 | if (timr->it.real.interval.tv64 != 0) | 347 | if (timr->it.real.interval.tv64 != 0) |
@@ -350,9 +355,11 @@ static int posix_timer_fn(void *data) | |||
350 | */ | 355 | */ |
351 | if (timr->it.real.interval.tv64 != 0) { | 356 | if (timr->it.real.interval.tv64 != 0) { |
352 | timr->it_overrun += | 357 | timr->it_overrun += |
353 | hrtimer_forward(&timr->it.real.timer, | 358 | hrtimer_forward(timer, |
359 | timer->base->softirq_time, | ||
354 | timr->it.real.interval); | 360 | timr->it.real.interval); |
355 | ret = HRTIMER_RESTART; | 361 | ret = HRTIMER_RESTART; |
362 | ++timr->it_requeue_pending; | ||
356 | } | 363 | } |
357 | } | 364 | } |
358 | 365 | ||
@@ -601,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) | |||
601 | static void | 608 | static void |
602 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | 609 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) |
603 | { | 610 | { |
604 | ktime_t remaining; | 611 | ktime_t now, remaining, iv; |
605 | struct hrtimer *timer = &timr->it.real.timer; | 612 | struct hrtimer *timer = &timr->it.real.timer; |
606 | 613 | ||
607 | memset(cur_setting, 0, sizeof(struct itimerspec)); | 614 | memset(cur_setting, 0, sizeof(struct itimerspec)); |
608 | remaining = hrtimer_get_remaining(timer); | ||
609 | 615 | ||
610 | /* Time left ? or timer pending */ | 616 | iv = timr->it.real.interval; |
611 | if (remaining.tv64 > 0 || hrtimer_active(timer)) | 617 | |
612 | goto calci; | ||
613 | /* interval timer ? */ | 618 | /* interval timer ? */ |
614 | if (timr->it.real.interval.tv64 == 0) | 619 | if (iv.tv64) |
620 | cur_setting->it_interval = ktime_to_timespec(iv); | ||
621 | else if (!hrtimer_active(timer) && | ||
622 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | ||
615 | return; | 623 | return; |
624 | |||
625 | now = timer->base->get_time(); | ||
626 | |||
616 | /* | 627 | /* |
617 | * When a requeue is pending or this is a SIGEV_NONE timer | 628 | * When a requeue is pending or this is a SIGEV_NONE |
618 | * move the expiry time forward by intervals, so expiry is > | 629 | * timer move the expiry time forward by intervals, so |
619 | * now. | 630 | * expiry is > now. |
620 | */ | 631 | */ |
621 | if (timr->it_requeue_pending & REQUEUE_PENDING || | 632 | if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || |
622 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | 633 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) |
623 | timr->it_overrun += | 634 | timr->it_overrun += hrtimer_forward(timer, now, iv); |
624 | hrtimer_forward(timer, timr->it.real.interval); | 635 | |
625 | remaining = hrtimer_get_remaining(timer); | 636 | remaining = ktime_sub(timer->expires, now); |
626 | } | ||
627 | calci: | ||
628 | /* interval timer ? */ | ||
629 | if (timr->it.real.interval.tv64 != 0) | ||
630 | cur_setting->it_interval = | ||
631 | ktime_to_timespec(timr->it.real.interval); | ||
632 | /* Return 0 only, when the timer is expired and not pending */ | 637 | /* Return 0 only, when the timer is expired and not pending */ |
633 | if (remaining.tv64 <= 0) | 638 | if (remaining.tv64 <= 0) { |
634 | cur_setting->it_value.tv_nsec = 1; | 639 | /* |
635 | else | 640 | * A single shot SIGEV_NONE timer must return 0, when |
641 | * it is expired ! | ||
642 | */ | ||
643 | if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | ||
644 | cur_setting->it_value.tv_nsec = 1; | ||
645 | } else | ||
636 | cur_setting->it_value = ktime_to_timespec(remaining); | 646 | cur_setting->it_value = ktime_to_timespec(remaining); |
637 | } | 647 | } |
638 | 648 | ||
@@ -715,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
715 | 725 | ||
716 | mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; | 726 | mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; |
717 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | 727 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); |
718 | timr->it.real.timer.data = timr; | ||
719 | timr->it.real.timer.function = posix_timer_fn; | 728 | timr->it.real.timer.function = posix_timer_fn; |
720 | 729 | ||
721 | timer->expires = timespec_to_ktime(new_setting->it_value); | 730 | timer->expires = timespec_to_ktime(new_setting->it_value); |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9fd8d4f035..ce0dfb8f4a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND | |||
41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
42 | ---help--- | 42 | ---help--- |
43 | Enable the possibility of suspending the machine. | 43 | Enable the possibility of suspending the machine. |
44 | It doesn't need APM. | 44 | It doesn't need ACPI or APM. |
45 | You may suspend your machine by 'swsusp' or 'shutdown -z <time>' | 45 | You may suspend your machine by 'swsusp' or 'shutdown -z <time>' |
46 | (patch for sysvinit needed). | 46 | (patch for sysvinit needed). |
47 | 47 | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 04be7d0d96..8d0af3d37a 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -5,7 +5,7 @@ endif | |||
5 | 5 | ||
6 | obj-y := main.o process.o console.o | 6 | obj-y := main.o process.o console.o |
7 | obj-$(CONFIG_PM_LEGACY) += pm.o | 7 | obj-$(CONFIG_PM_LEGACY) += pm.o |
8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o | 8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o |
9 | 9 | ||
10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | 10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o |
11 | 11 | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0b43847dc9..81d4d982f3 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -22,17 +22,6 @@ | |||
22 | #include "power.h" | 22 | #include "power.h" |
23 | 23 | ||
24 | 24 | ||
25 | extern suspend_disk_method_t pm_disk_mode; | ||
26 | |||
27 | extern int swsusp_shrink_memory(void); | ||
28 | extern int swsusp_suspend(void); | ||
29 | extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); | ||
30 | extern int swsusp_check(void); | ||
31 | extern int swsusp_read(struct pbe **pblist_ptr); | ||
32 | extern void swsusp_close(void); | ||
33 | extern int swsusp_resume(void); | ||
34 | |||
35 | |||
36 | static int noresume = 0; | 25 | static int noresume = 0; |
37 | char resume_file[256] = CONFIG_PM_STD_PARTITION; | 26 | char resume_file[256] = CONFIG_PM_STD_PARTITION; |
38 | dev_t swsusp_resume_device; | 27 | dev_t swsusp_resume_device; |
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode) | |||
70 | while(1); | 59 | while(1); |
71 | } | 60 | } |
72 | 61 | ||
73 | |||
74 | static int in_suspend __nosavedata = 0; | ||
75 | |||
76 | |||
77 | static inline void platform_finish(void) | 62 | static inline void platform_finish(void) |
78 | { | 63 | { |
79 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 64 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
@@ -87,7 +72,6 @@ static int prepare_processes(void) | |||
87 | int error; | 72 | int error; |
88 | 73 | ||
89 | pm_prepare_console(); | 74 | pm_prepare_console(); |
90 | sys_sync(); | ||
91 | disable_nonboot_cpus(); | 75 | disable_nonboot_cpus(); |
92 | 76 | ||
93 | if (freeze_processes()) { | 77 | if (freeze_processes()) { |
@@ -145,7 +129,7 @@ int pm_suspend_disk(void) | |||
145 | if (in_suspend) { | 129 | if (in_suspend) { |
146 | device_resume(); | 130 | device_resume(); |
147 | pr_debug("PM: writing image.\n"); | 131 | pr_debug("PM: writing image.\n"); |
148 | error = swsusp_write(pagedir_nosave, nr_copy_pages); | 132 | error = swsusp_write(); |
149 | if (!error) | 133 | if (!error) |
150 | power_down(pm_disk_mode); | 134 | power_down(pm_disk_mode); |
151 | else { | 135 | else { |
@@ -216,7 +200,7 @@ static int software_resume(void) | |||
216 | 200 | ||
217 | pr_debug("PM: Reading swsusp image.\n"); | 201 | pr_debug("PM: Reading swsusp image.\n"); |
218 | 202 | ||
219 | if ((error = swsusp_read(&pagedir_nosave))) { | 203 | if ((error = swsusp_read())) { |
220 | swsusp_free(); | 204 | swsusp_free(); |
221 | goto Thaw; | 205 | goto Thaw; |
222 | } | 206 | } |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 9cb235cba4..a6d9ef4600 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state) | |||
103 | } | 103 | } |
104 | 104 | ||
105 | 105 | ||
106 | static int suspend_enter(suspend_state_t state) | 106 | int suspend_enter(suspend_state_t state) |
107 | { | 107 | { |
108 | int error = 0; | 108 | int error = 0; |
109 | unsigned long flags; | 109 | unsigned long flags; |
@@ -272,7 +272,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n | |||
272 | if (*s && !strncmp(buf, *s, len)) | 272 | if (*s && !strncmp(buf, *s, len)) |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | if (*s) | 275 | if (state < PM_SUSPEND_MAX && *s) |
276 | error = enter_state(state); | 276 | error = enter_state(state); |
277 | else | 277 | else |
278 | error = -EINVAL; | 278 | error = -EINVAL; |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 33c508e857..84063ac8fc 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
26 | #include <linux/pm_legacy.h> | 26 | #include <linux/pm_legacy.h> |
27 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
28 | #include <linux/mutex.h> | ||
28 | 29 | ||
29 | int pm_active; | 30 | int pm_active; |
30 | 31 | ||
@@ -40,7 +41,7 @@ int pm_active; | |||
40 | * until a resume but that will be fine. | 41 | * until a resume but that will be fine. |
41 | */ | 42 | */ |
42 | 43 | ||
43 | static DECLARE_MUTEX(pm_devs_lock); | 44 | static DEFINE_MUTEX(pm_devs_lock); |
44 | static LIST_HEAD(pm_devs); | 45 | static LIST_HEAD(pm_devs); |
45 | 46 | ||
46 | /** | 47 | /** |
@@ -67,32 +68,13 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
67 | dev->id = id; | 68 | dev->id = id; |
68 | dev->callback = callback; | 69 | dev->callback = callback; |
69 | 70 | ||
70 | down(&pm_devs_lock); | 71 | mutex_lock(&pm_devs_lock); |
71 | list_add(&dev->entry, &pm_devs); | 72 | list_add(&dev->entry, &pm_devs); |
72 | up(&pm_devs_lock); | 73 | mutex_unlock(&pm_devs_lock); |
73 | } | 74 | } |
74 | return dev; | 75 | return dev; |
75 | } | 76 | } |
76 | 77 | ||
77 | /** | ||
78 | * pm_unregister - unregister a device with power management | ||
79 | * @dev: device to unregister | ||
80 | * | ||
81 | * Remove a device from the power management notification lists. The | ||
82 | * dev passed must be a handle previously returned by pm_register. | ||
83 | */ | ||
84 | |||
85 | void pm_unregister(struct pm_dev *dev) | ||
86 | { | ||
87 | if (dev) { | ||
88 | down(&pm_devs_lock); | ||
89 | list_del(&dev->entry); | ||
90 | up(&pm_devs_lock); | ||
91 | |||
92 | kfree(dev); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | static void __pm_unregister(struct pm_dev *dev) | 78 | static void __pm_unregister(struct pm_dev *dev) |
97 | { | 79 | { |
98 | if (dev) { | 80 | if (dev) { |
@@ -118,7 +100,7 @@ void pm_unregister_all(pm_callback callback) | |||
118 | if (!callback) | 100 | if (!callback) |
119 | return; | 101 | return; |
120 | 102 | ||
121 | down(&pm_devs_lock); | 103 | mutex_lock(&pm_devs_lock); |
122 | entry = pm_devs.next; | 104 | entry = pm_devs.next; |
123 | while (entry != &pm_devs) { | 105 | while (entry != &pm_devs) { |
124 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | 106 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); |
@@ -126,7 +108,7 @@ void pm_unregister_all(pm_callback callback) | |||
126 | if (dev->callback == callback) | 108 | if (dev->callback == callback) |
127 | __pm_unregister(dev); | 109 | __pm_unregister(dev); |
128 | } | 110 | } |
129 | up(&pm_devs_lock); | 111 | mutex_unlock(&pm_devs_lock); |
130 | } | 112 | } |
131 | 113 | ||
132 | /** | 114 | /** |
@@ -234,7 +216,7 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
234 | { | 216 | { |
235 | struct list_head *entry; | 217 | struct list_head *entry; |
236 | 218 | ||
237 | down(&pm_devs_lock); | 219 | mutex_lock(&pm_devs_lock); |
238 | entry = pm_devs.next; | 220 | entry = pm_devs.next; |
239 | while (entry != &pm_devs) { | 221 | while (entry != &pm_devs) { |
240 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | 222 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); |
@@ -246,18 +228,17 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
246 | */ | 228 | */ |
247 | if (rqst == PM_SUSPEND) | 229 | if (rqst == PM_SUSPEND) |
248 | pm_undo_all(dev); | 230 | pm_undo_all(dev); |
249 | up(&pm_devs_lock); | 231 | mutex_unlock(&pm_devs_lock); |
250 | return status; | 232 | return status; |
251 | } | 233 | } |
252 | } | 234 | } |
253 | entry = entry->next; | 235 | entry = entry->next; |
254 | } | 236 | } |
255 | up(&pm_devs_lock); | 237 | mutex_unlock(&pm_devs_lock); |
256 | return 0; | 238 | return 0; |
257 | } | 239 | } |
258 | 240 | ||
259 | EXPORT_SYMBOL(pm_register); | 241 | EXPORT_SYMBOL(pm_register); |
260 | EXPORT_SYMBOL(pm_unregister); | ||
261 | EXPORT_SYMBOL(pm_unregister_all); | 242 | EXPORT_SYMBOL(pm_unregister_all); |
262 | EXPORT_SYMBOL(pm_send_all); | 243 | EXPORT_SYMBOL(pm_send_all); |
263 | EXPORT_SYMBOL(pm_active); | 244 | EXPORT_SYMBOL(pm_active); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 388dba6808..f06f12f217 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -8,6 +8,7 @@ struct swsusp_info { | |||
8 | int cpus; | 8 | int cpus; |
9 | unsigned long image_pages; | 9 | unsigned long image_pages; |
10 | unsigned long pages; | 10 | unsigned long pages; |
11 | unsigned long size; | ||
11 | } __attribute__((aligned(PAGE_SIZE))); | 12 | } __attribute__((aligned(PAGE_SIZE))); |
12 | 13 | ||
13 | 14 | ||
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys; | |||
37 | /* References to section boundaries */ | 38 | /* References to section boundaries */ |
38 | extern const void __nosave_begin, __nosave_end; | 39 | extern const void __nosave_begin, __nosave_end; |
39 | 40 | ||
40 | extern unsigned int nr_copy_pages; | ||
41 | extern struct pbe *pagedir_nosave; | 41 | extern struct pbe *pagedir_nosave; |
42 | 42 | ||
43 | /* Preferred image size in bytes (default 500 MB) */ | 43 | /* Preferred image size in bytes (default 500 MB) */ |
44 | extern unsigned long image_size; | 44 | extern unsigned long image_size; |
45 | extern int in_suspend; | ||
46 | extern dev_t swsusp_resume_device; | ||
45 | 47 | ||
46 | extern asmlinkage int swsusp_arch_suspend(void); | 48 | extern asmlinkage int swsusp_arch_suspend(void); |
47 | extern asmlinkage int swsusp_arch_resume(void); | 49 | extern asmlinkage int swsusp_arch_resume(void); |
48 | 50 | ||
49 | extern unsigned int count_data_pages(void); | 51 | extern unsigned int count_data_pages(void); |
50 | extern void free_pagedir(struct pbe *pblist); | 52 | |
51 | extern void release_eaten_pages(void); | 53 | struct snapshot_handle { |
52 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); | 54 | loff_t offset; |
55 | unsigned int page; | ||
56 | unsigned int page_offset; | ||
57 | unsigned int prev; | ||
58 | struct pbe *pbe; | ||
59 | void *buffer; | ||
60 | unsigned int buf_offset; | ||
61 | }; | ||
62 | |||
63 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) | ||
64 | |||
65 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | ||
66 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | ||
67 | int snapshot_image_loaded(struct snapshot_handle *handle); | ||
68 | |||
69 | #define SNAPSHOT_IOC_MAGIC '3' | ||
70 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | ||
71 | #define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) | ||
72 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
73 | #define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) | ||
74 | #define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) | ||
75 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
76 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
77 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
78 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) | ||
79 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
80 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) | ||
81 | #define SNAPSHOT_IOC_MAXNR 11 | ||
82 | |||
83 | /** | ||
84 | * The bitmap is used for tracing allocated swap pages | ||
85 | * | ||
86 | * The entire bitmap consists of a number of bitmap_page | ||
87 | * structures linked with the help of the .next member. | ||
88 | * Thus each page can be allocated individually, so we only | ||
89 | * need to make 0-order memory allocations to create | ||
90 | * the bitmap. | ||
91 | */ | ||
92 | |||
93 | #define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) | ||
94 | #define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) | ||
95 | #define BITS_PER_CHUNK (sizeof(long) * 8) | ||
96 | #define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) | ||
97 | |||
98 | struct bitmap_page { | ||
99 | unsigned long chunks[BITMAP_PAGE_CHUNKS]; | ||
100 | struct bitmap_page *next; | ||
101 | }; | ||
102 | |||
103 | extern void free_bitmap(struct bitmap_page *bitmap); | ||
104 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); | ||
105 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); | ||
106 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); | ||
107 | |||
108 | extern int swsusp_check(void); | ||
109 | extern int swsusp_shrink_memory(void); | ||
53 | extern void swsusp_free(void); | 110 | extern void swsusp_free(void); |
54 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); | 111 | extern int swsusp_suspend(void); |
55 | extern unsigned int snapshot_nr_pages(void); | 112 | extern int swsusp_resume(void); |
56 | extern struct pbe *snapshot_pblist(void); | 113 | extern int swsusp_read(void); |
57 | extern void snapshot_pblist_set(struct pbe *pblist); | 114 | extern int swsusp_write(void); |
115 | extern void swsusp_close(void); | ||
116 | extern int suspend_enter(suspend_state_t state); | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 28de118f7a..b2a5f671d6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -12,11 +12,12 @@ | |||
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | #include <linux/suspend.h> | 13 | #include <linux/suspend.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/syscalls.h> | ||
15 | 16 | ||
16 | /* | 17 | /* |
17 | * Timeout for stopping processes | 18 | * Timeout for stopping processes |
18 | */ | 19 | */ |
19 | #define TIMEOUT (6 * HZ) | 20 | #define TIMEOUT (20 * HZ) |
20 | 21 | ||
21 | 22 | ||
22 | static inline int freezeable(struct task_struct * p) | 23 | static inline int freezeable(struct task_struct * p) |
@@ -25,8 +26,7 @@ static inline int freezeable(struct task_struct * p) | |||
25 | (p->flags & PF_NOFREEZE) || | 26 | (p->flags & PF_NOFREEZE) || |
26 | (p->exit_state == EXIT_ZOMBIE) || | 27 | (p->exit_state == EXIT_ZOMBIE) || |
27 | (p->exit_state == EXIT_DEAD) || | 28 | (p->exit_state == EXIT_DEAD) || |
28 | (p->state == TASK_STOPPED) || | 29 | (p->state == TASK_STOPPED)) |
29 | (p->state == TASK_TRACED)) | ||
30 | return 0; | 30 | return 0; |
31 | return 1; | 31 | return 1; |
32 | } | 32 | } |
@@ -54,38 +54,62 @@ void refrigerator(void) | |||
54 | current->state = save; | 54 | current->state = save; |
55 | } | 55 | } |
56 | 56 | ||
57 | static inline void freeze_process(struct task_struct *p) | ||
58 | { | ||
59 | unsigned long flags; | ||
60 | |||
61 | if (!freezing(p)) { | ||
62 | freeze(p); | ||
63 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
64 | signal_wake_up(p, 0); | ||
65 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
66 | } | ||
67 | } | ||
68 | |||
57 | /* 0 = success, else # of processes that we failed to stop */ | 69 | /* 0 = success, else # of processes that we failed to stop */ |
58 | int freeze_processes(void) | 70 | int freeze_processes(void) |
59 | { | 71 | { |
60 | int todo; | 72 | int todo, nr_user, user_frozen; |
61 | unsigned long start_time; | 73 | unsigned long start_time; |
62 | struct task_struct *g, *p; | 74 | struct task_struct *g, *p; |
63 | unsigned long flags; | 75 | unsigned long flags; |
64 | 76 | ||
65 | printk( "Stopping tasks: " ); | 77 | printk( "Stopping tasks: " ); |
66 | start_time = jiffies; | 78 | start_time = jiffies; |
79 | user_frozen = 0; | ||
67 | do { | 80 | do { |
68 | todo = 0; | 81 | nr_user = todo = 0; |
69 | read_lock(&tasklist_lock); | 82 | read_lock(&tasklist_lock); |
70 | do_each_thread(g, p) { | 83 | do_each_thread(g, p) { |
71 | if (!freezeable(p)) | 84 | if (!freezeable(p)) |
72 | continue; | 85 | continue; |
73 | if (frozen(p)) | 86 | if (frozen(p)) |
74 | continue; | 87 | continue; |
75 | 88 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { | |
76 | freeze(p); | 89 | /* The task is a user-space one. |
77 | spin_lock_irqsave(&p->sighand->siglock, flags); | 90 | * Freeze it unless there's a vfork completion |
78 | signal_wake_up(p, 0); | 91 | * pending |
79 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 92 | */ |
80 | todo++; | 93 | if (!p->vfork_done) |
94 | freeze_process(p); | ||
95 | nr_user++; | ||
96 | } else { | ||
97 | /* Freeze only if the user space is frozen */ | ||
98 | if (user_frozen) | ||
99 | freeze_process(p); | ||
100 | todo++; | ||
101 | } | ||
81 | } while_each_thread(g, p); | 102 | } while_each_thread(g, p); |
82 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
104 | todo += nr_user; | ||
105 | if (!user_frozen && !nr_user) { | ||
106 | sys_sync(); | ||
107 | start_time = jiffies; | ||
108 | } | ||
109 | user_frozen = !nr_user; | ||
83 | yield(); /* Yield is okay here */ | 110 | yield(); /* Yield is okay here */ |
84 | if (todo && time_after(jiffies, start_time + TIMEOUT)) { | 111 | if (todo && time_after(jiffies, start_time + TIMEOUT)) |
85 | printk( "\n" ); | ||
86 | printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); | ||
87 | break; | 112 | break; |
88 | } | ||
89 | } while(todo); | 113 | } while(todo); |
90 | 114 | ||
91 | /* This does not unfreeze processes that are already frozen | 115 | /* This does not unfreeze processes that are already frozen |
@@ -94,8 +118,14 @@ int freeze_processes(void) | |||
94 | * but it cleans up leftover PF_FREEZE requests. | 118 | * but it cleans up leftover PF_FREEZE requests. |
95 | */ | 119 | */ |
96 | if (todo) { | 120 | if (todo) { |
121 | printk( "\n" ); | ||
122 | printk(KERN_ERR " stopping tasks timed out " | ||
123 | "after %d seconds (%d tasks remaining):\n", | ||
124 | TIMEOUT / HZ, todo); | ||
97 | read_lock(&tasklist_lock); | 125 | read_lock(&tasklist_lock); |
98 | do_each_thread(g, p) | 126 | do_each_thread(g, p) { |
127 | if (freezeable(p) && !frozen(p)) | ||
128 | printk(KERN_ERR " %s\n", p->comm); | ||
99 | if (freezing(p)) { | 129 | if (freezing(p)) { |
100 | pr_debug(" clean up: %s\n", p->comm); | 130 | pr_debug(" clean up: %s\n", p->comm); |
101 | p->flags &= ~PF_FREEZE; | 131 | p->flags &= ~PF_FREEZE; |
@@ -103,7 +133,7 @@ int freeze_processes(void) | |||
103 | recalc_sigpending_tsk(p); | 133 | recalc_sigpending_tsk(p); |
104 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 134 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
105 | } | 135 | } |
106 | while_each_thread(g, p); | 136 | } while_each_thread(g, p); |
107 | read_unlock(&tasklist_lock); | 137 | read_unlock(&tasklist_lock); |
108 | return todo; | 138 | return todo; |
109 | } | 139 | } |
diff --git a/kernel/power/smp.c b/kernel/power/smp.c index 911fc62b82..5957312b2d 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c | |||
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void) | |||
49 | 49 | ||
50 | printk("Thawing cpus ...\n"); | 50 | printk("Thawing cpus ...\n"); |
51 | for_each_cpu_mask(cpu, frozen_cpus) { | 51 | for_each_cpu_mask(cpu, frozen_cpus) { |
52 | error = smp_prepare_cpu(cpu); | 52 | error = cpu_up(cpu); |
53 | if (!error) | ||
54 | error = cpu_up(cpu); | ||
55 | if (!error) { | 53 | if (!error) { |
56 | printk("CPU%d is up\n", cpu); | 54 | printk("CPU%d is up\n", cpu); |
57 | continue; | 55 | continue; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 41f66365f0..3eeedbb13b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -10,6 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | 12 | ||
13 | #include <linux/version.h> | ||
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
14 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
15 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
@@ -34,7 +35,9 @@ | |||
34 | #include "power.h" | 35 | #include "power.h" |
35 | 36 | ||
36 | struct pbe *pagedir_nosave; | 37 | struct pbe *pagedir_nosave; |
37 | unsigned int nr_copy_pages; | 38 | static unsigned int nr_copy_pages; |
39 | static unsigned int nr_meta_pages; | ||
40 | static unsigned long *buffer; | ||
38 | 41 | ||
39 | #ifdef CONFIG_HIGHMEM | 42 | #ifdef CONFIG_HIGHMEM |
40 | unsigned int count_highmem_pages(void) | 43 | unsigned int count_highmem_pages(void) |
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone) | |||
80 | void *kaddr; | 83 | void *kaddr; |
81 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | 84 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; |
82 | 85 | ||
83 | if (!(pfn%1000)) | 86 | if (!(pfn%10000)) |
84 | printk("."); | 87 | printk("."); |
85 | if (!pfn_valid(pfn)) | 88 | if (!pfn_valid(pfn)) |
86 | continue; | 89 | continue; |
@@ -91,10 +94,8 @@ static int save_highmem_zone(struct zone *zone) | |||
91 | * corrected eventually when the cases giving rise to this | 94 | * corrected eventually when the cases giving rise to this |
92 | * are better understood. | 95 | * are better understood. |
93 | */ | 96 | */ |
94 | if (PageReserved(page)) { | 97 | if (PageReserved(page)) |
95 | printk("highmem reserved page?!\n"); | ||
96 | continue; | 98 | continue; |
97 | } | ||
98 | BUG_ON(PageNosave(page)); | 99 | BUG_ON(PageNosave(page)); |
99 | if (PageNosaveFree(page)) | 100 | if (PageNosaveFree(page)) |
100 | continue; | 101 | continue; |
@@ -121,13 +122,15 @@ int save_highmem(void) | |||
121 | struct zone *zone; | 122 | struct zone *zone; |
122 | int res = 0; | 123 | int res = 0; |
123 | 124 | ||
124 | pr_debug("swsusp: Saving Highmem\n"); | 125 | pr_debug("swsusp: Saving Highmem"); |
126 | drain_local_pages(); | ||
125 | for_each_zone (zone) { | 127 | for_each_zone (zone) { |
126 | if (is_highmem(zone)) | 128 | if (is_highmem(zone)) |
127 | res = save_highmem_zone(zone); | 129 | res = save_highmem_zone(zone); |
128 | if (res) | 130 | if (res) |
129 | return res; | 131 | return res; |
130 | } | 132 | } |
133 | printk("\n"); | ||
131 | return 0; | 134 | return 0; |
132 | } | 135 | } |
133 | 136 | ||
@@ -237,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist) | |||
237 | * free_pagedir - free pages allocated with alloc_pagedir() | 240 | * free_pagedir - free pages allocated with alloc_pagedir() |
238 | */ | 241 | */ |
239 | 242 | ||
240 | void free_pagedir(struct pbe *pblist) | 243 | static void free_pagedir(struct pbe *pblist, int clear_nosave_free) |
241 | { | 244 | { |
242 | struct pbe *pbe; | 245 | struct pbe *pbe; |
243 | 246 | ||
244 | while (pblist) { | 247 | while (pblist) { |
245 | pbe = (pblist + PB_PAGE_SKIP)->next; | 248 | pbe = (pblist + PB_PAGE_SKIP)->next; |
246 | ClearPageNosave(virt_to_page(pblist)); | 249 | ClearPageNosave(virt_to_page(pblist)); |
247 | ClearPageNosaveFree(virt_to_page(pblist)); | 250 | if (clear_nosave_free) |
251 | ClearPageNosaveFree(virt_to_page(pblist)); | ||
248 | free_page((unsigned long)pblist); | 252 | free_page((unsigned long)pblist); |
249 | pblist = pbe; | 253 | pblist = pbe; |
250 | } | 254 | } |
@@ -303,7 +307,7 @@ struct eaten_page { | |||
303 | 307 | ||
304 | static struct eaten_page *eaten_pages = NULL; | 308 | static struct eaten_page *eaten_pages = NULL; |
305 | 309 | ||
306 | void release_eaten_pages(void) | 310 | static void release_eaten_pages(void) |
307 | { | 311 | { |
308 | struct eaten_page *p, *q; | 312 | struct eaten_page *p, *q; |
309 | 313 | ||
@@ -378,7 +382,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
378 | if (!nr_pages) | 382 | if (!nr_pages) |
379 | return NULL; | 383 | return NULL; |
380 | 384 | ||
381 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
382 | pblist = alloc_image_page(gfp_mask, safe_needed); | 385 | pblist = alloc_image_page(gfp_mask, safe_needed); |
383 | /* FIXME: rewrite this ugly loop */ | 386 | /* FIXME: rewrite this ugly loop */ |
384 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | 387 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; |
@@ -387,10 +390,10 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
387 | pbe->next = alloc_image_page(gfp_mask, safe_needed); | 390 | pbe->next = alloc_image_page(gfp_mask, safe_needed); |
388 | } | 391 | } |
389 | if (!pbe) { /* get_zeroed_page() failed */ | 392 | if (!pbe) { /* get_zeroed_page() failed */ |
390 | free_pagedir(pblist); | 393 | free_pagedir(pblist, 1); |
391 | pblist = NULL; | 394 | pblist = NULL; |
392 | } else | 395 | } else |
393 | create_pbe_list(pblist, nr_pages); | 396 | create_pbe_list(pblist, nr_pages); |
394 | return pblist; | 397 | return pblist; |
395 | } | 398 | } |
396 | 399 | ||
@@ -416,6 +419,10 @@ void swsusp_free(void) | |||
416 | } | 419 | } |
417 | } | 420 | } |
418 | } | 421 | } |
422 | nr_copy_pages = 0; | ||
423 | nr_meta_pages = 0; | ||
424 | pagedir_nosave = NULL; | ||
425 | buffer = NULL; | ||
419 | } | 426 | } |
420 | 427 | ||
421 | 428 | ||
@@ -439,7 +446,7 @@ static int enough_free_mem(unsigned int nr_pages) | |||
439 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 446 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
440 | } | 447 | } |
441 | 448 | ||
442 | int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) | 449 | static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) |
443 | { | 450 | { |
444 | struct pbe *p; | 451 | struct pbe *p; |
445 | 452 | ||
@@ -506,7 +513,318 @@ asmlinkage int swsusp_save(void) | |||
506 | */ | 513 | */ |
507 | 514 | ||
508 | nr_copy_pages = nr_pages; | 515 | nr_copy_pages = nr_pages; |
516 | nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
509 | 517 | ||
510 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | 518 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); |
511 | return 0; | 519 | return 0; |
512 | } | 520 | } |
521 | |||
522 | static void init_header(struct swsusp_info *info) | ||
523 | { | ||
524 | memset(info, 0, sizeof(struct swsusp_info)); | ||
525 | info->version_code = LINUX_VERSION_CODE; | ||
526 | info->num_physpages = num_physpages; | ||
527 | memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); | ||
528 | info->cpus = num_online_cpus(); | ||
529 | info->image_pages = nr_copy_pages; | ||
530 | info->pages = nr_copy_pages + nr_meta_pages + 1; | ||
531 | info->size = info->pages; | ||
532 | info->size <<= PAGE_SHIFT; | ||
533 | } | ||
534 | |||
535 | /** | ||
536 | * pack_orig_addresses - the .orig_address fields of the PBEs from the | ||
537 | * list starting at @pbe are stored in the array @buf[] (1 page) | ||
538 | */ | ||
539 | |||
540 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) | ||
541 | { | ||
542 | int j; | ||
543 | |||
544 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
545 | buf[j] = pbe->orig_address; | ||
546 | pbe = pbe->next; | ||
547 | } | ||
548 | if (!pbe) | ||
549 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
550 | buf[j] = 0; | ||
551 | return pbe; | ||
552 | } | ||
553 | |||
554 | /** | ||
555 | * snapshot_read_next - used for reading the system memory snapshot. | ||
556 | * | ||
557 | * On the first call to it @handle should point to a zeroed | ||
558 | * snapshot_handle structure. The structure gets updated and a pointer | ||
559 | * to it should be passed to this function every next time. | ||
560 | * | ||
561 | * The @count parameter should contain the number of bytes the caller | ||
562 | * wants to read from the snapshot. It must not be zero. | ||
563 | * | ||
564 | * On success the function returns a positive number. Then, the caller | ||
565 | * is allowed to read up to the returned number of bytes from the memory | ||
566 | * location computed by the data_of() macro. The number returned | ||
567 | * may be smaller than @count, but this only happens if the read would | ||
568 | * cross a page boundary otherwise. | ||
569 | * | ||
570 | * The function returns 0 to indicate the end of data stream condition, | ||
571 | * and a negative number is returned on error. In such cases the | ||
572 | * structure pointed to by @handle is not updated and should not be used | ||
573 | * any more. | ||
574 | */ | ||
575 | |||
576 | int snapshot_read_next(struct snapshot_handle *handle, size_t count) | ||
577 | { | ||
578 | if (handle->page > nr_meta_pages + nr_copy_pages) | ||
579 | return 0; | ||
580 | if (!buffer) { | ||
581 | /* This makes the buffer be freed by swsusp_free() */ | ||
582 | buffer = alloc_image_page(GFP_ATOMIC, 0); | ||
583 | if (!buffer) | ||
584 | return -ENOMEM; | ||
585 | } | ||
586 | if (!handle->offset) { | ||
587 | init_header((struct swsusp_info *)buffer); | ||
588 | handle->buffer = buffer; | ||
589 | handle->pbe = pagedir_nosave; | ||
590 | } | ||
591 | if (handle->prev < handle->page) { | ||
592 | if (handle->page <= nr_meta_pages) { | ||
593 | handle->pbe = pack_orig_addresses(buffer, handle->pbe); | ||
594 | if (!handle->pbe) | ||
595 | handle->pbe = pagedir_nosave; | ||
596 | } else { | ||
597 | handle->buffer = (void *)handle->pbe->address; | ||
598 | handle->pbe = handle->pbe->next; | ||
599 | } | ||
600 | handle->prev = handle->page; | ||
601 | } | ||
602 | handle->buf_offset = handle->page_offset; | ||
603 | if (handle->page_offset + count >= PAGE_SIZE) { | ||
604 | count = PAGE_SIZE - handle->page_offset; | ||
605 | handle->page_offset = 0; | ||
606 | handle->page++; | ||
607 | } else { | ||
608 | handle->page_offset += count; | ||
609 | } | ||
610 | handle->offset += count; | ||
611 | return count; | ||
612 | } | ||
613 | |||
614 | /** | ||
615 | * mark_unsafe_pages - mark the pages that cannot be used for storing | ||
616 | * the image during resume, because they conflict with the pages that | ||
617 | * had been used before suspend | ||
618 | */ | ||
619 | |||
620 | static int mark_unsafe_pages(struct pbe *pblist) | ||
621 | { | ||
622 | struct zone *zone; | ||
623 | unsigned long zone_pfn; | ||
624 | struct pbe *p; | ||
625 | |||
626 | if (!pblist) /* a sanity check */ | ||
627 | return -EINVAL; | ||
628 | |||
629 | /* Clear page flags */ | ||
630 | for_each_zone (zone) { | ||
631 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
632 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) | ||
633 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
634 | zone->zone_start_pfn)); | ||
635 | } | ||
636 | |||
637 | /* Mark orig addresses */ | ||
638 | for_each_pbe (p, pblist) { | ||
639 | if (virt_addr_valid(p->orig_address)) | ||
640 | SetPageNosaveFree(virt_to_page(p->orig_address)); | ||
641 | else | ||
642 | return -EFAULT; | ||
643 | } | ||
644 | |||
645 | return 0; | ||
646 | } | ||
647 | |||
648 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | ||
649 | { | ||
650 | /* We assume both lists contain the same number of elements */ | ||
651 | while (src) { | ||
652 | dst->orig_address = src->orig_address; | ||
653 | dst = dst->next; | ||
654 | src = src->next; | ||
655 | } | ||
656 | } | ||
657 | |||
658 | static int check_header(struct swsusp_info *info) | ||
659 | { | ||
660 | char *reason = NULL; | ||
661 | |||
662 | if (info->version_code != LINUX_VERSION_CODE) | ||
663 | reason = "kernel version"; | ||
664 | if (info->num_physpages != num_physpages) | ||
665 | reason = "memory size"; | ||
666 | if (strcmp(info->uts.sysname,system_utsname.sysname)) | ||
667 | reason = "system type"; | ||
668 | if (strcmp(info->uts.release,system_utsname.release)) | ||
669 | reason = "kernel release"; | ||
670 | if (strcmp(info->uts.version,system_utsname.version)) | ||
671 | reason = "version"; | ||
672 | if (strcmp(info->uts.machine,system_utsname.machine)) | ||
673 | reason = "machine"; | ||
674 | if (reason) { | ||
675 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
676 | return -EPERM; | ||
677 | } | ||
678 | return 0; | ||
679 | } | ||
680 | |||
681 | /** | ||
682 | * load header - check the image header and copy data from it | ||
683 | */ | ||
684 | |||
685 | static int load_header(struct snapshot_handle *handle, | ||
686 | struct swsusp_info *info) | ||
687 | { | ||
688 | int error; | ||
689 | struct pbe *pblist; | ||
690 | |||
691 | error = check_header(info); | ||
692 | if (!error) { | ||
693 | pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); | ||
694 | if (!pblist) | ||
695 | return -ENOMEM; | ||
696 | pagedir_nosave = pblist; | ||
697 | handle->pbe = pblist; | ||
698 | nr_copy_pages = info->image_pages; | ||
699 | nr_meta_pages = info->pages - info->image_pages - 1; | ||
700 | } | ||
701 | return error; | ||
702 | } | ||
703 | |||
704 | /** | ||
705 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
706 | * the PBEs in the list starting at @pbe | ||
707 | */ | ||
708 | |||
709 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | ||
710 | struct pbe *pbe) | ||
711 | { | ||
712 | int j; | ||
713 | |||
714 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
715 | pbe->orig_address = buf[j]; | ||
716 | pbe = pbe->next; | ||
717 | } | ||
718 | return pbe; | ||
719 | } | ||
720 | |||
721 | /** | ||
722 | * create_image - use metadata contained in the PBE list | ||
723 | * pointed to by pagedir_nosave to mark the pages that will | ||
724 | * be overwritten in the process of restoring the system | ||
725 | * memory state from the image and allocate memory for | ||
726 | * the image avoiding these pages | ||
727 | */ | ||
728 | |||
729 | static int create_image(struct snapshot_handle *handle) | ||
730 | { | ||
731 | int error = 0; | ||
732 | struct pbe *p, *pblist; | ||
733 | |||
734 | p = pagedir_nosave; | ||
735 | error = mark_unsafe_pages(p); | ||
736 | if (!error) { | ||
737 | pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | ||
738 | if (pblist) | ||
739 | copy_page_backup_list(pblist, p); | ||
740 | free_pagedir(p, 0); | ||
741 | if (!pblist) | ||
742 | error = -ENOMEM; | ||
743 | } | ||
744 | if (!error) | ||
745 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
746 | if (!error) { | ||
747 | release_eaten_pages(); | ||
748 | pagedir_nosave = pblist; | ||
749 | } else { | ||
750 | pagedir_nosave = NULL; | ||
751 | handle->pbe = NULL; | ||
752 | nr_copy_pages = 0; | ||
753 | nr_meta_pages = 0; | ||
754 | } | ||
755 | return error; | ||
756 | } | ||
757 | |||
758 | /** | ||
759 | * snapshot_write_next - used for writing the system memory snapshot. | ||
760 | * | ||
761 | * On the first call to it @handle should point to a zeroed | ||
762 | * snapshot_handle structure. The structure gets updated and a pointer | ||
763 | * to it should be passed to this function every next time. | ||
764 | * | ||
765 | * The @count parameter should contain the number of bytes the caller | ||
766 | * wants to write to the image. It must not be zero. | ||
767 | * | ||
768 | * On success the function returns a positive number. Then, the caller | ||
769 | * is allowed to write up to the returned number of bytes to the memory | ||
770 | * location computed by the data_of() macro. The number returned | ||
771 | * may be smaller than @count, but this only happens if the write would | ||
772 | * cross a page boundary otherwise. | ||
773 | * | ||
774 | * The function returns 0 to indicate the "end of file" condition, | ||
775 | * and a negative number is returned on error. In such cases the | ||
776 | * structure pointed to by @handle is not updated and should not be used | ||
777 | * any more. | ||
778 | */ | ||
779 | |||
780 | int snapshot_write_next(struct snapshot_handle *handle, size_t count) | ||
781 | { | ||
782 | int error = 0; | ||
783 | |||
784 | if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) | ||
785 | return 0; | ||
786 | if (!buffer) { | ||
787 | /* This makes the buffer be freed by swsusp_free() */ | ||
788 | buffer = alloc_image_page(GFP_ATOMIC, 0); | ||
789 | if (!buffer) | ||
790 | return -ENOMEM; | ||
791 | } | ||
792 | if (!handle->offset) | ||
793 | handle->buffer = buffer; | ||
794 | if (handle->prev < handle->page) { | ||
795 | if (!handle->prev) { | ||
796 | error = load_header(handle, (struct swsusp_info *)buffer); | ||
797 | if (error) | ||
798 | return error; | ||
799 | } else if (handle->prev <= nr_meta_pages) { | ||
800 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); | ||
801 | if (!handle->pbe) { | ||
802 | error = create_image(handle); | ||
803 | if (error) | ||
804 | return error; | ||
805 | handle->pbe = pagedir_nosave; | ||
806 | handle->buffer = (void *)handle->pbe->address; | ||
807 | } | ||
808 | } else { | ||
809 | handle->pbe = handle->pbe->next; | ||
810 | handle->buffer = (void *)handle->pbe->address; | ||
811 | } | ||
812 | handle->prev = handle->page; | ||
813 | } | ||
814 | handle->buf_offset = handle->page_offset; | ||
815 | if (handle->page_offset + count >= PAGE_SIZE) { | ||
816 | count = PAGE_SIZE - handle->page_offset; | ||
817 | handle->page_offset = 0; | ||
818 | handle->page++; | ||
819 | } else { | ||
820 | handle->page_offset += count; | ||
821 | } | ||
822 | handle->offset += count; | ||
823 | return count; | ||
824 | } | ||
825 | |||
826 | int snapshot_image_loaded(struct snapshot_handle *handle) | ||
827 | { | ||
828 | return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || | ||
829 | handle->page <= nr_meta_pages + nr_copy_pages); | ||
830 | } | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 0000000000..044b8e0c10 --- /dev/null +++ b/kernel/power/swap.c | |||
@@ -0,0 +1,545 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/swap.c | ||
3 | * | ||
4 | * This file provides functions for reading the suspend image from | ||
5 | * and writing it to a swap partition. | ||
6 | * | ||
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | ||
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
9 | * | ||
10 | * This file is released under the GPLv2. | ||
11 | * | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/file.h> | ||
17 | #include <linux/utsname.h> | ||
18 | #include <linux/version.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/bitops.h> | ||
21 | #include <linux/genhd.h> | ||
22 | #include <linux/device.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | #include <linux/bio.h> | ||
25 | #include <linux/swap.h> | ||
26 | #include <linux/swapops.h> | ||
27 | #include <linux/pm.h> | ||
28 | |||
29 | #include "power.h" | ||
30 | |||
31 | extern char resume_file[]; | ||
32 | |||
33 | #define SWSUSP_SIG "S1SUSPEND" | ||
34 | |||
35 | static struct swsusp_header { | ||
36 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | ||
37 | swp_entry_t image; | ||
38 | char orig_sig[10]; | ||
39 | char sig[10]; | ||
40 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | ||
41 | |||
42 | /* | ||
43 | * Saving part... | ||
44 | */ | ||
45 | |||
46 | static unsigned short root_swap = 0xffff; | ||
47 | |||
48 | static int mark_swapfiles(swp_entry_t start) | ||
49 | { | ||
50 | int error; | ||
51 | |||
52 | rw_swap_page_sync(READ, | ||
53 | swp_entry(root_swap, 0), | ||
54 | virt_to_page((unsigned long)&swsusp_header)); | ||
55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | ||
56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | ||
57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | ||
58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | ||
59 | swsusp_header.image = start; | ||
60 | error = rw_swap_page_sync(WRITE, | ||
61 | swp_entry(root_swap, 0), | ||
62 | virt_to_page((unsigned long) | ||
63 | &swsusp_header)); | ||
64 | } else { | ||
65 | pr_debug("swsusp: Partition is not swap space.\n"); | ||
66 | error = -ENODEV; | ||
67 | } | ||
68 | return error; | ||
69 | } | ||
70 | |||
71 | /** | ||
72 | * swsusp_swap_check - check if the resume device is a swap device | ||
73 | * and get its index (if so) | ||
74 | */ | ||
75 | |||
76 | static int swsusp_swap_check(void) /* This is called before saving image */ | ||
77 | { | ||
78 | int res = swap_type_of(swsusp_resume_device); | ||
79 | |||
80 | if (res >= 0) { | ||
81 | root_swap = res; | ||
82 | return 0; | ||
83 | } | ||
84 | return res; | ||
85 | } | ||
86 | |||
87 | /** | ||
88 | * write_page - Write one page to given swap location. | ||
89 | * @buf: Address we're writing. | ||
90 | * @offset: Offset of the swap page we're writing to. | ||
91 | */ | ||
92 | |||
93 | static int write_page(void *buf, unsigned long offset) | ||
94 | { | ||
95 | swp_entry_t entry; | ||
96 | int error = -ENOSPC; | ||
97 | |||
98 | if (offset) { | ||
99 | entry = swp_entry(root_swap, offset); | ||
100 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); | ||
101 | } | ||
102 | return error; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * The swap map is a data structure used for keeping track of each page | ||
107 | * written to a swap partition. It consists of many swap_map_page | ||
108 | * structures that contain each an array of MAP_PAGE_SIZE swap entries. | ||
109 | * These structures are stored on the swap and linked together with the | ||
110 | * help of the .next_swap member. | ||
111 | * | ||
112 | * The swap map is created during suspend. The swap map pages are | ||
113 | * allocated and populated one at a time, so we only need one memory | ||
114 | * page to set up the entire structure. | ||
115 | * | ||
116 | * During resume we also only need to use one swap_map_page structure | ||
117 | * at a time. | ||
118 | */ | ||
119 | |||
120 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) | ||
121 | |||
122 | struct swap_map_page { | ||
123 | unsigned long entries[MAP_PAGE_ENTRIES]; | ||
124 | unsigned long next_swap; | ||
125 | }; | ||
126 | |||
127 | /** | ||
128 | * The swap_map_handle structure is used for handling swap in | ||
129 | * a file-alike way | ||
130 | */ | ||
131 | |||
132 | struct swap_map_handle { | ||
133 | struct swap_map_page *cur; | ||
134 | unsigned long cur_swap; | ||
135 | struct bitmap_page *bitmap; | ||
136 | unsigned int k; | ||
137 | }; | ||
138 | |||
139 | static void release_swap_writer(struct swap_map_handle *handle) | ||
140 | { | ||
141 | if (handle->cur) | ||
142 | free_page((unsigned long)handle->cur); | ||
143 | handle->cur = NULL; | ||
144 | if (handle->bitmap) | ||
145 | free_bitmap(handle->bitmap); | ||
146 | handle->bitmap = NULL; | ||
147 | } | ||
148 | |||
149 | static int get_swap_writer(struct swap_map_handle *handle) | ||
150 | { | ||
151 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); | ||
152 | if (!handle->cur) | ||
153 | return -ENOMEM; | ||
154 | handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); | ||
155 | if (!handle->bitmap) { | ||
156 | release_swap_writer(handle); | ||
157 | return -ENOMEM; | ||
158 | } | ||
159 | handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); | ||
160 | if (!handle->cur_swap) { | ||
161 | release_swap_writer(handle); | ||
162 | return -ENOSPC; | ||
163 | } | ||
164 | handle->k = 0; | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | static int swap_write_page(struct swap_map_handle *handle, void *buf) | ||
169 | { | ||
170 | int error; | ||
171 | unsigned long offset; | ||
172 | |||
173 | if (!handle->cur) | ||
174 | return -EINVAL; | ||
175 | offset = alloc_swap_page(root_swap, handle->bitmap); | ||
176 | error = write_page(buf, offset); | ||
177 | if (error) | ||
178 | return error; | ||
179 | handle->cur->entries[handle->k++] = offset; | ||
180 | if (handle->k >= MAP_PAGE_ENTRIES) { | ||
181 | offset = alloc_swap_page(root_swap, handle->bitmap); | ||
182 | if (!offset) | ||
183 | return -ENOSPC; | ||
184 | handle->cur->next_swap = offset; | ||
185 | error = write_page(handle->cur, handle->cur_swap); | ||
186 | if (error) | ||
187 | return error; | ||
188 | memset(handle->cur, 0, PAGE_SIZE); | ||
189 | handle->cur_swap = offset; | ||
190 | handle->k = 0; | ||
191 | } | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static int flush_swap_writer(struct swap_map_handle *handle) | ||
196 | { | ||
197 | if (handle->cur && handle->cur_swap) | ||
198 | return write_page(handle->cur, handle->cur_swap); | ||
199 | else | ||
200 | return -EINVAL; | ||
201 | } | ||
202 | |||
203 | /** | ||
204 | * save_image - save the suspend image data | ||
205 | */ | ||
206 | |||
207 | static int save_image(struct swap_map_handle *handle, | ||
208 | struct snapshot_handle *snapshot, | ||
209 | unsigned int nr_pages) | ||
210 | { | ||
211 | unsigned int m; | ||
212 | int ret; | ||
213 | int error = 0; | ||
214 | |||
215 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
216 | m = nr_pages / 100; | ||
217 | if (!m) | ||
218 | m = 1; | ||
219 | nr_pages = 0; | ||
220 | do { | ||
221 | ret = snapshot_read_next(snapshot, PAGE_SIZE); | ||
222 | if (ret > 0) { | ||
223 | error = swap_write_page(handle, data_of(*snapshot)); | ||
224 | if (error) | ||
225 | break; | ||
226 | if (!(nr_pages % m)) | ||
227 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
228 | nr_pages++; | ||
229 | } | ||
230 | } while (ret > 0); | ||
231 | if (!error) | ||
232 | printk("\b\b\b\bdone\n"); | ||
233 | return error; | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * enough_swap - Make sure we have enough swap to save the image. | ||
238 | * | ||
239 | * Returns TRUE or FALSE after checking the total amount of swap | ||
240 | * space avaiable from the resume partition. | ||
241 | */ | ||
242 | |||
243 | static int enough_swap(unsigned int nr_pages) | ||
244 | { | ||
245 | unsigned int free_swap = count_swap_pages(root_swap, 1); | ||
246 | |||
247 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | ||
248 | return free_swap > (nr_pages + PAGES_FOR_IO + | ||
249 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
250 | } | ||
251 | |||
252 | /** | ||
253 | * swsusp_write - Write entire image and metadata. | ||
254 | * | ||
255 | * It is important _NOT_ to umount filesystems at this point. We want | ||
256 | * them synced (in case something goes wrong) but we DO not want to mark | ||
257 | * filesystem clean: it is not. (And it does not matter, if we resume | ||
258 | * correctly, we'll mark system clean, anyway.) | ||
259 | */ | ||
260 | |||
261 | int swsusp_write(void) | ||
262 | { | ||
263 | struct swap_map_handle handle; | ||
264 | struct snapshot_handle snapshot; | ||
265 | struct swsusp_info *header; | ||
266 | unsigned long start; | ||
267 | int error; | ||
268 | |||
269 | if ((error = swsusp_swap_check())) { | ||
270 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
271 | return error; | ||
272 | } | ||
273 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | ||
274 | error = snapshot_read_next(&snapshot, PAGE_SIZE); | ||
275 | if (error < PAGE_SIZE) | ||
276 | return error < 0 ? error : -EFAULT; | ||
277 | header = (struct swsusp_info *)data_of(snapshot); | ||
278 | if (!enough_swap(header->pages)) { | ||
279 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
280 | return -ENOSPC; | ||
281 | } | ||
282 | error = get_swap_writer(&handle); | ||
283 | if (!error) { | ||
284 | start = handle.cur_swap; | ||
285 | error = swap_write_page(&handle, header); | ||
286 | } | ||
287 | if (!error) | ||
288 | error = save_image(&handle, &snapshot, header->pages - 1); | ||
289 | if (!error) { | ||
290 | flush_swap_writer(&handle); | ||
291 | printk("S"); | ||
292 | error = mark_swapfiles(swp_entry(root_swap, start)); | ||
293 | printk("|\n"); | ||
294 | } | ||
295 | if (error) | ||
296 | free_all_swap_pages(root_swap, handle.bitmap); | ||
297 | release_swap_writer(&handle); | ||
298 | return error; | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Using bio to read from swap. | ||
303 | * This code requires a bit more work than just using buffer heads | ||
304 | * but, it is the recommended way for 2.5/2.6. | ||
305 | * The following are to signal the beginning and end of I/O. Bios | ||
306 | * finish asynchronously, while we want them to happen synchronously. | ||
307 | * A simple atomic_t, and a wait loop take care of this problem. | ||
308 | */ | ||
309 | |||
310 | static atomic_t io_done = ATOMIC_INIT(0); | ||
311 | |||
312 | static int end_io(struct bio *bio, unsigned int num, int err) | ||
313 | { | ||
314 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
315 | panic("I/O error reading memory image"); | ||
316 | atomic_set(&io_done, 0); | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | static struct block_device *resume_bdev; | ||
321 | |||
322 | /** | ||
323 | * submit - submit BIO request. | ||
324 | * @rw: READ or WRITE. | ||
325 | * @off physical offset of page. | ||
326 | * @page: page we're reading or writing. | ||
327 | * | ||
328 | * Straight from the textbook - allocate and initialize the bio. | ||
329 | * If we're writing, make sure the page is marked as dirty. | ||
330 | * Then submit it and wait. | ||
331 | */ | ||
332 | |||
333 | static int submit(int rw, pgoff_t page_off, void *page) | ||
334 | { | ||
335 | int error = 0; | ||
336 | struct bio *bio; | ||
337 | |||
338 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
339 | if (!bio) | ||
340 | return -ENOMEM; | ||
341 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
342 | bio->bi_bdev = resume_bdev; | ||
343 | bio->bi_end_io = end_io; | ||
344 | |||
345 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | ||
346 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | ||
347 | error = -EFAULT; | ||
348 | goto Done; | ||
349 | } | ||
350 | |||
351 | atomic_set(&io_done, 1); | ||
352 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
353 | while (atomic_read(&io_done)) | ||
354 | yield(); | ||
355 | if (rw == READ) | ||
356 | bio_set_pages_dirty(bio); | ||
357 | Done: | ||
358 | bio_put(bio); | ||
359 | return error; | ||
360 | } | ||
361 | |||
362 | static int bio_read_page(pgoff_t page_off, void *page) | ||
363 | { | ||
364 | return submit(READ, page_off, page); | ||
365 | } | ||
366 | |||
367 | static int bio_write_page(pgoff_t page_off, void *page) | ||
368 | { | ||
369 | return submit(WRITE, page_off, page); | ||
370 | } | ||
371 | |||
372 | /** | ||
373 | * The following functions allow us to read data using a swap map | ||
374 | * in a file-alike way | ||
375 | */ | ||
376 | |||
377 | static void release_swap_reader(struct swap_map_handle *handle) | ||
378 | { | ||
379 | if (handle->cur) | ||
380 | free_page((unsigned long)handle->cur); | ||
381 | handle->cur = NULL; | ||
382 | } | ||
383 | |||
384 | static int get_swap_reader(struct swap_map_handle *handle, | ||
385 | swp_entry_t start) | ||
386 | { | ||
387 | int error; | ||
388 | |||
389 | if (!swp_offset(start)) | ||
390 | return -EINVAL; | ||
391 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
392 | if (!handle->cur) | ||
393 | return -ENOMEM; | ||
394 | error = bio_read_page(swp_offset(start), handle->cur); | ||
395 | if (error) { | ||
396 | release_swap_reader(handle); | ||
397 | return error; | ||
398 | } | ||
399 | handle->k = 0; | ||
400 | return 0; | ||
401 | } | ||
402 | |||
403 | static int swap_read_page(struct swap_map_handle *handle, void *buf) | ||
404 | { | ||
405 | unsigned long offset; | ||
406 | int error; | ||
407 | |||
408 | if (!handle->cur) | ||
409 | return -EINVAL; | ||
410 | offset = handle->cur->entries[handle->k]; | ||
411 | if (!offset) | ||
412 | return -EFAULT; | ||
413 | error = bio_read_page(offset, buf); | ||
414 | if (error) | ||
415 | return error; | ||
416 | if (++handle->k >= MAP_PAGE_ENTRIES) { | ||
417 | handle->k = 0; | ||
418 | offset = handle->cur->next_swap; | ||
419 | if (!offset) | ||
420 | release_swap_reader(handle); | ||
421 | else | ||
422 | error = bio_read_page(offset, handle->cur); | ||
423 | } | ||
424 | return error; | ||
425 | } | ||
426 | |||
427 | /** | ||
428 | * load_image - load the image using the swap map handle | ||
429 | * @handle and the snapshot handle @snapshot | ||
430 | * (assume there are @nr_pages pages to load) | ||
431 | */ | ||
432 | |||
433 | static int load_image(struct swap_map_handle *handle, | ||
434 | struct snapshot_handle *snapshot, | ||
435 | unsigned int nr_pages) | ||
436 | { | ||
437 | unsigned int m; | ||
438 | int ret; | ||
439 | int error = 0; | ||
440 | |||
441 | printk("Loading image data pages (%u pages) ... ", nr_pages); | ||
442 | m = nr_pages / 100; | ||
443 | if (!m) | ||
444 | m = 1; | ||
445 | nr_pages = 0; | ||
446 | do { | ||
447 | ret = snapshot_write_next(snapshot, PAGE_SIZE); | ||
448 | if (ret > 0) { | ||
449 | error = swap_read_page(handle, data_of(*snapshot)); | ||
450 | if (error) | ||
451 | break; | ||
452 | if (!(nr_pages % m)) | ||
453 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
454 | nr_pages++; | ||
455 | } | ||
456 | } while (ret > 0); | ||
457 | if (!error) { | ||
458 | printk("\b\b\b\bdone\n"); | ||
459 | if (!snapshot_image_loaded(snapshot)) | ||
460 | error = -ENODATA; | ||
461 | } | ||
462 | return error; | ||
463 | } | ||
464 | |||
465 | int swsusp_read(void) | ||
466 | { | ||
467 | int error; | ||
468 | struct swap_map_handle handle; | ||
469 | struct snapshot_handle snapshot; | ||
470 | struct swsusp_info *header; | ||
471 | |||
472 | if (IS_ERR(resume_bdev)) { | ||
473 | pr_debug("swsusp: block device not initialised\n"); | ||
474 | return PTR_ERR(resume_bdev); | ||
475 | } | ||
476 | |||
477 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | ||
478 | error = snapshot_write_next(&snapshot, PAGE_SIZE); | ||
479 | if (error < PAGE_SIZE) | ||
480 | return error < 0 ? error : -EFAULT; | ||
481 | header = (struct swsusp_info *)data_of(snapshot); | ||
482 | error = get_swap_reader(&handle, swsusp_header.image); | ||
483 | if (!error) | ||
484 | error = swap_read_page(&handle, header); | ||
485 | if (!error) | ||
486 | error = load_image(&handle, &snapshot, header->pages - 1); | ||
487 | release_swap_reader(&handle); | ||
488 | |||
489 | blkdev_put(resume_bdev); | ||
490 | |||
491 | if (!error) | ||
492 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
493 | else | ||
494 | pr_debug("swsusp: Error %d resuming\n", error); | ||
495 | return error; | ||
496 | } | ||
497 | |||
498 | /** | ||
499 | * swsusp_check - Check for swsusp signature in the resume device | ||
500 | */ | ||
501 | |||
502 | int swsusp_check(void) | ||
503 | { | ||
504 | int error; | ||
505 | |||
506 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | ||
507 | if (!IS_ERR(resume_bdev)) { | ||
508 | set_blocksize(resume_bdev, PAGE_SIZE); | ||
509 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | ||
510 | if ((error = bio_read_page(0, &swsusp_header))) | ||
511 | return error; | ||
512 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
513 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
514 | /* Reset swap signature now */ | ||
515 | error = bio_write_page(0, &swsusp_header); | ||
516 | } else { | ||
517 | return -EINVAL; | ||
518 | } | ||
519 | if (error) | ||
520 | blkdev_put(resume_bdev); | ||
521 | else | ||
522 | pr_debug("swsusp: Signature found, resuming\n"); | ||
523 | } else { | ||
524 | error = PTR_ERR(resume_bdev); | ||
525 | } | ||
526 | |||
527 | if (error) | ||
528 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
529 | |||
530 | return error; | ||
531 | } | ||
532 | |||
533 | /** | ||
534 | * swsusp_close - close swap device. | ||
535 | */ | ||
536 | |||
537 | void swsusp_close(void) | ||
538 | { | ||
539 | if (IS_ERR(resume_bdev)) { | ||
540 | pr_debug("swsusp: block device not initialised\n"); | ||
541 | return; | ||
542 | } | ||
543 | |||
544 | blkdev_put(resume_bdev); | ||
545 | } | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 4e90905f0e..c4016cbbd3 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -31,41 +31,24 @@ | |||
31 | * Fixed runaway init | 31 | * Fixed runaway init |
32 | * | 32 | * |
33 | * Rafael J. Wysocki <rjw@sisk.pl> | 33 | * Rafael J. Wysocki <rjw@sisk.pl> |
34 | * Added the swap map data structure and reworked the handling of swap | 34 | * Reworked the freeing of memory and the handling of swap |
35 | * | 35 | * |
36 | * More state savers are welcome. Especially for the scsi layer... | 36 | * More state savers are welcome. Especially for the scsi layer... |
37 | * | 37 | * |
38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | 38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt |
39 | */ | 39 | */ |
40 | 40 | ||
41 | #include <linux/module.h> | ||
42 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
43 | #include <linux/suspend.h> | 42 | #include <linux/suspend.h> |
44 | #include <linux/smp_lock.h> | ||
45 | #include <linux/file.h> | ||
46 | #include <linux/utsname.h> | ||
47 | #include <linux/version.h> | ||
48 | #include <linux/delay.h> | ||
49 | #include <linux/bitops.h> | ||
50 | #include <linux/spinlock.h> | 43 | #include <linux/spinlock.h> |
51 | #include <linux/genhd.h> | ||
52 | #include <linux/kernel.h> | 44 | #include <linux/kernel.h> |
53 | #include <linux/major.h> | 45 | #include <linux/major.h> |
54 | #include <linux/swap.h> | 46 | #include <linux/swap.h> |
55 | #include <linux/pm.h> | 47 | #include <linux/pm.h> |
56 | #include <linux/device.h> | ||
57 | #include <linux/buffer_head.h> | ||
58 | #include <linux/swapops.h> | 48 | #include <linux/swapops.h> |
59 | #include <linux/bootmem.h> | 49 | #include <linux/bootmem.h> |
60 | #include <linux/syscalls.h> | 50 | #include <linux/syscalls.h> |
61 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
62 | #include <linux/bio.h> | ||
63 | |||
64 | #include <asm/uaccess.h> | ||
65 | #include <asm/mmu_context.h> | ||
66 | #include <asm/pgtable.h> | ||
67 | #include <asm/tlbflush.h> | ||
68 | #include <asm/io.h> | ||
69 | 52 | ||
70 | #include "power.h" | 53 | #include "power.h" |
71 | 54 | ||
@@ -77,6 +60,8 @@ | |||
77 | */ | 60 | */ |
78 | unsigned long image_size = 500 * 1024 * 1024; | 61 | unsigned long image_size = 500 * 1024 * 1024; |
79 | 62 | ||
63 | int in_suspend __nosavedata = 0; | ||
64 | |||
80 | #ifdef CONFIG_HIGHMEM | 65 | #ifdef CONFIG_HIGHMEM |
81 | unsigned int count_highmem_pages(void); | 66 | unsigned int count_highmem_pages(void); |
82 | int save_highmem(void); | 67 | int save_highmem(void); |
@@ -87,473 +72,97 @@ static int restore_highmem(void) { return 0; } | |||
87 | static unsigned int count_highmem_pages(void) { return 0; } | 72 | static unsigned int count_highmem_pages(void) { return 0; } |
88 | #endif | 73 | #endif |
89 | 74 | ||
90 | extern char resume_file[]; | ||
91 | |||
92 | #define SWSUSP_SIG "S1SUSPEND" | ||
93 | |||
94 | static struct swsusp_header { | ||
95 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | ||
96 | swp_entry_t image; | ||
97 | char orig_sig[10]; | ||
98 | char sig[10]; | ||
99 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | ||
100 | |||
101 | static struct swsusp_info swsusp_info; | ||
102 | |||
103 | /* | ||
104 | * Saving part... | ||
105 | */ | ||
106 | |||
107 | static unsigned short root_swap = 0xffff; | ||
108 | |||
109 | static int mark_swapfiles(swp_entry_t start) | ||
110 | { | ||
111 | int error; | ||
112 | |||
113 | rw_swap_page_sync(READ, | ||
114 | swp_entry(root_swap, 0), | ||
115 | virt_to_page((unsigned long)&swsusp_header)); | ||
116 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | ||
117 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | ||
118 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | ||
119 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | ||
120 | swsusp_header.image = start; | ||
121 | error = rw_swap_page_sync(WRITE, | ||
122 | swp_entry(root_swap, 0), | ||
123 | virt_to_page((unsigned long) | ||
124 | &swsusp_header)); | ||
125 | } else { | ||
126 | pr_debug("swsusp: Partition is not swap space.\n"); | ||
127 | error = -ENODEV; | ||
128 | } | ||
129 | return error; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Check whether the swap device is the specified resume | ||
134 | * device, irrespective of whether they are specified by | ||
135 | * identical names. | ||
136 | * | ||
137 | * (Thus, device inode aliasing is allowed. You can say /dev/hda4 | ||
138 | * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] | ||
139 | * and they'll be considered the same device. This is *necessary* for | ||
140 | * devfs, since the resume code can only recognize the form /dev/hda4, | ||
141 | * but the suspend code would see the long name.) | ||
142 | */ | ||
143 | static inline int is_resume_device(const struct swap_info_struct *swap_info) | ||
144 | { | ||
145 | struct file *file = swap_info->swap_file; | ||
146 | struct inode *inode = file->f_dentry->d_inode; | ||
147 | |||
148 | return S_ISBLK(inode->i_mode) && | ||
149 | swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); | ||
150 | } | ||
151 | |||
152 | static int swsusp_swap_check(void) /* This is called before saving image */ | ||
153 | { | ||
154 | int i; | ||
155 | |||
156 | if (!swsusp_resume_device) | ||
157 | return -ENODEV; | ||
158 | spin_lock(&swap_lock); | ||
159 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
160 | if (!(swap_info[i].flags & SWP_WRITEOK)) | ||
161 | continue; | ||
162 | if (is_resume_device(swap_info + i)) { | ||
163 | spin_unlock(&swap_lock); | ||
164 | root_swap = i; | ||
165 | return 0; | ||
166 | } | ||
167 | } | ||
168 | spin_unlock(&swap_lock); | ||
169 | return -ENODEV; | ||
170 | } | ||
171 | |||
172 | /** | ||
173 | * write_page - Write one page to a fresh swap location. | ||
174 | * @addr: Address we're writing. | ||
175 | * @loc: Place to store the entry we used. | ||
176 | * | ||
177 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO | ||
178 | * errors. That is an artifact left over from swsusp. It did not | ||
179 | * check the return of rw_swap_page_sync() at all, since most pages | ||
180 | * written back to swap would return -EIO. | ||
181 | * This is a partial improvement, since we will at least return other | ||
182 | * errors, though we need to eventually fix the damn code. | ||
183 | */ | ||
184 | static int write_page(unsigned long addr, swp_entry_t *loc) | ||
185 | { | ||
186 | swp_entry_t entry; | ||
187 | int error = -ENOSPC; | ||
188 | |||
189 | entry = get_swap_page_of_type(root_swap); | ||
190 | if (swp_offset(entry)) { | ||
191 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); | ||
192 | if (!error || error == -EIO) | ||
193 | *loc = entry; | ||
194 | } | ||
195 | return error; | ||
196 | } | ||
197 | |||
198 | /** | 75 | /** |
199 | * Swap map-handling functions | 76 | * The following functions are used for tracing the allocated |
200 | * | 77 | * swap pages, so that they can be freed in case of an error. |
201 | * The swap map is a data structure used for keeping track of each page | ||
202 | * written to the swap. It consists of many swap_map_page structures | ||
203 | * that contain each an array of MAP_PAGE_SIZE swap entries. | ||
204 | * These structures are linked together with the help of either the | ||
205 | * .next (in memory) or the .next_swap (in swap) member. | ||
206 | * | 78 | * |
207 | * The swap map is created during suspend. At that time we need to keep | 79 | * The functions operate on a linked bitmap structure defined |
208 | * it in memory, because we have to free all of the allocated swap | 80 | * in power.h |
209 | * entries if an error occurs. The memory needed is preallocated | ||
210 | * so that we know in advance if there's enough of it. | ||
211 | * | ||
212 | * The first swap_map_page structure is filled with the swap entries that | ||
213 | * correspond to the first MAP_PAGE_SIZE data pages written to swap and | ||
214 | * so on. After the all of the data pages have been written, the order | ||
215 | * of the swap_map_page structures in the map is reversed so that they | ||
216 | * can be read from swap in the original order. This causes the data | ||
217 | * pages to be loaded in exactly the same order in which they have been | ||
218 | * saved. | ||
219 | * | ||
220 | * During resume we only need to use one swap_map_page structure | ||
221 | * at a time, which means that we only need to use two memory pages for | ||
222 | * reading the image - one for reading the swap_map_page structures | ||
223 | * and the second for reading the data pages from swap. | ||
224 | */ | 81 | */ |
225 | 82 | ||
226 | #define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ | 83 | void free_bitmap(struct bitmap_page *bitmap) |
227 | / sizeof(swp_entry_t)) | ||
228 | |||
229 | struct swap_map_page { | ||
230 | swp_entry_t entries[MAP_PAGE_SIZE]; | ||
231 | swp_entry_t next_swap; | ||
232 | struct swap_map_page *next; | ||
233 | }; | ||
234 | |||
235 | static inline void free_swap_map(struct swap_map_page *swap_map) | ||
236 | { | 84 | { |
237 | struct swap_map_page *swp; | 85 | struct bitmap_page *bp; |
238 | 86 | ||
239 | while (swap_map) { | 87 | while (bitmap) { |
240 | swp = swap_map->next; | 88 | bp = bitmap->next; |
241 | free_page((unsigned long)swap_map); | 89 | free_page((unsigned long)bitmap); |
242 | swap_map = swp; | 90 | bitmap = bp; |
243 | } | 91 | } |
244 | } | 92 | } |
245 | 93 | ||
246 | static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) | 94 | struct bitmap_page *alloc_bitmap(unsigned int nr_bits) |
247 | { | 95 | { |
248 | struct swap_map_page *swap_map, *swp; | 96 | struct bitmap_page *bitmap, *bp; |
249 | unsigned n = 0; | 97 | unsigned int n; |
250 | 98 | ||
251 | if (!nr_pages) | 99 | if (!nr_bits) |
252 | return NULL; | 100 | return NULL; |
253 | 101 | ||
254 | pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); | 102 | bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); |
255 | swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 103 | bp = bitmap; |
256 | swp = swap_map; | 104 | for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { |
257 | for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { | 105 | bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); |
258 | swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 106 | bp = bp->next; |
259 | swp = swp->next; | 107 | if (!bp) { |
260 | if (!swp) { | 108 | free_bitmap(bitmap); |
261 | free_swap_map(swap_map); | ||
262 | return NULL; | 109 | return NULL; |
263 | } | 110 | } |
264 | } | 111 | } |
265 | return swap_map; | 112 | return bitmap; |
266 | } | 113 | } |
267 | 114 | ||
268 | /** | 115 | static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) |
269 | * reverse_swap_map - reverse the order of pages in the swap map | ||
270 | * @swap_map | ||
271 | */ | ||
272 | |||
273 | static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) | ||
274 | { | ||
275 | struct swap_map_page *prev, *next; | ||
276 | |||
277 | prev = NULL; | ||
278 | while (swap_map) { | ||
279 | next = swap_map->next; | ||
280 | swap_map->next = prev; | ||
281 | prev = swap_map; | ||
282 | swap_map = next; | ||
283 | } | ||
284 | return prev; | ||
285 | } | ||
286 | |||
287 | /** | ||
288 | * free_swap_map_entries - free the swap entries allocated to store | ||
289 | * the swap map @swap_map (this is only called in case of an error) | ||
290 | */ | ||
291 | static inline void free_swap_map_entries(struct swap_map_page *swap_map) | ||
292 | { | ||
293 | while (swap_map) { | ||
294 | if (swap_map->next_swap.val) | ||
295 | swap_free(swap_map->next_swap); | ||
296 | swap_map = swap_map->next; | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /** | ||
301 | * save_swap_map - save the swap map used for tracing the data pages | ||
302 | * stored in the swap | ||
303 | */ | ||
304 | |||
305 | static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) | ||
306 | { | ||
307 | swp_entry_t entry = (swp_entry_t){0}; | ||
308 | int error; | ||
309 | |||
310 | while (swap_map) { | ||
311 | swap_map->next_swap = entry; | ||
312 | if ((error = write_page((unsigned long)swap_map, &entry))) | ||
313 | return error; | ||
314 | swap_map = swap_map->next; | ||
315 | } | ||
316 | *start = entry; | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | /** | ||
321 | * free_image_entries - free the swap entries allocated to store | ||
322 | * the image data pages (this is only called in case of an error) | ||
323 | */ | ||
324 | |||
325 | static inline void free_image_entries(struct swap_map_page *swp) | ||
326 | { | 116 | { |
327 | unsigned k; | 117 | unsigned int n; |
328 | 118 | ||
329 | while (swp) { | 119 | n = BITMAP_PAGE_BITS; |
330 | for (k = 0; k < MAP_PAGE_SIZE; k++) | 120 | while (bitmap && n <= bit) { |
331 | if (swp->entries[k].val) | 121 | n += BITMAP_PAGE_BITS; |
332 | swap_free(swp->entries[k]); | 122 | bitmap = bitmap->next; |
333 | swp = swp->next; | ||
334 | } | 123 | } |
335 | } | 124 | if (!bitmap) |
336 | 125 | return -EINVAL; | |
337 | /** | 126 | n -= BITMAP_PAGE_BITS; |
338 | * The swap_map_handle structure is used for handling the swap map in | 127 | bit -= n; |
339 | * a file-alike way | 128 | n = 0; |
340 | */ | 129 | while (bit >= BITS_PER_CHUNK) { |
341 | 130 | bit -= BITS_PER_CHUNK; | |
342 | struct swap_map_handle { | 131 | n++; |
343 | struct swap_map_page *cur; | ||
344 | unsigned int k; | ||
345 | }; | ||
346 | |||
347 | static inline void init_swap_map_handle(struct swap_map_handle *handle, | ||
348 | struct swap_map_page *map) | ||
349 | { | ||
350 | handle->cur = map; | ||
351 | handle->k = 0; | ||
352 | } | ||
353 | |||
354 | static inline int swap_map_write_page(struct swap_map_handle *handle, | ||
355 | unsigned long addr) | ||
356 | { | ||
357 | int error; | ||
358 | |||
359 | error = write_page(addr, handle->cur->entries + handle->k); | ||
360 | if (error) | ||
361 | return error; | ||
362 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
363 | handle->cur = handle->cur->next; | ||
364 | handle->k = 0; | ||
365 | } | 132 | } |
133 | bitmap->chunks[n] |= (1UL << bit); | ||
366 | return 0; | 134 | return 0; |
367 | } | 135 | } |
368 | 136 | ||
369 | /** | 137 | unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) |
370 | * save_image_data - save the data pages pointed to by the PBEs | ||
371 | * from the list @pblist using the swap map handle @handle | ||
372 | * (assume there are @nr_pages data pages to save) | ||
373 | */ | ||
374 | |||
375 | static int save_image_data(struct pbe *pblist, | ||
376 | struct swap_map_handle *handle, | ||
377 | unsigned int nr_pages) | ||
378 | { | ||
379 | unsigned int m; | ||
380 | struct pbe *p; | ||
381 | int error = 0; | ||
382 | |||
383 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
384 | m = nr_pages / 100; | ||
385 | if (!m) | ||
386 | m = 1; | ||
387 | nr_pages = 0; | ||
388 | for_each_pbe (p, pblist) { | ||
389 | error = swap_map_write_page(handle, p->address); | ||
390 | if (error) | ||
391 | break; | ||
392 | if (!(nr_pages % m)) | ||
393 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
394 | nr_pages++; | ||
395 | } | ||
396 | if (!error) | ||
397 | printk("\b\b\b\bdone\n"); | ||
398 | return error; | ||
399 | } | ||
400 | |||
401 | static void dump_info(void) | ||
402 | { | ||
403 | pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); | ||
404 | pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); | ||
405 | pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); | ||
406 | pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); | ||
407 | pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); | ||
408 | pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); | ||
409 | pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); | ||
410 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); | ||
411 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); | ||
412 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); | ||
413 | pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); | ||
414 | } | ||
415 | |||
416 | static void init_header(unsigned int nr_pages) | ||
417 | { | ||
418 | memset(&swsusp_info, 0, sizeof(swsusp_info)); | ||
419 | swsusp_info.version_code = LINUX_VERSION_CODE; | ||
420 | swsusp_info.num_physpages = num_physpages; | ||
421 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); | ||
422 | |||
423 | swsusp_info.cpus = num_online_cpus(); | ||
424 | swsusp_info.image_pages = nr_pages; | ||
425 | swsusp_info.pages = nr_pages + | ||
426 | ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; | ||
427 | } | ||
428 | |||
429 | /** | ||
430 | * pack_orig_addresses - the .orig_address fields of the PBEs from the | ||
431 | * list starting at @pbe are stored in the array @buf[] (1 page) | ||
432 | */ | ||
433 | |||
434 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, | ||
435 | struct pbe *pbe) | ||
436 | { | ||
437 | int j; | ||
438 | |||
439 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
440 | buf[j] = pbe->orig_address; | ||
441 | pbe = pbe->next; | ||
442 | } | ||
443 | if (!pbe) | ||
444 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
445 | buf[j] = 0; | ||
446 | return pbe; | ||
447 | } | ||
448 | |||
449 | /** | ||
450 | * save_image_metadata - save the .orig_address fields of the PBEs | ||
451 | * from the list @pblist using the swap map handle @handle | ||
452 | */ | ||
453 | |||
454 | static int save_image_metadata(struct pbe *pblist, | ||
455 | struct swap_map_handle *handle) | ||
456 | { | 138 | { |
457 | unsigned long *buf; | 139 | unsigned long offset; |
458 | unsigned int n = 0; | ||
459 | struct pbe *p; | ||
460 | int error = 0; | ||
461 | 140 | ||
462 | printk("Saving image metadata ... "); | 141 | offset = swp_offset(get_swap_page_of_type(swap)); |
463 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | 142 | if (offset) { |
464 | if (!buf) | 143 | if (bitmap_set(bitmap, offset)) { |
465 | return -ENOMEM; | 144 | swap_free(swp_entry(swap, offset)); |
466 | p = pblist; | 145 | offset = 0; |
467 | while (p) { | 146 | } |
468 | p = pack_orig_addresses(buf, p); | ||
469 | error = swap_map_write_page(handle, (unsigned long)buf); | ||
470 | if (error) | ||
471 | break; | ||
472 | n++; | ||
473 | } | 147 | } |
474 | free_page((unsigned long)buf); | 148 | return offset; |
475 | if (!error) | ||
476 | printk("done (%u pages saved)\n", n); | ||
477 | return error; | ||
478 | } | 149 | } |
479 | 150 | ||
480 | /** | 151 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) |
481 | * enough_swap - Make sure we have enough swap to save the image. | ||
482 | * | ||
483 | * Returns TRUE or FALSE after checking the total amount of swap | ||
484 | * space avaiable from the resume partition. | ||
485 | */ | ||
486 | |||
487 | static int enough_swap(unsigned int nr_pages) | ||
488 | { | 152 | { |
489 | unsigned int free_swap = swap_info[root_swap].pages - | 153 | unsigned int bit, n; |
490 | swap_info[root_swap].inuse_pages; | 154 | unsigned long test; |
491 | |||
492 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | ||
493 | return free_swap > (nr_pages + PAGES_FOR_IO + | ||
494 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
495 | } | ||
496 | 155 | ||
497 | /** | 156 | bit = 0; |
498 | * swsusp_write - Write entire image and metadata. | 157 | while (bitmap) { |
499 | * | 158 | for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) |
500 | * It is important _NOT_ to umount filesystems at this point. We want | 159 | for (test = 1UL; test; test <<= 1) { |
501 | * them synced (in case something goes wrong) but we DO not want to mark | 160 | if (bitmap->chunks[n] & test) |
502 | * filesystem clean: it is not. (And it does not matter, if we resume | 161 | swap_free(swp_entry(swap, bit)); |
503 | * correctly, we'll mark system clean, anyway.) | 162 | bit++; |
504 | */ | 163 | } |
505 | 164 | bitmap = bitmap->next; | |
506 | int swsusp_write(struct pbe *pblist, unsigned int nr_pages) | ||
507 | { | ||
508 | struct swap_map_page *swap_map; | ||
509 | struct swap_map_handle handle; | ||
510 | swp_entry_t start; | ||
511 | int error; | ||
512 | |||
513 | if ((error = swsusp_swap_check())) { | ||
514 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
515 | return error; | ||
516 | } | ||
517 | if (!enough_swap(nr_pages)) { | ||
518 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
519 | return -ENOSPC; | ||
520 | } | 165 | } |
521 | |||
522 | init_header(nr_pages); | ||
523 | swap_map = alloc_swap_map(swsusp_info.pages); | ||
524 | if (!swap_map) | ||
525 | return -ENOMEM; | ||
526 | init_swap_map_handle(&handle, swap_map); | ||
527 | |||
528 | error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); | ||
529 | if (!error) | ||
530 | error = save_image_metadata(pblist, &handle); | ||
531 | if (!error) | ||
532 | error = save_image_data(pblist, &handle, nr_pages); | ||
533 | if (error) | ||
534 | goto Free_image_entries; | ||
535 | |||
536 | swap_map = reverse_swap_map(swap_map); | ||
537 | error = save_swap_map(swap_map, &start); | ||
538 | if (error) | ||
539 | goto Free_map_entries; | ||
540 | |||
541 | dump_info(); | ||
542 | printk( "S" ); | ||
543 | error = mark_swapfiles(start); | ||
544 | printk( "|\n" ); | ||
545 | if (error) | ||
546 | goto Free_map_entries; | ||
547 | |||
548 | Free_swap_map: | ||
549 | free_swap_map(swap_map); | ||
550 | return error; | ||
551 | |||
552 | Free_map_entries: | ||
553 | free_swap_map_entries(swap_map); | ||
554 | Free_image_entries: | ||
555 | free_image_entries(swap_map); | ||
556 | goto Free_swap_map; | ||
557 | } | 166 | } |
558 | 167 | ||
559 | /** | 168 | /** |
@@ -662,379 +271,3 @@ int swsusp_resume(void) | |||
662 | local_irq_enable(); | 271 | local_irq_enable(); |
663 | return error; | 272 | return error; |
664 | } | 273 | } |
665 | |||
666 | /** | ||
667 | * mark_unsafe_pages - mark the pages that cannot be used for storing | ||
668 | * the image during resume, because they conflict with the pages that | ||
669 | * had been used before suspend | ||
670 | */ | ||
671 | |||
672 | static void mark_unsafe_pages(struct pbe *pblist) | ||
673 | { | ||
674 | struct zone *zone; | ||
675 | unsigned long zone_pfn; | ||
676 | struct pbe *p; | ||
677 | |||
678 | if (!pblist) /* a sanity check */ | ||
679 | return; | ||
680 | |||
681 | /* Clear page flags */ | ||
682 | for_each_zone (zone) { | ||
683 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
684 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) | ||
685 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
686 | zone->zone_start_pfn)); | ||
687 | } | ||
688 | |||
689 | /* Mark orig addresses */ | ||
690 | for_each_pbe (p, pblist) | ||
691 | SetPageNosaveFree(virt_to_page(p->orig_address)); | ||
692 | |||
693 | } | ||
694 | |||
695 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | ||
696 | { | ||
697 | /* We assume both lists contain the same number of elements */ | ||
698 | while (src) { | ||
699 | dst->orig_address = src->orig_address; | ||
700 | dst = dst->next; | ||
701 | src = src->next; | ||
702 | } | ||
703 | } | ||
704 | |||
705 | /* | ||
706 | * Using bio to read from swap. | ||
707 | * This code requires a bit more work than just using buffer heads | ||
708 | * but, it is the recommended way for 2.5/2.6. | ||
709 | * The following are to signal the beginning and end of I/O. Bios | ||
710 | * finish asynchronously, while we want them to happen synchronously. | ||
711 | * A simple atomic_t, and a wait loop take care of this problem. | ||
712 | */ | ||
713 | |||
714 | static atomic_t io_done = ATOMIC_INIT(0); | ||
715 | |||
716 | static int end_io(struct bio *bio, unsigned int num, int err) | ||
717 | { | ||
718 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
719 | panic("I/O error reading memory image"); | ||
720 | atomic_set(&io_done, 0); | ||
721 | return 0; | ||
722 | } | ||
723 | |||
724 | static struct block_device *resume_bdev; | ||
725 | |||
726 | /** | ||
727 | * submit - submit BIO request. | ||
728 | * @rw: READ or WRITE. | ||
729 | * @off physical offset of page. | ||
730 | * @page: page we're reading or writing. | ||
731 | * | ||
732 | * Straight from the textbook - allocate and initialize the bio. | ||
733 | * If we're writing, make sure the page is marked as dirty. | ||
734 | * Then submit it and wait. | ||
735 | */ | ||
736 | |||
737 | static int submit(int rw, pgoff_t page_off, void *page) | ||
738 | { | ||
739 | int error = 0; | ||
740 | struct bio *bio; | ||
741 | |||
742 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
743 | if (!bio) | ||
744 | return -ENOMEM; | ||
745 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
746 | bio->bi_bdev = resume_bdev; | ||
747 | bio->bi_end_io = end_io; | ||
748 | |||
749 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | ||
750 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | ||
751 | error = -EFAULT; | ||
752 | goto Done; | ||
753 | } | ||
754 | |||
755 | |||
756 | atomic_set(&io_done, 1); | ||
757 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
758 | while (atomic_read(&io_done)) | ||
759 | yield(); | ||
760 | if (rw == READ) | ||
761 | bio_set_pages_dirty(bio); | ||
762 | Done: | ||
763 | bio_put(bio); | ||
764 | return error; | ||
765 | } | ||
766 | |||
767 | static int bio_read_page(pgoff_t page_off, void *page) | ||
768 | { | ||
769 | return submit(READ, page_off, page); | ||
770 | } | ||
771 | |||
772 | static int bio_write_page(pgoff_t page_off, void *page) | ||
773 | { | ||
774 | return submit(WRITE, page_off, page); | ||
775 | } | ||
776 | |||
777 | /** | ||
778 | * The following functions allow us to read data using a swap map | ||
779 | * in a file-alike way | ||
780 | */ | ||
781 | |||
782 | static inline void release_swap_map_reader(struct swap_map_handle *handle) | ||
783 | { | ||
784 | if (handle->cur) | ||
785 | free_page((unsigned long)handle->cur); | ||
786 | handle->cur = NULL; | ||
787 | } | ||
788 | |||
789 | static inline int get_swap_map_reader(struct swap_map_handle *handle, | ||
790 | swp_entry_t start) | ||
791 | { | ||
792 | int error; | ||
793 | |||
794 | if (!swp_offset(start)) | ||
795 | return -EINVAL; | ||
796 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
797 | if (!handle->cur) | ||
798 | return -ENOMEM; | ||
799 | error = bio_read_page(swp_offset(start), handle->cur); | ||
800 | if (error) { | ||
801 | release_swap_map_reader(handle); | ||
802 | return error; | ||
803 | } | ||
804 | handle->k = 0; | ||
805 | return 0; | ||
806 | } | ||
807 | |||
808 | static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) | ||
809 | { | ||
810 | unsigned long offset; | ||
811 | int error; | ||
812 | |||
813 | if (!handle->cur) | ||
814 | return -EINVAL; | ||
815 | offset = swp_offset(handle->cur->entries[handle->k]); | ||
816 | if (!offset) | ||
817 | return -EINVAL; | ||
818 | error = bio_read_page(offset, buf); | ||
819 | if (error) | ||
820 | return error; | ||
821 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
822 | handle->k = 0; | ||
823 | offset = swp_offset(handle->cur->next_swap); | ||
824 | if (!offset) | ||
825 | release_swap_map_reader(handle); | ||
826 | else | ||
827 | error = bio_read_page(offset, handle->cur); | ||
828 | } | ||
829 | return error; | ||
830 | } | ||
831 | |||
832 | static int check_header(void) | ||
833 | { | ||
834 | char *reason = NULL; | ||
835 | |||
836 | dump_info(); | ||
837 | if (swsusp_info.version_code != LINUX_VERSION_CODE) | ||
838 | reason = "kernel version"; | ||
839 | if (swsusp_info.num_physpages != num_physpages) | ||
840 | reason = "memory size"; | ||
841 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | ||
842 | reason = "system type"; | ||
843 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
844 | reason = "kernel release"; | ||
845 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
846 | reason = "version"; | ||
847 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
848 | reason = "machine"; | ||
849 | if (reason) { | ||
850 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
851 | return -EPERM; | ||
852 | } | ||
853 | return 0; | ||
854 | } | ||
855 | |||
856 | /** | ||
857 | * load_image_data - load the image data using the swap map handle | ||
858 | * @handle and store them using the page backup list @pblist | ||
859 | * (assume there are @nr_pages pages to load) | ||
860 | */ | ||
861 | |||
862 | static int load_image_data(struct pbe *pblist, | ||
863 | struct swap_map_handle *handle, | ||
864 | unsigned int nr_pages) | ||
865 | { | ||
866 | int error; | ||
867 | unsigned int m; | ||
868 | struct pbe *p; | ||
869 | |||
870 | if (!pblist) | ||
871 | return -EINVAL; | ||
872 | printk("Loading image data pages (%u pages) ... ", nr_pages); | ||
873 | m = nr_pages / 100; | ||
874 | if (!m) | ||
875 | m = 1; | ||
876 | nr_pages = 0; | ||
877 | p = pblist; | ||
878 | while (p) { | ||
879 | error = swap_map_read_page(handle, (void *)p->address); | ||
880 | if (error) | ||
881 | break; | ||
882 | p = p->next; | ||
883 | if (!(nr_pages % m)) | ||
884 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
885 | nr_pages++; | ||
886 | } | ||
887 | if (!error) | ||
888 | printk("\b\b\b\bdone\n"); | ||
889 | return error; | ||
890 | } | ||
891 | |||
892 | /** | ||
893 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
894 | * the PBEs in the list starting at @pbe | ||
895 | */ | ||
896 | |||
897 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | ||
898 | struct pbe *pbe) | ||
899 | { | ||
900 | int j; | ||
901 | |||
902 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | ||
903 | pbe->orig_address = buf[j]; | ||
904 | pbe = pbe->next; | ||
905 | } | ||
906 | return pbe; | ||
907 | } | ||
908 | |||
909 | /** | ||
910 | * load_image_metadata - load the image metadata using the swap map | ||
911 | * handle @handle and put them into the PBEs in the list @pblist | ||
912 | */ | ||
913 | |||
914 | static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) | ||
915 | { | ||
916 | struct pbe *p; | ||
917 | unsigned long *buf; | ||
918 | unsigned int n = 0; | ||
919 | int error = 0; | ||
920 | |||
921 | printk("Loading image metadata ... "); | ||
922 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | ||
923 | if (!buf) | ||
924 | return -ENOMEM; | ||
925 | p = pblist; | ||
926 | while (p) { | ||
927 | error = swap_map_read_page(handle, buf); | ||
928 | if (error) | ||
929 | break; | ||
930 | p = unpack_orig_addresses(buf, p); | ||
931 | n++; | ||
932 | } | ||
933 | free_page((unsigned long)buf); | ||
934 | if (!error) | ||
935 | printk("done (%u pages loaded)\n", n); | ||
936 | return error; | ||
937 | } | ||
938 | |||
939 | int swsusp_read(struct pbe **pblist_ptr) | ||
940 | { | ||
941 | int error; | ||
942 | struct pbe *p, *pblist; | ||
943 | struct swap_map_handle handle; | ||
944 | unsigned int nr_pages; | ||
945 | |||
946 | if (IS_ERR(resume_bdev)) { | ||
947 | pr_debug("swsusp: block device not initialised\n"); | ||
948 | return PTR_ERR(resume_bdev); | ||
949 | } | ||
950 | |||
951 | error = get_swap_map_reader(&handle, swsusp_header.image); | ||
952 | if (!error) | ||
953 | error = swap_map_read_page(&handle, &swsusp_info); | ||
954 | if (!error) | ||
955 | error = check_header(); | ||
956 | if (error) | ||
957 | return error; | ||
958 | nr_pages = swsusp_info.image_pages; | ||
959 | p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); | ||
960 | if (!p) | ||
961 | return -ENOMEM; | ||
962 | error = load_image_metadata(p, &handle); | ||
963 | if (!error) { | ||
964 | mark_unsafe_pages(p); | ||
965 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); | ||
966 | if (pblist) | ||
967 | copy_page_backup_list(pblist, p); | ||
968 | free_pagedir(p); | ||
969 | if (!pblist) | ||
970 | error = -ENOMEM; | ||
971 | |||
972 | /* Allocate memory for the image and read the data from swap */ | ||
973 | if (!error) | ||
974 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
975 | if (!error) { | ||
976 | release_eaten_pages(); | ||
977 | error = load_image_data(pblist, &handle, nr_pages); | ||
978 | } | ||
979 | if (!error) | ||
980 | *pblist_ptr = pblist; | ||
981 | } | ||
982 | release_swap_map_reader(&handle); | ||
983 | |||
984 | blkdev_put(resume_bdev); | ||
985 | |||
986 | if (!error) | ||
987 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
988 | else | ||
989 | pr_debug("swsusp: Error %d resuming\n", error); | ||
990 | return error; | ||
991 | } | ||
992 | |||
993 | /** | ||
994 | * swsusp_check - Check for swsusp signature in the resume device | ||
995 | */ | ||
996 | |||
997 | int swsusp_check(void) | ||
998 | { | ||
999 | int error; | ||
1000 | |||
1001 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | ||
1002 | if (!IS_ERR(resume_bdev)) { | ||
1003 | set_blocksize(resume_bdev, PAGE_SIZE); | ||
1004 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | ||
1005 | if ((error = bio_read_page(0, &swsusp_header))) | ||
1006 | return error; | ||
1007 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
1008 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
1009 | /* Reset swap signature now */ | ||
1010 | error = bio_write_page(0, &swsusp_header); | ||
1011 | } else { | ||
1012 | return -EINVAL; | ||
1013 | } | ||
1014 | if (error) | ||
1015 | blkdev_put(resume_bdev); | ||
1016 | else | ||
1017 | pr_debug("swsusp: Signature found, resuming\n"); | ||
1018 | } else { | ||
1019 | error = PTR_ERR(resume_bdev); | ||
1020 | } | ||
1021 | |||
1022 | if (error) | ||
1023 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
1024 | |||
1025 | return error; | ||
1026 | } | ||
1027 | |||
1028 | /** | ||
1029 | * swsusp_close - close swap device. | ||
1030 | */ | ||
1031 | |||
1032 | void swsusp_close(void) | ||
1033 | { | ||
1034 | if (IS_ERR(resume_bdev)) { | ||
1035 | pr_debug("swsusp: block device not initialised\n"); | ||
1036 | return; | ||
1037 | } | ||
1038 | |||
1039 | blkdev_put(resume_bdev); | ||
1040 | } | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 0000000000..3f1539fbe4 --- /dev/null +++ b/kernel/power/user.c | |||
@@ -0,0 +1,333 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/user.c | ||
3 | * | ||
4 | * This file provides the user space interface for software suspend/resume. | ||
5 | * | ||
6 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | ||
8 | * This file is released under the GPLv2. | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #include <linux/suspend.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/device.h> | ||
16 | #include <linux/miscdevice.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/swap.h> | ||
19 | #include <linux/swapops.h> | ||
20 | #include <linux/pm.h> | ||
21 | #include <linux/fs.h> | ||
22 | |||
23 | #include <asm/uaccess.h> | ||
24 | |||
25 | #include "power.h" | ||
26 | |||
27 | #define SNAPSHOT_MINOR 231 | ||
28 | |||
29 | static struct snapshot_data { | ||
30 | struct snapshot_handle handle; | ||
31 | int swap; | ||
32 | struct bitmap_page *bitmap; | ||
33 | int mode; | ||
34 | char frozen; | ||
35 | char ready; | ||
36 | } snapshot_state; | ||
37 | |||
38 | static atomic_t device_available = ATOMIC_INIT(1); | ||
39 | |||
40 | static int snapshot_open(struct inode *inode, struct file *filp) | ||
41 | { | ||
42 | struct snapshot_data *data; | ||
43 | |||
44 | if (!atomic_add_unless(&device_available, -1, 0)) | ||
45 | return -EBUSY; | ||
46 | |||
47 | if ((filp->f_flags & O_ACCMODE) == O_RDWR) | ||
48 | return -ENOSYS; | ||
49 | |||
50 | nonseekable_open(inode, filp); | ||
51 | data = &snapshot_state; | ||
52 | filp->private_data = data; | ||
53 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | ||
54 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { | ||
55 | data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; | ||
56 | data->mode = O_RDONLY; | ||
57 | } else { | ||
58 | data->swap = -1; | ||
59 | data->mode = O_WRONLY; | ||
60 | } | ||
61 | data->bitmap = NULL; | ||
62 | data->frozen = 0; | ||
63 | data->ready = 0; | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | static int snapshot_release(struct inode *inode, struct file *filp) | ||
69 | { | ||
70 | struct snapshot_data *data; | ||
71 | |||
72 | swsusp_free(); | ||
73 | data = filp->private_data; | ||
74 | free_all_swap_pages(data->swap, data->bitmap); | ||
75 | free_bitmap(data->bitmap); | ||
76 | if (data->frozen) { | ||
77 | down(&pm_sem); | ||
78 | thaw_processes(); | ||
79 | enable_nonboot_cpus(); | ||
80 | up(&pm_sem); | ||
81 | } | ||
82 | atomic_inc(&device_available); | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static ssize_t snapshot_read(struct file *filp, char __user *buf, | ||
87 | size_t count, loff_t *offp) | ||
88 | { | ||
89 | struct snapshot_data *data; | ||
90 | ssize_t res; | ||
91 | |||
92 | data = filp->private_data; | ||
93 | res = snapshot_read_next(&data->handle, count); | ||
94 | if (res > 0) { | ||
95 | if (copy_to_user(buf, data_of(data->handle), res)) | ||
96 | res = -EFAULT; | ||
97 | else | ||
98 | *offp = data->handle.offset; | ||
99 | } | ||
100 | return res; | ||
101 | } | ||
102 | |||
103 | static ssize_t snapshot_write(struct file *filp, const char __user *buf, | ||
104 | size_t count, loff_t *offp) | ||
105 | { | ||
106 | struct snapshot_data *data; | ||
107 | ssize_t res; | ||
108 | |||
109 | data = filp->private_data; | ||
110 | res = snapshot_write_next(&data->handle, count); | ||
111 | if (res > 0) { | ||
112 | if (copy_from_user(data_of(data->handle), buf, res)) | ||
113 | res = -EFAULT; | ||
114 | else | ||
115 | *offp = data->handle.offset; | ||
116 | } | ||
117 | return res; | ||
118 | } | ||
119 | |||
120 | static int snapshot_ioctl(struct inode *inode, struct file *filp, | ||
121 | unsigned int cmd, unsigned long arg) | ||
122 | { | ||
123 | int error = 0; | ||
124 | struct snapshot_data *data; | ||
125 | loff_t offset, avail; | ||
126 | |||
127 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) | ||
128 | return -ENOTTY; | ||
129 | if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) | ||
130 | return -ENOTTY; | ||
131 | if (!capable(CAP_SYS_ADMIN)) | ||
132 | return -EPERM; | ||
133 | |||
134 | data = filp->private_data; | ||
135 | |||
136 | switch (cmd) { | ||
137 | |||
138 | case SNAPSHOT_FREEZE: | ||
139 | if (data->frozen) | ||
140 | break; | ||
141 | down(&pm_sem); | ||
142 | disable_nonboot_cpus(); | ||
143 | if (freeze_processes()) { | ||
144 | thaw_processes(); | ||
145 | enable_nonboot_cpus(); | ||
146 | error = -EBUSY; | ||
147 | } | ||
148 | up(&pm_sem); | ||
149 | if (!error) | ||
150 | data->frozen = 1; | ||
151 | break; | ||
152 | |||
153 | case SNAPSHOT_UNFREEZE: | ||
154 | if (!data->frozen) | ||
155 | break; | ||
156 | down(&pm_sem); | ||
157 | thaw_processes(); | ||
158 | enable_nonboot_cpus(); | ||
159 | up(&pm_sem); | ||
160 | data->frozen = 0; | ||
161 | break; | ||
162 | |||
163 | case SNAPSHOT_ATOMIC_SNAPSHOT: | ||
164 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | ||
165 | error = -EPERM; | ||
166 | break; | ||
167 | } | ||
168 | down(&pm_sem); | ||
169 | /* Free memory before shutting down devices. */ | ||
170 | error = swsusp_shrink_memory(); | ||
171 | if (!error) { | ||
172 | error = device_suspend(PMSG_FREEZE); | ||
173 | if (!error) { | ||
174 | in_suspend = 1; | ||
175 | error = swsusp_suspend(); | ||
176 | device_resume(); | ||
177 | } | ||
178 | } | ||
179 | up(&pm_sem); | ||
180 | if (!error) | ||
181 | error = put_user(in_suspend, (unsigned int __user *)arg); | ||
182 | if (!error) | ||
183 | data->ready = 1; | ||
184 | break; | ||
185 | |||
186 | case SNAPSHOT_ATOMIC_RESTORE: | ||
187 | if (data->mode != O_WRONLY || !data->frozen || | ||
188 | !snapshot_image_loaded(&data->handle)) { | ||
189 | error = -EPERM; | ||
190 | break; | ||
191 | } | ||
192 | down(&pm_sem); | ||
193 | pm_prepare_console(); | ||
194 | error = device_suspend(PMSG_FREEZE); | ||
195 | if (!error) { | ||
196 | error = swsusp_resume(); | ||
197 | device_resume(); | ||
198 | } | ||
199 | pm_restore_console(); | ||
200 | up(&pm_sem); | ||
201 | break; | ||
202 | |||
203 | case SNAPSHOT_FREE: | ||
204 | swsusp_free(); | ||
205 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | ||
206 | data->ready = 0; | ||
207 | break; | ||
208 | |||
209 | case SNAPSHOT_SET_IMAGE_SIZE: | ||
210 | image_size = arg; | ||
211 | break; | ||
212 | |||
213 | case SNAPSHOT_AVAIL_SWAP: | ||
214 | avail = count_swap_pages(data->swap, 1); | ||
215 | avail <<= PAGE_SHIFT; | ||
216 | error = put_user(avail, (loff_t __user *)arg); | ||
217 | break; | ||
218 | |||
219 | case SNAPSHOT_GET_SWAP_PAGE: | ||
220 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | ||
221 | error = -ENODEV; | ||
222 | break; | ||
223 | } | ||
224 | if (!data->bitmap) { | ||
225 | data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); | ||
226 | if (!data->bitmap) { | ||
227 | error = -ENOMEM; | ||
228 | break; | ||
229 | } | ||
230 | } | ||
231 | offset = alloc_swap_page(data->swap, data->bitmap); | ||
232 | if (offset) { | ||
233 | offset <<= PAGE_SHIFT; | ||
234 | error = put_user(offset, (loff_t __user *)arg); | ||
235 | } else { | ||
236 | error = -ENOSPC; | ||
237 | } | ||
238 | break; | ||
239 | |||
240 | case SNAPSHOT_FREE_SWAP_PAGES: | ||
241 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | ||
242 | error = -ENODEV; | ||
243 | break; | ||
244 | } | ||
245 | free_all_swap_pages(data->swap, data->bitmap); | ||
246 | free_bitmap(data->bitmap); | ||
247 | data->bitmap = NULL; | ||
248 | break; | ||
249 | |||
250 | case SNAPSHOT_SET_SWAP_FILE: | ||
251 | if (!data->bitmap) { | ||
252 | /* | ||
253 | * User space encodes device types as two-byte values, | ||
254 | * so we need to recode them | ||
255 | */ | ||
256 | if (old_decode_dev(arg)) { | ||
257 | data->swap = swap_type_of(old_decode_dev(arg)); | ||
258 | if (data->swap < 0) | ||
259 | error = -ENODEV; | ||
260 | } else { | ||
261 | data->swap = -1; | ||
262 | error = -EINVAL; | ||
263 | } | ||
264 | } else { | ||
265 | error = -EPERM; | ||
266 | } | ||
267 | break; | ||
268 | |||
269 | case SNAPSHOT_S2RAM: | ||
270 | if (!data->frozen) { | ||
271 | error = -EPERM; | ||
272 | break; | ||
273 | } | ||
274 | |||
275 | if (down_trylock(&pm_sem)) { | ||
276 | error = -EBUSY; | ||
277 | break; | ||
278 | } | ||
279 | |||
280 | if (pm_ops->prepare) { | ||
281 | error = pm_ops->prepare(PM_SUSPEND_MEM); | ||
282 | if (error) | ||
283 | goto OutS3; | ||
284 | } | ||
285 | |||
286 | /* Put devices to sleep */ | ||
287 | error = device_suspend(PMSG_SUSPEND); | ||
288 | if (error) { | ||
289 | printk(KERN_ERR "Failed to suspend some devices.\n"); | ||
290 | } else { | ||
291 | /* Enter S3, system is already frozen */ | ||
292 | suspend_enter(PM_SUSPEND_MEM); | ||
293 | |||
294 | /* Wake up devices */ | ||
295 | device_resume(); | ||
296 | } | ||
297 | |||
298 | if (pm_ops->finish) | ||
299 | pm_ops->finish(PM_SUSPEND_MEM); | ||
300 | |||
301 | OutS3: | ||
302 | up(&pm_sem); | ||
303 | break; | ||
304 | |||
305 | default: | ||
306 | error = -ENOTTY; | ||
307 | |||
308 | } | ||
309 | |||
310 | return error; | ||
311 | } | ||
312 | |||
313 | static struct file_operations snapshot_fops = { | ||
314 | .open = snapshot_open, | ||
315 | .release = snapshot_release, | ||
316 | .read = snapshot_read, | ||
317 | .write = snapshot_write, | ||
318 | .llseek = no_llseek, | ||
319 | .ioctl = snapshot_ioctl, | ||
320 | }; | ||
321 | |||
322 | static struct miscdevice snapshot_device = { | ||
323 | .minor = SNAPSHOT_MINOR, | ||
324 | .name = "snapshot", | ||
325 | .fops = &snapshot_fops, | ||
326 | }; | ||
327 | |||
328 | static int __init snapshot_device_init(void) | ||
329 | { | ||
330 | return misc_register(&snapshot_device); | ||
331 | }; | ||
332 | |||
333 | device_initcall(snapshot_device_init); | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 13ced0f782..c056f33244 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf; | |||
122 | static int log_buf_len = __LOG_BUF_LEN; | 122 | static int log_buf_len = __LOG_BUF_LEN; |
123 | static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ | 123 | static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ |
124 | 124 | ||
125 | /* | ||
126 | * Setup a list of consoles. Called from init/main.c | ||
127 | */ | ||
128 | static int __init console_setup(char *str) | ||
129 | { | ||
130 | char name[sizeof(console_cmdline[0].name)]; | ||
131 | char *s, *options; | ||
132 | int idx; | ||
133 | |||
134 | /* | ||
135 | * Decode str into name, index, options. | ||
136 | */ | ||
137 | if (str[0] >= '0' && str[0] <= '9') { | ||
138 | strcpy(name, "ttyS"); | ||
139 | strncpy(name + 4, str, sizeof(name) - 5); | ||
140 | } else | ||
141 | strncpy(name, str, sizeof(name) - 1); | ||
142 | name[sizeof(name) - 1] = 0; | ||
143 | if ((options = strchr(str, ',')) != NULL) | ||
144 | *(options++) = 0; | ||
145 | #ifdef __sparc__ | ||
146 | if (!strcmp(str, "ttya")) | ||
147 | strcpy(name, "ttyS0"); | ||
148 | if (!strcmp(str, "ttyb")) | ||
149 | strcpy(name, "ttyS1"); | ||
150 | #endif | ||
151 | for (s = name; *s; s++) | ||
152 | if ((*s >= '0' && *s <= '9') || *s == ',') | ||
153 | break; | ||
154 | idx = simple_strtoul(s, NULL, 10); | ||
155 | *s = 0; | ||
156 | |||
157 | add_preferred_console(name, idx, options); | ||
158 | return 1; | ||
159 | } | ||
160 | |||
161 | __setup("console=", console_setup); | ||
162 | |||
163 | static int __init log_buf_len_setup(char *str) | 125 | static int __init log_buf_len_setup(char *str) |
164 | { | 126 | { |
165 | unsigned long size = memparse(str, &str); | 127 | unsigned long size = memparse(str, &str); |
@@ -398,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
398 | unsigned long cur_index, start_print; | 360 | unsigned long cur_index, start_print; |
399 | static int msg_level = -1; | 361 | static int msg_level = -1; |
400 | 362 | ||
401 | if (((long)(start - end)) > 0) | 363 | BUG_ON(((long)(start - end)) > 0); |
402 | BUG(); | ||
403 | 364 | ||
404 | cur_index = start; | 365 | cur_index = start; |
405 | start_print = start; | 366 | start_print = start; |
@@ -659,6 +620,44 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
659 | 620 | ||
660 | #endif | 621 | #endif |
661 | 622 | ||
623 | /* | ||
624 | * Set up a list of consoles. Called from init/main.c | ||
625 | */ | ||
626 | static int __init console_setup(char *str) | ||
627 | { | ||
628 | char name[sizeof(console_cmdline[0].name)]; | ||
629 | char *s, *options; | ||
630 | int idx; | ||
631 | |||
632 | /* | ||
633 | * Decode str into name, index, options. | ||
634 | */ | ||
635 | if (str[0] >= '0' && str[0] <= '9') { | ||
636 | strcpy(name, "ttyS"); | ||
637 | strncpy(name + 4, str, sizeof(name) - 5); | ||
638 | } else { | ||
639 | strncpy(name, str, sizeof(name) - 1); | ||
640 | } | ||
641 | name[sizeof(name) - 1] = 0; | ||
642 | if ((options = strchr(str, ',')) != NULL) | ||
643 | *(options++) = 0; | ||
644 | #ifdef __sparc__ | ||
645 | if (!strcmp(str, "ttya")) | ||
646 | strcpy(name, "ttyS0"); | ||
647 | if (!strcmp(str, "ttyb")) | ||
648 | strcpy(name, "ttyS1"); | ||
649 | #endif | ||
650 | for (s = name; *s; s++) | ||
651 | if ((*s >= '0' && *s <= '9') || *s == ',') | ||
652 | break; | ||
653 | idx = simple_strtoul(s, NULL, 10); | ||
654 | *s = 0; | ||
655 | |||
656 | add_preferred_console(name, idx, options); | ||
657 | return 1; | ||
658 | } | ||
659 | __setup("console=", console_setup); | ||
660 | |||
662 | /** | 661 | /** |
663 | * add_preferred_console - add a device to the list of preferred consoles. | 662 | * add_preferred_console - add a device to the list of preferred consoles. |
664 | * @name: device name | 663 | * @name: device name |
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
708 | */ | 707 | */ |
709 | void acquire_console_sem(void) | 708 | void acquire_console_sem(void) |
710 | { | 709 | { |
711 | if (in_interrupt()) | 710 | BUG_ON(in_interrupt()); |
712 | BUG(); | ||
713 | down(&console_sem); | 711 | down(&console_sem); |
714 | console_locked = 1; | 712 | console_locked = 1; |
715 | console_may_schedule = 1; | 713 | console_may_schedule = 1; |
diff --git a/kernel/profile.c b/kernel/profile.c index f89248e6d7..68afe121e5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
24 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
25 | #include <linux/highmem.h> | 25 | #include <linux/highmem.h> |
26 | #include <linux/mutex.h> | ||
26 | #include <asm/sections.h> | 27 | #include <asm/sections.h> |
27 | #include <asm/semaphore.h> | 28 | #include <asm/semaphore.h> |
28 | 29 | ||
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | |||
44 | #ifdef CONFIG_SMP | 45 | #ifdef CONFIG_SMP |
45 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
46 | static DEFINE_PER_CPU(int, cpu_profile_flip); | 47 | static DEFINE_PER_CPU(int, cpu_profile_flip); |
47 | static DECLARE_MUTEX(profile_flip_mutex); | 48 | static DEFINE_MUTEX(profile_flip_mutex); |
48 | #endif /* CONFIG_SMP */ | 49 | #endif /* CONFIG_SMP */ |
49 | 50 | ||
50 | static int __init profile_setup(char * str) | 51 | static int __init profile_setup(char * str) |
@@ -86,72 +87,52 @@ void __init profile_init(void) | |||
86 | 87 | ||
87 | #ifdef CONFIG_PROFILING | 88 | #ifdef CONFIG_PROFILING |
88 | 89 | ||
89 | static DECLARE_RWSEM(profile_rwsem); | 90 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
90 | static DEFINE_RWLOCK(handoff_lock); | 91 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
91 | static struct notifier_block * task_exit_notifier; | 92 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
92 | static struct notifier_block * task_free_notifier; | ||
93 | static struct notifier_block * munmap_notifier; | ||
94 | 93 | ||
95 | void profile_task_exit(struct task_struct * task) | 94 | void profile_task_exit(struct task_struct * task) |
96 | { | 95 | { |
97 | down_read(&profile_rwsem); | 96 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
98 | notifier_call_chain(&task_exit_notifier, 0, task); | ||
99 | up_read(&profile_rwsem); | ||
100 | } | 97 | } |
101 | 98 | ||
102 | int profile_handoff_task(struct task_struct * task) | 99 | int profile_handoff_task(struct task_struct * task) |
103 | { | 100 | { |
104 | int ret; | 101 | int ret; |
105 | read_lock(&handoff_lock); | 102 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
106 | ret = notifier_call_chain(&task_free_notifier, 0, task); | ||
107 | read_unlock(&handoff_lock); | ||
108 | return (ret == NOTIFY_OK) ? 1 : 0; | 103 | return (ret == NOTIFY_OK) ? 1 : 0; |
109 | } | 104 | } |
110 | 105 | ||
111 | void profile_munmap(unsigned long addr) | 106 | void profile_munmap(unsigned long addr) |
112 | { | 107 | { |
113 | down_read(&profile_rwsem); | 108 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
114 | notifier_call_chain(&munmap_notifier, 0, (void *)addr); | ||
115 | up_read(&profile_rwsem); | ||
116 | } | 109 | } |
117 | 110 | ||
118 | int task_handoff_register(struct notifier_block * n) | 111 | int task_handoff_register(struct notifier_block * n) |
119 | { | 112 | { |
120 | int err = -EINVAL; | 113 | return atomic_notifier_chain_register(&task_free_notifier, n); |
121 | |||
122 | write_lock(&handoff_lock); | ||
123 | err = notifier_chain_register(&task_free_notifier, n); | ||
124 | write_unlock(&handoff_lock); | ||
125 | return err; | ||
126 | } | 114 | } |
127 | 115 | ||
128 | int task_handoff_unregister(struct notifier_block * n) | 116 | int task_handoff_unregister(struct notifier_block * n) |
129 | { | 117 | { |
130 | int err = -EINVAL; | 118 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
131 | |||
132 | write_lock(&handoff_lock); | ||
133 | err = notifier_chain_unregister(&task_free_notifier, n); | ||
134 | write_unlock(&handoff_lock); | ||
135 | return err; | ||
136 | } | 119 | } |
137 | 120 | ||
138 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 121 | int profile_event_register(enum profile_type type, struct notifier_block * n) |
139 | { | 122 | { |
140 | int err = -EINVAL; | 123 | int err = -EINVAL; |
141 | 124 | ||
142 | down_write(&profile_rwsem); | ||
143 | |||
144 | switch (type) { | 125 | switch (type) { |
145 | case PROFILE_TASK_EXIT: | 126 | case PROFILE_TASK_EXIT: |
146 | err = notifier_chain_register(&task_exit_notifier, n); | 127 | err = blocking_notifier_chain_register( |
128 | &task_exit_notifier, n); | ||
147 | break; | 129 | break; |
148 | case PROFILE_MUNMAP: | 130 | case PROFILE_MUNMAP: |
149 | err = notifier_chain_register(&munmap_notifier, n); | 131 | err = blocking_notifier_chain_register( |
132 | &munmap_notifier, n); | ||
150 | break; | 133 | break; |
151 | } | 134 | } |
152 | 135 | ||
153 | up_write(&profile_rwsem); | ||
154 | |||
155 | return err; | 136 | return err; |
156 | } | 137 | } |
157 | 138 | ||
@@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n) | |||
160 | { | 141 | { |
161 | int err = -EINVAL; | 142 | int err = -EINVAL; |
162 | 143 | ||
163 | down_write(&profile_rwsem); | ||
164 | |||
165 | switch (type) { | 144 | switch (type) { |
166 | case PROFILE_TASK_EXIT: | 145 | case PROFILE_TASK_EXIT: |
167 | err = notifier_chain_unregister(&task_exit_notifier, n); | 146 | err = blocking_notifier_chain_unregister( |
147 | &task_exit_notifier, n); | ||
168 | break; | 148 | break; |
169 | case PROFILE_MUNMAP: | 149 | case PROFILE_MUNMAP: |
170 | err = notifier_chain_unregister(&munmap_notifier, n); | 150 | err = blocking_notifier_chain_unregister( |
151 | &munmap_notifier, n); | ||
171 | break; | 152 | break; |
172 | } | 153 | } |
173 | 154 | ||
174 | up_write(&profile_rwsem); | ||
175 | return err; | 155 | return err; |
176 | } | 156 | } |
177 | 157 | ||
@@ -243,7 +223,7 @@ static void profile_flip_buffers(void) | |||
243 | { | 223 | { |
244 | int i, j, cpu; | 224 | int i, j, cpu; |
245 | 225 | ||
246 | down(&profile_flip_mutex); | 226 | mutex_lock(&profile_flip_mutex); |
247 | j = per_cpu(cpu_profile_flip, get_cpu()); | 227 | j = per_cpu(cpu_profile_flip, get_cpu()); |
248 | put_cpu(); | 228 | put_cpu(); |
249 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | 229 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); |
@@ -259,14 +239,14 @@ static void profile_flip_buffers(void) | |||
259 | hits[i].hits = hits[i].pc = 0; | 239 | hits[i].hits = hits[i].pc = 0; |
260 | } | 240 | } |
261 | } | 241 | } |
262 | up(&profile_flip_mutex); | 242 | mutex_unlock(&profile_flip_mutex); |
263 | } | 243 | } |
264 | 244 | ||
265 | static void profile_discard_flip_buffers(void) | 245 | static void profile_discard_flip_buffers(void) |
266 | { | 246 | { |
267 | int i, cpu; | 247 | int i, cpu; |
268 | 248 | ||
269 | down(&profile_flip_mutex); | 249 | mutex_lock(&profile_flip_mutex); |
270 | i = per_cpu(cpu_profile_flip, get_cpu()); | 250 | i = per_cpu(cpu_profile_flip, get_cpu()); |
271 | put_cpu(); | 251 | put_cpu(); |
272 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | 252 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); |
@@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void) | |||
274 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; | 254 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; |
275 | memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); | 255 | memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); |
276 | } | 256 | } |
277 | up(&profile_flip_mutex); | 257 | mutex_unlock(&profile_flip_mutex); |
278 | } | 258 | } |
279 | 259 | ||
280 | void profile_hit(int type, void *__pc) | 260 | void profile_hit(int type, void *__pc) |
@@ -319,7 +299,7 @@ out: | |||
319 | } | 299 | } |
320 | 300 | ||
321 | #ifdef CONFIG_HOTPLUG_CPU | 301 | #ifdef CONFIG_HOTPLUG_CPU |
322 | static int __devinit profile_cpu_callback(struct notifier_block *info, | 302 | static int profile_cpu_callback(struct notifier_block *info, |
323 | unsigned long action, void *__cpu) | 303 | unsigned long action, void *__cpu) |
324 | { | 304 | { |
325 | int node, cpu = (unsigned long)__cpu; | 305 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5f33cdb6ff..921c22ad16 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -30,14 +30,13 @@ | |||
30 | */ | 30 | */ |
31 | void __ptrace_link(task_t *child, task_t *new_parent) | 31 | void __ptrace_link(task_t *child, task_t *new_parent) |
32 | { | 32 | { |
33 | if (!list_empty(&child->ptrace_list)) | 33 | BUG_ON(!list_empty(&child->ptrace_list)); |
34 | BUG(); | ||
35 | if (child->parent == new_parent) | 34 | if (child->parent == new_parent) |
36 | return; | 35 | return; |
37 | list_add(&child->ptrace_list, &child->parent->ptrace_children); | 36 | list_add(&child->ptrace_list, &child->parent->ptrace_children); |
38 | REMOVE_LINKS(child); | 37 | remove_parent(child); |
39 | child->parent = new_parent; | 38 | child->parent = new_parent; |
40 | SET_LINKS(child); | 39 | add_parent(child); |
41 | } | 40 | } |
42 | 41 | ||
43 | /* | 42 | /* |
@@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child) | |||
57 | signal_wake_up(child, 1); | 56 | signal_wake_up(child, 1); |
58 | } | 57 | } |
59 | } | 58 | } |
60 | if (child->signal->flags & SIGNAL_GROUP_EXIT) { | ||
61 | sigaddset(&child->pending.signal, SIGKILL); | ||
62 | signal_wake_up(child, 1); | ||
63 | } | ||
64 | spin_unlock(&child->sighand->siglock); | 59 | spin_unlock(&child->sighand->siglock); |
65 | } | 60 | } |
66 | 61 | ||
@@ -72,17 +67,18 @@ void ptrace_untrace(task_t *child) | |||
72 | */ | 67 | */ |
73 | void __ptrace_unlink(task_t *child) | 68 | void __ptrace_unlink(task_t *child) |
74 | { | 69 | { |
75 | if (!child->ptrace) | 70 | BUG_ON(!child->ptrace); |
76 | BUG(); | 71 | |
77 | child->ptrace = 0; | 72 | child->ptrace = 0; |
78 | if (!list_empty(&child->ptrace_list)) { | 73 | if (!list_empty(&child->ptrace_list)) { |
79 | list_del_init(&child->ptrace_list); | 74 | list_del_init(&child->ptrace_list); |
80 | REMOVE_LINKS(child); | 75 | remove_parent(child); |
81 | child->parent = child->real_parent; | 76 | child->parent = child->real_parent; |
82 | SET_LINKS(child); | 77 | add_parent(child); |
83 | } | 78 | } |
84 | 79 | ||
85 | ptrace_untrace(child); | 80 | if (child->state == TASK_TRACED) |
81 | ptrace_untrace(child); | ||
86 | } | 82 | } |
87 | 83 | ||
88 | /* | 84 | /* |
@@ -152,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task) | |||
152 | int ptrace_attach(struct task_struct *task) | 148 | int ptrace_attach(struct task_struct *task) |
153 | { | 149 | { |
154 | int retval; | 150 | int retval; |
155 | task_lock(task); | 151 | |
156 | retval = -EPERM; | 152 | retval = -EPERM; |
157 | if (task->pid <= 1) | 153 | if (task->pid <= 1) |
158 | goto bad; | 154 | goto out; |
159 | if (task->tgid == current->tgid) | 155 | if (task->tgid == current->tgid) |
160 | goto bad; | 156 | goto out; |
157 | |||
158 | repeat: | ||
159 | /* | ||
160 | * Nasty, nasty. | ||
161 | * | ||
162 | * We want to hold both the task-lock and the | ||
163 | * tasklist_lock for writing at the same time. | ||
164 | * But that's against the rules (tasklist_lock | ||
165 | * is taken for reading by interrupts on other | ||
166 | * cpu's that may have task_lock). | ||
167 | */ | ||
168 | task_lock(task); | ||
169 | local_irq_disable(); | ||
170 | if (!write_trylock(&tasklist_lock)) { | ||
171 | local_irq_enable(); | ||
172 | task_unlock(task); | ||
173 | do { | ||
174 | cpu_relax(); | ||
175 | } while (!write_can_lock(&tasklist_lock)); | ||
176 | goto repeat; | ||
177 | } | ||
178 | |||
161 | /* the same process cannot be attached many times */ | 179 | /* the same process cannot be attached many times */ |
162 | if (task->ptrace & PT_PTRACED) | 180 | if (task->ptrace & PT_PTRACED) |
163 | goto bad; | 181 | goto bad; |
@@ -170,36 +188,39 @@ int ptrace_attach(struct task_struct *task) | |||
170 | ? PT_ATTACHED : 0); | 188 | ? PT_ATTACHED : 0); |
171 | if (capable(CAP_SYS_PTRACE)) | 189 | if (capable(CAP_SYS_PTRACE)) |
172 | task->ptrace |= PT_PTRACE_CAP; | 190 | task->ptrace |= PT_PTRACE_CAP; |
173 | task_unlock(task); | ||
174 | 191 | ||
175 | write_lock_irq(&tasklist_lock); | ||
176 | __ptrace_link(task, current); | 192 | __ptrace_link(task, current); |
177 | write_unlock_irq(&tasklist_lock); | ||
178 | 193 | ||
179 | force_sig_specific(SIGSTOP, task); | 194 | force_sig_specific(SIGSTOP, task); |
180 | return 0; | ||
181 | 195 | ||
182 | bad: | 196 | bad: |
197 | write_unlock_irq(&tasklist_lock); | ||
183 | task_unlock(task); | 198 | task_unlock(task); |
199 | out: | ||
184 | return retval; | 200 | return retval; |
185 | } | 201 | } |
186 | 202 | ||
203 | void __ptrace_detach(struct task_struct *child, unsigned int data) | ||
204 | { | ||
205 | child->exit_code = data; | ||
206 | /* .. re-parent .. */ | ||
207 | __ptrace_unlink(child); | ||
208 | /* .. and wake it up. */ | ||
209 | if (child->exit_state != EXIT_ZOMBIE) | ||
210 | wake_up_process(child); | ||
211 | } | ||
212 | |||
187 | int ptrace_detach(struct task_struct *child, unsigned int data) | 213 | int ptrace_detach(struct task_struct *child, unsigned int data) |
188 | { | 214 | { |
189 | if (!valid_signal(data)) | 215 | if (!valid_signal(data)) |
190 | return -EIO; | 216 | return -EIO; |
191 | 217 | ||
192 | /* Architecture-specific hardware disable .. */ | 218 | /* Architecture-specific hardware disable .. */ |
193 | ptrace_disable(child); | 219 | ptrace_disable(child); |
194 | 220 | ||
195 | /* .. re-parent .. */ | ||
196 | child->exit_code = data; | ||
197 | |||
198 | write_lock_irq(&tasklist_lock); | 221 | write_lock_irq(&tasklist_lock); |
199 | __ptrace_unlink(child); | 222 | if (child->ptrace) |
200 | /* .. and wake it up. */ | 223 | __ptrace_detach(child, data); |
201 | if (child->exit_state != EXIT_ZOMBIE) | ||
202 | wake_up_process(child); | ||
203 | write_unlock_irq(&tasklist_lock); | 224 | write_unlock_irq(&tasklist_lock); |
204 | 225 | ||
205 | return 0; | 226 | return 0; |
@@ -242,8 +263,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
242 | if (write) { | 263 | if (write) { |
243 | copy_to_user_page(vma, page, addr, | 264 | copy_to_user_page(vma, page, addr, |
244 | maddr + offset, buf, bytes); | 265 | maddr + offset, buf, bytes); |
245 | if (!PageCompound(page)) | 266 | set_page_dirty_lock(page); |
246 | set_page_dirty_lock(page); | ||
247 | } else { | 267 | } else { |
248 | copy_from_user_page(vma, page, addr, | 268 | copy_from_user_page(vma, page, addr, |
249 | buf, maddr + offset, bytes); | 269 | buf, maddr + offset, bytes); |
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request, | |||
417 | */ | 437 | */ |
418 | int ptrace_traceme(void) | 438 | int ptrace_traceme(void) |
419 | { | 439 | { |
420 | int ret; | 440 | int ret = -EPERM; |
421 | 441 | ||
422 | /* | 442 | /* |
423 | * Are we already being traced? | 443 | * Are we already being traced? |
424 | */ | 444 | */ |
425 | if (current->ptrace & PT_PTRACED) | 445 | task_lock(current); |
426 | return -EPERM; | 446 | if (!(current->ptrace & PT_PTRACED)) { |
427 | ret = security_ptrace(current->parent, current); | 447 | ret = security_ptrace(current->parent, current); |
428 | if (ret) | 448 | /* |
429 | return -EPERM; | 449 | * Set the ptrace bit in the process ptrace flags. |
430 | /* | 450 | */ |
431 | * Set the ptrace bit in the process ptrace flags. | 451 | if (!ret) |
432 | */ | 452 | current->ptrace |= PT_PTRACED; |
433 | current->ptrace |= PT_PTRACED; | 453 | } |
434 | return 0; | 454 | task_unlock(current); |
455 | return ret; | ||
435 | } | 456 | } |
436 | 457 | ||
437 | /** | 458 | /** |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 0cf8146bd5..2058f88c7b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -47,15 +47,16 @@ | |||
47 | #include <linux/notifier.h> | 47 | #include <linux/notifier.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/mutex.h> | ||
50 | 51 | ||
51 | /* Definition for rcupdate control block. */ | 52 | /* Definition for rcupdate control block. */ |
52 | struct rcu_ctrlblk rcu_ctrlblk = { | 53 | static struct rcu_ctrlblk rcu_ctrlblk = { |
53 | .cur = -300, | 54 | .cur = -300, |
54 | .completed = -300, | 55 | .completed = -300, |
55 | .lock = SPIN_LOCK_UNLOCKED, | 56 | .lock = SPIN_LOCK_UNLOCKED, |
56 | .cpumask = CPU_MASK_NONE, | 57 | .cpumask = CPU_MASK_NONE, |
57 | }; | 58 | }; |
58 | struct rcu_ctrlblk rcu_bh_ctrlblk = { | 59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
59 | .cur = -300, | 60 | .cur = -300, |
60 | .completed = -300, | 61 | .completed = -300, |
61 | .lock = SPIN_LOCK_UNLOCKED, | 62 | .lock = SPIN_LOCK_UNLOCKED, |
@@ -67,7 +68,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
67 | 68 | ||
68 | /* Fake initialization required by compiler */ | 69 | /* Fake initialization required by compiler */ |
69 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 70 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
70 | static int maxbatch = 10000; | 71 | static int blimit = 10; |
72 | static int qhimark = 10000; | ||
73 | static int qlowmark = 100; | ||
74 | #ifdef CONFIG_SMP | ||
75 | static int rsinterval = 1000; | ||
76 | #endif | ||
77 | |||
78 | static atomic_t rcu_barrier_cpu_count; | ||
79 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
80 | static struct completion rcu_barrier_completion; | ||
81 | |||
82 | #ifdef CONFIG_SMP | ||
83 | static void force_quiescent_state(struct rcu_data *rdp, | ||
84 | struct rcu_ctrlblk *rcp) | ||
85 | { | ||
86 | int cpu; | ||
87 | cpumask_t cpumask; | ||
88 | set_need_resched(); | ||
89 | if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { | ||
90 | rdp->last_rs_qlen = rdp->qlen; | ||
91 | /* | ||
92 | * Don't send IPI to itself. With irqs disabled, | ||
93 | * rdp->cpu is the current cpu. | ||
94 | */ | ||
95 | cpumask = rcp->cpumask; | ||
96 | cpu_clear(rdp->cpu, cpumask); | ||
97 | for_each_cpu_mask(cpu, cpumask) | ||
98 | smp_send_reschedule(cpu); | ||
99 | } | ||
100 | } | ||
101 | #else | ||
102 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
103 | struct rcu_ctrlblk *rcp) | ||
104 | { | ||
105 | set_need_resched(); | ||
106 | } | ||
107 | #endif | ||
71 | 108 | ||
72 | /** | 109 | /** |
73 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 110 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
@@ -92,17 +129,13 @@ void fastcall call_rcu(struct rcu_head *head, | |||
92 | rdp = &__get_cpu_var(rcu_data); | 129 | rdp = &__get_cpu_var(rcu_data); |
93 | *rdp->nxttail = head; | 130 | *rdp->nxttail = head; |
94 | rdp->nxttail = &head->next; | 131 | rdp->nxttail = &head->next; |
95 | 132 | if (unlikely(++rdp->qlen > qhimark)) { | |
96 | if (unlikely(++rdp->count > 10000)) | 133 | rdp->blimit = INT_MAX; |
97 | set_need_resched(); | 134 | force_quiescent_state(rdp, &rcu_ctrlblk); |
98 | 135 | } | |
99 | local_irq_restore(flags); | 136 | local_irq_restore(flags); |
100 | } | 137 | } |
101 | 138 | ||
102 | static atomic_t rcu_barrier_cpu_count; | ||
103 | static struct semaphore rcu_barrier_sema; | ||
104 | static struct completion rcu_barrier_completion; | ||
105 | |||
106 | /** | 139 | /** |
107 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | 140 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. |
108 | * @head: structure to be used for queueing the RCU updates. | 141 | * @head: structure to be used for queueing the RCU updates. |
@@ -131,12 +164,12 @@ void fastcall call_rcu_bh(struct rcu_head *head, | |||
131 | rdp = &__get_cpu_var(rcu_bh_data); | 164 | rdp = &__get_cpu_var(rcu_bh_data); |
132 | *rdp->nxttail = head; | 165 | *rdp->nxttail = head; |
133 | rdp->nxttail = &head->next; | 166 | rdp->nxttail = &head->next; |
134 | rdp->count++; | 167 | |
135 | /* | 168 | if (unlikely(++rdp->qlen > qhimark)) { |
136 | * Should we directly call rcu_do_batch() here ? | 169 | rdp->blimit = INT_MAX; |
137 | * if (unlikely(rdp->count > 10000)) | 170 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); |
138 | * rcu_do_batch(rdp); | 171 | } |
139 | */ | 172 | |
140 | local_irq_restore(flags); | 173 | local_irq_restore(flags); |
141 | } | 174 | } |
142 | 175 | ||
@@ -175,13 +208,13 @@ static void rcu_barrier_func(void *notused) | |||
175 | void rcu_barrier(void) | 208 | void rcu_barrier(void) |
176 | { | 209 | { |
177 | BUG_ON(in_interrupt()); | 210 | BUG_ON(in_interrupt()); |
178 | /* Take cpucontrol semaphore to protect against CPU hotplug */ | 211 | /* Take cpucontrol mutex to protect against CPU hotplug */ |
179 | down(&rcu_barrier_sema); | 212 | mutex_lock(&rcu_barrier_mutex); |
180 | init_completion(&rcu_barrier_completion); | 213 | init_completion(&rcu_barrier_completion); |
181 | atomic_set(&rcu_barrier_cpu_count, 0); | 214 | atomic_set(&rcu_barrier_cpu_count, 0); |
182 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 215 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
183 | wait_for_completion(&rcu_barrier_completion); | 216 | wait_for_completion(&rcu_barrier_completion); |
184 | up(&rcu_barrier_sema); | 217 | mutex_unlock(&rcu_barrier_mutex); |
185 | } | 218 | } |
186 | EXPORT_SYMBOL_GPL(rcu_barrier); | 219 | EXPORT_SYMBOL_GPL(rcu_barrier); |
187 | 220 | ||
@@ -199,10 +232,12 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
199 | next = rdp->donelist = list->next; | 232 | next = rdp->donelist = list->next; |
200 | list->func(list); | 233 | list->func(list); |
201 | list = next; | 234 | list = next; |
202 | rdp->count--; | 235 | rdp->qlen--; |
203 | if (++count >= maxbatch) | 236 | if (++count >= rdp->blimit) |
204 | break; | 237 | break; |
205 | } | 238 | } |
239 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
240 | rdp->blimit = blimit; | ||
206 | if (!rdp->donelist) | 241 | if (!rdp->donelist) |
207 | rdp->donetail = &rdp->donelist; | 242 | rdp->donetail = &rdp->donelist; |
208 | else | 243 | else |
@@ -381,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | |||
381 | rdp->curtail = &rdp->curlist; | 416 | rdp->curtail = &rdp->curlist; |
382 | } | 417 | } |
383 | 418 | ||
384 | local_irq_disable(); | ||
385 | if (rdp->nxtlist && !rdp->curlist) { | 419 | if (rdp->nxtlist && !rdp->curlist) { |
420 | local_irq_disable(); | ||
386 | rdp->curlist = rdp->nxtlist; | 421 | rdp->curlist = rdp->nxtlist; |
387 | rdp->curtail = rdp->nxttail; | 422 | rdp->curtail = rdp->nxttail; |
388 | rdp->nxtlist = NULL; | 423 | rdp->nxtlist = NULL; |
@@ -407,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | |||
407 | rcu_start_batch(rcp); | 442 | rcu_start_batch(rcp); |
408 | spin_unlock(&rcp->lock); | 443 | spin_unlock(&rcp->lock); |
409 | } | 444 | } |
410 | } else { | ||
411 | local_irq_enable(); | ||
412 | } | 445 | } |
446 | |||
413 | rcu_check_quiescent_state(rcp, rdp); | 447 | rcu_check_quiescent_state(rcp, rdp); |
414 | if (rdp->donelist) | 448 | if (rdp->donelist) |
415 | rcu_do_batch(rdp); | 449 | rcu_do_batch(rdp); |
@@ -445,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | |||
445 | return 0; | 479 | return 0; |
446 | } | 480 | } |
447 | 481 | ||
482 | /* | ||
483 | * Check to see if there is any immediate RCU-related work to be done | ||
484 | * by the current CPU, returning 1 if so. This function is part of the | ||
485 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
486 | */ | ||
448 | int rcu_pending(int cpu) | 487 | int rcu_pending(int cpu) |
449 | { | 488 | { |
450 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | 489 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || |
451 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | 490 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); |
452 | } | 491 | } |
453 | 492 | ||
493 | /* | ||
494 | * Check to see if any future RCU-related work will need to be done | ||
495 | * by the current CPU, even if none need be done immediately, returning | ||
496 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
497 | * an exported member of the RCU API. | ||
498 | */ | ||
499 | int rcu_needs_cpu(int cpu) | ||
500 | { | ||
501 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
502 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
503 | |||
504 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
505 | } | ||
506 | |||
454 | void rcu_check_callbacks(int cpu, int user) | 507 | void rcu_check_callbacks(int cpu, int user) |
455 | { | 508 | { |
456 | if (user || | 509 | if (user || |
@@ -473,6 +526,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | |||
473 | rdp->quiescbatch = rcp->completed; | 526 | rdp->quiescbatch = rcp->completed; |
474 | rdp->qs_pending = 0; | 527 | rdp->qs_pending = 0; |
475 | rdp->cpu = cpu; | 528 | rdp->cpu = cpu; |
529 | rdp->blimit = blimit; | ||
476 | } | 530 | } |
477 | 531 | ||
478 | static void __devinit rcu_online_cpu(int cpu) | 532 | static void __devinit rcu_online_cpu(int cpu) |
@@ -485,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
485 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
486 | } | 540 | } |
487 | 541 | ||
488 | static int __devinit rcu_cpu_notify(struct notifier_block *self, | 542 | static int rcu_cpu_notify(struct notifier_block *self, |
489 | unsigned long action, void *hcpu) | 543 | unsigned long action, void *hcpu) |
490 | { | 544 | { |
491 | long cpu = (long)hcpu; | 545 | long cpu = (long)hcpu; |
@@ -502,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, | |||
502 | return NOTIFY_OK; | 556 | return NOTIFY_OK; |
503 | } | 557 | } |
504 | 558 | ||
505 | static struct notifier_block __devinitdata rcu_nb = { | 559 | static struct notifier_block rcu_nb = { |
506 | .notifier_call = rcu_cpu_notify, | 560 | .notifier_call = rcu_cpu_notify, |
507 | }; | 561 | }; |
508 | 562 | ||
@@ -514,7 +568,6 @@ static struct notifier_block __devinitdata rcu_nb = { | |||
514 | */ | 568 | */ |
515 | void __init rcu_init(void) | 569 | void __init rcu_init(void) |
516 | { | 570 | { |
517 | sema_init(&rcu_barrier_sema, 1); | ||
518 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 571 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, |
519 | (void *)(long)smp_processor_id()); | 572 | (void *)(long)smp_processor_id()); |
520 | /* Register notifier for non-boot CPUs */ | 573 | /* Register notifier for non-boot CPUs */ |
@@ -567,9 +620,14 @@ void synchronize_kernel(void) | |||
567 | synchronize_rcu(); | 620 | synchronize_rcu(); |
568 | } | 621 | } |
569 | 622 | ||
570 | module_param(maxbatch, int, 0); | 623 | module_param(blimit, int, 0); |
624 | module_param(qhimark, int, 0); | ||
625 | module_param(qlowmark, int, 0); | ||
626 | #ifdef CONFIG_SMP | ||
627 | module_param(rsinterval, int, 0); | ||
628 | #endif | ||
571 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 629 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
572 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ | 630 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ |
573 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 631 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ |
574 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 632 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
575 | EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ | 633 | EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7712912dbc..8154e7589d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */ | |||
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
56 | 56 | ||
57 | MODULE_PARM(nreaders, "i"); | 57 | module_param(nreaders, int, 0); |
58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
59 | MODULE_PARM(stat_interval, "i"); | 59 | module_param(stat_interval, int, 0); |
60 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 60 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
61 | MODULE_PARM(verbose, "i"); | 61 | module_param(verbose, bool, 0); |
62 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 62 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
63 | MODULE_PARM(test_no_idle_hz, "i"); | 63 | module_param(test_no_idle_hz, bool, 0); |
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
65 | MODULE_PARM(shuffle_interval, "i"); | 65 | module_param(shuffle_interval, int, 0); |
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
67 | #define TORTURE_FLAG "rcutorture: " | 67 | #define TORTURE_FLAG "rcutorture: " |
68 | #define PRINTK_STRING(s) \ | 68 | #define PRINTK_STRING(s) \ |
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page) | |||
301 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 301 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
302 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 302 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
303 | 303 | ||
304 | for_each_cpu(cpu) { | 304 | for_each_possible_cpu(cpu) { |
305 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 305 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
306 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | 306 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; |
307 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | 307 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; |
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg) | |||
441 | return 0; | 441 | return 0; |
442 | } | 442 | } |
443 | 443 | ||
444 | static inline void | ||
445 | rcu_torture_print_module_parms(char *tag) | ||
446 | { | ||
447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | ||
448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | ||
449 | "shuffle_interval = %d\n", | ||
450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | ||
451 | shuffle_interval); | ||
452 | } | ||
453 | |||
444 | static void | 454 | static void |
445 | rcu_torture_cleanup(void) | 455 | rcu_torture_cleanup(void) |
446 | { | 456 | { |
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void) | |||
483 | rcu_barrier(); | 493 | rcu_barrier(); |
484 | 494 | ||
485 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
486 | printk(KERN_ALERT TORTURE_FLAG | 496 | if (atomic_read(&n_rcu_torture_error)) |
487 | "--- End of test: %s\n", | 497 | rcu_torture_print_module_parms("End of test: FAILURE"); |
488 | atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); | 498 | else |
499 | rcu_torture_print_module_parms("End of test: SUCCESS"); | ||
489 | } | 500 | } |
490 | 501 | ||
491 | static int | 502 | static int |
@@ -501,11 +512,7 @@ rcu_torture_init(void) | |||
501 | nrealreaders = nreaders; | 512 | nrealreaders = nreaders; |
502 | else | 513 | else |
503 | nrealreaders = 2 * num_online_cpus(); | 514 | nrealreaders = 2 * num_online_cpus(); |
504 | printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " | 515 | rcu_torture_print_module_parms("Start of test"); |
505 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | ||
506 | "shuffle_interval = %d\n", | ||
507 | nrealreaders, stat_interval, verbose, test_no_idle_hz, | ||
508 | shuffle_interval); | ||
509 | fullstop = 0; | 516 | fullstop = 0; |
510 | 517 | ||
511 | /* Set up the freelist. */ | 518 | /* Set up the freelist. */ |
@@ -528,7 +535,7 @@ rcu_torture_init(void) | |||
528 | atomic_set(&n_rcu_torture_error, 0); | 535 | atomic_set(&n_rcu_torture_error, 0); |
529 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 536 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
530 | atomic_set(&rcu_torture_wcount[i], 0); | 537 | atomic_set(&rcu_torture_wcount[i], 0); |
531 | for_each_cpu(cpu) { | 538 | for_each_possible_cpu(cpu) { |
532 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 539 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
533 | per_cpu(rcu_torture_count, cpu)[i] = 0; | 540 | per_cpu(rcu_torture_count, cpu)[i] = 0; |
534 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | 541 | per_cpu(rcu_torture_batch, cpu)[i] = 0; |
diff --git a/kernel/relay.c b/kernel/relay.c new file mode 100644 index 0000000000..33345e7348 --- /dev/null +++ b/kernel/relay.c | |||
@@ -0,0 +1,1012 @@ | |||
1 | /* | ||
2 | * Public API and common code for kernel->userspace relay file support. | ||
3 | * | ||
4 | * See Documentation/filesystems/relayfs.txt for an overview of relayfs. | ||
5 | * | ||
6 | * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp | ||
7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) | ||
8 | * | ||
9 | * Moved to kernel/relay.c by Paul Mundt, 2006. | ||
10 | * | ||
11 | * This file is released under the GPL. | ||
12 | */ | ||
13 | #include <linux/errno.h> | ||
14 | #include <linux/stddef.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <linux/relay.h> | ||
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/mm.h> | ||
21 | |||
22 | /* | ||
23 | * close() vm_op implementation for relay file mapping. | ||
24 | */ | ||
25 | static void relay_file_mmap_close(struct vm_area_struct *vma) | ||
26 | { | ||
27 | struct rchan_buf *buf = vma->vm_private_data; | ||
28 | buf->chan->cb->buf_unmapped(buf, vma->vm_file); | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * nopage() vm_op implementation for relay file mapping. | ||
33 | */ | ||
34 | static struct page *relay_buf_nopage(struct vm_area_struct *vma, | ||
35 | unsigned long address, | ||
36 | int *type) | ||
37 | { | ||
38 | struct page *page; | ||
39 | struct rchan_buf *buf = vma->vm_private_data; | ||
40 | unsigned long offset = address - vma->vm_start; | ||
41 | |||
42 | if (address > vma->vm_end) | ||
43 | return NOPAGE_SIGBUS; /* Disallow mremap */ | ||
44 | if (!buf) | ||
45 | return NOPAGE_OOM; | ||
46 | |||
47 | page = vmalloc_to_page(buf->start + offset); | ||
48 | if (!page) | ||
49 | return NOPAGE_OOM; | ||
50 | get_page(page); | ||
51 | |||
52 | if (type) | ||
53 | *type = VM_FAULT_MINOR; | ||
54 | |||
55 | return page; | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * vm_ops for relay file mappings. | ||
60 | */ | ||
61 | static struct vm_operations_struct relay_file_mmap_ops = { | ||
62 | .nopage = relay_buf_nopage, | ||
63 | .close = relay_file_mmap_close, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * relay_mmap_buf: - mmap channel buffer to process address space | ||
68 | * @buf: relay channel buffer | ||
69 | * @vma: vm_area_struct describing memory to be mapped | ||
70 | * | ||
71 | * Returns 0 if ok, negative on error | ||
72 | * | ||
73 | * Caller should already have grabbed mmap_sem. | ||
74 | */ | ||
75 | int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) | ||
76 | { | ||
77 | unsigned long length = vma->vm_end - vma->vm_start; | ||
78 | struct file *filp = vma->vm_file; | ||
79 | |||
80 | if (!buf) | ||
81 | return -EBADF; | ||
82 | |||
83 | if (length != (unsigned long)buf->chan->alloc_size) | ||
84 | return -EINVAL; | ||
85 | |||
86 | vma->vm_ops = &relay_file_mmap_ops; | ||
87 | vma->vm_private_data = buf; | ||
88 | buf->chan->cb->buf_mapped(buf, filp); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /** | ||
94 | * relay_alloc_buf - allocate a channel buffer | ||
95 | * @buf: the buffer struct | ||
96 | * @size: total size of the buffer | ||
97 | * | ||
98 | * Returns a pointer to the resulting buffer, NULL if unsuccessful. The | ||
99 | * passed in size will get page aligned, if it isn't already. | ||
100 | */ | ||
101 | static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | ||
102 | { | ||
103 | void *mem; | ||
104 | unsigned int i, j, n_pages; | ||
105 | |||
106 | *size = PAGE_ALIGN(*size); | ||
107 | n_pages = *size >> PAGE_SHIFT; | ||
108 | |||
109 | buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); | ||
110 | if (!buf->page_array) | ||
111 | return NULL; | ||
112 | |||
113 | for (i = 0; i < n_pages; i++) { | ||
114 | buf->page_array[i] = alloc_page(GFP_KERNEL); | ||
115 | if (unlikely(!buf->page_array[i])) | ||
116 | goto depopulate; | ||
117 | } | ||
118 | mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); | ||
119 | if (!mem) | ||
120 | goto depopulate; | ||
121 | |||
122 | memset(mem, 0, *size); | ||
123 | buf->page_count = n_pages; | ||
124 | return mem; | ||
125 | |||
126 | depopulate: | ||
127 | for (j = 0; j < i; j++) | ||
128 | __free_page(buf->page_array[j]); | ||
129 | kfree(buf->page_array); | ||
130 | return NULL; | ||
131 | } | ||
132 | |||
133 | /** | ||
134 | * relay_create_buf - allocate and initialize a channel buffer | ||
135 | * @alloc_size: size of the buffer to allocate | ||
136 | * @n_subbufs: number of sub-buffers in the channel | ||
137 | * | ||
138 | * Returns channel buffer if successful, NULL otherwise | ||
139 | */ | ||
140 | struct rchan_buf *relay_create_buf(struct rchan *chan) | ||
141 | { | ||
142 | struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); | ||
143 | if (!buf) | ||
144 | return NULL; | ||
145 | |||
146 | buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); | ||
147 | if (!buf->padding) | ||
148 | goto free_buf; | ||
149 | |||
150 | buf->start = relay_alloc_buf(buf, &chan->alloc_size); | ||
151 | if (!buf->start) | ||
152 | goto free_buf; | ||
153 | |||
154 | buf->chan = chan; | ||
155 | kref_get(&buf->chan->kref); | ||
156 | return buf; | ||
157 | |||
158 | free_buf: | ||
159 | kfree(buf->padding); | ||
160 | kfree(buf); | ||
161 | return NULL; | ||
162 | } | ||
163 | |||
164 | /** | ||
165 | * relay_destroy_channel - free the channel struct | ||
166 | * | ||
167 | * Should only be called from kref_put(). | ||
168 | */ | ||
169 | void relay_destroy_channel(struct kref *kref) | ||
170 | { | ||
171 | struct rchan *chan = container_of(kref, struct rchan, kref); | ||
172 | kfree(chan); | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer | ||
177 | * @buf: the buffer struct | ||
178 | */ | ||
179 | void relay_destroy_buf(struct rchan_buf *buf) | ||
180 | { | ||
181 | struct rchan *chan = buf->chan; | ||
182 | unsigned int i; | ||
183 | |||
184 | if (likely(buf->start)) { | ||
185 | vunmap(buf->start); | ||
186 | for (i = 0; i < buf->page_count; i++) | ||
187 | __free_page(buf->page_array[i]); | ||
188 | kfree(buf->page_array); | ||
189 | } | ||
190 | kfree(buf->padding); | ||
191 | kfree(buf); | ||
192 | kref_put(&chan->kref, relay_destroy_channel); | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * relay_remove_buf - remove a channel buffer | ||
197 | * | ||
198 | * Removes the file from the fileystem, which also frees the | ||
199 | * rchan_buf_struct and the channel buffer. Should only be called from | ||
200 | * kref_put(). | ||
201 | */ | ||
202 | void relay_remove_buf(struct kref *kref) | ||
203 | { | ||
204 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); | ||
205 | buf->chan->cb->remove_buf_file(buf->dentry); | ||
206 | relay_destroy_buf(buf); | ||
207 | } | ||
208 | |||
209 | /** | ||
210 | * relay_buf_empty - boolean, is the channel buffer empty? | ||
211 | * @buf: channel buffer | ||
212 | * | ||
213 | * Returns 1 if the buffer is empty, 0 otherwise. | ||
214 | */ | ||
215 | int relay_buf_empty(struct rchan_buf *buf) | ||
216 | { | ||
217 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; | ||
218 | } | ||
219 | EXPORT_SYMBOL_GPL(relay_buf_empty); | ||
220 | |||
221 | /** | ||
222 | * relay_buf_full - boolean, is the channel buffer full? | ||
223 | * @buf: channel buffer | ||
224 | * | ||
225 | * Returns 1 if the buffer is full, 0 otherwise. | ||
226 | */ | ||
227 | int relay_buf_full(struct rchan_buf *buf) | ||
228 | { | ||
229 | size_t ready = buf->subbufs_produced - buf->subbufs_consumed; | ||
230 | return (ready >= buf->chan->n_subbufs) ? 1 : 0; | ||
231 | } | ||
232 | EXPORT_SYMBOL_GPL(relay_buf_full); | ||
233 | |||
234 | /* | ||
235 | * High-level relay kernel API and associated functions. | ||
236 | */ | ||
237 | |||
238 | /* | ||
239 | * rchan_callback implementations defining default channel behavior. Used | ||
240 | * in place of corresponding NULL values in client callback struct. | ||
241 | */ | ||
242 | |||
243 | /* | ||
244 | * subbuf_start() default callback. Does nothing. | ||
245 | */ | ||
246 | static int subbuf_start_default_callback (struct rchan_buf *buf, | ||
247 | void *subbuf, | ||
248 | void *prev_subbuf, | ||
249 | size_t prev_padding) | ||
250 | { | ||
251 | if (relay_buf_full(buf)) | ||
252 | return 0; | ||
253 | |||
254 | return 1; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * buf_mapped() default callback. Does nothing. | ||
259 | */ | ||
260 | static void buf_mapped_default_callback(struct rchan_buf *buf, | ||
261 | struct file *filp) | ||
262 | { | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * buf_unmapped() default callback. Does nothing. | ||
267 | */ | ||
268 | static void buf_unmapped_default_callback(struct rchan_buf *buf, | ||
269 | struct file *filp) | ||
270 | { | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * create_buf_file_create() default callback. Does nothing. | ||
275 | */ | ||
276 | static struct dentry *create_buf_file_default_callback(const char *filename, | ||
277 | struct dentry *parent, | ||
278 | int mode, | ||
279 | struct rchan_buf *buf, | ||
280 | int *is_global) | ||
281 | { | ||
282 | return NULL; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * remove_buf_file() default callback. Does nothing. | ||
287 | */ | ||
288 | static int remove_buf_file_default_callback(struct dentry *dentry) | ||
289 | { | ||
290 | return -EINVAL; | ||
291 | } | ||
292 | |||
293 | /* relay channel default callbacks */ | ||
294 | static struct rchan_callbacks default_channel_callbacks = { | ||
295 | .subbuf_start = subbuf_start_default_callback, | ||
296 | .buf_mapped = buf_mapped_default_callback, | ||
297 | .buf_unmapped = buf_unmapped_default_callback, | ||
298 | .create_buf_file = create_buf_file_default_callback, | ||
299 | .remove_buf_file = remove_buf_file_default_callback, | ||
300 | }; | ||
301 | |||
302 | /** | ||
303 | * wakeup_readers - wake up readers waiting on a channel | ||
304 | * @private: the channel buffer | ||
305 | * | ||
306 | * This is the work function used to defer reader waking. The | ||
307 | * reason waking is deferred is that calling directly from write | ||
308 | * causes problems if you're writing from say the scheduler. | ||
309 | */ | ||
310 | static void wakeup_readers(void *private) | ||
311 | { | ||
312 | struct rchan_buf *buf = private; | ||
313 | wake_up_interruptible(&buf->read_wait); | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * __relay_reset - reset a channel buffer | ||
318 | * @buf: the channel buffer | ||
319 | * @init: 1 if this is a first-time initialization | ||
320 | * | ||
321 | * See relay_reset for description of effect. | ||
322 | */ | ||
323 | static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) | ||
324 | { | ||
325 | size_t i; | ||
326 | |||
327 | if (init) { | ||
328 | init_waitqueue_head(&buf->read_wait); | ||
329 | kref_init(&buf->kref); | ||
330 | INIT_WORK(&buf->wake_readers, NULL, NULL); | ||
331 | } else { | ||
332 | cancel_delayed_work(&buf->wake_readers); | ||
333 | flush_scheduled_work(); | ||
334 | } | ||
335 | |||
336 | buf->subbufs_produced = 0; | ||
337 | buf->subbufs_consumed = 0; | ||
338 | buf->bytes_consumed = 0; | ||
339 | buf->finalized = 0; | ||
340 | buf->data = buf->start; | ||
341 | buf->offset = 0; | ||
342 | |||
343 | for (i = 0; i < buf->chan->n_subbufs; i++) | ||
344 | buf->padding[i] = 0; | ||
345 | |||
346 | buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); | ||
347 | } | ||
348 | |||
349 | /** | ||
350 | * relay_reset - reset the channel | ||
351 | * @chan: the channel | ||
352 | * | ||
353 | * This has the effect of erasing all data from all channel buffers | ||
354 | * and restarting the channel in its initial state. The buffers | ||
355 | * are not freed, so any mappings are still in effect. | ||
356 | * | ||
357 | * NOTE: Care should be taken that the channel isn't actually | ||
358 | * being used by anything when this call is made. | ||
359 | */ | ||
360 | void relay_reset(struct rchan *chan) | ||
361 | { | ||
362 | unsigned int i; | ||
363 | struct rchan_buf *prev = NULL; | ||
364 | |||
365 | if (!chan) | ||
366 | return; | ||
367 | |||
368 | for (i = 0; i < NR_CPUS; i++) { | ||
369 | if (!chan->buf[i] || chan->buf[i] == prev) | ||
370 | break; | ||
371 | __relay_reset(chan->buf[i], 0); | ||
372 | prev = chan->buf[i]; | ||
373 | } | ||
374 | } | ||
375 | EXPORT_SYMBOL_GPL(relay_reset); | ||
376 | |||
377 | /** | ||
378 | * relay_open_buf - create a new relay channel buffer | ||
379 | * | ||
380 | * Internal - used by relay_open(). | ||
381 | */ | ||
382 | static struct rchan_buf *relay_open_buf(struct rchan *chan, | ||
383 | const char *filename, | ||
384 | struct dentry *parent, | ||
385 | int *is_global) | ||
386 | { | ||
387 | struct rchan_buf *buf; | ||
388 | struct dentry *dentry; | ||
389 | |||
390 | if (*is_global) | ||
391 | return chan->buf[0]; | ||
392 | |||
393 | buf = relay_create_buf(chan); | ||
394 | if (!buf) | ||
395 | return NULL; | ||
396 | |||
397 | /* Create file in fs */ | ||
398 | dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, | ||
399 | buf, is_global); | ||
400 | if (!dentry) { | ||
401 | relay_destroy_buf(buf); | ||
402 | return NULL; | ||
403 | } | ||
404 | |||
405 | buf->dentry = dentry; | ||
406 | __relay_reset(buf, 1); | ||
407 | |||
408 | return buf; | ||
409 | } | ||
410 | |||
411 | /** | ||
412 | * relay_close_buf - close a channel buffer | ||
413 | * @buf: channel buffer | ||
414 | * | ||
415 | * Marks the buffer finalized and restores the default callbacks. | ||
416 | * The channel buffer and channel buffer data structure are then freed | ||
417 | * automatically when the last reference is given up. | ||
418 | */ | ||
419 | static inline void relay_close_buf(struct rchan_buf *buf) | ||
420 | { | ||
421 | buf->finalized = 1; | ||
422 | cancel_delayed_work(&buf->wake_readers); | ||
423 | flush_scheduled_work(); | ||
424 | kref_put(&buf->kref, relay_remove_buf); | ||
425 | } | ||
426 | |||
427 | static inline void setup_callbacks(struct rchan *chan, | ||
428 | struct rchan_callbacks *cb) | ||
429 | { | ||
430 | if (!cb) { | ||
431 | chan->cb = &default_channel_callbacks; | ||
432 | return; | ||
433 | } | ||
434 | |||
435 | if (!cb->subbuf_start) | ||
436 | cb->subbuf_start = subbuf_start_default_callback; | ||
437 | if (!cb->buf_mapped) | ||
438 | cb->buf_mapped = buf_mapped_default_callback; | ||
439 | if (!cb->buf_unmapped) | ||
440 | cb->buf_unmapped = buf_unmapped_default_callback; | ||
441 | if (!cb->create_buf_file) | ||
442 | cb->create_buf_file = create_buf_file_default_callback; | ||
443 | if (!cb->remove_buf_file) | ||
444 | cb->remove_buf_file = remove_buf_file_default_callback; | ||
445 | chan->cb = cb; | ||
446 | } | ||
447 | |||
448 | /** | ||
449 | * relay_open - create a new relay channel | ||
450 | * @base_filename: base name of files to create | ||
451 | * @parent: dentry of parent directory, NULL for root directory | ||
452 | * @subbuf_size: size of sub-buffers | ||
453 | * @n_subbufs: number of sub-buffers | ||
454 | * @cb: client callback functions | ||
455 | * | ||
456 | * Returns channel pointer if successful, NULL otherwise. | ||
457 | * | ||
458 | * Creates a channel buffer for each cpu using the sizes and | ||
459 | * attributes specified. The created channel buffer files | ||
460 | * will be named base_filename0...base_filenameN-1. File | ||
461 | * permissions will be S_IRUSR. | ||
462 | */ | ||
463 | struct rchan *relay_open(const char *base_filename, | ||
464 | struct dentry *parent, | ||
465 | size_t subbuf_size, | ||
466 | size_t n_subbufs, | ||
467 | struct rchan_callbacks *cb) | ||
468 | { | ||
469 | unsigned int i; | ||
470 | struct rchan *chan; | ||
471 | char *tmpname; | ||
472 | int is_global = 0; | ||
473 | |||
474 | if (!base_filename) | ||
475 | return NULL; | ||
476 | |||
477 | if (!(subbuf_size && n_subbufs)) | ||
478 | return NULL; | ||
479 | |||
480 | chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); | ||
481 | if (!chan) | ||
482 | return NULL; | ||
483 | |||
484 | chan->version = RELAYFS_CHANNEL_VERSION; | ||
485 | chan->n_subbufs = n_subbufs; | ||
486 | chan->subbuf_size = subbuf_size; | ||
487 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); | ||
488 | setup_callbacks(chan, cb); | ||
489 | kref_init(&chan->kref); | ||
490 | |||
491 | tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
492 | if (!tmpname) | ||
493 | goto free_chan; | ||
494 | |||
495 | for_each_online_cpu(i) { | ||
496 | sprintf(tmpname, "%s%d", base_filename, i); | ||
497 | chan->buf[i] = relay_open_buf(chan, tmpname, parent, | ||
498 | &is_global); | ||
499 | if (!chan->buf[i]) | ||
500 | goto free_bufs; | ||
501 | |||
502 | chan->buf[i]->cpu = i; | ||
503 | } | ||
504 | |||
505 | kfree(tmpname); | ||
506 | return chan; | ||
507 | |||
508 | free_bufs: | ||
509 | for (i = 0; i < NR_CPUS; i++) { | ||
510 | if (!chan->buf[i]) | ||
511 | break; | ||
512 | relay_close_buf(chan->buf[i]); | ||
513 | if (is_global) | ||
514 | break; | ||
515 | } | ||
516 | kfree(tmpname); | ||
517 | |||
518 | free_chan: | ||
519 | kref_put(&chan->kref, relay_destroy_channel); | ||
520 | return NULL; | ||
521 | } | ||
522 | EXPORT_SYMBOL_GPL(relay_open); | ||
523 | |||
524 | /** | ||
525 | * relay_switch_subbuf - switch to a new sub-buffer | ||
526 | * @buf: channel buffer | ||
527 | * @length: size of current event | ||
528 | * | ||
529 | * Returns either the length passed in or 0 if full. | ||
530 | * | ||
531 | * Performs sub-buffer-switch tasks such as invoking callbacks, | ||
532 | * updating padding counts, waking up readers, etc. | ||
533 | */ | ||
534 | size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | ||
535 | { | ||
536 | void *old, *new; | ||
537 | size_t old_subbuf, new_subbuf; | ||
538 | |||
539 | if (unlikely(length > buf->chan->subbuf_size)) | ||
540 | goto toobig; | ||
541 | |||
542 | if (buf->offset != buf->chan->subbuf_size + 1) { | ||
543 | buf->prev_padding = buf->chan->subbuf_size - buf->offset; | ||
544 | old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; | ||
545 | buf->padding[old_subbuf] = buf->prev_padding; | ||
546 | buf->subbufs_produced++; | ||
547 | buf->dentry->d_inode->i_size += buf->chan->subbuf_size - | ||
548 | buf->padding[old_subbuf]; | ||
549 | smp_mb(); | ||
550 | if (waitqueue_active(&buf->read_wait)) { | ||
551 | PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); | ||
552 | schedule_delayed_work(&buf->wake_readers, 1); | ||
553 | } | ||
554 | } | ||
555 | |||
556 | old = buf->data; | ||
557 | new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; | ||
558 | new = buf->start + new_subbuf * buf->chan->subbuf_size; | ||
559 | buf->offset = 0; | ||
560 | if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { | ||
561 | buf->offset = buf->chan->subbuf_size + 1; | ||
562 | return 0; | ||
563 | } | ||
564 | buf->data = new; | ||
565 | buf->padding[new_subbuf] = 0; | ||
566 | |||
567 | if (unlikely(length + buf->offset > buf->chan->subbuf_size)) | ||
568 | goto toobig; | ||
569 | |||
570 | return length; | ||
571 | |||
572 | toobig: | ||
573 | buf->chan->last_toobig = length; | ||
574 | return 0; | ||
575 | } | ||
576 | EXPORT_SYMBOL_GPL(relay_switch_subbuf); | ||
577 | |||
578 | /** | ||
579 | * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count | ||
580 | * @chan: the channel | ||
581 | * @cpu: the cpu associated with the channel buffer to update | ||
582 | * @subbufs_consumed: number of sub-buffers to add to current buf's count | ||
583 | * | ||
584 | * Adds to the channel buffer's consumed sub-buffer count. | ||
585 | * subbufs_consumed should be the number of sub-buffers newly consumed, | ||
586 | * not the total consumed. | ||
587 | * | ||
588 | * NOTE: kernel clients don't need to call this function if the channel | ||
589 | * mode is 'overwrite'. | ||
590 | */ | ||
591 | void relay_subbufs_consumed(struct rchan *chan, | ||
592 | unsigned int cpu, | ||
593 | size_t subbufs_consumed) | ||
594 | { | ||
595 | struct rchan_buf *buf; | ||
596 | |||
597 | if (!chan) | ||
598 | return; | ||
599 | |||
600 | if (cpu >= NR_CPUS || !chan->buf[cpu]) | ||
601 | return; | ||
602 | |||
603 | buf = chan->buf[cpu]; | ||
604 | buf->subbufs_consumed += subbufs_consumed; | ||
605 | if (buf->subbufs_consumed > buf->subbufs_produced) | ||
606 | buf->subbufs_consumed = buf->subbufs_produced; | ||
607 | } | ||
608 | EXPORT_SYMBOL_GPL(relay_subbufs_consumed); | ||
609 | |||
610 | /** | ||
611 | * relay_close - close the channel | ||
612 | * @chan: the channel | ||
613 | * | ||
614 | * Closes all channel buffers and frees the channel. | ||
615 | */ | ||
616 | void relay_close(struct rchan *chan) | ||
617 | { | ||
618 | unsigned int i; | ||
619 | struct rchan_buf *prev = NULL; | ||
620 | |||
621 | if (!chan) | ||
622 | return; | ||
623 | |||
624 | for (i = 0; i < NR_CPUS; i++) { | ||
625 | if (!chan->buf[i] || chan->buf[i] == prev) | ||
626 | break; | ||
627 | relay_close_buf(chan->buf[i]); | ||
628 | prev = chan->buf[i]; | ||
629 | } | ||
630 | |||
631 | if (chan->last_toobig) | ||
632 | printk(KERN_WARNING "relay: one or more items not logged " | ||
633 | "[item size (%Zd) > sub-buffer size (%Zd)]\n", | ||
634 | chan->last_toobig, chan->subbuf_size); | ||
635 | |||
636 | kref_put(&chan->kref, relay_destroy_channel); | ||
637 | } | ||
638 | EXPORT_SYMBOL_GPL(relay_close); | ||
639 | |||
640 | /** | ||
641 | * relay_flush - close the channel | ||
642 | * @chan: the channel | ||
643 | * | ||
644 | * Flushes all channel buffers i.e. forces buffer switch. | ||
645 | */ | ||
646 | void relay_flush(struct rchan *chan) | ||
647 | { | ||
648 | unsigned int i; | ||
649 | struct rchan_buf *prev = NULL; | ||
650 | |||
651 | if (!chan) | ||
652 | return; | ||
653 | |||
654 | for (i = 0; i < NR_CPUS; i++) { | ||
655 | if (!chan->buf[i] || chan->buf[i] == prev) | ||
656 | break; | ||
657 | relay_switch_subbuf(chan->buf[i], 0); | ||
658 | prev = chan->buf[i]; | ||
659 | } | ||
660 | } | ||
661 | EXPORT_SYMBOL_GPL(relay_flush); | ||
662 | |||
663 | /** | ||
664 | * relay_file_open - open file op for relay files | ||
665 | * @inode: the inode | ||
666 | * @filp: the file | ||
667 | * | ||
668 | * Increments the channel buffer refcount. | ||
669 | */ | ||
670 | static int relay_file_open(struct inode *inode, struct file *filp) | ||
671 | { | ||
672 | struct rchan_buf *buf = inode->u.generic_ip; | ||
673 | kref_get(&buf->kref); | ||
674 | filp->private_data = buf; | ||
675 | |||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | /** | ||
680 | * relay_file_mmap - mmap file op for relay files | ||
681 | * @filp: the file | ||
682 | * @vma: the vma describing what to map | ||
683 | * | ||
684 | * Calls upon relay_mmap_buf to map the file into user space. | ||
685 | */ | ||
686 | static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) | ||
687 | { | ||
688 | struct rchan_buf *buf = filp->private_data; | ||
689 | return relay_mmap_buf(buf, vma); | ||
690 | } | ||
691 | |||
692 | /** | ||
693 | * relay_file_poll - poll file op for relay files | ||
694 | * @filp: the file | ||
695 | * @wait: poll table | ||
696 | * | ||
697 | * Poll implemention. | ||
698 | */ | ||
699 | static unsigned int relay_file_poll(struct file *filp, poll_table *wait) | ||
700 | { | ||
701 | unsigned int mask = 0; | ||
702 | struct rchan_buf *buf = filp->private_data; | ||
703 | |||
704 | if (buf->finalized) | ||
705 | return POLLERR; | ||
706 | |||
707 | if (filp->f_mode & FMODE_READ) { | ||
708 | poll_wait(filp, &buf->read_wait, wait); | ||
709 | if (!relay_buf_empty(buf)) | ||
710 | mask |= POLLIN | POLLRDNORM; | ||
711 | } | ||
712 | |||
713 | return mask; | ||
714 | } | ||
715 | |||
716 | /** | ||
717 | * relay_file_release - release file op for relay files | ||
718 | * @inode: the inode | ||
719 | * @filp: the file | ||
720 | * | ||
721 | * Decrements the channel refcount, as the filesystem is | ||
722 | * no longer using it. | ||
723 | */ | ||
724 | static int relay_file_release(struct inode *inode, struct file *filp) | ||
725 | { | ||
726 | struct rchan_buf *buf = filp->private_data; | ||
727 | kref_put(&buf->kref, relay_remove_buf); | ||
728 | |||
729 | return 0; | ||
730 | } | ||
731 | |||
732 | /** | ||
733 | * relay_file_read_consume - update the consumed count for the buffer | ||
734 | */ | ||
735 | static void relay_file_read_consume(struct rchan_buf *buf, | ||
736 | size_t read_pos, | ||
737 | size_t bytes_consumed) | ||
738 | { | ||
739 | size_t subbuf_size = buf->chan->subbuf_size; | ||
740 | size_t n_subbufs = buf->chan->n_subbufs; | ||
741 | size_t read_subbuf; | ||
742 | |||
743 | if (buf->bytes_consumed + bytes_consumed > subbuf_size) { | ||
744 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); | ||
745 | buf->bytes_consumed = 0; | ||
746 | } | ||
747 | |||
748 | buf->bytes_consumed += bytes_consumed; | ||
749 | read_subbuf = read_pos / buf->chan->subbuf_size; | ||
750 | if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { | ||
751 | if ((read_subbuf == buf->subbufs_produced % n_subbufs) && | ||
752 | (buf->offset == subbuf_size)) | ||
753 | return; | ||
754 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); | ||
755 | buf->bytes_consumed = 0; | ||
756 | } | ||
757 | } | ||
758 | |||
759 | /** | ||
760 | * relay_file_read_avail - boolean, are there unconsumed bytes available? | ||
761 | */ | ||
762 | static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) | ||
763 | { | ||
764 | size_t subbuf_size = buf->chan->subbuf_size; | ||
765 | size_t n_subbufs = buf->chan->n_subbufs; | ||
766 | size_t produced = buf->subbufs_produced; | ||
767 | size_t consumed = buf->subbufs_consumed; | ||
768 | |||
769 | relay_file_read_consume(buf, read_pos, 0); | ||
770 | |||
771 | if (unlikely(buf->offset > subbuf_size)) { | ||
772 | if (produced == consumed) | ||
773 | return 0; | ||
774 | return 1; | ||
775 | } | ||
776 | |||
777 | if (unlikely(produced - consumed >= n_subbufs)) { | ||
778 | consumed = (produced / n_subbufs) * n_subbufs; | ||
779 | buf->subbufs_consumed = consumed; | ||
780 | } | ||
781 | |||
782 | produced = (produced % n_subbufs) * subbuf_size + buf->offset; | ||
783 | consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; | ||
784 | |||
785 | if (consumed > produced) | ||
786 | produced += n_subbufs * subbuf_size; | ||
787 | |||
788 | if (consumed == produced) | ||
789 | return 0; | ||
790 | |||
791 | return 1; | ||
792 | } | ||
793 | |||
794 | /** | ||
795 | * relay_file_read_subbuf_avail - return bytes available in sub-buffer | ||
796 | */ | ||
797 | static size_t relay_file_read_subbuf_avail(size_t read_pos, | ||
798 | struct rchan_buf *buf) | ||
799 | { | ||
800 | size_t padding, avail = 0; | ||
801 | size_t read_subbuf, read_offset, write_subbuf, write_offset; | ||
802 | size_t subbuf_size = buf->chan->subbuf_size; | ||
803 | |||
804 | write_subbuf = (buf->data - buf->start) / subbuf_size; | ||
805 | write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; | ||
806 | read_subbuf = read_pos / subbuf_size; | ||
807 | read_offset = read_pos % subbuf_size; | ||
808 | padding = buf->padding[read_subbuf]; | ||
809 | |||
810 | if (read_subbuf == write_subbuf) { | ||
811 | if (read_offset + padding < write_offset) | ||
812 | avail = write_offset - (read_offset + padding); | ||
813 | } else | ||
814 | avail = (subbuf_size - padding) - read_offset; | ||
815 | |||
816 | return avail; | ||
817 | } | ||
818 | |||
819 | /** | ||
820 | * relay_file_read_start_pos - find the first available byte to read | ||
821 | * | ||
822 | * If the read_pos is in the middle of padding, return the | ||
823 | * position of the first actually available byte, otherwise | ||
824 | * return the original value. | ||
825 | */ | ||
826 | static size_t relay_file_read_start_pos(size_t read_pos, | ||
827 | struct rchan_buf *buf) | ||
828 | { | ||
829 | size_t read_subbuf, padding, padding_start, padding_end; | ||
830 | size_t subbuf_size = buf->chan->subbuf_size; | ||
831 | size_t n_subbufs = buf->chan->n_subbufs; | ||
832 | |||
833 | read_subbuf = read_pos / subbuf_size; | ||
834 | padding = buf->padding[read_subbuf]; | ||
835 | padding_start = (read_subbuf + 1) * subbuf_size - padding; | ||
836 | padding_end = (read_subbuf + 1) * subbuf_size; | ||
837 | if (read_pos >= padding_start && read_pos < padding_end) { | ||
838 | read_subbuf = (read_subbuf + 1) % n_subbufs; | ||
839 | read_pos = read_subbuf * subbuf_size; | ||
840 | } | ||
841 | |||
842 | return read_pos; | ||
843 | } | ||
844 | |||
845 | /** | ||
846 | * relay_file_read_end_pos - return the new read position | ||
847 | */ | ||
848 | static size_t relay_file_read_end_pos(struct rchan_buf *buf, | ||
849 | size_t read_pos, | ||
850 | size_t count) | ||
851 | { | ||
852 | size_t read_subbuf, padding, end_pos; | ||
853 | size_t subbuf_size = buf->chan->subbuf_size; | ||
854 | size_t n_subbufs = buf->chan->n_subbufs; | ||
855 | |||
856 | read_subbuf = read_pos / subbuf_size; | ||
857 | padding = buf->padding[read_subbuf]; | ||
858 | if (read_pos % subbuf_size + count + padding == subbuf_size) | ||
859 | end_pos = (read_subbuf + 1) * subbuf_size; | ||
860 | else | ||
861 | end_pos = read_pos + count; | ||
862 | if (end_pos >= subbuf_size * n_subbufs) | ||
863 | end_pos = 0; | ||
864 | |||
865 | return end_pos; | ||
866 | } | ||
867 | |||
868 | /** | ||
869 | * subbuf_read_actor - read up to one subbuf's worth of data | ||
870 | */ | ||
871 | static int subbuf_read_actor(size_t read_start, | ||
872 | struct rchan_buf *buf, | ||
873 | size_t avail, | ||
874 | read_descriptor_t *desc, | ||
875 | read_actor_t actor) | ||
876 | { | ||
877 | void *from; | ||
878 | int ret = 0; | ||
879 | |||
880 | from = buf->start + read_start; | ||
881 | ret = avail; | ||
882 | if (copy_to_user(desc->arg.data, from, avail)) { | ||
883 | desc->error = -EFAULT; | ||
884 | ret = 0; | ||
885 | } | ||
886 | desc->arg.data += ret; | ||
887 | desc->written += ret; | ||
888 | desc->count -= ret; | ||
889 | |||
890 | return ret; | ||
891 | } | ||
892 | |||
893 | /** | ||
894 | * subbuf_send_actor - send up to one subbuf's worth of data | ||
895 | */ | ||
896 | static int subbuf_send_actor(size_t read_start, | ||
897 | struct rchan_buf *buf, | ||
898 | size_t avail, | ||
899 | read_descriptor_t *desc, | ||
900 | read_actor_t actor) | ||
901 | { | ||
902 | unsigned long pidx, poff; | ||
903 | unsigned int subbuf_pages; | ||
904 | int ret = 0; | ||
905 | |||
906 | subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT; | ||
907 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | ||
908 | poff = read_start & ~PAGE_MASK; | ||
909 | while (avail) { | ||
910 | struct page *p = buf->page_array[pidx]; | ||
911 | unsigned int len; | ||
912 | |||
913 | len = PAGE_SIZE - poff; | ||
914 | if (len > avail) | ||
915 | len = avail; | ||
916 | |||
917 | len = actor(desc, p, poff, len); | ||
918 | if (desc->error) | ||
919 | break; | ||
920 | |||
921 | avail -= len; | ||
922 | ret += len; | ||
923 | poff = 0; | ||
924 | pidx = (pidx + 1) % subbuf_pages; | ||
925 | } | ||
926 | |||
927 | return ret; | ||
928 | } | ||
929 | |||
930 | typedef int (*subbuf_actor_t) (size_t read_start, | ||
931 | struct rchan_buf *buf, | ||
932 | size_t avail, | ||
933 | read_descriptor_t *desc, | ||
934 | read_actor_t actor); | ||
935 | |||
936 | /** | ||
937 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | ||
938 | */ | ||
939 | static inline ssize_t relay_file_read_subbufs(struct file *filp, | ||
940 | loff_t *ppos, | ||
941 | size_t count, | ||
942 | subbuf_actor_t subbuf_actor, | ||
943 | read_actor_t actor, | ||
944 | void *target) | ||
945 | { | ||
946 | struct rchan_buf *buf = filp->private_data; | ||
947 | size_t read_start, avail; | ||
948 | read_descriptor_t desc; | ||
949 | int ret; | ||
950 | |||
951 | if (!count) | ||
952 | return 0; | ||
953 | |||
954 | desc.written = 0; | ||
955 | desc.count = count; | ||
956 | desc.arg.data = target; | ||
957 | desc.error = 0; | ||
958 | |||
959 | mutex_lock(&filp->f_dentry->d_inode->i_mutex); | ||
960 | do { | ||
961 | if (!relay_file_read_avail(buf, *ppos)) | ||
962 | break; | ||
963 | |||
964 | read_start = relay_file_read_start_pos(*ppos, buf); | ||
965 | avail = relay_file_read_subbuf_avail(read_start, buf); | ||
966 | if (!avail) | ||
967 | break; | ||
968 | |||
969 | avail = min(desc.count, avail); | ||
970 | ret = subbuf_actor(read_start, buf, avail, &desc, actor); | ||
971 | if (desc.error < 0) | ||
972 | break; | ||
973 | |||
974 | if (ret) { | ||
975 | relay_file_read_consume(buf, read_start, ret); | ||
976 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | ||
977 | } | ||
978 | } while (desc.count && ret); | ||
979 | mutex_unlock(&filp->f_dentry->d_inode->i_mutex); | ||
980 | |||
981 | return desc.written; | ||
982 | } | ||
983 | |||
984 | static ssize_t relay_file_read(struct file *filp, | ||
985 | char __user *buffer, | ||
986 | size_t count, | ||
987 | loff_t *ppos) | ||
988 | { | ||
989 | return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor, | ||
990 | NULL, buffer); | ||
991 | } | ||
992 | |||
993 | static ssize_t relay_file_sendfile(struct file *filp, | ||
994 | loff_t *ppos, | ||
995 | size_t count, | ||
996 | read_actor_t actor, | ||
997 | void *target) | ||
998 | { | ||
999 | return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor, | ||
1000 | actor, target); | ||
1001 | } | ||
1002 | |||
1003 | struct file_operations relay_file_operations = { | ||
1004 | .open = relay_file_open, | ||
1005 | .poll = relay_file_poll, | ||
1006 | .mmap = relay_file_mmap, | ||
1007 | .read = relay_file_read, | ||
1008 | .llseek = no_llseek, | ||
1009 | .release = relay_file_release, | ||
1010 | .sendfile = relay_file_sendfile, | ||
1011 | }; | ||
1012 | EXPORT_SYMBOL_GPL(relay_file_operations); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index bc38804e40..c13f1bd2df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/syscalls.h> | 49 | #include <linux/syscalls.h> |
50 | #include <linux/times.h> | 50 | #include <linux/times.h> |
51 | #include <linux/acct.h> | 51 | #include <linux/acct.h> |
52 | #include <linux/kprobes.h> | ||
52 | #include <asm/tlb.h> | 53 | #include <asm/tlb.h> |
53 | 54 | ||
54 | #include <asm/unistd.h> | 55 | #include <asm/unistd.h> |
@@ -144,7 +145,8 @@ | |||
144 | (v1) * (v2_max) / (v1_max) | 145 | (v1) * (v2_max) / (v1_max) |
145 | 146 | ||
146 | #define DELTA(p) \ | 147 | #define DELTA(p) \ |
147 | (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) | 148 | (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ |
149 | INTERACTIVE_DELTA) | ||
148 | 150 | ||
149 | #define TASK_INTERACTIVE(p) \ | 151 | #define TASK_INTERACTIVE(p) \ |
150 | ((p)->prio <= (p)->static_prio - DELTA(p)) | 152 | ((p)->prio <= (p)->static_prio - DELTA(p)) |
@@ -178,13 +180,6 @@ static unsigned int task_timeslice(task_t *p) | |||
178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
179 | < (long long) (sd)->cache_hot_time) | 181 | < (long long) (sd)->cache_hot_time) |
180 | 182 | ||
181 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
182 | { | ||
183 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
184 | } | ||
185 | |||
186 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
187 | |||
188 | /* | 183 | /* |
189 | * These are the runqueue data structures: | 184 | * These are the runqueue data structures: |
190 | */ | 185 | */ |
@@ -215,7 +210,6 @@ struct runqueue { | |||
215 | */ | 210 | */ |
216 | unsigned long nr_running; | 211 | unsigned long nr_running; |
217 | #ifdef CONFIG_SMP | 212 | #ifdef CONFIG_SMP |
218 | unsigned long prio_bias; | ||
219 | unsigned long cpu_load[3]; | 213 | unsigned long cpu_load[3]; |
220 | #endif | 214 | #endif |
221 | unsigned long long nr_switches; | 215 | unsigned long long nr_switches; |
@@ -245,6 +239,7 @@ struct runqueue { | |||
245 | 239 | ||
246 | task_t *migration_thread; | 240 | task_t *migration_thread; |
247 | struct list_head migration_queue; | 241 | struct list_head migration_queue; |
242 | int cpu; | ||
248 | #endif | 243 | #endif |
249 | 244 | ||
250 | #ifdef CONFIG_SCHEDSTATS | 245 | #ifdef CONFIG_SCHEDSTATS |
@@ -669,68 +664,17 @@ static int effective_prio(task_t *p) | |||
669 | return prio; | 664 | return prio; |
670 | } | 665 | } |
671 | 666 | ||
672 | #ifdef CONFIG_SMP | ||
673 | static inline void inc_prio_bias(runqueue_t *rq, int prio) | ||
674 | { | ||
675 | rq->prio_bias += MAX_PRIO - prio; | ||
676 | } | ||
677 | |||
678 | static inline void dec_prio_bias(runqueue_t *rq, int prio) | ||
679 | { | ||
680 | rq->prio_bias -= MAX_PRIO - prio; | ||
681 | } | ||
682 | |||
683 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
684 | { | ||
685 | rq->nr_running++; | ||
686 | if (rt_task(p)) { | ||
687 | if (p != rq->migration_thread) | ||
688 | /* | ||
689 | * The migration thread does the actual balancing. Do | ||
690 | * not bias by its priority as the ultra high priority | ||
691 | * will skew balancing adversely. | ||
692 | */ | ||
693 | inc_prio_bias(rq, p->prio); | ||
694 | } else | ||
695 | inc_prio_bias(rq, p->static_prio); | ||
696 | } | ||
697 | |||
698 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
699 | { | ||
700 | rq->nr_running--; | ||
701 | if (rt_task(p)) { | ||
702 | if (p != rq->migration_thread) | ||
703 | dec_prio_bias(rq, p->prio); | ||
704 | } else | ||
705 | dec_prio_bias(rq, p->static_prio); | ||
706 | } | ||
707 | #else | ||
708 | static inline void inc_prio_bias(runqueue_t *rq, int prio) | ||
709 | { | ||
710 | } | ||
711 | |||
712 | static inline void dec_prio_bias(runqueue_t *rq, int prio) | ||
713 | { | ||
714 | } | ||
715 | |||
716 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
717 | { | ||
718 | rq->nr_running++; | ||
719 | } | ||
720 | |||
721 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
722 | { | ||
723 | rq->nr_running--; | ||
724 | } | ||
725 | #endif | ||
726 | |||
727 | /* | 667 | /* |
728 | * __activate_task - move a task to the runqueue. | 668 | * __activate_task - move a task to the runqueue. |
729 | */ | 669 | */ |
730 | static inline void __activate_task(task_t *p, runqueue_t *rq) | 670 | static void __activate_task(task_t *p, runqueue_t *rq) |
731 | { | 671 | { |
732 | enqueue_task(p, rq->active); | 672 | prio_array_t *target = rq->active; |
733 | inc_nr_running(p, rq); | 673 | |
674 | if (batch_task(p)) | ||
675 | target = rq->expired; | ||
676 | enqueue_task(p, target); | ||
677 | rq->nr_running++; | ||
734 | } | 678 | } |
735 | 679 | ||
736 | /* | 680 | /* |
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) | |||
739 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
740 | { | 684 | { |
741 | enqueue_task_head(p, rq->active); | 685 | enqueue_task_head(p, rq->active); |
742 | inc_nr_running(p, rq); | 686 | rq->nr_running++; |
743 | } | 687 | } |
744 | 688 | ||
745 | static int recalc_task_prio(task_t *p, unsigned long long now) | 689 | static int recalc_task_prio(task_t *p, unsigned long long now) |
@@ -748,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
748 | unsigned long long __sleep_time = now - p->timestamp; | 692 | unsigned long long __sleep_time = now - p->timestamp; |
749 | unsigned long sleep_time; | 693 | unsigned long sleep_time; |
750 | 694 | ||
751 | if (unlikely(p->policy == SCHED_BATCH)) | 695 | if (batch_task(p)) |
752 | sleep_time = 0; | 696 | sleep_time = 0; |
753 | else { | 697 | else { |
754 | if (__sleep_time > NS_MAX_SLEEP_AVG) | 698 | if (__sleep_time > NS_MAX_SLEEP_AVG) |
@@ -760,27 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
760 | if (likely(sleep_time > 0)) { | 704 | if (likely(sleep_time > 0)) { |
761 | /* | 705 | /* |
762 | * User tasks that sleep a long time are categorised as | 706 | * User tasks that sleep a long time are categorised as |
763 | * idle and will get just interactive status to stay active & | 707 | * idle. They will only have their sleep_avg increased to a |
764 | * prevent them suddenly becoming cpu hogs and starving | 708 | * level that makes them just interactive priority to stay |
765 | * other processes. | 709 | * active yet prevent them suddenly becoming cpu hogs and |
710 | * starving other processes. | ||
766 | */ | 711 | */ |
767 | if (p->mm && p->activated != -1 && | 712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { |
768 | sleep_time > INTERACTIVE_SLEEP(p)) { | 713 | unsigned long ceiling; |
769 | p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - | ||
770 | DEF_TIMESLICE); | ||
771 | } else { | ||
772 | /* | ||
773 | * The lower the sleep avg a task has the more | ||
774 | * rapidly it will rise with sleep time. | ||
775 | */ | ||
776 | sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; | ||
777 | 714 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | ||
716 | DEF_TIMESLICE); | ||
717 | if (p->sleep_avg < ceiling) | ||
718 | p->sleep_avg = ceiling; | ||
719 | } else { | ||
778 | /* | 720 | /* |
779 | * Tasks waking from uninterruptible sleep are | 721 | * Tasks waking from uninterruptible sleep are |
780 | * limited in their sleep_avg rise as they | 722 | * limited in their sleep_avg rise as they |
781 | * are likely to be waiting on I/O | 723 | * are likely to be waiting on I/O |
782 | */ | 724 | */ |
783 | if (p->activated == -1 && p->mm) { | 725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
784 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) |
785 | sleep_time = 0; | 727 | sleep_time = 0; |
786 | else if (p->sleep_avg + sleep_time >= | 728 | else if (p->sleep_avg + sleep_time >= |
@@ -835,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
835 | * This checks to make sure it's not an uninterruptible task | 777 | * This checks to make sure it's not an uninterruptible task |
836 | * that is now waking up. | 778 | * that is now waking up. |
837 | */ | 779 | */ |
838 | if (!p->activated) { | 780 | if (p->sleep_type == SLEEP_NORMAL) { |
839 | /* | 781 | /* |
840 | * Tasks which were woken up by interrupts (ie. hw events) | 782 | * Tasks which were woken up by interrupts (ie. hw events) |
841 | * are most likely of interactive nature. So we give them | 783 | * are most likely of interactive nature. So we give them |
@@ -844,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
844 | * on a CPU, first time around: | 786 | * on a CPU, first time around: |
845 | */ | 787 | */ |
846 | if (in_interrupt()) | 788 | if (in_interrupt()) |
847 | p->activated = 2; | 789 | p->sleep_type = SLEEP_INTERRUPTED; |
848 | else { | 790 | else { |
849 | /* | 791 | /* |
850 | * Normal first-time wakeups get a credit too for | 792 | * Normal first-time wakeups get a credit too for |
851 | * on-runqueue time, but it will be weighted down: | 793 | * on-runqueue time, but it will be weighted down: |
852 | */ | 794 | */ |
853 | p->activated = 1; | 795 | p->sleep_type = SLEEP_INTERACTIVE; |
854 | } | 796 | } |
855 | } | 797 | } |
856 | p->timestamp = now; | 798 | p->timestamp = now; |
@@ -863,7 +805,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
863 | */ | 805 | */ |
864 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
865 | { | 807 | { |
866 | dec_nr_running(p, rq); | 808 | rq->nr_running--; |
867 | dequeue_task(p, p->array); | 809 | dequeue_task(p, p->array); |
868 | p->array = NULL; | 810 | p->array = NULL; |
869 | } | 811 | } |
@@ -1007,61 +949,27 @@ void kick_process(task_t *p) | |||
1007 | * We want to under-estimate the load of migration sources, to | 949 | * We want to under-estimate the load of migration sources, to |
1008 | * balance conservatively. | 950 | * balance conservatively. |
1009 | */ | 951 | */ |
1010 | static unsigned long __source_load(int cpu, int type, enum idle_type idle) | 952 | static inline unsigned long source_load(int cpu, int type) |
1011 | { | 953 | { |
1012 | runqueue_t *rq = cpu_rq(cpu); | 954 | runqueue_t *rq = cpu_rq(cpu); |
1013 | unsigned long running = rq->nr_running; | 955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
1014 | unsigned long source_load, cpu_load = rq->cpu_load[type-1], | ||
1015 | load_now = running * SCHED_LOAD_SCALE; | ||
1016 | |||
1017 | if (type == 0) | 956 | if (type == 0) |
1018 | source_load = load_now; | 957 | return load_now; |
1019 | else | ||
1020 | source_load = min(cpu_load, load_now); | ||
1021 | 958 | ||
1022 | if (running > 1 || (idle == NOT_IDLE && running)) | 959 | return min(rq->cpu_load[type-1], load_now); |
1023 | /* | ||
1024 | * If we are busy rebalancing the load is biased by | ||
1025 | * priority to create 'nice' support across cpus. When | ||
1026 | * idle rebalancing we should only bias the source_load if | ||
1027 | * there is more than one task running on that queue to | ||
1028 | * prevent idle rebalance from trying to pull tasks from a | ||
1029 | * queue with only one running task. | ||
1030 | */ | ||
1031 | source_load = source_load * rq->prio_bias / running; | ||
1032 | |||
1033 | return source_load; | ||
1034 | } | ||
1035 | |||
1036 | static inline unsigned long source_load(int cpu, int type) | ||
1037 | { | ||
1038 | return __source_load(cpu, type, NOT_IDLE); | ||
1039 | } | 960 | } |
1040 | 961 | ||
1041 | /* | 962 | /* |
1042 | * Return a high guess at the load of a migration-target cpu | 963 | * Return a high guess at the load of a migration-target cpu |
1043 | */ | 964 | */ |
1044 | static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) | 965 | static inline unsigned long target_load(int cpu, int type) |
1045 | { | 966 | { |
1046 | runqueue_t *rq = cpu_rq(cpu); | 967 | runqueue_t *rq = cpu_rq(cpu); |
1047 | unsigned long running = rq->nr_running; | 968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
1048 | unsigned long target_load, cpu_load = rq->cpu_load[type-1], | ||
1049 | load_now = running * SCHED_LOAD_SCALE; | ||
1050 | |||
1051 | if (type == 0) | 969 | if (type == 0) |
1052 | target_load = load_now; | 970 | return load_now; |
1053 | else | ||
1054 | target_load = max(cpu_load, load_now); | ||
1055 | 971 | ||
1056 | if (running > 1 || (idle == NOT_IDLE && running)) | 972 | return max(rq->cpu_load[type-1], load_now); |
1057 | target_load = target_load * rq->prio_bias / running; | ||
1058 | |||
1059 | return target_load; | ||
1060 | } | ||
1061 | |||
1062 | static inline unsigned long target_load(int cpu, int type) | ||
1063 | { | ||
1064 | return __target_load(cpu, type, NOT_IDLE); | ||
1065 | } | 973 | } |
1066 | 974 | ||
1067 | /* | 975 | /* |
@@ -1294,9 +1202,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1294 | } | 1202 | } |
1295 | } | 1203 | } |
1296 | 1204 | ||
1297 | if (p->last_waker_cpu != this_cpu) | ||
1298 | goto out_set_cpu; | ||
1299 | |||
1300 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1205 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1301 | goto out_set_cpu; | 1206 | goto out_set_cpu; |
1302 | 1207 | ||
@@ -1367,8 +1272,6 @@ out_set_cpu: | |||
1367 | cpu = task_cpu(p); | 1272 | cpu = task_cpu(p); |
1368 | } | 1273 | } |
1369 | 1274 | ||
1370 | p->last_waker_cpu = this_cpu; | ||
1371 | |||
1372 | out_activate: | 1275 | out_activate: |
1373 | #endif /* CONFIG_SMP */ | 1276 | #endif /* CONFIG_SMP */ |
1374 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1277 | if (old_state == TASK_UNINTERRUPTIBLE) { |
@@ -1377,19 +1280,19 @@ out_activate: | |||
1377 | * Tasks on involuntary sleep don't earn | 1280 | * Tasks on involuntary sleep don't earn |
1378 | * sleep_avg beyond just interactive state. | 1281 | * sleep_avg beyond just interactive state. |
1379 | */ | 1282 | */ |
1380 | p->activated = -1; | 1283 | p->sleep_type = SLEEP_NONINTERACTIVE; |
1381 | } | 1284 | } else |
1382 | 1285 | ||
1383 | /* | 1286 | /* |
1384 | * Tasks that have marked their sleep as noninteractive get | 1287 | * Tasks that have marked their sleep as noninteractive get |
1385 | * woken up without updating their sleep average. (i.e. their | 1288 | * woken up with their sleep average not weighted in an |
1386 | * sleep is handled in a priority-neutral manner, no priority | 1289 | * interactive way. |
1387 | * boost and no penalty.) | ||
1388 | */ | 1290 | */ |
1389 | if (old_state & TASK_NONINTERACTIVE) | 1291 | if (old_state & TASK_NONINTERACTIVE) |
1390 | __activate_task(p, rq); | 1292 | p->sleep_type = SLEEP_NONINTERACTIVE; |
1391 | else | 1293 | |
1392 | activate_task(p, rq, cpu == this_cpu); | 1294 | |
1295 | activate_task(p, rq, cpu == this_cpu); | ||
1393 | /* | 1296 | /* |
1394 | * Sync wakeups (i.e. those types of wakeups where the waker | 1297 | * Sync wakeups (i.e. those types of wakeups where the waker |
1395 | * has indicated that it will leave the CPU in short order) | 1298 | * has indicated that it will leave the CPU in short order) |
@@ -1450,12 +1353,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1450 | #ifdef CONFIG_SCHEDSTATS | 1353 | #ifdef CONFIG_SCHEDSTATS |
1451 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1354 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1452 | #endif | 1355 | #endif |
1453 | #if defined(CONFIG_SMP) | 1356 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1454 | p->last_waker_cpu = cpu; | ||
1455 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1456 | p->oncpu = 0; | 1357 | p->oncpu = 0; |
1457 | #endif | 1358 | #endif |
1458 | #endif | ||
1459 | #ifdef CONFIG_PREEMPT | 1359 | #ifdef CONFIG_PREEMPT |
1460 | /* Want to start with kernel preemption disabled. */ | 1360 | /* Want to start with kernel preemption disabled. */ |
1461 | task_thread_info(p)->preempt_count = 1; | 1361 | task_thread_info(p)->preempt_count = 1; |
@@ -1530,7 +1430,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1530 | list_add_tail(&p->run_list, ¤t->run_list); | 1430 | list_add_tail(&p->run_list, ¤t->run_list); |
1531 | p->array = current->array; | 1431 | p->array = current->array; |
1532 | p->array->nr_active++; | 1432 | p->array->nr_active++; |
1533 | inc_nr_running(p, rq); | 1433 | rq->nr_running++; |
1534 | } | 1434 | } |
1535 | set_need_resched(); | 1435 | set_need_resched(); |
1536 | } else | 1436 | } else |
@@ -1656,8 +1556,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
1656 | finish_lock_switch(rq, prev); | 1556 | finish_lock_switch(rq, prev); |
1657 | if (mm) | 1557 | if (mm) |
1658 | mmdrop(mm); | 1558 | mmdrop(mm); |
1659 | if (unlikely(prev_task_flags & PF_DEAD)) | 1559 | if (unlikely(prev_task_flags & PF_DEAD)) { |
1560 | /* | ||
1561 | * Remove function-return probe instances associated with this | ||
1562 | * task and put them back on the free list. | ||
1563 | */ | ||
1564 | kprobe_flush_task(prev); | ||
1660 | put_task_struct(prev); | 1565 | put_task_struct(prev); |
1566 | } | ||
1661 | } | 1567 | } |
1662 | 1568 | ||
1663 | /** | 1569 | /** |
@@ -1727,7 +1633,7 @@ unsigned long nr_uninterruptible(void) | |||
1727 | { | 1633 | { |
1728 | unsigned long i, sum = 0; | 1634 | unsigned long i, sum = 0; |
1729 | 1635 | ||
1730 | for_each_cpu(i) | 1636 | for_each_possible_cpu(i) |
1731 | sum += cpu_rq(i)->nr_uninterruptible; | 1637 | sum += cpu_rq(i)->nr_uninterruptible; |
1732 | 1638 | ||
1733 | /* | 1639 | /* |
@@ -1744,7 +1650,7 @@ unsigned long long nr_context_switches(void) | |||
1744 | { | 1650 | { |
1745 | unsigned long long i, sum = 0; | 1651 | unsigned long long i, sum = 0; |
1746 | 1652 | ||
1747 | for_each_cpu(i) | 1653 | for_each_possible_cpu(i) |
1748 | sum += cpu_rq(i)->nr_switches; | 1654 | sum += cpu_rq(i)->nr_switches; |
1749 | 1655 | ||
1750 | return sum; | 1656 | return sum; |
@@ -1754,17 +1660,35 @@ unsigned long nr_iowait(void) | |||
1754 | { | 1660 | { |
1755 | unsigned long i, sum = 0; | 1661 | unsigned long i, sum = 0; |
1756 | 1662 | ||
1757 | for_each_cpu(i) | 1663 | for_each_possible_cpu(i) |
1758 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 1664 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
1759 | 1665 | ||
1760 | return sum; | 1666 | return sum; |
1761 | } | 1667 | } |
1762 | 1668 | ||
1669 | unsigned long nr_active(void) | ||
1670 | { | ||
1671 | unsigned long i, running = 0, uninterruptible = 0; | ||
1672 | |||
1673 | for_each_online_cpu(i) { | ||
1674 | running += cpu_rq(i)->nr_running; | ||
1675 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | ||
1676 | } | ||
1677 | |||
1678 | if (unlikely((long)uninterruptible < 0)) | ||
1679 | uninterruptible = 0; | ||
1680 | |||
1681 | return running + uninterruptible; | ||
1682 | } | ||
1683 | |||
1763 | #ifdef CONFIG_SMP | 1684 | #ifdef CONFIG_SMP |
1764 | 1685 | ||
1765 | /* | 1686 | /* |
1766 | * double_rq_lock - safely lock two runqueues | 1687 | * double_rq_lock - safely lock two runqueues |
1767 | * | 1688 | * |
1689 | * We must take them in cpu order to match code in | ||
1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
1691 | * | ||
1768 | * Note this does not disable interrupts like task_rq_lock, | 1692 | * Note this does not disable interrupts like task_rq_lock, |
1769 | * you need to do so manually before calling. | 1693 | * you need to do so manually before calling. |
1770 | */ | 1694 | */ |
@@ -1776,7 +1700,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1776 | spin_lock(&rq1->lock); | 1700 | spin_lock(&rq1->lock); |
1777 | __acquire(rq2->lock); /* Fake it out ;) */ | 1701 | __acquire(rq2->lock); /* Fake it out ;) */ |
1778 | } else { | 1702 | } else { |
1779 | if (rq1 < rq2) { | 1703 | if (rq1->cpu < rq2->cpu) { |
1780 | spin_lock(&rq1->lock); | 1704 | spin_lock(&rq1->lock); |
1781 | spin_lock(&rq2->lock); | 1705 | spin_lock(&rq2->lock); |
1782 | } else { | 1706 | } else { |
@@ -1812,7 +1736,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1812 | __acquires(this_rq->lock) | 1736 | __acquires(this_rq->lock) |
1813 | { | 1737 | { |
1814 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1738 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1815 | if (busiest < this_rq) { | 1739 | if (busiest->cpu < this_rq->cpu) { |
1816 | spin_unlock(&this_rq->lock); | 1740 | spin_unlock(&this_rq->lock); |
1817 | spin_lock(&busiest->lock); | 1741 | spin_lock(&busiest->lock); |
1818 | spin_lock(&this_rq->lock); | 1742 | spin_lock(&this_rq->lock); |
@@ -1875,9 +1799,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1875 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1876 | { | 1800 | { |
1877 | dequeue_task(p, src_array); | 1801 | dequeue_task(p, src_array); |
1878 | dec_nr_running(p, src_rq); | 1802 | src_rq->nr_running--; |
1879 | set_task_cpu(p, this_cpu); | 1803 | set_task_cpu(p, this_cpu); |
1880 | inc_nr_running(p, this_rq); | 1804 | this_rq->nr_running++; |
1881 | enqueue_task(p, this_array); | 1805 | enqueue_task(p, this_array); |
1882 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1883 | + this_rq->timestamp_last_tick; | 1807 | + this_rq->timestamp_last_tick; |
@@ -2056,9 +1980,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2056 | 1980 | ||
2057 | /* Bias balancing toward cpus of our domain */ | 1981 | /* Bias balancing toward cpus of our domain */ |
2058 | if (local_group) | 1982 | if (local_group) |
2059 | load = __target_load(i, load_idx, idle); | 1983 | load = target_load(i, load_idx); |
2060 | else | 1984 | else |
2061 | load = __source_load(i, load_idx, idle); | 1985 | load = source_load(i, load_idx); |
2062 | 1986 | ||
2063 | avg_load += load; | 1987 | avg_load += load; |
2064 | } | 1988 | } |
@@ -2171,7 +2095,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2171 | int i; | 2095 | int i; |
2172 | 2096 | ||
2173 | for_each_cpu_mask(i, group->cpumask) { | 2097 | for_each_cpu_mask(i, group->cpumask) { |
2174 | load = __source_load(i, 0, idle); | 2098 | load = source_load(i, 0); |
2175 | 2099 | ||
2176 | if (load > max_load) { | 2100 | if (load > max_load) { |
2177 | max_load = load; | 2101 | max_load = load; |
@@ -2959,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count); | |||
2959 | 2883 | ||
2960 | #endif | 2884 | #endif |
2961 | 2885 | ||
2886 | static inline int interactive_sleep(enum sleep_type sleep_type) | ||
2887 | { | ||
2888 | return (sleep_type == SLEEP_INTERACTIVE || | ||
2889 | sleep_type == SLEEP_INTERRUPTED); | ||
2890 | } | ||
2891 | |||
2962 | /* | 2892 | /* |
2963 | * schedule() is the main scheduler function. | 2893 | * schedule() is the main scheduler function. |
2964 | */ | 2894 | */ |
@@ -2978,13 +2908,11 @@ asmlinkage void __sched schedule(void) | |||
2978 | * schedule() atomically, we ignore that path for now. | 2908 | * schedule() atomically, we ignore that path for now. |
2979 | * Otherwise, whine if we are scheduling when we should not be. | 2909 | * Otherwise, whine if we are scheduling when we should not be. |
2980 | */ | 2910 | */ |
2981 | if (likely(!current->exit_state)) { | 2911 | if (unlikely(in_atomic() && !current->exit_state)) { |
2982 | if (unlikely(in_atomic())) { | 2912 | printk(KERN_ERR "BUG: scheduling while atomic: " |
2983 | printk(KERN_ERR "scheduling while atomic: " | 2913 | "%s/0x%08x/%d\n", |
2984 | "%s/0x%08x/%d\n", | 2914 | current->comm, preempt_count(), current->pid); |
2985 | current->comm, preempt_count(), current->pid); | 2915 | dump_stack(); |
2986 | dump_stack(); | ||
2987 | } | ||
2988 | } | 2916 | } |
2989 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 2917 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
2990 | 2918 | ||
@@ -3084,12 +3012,12 @@ go_idle: | |||
3084 | queue = array->queue + idx; | 3012 | queue = array->queue + idx; |
3085 | next = list_entry(queue->next, task_t, run_list); | 3013 | next = list_entry(queue->next, task_t, run_list); |
3086 | 3014 | ||
3087 | if (!rt_task(next) && next->activated > 0) { | 3015 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
3088 | unsigned long long delta = now - next->timestamp; | 3016 | unsigned long long delta = now - next->timestamp; |
3089 | if (unlikely((long long)(now - next->timestamp) < 0)) | 3017 | if (unlikely((long long)(now - next->timestamp) < 0)) |
3090 | delta = 0; | 3018 | delta = 0; |
3091 | 3019 | ||
3092 | if (next->activated == 1) | 3020 | if (next->sleep_type == SLEEP_INTERACTIVE) |
3093 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 3021 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
3094 | 3022 | ||
3095 | array = next->array; | 3023 | array = next->array; |
@@ -3099,10 +3027,9 @@ go_idle: | |||
3099 | dequeue_task(next, array); | 3027 | dequeue_task(next, array); |
3100 | next->prio = new_prio; | 3028 | next->prio = new_prio; |
3101 | enqueue_task(next, array); | 3029 | enqueue_task(next, array); |
3102 | } else | 3030 | } |
3103 | requeue_task(next, array); | ||
3104 | } | 3031 | } |
3105 | next->activated = 0; | 3032 | next->sleep_type = SLEEP_NORMAL; |
3106 | switch_tasks: | 3033 | switch_tasks: |
3107 | if (next == rq->idle) | 3034 | if (next == rq->idle) |
3108 | schedstat_inc(rq, sched_goidle); | 3035 | schedstat_inc(rq, sched_goidle); |
@@ -3571,10 +3498,8 @@ void set_user_nice(task_t *p, long nice) | |||
3571 | goto out_unlock; | 3498 | goto out_unlock; |
3572 | } | 3499 | } |
3573 | array = p->array; | 3500 | array = p->array; |
3574 | if (array) { | 3501 | if (array) |
3575 | dequeue_task(p, array); | 3502 | dequeue_task(p, array); |
3576 | dec_prio_bias(rq, p->static_prio); | ||
3577 | } | ||
3578 | 3503 | ||
3579 | old_prio = p->prio; | 3504 | old_prio = p->prio; |
3580 | new_prio = NICE_TO_PRIO(nice); | 3505 | new_prio = NICE_TO_PRIO(nice); |
@@ -3584,7 +3509,6 @@ void set_user_nice(task_t *p, long nice) | |||
3584 | 3509 | ||
3585 | if (array) { | 3510 | if (array) { |
3586 | enqueue_task(p, array); | 3511 | enqueue_task(p, array); |
3587 | inc_prio_bias(rq, p->static_prio); | ||
3588 | /* | 3512 | /* |
3589 | * If the task increased its priority or is running and | 3513 | * If the task increased its priority or is running and |
3590 | * lowered its priority, then reschedule its CPU: | 3514 | * lowered its priority, then reschedule its CPU: |
@@ -4129,6 +4053,8 @@ static inline void __cond_resched(void) | |||
4129 | */ | 4053 | */ |
4130 | if (unlikely(preempt_count())) | 4054 | if (unlikely(preempt_count())) |
4131 | return; | 4055 | return; |
4056 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4057 | return; | ||
4132 | do { | 4058 | do { |
4133 | add_preempt_count(PREEMPT_ACTIVE); | 4059 | add_preempt_count(PREEMPT_ACTIVE); |
4134 | schedule(); | 4060 | schedule(); |
@@ -4434,6 +4360,7 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4434 | runqueue_t *rq = cpu_rq(cpu); | 4360 | runqueue_t *rq = cpu_rq(cpu); |
4435 | unsigned long flags; | 4361 | unsigned long flags; |
4436 | 4362 | ||
4363 | idle->timestamp = sched_clock(); | ||
4437 | idle->sleep_avg = 0; | 4364 | idle->sleep_avg = 0; |
4438 | idle->array = NULL; | 4365 | idle->array = NULL; |
4439 | idle->prio = MAX_PRIO; | 4366 | idle->prio = MAX_PRIO; |
@@ -4861,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4861 | /* Register at highest priority so that task migration (migrate_all_tasks) | 4788 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4862 | * happens before everything else. | 4789 | * happens before everything else. |
4863 | */ | 4790 | */ |
4864 | static struct notifier_block __devinitdata migration_notifier = { | 4791 | static struct notifier_block migration_notifier = { |
4865 | .notifier_call = migration_call, | 4792 | .notifier_call = migration_call, |
4866 | .priority = 10 | 4793 | .priority = 10 |
4867 | }; | 4794 | }; |
@@ -5159,7 +5086,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5159 | #define MAX_DOMAIN_DISTANCE 32 | 5086 | #define MAX_DOMAIN_DISTANCE 32 |
5160 | 5087 | ||
5161 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | 5088 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = |
5162 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | 5089 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = |
5090 | /* | ||
5091 | * Architectures may override the migration cost and thus avoid | ||
5092 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | ||
5093 | * virtualized hardware: | ||
5094 | */ | ||
5095 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | ||
5096 | CONFIG_DEFAULT_MIGRATION_COST | ||
5097 | #else | ||
5098 | -1LL | ||
5099 | #endif | ||
5100 | }; | ||
5163 | 5101 | ||
5164 | /* | 5102 | /* |
5165 | * Allow override of migration cost - in units of microseconds. | 5103 | * Allow override of migration cost - in units of microseconds. |
@@ -5664,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu) | |||
5664 | } | 5602 | } |
5665 | #endif | 5603 | #endif |
5666 | 5604 | ||
5605 | #ifdef CONFIG_SCHED_MC | ||
5606 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | ||
5607 | static struct sched_group sched_group_core[NR_CPUS]; | ||
5608 | #endif | ||
5609 | |||
5610 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
5611 | static int cpu_to_core_group(int cpu) | ||
5612 | { | ||
5613 | return first_cpu(cpu_sibling_map[cpu]); | ||
5614 | } | ||
5615 | #elif defined(CONFIG_SCHED_MC) | ||
5616 | static int cpu_to_core_group(int cpu) | ||
5617 | { | ||
5618 | return cpu; | ||
5619 | } | ||
5620 | #endif | ||
5621 | |||
5667 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5622 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5668 | static struct sched_group sched_group_phys[NR_CPUS]; | 5623 | static struct sched_group sched_group_phys[NR_CPUS]; |
5669 | static int cpu_to_phys_group(int cpu) | 5624 | static int cpu_to_phys_group(int cpu) |
5670 | { | 5625 | { |
5671 | #ifdef CONFIG_SCHED_SMT | 5626 | #if defined(CONFIG_SCHED_MC) |
5627 | cpumask_t mask = cpu_coregroup_map(cpu); | ||
5628 | return first_cpu(mask); | ||
5629 | #elif defined(CONFIG_SCHED_SMT) | ||
5672 | return first_cpu(cpu_sibling_map[cpu]); | 5630 | return first_cpu(cpu_sibling_map[cpu]); |
5673 | #else | 5631 | #else |
5674 | return cpu; | 5632 | return cpu; |
@@ -5691,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu) | |||
5691 | { | 5649 | { |
5692 | return cpu_to_node(cpu); | 5650 | return cpu_to_node(cpu); |
5693 | } | 5651 | } |
5652 | static void init_numa_sched_groups_power(struct sched_group *group_head) | ||
5653 | { | ||
5654 | struct sched_group *sg = group_head; | ||
5655 | int j; | ||
5656 | |||
5657 | if (!sg) | ||
5658 | return; | ||
5659 | next_sg: | ||
5660 | for_each_cpu_mask(j, sg->cpumask) { | ||
5661 | struct sched_domain *sd; | ||
5662 | |||
5663 | sd = &per_cpu(phys_domains, j); | ||
5664 | if (j != first_cpu(sd->groups->cpumask)) { | ||
5665 | /* | ||
5666 | * Only add "power" once for each | ||
5667 | * physical package. | ||
5668 | */ | ||
5669 | continue; | ||
5670 | } | ||
5671 | |||
5672 | sg->cpu_power += sd->groups->cpu_power; | ||
5673 | } | ||
5674 | sg = sg->next; | ||
5675 | if (sg != group_head) | ||
5676 | goto next_sg; | ||
5677 | } | ||
5694 | #endif | 5678 | #endif |
5695 | 5679 | ||
5696 | /* | 5680 | /* |
@@ -5766,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5766 | sd->parent = p; | 5750 | sd->parent = p; |
5767 | sd->groups = &sched_group_phys[group]; | 5751 | sd->groups = &sched_group_phys[group]; |
5768 | 5752 | ||
5753 | #ifdef CONFIG_SCHED_MC | ||
5754 | p = sd; | ||
5755 | sd = &per_cpu(core_domains, i); | ||
5756 | group = cpu_to_core_group(i); | ||
5757 | *sd = SD_MC_INIT; | ||
5758 | sd->span = cpu_coregroup_map(i); | ||
5759 | cpus_and(sd->span, sd->span, *cpu_map); | ||
5760 | sd->parent = p; | ||
5761 | sd->groups = &sched_group_core[group]; | ||
5762 | #endif | ||
5763 | |||
5769 | #ifdef CONFIG_SCHED_SMT | 5764 | #ifdef CONFIG_SCHED_SMT |
5770 | p = sd; | 5765 | p = sd; |
5771 | sd = &per_cpu(cpu_domains, i); | 5766 | sd = &per_cpu(cpu_domains, i); |
@@ -5791,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5791 | } | 5786 | } |
5792 | #endif | 5787 | #endif |
5793 | 5788 | ||
5789 | #ifdef CONFIG_SCHED_MC | ||
5790 | /* Set up multi-core groups */ | ||
5791 | for_each_cpu_mask(i, *cpu_map) { | ||
5792 | cpumask_t this_core_map = cpu_coregroup_map(i); | ||
5793 | cpus_and(this_core_map, this_core_map, *cpu_map); | ||
5794 | if (i != first_cpu(this_core_map)) | ||
5795 | continue; | ||
5796 | init_sched_build_groups(sched_group_core, this_core_map, | ||
5797 | &cpu_to_core_group); | ||
5798 | } | ||
5799 | #endif | ||
5800 | |||
5801 | |||
5794 | /* Set up physical groups */ | 5802 | /* Set up physical groups */ |
5795 | for (i = 0; i < MAX_NUMNODES; i++) { | 5803 | for (i = 0; i < MAX_NUMNODES; i++) { |
5796 | cpumask_t nodemask = node_to_cpumask(i); | 5804 | cpumask_t nodemask = node_to_cpumask(i); |
@@ -5887,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5887 | power = SCHED_LOAD_SCALE; | 5895 | power = SCHED_LOAD_SCALE; |
5888 | sd->groups->cpu_power = power; | 5896 | sd->groups->cpu_power = power; |
5889 | #endif | 5897 | #endif |
5898 | #ifdef CONFIG_SCHED_MC | ||
5899 | sd = &per_cpu(core_domains, i); | ||
5900 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5901 | * SCHED_LOAD_SCALE / 10; | ||
5902 | sd->groups->cpu_power = power; | ||
5890 | 5903 | ||
5891 | sd = &per_cpu(phys_domains, i); | 5904 | sd = &per_cpu(phys_domains, i); |
5905 | |||
5906 | /* | ||
5907 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
5908 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
5909 | * while calculating NUMA group's cpu_power | ||
5910 | * we can simply do | ||
5911 | * numa_group->cpu_power += phys_group->cpu_power; | ||
5912 | * | ||
5913 | * See "only add power once for each physical pkg" | ||
5914 | * comment below | ||
5915 | */ | ||
5916 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5917 | #else | ||
5918 | sd = &per_cpu(phys_domains, i); | ||
5892 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 5919 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5893 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 5920 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5894 | sd->groups->cpu_power = power; | 5921 | sd->groups->cpu_power = power; |
5895 | |||
5896 | #ifdef CONFIG_NUMA | ||
5897 | sd = &per_cpu(allnodes_domains, i); | ||
5898 | if (sd->groups) { | ||
5899 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
5900 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
5901 | sd->groups->cpu_power = power; | ||
5902 | } | ||
5903 | #endif | 5922 | #endif |
5904 | } | 5923 | } |
5905 | 5924 | ||
5906 | #ifdef CONFIG_NUMA | 5925 | #ifdef CONFIG_NUMA |
5907 | for (i = 0; i < MAX_NUMNODES; i++) { | 5926 | for (i = 0; i < MAX_NUMNODES; i++) |
5908 | struct sched_group *sg = sched_group_nodes[i]; | 5927 | init_numa_sched_groups_power(sched_group_nodes[i]); |
5909 | int j; | ||
5910 | |||
5911 | if (sg == NULL) | ||
5912 | continue; | ||
5913 | next_sg: | ||
5914 | for_each_cpu_mask(j, sg->cpumask) { | ||
5915 | struct sched_domain *sd; | ||
5916 | int power; | ||
5917 | |||
5918 | sd = &per_cpu(phys_domains, j); | ||
5919 | if (j != first_cpu(sd->groups->cpumask)) { | ||
5920 | /* | ||
5921 | * Only add "power" once for each | ||
5922 | * physical package. | ||
5923 | */ | ||
5924 | continue; | ||
5925 | } | ||
5926 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
5927 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
5928 | 5928 | ||
5929 | sg->cpu_power += power; | 5929 | init_numa_sched_groups_power(sched_group_allnodes); |
5930 | } | ||
5931 | sg = sg->next; | ||
5932 | if (sg != sched_group_nodes[i]) | ||
5933 | goto next_sg; | ||
5934 | } | ||
5935 | #endif | 5930 | #endif |
5936 | 5931 | ||
5937 | /* Attach the domains */ | 5932 | /* Attach the domains */ |
@@ -5939,6 +5934,8 @@ next_sg: | |||
5939 | struct sched_domain *sd; | 5934 | struct sched_domain *sd; |
5940 | #ifdef CONFIG_SCHED_SMT | 5935 | #ifdef CONFIG_SCHED_SMT |
5941 | sd = &per_cpu(cpu_domains, i); | 5936 | sd = &per_cpu(cpu_domains, i); |
5937 | #elif defined(CONFIG_SCHED_MC) | ||
5938 | sd = &per_cpu(core_domains, i); | ||
5942 | #else | 5939 | #else |
5943 | sd = &per_cpu(phys_domains, i); | 5940 | sd = &per_cpu(phys_domains, i); |
5944 | #endif | 5941 | #endif |
@@ -6111,7 +6108,7 @@ void __init sched_init(void) | |||
6111 | runqueue_t *rq; | 6108 | runqueue_t *rq; |
6112 | int i, j, k; | 6109 | int i, j, k; |
6113 | 6110 | ||
6114 | for_each_cpu(i) { | 6111 | for_each_possible_cpu(i) { |
6115 | prio_array_t *array; | 6112 | prio_array_t *array; |
6116 | 6113 | ||
6117 | rq = cpu_rq(i); | 6114 | rq = cpu_rq(i); |
@@ -6129,6 +6126,7 @@ void __init sched_init(void) | |||
6129 | rq->push_cpu = 0; | 6126 | rq->push_cpu = 0; |
6130 | rq->migration_thread = NULL; | 6127 | rq->migration_thread = NULL; |
6131 | INIT_LIST_HEAD(&rq->migration_queue); | 6128 | INIT_LIST_HEAD(&rq->migration_queue); |
6129 | rq->cpu = i; | ||
6132 | #endif | 6130 | #endif |
6133 | atomic_set(&rq->nr_iowait, 0); | 6131 | atomic_set(&rq->nr_iowait, 0); |
6134 | 6132 | ||
@@ -6169,7 +6167,7 @@ void __might_sleep(char *file, int line) | |||
6169 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6167 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
6170 | return; | 6168 | return; |
6171 | prev_jiffy = jiffies; | 6169 | prev_jiffy = jiffies; |
6172 | printk(KERN_ERR "Debug: sleeping function called from invalid" | 6170 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
6173 | " context at %s:%d\n", file, line); | 6171 | " context at %s:%d\n", file, line); |
6174 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6172 | printk("in_atomic():%d, irqs_disabled():%d\n", |
6175 | in_atomic(), irqs_disabled()); | 6173 | in_atomic(), irqs_disabled()); |
diff --git a/kernel/signal.c b/kernel/signal.c index b373fc2420..e5f8aea78f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
24 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
25 | #include <linux/posix-timers.h> | ||
26 | #include <linux/signal.h> | 25 | #include <linux/signal.h> |
27 | #include <linux/audit.h> | 26 | #include <linux/audit.h> |
28 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep; | |||
147 | #define sig_kernel_stop(sig) \ | 146 | #define sig_kernel_stop(sig) \ |
148 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) | 147 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) |
149 | 148 | ||
149 | #define sig_needs_tasklist(sig) ((sig) == SIGCONT) | ||
150 | |||
150 | #define sig_user_defined(t, signr) \ | 151 | #define sig_user_defined(t, signr) \ |
151 | (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ | 152 | (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ |
152 | ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) | 153 | ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) |
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q) | |||
292 | kmem_cache_free(sigqueue_cachep, q); | 293 | kmem_cache_free(sigqueue_cachep, q); |
293 | } | 294 | } |
294 | 295 | ||
295 | static void flush_sigqueue(struct sigpending *queue) | 296 | void flush_sigqueue(struct sigpending *queue) |
296 | { | 297 | { |
297 | struct sigqueue *q; | 298 | struct sigqueue *q; |
298 | 299 | ||
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue) | |||
307 | /* | 308 | /* |
308 | * Flush all pending signals for a task. | 309 | * Flush all pending signals for a task. |
309 | */ | 310 | */ |
310 | 311 | void flush_signals(struct task_struct *t) | |
311 | void | ||
312 | flush_signals(struct task_struct *t) | ||
313 | { | 312 | { |
314 | unsigned long flags; | 313 | unsigned long flags; |
315 | 314 | ||
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t) | |||
321 | } | 320 | } |
322 | 321 | ||
323 | /* | 322 | /* |
324 | * This function expects the tasklist_lock write-locked. | ||
325 | */ | ||
326 | void __exit_sighand(struct task_struct *tsk) | ||
327 | { | ||
328 | struct sighand_struct * sighand = tsk->sighand; | ||
329 | |||
330 | /* Ok, we're done with the signal handlers */ | ||
331 | tsk->sighand = NULL; | ||
332 | if (atomic_dec_and_test(&sighand->count)) | ||
333 | sighand_free(sighand); | ||
334 | } | ||
335 | |||
336 | void exit_sighand(struct task_struct *tsk) | ||
337 | { | ||
338 | write_lock_irq(&tasklist_lock); | ||
339 | rcu_read_lock(); | ||
340 | if (tsk->sighand != NULL) { | ||
341 | struct sighand_struct *sighand = rcu_dereference(tsk->sighand); | ||
342 | spin_lock(&sighand->siglock); | ||
343 | __exit_sighand(tsk); | ||
344 | spin_unlock(&sighand->siglock); | ||
345 | } | ||
346 | rcu_read_unlock(); | ||
347 | write_unlock_irq(&tasklist_lock); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * This function expects the tasklist_lock write-locked. | ||
352 | */ | ||
353 | void __exit_signal(struct task_struct *tsk) | ||
354 | { | ||
355 | struct signal_struct * sig = tsk->signal; | ||
356 | struct sighand_struct * sighand; | ||
357 | |||
358 | if (!sig) | ||
359 | BUG(); | ||
360 | if (!atomic_read(&sig->count)) | ||
361 | BUG(); | ||
362 | rcu_read_lock(); | ||
363 | sighand = rcu_dereference(tsk->sighand); | ||
364 | spin_lock(&sighand->siglock); | ||
365 | posix_cpu_timers_exit(tsk); | ||
366 | if (atomic_dec_and_test(&sig->count)) { | ||
367 | posix_cpu_timers_exit_group(tsk); | ||
368 | tsk->signal = NULL; | ||
369 | __exit_sighand(tsk); | ||
370 | spin_unlock(&sighand->siglock); | ||
371 | flush_sigqueue(&sig->shared_pending); | ||
372 | } else { | ||
373 | /* | ||
374 | * If there is any task waiting for the group exit | ||
375 | * then notify it: | ||
376 | */ | ||
377 | if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { | ||
378 | wake_up_process(sig->group_exit_task); | ||
379 | sig->group_exit_task = NULL; | ||
380 | } | ||
381 | if (tsk == sig->curr_target) | ||
382 | sig->curr_target = next_thread(tsk); | ||
383 | tsk->signal = NULL; | ||
384 | /* | ||
385 | * Accumulate here the counters for all threads but the | ||
386 | * group leader as they die, so they can be added into | ||
387 | * the process-wide totals when those are taken. | ||
388 | * The group leader stays around as a zombie as long | ||
389 | * as there are other threads. When it gets reaped, | ||
390 | * the exit.c code will add its counts into these totals. | ||
391 | * We won't ever get here for the group leader, since it | ||
392 | * will have been the last reference on the signal_struct. | ||
393 | */ | ||
394 | sig->utime = cputime_add(sig->utime, tsk->utime); | ||
395 | sig->stime = cputime_add(sig->stime, tsk->stime); | ||
396 | sig->min_flt += tsk->min_flt; | ||
397 | sig->maj_flt += tsk->maj_flt; | ||
398 | sig->nvcsw += tsk->nvcsw; | ||
399 | sig->nivcsw += tsk->nivcsw; | ||
400 | sig->sched_time += tsk->sched_time; | ||
401 | __exit_sighand(tsk); | ||
402 | spin_unlock(&sighand->siglock); | ||
403 | sig = NULL; /* Marker for below. */ | ||
404 | } | ||
405 | rcu_read_unlock(); | ||
406 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | ||
407 | flush_sigqueue(&tsk->pending); | ||
408 | if (sig) { | ||
409 | /* | ||
410 | * We are cleaning up the signal_struct here. | ||
411 | */ | ||
412 | exit_thread_group_keys(sig); | ||
413 | kmem_cache_free(signal_cachep, sig); | ||
414 | } | ||
415 | } | ||
416 | |||
417 | void exit_signal(struct task_struct *tsk) | ||
418 | { | ||
419 | atomic_dec(&tsk->signal->live); | ||
420 | |||
421 | write_lock_irq(&tasklist_lock); | ||
422 | __exit_signal(tsk); | ||
423 | write_unlock_irq(&tasklist_lock); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Flush all handlers for a task. | 323 | * Flush all handlers for a task. |
428 | */ | 324 | */ |
429 | 325 | ||
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
695 | } | 591 | } |
696 | 592 | ||
697 | /* forward decl */ | 593 | /* forward decl */ |
698 | static void do_notify_parent_cldstop(struct task_struct *tsk, | 594 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why); |
699 | int to_self, | ||
700 | int why); | ||
701 | 595 | ||
702 | /* | 596 | /* |
703 | * Handle magic process-wide effects of stop/continue signals. | 597 | * Handle magic process-wide effects of stop/continue signals. |
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
747 | p->signal->group_stop_count = 0; | 641 | p->signal->group_stop_count = 0; |
748 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 642 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
749 | spin_unlock(&p->sighand->siglock); | 643 | spin_unlock(&p->sighand->siglock); |
750 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); | 644 | do_notify_parent_cldstop(p, CLD_STOPPED); |
751 | spin_lock(&p->sighand->siglock); | 645 | spin_lock(&p->sighand->siglock); |
752 | } | 646 | } |
753 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | 647 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); |
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
788 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 682 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
789 | p->signal->group_exit_code = 0; | 683 | p->signal->group_exit_code = 0; |
790 | spin_unlock(&p->sighand->siglock); | 684 | spin_unlock(&p->sighand->siglock); |
791 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); | 685 | do_notify_parent_cldstop(p, CLD_CONTINUED); |
792 | spin_lock(&p->sighand->siglock); | 686 | spin_lock(&p->sighand->siglock); |
793 | } else { | 687 | } else { |
794 | /* | 688 | /* |
@@ -875,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
875 | { | 769 | { |
876 | int ret = 0; | 770 | int ret = 0; |
877 | 771 | ||
878 | if (!irqs_disabled()) | 772 | BUG_ON(!irqs_disabled()); |
879 | BUG(); | ||
880 | assert_spin_locked(&t->sighand->siglock); | 773 | assert_spin_locked(&t->sighand->siglock); |
881 | 774 | ||
882 | /* Short-circuit ignored signals. */ | 775 | /* Short-circuit ignored signals. */ |
@@ -975,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p) | |||
975 | if (t == NULL) | 868 | if (t == NULL) |
976 | /* restart balancing at this thread */ | 869 | /* restart balancing at this thread */ |
977 | t = p->signal->curr_target = p; | 870 | t = p->signal->curr_target = p; |
978 | BUG_ON(t->tgid != p->tgid); | ||
979 | 871 | ||
980 | while (!wants_signal(sig, t)) { | 872 | while (!wants_signal(sig, t)) { |
981 | t = next_thread(t); | 873 | t = next_thread(t); |
@@ -1120,27 +1012,37 @@ void zap_other_threads(struct task_struct *p) | |||
1120 | /* | 1012 | /* |
1121 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 1013 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
1122 | */ | 1014 | */ |
1015 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) | ||
1016 | { | ||
1017 | struct sighand_struct *sighand; | ||
1018 | |||
1019 | for (;;) { | ||
1020 | sighand = rcu_dereference(tsk->sighand); | ||
1021 | if (unlikely(sighand == NULL)) | ||
1022 | break; | ||
1023 | |||
1024 | spin_lock_irqsave(&sighand->siglock, *flags); | ||
1025 | if (likely(sighand == tsk->sighand)) | ||
1026 | break; | ||
1027 | spin_unlock_irqrestore(&sighand->siglock, *flags); | ||
1028 | } | ||
1029 | |||
1030 | return sighand; | ||
1031 | } | ||
1032 | |||
1123 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1033 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
1124 | { | 1034 | { |
1125 | unsigned long flags; | 1035 | unsigned long flags; |
1126 | struct sighand_struct *sp; | ||
1127 | int ret; | 1036 | int ret; |
1128 | 1037 | ||
1129 | retry: | ||
1130 | ret = check_kill_permission(sig, info, p); | 1038 | ret = check_kill_permission(sig, info, p); |
1131 | if (!ret && sig && (sp = rcu_dereference(p->sighand))) { | 1039 | |
1132 | spin_lock_irqsave(&sp->siglock, flags); | 1040 | if (!ret && sig) { |
1133 | if (p->sighand != sp) { | 1041 | ret = -ESRCH; |
1134 | spin_unlock_irqrestore(&sp->siglock, flags); | 1042 | if (lock_task_sighand(p, &flags)) { |
1135 | goto retry; | 1043 | ret = __group_send_sig_info(sig, info, p); |
1136 | } | 1044 | unlock_task_sighand(p, &flags); |
1137 | if ((atomic_read(&sp->count) == 0) || | ||
1138 | (atomic_read(&p->usage) == 0)) { | ||
1139 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1140 | return -ESRCH; | ||
1141 | } | 1045 | } |
1142 | ret = __group_send_sig_info(sig, info, p); | ||
1143 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1144 | } | 1046 | } |
1145 | 1047 | ||
1146 | return ret; | 1048 | return ret; |
@@ -1189,7 +1091,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1189 | struct task_struct *p; | 1091 | struct task_struct *p; |
1190 | 1092 | ||
1191 | rcu_read_lock(); | 1093 | rcu_read_lock(); |
1192 | if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { | 1094 | if (unlikely(sig_needs_tasklist(sig))) { |
1193 | read_lock(&tasklist_lock); | 1095 | read_lock(&tasklist_lock); |
1194 | acquired_tasklist_lock = 1; | 1096 | acquired_tasklist_lock = 1; |
1195 | } | 1097 | } |
@@ -1405,12 +1307,10 @@ void sigqueue_free(struct sigqueue *q) | |||
1405 | __sigqueue_free(q); | 1307 | __sigqueue_free(q); |
1406 | } | 1308 | } |
1407 | 1309 | ||
1408 | int | 1310 | int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) |
1409 | send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | ||
1410 | { | 1311 | { |
1411 | unsigned long flags; | 1312 | unsigned long flags; |
1412 | int ret = 0; | 1313 | int ret = 0; |
1413 | struct sighand_struct *sh; | ||
1414 | 1314 | ||
1415 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1315 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1416 | 1316 | ||
@@ -1424,48 +1324,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1424 | */ | 1324 | */ |
1425 | rcu_read_lock(); | 1325 | rcu_read_lock(); |
1426 | 1326 | ||
1427 | if (unlikely(p->flags & PF_EXITING)) { | 1327 | if (!likely(lock_task_sighand(p, &flags))) { |
1428 | ret = -1; | 1328 | ret = -1; |
1429 | goto out_err; | 1329 | goto out_err; |
1430 | } | 1330 | } |
1431 | 1331 | ||
1432 | retry: | ||
1433 | sh = rcu_dereference(p->sighand); | ||
1434 | |||
1435 | spin_lock_irqsave(&sh->siglock, flags); | ||
1436 | if (p->sighand != sh) { | ||
1437 | /* We raced with exec() in a multithreaded process... */ | ||
1438 | spin_unlock_irqrestore(&sh->siglock, flags); | ||
1439 | goto retry; | ||
1440 | } | ||
1441 | |||
1442 | /* | ||
1443 | * We do the check here again to handle the following scenario: | ||
1444 | * | ||
1445 | * CPU 0 CPU 1 | ||
1446 | * send_sigqueue | ||
1447 | * check PF_EXITING | ||
1448 | * interrupt exit code running | ||
1449 | * __exit_signal | ||
1450 | * lock sighand->siglock | ||
1451 | * unlock sighand->siglock | ||
1452 | * lock sh->siglock | ||
1453 | * add(tsk->pending) flush_sigqueue(tsk->pending) | ||
1454 | * | ||
1455 | */ | ||
1456 | |||
1457 | if (unlikely(p->flags & PF_EXITING)) { | ||
1458 | ret = -1; | ||
1459 | goto out; | ||
1460 | } | ||
1461 | |||
1462 | if (unlikely(!list_empty(&q->list))) { | 1332 | if (unlikely(!list_empty(&q->list))) { |
1463 | /* | 1333 | /* |
1464 | * If an SI_TIMER entry is already queue just increment | 1334 | * If an SI_TIMER entry is already queue just increment |
1465 | * the overrun count. | 1335 | * the overrun count. |
1466 | */ | 1336 | */ |
1467 | if (q->info.si_code != SI_TIMER) | 1337 | BUG_ON(q->info.si_code != SI_TIMER); |
1468 | BUG(); | ||
1469 | q->info.si_overrun++; | 1338 | q->info.si_overrun++; |
1470 | goto out; | 1339 | goto out; |
1471 | } | 1340 | } |
@@ -1481,7 +1350,7 @@ retry: | |||
1481 | signal_wake_up(p, sig == SIGKILL); | 1350 | signal_wake_up(p, sig == SIGKILL); |
1482 | 1351 | ||
1483 | out: | 1352 | out: |
1484 | spin_unlock_irqrestore(&sh->siglock, flags); | 1353 | unlock_task_sighand(p, &flags); |
1485 | out_err: | 1354 | out_err: |
1486 | rcu_read_unlock(); | 1355 | rcu_read_unlock(); |
1487 | 1356 | ||
@@ -1513,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1513 | * the overrun count. Other uses should not try to | 1382 | * the overrun count. Other uses should not try to |
1514 | * send the signal multiple times. | 1383 | * send the signal multiple times. |
1515 | */ | 1384 | */ |
1516 | if (q->info.si_code != SI_TIMER) | 1385 | BUG_ON(q->info.si_code != SI_TIMER); |
1517 | BUG(); | ||
1518 | q->info.si_overrun++; | 1386 | q->info.si_overrun++; |
1519 | goto out; | 1387 | goto out; |
1520 | } | 1388 | } |
@@ -1613,14 +1481,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) | |||
1613 | spin_unlock_irqrestore(&psig->siglock, flags); | 1481 | spin_unlock_irqrestore(&psig->siglock, flags); |
1614 | } | 1482 | } |
1615 | 1483 | ||
1616 | static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) | 1484 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why) |
1617 | { | 1485 | { |
1618 | struct siginfo info; | 1486 | struct siginfo info; |
1619 | unsigned long flags; | 1487 | unsigned long flags; |
1620 | struct task_struct *parent; | 1488 | struct task_struct *parent; |
1621 | struct sighand_struct *sighand; | 1489 | struct sighand_struct *sighand; |
1622 | 1490 | ||
1623 | if (to_self) | 1491 | if (tsk->ptrace & PT_PTRACED) |
1624 | parent = tsk->parent; | 1492 | parent = tsk->parent; |
1625 | else { | 1493 | else { |
1626 | tsk = tsk->group_leader; | 1494 | tsk = tsk->group_leader; |
@@ -1689,13 +1557,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
1689 | /* Let the debugger run. */ | 1557 | /* Let the debugger run. */ |
1690 | set_current_state(TASK_TRACED); | 1558 | set_current_state(TASK_TRACED); |
1691 | spin_unlock_irq(¤t->sighand->siglock); | 1559 | spin_unlock_irq(¤t->sighand->siglock); |
1560 | try_to_freeze(); | ||
1692 | read_lock(&tasklist_lock); | 1561 | read_lock(&tasklist_lock); |
1693 | if (likely(current->ptrace & PT_PTRACED) && | 1562 | if (likely(current->ptrace & PT_PTRACED) && |
1694 | likely(current->parent != current->real_parent || | 1563 | likely(current->parent != current->real_parent || |
1695 | !(current->ptrace & PT_ATTACHED)) && | 1564 | !(current->ptrace & PT_ATTACHED)) && |
1696 | (likely(current->parent->signal != current->signal) || | 1565 | (likely(current->parent->signal != current->signal) || |
1697 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | 1566 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { |
1698 | do_notify_parent_cldstop(current, 1, CLD_TRAPPED); | 1567 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
1699 | read_unlock(&tasklist_lock); | 1568 | read_unlock(&tasklist_lock); |
1700 | schedule(); | 1569 | schedule(); |
1701 | } else { | 1570 | } else { |
@@ -1744,25 +1613,17 @@ void ptrace_notify(int exit_code) | |||
1744 | static void | 1613 | static void |
1745 | finish_stop(int stop_count) | 1614 | finish_stop(int stop_count) |
1746 | { | 1615 | { |
1747 | int to_self; | ||
1748 | |||
1749 | /* | 1616 | /* |
1750 | * If there are no other threads in the group, or if there is | 1617 | * If there are no other threads in the group, or if there is |
1751 | * a group stop in progress and we are the last to stop, | 1618 | * a group stop in progress and we are the last to stop, |
1752 | * report to the parent. When ptraced, every thread reports itself. | 1619 | * report to the parent. When ptraced, every thread reports itself. |
1753 | */ | 1620 | */ |
1754 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) | 1621 | if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { |
1755 | to_self = 1; | 1622 | read_lock(&tasklist_lock); |
1756 | else if (stop_count == 0) | 1623 | do_notify_parent_cldstop(current, CLD_STOPPED); |
1757 | to_self = 0; | 1624 | read_unlock(&tasklist_lock); |
1758 | else | 1625 | } |
1759 | goto out; | ||
1760 | |||
1761 | read_lock(&tasklist_lock); | ||
1762 | do_notify_parent_cldstop(current, to_self, CLD_STOPPED); | ||
1763 | read_unlock(&tasklist_lock); | ||
1764 | 1626 | ||
1765 | out: | ||
1766 | schedule(); | 1627 | schedule(); |
1767 | /* | 1628 | /* |
1768 | * Now we don't run again until continued. | 1629 | * Now we don't run again until continued. |
@@ -1776,12 +1637,10 @@ out: | |||
1776 | * Returns nonzero if we've actually stopped and released the siglock. | 1637 | * Returns nonzero if we've actually stopped and released the siglock. |
1777 | * Returns zero if we didn't stop and still hold the siglock. | 1638 | * Returns zero if we didn't stop and still hold the siglock. |
1778 | */ | 1639 | */ |
1779 | static int | 1640 | static int do_signal_stop(int signr) |
1780 | do_signal_stop(int signr) | ||
1781 | { | 1641 | { |
1782 | struct signal_struct *sig = current->signal; | 1642 | struct signal_struct *sig = current->signal; |
1783 | struct sighand_struct *sighand = current->sighand; | 1643 | int stop_count; |
1784 | int stop_count = -1; | ||
1785 | 1644 | ||
1786 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) | 1645 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) |
1787 | return 0; | 1646 | return 0; |
@@ -1791,86 +1650,37 @@ do_signal_stop(int signr) | |||
1791 | * There is a group stop in progress. We don't need to | 1650 | * There is a group stop in progress. We don't need to |
1792 | * start another one. | 1651 | * start another one. |
1793 | */ | 1652 | */ |
1794 | signr = sig->group_exit_code; | ||
1795 | stop_count = --sig->group_stop_count; | 1653 | stop_count = --sig->group_stop_count; |
1796 | current->exit_code = signr; | 1654 | } else { |
1797 | set_current_state(TASK_STOPPED); | ||
1798 | if (stop_count == 0) | ||
1799 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1800 | spin_unlock_irq(&sighand->siglock); | ||
1801 | } | ||
1802 | else if (thread_group_empty(current)) { | ||
1803 | /* | ||
1804 | * Lock must be held through transition to stopped state. | ||
1805 | */ | ||
1806 | current->exit_code = current->signal->group_exit_code = signr; | ||
1807 | set_current_state(TASK_STOPPED); | ||
1808 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1809 | spin_unlock_irq(&sighand->siglock); | ||
1810 | } | ||
1811 | else { | ||
1812 | /* | 1655 | /* |
1813 | * There is no group stop already in progress. | 1656 | * There is no group stop already in progress. |
1814 | * We must initiate one now, but that requires | 1657 | * We must initiate one now. |
1815 | * dropping siglock to get both the tasklist lock | ||
1816 | * and siglock again in the proper order. Note that | ||
1817 | * this allows an intervening SIGCONT to be posted. | ||
1818 | * We need to check for that and bail out if necessary. | ||
1819 | */ | 1658 | */ |
1820 | struct task_struct *t; | 1659 | struct task_struct *t; |
1821 | 1660 | ||
1822 | spin_unlock_irq(&sighand->siglock); | 1661 | sig->group_exit_code = signr; |
1823 | |||
1824 | /* signals can be posted during this window */ | ||
1825 | |||
1826 | read_lock(&tasklist_lock); | ||
1827 | spin_lock_irq(&sighand->siglock); | ||
1828 | 1662 | ||
1829 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { | 1663 | stop_count = 0; |
1664 | for (t = next_thread(current); t != current; t = next_thread(t)) | ||
1830 | /* | 1665 | /* |
1831 | * Another stop or continue happened while we | 1666 | * Setting state to TASK_STOPPED for a group |
1832 | * didn't have the lock. We can just swallow this | 1667 | * stop is always done with the siglock held, |
1833 | * signal now. If we raced with a SIGCONT, that | 1668 | * so this check has no races. |
1834 | * should have just cleared it now. If we raced | ||
1835 | * with another processor delivering a stop signal, | ||
1836 | * then the SIGCONT that wakes us up should clear it. | ||
1837 | */ | 1669 | */ |
1838 | read_unlock(&tasklist_lock); | 1670 | if (!t->exit_state && |
1839 | return 0; | 1671 | !(t->state & (TASK_STOPPED|TASK_TRACED))) { |
1840 | } | 1672 | stop_count++; |
1841 | 1673 | signal_wake_up(t, 0); | |
1842 | if (sig->group_stop_count == 0) { | 1674 | } |
1843 | sig->group_exit_code = signr; | 1675 | sig->group_stop_count = stop_count; |
1844 | stop_count = 0; | ||
1845 | for (t = next_thread(current); t != current; | ||
1846 | t = next_thread(t)) | ||
1847 | /* | ||
1848 | * Setting state to TASK_STOPPED for a group | ||
1849 | * stop is always done with the siglock held, | ||
1850 | * so this check has no races. | ||
1851 | */ | ||
1852 | if (!t->exit_state && | ||
1853 | !(t->state & (TASK_STOPPED|TASK_TRACED))) { | ||
1854 | stop_count++; | ||
1855 | signal_wake_up(t, 0); | ||
1856 | } | ||
1857 | sig->group_stop_count = stop_count; | ||
1858 | } | ||
1859 | else { | ||
1860 | /* A race with another thread while unlocked. */ | ||
1861 | signr = sig->group_exit_code; | ||
1862 | stop_count = --sig->group_stop_count; | ||
1863 | } | ||
1864 | |||
1865 | current->exit_code = signr; | ||
1866 | set_current_state(TASK_STOPPED); | ||
1867 | if (stop_count == 0) | ||
1868 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1869 | |||
1870 | spin_unlock_irq(&sighand->siglock); | ||
1871 | read_unlock(&tasklist_lock); | ||
1872 | } | 1676 | } |
1873 | 1677 | ||
1678 | if (stop_count == 0) | ||
1679 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1680 | current->exit_code = sig->group_exit_code; | ||
1681 | __set_current_state(TASK_STOPPED); | ||
1682 | |||
1683 | spin_unlock_irq(¤t->sighand->siglock); | ||
1874 | finish_stop(stop_count); | 1684 | finish_stop(stop_count); |
1875 | return 1; | 1685 | return 1; |
1876 | } | 1686 | } |
@@ -1922,6 +1732,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
1922 | sigset_t *mask = ¤t->blocked; | 1732 | sigset_t *mask = ¤t->blocked; |
1923 | int signr = 0; | 1733 | int signr = 0; |
1924 | 1734 | ||
1735 | try_to_freeze(); | ||
1736 | |||
1925 | relock: | 1737 | relock: |
1926 | spin_lock_irq(¤t->sighand->siglock); | 1738 | spin_lock_irq(¤t->sighand->siglock); |
1927 | for (;;) { | 1739 | for (;;) { |
@@ -1942,9 +1754,9 @@ relock: | |||
1942 | /* Let the debugger run. */ | 1754 | /* Let the debugger run. */ |
1943 | ptrace_stop(signr, signr, info); | 1755 | ptrace_stop(signr, signr, info); |
1944 | 1756 | ||
1945 | /* We're back. Did the debugger cancel the sig or group_exit? */ | 1757 | /* We're back. Did the debugger cancel the sig? */ |
1946 | signr = current->exit_code; | 1758 | signr = current->exit_code; |
1947 | if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) | 1759 | if (signr == 0) |
1948 | continue; | 1760 | continue; |
1949 | 1761 | ||
1950 | current->exit_code = 0; | 1762 | current->exit_code = 0; |
@@ -1988,7 +1800,7 @@ relock: | |||
1988 | continue; | 1800 | continue; |
1989 | 1801 | ||
1990 | /* Init gets no signals it doesn't want. */ | 1802 | /* Init gets no signals it doesn't want. */ |
1991 | if (current->pid == 1) | 1803 | if (current == child_reaper) |
1992 | continue; | 1804 | continue; |
1993 | 1805 | ||
1994 | if (sig_kernel_stop(signr)) { | 1806 | if (sig_kernel_stop(signr)) { |
@@ -2099,10 +1911,11 @@ long do_no_restart_syscall(struct restart_block *param) | |||
2099 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | 1911 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) |
2100 | { | 1912 | { |
2101 | int error; | 1913 | int error; |
2102 | sigset_t old_block; | ||
2103 | 1914 | ||
2104 | spin_lock_irq(¤t->sighand->siglock); | 1915 | spin_lock_irq(¤t->sighand->siglock); |
2105 | old_block = current->blocked; | 1916 | if (oldset) |
1917 | *oldset = current->blocked; | ||
1918 | |||
2106 | error = 0; | 1919 | error = 0; |
2107 | switch (how) { | 1920 | switch (how) { |
2108 | case SIG_BLOCK: | 1921 | case SIG_BLOCK: |
@@ -2119,8 +1932,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | |||
2119 | } | 1932 | } |
2120 | recalc_sigpending(); | 1933 | recalc_sigpending(); |
2121 | spin_unlock_irq(¤t->sighand->siglock); | 1934 | spin_unlock_irq(¤t->sighand->siglock); |
2122 | if (oldset) | 1935 | |
2123 | *oldset = old_block; | ||
2124 | return error; | 1936 | return error; |
2125 | } | 1937 | } |
2126 | 1938 | ||
@@ -2307,7 +2119,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
2307 | 2119 | ||
2308 | timeout = schedule_timeout_interruptible(timeout); | 2120 | timeout = schedule_timeout_interruptible(timeout); |
2309 | 2121 | ||
2310 | try_to_freeze(); | ||
2311 | spin_lock_irq(¤t->sighand->siglock); | 2122 | spin_lock_irq(¤t->sighand->siglock); |
2312 | sig = dequeue_signal(current, &these, &info); | 2123 | sig = dequeue_signal(current, &these, &info); |
2313 | current->blocked = current->real_blocked; | 2124 | current->blocked = current->real_blocked; |
@@ -2429,8 +2240,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) | |||
2429 | return kill_proc_info(sig, &info, pid); | 2240 | return kill_proc_info(sig, &info, pid); |
2430 | } | 2241 | } |
2431 | 2242 | ||
2432 | int | 2243 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
2433 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | ||
2434 | { | 2244 | { |
2435 | struct k_sigaction *k; | 2245 | struct k_sigaction *k; |
2436 | sigset_t mask; | 2246 | sigset_t mask; |
@@ -2454,6 +2264,9 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
2454 | *oact = *k; | 2264 | *oact = *k; |
2455 | 2265 | ||
2456 | if (act) { | 2266 | if (act) { |
2267 | sigdelsetmask(&act->sa.sa_mask, | ||
2268 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2269 | *k = *act; | ||
2457 | /* | 2270 | /* |
2458 | * POSIX 3.3.1.3: | 2271 | * POSIX 3.3.1.3: |
2459 | * "Setting a signal action to SIG_IGN for a signal that is | 2272 | * "Setting a signal action to SIG_IGN for a signal that is |
@@ -2466,21 +2279,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
2466 | * be discarded, whether or not it is blocked" | 2279 | * be discarded, whether or not it is blocked" |
2467 | */ | 2280 | */ |
2468 | if (act->sa.sa_handler == SIG_IGN || | 2281 | if (act->sa.sa_handler == SIG_IGN || |
2469 | (act->sa.sa_handler == SIG_DFL && | 2282 | (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) { |
2470 | sig_kernel_ignore(sig))) { | ||
2471 | /* | ||
2472 | * This is a fairly rare case, so we only take the | ||
2473 | * tasklist_lock once we're sure we'll need it. | ||
2474 | * Now we must do this little unlock and relock | ||
2475 | * dance to maintain the lock hierarchy. | ||
2476 | */ | ||
2477 | struct task_struct *t = current; | 2283 | struct task_struct *t = current; |
2478 | spin_unlock_irq(&t->sighand->siglock); | ||
2479 | read_lock(&tasklist_lock); | ||
2480 | spin_lock_irq(&t->sighand->siglock); | ||
2481 | *k = *act; | ||
2482 | sigdelsetmask(&k->sa.sa_mask, | ||
2483 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2484 | sigemptyset(&mask); | 2284 | sigemptyset(&mask); |
2485 | sigaddset(&mask, sig); | 2285 | sigaddset(&mask, sig); |
2486 | rm_from_queue_full(&mask, &t->signal->shared_pending); | 2286 | rm_from_queue_full(&mask, &t->signal->shared_pending); |
@@ -2489,14 +2289,7 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
2489 | recalc_sigpending_tsk(t); | 2289 | recalc_sigpending_tsk(t); |
2490 | t = next_thread(t); | 2290 | t = next_thread(t); |
2491 | } while (t != current); | 2291 | } while (t != current); |
2492 | spin_unlock_irq(¤t->sighand->siglock); | ||
2493 | read_unlock(&tasklist_lock); | ||
2494 | return 0; | ||
2495 | } | 2292 | } |
2496 | |||
2497 | *k = *act; | ||
2498 | sigdelsetmask(&k->sa.sa_mask, | ||
2499 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2500 | } | 2293 | } |
2501 | 2294 | ||
2502 | spin_unlock_irq(¤t->sighand->siglock); | 2295 | spin_unlock_irq(¤t->sighand->siglock); |
@@ -2702,6 +2495,7 @@ sys_signal(int sig, __sighandler_t handler) | |||
2702 | 2495 | ||
2703 | new_sa.sa.sa_handler = handler; | 2496 | new_sa.sa.sa_handler = handler; |
2704 | new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; | 2497 | new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; |
2498 | sigemptyset(&new_sa.sa.sa_mask); | ||
2705 | 2499 | ||
2706 | ret = do_sigaction(sig, &new_sa, &old_sa); | 2500 | ret = do_sigaction(sig, &new_sa, &old_sa); |
2707 | 2501 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index ad3295cdde..336f92d64e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
17 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
19 | #include <linux/smp.h> | ||
19 | 20 | ||
20 | #include <asm/irq.h> | 21 | #include <asm/irq.h> |
21 | /* | 22 | /* |
@@ -445,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
445 | } | 446 | } |
446 | #endif /* CONFIG_HOTPLUG_CPU */ | 447 | #endif /* CONFIG_HOTPLUG_CPU */ |
447 | 448 | ||
448 | static int __devinit cpu_callback(struct notifier_block *nfb, | 449 | static int cpu_callback(struct notifier_block *nfb, |
449 | unsigned long action, | 450 | unsigned long action, |
450 | void *hcpu) | 451 | void *hcpu) |
451 | { | 452 | { |
@@ -483,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
483 | return NOTIFY_OK; | 484 | return NOTIFY_OK; |
484 | } | 485 | } |
485 | 486 | ||
486 | static struct notifier_block __devinitdata cpu_nfb = { | 487 | static struct notifier_block cpu_nfb = { |
487 | .notifier_call = cpu_callback | 488 | .notifier_call = cpu_callback |
488 | }; | 489 | }; |
489 | 490 | ||
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void) | |||
495 | register_cpu_notifier(&cpu_nfb); | 496 | register_cpu_notifier(&cpu_nfb); |
496 | return 0; | 497 | return 0; |
497 | } | 498 | } |
499 | |||
500 | #ifdef CONFIG_SMP | ||
501 | /* | ||
502 | * Call a function on all processors | ||
503 | */ | ||
504 | int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) | ||
505 | { | ||
506 | int ret = 0; | ||
507 | |||
508 | preempt_disable(); | ||
509 | ret = smp_call_function(func, info, retry, wait); | ||
510 | local_irq_disable(); | ||
511 | func(info); | ||
512 | local_irq_enable(); | ||
513 | preempt_enable(); | ||
514 | return ret; | ||
515 | } | ||
516 | EXPORT_SYMBOL(on_each_cpu); | ||
517 | #endif | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index c67189a25d..14c7faf029 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -1,12 +1,11 @@ | |||
1 | /* | 1 | /* |
2 | * Detect Soft Lockups | 2 | * Detect Soft Lockups |
3 | * | 3 | * |
4 | * started by Ingo Molnar, (C) 2005, Red Hat | 4 | * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. |
5 | * | 5 | * |
6 | * this code detects soft lockups: incidents in where on a CPU | 6 | * this code detects soft lockups: incidents in where on a CPU |
7 | * the kernel does not reschedule for 10 seconds or more. | 7 | * the kernel does not reschedule for 10 seconds or more. |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
11 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
12 | #include <linux/init.h> | 11 | #include <linux/init.h> |
@@ -17,13 +16,14 @@ | |||
17 | 16 | ||
18 | static DEFINE_SPINLOCK(print_lock); | 17 | static DEFINE_SPINLOCK(print_lock); |
19 | 18 | ||
20 | static DEFINE_PER_CPU(unsigned long, timestamp) = 0; | 19 | static DEFINE_PER_CPU(unsigned long, touch_timestamp); |
21 | static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; | 20 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
22 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 21 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
23 | 22 | ||
24 | static int did_panic = 0; | 23 | static int did_panic = 0; |
25 | static int softlock_panic(struct notifier_block *this, unsigned long event, | 24 | |
26 | void *ptr) | 25 | static int |
26 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
27 | { | 27 | { |
28 | did_panic = 1; | 28 | did_panic = 1; |
29 | 29 | ||
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = { | |||
36 | 36 | ||
37 | void touch_softlockup_watchdog(void) | 37 | void touch_softlockup_watchdog(void) |
38 | { | 38 | { |
39 | per_cpu(timestamp, raw_smp_processor_id()) = jiffies; | 39 | per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; |
40 | } | 40 | } |
41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
42 | 42 | ||
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog); | |||
44 | * This callback runs from the timer interrupt, and checks | 44 | * This callback runs from the timer interrupt, and checks |
45 | * whether the watchdog thread has hung or not: | 45 | * whether the watchdog thread has hung or not: |
46 | */ | 46 | */ |
47 | void softlockup_tick(struct pt_regs *regs) | 47 | void softlockup_tick(void) |
48 | { | 48 | { |
49 | int this_cpu = smp_processor_id(); | 49 | int this_cpu = smp_processor_id(); |
50 | unsigned long timestamp = per_cpu(timestamp, this_cpu); | 50 | unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); |
51 | 51 | ||
52 | if (per_cpu(print_timestamp, this_cpu) == timestamp) | 52 | /* prevent double reports: */ |
53 | if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || | ||
54 | did_panic || | ||
55 | !per_cpu(watchdog_task, this_cpu)) | ||
53 | return; | 56 | return; |
54 | 57 | ||
55 | /* Do not cause a second panic when there already was one */ | 58 | /* do not print during early bootup: */ |
56 | if (did_panic) | 59 | if (unlikely(system_state != SYSTEM_RUNNING)) { |
60 | touch_softlockup_watchdog(); | ||
57 | return; | 61 | return; |
62 | } | ||
58 | 63 | ||
59 | if (time_after(jiffies, timestamp + 10*HZ)) { | 64 | /* Wake up the high-prio watchdog task every second: */ |
60 | per_cpu(print_timestamp, this_cpu) = timestamp; | 65 | if (time_after(jiffies, touch_timestamp + HZ)) |
66 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | ||
67 | |||
68 | /* Warn about unreasonable 10+ seconds delays: */ | ||
69 | if (time_after(jiffies, touch_timestamp + 10*HZ)) { | ||
70 | per_cpu(print_timestamp, this_cpu) = touch_timestamp; | ||
61 | 71 | ||
62 | spin_lock(&print_lock); | 72 | spin_lock(&print_lock); |
63 | printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", | 73 | printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", |
64 | this_cpu); | 74 | this_cpu); |
65 | show_regs(regs); | 75 | dump_stack(); |
66 | spin_unlock(&print_lock); | 76 | spin_unlock(&print_lock); |
67 | } | 77 | } |
68 | } | 78 | } |
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu) | |||
77 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 87 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
78 | current->flags |= PF_NOFREEZE; | 88 | current->flags |= PF_NOFREEZE; |
79 | 89 | ||
80 | set_current_state(TASK_INTERRUPTIBLE); | ||
81 | |||
82 | /* | 90 | /* |
83 | * Run briefly once per second - if this gets delayed for | 91 | * Run briefly once per second to reset the softlockup timestamp. |
84 | * more than 10 seconds then the debug-printout triggers | 92 | * If this gets delayed for more than 10 seconds then the |
85 | * in softlockup_tick(): | 93 | * debug-printout triggers in softlockup_tick(). |
86 | */ | 94 | */ |
87 | while (!kthread_should_stop()) { | 95 | while (!kthread_should_stop()) { |
88 | msleep_interruptible(1000); | 96 | set_current_state(TASK_INTERRUPTIBLE); |
89 | touch_softlockup_watchdog(); | 97 | touch_softlockup_watchdog(); |
98 | schedule(); | ||
90 | } | 99 | } |
91 | __set_current_state(TASK_RUNNING); | ||
92 | 100 | ||
93 | return 0; | 101 | return 0; |
94 | } | 102 | } |
@@ -96,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
96 | /* | 104 | /* |
97 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
98 | */ | 106 | */ |
99 | static int __devinit | 107 | static int |
100 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
101 | { | 109 | { |
102 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
110 | printk("watchdog for %i failed\n", hotcpu); | 118 | printk("watchdog for %i failed\n", hotcpu); |
111 | return NOTIFY_BAD; | 119 | return NOTIFY_BAD; |
112 | } | 120 | } |
121 | per_cpu(touch_timestamp, hotcpu) = jiffies; | ||
113 | per_cpu(watchdog_task, hotcpu) = p; | 122 | per_cpu(watchdog_task, hotcpu) = p; |
114 | kthread_bind(p, hotcpu); | 123 | kthread_bind(p, hotcpu); |
115 | break; | 124 | break; |
116 | case CPU_ONLINE: | 125 | case CPU_ONLINE: |
117 | |||
118 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 126 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
119 | break; | 127 | break; |
120 | #ifdef CONFIG_HOTPLUG_CPU | 128 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -132,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
132 | return NOTIFY_OK; | 140 | return NOTIFY_OK; |
133 | } | 141 | } |
134 | 142 | ||
135 | static struct notifier_block __devinitdata cpu_nfb = { | 143 | static struct notifier_block cpu_nfb = { |
136 | .notifier_call = cpu_callback | 144 | .notifier_call = cpu_callback |
137 | }; | 145 | }; |
138 | 146 | ||
@@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void) | |||
144 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 152 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
145 | register_cpu_notifier(&cpu_nfb); | 153 | register_cpu_notifier(&cpu_nfb); |
146 | 154 | ||
147 | notifier_chain_register(&panic_notifier_list, &panic_block); | 155 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); |
148 | } | 156 | } |
149 | |||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0375fcd592..d1b810782b 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock); | |||
179 | #define BUILD_LOCK_OPS(op, locktype) \ | 179 | #define BUILD_LOCK_OPS(op, locktype) \ |
180 | void __lockfunc _##op##_lock(locktype##_t *lock) \ | 180 | void __lockfunc _##op##_lock(locktype##_t *lock) \ |
181 | { \ | 181 | { \ |
182 | preempt_disable(); \ | ||
183 | for (;;) { \ | 182 | for (;;) { \ |
183 | preempt_disable(); \ | ||
184 | if (likely(_raw_##op##_trylock(lock))) \ | 184 | if (likely(_raw_##op##_trylock(lock))) \ |
185 | break; \ | 185 | break; \ |
186 | preempt_enable(); \ | 186 | preempt_enable(); \ |
187 | \ | ||
187 | if (!(lock)->break_lock) \ | 188 | if (!(lock)->break_lock) \ |
188 | (lock)->break_lock = 1; \ | 189 | (lock)->break_lock = 1; \ |
189 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 190 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
190 | cpu_relax(); \ | 191 | cpu_relax(); \ |
191 | preempt_disable(); \ | ||
192 | } \ | 192 | } \ |
193 | (lock)->break_lock = 0; \ | 193 | (lock)->break_lock = 0; \ |
194 | } \ | 194 | } \ |
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ | |||
199 | { \ | 199 | { \ |
200 | unsigned long flags; \ | 200 | unsigned long flags; \ |
201 | \ | 201 | \ |
202 | preempt_disable(); \ | ||
203 | for (;;) { \ | 202 | for (;;) { \ |
203 | preempt_disable(); \ | ||
204 | local_irq_save(flags); \ | 204 | local_irq_save(flags); \ |
205 | if (likely(_raw_##op##_trylock(lock))) \ | 205 | if (likely(_raw_##op##_trylock(lock))) \ |
206 | break; \ | 206 | break; \ |
207 | local_irq_restore(flags); \ | 207 | local_irq_restore(flags); \ |
208 | \ | ||
209 | preempt_enable(); \ | 208 | preempt_enable(); \ |
209 | \ | ||
210 | if (!(lock)->break_lock) \ | 210 | if (!(lock)->break_lock) \ |
211 | (lock)->break_lock = 1; \ | 211 | (lock)->break_lock = 1; \ |
212 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 212 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
213 | cpu_relax(); \ | 213 | cpu_relax(); \ |
214 | preempt_disable(); \ | ||
215 | } \ | 214 | } \ |
216 | (lock)->break_lock = 0; \ | 215 | (lock)->break_lock = 0; \ |
217 | return flags; \ | 216 | return flags; \ |
diff --git a/kernel/sys.c b/kernel/sys.c index f91218a546..0b6ec0e793 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -95,99 +95,304 @@ int cad_pid = 1; | |||
95 | * and the like. | 95 | * and the like. |
96 | */ | 96 | */ |
97 | 97 | ||
98 | static struct notifier_block *reboot_notifier_list; | 98 | static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); |
99 | static DEFINE_RWLOCK(notifier_lock); | 99 | |
100 | /* | ||
101 | * Notifier chain core routines. The exported routines below | ||
102 | * are layered on top of these, with appropriate locking added. | ||
103 | */ | ||
104 | |||
105 | static int notifier_chain_register(struct notifier_block **nl, | ||
106 | struct notifier_block *n) | ||
107 | { | ||
108 | while ((*nl) != NULL) { | ||
109 | if (n->priority > (*nl)->priority) | ||
110 | break; | ||
111 | nl = &((*nl)->next); | ||
112 | } | ||
113 | n->next = *nl; | ||
114 | rcu_assign_pointer(*nl, n); | ||
115 | return 0; | ||
116 | } | ||
117 | |||
118 | static int notifier_chain_unregister(struct notifier_block **nl, | ||
119 | struct notifier_block *n) | ||
120 | { | ||
121 | while ((*nl) != NULL) { | ||
122 | if ((*nl) == n) { | ||
123 | rcu_assign_pointer(*nl, n->next); | ||
124 | return 0; | ||
125 | } | ||
126 | nl = &((*nl)->next); | ||
127 | } | ||
128 | return -ENOENT; | ||
129 | } | ||
130 | |||
131 | static int __kprobes notifier_call_chain(struct notifier_block **nl, | ||
132 | unsigned long val, void *v) | ||
133 | { | ||
134 | int ret = NOTIFY_DONE; | ||
135 | struct notifier_block *nb; | ||
136 | |||
137 | nb = rcu_dereference(*nl); | ||
138 | while (nb) { | ||
139 | ret = nb->notifier_call(nb, val, v); | ||
140 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) | ||
141 | break; | ||
142 | nb = rcu_dereference(nb->next); | ||
143 | } | ||
144 | return ret; | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * Atomic notifier chain routines. Registration and unregistration | ||
149 | * use a mutex, and call_chain is synchronized by RCU (no locks). | ||
150 | */ | ||
100 | 151 | ||
101 | /** | 152 | /** |
102 | * notifier_chain_register - Add notifier to a notifier chain | 153 | * atomic_notifier_chain_register - Add notifier to an atomic notifier chain |
103 | * @list: Pointer to root list pointer | 154 | * @nh: Pointer to head of the atomic notifier chain |
104 | * @n: New entry in notifier chain | 155 | * @n: New entry in notifier chain |
105 | * | 156 | * |
106 | * Adds a notifier to a notifier chain. | 157 | * Adds a notifier to an atomic notifier chain. |
107 | * | 158 | * |
108 | * Currently always returns zero. | 159 | * Currently always returns zero. |
109 | */ | 160 | */ |
161 | |||
162 | int atomic_notifier_chain_register(struct atomic_notifier_head *nh, | ||
163 | struct notifier_block *n) | ||
164 | { | ||
165 | unsigned long flags; | ||
166 | int ret; | ||
167 | |||
168 | spin_lock_irqsave(&nh->lock, flags); | ||
169 | ret = notifier_chain_register(&nh->head, n); | ||
170 | spin_unlock_irqrestore(&nh->lock, flags); | ||
171 | return ret; | ||
172 | } | ||
173 | |||
174 | EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); | ||
175 | |||
176 | /** | ||
177 | * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain | ||
178 | * @nh: Pointer to head of the atomic notifier chain | ||
179 | * @n: Entry to remove from notifier chain | ||
180 | * | ||
181 | * Removes a notifier from an atomic notifier chain. | ||
182 | * | ||
183 | * Returns zero on success or %-ENOENT on failure. | ||
184 | */ | ||
185 | int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, | ||
186 | struct notifier_block *n) | ||
187 | { | ||
188 | unsigned long flags; | ||
189 | int ret; | ||
190 | |||
191 | spin_lock_irqsave(&nh->lock, flags); | ||
192 | ret = notifier_chain_unregister(&nh->head, n); | ||
193 | spin_unlock_irqrestore(&nh->lock, flags); | ||
194 | synchronize_rcu(); | ||
195 | return ret; | ||
196 | } | ||
197 | |||
198 | EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); | ||
199 | |||
200 | /** | ||
201 | * atomic_notifier_call_chain - Call functions in an atomic notifier chain | ||
202 | * @nh: Pointer to head of the atomic notifier chain | ||
203 | * @val: Value passed unmodified to notifier function | ||
204 | * @v: Pointer passed unmodified to notifier function | ||
205 | * | ||
206 | * Calls each function in a notifier chain in turn. The functions | ||
207 | * run in an atomic context, so they must not block. | ||
208 | * This routine uses RCU to synchronize with changes to the chain. | ||
209 | * | ||
210 | * If the return value of the notifier can be and'ed | ||
211 | * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain | ||
212 | * will return immediately, with the return value of | ||
213 | * the notifier function which halted execution. | ||
214 | * Otherwise the return value is the return value | ||
215 | * of the last notifier function called. | ||
216 | */ | ||
110 | 217 | ||
111 | int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) | 218 | int atomic_notifier_call_chain(struct atomic_notifier_head *nh, |
219 | unsigned long val, void *v) | ||
112 | { | 220 | { |
113 | write_lock(¬ifier_lock); | 221 | int ret; |
114 | while(*list) | 222 | |
115 | { | 223 | rcu_read_lock(); |
116 | if(n->priority > (*list)->priority) | 224 | ret = notifier_call_chain(&nh->head, val, v); |
117 | break; | 225 | rcu_read_unlock(); |
118 | list= &((*list)->next); | 226 | return ret; |
119 | } | ||
120 | n->next = *list; | ||
121 | *list=n; | ||
122 | write_unlock(¬ifier_lock); | ||
123 | return 0; | ||
124 | } | 227 | } |
125 | 228 | ||
126 | EXPORT_SYMBOL(notifier_chain_register); | 229 | EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); |
230 | |||
231 | /* | ||
232 | * Blocking notifier chain routines. All access to the chain is | ||
233 | * synchronized by an rwsem. | ||
234 | */ | ||
127 | 235 | ||
128 | /** | 236 | /** |
129 | * notifier_chain_unregister - Remove notifier from a notifier chain | 237 | * blocking_notifier_chain_register - Add notifier to a blocking notifier chain |
130 | * @nl: Pointer to root list pointer | 238 | * @nh: Pointer to head of the blocking notifier chain |
131 | * @n: New entry in notifier chain | 239 | * @n: New entry in notifier chain |
132 | * | 240 | * |
133 | * Removes a notifier from a notifier chain. | 241 | * Adds a notifier to a blocking notifier chain. |
242 | * Must be called in process context. | ||
134 | * | 243 | * |
135 | * Returns zero on success, or %-ENOENT on failure. | 244 | * Currently always returns zero. |
136 | */ | 245 | */ |
137 | 246 | ||
138 | int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) | 247 | int blocking_notifier_chain_register(struct blocking_notifier_head *nh, |
248 | struct notifier_block *n) | ||
139 | { | 249 | { |
140 | write_lock(¬ifier_lock); | 250 | int ret; |
141 | while((*nl)!=NULL) | 251 | |
142 | { | 252 | /* |
143 | if((*nl)==n) | 253 | * This code gets used during boot-up, when task switching is |
144 | { | 254 | * not yet working and interrupts must remain disabled. At |
145 | *nl=n->next; | 255 | * such times we must not call down_write(). |
146 | write_unlock(¬ifier_lock); | 256 | */ |
147 | return 0; | 257 | if (unlikely(system_state == SYSTEM_BOOTING)) |
148 | } | 258 | return notifier_chain_register(&nh->head, n); |
149 | nl=&((*nl)->next); | 259 | |
150 | } | 260 | down_write(&nh->rwsem); |
151 | write_unlock(¬ifier_lock); | 261 | ret = notifier_chain_register(&nh->head, n); |
152 | return -ENOENT; | 262 | up_write(&nh->rwsem); |
263 | return ret; | ||
264 | } | ||
265 | |||
266 | EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); | ||
267 | |||
268 | /** | ||
269 | * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain | ||
270 | * @nh: Pointer to head of the blocking notifier chain | ||
271 | * @n: Entry to remove from notifier chain | ||
272 | * | ||
273 | * Removes a notifier from a blocking notifier chain. | ||
274 | * Must be called from process context. | ||
275 | * | ||
276 | * Returns zero on success or %-ENOENT on failure. | ||
277 | */ | ||
278 | int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, | ||
279 | struct notifier_block *n) | ||
280 | { | ||
281 | int ret; | ||
282 | |||
283 | /* | ||
284 | * This code gets used during boot-up, when task switching is | ||
285 | * not yet working and interrupts must remain disabled. At | ||
286 | * such times we must not call down_write(). | ||
287 | */ | ||
288 | if (unlikely(system_state == SYSTEM_BOOTING)) | ||
289 | return notifier_chain_unregister(&nh->head, n); | ||
290 | |||
291 | down_write(&nh->rwsem); | ||
292 | ret = notifier_chain_unregister(&nh->head, n); | ||
293 | up_write(&nh->rwsem); | ||
294 | return ret; | ||
153 | } | 295 | } |
154 | 296 | ||
155 | EXPORT_SYMBOL(notifier_chain_unregister); | 297 | EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); |
156 | 298 | ||
157 | /** | 299 | /** |
158 | * notifier_call_chain - Call functions in a notifier chain | 300 | * blocking_notifier_call_chain - Call functions in a blocking notifier chain |
159 | * @n: Pointer to root pointer of notifier chain | 301 | * @nh: Pointer to head of the blocking notifier chain |
160 | * @val: Value passed unmodified to notifier function | 302 | * @val: Value passed unmodified to notifier function |
161 | * @v: Pointer passed unmodified to notifier function | 303 | * @v: Pointer passed unmodified to notifier function |
162 | * | 304 | * |
163 | * Calls each function in a notifier chain in turn. | 305 | * Calls each function in a notifier chain in turn. The functions |
306 | * run in a process context, so they are allowed to block. | ||
164 | * | 307 | * |
165 | * If the return value of the notifier can be and'd | 308 | * If the return value of the notifier can be and'ed |
166 | * with %NOTIFY_STOP_MASK, then notifier_call_chain | 309 | * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain |
167 | * will return immediately, with the return value of | 310 | * will return immediately, with the return value of |
168 | * the notifier function which halted execution. | 311 | * the notifier function which halted execution. |
169 | * Otherwise, the return value is the return value | 312 | * Otherwise the return value is the return value |
170 | * of the last notifier function called. | 313 | * of the last notifier function called. |
171 | */ | 314 | */ |
172 | 315 | ||
173 | int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) | 316 | int blocking_notifier_call_chain(struct blocking_notifier_head *nh, |
317 | unsigned long val, void *v) | ||
174 | { | 318 | { |
175 | int ret=NOTIFY_DONE; | 319 | int ret; |
176 | struct notifier_block *nb = *n; | ||
177 | 320 | ||
178 | while(nb) | 321 | down_read(&nh->rwsem); |
179 | { | 322 | ret = notifier_call_chain(&nh->head, val, v); |
180 | ret=nb->notifier_call(nb,val,v); | 323 | up_read(&nh->rwsem); |
181 | if(ret&NOTIFY_STOP_MASK) | ||
182 | { | ||
183 | return ret; | ||
184 | } | ||
185 | nb=nb->next; | ||
186 | } | ||
187 | return ret; | 324 | return ret; |
188 | } | 325 | } |
189 | 326 | ||
190 | EXPORT_SYMBOL(notifier_call_chain); | 327 | EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); |
328 | |||
329 | /* | ||
330 | * Raw notifier chain routines. There is no protection; | ||
331 | * the caller must provide it. Use at your own risk! | ||
332 | */ | ||
333 | |||
334 | /** | ||
335 | * raw_notifier_chain_register - Add notifier to a raw notifier chain | ||
336 | * @nh: Pointer to head of the raw notifier chain | ||
337 | * @n: New entry in notifier chain | ||
338 | * | ||
339 | * Adds a notifier to a raw notifier chain. | ||
340 | * All locking must be provided by the caller. | ||
341 | * | ||
342 | * Currently always returns zero. | ||
343 | */ | ||
344 | |||
345 | int raw_notifier_chain_register(struct raw_notifier_head *nh, | ||
346 | struct notifier_block *n) | ||
347 | { | ||
348 | return notifier_chain_register(&nh->head, n); | ||
349 | } | ||
350 | |||
351 | EXPORT_SYMBOL_GPL(raw_notifier_chain_register); | ||
352 | |||
353 | /** | ||
354 | * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain | ||
355 | * @nh: Pointer to head of the raw notifier chain | ||
356 | * @n: Entry to remove from notifier chain | ||
357 | * | ||
358 | * Removes a notifier from a raw notifier chain. | ||
359 | * All locking must be provided by the caller. | ||
360 | * | ||
361 | * Returns zero on success or %-ENOENT on failure. | ||
362 | */ | ||
363 | int raw_notifier_chain_unregister(struct raw_notifier_head *nh, | ||
364 | struct notifier_block *n) | ||
365 | { | ||
366 | return notifier_chain_unregister(&nh->head, n); | ||
367 | } | ||
368 | |||
369 | EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); | ||
370 | |||
371 | /** | ||
372 | * raw_notifier_call_chain - Call functions in a raw notifier chain | ||
373 | * @nh: Pointer to head of the raw notifier chain | ||
374 | * @val: Value passed unmodified to notifier function | ||
375 | * @v: Pointer passed unmodified to notifier function | ||
376 | * | ||
377 | * Calls each function in a notifier chain in turn. The functions | ||
378 | * run in an undefined context. | ||
379 | * All locking must be provided by the caller. | ||
380 | * | ||
381 | * If the return value of the notifier can be and'ed | ||
382 | * with %NOTIFY_STOP_MASK then raw_notifier_call_chain | ||
383 | * will return immediately, with the return value of | ||
384 | * the notifier function which halted execution. | ||
385 | * Otherwise the return value is the return value | ||
386 | * of the last notifier function called. | ||
387 | */ | ||
388 | |||
389 | int raw_notifier_call_chain(struct raw_notifier_head *nh, | ||
390 | unsigned long val, void *v) | ||
391 | { | ||
392 | return notifier_call_chain(&nh->head, val, v); | ||
393 | } | ||
394 | |||
395 | EXPORT_SYMBOL_GPL(raw_notifier_call_chain); | ||
191 | 396 | ||
192 | /** | 397 | /** |
193 | * register_reboot_notifier - Register function to be called at reboot time | 398 | * register_reboot_notifier - Register function to be called at reboot time |
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain); | |||
196 | * Registers a function with the list of functions | 401 | * Registers a function with the list of functions |
197 | * to be called at reboot time. | 402 | * to be called at reboot time. |
198 | * | 403 | * |
199 | * Currently always returns zero, as notifier_chain_register | 404 | * Currently always returns zero, as blocking_notifier_chain_register |
200 | * always returns zero. | 405 | * always returns zero. |
201 | */ | 406 | */ |
202 | 407 | ||
203 | int register_reboot_notifier(struct notifier_block * nb) | 408 | int register_reboot_notifier(struct notifier_block * nb) |
204 | { | 409 | { |
205 | return notifier_chain_register(&reboot_notifier_list, nb); | 410 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); |
206 | } | 411 | } |
207 | 412 | ||
208 | EXPORT_SYMBOL(register_reboot_notifier); | 413 | EXPORT_SYMBOL(register_reboot_notifier); |
@@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier); | |||
219 | 424 | ||
220 | int unregister_reboot_notifier(struct notifier_block * nb) | 425 | int unregister_reboot_notifier(struct notifier_block * nb) |
221 | { | 426 | { |
222 | return notifier_chain_unregister(&reboot_notifier_list, nb); | 427 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); |
223 | } | 428 | } |
224 | 429 | ||
225 | EXPORT_SYMBOL(unregister_reboot_notifier); | 430 | EXPORT_SYMBOL(unregister_reboot_notifier); |
226 | 431 | ||
227 | #ifndef CONFIG_SECURITY | ||
228 | int capable(int cap) | ||
229 | { | ||
230 | if (cap_raised(current->cap_effective, cap)) { | ||
231 | current->flags |= PF_SUPERPRIV; | ||
232 | return 1; | ||
233 | } | ||
234 | return 0; | ||
235 | } | ||
236 | EXPORT_SYMBOL(capable); | ||
237 | #endif | ||
238 | |||
239 | static int set_one_prio(struct task_struct *p, int niceval, int error) | 432 | static int set_one_prio(struct task_struct *p, int niceval, int error) |
240 | { | 433 | { |
241 | int no_nice; | 434 | int no_nice; |
@@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart); | |||
392 | 585 | ||
393 | void kernel_restart_prepare(char *cmd) | 586 | void kernel_restart_prepare(char *cmd) |
394 | { | 587 | { |
395 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 588 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
396 | system_state = SYSTEM_RESTART; | 589 | system_state = SYSTEM_RESTART; |
397 | device_shutdown(); | 590 | device_shutdown(); |
398 | } | 591 | } |
@@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec); | |||
442 | 635 | ||
443 | void kernel_shutdown_prepare(enum system_states state) | 636 | void kernel_shutdown_prepare(enum system_states state) |
444 | { | 637 | { |
445 | notifier_call_chain(&reboot_notifier_list, | 638 | blocking_notifier_call_chain(&reboot_notifier_list, |
446 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | 639 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); |
447 | system_state = state; | 640 | system_state = state; |
448 | device_shutdown(); | 641 | device_shutdown(); |
@@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
1009 | */ | 1202 | */ |
1010 | if (tbuf) { | 1203 | if (tbuf) { |
1011 | struct tms tmp; | 1204 | struct tms tmp; |
1205 | struct task_struct *tsk = current; | ||
1206 | struct task_struct *t; | ||
1012 | cputime_t utime, stime, cutime, cstime; | 1207 | cputime_t utime, stime, cutime, cstime; |
1013 | 1208 | ||
1014 | #ifdef CONFIG_SMP | 1209 | spin_lock_irq(&tsk->sighand->siglock); |
1015 | if (thread_group_empty(current)) { | 1210 | utime = tsk->signal->utime; |
1016 | /* | 1211 | stime = tsk->signal->stime; |
1017 | * Single thread case without the use of any locks. | 1212 | t = tsk; |
1018 | * | 1213 | do { |
1019 | * We may race with release_task if two threads are | 1214 | utime = cputime_add(utime, t->utime); |
1020 | * executing. However, release task first adds up the | 1215 | stime = cputime_add(stime, t->stime); |
1021 | * counters (__exit_signal) before removing the task | 1216 | t = next_thread(t); |
1022 | * from the process tasklist (__unhash_process). | 1217 | } while (t != tsk); |
1023 | * __exit_signal also acquires and releases the | ||
1024 | * siglock which results in the proper memory ordering | ||
1025 | * so that the list modifications are always visible | ||
1026 | * after the counters have been updated. | ||
1027 | * | ||
1028 | * If the counters have been updated by the second thread | ||
1029 | * but the thread has not yet been removed from the list | ||
1030 | * then the other branch will be executing which will | ||
1031 | * block on tasklist_lock until the exit handling of the | ||
1032 | * other task is finished. | ||
1033 | * | ||
1034 | * This also implies that the sighand->siglock cannot | ||
1035 | * be held by another processor. So we can also | ||
1036 | * skip acquiring that lock. | ||
1037 | */ | ||
1038 | utime = cputime_add(current->signal->utime, current->utime); | ||
1039 | stime = cputime_add(current->signal->utime, current->stime); | ||
1040 | cutime = current->signal->cutime; | ||
1041 | cstime = current->signal->cstime; | ||
1042 | } else | ||
1043 | #endif | ||
1044 | { | ||
1045 | 1218 | ||
1046 | /* Process with multiple threads */ | 1219 | cutime = tsk->signal->cutime; |
1047 | struct task_struct *tsk = current; | 1220 | cstime = tsk->signal->cstime; |
1048 | struct task_struct *t; | 1221 | spin_unlock_irq(&tsk->sighand->siglock); |
1049 | 1222 | ||
1050 | read_lock(&tasklist_lock); | ||
1051 | utime = tsk->signal->utime; | ||
1052 | stime = tsk->signal->stime; | ||
1053 | t = tsk; | ||
1054 | do { | ||
1055 | utime = cputime_add(utime, t->utime); | ||
1056 | stime = cputime_add(stime, t->stime); | ||
1057 | t = next_thread(t); | ||
1058 | } while (t != tsk); | ||
1059 | |||
1060 | /* | ||
1061 | * While we have tasklist_lock read-locked, no dying thread | ||
1062 | * can be updating current->signal->[us]time. Instead, | ||
1063 | * we got their counts included in the live thread loop. | ||
1064 | * However, another thread can come in right now and | ||
1065 | * do a wait call that updates current->signal->c[us]time. | ||
1066 | * To make sure we always see that pair updated atomically, | ||
1067 | * we take the siglock around fetching them. | ||
1068 | */ | ||
1069 | spin_lock_irq(&tsk->sighand->siglock); | ||
1070 | cutime = tsk->signal->cutime; | ||
1071 | cstime = tsk->signal->cstime; | ||
1072 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1073 | read_unlock(&tasklist_lock); | ||
1074 | } | ||
1075 | tmp.tms_utime = cputime_to_clock_t(utime); | 1223 | tmp.tms_utime = cputime_to_clock_t(utime); |
1076 | tmp.tms_stime = cputime_to_clock_t(stime); | 1224 | tmp.tms_stime = cputime_to_clock_t(stime); |
1077 | tmp.tms_cutime = cputime_to_clock_t(cutime); | 1225 | tmp.tms_cutime = cputime_to_clock_t(cutime); |
@@ -1224,24 +1372,35 @@ asmlinkage long sys_getsid(pid_t pid) | |||
1224 | asmlinkage long sys_setsid(void) | 1372 | asmlinkage long sys_setsid(void) |
1225 | { | 1373 | { |
1226 | struct task_struct *group_leader = current->group_leader; | 1374 | struct task_struct *group_leader = current->group_leader; |
1227 | struct pid *pid; | 1375 | pid_t session; |
1228 | int err = -EPERM; | 1376 | int err = -EPERM; |
1229 | 1377 | ||
1230 | down(&tty_sem); | 1378 | mutex_lock(&tty_mutex); |
1231 | write_lock_irq(&tasklist_lock); | 1379 | write_lock_irq(&tasklist_lock); |
1232 | 1380 | ||
1233 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); | 1381 | /* Fail if I am already a session leader */ |
1234 | if (pid) | 1382 | if (group_leader->signal->leader) |
1383 | goto out; | ||
1384 | |||
1385 | session = group_leader->pid; | ||
1386 | /* Fail if a process group id already exists that equals the | ||
1387 | * proposed session id. | ||
1388 | * | ||
1389 | * Don't check if session id == 1 because kernel threads use this | ||
1390 | * session id and so the check will always fail and make it so | ||
1391 | * init cannot successfully call setsid. | ||
1392 | */ | ||
1393 | if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) | ||
1235 | goto out; | 1394 | goto out; |
1236 | 1395 | ||
1237 | group_leader->signal->leader = 1; | 1396 | group_leader->signal->leader = 1; |
1238 | __set_special_pids(group_leader->pid, group_leader->pid); | 1397 | __set_special_pids(session, session); |
1239 | group_leader->signal->tty = NULL; | 1398 | group_leader->signal->tty = NULL; |
1240 | group_leader->signal->tty_old_pgrp = 0; | 1399 | group_leader->signal->tty_old_pgrp = 0; |
1241 | err = process_group(group_leader); | 1400 | err = process_group(group_leader); |
1242 | out: | 1401 | out: |
1243 | write_unlock_irq(&tasklist_lock); | 1402 | write_unlock_irq(&tasklist_lock); |
1244 | up(&tty_sem); | 1403 | mutex_unlock(&tty_mutex); |
1245 | return err; | 1404 | return err; |
1246 | } | 1405 | } |
1247 | 1406 | ||
@@ -1375,7 +1534,7 @@ static void groups_sort(struct group_info *group_info) | |||
1375 | /* a simple bsearch */ | 1534 | /* a simple bsearch */ |
1376 | int groups_search(struct group_info *group_info, gid_t grp) | 1535 | int groups_search(struct group_info *group_info, gid_t grp) |
1377 | { | 1536 | { |
1378 | int left, right; | 1537 | unsigned int left, right; |
1379 | 1538 | ||
1380 | if (!group_info) | 1539 | if (!group_info) |
1381 | return 0; | 1540 | return 0; |
@@ -1383,7 +1542,7 @@ int groups_search(struct group_info *group_info, gid_t grp) | |||
1383 | left = 0; | 1542 | left = 0; |
1384 | right = group_info->ngroups; | 1543 | right = group_info->ngroups; |
1385 | while (left < right) { | 1544 | while (left < right) { |
1386 | int mid = (left+right)/2; | 1545 | unsigned int mid = (left+right)/2; |
1387 | int cmp = grp - GROUP_AT(group_info, mid); | 1546 | int cmp = grp - GROUP_AT(group_info, mid); |
1388 | if (cmp > 0) | 1547 | if (cmp > 0) |
1389 | left = mid + 1; | 1548 | left = mid + 1; |
@@ -1433,7 +1592,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) | |||
1433 | return -EINVAL; | 1592 | return -EINVAL; |
1434 | 1593 | ||
1435 | /* no need to grab task_lock here; it cannot change */ | 1594 | /* no need to grab task_lock here; it cannot change */ |
1436 | get_group_info(current->group_info); | ||
1437 | i = current->group_info->ngroups; | 1595 | i = current->group_info->ngroups; |
1438 | if (gidsetsize) { | 1596 | if (gidsetsize) { |
1439 | if (i > gidsetsize) { | 1597 | if (i > gidsetsize) { |
@@ -1446,7 +1604,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) | |||
1446 | } | 1604 | } |
1447 | } | 1605 | } |
1448 | out: | 1606 | out: |
1449 | put_group_info(current->group_info); | ||
1450 | return i; | 1607 | return i; |
1451 | } | 1608 | } |
1452 | 1609 | ||
@@ -1487,9 +1644,7 @@ int in_group_p(gid_t grp) | |||
1487 | { | 1644 | { |
1488 | int retval = 1; | 1645 | int retval = 1; |
1489 | if (grp != current->fsgid) { | 1646 | if (grp != current->fsgid) { |
1490 | get_group_info(current->group_info); | ||
1491 | retval = groups_search(current->group_info, grp); | 1647 | retval = groups_search(current->group_info, grp); |
1492 | put_group_info(current->group_info); | ||
1493 | } | 1648 | } |
1494 | return retval; | 1649 | return retval; |
1495 | } | 1650 | } |
@@ -1500,9 +1655,7 @@ int in_egroup_p(gid_t grp) | |||
1500 | { | 1655 | { |
1501 | int retval = 1; | 1656 | int retval = 1; |
1502 | if (grp != current->egid) { | 1657 | if (grp != current->egid) { |
1503 | get_group_info(current->group_info); | ||
1504 | retval = groups_search(current->group_info, grp); | 1658 | retval = groups_search(current->group_info, grp); |
1505 | put_group_info(current->group_info); | ||
1506 | } | 1659 | } |
1507 | return retval; | 1660 | return retval; |
1508 | } | 1661 | } |
@@ -1630,20 +1783,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r | |||
1630 | asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | 1783 | asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) |
1631 | { | 1784 | { |
1632 | struct rlimit new_rlim, *old_rlim; | 1785 | struct rlimit new_rlim, *old_rlim; |
1786 | unsigned long it_prof_secs; | ||
1633 | int retval; | 1787 | int retval; |
1634 | 1788 | ||
1635 | if (resource >= RLIM_NLIMITS) | 1789 | if (resource >= RLIM_NLIMITS) |
1636 | return -EINVAL; | 1790 | return -EINVAL; |
1637 | if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | 1791 | if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) |
1638 | return -EFAULT; | 1792 | return -EFAULT; |
1639 | if (new_rlim.rlim_cur > new_rlim.rlim_max) | 1793 | if (new_rlim.rlim_cur > new_rlim.rlim_max) |
1640 | return -EINVAL; | 1794 | return -EINVAL; |
1641 | old_rlim = current->signal->rlim + resource; | 1795 | old_rlim = current->signal->rlim + resource; |
1642 | if ((new_rlim.rlim_max > old_rlim->rlim_max) && | 1796 | if ((new_rlim.rlim_max > old_rlim->rlim_max) && |
1643 | !capable(CAP_SYS_RESOURCE)) | 1797 | !capable(CAP_SYS_RESOURCE)) |
1644 | return -EPERM; | 1798 | return -EPERM; |
1645 | if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) | 1799 | if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) |
1646 | return -EPERM; | 1800 | return -EPERM; |
1647 | 1801 | ||
1648 | retval = security_task_setrlimit(resource, &new_rlim); | 1802 | retval = security_task_setrlimit(resource, &new_rlim); |
1649 | if (retval) | 1803 | if (retval) |
@@ -1653,19 +1807,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
1653 | *old_rlim = new_rlim; | 1807 | *old_rlim = new_rlim; |
1654 | task_unlock(current->group_leader); | 1808 | task_unlock(current->group_leader); |
1655 | 1809 | ||
1656 | if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && | 1810 | if (resource != RLIMIT_CPU) |
1657 | (cputime_eq(current->signal->it_prof_expires, cputime_zero) || | 1811 | goto out; |
1658 | new_rlim.rlim_cur <= cputime_to_secs( | 1812 | |
1659 | current->signal->it_prof_expires))) { | 1813 | /* |
1660 | cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); | 1814 | * RLIMIT_CPU handling. Note that the kernel fails to return an error |
1815 | * code if it rejected the user's attempt to set RLIMIT_CPU. This is a | ||
1816 | * very long-standing error, and fixing it now risks breakage of | ||
1817 | * applications, so we live with it | ||
1818 | */ | ||
1819 | if (new_rlim.rlim_cur == RLIM_INFINITY) | ||
1820 | goto out; | ||
1821 | |||
1822 | it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); | ||
1823 | if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { | ||
1824 | unsigned long rlim_cur = new_rlim.rlim_cur; | ||
1825 | cputime_t cputime; | ||
1826 | |||
1827 | if (rlim_cur == 0) { | ||
1828 | /* | ||
1829 | * The caller is asking for an immediate RLIMIT_CPU | ||
1830 | * expiry. But we use the zero value to mean "it was | ||
1831 | * never set". So let's cheat and make it one second | ||
1832 | * instead | ||
1833 | */ | ||
1834 | rlim_cur = 1; | ||
1835 | } | ||
1836 | cputime = secs_to_cputime(rlim_cur); | ||
1661 | read_lock(&tasklist_lock); | 1837 | read_lock(&tasklist_lock); |
1662 | spin_lock_irq(¤t->sighand->siglock); | 1838 | spin_lock_irq(¤t->sighand->siglock); |
1663 | set_process_cpu_timer(current, CPUCLOCK_PROF, | 1839 | set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); |
1664 | &cputime, NULL); | ||
1665 | spin_unlock_irq(¤t->sighand->siglock); | 1840 | spin_unlock_irq(¤t->sighand->siglock); |
1666 | read_unlock(&tasklist_lock); | 1841 | read_unlock(&tasklist_lock); |
1667 | } | 1842 | } |
1668 | 1843 | out: | |
1669 | return 0; | 1844 | return 0; |
1670 | } | 1845 | } |
1671 | 1846 | ||
@@ -1677,9 +1852,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
1677 | * a lot simpler! (Which we're not doing right now because we're not | 1852 | * a lot simpler! (Which we're not doing right now because we're not |
1678 | * measuring them yet). | 1853 | * measuring them yet). |
1679 | * | 1854 | * |
1680 | * This expects to be called with tasklist_lock read-locked or better, | ||
1681 | * and the siglock not locked. It may momentarily take the siglock. | ||
1682 | * | ||
1683 | * When sampling multiple threads for RUSAGE_SELF, under SMP we might have | 1855 | * When sampling multiple threads for RUSAGE_SELF, under SMP we might have |
1684 | * races with threads incrementing their own counters. But since word | 1856 | * races with threads incrementing their own counters. But since word |
1685 | * reads are atomic, we either get new values or old values and we don't | 1857 | * reads are atomic, we either get new values or old values and we don't |
@@ -1687,6 +1859,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
1687 | * the c* fields from p->signal from races with exit.c updating those | 1859 | * the c* fields from p->signal from races with exit.c updating those |
1688 | * fields when reaping, so a sample either gets all the additions of a | 1860 | * fields when reaping, so a sample either gets all the additions of a |
1689 | * given child after it's reaped, or none so this sample is before reaping. | 1861 | * given child after it's reaped, or none so this sample is before reaping. |
1862 | * | ||
1863 | * tasklist_lock locking optimisation: | ||
1864 | * If we are current and single threaded, we do not need to take the tasklist | ||
1865 | * lock or the siglock. No one else can take our signal_struct away, | ||
1866 | * no one else can reap the children to update signal->c* counters, and | ||
1867 | * no one else can race with the signal-> fields. | ||
1868 | * If we do not take the tasklist_lock, the signal-> fields could be read | ||
1869 | * out of order while another thread was just exiting. So we place a | ||
1870 | * read memory barrier when we avoid the lock. On the writer side, | ||
1871 | * write memory barrier is implied in __exit_signal as __exit_signal releases | ||
1872 | * the siglock spinlock after updating the signal-> fields. | ||
1873 | * | ||
1874 | * We don't really need the siglock when we access the non c* fields | ||
1875 | * of the signal_struct (for RUSAGE_SELF) even in multithreaded | ||
1876 | * case, since we take the tasklist lock for read and the non c* signal-> | ||
1877 | * fields are updated only in __exit_signal, which is called with | ||
1878 | * tasklist_lock taken for write, hence these two threads cannot execute | ||
1879 | * concurrently. | ||
1880 | * | ||
1690 | */ | 1881 | */ |
1691 | 1882 | ||
1692 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | 1883 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) |
@@ -1694,13 +1885,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1694 | struct task_struct *t; | 1885 | struct task_struct *t; |
1695 | unsigned long flags; | 1886 | unsigned long flags; |
1696 | cputime_t utime, stime; | 1887 | cputime_t utime, stime; |
1888 | int need_lock = 0; | ||
1697 | 1889 | ||
1698 | memset((char *) r, 0, sizeof *r); | 1890 | memset((char *) r, 0, sizeof *r); |
1891 | utime = stime = cputime_zero; | ||
1699 | 1892 | ||
1700 | if (unlikely(!p->signal)) | 1893 | if (p != current || !thread_group_empty(p)) |
1701 | return; | 1894 | need_lock = 1; |
1702 | 1895 | ||
1703 | utime = stime = cputime_zero; | 1896 | if (need_lock) { |
1897 | read_lock(&tasklist_lock); | ||
1898 | if (unlikely(!p->signal)) { | ||
1899 | read_unlock(&tasklist_lock); | ||
1900 | return; | ||
1901 | } | ||
1902 | } else | ||
1903 | /* See locking comments above */ | ||
1904 | smp_rmb(); | ||
1704 | 1905 | ||
1705 | switch (who) { | 1906 | switch (who) { |
1706 | case RUSAGE_BOTH: | 1907 | case RUSAGE_BOTH: |
@@ -1740,6 +1941,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1740 | BUG(); | 1941 | BUG(); |
1741 | } | 1942 | } |
1742 | 1943 | ||
1944 | if (need_lock) | ||
1945 | read_unlock(&tasklist_lock); | ||
1743 | cputime_to_timeval(utime, &r->ru_utime); | 1946 | cputime_to_timeval(utime, &r->ru_utime); |
1744 | cputime_to_timeval(stime, &r->ru_stime); | 1947 | cputime_to_timeval(stime, &r->ru_stime); |
1745 | } | 1948 | } |
@@ -1747,9 +1950,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1747 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1950 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
1748 | { | 1951 | { |
1749 | struct rusage r; | 1952 | struct rusage r; |
1750 | read_lock(&tasklist_lock); | ||
1751 | k_getrusage(p, who, &r); | 1953 | k_getrusage(p, who, &r); |
1752 | read_unlock(&tasklist_lock); | ||
1753 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | 1954 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; |
1754 | } | 1955 | } |
1755 | 1956 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 17313b99e5..5433195040 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg); | |||
42 | cond_syscall(sys_socketcall); | 42 | cond_syscall(sys_socketcall); |
43 | cond_syscall(sys_futex); | 43 | cond_syscall(sys_futex); |
44 | cond_syscall(compat_sys_futex); | 44 | cond_syscall(compat_sys_futex); |
45 | cond_syscall(sys_set_robust_list); | ||
46 | cond_syscall(compat_sys_set_robust_list); | ||
47 | cond_syscall(sys_get_robust_list); | ||
48 | cond_syscall(compat_sys_get_robust_list); | ||
45 | cond_syscall(sys_epoll_create); | 49 | cond_syscall(sys_epoll_create); |
46 | cond_syscall(sys_epoll_ctl); | 50 | cond_syscall(sys_epoll_ctl); |
47 | cond_syscall(sys_epoll_wait); | 51 | cond_syscall(sys_epoll_wait); |
@@ -104,6 +108,8 @@ cond_syscall(sys_setreuid16); | |||
104 | cond_syscall(sys_setuid16); | 108 | cond_syscall(sys_setuid16); |
105 | cond_syscall(sys_vm86old); | 109 | cond_syscall(sys_vm86old); |
106 | cond_syscall(sys_vm86); | 110 | cond_syscall(sys_vm86); |
111 | cond_syscall(compat_sys_ipc); | ||
112 | cond_syscall(compat_sys_sysctl); | ||
107 | 113 | ||
108 | /* arch-specific weak syscall entries */ | 114 | /* arch-specific weak syscall entries */ |
109 | cond_syscall(sys_pciconfig_read); | 115 | cond_syscall(sys_pciconfig_read); |
@@ -114,3 +120,15 @@ cond_syscall(sys32_sysctl); | |||
114 | cond_syscall(ppc_rtas); | 120 | cond_syscall(ppc_rtas); |
115 | cond_syscall(sys_spu_run); | 121 | cond_syscall(sys_spu_run); |
116 | cond_syscall(sys_spu_create); | 122 | cond_syscall(sys_spu_create); |
123 | |||
124 | /* mmu depending weak syscall entries */ | ||
125 | cond_syscall(sys_mprotect); | ||
126 | cond_syscall(sys_msync); | ||
127 | cond_syscall(sys_mlock); | ||
128 | cond_syscall(sys_munlock); | ||
129 | cond_syscall(sys_mlockall); | ||
130 | cond_syscall(sys_munlockall); | ||
131 | cond_syscall(sys_mincore); | ||
132 | cond_syscall(sys_madvise); | ||
133 | cond_syscall(sys_mremap); | ||
134 | cond_syscall(sys_remap_file_pages); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71dd6f62ef..e82726faee 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -44,13 +44,14 @@ | |||
44 | #include <linux/limits.h> | 44 | #include <linux/limits.h> |
45 | #include <linux/dcache.h> | 45 | #include <linux/dcache.h> |
46 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
47 | #include <linux/nfs_fs.h> | ||
48 | #include <linux/acpi.h> | ||
47 | 49 | ||
48 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
49 | #include <asm/processor.h> | 51 | #include <asm/processor.h> |
50 | 52 | ||
51 | #ifdef CONFIG_ROOT_NFS | 53 | extern int proc_nr_files(ctl_table *table, int write, struct file *filp, |
52 | #include <linux/nfs_fs.h> | 54 | void __user *buffer, size_t *lenp, loff_t *ppos); |
53 | #endif | ||
54 | 55 | ||
55 | #if defined(CONFIG_SYSCTL) | 56 | #if defined(CONFIG_SYSCTL) |
56 | 57 | ||
@@ -126,7 +127,9 @@ extern int sysctl_hz_timer; | |||
126 | extern int acct_parm[]; | 127 | extern int acct_parm[]; |
127 | #endif | 128 | #endif |
128 | 129 | ||
129 | int randomize_va_space = 1; | 130 | #ifdef CONFIG_IA64 |
131 | extern int no_unaligned_warning; | ||
132 | #endif | ||
130 | 133 | ||
131 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 134 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, |
132 | ctl_table *, void **); | 135 | ctl_table *, void **); |
@@ -640,6 +643,7 @@ static ctl_table kern_table[] = { | |||
640 | .proc_handler = &proc_dointvec, | 643 | .proc_handler = &proc_dointvec, |
641 | }, | 644 | }, |
642 | #endif | 645 | #endif |
646 | #if defined(CONFIG_MMU) | ||
643 | { | 647 | { |
644 | .ctl_name = KERN_RANDOMIZE, | 648 | .ctl_name = KERN_RANDOMIZE, |
645 | .procname = "randomize_va_space", | 649 | .procname = "randomize_va_space", |
@@ -648,6 +652,7 @@ static ctl_table kern_table[] = { | |||
648 | .mode = 0644, | 652 | .mode = 0644, |
649 | .proc_handler = &proc_dointvec, | 653 | .proc_handler = &proc_dointvec, |
650 | }, | 654 | }, |
655 | #endif | ||
651 | #if defined(CONFIG_S390) && defined(CONFIG_SMP) | 656 | #if defined(CONFIG_S390) && defined(CONFIG_SMP) |
652 | { | 657 | { |
653 | .ctl_name = KERN_SPIN_RETRY, | 658 | .ctl_name = KERN_SPIN_RETRY, |
@@ -658,6 +663,26 @@ static ctl_table kern_table[] = { | |||
658 | .proc_handler = &proc_dointvec, | 663 | .proc_handler = &proc_dointvec, |
659 | }, | 664 | }, |
660 | #endif | 665 | #endif |
666 | #ifdef CONFIG_ACPI_SLEEP | ||
667 | { | ||
668 | .ctl_name = KERN_ACPI_VIDEO_FLAGS, | ||
669 | .procname = "acpi_video_flags", | ||
670 | .data = &acpi_video_flags, | ||
671 | .maxlen = sizeof (unsigned long), | ||
672 | .mode = 0644, | ||
673 | .proc_handler = &proc_doulongvec_minmax, | ||
674 | }, | ||
675 | #endif | ||
676 | #ifdef CONFIG_IA64 | ||
677 | { | ||
678 | .ctl_name = KERN_IA64_UNALIGNED, | ||
679 | .procname = "ignore-unaligned-usertrap", | ||
680 | .data = &no_unaligned_warning, | ||
681 | .maxlen = sizeof (int), | ||
682 | .mode = 0644, | ||
683 | .proc_handler = &proc_dointvec, | ||
684 | }, | ||
685 | #endif | ||
661 | { .ctl_name = 0 } | 686 | { .ctl_name = 0 } |
662 | }; | 687 | }; |
663 | 688 | ||
@@ -717,18 +742,18 @@ static ctl_table vm_table[] = { | |||
717 | { | 742 | { |
718 | .ctl_name = VM_DIRTY_WB_CS, | 743 | .ctl_name = VM_DIRTY_WB_CS, |
719 | .procname = "dirty_writeback_centisecs", | 744 | .procname = "dirty_writeback_centisecs", |
720 | .data = &dirty_writeback_centisecs, | 745 | .data = &dirty_writeback_interval, |
721 | .maxlen = sizeof(dirty_writeback_centisecs), | 746 | .maxlen = sizeof(dirty_writeback_interval), |
722 | .mode = 0644, | 747 | .mode = 0644, |
723 | .proc_handler = &dirty_writeback_centisecs_handler, | 748 | .proc_handler = &dirty_writeback_centisecs_handler, |
724 | }, | 749 | }, |
725 | { | 750 | { |
726 | .ctl_name = VM_DIRTY_EXPIRE_CS, | 751 | .ctl_name = VM_DIRTY_EXPIRE_CS, |
727 | .procname = "dirty_expire_centisecs", | 752 | .procname = "dirty_expire_centisecs", |
728 | .data = &dirty_expire_centisecs, | 753 | .data = &dirty_expire_interval, |
729 | .maxlen = sizeof(dirty_expire_centisecs), | 754 | .maxlen = sizeof(dirty_expire_interval), |
730 | .mode = 0644, | 755 | .mode = 0644, |
731 | .proc_handler = &proc_dointvec, | 756 | .proc_handler = &proc_dointvec_userhz_jiffies, |
732 | }, | 757 | }, |
733 | { | 758 | { |
734 | .ctl_name = VM_NR_PDFLUSH_THREADS, | 759 | .ctl_name = VM_NR_PDFLUSH_THREADS, |
@@ -823,9 +848,8 @@ static ctl_table vm_table[] = { | |||
823 | .data = &laptop_mode, | 848 | .data = &laptop_mode, |
824 | .maxlen = sizeof(laptop_mode), | 849 | .maxlen = sizeof(laptop_mode), |
825 | .mode = 0644, | 850 | .mode = 0644, |
826 | .proc_handler = &proc_dointvec, | 851 | .proc_handler = &proc_dointvec_jiffies, |
827 | .strategy = &sysctl_intvec, | 852 | .strategy = &sysctl_jiffies, |
828 | .extra1 = &zero, | ||
829 | }, | 853 | }, |
830 | { | 854 | { |
831 | .ctl_name = VM_BLOCK_DUMP, | 855 | .ctl_name = VM_BLOCK_DUMP, |
@@ -921,7 +945,7 @@ static ctl_table fs_table[] = { | |||
921 | .data = &files_stat, | 945 | .data = &files_stat, |
922 | .maxlen = 3*sizeof(int), | 946 | .maxlen = 3*sizeof(int), |
923 | .mode = 0444, | 947 | .mode = 0444, |
924 | .proc_handler = &proc_dointvec, | 948 | .proc_handler = &proc_nr_files, |
925 | }, | 949 | }, |
926 | { | 950 | { |
927 | .ctl_name = FS_MAXFILE, | 951 | .ctl_name = FS_MAXFILE, |
@@ -2029,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, | |||
2029 | int write, void *data) | 2053 | int write, void *data) |
2030 | { | 2054 | { |
2031 | if (write) { | 2055 | if (write) { |
2056 | if (*lvalp > LONG_MAX / HZ) | ||
2057 | return 1; | ||
2032 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); | 2058 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); |
2033 | } else { | 2059 | } else { |
2034 | int val = *valp; | 2060 | int val = *valp; |
@@ -2050,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, | |||
2050 | int write, void *data) | 2076 | int write, void *data) |
2051 | { | 2077 | { |
2052 | if (write) { | 2078 | if (write) { |
2079 | if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) | ||
2080 | return 1; | ||
2053 | *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); | 2081 | *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); |
2054 | } else { | 2082 | } else { |
2055 | int val = *valp; | 2083 | int val = *valp; |
diff --git a/kernel/time.c b/kernel/time.c index 804539165d..b00ddc71ce 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, | |||
202 | return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | 202 | return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); |
203 | } | 203 | } |
204 | 204 | ||
205 | long pps_offset; /* pps time offset (us) */ | ||
206 | long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ | ||
207 | |||
208 | long pps_freq; /* frequency offset (scaled ppm) */ | ||
209 | long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ | ||
210 | |||
211 | long pps_valid = PPS_VALID; /* pps signal watchdog counter */ | ||
212 | |||
213 | int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ | ||
214 | |||
215 | long pps_jitcnt; /* jitter limit exceeded */ | ||
216 | long pps_calcnt; /* calibration intervals */ | ||
217 | long pps_errcnt; /* calibration errors */ | ||
218 | long pps_stbcnt; /* stability limit exceeded */ | ||
219 | |||
220 | /* hook for a loadable hardpps kernel module */ | ||
221 | void (*hardpps_ptr)(struct timeval *); | ||
222 | |||
223 | /* we call this to notify the arch when the clock is being | 205 | /* we call this to notify the arch when the clock is being |
224 | * controlled. If no such arch routine, do nothing. | 206 | * controlled. If no such arch routine, do nothing. |
225 | */ | 207 | */ |
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc) | |||
279 | result = -EINVAL; | 261 | result = -EINVAL; |
280 | goto leave; | 262 | goto leave; |
281 | } | 263 | } |
282 | time_freq = txc->freq - pps_freq; | 264 | time_freq = txc->freq; |
283 | } | 265 | } |
284 | 266 | ||
285 | if (txc->modes & ADJ_MAXERROR) { | 267 | if (txc->modes & ADJ_MAXERROR) { |
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc) | |||
312 | if ((time_next_adjust = txc->offset) == 0) | 294 | if ((time_next_adjust = txc->offset) == 0) |
313 | time_adjust = 0; | 295 | time_adjust = 0; |
314 | } | 296 | } |
315 | else if ( time_status & (STA_PLL | STA_PPSTIME) ) { | 297 | else if (time_status & STA_PLL) { |
316 | ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == | 298 | ltemp = txc->offset; |
317 | (STA_PPSTIME | STA_PPSSIGNAL) ? | ||
318 | pps_offset : txc->offset; | ||
319 | 299 | ||
320 | /* | 300 | /* |
321 | * Scale the phase adjustment and | 301 | * Scale the phase adjustment and |
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc) | |||
356 | } | 336 | } |
357 | time_freq = min(time_freq, time_tolerance); | 337 | time_freq = min(time_freq, time_tolerance); |
358 | time_freq = max(time_freq, -time_tolerance); | 338 | time_freq = max(time_freq, -time_tolerance); |
359 | } /* STA_PLL || STA_PPSTIME */ | 339 | } /* STA_PLL */ |
360 | } /* txc->modes & ADJ_OFFSET */ | 340 | } /* txc->modes & ADJ_OFFSET */ |
361 | if (txc->modes & ADJ_TICK) { | 341 | if (txc->modes & ADJ_TICK) { |
362 | tick_usec = txc->tick; | 342 | tick_usec = txc->tick; |
363 | tick_nsec = TICK_USEC_TO_NSEC(tick_usec); | 343 | tick_nsec = TICK_USEC_TO_NSEC(tick_usec); |
364 | } | 344 | } |
365 | } /* txc->modes */ | 345 | } /* txc->modes */ |
366 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | 346 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) |
367 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 | ||
368 | && (time_status & STA_PPSSIGNAL) == 0) | ||
369 | /* p. 24, (b) */ | ||
370 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
371 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
372 | /* p. 24, (c) */ | ||
373 | || ((time_status & STA_PPSFREQ) != 0 | ||
374 | && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) | ||
375 | /* p. 24, (d) */ | ||
376 | result = TIME_ERROR; | 347 | result = TIME_ERROR; |
377 | 348 | ||
378 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 349 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
@@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | |||
380 | else { | 351 | else { |
381 | txc->offset = shift_right(time_offset, SHIFT_UPDATE); | 352 | txc->offset = shift_right(time_offset, SHIFT_UPDATE); |
382 | } | 353 | } |
383 | txc->freq = time_freq + pps_freq; | 354 | txc->freq = time_freq; |
384 | txc->maxerror = time_maxerror; | 355 | txc->maxerror = time_maxerror; |
385 | txc->esterror = time_esterror; | 356 | txc->esterror = time_esterror; |
386 | txc->status = time_status; | 357 | txc->status = time_status; |
@@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | |||
388 | txc->precision = time_precision; | 359 | txc->precision = time_precision; |
389 | txc->tolerance = time_tolerance; | 360 | txc->tolerance = time_tolerance; |
390 | txc->tick = tick_usec; | 361 | txc->tick = tick_usec; |
391 | txc->ppsfreq = pps_freq; | 362 | |
392 | txc->jitter = pps_jitter >> PPS_AVG; | 363 | /* PPS is not implemented, so these are zero */ |
393 | txc->shift = pps_shift; | 364 | txc->ppsfreq = 0; |
394 | txc->stabil = pps_stabil; | 365 | txc->jitter = 0; |
395 | txc->jitcnt = pps_jitcnt; | 366 | txc->shift = 0; |
396 | txc->calcnt = pps_calcnt; | 367 | txc->stabil = 0; |
397 | txc->errcnt = pps_errcnt; | 368 | txc->jitcnt = 0; |
398 | txc->stbcnt = pps_stbcnt; | 369 | txc->calcnt = 0; |
370 | txc->errcnt = 0; | ||
371 | txc->stbcnt = 0; | ||
399 | write_sequnlock_irq(&xtime_lock); | 372 | write_sequnlock_irq(&xtime_lock); |
400 | do_gettimeofday(&txc->time); | 373 | do_gettimeofday(&txc->time); |
401 | notify_arch_cmos_timer(); | 374 | notify_arch_cmos_timer(); |
@@ -437,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time); | |||
437 | * current_fs_time - Return FS time | 410 | * current_fs_time - Return FS time |
438 | * @sb: Superblock. | 411 | * @sb: Superblock. |
439 | * | 412 | * |
440 | * Return the current time truncated to the time granuality supported by | 413 | * Return the current time truncated to the time granularity supported by |
441 | * the fs. | 414 | * the fs. |
442 | */ | 415 | */ |
443 | struct timespec current_fs_time(struct super_block *sb) | 416 | struct timespec current_fs_time(struct super_block *sb) |
@@ -448,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb) | |||
448 | EXPORT_SYMBOL(current_fs_time); | 421 | EXPORT_SYMBOL(current_fs_time); |
449 | 422 | ||
450 | /** | 423 | /** |
451 | * timespec_trunc - Truncate timespec to a granuality | 424 | * timespec_trunc - Truncate timespec to a granularity |
452 | * @t: Timespec | 425 | * @t: Timespec |
453 | * @gran: Granuality in ns. | 426 | * @gran: Granularity in ns. |
454 | * | 427 | * |
455 | * Truncate a timespec to a granuality. gran must be smaller than a second. | 428 | * Truncate a timespec to a granularity. gran must be smaller than a second. |
456 | * Always rounds down. | 429 | * Always rounds down. |
457 | * | 430 | * |
458 | * This function should be only used for timestamps returned by | 431 | * This function should be only used for timestamps returned by |
@@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | |||
637 | * | 610 | * |
638 | * Returns the timespec representation of the nsec parameter. | 611 | * Returns the timespec representation of the nsec parameter. |
639 | */ | 612 | */ |
640 | struct timespec ns_to_timespec(const nsec_t nsec) | 613 | struct timespec ns_to_timespec(const s64 nsec) |
641 | { | 614 | { |
642 | struct timespec ts; | 615 | struct timespec ts; |
643 | 616 | ||
@@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec) | |||
657 | * | 630 | * |
658 | * Returns the timeval representation of the nsec parameter. | 631 | * Returns the timeval representation of the nsec parameter. |
659 | */ | 632 | */ |
660 | struct timeval ns_to_timeval(const nsec_t nsec) | 633 | struct timeval ns_to_timeval(const s64 nsec) |
661 | { | 634 | { |
662 | struct timespec ts = ns_to_timespec(nsec); | 635 | struct timespec ts = ns_to_timespec(nsec); |
663 | struct timeval tv; | 636 | struct timeval tv; |
diff --git a/kernel/timer.c b/kernel/timer.c index b9dad39946..9e49deed46 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64); | |||
54 | /* | 54 | /* |
55 | * per-CPU timer vector definitions: | 55 | * per-CPU timer vector definitions: |
56 | */ | 56 | */ |
57 | |||
58 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) | 57 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) |
59 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) | 58 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) |
60 | #define TVN_SIZE (1 << TVN_BITS) | 59 | #define TVN_SIZE (1 << TVN_BITS) |
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64); | |||
62 | #define TVN_MASK (TVN_SIZE - 1) | 61 | #define TVN_MASK (TVN_SIZE - 1) |
63 | #define TVR_MASK (TVR_SIZE - 1) | 62 | #define TVR_MASK (TVR_SIZE - 1) |
64 | 63 | ||
65 | struct timer_base_s { | ||
66 | spinlock_t lock; | ||
67 | struct timer_list *running_timer; | ||
68 | }; | ||
69 | |||
70 | typedef struct tvec_s { | 64 | typedef struct tvec_s { |
71 | struct list_head vec[TVN_SIZE]; | 65 | struct list_head vec[TVN_SIZE]; |
72 | } tvec_t; | 66 | } tvec_t; |
@@ -76,7 +70,8 @@ typedef struct tvec_root_s { | |||
76 | } tvec_root_t; | 70 | } tvec_root_t; |
77 | 71 | ||
78 | struct tvec_t_base_s { | 72 | struct tvec_t_base_s { |
79 | struct timer_base_s t_base; | 73 | spinlock_t lock; |
74 | struct timer_list *running_timer; | ||
80 | unsigned long timer_jiffies; | 75 | unsigned long timer_jiffies; |
81 | tvec_root_t tv1; | 76 | tvec_root_t tv1; |
82 | tvec_t tv2; | 77 | tvec_t tv2; |
@@ -86,13 +81,16 @@ struct tvec_t_base_s { | |||
86 | } ____cacheline_aligned_in_smp; | 81 | } ____cacheline_aligned_in_smp; |
87 | 82 | ||
88 | typedef struct tvec_t_base_s tvec_base_t; | 83 | typedef struct tvec_t_base_s tvec_base_t; |
89 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases); | 84 | |
85 | tvec_base_t boot_tvec_bases; | ||
86 | EXPORT_SYMBOL(boot_tvec_bases); | ||
87 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; | ||
90 | 88 | ||
91 | static inline void set_running_timer(tvec_base_t *base, | 89 | static inline void set_running_timer(tvec_base_t *base, |
92 | struct timer_list *timer) | 90 | struct timer_list *timer) |
93 | { | 91 | { |
94 | #ifdef CONFIG_SMP | 92 | #ifdef CONFIG_SMP |
95 | base->t_base.running_timer = timer; | 93 | base->running_timer = timer; |
96 | #endif | 94 | #endif |
97 | } | 95 | } |
98 | 96 | ||
@@ -138,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
138 | list_add_tail(&timer->entry, vec); | 136 | list_add_tail(&timer->entry, vec); |
139 | } | 137 | } |
140 | 138 | ||
141 | typedef struct timer_base_s timer_base_t; | ||
142 | /* | ||
143 | * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) | ||
144 | * at compile time, and we need timer->base to lock the timer. | ||
145 | */ | ||
146 | timer_base_t __init_timer_base | ||
147 | ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; | ||
148 | EXPORT_SYMBOL(__init_timer_base); | ||
149 | |||
150 | /*** | 139 | /*** |
151 | * init_timer - initialize a timer. | 140 | * init_timer - initialize a timer. |
152 | * @timer: the timer to be initialized | 141 | * @timer: the timer to be initialized |
@@ -157,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base); | |||
157 | void fastcall init_timer(struct timer_list *timer) | 146 | void fastcall init_timer(struct timer_list *timer) |
158 | { | 147 | { |
159 | timer->entry.next = NULL; | 148 | timer->entry.next = NULL; |
160 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | 149 | timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); |
161 | } | 150 | } |
162 | EXPORT_SYMBOL(init_timer); | 151 | EXPORT_SYMBOL(init_timer); |
163 | 152 | ||
@@ -173,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer, | |||
173 | } | 162 | } |
174 | 163 | ||
175 | /* | 164 | /* |
176 | * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock | 165 | * We are using hashed locking: holding per_cpu(tvec_bases).lock |
177 | * means that all timers which are tied to this base via timer->base are | 166 | * means that all timers which are tied to this base via timer->base are |
178 | * locked, and the base itself is locked too. | 167 | * locked, and the base itself is locked too. |
179 | * | 168 | * |
@@ -184,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer, | |||
184 | * possible to set timer->base = NULL and drop the lock: the timer remains | 173 | * possible to set timer->base = NULL and drop the lock: the timer remains |
185 | * locked. | 174 | * locked. |
186 | */ | 175 | */ |
187 | static timer_base_t *lock_timer_base(struct timer_list *timer, | 176 | static tvec_base_t *lock_timer_base(struct timer_list *timer, |
188 | unsigned long *flags) | 177 | unsigned long *flags) |
189 | { | 178 | { |
190 | timer_base_t *base; | 179 | tvec_base_t *base; |
191 | 180 | ||
192 | for (;;) { | 181 | for (;;) { |
193 | base = timer->base; | 182 | base = timer->base; |
@@ -204,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer, | |||
204 | 193 | ||
205 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 194 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
206 | { | 195 | { |
207 | timer_base_t *base; | 196 | tvec_base_t *base, *new_base; |
208 | tvec_base_t *new_base; | ||
209 | unsigned long flags; | 197 | unsigned long flags; |
210 | int ret = 0; | 198 | int ret = 0; |
211 | 199 | ||
@@ -218,9 +206,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
218 | ret = 1; | 206 | ret = 1; |
219 | } | 207 | } |
220 | 208 | ||
221 | new_base = &__get_cpu_var(tvec_bases); | 209 | new_base = __get_cpu_var(tvec_bases); |
222 | 210 | ||
223 | if (base != &new_base->t_base) { | 211 | if (base != new_base) { |
224 | /* | 212 | /* |
225 | * We are trying to schedule the timer on the local CPU. | 213 | * We are trying to schedule the timer on the local CPU. |
226 | * However we can't change timer's base while it is running, | 214 | * However we can't change timer's base while it is running, |
@@ -228,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
228 | * handler yet has not finished. This also guarantees that | 216 | * handler yet has not finished. This also guarantees that |
229 | * the timer is serialized wrt itself. | 217 | * the timer is serialized wrt itself. |
230 | */ | 218 | */ |
231 | if (unlikely(base->running_timer == timer)) { | 219 | if (likely(base->running_timer != timer)) { |
232 | /* The timer remains on a former base */ | ||
233 | new_base = container_of(base, tvec_base_t, t_base); | ||
234 | } else { | ||
235 | /* See the comment in lock_timer_base() */ | 220 | /* See the comment in lock_timer_base() */ |
236 | timer->base = NULL; | 221 | timer->base = NULL; |
237 | spin_unlock(&base->lock); | 222 | spin_unlock(&base->lock); |
238 | spin_lock(&new_base->t_base.lock); | 223 | base = new_base; |
239 | timer->base = &new_base->t_base; | 224 | spin_lock(&base->lock); |
225 | timer->base = base; | ||
240 | } | 226 | } |
241 | } | 227 | } |
242 | 228 | ||
243 | timer->expires = expires; | 229 | timer->expires = expires; |
244 | internal_add_timer(new_base, timer); | 230 | internal_add_timer(base, timer); |
245 | spin_unlock_irqrestore(&new_base->t_base.lock, flags); | 231 | spin_unlock_irqrestore(&base->lock, flags); |
246 | 232 | ||
247 | return ret; | 233 | return ret; |
248 | } | 234 | } |
@@ -258,14 +244,14 @@ EXPORT_SYMBOL(__mod_timer); | |||
258 | */ | 244 | */ |
259 | void add_timer_on(struct timer_list *timer, int cpu) | 245 | void add_timer_on(struct timer_list *timer, int cpu) |
260 | { | 246 | { |
261 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); | 247 | tvec_base_t *base = per_cpu(tvec_bases, cpu); |
262 | unsigned long flags; | 248 | unsigned long flags; |
263 | 249 | ||
264 | BUG_ON(timer_pending(timer) || !timer->function); | 250 | BUG_ON(timer_pending(timer) || !timer->function); |
265 | spin_lock_irqsave(&base->t_base.lock, flags); | 251 | spin_lock_irqsave(&base->lock, flags); |
266 | timer->base = &base->t_base; | 252 | timer->base = base; |
267 | internal_add_timer(base, timer); | 253 | internal_add_timer(base, timer); |
268 | spin_unlock_irqrestore(&base->t_base.lock, flags); | 254 | spin_unlock_irqrestore(&base->lock, flags); |
269 | } | 255 | } |
270 | 256 | ||
271 | 257 | ||
@@ -318,7 +304,7 @@ EXPORT_SYMBOL(mod_timer); | |||
318 | */ | 304 | */ |
319 | int del_timer(struct timer_list *timer) | 305 | int del_timer(struct timer_list *timer) |
320 | { | 306 | { |
321 | timer_base_t *base; | 307 | tvec_base_t *base; |
322 | unsigned long flags; | 308 | unsigned long flags; |
323 | int ret = 0; | 309 | int ret = 0; |
324 | 310 | ||
@@ -345,7 +331,7 @@ EXPORT_SYMBOL(del_timer); | |||
345 | */ | 331 | */ |
346 | int try_to_del_timer_sync(struct timer_list *timer) | 332 | int try_to_del_timer_sync(struct timer_list *timer) |
347 | { | 333 | { |
348 | timer_base_t *base; | 334 | tvec_base_t *base; |
349 | unsigned long flags; | 335 | unsigned long flags; |
350 | int ret = -1; | 336 | int ret = -1; |
351 | 337 | ||
@@ -409,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
409 | struct timer_list *tmp; | 395 | struct timer_list *tmp; |
410 | 396 | ||
411 | tmp = list_entry(curr, struct timer_list, entry); | 397 | tmp = list_entry(curr, struct timer_list, entry); |
412 | BUG_ON(tmp->base != &base->t_base); | 398 | BUG_ON(tmp->base != base); |
413 | curr = curr->next; | 399 | curr = curr->next; |
414 | internal_add_timer(base, tmp); | 400 | internal_add_timer(base, tmp); |
415 | } | 401 | } |
@@ -431,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
431 | { | 417 | { |
432 | struct timer_list *timer; | 418 | struct timer_list *timer; |
433 | 419 | ||
434 | spin_lock_irq(&base->t_base.lock); | 420 | spin_lock_irq(&base->lock); |
435 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 421 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
436 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 422 | struct list_head work_list = LIST_HEAD_INIT(work_list); |
437 | struct list_head *head = &work_list; | 423 | struct list_head *head = &work_list; |
@@ -457,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
457 | 443 | ||
458 | set_running_timer(base, timer); | 444 | set_running_timer(base, timer); |
459 | detach_timer(timer, 1); | 445 | detach_timer(timer, 1); |
460 | spin_unlock_irq(&base->t_base.lock); | 446 | spin_unlock_irq(&base->lock); |
461 | { | 447 | { |
462 | int preempt_count = preempt_count(); | 448 | int preempt_count = preempt_count(); |
463 | fn(data); | 449 | fn(data); |
@@ -470,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base) | |||
470 | BUG(); | 456 | BUG(); |
471 | } | 457 | } |
472 | } | 458 | } |
473 | spin_lock_irq(&base->t_base.lock); | 459 | spin_lock_irq(&base->lock); |
474 | } | 460 | } |
475 | } | 461 | } |
476 | set_running_timer(base, NULL); | 462 | set_running_timer(base, NULL); |
477 | spin_unlock_irq(&base->t_base.lock); | 463 | spin_unlock_irq(&base->lock); |
478 | } | 464 | } |
479 | 465 | ||
480 | #ifdef CONFIG_NO_IDLE_HZ | 466 | #ifdef CONFIG_NO_IDLE_HZ |
@@ -489,11 +475,23 @@ unsigned long next_timer_interrupt(void) | |||
489 | struct list_head *list; | 475 | struct list_head *list; |
490 | struct timer_list *nte; | 476 | struct timer_list *nte; |
491 | unsigned long expires; | 477 | unsigned long expires; |
478 | unsigned long hr_expires = MAX_JIFFY_OFFSET; | ||
479 | ktime_t hr_delta; | ||
492 | tvec_t *varray[4]; | 480 | tvec_t *varray[4]; |
493 | int i, j; | 481 | int i, j; |
494 | 482 | ||
495 | base = &__get_cpu_var(tvec_bases); | 483 | hr_delta = hrtimer_get_next_event(); |
496 | spin_lock(&base->t_base.lock); | 484 | if (hr_delta.tv64 != KTIME_MAX) { |
485 | struct timespec tsdelta; | ||
486 | tsdelta = ktime_to_timespec(hr_delta); | ||
487 | hr_expires = timespec_to_jiffies(&tsdelta); | ||
488 | if (hr_expires < 3) | ||
489 | return hr_expires + jiffies; | ||
490 | } | ||
491 | hr_expires += jiffies; | ||
492 | |||
493 | base = __get_cpu_var(tvec_bases); | ||
494 | spin_lock(&base->lock); | ||
497 | expires = base->timer_jiffies + (LONG_MAX >> 1); | 495 | expires = base->timer_jiffies + (LONG_MAX >> 1); |
498 | list = NULL; | 496 | list = NULL; |
499 | 497 | ||
@@ -541,7 +539,27 @@ found: | |||
541 | expires = nte->expires; | 539 | expires = nte->expires; |
542 | } | 540 | } |
543 | } | 541 | } |
544 | spin_unlock(&base->t_base.lock); | 542 | spin_unlock(&base->lock); |
543 | |||
544 | /* | ||
545 | * It can happen that other CPUs service timer IRQs and increment | ||
546 | * jiffies, but we have not yet got a local timer tick to process | ||
547 | * the timer wheels. In that case, the expiry time can be before | ||
548 | * jiffies, but since the high-resolution timer here is relative to | ||
549 | * jiffies, the default expression when high-resolution timers are | ||
550 | * not active, | ||
551 | * | ||
552 | * time_before(MAX_JIFFY_OFFSET + jiffies, expires) | ||
553 | * | ||
554 | * would falsely evaluate to true. If that is the case, just | ||
555 | * return jiffies so that we can immediately fire the local timer | ||
556 | */ | ||
557 | if (time_before(expires, jiffies)) | ||
558 | return jiffies; | ||
559 | |||
560 | if (time_before(hr_expires, expires)) | ||
561 | return hr_expires; | ||
562 | |||
545 | return expires; | 563 | return expires; |
546 | } | 564 | } |
547 | #endif | 565 | #endif |
@@ -680,18 +698,9 @@ static void second_overflow(void) | |||
680 | 698 | ||
681 | /* | 699 | /* |
682 | * Compute the frequency estimate and additional phase adjustment due | 700 | * Compute the frequency estimate and additional phase adjustment due |
683 | * to frequency error for the next second. When the PPS signal is | 701 | * to frequency error for the next second. |
684 | * engaged, gnaw on the watchdog counter and update the frequency | ||
685 | * computed by the pll and the PPS signal. | ||
686 | */ | 702 | */ |
687 | pps_valid++; | 703 | ltemp = time_freq; |
688 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | ||
689 | pps_jitter = MAXTIME; | ||
690 | pps_stabil = MAXFREQ; | ||
691 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
692 | STA_PPSWANDER | STA_PPSERROR); | ||
693 | } | ||
694 | ltemp = time_freq + pps_freq; | ||
695 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); | 704 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); |
696 | 705 | ||
697 | #if HZ == 100 | 706 | #if HZ == 100 |
@@ -717,12 +726,16 @@ static void second_overflow(void) | |||
717 | #endif | 726 | #endif |
718 | } | 727 | } |
719 | 728 | ||
720 | /* in the NTP reference this is called "hardclock()" */ | 729 | /* |
721 | static void update_wall_time_one_tick(void) | 730 | * Returns how many microseconds we need to add to xtime this tick |
731 | * in doing an adjustment requested with adjtime. | ||
732 | */ | ||
733 | static long adjtime_adjustment(void) | ||
722 | { | 734 | { |
723 | long time_adjust_step, delta_nsec; | 735 | long time_adjust_step; |
724 | 736 | ||
725 | if ((time_adjust_step = time_adjust) != 0 ) { | 737 | time_adjust_step = time_adjust; |
738 | if (time_adjust_step) { | ||
726 | /* | 739 | /* |
727 | * We are doing an adjtime thing. Prepare time_adjust_step to | 740 | * We are doing an adjtime thing. Prepare time_adjust_step to |
728 | * be within bounds. Note that a positive time_adjust means we | 741 | * be within bounds. Note that a positive time_adjust means we |
@@ -733,10 +746,19 @@ static void update_wall_time_one_tick(void) | |||
733 | */ | 746 | */ |
734 | time_adjust_step = min(time_adjust_step, (long)tickadj); | 747 | time_adjust_step = min(time_adjust_step, (long)tickadj); |
735 | time_adjust_step = max(time_adjust_step, (long)-tickadj); | 748 | time_adjust_step = max(time_adjust_step, (long)-tickadj); |
749 | } | ||
750 | return time_adjust_step; | ||
751 | } | ||
736 | 752 | ||
753 | /* in the NTP reference this is called "hardclock()" */ | ||
754 | static void update_wall_time_one_tick(void) | ||
755 | { | ||
756 | long time_adjust_step, delta_nsec; | ||
757 | |||
758 | time_adjust_step = adjtime_adjustment(); | ||
759 | if (time_adjust_step) | ||
737 | /* Reduce by this step the amount of time left */ | 760 | /* Reduce by this step the amount of time left */ |
738 | time_adjust -= time_adjust_step; | 761 | time_adjust -= time_adjust_step; |
739 | } | ||
740 | delta_nsec = tick_nsec + time_adjust_step * 1000; | 762 | delta_nsec = tick_nsec + time_adjust_step * 1000; |
741 | /* | 763 | /* |
742 | * Advance the phase, once it gets to one microsecond, then | 764 | * Advance the phase, once it gets to one microsecond, then |
@@ -759,6 +781,22 @@ static void update_wall_time_one_tick(void) | |||
759 | } | 781 | } |
760 | 782 | ||
761 | /* | 783 | /* |
784 | * Return how long ticks are at the moment, that is, how much time | ||
785 | * update_wall_time_one_tick will add to xtime next time we call it | ||
786 | * (assuming no calls to do_adjtimex in the meantime). | ||
787 | * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 | ||
788 | * bits to the right of the binary point. | ||
789 | * This function has no side-effects. | ||
790 | */ | ||
791 | u64 current_tick_length(void) | ||
792 | { | ||
793 | long delta_nsec; | ||
794 | |||
795 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; | ||
796 | return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; | ||
797 | } | ||
798 | |||
799 | /* | ||
762 | * Using a loop looks inefficient, but "ticks" is | 800 | * Using a loop looks inefficient, but "ticks" is |
763 | * usually just one (we shouldn't be losing ticks, | 801 | * usually just one (we shouldn't be losing ticks, |
764 | * we're doing this this way mainly for interrupt | 802 | * we're doing this this way mainly for interrupt |
@@ -804,7 +842,7 @@ void update_process_times(int user_tick) | |||
804 | */ | 842 | */ |
805 | static unsigned long count_active_tasks(void) | 843 | static unsigned long count_active_tasks(void) |
806 | { | 844 | { |
807 | return (nr_running() + nr_uninterruptible()) * FIXED_1; | 845 | return nr_active() * FIXED_1; |
808 | } | 846 | } |
809 | 847 | ||
810 | /* | 848 | /* |
@@ -856,7 +894,7 @@ EXPORT_SYMBOL(xtime_lock); | |||
856 | */ | 894 | */ |
857 | static void run_timer_softirq(struct softirq_action *h) | 895 | static void run_timer_softirq(struct softirq_action *h) |
858 | { | 896 | { |
859 | tvec_base_t *base = &__get_cpu_var(tvec_bases); | 897 | tvec_base_t *base = __get_cpu_var(tvec_bases); |
860 | 898 | ||
861 | hrtimer_run_queues(); | 899 | hrtimer_run_queues(); |
862 | if (time_after_eq(jiffies, base->timer_jiffies)) | 900 | if (time_after_eq(jiffies, base->timer_jiffies)) |
@@ -869,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
869 | void run_local_timers(void) | 907 | void run_local_timers(void) |
870 | { | 908 | { |
871 | raise_softirq(TIMER_SOFTIRQ); | 909 | raise_softirq(TIMER_SOFTIRQ); |
910 | softlockup_tick(); | ||
872 | } | 911 | } |
873 | 912 | ||
874 | /* | 913 | /* |
@@ -896,8 +935,9 @@ static inline void update_times(void) | |||
896 | void do_timer(struct pt_regs *regs) | 935 | void do_timer(struct pt_regs *regs) |
897 | { | 936 | { |
898 | jiffies_64++; | 937 | jiffies_64++; |
938 | /* prevent loading jiffies before storing new jiffies_64 value. */ | ||
939 | barrier(); | ||
899 | update_times(); | 940 | update_times(); |
900 | softlockup_tick(regs); | ||
901 | } | 941 | } |
902 | 942 | ||
903 | #ifdef __ARCH_WANT_SYS_ALARM | 943 | #ifdef __ARCH_WANT_SYS_ALARM |
@@ -908,19 +948,7 @@ void do_timer(struct pt_regs *regs) | |||
908 | */ | 948 | */ |
909 | asmlinkage unsigned long sys_alarm(unsigned int seconds) | 949 | asmlinkage unsigned long sys_alarm(unsigned int seconds) |
910 | { | 950 | { |
911 | struct itimerval it_new, it_old; | 951 | return alarm_setitimer(seconds); |
912 | unsigned int oldalarm; | ||
913 | |||
914 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; | ||
915 | it_new.it_value.tv_sec = seconds; | ||
916 | it_new.it_value.tv_usec = 0; | ||
917 | do_setitimer(ITIMER_REAL, &it_new, &it_old); | ||
918 | oldalarm = it_old.it_value.tv_sec; | ||
919 | /* ehhh.. We can't return 0 if we have an alarm pending.. */ | ||
920 | /* And we'd better return too much than too little anyway */ | ||
921 | if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) | ||
922 | oldalarm++; | ||
923 | return oldalarm; | ||
924 | } | 952 | } |
925 | 953 | ||
926 | #endif | 954 | #endif |
@@ -1209,13 +1237,41 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1209 | return 0; | 1237 | return 0; |
1210 | } | 1238 | } |
1211 | 1239 | ||
1212 | static void __devinit init_timers_cpu(int cpu) | 1240 | static int __devinit init_timers_cpu(int cpu) |
1213 | { | 1241 | { |
1214 | int j; | 1242 | int j; |
1215 | tvec_base_t *base; | 1243 | tvec_base_t *base; |
1244 | static char __devinitdata tvec_base_done[NR_CPUS]; | ||
1245 | |||
1246 | if (!tvec_base_done[cpu]) { | ||
1247 | static char boot_done; | ||
1216 | 1248 | ||
1217 | base = &per_cpu(tvec_bases, cpu); | 1249 | if (boot_done) { |
1218 | spin_lock_init(&base->t_base.lock); | 1250 | /* |
1251 | * The APs use this path later in boot | ||
1252 | */ | ||
1253 | base = kmalloc_node(sizeof(*base), GFP_KERNEL, | ||
1254 | cpu_to_node(cpu)); | ||
1255 | if (!base) | ||
1256 | return -ENOMEM; | ||
1257 | memset(base, 0, sizeof(*base)); | ||
1258 | per_cpu(tvec_bases, cpu) = base; | ||
1259 | } else { | ||
1260 | /* | ||
1261 | * This is for the boot CPU - we use compile-time | ||
1262 | * static initialisation because per-cpu memory isn't | ||
1263 | * ready yet and because the memory allocators are not | ||
1264 | * initialised either. | ||
1265 | */ | ||
1266 | boot_done = 1; | ||
1267 | base = &boot_tvec_bases; | ||
1268 | } | ||
1269 | tvec_base_done[cpu] = 1; | ||
1270 | } else { | ||
1271 | base = per_cpu(tvec_bases, cpu); | ||
1272 | } | ||
1273 | |||
1274 | spin_lock_init(&base->lock); | ||
1219 | for (j = 0; j < TVN_SIZE; j++) { | 1275 | for (j = 0; j < TVN_SIZE; j++) { |
1220 | INIT_LIST_HEAD(base->tv5.vec + j); | 1276 | INIT_LIST_HEAD(base->tv5.vec + j); |
1221 | INIT_LIST_HEAD(base->tv4.vec + j); | 1277 | INIT_LIST_HEAD(base->tv4.vec + j); |
@@ -1226,6 +1282,7 @@ static void __devinit init_timers_cpu(int cpu) | |||
1226 | INIT_LIST_HEAD(base->tv1.vec + j); | 1282 | INIT_LIST_HEAD(base->tv1.vec + j); |
1227 | 1283 | ||
1228 | base->timer_jiffies = jiffies; | 1284 | base->timer_jiffies = jiffies; |
1285 | return 0; | ||
1229 | } | 1286 | } |
1230 | 1287 | ||
1231 | #ifdef CONFIG_HOTPLUG_CPU | 1288 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -1236,7 +1293,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | |||
1236 | while (!list_empty(head)) { | 1293 | while (!list_empty(head)) { |
1237 | timer = list_entry(head->next, struct timer_list, entry); | 1294 | timer = list_entry(head->next, struct timer_list, entry); |
1238 | detach_timer(timer, 0); | 1295 | detach_timer(timer, 0); |
1239 | timer->base = &new_base->t_base; | 1296 | timer->base = new_base; |
1240 | internal_add_timer(new_base, timer); | 1297 | internal_add_timer(new_base, timer); |
1241 | } | 1298 | } |
1242 | } | 1299 | } |
@@ -1248,15 +1305,15 @@ static void __devinit migrate_timers(int cpu) | |||
1248 | int i; | 1305 | int i; |
1249 | 1306 | ||
1250 | BUG_ON(cpu_online(cpu)); | 1307 | BUG_ON(cpu_online(cpu)); |
1251 | old_base = &per_cpu(tvec_bases, cpu); | 1308 | old_base = per_cpu(tvec_bases, cpu); |
1252 | new_base = &get_cpu_var(tvec_bases); | 1309 | new_base = get_cpu_var(tvec_bases); |
1253 | 1310 | ||
1254 | local_irq_disable(); | 1311 | local_irq_disable(); |
1255 | spin_lock(&new_base->t_base.lock); | 1312 | spin_lock(&new_base->lock); |
1256 | spin_lock(&old_base->t_base.lock); | 1313 | spin_lock(&old_base->lock); |
1314 | |||
1315 | BUG_ON(old_base->running_timer); | ||
1257 | 1316 | ||
1258 | if (old_base->t_base.running_timer) | ||
1259 | BUG(); | ||
1260 | for (i = 0; i < TVR_SIZE; i++) | 1317 | for (i = 0; i < TVR_SIZE; i++) |
1261 | migrate_timer_list(new_base, old_base->tv1.vec + i); | 1318 | migrate_timer_list(new_base, old_base->tv1.vec + i); |
1262 | for (i = 0; i < TVN_SIZE; i++) { | 1319 | for (i = 0; i < TVN_SIZE; i++) { |
@@ -1266,20 +1323,21 @@ static void __devinit migrate_timers(int cpu) | |||
1266 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1323 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1267 | } | 1324 | } |
1268 | 1325 | ||
1269 | spin_unlock(&old_base->t_base.lock); | 1326 | spin_unlock(&old_base->lock); |
1270 | spin_unlock(&new_base->t_base.lock); | 1327 | spin_unlock(&new_base->lock); |
1271 | local_irq_enable(); | 1328 | local_irq_enable(); |
1272 | put_cpu_var(tvec_bases); | 1329 | put_cpu_var(tvec_bases); |
1273 | } | 1330 | } |
1274 | #endif /* CONFIG_HOTPLUG_CPU */ | 1331 | #endif /* CONFIG_HOTPLUG_CPU */ |
1275 | 1332 | ||
1276 | static int __devinit timer_cpu_notify(struct notifier_block *self, | 1333 | static int timer_cpu_notify(struct notifier_block *self, |
1277 | unsigned long action, void *hcpu) | 1334 | unsigned long action, void *hcpu) |
1278 | { | 1335 | { |
1279 | long cpu = (long)hcpu; | 1336 | long cpu = (long)hcpu; |
1280 | switch(action) { | 1337 | switch(action) { |
1281 | case CPU_UP_PREPARE: | 1338 | case CPU_UP_PREPARE: |
1282 | init_timers_cpu(cpu); | 1339 | if (init_timers_cpu(cpu) < 0) |
1340 | return NOTIFY_BAD; | ||
1283 | break; | 1341 | break; |
1284 | #ifdef CONFIG_HOTPLUG_CPU | 1342 | #ifdef CONFIG_HOTPLUG_CPU |
1285 | case CPU_DEAD: | 1343 | case CPU_DEAD: |
@@ -1292,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, | |||
1292 | return NOTIFY_OK; | 1350 | return NOTIFY_OK; |
1293 | } | 1351 | } |
1294 | 1352 | ||
1295 | static struct notifier_block __devinitdata timers_nb = { | 1353 | static struct notifier_block timers_nb = { |
1296 | .notifier_call = timer_cpu_notify, | 1354 | .notifier_call = timer_cpu_notify, |
1297 | }; | 1355 | }; |
1298 | 1356 | ||
@@ -1307,8 +1365,8 @@ void __init init_timers(void) | |||
1307 | 1365 | ||
1308 | #ifdef CONFIG_TIME_INTERPOLATION | 1366 | #ifdef CONFIG_TIME_INTERPOLATION |
1309 | 1367 | ||
1310 | struct time_interpolator *time_interpolator; | 1368 | struct time_interpolator *time_interpolator __read_mostly; |
1311 | static struct time_interpolator *time_interpolator_list; | 1369 | static struct time_interpolator *time_interpolator_list __read_mostly; |
1312 | static DEFINE_SPINLOCK(time_interpolator_lock); | 1370 | static DEFINE_SPINLOCK(time_interpolator_lock); |
1313 | 1371 | ||
1314 | static inline u64 time_interpolator_get_cycles(unsigned int src) | 1372 | static inline u64 time_interpolator_get_cycles(unsigned int src) |
@@ -1322,10 +1380,10 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) | |||
1322 | return x(); | 1380 | return x(); |
1323 | 1381 | ||
1324 | case TIME_SOURCE_MMIO64 : | 1382 | case TIME_SOURCE_MMIO64 : |
1325 | return readq((void __iomem *) time_interpolator->addr); | 1383 | return readq_relaxed((void __iomem *)time_interpolator->addr); |
1326 | 1384 | ||
1327 | case TIME_SOURCE_MMIO32 : | 1385 | case TIME_SOURCE_MMIO32 : |
1328 | return readl((void __iomem *) time_interpolator->addr); | 1386 | return readl_relaxed((void __iomem *)time_interpolator->addr); |
1329 | 1387 | ||
1330 | default: return get_cycles(); | 1388 | default: return get_cycles(); |
1331 | } | 1389 | } |
@@ -1422,7 +1480,7 @@ static void time_interpolator_update(long delta_nsec) | |||
1422 | */ | 1480 | */ |
1423 | if (jiffies % INTERPOLATOR_ADJUST == 0) | 1481 | if (jiffies % INTERPOLATOR_ADJUST == 0) |
1424 | { | 1482 | { |
1425 | if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) | 1483 | if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) |
1426 | time_interpolator->nsec_per_cyc--; | 1484 | time_interpolator->nsec_per_cyc--; |
1427 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) | 1485 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) |
1428 | time_interpolator->nsec_per_cyc++; | 1486 | time_interpolator->nsec_per_cyc++; |
@@ -1446,8 +1504,7 @@ register_time_interpolator(struct time_interpolator *ti) | |||
1446 | unsigned long flags; | 1504 | unsigned long flags; |
1447 | 1505 | ||
1448 | /* Sanity check */ | 1506 | /* Sanity check */ |
1449 | if (ti->frequency == 0 || ti->mask == 0) | 1507 | BUG_ON(ti->frequency == 0 || ti->mask == 0); |
1450 | BUG(); | ||
1451 | 1508 | ||
1452 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; | 1509 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; |
1453 | spin_lock(&time_interpolator_lock); | 1510 | spin_lock(&time_interpolator_lock); |
diff --git a/kernel/uid16.c b/kernel/uid16.c index aa25605027..187e2a4238 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -20,43 +20,67 @@ | |||
20 | 20 | ||
21 | asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) | 21 | asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) |
22 | { | 22 | { |
23 | return sys_chown(filename, low2highuid(user), low2highgid(group)); | 23 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); |
24 | /* avoid REGPARM breakage on x86: */ | ||
25 | prevent_tail_call(ret); | ||
26 | return ret; | ||
24 | } | 27 | } |
25 | 28 | ||
26 | asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) | 29 | asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) |
27 | { | 30 | { |
28 | return sys_lchown(filename, low2highuid(user), low2highgid(group)); | 31 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); |
32 | /* avoid REGPARM breakage on x86: */ | ||
33 | prevent_tail_call(ret); | ||
34 | return ret; | ||
29 | } | 35 | } |
30 | 36 | ||
31 | asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) | 37 | asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) |
32 | { | 38 | { |
33 | return sys_fchown(fd, low2highuid(user), low2highgid(group)); | 39 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); |
40 | /* avoid REGPARM breakage on x86: */ | ||
41 | prevent_tail_call(ret); | ||
42 | return ret; | ||
34 | } | 43 | } |
35 | 44 | ||
36 | asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) | 45 | asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) |
37 | { | 46 | { |
38 | return sys_setregid(low2highgid(rgid), low2highgid(egid)); | 47 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); |
48 | /* avoid REGPARM breakage on x86: */ | ||
49 | prevent_tail_call(ret); | ||
50 | return ret; | ||
39 | } | 51 | } |
40 | 52 | ||
41 | asmlinkage long sys_setgid16(old_gid_t gid) | 53 | asmlinkage long sys_setgid16(old_gid_t gid) |
42 | { | 54 | { |
43 | return sys_setgid(low2highgid(gid)); | 55 | long ret = sys_setgid(low2highgid(gid)); |
56 | /* avoid REGPARM breakage on x86: */ | ||
57 | prevent_tail_call(ret); | ||
58 | return ret; | ||
44 | } | 59 | } |
45 | 60 | ||
46 | asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) | 61 | asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) |
47 | { | 62 | { |
48 | return sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 63 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
64 | /* avoid REGPARM breakage on x86: */ | ||
65 | prevent_tail_call(ret); | ||
66 | return ret; | ||
49 | } | 67 | } |
50 | 68 | ||
51 | asmlinkage long sys_setuid16(old_uid_t uid) | 69 | asmlinkage long sys_setuid16(old_uid_t uid) |
52 | { | 70 | { |
53 | return sys_setuid(low2highuid(uid)); | 71 | long ret = sys_setuid(low2highuid(uid)); |
72 | /* avoid REGPARM breakage on x86: */ | ||
73 | prevent_tail_call(ret); | ||
74 | return ret; | ||
54 | } | 75 | } |
55 | 76 | ||
56 | asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) | 77 | asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) |
57 | { | 78 | { |
58 | return sys_setresuid(low2highuid(ruid), low2highuid(euid), | 79 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), |
59 | low2highuid(suid)); | 80 | low2highuid(suid)); |
81 | /* avoid REGPARM breakage on x86: */ | ||
82 | prevent_tail_call(ret); | ||
83 | return ret; | ||
60 | } | 84 | } |
61 | 85 | ||
62 | asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) | 86 | asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) |
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, | |||
72 | 96 | ||
73 | asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) | 97 | asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) |
74 | { | 98 | { |
75 | return sys_setresgid(low2highgid(rgid), low2highgid(egid), | 99 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), |
76 | low2highgid(sgid)); | 100 | low2highgid(sgid)); |
101 | /* avoid REGPARM breakage on x86: */ | ||
102 | prevent_tail_call(ret); | ||
103 | return ret; | ||
77 | } | 104 | } |
78 | 105 | ||
79 | asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) | 106 | asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) |
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, | |||
89 | 116 | ||
90 | asmlinkage long sys_setfsuid16(old_uid_t uid) | 117 | asmlinkage long sys_setfsuid16(old_uid_t uid) |
91 | { | 118 | { |
92 | return sys_setfsuid(low2highuid(uid)); | 119 | long ret = sys_setfsuid(low2highuid(uid)); |
120 | /* avoid REGPARM breakage on x86: */ | ||
121 | prevent_tail_call(ret); | ||
122 | return ret; | ||
93 | } | 123 | } |
94 | 124 | ||
95 | asmlinkage long sys_setfsgid16(old_gid_t gid) | 125 | asmlinkage long sys_setfsgid16(old_gid_t gid) |
96 | { | 126 | { |
97 | return sys_setfsgid(low2highgid(gid)); | 127 | long ret = sys_setfsgid(low2highgid(gid)); |
128 | /* avoid REGPARM breakage on x86: */ | ||
129 | prevent_tail_call(ret); | ||
130 | return ret; | ||
98 | } | 131 | } |
99 | 132 | ||
100 | static int groups16_to_user(old_gid_t __user *grouplist, | 133 | static int groups16_to_user(old_gid_t __user *grouplist, |
diff --git a/kernel/user.c b/kernel/user.c index d9deae43a9..2116642f42 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up) | |||
105 | { | 105 | { |
106 | unsigned long flags; | 106 | unsigned long flags; |
107 | 107 | ||
108 | if (!up) | ||
109 | return; | ||
110 | |||
108 | local_irq_save(flags); | 111 | local_irq_save(flags); |
109 | if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | 112 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { |
110 | uid_hash_remove(up); | 113 | uid_hash_remove(up); |
114 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
111 | key_put(up->uid_keyring); | 115 | key_put(up->uid_keyring); |
112 | key_put(up->session_keyring); | 116 | key_put(up->session_keyring); |
113 | kmem_cache_free(uid_cachep, up); | 117 | kmem_cache_free(uid_cachep, up); |
114 | spin_unlock(&uidhash_lock); | 118 | } else { |
119 | local_irq_restore(flags); | ||
115 | } | 120 | } |
116 | local_irq_restore(flags); | ||
117 | } | 121 | } |
118 | 122 | ||
119 | struct user_struct * alloc_uid(uid_t uid) | 123 | struct user_struct * alloc_uid(uid_t uid) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b052e2c4c7..880fb415a8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
28 | #include <linux/notifier.h> | 28 | #include <linux/notifier.h> |
29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
30 | #include <linux/hardirq.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * The per-CPU workqueue (if single thread, we always use the first | 33 | * The per-CPU workqueue (if single thread, we always use the first |
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work) | |||
476 | } | 477 | } |
477 | EXPORT_SYMBOL(cancel_rearming_delayed_work); | 478 | EXPORT_SYMBOL(cancel_rearming_delayed_work); |
478 | 479 | ||
480 | /** | ||
481 | * execute_in_process_context - reliably execute the routine with user context | ||
482 | * @fn: the function to execute | ||
483 | * @data: data to pass to the function | ||
484 | * @ew: guaranteed storage for the execute work structure (must | ||
485 | * be available when the work executes) | ||
486 | * | ||
487 | * Executes the function immediately if process context is available, | ||
488 | * otherwise schedules the function for delayed execution. | ||
489 | * | ||
490 | * Returns: 0 - function was executed | ||
491 | * 1 - function was scheduled for execution | ||
492 | */ | ||
493 | int execute_in_process_context(void (*fn)(void *data), void *data, | ||
494 | struct execute_work *ew) | ||
495 | { | ||
496 | if (!in_interrupt()) { | ||
497 | fn(data); | ||
498 | return 0; | ||
499 | } | ||
500 | |||
501 | INIT_WORK(&ew->work, fn, data); | ||
502 | schedule_work(&ew->work); | ||
503 | |||
504 | return 1; | ||
505 | } | ||
506 | EXPORT_SYMBOL_GPL(execute_in_process_context); | ||
507 | |||
479 | int keventd_up(void) | 508 | int keventd_up(void) |
480 | { | 509 | { |
481 | return keventd_wq != NULL; | 510 | return keventd_wq != NULL; |
@@ -518,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
518 | } | 547 | } |
519 | 548 | ||
520 | /* We're holding the cpucontrol mutex here */ | 549 | /* We're holding the cpucontrol mutex here */ |
521 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | 550 | static int workqueue_cpu_callback(struct notifier_block *nfb, |
522 | unsigned long action, | 551 | unsigned long action, |
523 | void *hcpu) | 552 | void *hcpu) |
524 | { | 553 | { |