aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/snapshot.c83
-rw-r--r--kernel/power/swsusp.c210
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/sched.c160
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/sysctl.c136
9 files changed, 400 insertions, 235 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3619e939182e..d61ba88f34e5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,24 @@ EXPORT_SYMBOL_GPL(cpucontrol);
21 21
22static struct notifier_block *cpu_chain; 22static struct notifier_block *cpu_chain;
23 23
24/*
25 * Used to check by callers if they need to acquire the cpucontrol
26 * or not to protect a cpu from being removed. Its sometimes required to
27 * call these functions both for normal operations, and in response to
28 * a cpu being added/removed. If the context of the call is in the same
29 * thread context as a CPU hotplug thread, we dont need to take the lock
30 * since its already protected
31 * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
32 */
33
34int current_in_cpu_hotplug(void)
35{
36 return (current->flags & PF_HOTPLUG_CPU);
37}
38
39EXPORT_SYMBOL_GPL(current_in_cpu_hotplug);
40
41
24/* Need to know about CPUs going up/down? */ 42/* Need to know about CPUs going up/down? */
25int register_cpu_notifier(struct notifier_block *nb) 43int register_cpu_notifier(struct notifier_block *nb)
26{ 44{
@@ -94,6 +112,13 @@ int cpu_down(unsigned int cpu)
94 goto out; 112 goto out;
95 } 113 }
96 114
115 /*
116 * Leave a trace in current->flags indicating we are already in
117 * process of performing CPU hotplug. Callers can check if cpucontrol
118 * is already acquired by current thread, and if so not cause
119 * a dead lock by not acquiring the lock
120 */
121 current->flags |= PF_HOTPLUG_CPU;
97 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 122 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
98 (void *)(long)cpu); 123 (void *)(long)cpu);
99 if (err == NOTIFY_BAD) { 124 if (err == NOTIFY_BAD) {
@@ -146,6 +171,7 @@ out_thread:
146out_allowed: 171out_allowed:
147 set_cpus_allowed(current, old_allowed); 172 set_cpus_allowed(current, old_allowed);
148out: 173out:
174 current->flags &= ~PF_HOTPLUG_CPU;
149 unlock_cpu_hotplug(); 175 unlock_cpu_hotplug();
150 return err; 176 return err;
151} 177}
@@ -163,6 +189,12 @@ int __devinit cpu_up(unsigned int cpu)
163 ret = -EINVAL; 189 ret = -EINVAL;
164 goto out; 190 goto out;
165 } 191 }
192
193 /*
194 * Leave a trace in current->flags indicating we are already in
195 * process of performing CPU hotplug.
196 */
197 current->flags |= PF_HOTPLUG_CPU;
166 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 198 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
167 if (ret == NOTIFY_BAD) { 199 if (ret == NOTIFY_BAD) {
168 printk("%s: attempt to bring up CPU %u failed\n", 200 printk("%s: attempt to bring up CPU %u failed\n",
@@ -185,6 +217,7 @@ out_notify:
185 if (ret != 0) 217 if (ret != 0)
186 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 218 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
187out: 219out:
220 current->flags &= ~PF_HOTPLUG_CPU;
188 up(&cpucontrol); 221 up(&cpucontrol);
189 return ret; 222 return ret;
190} 223}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 18d7d693fbba..6ee2cad530e8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,7 +167,7 @@ static int enter_state(suspend_state_t state)
167{ 167{
168 int error; 168 int error;
169 169
170 if (pm_ops->valid && !pm_ops->valid(state)) 170 if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
171 return -ENODEV; 171 return -ENODEV;
172 if (down_trylock(&pm_sem)) 172 if (down_trylock(&pm_sem))
173 return -EBUSY; 173 return -EBUSY;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index d4fd96a135ab..6c042b5ee14b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -65,8 +65,8 @@ extern suspend_pagedir_t *pagedir_save;
65extern asmlinkage int swsusp_arch_suspend(void); 65extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void); 66extern asmlinkage int swsusp_arch_resume(void);
67 67
68extern int restore_highmem(void); 68extern void free_pagedir(struct pbe *pblist);
69extern struct pbe * alloc_pagedir(unsigned nr_pages); 69extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); 70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void); 71extern void swsusp_free(void);
72extern int enough_swap(unsigned nr_pages); 72extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 723f5179883e..4a6dbcefd378 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -88,8 +88,7 @@ static int save_highmem_zone(struct zone *zone)
88 return 0; 88 return 0;
89} 89}
90 90
91 91int save_highmem(void)
92static int save_highmem(void)
93{ 92{
94 struct zone *zone; 93 struct zone *zone;
95 int res = 0; 94 int res = 0;
@@ -120,11 +119,7 @@ int restore_highmem(void)
120 } 119 }
121 return 0; 120 return 0;
122} 121}
123#else 122#endif
124static int save_highmem(void) { return 0; }
125int restore_highmem(void) { return 0; }
126#endif /* CONFIG_HIGHMEM */
127
128 123
129static int pfn_is_nosave(unsigned long pfn) 124static int pfn_is_nosave(unsigned long pfn)
130{ 125{
@@ -216,7 +211,7 @@ static void copy_data_pages(struct pbe *pblist)
216 * free_pagedir - free pages allocated with alloc_pagedir() 211 * free_pagedir - free pages allocated with alloc_pagedir()
217 */ 212 */
218 213
219static void free_pagedir(struct pbe *pblist) 214void free_pagedir(struct pbe *pblist)
220{ 215{
221 struct pbe *pbe; 216 struct pbe *pbe;
222 217
@@ -269,9 +264,30 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
269 pr_debug("create_pbe_list(): initialized %d PBEs\n", num); 264 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
270} 265}
271 266
272static void *alloc_image_page(void) 267/**
268 * @safe_needed - on resume, for storing the PBE list and the image,
269 * we can only use memory pages that do not conflict with the pages
270 * which had been used before suspend.
271 *
272 * The unsafe pages are marked with the PG_nosave_free flag
273 *
274 * Allocated but unusable (ie eaten) memory pages should be marked
275 * so that swsusp_free() can release them
276 */
277
278static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
273{ 279{
274 void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); 280 void *res;
281
282 if (safe_needed)
283 do {
284 res = (void *)get_zeroed_page(gfp_mask);
285 if (res && PageNosaveFree(virt_to_page(res)))
286 /* This is for swsusp_free() */
287 SetPageNosave(virt_to_page(res));
288 } while (res && PageNosaveFree(virt_to_page(res)));
289 else
290 res = (void *)get_zeroed_page(gfp_mask);
275 if (res) { 291 if (res) {
276 SetPageNosave(virt_to_page(res)); 292 SetPageNosave(virt_to_page(res));
277 SetPageNosaveFree(virt_to_page(res)); 293 SetPageNosaveFree(virt_to_page(res));
@@ -279,6 +295,11 @@ static void *alloc_image_page(void)
279 return res; 295 return res;
280} 296}
281 297
298unsigned long get_safe_page(gfp_t gfp_mask)
299{
300 return (unsigned long)alloc_image_page(gfp_mask, 1);
301}
302
282/** 303/**
283 * alloc_pagedir - Allocate the page directory. 304 * alloc_pagedir - Allocate the page directory.
284 * 305 *
@@ -292,7 +313,7 @@ static void *alloc_image_page(void)
292 * On each page we set up a list of struct_pbe elements. 313 * On each page we set up a list of struct_pbe elements.
293 */ 314 */
294 315
295struct pbe *alloc_pagedir(unsigned int nr_pages) 316struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
296{ 317{
297 unsigned int num; 318 unsigned int num;
298 struct pbe *pblist, *pbe; 319 struct pbe *pblist, *pbe;
@@ -301,12 +322,12 @@ struct pbe *alloc_pagedir(unsigned int nr_pages)
301 return NULL; 322 return NULL;
302 323
303 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); 324 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
304 pblist = alloc_image_page(); 325 pblist = alloc_image_page(gfp_mask, safe_needed);
305 /* FIXME: rewrite this ugly loop */ 326 /* FIXME: rewrite this ugly loop */
306 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 327 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
307 pbe = pbe->next, num += PBES_PER_PAGE) { 328 pbe = pbe->next, num += PBES_PER_PAGE) {
308 pbe += PB_PAGE_SKIP; 329 pbe += PB_PAGE_SKIP;
309 pbe->next = alloc_image_page(); 330 pbe->next = alloc_image_page(gfp_mask, safe_needed);
310 } 331 }
311 if (!pbe) { /* get_zeroed_page() failed */ 332 if (!pbe) { /* get_zeroed_page() failed */
312 free_pagedir(pblist); 333 free_pagedir(pblist);
@@ -354,24 +375,32 @@ static int enough_free_mem(unsigned int nr_pages)
354 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 375 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
355} 376}
356 377
378int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
379{
380 struct pbe *p;
381
382 for_each_pbe (p, pblist) {
383 p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
384 if (!p->address)
385 return -ENOMEM;
386 }
387 return 0;
388}
357 389
358static struct pbe *swsusp_alloc(unsigned int nr_pages) 390static struct pbe *swsusp_alloc(unsigned int nr_pages)
359{ 391{
360 struct pbe *pblist, *p; 392 struct pbe *pblist;
361 393
362 if (!(pblist = alloc_pagedir(nr_pages))) { 394 if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
363 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 395 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
364 return NULL; 396 return NULL;
365 } 397 }
366 create_pbe_list(pblist, nr_pages); 398 create_pbe_list(pblist, nr_pages);
367 399
368 for_each_pbe (p, pblist) { 400 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
369 p->address = (unsigned long)alloc_image_page(); 401 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
370 if (!p->address) { 402 swsusp_free();
371 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 403 return NULL;
372 swsusp_free();
373 return NULL;
374 }
375 } 404 }
376 405
377 return pblist; 406 return pblist;
@@ -382,11 +411,6 @@ asmlinkage int swsusp_save(void)
382 unsigned int nr_pages; 411 unsigned int nr_pages;
383 412
384 pr_debug("swsusp: critical section: \n"); 413 pr_debug("swsusp: critical section: \n");
385 if (save_highmem()) {
386 printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
387 restore_highmem();
388 return -ENOMEM;
389 }
390 414
391 drain_local_pages(); 415 drain_local_pages();
392 nr_pages = count_data_pages(); 416 nr_pages = count_data_pages();
@@ -406,11 +430,6 @@ asmlinkage int swsusp_save(void)
406 return -ENOMEM; 430 return -ENOMEM;
407 } 431 }
408 432
409 if (!enough_swap(nr_pages)) {
410 printk(KERN_ERR "swsusp: Not enough free swap\n");
411 return -ENOSPC;
412 }
413
414 pagedir_nosave = swsusp_alloc(nr_pages); 433 pagedir_nosave = swsusp_alloc(nr_pages);
415 if (!pagedir_nosave) 434 if (!pagedir_nosave)
416 return -ENOMEM; 435 return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index e1ab28b9b217..c05f46e7348f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -73,6 +73,14 @@
73 73
74#include "power.h" 74#include "power.h"
75 75
76#ifdef CONFIG_HIGHMEM
77int save_highmem(void);
78int restore_highmem(void);
79#else
80static int save_highmem(void) { return 0; }
81static int restore_highmem(void) { return 0; }
82#endif
83
76#define CIPHER "aes" 84#define CIPHER "aes"
77#define MAXKEY 32 85#define MAXKEY 32
78#define MAXIV 32 86#define MAXIV 32
@@ -500,6 +508,26 @@ static int write_pagedir(void)
500} 508}
501 509
502/** 510/**
511 * enough_swap - Make sure we have enough swap to save the image.
512 *
513 * Returns TRUE or FALSE after checking the total amount of swap
514 * space avaiable.
515 *
516 * FIXME: si_swapinfo(&i) returns all swap devices information.
517 * We should only consider resume_device.
518 */
519
520static int enough_swap(unsigned int nr_pages)
521{
522 struct sysinfo i;
523
524 si_swapinfo(&i);
525 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
526 return i.freeswap > (nr_pages + PAGES_FOR_IO +
527 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
528}
529
530/**
503 * write_suspend_image - Write entire image and metadata. 531 * write_suspend_image - Write entire image and metadata.
504 * 532 *
505 */ 533 */
@@ -507,6 +535,11 @@ static int write_suspend_image(void)
507{ 535{
508 int error; 536 int error;
509 537
538 if (!enough_swap(nr_copy_pages)) {
539 printk(KERN_ERR "swsusp: Not enough free swap\n");
540 return -ENOSPC;
541 }
542
510 init_header(); 543 init_header();
511 if ((error = data_write())) 544 if ((error = data_write()))
512 goto FreeData; 545 goto FreeData;
@@ -526,27 +559,6 @@ static int write_suspend_image(void)
526 goto Done; 559 goto Done;
527} 560}
528 561
529/**
530 * enough_swap - Make sure we have enough swap to save the image.
531 *
532 * Returns TRUE or FALSE after checking the total amount of swap
533 * space avaiable.
534 *
535 * FIXME: si_swapinfo(&i) returns all swap devices information.
536 * We should only consider resume_device.
537 */
538
539int enough_swap(unsigned int nr_pages)
540{
541 struct sysinfo i;
542
543 si_swapinfo(&i);
544 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
545 return i.freeswap > (nr_pages + PAGES_FOR_IO +
546 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
547}
548
549
550/* It is important _NOT_ to umount filesystems at this point. We want 562/* It is important _NOT_ to umount filesystems at this point. We want
551 * them synced (in case something goes wrong) but we DO not want to mark 563 * them synced (in case something goes wrong) but we DO not want to mark
552 * filesystem clean: it is not. (And it does not matter, if we resume 564 * filesystem clean: it is not. (And it does not matter, if we resume
@@ -556,12 +568,15 @@ int swsusp_write(void)
556{ 568{
557 int error; 569 int error;
558 570
571 if ((error = swsusp_swap_check())) {
572 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
573 return error;
574 }
559 lock_swapdevices(); 575 lock_swapdevices();
560 error = write_suspend_image(); 576 error = write_suspend_image();
561 /* This will unlock ignored swap devices since writing is finished */ 577 /* This will unlock ignored swap devices since writing is finished */
562 lock_swapdevices(); 578 lock_swapdevices();
563 return error; 579 return error;
564
565} 580}
566 581
567 582
@@ -569,6 +584,7 @@ int swsusp_write(void)
569int swsusp_suspend(void) 584int swsusp_suspend(void)
570{ 585{
571 int error; 586 int error;
587
572 if ((error = arch_prepare_suspend())) 588 if ((error = arch_prepare_suspend()))
573 return error; 589 return error;
574 local_irq_disable(); 590 local_irq_disable();
@@ -580,15 +596,12 @@ int swsusp_suspend(void)
580 */ 596 */
581 if ((error = device_power_down(PMSG_FREEZE))) { 597 if ((error = device_power_down(PMSG_FREEZE))) {
582 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); 598 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
583 local_irq_enable(); 599 goto Enable_irqs;
584 return error;
585 } 600 }
586 601
587 if ((error = swsusp_swap_check())) { 602 if ((error = save_highmem())) {
588 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); 603 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
589 device_power_up(); 604 goto Restore_highmem;
590 local_irq_enable();
591 return error;
592 } 605 }
593 606
594 save_processor_state(); 607 save_processor_state();
@@ -596,8 +609,10 @@ int swsusp_suspend(void)
596 printk(KERN_ERR "Error %d suspending\n", error); 609 printk(KERN_ERR "Error %d suspending\n", error);
597 /* Restore control flow magically appears here */ 610 /* Restore control flow magically appears here */
598 restore_processor_state(); 611 restore_processor_state();
612Restore_highmem:
599 restore_highmem(); 613 restore_highmem();
600 device_power_up(); 614 device_power_up();
615Enable_irqs:
601 local_irq_enable(); 616 local_irq_enable();
602 return error; 617 return error;
603} 618}
@@ -629,127 +644,43 @@ int swsusp_resume(void)
629} 644}
630 645
631/** 646/**
632 * On resume, for storing the PBE list and the image, 647 * mark_unsafe_pages - mark the pages that cannot be used for storing
633 * we can only use memory pages that do not conflict with the pages 648 * the image during resume, because they conflict with the pages that
634 * which had been used before suspend. 649 * had been used before suspend
635 *
636 * We don't know which pages are usable until we allocate them.
637 *
638 * Allocated but unusable (ie eaten) memory pages are marked so that
639 * swsusp_free() can release them
640 */
641
642unsigned long get_safe_page(gfp_t gfp_mask)
643{
644 unsigned long m;
645
646 do {
647 m = get_zeroed_page(gfp_mask);
648 if (m && PageNosaveFree(virt_to_page(m)))
649 /* This is for swsusp_free() */
650 SetPageNosave(virt_to_page(m));
651 } while (m && PageNosaveFree(virt_to_page(m)));
652 if (m) {
653 /* This is for swsusp_free() */
654 SetPageNosave(virt_to_page(m));
655 SetPageNosaveFree(virt_to_page(m));
656 }
657 return m;
658}
659
660/**
661 * check_pagedir - We ensure here that pages that the PBEs point to
662 * won't collide with pages where we're going to restore from the loaded
663 * pages later
664 */
665
666static int check_pagedir(struct pbe *pblist)
667{
668 struct pbe *p;
669
670 /* This is necessary, so that we can free allocated pages
671 * in case of failure
672 */
673 for_each_pbe (p, pblist)
674 p->address = 0UL;
675
676 for_each_pbe (p, pblist) {
677 p->address = get_safe_page(GFP_ATOMIC);
678 if (!p->address)
679 return -ENOMEM;
680 }
681 return 0;
682}
683
684/**
685 * swsusp_pagedir_relocate - It is possible, that some memory pages
686 * occupied by the list of PBEs collide with pages where we're going to
687 * restore from the loaded pages later. We relocate them here.
688 */ 650 */
689 651
690static struct pbe *swsusp_pagedir_relocate(struct pbe *pblist) 652static void mark_unsafe_pages(struct pbe *pblist)
691{ 653{
692 struct zone *zone; 654 struct zone *zone;
693 unsigned long zone_pfn; 655 unsigned long zone_pfn;
694 struct pbe *pbpage, *tail, *p; 656 struct pbe *p;
695 void *m;
696 int rel = 0;
697 657
698 if (!pblist) /* a sanity check */ 658 if (!pblist) /* a sanity check */
699 return NULL; 659 return;
700
701 pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
702 swsusp_info.pagedir_pages);
703 660
704 /* Clear page flags */ 661 /* Clear page flags */
705
706 for_each_zone (zone) { 662 for_each_zone (zone) {
707 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 663 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
708 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) 664 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
709 ClearPageNosaveFree(pfn_to_page(zone_pfn + 665 ClearPageNosaveFree(pfn_to_page(zone_pfn +
710 zone->zone_start_pfn)); 666 zone->zone_start_pfn));
711 } 667 }
712 668
713 /* Mark orig addresses */ 669 /* Mark orig addresses */
714
715 for_each_pbe (p, pblist) 670 for_each_pbe (p, pblist)
716 SetPageNosaveFree(virt_to_page(p->orig_address)); 671 SetPageNosaveFree(virt_to_page(p->orig_address));
717 672
718 tail = pblist + PB_PAGE_SKIP; 673}
719
720 /* Relocate colliding pages */
721
722 for_each_pb_page (pbpage, pblist) {
723 if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
724 m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
725 if (!m)
726 return NULL;
727 memcpy(m, (void *)pbpage, PAGE_SIZE);
728 if (pbpage == pblist)
729 pblist = (struct pbe *)m;
730 else
731 tail->next = (struct pbe *)m;
732 pbpage = (struct pbe *)m;
733
734 /* We have to link the PBEs again */
735 for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
736 if (p->next) /* needed to save the end */
737 p->next = p + 1;
738
739 rel++;
740 }
741 tail = pbpage + PB_PAGE_SKIP;
742 }
743 674
744 /* This is for swsusp_free() */ 675static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
745 for_each_pb_page (pbpage, pblist) { 676{
746 SetPageNosave(virt_to_page(pbpage)); 677 /* We assume both lists contain the same number of elements */
747 SetPageNosaveFree(virt_to_page(pbpage)); 678 while (src) {
679 dst->orig_address = src->orig_address;
680 dst->swap_address = src->swap_address;
681 dst = dst->next;
682 src = src->next;
748 } 683 }
749
750 printk("swsusp: Relocated %d pages\n", rel);
751
752 return pblist;
753} 684}
754 685
755/* 686/*
@@ -888,7 +819,7 @@ static int check_sig(void)
888 * Reset swap signature now. 819 * Reset swap signature now.
889 */ 820 */
890 error = bio_write_page(0, &swsusp_header); 821 error = bio_write_page(0, &swsusp_header);
891 } else { 822 } else {
892 return -EINVAL; 823 return -EINVAL;
893 } 824 }
894 if (!error) 825 if (!error)
@@ -990,20 +921,25 @@ static int read_suspend_image(void)
990 int error = 0; 921 int error = 0;
991 struct pbe *p; 922 struct pbe *p;
992 923
993 if (!(p = alloc_pagedir(nr_copy_pages))) 924 if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
994 return -ENOMEM; 925 return -ENOMEM;
995 926
996 if ((error = read_pagedir(p))) 927 if ((error = read_pagedir(p)))
997 return error; 928 return error;
998
999 create_pbe_list(p, nr_copy_pages); 929 create_pbe_list(p, nr_copy_pages);
1000 930 mark_unsafe_pages(p);
1001 if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) 931 pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
932 if (pagedir_nosave) {
933 create_pbe_list(pagedir_nosave, nr_copy_pages);
934 copy_page_backup_list(pagedir_nosave, p);
935 }
936 free_pagedir(p);
937 if (!pagedir_nosave)
1002 return -ENOMEM; 938 return -ENOMEM;
1003 939
1004 /* Allocate memory for the image and read the data from swap */ 940 /* Allocate memory for the image and read the data from swap */
1005 941
1006 error = check_pagedir(pagedir_nosave); 942 error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
1007 943
1008 if (!error) 944 if (!error)
1009 error = data_read(pagedir_nosave); 945 error = data_read(pagedir_nosave);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5b8dd98a230e..b88d4186cd7a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -155,7 +155,7 @@ int ptrace_attach(struct task_struct *task)
155 retval = -EPERM; 155 retval = -EPERM;
156 if (task->pid <= 1) 156 if (task->pid <= 1)
157 goto bad; 157 goto bad;
158 if (task == current) 158 if (task->tgid == current->tgid)
159 goto bad; 159 goto bad;
160 /* the same process cannot be attached many times */ 160 /* the same process cannot be attached many times */
161 if (task->ptrace & PT_PTRACED) 161 if (task->ptrace & PT_PTRACED)
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ce26954be12..b6506671b2be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,6 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long prio_bias;
209 unsigned long cpu_load[3]; 210 unsigned long cpu_load[3];
210#endif 211#endif
211 unsigned long long nr_switches; 212 unsigned long long nr_switches;
@@ -659,13 +660,68 @@ static int effective_prio(task_t *p)
659 return prio; 660 return prio;
660} 661}
661 662
663#ifdef CONFIG_SMP
664static inline void inc_prio_bias(runqueue_t *rq, int prio)
665{
666 rq->prio_bias += MAX_PRIO - prio;
667}
668
669static inline void dec_prio_bias(runqueue_t *rq, int prio)
670{
671 rq->prio_bias -= MAX_PRIO - prio;
672}
673
674static inline void inc_nr_running(task_t *p, runqueue_t *rq)
675{
676 rq->nr_running++;
677 if (rt_task(p)) {
678 if (p != rq->migration_thread)
679 /*
680 * The migration thread does the actual balancing. Do
681 * not bias by its priority as the ultra high priority
682 * will skew balancing adversely.
683 */
684 inc_prio_bias(rq, p->prio);
685 } else
686 inc_prio_bias(rq, p->static_prio);
687}
688
689static inline void dec_nr_running(task_t *p, runqueue_t *rq)
690{
691 rq->nr_running--;
692 if (rt_task(p)) {
693 if (p != rq->migration_thread)
694 dec_prio_bias(rq, p->prio);
695 } else
696 dec_prio_bias(rq, p->static_prio);
697}
698#else
699static inline void inc_prio_bias(runqueue_t *rq, int prio)
700{
701}
702
703static inline void dec_prio_bias(runqueue_t *rq, int prio)
704{
705}
706
707static inline void inc_nr_running(task_t *p, runqueue_t *rq)
708{
709 rq->nr_running++;
710}
711
712static inline void dec_nr_running(task_t *p, runqueue_t *rq)
713{
714 rq->nr_running--;
715}
716#endif
717
662/* 718/*
663 * __activate_task - move a task to the runqueue. 719 * __activate_task - move a task to the runqueue.
664 */ 720 */
665static inline void __activate_task(task_t *p, runqueue_t *rq) 721static inline void __activate_task(task_t *p, runqueue_t *rq)
666{ 722{
667 enqueue_task(p, rq->active); 723 enqueue_task(p, rq->active);
668 rq->nr_running++; 724 inc_nr_running(p, rq);
669} 725}
670 726
671/* 727/*
@@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
674static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 730static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
675{ 731{
676 enqueue_task_head(p, rq->active); 732 enqueue_task_head(p, rq->active);
677 rq->nr_running++; 733 inc_nr_running(p, rq);
678} 734}
679 735
680static int recalc_task_prio(task_t *p, unsigned long long now) 736static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
759 } 815 }
760#endif 816#endif
761 817
762 p->prio = recalc_task_prio(p, now); 818 if (!rt_task(p))
819 p->prio = recalc_task_prio(p, now);
763 820
764 /* 821 /*
765 * This checks to make sure it's not an uninterruptible task 822 * This checks to make sure it's not an uninterruptible task
@@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
793 */ 850 */
794static void deactivate_task(struct task_struct *p, runqueue_t *rq) 851static void deactivate_task(struct task_struct *p, runqueue_t *rq)
795{ 852{
796 rq->nr_running--; 853 dec_nr_running(p, rq);
797 dequeue_task(p, p->array); 854 dequeue_task(p, p->array);
798 p->array = NULL; 855 p->array = NULL;
799} 856}
@@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
808#ifdef CONFIG_SMP 865#ifdef CONFIG_SMP
809static void resched_task(task_t *p) 866static void resched_task(task_t *p)
810{ 867{
811 int need_resched, nrpolling; 868 int cpu;
812 869
813 assert_spin_locked(&task_rq(p)->lock); 870 assert_spin_locked(&task_rq(p)->lock);
814 871
815 /* minimise the chance of sending an interrupt to poll_idle() */ 872 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
816 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 873 return;
817 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); 874
818 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 875 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
876
877 cpu = task_cpu(p);
878 if (cpu == smp_processor_id())
879 return;
819 880
820 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) 881 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
821 smp_send_reschedule(task_cpu(p)); 882 smp_mb();
883 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
884 smp_send_reschedule(cpu);
822} 885}
823#else 886#else
824static inline void resched_task(task_t *p) 887static inline void resched_task(task_t *p)
825{ 888{
889 assert_spin_locked(&task_rq(p)->lock);
826 set_tsk_need_resched(p); 890 set_tsk_need_resched(p);
827} 891}
828#endif 892#endif
@@ -930,27 +994,61 @@ void kick_process(task_t *p)
930 * We want to under-estimate the load of migration sources, to 994 * We want to under-estimate the load of migration sources, to
931 * balance conservatively. 995 * balance conservatively.
932 */ 996 */
933static inline unsigned long source_load(int cpu, int type) 997static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
934{ 998{
935 runqueue_t *rq = cpu_rq(cpu); 999 runqueue_t *rq = cpu_rq(cpu);
936 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1000 unsigned long running = rq->nr_running;
1001 unsigned long source_load, cpu_load = rq->cpu_load[type-1],
1002 load_now = running * SCHED_LOAD_SCALE;
1003
937 if (type == 0) 1004 if (type == 0)
938 return load_now; 1005 source_load = load_now;
1006 else
1007 source_load = min(cpu_load, load_now);
939 1008
940 return min(rq->cpu_load[type-1], load_now); 1009 if (running > 1 || (idle == NOT_IDLE && running))
1010 /*
1011 * If we are busy rebalancing the load is biased by
1012 * priority to create 'nice' support across cpus. When
1013 * idle rebalancing we should only bias the source_load if
1014 * there is more than one task running on that queue to
1015 * prevent idle rebalance from trying to pull tasks from a
1016 * queue with only one running task.
1017 */
1018 source_load = source_load * rq->prio_bias / running;
1019
1020 return source_load;
1021}
1022
1023static inline unsigned long source_load(int cpu, int type)
1024{
1025 return __source_load(cpu, type, NOT_IDLE);
941} 1026}
942 1027
943/* 1028/*
944 * Return a high guess at the load of a migration-target cpu 1029 * Return a high guess at the load of a migration-target cpu
945 */ 1030 */
946static inline unsigned long target_load(int cpu, int type) 1031static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
947{ 1032{
948 runqueue_t *rq = cpu_rq(cpu); 1033 runqueue_t *rq = cpu_rq(cpu);
949 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1034 unsigned long running = rq->nr_running;
1035 unsigned long target_load, cpu_load = rq->cpu_load[type-1],
1036 load_now = running * SCHED_LOAD_SCALE;
1037
950 if (type == 0) 1038 if (type == 0)
951 return load_now; 1039 target_load = load_now;
1040 else
1041 target_load = max(cpu_load, load_now);
1042
1043 if (running > 1 || (idle == NOT_IDLE && running))
1044 target_load = target_load * rq->prio_bias / running;
952 1045
953 return max(rq->cpu_load[type-1], load_now); 1046 return target_load;
1047}
1048
1049static inline unsigned long target_load(int cpu, int type)
1050{
1051 return __target_load(cpu, type, NOT_IDLE);
954} 1052}
955 1053
956/* 1054/*
@@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1411 list_add_tail(&p->run_list, &current->run_list); 1509 list_add_tail(&p->run_list, &current->run_list);
1412 p->array = current->array; 1510 p->array = current->array;
1413 p->array->nr_active++; 1511 p->array->nr_active++;
1414 rq->nr_running++; 1512 inc_nr_running(p, rq);
1415 } 1513 }
1416 set_need_resched(); 1514 set_need_resched();
1417 } else 1515 } else
@@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1756 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1854 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1757{ 1855{
1758 dequeue_task(p, src_array); 1856 dequeue_task(p, src_array);
1759 src_rq->nr_running--; 1857 dec_nr_running(p, src_rq);
1760 set_task_cpu(p, this_cpu); 1858 set_task_cpu(p, this_cpu);
1761 this_rq->nr_running++; 1859 inc_nr_running(p, this_rq);
1762 enqueue_task(p, this_array); 1860 enqueue_task(p, this_array);
1763 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1861 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1764 + this_rq->timestamp_last_tick; 1862 + this_rq->timestamp_last_tick;
@@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1937 2035
1938 /* Bias balancing toward cpus of our domain */ 2036 /* Bias balancing toward cpus of our domain */
1939 if (local_group) 2037 if (local_group)
1940 load = target_load(i, load_idx); 2038 load = __target_load(i, load_idx, idle);
1941 else 2039 else
1942 load = source_load(i, load_idx); 2040 load = __source_load(i, load_idx, idle);
1943 2041
1944 avg_load += load; 2042 avg_load += load;
1945 } 2043 }
@@ -2044,14 +2142,15 @@ out_balanced:
2044/* 2142/*
2045 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2143 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2046 */ 2144 */
2047static runqueue_t *find_busiest_queue(struct sched_group *group) 2145static runqueue_t *find_busiest_queue(struct sched_group *group,
2146 enum idle_type idle)
2048{ 2147{
2049 unsigned long load, max_load = 0; 2148 unsigned long load, max_load = 0;
2050 runqueue_t *busiest = NULL; 2149 runqueue_t *busiest = NULL;
2051 int i; 2150 int i;
2052 2151
2053 for_each_cpu_mask(i, group->cpumask) { 2152 for_each_cpu_mask(i, group->cpumask) {
2054 load = source_load(i, 0); 2153 load = __source_load(i, 0, idle);
2055 2154
2056 if (load > max_load) { 2155 if (load > max_load) {
2057 max_load = load; 2156 max_load = load;
@@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2095 goto out_balanced; 2194 goto out_balanced;
2096 } 2195 }
2097 2196
2098 busiest = find_busiest_queue(group); 2197 busiest = find_busiest_queue(group, idle);
2099 if (!busiest) { 2198 if (!busiest) {
2100 schedstat_inc(sd, lb_nobusyq[idle]); 2199 schedstat_inc(sd, lb_nobusyq[idle]);
2101 goto out_balanced; 2200 goto out_balanced;
@@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2218 goto out_balanced; 2317 goto out_balanced;
2219 } 2318 }
2220 2319
2221 busiest = find_busiest_queue(group); 2320 busiest = find_busiest_queue(group, NEWLY_IDLE);
2222 if (!busiest) { 2321 if (!busiest) {
2223 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2322 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2224 goto out_balanced; 2323 goto out_balanced;
@@ -3451,8 +3550,10 @@ void set_user_nice(task_t *p, long nice)
3451 goto out_unlock; 3550 goto out_unlock;
3452 } 3551 }
3453 array = p->array; 3552 array = p->array;
3454 if (array) 3553 if (array) {
3455 dequeue_task(p, array); 3554 dequeue_task(p, array);
3555 dec_prio_bias(rq, p->static_prio);
3556 }
3456 3557
3457 old_prio = p->prio; 3558 old_prio = p->prio;
3458 new_prio = NICE_TO_PRIO(nice); 3559 new_prio = NICE_TO_PRIO(nice);
@@ -3462,6 +3563,7 @@ void set_user_nice(task_t *p, long nice)
3462 3563
3463 if (array) { 3564 if (array) {
3464 enqueue_task(p, array); 3565 enqueue_task(p, array);
3566 inc_prio_bias(rq, p->static_prio);
3465 /* 3567 /*
3466 * If the task increased its priority or is running and 3568 * If the task increased its priority or is running and
3467 * lowered its priority, then reschedule its CPU: 3569 * lowered its priority, then reschedule its CPU:
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a2dcceb9437d..c67189a25d52 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs)
73static int watchdog(void * __bind_cpu) 73static int watchdog(void * __bind_cpu)
74{ 74{
75 struct sched_param param = { .sched_priority = 99 }; 75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79 76
80 sched_setscheduler(current, SCHED_FIFO, &param); 77 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE; 78 current->flags |= PF_NOFREEZE;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c4f35f96884d..9990e10192e8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
169 169
170extern struct proc_dir_entry *proc_sys_root; 170extern struct proc_dir_entry *proc_sys_root;
171 171
172static void register_proc_table(ctl_table *, struct proc_dir_entry *); 172static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
173static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); 173static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
174#endif 174#endif
175 175
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
992 992
993extern void init_irq_proc (void); 993extern void init_irq_proc (void);
994 994
995static DEFINE_SPINLOCK(sysctl_lock);
996
997/* called under sysctl_lock */
998static int use_table(struct ctl_table_header *p)
999{
1000 if (unlikely(p->unregistering))
1001 return 0;
1002 p->used++;
1003 return 1;
1004}
1005
1006/* called under sysctl_lock */
1007static void unuse_table(struct ctl_table_header *p)
1008{
1009 if (!--p->used)
1010 if (unlikely(p->unregistering))
1011 complete(p->unregistering);
1012}
1013
1014/* called under sysctl_lock, will reacquire if has to wait */
1015static void start_unregistering(struct ctl_table_header *p)
1016{
1017 /*
1018 * if p->used is 0, nobody will ever touch that entry again;
1019 * we'll eliminate all paths to it before dropping sysctl_lock
1020 */
1021 if (unlikely(p->used)) {
1022 struct completion wait;
1023 init_completion(&wait);
1024 p->unregistering = &wait;
1025 spin_unlock(&sysctl_lock);
1026 wait_for_completion(&wait);
1027 spin_lock(&sysctl_lock);
1028 }
1029 /*
1030 * do not remove from the list until nobody holds it; walking the
1031 * list in do_sysctl() relies on that.
1032 */
1033 list_del_init(&p->ctl_entry);
1034}
1035
995void __init sysctl_init(void) 1036void __init sysctl_init(void)
996{ 1037{
997#ifdef CONFIG_PROC_FS 1038#ifdef CONFIG_PROC_FS
998 register_proc_table(root_table, proc_sys_root); 1039 register_proc_table(root_table, proc_sys_root, &root_table_header);
999 init_irq_proc(); 1040 init_irq_proc();
1000#endif 1041#endif
1001} 1042}
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1004 void __user *newval, size_t newlen) 1045 void __user *newval, size_t newlen)
1005{ 1046{
1006 struct list_head *tmp; 1047 struct list_head *tmp;
1048 int error = -ENOTDIR;
1007 1049
1008 if (nlen <= 0 || nlen >= CTL_MAXNAME) 1050 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1009 return -ENOTDIR; 1051 return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1012 if (!oldlenp || get_user(old_len, oldlenp)) 1054 if (!oldlenp || get_user(old_len, oldlenp))
1013 return -EFAULT; 1055 return -EFAULT;
1014 } 1056 }
1057 spin_lock(&sysctl_lock);
1015 tmp = &root_table_header.ctl_entry; 1058 tmp = &root_table_header.ctl_entry;
1016 do { 1059 do {
1017 struct ctl_table_header *head = 1060 struct ctl_table_header *head =
1018 list_entry(tmp, struct ctl_table_header, ctl_entry); 1061 list_entry(tmp, struct ctl_table_header, ctl_entry);
1019 void *context = NULL; 1062 void *context = NULL;
1020 int error = parse_table(name, nlen, oldval, oldlenp, 1063
1064 if (!use_table(head))
1065 continue;
1066
1067 spin_unlock(&sysctl_lock);
1068
1069 error = parse_table(name, nlen, oldval, oldlenp,
1021 newval, newlen, head->ctl_table, 1070 newval, newlen, head->ctl_table,
1022 &context); 1071 &context);
1023 kfree(context); 1072 kfree(context);
1073
1074 spin_lock(&sysctl_lock);
1075 unuse_table(head);
1024 if (error != -ENOTDIR) 1076 if (error != -ENOTDIR)
1025 return error; 1077 break;
1026 tmp = tmp->next; 1078 } while ((tmp = tmp->next) != &root_table_header.ctl_entry);
1027 } while (tmp != &root_table_header.ctl_entry); 1079 spin_unlock(&sysctl_lock);
1028 return -ENOTDIR; 1080 return error;
1029} 1081}
1030 1082
1031asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 1083asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1236 return NULL; 1288 return NULL;
1237 tmp->ctl_table = table; 1289 tmp->ctl_table = table;
1238 INIT_LIST_HEAD(&tmp->ctl_entry); 1290 INIT_LIST_HEAD(&tmp->ctl_entry);
1291 tmp->used = 0;
1292 tmp->unregistering = NULL;
1293 spin_lock(&sysctl_lock);
1239 if (insert_at_head) 1294 if (insert_at_head)
1240 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); 1295 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
1241 else 1296 else
1242 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1297 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1298 spin_unlock(&sysctl_lock);
1243#ifdef CONFIG_PROC_FS 1299#ifdef CONFIG_PROC_FS
1244 register_proc_table(table, proc_sys_root); 1300 register_proc_table(table, proc_sys_root, tmp);
1245#endif 1301#endif
1246 return tmp; 1302 return tmp;
1247} 1303}
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1255 */ 1311 */
1256void unregister_sysctl_table(struct ctl_table_header * header) 1312void unregister_sysctl_table(struct ctl_table_header * header)
1257{ 1313{
1258 list_del(&header->ctl_entry); 1314 might_sleep();
1315 spin_lock(&sysctl_lock);
1316 start_unregistering(header);
1259#ifdef CONFIG_PROC_FS 1317#ifdef CONFIG_PROC_FS
1260 unregister_proc_table(header->ctl_table, proc_sys_root); 1318 unregister_proc_table(header->ctl_table, proc_sys_root);
1261#endif 1319#endif
1320 spin_unlock(&sysctl_lock);
1262 kfree(header); 1321 kfree(header);
1263} 1322}
1264 1323
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1269#ifdef CONFIG_PROC_FS 1328#ifdef CONFIG_PROC_FS
1270 1329
1271/* Scan the sysctl entries in table and add them all into /proc */ 1330/* Scan the sysctl entries in table and add them all into /proc */
1272static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) 1331static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
1273{ 1332{
1274 struct proc_dir_entry *de; 1333 struct proc_dir_entry *de;
1275 int len; 1334 int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
1305 de = create_proc_entry(table->procname, mode, root); 1364 de = create_proc_entry(table->procname, mode, root);
1306 if (!de) 1365 if (!de)
1307 continue; 1366 continue;
1367 de->set = set;
1308 de->data = (void *) table; 1368 de->data = (void *) table;
1309 if (table->proc_handler) 1369 if (table->proc_handler)
1310 de->proc_fops = &proc_sys_file_operations; 1370 de->proc_fops = &proc_sys_file_operations;
1311 } 1371 }
1312 table->de = de; 1372 table->de = de;
1313 if (de->mode & S_IFDIR) 1373 if (de->mode & S_IFDIR)
1314 register_proc_table(table->child, de); 1374 register_proc_table(table->child, de, set);
1315 } 1375 }
1316} 1376}
1317 1377
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
1336 continue; 1396 continue;
1337 } 1397 }
1338 1398
1399 /*
1400 * In any case, mark the entry as goner; we'll keep it
1401 * around if it's busy, but we'll know to do nothing with
1402 * its fields. We are under sysctl_lock here.
1403 */
1404 de->data = NULL;
1405
1339 /* Don't unregister proc entries that are still being used.. */ 1406 /* Don't unregister proc entries that are still being used.. */
1340 if (atomic_read(&de->count)) 1407 if (atomic_read(&de->count))
1341 continue; 1408 continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1349 size_t count, loff_t *ppos) 1416 size_t count, loff_t *ppos)
1350{ 1417{
1351 int op; 1418 int op;
1352 struct proc_dir_entry *de; 1419 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
1353 struct ctl_table *table; 1420 struct ctl_table *table;
1354 size_t res; 1421 size_t res;
1355 ssize_t error; 1422 ssize_t error = -ENOTDIR;
1356
1357 de = PDE(file->f_dentry->d_inode);
1358 if (!de || !de->data)
1359 return -ENOTDIR;
1360 table = (struct ctl_table *) de->data;
1361 if (!table || !table->proc_handler)
1362 return -ENOTDIR;
1363 op = (write ? 002 : 004);
1364 if (ctl_perm(table, op))
1365 return -EPERM;
1366 1423
1367 res = count; 1424 spin_lock(&sysctl_lock);
1368 1425 if (de && de->data && use_table(de->set)) {
1369 error = (*table->proc_handler) (table, write, file, buf, &res, ppos); 1426 /*
1370 if (error) 1427 * at that point we know that sysctl was not unregistered
1371 return error; 1428 * and won't be until we finish
1372 return res; 1429 */
1430 spin_unlock(&sysctl_lock);
1431 table = (struct ctl_table *) de->data;
1432 if (!table || !table->proc_handler)
1433 goto out;
1434 error = -EPERM;
1435 op = (write ? 002 : 004);
1436 if (ctl_perm(table, op))
1437 goto out;
1438
1439 /* careful: calling conventions are nasty here */
1440 res = count;
1441 error = (*table->proc_handler)(table, write, file,
1442 buf, &res, ppos);
1443 if (!error)
1444 error = res;
1445 out:
1446 spin_lock(&sysctl_lock);
1447 unuse_table(de->set);
1448 }
1449 spin_unlock(&sysctl_lock);
1450 return error;
1373} 1451}
1374 1452
1375static int proc_opensys(struct inode *inode, struct file *file) 1453static int proc_opensys(struct inode *inode, struct file *file)