aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt43
-rw-r--r--Documentation/sched-design-CFS.txt119
-rw-r--r--arch/i386/kernel/smpboot.c12
-rw-r--r--arch/i386/kernel/tsc.c9
-rw-r--r--arch/ia64/kernel/setup.c6
-rw-r--r--arch/mips/kernel/smp.c11
-rw-r--r--arch/sparc/kernel/smp.c10
-rw-r--r--arch/sparc64/kernel/smp.c27
-rw-r--r--drivers/ide/arm/icside.c16
-rw-r--r--drivers/ide/cris/ide-cris.c2
-rw-r--r--drivers/ide/ide-cd.c6
-rw-r--r--drivers/ide/ide-cd.h2
-rw-r--r--drivers/ide/ide-disk.c8
-rw-r--r--drivers/ide/ide-dma.c110
-rw-r--r--drivers/ide/ide-io.c4
-rw-r--r--drivers/ide/ide-iops.c8
-rw-r--r--drivers/ide/ide-probe.c10
-rw-r--r--drivers/ide/ide-proc.c34
-rw-r--r--drivers/ide/ide-timing.h56
-rw-r--r--drivers/ide/ide.c33
-rw-r--r--drivers/ide/legacy/hd.c2
-rw-r--r--drivers/ide/legacy/macide.c14
-rw-r--r--drivers/ide/mips/au1xxx-ide.c24
-rw-r--r--drivers/ide/pci/aec62xx.c119
-rw-r--r--drivers/ide/pci/alim15x3.c78
-rw-r--r--drivers/ide/pci/amd74xx.c127
-rw-r--r--drivers/ide/pci/atiixp.c5
-rw-r--r--drivers/ide/pci/cmd64x.c130
-rw-r--r--drivers/ide/pci/cs5535.c6
-rw-r--r--drivers/ide/pci/hpt366.c170
-rw-r--r--drivers/ide/pci/it8213.c8
-rw-r--r--drivers/ide/pci/it821x.c9
-rw-r--r--drivers/ide/pci/jmicron.c20
-rw-r--r--drivers/ide/pci/pdc202xx_new.c9
-rw-r--r--drivers/ide/pci/pdc202xx_old.c35
-rw-r--r--drivers/ide/pci/piix.c45
-rw-r--r--drivers/ide/pci/scc_pata.c2
-rw-r--r--drivers/ide/pci/serverworks.c103
-rw-r--r--drivers/ide/pci/sgiioc4.c20
-rw-r--r--drivers/ide/pci/siimage.c18
-rw-r--r--drivers/ide/pci/sis5513.c34
-rw-r--r--drivers/ide/pci/sl82c105.c20
-rw-r--r--drivers/ide/pci/slc90e66.c5
-rw-r--r--drivers/ide/pci/tc86c001.c4
-rw-r--r--drivers/ide/pci/via82cxxx.c175
-rw-r--r--drivers/ide/ppc/pmac.c42
-rw-r--r--fs/jfs/endian24.h2
-rw-r--r--fs/jfs/jfs_debug.c28
-rw-r--r--fs/jfs/jfs_debug.h2
-rw-r--r--fs/jfs/jfs_dinode.h42
-rw-r--r--fs/jfs/jfs_dmap.c419
-rw-r--r--fs/jfs/jfs_dmap.h118
-rw-r--r--fs/jfs/jfs_dtree.c105
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_extent.c102
-rw-r--r--fs/jfs/jfs_filsys.h13
-rw-r--r--fs/jfs/jfs_imap.c296
-rw-r--r--fs/jfs/jfs_imap.h98
-rw-r--r--fs/jfs/jfs_incore.h4
-rw-r--r--fs/jfs/jfs_logmgr.c90
-rw-r--r--fs/jfs/jfs_logmgr.h26
-rw-r--r--fs/jfs/jfs_metapage.c3
-rw-r--r--fs/jfs/jfs_mount.c6
-rw-r--r--fs/jfs/jfs_txnmgr.c302
-rw-r--r--fs/jfs/jfs_txnmgr.h2
-rw-r--r--fs/jfs/jfs_types.h20
-rw-r--r--fs/jfs/jfs_umount.c2
-rw-r--r--fs/jfs/jfs_xtree.c428
-rw-r--r--fs/jfs/jfs_xtree.h48
-rw-r--r--fs/jfs/namei.c26
-rw-r--r--fs/jfs/resize.c48
-rw-r--r--fs/jfs/xattr.c9
-rw-r--r--fs/proc/array.c59
-rw-r--r--fs/proc/base.c71
-rw-r--r--include/asm-generic/bitops/sched.h21
-rw-r--r--include/asm-mips/mach-au1x00/au1xxx_ide.h28
-rw-r--r--include/linux/hardirq.h13
-rw-r--r--include/linux/ide.h18
-rw-r--r--include/linux/sched.h251
-rw-r--r--include/linux/topology.h12
-rw-r--r--include/linux/wait.h16
-rw-r--r--init/main.c5
-rw-r--r--kernel/delayacct.c10
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/posix-cpu-timers.c34
-rw-r--r--kernel/sched.c3021
-rw-r--r--kernel/sched_debug.c275
-rw-r--r--kernel/sched_fair.c1131
-rw-r--r--kernel/sched_idletask.c71
-rw-r--r--kernel/sched_rt.c255
-rw-r--r--kernel/sched_stats.h235
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/sysctl.c80
-rw-r--r--lib/Kconfig.debug9
95 files changed, 5518 insertions, 4098 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af50f9bbe68e..4d880b3d1f35 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file
1014 1014
1015 mga= [HW,DRM] 1015 mga= [HW,DRM]
1016 1016
1017 migration_cost=
1018 [KNL,SMP] debug: override scheduler migration costs
1019 Format: <level-1-usecs>,<level-2-usecs>,...
1020 This debugging option can be used to override the
1021 default scheduler migration cost matrix. The numbers
1022 are indexed by 'CPU domain distance'.
1023 E.g. migration_cost=1000,2000,3000 on an SMT NUMA
1024 box will set up an intra-core migration cost of
1025 1 msec, an inter-core migration cost of 2 msecs,
1026 and an inter-node migration cost of 3 msecs.
1027
1028 WARNING: using the wrong values here can break
1029 scheduler performance, so it's only for scheduler
1030 development purposes, not production environments.
1031
1032 migration_debug=
1033 [KNL,SMP] migration cost auto-detect verbosity
1034 Format=<0|1|2>
1035 If a system's migration matrix reported at bootup
1036 seems erroneous then this option can be used to
1037 increase verbosity of the detection process.
1038 We default to 0 (no extra messages), 1 will print
1039 some more information, and 2 will be really
1040 verbose (probably only useful if you also have a
1041 serial console attached to the system).
1042
1043 migration_factor=
1044 [KNL,SMP] multiply/divide migration costs by a factor
1045 Format=<percent>
1046 This debug option can be used to proportionally
1047 increase or decrease the auto-detected migration
1048 costs for all entries of the migration matrix.
1049 E.g. migration_factor=150 will increase migration
1050 costs by 50%. (and thus the scheduler will be less
1051 eager migrating cache-hot tasks)
1052 migration_factor=80 will decrease migration costs
1053 by 20%. (thus the scheduler will be more eager to
1054 migrate tasks)
1055
1056 WARNING: using the wrong values here can break
1057 scheduler performance, so it's only for scheduler
1058 development purposes, not production environments.
1059
1060 mousedev.tap_time= 1017 mousedev.tap_time=
1061 [MOUSE] Maximum time between finger touching and 1018 [MOUSE] Maximum time between finger touching and
1062 leaving touchpad surface for touch to be considered 1019 leaving touchpad surface for touch to be considered
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
new file mode 100644
index 000000000000..16feebb7bdc0
--- /dev/null
+++ b/Documentation/sched-design-CFS.txt
@@ -0,0 +1,119 @@
1
2This is the CFS scheduler.
3
480% of CFS's design can be summed up in a single sentence: CFS basically
5models an "ideal, precise multi-tasking CPU" on real hardware.
6
7"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100%
8physical power and which can run each task at precise equal speed, in
9parallel, each at 1/nr_running speed. For example: if there are 2 tasks
10running then it runs each at 50% physical power - totally in parallel.
11
12On real hardware, we can run only a single task at once, so while that
13one task runs, the other tasks that are waiting for the CPU are at a
14disadvantage - the current task gets an unfair amount of CPU time. In
15CFS this fairness imbalance is expressed and tracked via the per-task
16p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
17time the task should now run on the CPU for it to become completely fair
18and balanced.
19
20( small detail: on 'ideal' hardware, the p->wait_runtime value would
21 always be zero - no task would ever get 'out of balance' from the
22 'ideal' share of CPU time. )
23
24CFS's task picking logic is based on this p->wait_runtime value and it
25is thus very simple: it always tries to run the task with the largest
26p->wait_runtime value. In other words, CFS tries to run the task with
27the 'gravest need' for more CPU time. So CFS always tries to split up
28CPU time between runnable tasks as close to 'ideal multitasking
29hardware' as possible.
30
31Most of the rest of CFS's design just falls out of this really simple
32concept, with a few add-on embellishments like nice levels,
33multiprocessing and various algorithm variants to recognize sleepers.
34
35In practice it works like this: the system runs a task a bit, and when
36the task schedules (or a scheduler tick happens) the task's CPU usage is
37'accounted for': the (small) time it just spent using the physical CPU
38is deducted from p->wait_runtime. [minus the 'fair share' it would have
39gotten anyway]. Once p->wait_runtime gets low enough so that another
40task becomes the 'leftmost task' of the time-ordered rbtree it maintains
41(plus a small amount of 'granularity' distance relative to the leftmost
42task so that we do not over-schedule tasks and trash the cache) then the
43new leftmost task is picked and the current task is preempted.
44
45The rq->fair_clock value tracks the 'CPU time a runnable task would have
46fairly gotten, had it been runnable during that time'. So by using
47rq->fair_clock values we can accurately timestamp and measure the
48'expected CPU time' a task should have gotten. All runnable tasks are
49sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
50CFS picks the 'leftmost' task and sticks to it. As the system progresses
51forwards, newly woken tasks are put into the tree more and more to the
52right - slowly but surely giving a chance for every task to become the
53'leftmost task' and thus get on the CPU within a deterministic amount of
54time.
55
56Some implementation details:
57
58 - the introduction of Scheduling Classes: an extensible hierarchy of
59 scheduler modules. These modules encapsulate scheduling policy
60 details and are handled by the scheduler core without the core
61 code assuming about them too much.
62
63 - sched_fair.c implements the 'CFS desktop scheduler': it is a
64 replacement for the vanilla scheduler's SCHED_OTHER interactivity
65 code.
66
67 I'd like to give credit to Con Kolivas for the general approach here:
68 he has proven via RSDL/SD that 'fair scheduling' is possible and that
69 it results in better desktop scheduling. Kudos Con!
70
71 The CFS patch uses a completely different approach and implementation
72 from RSDL/SD. My goal was to make CFS's interactivity quality exceed
73 that of RSDL/SD, which is a high standard to meet :-) Testing
74 feedback is welcome to decide this one way or another. [ and, in any
75 case, all of SD's logic could be added via a kernel/sched_sd.c module
76 as well, if Con is interested in such an approach. ]
77
78 CFS's design is quite radical: it does not use runqueues, it uses a
79 time-ordered rbtree to build a 'timeline' of future task execution,
80 and thus has no 'array switch' artifacts (by which both the vanilla
81 scheduler and RSDL/SD are affected).
82
83 CFS uses nanosecond granularity accounting and does not rely on any
84 jiffies or other HZ detail. Thus the CFS scheduler has no notion of
85 'timeslices' and has no heuristics whatsoever. There is only one
86 central tunable:
87
88 /proc/sys/kernel/sched_granularity_ns
89
90 which can be used to tune the scheduler from 'desktop' (low
91 latencies) to 'server' (good batching) workloads. It defaults to a
92 setting suitable for desktop workloads. SCHED_BATCH is handled by the
93 CFS scheduler module too.
94
95 Due to its design, the CFS scheduler is not prone to any of the
96 'attacks' that exist today against the heuristics of the stock
97 scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
98 work fine and do not impact interactivity and produce the expected
99 behavior.
100
101 the CFS scheduler has a much stronger handling of nice levels and
102 SCHED_BATCH: both types of workloads should be isolated much more
103 agressively than under the vanilla scheduler.
104
105 ( another detail: due to nanosec accounting and timeline sorting,
106 sched_yield() support is very simple under CFS, and in fact under
107 CFS sched_yield() behaves much better than under any other
108 scheduler i have tested so far. )
109
110 - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
111 way than the vanilla scheduler does. It uses 100 runqueues (for all
112 100 RT priority levels, instead of 140 in the vanilla scheduler)
113 and it needs no expired array.
114
115 - reworked/sanitized SMP load-balancing: the runqueue-walking
116 assumptions are gone from the load-balancing code now, and
117 iterators of the scheduling modules are used. The balancing code got
118 quite a bit simpler as a result.
119
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 88baed1e7e83..0b2954534b8e 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ exit:
941} 941}
942#endif 942#endif
943 943
944static void smp_tune_scheduling(void)
945{
946 if (cpu_khz) {
947 /* cache size in kB */
948 long cachesize = boot_cpu_data.x86_cache_size;
949
950 if (cachesize > 0)
951 max_cache_size = cachesize * 1024;
952 }
953}
954
955/* 944/*
956 * Cycle through the processors sending APIC IPIs to boot each. 945 * Cycle through the processors sending APIC IPIs to boot each.
957 */ 946 */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
980 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; 969 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
981 970
982 current_thread_info()->cpu = 0; 971 current_thread_info()->cpu = 0;
983 smp_tune_scheduling();
984 972
985 set_cpu_sibling_map(0); 973 set_cpu_sibling_map(0);
986 974
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index f64b81f3033b..ea63a30ca3e8 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -4,6 +4,7 @@
4 * See comments there for proper credits. 4 * See comments there for proper credits.
5 */ 5 */
6 6
7#include <linux/sched.h>
7#include <linux/clocksource.h> 8#include <linux/clocksource.h>
8#include <linux/workqueue.h> 9#include <linux/workqueue.h>
9#include <linux/cpufreq.h> 10#include <linux/cpufreq.h>
@@ -106,8 +107,13 @@ unsigned long long sched_clock(void)
106 107
107 /* 108 /*
108 * Fall back to jiffies if there's no TSC available: 109 * Fall back to jiffies if there's no TSC available:
110 * ( But note that we still use it if the TSC is marked
111 * unstable. We do this because unlike Time Of Day,
112 * the scheduler clock tolerates small errors and it's
113 * very important for it to be as fast as the platform
114 * can achive it. )
109 */ 115 */
110 if (unlikely(!tsc_enabled)) 116 if (unlikely(!tsc_enabled && !tsc_unstable))
111 /* No locking but a rare wrong value is not a big deal: */ 117 /* No locking but a rare wrong value is not a big deal: */
112 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 118 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
113 119
@@ -277,6 +283,7 @@ static struct clocksource clocksource_tsc = {
277 283
278void mark_tsc_unstable(char *reason) 284void mark_tsc_unstable(char *reason)
279{ 285{
286 sched_clock_unstable_event();
280 if (!tsc_unstable) { 287 if (!tsc_unstable) {
281 tsc_unstable = 1; 288 tsc_unstable = 1;
282 tsc_enabled = 0; 289 tsc_enabled = 0;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index eaa6a24bc0b6..188fb73c6845 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
805get_max_cacheline_size (void) 805get_max_cacheline_size (void)
806{ 806{
807 unsigned long line_size, max = 1; 807 unsigned long line_size, max = 1;
808 unsigned int cache_size = 0;
809 u64 l, levels, unique_caches; 808 u64 l, levels, unique_caches;
810 pal_cache_config_info_t cci; 809 pal_cache_config_info_t cci;
811 s64 status; 810 s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
835 line_size = 1 << cci.pcci_line_size; 834 line_size = 1 << cci.pcci_line_size;
836 if (line_size > max) 835 if (line_size > max)
837 max = line_size; 836 max = line_size;
838 if (cache_size < cci.pcci_cache_size)
839 cache_size = cci.pcci_cache_size;
840 if (!cci.pcci_unified) { 837 if (!cci.pcci_unified) {
841 status = ia64_pal_cache_config_info(l, 838 status = ia64_pal_cache_config_info(l,
842 /* cache_type (instruction)= */ 1, 839 /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
853 ia64_i_cache_stride_shift = cci.pcci_stride; 850 ia64_i_cache_stride_shift = cci.pcci_stride;
854 } 851 }
855 out: 852 out:
856#ifdef CONFIG_SMP
857 max_cache_size = max(max_cache_size, cache_size);
858#endif
859 if (max > ia64_max_cacheline_size) 853 if (max > ia64_max_cacheline_size)
860 ia64_max_cacheline_size = max; 854 ia64_max_cacheline_size = max;
861} 855}
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 67edfa7ed93a..a1b017f2dbb3 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */
51EXPORT_SYMBOL(phys_cpu_present_map); 51EXPORT_SYMBOL(phys_cpu_present_map);
52EXPORT_SYMBOL(cpu_online_map); 52EXPORT_SYMBOL(cpu_online_map);
53 53
54/* This happens early in bootup, can't really do it better */
55static void smp_tune_scheduling (void)
56{
57 struct cache_desc *cd = &current_cpu_data.scache;
58 unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
59
60 if (cachesize > max_cache_size)
61 max_cache_size = cachesize;
62}
63
64extern void __init calibrate_delay(void); 54extern void __init calibrate_delay(void);
65extern ATTRIB_NORET void cpu_idle(void); 55extern ATTRIB_NORET void cpu_idle(void);
66 56
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
228{ 218{
229 init_new_context(current, &init_mm); 219 init_new_context(current, &init_mm);
230 current_thread_info()->cpu = 0; 220 current_thread_info()->cpu = 0;
231 smp_tune_scheduling();
232 plat_prepare_cpus(max_cpus); 221 plat_prepare_cpus(max_cpus);
233#ifndef CONFIG_HOTPLUG_CPU 222#ifndef CONFIG_HOTPLUG_CPU
234 cpu_present_map = cpu_possible_map; 223 cpu_present_map = cpu_possible_map;
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c
index 4d9ad59031bb..4fea3ac7bff0 100644
--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
68 cpu_data(id).prom_node = cpu_node; 68 cpu_data(id).prom_node = cpu_node;
69 cpu_data(id).mid = cpu_get_hwmid(cpu_node); 69 cpu_data(id).mid = cpu_get_hwmid(cpu_node);
70 70
71 /* this is required to tune the scheduler correctly */
72 /* is it possible to have CPUs with different cache sizes? */
73 if (id == boot_cpu_id) {
74 int cache_line,cache_nlines;
75 cache_line = 0x20;
76 cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
77 cache_nlines = 0x8000;
78 cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
79 max_cache_size = cache_line * cache_nlines;
80 }
81 if (cpu_data(id).mid < 0) 71 if (cpu_data(id).mid < 0)
82 panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); 72 panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
83} 73}
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4dcd7d0b60f2..40e40f968d61 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
1163 return -EINVAL; 1163 return -EINVAL;
1164} 1164}
1165 1165
1166static void __init smp_tune_scheduling(void)
1167{
1168 unsigned int smallest = ~0U;
1169 int i;
1170
1171 for (i = 0; i < NR_CPUS; i++) {
1172 unsigned int val = cpu_data(i).ecache_size;
1173
1174 if (val && val < smallest)
1175 smallest = val;
1176 }
1177
1178 /* Any value less than 256K is nonsense. */
1179 if (smallest < (256U * 1024U))
1180 smallest = 256 * 1024;
1181
1182 max_cache_size = smallest;
1183
1184 if (smallest < 1U * 1024U * 1024U)
1185 printk(KERN_INFO "Using max_cache_size of %uKB\n",
1186 smallest / 1024U);
1187 else
1188 printk(KERN_INFO "Using max_cache_size of %uMB\n",
1189 smallest / 1024U / 1024U);
1190}
1191
1192/* Constrain the number of cpus to max_cpus. */ 1166/* Constrain the number of cpus to max_cpus. */
1193void __init smp_prepare_cpus(unsigned int max_cpus) 1167void __init smp_prepare_cpus(unsigned int max_cpus)
1194{ 1168{
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1206 } 1180 }
1207 1181
1208 cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; 1182 cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
1209 smp_tune_scheduling();
1210} 1183}
1211 1184
1212void __devinit smp_prepare_boot_cpu(void) 1185void __devinit smp_prepare_boot_cpu(void)
diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index 66f826252aee..444a0b84f5bd 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -448,23 +448,21 @@ static int icside_dma_test_irq(ide_drive_t *drive)
448 ICS_ARCIN_V6_INTRSTAT_1)) & 1; 448 ICS_ARCIN_V6_INTRSTAT_1)) & 1;
449} 449}
450 450
451static int icside_dma_timeout(ide_drive_t *drive) 451static void icside_dma_timeout(ide_drive_t *drive)
452{ 452{
453 printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name); 453 printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
454 454
455 if (icside_dma_test_irq(drive)) 455 if (icside_dma_test_irq(drive))
456 return 0; 456 return;
457 457
458 ide_dump_status(drive, "DMA timeout", 458 ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG));
459 HWIF(drive)->INB(IDE_STATUS_REG));
460 459
461 return icside_dma_end(drive); 460 icside_dma_end(drive);
462} 461}
463 462
464static int icside_dma_lostirq(ide_drive_t *drive) 463static void icside_dma_lost_irq(ide_drive_t *drive)
465{ 464{
466 printk(KERN_ERR "%s: IRQ lost\n", drive->name); 465 printk(KERN_ERR "%s: IRQ lost\n", drive->name);
467 return 1;
468} 466}
469 467
470static void icside_dma_init(ide_hwif_t *hwif) 468static void icside_dma_init(ide_hwif_t *hwif)
@@ -490,8 +488,8 @@ static void icside_dma_init(ide_hwif_t *hwif)
490 hwif->dma_start = icside_dma_start; 488 hwif->dma_start = icside_dma_start;
491 hwif->ide_dma_end = icside_dma_end; 489 hwif->ide_dma_end = icside_dma_end;
492 hwif->ide_dma_test_irq = icside_dma_test_irq; 490 hwif->ide_dma_test_irq = icside_dma_test_irq;
493 hwif->ide_dma_timeout = icside_dma_timeout; 491 hwif->dma_timeout = icside_dma_timeout;
494 hwif->ide_dma_lostirq = icside_dma_lostirq; 492 hwif->dma_lost_irq = icside_dma_lost_irq;
495 493
496 hwif->drives[0].autodma = hwif->autodma; 494 hwif->drives[0].autodma = hwif->autodma;
497 hwif->drives[1].autodma = hwif->autodma; 495 hwif->drives[1].autodma = hwif->autodma;
diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index ca0341c05e55..886091bc7db0 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -819,7 +819,7 @@ init_e100_ide (void)
819 hwif->dma_host_off = &cris_dma_off; 819 hwif->dma_host_off = &cris_dma_off;
820 hwif->dma_host_on = &cris_dma_on; 820 hwif->dma_host_on = &cris_dma_on;
821 hwif->dma_off_quietly = &cris_dma_off; 821 hwif->dma_off_quietly = &cris_dma_off;
822 hwif->udma_four = 0; 822 hwif->cbl = ATA_CBL_PATA40;
823 hwif->ultra_mask = cris_ultra_mask; 823 hwif->ultra_mask = cris_ultra_mask;
824 hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */ 824 hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */
825 hwif->autodma = 1; 825 hwif->autodma = 1;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 252ab8295edf..1486eb212ccc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -481,7 +481,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
481 else 481 else
482 printk(" Unknown Error Type: "); 482 printk(" Unknown Error Type: ");
483 483
484 if (sense->sense_key < ARY_LEN(sense_key_texts)) 484 if (sense->sense_key < ARRAY_SIZE(sense_key_texts))
485 s = sense_key_texts[sense->sense_key]; 485 s = sense_key_texts[sense->sense_key];
486 486
487 printk("%s -- (Sense key=0x%02x)\n", s, sense->sense_key); 487 printk("%s -- (Sense key=0x%02x)\n", s, sense->sense_key);
@@ -491,7 +491,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
491 sense->ascq); 491 sense->ascq);
492 s = buf; 492 s = buf;
493 } else { 493 } else {
494 int lo = 0, mid, hi = ARY_LEN(sense_data_texts); 494 int lo = 0, mid, hi = ARRAY_SIZE(sense_data_texts);
495 unsigned long key = (sense->sense_key << 16); 495 unsigned long key = (sense->sense_key << 16);
496 key |= (sense->asc << 8); 496 key |= (sense->asc << 8);
497 if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd)) 497 if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd))
@@ -524,7 +524,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
524 524
525 if (failed_command != NULL) { 525 if (failed_command != NULL) {
526 526
527 int lo=0, mid, hi= ARY_LEN (packet_command_texts); 527 int lo=0, mid, hi= ARRAY_SIZE(packet_command_texts);
528 s = NULL; 528 s = NULL;
529 529
530 while (hi > lo) { 530 while (hi > lo) {
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index ad1f2ed14a37..228b29c5d2e4 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -498,8 +498,6 @@ struct cdrom_info {
498 * Descriptions of ATAPI error codes. 498 * Descriptions of ATAPI error codes.
499 */ 499 */
500 500
501#define ARY_LEN(a) ((sizeof(a) / sizeof(a[0])))
502
503/* This stuff should be in cdrom.h, since it is now generic... */ 501/* This stuff should be in cdrom.h, since it is now generic... */
504 502
505/* ATAPI sense keys (from table 140 of ATAPI 2.6) */ 503/* ATAPI sense keys (from table 140 of ATAPI 2.6) */
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index dc2175c81f5e..b1304a7f3e0a 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1190,11 +1190,11 @@ static int idedisk_ioctl(struct inode *inode, struct file *file,
1190 return generic_ide_ioctl(drive, file, bdev, cmd, arg); 1190 return generic_ide_ioctl(drive, file, bdev, cmd, arg);
1191 1191
1192read_val: 1192read_val:
1193 down(&ide_setting_sem); 1193 mutex_lock(&ide_setting_mtx);
1194 spin_lock_irqsave(&ide_lock, flags); 1194 spin_lock_irqsave(&ide_lock, flags);
1195 err = *val; 1195 err = *val;
1196 spin_unlock_irqrestore(&ide_lock, flags); 1196 spin_unlock_irqrestore(&ide_lock, flags);
1197 up(&ide_setting_sem); 1197 mutex_unlock(&ide_setting_mtx);
1198 return err >= 0 ? put_user(err, (long __user *)arg) : err; 1198 return err >= 0 ? put_user(err, (long __user *)arg) : err;
1199 1199
1200set_val: 1200set_val:
@@ -1204,9 +1204,9 @@ set_val:
1204 if (!capable(CAP_SYS_ADMIN)) 1204 if (!capable(CAP_SYS_ADMIN))
1205 err = -EACCES; 1205 err = -EACCES;
1206 else { 1206 else {
1207 down(&ide_setting_sem); 1207 mutex_lock(&ide_setting_mtx);
1208 err = setfunc(drive, arg); 1208 err = setfunc(drive, arg);
1209 up(&ide_setting_sem); 1209 mutex_unlock(&ide_setting_mtx);
1210 } 1210 }
1211 } 1211 }
1212 return err; 1212 return err;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index ead141e2db9e..5fe1d72ab451 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -91,45 +91,45 @@
91 91
92static const struct drive_list_entry drive_whitelist [] = { 92static const struct drive_list_entry drive_whitelist [] = {
93 93
94 { "Micropolis 2112A" , "ALL" }, 94 { "Micropolis 2112A" , NULL },
95 { "CONNER CTMA 4000" , "ALL" }, 95 { "CONNER CTMA 4000" , NULL },
96 { "CONNER CTT8000-A" , "ALL" }, 96 { "CONNER CTT8000-A" , NULL },
97 { "ST34342A" , "ALL" }, 97 { "ST34342A" , NULL },
98 { NULL , NULL } 98 { NULL , NULL }
99}; 99};
100 100
101static const struct drive_list_entry drive_blacklist [] = { 101static const struct drive_list_entry drive_blacklist [] = {
102 102
103 { "WDC AC11000H" , "ALL" }, 103 { "WDC AC11000H" , NULL },
104 { "WDC AC22100H" , "ALL" }, 104 { "WDC AC22100H" , NULL },
105 { "WDC AC32500H" , "ALL" }, 105 { "WDC AC32500H" , NULL },
106 { "WDC AC33100H" , "ALL" }, 106 { "WDC AC33100H" , NULL },
107 { "WDC AC31600H" , "ALL" }, 107 { "WDC AC31600H" , NULL },
108 { "WDC AC32100H" , "24.09P07" }, 108 { "WDC AC32100H" , "24.09P07" },
109 { "WDC AC23200L" , "21.10N21" }, 109 { "WDC AC23200L" , "21.10N21" },
110 { "Compaq CRD-8241B" , "ALL" }, 110 { "Compaq CRD-8241B" , NULL },
111 { "CRD-8400B" , "ALL" }, 111 { "CRD-8400B" , NULL },
112 { "CRD-8480B", "ALL" }, 112 { "CRD-8480B", NULL },
113 { "CRD-8482B", "ALL" }, 113 { "CRD-8482B", NULL },
114 { "CRD-84" , "ALL" }, 114 { "CRD-84" , NULL },
115 { "SanDisk SDP3B" , "ALL" }, 115 { "SanDisk SDP3B" , NULL },
116 { "SanDisk SDP3B-64" , "ALL" }, 116 { "SanDisk SDP3B-64" , NULL },
117 { "SANYO CD-ROM CRD" , "ALL" }, 117 { "SANYO CD-ROM CRD" , NULL },
118 { "HITACHI CDR-8" , "ALL" }, 118 { "HITACHI CDR-8" , NULL },
119 { "HITACHI CDR-8335" , "ALL" }, 119 { "HITACHI CDR-8335" , NULL },
120 { "HITACHI CDR-8435" , "ALL" }, 120 { "HITACHI CDR-8435" , NULL },
121 { "Toshiba CD-ROM XM-6202B" , "ALL" }, 121 { "Toshiba CD-ROM XM-6202B" , NULL },
122 { "TOSHIBA CD-ROM XM-1702BC", "ALL" }, 122 { "TOSHIBA CD-ROM XM-1702BC", NULL },
123 { "CD-532E-A" , "ALL" }, 123 { "CD-532E-A" , NULL },
124 { "E-IDE CD-ROM CR-840", "ALL" }, 124 { "E-IDE CD-ROM CR-840", NULL },
125 { "CD-ROM Drive/F5A", "ALL" }, 125 { "CD-ROM Drive/F5A", NULL },
126 { "WPI CDD-820", "ALL" }, 126 { "WPI CDD-820", NULL },
127 { "SAMSUNG CD-ROM SC-148C", "ALL" }, 127 { "SAMSUNG CD-ROM SC-148C", NULL },
128 { "SAMSUNG CD-ROM SC", "ALL" }, 128 { "SAMSUNG CD-ROM SC", NULL },
129 { "ATAPI CD-ROM DRIVE 40X MAXIMUM", "ALL" }, 129 { "ATAPI CD-ROM DRIVE 40X MAXIMUM", NULL },
130 { "_NEC DV5800A", "ALL" }, 130 { "_NEC DV5800A", NULL },
131 { "SAMSUNG CD-ROM SN-124", "N001" }, 131 { "SAMSUNG CD-ROM SN-124", "N001" },
132 { "Seagate STT20000A", "ALL" }, 132 { "Seagate STT20000A", NULL },
133 { NULL , NULL } 133 { NULL , NULL }
134 134
135}; 135};
@@ -147,8 +147,8 @@ int ide_in_drive_list(struct hd_driveid *id, const struct drive_list_entry *driv
147{ 147{
148 for ( ; drive_table->id_model ; drive_table++) 148 for ( ; drive_table->id_model ; drive_table++)
149 if ((!strcmp(drive_table->id_model, id->model)) && 149 if ((!strcmp(drive_table->id_model, id->model)) &&
150 ((strstr(id->fw_rev, drive_table->id_firmware)) || 150 (!drive_table->id_firmware ||
151 (!strcmp(drive_table->id_firmware, "ALL")))) 151 strstr(id->fw_rev, drive_table->id_firmware)))
152 return 1; 152 return 1;
153 return 0; 153 return 0;
154} 154}
@@ -702,8 +702,22 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base)
702 mask = id->dma_mword & hwif->mwdma_mask; 702 mask = id->dma_mword & hwif->mwdma_mask;
703 break; 703 break;
704 case XFER_SW_DMA_0: 704 case XFER_SW_DMA_0:
705 if (id->field_valid & 2) 705 if (id->field_valid & 2) {
706 mask = id->dma_1word & hwif->swdma_mask; 706 mask = id->dma_1word & hwif->swdma_mask;
707 } else if (id->tDMA) {
708 /*
709 * ide_fix_driveid() doesn't convert ->tDMA to the
710 * CPU endianness so we need to do it here
711 */
712 u8 mode = le16_to_cpu(id->tDMA);
713
714 /*
715 * if the mode is valid convert it to the mask
716 * (the maximum allowed mode is XFER_SW_DMA_2)
717 */
718 if (mode <= 2)
719 mask = ((2 << mode) - 1) & hwif->swdma_mask;
720 }
707 break; 721 break;
708 default: 722 default:
709 BUG(); 723 BUG();
@@ -847,27 +861,27 @@ int ide_set_dma(ide_drive_t *drive)
847 return rc; 861 return rc;
848} 862}
849 863
850EXPORT_SYMBOL_GPL(ide_set_dma);
851
852#ifdef CONFIG_BLK_DEV_IDEDMA_PCI 864#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
853int __ide_dma_lostirq (ide_drive_t *drive) 865void ide_dma_lost_irq (ide_drive_t *drive)
854{ 866{
855 printk("%s: DMA interrupt recovery\n", drive->name); 867 printk("%s: DMA interrupt recovery\n", drive->name);
856 return 1;
857} 868}
858 869
859EXPORT_SYMBOL(__ide_dma_lostirq); 870EXPORT_SYMBOL(ide_dma_lost_irq);
860 871
861int __ide_dma_timeout (ide_drive_t *drive) 872void ide_dma_timeout (ide_drive_t *drive)
862{ 873{
874 ide_hwif_t *hwif = HWIF(drive);
875
863 printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name); 876 printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
864 if (HWIF(drive)->ide_dma_test_irq(drive))
865 return 0;
866 877
867 return HWIF(drive)->ide_dma_end(drive); 878 if (hwif->ide_dma_test_irq(drive))
879 return;
880
881 hwif->ide_dma_end(drive);
868} 882}
869 883
870EXPORT_SYMBOL(__ide_dma_timeout); 884EXPORT_SYMBOL(ide_dma_timeout);
871 885
872/* 886/*
873 * Needed for allowing full modular support of ide-driver 887 * Needed for allowing full modular support of ide-driver
@@ -1018,10 +1032,10 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p
1018 hwif->ide_dma_end = &__ide_dma_end; 1032 hwif->ide_dma_end = &__ide_dma_end;
1019 if (!hwif->ide_dma_test_irq) 1033 if (!hwif->ide_dma_test_irq)
1020 hwif->ide_dma_test_irq = &__ide_dma_test_irq; 1034 hwif->ide_dma_test_irq = &__ide_dma_test_irq;
1021 if (!hwif->ide_dma_timeout) 1035 if (!hwif->dma_timeout)
1022 hwif->ide_dma_timeout = &__ide_dma_timeout; 1036 hwif->dma_timeout = &ide_dma_timeout;
1023 if (!hwif->ide_dma_lostirq) 1037 if (!hwif->dma_lost_irq)
1024 hwif->ide_dma_lostirq = &__ide_dma_lostirq; 1038 hwif->dma_lost_irq = &ide_dma_lost_irq;
1025 1039
1026 if (hwif->chipset != ide_trm290) { 1040 if (hwif->chipset != ide_trm290) {
1027 u8 dma_stat = hwif->INB(hwif->dma_status); 1041 u8 dma_stat = hwif->INB(hwif->dma_status);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index bfe8f1b712ba..c5b5011da56e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1350,7 +1350,7 @@ static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
1350 hwif->INB(IDE_STATUS_REG)); 1350 hwif->INB(IDE_STATUS_REG));
1351 } else { 1351 } else {
1352 printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name); 1352 printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
1353 (void) hwif->ide_dma_timeout(drive); 1353 hwif->dma_timeout(drive);
1354 } 1354 }
1355 1355
1356 /* 1356 /*
@@ -1466,7 +1466,7 @@ void ide_timer_expiry (unsigned long data)
1466 startstop = handler(drive); 1466 startstop = handler(drive);
1467 } else if (drive_is_ready(drive)) { 1467 } else if (drive_is_ready(drive)) {
1468 if (drive->waiting_for_dma) 1468 if (drive->waiting_for_dma)
1469 (void) hwgroup->hwif->ide_dma_lostirq(drive); 1469 hwgroup->hwif->dma_lost_irq(drive);
1470 (void)ide_ack_intr(hwif); 1470 (void)ide_ack_intr(hwif);
1471 printk(KERN_WARNING "%s: lost interrupt\n", drive->name); 1471 printk(KERN_WARNING "%s: lost interrupt\n", drive->name);
1472 startstop = handler(drive); 1472 startstop = handler(drive);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index f0be5f665a0e..92578b6832e9 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -574,7 +574,10 @@ u8 eighty_ninty_three (ide_drive_t *drive)
574 ide_hwif_t *hwif = drive->hwif; 574 ide_hwif_t *hwif = drive->hwif;
575 struct hd_driveid *id = drive->id; 575 struct hd_driveid *id = drive->id;
576 576
577 if (hwif->udma_four == 0) 577 if (hwif->cbl == ATA_CBL_PATA40_SHORT)
578 return 1;
579
580 if (hwif->cbl != ATA_CBL_PATA80)
578 goto no_80w; 581 goto no_80w;
579 582
580 /* Check for SATA but only if we are ATA5 or higher */ 583 /* Check for SATA but only if we are ATA5 or higher */
@@ -600,7 +603,8 @@ no_80w:
600 603
601 printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, " 604 printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, "
602 "limiting max speed to UDMA33\n", 605 "limiting max speed to UDMA33\n",
603 drive->name, hwif->udma_four ? "drive" : "host"); 606 drive->name,
607 hwif->cbl == ATA_CBL_PATA80 ? "drive" : "host");
604 608
605 drive->udma33_warned = 1; 609 drive->udma33_warned = 1;
606 610
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f5ce22c38f82..cc5801399467 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -144,7 +144,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
144 local_irq_enable(); 144 local_irq_enable();
145 ide_fix_driveid(id); 145 ide_fix_driveid(id);
146 146
147#if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) 147#if defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
148 /* 148 /*
149 * EATA SCSI controllers do a hardware ATA emulation: 149 * EATA SCSI controllers do a hardware ATA emulation:
150 * Ignore them if there is a driver for them available. 150 * Ignore them if there is a driver for them available.
@@ -154,7 +154,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
154 printk("%s: EATA SCSI HBA %.10s\n", drive->name, id->model); 154 printk("%s: EATA SCSI HBA %.10s\n", drive->name, id->model);
155 goto err_misc; 155 goto err_misc;
156 } 156 }
157#endif /* CONFIG_SCSI_EATA_DMA || CONFIG_SCSI_EATA_PIO */ 157#endif /* CONFIG_SCSI_EATA || CONFIG_SCSI_EATA_PIO */
158 158
159 /* 159 /*
160 * WIN_IDENTIFY returns little-endian info, 160 * WIN_IDENTIFY returns little-endian info,
@@ -1025,7 +1025,7 @@ static int init_irq (ide_hwif_t *hwif)
1025 BUG_ON(irqs_disabled()); 1025 BUG_ON(irqs_disabled());
1026 BUG_ON(hwif == NULL); 1026 BUG_ON(hwif == NULL);
1027 1027
1028 down(&ide_cfg_sem); 1028 mutex_lock(&ide_cfg_mtx);
1029 hwif->hwgroup = NULL; 1029 hwif->hwgroup = NULL;
1030#if MAX_HWIFS > 1 1030#if MAX_HWIFS > 1
1031 /* 1031 /*
@@ -1154,7 +1154,7 @@ static int init_irq (ide_hwif_t *hwif)
1154 printk(" (%sed with %s)", 1154 printk(" (%sed with %s)",
1155 hwif->sharing_irq ? "shar" : "serializ", match->name); 1155 hwif->sharing_irq ? "shar" : "serializ", match->name);
1156 printk("\n"); 1156 printk("\n");
1157 up(&ide_cfg_sem); 1157 mutex_unlock(&ide_cfg_mtx);
1158 return 0; 1158 return 0;
1159out_unlink: 1159out_unlink:
1160 spin_lock_irq(&ide_lock); 1160 spin_lock_irq(&ide_lock);
@@ -1177,7 +1177,7 @@ out_unlink:
1177 } 1177 }
1178 spin_unlock_irq(&ide_lock); 1178 spin_unlock_irq(&ide_lock);
1179out_up: 1179out_up:
1180 up(&ide_cfg_sem); 1180 mutex_unlock(&ide_cfg_mtx);
1181 return 1; 1181 return 1;
1182} 1182}
1183 1183
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index ea94c9aa1220..fc1d8ae6a803 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -156,7 +156,7 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
156{ 156{
157 ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL; 157 ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
158 158
159 down(&ide_setting_sem); 159 mutex_lock(&ide_setting_mtx);
160 while ((*p) && strcmp((*p)->name, name) < 0) 160 while ((*p) && strcmp((*p)->name, name) < 0)
161 p = &((*p)->next); 161 p = &((*p)->next);
162 if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL) 162 if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
@@ -177,10 +177,10 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
177 if (auto_remove) 177 if (auto_remove)
178 setting->auto_remove = 1; 178 setting->auto_remove = 1;
179 *p = setting; 179 *p = setting;
180 up(&ide_setting_sem); 180 mutex_unlock(&ide_setting_mtx);
181 return 0; 181 return 0;
182abort: 182abort:
183 up(&ide_setting_sem); 183 mutex_unlock(&ide_setting_mtx);
184 kfree(setting); 184 kfree(setting);
185 return -1; 185 return -1;
186} 186}
@@ -224,7 +224,7 @@ static void __ide_remove_setting (ide_drive_t *drive, char *name)
224 * 224 *
225 * Automatically remove all the driver specific settings for this 225 * Automatically remove all the driver specific settings for this
226 * drive. This function may not be called from IRQ context. The 226 * drive. This function may not be called from IRQ context. The
227 * caller must hold ide_setting_sem. 227 * caller must hold ide_setting_mtx.
228 */ 228 */
229 229
230static void auto_remove_settings (ide_drive_t *drive) 230static void auto_remove_settings (ide_drive_t *drive)
@@ -269,7 +269,7 @@ static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name)
269 * @setting: drive setting 269 * @setting: drive setting
270 * 270 *
271 * Read a drive setting and return the value. The caller 271 * Read a drive setting and return the value. The caller
272 * must hold the ide_setting_sem when making this call. 272 * must hold the ide_setting_mtx when making this call.
273 * 273 *
274 * BUGS: the data return and error are the same return value 274 * BUGS: the data return and error are the same return value
275 * so an error -EINVAL and true return of the same value cannot 275 * so an error -EINVAL and true return of the same value cannot
@@ -306,7 +306,7 @@ static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting)
306 * @val: value 306 * @val: value
307 * 307 *
308 * Write a drive setting if it is possible. The caller 308 * Write a drive setting if it is possible. The caller
309 * must hold the ide_setting_sem when making this call. 309 * must hold the ide_setting_mtx when making this call.
310 * 310 *
311 * BUGS: the data return and error are the same return value 311 * BUGS: the data return and error are the same return value
312 * so an error -EINVAL and true return of the same value cannot 312 * so an error -EINVAL and true return of the same value cannot
@@ -367,7 +367,7 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
367 * @drive: drive being configured 367 * @drive: drive being configured
368 * 368 *
369 * Add the generic parts of the system settings to the /proc files. 369 * Add the generic parts of the system settings to the /proc files.
370 * The caller must not be holding the ide_setting_sem. 370 * The caller must not be holding the ide_setting_mtx.
371 */ 371 */
372 372
373void ide_add_generic_settings (ide_drive_t *drive) 373void ide_add_generic_settings (ide_drive_t *drive)
@@ -408,7 +408,7 @@ static int proc_ide_read_settings
408 408
409 proc_ide_settings_warn(); 409 proc_ide_settings_warn();
410 410
411 down(&ide_setting_sem); 411 mutex_lock(&ide_setting_mtx);
412 out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); 412 out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
413 out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); 413 out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
414 while(setting) { 414 while(setting) {
@@ -428,7 +428,7 @@ static int proc_ide_read_settings
428 setting = setting->next; 428 setting = setting->next;
429 } 429 }
430 len = out - page; 430 len = out - page;
431 up(&ide_setting_sem); 431 mutex_unlock(&ide_setting_mtx);
432 PROC_IDE_READ_RETURN(page,start,off,count,eof,len); 432 PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
433} 433}
434 434
@@ -508,16 +508,16 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer,
508 ++p; 508 ++p;
509 } 509 }
510 510
511 down(&ide_setting_sem); 511 mutex_lock(&ide_setting_mtx);
512 setting = ide_find_setting_by_name(drive, name); 512 setting = ide_find_setting_by_name(drive, name);
513 if (!setting) 513 if (!setting)
514 { 514 {
515 up(&ide_setting_sem); 515 mutex_unlock(&ide_setting_mtx);
516 goto parse_error; 516 goto parse_error;
517 } 517 }
518 if (for_real) 518 if (for_real)
519 ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor); 519 ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor);
520 up(&ide_setting_sem); 520 mutex_unlock(&ide_setting_mtx);
521 } 521 }
522 } while (!for_real++); 522 } while (!for_real++);
523 free_page((unsigned long)buf); 523 free_page((unsigned long)buf);
@@ -705,7 +705,7 @@ EXPORT_SYMBOL(ide_proc_register_driver);
705 * Clean up the driver specific /proc files and IDE settings 705 * Clean up the driver specific /proc files and IDE settings
706 * for a given drive. 706 * for a given drive.
707 * 707 *
708 * Takes ide_setting_sem and ide_lock. 708 * Takes ide_setting_mtx and ide_lock.
709 * Caller must hold none of the locks. 709 * Caller must hold none of the locks.
710 */ 710 */
711 711
@@ -715,10 +715,10 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
715 715
716 ide_remove_proc_entries(drive->proc, driver->proc); 716 ide_remove_proc_entries(drive->proc, driver->proc);
717 717
718 down(&ide_setting_sem); 718 mutex_lock(&ide_setting_mtx);
719 spin_lock_irqsave(&ide_lock, flags); 719 spin_lock_irqsave(&ide_lock, flags);
720 /* 720 /*
721 * ide_setting_sem protects the settings list 721 * ide_setting_mtx protects the settings list
722 * ide_lock protects the use of settings 722 * ide_lock protects the use of settings
723 * 723 *
724 * so we need to hold both, ide_settings_sem because we want to 724 * so we need to hold both, ide_settings_sem because we want to
@@ -726,11 +726,11 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
726 * a setting out that is being used. 726 * a setting out that is being used.
727 * 727 *
728 * OTOH both ide_{read,write}_setting are only ever used under 728 * OTOH both ide_{read,write}_setting are only ever used under
729 * ide_setting_sem. 729 * ide_setting_mtx.
730 */ 730 */
731 auto_remove_settings(drive); 731 auto_remove_settings(drive);
732 spin_unlock_irqrestore(&ide_lock, flags); 732 spin_unlock_irqrestore(&ide_lock, flags);
733 up(&ide_setting_sem); 733 mutex_unlock(&ide_setting_mtx);
734} 734}
735 735
736EXPORT_SYMBOL(ide_proc_unregister_driver); 736EXPORT_SYMBOL(ide_proc_unregister_driver);
diff --git a/drivers/ide/ide-timing.h b/drivers/ide/ide-timing.h
index c0864b1e9228..e6cb8593b5ba 100644
--- a/drivers/ide/ide-timing.h
+++ b/drivers/ide/ide-timing.h
@@ -102,66 +102,16 @@ static struct ide_timing ide_timing[] = {
102#define EZ(v,unit) ((v)?ENOUGH(v,unit):0) 102#define EZ(v,unit) ((v)?ENOUGH(v,unit):0)
103 103
104#define XFER_MODE 0xf0 104#define XFER_MODE 0xf0
105#define XFER_UDMA_133 0x48
106#define XFER_UDMA_100 0x44
107#define XFER_UDMA_66 0x42
108#define XFER_UDMA 0x40
109#define XFER_MWDMA 0x20 105#define XFER_MWDMA 0x20
110#define XFER_SWDMA 0x10
111#define XFER_EPIO 0x01 106#define XFER_EPIO 0x01
112#define XFER_PIO 0x00 107#define XFER_PIO 0x00
113 108
114static short ide_find_best_mode(ide_drive_t *drive, int map) 109static short ide_find_best_pio_mode(ide_drive_t *drive)
115{ 110{
116 struct hd_driveid *id = drive->id; 111 struct hd_driveid *id = drive->id;
117 short best = 0; 112 short best = 0;
118 113
119 if (!id) 114 if (id->field_valid & 2) { /* EIDE PIO modes */
120 return XFER_PIO_SLOW;
121
122 if ((map & XFER_UDMA) && (id->field_valid & 4)) { /* Want UDMA and UDMA bitmap valid */
123
124 if ((map & XFER_UDMA_133) == XFER_UDMA_133)
125 if ((best = (id->dma_ultra & 0x0040) ? XFER_UDMA_6 : 0)) return best;
126
127 if ((map & XFER_UDMA_100) == XFER_UDMA_100)
128 if ((best = (id->dma_ultra & 0x0020) ? XFER_UDMA_5 : 0)) return best;
129
130 if ((map & XFER_UDMA_66) == XFER_UDMA_66)
131 if ((best = (id->dma_ultra & 0x0010) ? XFER_UDMA_4 :
132 (id->dma_ultra & 0x0008) ? XFER_UDMA_3 : 0)) return best;
133
134 if ((best = (id->dma_ultra & 0x0004) ? XFER_UDMA_2 :
135 (id->dma_ultra & 0x0002) ? XFER_UDMA_1 :
136 (id->dma_ultra & 0x0001) ? XFER_UDMA_0 : 0)) return best;
137 }
138
139 if ((map & XFER_MWDMA) && (id->field_valid & 2)) { /* Want MWDMA and drive has EIDE fields */
140
141 if ((best = (id->dma_mword & 0x0004) ? XFER_MW_DMA_2 :
142 (id->dma_mword & 0x0002) ? XFER_MW_DMA_1 :
143 (id->dma_mword & 0x0001) ? XFER_MW_DMA_0 : 0)) return best;
144 }
145
146 if (map & XFER_SWDMA) { /* Want SWDMA */
147
148 if (id->field_valid & 2) { /* EIDE SWDMA */
149
150 if ((best = (id->dma_1word & 0x0004) ? XFER_SW_DMA_2 :
151 (id->dma_1word & 0x0002) ? XFER_SW_DMA_1 :
152 (id->dma_1word & 0x0001) ? XFER_SW_DMA_0 : 0)) return best;
153 }
154
155 if (id->capability & 1) { /* Pre-EIDE style SWDMA */
156
157 if ((best = (id->tDMA == 2) ? XFER_SW_DMA_2 :
158 (id->tDMA == 1) ? XFER_SW_DMA_1 :
159 (id->tDMA == 0) ? XFER_SW_DMA_0 : 0)) return best;
160 }
161 }
162
163
164 if ((map & XFER_EPIO) && (id->field_valid & 2)) { /* EIDE PIO modes */
165 115
166 if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 : 116 if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 :
167 (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 : 117 (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 :
@@ -262,7 +212,7 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing
262 */ 212 */
263 213
264 if ((speed & XFER_MODE) != XFER_PIO) { 214 if ((speed & XFER_MODE) != XFER_PIO) {
265 ide_timing_compute(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO), &p, T, UT); 215 ide_timing_compute(drive, ide_find_best_pio_mode(drive), &p, T, UT);
266 ide_timing_merge(&p, t, t, IDE_TIMING_ALL); 216 ide_timing_merge(&p, t, t, IDE_TIMING_ALL);
267 } 217 }
268 218
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 0cd76bf66833..c948a5c17a5d 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -169,7 +169,7 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
169static int idebus_parameter; /* holds the "idebus=" parameter */ 169static int idebus_parameter; /* holds the "idebus=" parameter */
170static int system_bus_speed; /* holds what we think is VESA/PCI bus speed */ 170static int system_bus_speed; /* holds what we think is VESA/PCI bus speed */
171 171
172DECLARE_MUTEX(ide_cfg_sem); 172DEFINE_MUTEX(ide_cfg_mtx);
173 __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock); 173 __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
174 174
175#ifdef CONFIG_IDEPCI_PCIBUS_ORDER 175#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
@@ -460,6 +460,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
460 hwif->mwdma_mask = tmp_hwif->mwdma_mask; 460 hwif->mwdma_mask = tmp_hwif->mwdma_mask;
461 hwif->swdma_mask = tmp_hwif->swdma_mask; 461 hwif->swdma_mask = tmp_hwif->swdma_mask;
462 462
463 hwif->cbl = tmp_hwif->cbl;
464
463 hwif->chipset = tmp_hwif->chipset; 465 hwif->chipset = tmp_hwif->chipset;
464 hwif->hold = tmp_hwif->hold; 466 hwif->hold = tmp_hwif->hold;
465 467
@@ -496,8 +498,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
496 hwif->ide_dma_clear_irq = tmp_hwif->ide_dma_clear_irq; 498 hwif->ide_dma_clear_irq = tmp_hwif->ide_dma_clear_irq;
497 hwif->dma_host_on = tmp_hwif->dma_host_on; 499 hwif->dma_host_on = tmp_hwif->dma_host_on;
498 hwif->dma_host_off = tmp_hwif->dma_host_off; 500 hwif->dma_host_off = tmp_hwif->dma_host_off;
499 hwif->ide_dma_lostirq = tmp_hwif->ide_dma_lostirq; 501 hwif->dma_lost_irq = tmp_hwif->dma_lost_irq;
500 hwif->ide_dma_timeout = tmp_hwif->ide_dma_timeout; 502 hwif->dma_timeout = tmp_hwif->dma_timeout;
501 503
502 hwif->OUTB = tmp_hwif->OUTB; 504 hwif->OUTB = tmp_hwif->OUTB;
503 hwif->OUTBSYNC = tmp_hwif->OUTBSYNC; 505 hwif->OUTBSYNC = tmp_hwif->OUTBSYNC;
@@ -533,7 +535,6 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
533 hwif->extra_base = tmp_hwif->extra_base; 535 hwif->extra_base = tmp_hwif->extra_base;
534 hwif->extra_ports = tmp_hwif->extra_ports; 536 hwif->extra_ports = tmp_hwif->extra_ports;
535 hwif->autodma = tmp_hwif->autodma; 537 hwif->autodma = tmp_hwif->autodma;
536 hwif->udma_four = tmp_hwif->udma_four;
537 538
538 hwif->hwif_data = tmp_hwif->hwif_data; 539 hwif->hwif_data = tmp_hwif->hwif_data;
539} 540}
@@ -564,7 +565,7 @@ void ide_unregister(unsigned int index)
564{ 565{
565 ide_drive_t *drive; 566 ide_drive_t *drive;
566 ide_hwif_t *hwif, *g; 567 ide_hwif_t *hwif, *g;
567 static ide_hwif_t tmp_hwif; /* protected by ide_cfg_sem */ 568 static ide_hwif_t tmp_hwif; /* protected by ide_cfg_mtx */
568 ide_hwgroup_t *hwgroup; 569 ide_hwgroup_t *hwgroup;
569 int irq_count = 0, unit; 570 int irq_count = 0, unit;
570 571
@@ -572,7 +573,7 @@ void ide_unregister(unsigned int index)
572 573
573 BUG_ON(in_interrupt()); 574 BUG_ON(in_interrupt());
574 BUG_ON(irqs_disabled()); 575 BUG_ON(irqs_disabled());
575 down(&ide_cfg_sem); 576 mutex_lock(&ide_cfg_mtx);
576 spin_lock_irq(&ide_lock); 577 spin_lock_irq(&ide_lock);
577 hwif = &ide_hwifs[index]; 578 hwif = &ide_hwifs[index];
578 if (!hwif->present) 579 if (!hwif->present)
@@ -679,7 +680,7 @@ void ide_unregister(unsigned int index)
679 680
680abort: 681abort:
681 spin_unlock_irq(&ide_lock); 682 spin_unlock_irq(&ide_lock);
682 up(&ide_cfg_sem); 683 mutex_unlock(&ide_cfg_mtx);
683} 684}
684 685
685EXPORT_SYMBOL(ide_unregister); 686EXPORT_SYMBOL(ide_unregister);
@@ -817,9 +818,9 @@ EXPORT_SYMBOL(ide_register_hw);
817 * Locks for IDE setting functionality 818 * Locks for IDE setting functionality
818 */ 819 */
819 820
820DECLARE_MUTEX(ide_setting_sem); 821DEFINE_MUTEX(ide_setting_mtx);
821 822
822EXPORT_SYMBOL_GPL(ide_setting_sem); 823EXPORT_SYMBOL_GPL(ide_setting_mtx);
823 824
824/** 825/**
825 * ide_spin_wait_hwgroup - wait for group 826 * ide_spin_wait_hwgroup - wait for group
@@ -1192,11 +1193,11 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
1192 } 1193 }
1193 1194
1194read_val: 1195read_val:
1195 down(&ide_setting_sem); 1196 mutex_lock(&ide_setting_mtx);
1196 spin_lock_irqsave(&ide_lock, flags); 1197 spin_lock_irqsave(&ide_lock, flags);
1197 err = *val; 1198 err = *val;
1198 spin_unlock_irqrestore(&ide_lock, flags); 1199 spin_unlock_irqrestore(&ide_lock, flags);
1199 up(&ide_setting_sem); 1200 mutex_unlock(&ide_setting_mtx);
1200 return err >= 0 ? put_user(err, (long __user *)arg) : err; 1201 return err >= 0 ? put_user(err, (long __user *)arg) : err;
1201 1202
1202set_val: 1203set_val:
@@ -1206,9 +1207,9 @@ set_val:
1206 if (!capable(CAP_SYS_ADMIN)) 1207 if (!capable(CAP_SYS_ADMIN))
1207 err = -EACCES; 1208 err = -EACCES;
1208 else { 1209 else {
1209 down(&ide_setting_sem); 1210 mutex_lock(&ide_setting_mtx);
1210 err = setfunc(drive, arg); 1211 err = setfunc(drive, arg);
1211 up(&ide_setting_sem); 1212 mutex_unlock(&ide_setting_mtx);
1212 } 1213 }
1213 } 1214 }
1214 return err; 1215 return err;
@@ -1548,7 +1549,11 @@ static int __init ide_setup(char *s)
1548 goto bad_option; 1549 goto bad_option;
1549 case -7: /* ata66 */ 1550 case -7: /* ata66 */
1550#ifdef CONFIG_BLK_DEV_IDEPCI 1551#ifdef CONFIG_BLK_DEV_IDEPCI
1551 hwif->udma_four = 1; 1552 /*
1553 * Use ATA_CBL_PATA40_SHORT so drive side
1554 * cable detection is also overriden.
1555 */
1556 hwif->cbl = ATA_CBL_PATA40_SHORT;
1552 goto obsolete_option; 1557 goto obsolete_option;
1553#else 1558#else
1554 goto bad_hwif; 1559 goto bad_hwif;
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 45ed03591cd8..661c12f6dda6 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -130,7 +130,7 @@ struct hd_i_struct {
130 130
131#ifdef HD_TYPE 131#ifdef HD_TYPE
132static struct hd_i_struct hd_info[] = { HD_TYPE }; 132static struct hd_i_struct hd_info[] = { HD_TYPE };
133static int NR_HD = ((sizeof (hd_info))/(sizeof (struct hd_i_struct))); 133static int NR_HD = ARRAY_SIZE(hd_info);
134#else 134#else
135static struct hd_i_struct hd_info[MAX_HD]; 135static struct hd_i_struct hd_info[MAX_HD];
136static int NR_HD; 136static int NR_HD;
diff --git a/drivers/ide/legacy/macide.c b/drivers/ide/legacy/macide.c
index c211fc78345d..b557c45a5a9d 100644
--- a/drivers/ide/legacy/macide.c
+++ b/drivers/ide/legacy/macide.c
@@ -77,15 +77,6 @@ int macide_ack_intr(ide_hwif_t* hwif)
77 return 0; 77 return 0;
78} 78}
79 79
80#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
81static void macide_mediabay_interrupt(int irq, void *dev_id)
82{
83 int state = baboon->mb_status & 0x04;
84
85 printk(KERN_INFO "macide: media bay %s detected\n", state? "removal":"insertion");
86}
87#endif
88
89/* 80/*
90 * Probe for a Macintosh IDE interface 81 * Probe for a Macintosh IDE interface
91 */ 82 */
@@ -128,11 +119,6 @@ void macide_init(void)
128 ide_drive_t *drive = &ide_hwifs[index].drives[0]; 119 ide_drive_t *drive = &ide_hwifs[index].drives[0];
129 drive->capacity64 = drive->cyl*drive->head*drive->sect; 120 drive->capacity64 = drive->cyl*drive->head*drive->sect;
130 121
131#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
132 request_irq(IRQ_BABOON_2, macide_mediabay_interrupt,
133 IRQ_FLG_FAST, "mediabay",
134 macide_mediabay_interrupt);
135#endif
136 } 122 }
137 break; 123 break;
138 124
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index ca95e990862e..2e7013a2a7f6 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -381,9 +381,7 @@ static int auide_dma_setup(ide_drive_t *drive)
381 381
382static int auide_dma_check(ide_drive_t *drive) 382static int auide_dma_check(ide_drive_t *drive)
383{ 383{
384 u8 speed; 384 u8 speed = ide_max_dma_mode(drive);
385
386#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
387 385
388 if( dbdma_init_done == 0 ){ 386 if( dbdma_init_done == 0 ){
389 auide_hwif.white_list = ide_in_drive_list(drive->id, 387 auide_hwif.white_list = ide_in_drive_list(drive->id,
@@ -394,7 +392,6 @@ static int auide_dma_check(ide_drive_t *drive)
394 auide_ddma_init(&auide_hwif); 392 auide_ddma_init(&auide_hwif);
395 dbdma_init_done = 1; 393 dbdma_init_done = 1;
396 } 394 }
397#endif
398 395
399 /* Is the drive in our DMA black list? */ 396 /* Is the drive in our DMA black list? */
400 397
@@ -409,8 +406,6 @@ static int auide_dma_check(ide_drive_t *drive)
409 else 406 else
410 drive->using_dma = 1; 407 drive->using_dma = 1;
411 408
412 speed = ide_find_best_mode(drive, XFER_PIO | XFER_MWDMA);
413
414 if (drive->autodma && (speed & XFER_MODE) != XFER_PIO) 409 if (drive->autodma && (speed & XFER_MODE) != XFER_PIO)
415 return 0; 410 return 0;
416 411
@@ -456,10 +451,9 @@ static void auide_dma_off_quietly(ide_drive_t *drive)
456 drive->using_dma = 0; 451 drive->using_dma = 0;
457} 452}
458 453
459static int auide_dma_lostirq(ide_drive_t *drive) 454static void auide_dma_lost_irq(ide_drive_t *drive)
460{ 455{
461 printk(KERN_ERR "%s: IRQ lost\n", drive->name); 456 printk(KERN_ERR "%s: IRQ lost\n", drive->name);
462 return 0;
463} 457}
464 458
465static void auide_ddma_tx_callback(int irq, void *param) 459static void auide_ddma_tx_callback(int irq, void *param)
@@ -489,16 +483,16 @@ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 de
489 483
490#if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA) 484#if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
491 485
492static int auide_dma_timeout(ide_drive_t *drive) 486static void auide_dma_timeout(ide_drive_t *drive)
493{ 487{
494// printk("%s\n", __FUNCTION__); 488 ide_hwif_t *hwif = HWIF(drive);
495 489
496 printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name); 490 printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
497 491
498 if (HWIF(drive)->ide_dma_test_irq(drive)) 492 if (hwif->ide_dma_test_irq(drive))
499 return 0; 493 return;
500 494
501 return HWIF(drive)->ide_dma_end(drive); 495 hwif->ide_dma_end(drive);
502} 496}
503 497
504 498
@@ -721,7 +715,7 @@ static int au_ide_probe(struct device *dev)
721 715
722#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA 716#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
723 hwif->dma_off_quietly = &auide_dma_off_quietly; 717 hwif->dma_off_quietly = &auide_dma_off_quietly;
724 hwif->ide_dma_timeout = &auide_dma_timeout; 718 hwif->dma_timeout = &auide_dma_timeout;
725 719
726 hwif->ide_dma_check = &auide_dma_check; 720 hwif->ide_dma_check = &auide_dma_check;
727 hwif->dma_exec_cmd = &auide_dma_exec_cmd; 721 hwif->dma_exec_cmd = &auide_dma_exec_cmd;
@@ -731,7 +725,7 @@ static int au_ide_probe(struct device *dev)
731 hwif->ide_dma_test_irq = &auide_dma_test_irq; 725 hwif->ide_dma_test_irq = &auide_dma_test_irq;
732 hwif->dma_host_off = &auide_dma_host_off; 726 hwif->dma_host_off = &auide_dma_host_off;
733 hwif->dma_host_on = &auide_dma_host_on; 727 hwif->dma_host_on = &auide_dma_host_on;
734 hwif->ide_dma_lostirq = &auide_dma_lostirq; 728 hwif->dma_lost_irq = &auide_dma_lost_irq;
735 hwif->ide_dma_on = &auide_dma_on; 729 hwif->ide_dma_on = &auide_dma_on;
736 730
737 hwif->autodma = 1; 731 hwif->autodma = 1;
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index b173bc66ce1e..e5d09367627e 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/aec62xx.c Version 0.21 Apr 21, 2007 2 * linux/drivers/ide/pci/aec62xx.c Version 0.24 May 24, 2007
3 * 3 *
4 * Copyright (C) 1999-2002 Andre Hedrick <andre@linux-ide.org> 4 * Copyright (C) 1999-2002 Andre Hedrick <andre@linux-ide.org>
5 * Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com> 5 * Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com>
@@ -140,25 +140,10 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed)
140 return(ide_config_drive_speed(drive, speed)); 140 return(ide_config_drive_speed(drive, speed));
141} 141}
142 142
143static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed)
144{
145 switch (HWIF(drive)->pci_dev->device) {
146 case PCI_DEVICE_ID_ARTOP_ATP865:
147 case PCI_DEVICE_ID_ARTOP_ATP865R:
148 case PCI_DEVICE_ID_ARTOP_ATP860:
149 case PCI_DEVICE_ID_ARTOP_ATP860R:
150 return ((int) aec6260_tune_chipset(drive, speed));
151 case PCI_DEVICE_ID_ARTOP_ATP850UF:
152 return ((int) aec6210_tune_chipset(drive, speed));
153 default:
154 return -1;
155 }
156}
157
158static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio) 143static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
159{ 144{
160 pio = ide_get_best_pio_mode(drive, pio, 4, NULL); 145 pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
161 (void) aec62xx_tune_chipset(drive, pio + XFER_PIO_0); 146 (void) HWIF(drive)->speedproc(drive, pio + XFER_PIO_0);
162} 147}
163 148
164static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive) 149static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
@@ -172,12 +157,9 @@ static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
172 return -1; 157 return -1;
173} 158}
174 159
175static int aec62xx_irq_timeout (ide_drive_t *drive) 160static void aec62xx_dma_lost_irq (ide_drive_t *drive)
176{ 161{
177 ide_hwif_t *hwif = HWIF(drive); 162 switch (HWIF(drive)->pci_dev->device) {
178 struct pci_dev *dev = hwif->pci_dev;
179
180 switch(dev->device) {
181 case PCI_DEVICE_ID_ARTOP_ATP860: 163 case PCI_DEVICE_ID_ARTOP_ATP860:
182 case PCI_DEVICE_ID_ARTOP_ATP860R: 164 case PCI_DEVICE_ID_ARTOP_ATP860R:
183 case PCI_DEVICE_ID_ARTOP_ATP865: 165 case PCI_DEVICE_ID_ARTOP_ATP865:
@@ -186,7 +168,6 @@ static int aec62xx_irq_timeout (ide_drive_t *drive)
186 default: 168 default:
187 break; 169 break;
188 } 170 }
189 return 0;
190} 171}
191 172
192static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name) 173static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name)
@@ -224,64 +205,46 @@ static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const ch
224 205
225static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif) 206static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif)
226{ 207{
227 struct pci_dev *dev = hwif->pci_dev; 208 struct pci_dev *dev = hwif->pci_dev;
209 u8 reg54 = 0, mask = hwif->channel ? 0xf0 : 0x0f;
210 unsigned long flags;
228 211
229 hwif->autodma = 0;
230 hwif->tuneproc = &aec62xx_tune_drive; 212 hwif->tuneproc = &aec62xx_tune_drive;
231 hwif->speedproc = &aec62xx_tune_chipset;
232 213
233 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) 214 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
234 hwif->serialized = hwif->channel; 215 if(hwif->mate)
235 216 hwif->mate->serialized = hwif->serialized = 1;
236 if (hwif->mate) 217 hwif->speedproc = &aec6210_tune_chipset;
237 hwif->mate->serialized = hwif->serialized; 218 } else
219 hwif->speedproc = &aec6260_tune_chipset;
238 220
239 if (!hwif->dma_base) { 221 if (!hwif->dma_base) {
240 hwif->drives[0].autotune = 1; 222 hwif->drives[0].autotune = hwif->drives[1].autotune = 1;
241 hwif->drives[1].autotune = 1;
242 return; 223 return;
243 } 224 }
244 225
245 hwif->ultra_mask = hwif->cds->udma_mask; 226 hwif->ultra_mask = hwif->cds->udma_mask;
246
247 /* atp865 and atp865r */
248 if (hwif->ultra_mask == 0x3f) {
249 /* check bit 0x10 of DMA status register */
250 if (inb(pci_resource_start(dev, 4) + 2) & 0x10)
251 hwif->ultra_mask = 0x7f; /* udma0-6 */
252 }
253
254 hwif->mwdma_mask = 0x07; 227 hwif->mwdma_mask = 0x07;
255 228
256 hwif->ide_dma_check = &aec62xx_config_drive_xfer_rate; 229 hwif->ide_dma_check = &aec62xx_config_drive_xfer_rate;
257 hwif->ide_dma_lostirq = &aec62xx_irq_timeout; 230 hwif->dma_lost_irq = &aec62xx_dma_lost_irq;
258
259 if (!noautodma)
260 hwif->autodma = 1;
261 hwif->drives[0].autodma = hwif->autodma;
262 hwif->drives[1].autodma = hwif->autodma;
263}
264
265static void __devinit init_dma_aec62xx(ide_hwif_t *hwif, unsigned long dmabase)
266{
267 struct pci_dev *dev = hwif->pci_dev;
268 231
269 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) { 232 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
270 u8 reg54h = 0;
271 unsigned long flags;
272
273 spin_lock_irqsave(&ide_lock, flags); 233 spin_lock_irqsave(&ide_lock, flags);
274 pci_read_config_byte(dev, 0x54, &reg54h); 234 pci_read_config_byte (dev, 0x54, &reg54);
275 pci_write_config_byte(dev, 0x54, reg54h & ~(hwif->channel ? 0xF0 : 0x0F)); 235 pci_write_config_byte(dev, 0x54, (reg54 & ~mask));
276 spin_unlock_irqrestore(&ide_lock, flags); 236 spin_unlock_irqrestore(&ide_lock, flags);
277 } else { 237 } else if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
278 u8 ata66 = 0; 238 u8 ata66 = 0, mask = hwif->channel ? 0x02 : 0x01;
239
279 pci_read_config_byte(hwif->pci_dev, 0x49, &ata66); 240 pci_read_config_byte(hwif->pci_dev, 0x49, &ata66);
280 if (!(hwif->udma_four)) 241
281 hwif->udma_four = (ata66&(hwif->channel?0x02:0x01))?0:1; 242 hwif->cbl = (ata66 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
282 } 243 }
283 244
284 ide_setup_dma(hwif, dmabase, 8); 245 if (!noautodma)
246 hwif->autodma = 1;
247 hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
285} 248}
286 249
287static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d) 250static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d)
@@ -291,16 +254,12 @@ static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d
291 254
292static int __devinit init_setup_aec6x80(struct pci_dev *dev, ide_pci_device_t *d) 255static int __devinit init_setup_aec6x80(struct pci_dev *dev, ide_pci_device_t *d)
293{ 256{
294 unsigned long bar4reg = pci_resource_start(dev, 4); 257 unsigned long dma_base = pci_resource_start(dev, 4);
295 258
296 if (inb(bar4reg+2) & 0x10) { 259 if (inb(dma_base + 2) & 0x10) {
297 strcpy(d->name, "AEC6880"); 260 d->name = (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) ?
298 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) 261 "AEC6880R" : "AEC6880";
299 strcpy(d->name, "AEC6880R"); 262 d->udma_mask = 0x7f; /* udma0-6 */
300 } else {
301 strcpy(d->name, "AEC6280");
302 if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)
303 strcpy(d->name, "AEC6280R");
304 } 263 }
305 264
306 return ide_setup_pci_device(dev, d); 265 return ide_setup_pci_device(dev, d);
@@ -312,7 +271,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
312 .init_setup = init_setup_aec62xx, 271 .init_setup = init_setup_aec62xx,
313 .init_chipset = init_chipset_aec62xx, 272 .init_chipset = init_chipset_aec62xx,
314 .init_hwif = init_hwif_aec62xx, 273 .init_hwif = init_hwif_aec62xx,
315 .init_dma = init_dma_aec62xx,
316 .channels = 2, 274 .channels = 2,
317 .autodma = AUTODMA, 275 .autodma = AUTODMA,
318 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, 276 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -323,7 +281,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
323 .init_setup = init_setup_aec62xx, 281 .init_setup = init_setup_aec62xx,
324 .init_chipset = init_chipset_aec62xx, 282 .init_chipset = init_chipset_aec62xx,
325 .init_hwif = init_hwif_aec62xx, 283 .init_hwif = init_hwif_aec62xx,
326 .init_dma = init_dma_aec62xx,
327 .channels = 2, 284 .channels = 2,
328 .autodma = NOAUTODMA, 285 .autodma = NOAUTODMA,
329 .bootable = OFF_BOARD, 286 .bootable = OFF_BOARD,
@@ -333,28 +290,25 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
333 .init_setup = init_setup_aec62xx, 290 .init_setup = init_setup_aec62xx,
334 .init_chipset = init_chipset_aec62xx, 291 .init_chipset = init_chipset_aec62xx,
335 .init_hwif = init_hwif_aec62xx, 292 .init_hwif = init_hwif_aec62xx,
336 .init_dma = init_dma_aec62xx,
337 .channels = 2, 293 .channels = 2,
338 .autodma = AUTODMA, 294 .autodma = AUTODMA,
339 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, 295 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
340 .bootable = NEVER_BOARD, 296 .bootable = NEVER_BOARD,
341 .udma_mask = 0x1f, /* udma0-4 */ 297 .udma_mask = 0x1f, /* udma0-4 */
342 },{ /* 3 */ 298 },{ /* 3 */
343 .name = "AEC6X80", 299 .name = "AEC6280",
344 .init_setup = init_setup_aec6x80, 300 .init_setup = init_setup_aec6x80,
345 .init_chipset = init_chipset_aec62xx, 301 .init_chipset = init_chipset_aec62xx,
346 .init_hwif = init_hwif_aec62xx, 302 .init_hwif = init_hwif_aec62xx,
347 .init_dma = init_dma_aec62xx,
348 .channels = 2, 303 .channels = 2,
349 .autodma = AUTODMA, 304 .autodma = AUTODMA,
350 .bootable = OFF_BOARD, 305 .bootable = OFF_BOARD,
351 .udma_mask = 0x3f, /* udma0-5 */ 306 .udma_mask = 0x3f, /* udma0-5 */
352 },{ /* 4 */ 307 },{ /* 4 */
353 .name = "AEC6X80R", 308 .name = "AEC6280R",
354 .init_setup = init_setup_aec6x80, 309 .init_setup = init_setup_aec6x80,
355 .init_chipset = init_chipset_aec62xx, 310 .init_chipset = init_chipset_aec62xx,
356 .init_hwif = init_hwif_aec62xx, 311 .init_hwif = init_hwif_aec62xx,
357 .init_dma = init_dma_aec62xx,
358 .channels = 2, 312 .channels = 2,
359 .autodma = AUTODMA, 313 .autodma = AUTODMA,
360 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}}, 314 .enablebits = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -370,13 +324,16 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
370 * 324 *
371 * Called when the PCI registration layer (or the IDE initialization) 325 * Called when the PCI registration layer (or the IDE initialization)
372 * finds a device matching our IDE device tables. 326 * finds a device matching our IDE device tables.
327 *
328 * NOTE: since we're going to modify the 'name' field for AEC-6[26]80[R]
329 * chips, pass a local copy of 'struct pci_device_id' down the call chain.
373 */ 330 */
374 331
375static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id) 332static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
376{ 333{
377 ide_pci_device_t *d = &aec62xx_chipsets[id->driver_data]; 334 ide_pci_device_t d = aec62xx_chipsets[id->driver_data];
378 335
379 return d->init_setup(dev, d); 336 return d.init_setup(dev, &d);
380} 337}
381 338
382static struct pci_device_id aec62xx_pci_tbl[] = { 339static struct pci_device_id aec62xx_pci_tbl[] = {
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 27525ec2e19a..8a6b27b3bcc3 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/alim15x3.c Version 0.21 2007/02/03 2 * linux/drivers/ide/pci/alim15x3.c Version 0.25 Jun 9 2007
3 * 3 *
4 * Copyright (C) 1998-2000 Michel Aubry, Maintainer 4 * Copyright (C) 1998-2000 Michel Aubry, Maintainer
5 * Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer 5 * Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer
@@ -10,6 +10,7 @@
10 * Copyright (C) 2002 Alan Cox <alan@redhat.com> 10 * Copyright (C) 2002 Alan Cox <alan@redhat.com>
11 * ALi (now ULi M5228) support by Clear Zhang <Clear.Zhang@ali.com.tw> 11 * ALi (now ULi M5228) support by Clear Zhang <Clear.Zhang@ali.com.tw>
12 * Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com> 12 * Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com>
13 * Copyright (C) 2007 Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
13 * 14 *
14 * (U)DMA capable version of ali 1533/1543(C), 1535(D) 15 * (U)DMA capable version of ali 1533/1543(C), 1535(D)
15 * 16 *
@@ -36,6 +37,7 @@
36#include <linux/hdreg.h> 37#include <linux/hdreg.h>
37#include <linux/ide.h> 38#include <linux/ide.h>
38#include <linux/init.h> 39#include <linux/init.h>
40#include <linux/dmi.h>
39 41
40#include <asm/io.h> 42#include <asm/io.h>
41 43
@@ -583,6 +585,35 @@ out:
583 return 0; 585 return 0;
584} 586}
585 587
588/*
589 * Cable special cases
590 */
591
592static struct dmi_system_id cable_dmi_table[] = {
593 {
594 .ident = "HP Pavilion N5430",
595 .matches = {
596 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
597 DMI_MATCH(DMI_BOARD_NAME, "OmniBook N32N-736"),
598 },
599 },
600 { }
601};
602
603static int ali_cable_override(struct pci_dev *pdev)
604{
605 /* Fujitsu P2000 */
606 if (pdev->subsystem_vendor == 0x10CF &&
607 pdev->subsystem_device == 0x10AF)
608 return 1;
609
610 /* Systems by DMI */
611 if (dmi_check_system(cable_dmi_table))
612 return 1;
613
614 return 0;
615}
616
586/** 617/**
587 * ata66_ali15x3 - check for UDMA 66 support 618 * ata66_ali15x3 - check for UDMA 66 support
588 * @hwif: IDE interface 619 * @hwif: IDE interface
@@ -594,37 +625,31 @@ out:
594 * FIXME: frobs bits that are not defined on newer ALi devicea 625 * FIXME: frobs bits that are not defined on newer ALi devicea
595 */ 626 */
596 627
597static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif) 628static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif)
598{ 629{
599 struct pci_dev *dev = hwif->pci_dev; 630 struct pci_dev *dev = hwif->pci_dev;
600 unsigned int ata66 = 0;
601 u8 cable_80_pin[2] = { 0, 0 };
602
603 unsigned long flags; 631 unsigned long flags;
604 u8 tmpbyte; 632 u8 cbl = ATA_CBL_PATA40, tmpbyte;
605 633
606 local_irq_save(flags); 634 local_irq_save(flags);
607 635
608 if (m5229_revision >= 0xC2) { 636 if (m5229_revision >= 0xC2) {
609 /* 637 /*
610 * Ultra66 cable detection (from Host View) 638 * m5229 80-pin cable detection (from Host View)
611 * m5229, 0x4a, bit0: primary, bit1: secondary 80 pin 639 *
612 */ 640 * 0x4a bit0 is 0 => primary channel has 80-pin
613 pci_read_config_byte(dev, 0x4a, &tmpbyte); 641 * 0x4a bit1 is 0 => secondary channel has 80-pin
614 /* 642 *
615 * 0x4a, bit0 is 0 => primary channel 643 * Certain laptops use short but suitable cables
616 * has 80-pin (from host view) 644 * and don't implement the detect logic.
617 */
618 if (!(tmpbyte & 0x01)) cable_80_pin[0] = 1;
619 /*
620 * 0x4a, bit1 is 0 => secondary channel
621 * has 80-pin (from host view)
622 */
623 if (!(tmpbyte & 0x02)) cable_80_pin[1] = 1;
624 /*
625 * Allow ata66 if cable of current channel has 80 pins
626 */ 645 */
627 ata66 = (hwif->channel)?cable_80_pin[1]:cable_80_pin[0]; 646 if (ali_cable_override(dev))
647 cbl = ATA_CBL_PATA40_SHORT;
648 else {
649 pci_read_config_byte(dev, 0x4a, &tmpbyte);
650 if ((tmpbyte & (1 << hwif->channel)) == 0)
651 cbl = ATA_CBL_PATA80;
652 }
628 } else { 653 } else {
629 /* 654 /*
630 * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010 655 * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010
@@ -657,7 +682,7 @@ static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif)
657 682
658 local_irq_restore(flags); 683 local_irq_restore(flags);
659 684
660 return(ata66); 685 return cbl;
661} 686}
662 687
663/** 688/**
@@ -708,8 +733,9 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif)
708 hwif->dma_setup = &ali15x3_dma_setup; 733 hwif->dma_setup = &ali15x3_dma_setup;
709 if (!noautodma) 734 if (!noautodma)
710 hwif->autodma = 1; 735 hwif->autodma = 1;
711 if (!(hwif->udma_four)) 736
712 hwif->udma_four = ata66_ali15x3(hwif); 737 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
738 hwif->cbl = ata66_ali15x3(hwif);
713 } 739 }
714 hwif->drives[0].autodma = hwif->autodma; 740 hwif->drives[0].autodma = hwif->autodma;
715 hwif->drives[1].autodma = hwif->autodma; 741 hwif->drives[1].autodma = hwif->autodma;
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index a2be65fcf89c..84ed30cdb324 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * Version 2.16 2 * Version 2.20
3 * 3 *
4 * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04 4 * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04
5 * IDE driver for Linux. 5 * IDE driver for Linux.
6 * 6 *
7 * Copyright (c) 2000-2002 Vojtech Pavlik 7 * Copyright (c) 2000-2002 Vojtech Pavlik
8 * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
8 * 9 *
9 * Based on the work of: 10 * Based on the work of:
10 * Andre Hedrick 11 * Andre Hedrick
@@ -37,11 +38,6 @@
37#define AMD_ADDRESS_SETUP (0x0c + amd_config->base) 38#define AMD_ADDRESS_SETUP (0x0c + amd_config->base)
38#define AMD_UDMA_TIMING (0x10 + amd_config->base) 39#define AMD_UDMA_TIMING (0x10 + amd_config->base)
39 40
40#define AMD_UDMA 0x07
41#define AMD_UDMA_33 0x01
42#define AMD_UDMA_66 0x02
43#define AMD_UDMA_100 0x03
44#define AMD_UDMA_133 0x04
45#define AMD_CHECK_SWDMA 0x08 41#define AMD_CHECK_SWDMA 0x08
46#define AMD_BAD_SWDMA 0x10 42#define AMD_BAD_SWDMA 0x10
47#define AMD_BAD_FIFO 0x20 43#define AMD_BAD_FIFO 0x20
@@ -53,32 +49,33 @@
53 49
54static struct amd_ide_chip { 50static struct amd_ide_chip {
55 unsigned short id; 51 unsigned short id;
56 unsigned long base; 52 u8 base;
57 unsigned char flags; 53 u8 udma_mask;
54 u8 flags;
58} amd_ide_chips[] = { 55} amd_ide_chips[] = {
59 { PCI_DEVICE_ID_AMD_COBRA_7401, 0x40, AMD_UDMA_33 | AMD_BAD_SWDMA }, 56 { PCI_DEVICE_ID_AMD_COBRA_7401, 0x40, ATA_UDMA2, AMD_BAD_SWDMA },
60 { PCI_DEVICE_ID_AMD_VIPER_7409, 0x40, AMD_UDMA_66 | AMD_CHECK_SWDMA }, 57 { PCI_DEVICE_ID_AMD_VIPER_7409, 0x40, ATA_UDMA4, AMD_CHECK_SWDMA },
61 { PCI_DEVICE_ID_AMD_VIPER_7411, 0x40, AMD_UDMA_100 | AMD_BAD_FIFO }, 58 { PCI_DEVICE_ID_AMD_VIPER_7411, 0x40, ATA_UDMA5, AMD_BAD_FIFO },
62 { PCI_DEVICE_ID_AMD_OPUS_7441, 0x40, AMD_UDMA_100 }, 59 { PCI_DEVICE_ID_AMD_OPUS_7441, 0x40, ATA_UDMA5, },
63 { PCI_DEVICE_ID_AMD_8111_IDE, 0x40, AMD_UDMA_133 | AMD_CHECK_SERENADE }, 60 { PCI_DEVICE_ID_AMD_8111_IDE, 0x40, ATA_UDMA6, AMD_CHECK_SERENADE },
64 { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, 0x50, AMD_UDMA_100 }, 61 { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, 0x50, ATA_UDMA5, },
65 { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE, 0x50, AMD_UDMA_133 }, 62 { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE, 0x50, ATA_UDMA6, },
66 { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE, 0x50, AMD_UDMA_133 }, 63 { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE, 0x50, ATA_UDMA6, },
67 { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA, 0x50, AMD_UDMA_133 }, 64 { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA, 0x50, ATA_UDMA6, },
68 { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE, 0x50, AMD_UDMA_133 }, 65 { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE, 0x50, ATA_UDMA6, },
69 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE, 0x50, AMD_UDMA_133 }, 66 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE, 0x50, ATA_UDMA6, },
70 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA, 0x50, AMD_UDMA_133 }, 67 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA, 0x50, ATA_UDMA6, },
71 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2, 0x50, AMD_UDMA_133 }, 68 { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2, 0x50, ATA_UDMA6, },
72 { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, AMD_UDMA_133 }, 69 { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, ATA_UDMA6, },
73 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, AMD_UDMA_133 }, 70 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, ATA_UDMA6, },
74 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, AMD_UDMA_133 }, 71 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, ATA_UDMA6, },
75 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, AMD_UDMA_133 }, 72 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, ATA_UDMA6, },
76 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, AMD_UDMA_133 }, 73 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, ATA_UDMA6, },
77 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, AMD_UDMA_133 }, 74 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, ATA_UDMA6, },
78 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, AMD_UDMA_133 }, 75 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, ATA_UDMA6, },
79 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, AMD_UDMA_133 }, 76 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, ATA_UDMA6, },
80 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, AMD_UDMA_133 }, 77 { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, ATA_UDMA6, },
81 { PCI_DEVICE_ID_AMD_CS5536_IDE, 0x40, AMD_UDMA_100 }, 78 { PCI_DEVICE_ID_AMD_CS5536_IDE, 0x40, ATA_UDMA5, },
82 { 0 } 79 { 0 }
83}; 80};
84 81
@@ -87,7 +84,7 @@ static ide_pci_device_t *amd_chipset;
87static unsigned int amd_80w; 84static unsigned int amd_80w;
88static unsigned int amd_clock; 85static unsigned int amd_clock;
89 86
90static char *amd_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" }; 87static char *amd_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
91static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 }; 88static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 };
92 89
93/* 90/*
@@ -128,7 +125,7 @@ static int amd74xx_get_info(char *buffer, char **addr, off_t offset, int count)
128 125
129 pci_read_config_byte(dev, PCI_REVISION_ID, &t); 126 pci_read_config_byte(dev, PCI_REVISION_ID, &t);
130 amd_print("Revision: IDE %#x", t); 127 amd_print("Revision: IDE %#x", t);
131 amd_print("Highest DMA rate: %s", amd_dma[amd_config->flags & AMD_UDMA]); 128 amd_print("Highest DMA rate: UDMA%s", amd_dma[fls(amd_config->udma_mask) - 1]);
132 129
133 amd_print("BM-DMA base: %#lx", amd_base); 130 amd_print("BM-DMA base: %#lx", amd_base);
134 amd_print("PCI clock: %d.%dMHz", amd_clock / 1000, amd_clock / 100 % 10); 131 amd_print("PCI clock: %d.%dMHz", amd_clock / 1000, amd_clock / 100 % 10);
@@ -221,12 +218,12 @@ static void amd_set_speed(struct pci_dev *dev, unsigned char dn, struct ide_timi
221 pci_write_config_byte(dev, AMD_DRIVE_TIMING + (3 - dn), 218 pci_write_config_byte(dev, AMD_DRIVE_TIMING + (3 - dn),
222 ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1)); 219 ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
223 220
224 switch (amd_config->flags & AMD_UDMA) { 221 switch (amd_config->udma_mask) {
225 case AMD_UDMA_33: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; 222 case ATA_UDMA2: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
226 case AMD_UDMA_66: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break; 223 case ATA_UDMA4: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break;
227 case AMD_UDMA_100: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break; 224 case ATA_UDMA5: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break;
228 case AMD_UDMA_133: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break; 225 case ATA_UDMA6: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break;
229 default: return; 226 default: return;
230 } 227 }
231 228
232 pci_write_config_byte(dev, AMD_UDMA_TIMING + (3 - dn), t); 229 pci_write_config_byte(dev, AMD_UDMA_TIMING + (3 - dn), t);
@@ -248,7 +245,7 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
248 ide_config_drive_speed(drive, speed); 245 ide_config_drive_speed(drive, speed);
249 246
250 T = 1000000000 / amd_clock; 247 T = 1000000000 / amd_clock;
251 UT = T / min_t(int, max_t(int, amd_config->flags & AMD_UDMA, 1), 2); 248 UT = (amd_config->udma_mask == ATA_UDMA2) ? T : (T / 2);
252 249
253 ide_timing_compute(drive, speed, &t, T, UT); 250 ide_timing_compute(drive, speed, &t, T, UT);
254 251
@@ -277,29 +274,19 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
277static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio) 274static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio)
278{ 275{
279 if (pio == 255) { 276 if (pio == 255) {
280 amd_set_drive(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO)); 277 amd_set_drive(drive, ide_find_best_pio_mode(drive));
281 return; 278 return;
282 } 279 }
283 280
284 amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5)); 281 amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5));
285} 282}
286 283
287/*
288 * amd74xx_dmaproc() is a callback from upper layers that can do
289 * a lot, but we use it for DMA/PIO tuning only, delegating everything
290 * else to the default ide_dmaproc().
291 */
292
293static int amd74xx_ide_dma_check(ide_drive_t *drive) 284static int amd74xx_ide_dma_check(ide_drive_t *drive)
294{ 285{
295 int w80 = HWIF(drive)->udma_four; 286 u8 speed = ide_max_dma_mode(drive);
296 287
297 u8 speed = ide_find_best_mode(drive, 288 if (speed == 0)
298 XFER_PIO | XFER_EPIO | XFER_MWDMA | XFER_UDMA | 289 speed = ide_find_best_pio_mode(drive);
299 ((amd_config->flags & AMD_BAD_SWDMA) ? 0 : XFER_SWDMA) |
300 (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_66 ? XFER_UDMA_66 : 0) |
301 (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_100 ? XFER_UDMA_100 : 0) |
302 (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_133 ? XFER_UDMA_133 : 0));
303 290
304 amd_set_drive(drive, speed); 291 amd_set_drive(drive, speed);
305 292
@@ -334,10 +321,10 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
334 * Check 80-wire cable presence. 321 * Check 80-wire cable presence.
335 */ 322 */
336 323
337 switch (amd_config->flags & AMD_UDMA) { 324 switch (amd_config->udma_mask) {
338 325
339 case AMD_UDMA_133: 326 case ATA_UDMA6:
340 case AMD_UDMA_100: 327 case ATA_UDMA5:
341 pci_read_config_byte(dev, AMD_CABLE_DETECT, &t); 328 pci_read_config_byte(dev, AMD_CABLE_DETECT, &t);
342 pci_read_config_dword(dev, AMD_UDMA_TIMING, &u); 329 pci_read_config_dword(dev, AMD_UDMA_TIMING, &u);
343 amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0); 330 amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0);
@@ -349,7 +336,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
349 } 336 }
350 break; 337 break;
351 338
352 case AMD_UDMA_66: 339 case ATA_UDMA4:
353 /* no host side cable detection */ 340 /* no host side cable detection */
354 amd_80w = 0x03; 341 amd_80w = 0x03;
355 break; 342 break;
@@ -370,7 +357,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
370 if ((amd_config->flags & AMD_CHECK_SERENADE) && 357 if ((amd_config->flags & AMD_CHECK_SERENADE) &&
371 dev->subsystem_vendor == PCI_VENDOR_ID_AMD && 358 dev->subsystem_vendor == PCI_VENDOR_ID_AMD &&
372 dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE) 359 dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE)
373 amd_config->flags = AMD_UDMA_100; 360 amd_config->udma_mask = ATA_UDMA5;
374 361
375/* 362/*
376 * Determine the system bus clock. 363 * Determine the system bus clock.
@@ -395,8 +382,9 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
395 */ 382 */
396 383
397 pci_read_config_byte(dev, PCI_REVISION_ID, &t); 384 pci_read_config_byte(dev, PCI_REVISION_ID, &t);
398 printk(KERN_INFO "%s: %s (rev %02x) %s controller\n", 385 printk(KERN_INFO "%s: %s (rev %02x) UDMA%s controller\n",
399 amd_chipset->name, pci_name(dev), t, amd_dma[amd_config->flags & AMD_UDMA]); 386 amd_chipset->name, pci_name(dev), t,
387 amd_dma[fls(amd_config->udma_mask) - 1]);
400 388
401/* 389/*
402 * Register /proc/ide/amd74xx entry 390 * Register /proc/ide/amd74xx entry
@@ -437,12 +425,19 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
437 return; 425 return;
438 426
439 hwif->atapi_dma = 1; 427 hwif->atapi_dma = 1;
440 hwif->ultra_mask = 0x7f;
441 hwif->mwdma_mask = 0x07;
442 hwif->swdma_mask = 0x07;
443 428
444 if (!hwif->udma_four) 429 hwif->ultra_mask = amd_config->udma_mask;
445 hwif->udma_four = (amd_80w >> hwif->channel) & 1; 430 hwif->mwdma_mask = 0x07;
431 if ((amd_config->flags & AMD_BAD_SWDMA) == 0)
432 hwif->swdma_mask = 0x07;
433
434 if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
435 if ((amd_80w >> hwif->channel) & 1)
436 hwif->cbl = ATA_CBL_PATA80;
437 else
438 hwif->cbl = ATA_CBL_PATA40;
439 }
440
446 hwif->ide_dma_check = &amd74xx_ide_dma_check; 441 hwif->ide_dma_check = &amd74xx_ide_dma_check;
447 if (!noautodma) 442 if (!noautodma)
448 hwif->autodma = 1; 443 hwif->autodma = 1;
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index 8ab33faf6f76..2761510309b3 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -264,10 +264,11 @@ static void __devinit init_hwif_atiixp(ide_hwif_t *hwif)
264 hwif->swdma_mask = 0x04; 264 hwif->swdma_mask = 0x04;
265 265
266 pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode); 266 pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode);
267
267 if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40) 268 if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40)
268 hwif->udma_four = 1; 269 hwif->cbl = ATA_CBL_PATA80;
269 else 270 else
270 hwif->udma_four = 0; 271 hwif->cbl = ATA_CBL_PATA40;
271 272
272 hwif->dma_host_on = &atiixp_dma_host_on; 273 hwif->dma_host_on = &atiixp_dma_host_on;
273 hwif->dma_host_off = &atiixp_dma_host_off; 274 hwif->dma_host_off = &atiixp_dma_host_off;
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 7c57dc696f52..8631b6c8aa15 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/cmd64x.c Version 1.47 Mar 19, 2007 2 * linux/drivers/ide/pci/cmd64x.c Version 1.50 May 10, 2007
3 * 3 *
4 * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines. 4 * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines.
5 * Due to massive hardware bugs, UltraDMA is only supported 5 * Due to massive hardware bugs, UltraDMA is only supported
@@ -52,9 +52,6 @@
52#define ARTTIM23_DIS_RA2 0x04 52#define ARTTIM23_DIS_RA2 0x04
53#define ARTTIM23_DIS_RA3 0x08 53#define ARTTIM23_DIS_RA3 0x08
54#define ARTTIM23_INTR_CH1 0x10 54#define ARTTIM23_INTR_CH1 0x10
55#define ARTTIM2 0x57
56#define ARTTIM3 0x57
57#define DRWTIM23 0x58
58#define DRWTIM2 0x58 55#define DRWTIM2 0x58
59#define BRST 0x59 56#define BRST 0x59
60#define DRWTIM3 0x5b 57#define DRWTIM3 0x5b
@@ -469,71 +466,43 @@ static int cmd646_1_ide_dma_end (ide_drive_t *drive)
469 466
470static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const char *name) 467static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const char *name)
471{ 468{
472 u32 class_rev = 0;
473 u8 mrdmode = 0; 469 u8 mrdmode = 0;
474 470
475 pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev); 471 if (dev->device == PCI_DEVICE_ID_CMD_646) {
476 class_rev &= 0xff; 472 u8 rev = 0;
477 473
478 switch(dev->device) { 474 pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
479 case PCI_DEVICE_ID_CMD_643: 475
480 break; 476 switch (rev) {
481 case PCI_DEVICE_ID_CMD_646: 477 case 0x07:
482 printk(KERN_INFO "%s: chipset revision 0x%02X, ", name, class_rev); 478 case 0x05:
483 switch(class_rev) { 479 printk("%s: UltraDMA capable", name);
484 case 0x07:
485 case 0x05:
486 printk("UltraDMA Capable");
487 break;
488 case 0x03:
489 printk("MultiWord DMA Force Limited");
490 break;
491 case 0x01:
492 default:
493 printk("MultiWord DMA Limited, IRQ workaround enabled");
494 break;
495 }
496 printk("\n");
497 break;
498 case PCI_DEVICE_ID_CMD_648:
499 case PCI_DEVICE_ID_CMD_649:
500 break; 480 break;
481 case 0x03:
501 default: 482 default:
483 printk("%s: MultiWord DMA force limited", name);
484 break;
485 case 0x01:
486 printk("%s: MultiWord DMA limited, "
487 "IRQ workaround enabled\n", name);
502 break; 488 break;
489 }
503 } 490 }
504 491
505 /* Set a good latency timer and cache line size value. */ 492 /* Set a good latency timer and cache line size value. */
506 (void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64); 493 (void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64);
507 /* FIXME: pci_set_master() to ensure a good latency timer value */ 494 /* FIXME: pci_set_master() to ensure a good latency timer value */
508 495
509 /* Setup interrupts. */ 496 /*
510 (void) pci_read_config_byte(dev, MRDMODE, &mrdmode); 497 * Enable interrupts, select MEMORY READ LINE for reads.
511 mrdmode &= ~(0x30); 498 *
512 (void) pci_write_config_byte(dev, MRDMODE, mrdmode); 499 * NOTE: although not mentioned in the PCI0646U specs,
513 500 * bits 0-1 are write only and won't be read back as
514 /* Use MEMORY READ LINE for reads. 501 * set or not -- PCI0646U2 specs clarify this point.
515 * NOTE: Although not mentioned in the PCI0646U specs,
516 * these bits are write only and won't be read
517 * back as set or not. The PCI0646U2 specs clarify
518 * this point.
519 */ 502 */
520 (void) pci_write_config_byte(dev, MRDMODE, mrdmode | 0x02); 503 (void) pci_read_config_byte (dev, MRDMODE, &mrdmode);
521 504 mrdmode &= ~0x30;
522 /* Set reasonable active/recovery/address-setup values. */ 505 (void) pci_write_config_byte(dev, MRDMODE, (mrdmode | 0x02));
523 (void) pci_write_config_byte(dev, ARTTIM0, 0x40);
524 (void) pci_write_config_byte(dev, DRWTIM0, 0x3f);
525 (void) pci_write_config_byte(dev, ARTTIM1, 0x40);
526 (void) pci_write_config_byte(dev, DRWTIM1, 0x3f);
527#ifdef __i386__
528 (void) pci_write_config_byte(dev, ARTTIM23, 0x1c);
529#else
530 (void) pci_write_config_byte(dev, ARTTIM23, 0x5c);
531#endif
532 (void) pci_write_config_byte(dev, DRWTIM23, 0x3f);
533 (void) pci_write_config_byte(dev, DRWTIM3, 0x3f);
534#ifdef CONFIG_PPC
535 (void) pci_write_config_byte(dev, UDIDETCR0, 0xf0);
536#endif /* CONFIG_PPC */
537 506
538#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS) 507#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
539 508
@@ -548,29 +517,27 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha
548 return 0; 517 return 0;
549} 518}
550 519
551static unsigned int __devinit ata66_cmd64x(ide_hwif_t *hwif) 520static u8 __devinit ata66_cmd64x(ide_hwif_t *hwif)
552{ 521{
553 u8 ata66 = 0, mask = (hwif->channel) ? 0x02 : 0x01; 522 struct pci_dev *dev = hwif->pci_dev;
523 u8 bmidecsr = 0, mask = hwif->channel ? 0x02 : 0x01;
554 524
555 switch(hwif->pci_dev->device) { 525 switch (dev->device) {
556 case PCI_DEVICE_ID_CMD_643: 526 case PCI_DEVICE_ID_CMD_648:
557 case PCI_DEVICE_ID_CMD_646: 527 case PCI_DEVICE_ID_CMD_649:
558 return ata66; 528 pci_read_config_byte(dev, BMIDECSR, &bmidecsr);
559 default: 529 return (bmidecsr & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
560 break; 530 default:
531 return ATA_CBL_PATA40;
561 } 532 }
562 pci_read_config_byte(hwif->pci_dev, BMIDECSR, &ata66);
563 return (ata66 & mask) ? 1 : 0;
564} 533}
565 534
566static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif) 535static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
567{ 536{
568 struct pci_dev *dev = hwif->pci_dev; 537 struct pci_dev *dev = hwif->pci_dev;
569 unsigned int class_rev; 538 u8 rev = 0;
570 539
571 hwif->autodma = 0; 540 pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
572 pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
573 class_rev &= 0xff;
574 541
575 hwif->tuneproc = &cmd64x_tune_drive; 542 hwif->tuneproc = &cmd64x_tune_drive;
576 hwif->speedproc = &cmd64x_tune_chipset; 543 hwif->speedproc = &cmd64x_tune_chipset;
@@ -580,8 +547,8 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
580 if (!hwif->dma_base) 547 if (!hwif->dma_base)
581 return; 548 return;
582 549
583 hwif->atapi_dma = 1; 550 hwif->atapi_dma = 1;
584 551 hwif->mwdma_mask = 0x07;
585 hwif->ultra_mask = hwif->cds->udma_mask; 552 hwif->ultra_mask = hwif->cds->udma_mask;
586 553
587 /* 554 /*
@@ -596,16 +563,15 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
596 * 563 *
597 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets. 564 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
598 */ 565 */
599 if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5) 566 if (dev->device == PCI_DEVICE_ID_CMD_646 && rev < 5)
600 hwif->ultra_mask = 0x00; 567 hwif->ultra_mask = 0x00;
601 568
602 hwif->mwdma_mask = 0x07;
603
604 hwif->ide_dma_check = &cmd64x_config_drive_for_dma; 569 hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
605 if (!(hwif->udma_four))
606 hwif->udma_four = ata66_cmd64x(hwif);
607 570
608 switch(dev->device) { 571 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
572 hwif->cbl = ata66_cmd64x(hwif);
573
574 switch (dev->device) {
609 case PCI_DEVICE_ID_CMD_648: 575 case PCI_DEVICE_ID_CMD_648:
610 case PCI_DEVICE_ID_CMD_649: 576 case PCI_DEVICE_ID_CMD_649:
611 alt_irq_bits: 577 alt_irq_bits:
@@ -614,10 +580,10 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
614 break; 580 break;
615 case PCI_DEVICE_ID_CMD_646: 581 case PCI_DEVICE_ID_CMD_646:
616 hwif->chipset = ide_cmd646; 582 hwif->chipset = ide_cmd646;
617 if (class_rev == 0x01) { 583 if (rev == 0x01) {
618 hwif->ide_dma_end = &cmd646_1_ide_dma_end; 584 hwif->ide_dma_end = &cmd646_1_ide_dma_end;
619 break; 585 break;
620 } else if (class_rev >= 0x03) 586 } else if (rev >= 0x03)
621 goto alt_irq_bits; 587 goto alt_irq_bits;
622 /* fall thru */ 588 /* fall thru */
623 default: 589 default:
@@ -626,11 +592,9 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
626 break; 592 break;
627 } 593 }
628 594
629
630 if (!noautodma) 595 if (!noautodma)
631 hwif->autodma = 1; 596 hwif->autodma = 1;
632 hwif->drives[0].autodma = hwif->autodma; 597 hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
633 hwif->drives[1].autodma = hwif->autodma;
634} 598}
635 599
636static int __devinit init_setup_cmd64x(struct pci_dev *dev, ide_pci_device_t *d) 600static int __devinit init_setup_cmd64x(struct pci_dev *dev, ide_pci_device_t *d)
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 41925c47ef05..10f61f38243c 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -187,7 +187,8 @@ static u8 __devinit cs5535_cable_detect(struct pci_dev *dev)
187 187
188 /* if a 80 wire cable was detected */ 188 /* if a 80 wire cable was detected */
189 pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit); 189 pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit);
190 return (bit & 1); 190
191 return (bit & 1) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
191} 192}
192 193
193/**** 194/****
@@ -212,8 +213,7 @@ static void __devinit init_hwif_cs5535(ide_hwif_t *hwif)
212 hwif->ultra_mask = 0x1F; 213 hwif->ultra_mask = 0x1F;
213 hwif->mwdma_mask = 0x07; 214 hwif->mwdma_mask = 0x07;
214 215
215 216 hwif->cbl = cs5535_cable_detect(hwif->pci_dev);
216 hwif->udma_four = cs5535_cable_detect(hwif->pci_dev);
217 217
218 if (!noautodma) 218 if (!noautodma)
219 hwif->autodma = 1; 219 hwif->autodma = 1;
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index c33d0b0f11c9..4b6bae8eee82 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/hpt366.c Version 1.06 Jun 27, 2007 2 * linux/drivers/ide/pci/hpt366.c Version 1.10 Jun 29, 2007
3 * 3 *
4 * Copyright (C) 1999-2003 Andre Hedrick <andre@linux-ide.org> 4 * Copyright (C) 1999-2003 Andre Hedrick <andre@linux-ide.org>
5 * Portions Copyright (C) 2001 Sun Microsystems, Inc. 5 * Portions Copyright (C) 2001 Sun Microsystems, Inc.
@@ -77,7 +77,7 @@
77 * since they may tamper with its fields 77 * since they may tamper with its fields
78 * - prefix the driver startup messages with the real chip name 78 * - prefix the driver startup messages with the real chip name
79 * - claim the extra 240 bytes of I/O space for all chips 79 * - claim the extra 240 bytes of I/O space for all chips
80 * - optimize the rate masking/filtering and the drive list lookup code 80 * - optimize the UltraDMA filtering and the drive list lookup code
81 * - use pci_get_slot() to get to the function 1 of HPT36x/374 81 * - use pci_get_slot() to get to the function 1 of HPT36x/374
82 * - cache offset of the channel's misc. control registers (MCRs) being used 82 * - cache offset of the channel's misc. control registers (MCRs) being used
83 * throughout the driver 83 * throughout the driver
@@ -99,9 +99,9 @@
99 * stop duplicating it for each channel by storing the pointer in the pci_dev 99 * stop duplicating it for each channel by storing the pointer in the pci_dev
100 * structure: first, at the init_setup stage, point it to a static "template" 100 * structure: first, at the init_setup stage, point it to a static "template"
101 * with only the chip type and its specific base DPLL frequency, the highest 101 * with only the chip type and its specific base DPLL frequency, the highest
102 * supported DMA mode, and the chip settings table pointer filled, then, at 102 * UltraDMA mode, and the chip settings table pointer filled, then, at the
103 * the init_chipset stage, allocate per-chip instance and fill it with the 103 * init_chipset stage, allocate per-chip instance and fill it with the rest
104 * rest of the necessary information 104 * of the necessary information
105 * - get rid of the constant thresholds in the HPT37x PCI clock detection code, 105 * - get rid of the constant thresholds in the HPT37x PCI clock detection code,
106 * switch to calculating PCI clock frequency based on the chip's base DPLL 106 * switch to calculating PCI clock frequency based on the chip's base DPLL
107 * frequency 107 * frequency
@@ -112,6 +112,7 @@
112 * also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips; 112 * also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips;
113 * unify HPT36x/37x timing setup code and the speedproc handlers by joining 113 * unify HPT36x/37x timing setup code and the speedproc handlers by joining
114 * the register setting lists into the table indexed by the clock selected 114 * the register setting lists into the table indexed by the clock selected
115 * - set the correct hwif->ultra_mask for each individual chip
115 * Sergei Shtylyov, <sshtylyov@ru.mvista.com> or <source@mvista.com> 116 * Sergei Shtylyov, <sshtylyov@ru.mvista.com> or <source@mvista.com>
116 */ 117 */
117 118
@@ -391,7 +392,7 @@ enum ata_clock {
391 392
392struct hpt_info { 393struct hpt_info {
393 u8 chip_type; /* Chip type */ 394 u8 chip_type; /* Chip type */
394 u8 max_mode; /* Speeds allowed */ 395 u8 max_ultra; /* Max. UltraDMA mode allowed */
395 u8 dpll_clk; /* DPLL clock in MHz */ 396 u8 dpll_clk; /* DPLL clock in MHz */
396 u8 pci_clk; /* PCI clock in MHz */ 397 u8 pci_clk; /* PCI clock in MHz */
397 u32 **settings; /* Chipset settings table */ 398 u32 **settings; /* Chipset settings table */
@@ -430,77 +431,77 @@ static u32 *hpt37x_settings[NUM_ATA_CLOCKS] = {
430 431
431static struct hpt_info hpt36x __devinitdata = { 432static struct hpt_info hpt36x __devinitdata = {
432 .chip_type = HPT36x, 433 .chip_type = HPT36x,
433 .max_mode = (HPT366_ALLOW_ATA66_4 || HPT366_ALLOW_ATA66_3) ? 2 : 1, 434 .max_ultra = HPT366_ALLOW_ATA66_3 ? (HPT366_ALLOW_ATA66_4 ? 4 : 3) : 2,
434 .dpll_clk = 0, /* no DPLL */ 435 .dpll_clk = 0, /* no DPLL */
435 .settings = hpt36x_settings 436 .settings = hpt36x_settings
436}; 437};
437 438
438static struct hpt_info hpt370 __devinitdata = { 439static struct hpt_info hpt370 __devinitdata = {
439 .chip_type = HPT370, 440 .chip_type = HPT370,
440 .max_mode = HPT370_ALLOW_ATA100_5 ? 3 : 2, 441 .max_ultra = HPT370_ALLOW_ATA100_5 ? 5 : 4,
441 .dpll_clk = 48, 442 .dpll_clk = 48,
442 .settings = hpt37x_settings 443 .settings = hpt37x_settings
443}; 444};
444 445
445static struct hpt_info hpt370a __devinitdata = { 446static struct hpt_info hpt370a __devinitdata = {
446 .chip_type = HPT370A, 447 .chip_type = HPT370A,
447 .max_mode = HPT370_ALLOW_ATA100_5 ? 3 : 2, 448 .max_ultra = HPT370_ALLOW_ATA100_5 ? 5 : 4,
448 .dpll_clk = 48, 449 .dpll_clk = 48,
449 .settings = hpt37x_settings 450 .settings = hpt37x_settings
450}; 451};
451 452
452static struct hpt_info hpt374 __devinitdata = { 453static struct hpt_info hpt374 __devinitdata = {
453 .chip_type = HPT374, 454 .chip_type = HPT374,
454 .max_mode = 3, 455 .max_ultra = 5,
455 .dpll_clk = 48, 456 .dpll_clk = 48,
456 .settings = hpt37x_settings 457 .settings = hpt37x_settings
457}; 458};
458 459
459static struct hpt_info hpt372 __devinitdata = { 460static struct hpt_info hpt372 __devinitdata = {
460 .chip_type = HPT372, 461 .chip_type = HPT372,
461 .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, 462 .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5,
462 .dpll_clk = 55, 463 .dpll_clk = 55,
463 .settings = hpt37x_settings 464 .settings = hpt37x_settings
464}; 465};
465 466
466static struct hpt_info hpt372a __devinitdata = { 467static struct hpt_info hpt372a __devinitdata = {
467 .chip_type = HPT372A, 468 .chip_type = HPT372A,
468 .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, 469 .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5,
469 .dpll_clk = 66, 470 .dpll_clk = 66,
470 .settings = hpt37x_settings 471 .settings = hpt37x_settings
471}; 472};
472 473
473static struct hpt_info hpt302 __devinitdata = { 474static struct hpt_info hpt302 __devinitdata = {
474 .chip_type = HPT302, 475 .chip_type = HPT302,
475 .max_mode = HPT302_ALLOW_ATA133_6 ? 4 : 3, 476 .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5,
476 .dpll_clk = 66, 477 .dpll_clk = 66,
477 .settings = hpt37x_settings 478 .settings = hpt37x_settings
478}; 479};
479 480
480static struct hpt_info hpt371 __devinitdata = { 481static struct hpt_info hpt371 __devinitdata = {
481 .chip_type = HPT371, 482 .chip_type = HPT371,
482 .max_mode = HPT371_ALLOW_ATA133_6 ? 4 : 3, 483 .max_ultra = HPT371_ALLOW_ATA133_6 ? 6 : 5,
483 .dpll_clk = 66, 484 .dpll_clk = 66,
484 .settings = hpt37x_settings 485 .settings = hpt37x_settings
485}; 486};
486 487
487static struct hpt_info hpt372n __devinitdata = { 488static struct hpt_info hpt372n __devinitdata = {
488 .chip_type = HPT372N, 489 .chip_type = HPT372N,
489 .max_mode = HPT372_ALLOW_ATA133_6 ? 4 : 3, 490 .max_ultra = HPT372_ALLOW_ATA133_6 ? 6 : 5,
490 .dpll_clk = 77, 491 .dpll_clk = 77,
491 .settings = hpt37x_settings 492 .settings = hpt37x_settings
492}; 493};
493 494
494static struct hpt_info hpt302n __devinitdata = { 495static struct hpt_info hpt302n __devinitdata = {
495 .chip_type = HPT302N, 496 .chip_type = HPT302N,
496 .max_mode = HPT302_ALLOW_ATA133_6 ? 4 : 3, 497 .max_ultra = HPT302_ALLOW_ATA133_6 ? 6 : 5,
497 .dpll_clk = 77, 498 .dpll_clk = 77,
498 .settings = hpt37x_settings 499 .settings = hpt37x_settings
499}; 500};
500 501
501static struct hpt_info hpt371n __devinitdata = { 502static struct hpt_info hpt371n __devinitdata = {
502 .chip_type = HPT371N, 503 .chip_type = HPT371N,
503 .max_mode = HPT371_ALLOW_ATA133_6 ? 4 : 3, 504 .max_ultra = HPT371_ALLOW_ATA133_6 ? 6 : 5,
504 .dpll_clk = 77, 505 .dpll_clk = 77,
505 .settings = hpt37x_settings 506 .settings = hpt37x_settings
506}; 507};
@@ -523,53 +524,38 @@ static int check_in_drive_list(ide_drive_t *drive, const char **list)
523static u8 hpt3xx_udma_filter(ide_drive_t *drive) 524static u8 hpt3xx_udma_filter(ide_drive_t *drive)
524{ 525{
525 struct hpt_info *info = pci_get_drvdata(HWIF(drive)->pci_dev); 526 struct hpt_info *info = pci_get_drvdata(HWIF(drive)->pci_dev);
526 u8 chip_type = info->chip_type;
527 u8 mode = info->max_mode;
528 u8 mask; 527 u8 mask;
529 528
530 switch (mode) { 529 switch (info->chip_type) {
531 case 0x04: 530 case HPT370A:
532 mask = 0x7f; 531 if (!HPT370_ALLOW_ATA100_5 ||
533 break; 532 check_in_drive_list(drive, bad_ata100_5))
534 case 0x03: 533 return 0x1f;
534 else
535 return 0x3f;
536 case HPT370:
537 if (!HPT370_ALLOW_ATA100_5 ||
538 check_in_drive_list(drive, bad_ata100_5))
539 mask = 0x1f;
540 else
535 mask = 0x3f; 541 mask = 0x3f;
536 if (chip_type >= HPT374) 542 break;
537 break; 543 case HPT36x:
538 if (!check_in_drive_list(drive, bad_ata100_5)) 544 if (!HPT366_ALLOW_ATA66_4 ||
539 goto check_bad_ata33; 545 check_in_drive_list(drive, bad_ata66_4))
540 /* fall thru */ 546 mask = 0x0f;
541 case 0x02: 547 else
542 mask = 0x1f; 548 mask = 0x1f;
543 549
544 /* 550 if (!HPT366_ALLOW_ATA66_3 ||
545 * CHECK ME, Does this need to be changed to HPT374 ?? 551 check_in_drive_list(drive, bad_ata66_3))
546 */
547 if (chip_type >= HPT370)
548 goto check_bad_ata33;
549 if (HPT366_ALLOW_ATA66_4 &&
550 !check_in_drive_list(drive, bad_ata66_4))
551 goto check_bad_ata33;
552
553 mask = 0x0f;
554 if (HPT366_ALLOW_ATA66_3 &&
555 !check_in_drive_list(drive, bad_ata66_3))
556 goto check_bad_ata33;
557 /* fall thru */
558 case 0x01:
559 mask = 0x07; 552 mask = 0x07;
560 553 break;
561 check_bad_ata33: 554 default:
562 if (chip_type >= HPT370A) 555 return 0x7f;
563 break;
564 if (!check_in_drive_list(drive, bad_ata33))
565 break;
566 /* fall thru */
567 case 0x00:
568 default:
569 mask = 0x00;
570 break;
571 } 556 }
572 return mask; 557
558 return check_in_drive_list(drive, bad_ata33) ? 0x00 : mask;
573} 559}
574 560
575static u32 get_speed_setting(u8 speed, struct hpt_info *info) 561static u32 get_speed_setting(u8 speed, struct hpt_info *info)
@@ -737,7 +723,7 @@ static int hpt366_config_drive_xfer_rate(ide_drive_t *drive)
737 * This is specific to the HPT366 UDMA chipset 723 * This is specific to the HPT366 UDMA chipset
738 * by HighPoint|Triones Technologies, Inc. 724 * by HighPoint|Triones Technologies, Inc.
739 */ 725 */
740static int hpt366_ide_dma_lostirq(ide_drive_t *drive) 726static void hpt366_dma_lost_irq(ide_drive_t *drive)
741{ 727{
742 struct pci_dev *dev = HWIF(drive)->pci_dev; 728 struct pci_dev *dev = HWIF(drive)->pci_dev;
743 u8 mcr1 = 0, mcr3 = 0, scr1 = 0; 729 u8 mcr1 = 0, mcr3 = 0, scr1 = 0;
@@ -749,7 +735,7 @@ static int hpt366_ide_dma_lostirq(ide_drive_t *drive)
749 drive->name, __FUNCTION__, mcr1, mcr3, scr1); 735 drive->name, __FUNCTION__, mcr1, mcr3, scr1);
750 if (scr1 & 0x10) 736 if (scr1 & 0x10)
751 pci_write_config_byte(dev, 0x5a, scr1 & ~0x10); 737 pci_write_config_byte(dev, 0x5a, scr1 & ~0x10);
752 return __ide_dma_lostirq(drive); 738 ide_dma_lost_irq(drive);
753} 739}
754 740
755static void hpt370_clear_engine(ide_drive_t *drive) 741static void hpt370_clear_engine(ide_drive_t *drive)
@@ -799,10 +785,10 @@ static int hpt370_ide_dma_end(ide_drive_t *drive)
799 return __ide_dma_end(drive); 785 return __ide_dma_end(drive);
800} 786}
801 787
802static int hpt370_ide_dma_timeout(ide_drive_t *drive) 788static void hpt370_dma_timeout(ide_drive_t *drive)
803{ 789{
804 hpt370_irq_timeout(drive); 790 hpt370_irq_timeout(drive);
805 return __ide_dma_timeout(drive); 791 ide_dma_timeout(drive);
806} 792}
807 793
808/* returns 1 if DMA IRQ issued, 0 otherwise */ 794/* returns 1 if DMA IRQ issued, 0 otherwise */
@@ -1150,7 +1136,7 @@ static unsigned int __devinit init_chipset_hpt366(struct pci_dev *dev, const cha
1150 * Select 66 MHz DPLL clock only if UltraATA/133 mode is 1136 * Select 66 MHz DPLL clock only if UltraATA/133 mode is
1151 * supported/enabled, use 50 MHz DPLL clock otherwise... 1137 * supported/enabled, use 50 MHz DPLL clock otherwise...
1152 */ 1138 */
1153 if (info->max_mode == 0x04) { 1139 if (info->max_ultra == 6) {
1154 dpll_clk = 66; 1140 dpll_clk = 66;
1155 clock = ATA_CLOCK_66MHZ; 1141 clock = ATA_CLOCK_66MHZ;
1156 } else if (dpll_clk) { /* HPT36x chips don't have DPLL */ 1142 } else if (dpll_clk) { /* HPT36x chips don't have DPLL */
@@ -1243,7 +1229,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
1243 struct pci_dev *dev = hwif->pci_dev; 1229 struct pci_dev *dev = hwif->pci_dev;
1244 struct hpt_info *info = pci_get_drvdata(dev); 1230 struct hpt_info *info = pci_get_drvdata(dev);
1245 int serialize = HPT_SERIALIZE_IO; 1231 int serialize = HPT_SERIALIZE_IO;
1246 u8 scr1 = 0, ata66 = (hwif->channel) ? 0x01 : 0x02; 1232 u8 scr1 = 0, ata66 = hwif->channel ? 0x01 : 0x02;
1247 u8 chip_type = info->chip_type; 1233 u8 chip_type = info->chip_type;
1248 u8 new_mcr, old_mcr = 0; 1234 u8 new_mcr, old_mcr = 0;
1249 1235
@@ -1256,7 +1242,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
1256 hwif->intrproc = &hpt3xx_intrproc; 1242 hwif->intrproc = &hpt3xx_intrproc;
1257 hwif->maskproc = &hpt3xx_maskproc; 1243 hwif->maskproc = &hpt3xx_maskproc;
1258 hwif->busproc = &hpt3xx_busproc; 1244 hwif->busproc = &hpt3xx_busproc;
1259 hwif->udma_filter = &hpt3xx_udma_filter; 1245
1246 if (chip_type <= HPT370A)
1247 hwif->udma_filter = &hpt3xx_udma_filter;
1260 1248
1261 /* 1249 /*
1262 * HPT3xxN chips have some complications: 1250 * HPT3xxN chips have some complications:
@@ -1305,7 +1293,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
1305 return; 1293 return;
1306 } 1294 }
1307 1295
1308 hwif->ultra_mask = 0x7f; 1296 hwif->ultra_mask = hwif->cds->udma_mask;
1309 hwif->mwdma_mask = 0x07; 1297 hwif->mwdma_mask = 0x07;
1310 1298
1311 /* 1299 /*
@@ -1342,8 +1330,8 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
1342 } else 1330 } else
1343 pci_read_config_byte (dev, 0x5a, &scr1); 1331 pci_read_config_byte (dev, 0x5a, &scr1);
1344 1332
1345 if (!hwif->udma_four) 1333 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
1346 hwif->udma_four = (scr1 & ata66) ? 0 : 1; 1334 hwif->cbl = (scr1 & ata66) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
1347 1335
1348 hwif->ide_dma_check = &hpt366_config_drive_xfer_rate; 1336 hwif->ide_dma_check = &hpt366_config_drive_xfer_rate;
1349 1337
@@ -1353,9 +1341,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
1353 } else if (chip_type >= HPT370) { 1341 } else if (chip_type >= HPT370) {
1354 hwif->dma_start = &hpt370_ide_dma_start; 1342 hwif->dma_start = &hpt370_ide_dma_start;
1355 hwif->ide_dma_end = &hpt370_ide_dma_end; 1343 hwif->ide_dma_end = &hpt370_ide_dma_end;
1356 hwif->ide_dma_timeout = &hpt370_ide_dma_timeout; 1344 hwif->dma_timeout = &hpt370_dma_timeout;
1357 } else 1345 } else
1358 hwif->ide_dma_lostirq = &hpt366_ide_dma_lostirq; 1346 hwif->dma_lost_irq = &hpt366_dma_lost_irq;
1359 1347
1360 if (!noautodma) 1348 if (!noautodma)
1361 hwif->autodma = 1; 1349 hwif->autodma = 1;
@@ -1503,9 +1491,35 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
1503 1491
1504 pci_read_config_byte(dev, PCI_REVISION_ID, &rev); 1492 pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
1505 1493
1506 if (rev > 6) 1494 switch (rev) {
1495 case 0:
1496 case 1:
1497 case 2:
1498 /*
1499 * HPT36x chips have one channel per function and have
1500 * both channel enable bits located differently and visible
1501 * to both functions -- really stupid design decision... :-(
1502 * Bit 4 is for the primary channel, bit 5 for the secondary.
1503 */
1504 d->channels = 1;
1505 d->enablebits[0].mask = d->enablebits[0].val = 0x10;
1506
1507 d->udma_mask = HPT366_ALLOW_ATA66_3 ?
1508 (HPT366_ALLOW_ATA66_4 ? 0x1f : 0x0f) : 0x07;
1509 break;
1510 case 3:
1511 case 4:
1512 d->udma_mask = HPT370_ALLOW_ATA100_5 ? 0x3f : 0x1f;
1513 break;
1514 default:
1507 rev = 6; 1515 rev = 6;
1508 1516 /* fall thru */
1517 case 5:
1518 case 6:
1519 d->udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f;
1520 break;
1521 }
1522
1509 d->name = chipset_names[rev]; 1523 d->name = chipset_names[rev];
1510 1524
1511 pci_set_drvdata(dev, info[rev]); 1525 pci_set_drvdata(dev, info[rev]);
@@ -1513,15 +1527,6 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
1513 if (rev > 2) 1527 if (rev > 2)
1514 goto init_single; 1528 goto init_single;
1515 1529
1516 /*
1517 * HPT36x chips have one channel per function and have
1518 * both channel enable bits located differently and visible
1519 * to both functions -- really stupid design decision... :-(
1520 * Bit 4 is for the primary channel, bit 5 for the secondary.
1521 */
1522 d->channels = 1;
1523 d->enablebits[0].mask = d->enablebits[0].val = 0x10;
1524
1525 if ((dev2 = pci_get_slot(dev->bus, dev->devfn + 1)) != NULL) { 1530 if ((dev2 = pci_get_slot(dev->bus, dev->devfn + 1)) != NULL) {
1526 u8 mcr1 = 0, pin1 = 0, pin2 = 0; 1531 u8 mcr1 = 0, pin1 = 0, pin2 = 0;
1527 int ret; 1532 int ret;
@@ -1573,6 +1578,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
1573 .channels = 2, 1578 .channels = 2,
1574 .autodma = AUTODMA, 1579 .autodma = AUTODMA,
1575 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, 1580 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
1581 .udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
1576 .bootable = OFF_BOARD, 1582 .bootable = OFF_BOARD,
1577 .extra = 240 1583 .extra = 240
1578 },{ /* 2 */ 1584 },{ /* 2 */
@@ -1584,6 +1590,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
1584 .channels = 2, 1590 .channels = 2,
1585 .autodma = AUTODMA, 1591 .autodma = AUTODMA,
1586 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, 1592 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
1593 .udma_mask = HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f,
1587 .bootable = OFF_BOARD, 1594 .bootable = OFF_BOARD,
1588 .extra = 240 1595 .extra = 240
1589 },{ /* 3 */ 1596 },{ /* 3 */
@@ -1595,6 +1602,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
1595 .channels = 2, 1602 .channels = 2,
1596 .autodma = AUTODMA, 1603 .autodma = AUTODMA,
1597 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, 1604 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
1605 .udma_mask = HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f,
1598 .bootable = OFF_BOARD, 1606 .bootable = OFF_BOARD,
1599 .extra = 240 1607 .extra = 240
1600 },{ /* 4 */ 1608 },{ /* 4 */
@@ -1606,6 +1614,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
1606 .channels = 2, /* 4 */ 1614 .channels = 2, /* 4 */
1607 .autodma = AUTODMA, 1615 .autodma = AUTODMA,
1608 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, 1616 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
1617 .udma_mask = 0x3f,
1609 .bootable = OFF_BOARD, 1618 .bootable = OFF_BOARD,
1610 .extra = 240 1619 .extra = 240
1611 },{ /* 5 */ 1620 },{ /* 5 */
@@ -1617,6 +1626,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
1617 .channels = 2, /* 4 */ 1626 .channels = 2, /* 4 */
1618 .autodma = AUTODMA, 1627 .autodma = AUTODMA,
1619 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}}, 1628 .enablebits = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
1629 .udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
1620 .bootable = OFF_BOARD, 1630 .bootable = OFF_BOARD,
1621 .extra = 240 1631 .extra = 240
1622 } 1632 }
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index c04a02687b95..ff48c23e571e 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -231,7 +231,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive)
231 231
232static void __devinit init_hwif_it8213(ide_hwif_t *hwif) 232static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
233{ 233{
234 u8 reg42h = 0, ata66 = 0; 234 u8 reg42h = 0;
235 235
236 hwif->speedproc = &it8213_tune_chipset; 236 hwif->speedproc = &it8213_tune_chipset;
237 hwif->tuneproc = &it8213_tuneproc; 237 hwif->tuneproc = &it8213_tuneproc;
@@ -250,11 +250,11 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
250 hwif->swdma_mask = 0x04; 250 hwif->swdma_mask = 0x04;
251 251
252 pci_read_config_byte(hwif->pci_dev, 0x42, &reg42h); 252 pci_read_config_byte(hwif->pci_dev, 0x42, &reg42h);
253 ata66 = (reg42h & 0x02) ? 0 : 1;
254 253
255 hwif->ide_dma_check = &it8213_config_drive_for_dma; 254 hwif->ide_dma_check = &it8213_config_drive_for_dma;
256 if (!(hwif->udma_four)) 255
257 hwif->udma_four = ata66; 256 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
257 hwif->cbl = (reg42h & 0x02) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
258 258
259 /* 259 /*
260 * The BIOS often doesn't set up DMA on this controller 260 * The BIOS often doesn't set up DMA on this controller
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 3aeb7f1b7916..8197b653ba1e 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -491,10 +491,10 @@ static int it821x_config_drive_for_dma (ide_drive_t *drive)
491 * the needed logic onboard. 491 * the needed logic onboard.
492 */ 492 */
493 493
494static unsigned int __devinit ata66_it821x(ide_hwif_t *hwif) 494static u8 __devinit ata66_it821x(ide_hwif_t *hwif)
495{ 495{
496 /* The reference driver also only does disk side */ 496 /* The reference driver also only does disk side */
497 return 1; 497 return ATA_CBL_PATA80;
498} 498}
499 499
500/** 500/**
@@ -662,8 +662,9 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif)
662 hwif->mwdma_mask = 0x07; 662 hwif->mwdma_mask = 0x07;
663 663
664 hwif->ide_dma_check = &it821x_config_drive_for_dma; 664 hwif->ide_dma_check = &it821x_config_drive_for_dma;
665 if (!(hwif->udma_four)) 665
666 hwif->udma_four = ata66_it821x(hwif); 666 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
667 hwif->cbl = ata66_it821x(hwif);
667 668
668 /* 669 /*
669 * The BIOS often doesn't set up DMA on this controller 670 * The BIOS often doesn't set up DMA on this controller
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index 76ed25147229..a6008f63e71e 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -25,10 +25,10 @@ typedef enum {
25 * ata66_jmicron - Cable check 25 * ata66_jmicron - Cable check
26 * @hwif: IDE port 26 * @hwif: IDE port
27 * 27 *
28 * Return 1 if the cable is 80pin 28 * Returns the cable type.
29 */ 29 */
30 30
31static int __devinit ata66_jmicron(ide_hwif_t *hwif) 31static u8 __devinit ata66_jmicron(ide_hwif_t *hwif)
32{ 32{
33 struct pci_dev *pdev = hwif->pci_dev; 33 struct pci_dev *pdev = hwif->pci_dev;
34 34
@@ -70,16 +70,17 @@ static int __devinit ata66_jmicron(ide_hwif_t *hwif)
70 { 70 {
71 case PORT_PATA0: 71 case PORT_PATA0:
72 if (control & (1 << 3)) /* 40/80 pin primary */ 72 if (control & (1 << 3)) /* 40/80 pin primary */
73 return 0; 73 return ATA_CBL_PATA40;
74 return 1; 74 return ATA_CBL_PATA80;
75 case PORT_PATA1: 75 case PORT_PATA1:
76 if (control5 & (1 << 19)) /* 40/80 pin secondary */ 76 if (control5 & (1 << 19)) /* 40/80 pin secondary */
77 return 0; 77 return ATA_CBL_PATA40;
78 return 1; 78 return ATA_CBL_PATA80;
79 case PORT_SATA: 79 case PORT_SATA:
80 break; 80 break;
81 } 81 }
82 return 1; /* Avoid bogus "control reaches end of non-void function" */ 82 /* Avoid bogus "control reaches end of non-void function" */
83 return ATA_CBL_PATA80;
83} 84}
84 85
85static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted) 86static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted)
@@ -159,8 +160,9 @@ static void __devinit init_hwif_jmicron(ide_hwif_t *hwif)
159 hwif->mwdma_mask = 0x07; 160 hwif->mwdma_mask = 0x07;
160 161
161 hwif->ide_dma_check = &jmicron_config_drive_for_dma; 162 hwif->ide_dma_check = &jmicron_config_drive_for_dma;
162 if (!(hwif->udma_four)) 163
163 hwif->udma_four = ata66_jmicron(hwif); 164 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
165 hwif->cbl = ata66_jmicron(hwif);
164 166
165 hwif->autodma = 1; 167 hwif->autodma = 1;
166 hwif->drives[0].autodma = hwif->autodma; 168 hwif->drives[0].autodma = hwif->autodma;
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 0765dce6948e..ee5020df005d 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -225,7 +225,10 @@ static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio)
225 225
226static u8 pdcnew_cable_detect(ide_hwif_t *hwif) 226static u8 pdcnew_cable_detect(ide_hwif_t *hwif)
227{ 227{
228 return get_indexed_reg(hwif, 0x0b) & 0x04; 228 if (get_indexed_reg(hwif, 0x0b) & 0x04)
229 return ATA_CBL_PATA40;
230 else
231 return ATA_CBL_PATA80;
229} 232}
230 233
231static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive) 234static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive)
@@ -509,8 +512,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif)
509 512
510 hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate; 513 hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate;
511 514
512 if (!hwif->udma_four) 515 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
513 hwif->udma_four = pdcnew_cable_detect(hwif) ? 0 : 1; 516 hwif->cbl = pdcnew_cable_detect(hwif);
514 517
515 if (!noautodma) 518 if (!noautodma)
516 hwif->autodma = 1; 519 hwif->autodma = 1;
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 23844687deea..41ac4a94959f 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -152,8 +152,10 @@ static void pdc202xx_tune_drive(ide_drive_t *drive, u8 pio)
152static u8 pdc202xx_old_cable_detect (ide_hwif_t *hwif) 152static u8 pdc202xx_old_cable_detect (ide_hwif_t *hwif)
153{ 153{
154 u16 CIS = 0, mask = (hwif->channel) ? (1<<11) : (1<<10); 154 u16 CIS = 0, mask = (hwif->channel) ? (1<<11) : (1<<10);
155
155 pci_read_config_word(hwif->pci_dev, 0x50, &CIS); 156 pci_read_config_word(hwif->pci_dev, 0x50, &CIS);
156 return (CIS & mask) ? 1 : 0; 157
158 return (CIS & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
157} 159}
158 160
159/* 161/*
@@ -267,18 +269,24 @@ somebody_else:
267 return (dma_stat & 4) == 4; /* return 1 if INTR asserted */ 269 return (dma_stat & 4) == 4; /* return 1 if INTR asserted */
268} 270}
269 271
270static int pdc202xx_ide_dma_lostirq(ide_drive_t *drive) 272static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
271{ 273{
272 if (HWIF(drive)->resetproc != NULL) 274 ide_hwif_t *hwif = HWIF(drive);
273 HWIF(drive)->resetproc(drive); 275
274 return __ide_dma_lostirq(drive); 276 if (hwif->resetproc != NULL)
277 hwif->resetproc(drive);
278
279 ide_dma_lost_irq(drive);
275} 280}
276 281
277static int pdc202xx_ide_dma_timeout(ide_drive_t *drive) 282static void pdc202xx_dma_timeout(ide_drive_t *drive)
278{ 283{
279 if (HWIF(drive)->resetproc != NULL) 284 ide_hwif_t *hwif = HWIF(drive);
280 HWIF(drive)->resetproc(drive); 285
281 return __ide_dma_timeout(drive); 286 if (hwif->resetproc != NULL)
287 hwif->resetproc(drive);
288
289 ide_dma_timeout(drive);
282} 290}
283 291
284static void pdc202xx_reset_host (ide_hwif_t *hwif) 292static void pdc202xx_reset_host (ide_hwif_t *hwif)
@@ -347,12 +355,13 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif)
347 hwif->err_stops_fifo = 1; 355 hwif->err_stops_fifo = 1;
348 356
349 hwif->ide_dma_check = &pdc202xx_config_drive_xfer_rate; 357 hwif->ide_dma_check = &pdc202xx_config_drive_xfer_rate;
350 hwif->ide_dma_lostirq = &pdc202xx_ide_dma_lostirq; 358 hwif->dma_lost_irq = &pdc202xx_dma_lost_irq;
351 hwif->ide_dma_timeout = &pdc202xx_ide_dma_timeout; 359 hwif->dma_timeout = &pdc202xx_dma_timeout;
352 360
353 if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) { 361 if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) {
354 if (!(hwif->udma_four)) 362 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
355 hwif->udma_four = (pdc202xx_old_cable_detect(hwif)) ? 0 : 1; 363 hwif->cbl = pdc202xx_old_cable_detect(hwif);
364
356 hwif->dma_start = &pdc202xx_old_ide_dma_start; 365 hwif->dma_start = &pdc202xx_old_ide_dma_start;
357 hwif->ide_dma_end = &pdc202xx_old_ide_dma_end; 366 hwif->ide_dma_end = &pdc202xx_old_ide_dma_end;
358 } 367 }
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 8b219dd63024..2e0b29ef596a 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/piix.c Version 0.47 February 8, 2007 2 * linux/drivers/ide/pci/piix.c Version 0.50 Jun 10, 2007
3 * 3 *
4 * Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer 4 * Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer
5 * Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org> 5 * Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
@@ -394,14 +394,45 @@ static void piix_dma_clear_irq(ide_drive_t *drive)
394 hwif->OUTB(dma_stat, hwif->dma_status); 394 hwif->OUTB(dma_stat, hwif->dma_status);
395} 395}
396 396
397static int __devinit piix_cable_detect(ide_hwif_t *hwif) 397struct ich_laptop {
398 u16 device;
399 u16 subvendor;
400 u16 subdevice;
401};
402
403/*
404 * List of laptops that use short cables rather than 80 wire
405 */
406
407static const struct ich_laptop ich_laptop[] = {
408 /* devid, subvendor, subdev */
409 { 0x27DF, 0x0005, 0x0280 }, /* ICH7 on Acer 5602WLMi */
410 { 0x27DF, 0x1025, 0x0110 }, /* ICH7 on Acer 3682WLMi */
411 { 0x27DF, 0x1043, 0x1267 }, /* ICH7 on Asus W5F */
412 { 0x24CA, 0x1025, 0x0061 }, /* ICH4 on Acer Aspire 2023WLMi */
413 /* end marker */
414 { 0, }
415};
416
417static u8 __devinit piix_cable_detect(ide_hwif_t *hwif)
398{ 418{
399 struct pci_dev *dev = hwif->pci_dev; 419 struct pci_dev *pdev = hwif->pci_dev;
420 const struct ich_laptop *lap = &ich_laptop[0];
400 u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30; 421 u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30;
401 422
402 pci_read_config_byte(dev, 0x54, &reg54h); 423 /* check for specials */
424 while (lap->device) {
425 if (lap->device == pdev->device &&
426 lap->subvendor == pdev->subsystem_vendor &&
427 lap->subdevice == pdev->subsystem_device) {
428 return ATA_CBL_PATA40_SHORT;
429 }
430 lap++;
431 }
432
433 pci_read_config_byte(pdev, 0x54, &reg54h);
403 434
404 return (reg54h & mask) ? 1 : 0; 435 return (reg54h & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
405} 436}
406 437
407/** 438/**
@@ -444,8 +475,8 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
444 hwif->swdma_mask = 0x04; 475 hwif->swdma_mask = 0x04;
445 476
446 if (hwif->ultra_mask & 0x78) { 477 if (hwif->ultra_mask & 0x78) {
447 if (!hwif->udma_four) 478 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
448 hwif->udma_four = piix_cable_detect(hwif); 479 hwif->cbl = piix_cable_detect(hwif);
449 } 480 }
450 481
451 if (no_piix_dma) 482 if (no_piix_dma)
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 55bc0a32e34f..7b87488e3daa 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -716,7 +716,7 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif)
716 hwif->atapi_dma = 1; 716 hwif->atapi_dma = 1;
717 717
718 /* we support 80c cable only. */ 718 /* we support 80c cable only. */
719 hwif->udma_four = 1; 719 hwif->cbl = ATA_CBL_PATA80;
720 720
721 hwif->autodma = 0; 721 hwif->autodma = 0;
722 if (!noautodma) 722 if (!noautodma)
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index d9c4fd1ae996..1371b5bf6bf0 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/serverworks.c Version 0.11 Jun 2 2007 2 * linux/drivers/ide/pci/serverworks.c Version 0.20 Jun 3 2007
3 * 3 *
4 * Copyright (C) 1998-2000 Michel Aubry 4 * Copyright (C) 1998-2000 Michel Aubry
5 * Copyright (C) 1998-2000 Andrzej Krzysztofowicz 5 * Copyright (C) 1998-2000 Andrzej Krzysztofowicz
@@ -151,84 +151,11 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed)
151 if(dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4 && 151 if(dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4 &&
152 drive->media == ide_disk && speed >= XFER_UDMA_0) 152 drive->media == ide_disk && speed >= XFER_UDMA_0)
153 BUG(); 153 BUG();
154 154
155 pci_read_config_byte(dev, drive_pci[drive->dn], &pio_timing);
156 pci_read_config_byte(dev, drive_pci2[drive->dn], &dma_timing);
157 pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing); 155 pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
158 pci_read_config_word(dev, 0x4A, &csb5_pio); 156 pci_read_config_word(dev, 0x4A, &csb5_pio);
159 pci_read_config_byte(dev, 0x54, &ultra_enable); 157 pci_read_config_byte(dev, 0x54, &ultra_enable);
160 158
161 /* If we are in RAID mode (eg AMI MegaIDE) then we can't it
162 turns out trust the firmware configuration */
163
164 if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
165 goto oem_setup_failed;
166
167 /* Per Specified Design by OEM, and ASIC Architect */
168 if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
169 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) {
170 if (!drive->init_speed) {
171 u8 dma_stat = inb(hwif->dma_status);
172
173 if (((ultra_enable << (7-drive->dn) & 0x80) == 0x80) &&
174 ((dma_stat & (1<<(5+unit))) == (1<<(5+unit)))) {
175 drive->current_speed = drive->init_speed = XFER_UDMA_0 + udma_modes[(ultra_timing >> (4*unit)) & ~(0xF0)];
176 return 0;
177 } else if ((dma_timing) &&
178 ((dma_stat&(1<<(5+unit)))==(1<<(5+unit)))) {
179 u8 dmaspeed;
180
181 switch (dma_timing & 0x77) {
182 case 0x20:
183 dmaspeed = XFER_MW_DMA_2;
184 break;
185 case 0x21:
186 dmaspeed = XFER_MW_DMA_1;
187 break;
188 case 0x77:
189 dmaspeed = XFER_MW_DMA_0;
190 break;
191 default:
192 goto dma_pio;
193 }
194
195 drive->current_speed = drive->init_speed = dmaspeed;
196 return 0;
197 }
198dma_pio:
199 if (pio_timing) {
200 u8 piospeed;
201
202 switch (pio_timing & 0x7f) {
203 case 0x20:
204 piospeed = XFER_PIO_4;
205 break;
206 case 0x22:
207 piospeed = XFER_PIO_3;
208 break;
209 case 0x34:
210 piospeed = XFER_PIO_2;
211 break;
212 case 0x47:
213 piospeed = XFER_PIO_1;
214 break;
215 case 0x5d:
216 piospeed = XFER_PIO_0;
217 break;
218 default:
219 goto oem_setup_failed;
220 }
221
222 drive->current_speed = drive->init_speed = piospeed;
223 return 0;
224 }
225 }
226 }
227
228oem_setup_failed:
229
230 pio_timing = 0;
231 dma_timing = 0;
232 ultra_timing &= ~(0x0F << (4*unit)); 159 ultra_timing &= ~(0x0F << (4*unit));
233 ultra_enable &= ~(0x01 << drive->dn); 160 ultra_enable &= ~(0x01 << drive->dn);
234 csb5_pio &= ~(0x0F << (4*drive->dn)); 161 csb5_pio &= ~(0x0F << (4*drive->dn));
@@ -402,9 +329,9 @@ static unsigned int __devinit init_chipset_svwks (struct pci_dev *dev, const cha
402 return dev->irq; 329 return dev->irq;
403} 330}
404 331
405static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif) 332static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif)
406{ 333{
407 return 1; 334 return ATA_CBL_PATA80;
408} 335}
409 336
410/* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits 337/* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits
@@ -414,7 +341,7 @@ static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif)
414 * Bit 14 clear = primary IDE channel does not have 80-pin cable. 341 * Bit 14 clear = primary IDE channel does not have 80-pin cable.
415 * Bit 14 set = primary IDE channel has 80-pin cable. 342 * Bit 14 set = primary IDE channel has 80-pin cable.
416 */ 343 */
417static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif) 344static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif)
418{ 345{
419 struct pci_dev *dev = hwif->pci_dev; 346 struct pci_dev *dev = hwif->pci_dev;
420 if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL && 347 if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
@@ -422,8 +349,8 @@ static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
422 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE || 349 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE ||
423 dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE)) 350 dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE))
424 return ((1 << (hwif->channel + 14)) & 351 return ((1 << (hwif->channel + 14)) &
425 dev->subsystem_device) ? 1 : 0; 352 dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
426 return 0; 353 return ATA_CBL_PATA40;
427} 354}
428 355
429/* Sun Cobalt Alpine hardware avoids the 80-pin cable 356/* Sun Cobalt Alpine hardware avoids the 80-pin cable
@@ -432,18 +359,18 @@ static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
432 * 359 *
433 * WARNING: this only works on Alpine hardware! 360 * WARNING: this only works on Alpine hardware!
434 */ 361 */
435static unsigned int __devinit ata66_svwks_cobalt (ide_hwif_t *hwif) 362static u8 __devinit ata66_svwks_cobalt(ide_hwif_t *hwif)
436{ 363{
437 struct pci_dev *dev = hwif->pci_dev; 364 struct pci_dev *dev = hwif->pci_dev;
438 if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN && 365 if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN &&
439 dev->vendor == PCI_VENDOR_ID_SERVERWORKS && 366 dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
440 dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE) 367 dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE)
441 return ((1 << (hwif->channel + 14)) & 368 return ((1 << (hwif->channel + 14)) &
442 dev->subsystem_device) ? 1 : 0; 369 dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
443 return 0; 370 return ATA_CBL_PATA40;
444} 371}
445 372
446static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif) 373static u8 __devinit ata66_svwks(ide_hwif_t *hwif)
447{ 374{
448 struct pci_dev *dev = hwif->pci_dev; 375 struct pci_dev *dev = hwif->pci_dev;
449 376
@@ -462,9 +389,9 @@ static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif)
462 /* Per Specified Design by OEM, and ASIC Architect */ 389 /* Per Specified Design by OEM, and ASIC Architect */
463 if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) || 390 if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
464 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) 391 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2))
465 return 1; 392 return ATA_CBL_PATA80;
466 393
467 return 0; 394 return ATA_CBL_PATA40;
468} 395}
469 396
470static void __devinit init_hwif_svwks (ide_hwif_t *hwif) 397static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
@@ -495,8 +422,8 @@ static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
495 422
496 hwif->ide_dma_check = &svwks_config_drive_xfer_rate; 423 hwif->ide_dma_check = &svwks_config_drive_xfer_rate;
497 if (hwif->pci_dev->device != PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) { 424 if (hwif->pci_dev->device != PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
498 if (!hwif->udma_four) 425 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
499 hwif->udma_four = ata66_svwks(hwif); 426 hwif->cbl = ata66_svwks(hwif);
500 } 427 }
501 if (!noautodma) 428 if (!noautodma)
502 hwif->autodma = 1; 429 hwif->autodma = 1;
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index d3185e29a38e..d396b2929ed8 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -316,14 +316,6 @@ static void sgiioc4_dma_host_off(ide_drive_t * drive)
316 sgiioc4_clearirq(drive); 316 sgiioc4_clearirq(drive);
317} 317}
318 318
319static int
320sgiioc4_ide_dma_lostirq(ide_drive_t * drive)
321{
322 HWIF(drive)->resetproc(drive);
323
324 return __ide_dma_lostirq(drive);
325}
326
327static void 319static void
328sgiioc4_resetproc(ide_drive_t * drive) 320sgiioc4_resetproc(ide_drive_t * drive)
329{ 321{
@@ -331,6 +323,14 @@ sgiioc4_resetproc(ide_drive_t * drive)
331 sgiioc4_clearirq(drive); 323 sgiioc4_clearirq(drive);
332} 324}
333 325
326static void
327sgiioc4_dma_lost_irq(ide_drive_t * drive)
328{
329 sgiioc4_resetproc(drive);
330
331 ide_dma_lost_irq(drive);
332}
333
334static u8 334static u8
335sgiioc4_INB(unsigned long port) 335sgiioc4_INB(unsigned long port)
336{ 336{
@@ -607,8 +607,8 @@ ide_init_sgiioc4(ide_hwif_t * hwif)
607 hwif->ide_dma_test_irq = &sgiioc4_ide_dma_test_irq; 607 hwif->ide_dma_test_irq = &sgiioc4_ide_dma_test_irq;
608 hwif->dma_host_on = &sgiioc4_dma_host_on; 608 hwif->dma_host_on = &sgiioc4_dma_host_on;
609 hwif->dma_host_off = &sgiioc4_dma_host_off; 609 hwif->dma_host_off = &sgiioc4_dma_host_off;
610 hwif->ide_dma_lostirq = &sgiioc4_ide_dma_lostirq; 610 hwif->dma_lost_irq = &sgiioc4_dma_lost_irq;
611 hwif->ide_dma_timeout = &__ide_dma_timeout; 611 hwif->dma_timeout = &ide_dma_timeout;
612 612
613 hwif->INB = &sgiioc4_INB; 613 hwif->INB = &sgiioc4_INB;
614} 614}
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 1a4444e7226a..1c3e35487893 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -933,16 +933,17 @@ static void __devinit init_iops_siimage(ide_hwif_t *hwif)
933 * interface. 933 * interface.
934 */ 934 */
935 935
936static unsigned int __devinit ata66_siimage(ide_hwif_t *hwif) 936static u8 __devinit ata66_siimage(ide_hwif_t *hwif)
937{ 937{
938 unsigned long addr = siimage_selreg(hwif, 0); 938 unsigned long addr = siimage_selreg(hwif, 0);
939 if (pci_get_drvdata(hwif->pci_dev) == NULL) { 939 u8 ata66 = 0;
940 u8 ata66 = 0; 940
941 if (pci_get_drvdata(hwif->pci_dev) == NULL)
941 pci_read_config_byte(hwif->pci_dev, addr, &ata66); 942 pci_read_config_byte(hwif->pci_dev, addr, &ata66);
942 return (ata66 & 0x01) ? 1 : 0; 943 else
943 } 944 ata66 = hwif->INB(addr);
944 945
945 return (hwif->INB(addr) & 0x01) ? 1 : 0; 946 return (ata66 & 0x01) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
946} 947}
947 948
948/** 949/**
@@ -988,8 +989,9 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
988 hwif->atapi_dma = 1; 989 hwif->atapi_dma = 1;
989 990
990 hwif->ide_dma_check = &siimage_config_drive_for_dma; 991 hwif->ide_dma_check = &siimage_config_drive_for_dma;
991 if (!(hwif->udma_four)) 992
992 hwif->udma_four = ata66_siimage(hwif); 993 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
994 hwif->cbl = ata66_siimage(hwif);
993 995
994 if (hwif->mmio) { 996 if (hwif->mmio) {
995 hwif->ide_dma_test_irq = &siimage_mmio_ide_dma_test_irq; 997 hwif->ide_dma_test_irq = &siimage_mmio_ide_dma_test_irq;
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index ec0adad9ef61..f875183ac8d9 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/ide/pci/sis5513.c Version 0.20 Mar 4, 2007 2 * linux/drivers/ide/pci/sis5513.c Version 0.25 Jun 10, 2007
3 * 3 *
4 * Copyright (C) 1999-2000 Andre Hedrick <andre@linux-ide.org> 4 * Copyright (C) 1999-2000 Andre Hedrick <andre@linux-ide.org>
5 * Copyright (C) 2002 Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer 5 * Copyright (C) 2002 Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer
@@ -796,10 +796,33 @@ static unsigned int __devinit init_chipset_sis5513 (struct pci_dev *dev, const c
796 return 0; 796 return 0;
797} 797}
798 798
799static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif) 799struct sis_laptop {
800 u16 device;
801 u16 subvendor;
802 u16 subdevice;
803};
804
805static const struct sis_laptop sis_laptop[] = {
806 /* devid, subvendor, subdev */
807 { 0x5513, 0x1043, 0x1107 }, /* ASUS A6K */
808 /* end marker */
809 { 0, }
810};
811
812static u8 __devinit ata66_sis5513(ide_hwif_t *hwif)
800{ 813{
814 struct pci_dev *pdev = hwif->pci_dev;
815 const struct sis_laptop *lap = &sis_laptop[0];
801 u8 ata66 = 0; 816 u8 ata66 = 0;
802 817
818 while (lap->device) {
819 if (lap->device == pdev->device &&
820 lap->subvendor == pdev->subsystem_vendor &&
821 lap->subdevice == pdev->subsystem_device)
822 return ATA_CBL_PATA40_SHORT;
823 lap++;
824 }
825
803 if (chipset_family >= ATA_133) { 826 if (chipset_family >= ATA_133) {
804 u16 regw = 0; 827 u16 regw = 0;
805 u16 reg_addr = hwif->channel ? 0x52: 0x50; 828 u16 reg_addr = hwif->channel ? 0x52: 0x50;
@@ -811,7 +834,8 @@ static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif)
811 pci_read_config_byte(hwif->pci_dev, 0x48, &reg48h); 834 pci_read_config_byte(hwif->pci_dev, 0x48, &reg48h);
812 ata66 = (reg48h & mask) ? 0 : 1; 835 ata66 = (reg48h & mask) ? 0 : 1;
813 } 836 }
814 return ata66; 837
838 return ata66 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
815} 839}
816 840
817static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif) 841static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
@@ -841,8 +865,8 @@ static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
841 if (!chipset_family) 865 if (!chipset_family)
842 return; 866 return;
843 867
844 if (!(hwif->udma_four)) 868 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
845 hwif->udma_four = ata66_sis5513(hwif); 869 hwif->cbl = ata66_sis5513(hwif);
846 870
847 if (chipset_family > ATA_16) { 871 if (chipset_family > ATA_16) {
848 hwif->ide_dma_check = &sis5513_config_xfer_rate; 872 hwif->ide_dma_check = &sis5513_config_xfer_rate;
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index 7c383d9cc472..487879842af4 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -195,7 +195,7 @@ static inline void sl82c105_reset_host(struct pci_dev *dev)
195 * This function is called when the IDE timer expires, the drive 195 * This function is called when the IDE timer expires, the drive
196 * indicates that it is READY, and we were waiting for DMA to complete. 196 * indicates that it is READY, and we were waiting for DMA to complete.
197 */ 197 */
198static int sl82c105_ide_dma_lostirq(ide_drive_t *drive) 198static void sl82c105_dma_lost_irq(ide_drive_t *drive)
199{ 199{
200 ide_hwif_t *hwif = HWIF(drive); 200 ide_hwif_t *hwif = HWIF(drive);
201 struct pci_dev *dev = hwif->pci_dev; 201 struct pci_dev *dev = hwif->pci_dev;
@@ -222,9 +222,6 @@ static int sl82c105_ide_dma_lostirq(ide_drive_t *drive)
222 } 222 }
223 223
224 sl82c105_reset_host(dev); 224 sl82c105_reset_host(dev);
225
226 /* __ide_dma_lostirq would return 1, so we do as well */
227 return 1;
228} 225}
229 226
230/* 227/*
@@ -244,15 +241,12 @@ static void sl82c105_dma_start(ide_drive_t *drive)
244 ide_dma_start(drive); 241 ide_dma_start(drive);
245} 242}
246 243
247static int sl82c105_ide_dma_timeout(ide_drive_t *drive) 244static void sl82c105_dma_timeout(ide_drive_t *drive)
248{ 245{
249 ide_hwif_t *hwif = HWIF(drive); 246 DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
250 struct pci_dev *dev = hwif->pci_dev;
251 247
252 DBG(("sl82c105_ide_dma_timeout(drive:%s)\n", drive->name)); 248 sl82c105_reset_host(HWIF(drive)->pci_dev);
253 249 ide_dma_timeout(drive);
254 sl82c105_reset_host(dev);
255 return __ide_dma_timeout(drive);
256} 250}
257 251
258static int sl82c105_ide_dma_on(ide_drive_t *drive) 252static int sl82c105_ide_dma_on(ide_drive_t *drive)
@@ -441,9 +435,9 @@ static void __devinit init_hwif_sl82c105(ide_hwif_t *hwif)
441 hwif->ide_dma_check = &sl82c105_ide_dma_check; 435 hwif->ide_dma_check = &sl82c105_ide_dma_check;
442 hwif->ide_dma_on = &sl82c105_ide_dma_on; 436 hwif->ide_dma_on = &sl82c105_ide_dma_on;
443 hwif->dma_off_quietly = &sl82c105_dma_off_quietly; 437 hwif->dma_off_quietly = &sl82c105_dma_off_quietly;
444 hwif->ide_dma_lostirq = &sl82c105_ide_dma_lostirq; 438 hwif->dma_lost_irq = &sl82c105_dma_lost_irq;
445 hwif->dma_start = &sl82c105_dma_start; 439 hwif->dma_start = &sl82c105_dma_start;
446 hwif->ide_dma_timeout = &sl82c105_ide_dma_timeout; 440 hwif->dma_timeout = &sl82c105_dma_timeout;
447 441
448 if (!noautodma) 442 if (!noautodma)
449 hwif->autodma = 1; 443 hwif->autodma = 1;
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index c40f291f91e0..575dbbd8b482 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -199,10 +199,9 @@ static void __devinit init_hwif_slc90e66 (ide_hwif_t *hwif)
199 hwif->mwdma_mask = 0x06; 199 hwif->mwdma_mask = 0x06;
200 hwif->swdma_mask = 0x04; 200 hwif->swdma_mask = 0x04;
201 201
202 if (!hwif->udma_four) { 202 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
203 /* bit[0(1)]: 0:80, 1:40 */ 203 /* bit[0(1)]: 0:80, 1:40 */
204 hwif->udma_four = (reg47 & mask) ? 0 : 1; 204 hwif->cbl = (reg47 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
205 }
206 205
207 hwif->ide_dma_check = &slc90e66_config_drive_xfer_rate; 206 hwif->ide_dma_check = &slc90e66_config_drive_xfer_rate;
208 207
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index cee619bb2eaf..8de1f8e22494 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -220,13 +220,13 @@ static void __devinit init_hwif_tc86c001(ide_hwif_t *hwif)
220 hwif->ide_dma_check = &tc86c001_config_drive_xfer_rate; 220 hwif->ide_dma_check = &tc86c001_config_drive_xfer_rate;
221 hwif->dma_start = &tc86c001_dma_start; 221 hwif->dma_start = &tc86c001_dma_start;
222 222
223 if (!hwif->udma_four) { 223 if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
224 /* 224 /*
225 * System Control 1 Register bit 13 (PDIAGN): 225 * System Control 1 Register bit 13 (PDIAGN):
226 * 0=80-pin cable, 1=40-pin cable 226 * 0=80-pin cable, 1=40-pin cable
227 */ 227 */
228 scr1 = hwif->INW(sc_base + 0x00); 228 scr1 = hwif->INW(sc_base + 0x00);
229 hwif->udma_four = (scr1 & 0x2000) ? 0 : 1; 229 hwif->cbl = (scr1 & 0x2000) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
230 } 230 }
231 231
232 if (!noautodma) 232 if (!noautodma)
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index a508550c4095..d21dd2e7eeb3 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * 2 *
3 * Version 3.38 3 * Version 3.45
4 * 4 *
5 * VIA IDE driver for Linux. Supported southbridges: 5 * VIA IDE driver for Linux. Supported southbridges:
6 * 6 *
@@ -9,6 +9,7 @@
9 * vt8235, vt8237, vt8237a 9 * vt8235, vt8237, vt8237a
10 * 10 *
11 * Copyright (c) 2000-2002 Vojtech Pavlik 11 * Copyright (c) 2000-2002 Vojtech Pavlik
12 * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
12 * 13 *
13 * Based on the work of: 14 * Based on the work of:
14 * Michel Aubry 15 * Michel Aubry
@@ -33,6 +34,8 @@
33#include <linux/pci.h> 34#include <linux/pci.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/ide.h> 36#include <linux/ide.h>
37#include <linux/dmi.h>
38
36#include <asm/io.h> 39#include <asm/io.h>
37 40
38#ifdef CONFIG_PPC_CHRP 41#ifdef CONFIG_PPC_CHRP
@@ -41,8 +44,6 @@
41 44
42#include "ide-timing.h" 45#include "ide-timing.h"
43 46
44#define DISPLAY_VIA_TIMINGS
45
46#define VIA_IDE_ENABLE 0x40 47#define VIA_IDE_ENABLE 0x40
47#define VIA_IDE_CONFIG 0x41 48#define VIA_IDE_CONFIG 0x41
48#define VIA_FIFO_CONFIG 0x43 49#define VIA_FIFO_CONFIG 0x43
@@ -54,18 +55,12 @@
54#define VIA_ADDRESS_SETUP 0x4c 55#define VIA_ADDRESS_SETUP 0x4c
55#define VIA_UDMA_TIMING 0x50 56#define VIA_UDMA_TIMING 0x50
56 57
57#define VIA_UDMA 0x007 58#define VIA_BAD_PREQ 0x01 /* Crashes if PREQ# till DDACK# set */
58#define VIA_UDMA_NONE 0x000 59#define VIA_BAD_CLK66 0x02 /* 66 MHz clock doesn't work correctly */
59#define VIA_UDMA_33 0x001 60#define VIA_SET_FIFO 0x04 /* Needs to have FIFO split set */
60#define VIA_UDMA_66 0x002 61#define VIA_NO_UNMASK 0x08 /* Doesn't work with IRQ unmasking on */
61#define VIA_UDMA_100 0x003 62#define VIA_BAD_ID 0x10 /* Has wrong vendor ID (0x1107) */
62#define VIA_UDMA_133 0x004 63#define VIA_BAD_AST 0x20 /* Don't touch Address Setup Timing */
63#define VIA_BAD_PREQ 0x010 /* Crashes if PREQ# till DDACK# set */
64#define VIA_BAD_CLK66 0x020 /* 66 MHz clock doesn't work correctly */
65#define VIA_SET_FIFO 0x040 /* Needs to have FIFO split set */
66#define VIA_NO_UNMASK 0x080 /* Doesn't work with IRQ unmasking on */
67#define VIA_BAD_ID 0x100 /* Has wrong vendor ID (0x1107) */
68#define VIA_BAD_AST 0x200 /* Don't touch Address Setup Timing */
69 64
70/* 65/*
71 * VIA SouthBridge chips. 66 * VIA SouthBridge chips.
@@ -76,36 +71,37 @@ static struct via_isa_bridge {
76 u16 id; 71 u16 id;
77 u8 rev_min; 72 u8 rev_min;
78 u8 rev_max; 73 u8 rev_max;
79 u16 flags; 74 u8 udma_mask;
75 u8 flags;
80} via_isa_bridges[] = { 76} via_isa_bridges[] = {
81 { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 77 { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
82 { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 78 { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
83 { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 79 { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
84 { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 80 { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
85 { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 81 { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
86 { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 82 { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
87 { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 83 { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
88 { "vt8233a", PCI_DEVICE_ID_VIA_8233A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, 84 { "vt8233a", PCI_DEVICE_ID_VIA_8233A, 0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
89 { "vt8233c", PCI_DEVICE_ID_VIA_8233C_0, 0x00, 0x2f, VIA_UDMA_100 }, 85 { "vt8233c", PCI_DEVICE_ID_VIA_8233C_0, 0x00, 0x2f, ATA_UDMA5, },
90 { "vt8233", PCI_DEVICE_ID_VIA_8233_0, 0x00, 0x2f, VIA_UDMA_100 }, 86 { "vt8233", PCI_DEVICE_ID_VIA_8233_0, 0x00, 0x2f, ATA_UDMA5, },
91 { "vt8231", PCI_DEVICE_ID_VIA_8231, 0x00, 0x2f, VIA_UDMA_100 }, 87 { "vt8231", PCI_DEVICE_ID_VIA_8231, 0x00, 0x2f, ATA_UDMA5, },
92 { "vt82c686b", PCI_DEVICE_ID_VIA_82C686, 0x40, 0x4f, VIA_UDMA_100 }, 88 { "vt82c686b", PCI_DEVICE_ID_VIA_82C686, 0x40, 0x4f, ATA_UDMA5, },
93 { "vt82c686a", PCI_DEVICE_ID_VIA_82C686, 0x10, 0x2f, VIA_UDMA_66 }, 89 { "vt82c686a", PCI_DEVICE_ID_VIA_82C686, 0x10, 0x2f, ATA_UDMA4, },
94 { "vt82c686", PCI_DEVICE_ID_VIA_82C686, 0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 }, 90 { "vt82c686", PCI_DEVICE_ID_VIA_82C686, 0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
95 { "vt82c596b", PCI_DEVICE_ID_VIA_82C596, 0x10, 0x2f, VIA_UDMA_66 }, 91 { "vt82c596b", PCI_DEVICE_ID_VIA_82C596, 0x10, 0x2f, ATA_UDMA4, },
96 { "vt82c596a", PCI_DEVICE_ID_VIA_82C596, 0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 }, 92 { "vt82c596a", PCI_DEVICE_ID_VIA_82C596, 0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
97 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, VIA_UDMA_33 | VIA_SET_FIFO }, 93 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, ATA_UDMA2, VIA_SET_FIFO },
98 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, VIA_UDMA_33 | VIA_SET_FIFO | VIA_BAD_PREQ }, 94 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, ATA_UDMA2, VIA_SET_FIFO | VIA_BAD_PREQ },
99 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, VIA_UDMA_33 | VIA_SET_FIFO }, 95 { "vt82c586b", PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, ATA_UDMA2, VIA_SET_FIFO },
100 { "vt82c586a", PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, VIA_UDMA_33 | VIA_SET_FIFO }, 96 { "vt82c586a", PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, ATA_UDMA2, VIA_SET_FIFO },
101 { "vt82c586", PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, VIA_UDMA_NONE | VIA_SET_FIFO }, 97 { "vt82c586", PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, 0x00, VIA_SET_FIFO },
102 { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK }, 98 { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, 0x00, VIA_SET_FIFO | VIA_NO_UNMASK },
103 { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID }, 99 { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, 0x00, VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
104 { NULL } 100 { NULL }
105}; 101};
106 102
107static unsigned int via_clock; 103static unsigned int via_clock;
108static char *via_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" }; 104static char *via_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
109 105
110struct via82cxxx_dev 106struct via82cxxx_dev
111{ 107{
@@ -140,12 +136,12 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
140 pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn), 136 pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn),
141 ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1)); 137 ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
142 138
143 switch (vdev->via_config->flags & VIA_UDMA) { 139 switch (vdev->via_config->udma_mask) {
144 case VIA_UDMA_33: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break; 140 case ATA_UDMA2: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
145 case VIA_UDMA_66: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break; 141 case ATA_UDMA4: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break;
146 case VIA_UDMA_100: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; 142 case ATA_UDMA5: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
147 case VIA_UDMA_133: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break; 143 case ATA_UDMA6: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
148 default: return; 144 default: return;
149 } 145 }
150 146
151 pci_write_config_byte(dev, VIA_UDMA_TIMING + (3 - dn), t); 147 pci_write_config_byte(dev, VIA_UDMA_TIMING + (3 - dn), t);
@@ -173,12 +169,12 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
173 169
174 T = 1000000000 / via_clock; 170 T = 1000000000 / via_clock;
175 171
176 switch (vdev->via_config->flags & VIA_UDMA) { 172 switch (vdev->via_config->udma_mask) {
177 case VIA_UDMA_33: UT = T; break; 173 case ATA_UDMA2: UT = T; break;
178 case VIA_UDMA_66: UT = T/2; break; 174 case ATA_UDMA4: UT = T/2; break;
179 case VIA_UDMA_100: UT = T/3; break; 175 case ATA_UDMA5: UT = T/3; break;
180 case VIA_UDMA_133: UT = T/4; break; 176 case ATA_UDMA6: UT = T/4; break;
181 default: UT = T; 177 default: UT = T;
182 } 178 }
183 179
184 ide_timing_compute(drive, speed, &t, T, UT); 180 ide_timing_compute(drive, speed, &t, T, UT);
@@ -208,8 +204,7 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
208static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio) 204static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
209{ 205{
210 if (pio == 255) { 206 if (pio == 255) {
211 via_set_drive(drive, 207 via_set_drive(drive, ide_find_best_pio_mode(drive));
212 ide_find_best_mode(drive, XFER_PIO | XFER_EPIO));
213 return; 208 return;
214 } 209 }
215 210
@@ -226,16 +221,10 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
226 221
227static int via82cxxx_ide_dma_check (ide_drive_t *drive) 222static int via82cxxx_ide_dma_check (ide_drive_t *drive)
228{ 223{
229 ide_hwif_t *hwif = HWIF(drive); 224 u8 speed = ide_max_dma_mode(drive);
230 struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
231 u16 w80 = hwif->udma_four;
232 225
233 u16 speed = ide_find_best_mode(drive, 226 if (speed == 0)
234 XFER_PIO | XFER_EPIO | XFER_SWDMA | XFER_MWDMA | 227 speed = ide_find_best_pio_mode(drive);
235 (vdev->via_config->flags & VIA_UDMA ? XFER_UDMA : 0) |
236 (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_66 ? XFER_UDMA_66 : 0) |
237 (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_100 ? XFER_UDMA_100 : 0) |
238 (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_133 ? XFER_UDMA_133 : 0));
239 228
240 via_set_drive(drive, speed); 229 via_set_drive(drive, speed);
241 230
@@ -272,8 +261,8 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
272{ 261{
273 int i; 262 int i;
274 263
275 switch (vdev->via_config->flags & VIA_UDMA) { 264 switch (vdev->via_config->udma_mask) {
276 case VIA_UDMA_66: 265 case ATA_UDMA4:
277 for (i = 24; i >= 0; i -= 8) 266 for (i = 24; i >= 0; i -= 8)
278 if (((u >> (i & 16)) & 8) && 267 if (((u >> (i & 16)) & 8) &&
279 ((u >> i) & 0x20) && 268 ((u >> i) & 0x20) &&
@@ -286,7 +275,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
286 } 275 }
287 break; 276 break;
288 277
289 case VIA_UDMA_100: 278 case ATA_UDMA5:
290 for (i = 24; i >= 0; i -= 8) 279 for (i = 24; i >= 0; i -= 8)
291 if (((u >> i) & 0x10) || 280 if (((u >> i) & 0x10) ||
292 (((u >> i) & 0x20) && 281 (((u >> i) & 0x20) &&
@@ -298,7 +287,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
298 } 287 }
299 break; 288 break;
300 289
301 case VIA_UDMA_133: 290 case ATA_UDMA6:
302 for (i = 24; i >= 0; i -= 8) 291 for (i = 24; i >= 0; i -= 8)
303 if (((u >> i) & 0x10) || 292 if (((u >> i) & 0x10) ||
304 (((u >> i) & 0x20) && 293 (((u >> i) & 0x20) &&
@@ -353,7 +342,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
353 342
354 via_cable_detect(vdev, u); 343 via_cable_detect(vdev, u);
355 344
356 if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) { 345 if (via_config->udma_mask == ATA_UDMA4) {
357 /* Enable Clk66 */ 346 /* Enable Clk66 */
358 pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008); 347 pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008);
359 } else if (via_config->flags & VIA_BAD_CLK66) { 348 } else if (via_config->flags & VIA_BAD_CLK66) {
@@ -416,16 +405,54 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
416 */ 405 */
417 406
418 pci_read_config_byte(isa, PCI_REVISION_ID, &t); 407 pci_read_config_byte(isa, PCI_REVISION_ID, &t);
419 printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %s " 408 printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %sDMA%s "
420 "controller on pci%s\n", 409 "controller on pci%s\n",
421 via_config->name, t, 410 via_config->name, t,
422 via_dma[via_config->flags & VIA_UDMA], 411 via_config->udma_mask ? "U" : "MW",
412 via_dma[via_config->udma_mask ?
413 (fls(via_config->udma_mask) - 1) : 0],
423 pci_name(dev)); 414 pci_name(dev));
424 415
425 pci_dev_put(isa); 416 pci_dev_put(isa);
426 return 0; 417 return 0;
427} 418}
428 419
420/*
421 * Cable special cases
422 */
423
424static struct dmi_system_id cable_dmi_table[] = {
425 {
426 .ident = "Acer Ferrari 3400",
427 .matches = {
428 DMI_MATCH(DMI_BOARD_VENDOR, "Acer,Inc."),
429 DMI_MATCH(DMI_BOARD_NAME, "Ferrari 3400"),
430 },
431 },
432 { }
433};
434
435static int via_cable_override(void)
436{
437 /* Systems by DMI */
438 if (dmi_check_system(cable_dmi_table))
439 return 1;
440 return 0;
441}
442
443static u8 __devinit via82cxxx_cable_detect(ide_hwif_t *hwif)
444{
445 struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
446
447 if (via_cable_override())
448 return ATA_CBL_PATA40_SHORT;
449
450 if ((vdev->via_80w >> hwif->channel) & 1)
451 return ATA_CBL_PATA80;
452 else
453 return ATA_CBL_PATA40;
454}
455
429static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif) 456static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
430{ 457{
431 struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); 458 struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
@@ -454,12 +481,14 @@ static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
454 return; 481 return;
455 482
456 hwif->atapi_dma = 1; 483 hwif->atapi_dma = 1;
457 hwif->ultra_mask = 0x7f; 484
485 hwif->ultra_mask = vdev->via_config->udma_mask;
458 hwif->mwdma_mask = 0x07; 486 hwif->mwdma_mask = 0x07;
459 hwif->swdma_mask = 0x07; 487 hwif->swdma_mask = 0x07;
460 488
461 if (!hwif->udma_four) 489 if (hwif->cbl != ATA_CBL_PATA40_SHORT)
462 hwif->udma_four = (vdev->via_80w >> hwif->channel) & 1; 490 hwif->cbl = via82cxxx_cable_detect(hwif);
491
463 hwif->ide_dma_check = &via82cxxx_ide_dma_check; 492 hwif->ide_dma_check = &via82cxxx_ide_dma_check;
464 if (!noautodma) 493 if (!noautodma)
465 hwif->autodma = 1; 494 hwif->autodma = 1;
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index 45fc36f0f219..e46f47206542 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -942,8 +942,8 @@ pmac_ide_tune_chipset (ide_drive_t *drive, byte speed)
942 return 1; 942 return 1;
943 case XFER_UDMA_4: 943 case XFER_UDMA_4:
944 case XFER_UDMA_3: 944 case XFER_UDMA_3:
945 if (HWIF(drive)->udma_four == 0) 945 if (drive->hwif->cbl != ATA_CBL_PATA80)
946 return 1; 946 return 1;
947 case XFER_UDMA_2: 947 case XFER_UDMA_2:
948 case XFER_UDMA_1: 948 case XFER_UDMA_1:
949 case XFER_UDMA_0: 949 case XFER_UDMA_0:
@@ -1244,7 +1244,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
1244 hwif->chipset = ide_pmac; 1244 hwif->chipset = ide_pmac;
1245 hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET] || pmif->mediabay; 1245 hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET] || pmif->mediabay;
1246 hwif->hold = pmif->mediabay; 1246 hwif->hold = pmif->mediabay;
1247 hwif->udma_four = pmif->cable_80; 1247 hwif->cbl = pmif->cable_80 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
1248 hwif->drives[0].unmask = 1; 1248 hwif->drives[0].unmask = 1;
1249 hwif->drives[1].unmask = 1; 1249 hwif->drives[1].unmask = 1;
1250 hwif->tuneproc = pmac_ide_tuneproc; 1250 hwif->tuneproc = pmac_ide_tuneproc;
@@ -1821,28 +1821,11 @@ pmac_ide_dma_check(ide_drive_t *drive)
1821 enable = 0; 1821 enable = 0;
1822 1822
1823 if (enable) { 1823 if (enable) {
1824 short mode; 1824 u8 mode = ide_max_dma_mode(drive);
1825 1825
1826 map = XFER_MWDMA; 1826 if (mode >= XFER_UDMA_0)
1827 if (pmif->kind == controller_kl_ata4
1828 || pmif->kind == controller_un_ata6
1829 || pmif->kind == controller_k2_ata6
1830 || pmif->kind == controller_sh_ata6) {
1831 map |= XFER_UDMA;
1832 if (pmif->cable_80) {
1833 map |= XFER_UDMA_66;
1834 if (pmif->kind == controller_un_ata6 ||
1835 pmif->kind == controller_k2_ata6 ||
1836 pmif->kind == controller_sh_ata6)
1837 map |= XFER_UDMA_100;
1838 if (pmif->kind == controller_sh_ata6)
1839 map |= XFER_UDMA_133;
1840 }
1841 }
1842 mode = ide_find_best_mode(drive, map);
1843 if (mode & XFER_UDMA)
1844 drive->using_dma = pmac_ide_udma_enable(drive, mode); 1827 drive->using_dma = pmac_ide_udma_enable(drive, mode);
1845 else if (mode & XFER_MWDMA) 1828 else if (mode >= XFER_MW_DMA_0)
1846 drive->using_dma = pmac_ide_mdma_enable(drive, mode); 1829 drive->using_dma = pmac_ide_mdma_enable(drive, mode);
1847 hwif->OUTB(0, IDE_CONTROL_REG); 1830 hwif->OUTB(0, IDE_CONTROL_REG);
1848 /* Apply settings to controller */ 1831 /* Apply settings to controller */
@@ -2004,20 +1987,19 @@ static void pmac_ide_dma_host_on(ide_drive_t *drive)
2004{ 1987{
2005} 1988}
2006 1989
2007static int 1990static void
2008pmac_ide_dma_lostirq (ide_drive_t *drive) 1991pmac_ide_dma_lost_irq (ide_drive_t *drive)
2009{ 1992{
2010 pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data; 1993 pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data;
2011 volatile struct dbdma_regs __iomem *dma; 1994 volatile struct dbdma_regs __iomem *dma;
2012 unsigned long status; 1995 unsigned long status;
2013 1996
2014 if (pmif == NULL) 1997 if (pmif == NULL)
2015 return 0; 1998 return;
2016 dma = pmif->dma_regs; 1999 dma = pmif->dma_regs;
2017 2000
2018 status = readl(&dma->status); 2001 status = readl(&dma->status);
2019 printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status); 2002 printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status);
2020 return 0;
2021} 2003}
2022 2004
2023/* 2005/*
@@ -2057,8 +2039,8 @@ pmac_ide_setup_dma(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
2057 hwif->ide_dma_test_irq = &pmac_ide_dma_test_irq; 2039 hwif->ide_dma_test_irq = &pmac_ide_dma_test_irq;
2058 hwif->dma_host_off = &pmac_ide_dma_host_off; 2040 hwif->dma_host_off = &pmac_ide_dma_host_off;
2059 hwif->dma_host_on = &pmac_ide_dma_host_on; 2041 hwif->dma_host_on = &pmac_ide_dma_host_on;
2060 hwif->ide_dma_timeout = &__ide_dma_timeout; 2042 hwif->dma_timeout = &ide_dma_timeout;
2061 hwif->ide_dma_lostirq = &pmac_ide_dma_lostirq; 2043 hwif->dma_lost_irq = &pmac_ide_dma_lost_irq;
2062 2044
2063 hwif->atapi_dma = 1; 2045 hwif->atapi_dma = 1;
2064 switch(pmif->kind) { 2046 switch(pmif->kind) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
index 79494c4f2b10..fa92f7f1d0d0 100644
--- a/fs/jfs/endian24.h
+++ b/fs/jfs/endian24.h
@@ -29,7 +29,7 @@
29 __u32 __x = (x); \ 29 __u32 __x = (x); \
30 ((__u32)( \ 30 ((__u32)( \
31 ((__x & (__u32)0x000000ffUL) << 16) | \ 31 ((__x & (__u32)0x000000ffUL) << 16) | \
32 (__x & (__u32)0x0000ff00UL) | \ 32 (__x & (__u32)0x0000ff00UL) | \
33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \ 33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \
34}) 34})
35 35
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 9c5d59632aac..887f5759e536 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -26,34 +26,6 @@
26#include "jfs_filsys.h" 26#include "jfs_filsys.h"
27#include "jfs_debug.h" 27#include "jfs_debug.h"
28 28
29#ifdef CONFIG_JFS_DEBUG
30void dump_mem(char *label, void *data, int length)
31{
32 int i, j;
33 int *intptr = data;
34 char *charptr = data;
35 char buf[10], line[80];
36
37 printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
38 data);
39 for (i = 0; i < length; i += 16) {
40 line[0] = 0;
41 for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
42 sprintf(buf, " %08x", intptr[i / 4 + j]);
43 strcat(line, buf);
44 }
45 buf[0] = ' ';
46 buf[2] = 0;
47 for (j = 0; (j < 16) && (i + j < length); j++) {
48 buf[1] =
49 isprint(charptr[i + j]) ? charptr[i + j] : '.';
50 strcat(line, buf);
51 }
52 printk("%s\n", line);
53 }
54}
55#endif
56
57#ifdef PROC_FS_JFS /* see jfs_debug.h */ 29#ifdef PROC_FS_JFS /* see jfs_debug.h */
58 30
59static struct proc_dir_entry *base; 31static struct proc_dir_entry *base;
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 7378798f0b21..044c1e654cc0 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,6 @@ extern void jfs_proc_clean(void);
62 62
63extern int jfsloglevel; 63extern int jfsloglevel;
64 64
65extern void dump_mem(char *label, void *data, int length);
66extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); 65extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
67 66
68/* information message: e.g., configuration, major event */ 67/* information message: e.g., configuration, major event */
@@ -94,7 +93,6 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
94 * --------- 93 * ---------
95 */ 94 */
96#else /* CONFIG_JFS_DEBUG */ 95#else /* CONFIG_JFS_DEBUG */
97#define dump_mem(label,data,length) do {} while (0)
98#define ASSERT(p) do {} while (0) 96#define ASSERT(p) do {} while (0)
99#define jfs_info(fmt, arg...) do {} while (0) 97#define jfs_info(fmt, arg...) do {} while (0)
100#define jfs_debug(fmt, arg...) do {} while (0) 98#define jfs_debug(fmt, arg...) do {} while (0)
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 40b20111383c..c387540d3425 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -19,23 +19,23 @@
19#define _H_JFS_DINODE 19#define _H_JFS_DINODE
20 20
21/* 21/*
22 * jfs_dinode.h: on-disk inode manager 22 * jfs_dinode.h: on-disk inode manager
23 */ 23 */
24 24
25#define INODESLOTSIZE 128 25#define INODESLOTSIZE 128
26#define L2INODESLOTSIZE 7 26#define L2INODESLOTSIZE 7
27#define log2INODESIZE 9 /* log2(bytes per dinode) */ 27#define log2INODESIZE 9 /* log2(bytes per dinode) */
28 28
29 29
30/* 30/*
31 * on-disk inode : 512 bytes 31 * on-disk inode : 512 bytes
32 * 32 *
33 * note: align 64-bit fields on 8-byte boundary. 33 * note: align 64-bit fields on 8-byte boundary.
34 */ 34 */
35struct dinode { 35struct dinode {
36 /* 36 /*
37 * I. base area (128 bytes) 37 * I. base area (128 bytes)
38 * ------------------------ 38 * ------------------------
39 * 39 *
40 * define generic/POSIX attributes 40 * define generic/POSIX attributes
41 */ 41 */
@@ -70,16 +70,16 @@ struct dinode {
70 __le32 di_acltype; /* 4: Type of ACL */ 70 __le32 di_acltype; /* 4: Type of ACL */
71 71
72 /* 72 /*
73 * Extension Areas. 73 * Extension Areas.
74 * 74 *
75 * Historically, the inode was partitioned into 4 128-byte areas, 75 * Historically, the inode was partitioned into 4 128-byte areas,
76 * the last 3 being defined as unions which could have multiple 76 * the last 3 being defined as unions which could have multiple
77 * uses. The first 96 bytes had been completely unused until 77 * uses. The first 96 bytes had been completely unused until
78 * an index table was added to the directory. It is now more 78 * an index table was added to the directory. It is now more
79 * useful to describe the last 3/4 of the inode as a single 79 * useful to describe the last 3/4 of the inode as a single
80 * union. We would probably be better off redesigning the 80 * union. We would probably be better off redesigning the
81 * entire structure from scratch, but we don't want to break 81 * entire structure from scratch, but we don't want to break
82 * commonality with OS/2's JFS at this time. 82 * commonality with OS/2's JFS at this time.
83 */ 83 */
84 union { 84 union {
85 struct { 85 struct {
@@ -95,7 +95,7 @@ struct dinode {
95 } _dir; /* (384) */ 95 } _dir; /* (384) */
96#define di_dirtable u._dir._table 96#define di_dirtable u._dir._table
97#define di_dtroot u._dir._dtroot 97#define di_dtroot u._dir._dtroot
98#define di_parent di_dtroot.header.idotdot 98#define di_parent di_dtroot.header.idotdot
99#define di_DASD di_dtroot.header.DASD 99#define di_DASD di_dtroot.header.DASD
100 100
101 struct { 101 struct {
@@ -127,14 +127,14 @@ struct dinode {
127#define di_inlinedata u._file._u2._special._u 127#define di_inlinedata u._file._u2._special._u
128#define di_rdev u._file._u2._special._u._rdev 128#define di_rdev u._file._u2._special._u._rdev
129#define di_fastsymlink u._file._u2._special._u._fastsymlink 129#define di_fastsymlink u._file._u2._special._u._fastsymlink
130#define di_inlineea u._file._u2._special._inlineea 130#define di_inlineea u._file._u2._special._inlineea
131 } u; 131 } u;
132}; 132};
133 133
134/* extended mode bits (on-disk inode di_mode) */ 134/* extended mode bits (on-disk inode di_mode) */
135#define IFJOURNAL 0x00010000 /* journalled file */ 135#define IFJOURNAL 0x00010000 /* journalled file */
136#define ISPARSE 0x00020000 /* sparse file enabled */ 136#define ISPARSE 0x00020000 /* sparse file enabled */
137#define INLINEEA 0x00040000 /* inline EA area free */ 137#define INLINEEA 0x00040000 /* inline EA area free */
138#define ISWAPFILE 0x00800000 /* file open for pager swap space */ 138#define ISWAPFILE 0x00800000 /* file open for pager swap space */
139 139
140/* more extended mode bits: attributes for OS/2 */ 140/* more extended mode bits: attributes for OS/2 */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index f3b1ebb22280..e1985066b1c6 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -154,12 +154,12 @@ static const s8 budtab[256] = {
154 * the in-core descriptor is initialized from disk. 154 * the in-core descriptor is initialized from disk.
155 * 155 *
156 * PARAMETERS: 156 * PARAMETERS:
157 * ipbmap - pointer to in-core inode for the block map. 157 * ipbmap - pointer to in-core inode for the block map.
158 * 158 *
159 * RETURN VALUES: 159 * RETURN VALUES:
160 * 0 - success 160 * 0 - success
161 * -ENOMEM - insufficient memory 161 * -ENOMEM - insufficient memory
162 * -EIO - i/o error 162 * -EIO - i/o error
163 */ 163 */
164int dbMount(struct inode *ipbmap) 164int dbMount(struct inode *ipbmap)
165{ 165{
@@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap)
232 * the memory for this descriptor is freed. 232 * the memory for this descriptor is freed.
233 * 233 *
234 * PARAMETERS: 234 * PARAMETERS:
235 * ipbmap - pointer to in-core inode for the block map. 235 * ipbmap - pointer to in-core inode for the block map.
236 * 236 *
237 * RETURN VALUES: 237 * RETURN VALUES:
238 * 0 - success 238 * 0 - success
239 * -EIO - i/o error 239 * -EIO - i/o error
240 */ 240 */
241int dbUnmount(struct inode *ipbmap, int mounterror) 241int dbUnmount(struct inode *ipbmap, int mounterror)
242{ 242{
@@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap)
320 * at a time. 320 * at a time.
321 * 321 *
322 * PARAMETERS: 322 * PARAMETERS:
323 * ip - pointer to in-core inode; 323 * ip - pointer to in-core inode;
324 * blkno - starting block number to be freed. 324 * blkno - starting block number to be freed.
325 * nblocks - number of blocks to be freed. 325 * nblocks - number of blocks to be freed.
326 * 326 *
327 * RETURN VALUES: 327 * RETURN VALUES:
328 * 0 - success 328 * 0 - success
329 * -EIO - i/o error 329 * -EIO - i/o error
330 */ 330 */
331int dbFree(struct inode *ip, s64 blkno, s64 nblocks) 331int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
332{ 332{
@@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
395/* 395/*
396 * NAME: dbUpdatePMap() 396 * NAME: dbUpdatePMap()
397 * 397 *
398 * FUNCTION: update the allocation state (free or allocate) of the 398 * FUNCTION: update the allocation state (free or allocate) of the
399 * specified block range in the persistent block allocation map. 399 * specified block range in the persistent block allocation map.
400 * 400 *
401 * the blocks will be updated in the persistent map one 401 * the blocks will be updated in the persistent map one
402 * dmap at a time. 402 * dmap at a time.
403 * 403 *
404 * PARAMETERS: 404 * PARAMETERS:
405 * ipbmap - pointer to in-core inode for the block map. 405 * ipbmap - pointer to in-core inode for the block map.
406 * free - 'true' if block range is to be freed from the persistent 406 * free - 'true' if block range is to be freed from the persistent
407 * map; 'false' if it is to be allocated. 407 * map; 'false' if it is to be allocated.
408 * blkno - starting block number of the range. 408 * blkno - starting block number of the range.
409 * nblocks - number of contiguous blocks in the range. 409 * nblocks - number of contiguous blocks in the range.
410 * tblk - transaction block; 410 * tblk - transaction block;
411 * 411 *
412 * RETURN VALUES: 412 * RETURN VALUES:
413 * 0 - success 413 * 0 - success
414 * -EIO - i/o error 414 * -EIO - i/o error
415 */ 415 */
416int 416int
417dbUpdatePMap(struct inode *ipbmap, 417dbUpdatePMap(struct inode *ipbmap,
@@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap,
573/* 573/*
574 * NAME: dbNextAG() 574 * NAME: dbNextAG()
575 * 575 *
576 * FUNCTION: find the preferred allocation group for new allocations. 576 * FUNCTION: find the preferred allocation group for new allocations.
577 * 577 *
578 * Within the allocation groups, we maintain a preferred 578 * Within the allocation groups, we maintain a preferred
579 * allocation group which consists of a group with at least 579 * allocation group which consists of a group with at least
@@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap,
589 * empty ags around for large allocations. 589 * empty ags around for large allocations.
590 * 590 *
591 * PARAMETERS: 591 * PARAMETERS:
592 * ipbmap - pointer to in-core inode for the block map. 592 * ipbmap - pointer to in-core inode for the block map.
593 * 593 *
594 * RETURN VALUES: 594 * RETURN VALUES:
595 * the preferred allocation group number. 595 * the preferred allocation group number.
596 */ 596 */
597int dbNextAG(struct inode *ipbmap) 597int dbNextAG(struct inode *ipbmap)
598{ 598{
@@ -656,7 +656,7 @@ unlock:
656/* 656/*
657 * NAME: dbAlloc() 657 * NAME: dbAlloc()
658 * 658 *
659 * FUNCTION: attempt to allocate a specified number of contiguous free 659 * FUNCTION: attempt to allocate a specified number of contiguous free
660 * blocks from the working allocation block map. 660 * blocks from the working allocation block map.
661 * 661 *
662 * the block allocation policy uses hints and a multi-step 662 * the block allocation policy uses hints and a multi-step
@@ -680,16 +680,16 @@ unlock:
680 * size or requests that specify no hint value. 680 * size or requests that specify no hint value.
681 * 681 *
682 * PARAMETERS: 682 * PARAMETERS:
683 * ip - pointer to in-core inode; 683 * ip - pointer to in-core inode;
684 * hint - allocation hint. 684 * hint - allocation hint.
685 * nblocks - number of contiguous blocks in the range. 685 * nblocks - number of contiguous blocks in the range.
686 * results - on successful return, set to the starting block number 686 * results - on successful return, set to the starting block number
687 * of the newly allocated contiguous range. 687 * of the newly allocated contiguous range.
688 * 688 *
689 * RETURN VALUES: 689 * RETURN VALUES:
690 * 0 - success 690 * 0 - success
691 * -ENOSPC - insufficient disk resources 691 * -ENOSPC - insufficient disk resources
692 * -EIO - i/o error 692 * -EIO - i/o error
693 */ 693 */
694int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) 694int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
695{ 695{
@@ -706,12 +706,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
706 /* assert that nblocks is valid */ 706 /* assert that nblocks is valid */
707 assert(nblocks > 0); 707 assert(nblocks > 0);
708 708
709#ifdef _STILL_TO_PORT
710 /* DASD limit check F226941 */
711 if (OVER_LIMIT(ip, nblocks))
712 return -ENOSPC;
713#endif /* _STILL_TO_PORT */
714
715 /* get the log2 number of blocks to be allocated. 709 /* get the log2 number of blocks to be allocated.
716 * if the number of blocks is not a log2 multiple, 710 * if the number of blocks is not a log2 multiple,
717 * it will be rounded up to the next log2 multiple. 711 * it will be rounded up to the next log2 multiple.
@@ -720,7 +714,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
720 714
721 bmp = JFS_SBI(ip->i_sb)->bmap; 715 bmp = JFS_SBI(ip->i_sb)->bmap;
722 716
723//retry: /* serialize w.r.t.extendfs() */
724 mapSize = bmp->db_mapsize; 717 mapSize = bmp->db_mapsize;
725 718
726 /* the hint should be within the map */ 719 /* the hint should be within the map */
@@ -879,17 +872,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
879/* 872/*
880 * NAME: dbAllocExact() 873 * NAME: dbAllocExact()
881 * 874 *
882 * FUNCTION: try to allocate the requested extent; 875 * FUNCTION: try to allocate the requested extent;
883 * 876 *
884 * PARAMETERS: 877 * PARAMETERS:
885 * ip - pointer to in-core inode; 878 * ip - pointer to in-core inode;
886 * blkno - extent address; 879 * blkno - extent address;
887 * nblocks - extent length; 880 * nblocks - extent length;
888 * 881 *
889 * RETURN VALUES: 882 * RETURN VALUES:
890 * 0 - success 883 * 0 - success
891 * -ENOSPC - insufficient disk resources 884 * -ENOSPC - insufficient disk resources
892 * -EIO - i/o error 885 * -EIO - i/o error
893 */ 886 */
894int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) 887int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
895{ 888{
@@ -946,7 +939,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
946/* 939/*
947 * NAME: dbReAlloc() 940 * NAME: dbReAlloc()
948 * 941 *
949 * FUNCTION: attempt to extend a current allocation by a specified 942 * FUNCTION: attempt to extend a current allocation by a specified
950 * number of blocks. 943 * number of blocks.
951 * 944 *
952 * this routine attempts to satisfy the allocation request 945 * this routine attempts to satisfy the allocation request
@@ -959,21 +952,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
959 * number of blocks required. 952 * number of blocks required.
960 * 953 *
961 * PARAMETERS: 954 * PARAMETERS:
962 * ip - pointer to in-core inode requiring allocation. 955 * ip - pointer to in-core inode requiring allocation.
963 * blkno - starting block of the current allocation. 956 * blkno - starting block of the current allocation.
964 * nblocks - number of contiguous blocks within the current 957 * nblocks - number of contiguous blocks within the current
965 * allocation. 958 * allocation.
966 * addnblocks - number of blocks to add to the allocation. 959 * addnblocks - number of blocks to add to the allocation.
967 * results - on successful return, set to the starting block number 960 * results - on successful return, set to the starting block number
968 * of the existing allocation if the existing allocation 961 * of the existing allocation if the existing allocation
969 * was extended in place or to a newly allocated contiguous 962 * was extended in place or to a newly allocated contiguous
970 * range if the existing allocation could not be extended 963 * range if the existing allocation could not be extended
971 * in place. 964 * in place.
972 * 965 *
973 * RETURN VALUES: 966 * RETURN VALUES:
974 * 0 - success 967 * 0 - success
975 * -ENOSPC - insufficient disk resources 968 * -ENOSPC - insufficient disk resources
976 * -EIO - i/o error 969 * -EIO - i/o error
977 */ 970 */
978int 971int
979dbReAlloc(struct inode *ip, 972dbReAlloc(struct inode *ip,
@@ -1004,7 +997,7 @@ dbReAlloc(struct inode *ip,
1004/* 997/*
1005 * NAME: dbExtend() 998 * NAME: dbExtend()
1006 * 999 *
1007 * FUNCTION: attempt to extend a current allocation by a specified 1000 * FUNCTION: attempt to extend a current allocation by a specified
1008 * number of blocks. 1001 * number of blocks.
1009 * 1002 *
1010 * this routine attempts to satisfy the allocation request 1003 * this routine attempts to satisfy the allocation request
@@ -1013,16 +1006,16 @@ dbReAlloc(struct inode *ip,
1013 * immediately following the current allocation. 1006 * immediately following the current allocation.
1014 * 1007 *
1015 * PARAMETERS: 1008 * PARAMETERS:
1016 * ip - pointer to in-core inode requiring allocation. 1009 * ip - pointer to in-core inode requiring allocation.
1017 * blkno - starting block of the current allocation. 1010 * blkno - starting block of the current allocation.
1018 * nblocks - number of contiguous blocks within the current 1011 * nblocks - number of contiguous blocks within the current
1019 * allocation. 1012 * allocation.
1020 * addnblocks - number of blocks to add to the allocation. 1013 * addnblocks - number of blocks to add to the allocation.
1021 * 1014 *
1022 * RETURN VALUES: 1015 * RETURN VALUES:
1023 * 0 - success 1016 * 0 - success
1024 * -ENOSPC - insufficient disk resources 1017 * -ENOSPC - insufficient disk resources
1025 * -EIO - i/o error 1018 * -EIO - i/o error
1026 */ 1019 */
1027static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) 1020static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1028{ 1021{
@@ -1109,19 +1102,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1109/* 1102/*
1110 * NAME: dbAllocNext() 1103 * NAME: dbAllocNext()
1111 * 1104 *
1112 * FUNCTION: attempt to allocate the blocks of the specified block 1105 * FUNCTION: attempt to allocate the blocks of the specified block
1113 * range within a dmap. 1106 * range within a dmap.
1114 * 1107 *
1115 * PARAMETERS: 1108 * PARAMETERS:
1116 * bmp - pointer to bmap descriptor 1109 * bmp - pointer to bmap descriptor
1117 * dp - pointer to dmap. 1110 * dp - pointer to dmap.
1118 * blkno - starting block number of the range. 1111 * blkno - starting block number of the range.
1119 * nblocks - number of contiguous free blocks of the range. 1112 * nblocks - number of contiguous free blocks of the range.
1120 * 1113 *
1121 * RETURN VALUES: 1114 * RETURN VALUES:
1122 * 0 - success 1115 * 0 - success
1123 * -ENOSPC - insufficient disk resources 1116 * -ENOSPC - insufficient disk resources
1124 * -EIO - i/o error 1117 * -EIO - i/o error
1125 * 1118 *
1126 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1119 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1127 */ 1120 */
@@ -1233,7 +1226,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1233/* 1226/*
1234 * NAME: dbAllocNear() 1227 * NAME: dbAllocNear()
1235 * 1228 *
1236 * FUNCTION: attempt to allocate a number of contiguous free blocks near 1229 * FUNCTION: attempt to allocate a number of contiguous free blocks near
1237 * a specified block (hint) within a dmap. 1230 * a specified block (hint) within a dmap.
1238 * 1231 *
1239 * starting with the dmap leaf that covers the hint, we'll 1232 * starting with the dmap leaf that covers the hint, we'll
@@ -1242,18 +1235,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1242 * the desired free space. 1235 * the desired free space.
1243 * 1236 *
1244 * PARAMETERS: 1237 * PARAMETERS:
1245 * bmp - pointer to bmap descriptor 1238 * bmp - pointer to bmap descriptor
1246 * dp - pointer to dmap. 1239 * dp - pointer to dmap.
1247 * blkno - block number to allocate near. 1240 * blkno - block number to allocate near.
1248 * nblocks - actual number of contiguous free blocks desired. 1241 * nblocks - actual number of contiguous free blocks desired.
1249 * l2nb - log2 number of contiguous free blocks desired. 1242 * l2nb - log2 number of contiguous free blocks desired.
1250 * results - on successful return, set to the starting block number 1243 * results - on successful return, set to the starting block number
1251 * of the newly allocated range. 1244 * of the newly allocated range.
1252 * 1245 *
1253 * RETURN VALUES: 1246 * RETURN VALUES:
1254 * 0 - success 1247 * 0 - success
1255 * -ENOSPC - insufficient disk resources 1248 * -ENOSPC - insufficient disk resources
1256 * -EIO - i/o error 1249 * -EIO - i/o error
1257 * 1250 *
1258 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1251 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1259 */ 1252 */
@@ -1316,7 +1309,7 @@ dbAllocNear(struct bmap * bmp,
1316/* 1309/*
1317 * NAME: dbAllocAG() 1310 * NAME: dbAllocAG()
1318 * 1311 *
1319 * FUNCTION: attempt to allocate the specified number of contiguous 1312 * FUNCTION: attempt to allocate the specified number of contiguous
1320 * free blocks within the specified allocation group. 1313 * free blocks within the specified allocation group.
1321 * 1314 *
1322 * unless the allocation group size is equal to the number 1315 * unless the allocation group size is equal to the number
@@ -1353,17 +1346,17 @@ dbAllocNear(struct bmap * bmp,
1353 * the allocation group. 1346 * the allocation group.
1354 * 1347 *
1355 * PARAMETERS: 1348 * PARAMETERS:
1356 * bmp - pointer to bmap descriptor 1349 * bmp - pointer to bmap descriptor
1357 * agno - allocation group number. 1350 * agno - allocation group number.
1358 * nblocks - actual number of contiguous free blocks desired. 1351 * nblocks - actual number of contiguous free blocks desired.
1359 * l2nb - log2 number of contiguous free blocks desired. 1352 * l2nb - log2 number of contiguous free blocks desired.
1360 * results - on successful return, set to the starting block number 1353 * results - on successful return, set to the starting block number
1361 * of the newly allocated range. 1354 * of the newly allocated range.
1362 * 1355 *
1363 * RETURN VALUES: 1356 * RETURN VALUES:
1364 * 0 - success 1357 * 0 - success
1365 * -ENOSPC - insufficient disk resources 1358 * -ENOSPC - insufficient disk resources
1366 * -EIO - i/o error 1359 * -EIO - i/o error
1367 * 1360 *
1368 * note: IWRITE_LOCK(ipmap) held on entry/exit; 1361 * note: IWRITE_LOCK(ipmap) held on entry/exit;
1369 */ 1362 */
@@ -1546,7 +1539,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1546/* 1539/*
1547 * NAME: dbAllocAny() 1540 * NAME: dbAllocAny()
1548 * 1541 *
1549 * FUNCTION: attempt to allocate the specified number of contiguous 1542 * FUNCTION: attempt to allocate the specified number of contiguous
1550 * free blocks anywhere in the file system. 1543 * free blocks anywhere in the file system.
1551 * 1544 *
1552 * dbAllocAny() attempts to find the sufficient free space by 1545 * dbAllocAny() attempts to find the sufficient free space by
@@ -1556,16 +1549,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1556 * desired free space is allocated. 1549 * desired free space is allocated.
1557 * 1550 *
1558 * PARAMETERS: 1551 * PARAMETERS:
1559 * bmp - pointer to bmap descriptor 1552 * bmp - pointer to bmap descriptor
1560 * nblocks - actual number of contiguous free blocks desired. 1553 * nblocks - actual number of contiguous free blocks desired.
1561 * l2nb - log2 number of contiguous free blocks desired. 1554 * l2nb - log2 number of contiguous free blocks desired.
1562 * results - on successful return, set to the starting block number 1555 * results - on successful return, set to the starting block number
1563 * of the newly allocated range. 1556 * of the newly allocated range.
1564 * 1557 *
1565 * RETURN VALUES: 1558 * RETURN VALUES:
1566 * 0 - success 1559 * 0 - success
1567 * -ENOSPC - insufficient disk resources 1560 * -ENOSPC - insufficient disk resources
1568 * -EIO - i/o error 1561 * -EIO - i/o error
1569 * 1562 *
1570 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1563 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1571 */ 1564 */
@@ -1598,9 +1591,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1598/* 1591/*
1599 * NAME: dbFindCtl() 1592 * NAME: dbFindCtl()
1600 * 1593 *
1601 * FUNCTION: starting at a specified dmap control page level and block 1594 * FUNCTION: starting at a specified dmap control page level and block
1602 * number, search down the dmap control levels for a range of 1595 * number, search down the dmap control levels for a range of
1603 * contiguous free blocks large enough to satisfy an allocation 1596 * contiguous free blocks large enough to satisfy an allocation
1604 * request for the specified number of free blocks. 1597 * request for the specified number of free blocks.
1605 * 1598 *
1606 * if sufficient contiguous free blocks are found, this routine 1599 * if sufficient contiguous free blocks are found, this routine
@@ -1609,17 +1602,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1609 * is sufficient in size. 1602 * is sufficient in size.
1610 * 1603 *
1611 * PARAMETERS: 1604 * PARAMETERS:
1612 * bmp - pointer to bmap descriptor 1605 * bmp - pointer to bmap descriptor
1613 * level - starting dmap control page level. 1606 * level - starting dmap control page level.
1614 * l2nb - log2 number of contiguous free blocks desired. 1607 * l2nb - log2 number of contiguous free blocks desired.
1615 * *blkno - on entry, starting block number for conducting the search. 1608 * *blkno - on entry, starting block number for conducting the search.
1616 * on successful return, the first block within a dmap page 1609 * on successful return, the first block within a dmap page
1617 * that contains or starts a range of contiguous free blocks. 1610 * that contains or starts a range of contiguous free blocks.
1618 * 1611 *
1619 * RETURN VALUES: 1612 * RETURN VALUES:
1620 * 0 - success 1613 * 0 - success
1621 * -ENOSPC - insufficient disk resources 1614 * -ENOSPC - insufficient disk resources
1622 * -EIO - i/o error 1615 * -EIO - i/o error
1623 * 1616 *
1624 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1617 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1625 */ 1618 */
@@ -1699,7 +1692,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1699/* 1692/*
1700 * NAME: dbAllocCtl() 1693 * NAME: dbAllocCtl()
1701 * 1694 *
1702 * FUNCTION: attempt to allocate a specified number of contiguous 1695 * FUNCTION: attempt to allocate a specified number of contiguous
1703 * blocks starting within a specific dmap. 1696 * blocks starting within a specific dmap.
1704 * 1697 *
1705 * this routine is called by higher level routines that search 1698 * this routine is called by higher level routines that search
@@ -1726,18 +1719,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1726 * first dmap (i.e. blkno). 1719 * first dmap (i.e. blkno).
1727 * 1720 *
1728 * PARAMETERS: 1721 * PARAMETERS:
1729 * bmp - pointer to bmap descriptor 1722 * bmp - pointer to bmap descriptor
1730 * nblocks - actual number of contiguous free blocks to allocate. 1723 * nblocks - actual number of contiguous free blocks to allocate.
1731 * l2nb - log2 number of contiguous free blocks to allocate. 1724 * l2nb - log2 number of contiguous free blocks to allocate.
1732 * blkno - starting block number of the dmap to start the allocation 1725 * blkno - starting block number of the dmap to start the allocation
1733 * from. 1726 * from.
1734 * results - on successful return, set to the starting block number 1727 * results - on successful return, set to the starting block number
1735 * of the newly allocated range. 1728 * of the newly allocated range.
1736 * 1729 *
1737 * RETURN VALUES: 1730 * RETURN VALUES:
1738 * 0 - success 1731 * 0 - success
1739 * -ENOSPC - insufficient disk resources 1732 * -ENOSPC - insufficient disk resources
1740 * -EIO - i/o error 1733 * -EIO - i/o error
1741 * 1734 *
1742 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1735 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1743 */ 1736 */
@@ -1870,7 +1863,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1870/* 1863/*
1871 * NAME: dbAllocDmapLev() 1864 * NAME: dbAllocDmapLev()
1872 * 1865 *
1873 * FUNCTION: attempt to allocate a specified number of contiguous blocks 1866 * FUNCTION: attempt to allocate a specified number of contiguous blocks
1874 * from a specified dmap. 1867 * from a specified dmap.
1875 * 1868 *
1876 * this routine checks if the contiguous blocks are available. 1869 * this routine checks if the contiguous blocks are available.
@@ -1878,17 +1871,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1878 * returned. 1871 * returned.
1879 * 1872 *
1880 * PARAMETERS: 1873 * PARAMETERS:
1881 * mp - pointer to bmap descriptor 1874 * mp - pointer to bmap descriptor
1882 * dp - pointer to dmap to attempt to allocate blocks from. 1875 * dp - pointer to dmap to attempt to allocate blocks from.
1883 * l2nb - log2 number of contiguous block desired. 1876 * l2nb - log2 number of contiguous block desired.
1884 * nblocks - actual number of contiguous block desired. 1877 * nblocks - actual number of contiguous block desired.
1885 * results - on successful return, set to the starting block number 1878 * results - on successful return, set to the starting block number
1886 * of the newly allocated range. 1879 * of the newly allocated range.
1887 * 1880 *
1888 * RETURN VALUES: 1881 * RETURN VALUES:
1889 * 0 - success 1882 * 0 - success
1890 * -ENOSPC - insufficient disk resources 1883 * -ENOSPC - insufficient disk resources
1891 * -EIO - i/o error 1884 * -EIO - i/o error
1892 * 1885 *
1893 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 1886 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1894 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; 1887 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
@@ -1933,7 +1926,7 @@ dbAllocDmapLev(struct bmap * bmp,
1933/* 1926/*
1934 * NAME: dbAllocDmap() 1927 * NAME: dbAllocDmap()
1935 * 1928 *
1936 * FUNCTION: adjust the disk allocation map to reflect the allocation 1929 * FUNCTION: adjust the disk allocation map to reflect the allocation
1937 * of a specified block range within a dmap. 1930 * of a specified block range within a dmap.
1938 * 1931 *
1939 * this routine allocates the specified blocks from the dmap 1932 * this routine allocates the specified blocks from the dmap
@@ -1946,14 +1939,14 @@ dbAllocDmapLev(struct bmap * bmp,
1946 * covers this dmap. 1939 * covers this dmap.
1947 * 1940 *
1948 * PARAMETERS: 1941 * PARAMETERS:
1949 * bmp - pointer to bmap descriptor 1942 * bmp - pointer to bmap descriptor
1950 * dp - pointer to dmap to allocate the block range from. 1943 * dp - pointer to dmap to allocate the block range from.
1951 * blkno - starting block number of the block to be allocated. 1944 * blkno - starting block number of the block to be allocated.
1952 * nblocks - number of blocks to be allocated. 1945 * nblocks - number of blocks to be allocated.
1953 * 1946 *
1954 * RETURN VALUES: 1947 * RETURN VALUES:
1955 * 0 - success 1948 * 0 - success
1956 * -EIO - i/o error 1949 * -EIO - i/o error
1957 * 1950 *
1958 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 1951 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
1959 */ 1952 */
@@ -1989,7 +1982,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
1989/* 1982/*
1990 * NAME: dbFreeDmap() 1983 * NAME: dbFreeDmap()
1991 * 1984 *
1992 * FUNCTION: adjust the disk allocation map to reflect the allocation 1985 * FUNCTION: adjust the disk allocation map to reflect the allocation
1993 * of a specified block range within a dmap. 1986 * of a specified block range within a dmap.
1994 * 1987 *
1995 * this routine frees the specified blocks from the dmap through 1988 * this routine frees the specified blocks from the dmap through
@@ -1997,18 +1990,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
1997 * causes the maximum string of free blocks within the dmap to 1990 * causes the maximum string of free blocks within the dmap to
1998 * change (i.e. the value of the root of the dmap's dmtree), this 1991 * change (i.e. the value of the root of the dmap's dmtree), this
1999 * routine will cause this change to be reflected up through the 1992 * routine will cause this change to be reflected up through the
2000 * appropriate levels of the dmap control pages by a call to 1993 * appropriate levels of the dmap control pages by a call to
2001 * dbAdjCtl() for the L0 dmap control page that covers this dmap. 1994 * dbAdjCtl() for the L0 dmap control page that covers this dmap.
2002 * 1995 *
2003 * PARAMETERS: 1996 * PARAMETERS:
2004 * bmp - pointer to bmap descriptor 1997 * bmp - pointer to bmap descriptor
2005 * dp - pointer to dmap to free the block range from. 1998 * dp - pointer to dmap to free the block range from.
2006 * blkno - starting block number of the block to be freed. 1999 * blkno - starting block number of the block to be freed.
2007 * nblocks - number of blocks to be freed. 2000 * nblocks - number of blocks to be freed.
2008 * 2001 *
2009 * RETURN VALUES: 2002 * RETURN VALUES:
2010 * 0 - success 2003 * 0 - success
2011 * -EIO - i/o error 2004 * -EIO - i/o error
2012 * 2005 *
2013 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2006 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2014 */ 2007 */
@@ -2055,7 +2048,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2055/* 2048/*
2056 * NAME: dbAllocBits() 2049 * NAME: dbAllocBits()
2057 * 2050 *
2058 * FUNCTION: allocate a specified block range from a dmap. 2051 * FUNCTION: allocate a specified block range from a dmap.
2059 * 2052 *
2060 * this routine updates the dmap to reflect the working 2053 * this routine updates the dmap to reflect the working
2061 * state allocation of the specified block range. it directly 2054 * state allocation of the specified block range. it directly
@@ -2065,10 +2058,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2065 * dmap's dmtree, as a whole, to reflect the allocated range. 2058 * dmap's dmtree, as a whole, to reflect the allocated range.
2066 * 2059 *
2067 * PARAMETERS: 2060 * PARAMETERS:
2068 * bmp - pointer to bmap descriptor 2061 * bmp - pointer to bmap descriptor
2069 * dp - pointer to dmap to allocate bits from. 2062 * dp - pointer to dmap to allocate bits from.
2070 * blkno - starting block number of the bits to be allocated. 2063 * blkno - starting block number of the bits to be allocated.
2071 * nblocks - number of bits to be allocated. 2064 * nblocks - number of bits to be allocated.
2072 * 2065 *
2073 * RETURN VALUES: none 2066 * RETURN VALUES: none
2074 * 2067 *
@@ -2149,7 +2142,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2149 * the allocated words. 2142 * the allocated words.
2150 */ 2143 */
2151 for (; nwords > 0; nwords -= nw) { 2144 for (; nwords > 0; nwords -= nw) {
2152 if (leaf[word] < BUDMIN) { 2145 if (leaf[word] < BUDMIN) {
2153 jfs_error(bmp->db_ipbmap->i_sb, 2146 jfs_error(bmp->db_ipbmap->i_sb,
2154 "dbAllocBits: leaf page " 2147 "dbAllocBits: leaf page "
2155 "corrupt"); 2148 "corrupt");
@@ -2202,7 +2195,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2202/* 2195/*
2203 * NAME: dbFreeBits() 2196 * NAME: dbFreeBits()
2204 * 2197 *
2205 * FUNCTION: free a specified block range from a dmap. 2198 * FUNCTION: free a specified block range from a dmap.
2206 * 2199 *
2207 * this routine updates the dmap to reflect the working 2200 * this routine updates the dmap to reflect the working
2208 * state allocation of the specified block range. it directly 2201 * state allocation of the specified block range. it directly
@@ -2212,10 +2205,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2212 * dmtree, as a whole, to reflect the deallocated range. 2205 * dmtree, as a whole, to reflect the deallocated range.
2213 * 2206 *
2214 * PARAMETERS: 2207 * PARAMETERS:
2215 * bmp - pointer to bmap descriptor 2208 * bmp - pointer to bmap descriptor
2216 * dp - pointer to dmap to free bits from. 2209 * dp - pointer to dmap to free bits from.
2217 * blkno - starting block number of the bits to be freed. 2210 * blkno - starting block number of the bits to be freed.
2218 * nblocks - number of bits to be freed. 2211 * nblocks - number of bits to be freed.
2219 * 2212 *
2220 * RETURN VALUES: 0 for success 2213 * RETURN VALUES: 0 for success
2221 * 2214 *
@@ -2388,19 +2381,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2388 * the new root value and the next dmap control page level to 2381 * the new root value and the next dmap control page level to
2389 * be adjusted. 2382 * be adjusted.
2390 * PARAMETERS: 2383 * PARAMETERS:
2391 * bmp - pointer to bmap descriptor 2384 * bmp - pointer to bmap descriptor
2392 * blkno - the first block of a block range within a dmap. it is 2385 * blkno - the first block of a block range within a dmap. it is
2393 * the allocation or deallocation of this block range that 2386 * the allocation or deallocation of this block range that
2394 * requires the dmap control page to be adjusted. 2387 * requires the dmap control page to be adjusted.
2395 * newval - the new value of the lower level dmap or dmap control 2388 * newval - the new value of the lower level dmap or dmap control
2396 * page root. 2389 * page root.
2397 * alloc - 'true' if adjustment is due to an allocation. 2390 * alloc - 'true' if adjustment is due to an allocation.
2398 * level - current level of dmap control page (i.e. L0, L1, L2) to 2391 * level - current level of dmap control page (i.e. L0, L1, L2) to
2399 * be adjusted. 2392 * be adjusted.
2400 * 2393 *
2401 * RETURN VALUES: 2394 * RETURN VALUES:
2402 * 0 - success 2395 * 0 - success
2403 * -EIO - i/o error 2396 * -EIO - i/o error
2404 * 2397 *
2405 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2398 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2406 */ 2399 */
@@ -2544,16 +2537,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2544/* 2537/*
2545 * NAME: dbSplit() 2538 * NAME: dbSplit()
2546 * 2539 *
2547 * FUNCTION: update the leaf of a dmtree with a new value, splitting 2540 * FUNCTION: update the leaf of a dmtree with a new value, splitting
2548 * the leaf from the binary buddy system of the dmtree's 2541 * the leaf from the binary buddy system of the dmtree's
2549 * leaves, as required. 2542 * leaves, as required.
2550 * 2543 *
2551 * PARAMETERS: 2544 * PARAMETERS:
2552 * tp - pointer to the tree containing the leaf. 2545 * tp - pointer to the tree containing the leaf.
2553 * leafno - the number of the leaf to be updated. 2546 * leafno - the number of the leaf to be updated.
2554 * splitsz - the size the binary buddy system starting at the leaf 2547 * splitsz - the size the binary buddy system starting at the leaf
2555 * must be split to, specified as the log2 number of blocks. 2548 * must be split to, specified as the log2 number of blocks.
2556 * newval - the new value for the leaf. 2549 * newval - the new value for the leaf.
2557 * 2550 *
2558 * RETURN VALUES: none 2551 * RETURN VALUES: none
2559 * 2552 *
@@ -2600,7 +2593,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2600/* 2593/*
2601 * NAME: dbBackSplit() 2594 * NAME: dbBackSplit()
2602 * 2595 *
2603 * FUNCTION: back split the binary buddy system of dmtree leaves 2596 * FUNCTION: back split the binary buddy system of dmtree leaves
2604 * that hold a specified leaf until the specified leaf 2597 * that hold a specified leaf until the specified leaf
2605 * starts its own binary buddy system. 2598 * starts its own binary buddy system.
2606 * 2599 *
@@ -2617,8 +2610,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2617 * in which a previous join operation must be backed out. 2610 * in which a previous join operation must be backed out.
2618 * 2611 *
2619 * PARAMETERS: 2612 * PARAMETERS:
2620 * tp - pointer to the tree containing the leaf. 2613 * tp - pointer to the tree containing the leaf.
2621 * leafno - the number of the leaf to be updated. 2614 * leafno - the number of the leaf to be updated.
2622 * 2615 *
2623 * RETURN VALUES: none 2616 * RETURN VALUES: none
2624 * 2617 *
@@ -2692,14 +2685,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
2692/* 2685/*
2693 * NAME: dbJoin() 2686 * NAME: dbJoin()
2694 * 2687 *
2695 * FUNCTION: update the leaf of a dmtree with a new value, joining 2688 * FUNCTION: update the leaf of a dmtree with a new value, joining
2696 * the leaf with other leaves of the dmtree into a multi-leaf 2689 * the leaf with other leaves of the dmtree into a multi-leaf
2697 * binary buddy system, as required. 2690 * binary buddy system, as required.
2698 * 2691 *
2699 * PARAMETERS: 2692 * PARAMETERS:
2700 * tp - pointer to the tree containing the leaf. 2693 * tp - pointer to the tree containing the leaf.
2701 * leafno - the number of the leaf to be updated. 2694 * leafno - the number of the leaf to be updated.
2702 * newval - the new value for the leaf. 2695 * newval - the new value for the leaf.
2703 * 2696 *
2704 * RETURN VALUES: none 2697 * RETURN VALUES: none
2705 */ 2698 */
@@ -2785,15 +2778,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
2785/* 2778/*
2786 * NAME: dbAdjTree() 2779 * NAME: dbAdjTree()
2787 * 2780 *
2788 * FUNCTION: update a leaf of a dmtree with a new value, adjusting 2781 * FUNCTION: update a leaf of a dmtree with a new value, adjusting
2789 * the dmtree, as required, to reflect the new leaf value. 2782 * the dmtree, as required, to reflect the new leaf value.
2790 * the combination of any buddies must already be done before 2783 * the combination of any buddies must already be done before
2791 * this is called. 2784 * this is called.
2792 * 2785 *
2793 * PARAMETERS: 2786 * PARAMETERS:
2794 * tp - pointer to the tree to be adjusted. 2787 * tp - pointer to the tree to be adjusted.
2795 * leafno - the number of the leaf to be updated. 2788 * leafno - the number of the leaf to be updated.
2796 * newval - the new value for the leaf. 2789 * newval - the new value for the leaf.
2797 * 2790 *
2798 * RETURN VALUES: none 2791 * RETURN VALUES: none
2799 */ 2792 */
@@ -2852,7 +2845,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2852/* 2845/*
2853 * NAME: dbFindLeaf() 2846 * NAME: dbFindLeaf()
2854 * 2847 *
2855 * FUNCTION: search a dmtree_t for sufficient free blocks, returning 2848 * FUNCTION: search a dmtree_t for sufficient free blocks, returning
2856 * the index of a leaf describing the free blocks if 2849 * the index of a leaf describing the free blocks if
2857 * sufficient free blocks are found. 2850 * sufficient free blocks are found.
2858 * 2851 *
@@ -2861,15 +2854,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2861 * free space. 2854 * free space.
2862 * 2855 *
2863 * PARAMETERS: 2856 * PARAMETERS:
2864 * tp - pointer to the tree to be searched. 2857 * tp - pointer to the tree to be searched.
2865 * l2nb - log2 number of free blocks to search for. 2858 * l2nb - log2 number of free blocks to search for.
2866 * leafidx - return pointer to be set to the index of the leaf 2859 * leafidx - return pointer to be set to the index of the leaf
2867 * describing at least l2nb free blocks if sufficient 2860 * describing at least l2nb free blocks if sufficient
2868 * free blocks are found. 2861 * free blocks are found.
2869 * 2862 *
2870 * RETURN VALUES: 2863 * RETURN VALUES:
2871 * 0 - success 2864 * 0 - success
2872 * -ENOSPC - insufficient free blocks. 2865 * -ENOSPC - insufficient free blocks.
2873 */ 2866 */
2874static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) 2867static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2875{ 2868{
@@ -2916,18 +2909,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2916/* 2909/*
2917 * NAME: dbFindBits() 2910 * NAME: dbFindBits()
2918 * 2911 *
2919 * FUNCTION: find a specified number of binary buddy free bits within a 2912 * FUNCTION: find a specified number of binary buddy free bits within a
2920 * dmap bitmap word value. 2913 * dmap bitmap word value.
2921 * 2914 *
2922 * this routine searches the bitmap value for (1 << l2nb) free 2915 * this routine searches the bitmap value for (1 << l2nb) free
2923 * bits at (1 << l2nb) alignments within the value. 2916 * bits at (1 << l2nb) alignments within the value.
2924 * 2917 *
2925 * PARAMETERS: 2918 * PARAMETERS:
2926 * word - dmap bitmap word value. 2919 * word - dmap bitmap word value.
2927 * l2nb - number of free bits specified as a log2 number. 2920 * l2nb - number of free bits specified as a log2 number.
2928 * 2921 *
2929 * RETURN VALUES: 2922 * RETURN VALUES:
2930 * starting bit number of free bits. 2923 * starting bit number of free bits.
2931 */ 2924 */
2932static int dbFindBits(u32 word, int l2nb) 2925static int dbFindBits(u32 word, int l2nb)
2933{ 2926{
@@ -2963,14 +2956,14 @@ static int dbFindBits(u32 word, int l2nb)
2963/* 2956/*
2964 * NAME: dbMaxBud(u8 *cp) 2957 * NAME: dbMaxBud(u8 *cp)
2965 * 2958 *
2966 * FUNCTION: determine the largest binary buddy string of free 2959 * FUNCTION: determine the largest binary buddy string of free
2967 * bits within 32-bits of the map. 2960 * bits within 32-bits of the map.
2968 * 2961 *
2969 * PARAMETERS: 2962 * PARAMETERS:
2970 * cp - pointer to the 32-bit value. 2963 * cp - pointer to the 32-bit value.
2971 * 2964 *
2972 * RETURN VALUES: 2965 * RETURN VALUES:
2973 * largest binary buddy of free bits within a dmap word. 2966 * largest binary buddy of free bits within a dmap word.
2974 */ 2967 */
2975static int dbMaxBud(u8 * cp) 2968static int dbMaxBud(u8 * cp)
2976{ 2969{
@@ -3000,14 +2993,14 @@ static int dbMaxBud(u8 * cp)
3000/* 2993/*
3001 * NAME: cnttz(uint word) 2994 * NAME: cnttz(uint word)
3002 * 2995 *
3003 * FUNCTION: determine the number of trailing zeros within a 32-bit 2996 * FUNCTION: determine the number of trailing zeros within a 32-bit
3004 * value. 2997 * value.
3005 * 2998 *
3006 * PARAMETERS: 2999 * PARAMETERS:
3007 * value - 32-bit value to be examined. 3000 * value - 32-bit value to be examined.
3008 * 3001 *
3009 * RETURN VALUES: 3002 * RETURN VALUES:
3010 * count of trailing zeros 3003 * count of trailing zeros
3011 */ 3004 */
3012static int cnttz(u32 word) 3005static int cnttz(u32 word)
3013{ 3006{
@@ -3025,14 +3018,14 @@ static int cnttz(u32 word)
3025/* 3018/*
3026 * NAME: cntlz(u32 value) 3019 * NAME: cntlz(u32 value)
3027 * 3020 *
3028 * FUNCTION: determine the number of leading zeros within a 32-bit 3021 * FUNCTION: determine the number of leading zeros within a 32-bit
3029 * value. 3022 * value.
3030 * 3023 *
3031 * PARAMETERS: 3024 * PARAMETERS:
3032 * value - 32-bit value to be examined. 3025 * value - 32-bit value to be examined.
3033 * 3026 *
3034 * RETURN VALUES: 3027 * RETURN VALUES:
3035 * count of leading zeros 3028 * count of leading zeros
3036 */ 3029 */
3037static int cntlz(u32 value) 3030static int cntlz(u32 value)
3038{ 3031{
@@ -3050,14 +3043,14 @@ static int cntlz(u32 value)
3050 * NAME: blkstol2(s64 nb) 3043 * NAME: blkstol2(s64 nb)
3051 * 3044 *
3052 * FUNCTION: convert a block count to its log2 value. if the block 3045 * FUNCTION: convert a block count to its log2 value. if the block
3053 * count is not a l2 multiple, it is rounded up to the next 3046 * count is not a l2 multiple, it is rounded up to the next
3054 * larger l2 multiple. 3047 * larger l2 multiple.
3055 * 3048 *
3056 * PARAMETERS: 3049 * PARAMETERS:
3057 * nb - number of blocks 3050 * nb - number of blocks
3058 * 3051 *
3059 * RETURN VALUES: 3052 * RETURN VALUES:
3060 * log2 number of blocks 3053 * log2 number of blocks
3061 */ 3054 */
3062static int blkstol2(s64 nb) 3055static int blkstol2(s64 nb)
3063{ 3056{
@@ -3099,13 +3092,13 @@ static int blkstol2(s64 nb)
3099 * at a time. 3092 * at a time.
3100 * 3093 *
3101 * PARAMETERS: 3094 * PARAMETERS:
3102 * ip - pointer to in-core inode; 3095 * ip - pointer to in-core inode;
3103 * blkno - starting block number to be freed. 3096 * blkno - starting block number to be freed.
3104 * nblocks - number of blocks to be freed. 3097 * nblocks - number of blocks to be freed.
3105 * 3098 *
3106 * RETURN VALUES: 3099 * RETURN VALUES:
3107 * 0 - success 3100 * 0 - success
3108 * -EIO - i/o error 3101 * -EIO - i/o error
3109 */ 3102 */
3110int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) 3103int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3111{ 3104{
@@ -3278,10 +3271,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3278 * L2 3271 * L2
3279 * | 3272 * |
3280 * L1---------------------------------L1 3273 * L1---------------------------------L1
3281 * | | 3274 * | |
3282 * L0---------L0---------L0 L0---------L0---------L0 3275 * L0---------L0---------L0 L0---------L0---------L0
3283 * | | | | | | 3276 * | | | | | |
3284 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; 3277 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm;
3285 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm 3278 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3286 * 3279 *
3287 * <---old---><----------------------------extend-----------------------> 3280 * <---old---><----------------------------extend----------------------->
@@ -3307,7 +3300,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3307 (long long) blkno, (long long) nblocks, (long long) newsize); 3300 (long long) blkno, (long long) nblocks, (long long) newsize);
3308 3301
3309 /* 3302 /*
3310 * initialize bmap control page. 3303 * initialize bmap control page.
3311 * 3304 *
3312 * all the data in bmap control page should exclude 3305 * all the data in bmap control page should exclude
3313 * the mkfs hidden dmap page. 3306 * the mkfs hidden dmap page.
@@ -3330,7 +3323,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3330 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; 3323 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3331 3324
3332 /* 3325 /*
3333 * reconfigure db_agfree[] 3326 * reconfigure db_agfree[]
3334 * from old AG configuration to new AG configuration; 3327 * from old AG configuration to new AG configuration;
3335 * 3328 *
3336 * coalesce contiguous k (newAGSize/oldAGSize) AGs; 3329 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
@@ -3362,7 +3355,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3362 bmp->db_maxag = bmp->db_maxag / k; 3355 bmp->db_maxag = bmp->db_maxag / k;
3363 3356
3364 /* 3357 /*
3365 * extend bmap 3358 * extend bmap
3366 * 3359 *
3367 * update bit maps and corresponding level control pages; 3360 * update bit maps and corresponding level control pages;
3368 * global control page db_nfree, db_agfree[agno], db_maxfreebud; 3361 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
@@ -3410,7 +3403,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3410 /* compute start L0 */ 3403 /* compute start L0 */
3411 j = 0; 3404 j = 0;
3412 l1leaf = l1dcp->stree + CTLLEAFIND; 3405 l1leaf = l1dcp->stree + CTLLEAFIND;
3413 p += nbperpage; /* 1st L0 of L1.k */ 3406 p += nbperpage; /* 1st L0 of L1.k */
3414 } 3407 }
3415 3408
3416 /* 3409 /*
@@ -3548,7 +3541,7 @@ errout:
3548 return -EIO; 3541 return -EIO;
3549 3542
3550 /* 3543 /*
3551 * finalize bmap control page 3544 * finalize bmap control page
3552 */ 3545 */
3553finalize: 3546finalize:
3554 3547
@@ -3567,7 +3560,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3567 int i, n; 3560 int i, n;
3568 3561
3569 /* 3562 /*
3570 * finalize bmap control page 3563 * finalize bmap control page
3571 */ 3564 */
3572//finalize: 3565//finalize:
3573 /* 3566 /*
@@ -3953,8 +3946,8 @@ static int dbGetL2AGSize(s64 nblocks)
3953 * convert number of map pages to the zero origin top dmapctl level 3946 * convert number of map pages to the zero origin top dmapctl level
3954 */ 3947 */
3955#define BMAPPGTOLEV(npages) \ 3948#define BMAPPGTOLEV(npages) \
3956 (((npages) <= 3 + MAXL0PAGES) ? 0 \ 3949 (((npages) <= 3 + MAXL0PAGES) ? 0 : \
3957 : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) 3950 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
3958 3951
3959s64 dbMapFileSizeToMapSize(struct inode * ipbmap) 3952s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
3960{ 3953{
@@ -3981,8 +3974,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
3981 factor = 3974 factor =
3982 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); 3975 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
3983 complete = (u32) npages / factor; 3976 complete = (u32) npages / factor;
3984 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL 3977 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
3985 : ((i == 1) ? LPERCTL : 1)); 3978 ((i == 1) ? LPERCTL : 1));
3986 3979
3987 /* pages in last/incomplete child */ 3980 /* pages in last/incomplete child */
3988 npages = (u32) npages % factor; 3981 npages = (u32) npages % factor;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 45ea454c74bd..11e6d471b364 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp)
83 * - 1 is added to account for the control page of the map. 83 * - 1 is added to account for the control page of the map.
84 */ 84 */
85#define BLKTODMAP(b,s) \ 85#define BLKTODMAP(b,s) \
86 ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) 86 ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
87 87
88/* 88/*
89 * convert disk block number to the logical block number of the LEVEL 0 89 * convert disk block number to the logical block number of the LEVEL 0
@@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp)
98 * - 1 is added to account for the control page of the map. 98 * - 1 is added to account for the control page of the map.
99 */ 99 */
100#define BLKTOL0(b,s) \ 100#define BLKTOL0(b,s) \
101 (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) 101 (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
102 102
103/* 103/*
104 * convert disk block number to the logical block number of the LEVEL 1 104 * convert disk block number to the logical block number of the LEVEL 1
@@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp)
120 * at the specified level which describes the disk block. 120 * at the specified level which describes the disk block.
121 */ 121 */
122#define BLKTOCTL(b,s,l) \ 122#define BLKTOCTL(b,s,l) \
123 (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) 123 (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
124 124
125/* 125/*
126 * convert aggregate map size to the zero origin dmapctl level of the 126 * convert aggregate map size to the zero origin dmapctl level of the
@@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp)
145 * dmaptree must be consistent with dmapctl. 145 * dmaptree must be consistent with dmapctl.
146 */ 146 */
147struct dmaptree { 147struct dmaptree {
148 __le32 nleafs; /* 4: number of tree leafs */ 148 __le32 nleafs; /* 4: number of tree leafs */
149 __le32 l2nleafs; /* 4: l2 number of tree leafs */ 149 __le32 l2nleafs; /* 4: l2 number of tree leafs */
150 __le32 leafidx; /* 4: index of first tree leaf */ 150 __le32 leafidx; /* 4: index of first tree leaf */
151 __le32 height; /* 4: height of the tree */ 151 __le32 height; /* 4: height of the tree */
152 s8 budmin; /* 1: min l2 tree leaf value to combine */ 152 s8 budmin; /* 1: min l2 tree leaf value to combine */
153 s8 stree[TREESIZE]; /* TREESIZE: tree */ 153 s8 stree[TREESIZE]; /* TREESIZE: tree */
154 u8 pad[2]; /* 2: pad to word boundary */ 154 u8 pad[2]; /* 2: pad to word boundary */
155}; /* - 360 - */ 155}; /* - 360 - */
156 156
157/* 157/*
158 * dmap page per 8K blocks bitmap 158 * dmap page per 8K blocks bitmap
159 */ 159 */
160struct dmap { 160struct dmap {
161 __le32 nblocks; /* 4: num blks covered by this dmap */ 161 __le32 nblocks; /* 4: num blks covered by this dmap */
162 __le32 nfree; /* 4: num of free blks in this dmap */ 162 __le32 nfree; /* 4: num of free blks in this dmap */
163 __le64 start; /* 8: starting blkno for this dmap */ 163 __le64 start; /* 8: starting blkno for this dmap */
164 struct dmaptree tree; /* 360: dmap tree */ 164 struct dmaptree tree; /* 360: dmap tree */
165 u8 pad[1672]; /* 1672: pad to 2048 bytes */ 165 u8 pad[1672]; /* 1672: pad to 2048 bytes */
166 __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ 166 __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */
167 __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ 167 __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */
168}; /* - 4096 - */ 168}; /* - 4096 - */
169 169
170/* 170/*
171 * disk map control page per level. 171 * disk map control page per level.
@@ -173,14 +173,14 @@ struct dmap {
173 * dmapctl must be consistent with dmaptree. 173 * dmapctl must be consistent with dmaptree.
174 */ 174 */
175struct dmapctl { 175struct dmapctl {
176 __le32 nleafs; /* 4: number of tree leafs */ 176 __le32 nleafs; /* 4: number of tree leafs */
177 __le32 l2nleafs; /* 4: l2 number of tree leafs */ 177 __le32 l2nleafs; /* 4: l2 number of tree leafs */
178 __le32 leafidx; /* 4: index of the first tree leaf */ 178 __le32 leafidx; /* 4: index of the first tree leaf */
179 __le32 height; /* 4: height of tree */ 179 __le32 height; /* 4: height of tree */
180 s8 budmin; /* 1: minimum l2 tree leaf value */ 180 s8 budmin; /* 1: minimum l2 tree leaf value */
181 s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ 181 s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */
182 u8 pad[2714]; /* 2714: pad to 4096 */ 182 u8 pad[2714]; /* 2714: pad to 4096 */
183}; /* - 4096 - */ 183}; /* - 4096 - */
184 184
185/* 185/*
186 * common definition for dmaptree within dmap and dmapctl 186 * common definition for dmaptree within dmap and dmapctl
@@ -202,41 +202,41 @@ typedef union dmtree {
202 * on-disk aggregate disk allocation map descriptor. 202 * on-disk aggregate disk allocation map descriptor.
203 */ 203 */
204struct dbmap_disk { 204struct dbmap_disk {
205 __le64 dn_mapsize; /* 8: number of blocks in aggregate */ 205 __le64 dn_mapsize; /* 8: number of blocks in aggregate */
206 __le64 dn_nfree; /* 8: num free blks in aggregate map */ 206 __le64 dn_nfree; /* 8: num free blks in aggregate map */
207 __le32 dn_l2nbperpage; /* 4: number of blks per page */ 207 __le32 dn_l2nbperpage; /* 4: number of blks per page */
208 __le32 dn_numag; /* 4: total number of ags */ 208 __le32 dn_numag; /* 4: total number of ags */
209 __le32 dn_maxlevel; /* 4: number of active ags */ 209 __le32 dn_maxlevel; /* 4: number of active ags */
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
217 __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ 217 __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */
218 __le64 dn_agsize; /* 8: num of blks per alloc group */ 218 __le64 dn_agsize; /* 8: num of blks per alloc group */
219 s8 dn_maxfreebud; /* 1: max free buddy system */ 219 s8 dn_maxfreebud; /* 1: max free buddy system */
220 u8 pad[3007]; /* 3007: pad to 4096 */ 220 u8 pad[3007]; /* 3007: pad to 4096 */
221}; /* - 4096 - */ 221}; /* - 4096 - */
222 222
223struct dbmap { 223struct dbmap {
224 s64 dn_mapsize; /* number of blocks in aggregate */ 224 s64 dn_mapsize; /* number of blocks in aggregate */
225 s64 dn_nfree; /* num free blks in aggregate map */ 225 s64 dn_nfree; /* num free blks in aggregate map */
226 int dn_l2nbperpage; /* number of blks per page */ 226 int dn_l2nbperpage; /* number of blks per page */
227 int dn_numag; /* total number of ags */ 227 int dn_numag; /* total number of ags */
228 int dn_maxlevel; /* number of active ags */ 228 int dn_maxlevel; /* number of active ags */
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheigth; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
236 s64 dn_agfree[MAXAG]; /* per AG free count */ 236 s64 dn_agfree[MAXAG]; /* per AG free count */
237 s64 dn_agsize; /* num of blks per alloc group */ 237 s64 dn_agsize; /* num of blks per alloc group */
238 signed char dn_maxfreebud; /* max free buddy system */ 238 signed char dn_maxfreebud; /* max free buddy system */
239}; /* - 4096 - */ 239}; /* - 4096 - */
240/* 240/*
241 * in-memory aggregate disk allocation map descriptor. 241 * in-memory aggregate disk allocation map descriptor.
242 */ 242 */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 6d62f3222892..c14ba3cfa818 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
315 lv = &llck->lv[llck->index]; 315 lv = &llck->lv[llck->index];
316 316
317 /* 317 /*
318 * Linelock slot size is twice the size of directory table 318 * Linelock slot size is twice the size of directory table
319 * slot size. 512 entries per page. 319 * slot size. 512 entries per page.
320 */ 320 */
321 lv->offset = ((index - 2) & 511) >> 1; 321 lv->offset = ((index - 2) & 511) >> 1;
322 lv->length = 1; 322 lv->length = 1;
@@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
615 btstack->nsplit = 1; 615 btstack->nsplit = 1;
616 616
617 /* 617 /*
618 * search down tree from root: 618 * search down tree from root:
619 * 619 *
620 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 620 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
621 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 621 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
659 } 659 }
660 if (cmp == 0) { 660 if (cmp == 0) {
661 /* 661 /*
662 * search hit 662 * search hit
663 */ 663 */
664 /* search hit - leaf page: 664 /* search hit - leaf page:
665 * return the entry found 665 * return the entry found
@@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
723 } 723 }
724 724
725 /* 725 /*
726 * search miss 726 * search miss
727 * 727 *
728 * base is the smallest index with key (Kj) greater than 728 * base is the smallest index with key (Kj) greater than
729 * search key (K) and may be zero or (maxindex + 1) index. 729 * search key (K) and may be zero or (maxindex + 1) index.
@@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip,
834 struct lv *lv; 834 struct lv *lv;
835 835
836 /* 836 /*
837 * retrieve search result 837 * retrieve search result
838 * 838 *
839 * dtSearch() returns (leaf page pinned, index at which to insert). 839 * dtSearch() returns (leaf page pinned, index at which to insert).
840 * n.b. dtSearch() may return index of (maxindex + 1) of 840 * n.b. dtSearch() may return index of (maxindex + 1) of
@@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip,
843 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); 843 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
844 844
845 /* 845 /*
846 * insert entry for new key 846 * insert entry for new key
847 */ 847 */
848 if (DO_INDEX(ip)) { 848 if (DO_INDEX(ip)) {
849 if (JFS_IP(ip)->next_index == DIREND) { 849 if (JFS_IP(ip)->next_index == DIREND) {
@@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip,
860 data.leaf.ino = *fsn; 860 data.leaf.ino = *fsn;
861 861
862 /* 862 /*
863 * leaf page does not have enough room for new entry: 863 * leaf page does not have enough room for new entry:
864 * 864 *
865 * extend/split the leaf page; 865 * extend/split the leaf page;
866 * 866 *
867 * dtSplitUp() will insert the entry and unpin the leaf page. 867 * dtSplitUp() will insert the entry and unpin the leaf page.
868 */ 868 */
@@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip,
877 } 877 }
878 878
879 /* 879 /*
880 * leaf page does have enough room for new entry: 880 * leaf page does have enough room for new entry:
881 * 881 *
882 * insert the new data entry into the leaf page; 882 * insert the new data entry into the leaf page;
883 */ 883 */
884 BT_MARK_DIRTY(mp, ip); 884 BT_MARK_DIRTY(mp, ip);
885 /* 885 /*
@@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid,
967 } 967 }
968 968
969 /* 969 /*
970 * split leaf page 970 * split leaf page
971 * 971 *
972 * The split routines insert the new entry, and 972 * The split routines insert the new entry, and
973 * acquire txLock as appropriate. 973 * acquire txLock as appropriate.
974 */ 974 */
975 /* 975 /*
976 * split root leaf page: 976 * split root leaf page:
977 */ 977 */
978 if (sp->header.flag & BT_ROOT) { 978 if (sp->header.flag & BT_ROOT) {
979 /* 979 /*
@@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid,
1012 } 1012 }
1013 1013
1014 /* 1014 /*
1015 * extend first leaf page 1015 * extend first leaf page
1016 * 1016 *
1017 * extend the 1st extent if less than buffer page size 1017 * extend the 1st extent if less than buffer page size
1018 * (dtExtendPage() reurns leaf page unpinned) 1018 * (dtExtendPage() reurns leaf page unpinned)
@@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid,
1068 } 1068 }
1069 1069
1070 /* 1070 /*
1071 * split leaf page <sp> into <sp> and a new right page <rp>. 1071 * split leaf page <sp> into <sp> and a new right page <rp>.
1072 * 1072 *
1073 * return <rp> pinned and its extent descriptor <rpxd> 1073 * return <rp> pinned and its extent descriptor <rpxd>
1074 */ 1074 */
@@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1433 rp->header.freecnt = rp->header.maxslot - fsi; 1433 rp->header.freecnt = rp->header.maxslot - fsi;
1434 1434
1435 /* 1435 /*
1436 * sequential append at tail: append without split 1436 * sequential append at tail: append without split
1437 * 1437 *
1438 * If splitting the last page on a level because of appending 1438 * If splitting the last page on a level because of appending
1439 * a entry to it (skip is maxentry), it's likely that the access is 1439 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1467 } 1467 }
1468 1468
1469 /* 1469 /*
1470 * non-sequential insert (at possibly middle page) 1470 * non-sequential insert (at possibly middle page)
1471 */ 1471 */
1472 1472
1473 /* 1473 /*
@@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1508 left = 0; 1508 left = 0;
1509 1509
1510 /* 1510 /*
1511 * compute fill factor for split pages 1511 * compute fill factor for split pages
1512 * 1512 *
1513 * <nxt> traces the next entry to move to rp 1513 * <nxt> traces the next entry to move to rp
1514 * <off> traces the next entry to stay in sp 1514 * <off> traces the next entry to stay in sp
@@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1551 /* <nxt> poins to the 1st entry to move */ 1551 /* <nxt> poins to the 1st entry to move */
1552 1552
1553 /* 1553 /*
1554 * move entries to right page 1554 * move entries to right page
1555 * 1555 *
1556 * dtMoveEntry() initializes rp and reserves entry for insertion 1556 * dtMoveEntry() initializes rp and reserves entry for insertion
1557 * 1557 *
@@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid,
1677 return (rc); 1677 return (rc);
1678 1678
1679 /* 1679 /*
1680 * extend the extent 1680 * extend the extent
1681 */ 1681 */
1682 pxdlist = split->pxdlist; 1682 pxdlist = split->pxdlist;
1683 pxd = &pxdlist->pxd[pxdlist->npxd]; 1683 pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid,
1722 } 1722 }
1723 1723
1724 /* 1724 /*
1725 * extend the page 1725 * extend the page
1726 */ 1726 */
1727 sp->header.self = *pxd; 1727 sp->header.self = *pxd;
1728 1728
@@ -1739,9 +1739,6 @@ static int dtExtendPage(tid_t tid,
1739 /* update buffer extent descriptor of extended page */ 1739 /* update buffer extent descriptor of extended page */
1740 xlen = lengthPXD(pxd); 1740 xlen = lengthPXD(pxd);
1741 xsize = xlen << JFS_SBI(sb)->l2bsize; 1741 xsize = xlen << JFS_SBI(sb)->l2bsize;
1742#ifdef _STILL_TO_PORT
1743 bmSetXD(smp, xaddr, xsize);
1744#endif /* _STILL_TO_PORT */
1745 1742
1746 /* 1743 /*
1747 * copy old stbl to new stbl at start of extended area 1744 * copy old stbl to new stbl at start of extended area
@@ -1836,7 +1833,7 @@ static int dtExtendPage(tid_t tid,
1836 } 1833 }
1837 1834
1838 /* 1835 /*
1839 * update parent entry on the parent/root page 1836 * update parent entry on the parent/root page
1840 */ 1837 */
1841 /* 1838 /*
1842 * acquire a transaction lock on the parent/root page 1839 * acquire a transaction lock on the parent/root page
@@ -1904,7 +1901,7 @@ static int dtSplitRoot(tid_t tid,
1904 sp = &JFS_IP(ip)->i_dtroot; 1901 sp = &JFS_IP(ip)->i_dtroot;
1905 1902
1906 /* 1903 /*
1907 * allocate/initialize a single (right) child page 1904 * allocate/initialize a single (right) child page
1908 * 1905 *
1909 * N.B. at first split, a one (or two) block to fit new entry 1906 * N.B. at first split, a one (or two) block to fit new entry
1910 * is allocated; at subsequent split, a full page is allocated; 1907 * is allocated; at subsequent split, a full page is allocated;
@@ -1943,7 +1940,7 @@ static int dtSplitRoot(tid_t tid,
1943 rp->header.prev = 0; 1940 rp->header.prev = 0;
1944 1941
1945 /* 1942 /*
1946 * move in-line root page into new right page extent 1943 * move in-line root page into new right page extent
1947 */ 1944 */
1948 /* linelock header + copied entries + new stbl (1st slot) in new page */ 1945 /* linelock header + copied entries + new stbl (1st slot) in new page */
1949 ASSERT(dtlck->index == 0); 1946 ASSERT(dtlck->index == 0);
@@ -2016,7 +2013,7 @@ static int dtSplitRoot(tid_t tid,
2016 dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); 2013 dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2017 2014
2018 /* 2015 /*
2019 * reset parent/root page 2016 * reset parent/root page
2020 * 2017 *
2021 * set the 1st entry offset to 0, which force the left-most key 2018 * set the 1st entry offset to 0, which force the left-most key
2022 * at any level of the tree to be less than any search key. 2019 * at any level of the tree to be less than any search key.
@@ -2102,7 +2099,7 @@ int dtDelete(tid_t tid,
2102 dtpage_t *np; 2099 dtpage_t *np;
2103 2100
2104 /* 2101 /*
2105 * search for the entry to delete: 2102 * search for the entry to delete:
2106 * 2103 *
2107 * dtSearch() returns (leaf page pinned, index at which to delete). 2104 * dtSearch() returns (leaf page pinned, index at which to delete).
2108 */ 2105 */
@@ -2253,7 +2250,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2253 int i; 2250 int i;
2254 2251
2255 /* 2252 /*
2256 * keep the root leaf page which has become empty 2253 * keep the root leaf page which has become empty
2257 */ 2254 */
2258 if (BT_IS_ROOT(fmp)) { 2255 if (BT_IS_ROOT(fmp)) {
2259 /* 2256 /*
@@ -2269,7 +2266,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2269 } 2266 }
2270 2267
2271 /* 2268 /*
2272 * free the non-root leaf page 2269 * free the non-root leaf page
2273 */ 2270 */
2274 /* 2271 /*
2275 * acquire a transaction lock on the page 2272 * acquire a transaction lock on the page
@@ -2299,7 +2296,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2299 discard_metapage(fmp); 2296 discard_metapage(fmp);
2300 2297
2301 /* 2298 /*
2302 * propagate page deletion up the directory tree 2299 * propagate page deletion up the directory tree
2303 * 2300 *
2304 * If the delete from the parent page makes it empty, 2301 * If the delete from the parent page makes it empty,
2305 * continue all the way up the tree. 2302 * continue all the way up the tree.
@@ -2440,10 +2437,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2440 2437
2441#ifdef _NOTYET 2438#ifdef _NOTYET
2442/* 2439/*
2443 * NAME: dtRelocate() 2440 * NAME: dtRelocate()
2444 * 2441 *
2445 * FUNCTION: relocate dtpage (internal or leaf) of directory; 2442 * FUNCTION: relocate dtpage (internal or leaf) of directory;
2446 * This function is mainly used by defragfs utility. 2443 * This function is mainly used by defragfs utility.
2447 */ 2444 */
2448int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, 2445int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2449 s64 nxaddr) 2446 s64 nxaddr)
@@ -2471,8 +2468,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2471 xlen); 2468 xlen);
2472 2469
2473 /* 2470 /*
2474 * 1. get the internal parent dtpage covering 2471 * 1. get the internal parent dtpage covering
2475 * router entry for the tartget page to be relocated; 2472 * router entry for the tartget page to be relocated;
2476 */ 2473 */
2477 rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); 2474 rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2478 if (rc) 2475 if (rc)
@@ -2483,7 +2480,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2483 jfs_info("dtRelocate: parent router entry validated."); 2480 jfs_info("dtRelocate: parent router entry validated.");
2484 2481
2485 /* 2482 /*
2486 * 2. relocate the target dtpage 2483 * 2. relocate the target dtpage
2487 */ 2484 */
2488 /* read in the target page from src extent */ 2485 /* read in the target page from src extent */
2489 DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); 2486 DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
@@ -2581,9 +2578,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2581 2578
2582 /* update the buffer extent descriptor of the dtpage */ 2579 /* update the buffer extent descriptor of the dtpage */
2583 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; 2580 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2584#ifdef _STILL_TO_PORT 2581
2585 bmSetXD(mp, nxaddr, xsize);
2586#endif /* _STILL_TO_PORT */
2587 /* unpin the relocated page */ 2582 /* unpin the relocated page */
2588 DT_PUTPAGE(mp); 2583 DT_PUTPAGE(mp);
2589 jfs_info("dtRelocate: target dtpage relocated."); 2584 jfs_info("dtRelocate: target dtpage relocated.");
@@ -2594,7 +2589,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2594 */ 2589 */
2595 2590
2596 /* 2591 /*
2597 * 3. acquire maplock for the source extent to be freed; 2592 * 3. acquire maplock for the source extent to be freed;
2598 */ 2593 */
2599 /* for dtpage relocation, write a LOG_NOREDOPAGE record 2594 /* for dtpage relocation, write a LOG_NOREDOPAGE record
2600 * for the source dtpage (logredo() will init NoRedoPage 2595 * for the source dtpage (logredo() will init NoRedoPage
@@ -2609,7 +2604,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2609 pxdlock->index = 1; 2604 pxdlock->index = 1;
2610 2605
2611 /* 2606 /*
2612 * 4. update the parent router entry for relocation; 2607 * 4. update the parent router entry for relocation;
2613 * 2608 *
2614 * acquire tlck for the parent entry covering the target dtpage; 2609 * acquire tlck for the parent entry covering the target dtpage;
2615 * write LOG_REDOPAGE to apply after image only; 2610 * write LOG_REDOPAGE to apply after image only;
@@ -2637,7 +2632,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2637 * NAME: dtSearchNode() 2632 * NAME: dtSearchNode()
2638 * 2633 *
2639 * FUNCTION: Search for an dtpage containing a specified address 2634 * FUNCTION: Search for an dtpage containing a specified address
2640 * This function is mainly used by defragfs utility. 2635 * This function is mainly used by defragfs utility.
2641 * 2636 *
2642 * NOTE: Search result on stack, the found page is pinned at exit. 2637 * NOTE: Search result on stack, the found page is pinned at exit.
2643 * The result page must be an internal dtpage. 2638 * The result page must be an internal dtpage.
@@ -2660,7 +2655,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2660 BT_CLR(btstack); /* reset stack */ 2655 BT_CLR(btstack); /* reset stack */
2661 2656
2662 /* 2657 /*
2663 * descend tree to the level with specified leftmost page 2658 * descend tree to the level with specified leftmost page
2664 * 2659 *
2665 * by convention, root bn = 0. 2660 * by convention, root bn = 0.
2666 */ 2661 */
@@ -2699,7 +2694,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2699 } 2694 }
2700 2695
2701 /* 2696 /*
2702 * search each page at the current levevl 2697 * search each page at the current levevl
2703 */ 2698 */
2704 loop: 2699 loop:
2705 stbl = DT_GETSTBL(p); 2700 stbl = DT_GETSTBL(p);
@@ -3044,9 +3039,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3044 if (DO_INDEX(ip)) { 3039 if (DO_INDEX(ip)) {
3045 /* 3040 /*
3046 * persistent index is stored in directory entries. 3041 * persistent index is stored in directory entries.
3047 * Special cases: 0 = . 3042 * Special cases: 0 = .
3048 * 1 = .. 3043 * 1 = ..
3049 * -1 = End of directory 3044 * -1 = End of directory
3050 */ 3045 */
3051 do_index = 1; 3046 do_index = 1;
3052 3047
@@ -3128,10 +3123,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3128 /* 3123 /*
3129 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 3124 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3130 * 3125 *
3131 * pn = index = 0: First entry "." 3126 * pn = index = 0: First entry "."
3132 * pn = 0; index = 1: Second entry ".." 3127 * pn = 0; index = 1: Second entry ".."
3133 * pn > 0: Real entries, pn=1 -> leftmost page 3128 * pn > 0: Real entries, pn=1 -> leftmost page
3134 * pn = index = -1: No more entries 3129 * pn = index = -1: No more entries
3135 */ 3130 */
3136 dtpos = filp->f_pos; 3131 dtpos = filp->f_pos;
3137 if (dtpos == 0) { 3132 if (dtpos == 0) {
@@ -3351,7 +3346,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3351 BT_CLR(btstack); /* reset stack */ 3346 BT_CLR(btstack); /* reset stack */
3352 3347
3353 /* 3348 /*
3354 * descend leftmost path of the tree 3349 * descend leftmost path of the tree
3355 * 3350 *
3356 * by convention, root bn = 0. 3351 * by convention, root bn = 0.
3357 */ 3352 */
@@ -4531,7 +4526,7 @@ int dtModify(tid_t tid, struct inode *ip,
4531 struct ldtentry *entry; 4526 struct ldtentry *entry;
4532 4527
4533 /* 4528 /*
4534 * search for the entry to modify: 4529 * search for the entry to modify:
4535 * 4530 *
4536 * dtSearch() returns (leaf page pinned, index at which to modify). 4531 * dtSearch() returns (leaf page pinned, index at which to modify).
4537 */ 4532 */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index af8513f78648..8561c6ecece0 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -35,7 +35,7 @@ typedef union {
35 35
36 36
37/* 37/*
38 * entry segment/slot 38 * entry segment/slot
39 * 39 *
40 * an entry consists of type dependent head/only segment/slot and 40 * an entry consists of type dependent head/only segment/slot and
41 * additional segments/slots linked vi next field; 41 * additional segments/slots linked vi next field;
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index a35bdca6a805..7ae1e3281de9 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
34#endif 34#endif
35static s64 extRoundDown(s64 nb); 35static s64 extRoundDown(s64 nb);
36 36
37#define DPD(a) (printk("(a): %d\n",(a))) 37#define DPD(a) (printk("(a): %d\n",(a)))
38#define DPC(a) (printk("(a): %c\n",(a))) 38#define DPC(a) (printk("(a): %c\n",(a)))
39#define DPL1(a) \ 39#define DPL1(a) \
40{ \ 40{ \
41 if ((a) >> 32) \ 41 if ((a) >> 32) \
@@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb);
51 printk("(a): %x\n",(a) << 32); \ 51 printk("(a): %x\n",(a) << 32); \
52} 52}
53 53
54#define DPD1(a) (printk("(a): %d ",(a))) 54#define DPD1(a) (printk("(a): %d ",(a)))
55#define DPX(a) (printk("(a): %08x\n",(a))) 55#define DPX(a) (printk("(a): %08x\n",(a)))
56#define DPX1(a) (printk("(a): %08x ",(a))) 56#define DPX1(a) (printk("(a): %08x ",(a)))
57#define DPS(a) (printk("%s\n",(a))) 57#define DPS(a) (printk("%s\n",(a)))
58#define DPE(a) (printk("\nENTERING: %s\n",(a))) 58#define DPE(a) (printk("\nENTERING: %s\n",(a)))
59#define DPE1(a) (printk("\nENTERING: %s",(a))) 59#define DPE1(a) (printk("\nENTERING: %s",(a)))
60#define DPS1(a) (printk(" %s ",(a))) 60#define DPS1(a) (printk(" %s ",(a)))
61 61
62 62
63/* 63/*
64 * NAME: extAlloc() 64 * NAME: extAlloc()
65 * 65 *
66 * FUNCTION: allocate an extent for a specified page range within a 66 * FUNCTION: allocate an extent for a specified page range within a
67 * file. 67 * file.
68 * 68 *
69 * PARAMETERS: 69 * PARAMETERS:
@@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb);
78 * should be marked as allocated but not recorded. 78 * should be marked as allocated but not recorded.
79 * 79 *
80 * RETURN VALUES: 80 * RETURN VALUES:
81 * 0 - success 81 * 0 - success
82 * -EIO - i/o error. 82 * -EIO - i/o error.
83 * -ENOSPC - insufficient disk resources. 83 * -ENOSPC - insufficient disk resources.
84 */ 84 */
85int 85int
86extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) 86extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
@@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
192 192
193#ifdef _NOTYET 193#ifdef _NOTYET
194/* 194/*
195 * NAME: extRealloc() 195 * NAME: extRealloc()
196 * 196 *
197 * FUNCTION: extend the allocation of a file extent containing a 197 * FUNCTION: extend the allocation of a file extent containing a
198 * partial back last page. 198 * partial back last page.
199 * 199 *
200 * PARAMETERS: 200 * PARAMETERS:
@@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
207 * should be marked as allocated but not recorded. 207 * should be marked as allocated but not recorded.
208 * 208 *
209 * RETURN VALUES: 209 * RETURN VALUES:
210 * 0 - success 210 * 0 - success
211 * -EIO - i/o error. 211 * -EIO - i/o error.
212 * -ENOSPC - insufficient disk resources. 212 * -ENOSPC - insufficient disk resources.
213 */ 213 */
214int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) 214int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
215{ 215{
@@ -345,9 +345,9 @@ exit:
345 345
346 346
347/* 347/*
348 * NAME: extHint() 348 * NAME: extHint()
349 * 349 *
350 * FUNCTION: produce an extent allocation hint for a file offset. 350 * FUNCTION: produce an extent allocation hint for a file offset.
351 * 351 *
352 * PARAMETERS: 352 * PARAMETERS:
353 * ip - the inode of the file. 353 * ip - the inode of the file.
@@ -356,8 +356,8 @@ exit:
356 * the hint. 356 * the hint.
357 * 357 *
358 * RETURN VALUES: 358 * RETURN VALUES:
359 * 0 - success 359 * 0 - success
360 * -EIO - i/o error. 360 * -EIO - i/o error.
361 */ 361 */
362int extHint(struct inode *ip, s64 offset, xad_t * xp) 362int extHint(struct inode *ip, s64 offset, xad_t * xp)
363{ 363{
@@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
387 lxdl.nlxd = 1; 387 lxdl.nlxd = 1;
388 lxdl.lxd = &lxd; 388 lxdl.lxd = &lxd;
389 LXDoffset(&lxd, prev) 389 LXDoffset(&lxd, prev)
390 LXDlength(&lxd, nbperpage); 390 LXDlength(&lxd, nbperpage);
391 391
392 xadl.maxnxad = 1; 392 xadl.maxnxad = 1;
393 xadl.nxad = 0; 393 xadl.nxad = 0;
@@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
397 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) 397 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
398 return (rc); 398 return (rc);
399 399
400 /* check if not extent exists for the previous page. 400 /* check if no extent exists for the previous page.
401 * this is possible for sparse files. 401 * this is possible for sparse files.
402 */ 402 */
403 if (xadl.nxad == 0) { 403 if (xadl.nxad == 0) {
404// assert(ISSPARSE(ip)); 404// assert(ISSPARSE(ip));
405 return (0); 405 return (0);
406 } 406 }
407 407
@@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
410 */ 410 */
411 xp->flag &= XAD_NOTRECORDED; 411 xp->flag &= XAD_NOTRECORDED;
412 412
413 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { 413 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
414 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 414 jfs_error(ip->i_sb, "extHint: corrupt xtree");
415 return -EIO; 415 return -EIO;
416 } 416 }
417 417
418 return (0); 418 return (0);
419} 419}
420 420
421 421
422/* 422/*
423 * NAME: extRecord() 423 * NAME: extRecord()
424 * 424 *
425 * FUNCTION: change a page with a file from not recorded to recorded. 425 * FUNCTION: change a page with a file from not recorded to recorded.
426 * 426 *
427 * PARAMETERS: 427 * PARAMETERS:
428 * ip - inode of the file. 428 * ip - inode of the file.
429 * cp - cbuf of the file page. 429 * cp - cbuf of the file page.
430 * 430 *
431 * RETURN VALUES: 431 * RETURN VALUES:
432 * 0 - success 432 * 0 - success
433 * -EIO - i/o error. 433 * -EIO - i/o error.
434 * -ENOSPC - insufficient disk resources. 434 * -ENOSPC - insufficient disk resources.
435 */ 435 */
436int extRecord(struct inode *ip, xad_t * xp) 436int extRecord(struct inode *ip, xad_t * xp)
437{ 437{
@@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp)
451 451
452#ifdef _NOTYET 452#ifdef _NOTYET
453/* 453/*
454 * NAME: extFill() 454 * NAME: extFill()
455 * 455 *
456 * FUNCTION: allocate disk space for a file page that represents 456 * FUNCTION: allocate disk space for a file page that represents
457 * a file hole. 457 * a file hole.
458 * 458 *
459 * PARAMETERS: 459 * PARAMETERS:
@@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp)
461 * cp - cbuf of the file page represent the hole. 461 * cp - cbuf of the file page represent the hole.
462 * 462 *
463 * RETURN VALUES: 463 * RETURN VALUES:
464 * 0 - success 464 * 0 - success
465 * -EIO - i/o error. 465 * -EIO - i/o error.
466 * -ENOSPC - insufficient disk resources. 466 * -ENOSPC - insufficient disk resources.
467 */ 467 */
468int extFill(struct inode *ip, xad_t * xp) 468int extFill(struct inode *ip, xad_t * xp)
469{ 469{
470 int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; 470 int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
471 s64 blkno = offsetXAD(xp) >> ip->i_blkbits; 471 s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
472 472
473// assert(ISSPARSE(ip)); 473// assert(ISSPARSE(ip));
474 474
475 /* initialize the extent allocation hint */ 475 /* initialize the extent allocation hint */
476 XADaddress(xp, 0); 476 XADaddress(xp, 0);
@@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp)
489/* 489/*
490 * NAME: extBalloc() 490 * NAME: extBalloc()
491 * 491 *
492 * FUNCTION: allocate disk blocks to form an extent. 492 * FUNCTION: allocate disk blocks to form an extent.
493 * 493 *
494 * initially, we will try to allocate disk blocks for the 494 * initially, we will try to allocate disk blocks for the
495 * requested size (nblocks). if this fails (nblocks 495 * requested size (nblocks). if this fails (nblocks
@@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp)
513 * allocated block range. 513 * allocated block range.
514 * 514 *
515 * RETURN VALUES: 515 * RETURN VALUES:
516 * 0 - success 516 * 0 - success
517 * -EIO - i/o error. 517 * -EIO - i/o error.
518 * -ENOSPC - insufficient disk resources. 518 * -ENOSPC - insufficient disk resources.
519 */ 519 */
520static int 520static int
521extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) 521extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
@@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
580/* 580/*
581 * NAME: extBrealloc() 581 * NAME: extBrealloc()
582 * 582 *
583 * FUNCTION: attempt to extend an extent's allocation. 583 * FUNCTION: attempt to extend an extent's allocation.
584 * 584 *
585 * Initially, we will try to extend the extent's allocation 585 * Initially, we will try to extend the extent's allocation
586 * in place. If this fails, we'll try to move the extent 586 * in place. If this fails, we'll try to move the extent
@@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
597 * 597 *
598 * PARAMETERS: 598 * PARAMETERS:
599 * ip - the inode of the file. 599 * ip - the inode of the file.
600 * blkno - starting block number of the extents current allocation. 600 * blkno - starting block number of the extents current allocation.
601 * nblks - number of blocks within the extents current allocation. 601 * nblks - number of blocks within the extents current allocation.
602 * newnblks - pointer to a s64 value. on entry, this value is the 602 * newnblks - pointer to a s64 value. on entry, this value is the
603 * the new desired extent size (number of blocks). on 603 * the new desired extent size (number of blocks). on
604 * successful exit, this value is set to the extent's actual 604 * successful exit, this value is set to the extent's actual
@@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
606 * newblkno - the starting block number of the extents new allocation. 606 * newblkno - the starting block number of the extents new allocation.
607 * 607 *
608 * RETURN VALUES: 608 * RETURN VALUES:
609 * 0 - success 609 * 0 - success
610 * -EIO - i/o error. 610 * -EIO - i/o error.
611 * -ENOSPC - insufficient disk resources. 611 * -ENOSPC - insufficient disk resources.
612 */ 612 */
613static int 613static int
614extBrealloc(struct inode *ip, 614extBrealloc(struct inode *ip,
@@ -634,16 +634,16 @@ extBrealloc(struct inode *ip,
634 634
635 635
636/* 636/*
637 * NAME: extRoundDown() 637 * NAME: extRoundDown()
638 * 638 *
639 * FUNCTION: round down a specified number of blocks to the next 639 * FUNCTION: round down a specified number of blocks to the next
640 * smallest power of 2 number. 640 * smallest power of 2 number.
641 * 641 *
642 * PARAMETERS: 642 * PARAMETERS:
643 * nb - the inode of the file. 643 * nb - the inode of the file.
644 * 644 *
645 * RETURN VALUES: 645 * RETURN VALUES:
646 * next smallest power of 2 number. 646 * next smallest power of 2 number.
647 */ 647 */
648static s64 extRoundDown(s64 nb) 648static s64 extRoundDown(s64 nb)
649{ 649{
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 38f70ac03bec..b3f5463fbe52 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -34,9 +34,9 @@
34#define JFS_UNICODE 0x00000001 /* unicode name */ 34#define JFS_UNICODE 0x00000001 /* unicode name */
35 35
36/* mount time flags for error handling */ 36/* mount time flags for error handling */
37#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ 37#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
38#define JFS_ERR_CONTINUE 0x00000004 /* continue */ 38#define JFS_ERR_CONTINUE 0x00000004 /* continue */
39#define JFS_ERR_PANIC 0x00000008 /* panic */ 39#define JFS_ERR_PANIC 0x00000008 /* panic */
40 40
41/* Quota support */ 41/* Quota support */
42#define JFS_USRQUOTA 0x00000010 42#define JFS_USRQUOTA 0x00000010
@@ -83,7 +83,6 @@
83/* case-insensitive name/directory support */ 83/* case-insensitive name/directory support */
84 84
85#define JFS_AIX 0x80000000 /* AIX support */ 85#define JFS_AIX 0x80000000 /* AIX support */
86/* POSIX name/directory support - Never implemented*/
87 86
88/* 87/*
89 * buffer cache configuration 88 * buffer cache configuration
@@ -113,10 +112,10 @@
113#define IDATASIZE 256 /* inode inline data size */ 112#define IDATASIZE 256 /* inode inline data size */
114#define IXATTRSIZE 128 /* inode inline extended attribute size */ 113#define IXATTRSIZE 128 /* inode inline extended attribute size */
115 114
116#define XTPAGE_SIZE 4096 115#define XTPAGE_SIZE 4096
117#define log2_PAGESIZE 12 116#define log2_PAGESIZE 12
118 117
119#define IAG_SIZE 4096 118#define IAG_SIZE 4096
120#define IAG_EXTENT_SIZE 4096 119#define IAG_EXTENT_SIZE 4096
121#define INOSPERIAG 4096 /* number of disk inodes per iag */ 120#define INOSPERIAG 4096 /* number of disk inodes per iag */
122#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ 121#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index c6530227cda6..3870ba8b9086 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *);
93static void copy_to_dinode(struct dinode *, struct inode *); 93static void copy_to_dinode(struct dinode *, struct inode *);
94 94
95/* 95/*
96 * NAME: diMount() 96 * NAME: diMount()
97 * 97 *
98 * FUNCTION: initialize the incore inode map control structures for 98 * FUNCTION: initialize the incore inode map control structures for
99 * a fileset or aggregate init time. 99 * a fileset or aggregate init time.
100 * 100 *
101 * the inode map's control structure (dinomap) is 101 * the inode map's control structure (dinomap) is
102 * brought in from disk and placed in virtual memory. 102 * brought in from disk and placed in virtual memory.
103 * 103 *
104 * PARAMETERS: 104 * PARAMETERS:
105 * ipimap - pointer to inode map inode for the aggregate or fileset. 105 * ipimap - pointer to inode map inode for the aggregate or fileset.
106 * 106 *
107 * RETURN VALUES: 107 * RETURN VALUES:
108 * 0 - success 108 * 0 - success
109 * -ENOMEM - insufficient free virtual memory. 109 * -ENOMEM - insufficient free virtual memory.
110 * -EIO - i/o error. 110 * -EIO - i/o error.
111 */ 111 */
112int diMount(struct inode *ipimap) 112int diMount(struct inode *ipimap)
113{ 113{
@@ -180,18 +180,18 @@ int diMount(struct inode *ipimap)
180 180
181 181
182/* 182/*
183 * NAME: diUnmount() 183 * NAME: diUnmount()
184 * 184 *
185 * FUNCTION: write to disk the incore inode map control structures for 185 * FUNCTION: write to disk the incore inode map control structures for
186 * a fileset or aggregate at unmount time. 186 * a fileset or aggregate at unmount time.
187 * 187 *
188 * PARAMETERS: 188 * PARAMETERS:
189 * ipimap - pointer to inode map inode for the aggregate or fileset. 189 * ipimap - pointer to inode map inode for the aggregate or fileset.
190 * 190 *
191 * RETURN VALUES: 191 * RETURN VALUES:
192 * 0 - success 192 * 0 - success
193 * -ENOMEM - insufficient free virtual memory. 193 * -ENOMEM - insufficient free virtual memory.
194 * -EIO - i/o error. 194 * -EIO - i/o error.
195 */ 195 */
196int diUnmount(struct inode *ipimap, int mounterror) 196int diUnmount(struct inode *ipimap, int mounterror)
197{ 197{
@@ -274,9 +274,9 @@ int diSync(struct inode *ipimap)
274 274
275 275
276/* 276/*
277 * NAME: diRead() 277 * NAME: diRead()
278 * 278 *
279 * FUNCTION: initialize an incore inode from disk. 279 * FUNCTION: initialize an incore inode from disk.
280 * 280 *
281 * on entry, the specifed incore inode should itself 281 * on entry, the specifed incore inode should itself
282 * specify the disk inode number corresponding to the 282 * specify the disk inode number corresponding to the
@@ -285,7 +285,7 @@ int diSync(struct inode *ipimap)
285 * this routine handles incore inode initialization for 285 * this routine handles incore inode initialization for
286 * both "special" and "regular" inodes. special inodes 286 * both "special" and "regular" inodes. special inodes
287 * are those required early in the mount process and 287 * are those required early in the mount process and
288 * require special handling since much of the file system 288 * require special handling since much of the file system
289 * is not yet initialized. these "special" inodes are 289 * is not yet initialized. these "special" inodes are
290 * identified by a NULL inode map inode pointer and are 290 * identified by a NULL inode map inode pointer and are
291 * actually initialized by a call to diReadSpecial(). 291 * actually initialized by a call to diReadSpecial().
@@ -298,12 +298,12 @@ int diSync(struct inode *ipimap)
298 * incore inode. 298 * incore inode.
299 * 299 *
300 * PARAMETERS: 300 * PARAMETERS:
301 * ip - pointer to incore inode to be initialized from disk. 301 * ip - pointer to incore inode to be initialized from disk.
302 * 302 *
303 * RETURN VALUES: 303 * RETURN VALUES:
304 * 0 - success 304 * 0 - success
305 * -EIO - i/o error. 305 * -EIO - i/o error.
306 * -ENOMEM - insufficient memory 306 * -ENOMEM - insufficient memory
307 * 307 *
308 */ 308 */
309int diRead(struct inode *ip) 309int diRead(struct inode *ip)
@@ -410,26 +410,26 @@ int diRead(struct inode *ip)
410 410
411 411
412/* 412/*
413 * NAME: diReadSpecial() 413 * NAME: diReadSpecial()
414 * 414 *
415 * FUNCTION: initialize a 'special' inode from disk. 415 * FUNCTION: initialize a 'special' inode from disk.
416 * 416 *
417 * this routines handles aggregate level inodes. The 417 * this routines handles aggregate level inodes. The
418 * inode cache cannot differentiate between the 418 * inode cache cannot differentiate between the
419 * aggregate inodes and the filesystem inodes, so we 419 * aggregate inodes and the filesystem inodes, so we
420 * handle these here. We don't actually use the aggregate 420 * handle these here. We don't actually use the aggregate
421 * inode map, since these inodes are at a fixed location 421 * inode map, since these inodes are at a fixed location
422 * and in some cases the aggregate inode map isn't initialized 422 * and in some cases the aggregate inode map isn't initialized
423 * yet. 423 * yet.
424 * 424 *
425 * PARAMETERS: 425 * PARAMETERS:
426 * sb - filesystem superblock 426 * sb - filesystem superblock
427 * inum - aggregate inode number 427 * inum - aggregate inode number
428 * secondary - 1 if secondary aggregate inode table 428 * secondary - 1 if secondary aggregate inode table
429 * 429 *
430 * RETURN VALUES: 430 * RETURN VALUES:
431 * new inode - success 431 * new inode - success
432 * NULL - i/o error. 432 * NULL - i/o error.
433 */ 433 */
434struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) 434struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
435{ 435{
@@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
502} 502}
503 503
504/* 504/*
505 * NAME: diWriteSpecial() 505 * NAME: diWriteSpecial()
506 * 506 *
507 * FUNCTION: Write the special inode to disk 507 * FUNCTION: Write the special inode to disk
508 * 508 *
509 * PARAMETERS: 509 * PARAMETERS:
510 * ip - special inode 510 * ip - special inode
511 * secondary - 1 if secondary aggregate inode table 511 * secondary - 1 if secondary aggregate inode table
512 * 512 *
513 * RETURN VALUES: none 513 * RETURN VALUES: none
@@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary)
554} 554}
555 555
556/* 556/*
557 * NAME: diFreeSpecial() 557 * NAME: diFreeSpecial()
558 * 558 *
559 * FUNCTION: Free allocated space for special inode 559 * FUNCTION: Free allocated space for special inode
560 */ 560 */
561void diFreeSpecial(struct inode *ip) 561void diFreeSpecial(struct inode *ip)
562{ 562{
@@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip)
572 572
573 573
574/* 574/*
575 * NAME: diWrite() 575 * NAME: diWrite()
576 * 576 *
577 * FUNCTION: write the on-disk inode portion of the in-memory inode 577 * FUNCTION: write the on-disk inode portion of the in-memory inode
578 * to its corresponding on-disk inode. 578 * to its corresponding on-disk inode.
579 * 579 *
580 * on entry, the specifed incore inode should itself 580 * on entry, the specifed incore inode should itself
@@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip)
589 * 589 *
590 * PARAMETERS: 590 * PARAMETERS:
591 * tid - transacation id 591 * tid - transacation id
592 * ip - pointer to incore inode to be written to the inode extent. 592 * ip - pointer to incore inode to be written to the inode extent.
593 * 593 *
594 * RETURN VALUES: 594 * RETURN VALUES:
595 * 0 - success 595 * 0 - success
596 * -EIO - i/o error. 596 * -EIO - i/o error.
597 */ 597 */
598int diWrite(tid_t tid, struct inode *ip) 598int diWrite(tid_t tid, struct inode *ip)
599{ 599{
@@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip)
730 ilinelock = (struct linelock *) & tlck->lock; 730 ilinelock = (struct linelock *) & tlck->lock;
731 731
732 /* 732 /*
733 * regular file: 16 byte (XAD slot) granularity 733 * regular file: 16 byte (XAD slot) granularity
734 */ 734 */
735 if (type & tlckXTREE) { 735 if (type & tlckXTREE) {
736 xtpage_t *p, *xp; 736 xtpage_t *p, *xp;
@@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip)
755 xad->flag &= ~(XAD_NEW | XAD_EXTENDED); 755 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
756 } 756 }
757 /* 757 /*
758 * directory: 32 byte (directory entry slot) granularity 758 * directory: 32 byte (directory entry slot) granularity
759 */ 759 */
760 else if (type & tlckDTREE) { 760 else if (type & tlckDTREE) {
761 dtpage_t *p, *xp; 761 dtpage_t *p, *xp;
@@ -800,9 +800,8 @@ int diWrite(tid_t tid, struct inode *ip)
800 } 800 }
801 801
802 /* 802 /*
803 * lock/copy inode base: 128 byte slot granularity 803 * lock/copy inode base: 128 byte slot granularity
804 */ 804 */
805// baseDinode:
806 lv = & dilinelock->lv[dilinelock->index]; 805 lv = & dilinelock->lv[dilinelock->index];
807 lv->offset = dioffset >> L2INODESLOTSIZE; 806 lv->offset = dioffset >> L2INODESLOTSIZE;
808 copy_to_dinode(dp, ip); 807 copy_to_dinode(dp, ip);
@@ -813,17 +812,6 @@ int diWrite(tid_t tid, struct inode *ip)
813 lv->length = 1; 812 lv->length = 1;
814 dilinelock->index++; 813 dilinelock->index++;
815 814
816#ifdef _JFS_FASTDASD
817 /*
818 * We aren't logging changes to the DASD used in directory inodes,
819 * but we need to write them to disk. If we don't unmount cleanly,
820 * mount will recalculate the DASD used.
821 */
822 if (S_ISDIR(ip->i_mode)
823 && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
824 memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
825#endif /* _JFS_FASTDASD */
826
827 /* release the buffer holding the updated on-disk inode. 815 /* release the buffer holding the updated on-disk inode.
828 * the buffer will be later written by commit processing. 816 * the buffer will be later written by commit processing.
829 */ 817 */
@@ -834,9 +822,9 @@ int diWrite(tid_t tid, struct inode *ip)
834 822
835 823
836/* 824/*
837 * NAME: diFree(ip) 825 * NAME: diFree(ip)
838 * 826 *
839 * FUNCTION: free a specified inode from the inode working map 827 * FUNCTION: free a specified inode from the inode working map
840 * for a fileset or aggregate. 828 * for a fileset or aggregate.
841 * 829 *
842 * if the inode to be freed represents the first (only) 830 * if the inode to be freed represents the first (only)
@@ -865,11 +853,11 @@ int diWrite(tid_t tid, struct inode *ip)
865 * any updates and are held until all updates are complete. 853 * any updates and are held until all updates are complete.
866 * 854 *
867 * PARAMETERS: 855 * PARAMETERS:
868 * ip - inode to be freed. 856 * ip - inode to be freed.
869 * 857 *
870 * RETURN VALUES: 858 * RETURN VALUES:
871 * 0 - success 859 * 0 - success
872 * -EIO - i/o error. 860 * -EIO - i/o error.
873 */ 861 */
874int diFree(struct inode *ip) 862int diFree(struct inode *ip)
875{ 863{
@@ -902,7 +890,8 @@ int diFree(struct inode *ip)
902 * the map. 890 * the map.
903 */ 891 */
904 if (iagno >= imap->im_nextiag) { 892 if (iagno >= imap->im_nextiag) {
905 dump_mem("imap", imap, 32); 893 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
894 imap, 32, 0);
906 jfs_error(ip->i_sb, 895 jfs_error(ip->i_sb,
907 "diFree: inum = %d, iagno = %d, nextiag = %d", 896 "diFree: inum = %d, iagno = %d, nextiag = %d",
908 (uint) inum, iagno, imap->im_nextiag); 897 (uint) inum, iagno, imap->im_nextiag);
@@ -964,8 +953,8 @@ int diFree(struct inode *ip)
964 return -EIO; 953 return -EIO;
965 } 954 }
966 /* 955 /*
967 * inode extent still has some inodes or below low water mark: 956 * inode extent still has some inodes or below low water mark:
968 * keep the inode extent; 957 * keep the inode extent;
969 */ 958 */
970 if (bitmap || 959 if (bitmap ||
971 imap->im_agctl[agno].numfree < 96 || 960 imap->im_agctl[agno].numfree < 96 ||
@@ -1047,12 +1036,12 @@ int diFree(struct inode *ip)
1047 1036
1048 1037
1049 /* 1038 /*
1050 * inode extent has become free and above low water mark: 1039 * inode extent has become free and above low water mark:
1051 * free the inode extent; 1040 * free the inode extent;
1052 */ 1041 */
1053 1042
1054 /* 1043 /*
1055 * prepare to update iag list(s) (careful update step 1) 1044 * prepare to update iag list(s) (careful update step 1)
1056 */ 1045 */
1057 amp = bmp = cmp = dmp = NULL; 1046 amp = bmp = cmp = dmp = NULL;
1058 fwd = back = -1; 1047 fwd = back = -1;
@@ -1152,7 +1141,7 @@ int diFree(struct inode *ip)
1152 invalidate_pxd_metapages(ip, freepxd); 1141 invalidate_pxd_metapages(ip, freepxd);
1153 1142
1154 /* 1143 /*
1155 * update iag list(s) (careful update step 2) 1144 * update iag list(s) (careful update step 2)
1156 */ 1145 */
1157 /* add the iag to the ag extent free list if this is the 1146 /* add the iag to the ag extent free list if this is the
1158 * first free extent for the iag. 1147 * first free extent for the iag.
@@ -1338,20 +1327,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
1338 1327
1339 1328
1340/* 1329/*
1341 * NAME: diAlloc(pip,dir,ip) 1330 * NAME: diAlloc(pip,dir,ip)
1342 * 1331 *
1343 * FUNCTION: allocate a disk inode from the inode working map 1332 * FUNCTION: allocate a disk inode from the inode working map
1344 * for a fileset or aggregate. 1333 * for a fileset or aggregate.
1345 * 1334 *
1346 * PARAMETERS: 1335 * PARAMETERS:
1347 * pip - pointer to incore inode for the parent inode. 1336 * pip - pointer to incore inode for the parent inode.
1348 * dir - 'true' if the new disk inode is for a directory. 1337 * dir - 'true' if the new disk inode is for a directory.
1349 * ip - pointer to a new inode 1338 * ip - pointer to a new inode
1350 * 1339 *
1351 * RETURN VALUES: 1340 * RETURN VALUES:
1352 * 0 - success. 1341 * 0 - success.
1353 * -ENOSPC - insufficient disk resources. 1342 * -ENOSPC - insufficient disk resources.
1354 * -EIO - i/o error. 1343 * -EIO - i/o error.
1355 */ 1344 */
1356int diAlloc(struct inode *pip, bool dir, struct inode *ip) 1345int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1357{ 1346{
@@ -1433,7 +1422,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1433 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); 1422 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
1434 1423
1435 /* 1424 /*
1436 * try to allocate from the IAG 1425 * try to allocate from the IAG
1437 */ 1426 */
1438 /* check if the inode may be allocated from the iag 1427 /* check if the inode may be allocated from the iag
1439 * (i.e. the inode has free inodes or new extent can be added). 1428 * (i.e. the inode has free inodes or new extent can be added).
@@ -1633,9 +1622,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1633 1622
1634 1623
1635/* 1624/*
1636 * NAME: diAllocAG(imap,agno,dir,ip) 1625 * NAME: diAllocAG(imap,agno,dir,ip)
1637 * 1626 *
1638 * FUNCTION: allocate a disk inode from the allocation group. 1627 * FUNCTION: allocate a disk inode from the allocation group.
1639 * 1628 *
1640 * this routine first determines if a new extent of free 1629 * this routine first determines if a new extent of free
1641 * inodes should be added for the allocation group, with 1630 * inodes should be added for the allocation group, with
@@ -1649,17 +1638,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1649 * PRE CONDITION: Already have the AG lock for this AG. 1638 * PRE CONDITION: Already have the AG lock for this AG.
1650 * 1639 *
1651 * PARAMETERS: 1640 * PARAMETERS:
1652 * imap - pointer to inode map control structure. 1641 * imap - pointer to inode map control structure.
1653 * agno - allocation group to allocate from. 1642 * agno - allocation group to allocate from.
1654 * dir - 'true' if the new disk inode is for a directory. 1643 * dir - 'true' if the new disk inode is for a directory.
1655 * ip - pointer to the new inode to be filled in on successful return 1644 * ip - pointer to the new inode to be filled in on successful return
1656 * with the disk inode number allocated, its extent address 1645 * with the disk inode number allocated, its extent address
1657 * and the start of the ag. 1646 * and the start of the ag.
1658 * 1647 *
1659 * RETURN VALUES: 1648 * RETURN VALUES:
1660 * 0 - success. 1649 * 0 - success.
1661 * -ENOSPC - insufficient disk resources. 1650 * -ENOSPC - insufficient disk resources.
1662 * -EIO - i/o error. 1651 * -EIO - i/o error.
1663 */ 1652 */
1664static int 1653static int
1665diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) 1654diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1709,9 +1698,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1709 1698
1710 1699
1711/* 1700/*
1712 * NAME: diAllocAny(imap,agno,dir,iap) 1701 * NAME: diAllocAny(imap,agno,dir,iap)
1713 * 1702 *
1714 * FUNCTION: allocate a disk inode from any other allocation group. 1703 * FUNCTION: allocate a disk inode from any other allocation group.
1715 * 1704 *
1716 * this routine is called when an allocation attempt within 1705 * this routine is called when an allocation attempt within
1717 * the primary allocation group has failed. if attempts to 1706 * the primary allocation group has failed. if attempts to
@@ -1719,17 +1708,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1719 * specified primary group. 1708 * specified primary group.
1720 * 1709 *
1721 * PARAMETERS: 1710 * PARAMETERS:
1722 * imap - pointer to inode map control structure. 1711 * imap - pointer to inode map control structure.
1723 * agno - primary allocation group (to avoid). 1712 * agno - primary allocation group (to avoid).
1724 * dir - 'true' if the new disk inode is for a directory. 1713 * dir - 'true' if the new disk inode is for a directory.
1725 * ip - pointer to a new inode to be filled in on successful return 1714 * ip - pointer to a new inode to be filled in on successful return
1726 * with the disk inode number allocated, its extent address 1715 * with the disk inode number allocated, its extent address
1727 * and the start of the ag. 1716 * and the start of the ag.
1728 * 1717 *
1729 * RETURN VALUES: 1718 * RETURN VALUES:
1730 * 0 - success. 1719 * 0 - success.
1731 * -ENOSPC - insufficient disk resources. 1720 * -ENOSPC - insufficient disk resources.
1732 * -EIO - i/o error. 1721 * -EIO - i/o error.
1733 */ 1722 */
1734static int 1723static int
1735diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) 1724diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1772,9 +1761,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
1772 1761
1773 1762
1774/* 1763/*
1775 * NAME: diAllocIno(imap,agno,ip) 1764 * NAME: diAllocIno(imap,agno,ip)
1776 * 1765 *
1777 * FUNCTION: allocate a disk inode from the allocation group's free 1766 * FUNCTION: allocate a disk inode from the allocation group's free
1778 * inode list, returning an error if this free list is 1767 * inode list, returning an error if this free list is
1779 * empty (i.e. no iags on the list). 1768 * empty (i.e. no iags on the list).
1780 * 1769 *
@@ -1785,16 +1774,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
1785 * PRE CONDITION: Already have AG lock for this AG. 1774 * PRE CONDITION: Already have AG lock for this AG.
1786 * 1775 *
1787 * PARAMETERS: 1776 * PARAMETERS:
1788 * imap - pointer to inode map control structure. 1777 * imap - pointer to inode map control structure.
1789 * agno - allocation group. 1778 * agno - allocation group.
1790 * ip - pointer to new inode to be filled in on successful return 1779 * ip - pointer to new inode to be filled in on successful return
1791 * with the disk inode number allocated, its extent address 1780 * with the disk inode number allocated, its extent address
1792 * and the start of the ag. 1781 * and the start of the ag.
1793 * 1782 *
1794 * RETURN VALUES: 1783 * RETURN VALUES:
1795 * 0 - success. 1784 * 0 - success.
1796 * -ENOSPC - insufficient disk resources. 1785 * -ENOSPC - insufficient disk resources.
1797 * -EIO - i/o error. 1786 * -EIO - i/o error.
1798 */ 1787 */
1799static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) 1788static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1800{ 1789{
@@ -1890,7 +1879,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1890 1879
1891 1880
1892/* 1881/*
1893 * NAME: diAllocExt(imap,agno,ip) 1882 * NAME: diAllocExt(imap,agno,ip)
1894 * 1883 *
1895 * FUNCTION: add a new extent of free inodes to an iag, allocating 1884 * FUNCTION: add a new extent of free inodes to an iag, allocating
1896 * an inode from this extent to satisfy the current allocation 1885 * an inode from this extent to satisfy the current allocation
@@ -1910,16 +1899,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1910 * for the purpose of satisfying this request. 1899 * for the purpose of satisfying this request.
1911 * 1900 *
1912 * PARAMETERS: 1901 * PARAMETERS:
1913 * imap - pointer to inode map control structure. 1902 * imap - pointer to inode map control structure.
1914 * agno - allocation group number. 1903 * agno - allocation group number.
1915 * ip - pointer to new inode to be filled in on successful return 1904 * ip - pointer to new inode to be filled in on successful return
1916 * with the disk inode number allocated, its extent address 1905 * with the disk inode number allocated, its extent address
1917 * and the start of the ag. 1906 * and the start of the ag.
1918 * 1907 *
1919 * RETURN VALUES: 1908 * RETURN VALUES:
1920 * 0 - success. 1909 * 0 - success.
1921 * -ENOSPC - insufficient disk resources. 1910 * -ENOSPC - insufficient disk resources.
1922 * -EIO - i/o error. 1911 * -EIO - i/o error.
1923 */ 1912 */
1924static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) 1913static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1925{ 1914{
@@ -2010,7 +1999,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
2010 1999
2011 2000
2012/* 2001/*
2013 * NAME: diAllocBit(imap,iagp,ino) 2002 * NAME: diAllocBit(imap,iagp,ino)
2014 * 2003 *
2015 * FUNCTION: allocate a backed inode from an iag. 2004 * FUNCTION: allocate a backed inode from an iag.
2016 * 2005 *
@@ -2030,14 +2019,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
2030 * this AG. Must have read lock on imap inode. 2019 * this AG. Must have read lock on imap inode.
2031 * 2020 *
2032 * PARAMETERS: 2021 * PARAMETERS:
2033 * imap - pointer to inode map control structure. 2022 * imap - pointer to inode map control structure.
2034 * iagp - pointer to iag. 2023 * iagp - pointer to iag.
2035 * ino - inode number to be allocated within the iag. 2024 * ino - inode number to be allocated within the iag.
2036 * 2025 *
2037 * RETURN VALUES: 2026 * RETURN VALUES:
2038 * 0 - success. 2027 * 0 - success.
2039 * -ENOSPC - insufficient disk resources. 2028 * -ENOSPC - insufficient disk resources.
2040 * -EIO - i/o error. 2029 * -EIO - i/o error.
2041 */ 2030 */
2042static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) 2031static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2043{ 2032{
@@ -2144,11 +2133,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2144 2133
2145 2134
2146/* 2135/*
2147 * NAME: diNewExt(imap,iagp,extno) 2136 * NAME: diNewExt(imap,iagp,extno)
2148 * 2137 *
2149 * FUNCTION: initialize a new extent of inodes for an iag, allocating 2138 * FUNCTION: initialize a new extent of inodes for an iag, allocating
2150 * the first inode of the extent for use for the current 2139 * the first inode of the extent for use for the current
2151 * allocation request. 2140 * allocation request.
2152 * 2141 *
2153 * disk resources are allocated for the new extent of inodes 2142 * disk resources are allocated for the new extent of inodes
2154 * and the inodes themselves are initialized to reflect their 2143 * and the inodes themselves are initialized to reflect their
@@ -2177,14 +2166,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2177 * this AG. Must have read lock on imap inode. 2166 * this AG. Must have read lock on imap inode.
2178 * 2167 *
2179 * PARAMETERS: 2168 * PARAMETERS:
2180 * imap - pointer to inode map control structure. 2169 * imap - pointer to inode map control structure.
2181 * iagp - pointer to iag. 2170 * iagp - pointer to iag.
2182 * extno - extent number. 2171 * extno - extent number.
2183 * 2172 *
2184 * RETURN VALUES: 2173 * RETURN VALUES:
2185 * 0 - success. 2174 * 0 - success.
2186 * -ENOSPC - insufficient disk resources. 2175 * -ENOSPC - insufficient disk resources.
2187 * -EIO - i/o error. 2176 * -EIO - i/o error.
2188 */ 2177 */
2189static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) 2178static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2190{ 2179{
@@ -2430,7 +2419,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2430 2419
2431 2420
2432/* 2421/*
2433 * NAME: diNewIAG(imap,iagnop,agno) 2422 * NAME: diNewIAG(imap,iagnop,agno)
2434 * 2423 *
2435 * FUNCTION: allocate a new iag for an allocation group. 2424 * FUNCTION: allocate a new iag for an allocation group.
2436 * 2425 *
@@ -2443,16 +2432,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2443 * and returned to satisfy the request. 2432 * and returned to satisfy the request.
2444 * 2433 *
2445 * PARAMETERS: 2434 * PARAMETERS:
2446 * imap - pointer to inode map control structure. 2435 * imap - pointer to inode map control structure.
2447 * iagnop - pointer to an iag number set with the number of the 2436 * iagnop - pointer to an iag number set with the number of the
2448 * newly allocated iag upon successful return. 2437 * newly allocated iag upon successful return.
2449 * agno - allocation group number. 2438 * agno - allocation group number.
2450 * bpp - Buffer pointer to be filled in with new IAG's buffer 2439 * bpp - Buffer pointer to be filled in with new IAG's buffer
2451 * 2440 *
2452 * RETURN VALUES: 2441 * RETURN VALUES:
2453 * 0 - success. 2442 * 0 - success.
2454 * -ENOSPC - insufficient disk resources. 2443 * -ENOSPC - insufficient disk resources.
2455 * -EIO - i/o error. 2444 * -EIO - i/o error.
2456 * 2445 *
2457 * serialization: 2446 * serialization:
2458 * AG lock held on entry/exit; 2447 * AG lock held on entry/exit;
@@ -2461,7 +2450,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2461 * 2450 *
2462 * note: new iag transaction: 2451 * note: new iag transaction:
2463 * . synchronously write iag; 2452 * . synchronously write iag;
2464 * . write log of xtree and inode of imap; 2453 * . write log of xtree and inode of imap;
2465 * . commit; 2454 * . commit;
2466 * . synchronous write of xtree (right to left, bottom to top); 2455 * . synchronous write of xtree (right to left, bottom to top);
2467 * . at start of logredo(): init in-memory imap with one additional iag page; 2456 * . at start of logredo(): init in-memory imap with one additional iag page;
@@ -2481,9 +2470,6 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2481 s64 xaddr = 0; 2470 s64 xaddr = 0;
2482 s64 blkno; 2471 s64 blkno;
2483 tid_t tid; 2472 tid_t tid;
2484#ifdef _STILL_TO_PORT
2485 xad_t xad;
2486#endif /* _STILL_TO_PORT */
2487 struct inode *iplist[1]; 2473 struct inode *iplist[1];
2488 2474
2489 /* pick up pointers to the inode map and mount inodes */ 2475 /* pick up pointers to the inode map and mount inodes */
@@ -2674,15 +2660,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2674} 2660}
2675 2661
2676/* 2662/*
2677 * NAME: diIAGRead() 2663 * NAME: diIAGRead()
2678 * 2664 *
2679 * FUNCTION: get the buffer for the specified iag within a fileset 2665 * FUNCTION: get the buffer for the specified iag within a fileset
2680 * or aggregate inode map. 2666 * or aggregate inode map.
2681 * 2667 *
2682 * PARAMETERS: 2668 * PARAMETERS:
2683 * imap - pointer to inode map control structure. 2669 * imap - pointer to inode map control structure.
2684 * iagno - iag number. 2670 * iagno - iag number.
2685 * bpp - point to buffer pointer to be filled in on successful 2671 * bpp - point to buffer pointer to be filled in on successful
2686 * exit. 2672 * exit.
2687 * 2673 *
2688 * SERIALIZATION: 2674 * SERIALIZATION:
@@ -2691,8 +2677,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2691 * the read lock is unnecessary.) 2677 * the read lock is unnecessary.)
2692 * 2678 *
2693 * RETURN VALUES: 2679 * RETURN VALUES:
2694 * 0 - success. 2680 * 0 - success.
2695 * -EIO - i/o error. 2681 * -EIO - i/o error.
2696 */ 2682 */
2697static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) 2683static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2698{ 2684{
@@ -2712,17 +2698,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2712} 2698}
2713 2699
2714/* 2700/*
2715 * NAME: diFindFree() 2701 * NAME: diFindFree()
2716 * 2702 *
2717 * FUNCTION: find the first free bit in a word starting at 2703 * FUNCTION: find the first free bit in a word starting at
2718 * the specified bit position. 2704 * the specified bit position.
2719 * 2705 *
2720 * PARAMETERS: 2706 * PARAMETERS:
2721 * word - word to be examined. 2707 * word - word to be examined.
2722 * start - starting bit position. 2708 * start - starting bit position.
2723 * 2709 *
2724 * RETURN VALUES: 2710 * RETURN VALUES:
2725 * bit position of first free bit in the word or 32 if 2711 * bit position of first free bit in the word or 32 if
2726 * no free bits were found. 2712 * no free bits were found.
2727 */ 2713 */
2728static int diFindFree(u32 word, int start) 2714static int diFindFree(u32 word, int start)
@@ -2897,7 +2883,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2897 atomic_read(&imap->im_numfree)); 2883 atomic_read(&imap->im_numfree));
2898 2884
2899 /* 2885 /*
2900 * reconstruct imap 2886 * reconstruct imap
2901 * 2887 *
2902 * coalesce contiguous k (newAGSize/oldAGSize) AGs; 2888 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
2903 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; 2889 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
@@ -2913,7 +2899,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2913 } 2899 }
2914 2900
2915 /* 2901 /*
2916 * process each iag page of the map. 2902 * process each iag page of the map.
2917 * 2903 *
2918 * rebuild AG Free Inode List, AG Free Inode Extent List; 2904 * rebuild AG Free Inode List, AG Free Inode Extent List;
2919 */ 2905 */
@@ -2932,7 +2918,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2932 2918
2933 /* leave free iag in the free iag list */ 2919 /* leave free iag in the free iag list */
2934 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { 2920 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2935 release_metapage(bp); 2921 release_metapage(bp);
2936 continue; 2922 continue;
2937 } 2923 }
2938 2924
@@ -3063,13 +3049,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
3063} 3049}
3064 3050
3065/* 3051/*
3066 * NAME: copy_from_dinode() 3052 * NAME: copy_from_dinode()
3067 * 3053 *
3068 * FUNCTION: Copies inode info from disk inode to in-memory inode 3054 * FUNCTION: Copies inode info from disk inode to in-memory inode
3069 * 3055 *
3070 * RETURN VALUES: 3056 * RETURN VALUES:
3071 * 0 - success 3057 * 0 - success
3072 * -ENOMEM - insufficient memory 3058 * -ENOMEM - insufficient memory
3073 */ 3059 */
3074static int copy_from_dinode(struct dinode * dip, struct inode *ip) 3060static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3075{ 3061{
@@ -3151,9 +3137,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3151} 3137}
3152 3138
3153/* 3139/*
3154 * NAME: copy_to_dinode() 3140 * NAME: copy_to_dinode()
3155 * 3141 *
3156 * FUNCTION: Copies inode info from in-memory inode to disk inode 3142 * FUNCTION: Copies inode info from in-memory inode to disk inode
3157 */ 3143 */
3158static void copy_to_dinode(struct dinode * dip, struct inode *ip) 3144static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3159{ 3145{
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
index 4f9c346ed498..610a0e9d8941 100644
--- a/fs/jfs/jfs_imap.h
+++ b/fs/jfs/jfs_imap.h
@@ -24,17 +24,17 @@
24 * jfs_imap.h: disk inode manager 24 * jfs_imap.h: disk inode manager
25 */ 25 */
26 26
27#define EXTSPERIAG 128 /* number of disk inode extent per iag */ 27#define EXTSPERIAG 128 /* number of disk inode extent per iag */
28#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ 28#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */
29#define SMAPSZ 4 /* number of words per summary map */ 29#define SMAPSZ 4 /* number of words per summary map */
30#define EXTSPERSUM 32 /* number of extents per summary map entry */ 30#define EXTSPERSUM 32 /* number of extents per summary map entry */
31#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ 31#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */
32#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ 32#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */
33#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ 33#define MAXIAGS ((1<<20)-1) /* maximum number of iags */
34#define MAXAG 128 /* maximum number of allocation groups */ 34#define MAXAG 128 /* maximum number of allocation groups */
35 35
36#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ 36#define AMAPSIZE 512 /* bytes in the IAG allocation maps */
37#define SMAPSIZE 16 /* bytes in the IAG summary maps */ 37#define SMAPSIZE 16 /* bytes in the IAG summary maps */
38 38
39/* convert inode number to iag number */ 39/* convert inode number to iag number */
40#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) 40#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG)
@@ -60,31 +60,31 @@
60 * inode allocation group page (per 4096 inodes of an AG) 60 * inode allocation group page (per 4096 inodes of an AG)
61 */ 61 */
62struct iag { 62struct iag {
63 __le64 agstart; /* 8: starting block of ag */ 63 __le64 agstart; /* 8: starting block of ag */
64 __le32 iagnum; /* 4: inode allocation group number */ 64 __le32 iagnum; /* 4: inode allocation group number */
65 __le32 inofreefwd; /* 4: ag inode free list forward */ 65 __le32 inofreefwd; /* 4: ag inode free list forward */
66 __le32 inofreeback; /* 4: ag inode free list back */ 66 __le32 inofreeback; /* 4: ag inode free list back */
67 __le32 extfreefwd; /* 4: ag inode extent free list forward */ 67 __le32 extfreefwd; /* 4: ag inode extent free list forward */
68 __le32 extfreeback; /* 4: ag inode extent free list back */ 68 __le32 extfreeback; /* 4: ag inode extent free list back */
69 __le32 iagfree; /* 4: iag free list */ 69 __le32 iagfree; /* 4: iag free list */
70 70
71 /* summary map: 1 bit per inode extent */ 71 /* summary map: 1 bit per inode extent */
72 __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; 72 __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
73 * note: this indicates free and backed 73 * note: this indicates free and backed
74 * inodes, if the extent is not backed the 74 * inodes, if the extent is not backed the
75 * value will be 1. if the extent is 75 * value will be 1. if the extent is
76 * backed but all inodes are being used the 76 * backed but all inodes are being used the
77 * value will be 1. if the extent is 77 * value will be 1. if the extent is
78 * backed but at least one of the inodes is 78 * backed but at least one of the inodes is
79 * free the value will be 0. 79 * free the value will be 0.
80 */ 80 */
81 __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ 81 __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
82 __le32 nfreeinos; /* 4: number of free inodes */ 82 __le32 nfreeinos; /* 4: number of free inodes */
83 __le32 nfreeexts; /* 4: number of free extents */ 83 __le32 nfreeexts; /* 4: number of free extents */
84 /* (72) */ 84 /* (72) */
85 u8 pad[1976]; /* 1976: pad to 2048 bytes */ 85 u8 pad[1976]; /* 1976: pad to 2048 bytes */
86 /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ 86 /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
87 __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ 87 __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */
88 __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ 88 __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */
89 pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ 89 pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */
90}; /* (4096) */ 90}; /* (4096) */
@@ -93,44 +93,44 @@ struct iag {
93 * per AG control information (in inode map control page) 93 * per AG control information (in inode map control page)
94 */ 94 */
95struct iagctl_disk { 95struct iagctl_disk {
96 __le32 inofree; /* 4: free inode list anchor */ 96 __le32 inofree; /* 4: free inode list anchor */
97 __le32 extfree; /* 4: free extent list anchor */ 97 __le32 extfree; /* 4: free extent list anchor */
98 __le32 numinos; /* 4: number of backed inodes */ 98 __le32 numinos; /* 4: number of backed inodes */
99 __le32 numfree; /* 4: number of free inodes */ 99 __le32 numfree; /* 4: number of free inodes */
100}; /* (16) */ 100}; /* (16) */
101 101
102struct iagctl { 102struct iagctl {
103 int inofree; /* free inode list anchor */ 103 int inofree; /* free inode list anchor */
104 int extfree; /* free extent list anchor */ 104 int extfree; /* free extent list anchor */
105 int numinos; /* number of backed inodes */ 105 int numinos; /* number of backed inodes */
106 int numfree; /* number of free inodes */ 106 int numfree; /* number of free inodes */
107}; 107};
108 108
109/* 109/*
110 * per fileset/aggregate inode map control page 110 * per fileset/aggregate inode map control page
111 */ 111 */
112struct dinomap_disk { 112struct dinomap_disk {
113 __le32 in_freeiag; /* 4: free iag list anchor */ 113 __le32 in_freeiag; /* 4: free iag list anchor */
114 __le32 in_nextiag; /* 4: next free iag number */ 114 __le32 in_nextiag; /* 4: next free iag number */
115 __le32 in_numinos; /* 4: num of backed inodes */ 115 __le32 in_numinos; /* 4: num of backed inodes */
116 __le32 in_numfree; /* 4: num of free backed inodes */ 116 __le32 in_numfree; /* 4: num of free backed inodes */
117 __le32 in_nbperiext; /* 4: num of blocks per inode extent */ 117 __le32 in_nbperiext; /* 4: num of blocks per inode extent */
118 __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ 118 __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */
119 __le32 in_diskblock; /* 4: for standalone test driver */ 119 __le32 in_diskblock; /* 4: for standalone test driver */
120 __le32 in_maxag; /* 4: for standalone test driver */ 120 __le32 in_maxag; /* 4: for standalone test driver */
121 u8 pad[2016]; /* 2016: pad to 2048 */ 121 u8 pad[2016]; /* 2016: pad to 2048 */
122 struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ 122 struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
123}; /* (4096) */ 123}; /* (4096) */
124 124
125struct dinomap { 125struct dinomap {
126 int in_freeiag; /* free iag list anchor */ 126 int in_freeiag; /* free iag list anchor */
127 int in_nextiag; /* next free iag number */ 127 int in_nextiag; /* next free iag number */
128 int in_numinos; /* num of backed inodes */ 128 int in_numinos; /* num of backed inodes */
129 int in_numfree; /* num of free backed inodes */ 129 int in_numfree; /* num of free backed inodes */
130 int in_nbperiext; /* num of blocks per inode extent */ 130 int in_nbperiext; /* num of blocks per inode extent */
131 int in_l2nbperiext; /* l2 of in_nbperiext */ 131 int in_l2nbperiext; /* l2 of in_nbperiext */
132 int in_diskblock; /* for standalone test driver */ 132 int in_diskblock; /* for standalone test driver */
133 int in_maxag; /* for standalone test driver */ 133 int in_maxag; /* for standalone test driver */
134 struct iagctl in_agctl[MAXAG]; /* AG control information */ 134 struct iagctl in_agctl[MAXAG]; /* AG control information */
135}; 135};
136 136
@@ -139,9 +139,9 @@ struct dinomap {
139 */ 139 */
140struct inomap { 140struct inomap {
141 struct dinomap im_imap; /* 4096: inode allocation control */ 141 struct dinomap im_imap; /* 4096: inode allocation control */
142 struct inode *im_ipimap; /* 4: ptr to inode for imap */ 142 struct inode *im_ipimap; /* 4: ptr to inode for imap */
143 struct mutex im_freelock; /* 4: iag free list lock */ 143 struct mutex im_freelock; /* 4: iag free list lock */
144 struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ 144 struct mutex im_aglock[MAXAG]; /* 512: per AG locks */
145 u32 *im_DBGdimap; 145 u32 *im_DBGdimap;
146 atomic_t im_numinos; /* num of backed inodes */ 146 atomic_t im_numinos; /* num of backed inodes */
147 atomic_t im_numfree; /* num of free backed inodes */ 147 atomic_t im_numfree; /* num of free backed inodes */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 8f453eff3c83..cb8f30985ad1 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -40,7 +40,7 @@ struct jfs_inode_info {
40 uint mode2; /* jfs-specific mode */ 40 uint mode2; /* jfs-specific mode */
41 uint saved_uid; /* saved for uid mount option */ 41 uint saved_uid; /* saved for uid mount option */
42 uint saved_gid; /* saved for gid mount option */ 42 uint saved_gid; /* saved for gid mount option */
43 pxd_t ixpxd; /* inode extent descriptor */ 43 pxd_t ixpxd; /* inode extent descriptor */
44 dxd_t acl; /* dxd describing acl */ 44 dxd_t acl; /* dxd describing acl */
45 dxd_t ea; /* dxd describing ea */ 45 dxd_t ea; /* dxd describing ea */
46 time_t otime; /* time created */ 46 time_t otime; /* time created */
@@ -190,7 +190,7 @@ struct jfs_sb_info {
190 uint gengen; /* inode generation generator*/ 190 uint gengen; /* inode generation generator*/
191 uint inostamp; /* shows inode belongs to fileset*/ 191 uint inostamp; /* shows inode belongs to fileset*/
192 192
193 /* Formerly in ipbmap */ 193 /* Formerly in ipbmap */
194 struct bmap *bmap; /* incore bmap descriptor */ 194 struct bmap *bmap; /* incore bmap descriptor */
195 struct nls_table *nls_tab; /* current codepage */ 195 struct nls_table *nls_tab; /* current codepage */
196 struct inode *direct_inode; /* metadata inode */ 196 struct inode *direct_inode; /* metadata inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 44a2f33cb98d..de3e4a506dbc 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
244 goto writeRecord; 244 goto writeRecord;
245 245
246 /* 246 /*
247 * initialize/update page/transaction recovery lsn 247 * initialize/update page/transaction recovery lsn
248 */ 248 */
249 lsn = log->lsn; 249 lsn = log->lsn;
250 250
@@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
263 } 263 }
264 264
265 /* 265 /*
266 * initialize/update lsn of tblock of the page 266 * initialize/update lsn of tblock of the page
267 * 267 *
268 * transaction inherits oldest lsn of pages associated 268 * transaction inherits oldest lsn of pages associated
269 * with allocation/deallocation of resources (their 269 * with allocation/deallocation of resources (their
@@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
307 LOGSYNC_UNLOCK(log, flags); 307 LOGSYNC_UNLOCK(log, flags);
308 308
309 /* 309 /*
310 * write the log record 310 * write the log record
311 */ 311 */
312 writeRecord: 312 writeRecord:
313 lsn = lmWriteRecord(log, tblk, lrd, tlck); 313 lsn = lmWriteRecord(log, tblk, lrd, tlck);
@@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
372 goto moveLrd; 372 goto moveLrd;
373 373
374 /* 374 /*
375 * move log record data 375 * move log record data
376 */ 376 */
377 /* retrieve source meta-data page to log */ 377 /* retrieve source meta-data page to log */
378 if (tlck->flag & tlckPAGELOCK) { 378 if (tlck->flag & tlckPAGELOCK) {
@@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
465 } 465 }
466 466
467 /* 467 /*
468 * move log record descriptor 468 * move log record descriptor
469 */ 469 */
470 moveLrd: 470 moveLrd:
471 lrd->length = cpu_to_le16(len); 471 lrd->length = cpu_to_le16(len);
@@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log)
574 LOGGC_LOCK(log); 574 LOGGC_LOCK(log);
575 575
576 /* 576 /*
577 * write or queue the full page at the tail of write queue 577 * write or queue the full page at the tail of write queue
578 */ 578 */
579 /* get the tail tblk on commit queue */ 579 /* get the tail tblk on commit queue */
580 if (list_empty(&log->cqueue)) 580 if (list_empty(&log->cqueue))
@@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log)
625 LOGGC_UNLOCK(log); 625 LOGGC_UNLOCK(log);
626 626
627 /* 627 /*
628 * allocate/initialize next page 628 * allocate/initialize next page
629 */ 629 */
630 /* if log wraps, the first data page of log is 2 630 /* if log wraps, the first data page of log is 2
631 * (0 never used, 1 is superblock). 631 * (0 never used, 1 is superblock).
@@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
953 } 953 }
954 954
955 /* 955 /*
956 * forward syncpt 956 * forward syncpt
957 */ 957 */
958 /* if last sync is same as last syncpt, 958 /* if last sync is same as last syncpt,
959 * invoke sync point forward processing to update sync. 959 * invoke sync point forward processing to update sync.
@@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
989 lsn = log->lsn; 989 lsn = log->lsn;
990 990
991 /* 991 /*
992 * setup next syncpt trigger (SWAG) 992 * setup next syncpt trigger (SWAG)
993 */ 993 */
994 logsize = log->logsize; 994 logsize = log->logsize;
995 995
@@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1000 if (more < 2 * LOGPSIZE) { 1000 if (more < 2 * LOGPSIZE) {
1001 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); 1001 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1002 /* 1002 /*
1003 * log wrapping 1003 * log wrapping
1004 * 1004 *
1005 * option 1 - panic ? No.! 1005 * option 1 - panic ? No.!
1006 * option 2 - shutdown file systems 1006 * option 2 - shutdown file systems
1007 * associated with log ? 1007 * associated with log ?
1008 * option 3 - extend log ? 1008 * option 3 - extend log ?
1009 */ 1009 */
1010 /* 1010 /*
@@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
1062/* 1062/*
1063 * NAME: lmLogOpen() 1063 * NAME: lmLogOpen()
1064 * 1064 *
1065 * FUNCTION: open the log on first open; 1065 * FUNCTION: open the log on first open;
1066 * insert filesystem in the active list of the log. 1066 * insert filesystem in the active list of the log.
1067 * 1067 *
1068 * PARAMETER: ipmnt - file system mount inode 1068 * PARAMETER: ipmnt - file system mount inode
@@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb)
1113 init_waitqueue_head(&log->syncwait); 1113 init_waitqueue_head(&log->syncwait);
1114 1114
1115 /* 1115 /*
1116 * external log as separate logical volume 1116 * external log as separate logical volume
1117 * 1117 *
1118 * file systems to log may have n-to-1 relationship; 1118 * file systems to log may have n-to-1 relationship;
1119 */ 1119 */
@@ -1155,7 +1155,7 @@ journal_found:
1155 return 0; 1155 return 0;
1156 1156
1157 /* 1157 /*
1158 * unwind on error 1158 * unwind on error
1159 */ 1159 */
1160 shutdown: /* unwind lbmLogInit() */ 1160 shutdown: /* unwind lbmLogInit() */
1161 list_del(&log->journal_list); 1161 list_del(&log->journal_list);
@@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log)
1427 return 0; 1427 return 0;
1428 1428
1429 /* 1429 /*
1430 * unwind on error 1430 * unwind on error
1431 */ 1431 */
1432 errout30: /* release log page */ 1432 errout30: /* release log page */
1433 log->wqueue = NULL; 1433 log->wqueue = NULL;
@@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb)
1480 1480
1481 if (test_bit(log_INLINELOG, &log->flag)) { 1481 if (test_bit(log_INLINELOG, &log->flag)) {
1482 /* 1482 /*
1483 * in-line log in host file system 1483 * in-line log in host file system
1484 */ 1484 */
1485 rc = lmLogShutdown(log); 1485 rc = lmLogShutdown(log);
1486 kfree(log); 1486 kfree(log);
@@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb)
1504 goto out; 1504 goto out;
1505 1505
1506 /* 1506 /*
1507 * external log as separate logical volume 1507 * external log as separate logical volume
1508 */ 1508 */
1509 list_del(&log->journal_list); 1509 list_del(&log->journal_list);
1510 bdev = log->bdev; 1510 bdev = log->bdev;
@@ -1622,20 +1622,26 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1622 if (!list_empty(&log->synclist)) { 1622 if (!list_empty(&log->synclist)) {
1623 struct logsyncblk *lp; 1623 struct logsyncblk *lp;
1624 1624
1625 printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
1625 list_for_each_entry(lp, &log->synclist, synclist) { 1626 list_for_each_entry(lp, &log->synclist, synclist) {
1626 if (lp->xflag & COMMIT_PAGE) { 1627 if (lp->xflag & COMMIT_PAGE) {
1627 struct metapage *mp = (struct metapage *)lp; 1628 struct metapage *mp = (struct metapage *)lp;
1628 dump_mem("orphan metapage", lp, 1629 print_hex_dump(KERN_ERR, "metapage: ",
1629 sizeof(struct metapage)); 1630 DUMP_PREFIX_ADDRESS, 16, 4,
1630 dump_mem("page", mp->page, sizeof(struct page)); 1631 mp, sizeof(struct metapage), 0);
1631 } 1632 print_hex_dump(KERN_ERR, "page: ",
1632 else 1633 DUMP_PREFIX_ADDRESS, 16,
1633 dump_mem("orphan tblock", lp, 1634 sizeof(long), mp->page,
1634 sizeof(struct tblock)); 1635 sizeof(struct page), 0);
1636 } else
1637 print_hex_dump(KERN_ERR, "tblock:",
1638 DUMP_PREFIX_ADDRESS, 16, 4,
1639 lp, sizeof(struct tblock), 0);
1635 } 1640 }
1636 } 1641 }
1642#else
1643 WARN_ON(!list_empty(&log->synclist));
1637#endif 1644#endif
1638 //assert(list_empty(&log->synclist));
1639 clear_bit(log_FLUSH, &log->flag); 1645 clear_bit(log_FLUSH, &log->flag);
1640} 1646}
1641 1647
@@ -1723,7 +1729,7 @@ int lmLogShutdown(struct jfs_log * log)
1723 * 1729 *
1724 * PARAMETE: log - pointer to logs inode. 1730 * PARAMETE: log - pointer to logs inode.
1725 * fsdev - kdev_t of filesystem. 1731 * fsdev - kdev_t of filesystem.
1726 * serial - pointer to returned log serial number 1732 * serial - pointer to returned log serial number
1727 * activate - insert/remove device from active list. 1733 * activate - insert/remove device from active list.
1728 * 1734 *
1729 * RETURN: 0 - success 1735 * RETURN: 0 - success
@@ -1963,7 +1969,7 @@ static void lbmfree(struct lbuf * bp)
1963 * FUNCTION: add a log buffer to the log redrive list 1969 * FUNCTION: add a log buffer to the log redrive list
1964 * 1970 *
1965 * PARAMETER: 1971 * PARAMETER:
1966 * bp - log buffer 1972 * bp - log buffer
1967 * 1973 *
1968 * NOTES: 1974 * NOTES:
1969 * Takes log_redrive_lock. 1975 * Takes log_redrive_lock.
@@ -2054,7 +2060,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
2054 bp->l_flag = flag; 2060 bp->l_flag = flag;
2055 2061
2056 /* 2062 /*
2057 * insert bp at tail of write queue associated with log 2063 * insert bp at tail of write queue associated with log
2058 * 2064 *
2059 * (request is either for bp already/currently at head of queue 2065 * (request is either for bp already/currently at head of queue
2060 * or new bp to be inserted at tail) 2066 * or new bp to be inserted at tail)
@@ -2117,7 +2123,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2117 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); 2123 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2118 2124
2119 /* 2125 /*
2120 * initiate pageout of the page 2126 * initiate pageout of the page
2121 */ 2127 */
2122 lbmStartIO(bp); 2128 lbmStartIO(bp);
2123} 2129}
@@ -2128,7 +2134,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2128 * 2134 *
2129 * FUNCTION: Interface to DD strategy routine 2135 * FUNCTION: Interface to DD strategy routine
2130 * 2136 *
2131 * RETURN: none 2137 * RETURN: none
2132 * 2138 *
2133 * serialization: LCACHE_LOCK() is NOT held during log i/o; 2139 * serialization: LCACHE_LOCK() is NOT held during log i/o;
2134 */ 2140 */
@@ -2222,7 +2228,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2222 bio_put(bio); 2228 bio_put(bio);
2223 2229
2224 /* 2230 /*
2225 * pagein completion 2231 * pagein completion
2226 */ 2232 */
2227 if (bp->l_flag & lbmREAD) { 2233 if (bp->l_flag & lbmREAD) {
2228 bp->l_flag &= ~lbmREAD; 2234 bp->l_flag &= ~lbmREAD;
@@ -2236,7 +2242,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2236 } 2242 }
2237 2243
2238 /* 2244 /*
2239 * pageout completion 2245 * pageout completion
2240 * 2246 *
2241 * the bp at the head of write queue has completed pageout. 2247 * the bp at the head of write queue has completed pageout.
2242 * 2248 *
@@ -2302,7 +2308,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2302 } 2308 }
2303 2309
2304 /* 2310 /*
2305 * synchronous pageout: 2311 * synchronous pageout:
2306 * 2312 *
2307 * buffer has not necessarily been removed from write queue 2313 * buffer has not necessarily been removed from write queue
2308 * (e.g., synchronous write of partial-page with COMMIT): 2314 * (e.g., synchronous write of partial-page with COMMIT):
@@ -2316,7 +2322,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2316 } 2322 }
2317 2323
2318 /* 2324 /*
2319 * Group Commit pageout: 2325 * Group Commit pageout:
2320 */ 2326 */
2321 else if (bp->l_flag & lbmGC) { 2327 else if (bp->l_flag & lbmGC) {
2322 LCACHE_UNLOCK(flags); 2328 LCACHE_UNLOCK(flags);
@@ -2324,7 +2330,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2324 } 2330 }
2325 2331
2326 /* 2332 /*
2327 * asynchronous pageout: 2333 * asynchronous pageout:
2328 * 2334 *
2329 * buffer must have been removed from write queue: 2335 * buffer must have been removed from write queue:
2330 * insert buffer at head of freelist where it can be recycled 2336 * insert buffer at head of freelist where it can be recycled
@@ -2375,7 +2381,7 @@ int jfsIOWait(void *arg)
2375 * FUNCTION: format file system log 2381 * FUNCTION: format file system log
2376 * 2382 *
2377 * PARAMETERS: 2383 * PARAMETERS:
2378 * log - volume log 2384 * log - volume log
2379 * logAddress - start address of log space in FS block 2385 * logAddress - start address of log space in FS block
2380 * logSize - length of log space in FS block; 2386 * logSize - length of log space in FS block;
2381 * 2387 *
@@ -2407,16 +2413,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2407 npages = logSize >> sbi->l2nbperpage; 2413 npages = logSize >> sbi->l2nbperpage;
2408 2414
2409 /* 2415 /*
2410 * log space: 2416 * log space:
2411 * 2417 *
2412 * page 0 - reserved; 2418 * page 0 - reserved;
2413 * page 1 - log superblock; 2419 * page 1 - log superblock;
2414 * page 2 - log data page: A SYNC log record is written 2420 * page 2 - log data page: A SYNC log record is written
2415 * into this page at logform time; 2421 * into this page at logform time;
2416 * pages 3-N - log data page: set to empty log data pages; 2422 * pages 3-N - log data page: set to empty log data pages;
2417 */ 2423 */
2418 /* 2424 /*
2419 * init log superblock: log page 1 2425 * init log superblock: log page 1
2420 */ 2426 */
2421 logsuper = (struct logsuper *) bp->l_ldata; 2427 logsuper = (struct logsuper *) bp->l_ldata;
2422 2428
@@ -2436,7 +2442,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2436 goto exit; 2442 goto exit;
2437 2443
2438 /* 2444 /*
2439 * init pages 2 to npages-1 as log data pages: 2445 * init pages 2 to npages-1 as log data pages:
2440 * 2446 *
2441 * log page sequence number (lpsn) initialization: 2447 * log page sequence number (lpsn) initialization:
2442 * 2448 *
@@ -2479,7 +2485,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2479 goto exit; 2485 goto exit;
2480 2486
2481 /* 2487 /*
2482 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) 2488 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2483 */ 2489 */
2484 for (lspn = 0; lspn < npages - 3; lspn++) { 2490 for (lspn = 0; lspn < npages - 3; lspn++) {
2485 lp->h.page = lp->t.page = cpu_to_le32(lspn); 2491 lp->h.page = lp->t.page = cpu_to_le32(lspn);
@@ -2495,7 +2501,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2495 rc = 0; 2501 rc = 0;
2496exit: 2502exit:
2497 /* 2503 /*
2498 * finalize log 2504 * finalize log
2499 */ 2505 */
2500 /* release the buffer */ 2506 /* release the buffer */
2501 lbmFree(bp); 2507 lbmFree(bp);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index a53fb17ea219..1f85ef0ec045 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -144,7 +144,7 @@ struct logpage {
144 * 144 *
145 * (this comment should be rewritten !) 145 * (this comment should be rewritten !)
146 * jfs uses only "after" log records (only a single writer is allowed 146 * jfs uses only "after" log records (only a single writer is allowed
147 * in a page, pages are written to temporary paging space if 147 * in a page, pages are written to temporary paging space if
148 * if they must be written to disk before commit, and i/o is 148 * if they must be written to disk before commit, and i/o is
149 * scheduled for modified pages to their home location after 149 * scheduled for modified pages to their home location after
150 * the log records containing the after values and the commit 150 * the log records containing the after values and the commit
@@ -153,7 +153,7 @@ struct logpage {
153 * 153 *
154 * a log record consists of a data area of variable length followed by 154 * a log record consists of a data area of variable length followed by
155 * a descriptor of fixed size LOGRDSIZE bytes. 155 * a descriptor of fixed size LOGRDSIZE bytes.
156 * the data area is rounded up to an integral number of 4-bytes and 156 * the data area is rounded up to an integral number of 4-bytes and
157 * must be no longer than LOGPSIZE. 157 * must be no longer than LOGPSIZE.
158 * the descriptor is of size of multiple of 4-bytes and aligned on a 158 * the descriptor is of size of multiple of 4-bytes and aligned on a
159 * 4-byte boundary. 159 * 4-byte boundary.
@@ -215,13 +215,13 @@ struct lrd {
215 union { 215 union {
216 216
217 /* 217 /*
218 * COMMIT: commit 218 * COMMIT: commit
219 * 219 *
220 * transaction commit: no type-dependent information; 220 * transaction commit: no type-dependent information;
221 */ 221 */
222 222
223 /* 223 /*
224 * REDOPAGE: after-image 224 * REDOPAGE: after-image
225 * 225 *
226 * apply after-image; 226 * apply after-image;
227 * 227 *
@@ -236,7 +236,7 @@ struct lrd {
236 } redopage; /* (20) */ 236 } redopage; /* (20) */
237 237
238 /* 238 /*
239 * NOREDOPAGE: the page is freed 239 * NOREDOPAGE: the page is freed
240 * 240 *
241 * do not apply after-image records which precede this record 241 * do not apply after-image records which precede this record
242 * in the log with the same page block number to this page. 242 * in the log with the same page block number to this page.
@@ -252,7 +252,7 @@ struct lrd {
252 } noredopage; /* (20) */ 252 } noredopage; /* (20) */
253 253
254 /* 254 /*
255 * UPDATEMAP: update block allocation map 255 * UPDATEMAP: update block allocation map
256 * 256 *
257 * either in-line PXD, 257 * either in-line PXD,
258 * or out-of-line XADLIST; 258 * or out-of-line XADLIST;
@@ -268,7 +268,7 @@ struct lrd {
268 } updatemap; /* (20) */ 268 } updatemap; /* (20) */
269 269
270 /* 270 /*
271 * NOREDOINOEXT: the inode extent is freed 271 * NOREDOINOEXT: the inode extent is freed
272 * 272 *
273 * do not apply after-image records which precede this 273 * do not apply after-image records which precede this
274 * record in the log with the any of the 4 page block 274 * record in the log with the any of the 4 page block
@@ -286,7 +286,7 @@ struct lrd {
286 } noredoinoext; /* (20) */ 286 } noredoinoext; /* (20) */
287 287
288 /* 288 /*
289 * SYNCPT: log sync point 289 * SYNCPT: log sync point
290 * 290 *
291 * replay log upto syncpt address specified; 291 * replay log upto syncpt address specified;
292 */ 292 */
@@ -295,13 +295,13 @@ struct lrd {
295 } syncpt; 295 } syncpt;
296 296
297 /* 297 /*
298 * MOUNT: file system mount 298 * MOUNT: file system mount
299 * 299 *
300 * file system mount: no type-dependent information; 300 * file system mount: no type-dependent information;
301 */ 301 */
302 302
303 /* 303 /*
304 * ? FREEXTENT: free specified extent(s) 304 * ? FREEXTENT: free specified extent(s)
305 * 305 *
306 * free specified extent(s) from block allocation map 306 * free specified extent(s) from block allocation map
307 * N.B.: nextents should be length of data/sizeof(xad_t) 307 * N.B.: nextents should be length of data/sizeof(xad_t)
@@ -314,7 +314,7 @@ struct lrd {
314 } freextent; 314 } freextent;
315 315
316 /* 316 /*
317 * ? NOREDOFILE: this file is freed 317 * ? NOREDOFILE: this file is freed
318 * 318 *
319 * do not apply records which precede this record in the log 319 * do not apply records which precede this record in the log
320 * with the same inode number. 320 * with the same inode number.
@@ -330,7 +330,7 @@ struct lrd {
330 } noredofile; 330 } noredofile;
331 331
332 /* 332 /*
333 * ? NEWPAGE: 333 * ? NEWPAGE:
334 * 334 *
335 * metadata type dependent 335 * metadata type dependent
336 */ 336 */
@@ -342,7 +342,7 @@ struct lrd {
342 } newpage; 342 } newpage;
343 343
344 /* 344 /*
345 * ? DUMMY: filler 345 * ? DUMMY: filler
346 * 346 *
347 * no type-dependent information 347 * no type-dependent information
348 */ 348 */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 43d4f69afbec..77c7f1129dde 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -472,7 +472,8 @@ add_failed:
472 printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); 472 printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
473 goto skip; 473 goto skip;
474dump_bio: 474dump_bio:
475 dump_mem("bio", bio, sizeof(*bio)); 475 print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
476 4, bio, sizeof(*bio), 0);
476skip: 477skip:
477 bio_put(bio); 478 bio_put(bio);
478 unlock_page(page); 479 unlock_page(page);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 4dd479834897..644429acb8c0 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb);
80 */ 80 */
81int jfs_mount(struct super_block *sb) 81int jfs_mount(struct super_block *sb)
82{ 82{
83 int rc = 0; /* Return code */ 83 int rc = 0; /* Return code */
84 struct jfs_sb_info *sbi = JFS_SBI(sb); 84 struct jfs_sb_info *sbi = JFS_SBI(sb);
85 struct inode *ipaimap = NULL; 85 struct inode *ipaimap = NULL;
86 struct inode *ipaimap2 = NULL; 86 struct inode *ipaimap2 = NULL;
@@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb)
169 sbi->ipaimap2 = NULL; 169 sbi->ipaimap2 = NULL;
170 170
171 /* 171 /*
172 * mount (the only/single) fileset 172 * mount (the only/single) fileset
173 */ 173 */
174 /* 174 /*
175 * open fileset inode allocation map (aka fileset inode) 175 * open fileset inode allocation map (aka fileset inode)
@@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb)
195 goto out; 195 goto out;
196 196
197 /* 197 /*
198 * unwind on error 198 * unwind on error
199 */ 199 */
200 errout41: /* close fileset inode allocation map inode */ 200 errout41: /* close fileset inode allocation map inode */
201 diFreeSpecial(ipimap); 201 diFreeSpecial(ipimap);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 25430d0b0d59..7aa1f7004eaf 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -18,7 +18,7 @@
18 */ 18 */
19 19
20/* 20/*
21 * jfs_txnmgr.c: transaction manager 21 * jfs_txnmgr.c: transaction manager
22 * 22 *
23 * notes: 23 * notes:
24 * transaction starts with txBegin() and ends with txCommit() 24 * transaction starts with txBegin() and ends with txCommit()
@@ -60,7 +60,7 @@
60#include "jfs_debug.h" 60#include "jfs_debug.h"
61 61
62/* 62/*
63 * transaction management structures 63 * transaction management structures
64 */ 64 */
65static struct { 65static struct {
66 int freetid; /* index of a free tid structure */ 66 int freetid; /* index of a free tid structure */
@@ -103,19 +103,19 @@ module_param(nTxLock, int, 0);
103MODULE_PARM_DESC(nTxLock, 103MODULE_PARM_DESC(nTxLock,
104 "Number of transaction locks (max:65536)"); 104 "Number of transaction locks (max:65536)");
105 105
106struct tblock *TxBlock; /* transaction block table */ 106struct tblock *TxBlock; /* transaction block table */
107static int TxLockLWM; /* Low water mark for number of txLocks used */ 107static int TxLockLWM; /* Low water mark for number of txLocks used */
108static int TxLockHWM; /* High water mark for number of txLocks used */ 108static int TxLockHWM; /* High water mark for number of txLocks used */
109static int TxLockVHWM; /* Very High water mark */ 109static int TxLockVHWM; /* Very High water mark */
110struct tlock *TxLock; /* transaction lock table */ 110struct tlock *TxLock; /* transaction lock table */
111 111
112/* 112/*
113 * transaction management lock 113 * transaction management lock
114 */ 114 */
115static DEFINE_SPINLOCK(jfsTxnLock); 115static DEFINE_SPINLOCK(jfsTxnLock);
116 116
117#define TXN_LOCK() spin_lock(&jfsTxnLock) 117#define TXN_LOCK() spin_lock(&jfsTxnLock)
118#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) 118#define TXN_UNLOCK() spin_unlock(&jfsTxnLock)
119 119
120#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); 120#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock);
121#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) 121#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags)
@@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
148#define TXN_WAKEUP(event) wake_up_all(event) 148#define TXN_WAKEUP(event) wake_up_all(event)
149 149
150/* 150/*
151 * statistics 151 * statistics
152 */ 152 */
153static struct { 153static struct {
154 tid_t maxtid; /* 4: biggest tid ever used */ 154 tid_t maxtid; /* 4: biggest tid ever used */
@@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
181static void LogSyncRelease(struct metapage * mp); 181static void LogSyncRelease(struct metapage * mp);
182 182
183/* 183/*
184 * transaction block/lock management 184 * transaction block/lock management
185 * --------------------------------- 185 * ---------------------------------
186 */ 186 */
187 187
188/* 188/*
@@ -227,9 +227,9 @@ static void txLockFree(lid_t lid)
227} 227}
228 228
229/* 229/*
230 * NAME: txInit() 230 * NAME: txInit()
231 * 231 *
232 * FUNCTION: initialize transaction management structures 232 * FUNCTION: initialize transaction management structures
233 * 233 *
234 * RETURN: 234 * RETURN:
235 * 235 *
@@ -333,9 +333,9 @@ int txInit(void)
333} 333}
334 334
335/* 335/*
336 * NAME: txExit() 336 * NAME: txExit()
337 * 337 *
338 * FUNCTION: clean up when module is unloaded 338 * FUNCTION: clean up when module is unloaded
339 */ 339 */
340void txExit(void) 340void txExit(void)
341{ 341{
@@ -346,12 +346,12 @@ void txExit(void)
346} 346}
347 347
348/* 348/*
349 * NAME: txBegin() 349 * NAME: txBegin()
350 * 350 *
351 * FUNCTION: start a transaction. 351 * FUNCTION: start a transaction.
352 * 352 *
353 * PARAMETER: sb - superblock 353 * PARAMETER: sb - superblock
354 * flag - force for nested tx; 354 * flag - force for nested tx;
355 * 355 *
356 * RETURN: tid - transaction id 356 * RETURN: tid - transaction id
357 * 357 *
@@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag)
447} 447}
448 448
449/* 449/*
450 * NAME: txBeginAnon() 450 * NAME: txBeginAnon()
451 * 451 *
452 * FUNCTION: start an anonymous transaction. 452 * FUNCTION: start an anonymous transaction.
453 * Blocks if logsync or available tlocks are low to prevent 453 * Blocks if logsync or available tlocks are low to prevent
454 * anonymous tlocks from depleting supply. 454 * anonymous tlocks from depleting supply.
455 * 455 *
456 * PARAMETER: sb - superblock 456 * PARAMETER: sb - superblock
457 * 457 *
458 * RETURN: none 458 * RETURN: none
459 */ 459 */
@@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb)
489} 489}
490 490
491/* 491/*
492 * txEnd() 492 * txEnd()
493 * 493 *
494 * function: free specified transaction block. 494 * function: free specified transaction block.
495 * 495 *
496 * logsync barrier processing: 496 * logsync barrier processing:
497 * 497 *
498 * serialization: 498 * serialization:
499 */ 499 */
@@ -577,13 +577,13 @@ wakeup:
577} 577}
578 578
579/* 579/*
580 * txLock() 580 * txLock()
581 * 581 *
582 * function: acquire a transaction lock on the specified <mp> 582 * function: acquire a transaction lock on the specified <mp>
583 * 583 *
584 * parameter: 584 * parameter:
585 * 585 *
586 * return: transaction lock id 586 * return: transaction lock id
587 * 587 *
588 * serialization: 588 * serialization:
589 */ 589 */
@@ -829,12 +829,16 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
829 /* Only locks on ipimap or ipaimap should reach here */ 829 /* Only locks on ipimap or ipaimap should reach here */
830 /* assert(jfs_ip->fileset == AGGREGATE_I); */ 830 /* assert(jfs_ip->fileset == AGGREGATE_I); */
831 if (jfs_ip->fileset != AGGREGATE_I) { 831 if (jfs_ip->fileset != AGGREGATE_I) {
832 jfs_err("txLock: trying to lock locked page!"); 832 printk(KERN_ERR "txLock: trying to lock locked page!");
833 dump_mem("ip", ip, sizeof(struct inode)); 833 print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
834 dump_mem("mp", mp, sizeof(struct metapage)); 834 ip, sizeof(*ip), 0);
835 dump_mem("Locker's tblk", tid_to_tblock(tid), 835 print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
836 sizeof(struct tblock)); 836 mp, sizeof(*mp), 0);
837 dump_mem("Tlock", tlck, sizeof(struct tlock)); 837 print_hex_dump(KERN_ERR, "Locker's tblock: ",
838 DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
839 sizeof(struct tblock), 0);
840 print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
841 tlck, sizeof(*tlck), 0);
838 BUG(); 842 BUG();
839 } 843 }
840 INCREMENT(stattx.waitlock); /* statistics */ 844 INCREMENT(stattx.waitlock); /* statistics */
@@ -857,17 +861,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
857} 861}
858 862
859/* 863/*
860 * NAME: txRelease() 864 * NAME: txRelease()
861 * 865 *
862 * FUNCTION: Release buffers associated with transaction locks, but don't 866 * FUNCTION: Release buffers associated with transaction locks, but don't
863 * mark homeok yet. The allows other transactions to modify 867 * mark homeok yet. The allows other transactions to modify
864 * buffers, but won't let them go to disk until commit record 868 * buffers, but won't let them go to disk until commit record
865 * actually gets written. 869 * actually gets written.
866 * 870 *
867 * PARAMETER: 871 * PARAMETER:
868 * tblk - 872 * tblk -
869 * 873 *
870 * RETURN: Errors from subroutines. 874 * RETURN: Errors from subroutines.
871 */ 875 */
872static void txRelease(struct tblock * tblk) 876static void txRelease(struct tblock * tblk)
873{ 877{
@@ -896,10 +900,10 @@ static void txRelease(struct tblock * tblk)
896} 900}
897 901
898/* 902/*
899 * NAME: txUnlock() 903 * NAME: txUnlock()
900 * 904 *
901 * FUNCTION: Initiates pageout of pages modified by tid in journalled 905 * FUNCTION: Initiates pageout of pages modified by tid in journalled
902 * objects and frees their lockwords. 906 * objects and frees their lockwords.
903 */ 907 */
904static void txUnlock(struct tblock * tblk) 908static void txUnlock(struct tblock * tblk)
905{ 909{
@@ -983,10 +987,10 @@ static void txUnlock(struct tblock * tblk)
983} 987}
984 988
985/* 989/*
986 * txMaplock() 990 * txMaplock()
987 * 991 *
988 * function: allocate a transaction lock for freed page/entry; 992 * function: allocate a transaction lock for freed page/entry;
989 * for freed page, maplock is used as xtlock/dtlock type; 993 * for freed page, maplock is used as xtlock/dtlock type;
990 */ 994 */
991struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) 995struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
992{ 996{
@@ -1057,7 +1061,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
1057} 1061}
1058 1062
1059/* 1063/*
1060 * txLinelock() 1064 * txLinelock()
1061 * 1065 *
1062 * function: allocate a transaction lock for log vector list 1066 * function: allocate a transaction lock for log vector list
1063 */ 1067 */
@@ -1092,39 +1096,39 @@ struct linelock *txLinelock(struct linelock * tlock)
1092} 1096}
1093 1097
1094/* 1098/*
1095 * transaction commit management 1099 * transaction commit management
1096 * ----------------------------- 1100 * -----------------------------
1097 */ 1101 */
1098 1102
1099/* 1103/*
1100 * NAME: txCommit() 1104 * NAME: txCommit()
1101 * 1105 *
1102 * FUNCTION: commit the changes to the objects specified in 1106 * FUNCTION: commit the changes to the objects specified in
1103 * clist. For journalled segments only the 1107 * clist. For journalled segments only the
1104 * changes of the caller are committed, ie by tid. 1108 * changes of the caller are committed, ie by tid.
1105 * for non-journalled segments the data are flushed to 1109 * for non-journalled segments the data are flushed to
1106 * disk and then the change to the disk inode and indirect 1110 * disk and then the change to the disk inode and indirect
1107 * blocks committed (so blocks newly allocated to the 1111 * blocks committed (so blocks newly allocated to the
1108 * segment will be made a part of the segment atomically). 1112 * segment will be made a part of the segment atomically).
1109 * 1113 *
1110 * all of the segments specified in clist must be in 1114 * all of the segments specified in clist must be in
1111 * one file system. no more than 6 segments are needed 1115 * one file system. no more than 6 segments are needed
1112 * to handle all unix svcs. 1116 * to handle all unix svcs.
1113 * 1117 *
1114 * if the i_nlink field (i.e. disk inode link count) 1118 * if the i_nlink field (i.e. disk inode link count)
1115 * is zero, and the type of inode is a regular file or 1119 * is zero, and the type of inode is a regular file or
1116 * directory, or symbolic link , the inode is truncated 1120 * directory, or symbolic link , the inode is truncated
1117 * to zero length. the truncation is committed but the 1121 * to zero length. the truncation is committed but the
1118 * VM resources are unaffected until it is closed (see 1122 * VM resources are unaffected until it is closed (see
1119 * iput and iclose). 1123 * iput and iclose).
1120 * 1124 *
1121 * PARAMETER: 1125 * PARAMETER:
1122 * 1126 *
1123 * RETURN: 1127 * RETURN:
1124 * 1128 *
1125 * serialization: 1129 * serialization:
1126 * on entry the inode lock on each segment is assumed 1130 * on entry the inode lock on each segment is assumed
1127 * to be held. 1131 * to be held.
1128 * 1132 *
1129 * i/o error: 1133 * i/o error:
1130 */ 1134 */
@@ -1175,7 +1179,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1175 if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) 1179 if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1176 tblk->xflag |= COMMIT_LAZY; 1180 tblk->xflag |= COMMIT_LAZY;
1177 /* 1181 /*
1178 * prepare non-journaled objects for commit 1182 * prepare non-journaled objects for commit
1179 * 1183 *
1180 * flush data pages of non-journaled file 1184 * flush data pages of non-journaled file
1181 * to prevent the file getting non-initialized disk blocks 1185 * to prevent the file getting non-initialized disk blocks
@@ -1186,7 +1190,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1186 cd.nip = nip; 1190 cd.nip = nip;
1187 1191
1188 /* 1192 /*
1189 * acquire transaction lock on (on-disk) inodes 1193 * acquire transaction lock on (on-disk) inodes
1190 * 1194 *
1191 * update on-disk inode from in-memory inode 1195 * update on-disk inode from in-memory inode
1192 * acquiring transaction locks for AFTER records 1196 * acquiring transaction locks for AFTER records
@@ -1262,7 +1266,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1262 } 1266 }
1263 1267
1264 /* 1268 /*
1265 * write log records from transaction locks 1269 * write log records from transaction locks
1266 * 1270 *
1267 * txUpdateMap() resets XAD_NEW in XAD. 1271 * txUpdateMap() resets XAD_NEW in XAD.
1268 */ 1272 */
@@ -1294,7 +1298,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1294 !test_cflag(COMMIT_Nolink, tblk->u.ip))); 1298 !test_cflag(COMMIT_Nolink, tblk->u.ip)));
1295 1299
1296 /* 1300 /*
1297 * write COMMIT log record 1301 * write COMMIT log record
1298 */ 1302 */
1299 lrd->type = cpu_to_le16(LOG_COMMIT); 1303 lrd->type = cpu_to_le16(LOG_COMMIT);
1300 lrd->length = 0; 1304 lrd->length = 0;
@@ -1303,7 +1307,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1303 lmGroupCommit(log, tblk); 1307 lmGroupCommit(log, tblk);
1304 1308
1305 /* 1309 /*
1306 * - transaction is now committed - 1310 * - transaction is now committed -
1307 */ 1311 */
1308 1312
1309 /* 1313 /*
@@ -1314,11 +1318,11 @@ int txCommit(tid_t tid, /* transaction identifier */
1314 txForce(tblk); 1318 txForce(tblk);
1315 1319
1316 /* 1320 /*
1317 * update allocation map. 1321 * update allocation map.
1318 * 1322 *
1319 * update inode allocation map and inode: 1323 * update inode allocation map and inode:
1320 * free pager lock on memory object of inode if any. 1324 * free pager lock on memory object of inode if any.
1321 * update block allocation map. 1325 * update block allocation map.
1322 * 1326 *
1323 * txUpdateMap() resets XAD_NEW in XAD. 1327 * txUpdateMap() resets XAD_NEW in XAD.
1324 */ 1328 */
@@ -1326,7 +1330,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1326 txUpdateMap(tblk); 1330 txUpdateMap(tblk);
1327 1331
1328 /* 1332 /*
1329 * free transaction locks and pageout/free pages 1333 * free transaction locks and pageout/free pages
1330 */ 1334 */
1331 txRelease(tblk); 1335 txRelease(tblk);
1332 1336
@@ -1335,7 +1339,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1335 1339
1336 1340
1337 /* 1341 /*
1338 * reset in-memory object state 1342 * reset in-memory object state
1339 */ 1343 */
1340 for (k = 0; k < cd.nip; k++) { 1344 for (k = 0; k < cd.nip; k++) {
1341 ip = cd.iplist[k]; 1345 ip = cd.iplist[k];
@@ -1358,11 +1362,11 @@ int txCommit(tid_t tid, /* transaction identifier */
1358} 1362}
1359 1363
1360/* 1364/*
1361 * NAME: txLog() 1365 * NAME: txLog()
1362 * 1366 *
1363 * FUNCTION: Writes AFTER log records for all lines modified 1367 * FUNCTION: Writes AFTER log records for all lines modified
1364 * by tid for segments specified by inodes in comdata. 1368 * by tid for segments specified by inodes in comdata.
1365 * Code assumes only WRITELOCKS are recorded in lockwords. 1369 * Code assumes only WRITELOCKS are recorded in lockwords.
1366 * 1370 *
1367 * PARAMETERS: 1371 * PARAMETERS:
1368 * 1372 *
@@ -1421,12 +1425,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1421} 1425}
1422 1426
1423/* 1427/*
1424 * diLog() 1428 * diLog()
1425 * 1429 *
1426 * function: log inode tlock and format maplock to update bmap; 1430 * function: log inode tlock and format maplock to update bmap;
1427 */ 1431 */
1428static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1432static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1429 struct tlock * tlck, struct commit * cd) 1433 struct tlock * tlck, struct commit * cd)
1430{ 1434{
1431 int rc = 0; 1435 int rc = 0;
1432 struct metapage *mp; 1436 struct metapage *mp;
@@ -1442,7 +1446,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1442 pxd = &lrd->log.redopage.pxd; 1446 pxd = &lrd->log.redopage.pxd;
1443 1447
1444 /* 1448 /*
1445 * inode after image 1449 * inode after image
1446 */ 1450 */
1447 if (tlck->type & tlckENTRY) { 1451 if (tlck->type & tlckENTRY) {
1448 /* log after-image for logredo(): */ 1452 /* log after-image for logredo(): */
@@ -1456,7 +1460,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1456 tlck->flag |= tlckWRITEPAGE; 1460 tlck->flag |= tlckWRITEPAGE;
1457 } else if (tlck->type & tlckFREE) { 1461 } else if (tlck->type & tlckFREE) {
1458 /* 1462 /*
1459 * free inode extent 1463 * free inode extent
1460 * 1464 *
1461 * (pages of the freed inode extent have been invalidated and 1465 * (pages of the freed inode extent have been invalidated and
1462 * a maplock for free of the extent has been formatted at 1466 * a maplock for free of the extent has been formatted at
@@ -1498,7 +1502,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1498 jfs_err("diLog: UFO type tlck:0x%p", tlck); 1502 jfs_err("diLog: UFO type tlck:0x%p", tlck);
1499#ifdef _JFS_WIP 1503#ifdef _JFS_WIP
1500 /* 1504 /*
1501 * alloc/free external EA extent 1505 * alloc/free external EA extent
1502 * 1506 *
1503 * a maplock for txUpdateMap() to update bPWMAP for alloc/free 1507 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1504 * of the extent has been formatted at txLock() time; 1508 * of the extent has been formatted at txLock() time;
@@ -1534,9 +1538,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1534} 1538}
1535 1539
1536/* 1540/*
1537 * dataLog() 1541 * dataLog()
1538 * 1542 *
1539 * function: log data tlock 1543 * function: log data tlock
1540 */ 1544 */
1541static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1545static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1542 struct tlock * tlck) 1546 struct tlock * tlck)
@@ -1580,9 +1584,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1580} 1584}
1581 1585
1582/* 1586/*
1583 * dtLog() 1587 * dtLog()
1584 * 1588 *
1585 * function: log dtree tlock and format maplock to update bmap; 1589 * function: log dtree tlock and format maplock to update bmap;
1586 */ 1590 */
1587static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1591static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1588 struct tlock * tlck) 1592 struct tlock * tlck)
@@ -1603,10 +1607,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1603 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); 1607 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1604 1608
1605 /* 1609 /*
1606 * page extension via relocation: entry insertion; 1610 * page extension via relocation: entry insertion;
1607 * page extension in-place: entry insertion; 1611 * page extension in-place: entry insertion;
1608 * new right page from page split, reinitialized in-line 1612 * new right page from page split, reinitialized in-line
1609 * root from root page split: entry insertion; 1613 * root from root page split: entry insertion;
1610 */ 1614 */
1611 if (tlck->type & (tlckNEW | tlckEXTEND)) { 1615 if (tlck->type & (tlckNEW | tlckEXTEND)) {
1612 /* log after-image of the new page for logredo(): 1616 /* log after-image of the new page for logredo():
@@ -1641,8 +1645,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1641 } 1645 }
1642 1646
1643 /* 1647 /*
1644 * entry insertion/deletion, 1648 * entry insertion/deletion,
1645 * sibling page link update (old right page before split); 1649 * sibling page link update (old right page before split);
1646 */ 1650 */
1647 if (tlck->type & (tlckENTRY | tlckRELINK)) { 1651 if (tlck->type & (tlckENTRY | tlckRELINK)) {
1648 /* log after-image for logredo(): */ 1652 /* log after-image for logredo(): */
@@ -1658,11 +1662,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1658 } 1662 }
1659 1663
1660 /* 1664 /*
1661 * page deletion: page has been invalidated 1665 * page deletion: page has been invalidated
1662 * page relocation: source extent 1666 * page relocation: source extent
1663 * 1667 *
1664 * a maplock for free of the page has been formatted 1668 * a maplock for free of the page has been formatted
1665 * at txLock() time); 1669 * at txLock() time);
1666 */ 1670 */
1667 if (tlck->type & (tlckFREE | tlckRELOCATE)) { 1671 if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1668 /* log LOG_NOREDOPAGE of the deleted page for logredo() 1672 /* log LOG_NOREDOPAGE of the deleted page for logredo()
@@ -1683,9 +1687,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1683} 1687}
1684 1688
1685/* 1689/*
1686 * xtLog() 1690 * xtLog()
1687 * 1691 *
1688 * function: log xtree tlock and format maplock to update bmap; 1692 * function: log xtree tlock and format maplock to update bmap;
1689 */ 1693 */
1690static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1694static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1691 struct tlock * tlck) 1695 struct tlock * tlck)
@@ -1725,8 +1729,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1725 xadlock = (struct xdlistlock *) maplock; 1729 xadlock = (struct xdlistlock *) maplock;
1726 1730
1727 /* 1731 /*
1728 * entry insertion/extension; 1732 * entry insertion/extension;
1729 * sibling page link update (old right page before split); 1733 * sibling page link update (old right page before split);
1730 */ 1734 */
1731 if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { 1735 if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1732 /* log after-image for logredo(): 1736 /* log after-image for logredo():
@@ -1801,7 +1805,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1801 } 1805 }
1802 1806
1803 /* 1807 /*
1804 * page deletion: file deletion/truncation (ref. xtTruncate()) 1808 * page deletion: file deletion/truncation (ref. xtTruncate())
1805 * 1809 *
1806 * (page will be invalidated after log is written and bmap 1810 * (page will be invalidated after log is written and bmap
1807 * is updated from the page); 1811 * is updated from the page);
@@ -1908,13 +1912,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1908 } 1912 }
1909 1913
1910 /* 1914 /*
1911 * page/entry truncation: file truncation (ref. xtTruncate()) 1915 * page/entry truncation: file truncation (ref. xtTruncate())
1912 * 1916 *
1913 * |----------+------+------+---------------| 1917 * |----------+------+------+---------------|
1914 * | | | 1918 * | | |
1915 * | | hwm - hwm before truncation 1919 * | | hwm - hwm before truncation
1916 * | next - truncation point 1920 * | next - truncation point
1917 * lwm - lwm before truncation 1921 * lwm - lwm before truncation
1918 * header ? 1922 * header ?
1919 */ 1923 */
1920 if (tlck->type & tlckTRUNCATE) { 1924 if (tlck->type & tlckTRUNCATE) {
@@ -1937,7 +1941,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1937 twm = xtlck->twm.offset; 1941 twm = xtlck->twm.offset;
1938 1942
1939 /* 1943 /*
1940 * write log records 1944 * write log records
1941 */ 1945 */
1942 /* log after-image for logredo(): 1946 /* log after-image for logredo():
1943 * 1947 *
@@ -1997,7 +2001,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1997 } 2001 }
1998 2002
1999 /* 2003 /*
2000 * format maplock(s) for txUpdateMap() to update bmap 2004 * format maplock(s) for txUpdateMap() to update bmap
2001 */ 2005 */
2002 maplock->index = 0; 2006 maplock->index = 0;
2003 2007
@@ -2069,9 +2073,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2069} 2073}
2070 2074
2071/* 2075/*
2072 * mapLog() 2076 * mapLog()
2073 * 2077 *
2074 * function: log from maplock of freed data extents; 2078 * function: log from maplock of freed data extents;
2075 */ 2079 */
2076static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 2080static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2077 struct tlock * tlck) 2081 struct tlock * tlck)
@@ -2081,7 +2085,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2081 pxd_t *pxd; 2085 pxd_t *pxd;
2082 2086
2083 /* 2087 /*
2084 * page relocation: free the source page extent 2088 * page relocation: free the source page extent
2085 * 2089 *
2086 * a maplock for txUpdateMap() for free of the page 2090 * a maplock for txUpdateMap() for free of the page
2087 * has been formatted at txLock() time saving the src 2091 * has been formatted at txLock() time saving the src
@@ -2155,10 +2159,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2155} 2159}
2156 2160
2157/* 2161/*
2158 * txEA() 2162 * txEA()
2159 * 2163 *
2160 * function: acquire maplock for EA/ACL extents or 2164 * function: acquire maplock for EA/ACL extents or
2161 * set COMMIT_INLINE flag; 2165 * set COMMIT_INLINE flag;
2162 */ 2166 */
2163void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) 2167void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2164{ 2168{
@@ -2207,10 +2211,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2207} 2211}
2208 2212
2209/* 2213/*
2210 * txForce() 2214 * txForce()
2211 * 2215 *
2212 * function: synchronously write pages locked by transaction 2216 * function: synchronously write pages locked by transaction
2213 * after txLog() but before txUpdateMap(); 2217 * after txLog() but before txUpdateMap();
2214 */ 2218 */
2215static void txForce(struct tblock * tblk) 2219static void txForce(struct tblock * tblk)
2216{ 2220{
@@ -2273,10 +2277,10 @@ static void txForce(struct tblock * tblk)
2273} 2277}
2274 2278
2275/* 2279/*
2276 * txUpdateMap() 2280 * txUpdateMap()
2277 * 2281 *
2278 * function: update persistent allocation map (and working map 2282 * function: update persistent allocation map (and working map
2279 * if appropriate); 2283 * if appropriate);
2280 * 2284 *
2281 * parameter: 2285 * parameter:
2282 */ 2286 */
@@ -2298,7 +2302,7 @@ static void txUpdateMap(struct tblock * tblk)
2298 2302
2299 2303
2300 /* 2304 /*
2301 * update block allocation map 2305 * update block allocation map
2302 * 2306 *
2303 * update allocation state in pmap (and wmap) and 2307 * update allocation state in pmap (and wmap) and
2304 * update lsn of the pmap page; 2308 * update lsn of the pmap page;
@@ -2382,7 +2386,7 @@ static void txUpdateMap(struct tblock * tblk)
2382 } 2386 }
2383 } 2387 }
2384 /* 2388 /*
2385 * update inode allocation map 2389 * update inode allocation map
2386 * 2390 *
2387 * update allocation state in pmap and 2391 * update allocation state in pmap and
2388 * update lsn of the pmap page; 2392 * update lsn of the pmap page;
@@ -2407,24 +2411,24 @@ static void txUpdateMap(struct tblock * tblk)
2407} 2411}
2408 2412
2409/* 2413/*
2410 * txAllocPMap() 2414 * txAllocPMap()
2411 * 2415 *
2412 * function: allocate from persistent map; 2416 * function: allocate from persistent map;
2413 * 2417 *
2414 * parameter: 2418 * parameter:
2415 * ipbmap - 2419 * ipbmap -
2416 * malock - 2420 * malock -
2417 * xad list: 2421 * xad list:
2418 * pxd: 2422 * pxd:
2419 * 2423 *
2420 * maptype - 2424 * maptype -
2421 * allocate from persistent map; 2425 * allocate from persistent map;
2422 * free from persistent map; 2426 * free from persistent map;
2423 * (e.g., tmp file - free from working map at releae 2427 * (e.g., tmp file - free from working map at releae
2424 * of last reference); 2428 * of last reference);
2425 * free from persistent and working map; 2429 * free from persistent and working map;
2426 * 2430 *
2427 * lsn - log sequence number; 2431 * lsn - log sequence number;
2428 */ 2432 */
2429static void txAllocPMap(struct inode *ip, struct maplock * maplock, 2433static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2430 struct tblock * tblk) 2434 struct tblock * tblk)
@@ -2478,9 +2482,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2478} 2482}
2479 2483
2480/* 2484/*
2481 * txFreeMap() 2485 * txFreeMap()
2482 * 2486 *
2483 * function: free from persistent and/or working map; 2487 * function: free from persistent and/or working map;
2484 * 2488 *
2485 * todo: optimization 2489 * todo: optimization
2486 */ 2490 */
@@ -2579,9 +2583,9 @@ void txFreeMap(struct inode *ip,
2579} 2583}
2580 2584
2581/* 2585/*
2582 * txFreelock() 2586 * txFreelock()
2583 * 2587 *
2584 * function: remove tlock from inode anonymous locklist 2588 * function: remove tlock from inode anonymous locklist
2585 */ 2589 */
2586void txFreelock(struct inode *ip) 2590void txFreelock(struct inode *ip)
2587{ 2591{
@@ -2619,7 +2623,7 @@ void txFreelock(struct inode *ip)
2619} 2623}
2620 2624
2621/* 2625/*
2622 * txAbort() 2626 * txAbort()
2623 * 2627 *
2624 * function: abort tx before commit; 2628 * function: abort tx before commit;
2625 * 2629 *
@@ -2679,7 +2683,7 @@ void txAbort(tid_t tid, int dirty)
2679} 2683}
2680 2684
2681/* 2685/*
2682 * txLazyCommit(void) 2686 * txLazyCommit(void)
2683 * 2687 *
2684 * All transactions except those changing ipimap (COMMIT_FORCE) are 2688 * All transactions except those changing ipimap (COMMIT_FORCE) are
2685 * processed by this routine. This insures that the inode and block 2689 * processed by this routine. This insures that the inode and block
@@ -2728,7 +2732,7 @@ static void txLazyCommit(struct tblock * tblk)
2728} 2732}
2729 2733
2730/* 2734/*
2731 * jfs_lazycommit(void) 2735 * jfs_lazycommit(void)
2732 * 2736 *
2733 * To be run as a kernel daemon. If lbmIODone is called in an interrupt 2737 * To be run as a kernel daemon. If lbmIODone is called in an interrupt
2734 * context, or where blocking is not wanted, this routine will process 2738 * context, or where blocking is not wanted, this routine will process
@@ -2913,7 +2917,7 @@ void txResume(struct super_block *sb)
2913} 2917}
2914 2918
2915/* 2919/*
2916 * jfs_sync(void) 2920 * jfs_sync(void)
2917 * 2921 *
2918 * To be run as a kernel daemon. This is awakened when tlocks run low. 2922 * To be run as a kernel daemon. This is awakened when tlocks run low.
2919 * We write any inodes that have anonymous tlocks so they will become 2923 * We write any inodes that have anonymous tlocks so they will become
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index 7863cf21afca..ab7288937019 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h
@@ -94,7 +94,7 @@ extern struct tblock *TxBlock; /* transaction block table */
94 */ 94 */
95struct tlock { 95struct tlock {
96 lid_t next; /* 2: index next lockword on tid locklist 96 lid_t next; /* 2: index next lockword on tid locklist
97 * next lockword on freelist 97 * next lockword on freelist
98 */ 98 */
99 tid_t tid; /* 2: transaction id holding lock */ 99 tid_t tid; /* 2: transaction id holding lock */
100 100
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 09b252958687..649f9817accd 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -21,7 +21,7 @@
21/* 21/*
22 * jfs_types.h: 22 * jfs_types.h:
23 * 23 *
24 * basic type/utility definitions 24 * basic type/utility definitions
25 * 25 *
26 * note: this header file must be the 1st include file 26 * note: this header file must be the 1st include file
27 * of JFS include list in all JFS .c file. 27 * of JFS include list in all JFS .c file.
@@ -54,8 +54,8 @@ struct timestruc_t {
54 */ 54 */
55 55
56#define LEFTMOSTONE 0x80000000 56#define LEFTMOSTONE 0x80000000
57#define HIGHORDER 0x80000000u /* high order bit on */ 57#define HIGHORDER 0x80000000u /* high order bit on */
58#define ONES 0xffffffffu /* all bit on */ 58#define ONES 0xffffffffu /* all bit on */
59 59
60/* 60/*
61 * logical xd (lxd) 61 * logical xd (lxd)
@@ -148,7 +148,7 @@ typedef struct {
148#define sizeDXD(dxd) le32_to_cpu((dxd)->size) 148#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
149 149
150/* 150/*
151 * directory entry argument 151 * directory entry argument
152 */ 152 */
153struct component_name { 153struct component_name {
154 int namlen; 154 int namlen;
@@ -160,14 +160,14 @@ struct component_name {
160 * DASD limit information - stored in directory inode 160 * DASD limit information - stored in directory inode
161 */ 161 */
162struct dasd { 162struct dasd {
163 u8 thresh; /* Alert Threshold (in percent) */ 163 u8 thresh; /* Alert Threshold (in percent) */
164 u8 delta; /* Alert Threshold delta (in percent) */ 164 u8 delta; /* Alert Threshold delta (in percent) */
165 u8 rsrvd1; 165 u8 rsrvd1;
166 u8 limit_hi; /* DASD limit (in logical blocks) */ 166 u8 limit_hi; /* DASD limit (in logical blocks) */
167 __le32 limit_lo; /* DASD limit (in logical blocks) */ 167 __le32 limit_lo; /* DASD limit (in logical blocks) */
168 u8 rsrvd2[3]; 168 u8 rsrvd2[3];
169 u8 used_hi; /* DASD usage (in logical blocks) */ 169 u8 used_hi; /* DASD usage (in logical blocks) */
170 __le32 used_lo; /* DASD usage (in logical blocks) */ 170 __le32 used_lo; /* DASD usage (in logical blocks) */
171}; 171};
172 172
173#define DASDLIMIT(dasdp) \ 173#define DASDLIMIT(dasdp) \
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index a386f48c73fc..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb)
60 jfs_info("UnMount JFS: sb:0x%p", sb); 60 jfs_info("UnMount JFS: sb:0x%p", sb);
61 61
62 /* 62 /*
63 * update superblock and close log 63 * update superblock and close log
64 * 64 *
65 * if mounted read-write and log based recovery was enabled 65 * if mounted read-write and log based recovery was enabled
66 */ 66 */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index acc97c46d8a4..1543906a2e0d 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -16,7 +16,7 @@
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */ 17 */
18/* 18/*
19 * jfs_xtree.c: extent allocation descriptor B+-tree manager 19 * jfs_xtree.c: extent allocation descriptor B+-tree manager
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
@@ -32,30 +32,30 @@
32/* 32/*
33 * xtree local flag 33 * xtree local flag
34 */ 34 */
35#define XT_INSERT 0x00000001 35#define XT_INSERT 0x00000001
36 36
37/* 37/*
38 * xtree key/entry comparison: extent offset 38 * xtree key/entry comparison: extent offset
39 * 39 *
40 * return: 40 * return:
41 * -1: k < start of extent 41 * -1: k < start of extent
42 * 0: start_of_extent <= k <= end_of_extent 42 * 0: start_of_extent <= k <= end_of_extent
43 * 1: k > end_of_extent 43 * 1: k > end_of_extent
44 */ 44 */
45#define XT_CMP(CMP, K, X, OFFSET64)\ 45#define XT_CMP(CMP, K, X, OFFSET64)\
46{\ 46{\
47 OFFSET64 = offsetXAD(X);\ 47 OFFSET64 = offsetXAD(X);\
48 (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ 48 (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
49 ((K) < OFFSET64) ? -1 : 0;\ 49 ((K) < OFFSET64) ? -1 : 0;\
50} 50}
51 51
52/* write a xad entry */ 52/* write a xad entry */
53#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ 53#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
54{\ 54{\
55 (XAD)->flag = (FLAG);\ 55 (XAD)->flag = (FLAG);\
56 XADoffset((XAD), (OFF));\ 56 XADoffset((XAD), (OFF));\
57 XADlength((XAD), (LEN));\ 57 XADlength((XAD), (LEN));\
58 XADaddress((XAD), (ADDR));\ 58 XADaddress((XAD), (ADDR));\
59} 59}
60 60
61#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) 61#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
@@ -76,13 +76,13 @@
76 MP = NULL;\ 76 MP = NULL;\
77 RC = -EIO;\ 77 RC = -EIO;\
78 }\ 78 }\
79 }\ 79 }\
80} 80}
81 81
82/* for consistency */ 82/* for consistency */
83#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) 83#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
84 84
85#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ 85#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
86 BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) 86 BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
87/* xtree entry parameter descriptor */ 87/* xtree entry parameter descriptor */
88struct xtsplit { 88struct xtsplit {
@@ -97,7 +97,7 @@ struct xtsplit {
97 97
98 98
99/* 99/*
100 * statistics 100 * statistics
101 */ 101 */
102#ifdef CONFIG_JFS_STATISTICS 102#ifdef CONFIG_JFS_STATISTICS
103static struct { 103static struct {
@@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
136#endif /* _STILL_TO_PORT */ 136#endif /* _STILL_TO_PORT */
137 137
138/* 138/*
139 * xtLookup() 139 * xtLookup()
140 * 140 *
141 * function: map a single page into a physical extent; 141 * function: map a single page into a physical extent;
142 */ 142 */
@@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart,
179 } 179 }
180 180
181 /* 181 /*
182 * compute the physical extent covering logical extent 182 * compute the physical extent covering logical extent
183 * 183 *
184 * N.B. search may have failed (e.g., hole in sparse file), 184 * N.B. search may have failed (e.g., hole in sparse file),
185 * and returned the index of the next entry. 185 * and returned the index of the next entry.
@@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart,
220 220
221 221
222/* 222/*
223 * xtLookupList() 223 * xtLookupList()
224 * 224 *
225 * function: map a single logical extent into a list of physical extent; 225 * function: map a single logical extent into a list of physical extent;
226 * 226 *
227 * parameter: 227 * parameter:
228 * struct inode *ip, 228 * struct inode *ip,
229 * struct lxdlist *lxdlist, lxd list (in) 229 * struct lxdlist *lxdlist, lxd list (in)
230 * struct xadlist *xadlist, xad list (in/out) 230 * struct xadlist *xadlist, xad list (in/out)
231 * int flag) 231 * int flag)
232 * 232 *
233 * coverage of lxd by xad under assumption of 233 * coverage of lxd by xad under assumption of
234 * . lxd's are ordered and disjoint. 234 * . lxd's are ordered and disjoint.
235 * . xad's are ordered and disjoint. 235 * . xad's are ordered and disjoint.
236 * 236 *
237 * return: 237 * return:
238 * 0: success 238 * 0: success
239 * 239 *
240 * note: a page being written (even a single byte) is backed fully, 240 * note: a page being written (even a single byte) is backed fully,
241 * except the last page which is only backed with blocks 241 * except the last page which is only backed with blocks
242 * required to cover the last byte; 242 * required to cover the last byte;
243 * the extent backing a page is fully contained within an xad; 243 * the extent backing a page is fully contained within an xad;
244 */ 244 */
245int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, 245int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
246 struct xadlist * xadlist, int flag) 246 struct xadlist * xadlist, int flag)
@@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
284 return rc; 284 return rc;
285 285
286 /* 286 /*
287 * compute the physical extent covering logical extent 287 * compute the physical extent covering logical extent
288 * 288 *
289 * N.B. search may have failed (e.g., hole in sparse file), 289 * N.B. search may have failed (e.g., hole in sparse file),
290 * and returned the index of the next entry. 290 * and returned the index of the next entry.
@@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
343 if (lstart >= size) 343 if (lstart >= size)
344 goto mapend; 344 goto mapend;
345 345
346 /* compare with the current xad */ 346 /* compare with the current xad */
347 goto compare1; 347 goto compare1;
348 } 348 }
349 /* lxd is covered by xad */ 349 /* lxd is covered by xad */
@@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
430 /* 430 /*
431 * lxd is partially covered by xad 431 * lxd is partially covered by xad
432 */ 432 */
433 else { /* (xend < lend) */ 433 else { /* (xend < lend) */
434 434
435 /* 435 /*
436 * get next xad 436 * get next xad
@@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
477 477
478 478
479/* 479/*
480 * xtSearch() 480 * xtSearch()
481 * 481 *
482 * function: search for the xad entry covering specified offset. 482 * function: search for the xad entry covering specified offset.
483 * 483 *
484 * parameters: 484 * parameters:
485 * ip - file object; 485 * ip - file object;
486 * xoff - extent offset; 486 * xoff - extent offset;
487 * nextp - address of next extent (if any) for search miss 487 * nextp - address of next extent (if any) for search miss
488 * cmpp - comparison result: 488 * cmpp - comparison result:
489 * btstack - traverse stack; 489 * btstack - traverse stack;
490 * flag - search process flag (XT_INSERT); 490 * flag - search process flag (XT_INSERT);
491 * 491 *
492 * returns: 492 * returns:
493 * btstack contains (bn, index) of search path traversed to the entry. 493 * btstack contains (bn, index) of search path traversed to the entry.
494 * *cmpp is set to result of comparison with the entry returned. 494 * *cmpp is set to result of comparison with the entry returned.
495 * the page containing the entry is pinned at exit. 495 * the page containing the entry is pinned at exit.
496 */ 496 */
497static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, 497static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
498 int *cmpp, struct btstack * btstack, int flag) 498 int *cmpp, struct btstack * btstack, int flag)
@@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
517 btstack->nsplit = 0; 517 btstack->nsplit = 0;
518 518
519 /* 519 /*
520 * search down tree from root: 520 * search down tree from root:
521 * 521 *
522 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 522 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
523 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 523 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
642 XT_CMP(cmp, xoff, &p->xad[index], t64); 642 XT_CMP(cmp, xoff, &p->xad[index], t64);
643 if (cmp == 0) { 643 if (cmp == 0) {
644 /* 644 /*
645 * search hit 645 * search hit
646 */ 646 */
647 /* search hit - leaf page: 647 /* search hit - leaf page:
648 * return the entry found 648 * return the entry found
@@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
692 } 692 }
693 693
694 /* 694 /*
695 * search miss 695 * search miss
696 * 696 *
697 * base is the smallest index with key (Kj) greater than 697 * base is the smallest index with key (Kj) greater than
698 * search key (K) and may be zero or maxentry index. 698 * search key (K) and may be zero or maxentry index.
@@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
773} 773}
774 774
775/* 775/*
776 * xtInsert() 776 * xtInsert()
777 * 777 *
778 * function: 778 * function:
779 * 779 *
780 * parameter: 780 * parameter:
781 * tid - transaction id; 781 * tid - transaction id;
782 * ip - file object; 782 * ip - file object;
783 * xflag - extent flag (XAD_NOTRECORDED): 783 * xflag - extent flag (XAD_NOTRECORDED):
784 * xoff - extent offset; 784 * xoff - extent offset;
785 * xlen - extent length; 785 * xlen - extent length;
786 * xaddrp - extent address pointer (in/out): 786 * xaddrp - extent address pointer (in/out):
787 * if (*xaddrp) 787 * if (*xaddrp)
788 * caller allocated data extent at *xaddrp; 788 * caller allocated data extent at *xaddrp;
789 * else 789 * else
790 * allocate data extent and return its xaddr; 790 * allocate data extent and return its xaddr;
791 * flag - 791 * flag -
792 * 792 *
793 * return: 793 * return:
794 */ 794 */
@@ -813,7 +813,7 @@ int xtInsert(tid_t tid, /* transaction id */
813 jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); 813 jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
814 814
815 /* 815 /*
816 * search for the entry location at which to insert: 816 * search for the entry location at which to insert:
817 * 817 *
818 * xtFastSearch() and xtSearch() both returns (leaf page 818 * xtFastSearch() and xtSearch() both returns (leaf page
819 * pinned, index at which to insert). 819 * pinned, index at which to insert).
@@ -853,13 +853,13 @@ int xtInsert(tid_t tid, /* transaction id */
853 } 853 }
854 854
855 /* 855 /*
856 * insert entry for new extent 856 * insert entry for new extent
857 */ 857 */
858 xflag |= XAD_NEW; 858 xflag |= XAD_NEW;
859 859
860 /* 860 /*
861 * if the leaf page is full, split the page and 861 * if the leaf page is full, split the page and
862 * propagate up the router entry for the new page from split 862 * propagate up the router entry for the new page from split
863 * 863 *
864 * The xtSplitUp() will insert the entry and unpin the leaf page. 864 * The xtSplitUp() will insert the entry and unpin the leaf page.
865 */ 865 */
@@ -886,7 +886,7 @@ int xtInsert(tid_t tid, /* transaction id */
886 } 886 }
887 887
888 /* 888 /*
889 * insert the new entry into the leaf page 889 * insert the new entry into the leaf page
890 */ 890 */
891 /* 891 /*
892 * acquire a transaction lock on the leaf page; 892 * acquire a transaction lock on the leaf page;
@@ -930,16 +930,16 @@ int xtInsert(tid_t tid, /* transaction id */
930 930
931 931
932/* 932/*
933 * xtSplitUp() 933 * xtSplitUp()
934 * 934 *
935 * function: 935 * function:
936 * split full pages as propagating insertion up the tree 936 * split full pages as propagating insertion up the tree
937 * 937 *
938 * parameter: 938 * parameter:
939 * tid - transaction id; 939 * tid - transaction id;
940 * ip - file object; 940 * ip - file object;
941 * split - entry parameter descriptor; 941 * split - entry parameter descriptor;
942 * btstack - traverse stack from xtSearch() 942 * btstack - traverse stack from xtSearch()
943 * 943 *
944 * return: 944 * return:
945 */ 945 */
@@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid,
1199 1199
1200 1200
1201/* 1201/*
1202 * xtSplitPage() 1202 * xtSplitPage()
1203 * 1203 *
1204 * function: 1204 * function:
1205 * split a full non-root page into 1205 * split a full non-root page into
1206 * original/split/left page and new right page 1206 * original/split/left page and new right page
1207 * i.e., the original/split page remains as left page. 1207 * i.e., the original/split page remains as left page.
1208 * 1208 *
1209 * parameter: 1209 * parameter:
1210 * int tid, 1210 * int tid,
1211 * struct inode *ip, 1211 * struct inode *ip,
1212 * struct xtsplit *split, 1212 * struct xtsplit *split,
1213 * struct metapage **rmpp, 1213 * struct metapage **rmpp,
1214 * u64 *rbnp, 1214 * u64 *rbnp,
1215 * 1215 *
1216 * return: 1216 * return:
1217 * Pointer to page in which to insert or NULL on error. 1217 * Pointer to page in which to insert or NULL on error.
1218 */ 1218 */
1219static int 1219static int
1220xtSplitPage(tid_t tid, struct inode *ip, 1220xtSplitPage(tid_t tid, struct inode *ip,
@@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
1248 rbn = addressPXD(pxd); 1248 rbn = addressPXD(pxd);
1249 1249
1250 /* Allocate blocks to quota. */ 1250 /* Allocate blocks to quota. */
1251 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 1251 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1252 rc = -EDQUOT; 1252 rc = -EDQUOT;
1253 goto clean_up; 1253 goto clean_up;
1254 } 1254 }
1255 1255
1256 quota_allocation += lengthPXD(pxd); 1256 quota_allocation += lengthPXD(pxd);
@@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1304 skip = split->index; 1304 skip = split->index;
1305 1305
1306 /* 1306 /*
1307 * sequential append at tail (after last entry of last page) 1307 * sequential append at tail (after last entry of last page)
1308 * 1308 *
1309 * if splitting the last page on a level because of appending 1309 * if splitting the last page on a level because of appending
1310 * a entry to it (skip is maxentry), it's likely that the access is 1310 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1342 } 1342 }
1343 1343
1344 /* 1344 /*
1345 * non-sequential insert (at possibly middle page) 1345 * non-sequential insert (at possibly middle page)
1346 */ 1346 */
1347 1347
1348 /* 1348 /*
@@ -1465,25 +1465,24 @@ xtSplitPage(tid_t tid, struct inode *ip,
1465 1465
1466 1466
1467/* 1467/*
1468 * xtSplitRoot() 1468 * xtSplitRoot()
1469 * 1469 *
1470 * function: 1470 * function:
1471 * split the full root page into 1471 * split the full root page into original/root/split page and new
1472 * original/root/split page and new right page 1472 * right page
1473 * i.e., root remains fixed in tree anchor (inode) and 1473 * i.e., root remains fixed in tree anchor (inode) and the root is
1474 * the root is copied to a single new right child page 1474 * copied to a single new right child page since root page <<
1475 * since root page << non-root page, and 1475 * non-root page, and the split root page contains a single entry
1476 * the split root page contains a single entry for the 1476 * for the new right child page.
1477 * new right child page.
1478 * 1477 *
1479 * parameter: 1478 * parameter:
1480 * int tid, 1479 * int tid,
1481 * struct inode *ip, 1480 * struct inode *ip,
1482 * struct xtsplit *split, 1481 * struct xtsplit *split,
1483 * struct metapage **rmpp) 1482 * struct metapage **rmpp)
1484 * 1483 *
1485 * return: 1484 * return:
1486 * Pointer to page in which to insert or NULL on error. 1485 * Pointer to page in which to insert or NULL on error.
1487 */ 1486 */
1488static int 1487static int
1489xtSplitRoot(tid_t tid, 1488xtSplitRoot(tid_t tid,
@@ -1505,7 +1504,7 @@ xtSplitRoot(tid_t tid,
1505 INCREMENT(xtStat.split); 1504 INCREMENT(xtStat.split);
1506 1505
1507 /* 1506 /*
1508 * allocate a single (right) child page 1507 * allocate a single (right) child page
1509 */ 1508 */
1510 pxdlist = split->pxdlist; 1509 pxdlist = split->pxdlist;
1511 pxd = &pxdlist->pxd[pxdlist->npxd]; 1510 pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1573,7 +1572,7 @@ xtSplitRoot(tid_t tid,
1573 } 1572 }
1574 1573
1575 /* 1574 /*
1576 * reset the root 1575 * reset the root
1577 * 1576 *
1578 * init root with the single entry for the new right page 1577 * init root with the single entry for the new right page
1579 * set the 1st entry offset to 0, which force the left-most key 1578 * set the 1st entry offset to 0, which force the left-most key
@@ -1610,7 +1609,7 @@ xtSplitRoot(tid_t tid,
1610 1609
1611 1610
1612/* 1611/*
1613 * xtExtend() 1612 * xtExtend()
1614 * 1613 *
1615 * function: extend in-place; 1614 * function: extend in-place;
1616 * 1615 *
@@ -1677,7 +1676,7 @@ int xtExtend(tid_t tid, /* transaction id */
1677 goto extendOld; 1676 goto extendOld;
1678 1677
1679 /* 1678 /*
1680 * extent overflow: insert entry for new extent 1679 * extent overflow: insert entry for new extent
1681 */ 1680 */
1682//insertNew: 1681//insertNew:
1683 xoff = offsetXAD(xad) + MAXXLEN; 1682 xoff = offsetXAD(xad) + MAXXLEN;
@@ -1685,8 +1684,8 @@ int xtExtend(tid_t tid, /* transaction id */
1685 nextindex = le16_to_cpu(p->header.nextindex); 1684 nextindex = le16_to_cpu(p->header.nextindex);
1686 1685
1687 /* 1686 /*
1688 * if the leaf page is full, insert the new entry and 1687 * if the leaf page is full, insert the new entry and
1689 * propagate up the router entry for the new page from split 1688 * propagate up the router entry for the new page from split
1690 * 1689 *
1691 * The xtSplitUp() will insert the entry and unpin the leaf page. 1690 * The xtSplitUp() will insert the entry and unpin the leaf page.
1692 */ 1691 */
@@ -1731,7 +1730,7 @@ int xtExtend(tid_t tid, /* transaction id */
1731 } 1730 }
1732 } 1731 }
1733 /* 1732 /*
1734 * insert the new entry into the leaf page 1733 * insert the new entry into the leaf page
1735 */ 1734 */
1736 else { 1735 else {
1737 /* insert the new entry: mark the entry NEW */ 1736 /* insert the new entry: mark the entry NEW */
@@ -1771,11 +1770,11 @@ int xtExtend(tid_t tid, /* transaction id */
1771 1770
1772#ifdef _NOTYET 1771#ifdef _NOTYET
1773/* 1772/*
1774 * xtTailgate() 1773 * xtTailgate()
1775 * 1774 *
1776 * function: split existing 'tail' extent 1775 * function: split existing 'tail' extent
1777 * (split offset >= start offset of tail extent), and 1776 * (split offset >= start offset of tail extent), and
1778 * relocate and extend the split tail half; 1777 * relocate and extend the split tail half;
1779 * 1778 *
1780 * note: existing extent may or may not have been committed. 1779 * note: existing extent may or may not have been committed.
1781 * caller is responsible for pager buffer cache update, and 1780 * caller is responsible for pager buffer cache update, and
@@ -1804,7 +1803,7 @@ int xtTailgate(tid_t tid, /* transaction id */
1804 1803
1805/* 1804/*
1806printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", 1805printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1807 (ulong)xoff, xlen, (ulong)xaddr); 1806 (ulong)xoff, xlen, (ulong)xaddr);
1808*/ 1807*/
1809 1808
1810 /* there must exist extent to be tailgated */ 1809 /* there must exist extent to be tailgated */
@@ -1842,18 +1841,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1842 xad = &p->xad[index]; 1841 xad = &p->xad[index];
1843/* 1842/*
1844printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", 1843printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1845 (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); 1844 (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
1846*/ 1845*/
1847 if ((llen = xoff - offsetXAD(xad)) == 0) 1846 if ((llen = xoff - offsetXAD(xad)) == 0)
1848 goto updateOld; 1847 goto updateOld;
1849 1848
1850 /* 1849 /*
1851 * partially replace extent: insert entry for new extent 1850 * partially replace extent: insert entry for new extent
1852 */ 1851 */
1853//insertNew: 1852//insertNew:
1854 /* 1853 /*
1855 * if the leaf page is full, insert the new entry and 1854 * if the leaf page is full, insert the new entry and
1856 * propagate up the router entry for the new page from split 1855 * propagate up the router entry for the new page from split
1857 * 1856 *
1858 * The xtSplitUp() will insert the entry and unpin the leaf page. 1857 * The xtSplitUp() will insert the entry and unpin the leaf page.
1859 */ 1858 */
@@ -1898,7 +1897,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1898 } 1897 }
1899 } 1898 }
1900 /* 1899 /*
1901 * insert the new entry into the leaf page 1900 * insert the new entry into the leaf page
1902 */ 1901 */
1903 else { 1902 else {
1904 /* insert the new entry: mark the entry NEW */ 1903 /* insert the new entry: mark the entry NEW */
@@ -1955,17 +1954,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1955#endif /* _NOTYET */ 1954#endif /* _NOTYET */
1956 1955
1957/* 1956/*
1958 * xtUpdate() 1957 * xtUpdate()
1959 * 1958 *
1960 * function: update XAD; 1959 * function: update XAD;
1961 * 1960 *
1962 * update extent for allocated_but_not_recorded or 1961 * update extent for allocated_but_not_recorded or
1963 * compressed extent; 1962 * compressed extent;
1964 * 1963 *
1965 * parameter: 1964 * parameter:
1966 * nxad - new XAD; 1965 * nxad - new XAD;
1967 * logical extent of the specified XAD must be completely 1966 * logical extent of the specified XAD must be completely
1968 * contained by an existing XAD; 1967 * contained by an existing XAD;
1969 */ 1968 */
1970int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) 1969int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1971{ /* new XAD */ 1970{ /* new XAD */
@@ -2416,19 +2415,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
2416 2415
2417 2416
2418/* 2417/*
2419 * xtAppend() 2418 * xtAppend()
2420 * 2419 *
2421 * function: grow in append mode from contiguous region specified ; 2420 * function: grow in append mode from contiguous region specified ;
2422 * 2421 *
2423 * parameter: 2422 * parameter:
2424 * tid - transaction id; 2423 * tid - transaction id;
2425 * ip - file object; 2424 * ip - file object;
2426 * xflag - extent flag: 2425 * xflag - extent flag:
2427 * xoff - extent offset; 2426 * xoff - extent offset;
2428 * maxblocks - max extent length; 2427 * maxblocks - max extent length;
2429 * xlen - extent length (in/out); 2428 * xlen - extent length (in/out);
2430 * xaddrp - extent address pointer (in/out): 2429 * xaddrp - extent address pointer (in/out):
2431 * flag - 2430 * flag -
2432 * 2431 *
2433 * return: 2432 * return:
2434 */ 2433 */
@@ -2460,7 +2459,7 @@ int xtAppend(tid_t tid, /* transaction id */
2460 (ulong) xoff, maxblocks, xlen, (ulong) xaddr); 2459 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
2461 2460
2462 /* 2461 /*
2463 * search for the entry location at which to insert: 2462 * search for the entry location at which to insert:
2464 * 2463 *
2465 * xtFastSearch() and xtSearch() both returns (leaf page 2464 * xtFastSearch() and xtSearch() both returns (leaf page
2466 * pinned, index at which to insert). 2465 * pinned, index at which to insert).
@@ -2482,13 +2481,13 @@ int xtAppend(tid_t tid, /* transaction id */
2482 xlen = min(xlen, (int)(next - xoff)); 2481 xlen = min(xlen, (int)(next - xoff));
2483//insert: 2482//insert:
2484 /* 2483 /*
2485 * insert entry for new extent 2484 * insert entry for new extent
2486 */ 2485 */
2487 xflag |= XAD_NEW; 2486 xflag |= XAD_NEW;
2488 2487
2489 /* 2488 /*
2490 * if the leaf page is full, split the page and 2489 * if the leaf page is full, split the page and
2491 * propagate up the router entry for the new page from split 2490 * propagate up the router entry for the new page from split
2492 * 2491 *
2493 * The xtSplitUp() will insert the entry and unpin the leaf page. 2492 * The xtSplitUp() will insert the entry and unpin the leaf page.
2494 */ 2493 */
@@ -2545,7 +2544,7 @@ int xtAppend(tid_t tid, /* transaction id */
2545 return 0; 2544 return 0;
2546 2545
2547 /* 2546 /*
2548 * insert the new entry into the leaf page 2547 * insert the new entry into the leaf page
2549 */ 2548 */
2550 insertLeaf: 2549 insertLeaf:
2551 /* 2550 /*
@@ -2589,17 +2588,17 @@ int xtAppend(tid_t tid, /* transaction id */
2589 2588
2590/* - TBD for defragmentaion/reorganization - 2589/* - TBD for defragmentaion/reorganization -
2591 * 2590 *
2592 * xtDelete() 2591 * xtDelete()
2593 * 2592 *
2594 * function: 2593 * function:
2595 * delete the entry with the specified key. 2594 * delete the entry with the specified key.
2596 * 2595 *
2597 * N.B.: whole extent of the entry is assumed to be deleted. 2596 * N.B.: whole extent of the entry is assumed to be deleted.
2598 * 2597 *
2599 * parameter: 2598 * parameter:
2600 * 2599 *
2601 * return: 2600 * return:
2602 * ENOENT: if the entry is not found. 2601 * ENOENT: if the entry is not found.
2603 * 2602 *
2604 * exception: 2603 * exception:
2605 */ 2604 */
@@ -2665,10 +2664,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2665 2664
2666/* - TBD for defragmentaion/reorganization - 2665/* - TBD for defragmentaion/reorganization -
2667 * 2666 *
2668 * xtDeleteUp() 2667 * xtDeleteUp()
2669 * 2668 *
2670 * function: 2669 * function:
2671 * free empty pages as propagating deletion up the tree 2670 * free empty pages as propagating deletion up the tree
2672 * 2671 *
2673 * parameter: 2672 * parameter:
2674 * 2673 *
@@ -2815,15 +2814,15 @@ xtDeleteUp(tid_t tid, struct inode *ip,
2815 2814
2816 2815
2817/* 2816/*
2818 * NAME: xtRelocate() 2817 * NAME: xtRelocate()
2819 * 2818 *
2820 * FUNCTION: relocate xtpage or data extent of regular file; 2819 * FUNCTION: relocate xtpage or data extent of regular file;
2821 * This function is mainly used by defragfs utility. 2820 * This function is mainly used by defragfs utility.
2822 * 2821 *
2823 * NOTE: This routine does not have the logic to handle 2822 * NOTE: This routine does not have the logic to handle
2824 * uncommitted allocated extent. The caller should call 2823 * uncommitted allocated extent. The caller should call
2825 * txCommit() to commit all the allocation before call 2824 * txCommit() to commit all the allocation before call
2826 * this routine. 2825 * this routine.
2827 */ 2826 */
2828int 2827int
2829xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ 2828xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
@@ -2865,8 +2864,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2865 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); 2864 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
2866 2865
2867 /* 2866 /*
2868 * 1. get and validate the parent xtpage/xad entry 2867 * 1. get and validate the parent xtpage/xad entry
2869 * covering the source extent to be relocated; 2868 * covering the source extent to be relocated;
2870 */ 2869 */
2871 if (xtype == DATAEXT) { 2870 if (xtype == DATAEXT) {
2872 /* search in leaf entry */ 2871 /* search in leaf entry */
@@ -2910,7 +2909,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2910 jfs_info("xtRelocate: parent xad entry validated."); 2909 jfs_info("xtRelocate: parent xad entry validated.");
2911 2910
2912 /* 2911 /*
2913 * 2. relocate the extent 2912 * 2. relocate the extent
2914 */ 2913 */
2915 if (xtype == DATAEXT) { 2914 if (xtype == DATAEXT) {
2916 /* if the extent is allocated-but-not-recorded 2915 /* if the extent is allocated-but-not-recorded
@@ -2923,7 +2922,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2923 XT_PUTPAGE(pmp); 2922 XT_PUTPAGE(pmp);
2924 2923
2925 /* 2924 /*
2926 * cmRelocate() 2925 * cmRelocate()
2927 * 2926 *
2928 * copy target data pages to be relocated; 2927 * copy target data pages to be relocated;
2929 * 2928 *
@@ -2945,8 +2944,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2945 pno = offset >> CM_L2BSIZE; 2944 pno = offset >> CM_L2BSIZE;
2946 npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; 2945 npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
2947/* 2946/*
2948 npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - 2947 npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
2949 (offset >> CM_L2BSIZE) + 1; 2948 (offset >> CM_L2BSIZE) + 1;
2950*/ 2949*/
2951 sxaddr = oxaddr; 2950 sxaddr = oxaddr;
2952 dxaddr = nxaddr; 2951 dxaddr = nxaddr;
@@ -2981,7 +2980,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2981 2980
2982 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); 2981 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2983 jfs_info("xtRelocate: target data extent relocated."); 2982 jfs_info("xtRelocate: target data extent relocated.");
2984 } else { /* (xtype == XTPAGE) */ 2983 } else { /* (xtype == XTPAGE) */
2985 2984
2986 /* 2985 /*
2987 * read in the target xtpage from the source extent; 2986 * read in the target xtpage from the source extent;
@@ -3026,16 +3025,14 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3026 */ 3025 */
3027 if (lmp) { 3026 if (lmp) {
3028 BT_MARK_DIRTY(lmp, ip); 3027 BT_MARK_DIRTY(lmp, ip);
3029 tlck = 3028 tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
3030 txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
3031 lp->header.next = cpu_to_le64(nxaddr); 3029 lp->header.next = cpu_to_le64(nxaddr);
3032 XT_PUTPAGE(lmp); 3030 XT_PUTPAGE(lmp);
3033 } 3031 }
3034 3032
3035 if (rmp) { 3033 if (rmp) {
3036 BT_MARK_DIRTY(rmp, ip); 3034 BT_MARK_DIRTY(rmp, ip);
3037 tlck = 3035 tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
3038 txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
3039 rp->header.prev = cpu_to_le64(nxaddr); 3036 rp->header.prev = cpu_to_le64(nxaddr);
3040 XT_PUTPAGE(rmp); 3037 XT_PUTPAGE(rmp);
3041 } 3038 }
@@ -3062,7 +3059,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3062 * scan may be skipped by commit() and logredo(); 3059 * scan may be skipped by commit() and logredo();
3063 */ 3060 */
3064 BT_MARK_DIRTY(mp, ip); 3061 BT_MARK_DIRTY(mp, ip);
3065 /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ 3062 /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
3066 tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); 3063 tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
3067 xtlck = (struct xtlock *) & tlck->lock; 3064 xtlck = (struct xtlock *) & tlck->lock;
3068 3065
@@ -3084,7 +3081,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3084 } 3081 }
3085 3082
3086 /* 3083 /*
3087 * 3. acquire maplock for the source extent to be freed; 3084 * 3. acquire maplock for the source extent to be freed;
3088 * 3085 *
3089 * acquire a maplock saving the src relocated extent address; 3086 * acquire a maplock saving the src relocated extent address;
3090 * to free of the extent at commit time; 3087 * to free of the extent at commit time;
@@ -3105,7 +3102,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3105 * is no buffer associated with this lock since the buffer 3102 * is no buffer associated with this lock since the buffer
3106 * has been redirected to the target location. 3103 * has been redirected to the target location.
3107 */ 3104 */
3108 else /* (xtype == XTPAGE) */ 3105 else /* (xtype == XTPAGE) */
3109 tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); 3106 tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
3110 3107
3111 pxdlock = (struct pxd_lock *) & tlck->lock; 3108 pxdlock = (struct pxd_lock *) & tlck->lock;
@@ -3115,7 +3112,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3115 pxdlock->index = 1; 3112 pxdlock->index = 1;
3116 3113
3117 /* 3114 /*
3118 * 4. update the parent xad entry for relocation; 3115 * 4. update the parent xad entry for relocation;
3119 * 3116 *
3120 * acquire tlck for the parent entry with XAD_NEW as entry 3117 * acquire tlck for the parent entry with XAD_NEW as entry
3121 * update which will write LOG_REDOPAGE and update bmap for 3118 * update which will write LOG_REDOPAGE and update bmap for
@@ -3143,22 +3140,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3143 3140
3144 3141
3145/* 3142/*
3146 * xtSearchNode() 3143 * xtSearchNode()
3147 * 3144 *
3148 * function: search for the internal xad entry covering specified extent. 3145 * function: search for the internal xad entry covering specified extent.
3149 * This function is mainly used by defragfs utility. 3146 * This function is mainly used by defragfs utility.
3150 * 3147 *
3151 * parameters: 3148 * parameters:
3152 * ip - file object; 3149 * ip - file object;
3153 * xad - extent to find; 3150 * xad - extent to find;
3154 * cmpp - comparison result: 3151 * cmpp - comparison result:
3155 * btstack - traverse stack; 3152 * btstack - traverse stack;
3156 * flag - search process flag; 3153 * flag - search process flag;
3157 * 3154 *
3158 * returns: 3155 * returns:
3159 * btstack contains (bn, index) of search path traversed to the entry. 3156 * btstack contains (bn, index) of search path traversed to the entry.
3160 * *cmpp is set to result of comparison with the entry returned. 3157 * *cmpp is set to result of comparison with the entry returned.
3161 * the page containing the entry is pinned at exit. 3158 * the page containing the entry is pinned at exit.
3162 */ 3159 */
3163static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ 3160static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3164 int *cmpp, struct btstack * btstack, int flag) 3161 int *cmpp, struct btstack * btstack, int flag)
@@ -3181,7 +3178,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3181 xaddr = addressXAD(xad); 3178 xaddr = addressXAD(xad);
3182 3179
3183 /* 3180 /*
3184 * search down tree from root: 3181 * search down tree from root:
3185 * 3182 *
3186 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 3183 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
3187 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 3184 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -3217,7 +3214,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3217 XT_CMP(cmp, xoff, &p->xad[index], t64); 3214 XT_CMP(cmp, xoff, &p->xad[index], t64);
3218 if (cmp == 0) { 3215 if (cmp == 0) {
3219 /* 3216 /*
3220 * search hit 3217 * search hit
3221 * 3218 *
3222 * verify for exact match; 3219 * verify for exact match;
3223 */ 3220 */
@@ -3245,7 +3242,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3245 } 3242 }
3246 3243
3247 /* 3244 /*
3248 * search miss - non-leaf page: 3245 * search miss - non-leaf page:
3249 * 3246 *
3250 * base is the smallest index with key (Kj) greater than 3247 * base is the smallest index with key (Kj) greater than
3251 * search key (K) and may be zero or maxentry index. 3248 * search key (K) and may be zero or maxentry index.
@@ -3268,15 +3265,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3268 3265
3269 3266
3270/* 3267/*
3271 * xtRelink() 3268 * xtRelink()
3272 * 3269 *
3273 * function: 3270 * function:
3274 * link around a freed page. 3271 * link around a freed page.
3275 * 3272 *
3276 * Parameter: 3273 * Parameter:
3277 * int tid, 3274 * int tid,
3278 * struct inode *ip, 3275 * struct inode *ip,
3279 * xtpage_t *p) 3276 * xtpage_t *p)
3280 * 3277 *
3281 * returns: 3278 * returns:
3282 */ 3279 */
@@ -3338,7 +3335,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
3338 3335
3339 3336
3340/* 3337/*
3341 * xtInitRoot() 3338 * xtInitRoot()
3342 * 3339 *
3343 * initialize file root (inline in inode) 3340 * initialize file root (inline in inode)
3344 */ 3341 */
@@ -3385,42 +3382,42 @@ void xtInitRoot(tid_t tid, struct inode *ip)
3385#define MAX_TRUNCATE_LEAVES 50 3382#define MAX_TRUNCATE_LEAVES 50
3386 3383
3387/* 3384/*
3388 * xtTruncate() 3385 * xtTruncate()
3389 * 3386 *
3390 * function: 3387 * function:
3391 * traverse for truncation logging backward bottom up; 3388 * traverse for truncation logging backward bottom up;
3392 * terminate at the last extent entry at the current subtree 3389 * terminate at the last extent entry at the current subtree
3393 * root page covering new down size. 3390 * root page covering new down size.
3394 * truncation may occur within the last extent entry. 3391 * truncation may occur within the last extent entry.
3395 * 3392 *
3396 * parameter: 3393 * parameter:
3397 * int tid, 3394 * int tid,
3398 * struct inode *ip, 3395 * struct inode *ip,
3399 * s64 newsize, 3396 * s64 newsize,
3400 * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} 3397 * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
3401 * 3398 *
3402 * return: 3399 * return:
3403 * 3400 *
3404 * note: 3401 * note:
3405 * PWMAP: 3402 * PWMAP:
3406 * 1. truncate (non-COMMIT_NOLINK file) 3403 * 1. truncate (non-COMMIT_NOLINK file)
3407 * by jfs_truncate() or jfs_open(O_TRUNC): 3404 * by jfs_truncate() or jfs_open(O_TRUNC):
3408 * xtree is updated; 3405 * xtree is updated;
3409 * 2. truncate index table of directory when last entry removed 3406 * 2. truncate index table of directory when last entry removed
3410 * map update via tlock at commit time; 3407 * map update via tlock at commit time;
3411 * PMAP: 3408 * PMAP:
3412 * Call xtTruncate_pmap instead 3409 * Call xtTruncate_pmap instead
3413 * WMAP: 3410 * WMAP:
3414 * 1. remove (free zero link count) on last reference release 3411 * 1. remove (free zero link count) on last reference release
3415 * (pmap has been freed at commit zero link count); 3412 * (pmap has been freed at commit zero link count);
3416 * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): 3413 * 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
3417 * xtree is updated; 3414 * xtree is updated;
3418 * map update directly at truncation time; 3415 * map update directly at truncation time;
3419 * 3416 *
3420 * if (DELETE) 3417 * if (DELETE)
3421 * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); 3418 * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
3422 * else if (TRUNCATE) 3419 * else if (TRUNCATE)
3423 * must write LOG_NOREDOPAGE for deleted index page; 3420 * must write LOG_NOREDOPAGE for deleted index page;
3424 * 3421 *
3425 * pages may already have been tlocked by anonymous transactions 3422 * pages may already have been tlocked by anonymous transactions
3426 * during file growth (i.e., write) before truncation; 3423 * during file growth (i.e., write) before truncation;
@@ -3493,7 +3490,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3493 * retained in the new sized file. 3490 * retained in the new sized file.
3494 * if type is PMAP, the data and index pages are NOT 3491 * if type is PMAP, the data and index pages are NOT
3495 * freed, and the data and index blocks are NOT freed 3492 * freed, and the data and index blocks are NOT freed
3496 * from working map. 3493 * from working map.
3497 * (this will allow continued access of data/index of 3494 * (this will allow continued access of data/index of
3498 * temporary file (zerolink count file truncated to zero-length)). 3495 * temporary file (zerolink count file truncated to zero-length)).
3499 */ 3496 */
@@ -3542,7 +3539,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3542 goto getChild; 3539 goto getChild;
3543 3540
3544 /* 3541 /*
3545 * leaf page 3542 * leaf page
3546 */ 3543 */
3547 freed = 0; 3544 freed = 0;
3548 3545
@@ -3916,7 +3913,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3916 } 3913 }
3917 3914
3918 /* 3915 /*
3919 * internal page: go down to child page of current entry 3916 * internal page: go down to child page of current entry
3920 */ 3917 */
3921 getChild: 3918 getChild:
3922 /* save current parent entry for the child page */ 3919 /* save current parent entry for the child page */
@@ -3965,7 +3962,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3965 3962
3966 3963
3967/* 3964/*
3968 * xtTruncate_pmap() 3965 * xtTruncate_pmap()
3969 * 3966 *
3970 * function: 3967 * function:
3971 * Perform truncate to zero lenghth for deleted file, leaving the 3968 * Perform truncate to zero lenghth for deleted file, leaving the
@@ -3974,9 +3971,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3974 * is committed to disk. 3971 * is committed to disk.
3975 * 3972 *
3976 * parameter: 3973 * parameter:
3977 * tid_t tid, 3974 * tid_t tid,
3978 * struct inode *ip, 3975 * struct inode *ip,
3979 * s64 committed_size) 3976 * s64 committed_size)
3980 * 3977 *
3981 * return: new committed size 3978 * return: new committed size
3982 * 3979 *
@@ -4050,7 +4047,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4050 } 4047 }
4051 4048
4052 /* 4049 /*
4053 * leaf page 4050 * leaf page
4054 */ 4051 */
4055 4052
4056 if (++locked_leaves > MAX_TRUNCATE_LEAVES) { 4053 if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
@@ -4062,7 +4059,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4062 xoff = offsetXAD(xad); 4059 xoff = offsetXAD(xad);
4063 xlen = lengthXAD(xad); 4060 xlen = lengthXAD(xad);
4064 XT_PUTPAGE(mp); 4061 XT_PUTPAGE(mp);
4065 return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; 4062 return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
4066 } 4063 }
4067 tlck = txLock(tid, ip, mp, tlckXTREE); 4064 tlck = txLock(tid, ip, mp, tlckXTREE);
4068 tlck->type = tlckXTREE | tlckFREE; 4065 tlck->type = tlckXTREE | tlckFREE;
@@ -4099,8 +4096,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4099 */ 4096 */
4100 tlck = txLock(tid, ip, mp, tlckXTREE); 4097 tlck = txLock(tid, ip, mp, tlckXTREE);
4101 xtlck = (struct xtlock *) & tlck->lock; 4098 xtlck = (struct xtlock *) & tlck->lock;
4102 xtlck->hwm.offset = 4099 xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
4103 le16_to_cpu(p->header.nextindex) - 1;
4104 tlck->type = tlckXTREE | tlckFREE; 4100 tlck->type = tlckXTREE | tlckFREE;
4105 4101
4106 XT_PUTPAGE(mp); 4102 XT_PUTPAGE(mp);
@@ -4118,7 +4114,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4118 else 4114 else
4119 index--; 4115 index--;
4120 /* 4116 /*
4121 * internal page: go down to child page of current entry 4117 * internal page: go down to child page of current entry
4122 */ 4118 */
4123 getChild: 4119 getChild:
4124 /* save current parent entry for the child page */ 4120 /* save current parent entry for the child page */
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 164f6f2b1019..70815c8a3d6a 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -19,14 +19,14 @@
19#define _H_JFS_XTREE 19#define _H_JFS_XTREE
20 20
21/* 21/*
22 * jfs_xtree.h: extent allocation descriptor B+-tree manager 22 * jfs_xtree.h: extent allocation descriptor B+-tree manager
23 */ 23 */
24 24
25#include "jfs_btree.h" 25#include "jfs_btree.h"
26 26
27 27
28/* 28/*
29 * extent allocation descriptor (xad) 29 * extent allocation descriptor (xad)
30 */ 30 */
31typedef struct xad { 31typedef struct xad {
32 unsigned flag:8; /* 1: flag */ 32 unsigned flag:8; /* 1: flag */
@@ -38,30 +38,30 @@ typedef struct xad {
38 __le32 addr2; /* 4: address in unit of fsblksize */ 38 __le32 addr2; /* 4: address in unit of fsblksize */
39} xad_t; /* (16) */ 39} xad_t; /* (16) */
40 40
41#define MAXXLEN ((1 << 24) - 1) 41#define MAXXLEN ((1 << 24) - 1)
42 42
43#define XTSLOTSIZE 16 43#define XTSLOTSIZE 16
44#define L2XTSLOTSIZE 4 44#define L2XTSLOTSIZE 4
45 45
46/* xad_t field construction */ 46/* xad_t field construction */
47#define XADoffset(xad, offset64)\ 47#define XADoffset(xad, offset64)\
48{\ 48{\
49 (xad)->off1 = ((u64)offset64) >> 32;\ 49 (xad)->off1 = ((u64)offset64) >> 32;\
50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ 50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
51} 51}
52#define XADaddress(xad, address64)\ 52#define XADaddress(xad, address64)\
53{\ 53{\
54 (xad)->addr1 = ((u64)address64) >> 32;\ 54 (xad)->addr1 = ((u64)address64) >> 32;\
55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ 55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
56} 56}
57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) 57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
58 58
59/* xad_t field extraction */ 59/* xad_t field extraction */
60#define offsetXAD(xad)\ 60#define offsetXAD(xad)\
61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) 61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
62#define addressXAD(xad)\ 62#define addressXAD(xad)\
63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) 63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
64#define lengthXAD(xad) __le24_to_cpu((xad)->len) 64#define lengthXAD(xad) __le24_to_cpu((xad)->len)
65 65
66/* xad list */ 66/* xad list */
67struct xadlist { 67struct xadlist {
@@ -71,22 +71,22 @@ struct xadlist {
71}; 71};
72 72
73/* xad_t flags */ 73/* xad_t flags */
74#define XAD_NEW 0x01 /* new */ 74#define XAD_NEW 0x01 /* new */
75#define XAD_EXTENDED 0x02 /* extended */ 75#define XAD_EXTENDED 0x02 /* extended */
76#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ 76#define XAD_COMPRESSED 0x04 /* compressed with recorded length */
77#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ 77#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */
78#define XAD_COW 0x10 /* copy-on-write */ 78#define XAD_COW 0x10 /* copy-on-write */
79 79
80 80
81/* possible values for maxentry */ 81/* possible values for maxentry */
82#define XTROOTINITSLOT_DIR 6 82#define XTROOTINITSLOT_DIR 6
83#define XTROOTINITSLOT 10 83#define XTROOTINITSLOT 10
84#define XTROOTMAXSLOT 18 84#define XTROOTMAXSLOT 18
85#define XTPAGEMAXSLOT 256 85#define XTPAGEMAXSLOT 256
86#define XTENTRYSTART 2 86#define XTENTRYSTART 2
87 87
88/* 88/*
89 * xtree page: 89 * xtree page:
90 */ 90 */
91typedef union { 91typedef union {
92 struct xtheader { 92 struct xtheader {
@@ -106,7 +106,7 @@ typedef union {
106} xtpage_t; 106} xtpage_t;
107 107
108/* 108/*
109 * external declaration 109 * external declaration
110 */ 110 */
111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, 111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
112 int *pflag, s64 * paddr, int *plen, int flag); 112 int *pflag, s64 * paddr, int *plen, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 41c204771262..25161c4121e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
328 * dentry - child directory dentry 328 * dentry - child directory dentry
329 * 329 *
330 * RETURN: -EINVAL - if name is . or .. 330 * RETURN: -EINVAL - if name is . or ..
331 * -EINVAL - if . or .. exist but are invalid. 331 * -EINVAL - if . or .. exist but are invalid.
332 * errors from subroutines 332 * errors from subroutines
333 * 333 *
334 * note: 334 * note:
@@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
517 inode_dec_link_count(ip); 517 inode_dec_link_count(ip);
518 518
519 /* 519 /*
520 * commit zero link count object 520 * commit zero link count object
521 */ 521 */
522 if (ip->i_nlink == 0) { 522 if (ip->i_nlink == 0) {
523 assert(!test_cflag(COMMIT_Nolink, ip)); 523 assert(!test_cflag(COMMIT_Nolink, ip));
@@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
596/* 596/*
597 * NAME: commitZeroLink() 597 * NAME: commitZeroLink()
598 * 598 *
599 * FUNCTION: for non-directory, called by jfs_remove(), 599 * FUNCTION: for non-directory, called by jfs_remove(),
600 * truncate a regular file, directory or symbolic 600 * truncate a regular file, directory or symbolic
601 * link to zero length. return 0 if type is not 601 * link to zero length. return 0 if type is not
602 * one of these. 602 * one of these.
@@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
676/* 676/*
677 * NAME: jfs_free_zero_link() 677 * NAME: jfs_free_zero_link()
678 * 678 *
679 * FUNCTION: for non-directory, called by iClose(), 679 * FUNCTION: for non-directory, called by iClose(),
680 * free resources of a file from cache and WORKING map 680 * free resources of a file from cache and WORKING map
681 * for a file previously committed with zero link count 681 * for a file previously committed with zero link count
682 * while associated with a pager object, 682 * while associated with a pager object,
@@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry,
855 * NAME: jfs_symlink(dip, dentry, name) 855 * NAME: jfs_symlink(dip, dentry, name)
856 * 856 *
857 * FUNCTION: creates a symbolic link to <symlink> by name <name> 857 * FUNCTION: creates a symbolic link to <symlink> by name <name>
858 * in directory <dip> 858 * in directory <dip>
859 * 859 *
860 * PARAMETER: dip - parent directory vnode 860 * PARAMETER: dip - parent directory vnode
861 * dentry - dentry of symbolic link 861 * dentry - dentry of symbolic link
862 * name - the path name of the existing object 862 * name - the path name of the existing object
863 * that will be the source of the link 863 * that will be the source of the link
864 * 864 *
865 * RETURN: errors from subroutines 865 * RETURN: errors from subroutines
866 * 866 *
@@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1052 1052
1053 1053
1054/* 1054/*
1055 * NAME: jfs_rename 1055 * NAME: jfs_rename
1056 * 1056 *
1057 * FUNCTION: rename a file or directory 1057 * FUNCTION: rename a file or directory
1058 */ 1058 */
1059static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, 1059static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1060 struct inode *new_dir, struct dentry *new_dentry) 1060 struct inode *new_dir, struct dentry *new_dentry)
@@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1331 1331
1332 1332
1333/* 1333/*
1334 * NAME: jfs_mknod 1334 * NAME: jfs_mknod
1335 * 1335 *
1336 * FUNCTION: Create a special file (device) 1336 * FUNCTION: Create a special file (device)
1337 */ 1337 */
1338static int jfs_mknod(struct inode *dir, struct dentry *dentry, 1338static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1339 int mode, dev_t rdev) 1339 int mode, dev_t rdev)
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 79d625f3f733..71984ee95346 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -29,17 +29,17 @@
29#include "jfs_txnmgr.h" 29#include "jfs_txnmgr.h"
30#include "jfs_debug.h" 30#include "jfs_debug.h"
31 31
32#define BITSPERPAGE (PSIZE << 3) 32#define BITSPERPAGE (PSIZE << 3)
33#define L2MEGABYTE 20 33#define L2MEGABYTE 20
34#define MEGABYTE (1 << L2MEGABYTE) 34#define MEGABYTE (1 << L2MEGABYTE)
35#define MEGABYTE32 (MEGABYTE << 5) 35#define MEGABYTE32 (MEGABYTE << 5)
36 36
37/* convert block number to bmap file page number */ 37/* convert block number to bmap file page number */
38#define BLKTODMAPN(b)\ 38#define BLKTODMAPN(b)\
39 (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) 39 (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
40 40
41/* 41/*
42 * jfs_extendfs() 42 * jfs_extendfs()
43 * 43 *
44 * function: extend file system; 44 * function: extend file system;
45 * 45 *
@@ -48,9 +48,9 @@
48 * workspace space 48 * workspace space
49 * 49 *
50 * input: 50 * input:
51 * new LVSize: in LV blocks (required) 51 * new LVSize: in LV blocks (required)
52 * new LogSize: in LV blocks (optional) 52 * new LogSize: in LV blocks (optional)
53 * new FSSize: in LV blocks (optional) 53 * new FSSize: in LV blocks (optional)
54 * 54 *
55 * new configuration: 55 * new configuration:
56 * 1. set new LogSize as specified or default from new LVSize; 56 * 1. set new LogSize as specified or default from new LVSize;
@@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
125 } 125 }
126 126
127 /* 127 /*
128 * reconfigure LV spaces 128 * reconfigure LV spaces
129 * --------------------- 129 * ---------------------
130 * 130 *
131 * validate new size, or, if not specified, determine new size 131 * validate new size, or, if not specified, determine new size
132 */ 132 */
@@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
198 log_formatted = 1; 198 log_formatted = 1;
199 } 199 }
200 /* 200 /*
201 * quiesce file system 201 * quiesce file system
202 * 202 *
203 * (prepare to move the inline log and to prevent map update) 203 * (prepare to move the inline log and to prevent map update)
204 * 204 *
@@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
270 } 270 }
271 271
272 /* 272 /*
273 * extend block allocation map 273 * extend block allocation map
274 * --------------------------- 274 * ---------------------------
275 * 275 *
276 * extendfs() for new extension, retry after crash recovery; 276 * extendfs() for new extension, retry after crash recovery;
277 * 277 *
@@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
283 * s_size: aggregate size in physical blocks; 283 * s_size: aggregate size in physical blocks;
284 */ 284 */
285 /* 285 /*
286 * compute the new block allocation map configuration 286 * compute the new block allocation map configuration
287 * 287 *
288 * map dinode: 288 * map dinode:
289 * di_size: map file size in byte; 289 * di_size: map file size in byte;
@@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
301 newNpages = BLKTODMAPN(t64) + 1; 301 newNpages = BLKTODMAPN(t64) + 1;
302 302
303 /* 303 /*
304 * extend map from current map (WITHOUT growing mapfile) 304 * extend map from current map (WITHOUT growing mapfile)
305 * 305 *
306 * map new extension with unmapped part of the last partial 306 * map new extension with unmapped part of the last partial
307 * dmap page, if applicable, and extra page(s) allocated 307 * dmap page, if applicable, and extra page(s) allocated
@@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
341 XSize -= nblocks; 341 XSize -= nblocks;
342 342
343 /* 343 /*
344 * grow map file to cover remaining extension 344 * grow map file to cover remaining extension
345 * and/or one extra dmap page for next extendfs(); 345 * and/or one extra dmap page for next extendfs();
346 * 346 *
347 * allocate new map pages and its backing blocks, and 347 * allocate new map pages and its backing blocks, and
348 * update map file xtree 348 * update map file xtree
@@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
422 dbFinalizeBmap(ipbmap); 422 dbFinalizeBmap(ipbmap);
423 423
424 /* 424 /*
425 * update inode allocation map 425 * update inode allocation map
426 * --------------------------- 426 * ---------------------------
427 * 427 *
428 * move iag lists from old to new iag; 428 * move iag lists from old to new iag;
429 * agstart field is not updated for logredo() to reconstruct 429 * agstart field is not updated for logredo() to reconstruct
@@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
442 } 442 }
443 443
444 /* 444 /*
445 * finalize 445 * finalize
446 * -------- 446 * --------
447 * 447 *
448 * extension is committed when on-disk super block is 448 * extension is committed when on-disk super block is
449 * updated with new descriptors: logredo will recover 449 * updated with new descriptors: logredo will recover
@@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
480 diFreeSpecial(ipbmap2); 480 diFreeSpecial(ipbmap2);
481 481
482 /* 482 /*
483 * update superblock 483 * update superblock
484 */ 484 */
485 if ((rc = readSuper(sb, &bh))) 485 if ((rc = readSuper(sb, &bh)))
486 goto error_out; 486 goto error_out;
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
530 530
531 resume: 531 resume:
532 /* 532 /*
533 * resume file system transactions 533 * resume file system transactions
534 */ 534 */
535 txResume(sb); 535 txResume(sb);
536 536
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index b753ba216450..b2375f0774b7 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -63,9 +63,9 @@
63 * 63 *
64 * On-disk: 64 * On-disk:
65 * 65 *
66 * FEALISTs are stored on disk using blocks allocated by dbAlloc() and 66 * FEALISTs are stored on disk using blocks allocated by dbAlloc() and
67 * written directly. An EA list may be in-lined in the inode if there is 67 * written directly. An EA list may be in-lined in the inode if there is
68 * sufficient room available. 68 * sufficient room available.
69 */ 69 */
70 70
71struct ea_buffer { 71struct ea_buffer {
@@ -590,7 +590,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
590 size_check: 590 size_check:
591 if (EALIST_SIZE(ea_buf->xattr) != ea_size) { 591 if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
592 printk(KERN_ERR "ea_get: invalid extended attribute\n"); 592 printk(KERN_ERR "ea_get: invalid extended attribute\n");
593 dump_mem("xattr", ea_buf->xattr, ea_size); 593 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
594 ea_buf->xattr, ea_size, 1);
594 ea_release(inode, ea_buf); 595 ea_release(inode, ea_buf);
595 rc = -EIO; 596 rc = -EIO;
596 goto clean_up; 597 goto clean_up;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 74f30e0c0381..98e78e2f18d6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
165 rcu_read_lock(); 165 rcu_read_lock();
166 buffer += sprintf(buffer, 166 buffer += sprintf(buffer,
167 "State:\t%s\n" 167 "State:\t%s\n"
168 "SleepAVG:\t%lu%%\n"
169 "Tgid:\t%d\n" 168 "Tgid:\t%d\n"
170 "Pid:\t%d\n" 169 "Pid:\t%d\n"
171 "PPid:\t%d\n" 170 "PPid:\t%d\n"
@@ -173,7 +172,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
173 "Uid:\t%d\t%d\t%d\t%d\n" 172 "Uid:\t%d\t%d\t%d\t%d\n"
174 "Gid:\t%d\t%d\t%d\t%d\n", 173 "Gid:\t%d\t%d\t%d\t%d\n",
175 get_task_state(p), 174 get_task_state(p),
176 (p->sleep_avg/1024)*100/(1020000000/1024),
177 p->tgid, p->pid, 175 p->tgid, p->pid,
178 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, 176 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
179 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, 177 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
@@ -312,6 +310,41 @@ int proc_pid_status(struct task_struct *task, char * buffer)
312 return buffer - orig; 310 return buffer - orig;
313} 311}
314 312
313static clock_t task_utime(struct task_struct *p)
314{
315 clock_t utime = cputime_to_clock_t(p->utime),
316 total = utime + cputime_to_clock_t(p->stime);
317 u64 temp;
318
319 /*
320 * Use CFS's precise accounting:
321 */
322 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
323
324 if (total) {
325 temp *= utime;
326 do_div(temp, total);
327 }
328 utime = (clock_t)temp;
329
330 return utime;
331}
332
333static clock_t task_stime(struct task_struct *p)
334{
335 clock_t stime = cputime_to_clock_t(p->stime);
336
337 /*
338 * Use CFS's precise accounting. (we subtract utime from
339 * the total, to make sure the total observed by userspace
340 * grows monotonically - apps rely on that):
341 */
342 stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
343
344 return stime;
345}
346
347
315static int do_task_stat(struct task_struct *task, char * buffer, int whole) 348static int do_task_stat(struct task_struct *task, char * buffer, int whole)
316{ 349{
317 unsigned long vsize, eip, esp, wchan = ~0UL; 350 unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -326,7 +359,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
326 unsigned long long start_time; 359 unsigned long long start_time;
327 unsigned long cmin_flt = 0, cmaj_flt = 0; 360 unsigned long cmin_flt = 0, cmaj_flt = 0;
328 unsigned long min_flt = 0, maj_flt = 0; 361 unsigned long min_flt = 0, maj_flt = 0;
329 cputime_t cutime, cstime, utime, stime; 362 cputime_t cutime, cstime;
363 clock_t utime, stime;
330 unsigned long rsslim = 0; 364 unsigned long rsslim = 0;
331 char tcomm[sizeof(task->comm)]; 365 char tcomm[sizeof(task->comm)];
332 unsigned long flags; 366 unsigned long flags;
@@ -344,7 +378,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
344 378
345 sigemptyset(&sigign); 379 sigemptyset(&sigign);
346 sigemptyset(&sigcatch); 380 sigemptyset(&sigcatch);
347 cutime = cstime = utime = stime = cputime_zero; 381 cutime = cstime = cputime_zero;
382 utime = stime = 0;
348 383
349 rcu_read_lock(); 384 rcu_read_lock();
350 if (lock_task_sighand(task, &flags)) { 385 if (lock_task_sighand(task, &flags)) {
@@ -370,15 +405,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
370 do { 405 do {
371 min_flt += t->min_flt; 406 min_flt += t->min_flt;
372 maj_flt += t->maj_flt; 407 maj_flt += t->maj_flt;
373 utime = cputime_add(utime, t->utime); 408 utime += task_utime(t);
374 stime = cputime_add(stime, t->stime); 409 stime += task_stime(t);
375 t = next_thread(t); 410 t = next_thread(t);
376 } while (t != task); 411 } while (t != task);
377 412
378 min_flt += sig->min_flt; 413 min_flt += sig->min_flt;
379 maj_flt += sig->maj_flt; 414 maj_flt += sig->maj_flt;
380 utime = cputime_add(utime, sig->utime); 415 utime += cputime_to_clock_t(sig->utime);
381 stime = cputime_add(stime, sig->stime); 416 stime += cputime_to_clock_t(sig->stime);
382 } 417 }
383 418
384 sid = signal_session(sig); 419 sid = signal_session(sig);
@@ -394,8 +429,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
394 if (!whole) { 429 if (!whole) {
395 min_flt = task->min_flt; 430 min_flt = task->min_flt;
396 maj_flt = task->maj_flt; 431 maj_flt = task->maj_flt;
397 utime = task->utime; 432 utime = task_utime(task);
398 stime = task->stime; 433 stime = task_stime(task);
399 } 434 }
400 435
401 /* scale priority and nice values from timeslices to -20..20 */ 436 /* scale priority and nice values from timeslices to -20..20 */
@@ -426,8 +461,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
426 cmin_flt, 461 cmin_flt,
427 maj_flt, 462 maj_flt,
428 cmaj_flt, 463 cmaj_flt,
429 cputime_to_clock_t(utime), 464 utime,
430 cputime_to_clock_t(stime), 465 stime,
431 cputime_to_clock_t(cutime), 466 cputime_to_clock_t(cutime),
432 cputime_to_clock_t(cstime), 467 cputime_to_clock_t(cstime),
433 priority, 468 priority,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a5fa1fdafc4e..46ea5d56e1bb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
296 */ 296 */
297static int proc_pid_schedstat(struct task_struct *task, char *buffer) 297static int proc_pid_schedstat(struct task_struct *task, char *buffer)
298{ 298{
299 return sprintf(buffer, "%lu %lu %lu\n", 299 return sprintf(buffer, "%llu %llu %lu\n",
300 task->sched_info.cpu_time, 300 task->sched_info.cpu_time,
301 task->sched_info.run_delay, 301 task->sched_info.run_delay,
302 task->sched_info.pcnt); 302 task->sched_info.pcnt);
@@ -929,6 +929,69 @@ static const struct file_operations proc_fault_inject_operations = {
929}; 929};
930#endif 930#endif
931 931
932#ifdef CONFIG_SCHED_DEBUG
933/*
934 * Print out various scheduling related per-task fields:
935 */
936static int sched_show(struct seq_file *m, void *v)
937{
938 struct inode *inode = m->private;
939 struct task_struct *p;
940
941 WARN_ON(!inode);
942
943 p = get_proc_task(inode);
944 if (!p)
945 return -ESRCH;
946 proc_sched_show_task(p, m);
947
948 put_task_struct(p);
949
950 return 0;
951}
952
953static ssize_t
954sched_write(struct file *file, const char __user *buf,
955 size_t count, loff_t *offset)
956{
957 struct inode *inode = file->f_path.dentry->d_inode;
958 struct task_struct *p;
959
960 WARN_ON(!inode);
961
962 p = get_proc_task(inode);
963 if (!p)
964 return -ESRCH;
965 proc_sched_set_task(p);
966
967 put_task_struct(p);
968
969 return count;
970}
971
972static int sched_open(struct inode *inode, struct file *filp)
973{
974 int ret;
975
976 ret = single_open(filp, sched_show, NULL);
977 if (!ret) {
978 struct seq_file *m = filp->private_data;
979
980 m->private = inode;
981 }
982 return ret;
983}
984
985static const struct file_operations proc_pid_sched_operations = {
986 .open = sched_open,
987 .read = seq_read,
988 .write = sched_write,
989 .llseek = seq_lseek,
990 .release = seq_release,
991};
992
993#endif
994
932static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) 995static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
933{ 996{
934 struct inode *inode = dentry->d_inode; 997 struct inode *inode = dentry->d_inode;
@@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_stuff[] = {
1963 INF("environ", S_IRUSR, pid_environ), 2026 INF("environ", S_IRUSR, pid_environ),
1964 INF("auxv", S_IRUSR, pid_auxv), 2027 INF("auxv", S_IRUSR, pid_auxv),
1965 INF("status", S_IRUGO, pid_status), 2028 INF("status", S_IRUGO, pid_status),
2029#ifdef CONFIG_SCHED_DEBUG
2030 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
2031#endif
1966 INF("cmdline", S_IRUGO, pid_cmdline), 2032 INF("cmdline", S_IRUGO, pid_cmdline),
1967 INF("stat", S_IRUGO, tgid_stat), 2033 INF("stat", S_IRUGO, tgid_stat),
1968 INF("statm", S_IRUGO, pid_statm), 2034 INF("statm", S_IRUGO, pid_statm),
@@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_stuff[] = {
2247 INF("environ", S_IRUSR, pid_environ), 2313 INF("environ", S_IRUSR, pid_environ),
2248 INF("auxv", S_IRUSR, pid_auxv), 2314 INF("auxv", S_IRUSR, pid_auxv),
2249 INF("status", S_IRUGO, pid_status), 2315 INF("status", S_IRUGO, pid_status),
2316#ifdef CONFIG_SCHED_DEBUG
2317 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
2318#endif
2250 INF("cmdline", S_IRUGO, pid_cmdline), 2319 INF("cmdline", S_IRUGO, pid_cmdline),
2251 INF("stat", S_IRUGO, tid_stat), 2320 INF("stat", S_IRUGO, tid_stat),
2252 INF("statm", S_IRUGO, pid_statm), 2321 INF("statm", S_IRUGO, pid_statm),
diff --git a/include/asm-generic/bitops/sched.h b/include/asm-generic/bitops/sched.h
index 815bb0148060..604fab7031a6 100644
--- a/include/asm-generic/bitops/sched.h
+++ b/include/asm-generic/bitops/sched.h
@@ -6,28 +6,23 @@
6 6
7/* 7/*
8 * Every architecture must define this function. It's the fastest 8 * Every architecture must define this function. It's the fastest
9 * way of searching a 140-bit bitmap where the first 100 bits are 9 * way of searching a 100-bit bitmap. It's guaranteed that at least
10 * unlikely to be set. It's guaranteed that at least one of the 140 10 * one of the 100 bits is cleared.
11 * bits is cleared.
12 */ 11 */
13static inline int sched_find_first_bit(const unsigned long *b) 12static inline int sched_find_first_bit(const unsigned long *b)
14{ 13{
15#if BITS_PER_LONG == 64 14#if BITS_PER_LONG == 64
16 if (unlikely(b[0])) 15 if (b[0])
17 return __ffs(b[0]); 16 return __ffs(b[0]);
18 if (likely(b[1])) 17 return __ffs(b[1]) + 64;
19 return __ffs(b[1]) + 64;
20 return __ffs(b[2]) + 128;
21#elif BITS_PER_LONG == 32 18#elif BITS_PER_LONG == 32
22 if (unlikely(b[0])) 19 if (b[0])
23 return __ffs(b[0]); 20 return __ffs(b[0]);
24 if (unlikely(b[1])) 21 if (b[1])
25 return __ffs(b[1]) + 32; 22 return __ffs(b[1]) + 32;
26 if (unlikely(b[2])) 23 if (b[2])
27 return __ffs(b[2]) + 64; 24 return __ffs(b[2]) + 64;
28 if (b[3]) 25 return __ffs(b[3]) + 96;
29 return __ffs(b[3]) + 96;
30 return __ffs(b[4]) + 128;
31#else 26#else
32#error BITS_PER_LONG not defined 27#error BITS_PER_LONG not defined
33#endif 28#endif
diff --git a/include/asm-mips/mach-au1x00/au1xxx_ide.h b/include/asm-mips/mach-au1x00/au1xxx_ide.h
index 8fcae21adbd5..4663e8b415c9 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_ide.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_ide.h
@@ -88,26 +88,26 @@ static const struct drive_list_entry dma_white_list [] = {
88/* 88/*
89 * Hitachi 89 * Hitachi
90 */ 90 */
91 { "HITACHI_DK14FA-20" , "ALL" }, 91 { "HITACHI_DK14FA-20" , NULL },
92 { "HTS726060M9AT00" , "ALL" }, 92 { "HTS726060M9AT00" , NULL },
93/* 93/*
94 * Maxtor 94 * Maxtor
95 */ 95 */
96 { "Maxtor 6E040L0" , "ALL" }, 96 { "Maxtor 6E040L0" , NULL },
97 { "Maxtor 6Y080P0" , "ALL" }, 97 { "Maxtor 6Y080P0" , NULL },
98 { "Maxtor 6Y160P0" , "ALL" }, 98 { "Maxtor 6Y160P0" , NULL },
99/* 99/*
100 * Seagate 100 * Seagate
101 */ 101 */
102 { "ST3120026A" , "ALL" }, 102 { "ST3120026A" , NULL },
103 { "ST320014A" , "ALL" }, 103 { "ST320014A" , NULL },
104 { "ST94011A" , "ALL" }, 104 { "ST94011A" , NULL },
105 { "ST340016A" , "ALL" }, 105 { "ST340016A" , NULL },
106/* 106/*
107 * Western Digital 107 * Western Digital
108 */ 108 */
109 { "WDC WD400UE-00HCT0" , "ALL" }, 109 { "WDC WD400UE-00HCT0" , NULL },
110 { "WDC WD400JB-00JJC0" , "ALL" }, 110 { "WDC WD400JB-00JJC0" , NULL },
111 { NULL , NULL } 111 { NULL , NULL }
112}; 112};
113 113
@@ -116,9 +116,9 @@ static const struct drive_list_entry dma_black_list [] = {
116/* 116/*
117 * Western Digital 117 * Western Digital
118 */ 118 */
119 { "WDC WD100EB-00CGH0" , "ALL" }, 119 { "WDC WD100EB-00CGH0" , NULL },
120 { "WDC WD200BB-00AUA1" , "ALL" }, 120 { "WDC WD200BB-00AUA1" , NULL },
121 { "WDC AC24300L" , "ALL" }, 121 { "WDC AC24300L" , NULL },
122 { NULL , NULL } 122 { NULL , NULL }
123}; 123};
124#endif 124#endif
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 7803014f3a11..8d302298a161 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -79,6 +79,19 @@
79#endif 79#endif
80 80
81#ifdef CONFIG_PREEMPT 81#ifdef CONFIG_PREEMPT
82# define PREEMPT_CHECK_OFFSET 1
83#else
84# define PREEMPT_CHECK_OFFSET 0
85#endif
86
87/*
88 * Check whether we were atomic before we did preempt_disable():
89 * (used by the scheduler)
90 */
91#define in_atomic_preempt_off() \
92 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
93
94#ifdef CONFIG_PREEMPT
82# define preemptible() (preempt_count() == 0 && !irqs_disabled()) 95# define preemptible() (preempt_count() == 0 && !irqs_disabled())
83# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) 96# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
84#else 97#else
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1e365acdd369..19ab25804056 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -25,6 +25,7 @@
25#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/io.h> 26#include <asm/io.h>
27#include <asm/semaphore.h> 27#include <asm/semaphore.h>
28#include <asm/mutex.h>
28 29
29/****************************************************************************** 30/******************************************************************************
30 * IDE driver configuration options (play with these as desired): 31 * IDE driver configuration options (play with these as desired):
@@ -685,6 +686,8 @@ typedef struct hwif_s {
685 u8 mwdma_mask; 686 u8 mwdma_mask;
686 u8 swdma_mask; 687 u8 swdma_mask;
687 688
689 u8 cbl; /* cable type */
690
688 hwif_chipset_t chipset; /* sub-module for tuning.. */ 691 hwif_chipset_t chipset; /* sub-module for tuning.. */
689 692
690 struct pci_dev *pci_dev; /* for pci chipsets */ 693 struct pci_dev *pci_dev; /* for pci chipsets */
@@ -735,8 +738,8 @@ typedef struct hwif_s {
735 void (*ide_dma_clear_irq)(ide_drive_t *drive); 738 void (*ide_dma_clear_irq)(ide_drive_t *drive);
736 void (*dma_host_on)(ide_drive_t *drive); 739 void (*dma_host_on)(ide_drive_t *drive);
737 void (*dma_host_off)(ide_drive_t *drive); 740 void (*dma_host_off)(ide_drive_t *drive);
738 int (*ide_dma_lostirq)(ide_drive_t *drive); 741 void (*dma_lost_irq)(ide_drive_t *drive);
739 int (*ide_dma_timeout)(ide_drive_t *drive); 742 void (*dma_timeout)(ide_drive_t *drive);
740 743
741 void (*OUTB)(u8 addr, unsigned long port); 744 void (*OUTB)(u8 addr, unsigned long port);
742 void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port); 745 void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port);
@@ -791,7 +794,6 @@ typedef struct hwif_s {
791 unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ 794 unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */
792 unsigned reset : 1; /* reset after probe */ 795 unsigned reset : 1; /* reset after probe */
793 unsigned autodma : 1; /* auto-attempt using DMA at boot */ 796 unsigned autodma : 1; /* auto-attempt using DMA at boot */
794 unsigned udma_four : 1; /* 1=ATA-66 capable, 0=default */
795 unsigned no_lba48 : 1; /* 1 = cannot do LBA48 */ 797 unsigned no_lba48 : 1; /* 1 = cannot do LBA48 */
796 unsigned no_lba48_dma : 1; /* 1 = cannot do LBA48 DMA */ 798 unsigned no_lba48_dma : 1; /* 1 = cannot do LBA48 DMA */
797 unsigned auto_poll : 1; /* supports nop auto-poll */ 799 unsigned auto_poll : 1; /* supports nop auto-poll */
@@ -863,7 +865,7 @@ typedef struct hwgroup_s {
863 865
864typedef struct ide_driver_s ide_driver_t; 866typedef struct ide_driver_s ide_driver_t;
865 867
866extern struct semaphore ide_setting_sem; 868extern struct mutex ide_setting_mtx;
867 869
868int set_io_32bit(ide_drive_t *, int); 870int set_io_32bit(ide_drive_t *, int);
869int set_pio_mode(ide_drive_t *, int); 871int set_pio_mode(ide_drive_t *, int);
@@ -1304,8 +1306,8 @@ extern int __ide_dma_check(ide_drive_t *);
1304extern int ide_dma_setup(ide_drive_t *); 1306extern int ide_dma_setup(ide_drive_t *);
1305extern void ide_dma_start(ide_drive_t *); 1307extern void ide_dma_start(ide_drive_t *);
1306extern int __ide_dma_end(ide_drive_t *); 1308extern int __ide_dma_end(ide_drive_t *);
1307extern int __ide_dma_lostirq(ide_drive_t *); 1309extern void ide_dma_lost_irq(ide_drive_t *);
1308extern int __ide_dma_timeout(ide_drive_t *); 1310extern void ide_dma_timeout(ide_drive_t *);
1309#endif /* CONFIG_BLK_DEV_IDEDMA_PCI */ 1311#endif /* CONFIG_BLK_DEV_IDEDMA_PCI */
1310 1312
1311#else 1313#else
@@ -1382,11 +1384,11 @@ extern const ide_pio_timings_t ide_pio_timings[6];
1382 1384
1383 1385
1384extern spinlock_t ide_lock; 1386extern spinlock_t ide_lock;
1385extern struct semaphore ide_cfg_sem; 1387extern struct mutex ide_cfg_mtx;
1386/* 1388/*
1387 * Structure locking: 1389 * Structure locking:
1388 * 1390 *
1389 * ide_cfg_sem and ide_lock together protect changes to 1391 * ide_cfg_mtx and ide_lock together protect changes to
1390 * ide_hwif_t->{next,hwgroup} 1392 * ide_hwif_t->{next,hwgroup}
1391 * ide_drive_t->next 1393 * ide_drive_t->next
1392 * 1394 *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 693f0e6c54d4..cfb680585ab8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,8 @@
34#define SCHED_FIFO 1 34#define SCHED_FIFO 1
35#define SCHED_RR 2 35#define SCHED_RR 2
36#define SCHED_BATCH 3 36#define SCHED_BATCH 3
37/* SCHED_ISO: reserved but not implemented yet */
38#define SCHED_IDLE 5
37 39
38#ifdef __KERNEL__ 40#ifdef __KERNEL__
39 41
@@ -130,6 +132,26 @@ extern unsigned long nr_active(void);
130extern unsigned long nr_iowait(void); 132extern unsigned long nr_iowait(void);
131extern unsigned long weighted_cpuload(const int cpu); 133extern unsigned long weighted_cpuload(const int cpu);
132 134
135struct seq_file;
136struct cfs_rq;
137#ifdef CONFIG_SCHED_DEBUG
138extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
139extern void proc_sched_set_task(struct task_struct *p);
140extern void
141print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
142#else
143static inline void
144proc_sched_show_task(struct task_struct *p, struct seq_file *m)
145{
146}
147static inline void proc_sched_set_task(struct task_struct *p)
148{
149}
150static inline void
151print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
152{
153}
154#endif
133 155
134/* 156/*
135 * Task state bitmask. NOTE! These bits are also 157 * Task state bitmask. NOTE! These bits are also
@@ -193,6 +215,7 @@ struct task_struct;
193extern void sched_init(void); 215extern void sched_init(void);
194extern void sched_init_smp(void); 216extern void sched_init_smp(void);
195extern void init_idle(struct task_struct *idle, int cpu); 217extern void init_idle(struct task_struct *idle, int cpu);
218extern void init_idle_bootup_task(struct task_struct *idle);
196 219
197extern cpumask_t nohz_cpu_mask; 220extern cpumask_t nohz_cpu_mask;
198#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 221#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
@@ -479,7 +502,7 @@ struct signal_struct {
479 * from jiffies_to_ns(utime + stime) if sched_clock uses something 502 * from jiffies_to_ns(utime + stime) if sched_clock uses something
480 * other than jiffies.) 503 * other than jiffies.)
481 */ 504 */
482 unsigned long long sched_time; 505 unsigned long long sum_sched_runtime;
483 506
484 /* 507 /*
485 * We don't bother to synchronize most readers of this at all, 508 * We don't bother to synchronize most readers of this at all,
@@ -521,31 +544,6 @@ struct signal_struct {
521#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ 544#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
522#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ 545#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
523 546
524
525/*
526 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
527 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
528 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
529 * values are inverted: lower p->prio value means higher priority.
530 *
531 * The MAX_USER_RT_PRIO value allows the actual maximum
532 * RT priority to be separate from the value exported to
533 * user-space. This allows kernel threads to set their
534 * priority to a value higher than any user task. Note:
535 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
536 */
537
538#define MAX_USER_RT_PRIO 100
539#define MAX_RT_PRIO MAX_USER_RT_PRIO
540
541#define MAX_PRIO (MAX_RT_PRIO + 40)
542
543#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
544#define rt_task(p) rt_prio((p)->prio)
545#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
546#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
547#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
548
549/* 547/*
550 * Some day this will be a full-fledged user tracking system.. 548 * Some day this will be a full-fledged user tracking system..
551 */ 549 */
@@ -583,13 +581,13 @@ struct reclaim_state;
583#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 581#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
584struct sched_info { 582struct sched_info {
585 /* cumulative counters */ 583 /* cumulative counters */
586 unsigned long cpu_time, /* time spent on the cpu */ 584 unsigned long pcnt; /* # of times run on this cpu */
587 run_delay, /* time spent waiting on a runqueue */ 585 unsigned long long cpu_time, /* time spent on the cpu */
588 pcnt; /* # of timeslices run on this cpu */ 586 run_delay; /* time spent waiting on a runqueue */
589 587
590 /* timestamps */ 588 /* timestamps */
591 unsigned long last_arrival, /* when we last ran on a cpu */ 589 unsigned long long last_arrival,/* when we last ran on a cpu */
592 last_queued; /* when we were last queued to run */ 590 last_queued; /* when we were last queued to run */
593}; 591};
594#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 592#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
595 593
@@ -639,18 +637,24 @@ static inline int sched_info_on(void)
639#endif 637#endif
640} 638}
641 639
642enum idle_type 640enum cpu_idle_type {
643{ 641 CPU_IDLE,
644 SCHED_IDLE, 642 CPU_NOT_IDLE,
645 NOT_IDLE, 643 CPU_NEWLY_IDLE,
646 NEWLY_IDLE, 644 CPU_MAX_IDLE_TYPES
647 MAX_IDLE_TYPES
648}; 645};
649 646
650/* 647/*
651 * sched-domains (multiprocessor balancing) declarations: 648 * sched-domains (multiprocessor balancing) declarations:
652 */ 649 */
653#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ 650
651/*
652 * Increase resolution of nice-level calculations:
653 */
654#define SCHED_LOAD_SHIFT 10
655#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
656
657#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 5)
654 658
655#ifdef CONFIG_SMP 659#ifdef CONFIG_SMP
656#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 660#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
@@ -719,14 +723,14 @@ struct sched_domain {
719 723
720#ifdef CONFIG_SCHEDSTATS 724#ifdef CONFIG_SCHEDSTATS
721 /* load_balance() stats */ 725 /* load_balance() stats */
722 unsigned long lb_cnt[MAX_IDLE_TYPES]; 726 unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
723 unsigned long lb_failed[MAX_IDLE_TYPES]; 727 unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
724 unsigned long lb_balanced[MAX_IDLE_TYPES]; 728 unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
725 unsigned long lb_imbalance[MAX_IDLE_TYPES]; 729 unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
726 unsigned long lb_gained[MAX_IDLE_TYPES]; 730 unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
727 unsigned long lb_hot_gained[MAX_IDLE_TYPES]; 731 unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
728 unsigned long lb_nobusyg[MAX_IDLE_TYPES]; 732 unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
729 unsigned long lb_nobusyq[MAX_IDLE_TYPES]; 733 unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
730 734
731 /* Active load balancing */ 735 /* Active load balancing */
732 unsigned long alb_cnt; 736 unsigned long alb_cnt;
@@ -753,12 +757,6 @@ struct sched_domain {
753extern int partition_sched_domains(cpumask_t *partition1, 757extern int partition_sched_domains(cpumask_t *partition1,
754 cpumask_t *partition2); 758 cpumask_t *partition2);
755 759
756/*
757 * Maximum cache size the migration-costs auto-tuning code will
758 * search from:
759 */
760extern unsigned int max_cache_size;
761
762#endif /* CONFIG_SMP */ 760#endif /* CONFIG_SMP */
763 761
764 762
@@ -809,14 +807,86 @@ struct mempolicy;
809struct pipe_inode_info; 807struct pipe_inode_info;
810struct uts_namespace; 808struct uts_namespace;
811 809
812enum sleep_type { 810struct rq;
813 SLEEP_NORMAL, 811struct sched_domain;
814 SLEEP_NONINTERACTIVE, 812
815 SLEEP_INTERACTIVE, 813struct sched_class {
816 SLEEP_INTERRUPTED, 814 struct sched_class *next;
815
816 void (*enqueue_task) (struct rq *rq, struct task_struct *p,
817 int wakeup, u64 now);
818 void (*dequeue_task) (struct rq *rq, struct task_struct *p,
819 int sleep, u64 now);
820 void (*yield_task) (struct rq *rq, struct task_struct *p);
821
822 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
823
824 struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
825 void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
826
827 int (*load_balance) (struct rq *this_rq, int this_cpu,
828 struct rq *busiest,
829 unsigned long max_nr_move, unsigned long max_load_move,
830 struct sched_domain *sd, enum cpu_idle_type idle,
831 int *all_pinned, unsigned long *total_load_moved);
832
833 void (*set_curr_task) (struct rq *rq);
834 void (*task_tick) (struct rq *rq, struct task_struct *p);
835 void (*task_new) (struct rq *rq, struct task_struct *p);
817}; 836};
818 837
819struct prio_array; 838struct load_weight {
839 unsigned long weight, inv_weight;
840};
841
842/*
843 * CFS stats for a schedulable entity (task, task-group etc)
844 *
845 * Current field usage histogram:
846 *
847 * 4 se->block_start
848 * 4 se->run_node
849 * 4 se->sleep_start
850 * 4 se->sleep_start_fair
851 * 6 se->load.weight
852 * 7 se->delta_fair
853 * 15 se->wait_runtime
854 */
855struct sched_entity {
856 long wait_runtime;
857 unsigned long delta_fair_run;
858 unsigned long delta_fair_sleep;
859 unsigned long delta_exec;
860 s64 fair_key;
861 struct load_weight load; /* for load-balancing */
862 struct rb_node run_node;
863 unsigned int on_rq;
864
865 u64 wait_start_fair;
866 u64 wait_start;
867 u64 exec_start;
868 u64 sleep_start;
869 u64 sleep_start_fair;
870 u64 block_start;
871 u64 sleep_max;
872 u64 block_max;
873 u64 exec_max;
874 u64 wait_max;
875 u64 last_ran;
876
877 u64 sum_exec_runtime;
878 s64 sum_wait_runtime;
879 s64 sum_sleep_runtime;
880 unsigned long wait_runtime_overruns;
881 unsigned long wait_runtime_underruns;
882#ifdef CONFIG_FAIR_GROUP_SCHED
883 struct sched_entity *parent;
884 /* rq on which this entity is (to be) queued: */
885 struct cfs_rq *cfs_rq;
886 /* rq "owned" by this entity/group: */
887 struct cfs_rq *my_q;
888#endif
889};
820 890
821struct task_struct { 891struct task_struct {
822 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 892 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -832,23 +902,20 @@ struct task_struct {
832 int oncpu; 902 int oncpu;
833#endif 903#endif
834#endif 904#endif
835 int load_weight; /* for niceness load balancing purposes */ 905
836 int prio, static_prio, normal_prio; 906 int prio, static_prio, normal_prio;
837 struct list_head run_list; 907 struct list_head run_list;
838 struct prio_array *array; 908 struct sched_class *sched_class;
909 struct sched_entity se;
839 910
840 unsigned short ioprio; 911 unsigned short ioprio;
841#ifdef CONFIG_BLK_DEV_IO_TRACE 912#ifdef CONFIG_BLK_DEV_IO_TRACE
842 unsigned int btrace_seq; 913 unsigned int btrace_seq;
843#endif 914#endif
844 unsigned long sleep_avg;
845 unsigned long long timestamp, last_ran;
846 unsigned long long sched_time; /* sched_clock time spent running */
847 enum sleep_type sleep_type;
848 915
849 unsigned int policy; 916 unsigned int policy;
850 cpumask_t cpus_allowed; 917 cpumask_t cpus_allowed;
851 unsigned int time_slice, first_time_slice; 918 unsigned int time_slice;
852 919
853#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 920#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
854 struct sched_info sched_info; 921 struct sched_info sched_info;
@@ -1078,6 +1145,37 @@ struct task_struct {
1078#endif 1145#endif
1079}; 1146};
1080 1147
1148/*
1149 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1150 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
1151 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
1152 * values are inverted: lower p->prio value means higher priority.
1153 *
1154 * The MAX_USER_RT_PRIO value allows the actual maximum
1155 * RT priority to be separate from the value exported to
1156 * user-space. This allows kernel threads to set their
1157 * priority to a value higher than any user task. Note:
1158 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
1159 */
1160
1161#define MAX_USER_RT_PRIO 100
1162#define MAX_RT_PRIO MAX_USER_RT_PRIO
1163
1164#define MAX_PRIO (MAX_RT_PRIO + 40)
1165#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
1166
1167static inline int rt_prio(int prio)
1168{
1169 if (unlikely(prio < MAX_RT_PRIO))
1170 return 1;
1171 return 0;
1172}
1173
1174static inline int rt_task(struct task_struct *p)
1175{
1176 return rt_prio(p->prio);
1177}
1178
1081static inline pid_t process_group(struct task_struct *tsk) 1179static inline pid_t process_group(struct task_struct *tsk)
1082{ 1180{
1083 return tsk->signal->pgrp; 1181 return tsk->signal->pgrp;
@@ -1223,7 +1321,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1223 1321
1224extern unsigned long long sched_clock(void); 1322extern unsigned long long sched_clock(void);
1225extern unsigned long long 1323extern unsigned long long
1226current_sched_time(const struct task_struct *current_task); 1324task_sched_runtime(struct task_struct *task);
1227 1325
1228/* sched_exec is called by processes performing an exec */ 1326/* sched_exec is called by processes performing an exec */
1229#ifdef CONFIG_SMP 1327#ifdef CONFIG_SMP
@@ -1232,6 +1330,8 @@ extern void sched_exec(void);
1232#define sched_exec() {} 1330#define sched_exec() {}
1233#endif 1331#endif
1234 1332
1333extern void sched_clock_unstable_event(void);
1334
1235#ifdef CONFIG_HOTPLUG_CPU 1335#ifdef CONFIG_HOTPLUG_CPU
1236extern void idle_task_exit(void); 1336extern void idle_task_exit(void);
1237#else 1337#else
@@ -1240,6 +1340,14 @@ static inline void idle_task_exit(void) {}
1240 1340
1241extern void sched_idle_next(void); 1341extern void sched_idle_next(void);
1242 1342
1343extern unsigned int sysctl_sched_granularity;
1344extern unsigned int sysctl_sched_wakeup_granularity;
1345extern unsigned int sysctl_sched_batch_wakeup_granularity;
1346extern unsigned int sysctl_sched_stat_granularity;
1347extern unsigned int sysctl_sched_runtime_limit;
1348extern unsigned int sysctl_sched_child_runs_first;
1349extern unsigned int sysctl_sched_features;
1350
1243#ifdef CONFIG_RT_MUTEXES 1351#ifdef CONFIG_RT_MUTEXES
1244extern int rt_mutex_getprio(struct task_struct *p); 1352extern int rt_mutex_getprio(struct task_struct *p);
1245extern void rt_mutex_setprio(struct task_struct *p, int prio); 1353extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1317,8 +1425,8 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
1317#else 1425#else
1318 static inline void kick_process(struct task_struct *tsk) { } 1426 static inline void kick_process(struct task_struct *tsk) { }
1319#endif 1427#endif
1320extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); 1428extern void sched_fork(struct task_struct *p, int clone_flags);
1321extern void FASTCALL(sched_exit(struct task_struct * p)); 1429extern void sched_dead(struct task_struct *p);
1322 1430
1323extern int in_group_p(gid_t); 1431extern int in_group_p(gid_t);
1324extern int in_egroup_p(gid_t); 1432extern int in_egroup_p(gid_t);
@@ -1406,7 +1514,7 @@ extern struct mm_struct * mm_alloc(void);
1406extern void FASTCALL(__mmdrop(struct mm_struct *)); 1514extern void FASTCALL(__mmdrop(struct mm_struct *));
1407static inline void mmdrop(struct mm_struct * mm) 1515static inline void mmdrop(struct mm_struct * mm)
1408{ 1516{
1409 if (atomic_dec_and_test(&mm->mm_count)) 1517 if (unlikely(atomic_dec_and_test(&mm->mm_count)))
1410 __mmdrop(mm); 1518 __mmdrop(mm);
1411} 1519}
1412 1520
@@ -1638,10 +1746,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
1638 return task_thread_info(p)->cpu; 1746 return task_thread_info(p)->cpu;
1639} 1747}
1640 1748
1641static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) 1749extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
1642{
1643 task_thread_info(p)->cpu = cpu;
1644}
1645 1750
1646#else 1751#else
1647 1752
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a9d1f049cc15..da6c39b2d051 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
98 .cache_nice_tries = 0, \ 98 .cache_nice_tries = 0, \
99 .busy_idx = 0, \ 99 .busy_idx = 0, \
100 .idle_idx = 0, \ 100 .idle_idx = 0, \
101 .newidle_idx = 1, \ 101 .newidle_idx = 0, \
102 .wake_idx = 0, \ 102 .wake_idx = 0, \
103 .forkexec_idx = 0, \ 103 .forkexec_idx = 0, \
104 .flags = SD_LOAD_BALANCE \ 104 .flags = SD_LOAD_BALANCE \
@@ -128,14 +128,15 @@
128 .imbalance_pct = 125, \ 128 .imbalance_pct = 125, \
129 .cache_nice_tries = 1, \ 129 .cache_nice_tries = 1, \
130 .busy_idx = 2, \ 130 .busy_idx = 2, \
131 .idle_idx = 1, \ 131 .idle_idx = 0, \
132 .newidle_idx = 2, \ 132 .newidle_idx = 0, \
133 .wake_idx = 1, \ 133 .wake_idx = 1, \
134 .forkexec_idx = 1, \ 134 .forkexec_idx = 1, \
135 .flags = SD_LOAD_BALANCE \ 135 .flags = SD_LOAD_BALANCE \
136 | SD_BALANCE_NEWIDLE \ 136 | SD_BALANCE_NEWIDLE \
137 | SD_BALANCE_EXEC \ 137 | SD_BALANCE_EXEC \
138 | SD_WAKE_AFFINE \ 138 | SD_WAKE_AFFINE \
139 | SD_WAKE_IDLE \
139 | SD_SHARE_PKG_RESOURCES\ 140 | SD_SHARE_PKG_RESOURCES\
140 | BALANCE_FOR_MC_POWER, \ 141 | BALANCE_FOR_MC_POWER, \
141 .last_balance = jiffies, \ 142 .last_balance = jiffies, \
@@ -158,14 +159,15 @@
158 .imbalance_pct = 125, \ 159 .imbalance_pct = 125, \
159 .cache_nice_tries = 1, \ 160 .cache_nice_tries = 1, \
160 .busy_idx = 2, \ 161 .busy_idx = 2, \
161 .idle_idx = 1, \ 162 .idle_idx = 0, \
162 .newidle_idx = 2, \ 163 .newidle_idx = 0, \
163 .wake_idx = 1, \ 164 .wake_idx = 1, \
164 .forkexec_idx = 1, \ 165 .forkexec_idx = 1, \
165 .flags = SD_LOAD_BALANCE \ 166 .flags = SD_LOAD_BALANCE \
166 | SD_BALANCE_NEWIDLE \ 167 | SD_BALANCE_NEWIDLE \
167 | SD_BALANCE_EXEC \ 168 | SD_BALANCE_EXEC \
168 | SD_WAKE_AFFINE \ 169 | SD_WAKE_AFFINE \
170 | SD_WAKE_IDLE \
169 | BALANCE_FOR_PKG_POWER,\ 171 | BALANCE_FOR_PKG_POWER,\
170 .last_balance = jiffies, \ 172 .last_balance = jiffies, \
171 .balance_interval = 1, \ 173 .balance_interval = 1, \
diff --git a/include/linux/wait.h b/include/linux/wait.h
index e820d00e1383..0e686280450b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -366,15 +366,15 @@ static inline void remove_wait_queue_locked(wait_queue_head_t *q,
366 366
367/* 367/*
368 * These are the old interfaces to sleep waiting for an event. 368 * These are the old interfaces to sleep waiting for an event.
369 * They are racy. DO NOT use them, use the wait_event* interfaces above. 369 * They are racy. DO NOT use them, use the wait_event* interfaces above.
370 * We plan to remove these interfaces during 2.7. 370 * We plan to remove these interfaces.
371 */ 371 */
372extern void FASTCALL(sleep_on(wait_queue_head_t *q)); 372extern void sleep_on(wait_queue_head_t *q);
373extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, 373extern long sleep_on_timeout(wait_queue_head_t *q,
374 signed long timeout)); 374 signed long timeout);
375extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); 375extern void interruptible_sleep_on(wait_queue_head_t *q);
376extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, 376extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
377 signed long timeout)); 377 signed long timeout);
378 378
379/* 379/*
380 * Waitqueues which are removed from the waitqueue_head at wakeup time 380 * Waitqueues which are removed from the waitqueue_head at wakeup time
diff --git a/init/main.c b/init/main.c
index eb8bdbae4fc7..0eb1c7463fe4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -436,15 +436,16 @@ static void noinline __init_refok rest_init(void)
436 436
437 /* 437 /*
438 * The boot idle thread must execute schedule() 438 * The boot idle thread must execute schedule()
439 * at least one to get things moving: 439 * at least once to get things moving:
440 */ 440 */
441 init_idle_bootup_task(current);
441 preempt_enable_no_resched(); 442 preempt_enable_no_resched();
442 schedule(); 443 schedule();
443 preempt_disable(); 444 preempt_disable();
444 445
445 /* Call into cpu_idle with preempt disabled */ 446 /* Call into cpu_idle with preempt disabled */
446 cpu_idle(); 447 cpu_idle();
447} 448}
448 449
449/* Check for early params. */ 450/* Check for early params. */
450static int __init do_early_param(char *param, char *val) 451static int __init do_early_param(char *param, char *val)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c0148ae992c4..81e697829633 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
100{ 100{
101 s64 tmp; 101 s64 tmp;
102 struct timespec ts; 102 unsigned long t1;
103 unsigned long t1,t2,t3; 103 unsigned long long t2, t3;
104 unsigned long flags; 104 unsigned long flags;
105 struct timespec ts;
105 106
106 /* Though tsk->delays accessed later, early exit avoids 107 /* Though tsk->delays accessed later, early exit avoids
107 * unnecessary returning of other data 108 * unnecessary returning of other data
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
124 125
125 d->cpu_count += t1; 126 d->cpu_count += t1;
126 127
127 jiffies_to_timespec(t2, &ts); 128 tmp = (s64)d->cpu_delay_total + t2;
128 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; 129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
130 130
131 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; 131 tmp = (s64)d->cpu_run_virtual_total + t3;
132 d->cpu_run_virtual_total = 132 d->cpu_run_virtual_total =
133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; 133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
134 134
diff --git a/kernel/exit.c b/kernel/exit.c
index 5c8ecbaa19a5..ca6a11b73023 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->maj_flt += tsk->maj_flt; 122 sig->maj_flt += tsk->maj_flt;
123 sig->nvcsw += tsk->nvcsw; 123 sig->nvcsw += tsk->nvcsw;
124 sig->nivcsw += tsk->nivcsw; 124 sig->nivcsw += tsk->nivcsw;
125 sig->sched_time += tsk->sched_time;
126 sig->inblock += task_io_get_inblock(tsk); 125 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 126 sig->oublock += task_io_get_oublock(tsk);
127 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
128 sig = NULL; /* Marker for below. */ 128 sig = NULL; /* Marker for below. */
129 } 129 }
130 130
@@ -182,7 +182,6 @@ repeat:
182 zap_leader = (leader->exit_signal == -1); 182 zap_leader = (leader->exit_signal == -1);
183 } 183 }
184 184
185 sched_exit(p);
186 write_unlock_irq(&tasklist_lock); 185 write_unlock_irq(&tasklist_lock);
187 proc_flush_task(p); 186 proc_flush_task(p);
188 release_thread(p); 187 release_thread(p);
@@ -291,7 +290,7 @@ static void reparent_to_kthreadd(void)
291 /* Set the exit signal to SIGCHLD so we signal init on exit */ 290 /* Set the exit signal to SIGCHLD so we signal init on exit */
292 current->exit_signal = SIGCHLD; 291 current->exit_signal = SIGCHLD;
293 292
294 if (!has_rt_policy(current) && (task_nice(current) < 0)) 293 if (task_nice(current) < 0)
295 set_user_nice(current, 0); 294 set_user_nice(current, 0);
296 /* cpus_allowed? */ 295 /* cpus_allowed? */
297 /* rt_priority? */ 296 /* rt_priority? */
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cda1bcd..da3a155bba0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,7 +877,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
880 sig->sched_time = 0; 880 sig->sum_sched_runtime = 0;
881 INIT_LIST_HEAD(&sig->cpu_timers[0]); 881 INIT_LIST_HEAD(&sig->cpu_timers[0]);
882 INIT_LIST_HEAD(&sig->cpu_timers[1]); 882 INIT_LIST_HEAD(&sig->cpu_timers[1]);
883 INIT_LIST_HEAD(&sig->cpu_timers[2]); 883 INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -1040,7 +1040,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1040 1040
1041 p->utime = cputime_zero; 1041 p->utime = cputime_zero;
1042 p->stime = cputime_zero; 1042 p->stime = cputime_zero;
1043 p->sched_time = 0; 1043
1044#ifdef CONFIG_TASK_XACCT 1044#ifdef CONFIG_TASK_XACCT
1045 p->rchar = 0; /* I/O counter: bytes read */ 1045 p->rchar = 0; /* I/O counter: bytes read */
1046 p->wchar = 0; /* I/O counter: bytes written */ 1046 p->wchar = 0; /* I/O counter: bytes written */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e18373..b53c8fcd9d82 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
161} 161}
162static inline unsigned long long sched_ns(struct task_struct *p) 162static inline unsigned long long sched_ns(struct task_struct *p)
163{ 163{
164 return (p == current) ? current_sched_time(p) : p->sched_time; 164 return task_sched_runtime(p);
165} 165}
166 166
167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
246 } while (t != p); 246 } while (t != p);
247 break; 247 break;
248 case CPUCLOCK_SCHED: 248 case CPUCLOCK_SCHED:
249 cpu->sched = p->signal->sched_time; 249 cpu->sched = p->signal->sum_sched_runtime;
250 /* Add in each other live thread. */ 250 /* Add in each other live thread. */
251 while ((t = next_thread(t)) != p) { 251 while ((t = next_thread(t)) != p) {
252 cpu->sched += t->sched_time; 252 cpu->sched += t->se.sum_exec_runtime;
253 } 253 }
254 cpu->sched += sched_ns(p); 254 cpu->sched += sched_ns(p);
255 break; 255 break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
422 */ 422 */
423static void cleanup_timers(struct list_head *head, 423static void cleanup_timers(struct list_head *head,
424 cputime_t utime, cputime_t stime, 424 cputime_t utime, cputime_t stime,
425 unsigned long long sched_time) 425 unsigned long long sum_exec_runtime)
426{ 426{
427 struct cpu_timer_list *timer, *next; 427 struct cpu_timer_list *timer, *next;
428 cputime_t ptime = cputime_add(utime, stime); 428 cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
451 ++head; 451 ++head;
452 list_for_each_entry_safe(timer, next, head, entry) { 452 list_for_each_entry_safe(timer, next, head, entry) {
453 list_del_init(&timer->entry); 453 list_del_init(&timer->entry);
454 if (timer->expires.sched < sched_time) { 454 if (timer->expires.sched < sum_exec_runtime) {
455 timer->expires.sched = 0; 455 timer->expires.sched = 0;
456 } else { 456 } else {
457 timer->expires.sched -= sched_time; 457 timer->expires.sched -= sum_exec_runtime;
458 } 458 }
459 } 459 }
460} 460}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
467void posix_cpu_timers_exit(struct task_struct *tsk) 467void posix_cpu_timers_exit(struct task_struct *tsk)
468{ 468{
469 cleanup_timers(tsk->cpu_timers, 469 cleanup_timers(tsk->cpu_timers,
470 tsk->utime, tsk->stime, tsk->sched_time); 470 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
471 471
472} 472}
473void posix_cpu_timers_exit_group(struct task_struct *tsk) 473void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
475 cleanup_timers(tsk->signal->cpu_timers, 475 cleanup_timers(tsk->signal->cpu_timers,
476 cputime_add(tsk->utime, tsk->signal->utime), 476 cputime_add(tsk->utime, tsk->signal->utime),
477 cputime_add(tsk->stime, tsk->signal->stime), 477 cputime_add(tsk->stime, tsk->signal->stime),
478 tsk->sched_time + tsk->signal->sched_time); 478 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
479} 479}
480 480
481 481
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
536 nsleft = max_t(unsigned long long, nsleft, 1); 536 nsleft = max_t(unsigned long long, nsleft, 1);
537 do { 537 do {
538 if (likely(!(t->flags & PF_EXITING))) { 538 if (likely(!(t->flags & PF_EXITING))) {
539 ns = t->sched_time + nsleft; 539 ns = t->se.sum_exec_runtime + nsleft;
540 if (t->it_sched_expires == 0 || 540 if (t->it_sched_expires == 0 ||
541 t->it_sched_expires > ns) { 541 t->it_sched_expires > ns) {
542 t->it_sched_expires = ns; 542 t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
1004 struct cpu_timer_list *t = list_first_entry(timers, 1004 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 1005 struct cpu_timer_list,
1006 entry); 1006 entry);
1007 if (!--maxfire || tsk->sched_time < t->expires.sched) { 1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 1008 tsk->it_sched_expires = t->expires.sched;
1009 break; 1009 break;
1010 } 1010 }
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
1024 int maxfire; 1024 int maxfire;
1025 struct signal_struct *const sig = tsk->signal; 1025 struct signal_struct *const sig = tsk->signal;
1026 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1026 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1027 unsigned long long sched_time, sched_expires; 1027 unsigned long long sum_sched_runtime, sched_expires;
1028 struct task_struct *t; 1028 struct task_struct *t;
1029 struct list_head *timers = sig->cpu_timers; 1029 struct list_head *timers = sig->cpu_timers;
1030 1030
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
1044 */ 1044 */
1045 utime = sig->utime; 1045 utime = sig->utime;
1046 stime = sig->stime; 1046 stime = sig->stime;
1047 sched_time = sig->sched_time; 1047 sum_sched_runtime = sig->sum_sched_runtime;
1048 t = tsk; 1048 t = tsk;
1049 do { 1049 do {
1050 utime = cputime_add(utime, t->utime); 1050 utime = cputime_add(utime, t->utime);
1051 stime = cputime_add(stime, t->stime); 1051 stime = cputime_add(stime, t->stime);
1052 sched_time += t->sched_time; 1052 sum_sched_runtime += t->se.sum_exec_runtime;
1053 t = next_thread(t); 1053 t = next_thread(t);
1054 } while (t != tsk); 1054 } while (t != tsk);
1055 ptime = cputime_add(utime, stime); 1055 ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
1090 struct cpu_timer_list *t = list_first_entry(timers, 1090 struct cpu_timer_list *t = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || sched_time < t->expires.sched) { 1093 if (!--maxfire || sum_sched_runtime < t->expires.sched) {
1094 sched_expires = t->expires.sched; 1094 sched_expires = t->expires.sched;
1095 break; 1095 break;
1096 } 1096 }
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
1182 virt_left = cputime_sub(virt_expires, utime); 1182 virt_left = cputime_sub(virt_expires, utime);
1183 virt_left = cputime_div_non_zero(virt_left, nthreads); 1183 virt_left = cputime_div_non_zero(virt_left, nthreads);
1184 if (sched_expires) { 1184 if (sched_expires) {
1185 sched_left = sched_expires - sched_time; 1185 sched_left = sched_expires - sum_sched_runtime;
1186 do_div(sched_left, nthreads); 1186 do_div(sched_left, nthreads);
1187 sched_left = max_t(unsigned long long, sched_left, 1); 1187 sched_left = max_t(unsigned long long, sched_left, 1);
1188 } else { 1188 } else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
1208 t->it_virt_expires = ticks; 1208 t->it_virt_expires = ticks;
1209 } 1209 }
1210 1210
1211 sched = t->sched_time + sched_left; 1211 sched = t->se.sum_exec_runtime + sched_left;
1212 if (sched_expires && (t->it_sched_expires == 0 || 1212 if (sched_expires && (t->it_sched_expires == 0 ||
1213 t->it_sched_expires > sched)) { 1213 t->it_sched_expires > sched)) {
1214 t->it_sched_expires = sched; 1214 t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1300 1300
1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1302 (tsk->it_sched_expires == 0 || 1302 (tsk->it_sched_expires == 0 ||
1303 tsk->sched_time < tsk->it_sched_expires)) 1303 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1304 return; 1304 return;
1305 1305
1306#undef UNEXPIRED 1306#undef UNEXPIRED
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a3122699..9fbced64bfee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
214 158
215static inline unsigned int task_timeslice(struct task_struct *p) 159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
223 198
224struct prio_array { 199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
225 unsigned int nr_active; 200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 201 * (like users, containers etc.)
227 struct list_head queue[MAX_PRIO]; 202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
209
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
334#ifndef prepare_arch_switch 394#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 395# define prepare_arch_switch(next) do { } while (0)
336#endif 396#endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
461} 521}
462 522
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 523/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 524 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 525 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 535 return rq;
604} 536}
605 537
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 538/*
608 * Called when a process is dequeued from the active array and given 539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 540 */
622static inline void sched_info_dequeued(struct task_struct *t) 541void sched_clock_unstable_event(void)
623{ 542{
624 t->sched_info.last_queued = 0; 543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
625} 550}
626 551
627/* 552/*
628 * Called when a task finally hits the cpu. We can now calculate how 553 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 554 *
630 * can keep stats on how long its timeslice is. 555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
631 */ 558 */
632static void sched_info_arrive(struct task_struct *t) 559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
633{ 566{
634 unsigned long now = jiffies, delta_jiffies = 0; 567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
635 573
636 if (t->sched_info.last_queued) 574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
637 delta_jiffies = now - t->sched_info.last_queued; 575
638 sched_info_dequeued(t); 576 cpu = task_cpu(p);
639 t->sched_info.run_delay += delta_jiffies; 577 if (cpu == smp_processor_id())
640 t->sched_info.last_arrival = now; 578 return;
641 t->sched_info.pcnt++;
642 579
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
644} 584}
645 585
646/* 586static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired
648 * array. The time is noted and later used to determine how long we
649 * had to wait for us to reach the cpu. Since the expired queue will
650 * become the active queue after active queue is empty, without dequeuing
651 * and requeuing any tasks, we are interested in queuing to either. It
652 * is unusual but not impossible for tasks to be dequeued and immediately
653 * requeued in the same or another array: this can happen in sched_yield(),
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
655 * to runqueue.
656 *
657 * This function is only called from enqueue_task(), but also only updates
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 587{
663 if (unlikely(sched_info_on())) 588 struct rq *rq = cpu_rq(cpu);
664 if (!t->sched_info.last_queued) 589 unsigned long flags;
665 t->sched_info.last_queued = jiffies; 590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
666} 595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
601}
602#endif
667 603
668/* 604static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 605{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
675 610
676 t->sched_info.cpu_time += delta_jiffies; 611 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 612#else
613 return divident / divisor;
614#endif
678} 615}
679 616
680/* 617#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 618# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 619#else
683 * the idle task.) We are only called when prev != next. 620# define WMULT_CONST (1UL << 32)
684 */ 621#endif
685static inline void 622
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
687{ 628{
688 struct rq *rq = task_rq(prev); 629 u64 tmp;
689 630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
690 /* 635 /*
691 * prev now departs the cpu. It's not interesting to record 636 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 637 */
695 if (prev != rq->idle) 638 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
697 644
698 if (next != rq->idle) 645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706} 646}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 647
712/* 648static inline unsigned long
713 * Adding/removing a task to/from a priority array: 649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 650{
717 array->nr_active--; 651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 652}
722 653
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 654static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 655{
725 sched_info_queued(p); 656 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 657 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 658}
731 659
732/* 660static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 661{
738 list_move_tail(&p->run_list, array->queue + p->prio); 662 lw->weight -= dec;
663 lw->inv_weight = 0;
739} 664}
740 665
741static inline void 666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 667{
744 list_add(&p->run_list, array->queue + p->prio); 668 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 669 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 671 ls->delta_stat = 0;
672 }
748} 673}
749 674
750/* 675/*
751 * __normal_prio - return the priority that is based on the static 676 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 * 677 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 678 * delta_fair clock advances at a rate inversely proportional to
755 * into the -5 ... 0 ... +5 bonus/penalty range. 679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
756 * 682 *
757 * We use 25% of the full 0...39 priority range so that: 683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
758 * 686 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 687 * This function is called /before/ updating rq->ls.load
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 688 * and when switching tasks.
761 *
762 * Both properties are important to certain workloads.
763 */ 689 */
764 690static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 691{
767 int bonus, prio; 692 struct load_stat *ls = &rq->ls;
768 693 u64 start;
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
770 694
771 prio = p->static_prio - bonus; 695 start = ls->load_update_start;
772 if (prio < MAX_RT_PRIO) 696 ls->load_update_start = now;
773 prio = MAX_RT_PRIO; 697 ls->delta_stat += now - start;
774 if (prio > MAX_PRIO-1) 698 /*
775 prio = MAX_PRIO-1; 699 * Stagger updates to ls->delta_fair. Very frequent updates
776 return prio; 700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
777} 704}
778 705
779/* 706/*
@@ -791,53 +718,146 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 718 * this code will need modification
792 */ 719 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 721#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 723#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 724 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 727
801static void set_load_weight(struct task_struct *p) 728#define WEIGHT_IDLEPRIO 2
802{ 729#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 730
804#ifdef CONFIG_SMP 731/*
805 if (p == task_rq(p)->migration_thread) 732 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 735 * that remained on nice 0.
809 * adversely. 736 *
810 */ 737 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 739 * it's +10% CPU usage.
813#endif 740 */
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 741static const int prio_to_weight[40] = {
815 } else 742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 743/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817} 744/* 0 */ NICE_0_LOAD /* 1024 */,
745/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747};
748
749static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 717986, 2147483, 2684354, 3355443, 4194304,
754 244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
758};
818 759
819static inline void 760static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 761inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 762{
822 rq->raw_weighted_load += p->load_weight; 763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
823} 765}
824 766
825static inline void 767static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 768dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 769{
828 rq->raw_weighted_load -= p->load_weight; 770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 772}
830 773
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 774static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 775{
833 rq->nr_running++; 776 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 777 inc_load(rq, p, now);
835} 778}
836 779
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 780static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 781{
839 rq->nr_running--; 782 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 783 dec_load(rq, p, now);
784}
785
786static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
787
788/*
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
792 */
793struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
797};
798
799static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
805
806#include "sched_stats.h"
807#include "sched_rt.c"
808#include "sched_fair.c"
809#include "sched_idletask.c"
810#ifdef CONFIG_SCHED_DEBUG
811# include "sched_debug.c"
812#endif
813
814#define sched_class_highest (&rt_sched_class)
815
816static void set_load_weight(struct task_struct *p)
817{
818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
820
821 if (task_has_rt_policy(p)) {
822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
825 }
826
827 /*
828 * SCHED_IDLE tasks get minimal weight:
829 */
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
834 }
835
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
838}
839
840static void
841enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
842{
843 sched_info_queued(p);
844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
846}
847
848static void
849dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
850{
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
853}
854
855/*
856 * __normal_prio - return the priority that is based on the static prio
857 */
858static inline int __normal_prio(struct task_struct *p)
859{
860 return p->static_prio;
841} 861}
842 862
843/* 863/*
@@ -851,7 +871,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 871{
852 int prio; 872 int prio;
853 873
854 if (has_rt_policy(p)) 874 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 875 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 876 else
857 prio = __normal_prio(p); 877 prio = __normal_prio(p);
@@ -879,222 +899,47 @@ static int effective_prio(struct task_struct *p)
879} 899}
880 900
881/* 901/*
882 * __activate_task - move a task to the runqueue. 902 * activate_task - move a task to the runqueue.
883 */
884static void __activate_task(struct task_struct *p, struct rq *rq)
885{
886 struct prio_array *target = rq->active;
887
888 if (batch_task(p))
889 target = rq->expired;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
898{
899 enqueue_task_head(p, rq->active);
900 inc_nr_running(p, rq);
901}
902
903/*
904 * Recalculate p->normal_prio and p->prio after having slept,
905 * updating the sleep-average too:
906 */ 903 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now) 904static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
908{ 905{
909 /* Caller must always ensure 'now >= p->timestamp' */ 906 u64 now = rq_clock(rq);
910 unsigned long sleep_time = now - p->timestamp;
911
912 if (batch_task(p))
913 sleep_time = 0;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954 907
955 /* 908 if (p->state == TASK_UNINTERRUPTIBLE)
956 * This code gives a bonus to interactive tasks. 909 rq->nr_uninterruptible--;
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 910
970 return effective_prio(p); 911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
971} 913}
972 914
973/* 915/*
974 * activate_task - move a task to the runqueue and do priority recalculation 916 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 917 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 918static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 919{
981 unsigned long long now; 920 u64 now = rq_clock(rq);
982 921
983 if (rt_task(p)) 922 if (p->state == TASK_UNINTERRUPTIBLE)
984 goto out; 923 rq->nr_uninterruptible--;
985
986 now = sched_clock();
987#ifdef CONFIG_SMP
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995
996 /*
997 * Sleep time is in units of nanosecs, so shift by 20 to get a
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006
1007 p->prio = recalc_task_prio(p, now);
1008 924
1009 /* 925 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 926 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 927}
1035 928
1036/* 929/*
1037 * deactivate_task - remove a task from the runqueue. 930 * deactivate_task - remove a task from the runqueue.
1038 */ 931 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 932static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{
1041 dec_nr_running(p, rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045
1046/*
1047 * resched_task - mark a task 'to be rescheduled now'.
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054
1055#ifndef tsk_is_polling
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{ 933{
1061 int cpu; 934 u64 now = rq_clock(rq);
1062 935
1063 assert_spin_locked(&task_rq(p)->lock); 936 if (p->state == TASK_UNINTERRUPTIBLE)
937 rq->nr_uninterruptible++;
1064 938
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 939 dequeue_task(rq, p, sleep, now);
1066 return; 940 dec_nr_running(p, rq, now);
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078} 941}
1079 942
1080static void resched_cpu(int cpu)
1081{
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084
1085 if (!spin_trylock_irqsave(&rq->lock, flags))
1086 return;
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095}
1096#endif
1097
1098/** 943/**
1099 * task_curr - is this task currently executing on a CPU? 944 * task_curr - is this task currently executing on a CPU?
1100 * @p: the task in question. 945 * @p: the task in question.
@@ -1107,10 +952,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 952/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 953unsigned long weighted_cpuload(const int cpu)
1109{ 954{
1110 return cpu_rq(cpu)->raw_weighted_load; 955 return cpu_rq(cpu)->ls.load.weight;
956}
957
958static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
959{
960#ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963#endif
1111} 964}
1112 965
1113#ifdef CONFIG_SMP 966#ifdef CONFIG_SMP
967
968void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
969{
970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
973
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
987
988 __set_task_cpu(p, new_cpu);
989}
990
1114struct migration_req { 991struct migration_req {
1115 struct list_head list; 992 struct list_head list;
1116 993
@@ -1133,7 +1010,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1010 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1011 * it is sufficient to simply update the task's cpu field.
1135 */ 1012 */
1136 if (!p->array && !task_running(rq, p)) { 1013 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1014 set_task_cpu(p, dest_cpu);
1138 return 0; 1015 return 0;
1139 } 1016 }
@@ -1158,9 +1035,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1035void wait_task_inactive(struct task_struct *p)
1159{ 1036{
1160 unsigned long flags; 1037 unsigned long flags;
1038 int running, on_rq;
1161 struct rq *rq; 1039 struct rq *rq;
1162 struct prio_array *array;
1163 int running;
1164 1040
1165repeat: 1041repeat:
1166 /* 1042 /*
@@ -1192,7 +1068,7 @@ repeat:
1192 */ 1068 */
1193 rq = task_rq_lock(p, &flags); 1069 rq = task_rq_lock(p, &flags);
1194 running = task_running(rq, p); 1070 running = task_running(rq, p);
1195 array = p->array; 1071 on_rq = p->se.on_rq;
1196 task_rq_unlock(rq, &flags); 1072 task_rq_unlock(rq, &flags);
1197 1073
1198 /* 1074 /*
@@ -1215,7 +1091,7 @@ repeat:
1215 * running right now), it's preempted, and we should 1091 * running right now), it's preempted, and we should
1216 * yield - it could be a while. 1092 * yield - it could be a while.
1217 */ 1093 */
1218 if (unlikely(array)) { 1094 if (unlikely(on_rq)) {
1219 yield(); 1095 yield();
1220 goto repeat; 1096 goto repeat;
1221 } 1097 }
@@ -1261,11 +1137,12 @@ void kick_process(struct task_struct *p)
1261static inline unsigned long source_load(int cpu, int type) 1137static inline unsigned long source_load(int cpu, int type)
1262{ 1138{
1263 struct rq *rq = cpu_rq(cpu); 1139 struct rq *rq = cpu_rq(cpu);
1140 unsigned long total = weighted_cpuload(cpu);
1264 1141
1265 if (type == 0) 1142 if (type == 0)
1266 return rq->raw_weighted_load; 1143 return total;
1267 1144
1268 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1145 return min(rq->cpu_load[type-1], total);
1269} 1146}
1270 1147
1271/* 1148/*
@@ -1275,11 +1152,12 @@ static inline unsigned long source_load(int cpu, int type)
1275static inline unsigned long target_load(int cpu, int type) 1152static inline unsigned long target_load(int cpu, int type)
1276{ 1153{
1277 struct rq *rq = cpu_rq(cpu); 1154 struct rq *rq = cpu_rq(cpu);
1155 unsigned long total = weighted_cpuload(cpu);
1278 1156
1279 if (type == 0) 1157 if (type == 0)
1280 return rq->raw_weighted_load; 1158 return total;
1281 1159
1282 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1160 return max(rq->cpu_load[type-1], total);
1283} 1161}
1284 1162
1285/* 1163/*
@@ -1288,9 +1166,10 @@ static inline unsigned long target_load(int cpu, int type)
1288static inline unsigned long cpu_avg_load_per_task(int cpu) 1166static inline unsigned long cpu_avg_load_per_task(int cpu)
1289{ 1167{
1290 struct rq *rq = cpu_rq(cpu); 1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu);
1291 unsigned long n = rq->nr_running; 1170 unsigned long n = rq->nr_running;
1292 1171
1293 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1172 return n ? total / n : SCHED_LOAD_SCALE;
1294} 1173}
1295 1174
1296/* 1175/*
@@ -1392,9 +1271,9 @@ static int sched_balance_self(int cpu, int flag)
1392 struct sched_domain *tmp, *sd = NULL; 1271 struct sched_domain *tmp, *sd = NULL;
1393 1272
1394 for_each_domain(cpu, tmp) { 1273 for_each_domain(cpu, tmp) {
1395 /* 1274 /*
1396 * If power savings logic is enabled for a domain, stop there. 1275 * If power savings logic is enabled for a domain, stop there.
1397 */ 1276 */
1398 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1399 break; 1278 break;
1400 if (tmp->flags & flag) 1279 if (tmp->flags & flag)
@@ -1477,9 +1356,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1477 if (idle_cpu(i)) 1356 if (idle_cpu(i))
1478 return i; 1357 return i;
1479 } 1358 }
1480 } 1359 } else {
1481 else
1482 break; 1360 break;
1361 }
1483 } 1362 }
1484 return cpu; 1363 return cpu;
1485} 1364}
@@ -1521,7 +1400,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (!(old_state & state)) 1400 if (!(old_state & state))
1522 goto out; 1401 goto out;
1523 1402
1524 if (p->array) 1403 if (p->se.on_rq)
1525 goto out_running; 1404 goto out_running;
1526 1405
1527 cpu = task_cpu(p); 1406 cpu = task_cpu(p);
@@ -1576,11 +1455,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1576 * of the current CPU: 1455 * of the current CPU:
1577 */ 1456 */
1578 if (sync) 1457 if (sync)
1579 tl -= current->load_weight; 1458 tl -= current->se.load.weight;
1580 1459
1581 if ((tl <= load && 1460 if ((tl <= load &&
1582 tl + target_load(cpu, idx) <= tl_per_task) || 1461 tl + target_load(cpu, idx) <= tl_per_task) ||
1583 100*(tl + p->load_weight) <= imbalance*load) { 1462 100*(tl + p->se.load.weight) <= imbalance*load) {
1584 /* 1463 /*
1585 * This domain has SD_WAKE_AFFINE and 1464 * This domain has SD_WAKE_AFFINE and
1586 * p is cache cold in this domain, and 1465 * p is cache cold in this domain, and
@@ -1614,7 +1493,7 @@ out_set_cpu:
1614 old_state = p->state; 1493 old_state = p->state;
1615 if (!(old_state & state)) 1494 if (!(old_state & state))
1616 goto out; 1495 goto out;
1617 if (p->array) 1496 if (p->se.on_rq)
1618 goto out_running; 1497 goto out_running;
1619 1498
1620 this_cpu = smp_processor_id(); 1499 this_cpu = smp_processor_id();
@@ -1623,25 +1502,7 @@ out_set_cpu:
1623 1502
1624out_activate: 1503out_activate:
1625#endif /* CONFIG_SMP */ 1504#endif /* CONFIG_SMP */
1626 if (old_state == TASK_UNINTERRUPTIBLE) { 1505 activate_task(rq, p, 1);
1627 rq->nr_uninterruptible--;
1628 /*
1629 * Tasks on involuntary sleep don't earn
1630 * sleep_avg beyond just interactive state.
1631 */
1632 p->sleep_type = SLEEP_NONINTERACTIVE;
1633 } else
1634
1635 /*
1636 * Tasks that have marked their sleep as noninteractive get
1637 * woken up with their sleep average not weighted in an
1638 * interactive way.
1639 */
1640 if (old_state & TASK_NONINTERACTIVE)
1641 p->sleep_type = SLEEP_NONINTERACTIVE;
1642
1643
1644 activate_task(p, rq, cpu == this_cpu);
1645 /* 1506 /*
1646 * Sync wakeups (i.e. those types of wakeups where the waker 1507 * Sync wakeups (i.e. those types of wakeups where the waker
1647 * has indicated that it will leave the CPU in short order) 1508 * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1511,8 @@ out_activate:
1650 * the waker guarantees that the freshly woken up task is going 1511 * the waker guarantees that the freshly woken up task is going
1651 * to be considered on this CPU.) 1512 * to be considered on this CPU.)
1652 */ 1513 */
1653 if (!sync || cpu != this_cpu) { 1514 if (!sync || cpu != this_cpu)
1654 if (TASK_PREEMPTS_CURR(p, rq)) 1515 check_preempt_curr(rq, p);
1655 resched_task(rq->curr);
1656 }
1657 success = 1; 1516 success = 1;
1658 1517
1659out_running: 1518out_running:
@@ -1676,19 +1535,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1676 return try_to_wake_up(p, state, 0); 1535 return try_to_wake_up(p, state, 0);
1677} 1536}
1678 1537
1679static void task_running_tick(struct rq *rq, struct task_struct *p);
1680/* 1538/*
1681 * Perform scheduler related setup for a newly forked process p. 1539 * Perform scheduler related setup for a newly forked process p.
1682 * p is forked by current. 1540 * p is forked by current.
1683 */ 1541 *
1684void fastcall sched_fork(struct task_struct *p, int clone_flags) 1542 * __sched_fork() is basic setup used by init_idle() too:
1685{ 1543 */
1686 int cpu = get_cpu(); 1544static void __sched_fork(struct task_struct *p)
1545{
1546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
1687 1565
1688#ifdef CONFIG_SMP 1566 INIT_LIST_HEAD(&p->run_list);
1689 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1567 p->se.on_rq = 0;
1690#endif
1691 set_task_cpu(p, cpu);
1692 1568
1693 /* 1569 /*
1694 * We mark the process as running here, but have not actually 1570 * We mark the process as running here, but have not actually
@@ -1697,16 +1573,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1697 * event cannot wake it up and insert it on the runqueue either. 1573 * event cannot wake it up and insert it on the runqueue either.
1698 */ 1574 */
1699 p->state = TASK_RUNNING; 1575 p->state = TASK_RUNNING;
1576}
1577
1578/*
1579 * fork()/clone()-time setup:
1580 */
1581void sched_fork(struct task_struct *p, int clone_flags)
1582{
1583 int cpu = get_cpu();
1584
1585 __sched_fork(p);
1586
1587#ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589#endif
1590 __set_task_cpu(p, cpu);
1700 1591
1701 /* 1592 /*
1702 * Make sure we do not leak PI boosting priority to the child: 1593 * Make sure we do not leak PI boosting priority to the child:
1703 */ 1594 */
1704 p->prio = current->normal_prio; 1595 p->prio = current->normal_prio;
1705 1596
1706 INIT_LIST_HEAD(&p->run_list);
1707 p->array = NULL;
1708#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1709 if (unlikely(sched_info_on())) 1598 if (likely(sched_info_on()))
1710 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1711#endif 1600#endif
1712#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1601#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1605,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1716 /* Want to start with kernel preemption disabled. */ 1605 /* Want to start with kernel preemption disabled. */
1717 task_thread_info(p)->preempt_count = 1; 1606 task_thread_info(p)->preempt_count = 1;
1718#endif 1607#endif
1719 /*
1720 * Share the timeslice between parent and child, thus the
1721 * total amount of pending timeslices in the system doesn't change,
1722 * resulting in more scheduling fairness.
1723 */
1724 local_irq_disable();
1725 p->time_slice = (current->time_slice + 1) >> 1;
1726 /*
1727 * The remainder of the first timeslice might be recovered by
1728 * the parent if the child exits early enough.
1729 */
1730 p->first_time_slice = 1;
1731 current->time_slice >>= 1;
1732 p->timestamp = sched_clock();
1733 if (unlikely(!current->time_slice)) {
1734 /*
1735 * This case is rare, it happens when the parent has only
1736 * a single jiffy left from its timeslice. Taking the
1737 * runqueue lock is not a problem.
1738 */
1739 current->time_slice = 1;
1740 task_running_tick(cpu_rq(cpu), current);
1741 }
1742 local_irq_enable();
1743 put_cpu(); 1608 put_cpu();
1744} 1609}
1745 1610
1746/* 1611/*
1612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1614 */
1615unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617/*
1747 * wake_up_new_task - wake up a newly created task for the first time. 1618 * wake_up_new_task - wake up a newly created task for the first time.
1748 * 1619 *
1749 * This function will do some initial scheduler statistics housekeeping 1620 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1623,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1752 */ 1623 */
1753void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1624void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1754{ 1625{
1755 struct rq *rq, *this_rq;
1756 unsigned long flags; 1626 unsigned long flags;
1757 int this_cpu, cpu; 1627 struct rq *rq;
1628 int this_cpu;
1758 1629
1759 rq = task_rq_lock(p, &flags); 1630 rq = task_rq_lock(p, &flags);
1760 BUG_ON(p->state != TASK_RUNNING); 1631 BUG_ON(p->state != TASK_RUNNING);
1761 this_cpu = smp_processor_id(); 1632 this_cpu = smp_processor_id(); /* parent's CPU */
1762 cpu = task_cpu(p);
1763
1764 /*
1765 * We decrease the sleep average of forking parents
1766 * and children as well, to keep max-interactive tasks
1767 * from forking tasks that are max-interactive. The parent
1768 * (current) is done further down, under its lock.
1769 */
1770 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1771 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1772 1633
1773 p->prio = effective_prio(p); 1634 p->prio = effective_prio(p);
1774 1635
1775 if (likely(cpu == this_cpu)) { 1636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1776 if (!(clone_flags & CLONE_VM)) { 1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1777 /* 1638 activate_task(rq, p, 0);
1778 * The VM isn't cloned, so we're in a good position to
1779 * do child-runs-first in anticipation of an exec. This
1780 * usually avoids a lot of COW overhead.
1781 */
1782 if (unlikely(!current->array))
1783 __activate_task(p, rq);
1784 else {
1785 p->prio = current->prio;
1786 p->normal_prio = current->normal_prio;
1787 list_add_tail(&p->run_list, &current->run_list);
1788 p->array = current->array;
1789 p->array->nr_active++;
1790 inc_nr_running(p, rq);
1791 }
1792 set_need_resched();
1793 } else
1794 /* Run child last */
1795 __activate_task(p, rq);
1796 /*
1797 * We skip the following code due to cpu == this_cpu
1798 *
1799 * task_rq_unlock(rq, &flags);
1800 * this_rq = task_rq_lock(current, &flags);
1801 */
1802 this_rq = rq;
1803 } else { 1639 } else {
1804 this_rq = cpu_rq(this_cpu);
1805
1806 /* 1640 /*
1807 * Not the local CPU - must adjust timestamp. This should 1641 * Let the scheduling class do new task startup
1808 * get optimised away in the !CONFIG_SMP case. 1642 * management (if any):
1809 */ 1643 */
1810 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1644 p->sched_class->task_new(rq, p);
1811 + rq->most_recent_timestamp;
1812 __activate_task(p, rq);
1813 if (TASK_PREEMPTS_CURR(p, rq))
1814 resched_task(rq->curr);
1815
1816 /*
1817 * Parent and child are on different CPUs, now get the
1818 * parent runqueue to update the parent's ->sleep_avg:
1819 */
1820 task_rq_unlock(rq, &flags);
1821 this_rq = task_rq_lock(current, &flags);
1822 }
1823 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1824 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1825 task_rq_unlock(this_rq, &flags);
1826}
1827
1828/*
1829 * Potentially available exiting-child timeslices are
1830 * retrieved here - this way the parent does not get
1831 * penalized for creating too many threads.
1832 *
1833 * (this cannot be used to 'generate' timeslices
1834 * artificially, because any timeslice recovered here
1835 * was given away by the parent in the first place.)
1836 */
1837void fastcall sched_exit(struct task_struct *p)
1838{
1839 unsigned long flags;
1840 struct rq *rq;
1841
1842 /*
1843 * If the child was a (relative-) CPU hog then decrease
1844 * the sleep_avg of the parent as well.
1845 */
1846 rq = task_rq_lock(p->parent, &flags);
1847 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1848 p->parent->time_slice += p->time_slice;
1849 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1850 p->parent->time_slice = task_timeslice(p);
1851 } 1645 }
1852 if (p->sleep_avg < p->parent->sleep_avg) 1646 check_preempt_curr(rq, p);
1853 p->parent->sleep_avg = p->parent->sleep_avg /
1854 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1855 (EXIT_WEIGHT + 1);
1856 task_rq_unlock(rq, &flags); 1647 task_rq_unlock(rq, &flags);
1857} 1648}
1858 1649
@@ -1917,7 +1708,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1917 /* 1708 /*
1918 * Remove function-return probe instances associated with this 1709 * Remove function-return probe instances associated with this
1919 * task and put them back on the free list. 1710 * task and put them back on the free list.
1920 */ 1711 */
1921 kprobe_flush_task(prev); 1712 kprobe_flush_task(prev);
1922 put_task_struct(prev); 1713 put_task_struct(prev);
1923 } 1714 }
@@ -1945,13 +1736,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1945 * context_switch - switch to the new MM and the new 1736 * context_switch - switch to the new MM and the new
1946 * thread's register state. 1737 * thread's register state.
1947 */ 1738 */
1948static inline struct task_struct * 1739static inline void
1949context_switch(struct rq *rq, struct task_struct *prev, 1740context_switch(struct rq *rq, struct task_struct *prev,
1950 struct task_struct *next) 1741 struct task_struct *next)
1951{ 1742{
1952 struct mm_struct *mm = next->mm; 1743 struct mm_struct *mm, *oldmm;
1953 struct mm_struct *oldmm = prev->active_mm;
1954 1744
1745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
1955 /* 1748 /*
1956 * For paravirt, this is coupled with an exit in switch_to to 1749 * For paravirt, this is coupled with an exit in switch_to to
1957 * combine the page table reload and the switch backend into 1750 * combine the page table reload and the switch backend into
@@ -1959,16 +1752,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1959 */ 1752 */
1960 arch_enter_lazy_cpu_mode(); 1753 arch_enter_lazy_cpu_mode();
1961 1754
1962 if (!mm) { 1755 if (unlikely(!mm)) {
1963 next->active_mm = oldmm; 1756 next->active_mm = oldmm;
1964 atomic_inc(&oldmm->mm_count); 1757 atomic_inc(&oldmm->mm_count);
1965 enter_lazy_tlb(oldmm, next); 1758 enter_lazy_tlb(oldmm, next);
1966 } else 1759 } else
1967 switch_mm(oldmm, mm, next); 1760 switch_mm(oldmm, mm, next);
1968 1761
1969 if (!prev->mm) { 1762 if (unlikely(!prev->mm)) {
1970 prev->active_mm = NULL; 1763 prev->active_mm = NULL;
1971 WARN_ON(rq->prev_mm);
1972 rq->prev_mm = oldmm; 1764 rq->prev_mm = oldmm;
1973 } 1765 }
1974 /* 1766 /*
@@ -1984,7 +1776,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1984 /* Here we just switch the register state and the stack. */ 1776 /* Here we just switch the register state and the stack. */
1985 switch_to(prev, next, prev); 1777 switch_to(prev, next, prev);
1986 1778
1987 return prev; 1779 barrier();
1780 /*
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1784 */
1785 finish_task_switch(this_rq(), prev);
1988} 1786}
1989 1787
1990/* 1788/*
@@ -2057,17 +1855,65 @@ unsigned long nr_active(void)
2057 return running + uninterruptible; 1855 return running + uninterruptible;
2058} 1856}
2059 1857
2060#ifdef CONFIG_SMP
2061
2062/* 1858/*
2063 * Is this task likely cache-hot: 1859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
2064 */ 1861 */
2065static inline int 1862static void update_cpu_load(struct rq *this_rq)
2066task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2067{ 1863{
2068 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1870
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1874
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1877
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1880
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1883
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1886
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1889
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1892
1893 idle_delta64 = sample_interval64 - exec_delta64;
1894
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898 this_load = (unsigned long)tmp64;
1899
1900do_avg:
1901
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1905
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1910
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 }
2069} 1913}
2070 1914
1915#ifdef CONFIG_SMP
1916
2071/* 1917/*
2072 * double_rq_lock - safely lock two runqueues 1918 * double_rq_lock - safely lock two runqueues
2073 * 1919 *
@@ -2184,23 +2030,17 @@ void sched_exec(void)
2184 * pull_task - move a task from a remote runqueue to the local runqueue. 2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2185 * Both runqueues must be locked. 2031 * Both runqueues must be locked.
2186 */ 2032 */
2187static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2033static void pull_task(struct rq *src_rq, struct task_struct *p,
2188 struct task_struct *p, struct rq *this_rq, 2034 struct rq *this_rq, int this_cpu)
2189 struct prio_array *this_array, int this_cpu)
2190{ 2035{
2191 dequeue_task(p, src_array); 2036 deactivate_task(src_rq, p, 0);
2192 dec_nr_running(p, src_rq);
2193 set_task_cpu(p, this_cpu); 2037 set_task_cpu(p, this_cpu);
2194 inc_nr_running(p, this_rq); 2038 activate_task(this_rq, p, 0);
2195 enqueue_task(p, this_array);
2196 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2197 + this_rq->most_recent_timestamp;
2198 /* 2039 /*
2199 * Note that idle threads have a prio of MAX_PRIO, for this test 2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2200 * to be always true for them. 2041 * to be always true for them.
2201 */ 2042 */
2202 if (TASK_PREEMPTS_CURR(p, this_rq)) 2043 check_preempt_curr(this_rq, p);
2203 resched_task(this_rq->curr);
2204} 2044}
2205 2045
2206/* 2046/*
@@ -2208,7 +2048,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2208 */ 2048 */
2209static 2049static
2210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2050int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2211 struct sched_domain *sd, enum idle_type idle, 2051 struct sched_domain *sd, enum cpu_idle_type idle,
2212 int *all_pinned) 2052 int *all_pinned)
2213{ 2053{
2214 /* 2054 /*
@@ -2225,132 +2065,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2225 return 0; 2065 return 0;
2226 2066
2227 /* 2067 /*
2228 * Aggressive migration if: 2068 * Aggressive migration if too many balance attempts have failed:
2229 * 1) task is cache cold, or
2230 * 2) too many balance attempts have failed.
2231 */ 2069 */
2232 2070 if (sd->nr_balance_failed > sd->cache_nice_tries)
2233 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2234#ifdef CONFIG_SCHEDSTATS
2235 if (task_hot(p, rq->most_recent_timestamp, sd))
2236 schedstat_inc(sd, lb_hot_gained[idle]);
2237#endif
2238 return 1; 2071 return 1;
2239 }
2240 2072
2241 if (task_hot(p, rq->most_recent_timestamp, sd))
2242 return 0;
2243 return 1; 2073 return 1;
2244} 2074}
2245 2075
2246#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2247
2248/*
2249 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2250 * load from busiest to this_rq, as part of a balancing operation within
2251 * "domain". Returns the number of tasks moved.
2252 *
2253 * Called with both runqueues locked.
2254 */
2255static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2256 unsigned long max_nr_move, unsigned long max_load_move, 2077 unsigned long max_nr_move, unsigned long max_load_move,
2257 struct sched_domain *sd, enum idle_type idle, 2078 struct sched_domain *sd, enum cpu_idle_type idle,
2258 int *all_pinned) 2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2259{ 2082{
2260 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2083 int pulled = 0, pinned = 0, skip_for_load;
2261 best_prio_seen, skip_for_load; 2084 struct task_struct *p;
2262 struct prio_array *array, *dst_array; 2085 long rem_load_move = max_load_move;
2263 struct list_head *head, *curr;
2264 struct task_struct *tmp;
2265 long rem_load_move;
2266 2086
2267 if (max_nr_move == 0 || max_load_move == 0) 2087 if (max_nr_move == 0 || max_load_move == 0)
2268 goto out; 2088 goto out;
2269 2089
2270 rem_load_move = max_load_move;
2271 pinned = 1; 2090 pinned = 1;
2272 this_best_prio = rq_best_prio(this_rq);
2273 best_prio = rq_best_prio(busiest);
2274 /*
2275 * Enable handling of the case where there is more than one task
2276 * with the best priority. If the current running task is one
2277 * of those with prio==best_prio we know it won't be moved
2278 * and therefore it's safe to override the skip (based on load) of
2279 * any task we find with that prio.
2280 */
2281 best_prio_seen = best_prio == busiest->curr->prio;
2282 2091
2283 /* 2092 /*
2284 * We first consider expired tasks. Those will likely not be 2093 * Start the load-balancing iterator:
2285 * executed in the near future, and they are most likely to
2286 * be cache-cold, thus switching CPUs has the least effect
2287 * on them.
2288 */ 2094 */
2289 if (busiest->expired->nr_active) { 2095 p = iterator->start(iterator->arg);
2290 array = busiest->expired; 2096next:
2291 dst_array = this_rq->expired; 2097 if (!p)
2292 } else {
2293 array = busiest->active;
2294 dst_array = this_rq->active;
2295 }
2296
2297new_array:
2298 /* Start searching at priority 0: */
2299 idx = 0;
2300skip_bitmap:
2301 if (!idx)
2302 idx = sched_find_first_bit(array->bitmap);
2303 else
2304 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2305 if (idx >= MAX_PRIO) {
2306 if (array == busiest->expired && busiest->active->nr_active) {
2307 array = busiest->active;
2308 dst_array = this_rq->active;
2309 goto new_array;
2310 }
2311 goto out; 2098 goto out;
2312 }
2313
2314 head = array->queue + idx;
2315 curr = head->prev;
2316skip_queue:
2317 tmp = list_entry(curr, struct task_struct, run_list);
2318
2319 curr = curr->prev;
2320
2321 /* 2099 /*
2322 * To help distribute high priority tasks accross CPUs we don't 2100 * To help distribute high priority tasks accross CPUs we don't
2323 * skip a task if it will be the highest priority task (i.e. smallest 2101 * skip a task if it will be the highest priority task (i.e. smallest
2324 * prio value) on its new queue regardless of its load weight 2102 * prio value) on its new queue regardless of its load weight
2325 */ 2103 */
2326 skip_for_load = tmp->load_weight > rem_load_move; 2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2327 if (skip_for_load && idx < this_best_prio) 2105 SCHED_LOAD_SCALE_FUZZ;
2328 skip_for_load = !best_prio_seen && idx == best_prio; 2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2329 if (skip_for_load || 2108 if (skip_for_load ||
2330 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2331 2110
2332 best_prio_seen |= idx == best_prio; 2111 best_prio_seen |= p->prio == best_prio;
2333 if (curr != head) 2112 p = iterator->next(iterator->arg);
2334 goto skip_queue; 2113 goto next;
2335 idx++;
2336 goto skip_bitmap;
2337 } 2114 }
2338 2115
2339 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2116 pull_task(busiest, p, this_rq, this_cpu);
2340 pulled++; 2117 pulled++;
2341 rem_load_move -= tmp->load_weight; 2118 rem_load_move -= p->se.load.weight;
2342 2119
2343 /* 2120 /*
2344 * We only want to steal up to the prescribed number of tasks 2121 * We only want to steal up to the prescribed number of tasks
2345 * and the prescribed amount of weighted load. 2122 * and the prescribed amount of weighted load.
2346 */ 2123 */
2347 if (pulled < max_nr_move && rem_load_move > 0) { 2124 if (pulled < max_nr_move && rem_load_move > 0) {
2348 if (idx < this_best_prio) 2125 if (p->prio < this_best_prio)
2349 this_best_prio = idx; 2126 this_best_prio = p->prio;
2350 if (curr != head) 2127 p = iterator->next(iterator->arg);
2351 goto skip_queue; 2128 goto next;
2352 idx++;
2353 goto skip_bitmap;
2354 } 2129 }
2355out: 2130out:
2356 /* 2131 /*
@@ -2362,18 +2137,48 @@ out:
2362 2137
2363 if (all_pinned) 2138 if (all_pinned)
2364 *all_pinned = pinned; 2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2365 return pulled; 2141 return pulled;
2366} 2142}
2367 2143
2368/* 2144/*
2145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
2148 *
2149 * Called with both runqueues locked.
2150 */
2151static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152 unsigned long max_nr_move, unsigned long max_load_move,
2153 struct sched_domain *sd, enum cpu_idle_type idle,
2154 int *all_pinned)
2155{
2156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
2159
2160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
2169
2170 return total_nr_moved;
2171}
2172
2173/*
2369 * find_busiest_group finds and returns the busiest CPU group within the 2174 * find_busiest_group finds and returns the busiest CPU group within the
2370 * domain. It calculates and returns the amount of weighted load which 2175 * domain. It calculates and returns the amount of weighted load which
2371 * should be moved to restore balance via the imbalance parameter. 2176 * should be moved to restore balance via the imbalance parameter.
2372 */ 2177 */
2373static struct sched_group * 2178static struct sched_group *
2374find_busiest_group(struct sched_domain *sd, int this_cpu, 2179find_busiest_group(struct sched_domain *sd, int this_cpu,
2375 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2180 unsigned long *imbalance, enum cpu_idle_type idle,
2376 cpumask_t *cpus, int *balance) 2181 int *sd_idle, cpumask_t *cpus, int *balance)
2377{ 2182{
2378 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2379 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2196,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2391 max_load = this_load = total_load = total_pwr = 0; 2196 max_load = this_load = total_load = total_pwr = 0;
2392 busiest_load_per_task = busiest_nr_running = 0; 2197 busiest_load_per_task = busiest_nr_running = 0;
2393 this_load_per_task = this_nr_running = 0; 2198 this_load_per_task = this_nr_running = 0;
2394 if (idle == NOT_IDLE) 2199 if (idle == CPU_NOT_IDLE)
2395 load_idx = sd->busy_idx; 2200 load_idx = sd->busy_idx;
2396 else if (idle == NEWLY_IDLE) 2201 else if (idle == CPU_NEWLY_IDLE)
2397 load_idx = sd->newidle_idx; 2202 load_idx = sd->newidle_idx;
2398 else 2203 else
2399 load_idx = sd->idle_idx; 2204 load_idx = sd->idle_idx;
@@ -2437,7 +2242,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 2242
2438 avg_load += load; 2243 avg_load += load;
2439 sum_nr_running += rq->nr_running; 2244 sum_nr_running += rq->nr_running;
2440 sum_weighted_load += rq->raw_weighted_load; 2245 sum_weighted_load += weighted_cpuload(i);
2441 } 2246 }
2442 2247
2443 /* 2248 /*
@@ -2477,8 +2282,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2477 * Busy processors will not participate in power savings 2282 * Busy processors will not participate in power savings
2478 * balance. 2283 * balance.
2479 */ 2284 */
2480 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2285 if (idle == CPU_NOT_IDLE ||
2481 goto group_next; 2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
2482 2288
2483 /* 2289 /*
2484 * If the local group is idle or completely loaded 2290 * If the local group is idle or completely loaded
@@ -2488,42 +2294,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2488 !this_nr_running)) 2294 !this_nr_running))
2489 power_savings_balance = 0; 2295 power_savings_balance = 0;
2490 2296
2491 /* 2297 /*
2492 * If a group is already running at full capacity or idle, 2298 * If a group is already running at full capacity or idle,
2493 * don't include that group in power savings calculations 2299 * don't include that group in power savings calculations
2494 */ 2300 */
2495 if (!power_savings_balance || sum_nr_running >= group_capacity 2301 if (!power_savings_balance || sum_nr_running >= group_capacity
2496 || !sum_nr_running) 2302 || !sum_nr_running)
2497 goto group_next; 2303 goto group_next;
2498 2304
2499 /* 2305 /*
2500 * Calculate the group which has the least non-idle load. 2306 * Calculate the group which has the least non-idle load.
2501 * This is the group from where we need to pick up the load 2307 * This is the group from where we need to pick up the load
2502 * for saving power 2308 * for saving power
2503 */ 2309 */
2504 if ((sum_nr_running < min_nr_running) || 2310 if ((sum_nr_running < min_nr_running) ||
2505 (sum_nr_running == min_nr_running && 2311 (sum_nr_running == min_nr_running &&
2506 first_cpu(group->cpumask) < 2312 first_cpu(group->cpumask) <
2507 first_cpu(group_min->cpumask))) { 2313 first_cpu(group_min->cpumask))) {
2508 group_min = group; 2314 group_min = group;
2509 min_nr_running = sum_nr_running; 2315 min_nr_running = sum_nr_running;
2510 min_load_per_task = sum_weighted_load / 2316 min_load_per_task = sum_weighted_load /
2511 sum_nr_running; 2317 sum_nr_running;
2512 } 2318 }
2513 2319
2514 /* 2320 /*
2515 * Calculate the group which is almost near its 2321 * Calculate the group which is almost near its
2516 * capacity but still has some space to pick up some load 2322 * capacity but still has some space to pick up some load
2517 * from other group and save more power 2323 * from other group and save more power
2518 */ 2324 */
2519 if (sum_nr_running <= group_capacity - 1) { 2325 if (sum_nr_running <= group_capacity - 1) {
2520 if (sum_nr_running > leader_nr_running || 2326 if (sum_nr_running > leader_nr_running ||
2521 (sum_nr_running == leader_nr_running && 2327 (sum_nr_running == leader_nr_running &&
2522 first_cpu(group->cpumask) > 2328 first_cpu(group->cpumask) >
2523 first_cpu(group_leader->cpumask))) { 2329 first_cpu(group_leader->cpumask))) {
2524 group_leader = group; 2330 group_leader = group;
2525 leader_nr_running = sum_nr_running; 2331 leader_nr_running = sum_nr_running;
2526 } 2332 }
2527 } 2333 }
2528group_next: 2334group_next:
2529#endif 2335#endif
@@ -2578,7 +2384,7 @@ group_next:
2578 * a think about bumping its value to force at least one task to be 2384 * a think about bumping its value to force at least one task to be
2579 * moved 2385 * moved
2580 */ 2386 */
2581 if (*imbalance < busiest_load_per_task) { 2387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2582 unsigned long tmp, pwr_now, pwr_move; 2388 unsigned long tmp, pwr_now, pwr_move;
2583 unsigned int imbn; 2389 unsigned int imbn;
2584 2390
@@ -2592,7 +2398,8 @@ small_imbalance:
2592 } else 2398 } else
2593 this_load_per_task = SCHED_LOAD_SCALE; 2399 this_load_per_task = SCHED_LOAD_SCALE;
2594 2400
2595 if (max_load - this_load >= busiest_load_per_task * imbn) { 2401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
2596 *imbalance = busiest_load_per_task; 2403 *imbalance = busiest_load_per_task;
2597 return busiest; 2404 return busiest;
2598 } 2405 }
@@ -2639,7 +2446,7 @@ small_imbalance:
2639 2446
2640out_balanced: 2447out_balanced:
2641#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2448#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2642 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2643 goto ret; 2450 goto ret;
2644 2451
2645 if (this == group_leader && group_leader != group_min) { 2452 if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2463,7 @@ ret:
2656 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2657 */ 2464 */
2658static struct rq * 2465static struct rq *
2659find_busiest_queue(struct sched_group *group, enum idle_type idle, 2466find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2660 unsigned long imbalance, cpumask_t *cpus) 2467 unsigned long imbalance, cpumask_t *cpus)
2661{ 2468{
2662 struct rq *busiest = NULL, *rq; 2469 struct rq *busiest = NULL, *rq;
@@ -2664,17 +2471,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2664 int i; 2471 int i;
2665 2472
2666 for_each_cpu_mask(i, group->cpumask) { 2473 for_each_cpu_mask(i, group->cpumask) {
2474 unsigned long wl;
2667 2475
2668 if (!cpu_isset(i, *cpus)) 2476 if (!cpu_isset(i, *cpus))
2669 continue; 2477 continue;
2670 2478
2671 rq = cpu_rq(i); 2479 rq = cpu_rq(i);
2480 wl = weighted_cpuload(i);
2672 2481
2673 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2482 if (rq->nr_running == 1 && wl > imbalance)
2674 continue; 2483 continue;
2675 2484
2676 if (rq->raw_weighted_load > max_load) { 2485 if (wl > max_load) {
2677 max_load = rq->raw_weighted_load; 2486 max_load = wl;
2678 busiest = rq; 2487 busiest = rq;
2679 } 2488 }
2680 } 2489 }
@@ -2698,7 +2507,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2698 * tasks if there is an imbalance. 2507 * tasks if there is an imbalance.
2699 */ 2508 */
2700static int load_balance(int this_cpu, struct rq *this_rq, 2509static int load_balance(int this_cpu, struct rq *this_rq,
2701 struct sched_domain *sd, enum idle_type idle, 2510 struct sched_domain *sd, enum cpu_idle_type idle,
2702 int *balance) 2511 int *balance)
2703{ 2512{
2704 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2520,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2711 /* 2520 /*
2712 * When power savings policy is enabled for the parent domain, idle 2521 * When power savings policy is enabled for the parent domain, idle
2713 * sibling can pick up load irrespective of busy siblings. In this case, 2522 * sibling can pick up load irrespective of busy siblings. In this case,
2714 * let the state of idle sibling percolate up as IDLE, instead of 2523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2715 * portraying it as NOT_IDLE. 2524 * portraying it as CPU_NOT_IDLE.
2716 */ 2525 */
2717 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2718 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd_idle = 1; 2528 sd_idle = 1;
2720 2529
@@ -2848,7 +2657,7 @@ out_one_pinned:
2848 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2849 * tasks if there is an imbalance. 2658 * tasks if there is an imbalance.
2850 * 2659 *
2851 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2852 * this_rq is locked. 2661 * this_rq is locked.
2853 */ 2662 */
2854static int 2663static int
@@ -2865,31 +2674,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2865 * When power savings policy is enabled for the parent domain, idle 2674 * When power savings policy is enabled for the parent domain, idle
2866 * sibling can pick up load irrespective of busy siblings. In this case, 2675 * sibling can pick up load irrespective of busy siblings. In this case,
2867 * let the state of idle sibling percolate up as IDLE, instead of 2676 * let the state of idle sibling percolate up as IDLE, instead of
2868 * portraying it as NOT_IDLE. 2677 * portraying it as CPU_NOT_IDLE.
2869 */ 2678 */
2870 if (sd->flags & SD_SHARE_CPUPOWER && 2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2871 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2872 sd_idle = 1; 2681 sd_idle = 1;
2873 2682
2874 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2875redo: 2684redo:
2876 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2877 &sd_idle, &cpus, NULL); 2686 &sd_idle, &cpus, NULL);
2878 if (!group) { 2687 if (!group) {
2879 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2880 goto out_balanced; 2689 goto out_balanced;
2881 } 2690 }
2882 2691
2883 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2884 &cpus); 2693 &cpus);
2885 if (!busiest) { 2694 if (!busiest) {
2886 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2887 goto out_balanced; 2696 goto out_balanced;
2888 } 2697 }
2889 2698
2890 BUG_ON(busiest == this_rq); 2699 BUG_ON(busiest == this_rq);
2891 2700
2892 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2893 2702
2894 nr_moved = 0; 2703 nr_moved = 0;
2895 if (busiest->nr_running > 1) { 2704 if (busiest->nr_running > 1) {
@@ -2897,7 +2706,7 @@ redo:
2897 double_lock_balance(this_rq, busiest); 2706 double_lock_balance(this_rq, busiest);
2898 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2899 minus_1_or_zero(busiest->nr_running), 2708 minus_1_or_zero(busiest->nr_running),
2900 imbalance, sd, NEWLY_IDLE, NULL); 2709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2901 spin_unlock(&busiest->lock); 2710 spin_unlock(&busiest->lock);
2902 2711
2903 if (!nr_moved) { 2712 if (!nr_moved) {
@@ -2908,7 +2717,7 @@ redo:
2908 } 2717 }
2909 2718
2910 if (!nr_moved) { 2719 if (!nr_moved) {
2911 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2912 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2913 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2914 return -1; 2723 return -1;
@@ -2918,7 +2727,7 @@ redo:
2918 return nr_moved; 2727 return nr_moved;
2919 2728
2920out_balanced: 2729out_balanced:
2921 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2922 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2923 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2924 return -1; 2733 return -1;
@@ -2934,8 +2743,8 @@ out_balanced:
2934static void idle_balance(int this_cpu, struct rq *this_rq) 2743static void idle_balance(int this_cpu, struct rq *this_rq)
2935{ 2744{
2936 struct sched_domain *sd; 2745 struct sched_domain *sd;
2937 int pulled_task = 0; 2746 int pulled_task = -1;
2938 unsigned long next_balance = jiffies + 60 * HZ; 2747 unsigned long next_balance = jiffies + HZ;
2939 2748
2940 for_each_domain(this_cpu, sd) { 2749 for_each_domain(this_cpu, sd) {
2941 unsigned long interval; 2750 unsigned long interval;
@@ -2954,12 +2763,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
2954 if (pulled_task) 2763 if (pulled_task)
2955 break; 2764 break;
2956 } 2765 }
2957 if (!pulled_task) 2766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2958 /* 2767 /*
2959 * We are going idle. next_balance may be set based on 2768 * We are going idle. next_balance may be set based on
2960 * a busy processor. So reset next_balance. 2769 * a busy processor. So reset next_balance.
2961 */ 2770 */
2962 this_rq->next_balance = next_balance; 2771 this_rq->next_balance = next_balance;
2772 }
2963} 2773}
2964 2774
2965/* 2775/*
@@ -3003,7 +2813,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3003 schedstat_inc(sd, alb_cnt); 2813 schedstat_inc(sd, alb_cnt);
3004 2814
3005 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
3006 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
3007 NULL)) 2817 NULL))
3008 schedstat_inc(sd, alb_pushed); 2818 schedstat_inc(sd, alb_pushed);
3009 else 2819 else
@@ -3012,32 +2822,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3012 spin_unlock(&target_rq->lock); 2822 spin_unlock(&target_rq->lock);
3013} 2823}
3014 2824
3015static void update_load(struct rq *this_rq)
3016{
3017 unsigned long this_load;
3018 unsigned int i, scale;
3019
3020 this_load = this_rq->raw_weighted_load;
3021
3022 /* Update our load: */
3023 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
3024 unsigned long old_load, new_load;
3025
3026 /* scale is effectively 1 << i now, and >> i divides by scale */
3027
3028 old_load = this_rq->cpu_load[i];
3029 new_load = this_load;
3030 /*
3031 * Round up the averaging division if load is increasing. This
3032 * prevents us from getting stuck on 9 if the load is 10, for
3033 * example.
3034 */
3035 if (new_load > old_load)
3036 new_load += scale-1;
3037 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3038 }
3039}
3040
3041#ifdef CONFIG_NO_HZ 2825#ifdef CONFIG_NO_HZ
3042static struct { 2826static struct {
3043 atomic_t load_balancer; 2827 atomic_t load_balancer;
@@ -3120,7 +2904,7 @@ static DEFINE_SPINLOCK(balancing);
3120 * 2904 *
3121 * Balancing parameters are set up in arch_init_sched_domains. 2905 * Balancing parameters are set up in arch_init_sched_domains.
3122 */ 2906 */
3123static inline void rebalance_domains(int cpu, enum idle_type idle) 2907static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3124{ 2908{
3125 int balance = 1; 2909 int balance = 1;
3126 struct rq *rq = cpu_rq(cpu); 2910 struct rq *rq = cpu_rq(cpu);
@@ -3134,13 +2918,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3134 continue; 2918 continue;
3135 2919
3136 interval = sd->balance_interval; 2920 interval = sd->balance_interval;
3137 if (idle != SCHED_IDLE) 2921 if (idle != CPU_IDLE)
3138 interval *= sd->busy_factor; 2922 interval *= sd->busy_factor;
3139 2923
3140 /* scale ms to jiffies */ 2924 /* scale ms to jiffies */
3141 interval = msecs_to_jiffies(interval); 2925 interval = msecs_to_jiffies(interval);
3142 if (unlikely(!interval)) 2926 if (unlikely(!interval))
3143 interval = 1; 2927 interval = 1;
2928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2930
3144 2931
3145 if (sd->flags & SD_SERIALIZE) { 2932 if (sd->flags & SD_SERIALIZE) {
3146 if (!spin_trylock(&balancing)) 2933 if (!spin_trylock(&balancing))
@@ -3154,7 +2941,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3154 * longer idle, or one of our SMT siblings is 2941 * longer idle, or one of our SMT siblings is
3155 * not idle. 2942 * not idle.
3156 */ 2943 */
3157 idle = NOT_IDLE; 2944 idle = CPU_NOT_IDLE;
3158 } 2945 }
3159 sd->last_balance = jiffies; 2946 sd->last_balance = jiffies;
3160 } 2947 }
@@ -3182,11 +2969,12 @@ out:
3182 */ 2969 */
3183static void run_rebalance_domains(struct softirq_action *h) 2970static void run_rebalance_domains(struct softirq_action *h)
3184{ 2971{
3185 int local_cpu = smp_processor_id(); 2972 int this_cpu = smp_processor_id();
3186 struct rq *local_rq = cpu_rq(local_cpu); 2973 struct rq *this_rq = cpu_rq(this_cpu);
3187 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
3188 2976
3189 rebalance_domains(local_cpu, idle); 2977 rebalance_domains(this_cpu, idle);
3190 2978
3191#ifdef CONFIG_NO_HZ 2979#ifdef CONFIG_NO_HZ
3192 /* 2980 /*
@@ -3194,13 +2982,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3194 * balancing on behalf of the other idle cpus whose ticks are 2982 * balancing on behalf of the other idle cpus whose ticks are
3195 * stopped. 2983 * stopped.
3196 */ 2984 */
3197 if (local_rq->idle_at_tick && 2985 if (this_rq->idle_at_tick &&
3198 atomic_read(&nohz.load_balancer) == local_cpu) { 2986 atomic_read(&nohz.load_balancer) == this_cpu) {
3199 cpumask_t cpus = nohz.cpu_mask; 2987 cpumask_t cpus = nohz.cpu_mask;
3200 struct rq *rq; 2988 struct rq *rq;
3201 int balance_cpu; 2989 int balance_cpu;
3202 2990
3203 cpu_clear(local_cpu, cpus); 2991 cpu_clear(this_cpu, cpus);
3204 for_each_cpu_mask(balance_cpu, cpus) { 2992 for_each_cpu_mask(balance_cpu, cpus) {
3205 /* 2993 /*
3206 * If this cpu gets work to do, stop the load balancing 2994 * If this cpu gets work to do, stop the load balancing
@@ -3213,8 +3001,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3213 rebalance_domains(balance_cpu, SCHED_IDLE); 3001 rebalance_domains(balance_cpu, SCHED_IDLE);
3214 3002
3215 rq = cpu_rq(balance_cpu); 3003 rq = cpu_rq(balance_cpu);
3216 if (time_after(local_rq->next_balance, rq->next_balance)) 3004 if (time_after(this_rq->next_balance, rq->next_balance))
3217 local_rq->next_balance = rq->next_balance; 3005 this_rq->next_balance = rq->next_balance;
3218 } 3006 }
3219 } 3007 }
3220#endif 3008#endif
@@ -3227,9 +3015,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3227 * idle load balancing owner or decide to stop the periodic load balancing, 3015 * idle load balancing owner or decide to stop the periodic load balancing,
3228 * if the whole system is idle. 3016 * if the whole system is idle.
3229 */ 3017 */
3230static inline void trigger_load_balance(int cpu) 3018static inline void trigger_load_balance(struct rq *rq, int cpu)
3231{ 3019{
3232 struct rq *rq = cpu_rq(cpu);
3233#ifdef CONFIG_NO_HZ 3020#ifdef CONFIG_NO_HZ
3234 /* 3021 /*
3235 * If we were in the nohz mode recently and busy at the current 3022 * If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3068,29 @@ static inline void trigger_load_balance(int cpu)
3281 if (time_after_eq(jiffies, rq->next_balance)) 3068 if (time_after_eq(jiffies, rq->next_balance))
3282 raise_softirq(SCHED_SOFTIRQ); 3069 raise_softirq(SCHED_SOFTIRQ);
3283} 3070}
3284#else 3071
3072#else /* CONFIG_SMP */
3073
3285/* 3074/*
3286 * on UP we do not need to balance between CPUs: 3075 * on UP we do not need to balance between CPUs:
3287 */ 3076 */
3288static inline void idle_balance(int cpu, struct rq *rq) 3077static inline void idle_balance(int cpu, struct rq *rq)
3289{ 3078{
3290} 3079}
3080
3081/* Avoid "used but not defined" warning on UP */
3082static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3088{
3089 *load_moved = 0;
3090
3091 return 0;
3092}
3093
3291#endif 3094#endif
3292 3095
3293DEFINE_PER_CPU(struct kernel_stat, kstat); 3096DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,54 +3098,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3295EXPORT_PER_CPU_SYMBOL(kstat); 3098EXPORT_PER_CPU_SYMBOL(kstat);
3296 3099
3297/* 3100/*
3298 * This is called on clock ticks and on context switches. 3101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3299 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3102 * that have not yet been banked in case the task is currently running.
3300 */
3301static inline void
3302update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3303{
3304 p->sched_time += now - p->last_ran;
3305 p->last_ran = rq->most_recent_timestamp = now;
3306}
3307
3308/*
3309 * Return current->sched_time plus any more ns on the sched_clock
3310 * that have not yet been banked.
3311 */ 3103 */
3312unsigned long long current_sched_time(const struct task_struct *p) 3104unsigned long long task_sched_runtime(struct task_struct *p)
3313{ 3105{
3314 unsigned long long ns;
3315 unsigned long flags; 3106 unsigned long flags;
3107 u64 ns, delta_exec;
3108 struct rq *rq;
3316 3109
3317 local_irq_save(flags); 3110 rq = task_rq_lock(p, &flags);
3318 ns = p->sched_time + sched_clock() - p->last_ran; 3111 ns = p->se.sum_exec_runtime;
3319 local_irq_restore(flags); 3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3116 }
3117 task_rq_unlock(rq, &flags);
3320 3118
3321 return ns; 3119 return ns;
3322} 3120}
3323 3121
3324/* 3122/*
3325 * We place interactive tasks back into the active array, if possible.
3326 *
3327 * To guarantee that this does not starve expired tasks we ignore the
3328 * interactivity of a task if the first expired task had to wait more
3329 * than a 'reasonable' amount of time. This deadline timeout is
3330 * load-dependent, as the frequency of array switched decreases with
3331 * increasing number of running tasks. We also ignore the interactivity
3332 * if a better static_prio task has expired:
3333 */
3334static inline int expired_starving(struct rq *rq)
3335{
3336 if (rq->curr->static_prio > rq->best_expired_prio)
3337 return 1;
3338 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3339 return 0;
3340 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3341 return 1;
3342 return 0;
3343}
3344
3345/*
3346 * Account user cpu time to a process. 3123 * Account user cpu time to a process.
3347 * @p: the process that the cpu time gets accounted to 3124 * @p: the process that the cpu time gets accounted to
3348 * @hardirq_offset: the offset to subtract from hardirq_count() 3125 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3415,81 +3192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3415 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3416} 3193}
3417 3194
3418static void task_running_tick(struct rq *rq, struct task_struct *p)
3419{
3420 if (p->array != rq->active) {
3421 /* Task has expired but was not scheduled yet */
3422 set_tsk_need_resched(p);
3423 return;
3424 }
3425 spin_lock(&rq->lock);
3426 /*
3427 * The task was running during this tick - update the
3428 * time slice counter. Note: we do not update a thread's
3429 * priority until it either goes to sleep or uses up its
3430 * timeslice. This makes it possible for interactive tasks
3431 * to use up their timeslices at their highest priority levels.
3432 */
3433 if (rt_task(p)) {
3434 /*
3435 * RR tasks need a special form of timeslice management.
3436 * FIFO tasks have no timeslices.
3437 */
3438 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3439 p->time_slice = task_timeslice(p);
3440 p->first_time_slice = 0;
3441 set_tsk_need_resched(p);
3442
3443 /* put it at the end of the queue: */
3444 requeue_task(p, rq->active);
3445 }
3446 goto out_unlock;
3447 }
3448 if (!--p->time_slice) {
3449 dequeue_task(p, rq->active);
3450 set_tsk_need_resched(p);
3451 p->prio = effective_prio(p);
3452 p->time_slice = task_timeslice(p);
3453 p->first_time_slice = 0;
3454
3455 if (!rq->expired_timestamp)
3456 rq->expired_timestamp = jiffies;
3457 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3458 enqueue_task(p, rq->expired);
3459 if (p->static_prio < rq->best_expired_prio)
3460 rq->best_expired_prio = p->static_prio;
3461 } else
3462 enqueue_task(p, rq->active);
3463 } else {
3464 /*
3465 * Prevent a too long timeslice allowing a task to monopolize
3466 * the CPU. We do this by splitting up the timeslice into
3467 * smaller pieces.
3468 *
3469 * Note: this does not mean the task's timeslices expire or
3470 * get lost in any way, they just might be preempted by
3471 * another task of equal priority. (one with higher
3472 * priority would have preempted this task already.) We
3473 * requeue this task to the end of the list on this priority
3474 * level, which is in essence a round-robin of tasks with
3475 * equal priority.
3476 *
3477 * This only applies to tasks in the interactive
3478 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3479 */
3480 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3481 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3482 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3483 (p->array == rq->active)) {
3484
3485 requeue_task(p, rq->active);
3486 set_tsk_need_resched(p);
3487 }
3488 }
3489out_unlock:
3490 spin_unlock(&rq->lock);
3491}
3492
3493/* 3195/*
3494 * This function gets called by the timer code, with HZ frequency. 3196 * This function gets called by the timer code, with HZ frequency.
3495 * We call it with interrupts disabled. 3197 * We call it with interrupts disabled.
@@ -3499,20 +3201,19 @@ out_unlock:
3499 */ 3201 */
3500void scheduler_tick(void) 3202void scheduler_tick(void)
3501{ 3203{
3502 unsigned long long now = sched_clock();
3503 struct task_struct *p = current;
3504 int cpu = smp_processor_id(); 3204 int cpu = smp_processor_id();
3505 int idle_at_tick = idle_cpu(cpu);
3506 struct rq *rq = cpu_rq(cpu); 3205 struct rq *rq = cpu_rq(cpu);
3206 struct task_struct *curr = rq->curr;
3507 3207
3508 update_cpu_clock(p, rq, now); 3208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3509 3213
3510 if (!idle_at_tick)
3511 task_running_tick(rq, p);
3512#ifdef CONFIG_SMP 3214#ifdef CONFIG_SMP
3513 update_load(rq); 3215 rq->idle_at_tick = idle_cpu(cpu);
3514 rq->idle_at_tick = idle_at_tick; 3216 trigger_load_balance(rq, cpu);
3515 trigger_load_balance(cpu);
3516#endif 3217#endif
3517} 3218}
3518 3219
@@ -3554,170 +3255,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3554 3255
3555#endif 3256#endif
3556 3257
3557static inline int interactive_sleep(enum sleep_type sleep_type) 3258/*
3259 * Print scheduling while atomic bug:
3260 */
3261static noinline void __schedule_bug(struct task_struct *prev)
3558{ 3262{
3559 return (sleep_type == SLEEP_INTERACTIVE || 3263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3560 sleep_type == SLEEP_INTERRUPTED); 3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3561} 3269}
3562 3270
3563/* 3271/*
3564 * schedule() is the main scheduler function. 3272 * Various schedule()-time debugging checks and statistics:
3565 */ 3273 */
3566asmlinkage void __sched schedule(void) 3274static inline void schedule_debug(struct task_struct *prev)
3567{ 3275{
3568 struct task_struct *prev, *next;
3569 struct prio_array *array;
3570 struct list_head *queue;
3571 unsigned long long now;
3572 unsigned long run_time;
3573 int cpu, idx, new_prio;
3574 long *switch_count;
3575 struct rq *rq;
3576
3577 /* 3276 /*
3578 * Test if we are atomic. Since do_exit() needs to call into 3277 * Test if we are atomic. Since do_exit() needs to call into
3579 * schedule() atomically, we ignore that path for now. 3278 * schedule() atomically, we ignore that path for now.
3580 * Otherwise, whine if we are scheduling when we should not be. 3279 * Otherwise, whine if we are scheduling when we should not be.
3581 */ 3280 */
3582 if (unlikely(in_atomic() && !current->exit_state)) { 3281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3583 printk(KERN_ERR "BUG: scheduling while atomic: " 3282 __schedule_bug(prev);
3584 "%s/0x%08x/%d\n",
3585 current->comm, preempt_count(), current->pid);
3586 debug_show_held_locks(current);
3587 if (irqs_disabled())
3588 print_irqtrace_events(current);
3589 dump_stack();
3590 }
3591 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3592 3283
3593need_resched: 3284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3594 preempt_disable();
3595 prev = current;
3596 release_kernel_lock(prev);
3597need_resched_nonpreemptible:
3598 rq = this_rq();
3599 3285
3600 /* 3286 schedstat_inc(this_rq(), sched_cnt);
3601 * The idle thread is not allowed to schedule! 3287}
3602 * Remove this check after it has been exercised a bit.
3603 */
3604 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3605 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3606 dump_stack();
3607 }
3608 3288
3609 schedstat_inc(rq, sched_cnt); 3289/*
3610 now = sched_clock(); 3290 * Pick up the highest-prio task:
3611 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3291 */
3612 run_time = now - prev->timestamp; 3292static inline struct task_struct *
3613 if (unlikely((long long)(now - prev->timestamp) < 0)) 3293pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3614 run_time = 0; 3294{
3615 } else 3295 struct sched_class *class;
3616 run_time = NS_MAX_SLEEP_AVG; 3296 struct task_struct *p;
3617 3297
3618 /* 3298 /*
3619 * Tasks charged proportionately less run_time at high sleep_avg to 3299 * Optimization: we know that if all tasks are in
3620 * delay them losing their interactive status 3300 * the fair class we can call that function directly:
3621 */ 3301 */
3622 run_time /= (CURRENT_BONUS(prev) ? : 1); 3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3623 3303 p = fair_sched_class.pick_next_task(rq, now);
3624 spin_lock_irq(&rq->lock); 3304 if (likely(p))
3625 3305 return p;
3626 switch_count = &prev->nivcsw;
3627 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3628 switch_count = &prev->nvcsw;
3629 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3630 unlikely(signal_pending(prev))))
3631 prev->state = TASK_RUNNING;
3632 else {
3633 if (prev->state == TASK_UNINTERRUPTIBLE)
3634 rq->nr_uninterruptible++;
3635 deactivate_task(prev, rq);
3636 }
3637 }
3638
3639 cpu = smp_processor_id();
3640 if (unlikely(!rq->nr_running)) {
3641 idle_balance(cpu, rq);
3642 if (!rq->nr_running) {
3643 next = rq->idle;
3644 rq->expired_timestamp = 0;
3645 goto switch_tasks;
3646 }
3647 } 3306 }
3648 3307
3649 array = rq->active; 3308 class = sched_class_highest;
3650 if (unlikely(!array->nr_active)) { 3309 for ( ; ; ) {
3310 p = class->pick_next_task(rq, now);
3311 if (p)
3312 return p;
3651 /* 3313 /*
3652 * Switch the active and expired arrays. 3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3653 */ 3316 */
3654 schedstat_inc(rq, sched_switch); 3317 class = class->next;
3655 rq->active = rq->expired;
3656 rq->expired = array;
3657 array = rq->active;
3658 rq->expired_timestamp = 0;
3659 rq->best_expired_prio = MAX_PRIO;
3660 } 3318 }
3319}
3320
3321/*
3322 * schedule() is the main scheduler function.
3323 */
3324asmlinkage void __sched schedule(void)
3325{
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3661 3331
3662 idx = sched_find_first_bit(array->bitmap); 3332need_resched:
3663 queue = array->queue + idx; 3333 preempt_disable();
3664 next = list_entry(queue->next, struct task_struct, run_list); 3334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3665 3339
3666 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3340 release_kernel_lock(prev);
3667 unsigned long long delta = now - next->timestamp; 3341need_resched_nonpreemptible:
3668 if (unlikely((long long)(now - next->timestamp) < 0))
3669 delta = 0;
3670 3342
3671 if (next->sleep_type == SLEEP_INTERACTIVE) 3343 schedule_debug(prev);
3672 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3673 3344
3674 array = next->array; 3345 spin_lock_irq(&rq->lock);
3675 new_prio = recalc_task_prio(next, next->timestamp + delta); 3346 clear_tsk_need_resched(prev);
3676 3347
3677 if (unlikely(next->prio != new_prio)) { 3348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3678 dequeue_task(next, array); 3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3679 next->prio = new_prio; 3350 unlikely(signal_pending(prev)))) {
3680 enqueue_task(next, array); 3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3681 } 3354 }
3355 switch_count = &prev->nvcsw;
3682 } 3356 }
3683 next->sleep_type = SLEEP_NORMAL;
3684switch_tasks:
3685 if (next == rq->idle)
3686 schedstat_inc(rq, sched_goidle);
3687 prefetch(next);
3688 prefetch_stack(next);
3689 clear_tsk_need_resched(prev);
3690 rcu_qsctr_inc(task_cpu(prev));
3691 3357
3692 update_cpu_clock(prev, rq, now); 3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3693 3360
3694 prev->sleep_avg -= run_time; 3361 now = __rq_clock(rq);
3695 if ((long)prev->sleep_avg <= 0) 3362 prev->sched_class->put_prev_task(rq, prev, now);
3696 prev->sleep_avg = 0; 3363 next = pick_next_task(rq, prev, now);
3697 prev->timestamp = prev->last_ran = now;
3698 3364
3699 sched_info_switch(prev, next); 3365 sched_info_switch(prev, next);
3366
3700 if (likely(prev != next)) { 3367 if (likely(prev != next)) {
3701 next->timestamp = next->last_ran = now;
3702 rq->nr_switches++; 3368 rq->nr_switches++;
3703 rq->curr = next; 3369 rq->curr = next;
3704 ++*switch_count; 3370 ++*switch_count;
3705 3371
3706 prepare_task_switch(rq, next); 3372 context_switch(rq, prev, next); /* unlocks the rq */
3707 prev = context_switch(rq, prev, next);
3708 barrier();
3709 /*
3710 * this_rq must be evaluated again because prev may have moved
3711 * CPUs since it called schedule(), thus the 'rq' on its stack
3712 * frame will be invalid.
3713 */
3714 finish_task_switch(this_rq(), prev);
3715 } else 3373 } else
3716 spin_unlock_irq(&rq->lock); 3374 spin_unlock_irq(&rq->lock);
3717 3375
3718 prev = current; 3376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3719 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
3720 goto need_resched_nonpreemptible; 3379 goto need_resched_nonpreemptible;
3380 }
3721 preempt_enable_no_resched(); 3381 preempt_enable_no_resched();
3722 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3723 goto need_resched; 3383 goto need_resched;
@@ -4045,74 +3705,85 @@ out:
4045} 3705}
4046EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3706EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4047 3707
4048 3708static inline void
4049#define SLEEP_ON_VAR \ 3709sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4050 unsigned long flags; \ 3710{
4051 wait_queue_t wait; \ 3711 spin_lock_irqsave(&q->lock, *flags);
4052 init_waitqueue_entry(&wait, current); 3712 __add_wait_queue(q, wait);
4053
4054#define SLEEP_ON_HEAD \
4055 spin_lock_irqsave(&q->lock,flags); \
4056 __add_wait_queue(q, &wait); \
4057 spin_unlock(&q->lock); 3713 spin_unlock(&q->lock);
3714}
4058 3715
4059#define SLEEP_ON_TAIL \ 3716static inline void
4060 spin_lock_irq(&q->lock); \ 3717sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4061 __remove_wait_queue(q, &wait); \ 3718{
4062 spin_unlock_irqrestore(&q->lock, flags); 3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3722}
4063 3723
4064void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3724void __sched interruptible_sleep_on(wait_queue_head_t *q)
4065{ 3725{
4066 SLEEP_ON_VAR 3726 unsigned long flags;
3727 wait_queue_t wait;
3728
3729 init_waitqueue_entry(&wait, current);
4067 3730
4068 current->state = TASK_INTERRUPTIBLE; 3731 current->state = TASK_INTERRUPTIBLE;
4069 3732
4070 SLEEP_ON_HEAD 3733 sleep_on_head(q, &wait, &flags);
4071 schedule(); 3734 schedule();
4072 SLEEP_ON_TAIL 3735 sleep_on_tail(q, &wait, &flags);
4073} 3736}
4074EXPORT_SYMBOL(interruptible_sleep_on); 3737EXPORT_SYMBOL(interruptible_sleep_on);
4075 3738
4076long fastcall __sched 3739long __sched
4077interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3740interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4078{ 3741{
4079 SLEEP_ON_VAR 3742 unsigned long flags;
3743 wait_queue_t wait;
3744
3745 init_waitqueue_entry(&wait, current);
4080 3746
4081 current->state = TASK_INTERRUPTIBLE; 3747 current->state = TASK_INTERRUPTIBLE;
4082 3748
4083 SLEEP_ON_HEAD 3749 sleep_on_head(q, &wait, &flags);
4084 timeout = schedule_timeout(timeout); 3750 timeout = schedule_timeout(timeout);
4085 SLEEP_ON_TAIL 3751 sleep_on_tail(q, &wait, &flags);
4086 3752
4087 return timeout; 3753 return timeout;
4088} 3754}
4089EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3755EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4090 3756
4091void fastcall __sched sleep_on(wait_queue_head_t *q) 3757void __sched sleep_on(wait_queue_head_t *q)
4092{ 3758{
4093 SLEEP_ON_VAR 3759 unsigned long flags;
3760 wait_queue_t wait;
3761
3762 init_waitqueue_entry(&wait, current);
4094 3763
4095 current->state = TASK_UNINTERRUPTIBLE; 3764 current->state = TASK_UNINTERRUPTIBLE;
4096 3765
4097 SLEEP_ON_HEAD 3766 sleep_on_head(q, &wait, &flags);
4098 schedule(); 3767 schedule();
4099 SLEEP_ON_TAIL 3768 sleep_on_tail(q, &wait, &flags);
4100} 3769}
4101EXPORT_SYMBOL(sleep_on); 3770EXPORT_SYMBOL(sleep_on);
4102 3771
4103long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3772long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4104{ 3773{
4105 SLEEP_ON_VAR 3774 unsigned long flags;
3775 wait_queue_t wait;
3776
3777 init_waitqueue_entry(&wait, current);
4106 3778
4107 current->state = TASK_UNINTERRUPTIBLE; 3779 current->state = TASK_UNINTERRUPTIBLE;
4108 3780
4109 SLEEP_ON_HEAD 3781 sleep_on_head(q, &wait, &flags);
4110 timeout = schedule_timeout(timeout); 3782 timeout = schedule_timeout(timeout);
4111 SLEEP_ON_TAIL 3783 sleep_on_tail(q, &wait, &flags);
4112 3784
4113 return timeout; 3785 return timeout;
4114} 3786}
4115
4116EXPORT_SYMBOL(sleep_on_timeout); 3787EXPORT_SYMBOL(sleep_on_timeout);
4117 3788
4118#ifdef CONFIG_RT_MUTEXES 3789#ifdef CONFIG_RT_MUTEXES
@@ -4129,29 +3800,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4129 */ 3800 */
4130void rt_mutex_setprio(struct task_struct *p, int prio) 3801void rt_mutex_setprio(struct task_struct *p, int prio)
4131{ 3802{
4132 struct prio_array *array;
4133 unsigned long flags; 3803 unsigned long flags;
3804 int oldprio, on_rq;
4134 struct rq *rq; 3805 struct rq *rq;
4135 int oldprio; 3806 u64 now;
4136 3807
4137 BUG_ON(prio < 0 || prio > MAX_PRIO); 3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
4138 3809
4139 rq = task_rq_lock(p, &flags); 3810 rq = task_rq_lock(p, &flags);
3811 now = rq_clock(rq);
4140 3812
4141 oldprio = p->prio; 3813 oldprio = p->prio;
4142 array = p->array; 3814 on_rq = p->se.on_rq;
4143 if (array) 3815 if (on_rq)
4144 dequeue_task(p, array); 3816 dequeue_task(rq, p, 0, now);
3817
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3822
4145 p->prio = prio; 3823 p->prio = prio;
4146 3824
4147 if (array) { 3825 if (on_rq) {
4148 /* 3826 enqueue_task(rq, p, 0, now);
4149 * If changing to an RT priority then queue it
4150 * in the active array!
4151 */
4152 if (rt_task(p))
4153 array = rq->active;
4154 enqueue_task(p, array);
4155 /* 3827 /*
4156 * Reschedule if we are currently running on this runqueue and 3828 * Reschedule if we are currently running on this runqueue and
4157 * our priority decreased, or if we are not currently running on 3829 * our priority decreased, or if we are not currently running on
@@ -4160,8 +3832,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4160 if (task_running(rq, p)) { 3832 if (task_running(rq, p)) {
4161 if (p->prio > oldprio) 3833 if (p->prio > oldprio)
4162 resched_task(rq->curr); 3834 resched_task(rq->curr);
4163 } else if (TASK_PREEMPTS_CURR(p, rq)) 3835 } else {
4164 resched_task(rq->curr); 3836 check_preempt_curr(rq, p);
3837 }
4165 } 3838 }
4166 task_rq_unlock(rq, &flags); 3839 task_rq_unlock(rq, &flags);
4167} 3840}
@@ -4170,10 +3843,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4170 3843
4171void set_user_nice(struct task_struct *p, long nice) 3844void set_user_nice(struct task_struct *p, long nice)
4172{ 3845{
4173 struct prio_array *array; 3846 int old_prio, delta, on_rq;
4174 int old_prio, delta;
4175 unsigned long flags; 3847 unsigned long flags;
4176 struct rq *rq; 3848 struct rq *rq;
3849 u64 now;
4177 3850
4178 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4179 return; 3852 return;
@@ -4182,20 +3855,21 @@ void set_user_nice(struct task_struct *p, long nice)
4182 * the task might be in the middle of scheduling on another CPU. 3855 * the task might be in the middle of scheduling on another CPU.
4183 */ 3856 */
4184 rq = task_rq_lock(p, &flags); 3857 rq = task_rq_lock(p, &flags);
3858 now = rq_clock(rq);
4185 /* 3859 /*
4186 * The RT priorities are set via sched_setscheduler(), but we still 3860 * The RT priorities are set via sched_setscheduler(), but we still
4187 * allow the 'normal' nice value to be set - but as expected 3861 * allow the 'normal' nice value to be set - but as expected
4188 * it wont have any effect on scheduling until the task is 3862 * it wont have any effect on scheduling until the task is
4189 * not SCHED_NORMAL/SCHED_BATCH: 3863 * SCHED_FIFO/SCHED_RR:
4190 */ 3864 */
4191 if (has_rt_policy(p)) { 3865 if (task_has_rt_policy(p)) {
4192 p->static_prio = NICE_TO_PRIO(nice); 3866 p->static_prio = NICE_TO_PRIO(nice);
4193 goto out_unlock; 3867 goto out_unlock;
4194 } 3868 }
4195 array = p->array; 3869 on_rq = p->se.on_rq;
4196 if (array) { 3870 if (on_rq) {
4197 dequeue_task(p, array); 3871 dequeue_task(rq, p, 0, now);
4198 dec_raw_weighted_load(rq, p); 3872 dec_load(rq, p, now);
4199 } 3873 }
4200 3874
4201 p->static_prio = NICE_TO_PRIO(nice); 3875 p->static_prio = NICE_TO_PRIO(nice);
@@ -4204,9 +3878,9 @@ void set_user_nice(struct task_struct *p, long nice)
4204 p->prio = effective_prio(p); 3878 p->prio = effective_prio(p);
4205 delta = p->prio - old_prio; 3879 delta = p->prio - old_prio;
4206 3880
4207 if (array) { 3881 if (on_rq) {
4208 enqueue_task(p, array); 3882 enqueue_task(rq, p, 0, now);
4209 inc_raw_weighted_load(rq, p); 3883 inc_load(rq, p, now);
4210 /* 3884 /*
4211 * If the task increased its priority or is running and 3885 * If the task increased its priority or is running and
4212 * lowered its priority, then reschedule its CPU: 3886 * lowered its priority, then reschedule its CPU:
@@ -4326,20 +4000,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4326} 4000}
4327 4001
4328/* Actually do priority change: must hold rq lock. */ 4002/* Actually do priority change: must hold rq lock. */
4329static void __setscheduler(struct task_struct *p, int policy, int prio) 4003static void
4004__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4330{ 4005{
4331 BUG_ON(p->array); 4006 BUG_ON(p->se.on_rq);
4332 4007
4333 p->policy = policy; 4008 p->policy = policy;
4009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4019 }
4020
4334 p->rt_priority = prio; 4021 p->rt_priority = prio;
4335 p->normal_prio = normal_prio(p); 4022 p->normal_prio = normal_prio(p);
4336 /* we are holding p->pi_lock already */ 4023 /* we are holding p->pi_lock already */
4337 p->prio = rt_mutex_getprio(p); 4024 p->prio = rt_mutex_getprio(p);
4338 /*
4339 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4340 */
4341 if (policy == SCHED_BATCH)
4342 p->sleep_avg = 0;
4343 set_load_weight(p); 4025 set_load_weight(p);
4344} 4026}
4345 4027
@@ -4354,8 +4036,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4354int sched_setscheduler(struct task_struct *p, int policy, 4036int sched_setscheduler(struct task_struct *p, int policy,
4355 struct sched_param *param) 4037 struct sched_param *param)
4356{ 4038{
4357 int retval, oldprio, oldpolicy = -1; 4039 int retval, oldprio, oldpolicy = -1, on_rq;
4358 struct prio_array *array;
4359 unsigned long flags; 4040 unsigned long flags;
4360 struct rq *rq; 4041 struct rq *rq;
4361 4042
@@ -4366,27 +4047,27 @@ recheck:
4366 if (policy < 0) 4047 if (policy < 0)
4367 policy = oldpolicy = p->policy; 4048 policy = oldpolicy = p->policy;
4368 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4369 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
4370 return -EINVAL; 4052 return -EINVAL;
4371 /* 4053 /*
4372 * Valid priorities for SCHED_FIFO and SCHED_RR are 4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
4373 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4374 * SCHED_BATCH is 0. 4056 * SCHED_BATCH and SCHED_IDLE is 0.
4375 */ 4057 */
4376 if (param->sched_priority < 0 || 4058 if (param->sched_priority < 0 ||
4377 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4378 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4379 return -EINVAL; 4061 return -EINVAL;
4380 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4062 if (rt_policy(policy) != (param->sched_priority != 0))
4381 return -EINVAL; 4063 return -EINVAL;
4382 4064
4383 /* 4065 /*
4384 * Allow unprivileged RT tasks to decrease priority: 4066 * Allow unprivileged RT tasks to decrease priority:
4385 */ 4067 */
4386 if (!capable(CAP_SYS_NICE)) { 4068 if (!capable(CAP_SYS_NICE)) {
4387 if (is_rt_policy(policy)) { 4069 if (rt_policy(policy)) {
4388 unsigned long rlim_rtprio; 4070 unsigned long rlim_rtprio;
4389 unsigned long flags;
4390 4071
4391 if (!lock_task_sighand(p, &flags)) 4072 if (!lock_task_sighand(p, &flags))
4392 return -ESRCH; 4073 return -ESRCH;
@@ -4402,6 +4083,12 @@ recheck:
4402 param->sched_priority > rlim_rtprio) 4083 param->sched_priority > rlim_rtprio)
4403 return -EPERM; 4084 return -EPERM;
4404 } 4085 }
4086 /*
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4089 */
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
4405 4092
4406 /* can't change other user's priorities */ 4093 /* can't change other user's priorities */
4407 if ((current->euid != p->euid) && 4094 if ((current->euid != p->euid) &&
@@ -4429,13 +4116,13 @@ recheck:
4429 spin_unlock_irqrestore(&p->pi_lock, flags); 4116 spin_unlock_irqrestore(&p->pi_lock, flags);
4430 goto recheck; 4117 goto recheck;
4431 } 4118 }
4432 array = p->array; 4119 on_rq = p->se.on_rq;
4433 if (array) 4120 if (on_rq)
4434 deactivate_task(p, rq); 4121 deactivate_task(rq, p, 0);
4435 oldprio = p->prio; 4122 oldprio = p->prio;
4436 __setscheduler(p, policy, param->sched_priority); 4123 __setscheduler(rq, p, policy, param->sched_priority);
4437 if (array) { 4124 if (on_rq) {
4438 __activate_task(p, rq); 4125 activate_task(rq, p, 0);
4439 /* 4126 /*
4440 * Reschedule if we are currently running on this runqueue and 4127 * Reschedule if we are currently running on this runqueue and
4441 * our priority decreased, or if we are not currently running on 4128 * our priority decreased, or if we are not currently running on
@@ -4444,8 +4131,9 @@ recheck:
4444 if (task_running(rq, p)) { 4131 if (task_running(rq, p)) {
4445 if (p->prio > oldprio) 4132 if (p->prio > oldprio)
4446 resched_task(rq->curr); 4133 resched_task(rq->curr);
4447 } else if (TASK_PREEMPTS_CURR(p, rq)) 4134 } else {
4448 resched_task(rq->curr); 4135 check_preempt_curr(rq, p);
4136 }
4449 } 4137 }
4450 __task_rq_unlock(rq); 4138 __task_rq_unlock(rq);
4451 spin_unlock_irqrestore(&p->pi_lock, flags); 4139 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4717,41 +4405,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4717/** 4405/**
4718 * sys_sched_yield - yield the current processor to other threads. 4406 * sys_sched_yield - yield the current processor to other threads.
4719 * 4407 *
4720 * This function yields the current CPU by moving the calling thread 4408 * This function yields the current CPU to other tasks. If there are no
4721 * to the expired array. If there are no other threads running on this 4409 * other threads running on this CPU then this function will return.
4722 * CPU then this function will return.
4723 */ 4410 */
4724asmlinkage long sys_sched_yield(void) 4411asmlinkage long sys_sched_yield(void)
4725{ 4412{
4726 struct rq *rq = this_rq_lock(); 4413 struct rq *rq = this_rq_lock();
4727 struct prio_array *array = current->array, *target = rq->expired;
4728 4414
4729 schedstat_inc(rq, yld_cnt); 4415 schedstat_inc(rq, yld_cnt);
4730 /* 4416 if (unlikely(rq->nr_running == 1))
4731 * We implement yielding by moving the task into the expired
4732 * queue.
4733 *
4734 * (special rule: RT tasks will just roundrobin in the active
4735 * array.)
4736 */
4737 if (rt_task(current))
4738 target = rq->active;
4739
4740 if (array->nr_active == 1) {
4741 schedstat_inc(rq, yld_act_empty); 4417 schedstat_inc(rq, yld_act_empty);
4742 if (!rq->expired->nr_active) 4418 else
4743 schedstat_inc(rq, yld_both_empty); 4419 current->sched_class->yield_task(rq, current);
4744 } else if (!rq->expired->nr_active)
4745 schedstat_inc(rq, yld_exp_empty);
4746
4747 if (array != target) {
4748 dequeue_task(current, array);
4749 enqueue_task(current, target);
4750 } else
4751 /*
4752 * requeue_task is cheaper so perform that if possible.
4753 */
4754 requeue_task(current, array);
4755 4420
4756 /* 4421 /*
4757 * Since we are going to call schedule() anyway, there's 4422 * Since we are going to call schedule() anyway, there's
@@ -4902,6 +4567,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4902 break; 4567 break;
4903 case SCHED_NORMAL: 4568 case SCHED_NORMAL:
4904 case SCHED_BATCH: 4569 case SCHED_BATCH:
4570 case SCHED_IDLE:
4905 ret = 0; 4571 ret = 0;
4906 break; 4572 break;
4907 } 4573 }
@@ -4926,6 +4592,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4926 break; 4592 break;
4927 case SCHED_NORMAL: 4593 case SCHED_NORMAL:
4928 case SCHED_BATCH: 4594 case SCHED_BATCH:
4595 case SCHED_IDLE:
4929 ret = 0; 4596 ret = 0;
4930 } 4597 }
4931 return ret; 4598 return ret;
@@ -4960,7 +4627,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4960 goto out_unlock; 4627 goto out_unlock;
4961 4628
4962 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4963 0 : task_timeslice(p), &t); 4630 0 : static_prio_timeslice(p->static_prio), &t);
4964 read_unlock(&tasklist_lock); 4631 read_unlock(&tasklist_lock);
4965 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4966out_nounlock: 4633out_nounlock:
@@ -5035,6 +4702,9 @@ void show_state_filter(unsigned long state_filter)
5035 4702
5036 touch_all_softlockup_watchdogs(); 4703 touch_all_softlockup_watchdogs();
5037 4704
4705#ifdef CONFIG_SCHED_DEBUG
4706 sysrq_sched_debug_show();
4707#endif
5038 read_unlock(&tasklist_lock); 4708 read_unlock(&tasklist_lock);
5039 /* 4709 /*
5040 * Only show locks if all tasks are dumped: 4710 * Only show locks if all tasks are dumped:
@@ -5043,6 +4713,11 @@ void show_state_filter(unsigned long state_filter)
5043 debug_show_all_locks(); 4713 debug_show_all_locks();
5044} 4714}
5045 4715
4716void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717{
4718 idle->sched_class = &idle_sched_class;
4719}
4720
5046/** 4721/**
5047 * init_idle - set up an idle thread for a given CPU 4722 * init_idle - set up an idle thread for a given CPU
5048 * @idle: task in question 4723 * @idle: task in question
@@ -5056,13 +4731,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5056 struct rq *rq = cpu_rq(cpu); 4731 struct rq *rq = cpu_rq(cpu);
5057 unsigned long flags; 4732 unsigned long flags;
5058 4733
5059 idle->timestamp = sched_clock(); 4734 __sched_fork(idle);
5060 idle->sleep_avg = 0; 4735 idle->se.exec_start = sched_clock();
5061 idle->array = NULL; 4736
5062 idle->prio = idle->normal_prio = MAX_PRIO; 4737 idle->prio = idle->normal_prio = MAX_PRIO;
5063 idle->state = TASK_RUNNING;
5064 idle->cpus_allowed = cpumask_of_cpu(cpu); 4738 idle->cpus_allowed = cpumask_of_cpu(cpu);
5065 set_task_cpu(idle, cpu); 4739 __set_task_cpu(idle, cpu);
5066 4740
5067 spin_lock_irqsave(&rq->lock, flags); 4741 spin_lock_irqsave(&rq->lock, flags);
5068 rq->curr = rq->idle = idle; 4742 rq->curr = rq->idle = idle;
@@ -5077,6 +4751,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5077#else 4751#else
5078 task_thread_info(idle)->preempt_count = 0; 4752 task_thread_info(idle)->preempt_count = 0;
5079#endif 4753#endif
4754 /*
4755 * The idle tasks have their own, simple scheduling class:
4756 */
4757 idle->sched_class = &idle_sched_class;
5080} 4758}
5081 4759
5082/* 4760/*
@@ -5088,6 +4766,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5088 */ 4766 */
5089cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4767cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5090 4768
4769/*
4770 * Increase the granularity value when there are more CPUs,
4771 * because with more CPUs the 'effective latency' as visible
4772 * to users decreases. But the relationship is not linear,
4773 * so pick a second-best guess by going with the log2 of the
4774 * number of CPUs.
4775 *
4776 * This idea comes from the SD scheduler of Con Kolivas:
4777 */
4778static inline void sched_init_granularity(void)
4779{
4780 unsigned int factor = 1 + ilog2(num_online_cpus());
4781 const unsigned long gran_limit = 10000000;
4782
4783 sysctl_sched_granularity *= factor;
4784 if (sysctl_sched_granularity > gran_limit)
4785 sysctl_sched_granularity = gran_limit;
4786
4787 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4788 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4789}
4790
5091#ifdef CONFIG_SMP 4791#ifdef CONFIG_SMP
5092/* 4792/*
5093 * This is how migration works: 4793 * This is how migration works:
@@ -5161,7 +4861,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5161static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4861static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5162{ 4862{
5163 struct rq *rq_dest, *rq_src; 4863 struct rq *rq_dest, *rq_src;
5164 int ret = 0; 4864 int ret = 0, on_rq;
5165 4865
5166 if (unlikely(cpu_is_offline(dest_cpu))) 4866 if (unlikely(cpu_is_offline(dest_cpu)))
5167 return ret; 4867 return ret;
@@ -5177,20 +4877,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5177 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4877 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5178 goto out; 4878 goto out;
5179 4879
4880 on_rq = p->se.on_rq;
4881 if (on_rq)
4882 deactivate_task(rq_src, p, 0);
5180 set_task_cpu(p, dest_cpu); 4883 set_task_cpu(p, dest_cpu);
5181 if (p->array) { 4884 if (on_rq) {
5182 /* 4885 activate_task(rq_dest, p, 0);
5183 * Sync timestamp with rq_dest's before activating. 4886 check_preempt_curr(rq_dest, p);
5184 * The same thing could be achieved by doing this step
5185 * afterwards, and pretending it was a local activate.
5186 * This way is cleaner and logically correct.
5187 */
5188 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5189 + rq_dest->most_recent_timestamp;
5190 deactivate_task(p, rq_src);
5191 __activate_task(p, rq_dest);
5192 if (TASK_PREEMPTS_CURR(p, rq_dest))
5193 resched_task(rq_dest->curr);
5194 } 4887 }
5195 ret = 1; 4888 ret = 1;
5196out: 4889out:
@@ -5342,7 +5035,8 @@ static void migrate_live_tasks(int src_cpu)
5342 write_unlock_irq(&tasklist_lock); 5035 write_unlock_irq(&tasklist_lock);
5343} 5036}
5344 5037
5345/* Schedules idle task to be the next runnable task on current CPU. 5038/*
5039 * Schedules idle task to be the next runnable task on current CPU.
5346 * It does so by boosting its priority to highest possible and adding it to 5040 * It does so by boosting its priority to highest possible and adding it to
5347 * the _front_ of the runqueue. Used by CPU offline code. 5041 * the _front_ of the runqueue. Used by CPU offline code.
5348 */ 5042 */
@@ -5362,10 +5056,10 @@ void sched_idle_next(void)
5362 */ 5056 */
5363 spin_lock_irqsave(&rq->lock, flags); 5057 spin_lock_irqsave(&rq->lock, flags);
5364 5058
5365 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5059 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5366 5060
5367 /* Add idle task to the _front_ of its priority queue: */ 5061 /* Add idle task to the _front_ of its priority queue: */
5368 __activate_idle_task(p, rq); 5062 activate_idle_task(p, rq);
5369 5063
5370 spin_unlock_irqrestore(&rq->lock, flags); 5064 spin_unlock_irqrestore(&rq->lock, flags);
5371} 5065}
@@ -5415,16 +5109,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5415static void migrate_dead_tasks(unsigned int dead_cpu) 5109static void migrate_dead_tasks(unsigned int dead_cpu)
5416{ 5110{
5417 struct rq *rq = cpu_rq(dead_cpu); 5111 struct rq *rq = cpu_rq(dead_cpu);
5418 unsigned int arr, i; 5112 struct task_struct *next;
5419 5113
5420 for (arr = 0; arr < 2; arr++) { 5114 for ( ; ; ) {
5421 for (i = 0; i < MAX_PRIO; i++) { 5115 if (!rq->nr_running)
5422 struct list_head *list = &rq->arrays[arr].queue[i]; 5116 break;
5423 5117 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5424 while (!list_empty(list)) 5118 if (!next)
5425 migrate_dead(dead_cpu, list_entry(list->next, 5119 break;
5426 struct task_struct, run_list)); 5120 migrate_dead(dead_cpu, next);
5427 }
5428 } 5121 }
5429} 5122}
5430#endif /* CONFIG_HOTPLUG_CPU */ 5123#endif /* CONFIG_HOTPLUG_CPU */
@@ -5448,14 +5141,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5448 5141
5449 case CPU_UP_PREPARE: 5142 case CPU_UP_PREPARE:
5450 case CPU_UP_PREPARE_FROZEN: 5143 case CPU_UP_PREPARE_FROZEN:
5451 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5144 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5452 if (IS_ERR(p)) 5145 if (IS_ERR(p))
5453 return NOTIFY_BAD; 5146 return NOTIFY_BAD;
5454 p->flags |= PF_NOFREEZE; 5147 p->flags |= PF_NOFREEZE;
5455 kthread_bind(p, cpu); 5148 kthread_bind(p, cpu);
5456 /* Must be high prio: stop_machine expects to yield to it. */ 5149 /* Must be high prio: stop_machine expects to yield to it. */
5457 rq = task_rq_lock(p, &flags); 5150 rq = task_rq_lock(p, &flags);
5458 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5151 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5459 task_rq_unlock(rq, &flags); 5152 task_rq_unlock(rq, &flags);
5460 cpu_rq(cpu)->migration_thread = p; 5153 cpu_rq(cpu)->migration_thread = p;
5461 break; 5154 break;
@@ -5486,9 +5179,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5486 rq->migration_thread = NULL; 5179 rq->migration_thread = NULL;
5487 /* Idle task back to normal (off runqueue, low prio) */ 5180 /* Idle task back to normal (off runqueue, low prio) */
5488 rq = task_rq_lock(rq->idle, &flags); 5181 rq = task_rq_lock(rq->idle, &flags);
5489 deactivate_task(rq->idle, rq); 5182 deactivate_task(rq, rq->idle, 0);
5490 rq->idle->static_prio = MAX_PRIO; 5183 rq->idle->static_prio = MAX_PRIO;
5491 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185 rq->idle->sched_class = &idle_sched_class;
5492 migrate_dead_tasks(cpu); 5186 migrate_dead_tasks(cpu);
5493 task_rq_unlock(rq, &flags); 5187 task_rq_unlock(rq, &flags);
5494 migrate_nr_uninterruptible(rq); 5188 migrate_nr_uninterruptible(rq);
@@ -5797,483 +5491,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797 5491
5798#define SD_NODES_PER_DOMAIN 16 5492#define SD_NODES_PER_DOMAIN 16
5799 5493
5800/*
5801 * Self-tuning task migration cost measurement between source and target CPUs.
5802 *
5803 * This is done by measuring the cost of manipulating buffers of varying
5804 * sizes. For a given buffer-size here are the steps that are taken:
5805 *
5806 * 1) the source CPU reads+dirties a shared buffer
5807 * 2) the target CPU reads+dirties the same shared buffer
5808 *
5809 * We measure how long they take, in the following 4 scenarios:
5810 *
5811 * - source: CPU1, target: CPU2 | cost1
5812 * - source: CPU2, target: CPU1 | cost2
5813 * - source: CPU1, target: CPU1 | cost3
5814 * - source: CPU2, target: CPU2 | cost4
5815 *
5816 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817 * the cost of migration.
5818 *
5819 * We then start off from a small buffer-size and iterate up to larger
5820 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821 * doing a maximum search for the cost. (The maximum cost for a migration
5822 * normally occurs when the working set size is around the effective cache
5823 * size.)
5824 */
5825#define SEARCH_SCOPE 2
5826#define MIN_CACHE_SIZE (64*1024U)
5827#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5828#define ITERATIONS 1
5829#define SIZE_THRESH 130
5830#define COST_THRESH 130
5831
5832/*
5833 * The migration cost is a function of 'domain distance'. Domain
5834 * distance is the number of steps a CPU has to iterate down its
5835 * domain tree to share a domain with the other CPU. The farther
5836 * two CPUs are from each other, the larger the distance gets.
5837 *
5838 * Note that we use the distance only to cache measurement results,
5839 * the distance value is not used numerically otherwise. When two
5840 * CPUs have the same distance it is assumed that the migration
5841 * cost is the same. (this is a simplification but quite practical)
5842 */
5843#define MAX_DOMAIN_DISTANCE 32
5844
5845static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847/*
5848 * Architectures may override the migration cost and thus avoid
5849 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5850 * virtualized hardware:
5851 */
5852#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853 CONFIG_DEFAULT_MIGRATION_COST
5854#else
5855 -1LL
5856#endif
5857};
5858
5859/*
5860 * Allow override of migration cost - in units of microseconds.
5861 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863 */
5864static int __init migration_cost_setup(char *str)
5865{
5866 int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868 str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870 printk("#ints: %d\n", ints[0]);
5871 for (i = 1; i <= ints[0]; i++) {
5872 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874 }
5875 return 1;
5876}
5877
5878__setup ("migration_cost=", migration_cost_setup);
5879
5880/*
5881 * Global multiplier (divisor) for migration-cutoff values,
5882 * in percentiles. E.g. use a value of 150 to get 1.5 times
5883 * longer cache-hot cutoff times.
5884 *
5885 * (We scale it from 100 to 128 to long long handling easier.)
5886 */
5887
5888#define MIGRATION_FACTOR_SCALE 128
5889
5890static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892static int __init setup_migration_factor(char *str)
5893{
5894 get_option(&str, &migration_factor);
5895 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896 return 1;
5897}
5898
5899__setup("migration_factor=", setup_migration_factor);
5900
5901/*
5902 * Estimated distance of two CPUs, measured via the number of domains
5903 * we have to pass for the two CPUs to be in the same span:
5904 */
5905static unsigned long domain_distance(int cpu1, int cpu2)
5906{
5907 unsigned long distance = 0;
5908 struct sched_domain *sd;
5909
5910 for_each_domain(cpu1, sd) {
5911 WARN_ON(!cpu_isset(cpu1, sd->span));
5912 if (cpu_isset(cpu2, sd->span))
5913 return distance;
5914 distance++;
5915 }
5916 if (distance >= MAX_DOMAIN_DISTANCE) {
5917 WARN_ON(1);
5918 distance = MAX_DOMAIN_DISTANCE-1;
5919 }
5920
5921 return distance;
5922}
5923
5924static unsigned int migration_debug;
5925
5926static int __init setup_migration_debug(char *str)
5927{
5928 get_option(&str, &migration_debug);
5929 return 1;
5930}
5931
5932__setup("migration_debug=", setup_migration_debug);
5933
5934/*
5935 * Maximum cache-size that the scheduler should try to measure.
5936 * Architectures with larger caches should tune this up during
5937 * bootup. Gets used in the domain-setup code (i.e. during SMP
5938 * bootup).
5939 */
5940unsigned int max_cache_size;
5941
5942static int __init setup_max_cache_size(char *str)
5943{
5944 get_option(&str, &max_cache_size);
5945 return 1;
5946}
5947
5948__setup("max_cache_size=", setup_max_cache_size);
5949
5950/*
5951 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952 * is the operation that is timed, so we try to generate unpredictable
5953 * cachemisses that still end up filling the L2 cache:
5954 */
5955static void touch_cache(void *__cache, unsigned long __size)
5956{
5957 unsigned long size = __size / sizeof(long);
5958 unsigned long chunk1 = size / 3;
5959 unsigned long chunk2 = 2 * size / 3;
5960 unsigned long *cache = __cache;
5961 int i;
5962
5963 for (i = 0; i < size/6; i += 8) {
5964 switch (i % 6) {
5965 case 0: cache[i]++;
5966 case 1: cache[size-1-i]++;
5967 case 2: cache[chunk1-i]++;
5968 case 3: cache[chunk1+i]++;
5969 case 4: cache[chunk2-i]++;
5970 case 5: cache[chunk2+i]++;
5971 }
5972 }
5973}
5974
5975/*
5976 * Measure the cache-cost of one task migration. Returns in units of nsec.
5977 */
5978static unsigned long long
5979measure_one(void *cache, unsigned long size, int source, int target)
5980{
5981 cpumask_t mask, saved_mask;
5982 unsigned long long t0, t1, t2, t3, cost;
5983
5984 saved_mask = current->cpus_allowed;
5985
5986 /*
5987 * Flush source caches to RAM and invalidate them:
5988 */
5989 sched_cacheflush();
5990
5991 /*
5992 * Migrate to the source CPU:
5993 */
5994 mask = cpumask_of_cpu(source);
5995 set_cpus_allowed(current, mask);
5996 WARN_ON(smp_processor_id() != source);
5997
5998 /*
5999 * Dirty the working set:
6000 */
6001 t0 = sched_clock();
6002 touch_cache(cache, size);
6003 t1 = sched_clock();
6004
6005 /*
6006 * Migrate to the target CPU, dirty the L2 cache and access
6007 * the shared buffer. (which represents the working set
6008 * of a migrated task.)
6009 */
6010 mask = cpumask_of_cpu(target);
6011 set_cpus_allowed(current, mask);
6012 WARN_ON(smp_processor_id() != target);
6013
6014 t2 = sched_clock();
6015 touch_cache(cache, size);
6016 t3 = sched_clock();
6017
6018 cost = t1-t0 + t3-t2;
6019
6020 if (migration_debug >= 2)
6021 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022 source, target, t1-t0, t1-t0, t3-t2, cost);
6023 /*
6024 * Flush target caches to RAM and invalidate them:
6025 */
6026 sched_cacheflush();
6027
6028 set_cpus_allowed(current, saved_mask);
6029
6030 return cost;
6031}
6032
6033/*
6034 * Measure a series of task migrations and return the average
6035 * result. Since this code runs early during bootup the system
6036 * is 'undisturbed' and the average latency makes sense.
6037 *
6038 * The algorithm in essence auto-detects the relevant cache-size,
6039 * so it will properly detect different cachesizes for different
6040 * cache-hierarchies, depending on how the CPUs are connected.
6041 *
6042 * Architectures can prime the upper limit of the search range via
6043 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6044 */
6045static unsigned long long
6046measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047{
6048 unsigned long long cost1, cost2;
6049 int i;
6050
6051 /*
6052 * Measure the migration cost of 'size' bytes, over an
6053 * average of 10 runs:
6054 *
6055 * (We perturb the cache size by a small (0..4k)
6056 * value to compensate size/alignment related artifacts.
6057 * We also subtract the cost of the operation done on
6058 * the same CPU.)
6059 */
6060 cost1 = 0;
6061
6062 /*
6063 * dry run, to make sure we start off cache-cold on cpu1,
6064 * and to get any vmalloc pagefaults in advance:
6065 */
6066 measure_one(cache, size, cpu1, cpu2);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070 measure_one(cache, size, cpu2, cpu1);
6071 for (i = 0; i < ITERATIONS; i++)
6072 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074 /*
6075 * (We measure the non-migrating [cached] cost on both
6076 * cpu1 and cpu2, to handle CPUs with different speeds)
6077 */
6078 cost2 = 0;
6079
6080 measure_one(cache, size, cpu1, cpu1);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084 measure_one(cache, size, cpu2, cpu2);
6085 for (i = 0; i < ITERATIONS; i++)
6086 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088 /*
6089 * Get the per-iteration migration cost:
6090 */
6091 do_div(cost1, 2 * ITERATIONS);
6092 do_div(cost2, 2 * ITERATIONS);
6093
6094 return cost1 - cost2;
6095}
6096
6097static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098{
6099 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100 unsigned int max_size, size, size_found = 0;
6101 long long cost = 0, prev_cost;
6102 void *cache;
6103
6104 /*
6105 * Search from max_cache_size*5 down to 64K - the real relevant
6106 * cachesize has to lie somewhere inbetween.
6107 */
6108 if (max_cache_size) {
6109 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111 } else {
6112 /*
6113 * Since we have no estimation about the relevant
6114 * search range
6115 */
6116 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117 size = MIN_CACHE_SIZE;
6118 }
6119
6120 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6121 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122 return 0;
6123 }
6124
6125 /*
6126 * Allocate the working set:
6127 */
6128 cache = vmalloc(max_size);
6129 if (!cache) {
6130 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131 return 1000000; /* return 1 msec on very small boxen */
6132 }
6133
6134 while (size <= max_size) {
6135 prev_cost = cost;
6136 cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138 /*
6139 * Update the max:
6140 */
6141 if (cost > 0) {
6142 if (max_cost < cost) {
6143 max_cost = cost;
6144 size_found = size;
6145 }
6146 }
6147 /*
6148 * Calculate average fluctuation, we use this to prevent
6149 * noise from triggering an early break out of the loop:
6150 */
6151 fluct = abs(cost - prev_cost);
6152 avg_fluct = (avg_fluct + fluct)/2;
6153
6154 if (migration_debug)
6155 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156 "(%8Ld %8Ld)\n",
6157 cpu1, cpu2, size,
6158 (long)cost / 1000000,
6159 ((long)cost / 100000) % 10,
6160 (long)max_cost / 1000000,
6161 ((long)max_cost / 100000) % 10,
6162 domain_distance(cpu1, cpu2),
6163 cost, avg_fluct);
6164
6165 /*
6166 * If we iterated at least 20% past the previous maximum,
6167 * and the cost has dropped by more than 20% already,
6168 * (taking fluctuations into account) then we assume to
6169 * have found the maximum and break out of the loop early:
6170 */
6171 if (size_found && (size*100 > size_found*SIZE_THRESH))
6172 if (cost+avg_fluct <= 0 ||
6173 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6174
6175 if (migration_debug)
6176 printk("-> found max.\n");
6177 break;
6178 }
6179 /*
6180 * Increase the cachesize in 10% steps:
6181 */
6182 size = size * 10 / 9;
6183 }
6184
6185 if (migration_debug)
6186 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187 cpu1, cpu2, size_found, max_cost);
6188
6189 vfree(cache);
6190
6191 /*
6192 * A task is considered 'cache cold' if at least 2 times
6193 * the worst-case cost of migration has passed.
6194 *
6195 * (this limit is only listened to if the load-balancing
6196 * situation is 'nice' - if there is a large imbalance we
6197 * ignore it for the sake of CPU utilization and
6198 * processing fairness.)
6199 */
6200 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201}
6202
6203static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204{
6205 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206 unsigned long j0, j1, distance, max_distance = 0;
6207 struct sched_domain *sd;
6208
6209 j0 = jiffies;
6210
6211 /*
6212 * First pass - calculate the cacheflush times:
6213 */
6214 for_each_cpu_mask(cpu1, *cpu_map) {
6215 for_each_cpu_mask(cpu2, *cpu_map) {
6216 if (cpu1 == cpu2)
6217 continue;
6218 distance = domain_distance(cpu1, cpu2);
6219 max_distance = max(max_distance, distance);
6220 /*
6221 * No result cached yet?
6222 */
6223 if (migration_cost[distance] == -1LL)
6224 migration_cost[distance] =
6225 measure_migration_cost(cpu1, cpu2);
6226 }
6227 }
6228 /*
6229 * Second pass - update the sched domain hierarchy with
6230 * the new cache-hot-time estimations:
6231 */
6232 for_each_cpu_mask(cpu, *cpu_map) {
6233 distance = 0;
6234 for_each_domain(cpu, sd) {
6235 sd->cache_hot_time = migration_cost[distance];
6236 distance++;
6237 }
6238 }
6239 /*
6240 * Print the matrix:
6241 */
6242 if (migration_debug)
6243 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244 max_cache_size,
6245#ifdef CONFIG_X86
6246 cpu_khz/1000
6247#else
6248 -1
6249#endif
6250 );
6251 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252 printk("migration_cost=");
6253 for (distance = 0; distance <= max_distance; distance++) {
6254 if (distance)
6255 printk(",");
6256 printk("%ld", (long)migration_cost[distance] / 1000);
6257 }
6258 printk("\n");
6259 }
6260 j1 = jiffies;
6261 if (migration_debug)
6262 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264 /*
6265 * Move back to the original CPU. NUMA-Q gets confused
6266 * if we migrate to another quad during bootup.
6267 */
6268 if (raw_smp_processor_id() != orig_cpu) {
6269 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270 saved_mask = current->cpus_allowed;
6271
6272 set_cpus_allowed(current, mask);
6273 set_cpus_allowed(current, saved_mask);
6274 }
6275}
6276
6277#ifdef CONFIG_NUMA 5494#ifdef CONFIG_NUMA
6278 5495
6279/** 5496/**
@@ -6574,7 +5791,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6574static int build_sched_domains(const cpumask_t *cpu_map) 5791static int build_sched_domains(const cpumask_t *cpu_map)
6575{ 5792{
6576 int i; 5793 int i;
6577 struct sched_domain *sd;
6578#ifdef CONFIG_NUMA 5794#ifdef CONFIG_NUMA
6579 struct sched_group **sched_group_nodes = NULL; 5795 struct sched_group **sched_group_nodes = NULL;
6580 int sd_allnodes = 0; 5796 int sd_allnodes = 0;
@@ -6582,7 +5798,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6582 /* 5798 /*
6583 * Allocate the per-node list of sched groups 5799 * Allocate the per-node list of sched groups
6584 */ 5800 */
6585 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5801 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6586 GFP_KERNEL); 5802 GFP_KERNEL);
6587 if (!sched_group_nodes) { 5803 if (!sched_group_nodes) {
6588 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5804 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +5817,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 cpus_and(nodemask, nodemask, *cpu_map); 5817 cpus_and(nodemask, nodemask, *cpu_map);
6602 5818
6603#ifdef CONFIG_NUMA 5819#ifdef CONFIG_NUMA
6604 if (cpus_weight(*cpu_map) 5820 if (cpus_weight(*cpu_map) >
6605 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5821 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6606 sd = &per_cpu(allnodes_domains, i); 5822 sd = &per_cpu(allnodes_domains, i);
6607 *sd = SD_ALLNODES_INIT; 5823 *sd = SD_ALLNODES_INIT;
6608 sd->span = *cpu_map; 5824 sd->span = *cpu_map;
@@ -6661,7 +5877,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6661 if (i != first_cpu(this_sibling_map)) 5877 if (i != first_cpu(this_sibling_map))
6662 continue; 5878 continue;
6663 5879
6664 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5880 init_sched_build_groups(this_sibling_map, cpu_map,
5881 &cpu_to_cpu_group);
6665 } 5882 }
6666#endif 5883#endif
6667 5884
@@ -6672,11 +5889,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6672 cpus_and(this_core_map, this_core_map, *cpu_map); 5889 cpus_and(this_core_map, this_core_map, *cpu_map);
6673 if (i != first_cpu(this_core_map)) 5890 if (i != first_cpu(this_core_map))
6674 continue; 5891 continue;
6675 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5892 init_sched_build_groups(this_core_map, cpu_map,
5893 &cpu_to_core_group);
6676 } 5894 }
6677#endif 5895#endif
6678 5896
6679
6680 /* Set up physical groups */ 5897 /* Set up physical groups */
6681 for (i = 0; i < MAX_NUMNODES; i++) { 5898 for (i = 0; i < MAX_NUMNODES; i++) {
6682 cpumask_t nodemask = node_to_cpumask(i); 5899 cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +5908,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6691#ifdef CONFIG_NUMA 5908#ifdef CONFIG_NUMA
6692 /* Set up node groups */ 5909 /* Set up node groups */
6693 if (sd_allnodes) 5910 if (sd_allnodes)
6694 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5911 init_sched_build_groups(*cpu_map, cpu_map,
5912 &cpu_to_allnodes_group);
6695 5913
6696 for (i = 0; i < MAX_NUMNODES; i++) { 5914 for (i = 0; i < MAX_NUMNODES; i++) {
6697 /* Set up node groups */ 5915 /* Set up node groups */
@@ -6719,6 +5937,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6719 sched_group_nodes[i] = sg; 5937 sched_group_nodes[i] = sg;
6720 for_each_cpu_mask(j, nodemask) { 5938 for_each_cpu_mask(j, nodemask) {
6721 struct sched_domain *sd; 5939 struct sched_domain *sd;
5940
6722 sd = &per_cpu(node_domains, j); 5941 sd = &per_cpu(node_domains, j);
6723 sd->groups = sg; 5942 sd->groups = sg;
6724 } 5943 }
@@ -6763,19 +5982,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6763 /* Calculate CPU power for physical packages and nodes */ 5982 /* Calculate CPU power for physical packages and nodes */
6764#ifdef CONFIG_SCHED_SMT 5983#ifdef CONFIG_SCHED_SMT
6765 for_each_cpu_mask(i, *cpu_map) { 5984 for_each_cpu_mask(i, *cpu_map) {
6766 sd = &per_cpu(cpu_domains, i); 5985 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
6767 init_sched_groups_power(i, sd); 5987 init_sched_groups_power(i, sd);
6768 } 5988 }
6769#endif 5989#endif
6770#ifdef CONFIG_SCHED_MC 5990#ifdef CONFIG_SCHED_MC
6771 for_each_cpu_mask(i, *cpu_map) { 5991 for_each_cpu_mask(i, *cpu_map) {
6772 sd = &per_cpu(core_domains, i); 5992 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
6773 init_sched_groups_power(i, sd); 5994 init_sched_groups_power(i, sd);
6774 } 5995 }
6775#endif 5996#endif
6776 5997
6777 for_each_cpu_mask(i, *cpu_map) { 5998 for_each_cpu_mask(i, *cpu_map) {
6778 sd = &per_cpu(phys_domains, i); 5999 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
6779 init_sched_groups_power(i, sd); 6001 init_sched_groups_power(i, sd);
6780 } 6002 }
6781 6003
@@ -6803,10 +6025,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803#endif 6025#endif
6804 cpu_attach_domain(sd, i); 6026 cpu_attach_domain(sd, i);
6805 } 6027 }
6806 /*
6807 * Tune cache-hot values:
6808 */
6809 calibrate_migration_costs(cpu_map);
6810 6028
6811 return 0; 6029 return 0;
6812 6030
@@ -7013,10 +6231,12 @@ void __init sched_init_smp(void)
7013 /* Move init over to a non-isolated CPU */ 6231 /* Move init over to a non-isolated CPU */
7014 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7015 BUG(); 6233 BUG();
6234 sched_init_granularity();
7016} 6235}
7017#else 6236#else
7018void __init sched_init_smp(void) 6237void __init sched_init_smp(void)
7019{ 6238{
6239 sched_init_granularity();
7020} 6240}
7021#endif /* CONFIG_SMP */ 6241#endif /* CONFIG_SMP */
7022 6242
@@ -7030,28 +6250,51 @@ int in_sched_functions(unsigned long addr)
7030 && addr < (unsigned long)__sched_text_end); 6250 && addr < (unsigned long)__sched_text_end);
7031} 6251}
7032 6252
6253static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254{
6255 cfs_rq->tasks_timeline = RB_ROOT;
6256 cfs_rq->fair_clock = 1;
6257#ifdef CONFIG_FAIR_GROUP_SCHED
6258 cfs_rq->rq = rq;
6259#endif
6260}
6261
7033void __init sched_init(void) 6262void __init sched_init(void)
7034{ 6263{
7035 int i, j, k; 6264 u64 now = sched_clock();
7036 int highest_cpu = 0; 6265 int highest_cpu = 0;
6266 int i, j;
6267
6268 /*
6269 * Link up the scheduling class hierarchy:
6270 */
6271 rt_sched_class.next = &fair_sched_class;
6272 fair_sched_class.next = &idle_sched_class;
6273 idle_sched_class.next = NULL;
7037 6274
7038 for_each_possible_cpu(i) { 6275 for_each_possible_cpu(i) {
7039 struct prio_array *array; 6276 struct rt_prio_array *array;
7040 struct rq *rq; 6277 struct rq *rq;
7041 6278
7042 rq = cpu_rq(i); 6279 rq = cpu_rq(i);
7043 spin_lock_init(&rq->lock); 6280 spin_lock_init(&rq->lock);
7044 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7045 rq->nr_running = 0; 6282 rq->nr_running = 0;
7046 rq->active = rq->arrays; 6283 rq->clock = 1;
7047 rq->expired = rq->arrays + 1; 6284 init_cfs_rq(&rq->cfs, rq);
7048 rq->best_expired_prio = MAX_PRIO; 6285#ifdef CONFIG_FAIR_GROUP_SCHED
6286 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288#endif
6289 rq->ls.load_update_last = now;
6290 rq->ls.load_update_start = now;
7049 6291
6292 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293 rq->cpu_load[j] = 0;
7050#ifdef CONFIG_SMP 6294#ifdef CONFIG_SMP
7051 rq->sd = NULL; 6295 rq->sd = NULL;
7052 for (j = 1; j < 3; j++)
7053 rq->cpu_load[j] = 0;
7054 rq->active_balance = 0; 6296 rq->active_balance = 0;
6297 rq->next_balance = jiffies;
7055 rq->push_cpu = 0; 6298 rq->push_cpu = 0;
7056 rq->cpu = i; 6299 rq->cpu = i;
7057 rq->migration_thread = NULL; 6300 rq->migration_thread = NULL;
@@ -7059,16 +6302,14 @@ void __init sched_init(void)
7059#endif 6302#endif
7060 atomic_set(&rq->nr_iowait, 0); 6303 atomic_set(&rq->nr_iowait, 0);
7061 6304
7062 for (j = 0; j < 2; j++) { 6305 array = &rq->rt.active;
7063 array = rq->arrays + j; 6306 for (j = 0; j < MAX_RT_PRIO; j++) {
7064 for (k = 0; k < MAX_PRIO; k++) { 6307 INIT_LIST_HEAD(array->queue + j);
7065 INIT_LIST_HEAD(array->queue + k); 6308 __clear_bit(j, array->bitmap);
7066 __clear_bit(k, array->bitmap);
7067 }
7068 // delimiter for bitsearch
7069 __set_bit(MAX_PRIO, array->bitmap);
7070 } 6309 }
7071 highest_cpu = i; 6310 highest_cpu = i;
6311 /* delimiter for bitsearch: */
6312 __set_bit(MAX_RT_PRIO, array->bitmap);
7072 } 6313 }
7073 6314
7074 set_load_weight(&init_task); 6315 set_load_weight(&init_task);
@@ -7095,6 +6336,10 @@ void __init sched_init(void)
7095 * when this runqueue becomes "idle". 6336 * when this runqueue becomes "idle".
7096 */ 6337 */
7097 init_idle(current, smp_processor_id()); 6338 init_idle(current, smp_processor_id());
6339 /*
6340 * During early bootup we pretend to be a normal task:
6341 */
6342 current->sched_class = &fair_sched_class;
7098} 6343}
7099 6344
7100#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6345#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7125,29 +6370,55 @@ EXPORT_SYMBOL(__might_sleep);
7125#ifdef CONFIG_MAGIC_SYSRQ 6370#ifdef CONFIG_MAGIC_SYSRQ
7126void normalize_rt_tasks(void) 6371void normalize_rt_tasks(void)
7127{ 6372{
7128 struct prio_array *array;
7129 struct task_struct *g, *p; 6373 struct task_struct *g, *p;
7130 unsigned long flags; 6374 unsigned long flags;
7131 struct rq *rq; 6375 struct rq *rq;
6376 int on_rq;
7132 6377
7133 read_lock_irq(&tasklist_lock); 6378 read_lock_irq(&tasklist_lock);
7134
7135 do_each_thread(g, p) { 6379 do_each_thread(g, p) {
7136 if (!rt_task(p)) 6380 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0;
6382 p->se.wait_start_fair = 0;
6383 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0;
6388 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0;
6390
6391 if (!rt_task(p)) {
6392 /*
6393 * Renice negative nice level userspace
6394 * tasks back to 0:
6395 */
6396 if (TASK_NICE(p) < 0 && p->mm)
6397 set_user_nice(p, 0);
7137 continue; 6398 continue;
6399 }
7138 6400
7139 spin_lock_irqsave(&p->pi_lock, flags); 6401 spin_lock_irqsave(&p->pi_lock, flags);
7140 rq = __task_rq_lock(p); 6402 rq = __task_rq_lock(p);
6403#ifdef CONFIG_SMP
6404 /*
6405 * Do not touch the migration thread:
6406 */
6407 if (p == rq->migration_thread)
6408 goto out_unlock;
6409#endif
7141 6410
7142 array = p->array; 6411 on_rq = p->se.on_rq;
7143 if (array) 6412 if (on_rq)
7144 deactivate_task(p, task_rq(p)); 6413 deactivate_task(task_rq(p), p, 0);
7145 __setscheduler(p, SCHED_NORMAL, 0); 6414 __setscheduler(rq, p, SCHED_NORMAL, 0);
7146 if (array) { 6415 if (on_rq) {
7147 __activate_task(p, task_rq(p)); 6416 activate_task(task_rq(p), p, 0);
7148 resched_task(rq->curr); 6417 resched_task(rq->curr);
7149 } 6418 }
7150 6419#ifdef CONFIG_SMP
6420 out_unlock:
6421#endif
7151 __task_rq_unlock(rq); 6422 __task_rq_unlock(rq);
7152 spin_unlock_irqrestore(&p->pi_lock, flags); 6423 spin_unlock_irqrestore(&p->pi_lock, flags);
7153 } while_each_thread(g, p); 6424 } while_each_thread(g, p);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 000000000000..1baf87cceb7c
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,275 @@
1/*
2 * kernel/time/sched_debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19/*
20 * This allows printing both to /proc/sched_debug and
21 * to the console
22 */
23#define SEQ_printf(m, x...) \
24 do { \
25 if (m) \
26 seq_printf(m, x); \
27 else \
28 printk(x); \
29 } while (0)
30
31static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
33{
34 if (rq->curr == p)
35 SEQ_printf(m, "R");
36 else
37 SEQ_printf(m, " ");
38
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
40 "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
41 p->comm, p->pid,
42 (long long)p->se.fair_key,
43 (long long)(p->se.fair_key - rq->cfs.fair_clock),
44 (long long)p->se.wait_runtime,
45 (long long)(p->nvcsw + p->nivcsw),
46 p->prio,
47 (long long)p->se.sum_exec_runtime,
48 (long long)p->se.sum_wait_runtime,
49 (long long)p->se.sum_sleep_runtime,
50 (long long)p->se.wait_runtime_overruns,
51 (long long)p->se.wait_runtime_underruns);
52}
53
54static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
55{
56 struct task_struct *g, *p;
57
58 SEQ_printf(m,
59 "\nrunnable tasks:\n"
60 " task PID tree-key delta waiting"
61 " switches prio"
62 " sum-exec sum-wait sum-sleep"
63 " wait-overrun wait-underrun\n"
64 "------------------------------------------------------------------"
65 "----------------"
66 "------------------------------------------------"
67 "--------------------------------\n");
68
69 read_lock_irq(&tasklist_lock);
70
71 do_each_thread(g, p) {
72 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
73 continue;
74
75 print_task(m, rq, p, now);
76 } while_each_thread(g, p);
77
78 read_unlock_irq(&tasklist_lock);
79}
80
81static void
82print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
83{
84 s64 wait_runtime_rq_sum = 0;
85 struct task_struct *p;
86 struct rb_node *curr;
87 unsigned long flags;
88 struct rq *rq = &per_cpu(runqueues, cpu);
89
90 spin_lock_irqsave(&rq->lock, flags);
91 curr = first_fair(cfs_rq);
92 while (curr) {
93 p = rb_entry(curr, struct task_struct, se.run_node);
94 wait_runtime_rq_sum += p->se.wait_runtime;
95
96 curr = rb_next(curr);
97 }
98 spin_unlock_irqrestore(&rq->lock, flags);
99
100 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
101 (long long)wait_runtime_rq_sum);
102}
103
104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
105{
106 SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
107
108#define P(x) \
109 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
110
111 P(fair_clock);
112 P(exec_clock);
113 P(wait_runtime);
114 P(wait_runtime_overruns);
115 P(wait_runtime_underruns);
116 P(sleeper_bonus);
117#undef P
118
119 print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
120}
121
122static void print_cpu(struct seq_file *m, int cpu, u64 now)
123{
124 struct rq *rq = &per_cpu(runqueues, cpu);
125
126#ifdef CONFIG_X86
127 {
128 unsigned int freq = cpu_khz ? : 1;
129
130 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
131 cpu, freq / 1000, (freq % 1000));
132 }
133#else
134 SEQ_printf(m, "\ncpu#%d\n", cpu);
135#endif
136
137#define P(x) \
138 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
139
140 P(nr_running);
141 SEQ_printf(m, " .%-30s: %lu\n", "load",
142 rq->ls.load.weight);
143 P(ls.delta_fair);
144 P(ls.delta_exec);
145 P(nr_switches);
146 P(nr_load_updates);
147 P(nr_uninterruptible);
148 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
149 P(next_balance);
150 P(curr->pid);
151 P(clock);
152 P(prev_clock_raw);
153 P(clock_warps);
154 P(clock_overflows);
155 P(clock_unstable_events);
156 P(clock_max_delta);
157 P(cpu_load[0]);
158 P(cpu_load[1]);
159 P(cpu_load[2]);
160 P(cpu_load[3]);
161 P(cpu_load[4]);
162#undef P
163
164 print_cfs_stats(m, cpu, now);
165
166 print_rq(m, rq, cpu, now);
167}
168
169static int sched_debug_show(struct seq_file *m, void *v)
170{
171 u64 now = ktime_to_ns(ktime_get());
172 int cpu;
173
174 SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n",
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version);
178
179 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
180
181 for_each_online_cpu(cpu)
182 print_cpu(m, cpu, now);
183
184 SEQ_printf(m, "\n");
185
186 return 0;
187}
188
189void sysrq_sched_debug_show(void)
190{
191 sched_debug_show(NULL, NULL);
192}
193
194static int sched_debug_open(struct inode *inode, struct file *filp)
195{
196 return single_open(filp, sched_debug_show, NULL);
197}
198
199static struct file_operations sched_debug_fops = {
200 .open = sched_debug_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = seq_release,
204};
205
206static int __init init_sched_debug_procfs(void)
207{
208 struct proc_dir_entry *pe;
209
210 pe = create_proc_entry("sched_debug", 0644, NULL);
211 if (!pe)
212 return -ENOMEM;
213
214 pe->proc_fops = &sched_debug_fops;
215
216 return 0;
217}
218
219__initcall(init_sched_debug_procfs);
220
221void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
222{
223 unsigned long flags;
224 int num_threads = 1;
225
226 rcu_read_lock();
227 if (lock_task_sighand(p, &flags)) {
228 num_threads = atomic_read(&p->signal->count);
229 unlock_task_sighand(p, &flags);
230 }
231 rcu_read_unlock();
232
233 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
234 SEQ_printf(m, "----------------------------------------------\n");
235#define P(F) \
236 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
237
238 P(se.wait_start);
239 P(se.wait_start_fair);
240 P(se.exec_start);
241 P(se.sleep_start);
242 P(se.sleep_start_fair);
243 P(se.block_start);
244 P(se.sleep_max);
245 P(se.block_max);
246 P(se.exec_max);
247 P(se.wait_max);
248 P(se.wait_runtime);
249 P(se.wait_runtime_overruns);
250 P(se.wait_runtime_underruns);
251 P(se.sum_wait_runtime);
252 P(se.sum_exec_runtime);
253 SEQ_printf(m, "%-25s:%20Ld\n",
254 "nr_switches", (long long)(p->nvcsw + p->nivcsw));
255 P(se.load.weight);
256 P(policy);
257 P(prio);
258#undef P
259
260 {
261 u64 t0, t1;
262
263 t0 = sched_clock();
264 t1 = sched_clock();
265 SEQ_printf(m, "%-25s:%20Ld\n",
266 "clock-delta", (long long)(t1-t0));
267 }
268}
269
270void proc_sched_set_task(struct task_struct *p)
271{
272 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
273 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
274 p->se.sum_exec_runtime = 0;
275}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 000000000000..6971db0a7160
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1131 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 */
19
20/*
21 * Preemption granularity:
22 * (default: 2 msec, units: nanoseconds)
23 *
24 * NOTE: this granularity value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat
26 * larger than this value. (to see the precise effective timeslice
27 * length of your workload, run vmstat and monitor the context-switches
28 * field)
29 *
30 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
33 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
35
36/*
37 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds)
39 *
40 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies.
43 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
45 10000000000ULL/HZ;
46
47/*
48 * SCHED_OTHER wake-up granularity.
49 * (default: 1 msec, units: nanoseconds)
50 *
51 * This option delays the preemption effects of decoupled workloads
52 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies.
54 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
56
57unsigned int sysctl_sched_stat_granularity __read_mostly;
58
59/*
60 * Initialized in sched_init_granularity():
61 */
62unsigned int sysctl_sched_runtime_limit __read_mostly;
63
64/*
65 * Debugging: various feature bits
66 */
67enum {
68 SCHED_FEAT_FAIR_SLEEPERS = 1,
69 SCHED_FEAT_SLEEPER_AVG = 2,
70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
71 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
72 SCHED_FEAT_START_DEBIT = 16,
73 SCHED_FEAT_SKIP_INITIAL = 32,
74};
75
76unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 |
82 SCHED_FEAT_SKIP_INITIAL *0;
83
84extern struct sched_class fair_sched_class;
85
86/**************************************************************
87 * CFS operations on generic schedulable entities:
88 */
89
90#ifdef CONFIG_FAIR_GROUP_SCHED
91
92/* cpu runqueue to which this cfs_rq is attached */
93static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
94{
95 return cfs_rq->rq;
96}
97
98/* currently running entity (if any) on this cfs_rq */
99static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
100{
101 return cfs_rq->curr;
102}
103
104/* An entity is a task if it doesn't "own" a runqueue */
105#define entity_is_task(se) (!se->my_q)
106
107static inline void
108set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
109{
110 cfs_rq->curr = se;
111}
112
113#else /* CONFIG_FAIR_GROUP_SCHED */
114
115static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116{
117 return container_of(cfs_rq, struct rq, cfs);
118}
119
120static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
121{
122 struct rq *rq = rq_of(cfs_rq);
123
124 if (unlikely(rq->curr->sched_class != &fair_sched_class))
125 return NULL;
126
127 return &rq->curr->se;
128}
129
130#define entity_is_task(se) 1
131
132static inline void
133set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
134
135#endif /* CONFIG_FAIR_GROUP_SCHED */
136
137static inline struct task_struct *task_of(struct sched_entity *se)
138{
139 return container_of(se, struct task_struct, se);
140}
141
142
143/**************************************************************
144 * Scheduling class tree data structure manipulation methods:
145 */
146
147/*
148 * Enqueue an entity into the rb-tree:
149 */
150static inline void
151__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
152{
153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
154 struct rb_node *parent = NULL;
155 struct sched_entity *entry;
156 s64 key = se->fair_key;
157 int leftmost = 1;
158
159 /*
160 * Find the right place in the rbtree:
161 */
162 while (*link) {
163 parent = *link;
164 entry = rb_entry(parent, struct sched_entity, run_node);
165 /*
166 * We dont care about collisions. Nodes with
167 * the same key stay together.
168 */
169 if (key - entry->fair_key < 0) {
170 link = &parent->rb_left;
171 } else {
172 link = &parent->rb_right;
173 leftmost = 0;
174 }
175 }
176
177 /*
178 * Maintain a cache of leftmost tree entries (it is frequently
179 * used):
180 */
181 if (leftmost)
182 cfs_rq->rb_leftmost = &se->run_node;
183
184 rb_link_node(&se->run_node, parent, link);
185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
186 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++;
188 se->on_rq = 1;
189}
190
191static inline void
192__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
193{
194 if (cfs_rq->rb_leftmost == &se->run_node)
195 cfs_rq->rb_leftmost = rb_next(&se->run_node);
196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
197 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--;
199 se->on_rq = 0;
200}
201
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
203{
204 return cfs_rq->rb_leftmost;
205}
206
207static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
208{
209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
210}
211
212/**************************************************************
213 * Scheduling class statistics methods:
214 */
215
216/*
217 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially:
219 */
220static long
221niced_granularity(struct sched_entity *curr, unsigned long granularity)
222{
223 u64 tmp;
224
225 /*
226 * Negative nice levels get the same granularity as nice-0:
227 */
228 if (likely(curr->load.weight >= NICE_0_LOAD))
229 return granularity;
230 /*
231 * Positive nice level tasks get linearly finer
232 * granularity:
233 */
234 tmp = curr->load.weight * (u64)granularity;
235
236 /*
237 * It will always fit into 'long':
238 */
239 return (long) (tmp >> NICE_0_SHIFT);
240}
241
242static inline void
243limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
244{
245 long limit = sysctl_sched_runtime_limit;
246
247 /*
248 * Niced tasks have the same history dynamic range as
249 * non-niced tasks:
250 */
251 if (unlikely(se->wait_runtime > limit)) {
252 se->wait_runtime = limit;
253 schedstat_inc(se, wait_runtime_overruns);
254 schedstat_inc(cfs_rq, wait_runtime_overruns);
255 }
256 if (unlikely(se->wait_runtime < -limit)) {
257 se->wait_runtime = -limit;
258 schedstat_inc(se, wait_runtime_underruns);
259 schedstat_inc(cfs_rq, wait_runtime_underruns);
260 }
261}
262
263static inline void
264__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
265{
266 se->wait_runtime += delta;
267 schedstat_add(se, sum_wait_runtime, delta);
268 limit_wait_runtime(cfs_rq, se);
269}
270
271static void
272add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
273{
274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
275 __add_wait_runtime(cfs_rq, se, delta);
276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
277}
278
279/*
280 * Update the current task's runtime statistics. Skip current tasks that
281 * are not in our scheduling class.
282 */
283static inline void
284__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
285{
286 unsigned long delta, delta_exec, delta_fair;
287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight;
290
291 if (unlikely(!load))
292 return;
293
294 delta_exec = curr->delta_exec;
295#ifdef CONFIG_SCHEDSTATS
296 if (unlikely(delta_exec > curr->exec_max))
297 curr->exec_max = delta_exec;
298#endif
299
300 curr->sum_exec_runtime += delta_exec;
301 cfs_rq->exec_clock += delta_exec;
302
303 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305
306 if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
307 delta = calc_delta_mine(cfs_rq->sleeper_bonus,
308 curr->load.weight, lw);
309 if (unlikely(delta > cfs_rq->sleeper_bonus))
310 delta = cfs_rq->sleeper_bonus;
311
312 cfs_rq->sleeper_bonus -= delta;
313 delta_mine -= delta;
314 }
315
316 cfs_rq->fair_clock += delta_fair;
317 /*
318 * We executed delta_exec amount of time on the CPU,
319 * but we were only entitled to delta_mine amount of
320 * time during that period (if nr_running == 1 then
321 * the two values are equal)
322 * [Note: delta_mine - delta_exec is negative]:
323 */
324 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
325}
326
327static void update_curr(struct cfs_rq *cfs_rq, u64 now)
328{
329 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
330 unsigned long delta_exec;
331
332 if (unlikely(!curr))
333 return;
334
335 /*
336 * Get the amount of time the current task was running
337 * since the last time we changed load (this cannot
338 * overflow on 32 bits):
339 */
340 delta_exec = (unsigned long)(now - curr->exec_start);
341
342 curr->delta_exec += delta_exec;
343
344 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
345 __update_curr(cfs_rq, curr, now);
346 curr->delta_exec = 0;
347 }
348 curr->exec_start = now;
349}
350
351static inline void
352update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
353{
354 se->wait_start_fair = cfs_rq->fair_clock;
355 se->wait_start = now;
356}
357
358/*
359 * We calculate fair deltas here, so protect against the random effects
360 * of a multiplication overflow by capping it to the runtime limit:
361 */
362#if BITS_PER_LONG == 32
363static inline unsigned long
364calc_weighted(unsigned long delta, unsigned long weight, int shift)
365{
366 u64 tmp = (u64)delta * weight >> shift;
367
368 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
369 return sysctl_sched_runtime_limit*2;
370 return tmp;
371}
372#else
373static inline unsigned long
374calc_weighted(unsigned long delta, unsigned long weight, int shift)
375{
376 return delta * weight >> shift;
377}
378#endif
379
380/*
381 * Task is being enqueued - update stats:
382 */
383static void
384update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
385{
386 s64 key;
387
388 /*
389 * Are we enqueueing a waiting task? (for current tasks
390 * a dequeue/enqueue event is a NOP)
391 */
392 if (se != cfs_rq_curr(cfs_rq))
393 update_stats_wait_start(cfs_rq, se, now);
394 /*
395 * Update the key:
396 */
397 key = cfs_rq->fair_clock;
398
399 /*
400 * Optimize the common nice 0 case:
401 */
402 if (likely(se->load.weight == NICE_0_LOAD)) {
403 key -= se->wait_runtime;
404 } else {
405 u64 tmp;
406
407 if (se->wait_runtime < 0) {
408 tmp = -se->wait_runtime;
409 key += (tmp * se->load.inv_weight) >>
410 (WMULT_SHIFT - NICE_0_SHIFT);
411 } else {
412 tmp = se->wait_runtime;
413 key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
414 }
415 }
416
417 se->fair_key = key;
418}
419
420/*
421 * Note: must be called with a freshly updated rq->fair_clock.
422 */
423static inline void
424__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
425{
426 unsigned long delta_fair = se->delta_fair_run;
427
428#ifdef CONFIG_SCHEDSTATS
429 {
430 s64 delta_wait = now - se->wait_start;
431 if (unlikely(delta_wait > se->wait_max))
432 se->wait_max = delta_wait;
433 }
434#endif
435
436 if (unlikely(se->load.weight != NICE_0_LOAD))
437 delta_fair = calc_weighted(delta_fair, se->load.weight,
438 NICE_0_SHIFT);
439
440 add_wait_runtime(cfs_rq, se, delta_fair);
441}
442
443static void
444update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
445{
446 unsigned long delta_fair;
447
448 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
449 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
450
451 se->delta_fair_run += delta_fair;
452 if (unlikely(abs(se->delta_fair_run) >=
453 sysctl_sched_stat_granularity)) {
454 __update_stats_wait_end(cfs_rq, se, now);
455 se->delta_fair_run = 0;
456 }
457
458 se->wait_start_fair = 0;
459 se->wait_start = 0;
460}
461
462static inline void
463update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
464{
465 update_curr(cfs_rq, now);
466 /*
467 * Mark the end of the wait period if dequeueing a
468 * waiting task:
469 */
470 if (se != cfs_rq_curr(cfs_rq))
471 update_stats_wait_end(cfs_rq, se, now);
472}
473
474/*
475 * We are picking a new current task - update its stats:
476 */
477static inline void
478update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
479{
480 /*
481 * We are starting a new run period:
482 */
483 se->exec_start = now;
484}
485
486/*
487 * We are descheduling a task - update its stats:
488 */
489static inline void
490update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
491{
492 se->exec_start = 0;
493}
494
495/**************************************************
496 * Scheduling class queueing methods:
497 */
498
499static void
500__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
501{
502 unsigned long load = cfs_rq->load.weight, delta_fair;
503 long prev_runtime;
504
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
506 load = rq_of(cfs_rq)->cpu_load[2];
507
508 delta_fair = se->delta_fair_sleep;
509
510 /*
511 * Fix up delta_fair with the effect of us running
512 * during the whole sleep period:
513 */
514 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
515 delta_fair = div64_likely32((u64)delta_fair * load,
516 load + se->load.weight);
517
518 if (unlikely(se->load.weight != NICE_0_LOAD))
519 delta_fair = calc_weighted(delta_fair, se->load.weight,
520 NICE_0_SHIFT);
521
522 prev_runtime = se->wait_runtime;
523 __add_wait_runtime(cfs_rq, se, delta_fair);
524 delta_fair = se->wait_runtime - prev_runtime;
525
526 /*
527 * Track the amount of bonus we've given to sleepers:
528 */
529 cfs_rq->sleeper_bonus += delta_fair;
530
531 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
532}
533
534static void
535enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
536{
537 struct task_struct *tsk = task_of(se);
538 unsigned long delta_fair;
539
540 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
541 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
542 return;
543
544 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
545 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
546
547 se->delta_fair_sleep += delta_fair;
548 if (unlikely(abs(se->delta_fair_sleep) >=
549 sysctl_sched_stat_granularity)) {
550 __enqueue_sleeper(cfs_rq, se, now);
551 se->delta_fair_sleep = 0;
552 }
553
554 se->sleep_start_fair = 0;
555
556#ifdef CONFIG_SCHEDSTATS
557 if (se->sleep_start) {
558 u64 delta = now - se->sleep_start;
559
560 if ((s64)delta < 0)
561 delta = 0;
562
563 if (unlikely(delta > se->sleep_max))
564 se->sleep_max = delta;
565
566 se->sleep_start = 0;
567 se->sum_sleep_runtime += delta;
568 }
569 if (se->block_start) {
570 u64 delta = now - se->block_start;
571
572 if ((s64)delta < 0)
573 delta = 0;
574
575 if (unlikely(delta > se->block_max))
576 se->block_max = delta;
577
578 se->block_start = 0;
579 se->sum_sleep_runtime += delta;
580 }
581#endif
582}
583
584static void
585enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
586 int wakeup, u64 now)
587{
588 /*
589 * Update the fair clock.
590 */
591 update_curr(cfs_rq, now);
592
593 if (wakeup)
594 enqueue_sleeper(cfs_rq, se, now);
595
596 update_stats_enqueue(cfs_rq, se, now);
597 __enqueue_entity(cfs_rq, se);
598}
599
600static void
601dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
602 int sleep, u64 now)
603{
604 update_stats_dequeue(cfs_rq, se, now);
605 if (sleep) {
606 se->sleep_start_fair = cfs_rq->fair_clock;
607#ifdef CONFIG_SCHEDSTATS
608 if (entity_is_task(se)) {
609 struct task_struct *tsk = task_of(se);
610
611 if (tsk->state & TASK_INTERRUPTIBLE)
612 se->sleep_start = now;
613 if (tsk->state & TASK_UNINTERRUPTIBLE)
614 se->block_start = now;
615 }
616 cfs_rq->wait_runtime -= se->wait_runtime;
617#endif
618 }
619 __dequeue_entity(cfs_rq, se);
620}
621
622/*
623 * Preempt the current task with a newly woken task if needed:
624 */
625static void
626__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
627 struct sched_entity *curr, unsigned long granularity)
628{
629 s64 __delta = curr->fair_key - se->fair_key;
630
631 /*
632 * Take scheduling granularity into account - do not
633 * preempt the current task unless the best task has
634 * a larger than sched_granularity fairness advantage:
635 */
636 if (__delta > niced_granularity(curr, granularity))
637 resched_task(rq_of(cfs_rq)->curr);
638}
639
640static inline void
641set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
642{
643 /*
644 * Any task has to be enqueued before it get to execute on
645 * a CPU. So account for the time it spent waiting on the
646 * runqueue. (note, here we rely on pick_next_task() having
647 * done a put_prev_task_fair() shortly before this, which
648 * updated rq->fair_clock - used by update_stats_wait_end())
649 */
650 update_stats_wait_end(cfs_rq, se, now);
651 update_stats_curr_start(cfs_rq, se, now);
652 set_cfs_rq_curr(cfs_rq, se);
653}
654
655static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
656{
657 struct sched_entity *se = __pick_next_entity(cfs_rq);
658
659 set_next_entity(cfs_rq, se, now);
660
661 return se;
662}
663
664static void
665put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
666{
667 /*
668 * If still on the runqueue then deactivate_task()
669 * was not called and update_curr() has to be done:
670 */
671 if (prev->on_rq)
672 update_curr(cfs_rq, now);
673
674 update_stats_curr_end(cfs_rq, prev, now);
675
676 if (prev->on_rq)
677 update_stats_wait_start(cfs_rq, prev, now);
678 set_cfs_rq_curr(cfs_rq, NULL);
679}
680
681static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
682{
683 struct rq *rq = rq_of(cfs_rq);
684 struct sched_entity *next;
685 u64 now = __rq_clock(rq);
686
687 /*
688 * Dequeue and enqueue the task to update its
689 * position within the tree:
690 */
691 dequeue_entity(cfs_rq, curr, 0, now);
692 enqueue_entity(cfs_rq, curr, 0, now);
693
694 /*
695 * Reschedule if another task tops the current one.
696 */
697 next = __pick_next_entity(cfs_rq);
698 if (next == curr)
699 return;
700
701 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
702}
703
704/**************************************************
705 * CFS operations on tasks:
706 */
707
708#ifdef CONFIG_FAIR_GROUP_SCHED
709
710/* Walk up scheduling entities hierarchy */
711#define for_each_sched_entity(se) \
712 for (; se; se = se->parent)
713
714static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
715{
716 return p->se.cfs_rq;
717}
718
719/* runqueue on which this entity is (to be) queued */
720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
721{
722 return se->cfs_rq;
723}
724
725/* runqueue "owned" by this group */
726static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
727{
728 return grp->my_q;
729}
730
731/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
732 * another cpu ('this_cpu')
733 */
734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
735{
736 /* A later patch will take group into account */
737 return &cpu_rq(this_cpu)->cfs;
738}
739
740/* Iterate thr' all leaf cfs_rq's on a runqueue */
741#define for_each_leaf_cfs_rq(rq, cfs_rq) \
742 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
743
744/* Do the two (enqueued) tasks belong to the same group ? */
745static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
746{
747 if (curr->se.cfs_rq == p->se.cfs_rq)
748 return 1;
749
750 return 0;
751}
752
753#else /* CONFIG_FAIR_GROUP_SCHED */
754
755#define for_each_sched_entity(se) \
756 for (; se; se = NULL)
757
758static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
759{
760 return &task_rq(p)->cfs;
761}
762
763static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
764{
765 struct task_struct *p = task_of(se);
766 struct rq *rq = task_rq(p);
767
768 return &rq->cfs;
769}
770
771/* runqueue "owned" by this group */
772static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
773{
774 return NULL;
775}
776
777static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
778{
779 return &cpu_rq(this_cpu)->cfs;
780}
781
782#define for_each_leaf_cfs_rq(rq, cfs_rq) \
783 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
784
785static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
786{
787 return 1;
788}
789
790#endif /* CONFIG_FAIR_GROUP_SCHED */
791
792/*
793 * The enqueue_task method is called before nr_running is
794 * increased. Here we update the fair scheduling stats and
795 * then put the task into the rbtree:
796 */
797static void
798enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
799{
800 struct cfs_rq *cfs_rq;
801 struct sched_entity *se = &p->se;
802
803 for_each_sched_entity(se) {
804 if (se->on_rq)
805 break;
806 cfs_rq = cfs_rq_of(se);
807 enqueue_entity(cfs_rq, se, wakeup, now);
808 }
809}
810
811/*
812 * The dequeue_task method is called before nr_running is
813 * decreased. We remove the task from the rbtree and
814 * update the fair scheduling stats:
815 */
816static void
817dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
818{
819 struct cfs_rq *cfs_rq;
820 struct sched_entity *se = &p->se;
821
822 for_each_sched_entity(se) {
823 cfs_rq = cfs_rq_of(se);
824 dequeue_entity(cfs_rq, se, sleep, now);
825 /* Don't dequeue parent if it has other entities besides us */
826 if (cfs_rq->load.weight)
827 break;
828 }
829}
830
831/*
832 * sched_yield() support is very simple - we dequeue and enqueue
833 */
834static void yield_task_fair(struct rq *rq, struct task_struct *p)
835{
836 struct cfs_rq *cfs_rq = task_cfs_rq(p);
837 u64 now = __rq_clock(rq);
838
839 /*
840 * Dequeue and enqueue the task to update its
841 * position within the tree:
842 */
843 dequeue_entity(cfs_rq, &p->se, 0, now);
844 enqueue_entity(cfs_rq, &p->se, 0, now);
845}
846
847/*
848 * Preempt the current task with a newly woken task if needed:
849 */
850static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
851{
852 struct task_struct *curr = rq->curr;
853 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
854 unsigned long gran;
855
856 if (unlikely(rt_prio(p->prio))) {
857 update_curr(cfs_rq, rq_clock(rq));
858 resched_task(curr);
859 return;
860 }
861
862 gran = sysctl_sched_wakeup_granularity;
863 /*
864 * Batch tasks prefer throughput over latency:
865 */
866 if (unlikely(p->policy == SCHED_BATCH))
867 gran = sysctl_sched_batch_wakeup_granularity;
868
869 if (is_same_group(curr, p))
870 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
871}
872
873static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
874{
875 struct cfs_rq *cfs_rq = &rq->cfs;
876 struct sched_entity *se;
877
878 if (unlikely(!cfs_rq->nr_running))
879 return NULL;
880
881 do {
882 se = pick_next_entity(cfs_rq, now);
883 cfs_rq = group_cfs_rq(se);
884 } while (cfs_rq);
885
886 return task_of(se);
887}
888
889/*
890 * Account for a descheduled task:
891 */
892static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
893{
894 struct sched_entity *se = &prev->se;
895 struct cfs_rq *cfs_rq;
896
897 for_each_sched_entity(se) {
898 cfs_rq = cfs_rq_of(se);
899 put_prev_entity(cfs_rq, se, now);
900 }
901}
902
903/**************************************************
904 * Fair scheduling class load-balancing methods:
905 */
906
907/*
908 * Load-balancing iterator. Note: while the runqueue stays locked
909 * during the whole iteration, the current task might be
910 * dequeued so the iterator has to be dequeue-safe. Here we
911 * achieve that by always pre-iterating before returning
912 * the current task:
913 */
914static inline struct task_struct *
915__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
916{
917 struct task_struct *p;
918
919 if (!curr)
920 return NULL;
921
922 p = rb_entry(curr, struct task_struct, se.run_node);
923 cfs_rq->rb_load_balance_curr = rb_next(curr);
924
925 return p;
926}
927
928static struct task_struct *load_balance_start_fair(void *arg)
929{
930 struct cfs_rq *cfs_rq = arg;
931
932 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
933}
934
935static struct task_struct *load_balance_next_fair(void *arg)
936{
937 struct cfs_rq *cfs_rq = arg;
938
939 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
940}
941
942static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
943{
944 struct sched_entity *curr;
945 struct task_struct *p;
946
947 if (!cfs_rq->nr_running)
948 return MAX_PRIO;
949
950 curr = __pick_next_entity(cfs_rq);
951 p = task_of(curr);
952
953 return p->prio;
954}
955
956static int
957load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 unsigned long max_nr_move, unsigned long max_load_move,
959 struct sched_domain *sd, enum cpu_idle_type idle,
960 int *all_pinned, unsigned long *total_load_moved)
961{
962 struct cfs_rq *busy_cfs_rq;
963 unsigned long load_moved, total_nr_moved = 0, nr_moved;
964 long rem_load_move = max_load_move;
965 struct rq_iterator cfs_rq_iterator;
966
967 cfs_rq_iterator.start = load_balance_start_fair;
968 cfs_rq_iterator.next = load_balance_next_fair;
969
970 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
971 struct cfs_rq *this_cfs_rq;
972 long imbalance;
973 unsigned long maxload;
974 int this_best_prio, best_prio, best_prio_seen = 0;
975
976 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
977
978 imbalance = busy_cfs_rq->load.weight -
979 this_cfs_rq->load.weight;
980 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
981 if (imbalance <= 0)
982 continue;
983
984 /* Don't pull more than imbalance/2 */
985 imbalance /= 2;
986 maxload = min(rem_load_move, imbalance);
987
988 this_best_prio = cfs_rq_best_prio(this_cfs_rq);
989 best_prio = cfs_rq_best_prio(busy_cfs_rq);
990
991 /*
992 * Enable handling of the case where there is more than one task
993 * with the best priority. If the current running task is one
994 * of those with prio==best_prio we know it won't be moved
995 * and therefore it's safe to override the skip (based on load)
996 * of any task we find with that prio.
997 */
998 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
999 best_prio_seen = 1;
1000
1001 /* pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators
1003 */
1004 cfs_rq_iterator.arg = busy_cfs_rq;
1005 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 max_nr_move, maxload, sd, idle, all_pinned,
1007 &load_moved, this_best_prio, best_prio,
1008 best_prio_seen, &cfs_rq_iterator);
1009
1010 total_nr_moved += nr_moved;
1011 max_nr_move -= nr_moved;
1012 rem_load_move -= load_moved;
1013
1014 if (max_nr_move <= 0 || rem_load_move <= 0)
1015 break;
1016 }
1017
1018 *total_load_moved = max_load_move - rem_load_move;
1019
1020 return total_nr_moved;
1021}
1022
1023/*
1024 * scheduler tick hitting a task of our scheduling class:
1025 */
1026static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1027{
1028 struct cfs_rq *cfs_rq;
1029 struct sched_entity *se = &curr->se;
1030
1031 for_each_sched_entity(se) {
1032 cfs_rq = cfs_rq_of(se);
1033 entity_tick(cfs_rq, se);
1034 }
1035}
1036
1037/*
1038 * Share the fairness runtime between parent and child, thus the
1039 * total amount of pressure for CPU stays equal - new tasks
1040 * get a chance to run but frequent forkers are not allowed to
1041 * monopolize the CPU. Note: the parent runqueue is locked,
1042 * the child is not running yet.
1043 */
1044static void task_new_fair(struct rq *rq, struct task_struct *p)
1045{
1046 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1047 struct sched_entity *se = &p->se;
1048 u64 now = rq_clock(rq);
1049
1050 sched_info_queued(p);
1051
1052 update_stats_enqueue(cfs_rq, se, now);
1053 /*
1054 * Child runs first: we let it run before the parent
1055 * until it reschedules once. We set up the key so that
1056 * it will preempt the parent:
1057 */
1058 p->se.fair_key = current->se.fair_key -
1059 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
1060 /*
1061 * The first wait is dominated by the child-runs-first logic,
1062 * so do not credit it with that waiting time yet:
1063 */
1064 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1065 p->se.wait_start_fair = 0;
1066
1067 /*
1068 * The statistical average of wait_runtime is about
1069 * -granularity/2, so initialize the task with that:
1070 */
1071 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1072 p->se.wait_runtime = -(sysctl_sched_granularity / 2);
1073
1074 __enqueue_entity(cfs_rq, se);
1075 inc_nr_running(p, rq, now);
1076}
1077
1078#ifdef CONFIG_FAIR_GROUP_SCHED
1079/* Account for a task changing its policy or group.
1080 *
1081 * This routine is mostly called to set cfs_rq->curr field when a task
1082 * migrates between groups/classes.
1083 */
1084static void set_curr_task_fair(struct rq *rq)
1085{
1086 struct task_struct *curr = rq->curr;
1087 struct sched_entity *se = &curr->se;
1088 u64 now = rq_clock(rq);
1089 struct cfs_rq *cfs_rq;
1090
1091 for_each_sched_entity(se) {
1092 cfs_rq = cfs_rq_of(se);
1093 set_next_entity(cfs_rq, se, now);
1094 }
1095}
1096#else
1097static void set_curr_task_fair(struct rq *rq)
1098{
1099}
1100#endif
1101
1102/*
1103 * All the scheduling class methods:
1104 */
1105struct sched_class fair_sched_class __read_mostly = {
1106 .enqueue_task = enqueue_task_fair,
1107 .dequeue_task = dequeue_task_fair,
1108 .yield_task = yield_task_fair,
1109
1110 .check_preempt_curr = check_preempt_curr_fair,
1111
1112 .pick_next_task = pick_next_task_fair,
1113 .put_prev_task = put_prev_task_fair,
1114
1115 .load_balance = load_balance_fair,
1116
1117 .set_curr_task = set_curr_task_fair,
1118 .task_tick = task_tick_fair,
1119 .task_new = task_new_fair,
1120};
1121
1122#ifdef CONFIG_SCHED_DEBUG
1123void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
1124{
1125 struct rq *rq = cpu_rq(cpu);
1126 struct cfs_rq *cfs_rq;
1127
1128 for_each_leaf_cfs_rq(rq, cfs_rq)
1129 print_cfs_rq(m, cpu, cfs_rq, now);
1130}
1131#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 000000000000..41841e741c4a
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
1/*
2 * idle-task scheduling class.
3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c)
6 */
7
8/*
9 * Idle tasks are unconditionally rescheduled:
10 */
11static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
12{
13 resched_task(rq->idle);
14}
15
16static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
17{
18 schedstat_inc(rq, sched_goidle);
19
20 return rq->idle;
21}
22
23/*
24 * It is not legal to sleep in the idle task - print a warning
25 * message if some code attempts to do it:
26 */
27static void
28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
29{
30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32 dump_stack();
33 spin_lock_irq(&rq->lock);
34}
35
36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
37{
38}
39
40static int
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved)
45{
46 return 0;
47}
48
49static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{
51}
52
53/*
54 * Simple, special scheduling class for the per-CPU idle tasks:
55 */
56static struct sched_class idle_sched_class __read_mostly = {
57 /* no enqueue/yield_task for idle tasks */
58
59 /* dequeue is not valid, we print a debug message there: */
60 .dequeue_task = dequeue_task_idle,
61
62 .check_preempt_curr = check_preempt_curr_idle,
63
64 .pick_next_task = pick_next_task_idle,
65 .put_prev_task = put_prev_task_idle,
66
67 .load_balance = load_balance_idle,
68
69 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */
71};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 000000000000..1192a2741b99
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6/*
7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class.
9 */
10static inline void update_curr_rt(struct rq *rq, u64 now)
11{
12 struct task_struct *curr = rq->curr;
13 u64 delta_exec;
14
15 if (!task_has_rt_policy(curr))
16 return;
17
18 delta_exec = now - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0;
21 if (unlikely(delta_exec > curr->se.exec_max))
22 curr->se.exec_max = delta_exec;
23
24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now;
26}
27
28static void
29enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30{
31 struct rt_prio_array *array = &rq->rt.active;
32
33 list_add_tail(&p->run_list, array->queue + p->prio);
34 __set_bit(p->prio, array->bitmap);
35}
36
37/*
38 * Adding/removing a task to/from a priority array:
39 */
40static void
41dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42{
43 struct rt_prio_array *array = &rq->rt.active;
44
45 update_curr_rt(rq, now);
46
47 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio))
49 __clear_bit(p->prio, array->bitmap);
50}
51
52/*
53 * Put task to the end of the run list without the overhead of dequeue
54 * followed by enqueue.
55 */
56static void requeue_task_rt(struct rq *rq, struct task_struct *p)
57{
58 struct rt_prio_array *array = &rq->rt.active;
59
60 list_move_tail(&p->run_list, array->queue + p->prio);
61}
62
63static void
64yield_task_rt(struct rq *rq, struct task_struct *p)
65{
66 requeue_task_rt(rq, p);
67}
68
69/*
70 * Preempt the current task with a newly woken task if needed:
71 */
72static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73{
74 if (p->prio < rq->curr->prio)
75 resched_task(rq->curr);
76}
77
78static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
79{
80 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next;
82 struct list_head *queue;
83 int idx;
84
85 idx = sched_find_first_bit(array->bitmap);
86 if (idx >= MAX_RT_PRIO)
87 return NULL;
88
89 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list);
91
92 next->se.exec_start = now;
93
94 return next;
95}
96
97static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
98{
99 update_curr_rt(rq, now);
100 p->se.exec_start = 0;
101}
102
103/*
104 * Load-balancing iterator. Note: while the runqueue stays locked
105 * during the whole iteration, the current task might be
106 * dequeued so the iterator has to be dequeue-safe. Here we
107 * achieve that by always pre-iterating before returning
108 * the current task:
109 */
110static struct task_struct *load_balance_start_rt(void *arg)
111{
112 struct rq *rq = arg;
113 struct rt_prio_array *array = &rq->rt.active;
114 struct list_head *head, *curr;
115 struct task_struct *p;
116 int idx;
117
118 idx = sched_find_first_bit(array->bitmap);
119 if (idx >= MAX_RT_PRIO)
120 return NULL;
121
122 head = array->queue + idx;
123 curr = head->prev;
124
125 p = list_entry(curr, struct task_struct, run_list);
126
127 curr = curr->prev;
128
129 rq->rt.rt_load_balance_idx = idx;
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132
133 return p;
134}
135
136static struct task_struct *load_balance_next_rt(void *arg)
137{
138 struct rq *rq = arg;
139 struct rt_prio_array *array = &rq->rt.active;
140 struct list_head *head, *curr;
141 struct task_struct *p;
142 int idx;
143
144 idx = rq->rt.rt_load_balance_idx;
145 head = rq->rt.rt_load_balance_head;
146 curr = rq->rt.rt_load_balance_curr;
147
148 /*
149 * If we arrived back to the head again then
150 * iterate to the next queue (if any):
151 */
152 if (unlikely(head == curr)) {
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
154
155 if (next_idx >= MAX_RT_PRIO)
156 return NULL;
157
158 idx = next_idx;
159 head = array->queue + idx;
160 curr = head->prev;
161
162 rq->rt.rt_load_balance_idx = idx;
163 rq->rt.rt_load_balance_head = head;
164 }
165
166 p = list_entry(curr, struct task_struct, run_list);
167
168 curr = curr->prev;
169
170 rq->rt.rt_load_balance_curr = curr;
171
172 return p;
173}
174
175static int
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved)
180{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved;
183 struct rq_iterator rt_rq_iterator;
184
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187
188 /*
189 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio.
194 */
195 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1;
197
198 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt;
200 /* pass 'busiest' rq argument into
201 * load_balance_[start|next]_rt iterators
202 */
203 rt_rq_iterator.arg = busiest;
204
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved,
207 this_best_prio, best_prio, best_prio_seen,
208 &rt_rq_iterator);
209
210 return nr_moved;
211}
212
213static void task_tick_rt(struct rq *rq, struct task_struct *p)
214{
215 /*
216 * RR tasks need a special form of timeslice management.
217 * FIFO tasks have no timeslices.
218 */
219 if (p->policy != SCHED_RR)
220 return;
221
222 if (--p->time_slice)
223 return;
224
225 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p);
227
228 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p);
230}
231
232/*
233 * No parent/child timeslice management necessary for RT tasks,
234 * just activate them:
235 */
236static void task_new_rt(struct rq *rq, struct task_struct *p)
237{
238 activate_task(rq, p, 1);
239}
240
241static struct sched_class rt_sched_class __read_mostly = {
242 .enqueue_task = enqueue_task_rt,
243 .dequeue_task = dequeue_task_rt,
244 .yield_task = yield_task_rt,
245
246 .check_preempt_curr = check_preempt_curr_rt,
247
248 .pick_next_task = pick_next_task_rt,
249 .put_prev_task = put_prev_task_rt,
250
251 .load_balance = load_balance_rt,
252
253 .task_tick = task_tick_rt,
254 .task_new = task_new_rt,
255};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 000000000000..c63c38f6fa6e
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
1
2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 14
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies);
15 for_each_online_cpu(cpu) {
16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP
18 struct sched_domain *sd;
19 int dcnt = 0;
20#endif
21
22 /* runqueue-specific stats */
23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
31
32 seq_printf(seq, "\n");
33
34#ifdef CONFIG_SMP
35 /* domain-specific stats */
36 preempt_disable();
37 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu",
47 sd->lb_cnt[itype],
48 sd->lb_balanced[itype],
49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype],
51 sd->lb_gained[itype],
52 sd->lb_hot_gained[itype],
53 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]);
55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance);
63 }
64 preempt_enable();
65#endif
66 }
67 return 0;
68}
69
70static int schedstat_open(struct inode *inode, struct file *file)
71{
72 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
73 char *buf = kmalloc(size, GFP_KERNEL);
74 struct seq_file *m;
75 int res;
76
77 if (!buf)
78 return -ENOMEM;
79 res = single_open(file, show_schedstat, NULL);
80 if (!res) {
81 m = file->private_data;
82 m->buf = buf;
83 m->size = size;
84 } else
85 kfree(buf);
86 return res;
87}
88
89const struct file_operations proc_schedstat_operations = {
90 .open = schedstat_open,
91 .read = seq_read,
92 .llseek = seq_lseek,
93 .release = single_release,
94};
95
96/*
97 * Expects runqueue lock to be held for atomicity of update
98 */
99static inline void
100rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{
102 if (rq) {
103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++;
105 }
106}
107
108/*
109 * Expects runqueue lock to be held for atomicity of update
110 */
111static inline void
112rq_sched_info_depart(struct rq *rq, unsigned long long delta)
113{
114 if (rq)
115 rq->rq_sched_info.cpu_time += delta;
116}
117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
119#else /* !CONFIG_SCHEDSTATS */
120static inline void
121rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
122{}
123static inline void
124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{}
126# define schedstat_inc(rq, field) do { } while (0)
127# define schedstat_add(rq, field, amt) do { } while (0)
128#endif
129
130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/*
132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive
134 * tasks, the expired queue will become the active queue after the active
135 * queue is empty, without explicitly dequeuing and requeuing tasks in the
136 * expired queue. (Interactive tasks may be requeued directly to the
137 * active queue, thus delaying tasks in the expired queue from running;
138 * see scheduler_tick()).
139 *
140 * This function is only called from sched_info_arrive(), rather than
141 * dequeue_task(). Even though a task may be queued and dequeued multiple
142 * times as it is shuffled about, we're really interested in knowing how
143 * long it was from the *first* time it was queued to the time that it
144 * finally hit a cpu.
145 */
146static inline void sched_info_dequeued(struct task_struct *t)
147{
148 t->sched_info.last_queued = 0;
149}
150
151/*
152 * Called when a task finally hits the cpu. We can now calculate how
153 * long it was waiting to run. We also note when it began so that we
154 * can keep stats on how long its timeslice is.
155 */
156static void sched_info_arrive(struct task_struct *t)
157{
158 unsigned long long now = sched_clock(), delta = 0;
159
160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued;
162 sched_info_dequeued(t);
163 t->sched_info.run_delay += delta;
164 t->sched_info.last_arrival = now;
165 t->sched_info.pcnt++;
166
167 rq_sched_info_arrive(task_rq(t), delta);
168}
169
170/*
171 * Called when a process is queued into either the active or expired
172 * array. The time is noted and later used to determine how long we
173 * had to wait for us to reach the cpu. Since the expired queue will
174 * become the active queue after active queue is empty, without dequeuing
175 * and requeuing any tasks, we are interested in queuing to either. It
176 * is unusual but not impossible for tasks to be dequeued and immediately
177 * requeued in the same or another array: this can happen in sched_yield(),
178 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
179 * to runqueue.
180 *
181 * This function is only called from enqueue_task(), but also only updates
182 * the timestamp if it is already not set. It's assumed that
183 * sched_info_dequeued() will clear that stamp when appropriate.
184 */
185static inline void sched_info_queued(struct task_struct *t)
186{
187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock();
190}
191
192/*
193 * Called when a process ceases being the active-running process, either
194 * voluntarily or involuntarily. Now we can calculate how long we ran.
195 */
196static inline void sched_info_depart(struct task_struct *t)
197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
199
200 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta);
202}
203
204/*
205 * Called when tasks are switched involuntarily due, typically, to expiring
206 * their time slice. (This may also be called when switching to or from
207 * the idle task.) We are only called when prev != next.
208 */
209static inline void
210__sched_info_switch(struct task_struct *prev, struct task_struct *next)
211{
212 struct rq *rq = task_rq(prev);
213
214 /*
215 * prev now departs the cpu. It's not interesting to record
216 * stats about how efficient we were at scheduling the idle
217 * process, however.
218 */
219 if (prev != rq->idle)
220 sched_info_depart(prev);
221
222 if (next != rq->idle)
223 sched_info_arrive(next);
224}
225static inline void
226sched_info_switch(struct task_struct *prev, struct task_struct *next)
227{
228 if (unlikely(sched_info_on()))
229 __sched_info_switch(prev, next);
230}
231#else
232#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0b9886a00e74..73217a9e2875 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -488,7 +488,6 @@ void __init softirq_init(void)
488 488
489static int ksoftirqd(void * __bind_cpu) 489static int ksoftirqd(void * __bind_cpu)
490{ 490{
491 set_user_nice(current, 19);
492 current->flags |= PF_NOFREEZE; 491 current->flags |= PF_NOFREEZE;
493 492
494 set_current_state(TASK_INTERRUPTIBLE); 493 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 30ee462ee79f..51f5dac42a00 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -206,7 +206,87 @@ static ctl_table root_table[] = {
206 { .ctl_name = 0 } 206 { .ctl_name = 0 }
207}; 207};
208 208
209#ifdef CONFIG_SCHED_DEBUG
210static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
211static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
212static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
213static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
214#endif
215
209static ctl_table kern_table[] = { 216static ctl_table kern_table[] = {
217#ifdef CONFIG_SCHED_DEBUG
218 {
219 .ctl_name = CTL_UNNUMBERED,
220 .procname = "sched_granularity_ns",
221 .data = &sysctl_sched_granularity,
222 .maxlen = sizeof(unsigned int),
223 .mode = 0644,
224 .proc_handler = &proc_dointvec_minmax,
225 .strategy = &sysctl_intvec,
226 .extra1 = &min_sched_granularity_ns,
227 .extra2 = &max_sched_granularity_ns,
228 },
229 {
230 .ctl_name = CTL_UNNUMBERED,
231 .procname = "sched_wakeup_granularity_ns",
232 .data = &sysctl_sched_wakeup_granularity,
233 .maxlen = sizeof(unsigned int),
234 .mode = 0644,
235 .proc_handler = &proc_dointvec_minmax,
236 .strategy = &sysctl_intvec,
237 .extra1 = &min_wakeup_granularity_ns,
238 .extra2 = &max_wakeup_granularity_ns,
239 },
240 {
241 .ctl_name = CTL_UNNUMBERED,
242 .procname = "sched_batch_wakeup_granularity_ns",
243 .data = &sysctl_sched_batch_wakeup_granularity,
244 .maxlen = sizeof(unsigned int),
245 .mode = 0644,
246 .proc_handler = &proc_dointvec_minmax,
247 .strategy = &sysctl_intvec,
248 .extra1 = &min_wakeup_granularity_ns,
249 .extra2 = &max_wakeup_granularity_ns,
250 },
251 {
252 .ctl_name = CTL_UNNUMBERED,
253 .procname = "sched_stat_granularity_ns",
254 .data = &sysctl_sched_stat_granularity,
255 .maxlen = sizeof(unsigned int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec_minmax,
258 .strategy = &sysctl_intvec,
259 .extra1 = &min_wakeup_granularity_ns,
260 .extra2 = &max_wakeup_granularity_ns,
261 },
262 {
263 .ctl_name = CTL_UNNUMBERED,
264 .procname = "sched_runtime_limit_ns",
265 .data = &sysctl_sched_runtime_limit,
266 .maxlen = sizeof(unsigned int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .strategy = &sysctl_intvec,
270 .extra1 = &min_sched_granularity_ns,
271 .extra2 = &max_sched_granularity_ns,
272 },
273 {
274 .ctl_name = CTL_UNNUMBERED,
275 .procname = "sched_child_runs_first",
276 .data = &sysctl_sched_child_runs_first,
277 .maxlen = sizeof(unsigned int),
278 .mode = 0644,
279 .proc_handler = &proc_dointvec,
280 },
281 {
282 .ctl_name = CTL_UNNUMBERED,
283 .procname = "sched_features",
284 .data = &sysctl_sched_features,
285 .maxlen = sizeof(unsigned int),
286 .mode = 0644,
287 .proc_handler = &proc_dointvec,
288 },
289#endif
210 { 290 {
211 .ctl_name = KERN_PANIC, 291 .ctl_name = KERN_PANIC,
212 .procname = "panic", 292 .procname = "panic",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index da95e10cfd70..fab32a286371 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP
105 can be detected via the NMI-watchdog, on platforms that 105 can be detected via the NMI-watchdog, on platforms that
106 support it.) 106 support it.)
107 107
108config SCHED_DEBUG
109 bool "Collect scheduler debugging info"
110 depends on DEBUG_KERNEL && PROC_FS
111 default y
112 help
113 If you say Y here, the /proc/sched_debug file will be provided
114 that can help debug the scheduler. The runtime overhead of this
115 option is minimal.
116
108config SCHEDSTATS 117config SCHEDSTATS
109 bool "Collect scheduler statistics" 118 bool "Collect scheduler statistics"
110 depends on DEBUG_KERNEL && PROC_FS 119 depends on DEBUG_KERNEL && PROC_FS