95 files changed, 5518 insertions, 4098 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index af50f9bbe68e..4d880b3d1f35 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. It is defined in the file
        mga=            [HW,DRM]
-        migration_cost=
-                        [KNL,SMP] debug: override scheduler migration costs
-                        Format: <level-1-usecs>,<level-2-usecs>,...
-                        This debugging option can be used to override the
-                        default scheduler migration cost matrix. The numbers
-                        are indexed by 'CPU domain distance'.
-                        E.g. migration_cost=1000,2000,3000 on an SMT NUMA
-                        box will set up an intra-core migration cost of
-                        1 msec, an inter-core migration cost of 2 msecs,
-                        and an inter-node migration cost of 3 msecs.
-                        WARNING: using the wrong values here can break
-                        scheduler performance, so it's only for scheduler
-                        development purposes, not production environments.
-        migration_debug=
-                        [KNL,SMP] migration cost auto-detect verbosity
-                        Format=<0|1|2>
-                        If a system's migration matrix reported at bootup
-                        seems erroneous then this option can be used to
-                        increase verbosity of the detection process.
-                        We default to 0 (no extra messages), 1 will print
-                        some more information, and 2 will be really
-                        verbose (probably only useful if you also have a
-                        serial console attached to the system).
-        migration_factor=
-                        [KNL,SMP] multiply/divide migration costs by a factor
-                        Format=<percent>
-                        This debug option can be used to proportionally
-                        increase or decrease the auto-detected migration
-                        costs for all entries of the migration matrix.
-                        E.g. migration_factor=150 will increase migration
-                        costs by 50%. (and thus the scheduler will be less
-                        eager migrating cache-hot tasks)
-                        migration_factor=80 will decrease migration costs
-                        by 20%. (thus the scheduler will be more eager to
-                        migrate tasks)
-                        WARNING: using the wrong values here can break
-                        scheduler performance, so it's only for scheduler
-                        development purposes, not production environments.
        mousedev.tap_time=
                        [MOUSE] Maximum time between finger touching and
                        leaving touchpad surface for touch to be considered
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
new file mode 100644
index 000000000000..16feebb7bdc0
--- /dev/null
+++ b/Documentation/sched-design-CFS.txt
@@ -0,0 +1,119 @@
+This is the CFS scheduler.
+80% of CFS's design can be summed up in a single sentence: CFS basically
+models an "ideal, precise multi-tasking CPU" on real hardware.
+"Ideal multi-tasking CPU" is a (non-existent  :-))  CPU that has 100%
+physical power and which can run each task at precise equal speed, in
+parallel, each at 1/nr_running speed. For example: if there are 2 tasks
+running then it runs each at 50% physical power - totally in parallel.
+On real hardware, we can run only a single task at once, so while that
+one task runs, the other tasks that are waiting for the CPU are at a
+disadvantage - the current task gets an unfair amount of CPU time. In
+CFS this fairness imbalance is expressed and tracked via the per-task
+p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of
+time the task should now run on the CPU for it to become completely fair
+and balanced.
+( small detail: on 'ideal' hardware, the p->wait_runtime value would
+  always be zero - no task would ever get 'out of balance' from the
+  'ideal' share of CPU time. )
+CFS's task picking logic is based on this p->wait_runtime value and it
+is thus very simple: it always tries to run the task with the largest
+p->wait_runtime value. In other words, CFS tries to run the task with
+the 'gravest need' for more CPU time. So CFS always tries to split up
+CPU time between runnable tasks as close to 'ideal multitasking
+hardware' as possible.
+Most of the rest of CFS's design just falls out of this really simple
+concept, with a few add-on embellishments like nice levels,
+multiprocessing and various algorithm variants to recognize sleepers.
+In practice it works like this: the system runs a task a bit, and when
+the task schedules (or a scheduler tick happens) the task's CPU usage is
+'accounted for': the (small) time it just spent using the physical CPU
+is deducted from p->wait_runtime. [minus the 'fair share' it would have
+gotten anyway]. Once p->wait_runtime gets low enough so that another
+task becomes the 'leftmost task' of the time-ordered rbtree it maintains
+(plus a small amount of 'granularity' distance relative to the leftmost
+task so that we do not over-schedule tasks and trash the cache) then the
+new leftmost task is picked and the current task is preempted.
+The rq->fair_clock value tracks the 'CPU time a runnable task would have
+fairly gotten, had it been runnable during that time'. So by using
+rq->fair_clock values we can accurately timestamp and measure the
+'expected CPU time' a task should have gotten. All runnable tasks are
+sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and
+CFS picks the 'leftmost' task and sticks to it. As the system progresses
+forwards, newly woken tasks are put into the tree more and more to the
+right - slowly but surely giving a chance for every task to become the
+'leftmost task' and thus get on the CPU within a deterministic amount of
+time.
+Some implementation details:
+ - the introduction of Scheduling Classes: an extensible hierarchy of
+   scheduler modules. These modules encapsulate scheduling policy
+   details and are handled by the scheduler core without the core
+   code assuming about them too much.
+ - sched_fair.c implements the 'CFS desktop scheduler': it is a
+   replacement for the vanilla scheduler's SCHED_OTHER interactivity
+   code.
+   I'd like to give credit to Con Kolivas for the general approach here:
+   he has proven via RSDL/SD that 'fair scheduling' is possible and that
+   it results in better desktop scheduling. Kudos Con!
+   The CFS patch uses a completely different approach and implementation
+   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
+   that of RSDL/SD, which is a high standard to meet :-) Testing
+   feedback is welcome to decide this one way or another. [ and, in any
+   case, all of SD's logic could be added via a kernel/sched_sd.c module
+   as well, if Con is interested in such an approach. ]
+   CFS's design is quite radical: it does not use runqueues, it uses a
+   time-ordered rbtree to build a 'timeline' of future task execution,
+   and thus has no 'array switch' artifacts (by which both the vanilla
+   scheduler and RSDL/SD are affected).
+   CFS uses nanosecond granularity accounting and does not rely on any
+   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
+   'timeslices' and has no heuristics whatsoever. There is only one
+   central tunable:
+         /proc/sys/kernel/sched_granularity_ns
+   which can be used to tune the scheduler from 'desktop' (low
+   latencies) to 'server' (good batching) workloads. It defaults to a
+   setting suitable for desktop workloads. SCHED_BATCH is handled by the
+   CFS scheduler module too.
+   Due to its design, the CFS scheduler is not prone to any of the
+   'attacks' that exist today against the heuristics of the stock
+   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
+   work fine and do not impact interactivity and produce the expected
+   behavior.
+   the CFS scheduler has a much stronger handling of nice levels and
+   SCHED_BATCH: both types of workloads should be isolated much more
+   agressively than under the vanilla scheduler.
+   ( another detail: due to nanosec accounting and timeline sorting,
+     sched_yield() support is very simple under CFS, and in fact under
+     CFS sched_yield() behaves much better than under any other
+     scheduler i have tested so far. )
+ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
+   way than the vanilla scheduler does. It uses 100 runqueues (for all
+   100 RT priority levels, instead of 140 in the vanilla scheduler)
+   and it needs no expired array.
+ - reworked/sanitized SMP load-balancing: the runqueue-walking
+   assumptions are gone from the load-balancing code now, and
+   iterators of the scheduling modules are used. The balancing code got
+   quite a bit simpler as a result.
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 88baed1e7e83..0b2954534b8e 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -941,17 +941,6 @@ exit:
 }
 #endif
-static void smp_tune_scheduling(void)
-{
-        if (cpu_khz) {
-                /* cache size in kB */
-                long cachesize = boot_cpu_data.x86_cache_size;
-                if (cachesize > 0)
-                        max_cache_size = cachesize * 1024;
-        }
-}
 /*
 * Cycle through the processors sending APIC IPIs to boot each.
 */
@@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
        current_thread_info()->cpu = 0;
-        smp_tune_scheduling();
        set_cpu_sibling_map(0);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index f64b81f3033b..ea63a30ca3e8 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -4,6 +4,7 @@
 * See comments there for proper credits.
 */
+#include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
 #include <linux/cpufreq.h>
@@ -106,8 +107,13 @@ unsigned long long sched_clock(void)
        /*
         * Fall back to jiffies if there's no TSC available:
+         * ( But note that we still use it if the TSC is marked
+         *   unstable. We do this because unlike Time Of Day,
+         *   the scheduler clock tolerates small errors and it's
+         *   very important for it to be as fast as the platform
+         *   can achive it. )
         */
-        if (unlikely(!tsc_enabled))
+        if (unlikely(!tsc_enabled && !tsc_unstable))
                /* No locking but a rare wrong value is not a big deal: */
                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
@@ -277,6 +283,7 @@ static struct clocksource clocksource_tsc = {
 void mark_tsc_unstable(char *reason)
 {
+        sched_clock_unstable_event();
        if (!tsc_unstable) {
                tsc_unstable = 1;
                tsc_enabled = 0;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index eaa6a24bc0b6..188fb73c6845 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -805,7 +805,6 @@ static void __cpuinit
 get_max_cacheline_size (void)
 {
        unsigned long line_size, max = 1;
-        unsigned int cache_size = 0;
        u64 l, levels, unique_caches;
        pal_cache_config_info_t cci;
        s64 status;
@@ -835,8 +834,6 @@ get_max_cacheline_size (void)
                line_size = 1 << cci.pcci_line_size;
                if (line_size > max)
                        max = line_size;
-                if (cache_size < cci.pcci_cache_size)
-                        cache_size = cci.pcci_cache_size;
                if (!cci.pcci_unified) {
                        status = ia64_pal_cache_config_info(l,
                                                    /* cache_type (instruction)= */ 1,
@@ -853,9 +850,6 @@ get_max_cacheline_size (void)
                        ia64_i_cache_stride_shift = cci.pcci_stride;
        }
  out:
-#ifdef CONFIG_SMP
-        max_cache_size = max(max_cache_size, cache_size);
-#endif
        if (max > ia64_max_cacheline_size)
                ia64_max_cacheline_size = max;
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 67edfa7ed93a..a1b017f2dbb3 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS];		/* Map logical to physical */
 EXPORT_SYMBOL(phys_cpu_present_map);
 EXPORT_SYMBOL(cpu_online_map);
-/* This happens early in bootup, can't really do it better */
-static void smp_tune_scheduling (void)
-{
-        struct cache_desc *cd = &current_cpu_data.scache;
-        unsigned long cachesize = cd->linesz * cd->sets * cd->ways;
-        if (cachesize > max_cache_size)
-                max_cache_size = cachesize;
-}
 extern void __init calibrate_delay(void);
 extern ATTRIB_NORET void cpu_idle(void);
@@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
        init_new_context(current, &init_mm);
        current_thread_info()->cpu = 0;
-        smp_tune_scheduling();
        plat_prepare_cpus(max_cpus);
 #ifndef CONFIG_HOTPLUG_CPU
        cpu_present_map = cpu_possible_map;
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c
index 4d9ad59031bb..4fea3ac7bff0 100644
--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id)
        cpu_data(id).prom_node = cpu_node;
        cpu_data(id).mid = cpu_get_hwmid(cpu_node);
-        /* this is required to tune the scheduler correctly */
-        /* is it possible to have CPUs with different cache sizes? */
-        if (id == boot_cpu_id) {
-                int cache_line,cache_nlines;
-                cache_line = 0x20;
-                cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
-                cache_nlines = 0x8000;
-                cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
-                max_cache_size = cache_line * cache_nlines;
-        }
        if (cpu_data(id).mid < 0)
                panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 }
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 4dcd7d0b60f2..40e40f968d61 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int multiplier)
        return -EINVAL;
 }
-static void __init smp_tune_scheduling(void)
-{
-        unsigned int smallest = ~0U;
-        int i;
-        for (i = 0; i < NR_CPUS; i++) {
-                unsigned int val = cpu_data(i).ecache_size;
-                if (val && val < smallest)
-                        smallest = val;
-        }
-        /* Any value less than 256K is nonsense.  */
-        if (smallest < (256U * 1024U))
-                smallest = 256 * 1024;
-        max_cache_size = smallest;
-        if (smallest < 1U * 1024U * 1024U)
-                printk(KERN_INFO "Using max_cache_size of %uKB\n",
-                       smallest / 1024U);
-        else
-                printk(KERN_INFO "Using max_cache_size of %uMB\n",
-                       smallest / 1024U / 1024U);
-}
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
@@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
        }
        cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
-        smp_tune_scheduling();
 }
 void __devinit smp_prepare_boot_cpu(void)
diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index 66f826252aee..444a0b84f5bd 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -448,23 +448,21 @@ static int icside_dma_test_irq(ide_drive_t *drive)
                        ICS_ARCIN_V6_INTRSTAT_1)) & 1;
 }
-static int icside_dma_timeout(ide_drive_t *drive)
+static void icside_dma_timeout(ide_drive_t *drive)
 {
        printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
        if (icside_dma_test_irq(drive))
-                return 0;
+                return;
-        ide_dump_status(drive, "DMA timeout",
+        ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG));
-                HWIF(drive)->INB(IDE_STATUS_REG));
-        return icside_dma_end(drive);
+        icside_dma_end(drive);
 }
-static int icside_dma_lostirq(ide_drive_t *drive)
+static void icside_dma_lost_irq(ide_drive_t *drive)
 {
        printk(KERN_ERR "%s: IRQ lost\n", drive->name);
-        return 1;
 }
 static void icside_dma_init(ide_hwif_t *hwif)
@@ -490,8 +488,8 @@ static void icside_dma_init(ide_hwif_t *hwif)
        hwif->dma_start         = icside_dma_start;
        hwif->ide_dma_end       = icside_dma_end;
        hwif->ide_dma_test_irq  = icside_dma_test_irq;
-        hwif->ide_dma_timeout   = icside_dma_timeout;
+        hwif->dma_timeout       = icside_dma_timeout;
-        hwif->ide_dma_lostirq   = icside_dma_lostirq;
+        hwif->dma_lost_irq      = icside_dma_lost_irq;
        hwif->drives[0].autodma = hwif->autodma;
        hwif->drives[1].autodma = hwif->autodma;
diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index ca0341c05e55..886091bc7db0 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -819,7 +819,7 @@ init_e100_ide (void)
                hwif->dma_host_off = &cris_dma_off;
                hwif->dma_host_on = &cris_dma_on;
                hwif->dma_off_quietly = &cris_dma_off;
-                hwif->udma_four = 0;
+                hwif->cbl = ATA_CBL_PATA40;
                hwif->ultra_mask = cris_ultra_mask;
                hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */
                hwif->autodma = 1;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 252ab8295edf..1486eb212ccc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -481,7 +481,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
                else
                        printk("  Unknown Error Type: ");
-                if (sense->sense_key < ARY_LEN(sense_key_texts))
+                if (sense->sense_key < ARRAY_SIZE(sense_key_texts))
                        s = sense_key_texts[sense->sense_key];
                printk("%s -- (Sense key=0x%02x)\n", s, sense->sense_key);
@@ -491,7 +491,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
                                 sense->ascq);
                        s = buf;
                } else {
-                        int lo = 0, mid, hi = ARY_LEN(sense_data_texts);
+                        int lo = 0, mid, hi = ARRAY_SIZE(sense_data_texts);
                        unsigned long key = (sense->sense_key << 16);
                        key |= (sense->asc << 8);
                        if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd))
@@ -524,7 +524,7 @@ void cdrom_analyze_sense_data(ide_drive_t *drive,
                if (failed_command != NULL) {
-                        int lo=0, mid, hi= ARY_LEN (packet_command_texts);
+                        int lo=0, mid, hi= ARRAY_SIZE(packet_command_texts);
                        s = NULL;
                        while (hi > lo) {
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index ad1f2ed14a37..228b29c5d2e4 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -498,8 +498,6 @@ struct cdrom_info {
 * Descriptions of ATAPI error codes.
 */
-#define ARY_LEN(a) ((sizeof(a) / sizeof(a[0])))
 /* This stuff should be in cdrom.h, since it is now generic... */
 /* ATAPI sense keys (from table 140 of ATAPI 2.6) */
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index dc2175c81f5e..b1304a7f3e0a 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1190,11 +1190,11 @@ static int idedisk_ioctl(struct inode *inode, struct file *file,
        return generic_ide_ioctl(drive, file, bdev, cmd, arg);
 read_val:
-        down(&ide_setting_sem);
+        mutex_lock(&ide_setting_mtx);
        spin_lock_irqsave(&ide_lock, flags);
        err = *val;
        spin_unlock_irqrestore(&ide_lock, flags);
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
        return err >= 0 ? put_user(err, (long __user *)arg) : err;
 set_val:
@@ -1204,9 +1204,9 @@ set_val:
                if (!capable(CAP_SYS_ADMIN))
                        err = -EACCES;
                else {
-                        down(&ide_setting_sem);
+                        mutex_lock(&ide_setting_mtx);
                        err = setfunc(drive, arg);
-                        up(&ide_setting_sem);
+                        mutex_unlock(&ide_setting_mtx);
                }
        }
        return err;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index ead141e2db9e..5fe1d72ab451 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -91,45 +91,45 @@
 static const struct drive_list_entry drive_whitelist [] = {
-        { "Micropolis 2112A"    ,       "ALL"           },
+        { "Micropolis 2112A"    ,       NULL            },
-        { "CONNER CTMA 4000"    ,       "ALL"           },
+        { "CONNER CTMA 4000"    ,       NULL            },
-        { "CONNER CTT8000-A"    ,       "ALL"           },
+        { "CONNER CTT8000-A"    ,       NULL            },
-        { "ST34342A"            ,       "ALL"           },
+        { "ST34342A"            ,       NULL            },
        { NULL                  ,       NULL            }
 };
 static const struct drive_list_entry drive_blacklist [] = {
-        { "WDC AC11000H"        ,       "ALL"           },
+        { "WDC AC11000H"        ,       NULL            },
-        { "WDC AC22100H"        ,       "ALL"           },
+        { "WDC AC22100H"        ,       NULL            },
-        { "WDC AC32500H"        ,       "ALL"           },
+        { "WDC AC32500H"        ,       NULL            },
-        { "WDC AC33100H"        ,       "ALL"           },
+        { "WDC AC33100H"        ,       NULL            },
-        { "WDC AC31600H"        ,       "ALL"           },
+        { "WDC AC31600H"        ,       NULL            },
        { "WDC AC32100H"        ,       "24.09P07"      },
        { "WDC AC23200L"        ,       "21.10N21"      },
-        { "Compaq CRD-8241B"    ,       "ALL"           },
+        { "Compaq CRD-8241B"    ,       NULL            },
-        { "CRD-8400B"           ,       "ALL"           },
+        { "CRD-8400B"           ,       NULL            },
-        { "CRD-8480B",                  "ALL"           },
+        { "CRD-8480B",                  NULL            },
-        { "CRD-8482B",                  "ALL"           },
+        { "CRD-8482B",                  NULL            },
-        { "CRD-84"              ,       "ALL"           },
+        { "CRD-84"              ,       NULL            },
-        { "SanDisk SDP3B"       ,       "ALL"           },
+        { "SanDisk SDP3B"       ,       NULL            },
-        { "SanDisk SDP3B-64"    ,       "ALL"           },
+        { "SanDisk SDP3B-64"    ,       NULL            },
-        { "SANYO CD-ROM CRD"    ,       "ALL"           },
+        { "SANYO CD-ROM CRD"    ,       NULL            },
-        { "HITACHI CDR-8"       ,       "ALL"           },
+        { "HITACHI CDR-8"       ,       NULL            },
-        { "HITACHI CDR-8335"    ,       "ALL"           },
+        { "HITACHI CDR-8335"    ,       NULL            },
-        { "HITACHI CDR-8435"    ,       "ALL"           },
+        { "HITACHI CDR-8435"    ,       NULL            },
-        { "Toshiba CD-ROM XM-6202B"     ,       "ALL"           },
+        { "Toshiba CD-ROM XM-6202B"     ,       NULL            },
-        { "TOSHIBA CD-ROM XM-1702BC",   "ALL"           },
+        { "TOSHIBA CD-ROM XM-1702BC",   NULL            },
-        { "CD-532E-A"           ,       "ALL"           },
+        { "CD-532E-A"           ,       NULL            },
-        { "E-IDE CD-ROM CR-840",        "ALL"           },
+        { "E-IDE CD-ROM CR-840",        NULL            },
-        { "CD-ROM Drive/F5A",   "ALL"           },
+        { "CD-ROM Drive/F5A",   NULL            },
-        { "WPI CDD-820",                "ALL"           },
+        { "WPI CDD-820",                NULL            },
-        { "SAMSUNG CD-ROM SC-148C",     "ALL"           },
+        { "SAMSUNG CD-ROM SC-148C",     NULL            },
-        { "SAMSUNG CD-ROM SC",  "ALL"           },
+        { "SAMSUNG CD-ROM SC",  NULL            },
-        { "ATAPI CD-ROM DRIVE 40X MAXIMUM",     "ALL"           },
+        { "ATAPI CD-ROM DRIVE 40X MAXIMUM",     NULL            },
-        { "_NEC DV5800A",               "ALL"           },  
+        { "_NEC DV5800A",               NULL            },
        { "SAMSUNG CD-ROM SN-124",      "N001" },
-        { "Seagate STT20000A",          "ALL" },
+        { "Seagate STT20000A",          NULL  },
        { NULL                  ,       NULL            }
 };
@@ -147,8 +147,8 @@ int ide_in_drive_list(struct hd_driveid *id, const struct drive_list_entry *driv
 {
        for ( ; drive_table->id_model ; drive_table++)
                if ((!strcmp(drive_table->id_model, id->model)) &&
-                    ((strstr(id->fw_rev, drive_table->id_firmware)) ||
+                    (!drive_table->id_firmware ||
-                     (!strcmp(drive_table->id_firmware, "ALL"))))
+                     strstr(id->fw_rev, drive_table->id_firmware)))
                        return 1;
        return 0;
 }
@@ -702,8 +702,22 @@ static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base)
                        mask = id->dma_mword & hwif->mwdma_mask;
                break;
        case XFER_SW_DMA_0:
-                if (id->field_valid & 2)
+                if (id->field_valid & 2) {
                        mask = id->dma_1word & hwif->swdma_mask;
+                } else if (id->tDMA) {
+                        /*
+                         * ide_fix_driveid() doesn't convert ->tDMA to the
+                         * CPU endianness so we need to do it here
+                         */
+                        u8 mode = le16_to_cpu(id->tDMA);
+                        /*
+                         * if the mode is valid convert it to the mask
+                         * (the maximum allowed mode is XFER_SW_DMA_2)
+                         */
+                        if (mode <= 2)
+                                mask = ((2 << mode) - 1) & hwif->swdma_mask;
+                }
                break;
        default:
                BUG();
@@ -847,27 +861,27 @@ int ide_set_dma(ide_drive_t *drive)
        return rc;
 }
-EXPORT_SYMBOL_GPL(ide_set_dma);
 #ifdef CONFIG_BLK_DEV_IDEDMA_PCI
-int __ide_dma_lostirq (ide_drive_t *drive)
+void ide_dma_lost_irq (ide_drive_t *drive)
 {
        printk("%s: DMA interrupt recovery\n", drive->name);
-        return 1;
 }
-EXPORT_SYMBOL(__ide_dma_lostirq);
+EXPORT_SYMBOL(ide_dma_lost_irq);
-int __ide_dma_timeout (ide_drive_t *drive)
+void ide_dma_timeout (ide_drive_t *drive)
 {
+        ide_hwif_t *hwif = HWIF(drive);
        printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
-        if (HWIF(drive)->ide_dma_test_irq(drive))
-                return 0;
-        return HWIF(drive)->ide_dma_end(drive);
+        if (hwif->ide_dma_test_irq(drive))
+                return;
+        hwif->ide_dma_end(drive);
 }
-EXPORT_SYMBOL(__ide_dma_timeout);
+EXPORT_SYMBOL(ide_dma_timeout);
 /*
 * Needed for allowing full modular support of ide-driver
@@ -1018,10 +1032,10 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p
                hwif->ide_dma_end = &__ide_dma_end;
        if (!hwif->ide_dma_test_irq)
                hwif->ide_dma_test_irq = &__ide_dma_test_irq;
-        if (!hwif->ide_dma_timeout)
+        if (!hwif->dma_timeout)
-                hwif->ide_dma_timeout = &__ide_dma_timeout;
+                hwif->dma_timeout = &ide_dma_timeout;
-        if (!hwif->ide_dma_lostirq)
+        if (!hwif->dma_lost_irq)
-                hwif->ide_dma_lostirq = &__ide_dma_lostirq;
+                hwif->dma_lost_irq = &ide_dma_lost_irq;
        if (hwif->chipset != ide_trm290) {
                u8 dma_stat = hwif->INB(hwif->dma_status);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index bfe8f1b712ba..c5b5011da56e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1350,7 +1350,7 @@ static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
                                                hwif->INB(IDE_STATUS_REG));
        } else {
                printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-                (void) hwif->ide_dma_timeout(drive);
+                hwif->dma_timeout(drive);
        }
        /*
@@ -1466,7 +1466,7 @@ void ide_timer_expiry (unsigned long data)
                                startstop = handler(drive);
                        } else if (drive_is_ready(drive)) {
                                if (drive->waiting_for_dma)
-                                        (void) hwgroup->hwif->ide_dma_lostirq(drive);
+                                        hwgroup->hwif->dma_lost_irq(drive);
                                (void)ide_ack_intr(hwif);
                                printk(KERN_WARNING "%s: lost interrupt\n", drive->name);
                                startstop = handler(drive);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index f0be5f665a0e..92578b6832e9 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -574,7 +574,10 @@ u8 eighty_ninty_three (ide_drive_t *drive)
        ide_hwif_t *hwif = drive->hwif;
        struct hd_driveid *id = drive->id;
-        if (hwif->udma_four == 0)
+        if (hwif->cbl == ATA_CBL_PATA40_SHORT)
+                return 1;
+        if (hwif->cbl != ATA_CBL_PATA80)
                goto no_80w;
        /* Check for SATA but only if we are ATA5 or higher */
@@ -600,7 +603,8 @@ no_80w:
        printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, "
                            "limiting max speed to UDMA33\n",
-                            drive->name, hwif->udma_four ? "drive" : "host");
+                            drive->name,
+                            hwif->cbl == ATA_CBL_PATA80 ? "drive" : "host");
        drive->udma33_warned = 1;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f5ce22c38f82..cc5801399467 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -144,7 +144,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
        local_irq_enable();
        ide_fix_driveid(id);
-#if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
+#if defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
        /*
         * EATA SCSI controllers do a hardware ATA emulation:
         * Ignore them if there is a driver for them available.
@@ -154,7 +154,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
                printk("%s: EATA SCSI HBA %.10s\n", drive->name, id->model);
                goto err_misc;
        }
-#endif /* CONFIG_SCSI_EATA_DMA || CONFIG_SCSI_EATA_PIO */
+#endif /* CONFIG_SCSI_EATA || CONFIG_SCSI_EATA_PIO */
        /*
         *  WIN_IDENTIFY returns little-endian info,
@@ -1025,7 +1025,7 @@ static int init_irq (ide_hwif_t *hwif)
        BUG_ON(irqs_disabled());        
        BUG_ON(hwif == NULL);
-        down(&ide_cfg_sem);
+        mutex_lock(&ide_cfg_mtx);
        hwif->hwgroup = NULL;
 #if MAX_HWIFS > 1
        /*
@@ -1154,7 +1154,7 @@ static int init_irq (ide_hwif_t *hwif)
                printk(" (%sed with %s)",
                        hwif->sharing_irq ? "shar" : "serializ", match->name);
        printk("\n");
-        up(&ide_cfg_sem);
+        mutex_unlock(&ide_cfg_mtx);
        return 0;
 out_unlink:
        spin_lock_irq(&ide_lock);
@@ -1177,7 +1177,7 @@ out_unlink:
        }
        spin_unlock_irq(&ide_lock);
 out_up:
-        up(&ide_cfg_sem);
+        mutex_unlock(&ide_cfg_mtx);
        return 1;
 }
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index ea94c9aa1220..fc1d8ae6a803 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -156,7 +156,7 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
 {
        ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
-        down(&ide_setting_sem);
+        mutex_lock(&ide_setting_mtx);
        while ((*p) && strcmp((*p)->name, name) < 0)
                p = &((*p)->next);
        if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
@@ -177,10 +177,10 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int d
        if (auto_remove)
                setting->auto_remove = 1;
        *p = setting;
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
        return 0;
 abort:
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
        kfree(setting);
        return -1;
 }
@@ -224,7 +224,7 @@ static void __ide_remove_setting (ide_drive_t *drive, char *name)
 *
 *      Automatically remove all the driver specific settings for this
 *      drive. This function may not be called from IRQ context. The
- *      caller must hold ide_setting_sem.
+ *      caller must hold ide_setting_mtx.
 */
 static void auto_remove_settings (ide_drive_t *drive)
@@ -269,7 +269,7 @@ static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name)
 *      @setting: drive setting
 *
 *      Read a drive setting and return the value. The caller
- *      must hold the ide_setting_sem when making this call.
+ *      must hold the ide_setting_mtx when making this call.
 *
 *      BUGS: the data return and error are the same return value
 *      so an error -EINVAL and true return of the same value cannot
@@ -306,7 +306,7 @@ static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting)
 *      @val: value
 *
 *      Write a drive setting if it is possible. The caller
- *      must hold the ide_setting_sem when making this call.
+ *      must hold the ide_setting_mtx when making this call.
 *
 *      BUGS: the data return and error are the same return value
 *      so an error -EINVAL and true return of the same value cannot
@@ -367,7 +367,7 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
 *      @drive: drive being configured
 *
 *      Add the generic parts of the system settings to the /proc files.
- *      The caller must not be holding the ide_setting_sem.
+ *      The caller must not be holding the ide_setting_mtx.
 */
 void ide_add_generic_settings (ide_drive_t *drive)
@@ -408,7 +408,7 @@ static int proc_ide_read_settings
        proc_ide_settings_warn();
-        down(&ide_setting_sem);
+        mutex_lock(&ide_setting_mtx);
        out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
        out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
        while(setting) {
@@ -428,7 +428,7 @@ static int proc_ide_read_settings
                setting = setting->next;
        }
        len = out - page;
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
        PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
 }
@@ -508,16 +508,16 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer,
                                ++p;
                        }
-                        down(&ide_setting_sem);
+                        mutex_lock(&ide_setting_mtx);
                        setting = ide_find_setting_by_name(drive, name);
                        if (!setting)
                        {
-                                up(&ide_setting_sem);
+                                mutex_unlock(&ide_setting_mtx);
                                goto parse_error;
                        }
                        if (for_real)
                                ide_write_setting(drive, setting, val * setting->div_factor / setting->mul_factor);
-                        up(&ide_setting_sem);
+                        mutex_unlock(&ide_setting_mtx);
                }
        } while (!for_real++);
        free_page((unsigned long)buf);
@@ -705,7 +705,7 @@ EXPORT_SYMBOL(ide_proc_register_driver);
 *      Clean up the driver specific /proc files and IDE settings
 *      for a given drive.
 *
- *      Takes ide_setting_sem and ide_lock.
+ *      Takes ide_setting_mtx and ide_lock.
 *      Caller must hold none of the locks.
 */
@@ -715,10 +715,10 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
        ide_remove_proc_entries(drive->proc, driver->proc);
-        down(&ide_setting_sem);
+        mutex_lock(&ide_setting_mtx);
        spin_lock_irqsave(&ide_lock, flags);
        /*
-         * ide_setting_sem protects the settings list
+         * ide_setting_mtx protects the settings list
         * ide_lock protects the use of settings
         *
         * so we need to hold both, ide_settings_sem because we want to
@@ -726,11 +726,11 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
         * a setting out that is being used.
         *
         * OTOH both ide_{read,write}_setting are only ever used under
-         * ide_setting_sem.
+         * ide_setting_mtx.
         */
        auto_remove_settings(drive);
        spin_unlock_irqrestore(&ide_lock, flags);
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
 }
 EXPORT_SYMBOL(ide_proc_unregister_driver);
diff --git a/drivers/ide/ide-timing.h b/drivers/ide/ide-timing.h
index c0864b1e9228..e6cb8593b5ba 100644
--- a/drivers/ide/ide-timing.h
+++ b/drivers/ide/ide-timing.h
@@ -102,66 +102,16 @@ static struct ide_timing ide_timing[] = {
 #define EZ(v,unit)              ((v)?ENOUGH(v,unit):0)
 #define XFER_MODE       0xf0
-#define XFER_UDMA_133   0x48
-#define XFER_UDMA_100   0x44
-#define XFER_UDMA_66    0x42
-#define XFER_UDMA       0x40
 #define XFER_MWDMA      0x20
-#define XFER_SWDMA      0x10
 #define XFER_EPIO       0x01
 #define XFER_PIO        0x00
-static short ide_find_best_mode(ide_drive_t *drive, int map)
+static short ide_find_best_pio_mode(ide_drive_t *drive)
 {
        struct hd_driveid *id = drive->id;
        short best = 0;
-        if (!id)
+        if (id->field_valid & 2) {      /* EIDE PIO modes */
-                return XFER_PIO_SLOW;
-        if ((map & XFER_UDMA) && (id->field_valid & 4)) {       /* Want UDMA and UDMA bitmap valid */
-                if ((map & XFER_UDMA_133) == XFER_UDMA_133)
-                        if ((best = (id->dma_ultra & 0x0040) ? XFER_UDMA_6 : 0)) return best;
-                if ((map & XFER_UDMA_100) == XFER_UDMA_100)
-                        if ((best = (id->dma_ultra & 0x0020) ? XFER_UDMA_5 : 0)) return best;
-                if ((map & XFER_UDMA_66) == XFER_UDMA_66)
-                        if ((best = (id->dma_ultra & 0x0010) ? XFER_UDMA_4 :
-                                    (id->dma_ultra & 0x0008) ? XFER_UDMA_3 : 0)) return best;
-                if ((best = (id->dma_ultra & 0x0004) ? XFER_UDMA_2 :
-                            (id->dma_ultra & 0x0002) ? XFER_UDMA_1 :
-                            (id->dma_ultra & 0x0001) ? XFER_UDMA_0 : 0)) return best;
-        }
-        if ((map & XFER_MWDMA) && (id->field_valid & 2)) {      /* Want MWDMA and drive has EIDE fields */
-                if ((best = (id->dma_mword & 0x0004) ? XFER_MW_DMA_2 :
-                            (id->dma_mword & 0x0002) ? XFER_MW_DMA_1 :
-                            (id->dma_mword & 0x0001) ? XFER_MW_DMA_0 : 0)) return best;
-        }
-        if (map & XFER_SWDMA) {                                 /* Want SWDMA */
-                if (id->field_valid & 2) {                      /* EIDE SWDMA */
-                        if ((best = (id->dma_1word & 0x0004) ? XFER_SW_DMA_2 :
-                                    (id->dma_1word & 0x0002) ? XFER_SW_DMA_1 :
-                                    (id->dma_1word & 0x0001) ? XFER_SW_DMA_0 : 0)) return best;
-                }
-                if (id->capability & 1) {                       /* Pre-EIDE style SWDMA */
-                        if ((best = (id->tDMA == 2) ? XFER_SW_DMA_2 :
-                                    (id->tDMA == 1) ? XFER_SW_DMA_1 :
-                                    (id->tDMA == 0) ? XFER_SW_DMA_0 : 0)) return best;
-                }
-        }
-        if ((map & XFER_EPIO) && (id->field_valid & 2)) {       /* EIDE PIO modes */
                if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 :
                            (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 :
@@ -262,7 +212,7 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing
 */
        if ((speed & XFER_MODE) != XFER_PIO) {
-                ide_timing_compute(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO), &p, T, UT);
+                ide_timing_compute(drive, ide_find_best_pio_mode(drive), &p, T, UT);
                ide_timing_merge(&p, t, t, IDE_TIMING_ALL);
        }
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 0cd76bf66833..c948a5c17a5d 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -169,7 +169,7 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 static int idebus_parameter;    /* holds the "idebus=" parameter */
 static int system_bus_speed;    /* holds what we think is VESA/PCI bus speed */
-DECLARE_MUTEX(ide_cfg_sem);
+DEFINE_MUTEX(ide_cfg_mtx);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
 #ifdef CONFIG_IDEPCI_PCIBUS_ORDER
@@ -460,6 +460,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
        hwif->mwdma_mask                = tmp_hwif->mwdma_mask;
        hwif->swdma_mask                = tmp_hwif->swdma_mask;
+        hwif->cbl                       = tmp_hwif->cbl;
        hwif->chipset                   = tmp_hwif->chipset;
        hwif->hold                      = tmp_hwif->hold;
@@ -496,8 +498,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
        hwif->ide_dma_clear_irq         = tmp_hwif->ide_dma_clear_irq;
        hwif->dma_host_on               = tmp_hwif->dma_host_on;
        hwif->dma_host_off              = tmp_hwif->dma_host_off;
-        hwif->ide_dma_lostirq           = tmp_hwif->ide_dma_lostirq;
+        hwif->dma_lost_irq              = tmp_hwif->dma_lost_irq;
-        hwif->ide_dma_timeout           = tmp_hwif->ide_dma_timeout;
+        hwif->dma_timeout               = tmp_hwif->dma_timeout;
        hwif->OUTB                      = tmp_hwif->OUTB;
        hwif->OUTBSYNC                  = tmp_hwif->OUTBSYNC;
@@ -533,7 +535,6 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
        hwif->extra_base                = tmp_hwif->extra_base;
        hwif->extra_ports               = tmp_hwif->extra_ports;
        hwif->autodma                   = tmp_hwif->autodma;
-        hwif->udma_four                 = tmp_hwif->udma_four;
        hwif->hwif_data                 = tmp_hwif->hwif_data;
 }
@@ -564,7 +565,7 @@ void ide_unregister(unsigned int index)
 {
        ide_drive_t *drive;
        ide_hwif_t *hwif, *g;
-        static ide_hwif_t tmp_hwif; /* protected by ide_cfg_sem */
+        static ide_hwif_t tmp_hwif; /* protected by ide_cfg_mtx */
        ide_hwgroup_t *hwgroup;
        int irq_count = 0, unit;
@@ -572,7 +573,7 @@ void ide_unregister(unsigned int index)
        BUG_ON(in_interrupt());
        BUG_ON(irqs_disabled());
-        down(&ide_cfg_sem);
+        mutex_lock(&ide_cfg_mtx);
        spin_lock_irq(&ide_lock);
        hwif = &ide_hwifs[index];
        if (!hwif->present)
@@ -679,7 +680,7 @@ void ide_unregister(unsigned int index)
 abort:
        spin_unlock_irq(&ide_lock);
-        up(&ide_cfg_sem);
+        mutex_unlock(&ide_cfg_mtx);
 }
 EXPORT_SYMBOL(ide_unregister);
@@ -817,9 +818,9 @@ EXPORT_SYMBOL(ide_register_hw);
 *      Locks for IDE setting functionality
 */
-DECLARE_MUTEX(ide_setting_sem);
+DEFINE_MUTEX(ide_setting_mtx);
-EXPORT_SYMBOL_GPL(ide_setting_sem);
+EXPORT_SYMBOL_GPL(ide_setting_mtx);
 /**
 *      ide_spin_wait_hwgroup   -       wait for group
@@ -1192,11 +1193,11 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
        }
 read_val:
-        down(&ide_setting_sem);
+        mutex_lock(&ide_setting_mtx);
        spin_lock_irqsave(&ide_lock, flags);
        err = *val;
        spin_unlock_irqrestore(&ide_lock, flags);
-        up(&ide_setting_sem);
+        mutex_unlock(&ide_setting_mtx);
        return err >= 0 ? put_user(err, (long __user *)arg) : err;
 set_val:
@@ -1206,9 +1207,9 @@ set_val:
                if (!capable(CAP_SYS_ADMIN))
                        err = -EACCES;
                else {
-                        down(&ide_setting_sem);
+                        mutex_lock(&ide_setting_mtx);
                        err = setfunc(drive, arg);
-                        up(&ide_setting_sem);
+                        mutex_unlock(&ide_setting_mtx);
                }
        }
        return err;
@@ -1548,7 +1549,11 @@ static int __init ide_setup(char *s)
                                goto bad_option;
                        case -7: /* ata66 */
 #ifdef CONFIG_BLK_DEV_IDEPCI
-                                hwif->udma_four = 1;
+                                /*
+                                 * Use ATA_CBL_PATA40_SHORT so drive side
+                                 * cable detection is also overriden.
+                                 */
+                                hwif->cbl = ATA_CBL_PATA40_SHORT;
                                goto obsolete_option;
 #else
                                goto bad_hwif;
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 45ed03591cd8..661c12f6dda6 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -130,7 +130,7 @@ struct hd_i_struct {
        
 #ifdef HD_TYPE
 static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ((sizeof (hd_info))/(sizeof (struct hd_i_struct)));
+static int NR_HD = ARRAY_SIZE(hd_info);
 #else
 static struct hd_i_struct hd_info[MAX_HD];
 static int NR_HD;
diff --git a/drivers/ide/legacy/macide.c b/drivers/ide/legacy/macide.c
index c211fc78345d..b557c45a5a9d 100644
--- a/drivers/ide/legacy/macide.c
+++ b/drivers/ide/legacy/macide.c
@@ -77,15 +77,6 @@ int macide_ack_intr(ide_hwif_t* hwif)
        return 0;
 }
-#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
-static void macide_mediabay_interrupt(int irq, void *dev_id)
-{
-        int state = baboon->mb_status & 0x04;
-        printk(KERN_INFO "macide: media bay %s detected\n", state? "removal":"insertion");
-}
-#endif
 /*
 * Probe for a Macintosh IDE interface
 */
@@ -128,11 +119,6 @@ void macide_init(void)
                        ide_drive_t *drive = &ide_hwifs[index].drives[0];
                        drive->capacity64 = drive->cyl*drive->head*drive->sect;
-#ifdef CONFIG_BLK_DEV_MAC_MEDIABAY
-                        request_irq(IRQ_BABOON_2, macide_mediabay_interrupt,
-                                        IRQ_FLG_FAST, "mediabay",
-                                        macide_mediabay_interrupt);
-#endif
                }
                break;
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index ca95e990862e..2e7013a2a7f6 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -381,9 +381,7 @@ static int auide_dma_setup(ide_drive_t *drive)
 static int auide_dma_check(ide_drive_t *drive)
 {
-        u8 speed;
+        u8 speed = ide_max_dma_mode(drive);
-#ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
        if( dbdma_init_done == 0 ){
                auide_hwif.white_list = ide_in_drive_list(drive->id,
@@ -394,7 +392,6 @@ static int auide_dma_check(ide_drive_t *drive)
                auide_ddma_init(&auide_hwif);
                dbdma_init_done = 1;
        }
-#endif
        /* Is the drive in our DMA black list? */
@@ -409,8 +406,6 @@ static int auide_dma_check(ide_drive_t *drive)
        else
                drive->using_dma = 1;
-        speed = ide_find_best_mode(drive, XFER_PIO | XFER_MWDMA);
-        
        if (drive->autodma && (speed & XFER_MODE) != XFER_PIO)
                return 0;
@@ -456,10 +451,9 @@ static void auide_dma_off_quietly(ide_drive_t *drive)
        drive->using_dma = 0;
 }
-static int auide_dma_lostirq(ide_drive_t *drive)
+static void auide_dma_lost_irq(ide_drive_t *drive)
 {
        printk(KERN_ERR "%s: IRQ lost\n", drive->name);
-        return 0;
 }
 static void auide_ddma_tx_callback(int irq, void *param)
@@ -489,16 +483,16 @@ static void auide_init_dbdma_dev(dbdev_tab_t *dev, u32 dev_id, u32 tsize, u32 de
  
 #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA)
-static int auide_dma_timeout(ide_drive_t *drive)
+static void auide_dma_timeout(ide_drive_t *drive)
 {
-//      printk("%s\n", __FUNCTION__);
+        ide_hwif_t *hwif = HWIF(drive);
        printk(KERN_ERR "%s: DMA timeout occurred: ", drive->name);
-        if (HWIF(drive)->ide_dma_test_irq(drive))
+        if (hwif->ide_dma_test_irq(drive))
-                return 0;
+                return;
-        return HWIF(drive)->ide_dma_end(drive);
+        hwif->ide_dma_end(drive);
 }
                                        
@@ -721,7 +715,7 @@ static int au_ide_probe(struct device *dev)
 #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA
        hwif->dma_off_quietly           = &auide_dma_off_quietly;
-        hwif->ide_dma_timeout           = &auide_dma_timeout;
+        hwif->dma_timeout               = &auide_dma_timeout;
        hwif->ide_dma_check             = &auide_dma_check;
        hwif->dma_exec_cmd              = &auide_dma_exec_cmd;
@@ -731,7 +725,7 @@ static int au_ide_probe(struct device *dev)
        hwif->ide_dma_test_irq          = &auide_dma_test_irq;
        hwif->dma_host_off              = &auide_dma_host_off;
        hwif->dma_host_on               = &auide_dma_host_on;
-        hwif->ide_dma_lostirq           = &auide_dma_lostirq;
+        hwif->dma_lost_irq              = &auide_dma_lost_irq;
        hwif->ide_dma_on                = &auide_dma_on;
        hwif->autodma                   = 1;
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index b173bc66ce1e..e5d09367627e 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/aec62xx.c              Version 0.21    Apr 21, 2007
+ * linux/drivers/ide/pci/aec62xx.c              Version 0.24    May 24, 2007
 *
 * Copyright (C) 1999-2002      Andre Hedrick <andre@linux-ide.org>
 * Copyright (C) 2007           MontaVista Software, Inc. <source@mvista.com>
@@ -140,25 +140,10 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed)
        return(ide_config_drive_speed(drive, speed));
 }
-static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed)
-{
-        switch (HWIF(drive)->pci_dev->device) {
-                case PCI_DEVICE_ID_ARTOP_ATP865:
-                case PCI_DEVICE_ID_ARTOP_ATP865R:
-                case PCI_DEVICE_ID_ARTOP_ATP860:
-                case PCI_DEVICE_ID_ARTOP_ATP860R:
-                        return ((int) aec6260_tune_chipset(drive, speed));
-                case PCI_DEVICE_ID_ARTOP_ATP850UF:
-                        return ((int) aec6210_tune_chipset(drive, speed));
-                default:
-                        return -1;
-        }
-}
 static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
 {
        pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
-        (void) aec62xx_tune_chipset(drive, pio + XFER_PIO_0);
+        (void) HWIF(drive)->speedproc(drive, pio + XFER_PIO_0);
 }
 static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
@@ -172,12 +157,9 @@ static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
        return -1;
 }
-static int aec62xx_irq_timeout (ide_drive_t *drive)
+static void aec62xx_dma_lost_irq (ide_drive_t *drive)
 {
-        ide_hwif_t *hwif        = HWIF(drive);
+        switch (HWIF(drive)->pci_dev->device) {
-        struct pci_dev *dev     = hwif->pci_dev;
-        switch(dev->device) {
                case PCI_DEVICE_ID_ARTOP_ATP860:
                case PCI_DEVICE_ID_ARTOP_ATP860R:
                case PCI_DEVICE_ID_ARTOP_ATP865:
@@ -186,7 +168,6 @@ static int aec62xx_irq_timeout (ide_drive_t *drive)
                default:
                        break;
        }
-        return 0;
 }
 static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name)
@@ -224,64 +205,46 @@ static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const ch
 static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif)
 {
-        struct pci_dev *dev = hwif->pci_dev;
+        struct pci_dev *dev     = hwif->pci_dev;
+        u8 reg54 = 0,  mask     = hwif->channel ? 0xf0 : 0x0f;
+        unsigned long flags;
-        hwif->autodma = 0;
        hwif->tuneproc = &aec62xx_tune_drive;
-        hwif->speedproc = &aec62xx_tune_chipset;
-        if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF)
+        if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
-                hwif->serialized = hwif->channel;
+                if(hwif->mate)
+                        hwif->mate->serialized = hwif->serialized = 1;
-        if (hwif->mate)
+                hwif->speedproc = &aec6210_tune_chipset;
-                hwif->mate->serialized = hwif->serialized;
+        } else
+                hwif->speedproc = &aec6260_tune_chipset;
        if (!hwif->dma_base) {
-                hwif->drives[0].autotune = 1;
+                hwif->drives[0].autotune = hwif->drives[1].autotune = 1;
-                hwif->drives[1].autotune = 1;
                return;
        }
        hwif->ultra_mask = hwif->cds->udma_mask;
-        /* atp865 and atp865r */
-        if (hwif->ultra_mask == 0x3f) {
-                /* check bit 0x10 of DMA status register */
-                if (inb(pci_resource_start(dev, 4) + 2) & 0x10)
-                        hwif->ultra_mask = 0x7f; /* udma0-6 */
-        }
        hwif->mwdma_mask = 0x07;
        hwif->ide_dma_check     = &aec62xx_config_drive_xfer_rate;
-        hwif->ide_dma_lostirq   = &aec62xx_irq_timeout;
+        hwif->dma_lost_irq      = &aec62xx_dma_lost_irq;
-        if (!noautodma)
-                hwif->autodma = 1;
-        hwif->drives[0].autodma = hwif->autodma;
-        hwif->drives[1].autodma = hwif->autodma;
-}
-static void __devinit init_dma_aec62xx(ide_hwif_t *hwif, unsigned long dmabase)
-{
-        struct pci_dev *dev     = hwif->pci_dev;
        if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF) {
-                u8 reg54h = 0;
-                unsigned long flags;
                spin_lock_irqsave(&ide_lock, flags);
-                pci_read_config_byte(dev, 0x54, &reg54h);
+                pci_read_config_byte (dev, 0x54, &reg54);
-                pci_write_config_byte(dev, 0x54, reg54h & ~(hwif->channel ? 0xF0 : 0x0F));
+                pci_write_config_byte(dev, 0x54, (reg54 & ~mask));
                spin_unlock_irqrestore(&ide_lock, flags);
-        } else {
+        } else if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
-                u8 ata66        = 0;
+                u8 ata66 = 0, mask = hwif->channel ? 0x02 : 0x01;
                pci_read_config_byte(hwif->pci_dev, 0x49, &ata66);
-                if (!(hwif->udma_four))
-                        hwif->udma_four = (ata66&(hwif->channel?0x02:0x01))?0:1;
+                hwif->cbl = (ata66 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
        }
-        ide_setup_dma(hwif, dmabase, 8);
+        if (!noautodma)
+                hwif->autodma = 1;
+        hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
 }
 static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d)
@@ -291,16 +254,12 @@ static int __devinit init_setup_aec62xx(struct pci_dev *dev, ide_pci_device_t *d
 static int __devinit init_setup_aec6x80(struct pci_dev *dev, ide_pci_device_t *d)
 {
-        unsigned long bar4reg = pci_resource_start(dev, 4);
+        unsigned long dma_base = pci_resource_start(dev, 4);
-        if (inb(bar4reg+2) & 0x10) {
+        if (inb(dma_base + 2) & 0x10) {
-                strcpy(d->name, "AEC6880");
+                d->name = (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R) ?
-                if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)
+                          "AEC6880R" : "AEC6880";
-                        strcpy(d->name, "AEC6880R");
+                d->udma_mask = 0x7f; /* udma0-6 */
-        } else {
-                strcpy(d->name, "AEC6280");
-                if (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)
-                        strcpy(d->name, "AEC6280R");
        }
        return ide_setup_pci_device(dev, d);
@@ -312,7 +271,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
                .init_setup     = init_setup_aec62xx,
                .init_chipset   = init_chipset_aec62xx,
                .init_hwif      = init_hwif_aec62xx,
-                .init_dma       = init_dma_aec62xx,
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -323,7 +281,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
                .init_setup     = init_setup_aec62xx,
                .init_chipset   = init_chipset_aec62xx,
                .init_hwif      = init_hwif_aec62xx,
-                .init_dma       = init_dma_aec62xx,
                .channels       = 2,
                .autodma        = NOAUTODMA,
                .bootable       = OFF_BOARD,
@@ -333,28 +290,25 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
                .init_setup     = init_setup_aec62xx,
                .init_chipset   = init_chipset_aec62xx,
                .init_hwif      = init_hwif_aec62xx,
-                .init_dma       = init_dma_aec62xx,
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
                .bootable       = NEVER_BOARD,
                .udma_mask      = 0x1f, /* udma0-4 */
        },{     /* 3 */
-                .name           = "AEC6X80",
+                .name           = "AEC6280",
                .init_setup     = init_setup_aec6x80,
                .init_chipset   = init_chipset_aec62xx,
                .init_hwif      = init_hwif_aec62xx,
-                .init_dma       = init_dma_aec62xx,
                .channels       = 2,
                .autodma        = AUTODMA,
                .bootable       = OFF_BOARD,
                .udma_mask      = 0x3f, /* udma0-5 */
        },{     /* 4 */
-                .name           = "AEC6X80R",
+                .name           = "AEC6280R",
                .init_setup     = init_setup_aec6x80,
                .init_chipset   = init_chipset_aec62xx,
                .init_hwif      = init_hwif_aec62xx,
-                .init_dma       = init_dma_aec62xx,
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
@@ -370,13 +324,16 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 *
 *      Called when the PCI registration layer (or the IDE initialization)
 *      finds a device matching our IDE device tables.
+ *
+ *      NOTE: since we're going to modify the 'name' field for AEC-6[26]80[R]
+ *      chips, pass a local copy of 'struct pci_device_id' down the call chain.
 */
 
 static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-        ide_pci_device_t *d = &aec62xx_chipsets[id->driver_data];
+        ide_pci_device_t d = aec62xx_chipsets[id->driver_data];
-        return d->init_setup(dev, d);
+        return d.init_setup(dev, &d);
 }
 static struct pci_device_id aec62xx_pci_tbl[] = {
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 27525ec2e19a..8a6b27b3bcc3 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/alim15x3.c             Version 0.21    2007/02/03
+ * linux/drivers/ide/pci/alim15x3.c             Version 0.25    Jun 9 2007
 *
 *  Copyright (C) 1998-2000 Michel Aubry, Maintainer
 *  Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer
@@ -10,6 +10,7 @@
 *  Copyright (C) 2002 Alan Cox <alan@redhat.com>
 *  ALi (now ULi M5228) support by Clear Zhang <Clear.Zhang@ali.com.tw>
 *  Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com>
+ *  Copyright (C) 2007 Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
 *
 *  (U)DMA capable version of ali 1533/1543(C), 1535(D)
 *
@@ -36,6 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/ide.h>
 #include <linux/init.h>
+#include <linux/dmi.h>
 #include <asm/io.h>
@@ -583,6 +585,35 @@ out:
        return 0;
 }
+/*
+ *      Cable special cases
+ */
+static struct dmi_system_id cable_dmi_table[] = {
+        {
+                .ident = "HP Pavilion N5430",
+                .matches = {
+                        DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
+                        DMI_MATCH(DMI_BOARD_NAME, "OmniBook N32N-736"),
+                },
+        },
+        { }
+};
+static int ali_cable_override(struct pci_dev *pdev)
+{
+        /* Fujitsu P2000 */
+        if (pdev->subsystem_vendor == 0x10CF &&
+            pdev->subsystem_device == 0x10AF)
+                return 1;
+        /* Systems by DMI */
+        if (dmi_check_system(cable_dmi_table))
+                return 1;
+        return 0;
+}
 /**
 *      ata66_ali15x3   -       check for UDMA 66 support
 *      @hwif: IDE interface
@@ -594,37 +625,31 @@ out:
 *      FIXME: frobs bits that are not defined on newer ALi devicea
 */
-static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif)
+static u8 __devinit ata66_ali15x3(ide_hwif_t *hwif)
 {
        struct pci_dev *dev     = hwif->pci_dev;
-        unsigned int ata66      = 0;
-        u8 cable_80_pin[2]      = { 0, 0 };
        unsigned long flags;
-        u8 tmpbyte;
+        u8 cbl = ATA_CBL_PATA40, tmpbyte;
        local_irq_save(flags);
        if (m5229_revision >= 0xC2) {
                /*
-                 * Ultra66 cable detection (from Host View)
+                 * m5229 80-pin cable detection (from Host View)
-                 * m5229, 0x4a, bit0: primary, bit1: secondary 80 pin
+                 *
-                 */
+                 * 0x4a bit0 is 0 => primary channel has 80-pin
-                pci_read_config_byte(dev, 0x4a, &tmpbyte);
+                 * 0x4a bit1 is 0 => secondary channel has 80-pin
-                /*
+                 *
-                 * 0x4a, bit0 is 0 => primary channel
+                 * Certain laptops use short but suitable cables
-                 * has 80-pin (from host view)
+                 * and don't implement the detect logic.
-                 */
-                if (!(tmpbyte & 0x01)) cable_80_pin[0] = 1;
-                /*
-                 * 0x4a, bit1 is 0 => secondary channel
-                 * has 80-pin (from host view)
-                 */
-                if (!(tmpbyte & 0x02)) cable_80_pin[1] = 1;
-                /*
-                 * Allow ata66 if cable of current channel has 80 pins
                 */
-                ata66 = (hwif->channel)?cable_80_pin[1]:cable_80_pin[0];
+                if (ali_cable_override(dev))
+                        cbl = ATA_CBL_PATA40_SHORT;
+                else {
+                        pci_read_config_byte(dev, 0x4a, &tmpbyte);
+                        if ((tmpbyte & (1 << hwif->channel)) == 0)
+                                cbl = ATA_CBL_PATA80;
+                }
        } else {
                /*
                 * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010
@@ -657,7 +682,7 @@ static unsigned int __devinit ata66_ali15x3 (ide_hwif_t *hwif)
        local_irq_restore(flags);
-        return(ata66);
+        return cbl;
 }
 /**
@@ -708,8 +733,9 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif)
                hwif->dma_setup = &ali15x3_dma_setup;
                if (!noautodma)
                        hwif->autodma = 1;
-                if (!(hwif->udma_four))
-                        hwif->udma_four = ata66_ali15x3(hwif);
+                if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                        hwif->cbl = ata66_ali15x3(hwif);
        }
        hwif->drives[0].autodma = hwif->autodma;
        hwif->drives[1].autodma = hwif->autodma;
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index a2be65fcf89c..84ed30cdb324 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -1,10 +1,11 @@
 /*
- * Version 2.16
+ * Version 2.20
 *
 * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04
 * IDE driver for Linux.
 *
 * Copyright (c) 2000-2002 Vojtech Pavlik
+ * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
 *
 * Based on the work of:
 *      Andre Hedrick
@@ -37,11 +38,6 @@
 #define AMD_ADDRESS_SETUP       (0x0c + amd_config->base)
 #define AMD_UDMA_TIMING         (0x10 + amd_config->base)
-#define AMD_UDMA                0x07
-#define AMD_UDMA_33             0x01
-#define AMD_UDMA_66             0x02
-#define AMD_UDMA_100            0x03
-#define AMD_UDMA_133            0x04
 #define AMD_CHECK_SWDMA         0x08
 #define AMD_BAD_SWDMA           0x10
 #define AMD_BAD_FIFO            0x20
@@ -53,32 +49,33 @@
 static struct amd_ide_chip {
        unsigned short id;
-        unsigned long base;
+        u8 base;
-        unsigned char flags;
+        u8 udma_mask;
+        u8 flags;
 } amd_ide_chips[] = {
-        { PCI_DEVICE_ID_AMD_COBRA_7401,         0x40, AMD_UDMA_33 | AMD_BAD_SWDMA },
+        { PCI_DEVICE_ID_AMD_COBRA_7401,          0x40, ATA_UDMA2, AMD_BAD_SWDMA },
-        { PCI_DEVICE_ID_AMD_VIPER_7409,         0x40, AMD_UDMA_66 | AMD_CHECK_SWDMA },
+        { PCI_DEVICE_ID_AMD_VIPER_7409,          0x40, ATA_UDMA4, AMD_CHECK_SWDMA },
-        { PCI_DEVICE_ID_AMD_VIPER_7411,         0x40, AMD_UDMA_100 | AMD_BAD_FIFO },
+        { PCI_DEVICE_ID_AMD_VIPER_7411,          0x40, ATA_UDMA5, AMD_BAD_FIFO },
-        { PCI_DEVICE_ID_AMD_OPUS_7441,          0x40, AMD_UDMA_100 },
+        { PCI_DEVICE_ID_AMD_OPUS_7441,           0x40, ATA_UDMA5, },
-        { PCI_DEVICE_ID_AMD_8111_IDE,           0x40, AMD_UDMA_133 | AMD_CHECK_SERENADE },
+        { PCI_DEVICE_ID_AMD_8111_IDE,            0x40, ATA_UDMA6, AMD_CHECK_SERENADE },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE,      0x50, AMD_UDMA_100 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_IDE,       0x50, ATA_UDMA5, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE,     0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE,      0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE,    0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE,     0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA,   0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA,    0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE,     0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE,      0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE,    0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE,     0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA,   0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA,    0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2,  0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2,   0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE,        0x50, AMD_UDMA_133 },
+        { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE, 0x50, ATA_UDMA6, },
-        { PCI_DEVICE_ID_AMD_CS5536_IDE,                 0x40, AMD_UDMA_100 },
+        { PCI_DEVICE_ID_AMD_CS5536_IDE,          0x40, ATA_UDMA5, },
        { 0 }
 };
@@ -87,7 +84,7 @@ static ide_pci_device_t *amd_chipset;
 static unsigned int amd_80w;
 static unsigned int amd_clock;
-static char *amd_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" };
+static char *amd_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
 static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 };
 /*
@@ -128,7 +125,7 @@ static int amd74xx_get_info(char *buffer, char **addr, off_t offset, int count)
        pci_read_config_byte(dev, PCI_REVISION_ID, &t);
        amd_print("Revision:                           IDE %#x", t);
-        amd_print("Highest DMA rate:                   %s", amd_dma[amd_config->flags & AMD_UDMA]);
+        amd_print("Highest DMA rate:                   UDMA%s", amd_dma[fls(amd_config->udma_mask) - 1]);
        amd_print("BM-DMA base:                        %#lx", amd_base);
        amd_print("PCI clock:                          %d.%dMHz", amd_clock / 1000, amd_clock / 100 % 10);
@@ -221,12 +218,12 @@ static void amd_set_speed(struct pci_dev *dev, unsigned char dn, struct ide_timi
        pci_write_config_byte(dev, AMD_DRIVE_TIMING + (3 - dn),
                ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
-        switch (amd_config->flags & AMD_UDMA) {
+        switch (amd_config->udma_mask) {
-                case AMD_UDMA_33:  t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
+        case ATA_UDMA2: t = timing->udma ? (0xc0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
-                case AMD_UDMA_66:  t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break;
+        case ATA_UDMA4: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 2, 10)]) : 0x03; break;
-                case AMD_UDMA_100: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break;
+        case ATA_UDMA5: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 10)]) : 0x03; break;
-                case AMD_UDMA_133: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break;
+        case ATA_UDMA6: t = timing->udma ? (0xc0 | amd_cyc2udma[FIT(timing->udma, 1, 15)]) : 0x03; break;
-                default: return;
+        default: return;
        }
        pci_write_config_byte(dev, AMD_UDMA_TIMING + (3 - dn), t);
@@ -248,7 +245,7 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
                ide_config_drive_speed(drive, speed);
        T = 1000000000 / amd_clock;
-        UT = T / min_t(int, max_t(int, amd_config->flags & AMD_UDMA, 1), 2);
+        UT = (amd_config->udma_mask == ATA_UDMA2) ? T : (T / 2);
        ide_timing_compute(drive, speed, &t, T, UT);
@@ -277,29 +274,19 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
 static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio)
 {
        if (pio == 255) {
-                amd_set_drive(drive, ide_find_best_mode(drive, XFER_PIO | XFER_EPIO));
+                amd_set_drive(drive, ide_find_best_pio_mode(drive));
                return;
        }
        amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5));
 }
-/*
- * amd74xx_dmaproc() is a callback from upper layers that can do
- * a lot, but we use it for DMA/PIO tuning only, delegating everything
- * else to the default ide_dmaproc().
- */
 static int amd74xx_ide_dma_check(ide_drive_t *drive)
 {
-        int w80 = HWIF(drive)->udma_four;
+        u8 speed = ide_max_dma_mode(drive);
-        u8 speed = ide_find_best_mode(drive,
+        if (speed == 0)
-                XFER_PIO | XFER_EPIO | XFER_MWDMA | XFER_UDMA |
+                speed = ide_find_best_pio_mode(drive);
-                ((amd_config->flags & AMD_BAD_SWDMA) ? 0 : XFER_SWDMA) |
-                (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_66 ? XFER_UDMA_66 : 0) |
-                (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_100 ? XFER_UDMA_100 : 0) |
-                (w80 && (amd_config->flags & AMD_UDMA) >= AMD_UDMA_133 ? XFER_UDMA_133 : 0));
        amd_set_drive(drive, speed);
@@ -334,10 +321,10 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
 * Check 80-wire cable presence.
 */
-        switch (amd_config->flags & AMD_UDMA) {
+        switch (amd_config->udma_mask) {
-                case AMD_UDMA_133:
+                case ATA_UDMA6:
-                case AMD_UDMA_100:
+                case ATA_UDMA5:
                        pci_read_config_byte(dev, AMD_CABLE_DETECT, &t);
                        pci_read_config_dword(dev, AMD_UDMA_TIMING, &u);
                        amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0);
@@ -349,7 +336,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
                                }
                        break;
-                case AMD_UDMA_66:
+                case ATA_UDMA4:
                        /* no host side cable detection */
                        amd_80w = 0x03;
                        break;
@@ -370,7 +357,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
        if ((amd_config->flags & AMD_CHECK_SERENADE) &&
                dev->subsystem_vendor == PCI_VENDOR_ID_AMD &&
                dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE)
-                        amd_config->flags = AMD_UDMA_100;
+                        amd_config->udma_mask = ATA_UDMA5;
 /*
 * Determine the system bus clock.
@@ -395,8 +382,9 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
 */
        pci_read_config_byte(dev, PCI_REVISION_ID, &t);
-        printk(KERN_INFO "%s: %s (rev %02x) %s controller\n",
+        printk(KERN_INFO "%s: %s (rev %02x) UDMA%s controller\n",
-                amd_chipset->name, pci_name(dev), t, amd_dma[amd_config->flags & AMD_UDMA]);
+                amd_chipset->name, pci_name(dev), t,
+                amd_dma[fls(amd_config->udma_mask) - 1]);
 /*
 * Register /proc/ide/amd74xx entry
@@ -437,12 +425,19 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
                return;
        hwif->atapi_dma = 1;
-        hwif->ultra_mask = 0x7f;
-        hwif->mwdma_mask = 0x07;
-        hwif->swdma_mask = 0x07;
-        if (!hwif->udma_four)
+        hwif->ultra_mask = amd_config->udma_mask;
-                hwif->udma_four = (amd_80w >> hwif->channel) & 1;
+        hwif->mwdma_mask = 0x07;
+        if ((amd_config->flags & AMD_BAD_SWDMA) == 0)
+                hwif->swdma_mask = 0x07;
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
+                if ((amd_80w >> hwif->channel) & 1)
+                        hwif->cbl = ATA_CBL_PATA80;
+                else
+                        hwif->cbl = ATA_CBL_PATA40;
+        }
        hwif->ide_dma_check = &amd74xx_ide_dma_check;
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index 8ab33faf6f76..2761510309b3 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -264,10 +264,11 @@ static void __devinit init_hwif_atiixp(ide_hwif_t *hwif)
        hwif->swdma_mask = 0x04;
        pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode);
        if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40)
-                hwif->udma_four = 1;
+                hwif->cbl = ATA_CBL_PATA80;
        else
-                hwif->udma_four = 0;
+                hwif->cbl = ATA_CBL_PATA40;
        hwif->dma_host_on = &atiixp_dma_host_on;
        hwif->dma_host_off = &atiixp_dma_host_off;
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 7c57dc696f52..8631b6c8aa15 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/cmd64x.c               Version 1.47    Mar 19, 2007
+ * linux/drivers/ide/pci/cmd64x.c               Version 1.50    May 10, 2007
 *
 * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines.
 *           Due to massive hardware bugs, UltraDMA is only supported
@@ -52,9 +52,6 @@
 #define   ARTTIM23_DIS_RA2      0x04
 #define   ARTTIM23_DIS_RA3      0x08
 #define   ARTTIM23_INTR_CH1     0x10
-#define ARTTIM2         0x57
-#define ARTTIM3         0x57
-#define DRWTIM23        0x58
 #define DRWTIM2         0x58
 #define BRST            0x59
 #define DRWTIM3         0x5b
@@ -469,71 +466,43 @@ static int cmd646_1_ide_dma_end (ide_drive_t *drive)
 static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const char *name)
 {
-        u32 class_rev = 0;
        u8 mrdmode = 0;
-        pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
+        if (dev->device == PCI_DEVICE_ID_CMD_646) {
-        class_rev &= 0xff;
+                u8 rev = 0;
-        switch(dev->device) {
+                pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
-                case PCI_DEVICE_ID_CMD_643:
-                        break;
+                switch (rev) {
-                case PCI_DEVICE_ID_CMD_646:
+                case 0x07:
-                        printk(KERN_INFO "%s: chipset revision 0x%02X, ", name, class_rev);
+                case 0x05:
-                        switch(class_rev) {
+                        printk("%s: UltraDMA capable", name);
-                                case 0x07:
-                                case 0x05:
-                                        printk("UltraDMA Capable");
-                                        break;
-                                case 0x03:
-                                        printk("MultiWord DMA Force Limited");
-                                        break;
-                                case 0x01:
-                                default:
-                                        printk("MultiWord DMA Limited, IRQ workaround enabled");
-                                        break;
-                                }
-                        printk("\n");
-                        break;
-                case PCI_DEVICE_ID_CMD_648:
-                case PCI_DEVICE_ID_CMD_649:
                        break;
+                case 0x03:
                default:
+                        printk("%s: MultiWord DMA force limited", name);
+                        break;
+                case 0x01:
+                        printk("%s: MultiWord DMA limited, "
+                               "IRQ workaround enabled\n", name);
                        break;
+                }
        }
        /* Set a good latency timer and cache line size value. */
        (void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64);
        /* FIXME: pci_set_master() to ensure a good latency timer value */
-        /* Setup interrupts. */
+        /*
-        (void) pci_read_config_byte(dev, MRDMODE, &mrdmode);
+         * Enable interrupts, select MEMORY READ LINE for reads.
-        mrdmode &= ~(0x30);
+         *
-        (void) pci_write_config_byte(dev, MRDMODE, mrdmode);
+         * NOTE: although not mentioned in the PCI0646U specs,
+         * bits 0-1 are write only and won't be read back as
-        /* Use MEMORY READ LINE for reads.
+         * set or not -- PCI0646U2 specs clarify this point.
-         * NOTE: Although not mentioned in the PCI0646U specs,
-         *       these bits are write only and won't be read
-         *       back as set or not.  The PCI0646U2 specs clarify
-         *       this point.
         */
-        (void) pci_write_config_byte(dev, MRDMODE, mrdmode | 0x02);
+        (void) pci_read_config_byte (dev, MRDMODE, &mrdmode);
+        mrdmode &= ~0x30;
-        /* Set reasonable active/recovery/address-setup values. */
+        (void) pci_write_config_byte(dev, MRDMODE, (mrdmode | 0x02));
-        (void) pci_write_config_byte(dev, ARTTIM0,  0x40);
-        (void) pci_write_config_byte(dev, DRWTIM0,  0x3f);
-        (void) pci_write_config_byte(dev, ARTTIM1,  0x40);
-        (void) pci_write_config_byte(dev, DRWTIM1,  0x3f);
-#ifdef __i386__
-        (void) pci_write_config_byte(dev, ARTTIM23, 0x1c);
-#else
-        (void) pci_write_config_byte(dev, ARTTIM23, 0x5c);
-#endif
-        (void) pci_write_config_byte(dev, DRWTIM23, 0x3f);
-        (void) pci_write_config_byte(dev, DRWTIM3,  0x3f);
-#ifdef CONFIG_PPC
-        (void) pci_write_config_byte(dev, UDIDETCR0, 0xf0);
-#endif /* CONFIG_PPC */
 #if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
@@ -548,29 +517,27 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha
        return 0;
 }
-static unsigned int __devinit ata66_cmd64x(ide_hwif_t *hwif)
+static u8 __devinit ata66_cmd64x(ide_hwif_t *hwif)
 {
-        u8 ata66 = 0, mask = (hwif->channel) ? 0x02 : 0x01;
+        struct pci_dev  *dev    = hwif->pci_dev;
+        u8 bmidecsr = 0, mask   = hwif->channel ? 0x02 : 0x01;
-        switch(hwif->pci_dev->device) {
+        switch (dev->device) {
-                case PCI_DEVICE_ID_CMD_643:
+        case PCI_DEVICE_ID_CMD_648:
-                case PCI_DEVICE_ID_CMD_646:
+        case PCI_DEVICE_ID_CMD_649:
-                        return ata66;
+                pci_read_config_byte(dev, BMIDECSR, &bmidecsr);
-                default:
+                return (bmidecsr & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-                        break;
+        default:
+                return ATA_CBL_PATA40;
        }
-        pci_read_config_byte(hwif->pci_dev, BMIDECSR, &ata66);
-        return (ata66 & mask) ? 1 : 0;
 }
 static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 {
        struct pci_dev *dev     = hwif->pci_dev;
-        unsigned int class_rev;
+        u8 rev                  = 0;
-        hwif->autodma = 0;
+        pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
-        pci_read_config_dword(dev, PCI_CLASS_REVISION, &class_rev);
-        class_rev &= 0xff;
        hwif->tuneproc  = &cmd64x_tune_drive;
        hwif->speedproc = &cmd64x_tune_chipset;
@@ -580,8 +547,8 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
        if (!hwif->dma_base)
                return;
-        hwif->atapi_dma = 1;
+        hwif->atapi_dma  = 1;
+        hwif->mwdma_mask = 0x07;
        hwif->ultra_mask = hwif->cds->udma_mask;
        /*
@@ -596,16 +563,15 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
         *
         * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
         */
-        if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5)
+        if (dev->device == PCI_DEVICE_ID_CMD_646 && rev < 5)
                hwif->ultra_mask = 0x00;
-        hwif->mwdma_mask = 0x07;
        hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
-        if (!(hwif->udma_four))
-                hwif->udma_four = ata66_cmd64x(hwif);
-        switch(dev->device) {
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                hwif->cbl = ata66_cmd64x(hwif);
+        switch (dev->device) {
        case PCI_DEVICE_ID_CMD_648:
        case PCI_DEVICE_ID_CMD_649:
        alt_irq_bits:
@@ -614,10 +580,10 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
                break;
        case PCI_DEVICE_ID_CMD_646:
                hwif->chipset = ide_cmd646;
-                if (class_rev == 0x01) {
+                if (rev == 0x01) {
                        hwif->ide_dma_end = &cmd646_1_ide_dma_end;
                        break;
-                } else if (class_rev >= 0x03)
+                } else if (rev >= 0x03)
                        goto alt_irq_bits;
                /* fall thru */
        default:
@@ -626,11 +592,9 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
                break;
        }
        if (!noautodma)
                hwif->autodma = 1;
-        hwif->drives[0].autodma = hwif->autodma;
+        hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
-        hwif->drives[1].autodma = hwif->autodma;
 }
 static int __devinit init_setup_cmd64x(struct pci_dev *dev, ide_pci_device_t *d)
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 41925c47ef05..10f61f38243c 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -187,7 +187,8 @@ static u8 __devinit cs5535_cable_detect(struct pci_dev *dev)
        /* if a 80 wire cable was detected */
        pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit);
-        return (bit & 1);
+        return (bit & 1) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
 }
 /****
@@ -212,8 +213,7 @@ static void __devinit init_hwif_cs5535(ide_hwif_t *hwif)
        hwif->ultra_mask = 0x1F;
        hwif->mwdma_mask = 0x07;
+        hwif->cbl = cs5535_cable_detect(hwif->pci_dev);
-        hwif->udma_four = cs5535_cable_detect(hwif->pci_dev);
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index c33d0b0f11c9..4b6bae8eee82 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/hpt366.c               Version 1.06    Jun 27, 2007
+ * linux/drivers/ide/pci/hpt366.c               Version 1.10    Jun 29, 2007
 *
 * Copyright (C) 1999-2003              Andre Hedrick <andre@linux-ide.org>
 * Portions Copyright (C) 2001          Sun Microsystems, Inc.
@@ -77,7 +77,7 @@
 *   since they may tamper with its fields
 * - prefix the driver startup messages with the real chip name
 * - claim the extra 240 bytes of I/O space for all chips
- * - optimize the rate masking/filtering and the drive list lookup code
+ * - optimize the UltraDMA filtering and the drive list lookup code
 * - use pci_get_slot() to get to the function 1 of HPT36x/374
 * - cache offset of the channel's misc. control registers (MCRs) being used
 *   throughout the driver
@@ -99,9 +99,9 @@
 *   stop duplicating it for each channel by storing the pointer in the pci_dev
 *   structure: first, at the init_setup stage, point it to a static "template"
 *   with only the chip type and its specific base DPLL frequency, the highest
- *   supported DMA mode, and the chip settings table pointer filled, then, at
+ *   UltraDMA mode, and the chip settings table pointer filled,  then, at the
- *   the init_chipset stage, allocate per-chip instance  and fill it with the
+ *   init_chipset stage, allocate per-chip instance  and fill it with the rest
- *   rest of the necessary information
+ *   of the necessary information
 * - get rid of the constant thresholds in the HPT37x PCI clock detection code,
 *   switch  to calculating  PCI clock frequency based on the chip's base DPLL
 *   frequency
@@ -112,6 +112,7 @@
 *   also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips;
 *   unify HPT36x/37x timing setup code and the speedproc handlers by joining
 *   the register setting lists into the table indexed by the clock selected
+ * - set the correct hwif->ultra_mask for each individual chip
 *      Sergei Shtylyov, <sshtylyov@ru.mvista.com> or <source@mvista.com>
 */
@@ -391,7 +392,7 @@ enum ata_clock {
 struct hpt_info {
        u8 chip_type;           /* Chip type */
-        u8 max_mode;            /* Speeds allowed */
+        u8 max_ultra;           /* Max. UltraDMA mode allowed */
        u8 dpll_clk;            /* DPLL clock in MHz */
        u8 pci_clk;             /* PCI  clock in MHz */
        u32 **settings;         /* Chipset settings table */
@@ -430,77 +431,77 @@ static u32 *hpt37x_settings[NUM_ATA_CLOCKS] = {
 static struct hpt_info hpt36x __devinitdata = {
        .chip_type      = HPT36x,
-        .max_mode       = (HPT366_ALLOW_ATA66_4 || HPT366_ALLOW_ATA66_3) ? 2 : 1,
+        .max_ultra      = HPT366_ALLOW_ATA66_3 ? (HPT366_ALLOW_ATA66_4 ? 4 : 3) : 2,
        .dpll_clk       = 0,    /* no DPLL */
        .settings       = hpt36x_settings
 };
 static struct hpt_info hpt370 __devinitdata = {
        .chip_type      = HPT370,
-        .max_mode       = HPT370_ALLOW_ATA100_5 ? 3 : 2,
+        .max_ultra      = HPT370_ALLOW_ATA100_5 ? 5 : 4,
        .dpll_clk       = 48,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt370a __devinitdata = {
        .chip_type      = HPT370A,
-        .max_mode       = HPT370_ALLOW_ATA100_5 ? 3 : 2,
+        .max_ultra      = HPT370_ALLOW_ATA100_5 ? 5 : 4,
        .dpll_clk       = 48,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt374 __devinitdata = {
        .chip_type      = HPT374,
-        .max_mode       = 3,
+        .max_ultra      = 5,
        .dpll_clk       = 48,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt372 __devinitdata = {
        .chip_type      = HPT372,
-        .max_mode       = HPT372_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT372_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 55,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt372a __devinitdata = {
        .chip_type      = HPT372A,
-        .max_mode       = HPT372_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT372_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 66,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt302 __devinitdata = {
        .chip_type      = HPT302,
-        .max_mode       = HPT302_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT372_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 66,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt371 __devinitdata = {
        .chip_type      = HPT371,
-        .max_mode       = HPT371_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT371_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 66,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt372n __devinitdata = {
        .chip_type      = HPT372N,
-        .max_mode       = HPT372_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT372_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 77,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt302n __devinitdata = {
        .chip_type      = HPT302N,
-        .max_mode       = HPT302_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT302_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 77,
        .settings       = hpt37x_settings
 };
 static struct hpt_info hpt371n __devinitdata = {
        .chip_type      = HPT371N,
-        .max_mode       = HPT371_ALLOW_ATA133_6 ? 4 : 3,
+        .max_ultra      = HPT371_ALLOW_ATA133_6 ? 6 : 5,
        .dpll_clk       = 77,
        .settings       = hpt37x_settings
 };
@@ -523,53 +524,38 @@ static int check_in_drive_list(ide_drive_t *drive, const char **list)
 static u8 hpt3xx_udma_filter(ide_drive_t *drive)
 {
        struct hpt_info *info   = pci_get_drvdata(HWIF(drive)->pci_dev);
-        u8 chip_type            = info->chip_type;
-        u8 mode                 = info->max_mode;
        u8 mask;
-        switch (mode) {
+        switch (info->chip_type) {
-                case 0x04:
+        case HPT370A:
-                        mask = 0x7f;
+                if (!HPT370_ALLOW_ATA100_5 ||
-                        break;
+                    check_in_drive_list(drive, bad_ata100_5))
-                case 0x03:
+                        return 0x1f;
+                else
+                        return 0x3f;
+        case HPT370:
+                if (!HPT370_ALLOW_ATA100_5 ||
+                    check_in_drive_list(drive, bad_ata100_5))
+                        mask = 0x1f;
+                else
                        mask = 0x3f;
-                        if (chip_type >= HPT374)
+                break;
-                                break;
+        case HPT36x:
-                        if (!check_in_drive_list(drive, bad_ata100_5))
+                if (!HPT366_ALLOW_ATA66_4 ||
-                                goto check_bad_ata33;
+                    check_in_drive_list(drive, bad_ata66_4))
-                        /* fall thru */
+                        mask = 0x0f;
-                case 0x02:
+                else
                        mask = 0x1f;
-                        /*
+                if (!HPT366_ALLOW_ATA66_3 ||
-                         * CHECK ME, Does this need to be changed to HPT374 ??
+                    check_in_drive_list(drive, bad_ata66_3))
-                         */
-                        if (chip_type >= HPT370)
-                                goto check_bad_ata33;
-                        if (HPT366_ALLOW_ATA66_4 &&
-                            !check_in_drive_list(drive, bad_ata66_4))
-                                goto check_bad_ata33;
-                        mask = 0x0f;
-                        if (HPT366_ALLOW_ATA66_3 &&
-                            !check_in_drive_list(drive, bad_ata66_3))
-                                goto check_bad_ata33;
-                        /* fall thru */
-                case 0x01:
                        mask = 0x07;
+                break;
-                check_bad_ata33:
+        default:
-                        if (chip_type >= HPT370A)
+                return 0x7f;
-                                break;
-                        if (!check_in_drive_list(drive, bad_ata33))
-                                break;
-                        /* fall thru */
-                case 0x00:
-                default:
-                        mask = 0x00;
-                        break;
        }
-        return mask;
+        return check_in_drive_list(drive, bad_ata33) ? 0x00 : mask;
 }
 static u32 get_speed_setting(u8 speed, struct hpt_info *info)
@@ -737,7 +723,7 @@ static int hpt366_config_drive_xfer_rate(ide_drive_t *drive)
 * This is specific to the HPT366 UDMA chipset
 * by HighPoint|Triones Technologies, Inc.
 */
-static int hpt366_ide_dma_lostirq(ide_drive_t *drive)
+static void hpt366_dma_lost_irq(ide_drive_t *drive)
 {
        struct pci_dev *dev = HWIF(drive)->pci_dev;
        u8 mcr1 = 0, mcr3 = 0, scr1 = 0;
@@ -749,7 +735,7 @@ static int hpt366_ide_dma_lostirq(ide_drive_t *drive)
                drive->name, __FUNCTION__, mcr1, mcr3, scr1);
        if (scr1 & 0x10)
                pci_write_config_byte(dev, 0x5a, scr1 & ~0x10);
-        return __ide_dma_lostirq(drive);
+        ide_dma_lost_irq(drive);
 }
 static void hpt370_clear_engine(ide_drive_t *drive)
@@ -799,10 +785,10 @@ static int hpt370_ide_dma_end(ide_drive_t *drive)
        return __ide_dma_end(drive);
 }
-static int hpt370_ide_dma_timeout(ide_drive_t *drive)
+static void hpt370_dma_timeout(ide_drive_t *drive)
 {
        hpt370_irq_timeout(drive);
-        return __ide_dma_timeout(drive);
+        ide_dma_timeout(drive);
 }
 /* returns 1 if DMA IRQ issued, 0 otherwise */
@@ -1150,7 +1136,7 @@ static unsigned int __devinit init_chipset_hpt366(struct pci_dev *dev, const cha
                  * Select 66 MHz DPLL clock only if UltraATA/133 mode is
                  * supported/enabled, use 50 MHz DPLL clock otherwise...
                  */
-                if (info->max_mode == 0x04) {
+                if (info->max_ultra == 6) {
                        dpll_clk = 66;
                        clock = ATA_CLOCK_66MHZ;
                } else if (dpll_clk) {  /* HPT36x chips don't have DPLL */
@@ -1243,7 +1229,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
        struct pci_dev  *dev            = hwif->pci_dev;
        struct hpt_info *info           = pci_get_drvdata(dev);
        int serialize                   = HPT_SERIALIZE_IO;
-        u8  scr1 = 0, ata66             = (hwif->channel) ? 0x01 : 0x02;
+        u8  scr1 = 0, ata66             = hwif->channel ? 0x01 : 0x02;
        u8  chip_type                   = info->chip_type;
        u8  new_mcr, old_mcr            = 0;
@@ -1256,7 +1242,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
        hwif->intrproc                  = &hpt3xx_intrproc;
        hwif->maskproc                  = &hpt3xx_maskproc;
        hwif->busproc                   = &hpt3xx_busproc;
-        hwif->udma_filter               = &hpt3xx_udma_filter;
+        if (chip_type <= HPT370A)
+                hwif->udma_filter       = &hpt3xx_udma_filter;
        /*
         * HPT3xxN chips have some complications:
@@ -1305,7 +1293,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
                return;
        }
-        hwif->ultra_mask = 0x7f;
+        hwif->ultra_mask = hwif->cds->udma_mask;
        hwif->mwdma_mask = 0x07;
        /*
@@ -1342,8 +1330,8 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
        } else
                pci_read_config_byte (dev, 0x5a, &scr1);
-        if (!hwif->udma_four)
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                hwif->udma_four = (scr1 & ata66) ? 0 : 1;
+                hwif->cbl = (scr1 & ata66) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
        hwif->ide_dma_check             = &hpt366_config_drive_xfer_rate;
@@ -1353,9 +1341,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
        } else if (chip_type >= HPT370) {
                hwif->dma_start         = &hpt370_ide_dma_start;
                hwif->ide_dma_end       = &hpt370_ide_dma_end;
-                hwif->ide_dma_timeout   = &hpt370_ide_dma_timeout;
+                hwif->dma_timeout       = &hpt370_dma_timeout;
        } else
-                hwif->ide_dma_lostirq   = &hpt366_ide_dma_lostirq;
+                hwif->dma_lost_irq      = &hpt366_dma_lost_irq;
        if (!noautodma)
                hwif->autodma = 1;
@@ -1503,9 +1491,35 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
        pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
-        if (rev > 6)
+        switch (rev) {
+        case 0:
+        case 1:
+        case 2:
+                /*
+                 * HPT36x chips have one channel per function and have
+                 * both channel enable bits located differently and visible
+                 * to both functions -- really stupid design decision... :-(
+                 * Bit 4 is for the primary channel, bit 5 for the secondary.
+                 */
+                d->channels = 1;
+                d->enablebits[0].mask = d->enablebits[0].val = 0x10;
+                d->udma_mask = HPT366_ALLOW_ATA66_3 ?
+                              (HPT366_ALLOW_ATA66_4 ? 0x1f : 0x0f) : 0x07;
+                break;
+        case 3:
+        case 4:
+                d->udma_mask = HPT370_ALLOW_ATA100_5 ? 0x3f : 0x1f;
+                break;
+        default:
                rev = 6;
-                
+                /* fall thru */
+        case 5:
+        case 6:
+                d->udma_mask = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f;
+                break;
+        }
        d->name = chipset_names[rev];
        pci_set_drvdata(dev, info[rev]);
@@ -1513,15 +1527,6 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
        if (rev > 2)
                goto init_single;
-        /*
-         * HPT36x chips have one channel per function and have
-         * both channel enable bits located differently and visible
-         * to both functions -- really stupid design decision... :-(
-         * Bit 4 is for the primary channel, bit 5 for the secondary.
-         */
-        d->channels = 1;
-        d->enablebits[0].mask = d->enablebits[0].val = 0x10;
        if ((dev2 = pci_get_slot(dev->bus, dev->devfn + 1)) != NULL) {
                u8  mcr1 = 0, pin1 = 0, pin2 = 0;
                int ret;
@@ -1573,6 +1578,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
+                .udma_mask      = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
                .bootable       = OFF_BOARD,
                .extra          = 240
        },{     /* 2 */
@@ -1584,6 +1590,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
+                .udma_mask      = HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f,
                .bootable       = OFF_BOARD,
                .extra          = 240
        },{     /* 3 */
@@ -1595,6 +1602,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
                .channels       = 2,
                .autodma        = AUTODMA,
                .enablebits     = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
+                .udma_mask      = HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f,
                .bootable       = OFF_BOARD,
                .extra          = 240
        },{     /* 4 */
@@ -1606,6 +1614,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
                .channels       = 2,    /* 4 */
                .autodma        = AUTODMA,
                .enablebits     = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
+                .udma_mask      = 0x3f,
                .bootable       = OFF_BOARD,
                .extra          = 240
        },{     /* 5 */
@@ -1617,6 +1626,7 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
                .channels       = 2,    /* 4 */
                .autodma        = AUTODMA,
                .enablebits     = {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
+                .udma_mask      = HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
                .bootable       = OFF_BOARD,
                .extra          = 240
        }
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index c04a02687b95..ff48c23e571e 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -231,7 +231,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive)
 static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
 {
-        u8 reg42h = 0, ata66 = 0;
+        u8 reg42h = 0;
        hwif->speedproc = &it8213_tune_chipset;
        hwif->tuneproc  = &it8213_tuneproc;
@@ -250,11 +250,11 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
        hwif->swdma_mask = 0x04;
        pci_read_config_byte(hwif->pci_dev, 0x42, &reg42h);
-        ata66 = (reg42h & 0x02) ? 0 : 1;
        hwif->ide_dma_check = &it8213_config_drive_for_dma;
-        if (!(hwif->udma_four))
-                hwif->udma_four = ata66;
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                hwif->cbl = (reg42h & 0x02) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
        /*
         *      The BIOS often doesn't set up DMA on this controller
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 3aeb7f1b7916..8197b653ba1e 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -491,10 +491,10 @@ static int it821x_config_drive_for_dma (ide_drive_t *drive)
 *      the needed logic onboard.
 */
-static unsigned int __devinit ata66_it821x(ide_hwif_t *hwif)
+static u8 __devinit ata66_it821x(ide_hwif_t *hwif)
 {
        /* The reference driver also only does disk side */
-        return 1;
+        return ATA_CBL_PATA80;
 }
 /**
@@ -662,8 +662,9 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif)
        hwif->mwdma_mask = 0x07;
        hwif->ide_dma_check = &it821x_config_drive_for_dma;
-        if (!(hwif->udma_four))
-                hwif->udma_four = ata66_it821x(hwif);
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                hwif->cbl = ata66_it821x(hwif);
        /*
         *      The BIOS often doesn't set up DMA on this controller
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index 76ed25147229..a6008f63e71e 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -25,10 +25,10 @@ typedef enum {
 *      ata66_jmicron           -       Cable check
 *      @hwif: IDE port
 *
- *      Return 1 if the cable is 80pin
+ *      Returns the cable type.
 */
-static int __devinit ata66_jmicron(ide_hwif_t *hwif)
+static u8 __devinit ata66_jmicron(ide_hwif_t *hwif)
 {
        struct pci_dev *pdev = hwif->pci_dev;
@@ -70,16 +70,17 @@ static int __devinit ata66_jmicron(ide_hwif_t *hwif)
        {
        case PORT_PATA0:
                if (control & (1 << 3)) /* 40/80 pin primary */
-                        return 0;
+                        return ATA_CBL_PATA40;
-                return 1;
+                return ATA_CBL_PATA80;
        case PORT_PATA1:
                if (control5 & (1 << 19))       /* 40/80 pin secondary */
-                        return 0;
+                        return ATA_CBL_PATA40;
-                return 1;
+                return ATA_CBL_PATA80;
        case PORT_SATA:
                break;
        }
-        return 1; /* Avoid bogus "control reaches end of non-void function" */
+        /* Avoid bogus "control reaches end of non-void function" */
+        return ATA_CBL_PATA80;
 }
 static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted)
@@ -159,8 +160,9 @@ static void __devinit init_hwif_jmicron(ide_hwif_t *hwif)
        hwif->mwdma_mask = 0x07;
        hwif->ide_dma_check = &jmicron_config_drive_for_dma;
-        if (!(hwif->udma_four))
-                hwif->udma_four = ata66_jmicron(hwif);
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                hwif->cbl = ata66_jmicron(hwif);
        hwif->autodma = 1;
        hwif->drives[0].autodma = hwif->autodma;
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 0765dce6948e..ee5020df005d 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -225,7 +225,10 @@ static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio)
 static u8 pdcnew_cable_detect(ide_hwif_t *hwif)
 {
-        return get_indexed_reg(hwif, 0x0b) & 0x04;
+        if (get_indexed_reg(hwif, 0x0b) & 0x04)
+                return ATA_CBL_PATA40;
+        else
+                return ATA_CBL_PATA80;
 }
 static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive)
@@ -509,8 +512,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif)
        hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate;
-        if (!hwif->udma_four)
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                hwif->udma_four = pdcnew_cable_detect(hwif) ? 0 : 1;
+                hwif->cbl = pdcnew_cable_detect(hwif);
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 23844687deea..41ac4a94959f 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -152,8 +152,10 @@ static void pdc202xx_tune_drive(ide_drive_t *drive, u8 pio)
 static u8 pdc202xx_old_cable_detect (ide_hwif_t *hwif)
 {
        u16 CIS = 0, mask = (hwif->channel) ? (1<<11) : (1<<10);
        pci_read_config_word(hwif->pci_dev, 0x50, &CIS);
-        return (CIS & mask) ? 1 : 0;
+        return (CIS & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
 }
 /*
@@ -267,18 +269,24 @@ somebody_else:
        return (dma_stat & 4) == 4;     /* return 1 if INTR asserted */
 }
-static int pdc202xx_ide_dma_lostirq(ide_drive_t *drive)
+static void pdc202xx_dma_lost_irq(ide_drive_t *drive)
 {
-        if (HWIF(drive)->resetproc != NULL)
+        ide_hwif_t *hwif = HWIF(drive);
-                HWIF(drive)->resetproc(drive);
-        return __ide_dma_lostirq(drive);
+        if (hwif->resetproc != NULL)
+                hwif->resetproc(drive);
+        ide_dma_lost_irq(drive);
 }
-static int pdc202xx_ide_dma_timeout(ide_drive_t *drive)
+static void pdc202xx_dma_timeout(ide_drive_t *drive)
 {
-        if (HWIF(drive)->resetproc != NULL)
+        ide_hwif_t *hwif = HWIF(drive);
-                HWIF(drive)->resetproc(drive);
-        return __ide_dma_timeout(drive);
+        if (hwif->resetproc != NULL)
+                hwif->resetproc(drive);
+        ide_dma_timeout(drive);
 }
 static void pdc202xx_reset_host (ide_hwif_t *hwif)
@@ -347,12 +355,13 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif)
        hwif->err_stops_fifo = 1;
        hwif->ide_dma_check = &pdc202xx_config_drive_xfer_rate;
-        hwif->ide_dma_lostirq = &pdc202xx_ide_dma_lostirq;
+        hwif->dma_lost_irq = &pdc202xx_dma_lost_irq;
-        hwif->ide_dma_timeout = &pdc202xx_ide_dma_timeout;
+        hwif->dma_timeout = &pdc202xx_dma_timeout;
        if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) {
-                if (!(hwif->udma_four))
+                if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                        hwif->udma_four = (pdc202xx_old_cable_detect(hwif)) ? 0 : 1;
+                        hwif->cbl = pdc202xx_old_cable_detect(hwif);
                hwif->dma_start = &pdc202xx_old_ide_dma_start;
                hwif->ide_dma_end = &pdc202xx_old_ide_dma_end;
        } 
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 8b219dd63024..2e0b29ef596a 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -1,5 +1,5 @@
 /*
- *  linux/drivers/ide/pci/piix.c        Version 0.47    February 8, 2007
+ *  linux/drivers/ide/pci/piix.c        Version 0.50    Jun 10, 2007
 *
 *  Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer
 *  Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
@@ -394,14 +394,45 @@ static void piix_dma_clear_irq(ide_drive_t *drive)
        hwif->OUTB(dma_stat, hwif->dma_status);
 }
-static int __devinit piix_cable_detect(ide_hwif_t *hwif)
+struct ich_laptop {
+        u16 device;
+        u16 subvendor;
+        u16 subdevice;
+};
+/*
+ *      List of laptops that use short cables rather than 80 wire
+ */
+static const struct ich_laptop ich_laptop[] = {
+        /* devid, subvendor, subdev */
+        { 0x27DF, 0x0005, 0x0280 },     /* ICH7 on Acer 5602WLMi */
+        { 0x27DF, 0x1025, 0x0110 },     /* ICH7 on Acer 3682WLMi */
+        { 0x27DF, 0x1043, 0x1267 },     /* ICH7 on Asus W5F */
+        { 0x24CA, 0x1025, 0x0061 },     /* ICH4 on Acer Aspire 2023WLMi */
+        /* end marker */
+        { 0, }
+};
+static u8 __devinit piix_cable_detect(ide_hwif_t *hwif)
 {
-        struct pci_dev *dev = hwif->pci_dev;
+        struct pci_dev *pdev = hwif->pci_dev;
+        const struct ich_laptop *lap = &ich_laptop[0];
        u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30;
-        pci_read_config_byte(dev, 0x54, &reg54h);
+        /* check for specials */
+        while (lap->device) {
+                if (lap->device == pdev->device &&
+                    lap->subvendor == pdev->subsystem_vendor &&
+                    lap->subdevice == pdev->subsystem_device) {
+                        return ATA_CBL_PATA40_SHORT;
+                }
+                lap++;
+        }
+        pci_read_config_byte(pdev, 0x54, &reg54h);
-        return (reg54h & mask) ? 1 : 0;
+        return (reg54h & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
 }
 /**
@@ -444,8 +475,8 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
        hwif->swdma_mask = 0x04;
        if (hwif->ultra_mask & 0x78) {
-                if (!hwif->udma_four)
+                if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                        hwif->udma_four = piix_cable_detect(hwif);
+                        hwif->cbl = piix_cable_detect(hwif);
        }
        if (no_piix_dma)
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 55bc0a32e34f..7b87488e3daa 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -716,7 +716,7 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif)
        hwif->atapi_dma = 1;
        /* we support 80c cable only. */
-        hwif->udma_four = 1;
+        hwif->cbl = ATA_CBL_PATA80;
        hwif->autodma = 0;
        if (!noautodma)
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index d9c4fd1ae996..1371b5bf6bf0 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/serverworks.c          Version 0.11    Jun 2 2007
+ * linux/drivers/ide/pci/serverworks.c          Version 0.20    Jun 3 2007
 *
 * Copyright (C) 1998-2000 Michel Aubry
 * Copyright (C) 1998-2000 Andrzej Krzysztofowicz
@@ -151,84 +151,11 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed)
        if(dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4 &&
                drive->media == ide_disk && speed >= XFER_UDMA_0)
                        BUG();
-                        
-        pci_read_config_byte(dev, drive_pci[drive->dn], &pio_timing);
-        pci_read_config_byte(dev, drive_pci2[drive->dn], &dma_timing);
        pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
        pci_read_config_word(dev, 0x4A, &csb5_pio);
        pci_read_config_byte(dev, 0x54, &ultra_enable);
-        /* If we are in RAID mode (eg AMI MegaIDE) then we can't it
-           turns out trust the firmware configuration */
-        if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
-                goto oem_setup_failed;
-        /* Per Specified Design by OEM, and ASIC Architect */
-        if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
-            (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) {
-                if (!drive->init_speed) {
-                        u8 dma_stat = inb(hwif->dma_status);
-                        if (((ultra_enable << (7-drive->dn) & 0x80) == 0x80) &&
-                            ((dma_stat & (1<<(5+unit))) == (1<<(5+unit)))) {
-                                drive->current_speed = drive->init_speed = XFER_UDMA_0 + udma_modes[(ultra_timing >> (4*unit)) & ~(0xF0)];
-                                return 0;
-                        } else if ((dma_timing) &&
-                                   ((dma_stat&(1<<(5+unit)))==(1<<(5+unit)))) {
-                                u8 dmaspeed;
-                                switch (dma_timing & 0x77) {
-                                case 0x20:
-                                        dmaspeed = XFER_MW_DMA_2;
-                                        break;
-                                case 0x21:
-                                        dmaspeed = XFER_MW_DMA_1;
-                                        break;
-                                case 0x77:
-                                        dmaspeed = XFER_MW_DMA_0;
-                                        break;
-                                default:
-                                        goto dma_pio;
-                                }
-                                drive->current_speed = drive->init_speed = dmaspeed;
-                                return 0;
-                        }
-dma_pio:
-                        if (pio_timing) {
-                                u8 piospeed;
-                                switch (pio_timing & 0x7f) {
-                                case 0x20:
-                                        piospeed = XFER_PIO_4;
-                                        break;
-                                case 0x22:
-                                        piospeed = XFER_PIO_3;
-                                        break;
-                                case 0x34:
-                                        piospeed = XFER_PIO_2;
-                                        break;
-                                case 0x47:
-                                        piospeed = XFER_PIO_1;
-                                        break;
-                                case 0x5d:
-                                        piospeed = XFER_PIO_0;
-                                        break;
-                                default:
-                                        goto oem_setup_failed;
-                                }
-                                drive->current_speed = drive->init_speed = piospeed;
-                                return 0;
-                        }
-                }
-        }
-oem_setup_failed:
-        pio_timing      = 0;
-        dma_timing      = 0;
        ultra_timing    &= ~(0x0F << (4*unit));
        ultra_enable    &= ~(0x01 << drive->dn);
        csb5_pio        &= ~(0x0F << (4*drive->dn));
@@ -402,9 +329,9 @@ static unsigned int __devinit init_chipset_svwks (struct pci_dev *dev, const cha
        return dev->irq;
 }
-static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif)
+static u8 __devinit ata66_svwks_svwks(ide_hwif_t *hwif)
 {
-        return 1;
+        return ATA_CBL_PATA80;
 }
 /* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits
@@ -414,7 +341,7 @@ static unsigned int __devinit ata66_svwks_svwks (ide_hwif_t *hwif)
 * Bit 14 clear = primary IDE channel does not have 80-pin cable.
 * Bit 14 set   = primary IDE channel has 80-pin cable.
 */
-static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
+static u8 __devinit ata66_svwks_dell(ide_hwif_t *hwif)
 {
        struct pci_dev *dev = hwif->pci_dev;
        if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
@@ -422,8 +349,8 @@ static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
            (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE ||
             dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE))
                return ((1 << (hwif->channel + 14)) &
-                        dev->subsystem_device) ? 1 : 0;
+                        dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-        return 0;
+        return ATA_CBL_PATA40;
 }
 /* Sun Cobalt Alpine hardware avoids the 80-pin cable
@@ -432,18 +359,18 @@ static unsigned int __devinit ata66_svwks_dell (ide_hwif_t *hwif)
 *
 * WARNING: this only works on Alpine hardware!
 */
-static unsigned int __devinit ata66_svwks_cobalt (ide_hwif_t *hwif)
+static u8 __devinit ata66_svwks_cobalt(ide_hwif_t *hwif)
 {
        struct pci_dev *dev = hwif->pci_dev;
        if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN &&
            dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
            dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE)
                return ((1 << (hwif->channel + 14)) &
-                        dev->subsystem_device) ? 1 : 0;
+                        dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-        return 0;
+        return ATA_CBL_PATA40;
 }
-static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif)
+static u8 __devinit ata66_svwks(ide_hwif_t *hwif)
 {
        struct pci_dev *dev = hwif->pci_dev;
@@ -462,9 +389,9 @@ static unsigned int __devinit ata66_svwks (ide_hwif_t *hwif)
        /* Per Specified Design by OEM, and ASIC Architect */
        if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
            (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2))
-                return 1;
+                return ATA_CBL_PATA80;
-        return 0;
+        return ATA_CBL_PATA40;
 }
 static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
@@ -495,8 +422,8 @@ static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
        hwif->ide_dma_check = &svwks_config_drive_xfer_rate;
        if (hwif->pci_dev->device != PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
-                if (!hwif->udma_four)
+                if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                        hwif->udma_four = ata66_svwks(hwif);
+                        hwif->cbl = ata66_svwks(hwif);
        }
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index d3185e29a38e..d396b2929ed8 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -316,14 +316,6 @@ static void sgiioc4_dma_host_off(ide_drive_t * drive)
        sgiioc4_clearirq(drive);
 }
-static int
-sgiioc4_ide_dma_lostirq(ide_drive_t * drive)
-{
-        HWIF(drive)->resetproc(drive);
-        return __ide_dma_lostirq(drive);
-}
 static void
 sgiioc4_resetproc(ide_drive_t * drive)
 {
@@ -331,6 +323,14 @@ sgiioc4_resetproc(ide_drive_t * drive)
        sgiioc4_clearirq(drive);
 }
+static void
+sgiioc4_dma_lost_irq(ide_drive_t * drive)
+{
+        sgiioc4_resetproc(drive);
+        ide_dma_lost_irq(drive);
+}
 static u8
 sgiioc4_INB(unsigned long port)
 {
@@ -607,8 +607,8 @@ ide_init_sgiioc4(ide_hwif_t * hwif)
        hwif->ide_dma_test_irq = &sgiioc4_ide_dma_test_irq;
        hwif->dma_host_on = &sgiioc4_dma_host_on;
        hwif->dma_host_off = &sgiioc4_dma_host_off;
-        hwif->ide_dma_lostirq = &sgiioc4_ide_dma_lostirq;
+        hwif->dma_lost_irq = &sgiioc4_dma_lost_irq;
-        hwif->ide_dma_timeout = &__ide_dma_timeout;
+        hwif->dma_timeout = &ide_dma_timeout;
        hwif->INB = &sgiioc4_INB;
 }
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 1a4444e7226a..1c3e35487893 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -933,16 +933,17 @@ static void __devinit init_iops_siimage(ide_hwif_t *hwif)
 *      interface.
 */
-static unsigned int __devinit ata66_siimage(ide_hwif_t *hwif)
+static u8 __devinit ata66_siimage(ide_hwif_t *hwif)
 {
        unsigned long addr = siimage_selreg(hwif, 0);
-        if (pci_get_drvdata(hwif->pci_dev) == NULL) {
+        u8 ata66 = 0;
-                u8 ata66 = 0;
+        if (pci_get_drvdata(hwif->pci_dev) == NULL)
                pci_read_config_byte(hwif->pci_dev, addr, &ata66);
-                return (ata66 & 0x01) ? 1 : 0;
+        else
-        }
+                ata66 = hwif->INB(addr);
-        return (hwif->INB(addr) & 0x01) ? 1 : 0;
+        return (ata66 & 0x01) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
 }
 /**
@@ -988,8 +989,9 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
                hwif->atapi_dma = 1;
        hwif->ide_dma_check = &siimage_config_drive_for_dma;
-        if (!(hwif->udma_four))
-                hwif->udma_four = ata66_siimage(hwif);
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
+                hwif->cbl = ata66_siimage(hwif);
        if (hwif->mmio) {
                hwif->ide_dma_test_irq = &siimage_mmio_ide_dma_test_irq;
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index ec0adad9ef61..f875183ac8d9 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pci/sis5513.c      Version 0.20    Mar 4, 2007
+ * linux/drivers/ide/pci/sis5513.c      Version 0.25    Jun 10, 2007
 *
 * Copyright (C) 1999-2000      Andre Hedrick <andre@linux-ide.org>
 * Copyright (C) 2002           Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer
@@ -796,10 +796,33 @@ static unsigned int __devinit init_chipset_sis5513 (struct pci_dev *dev, const c
        return 0;
 }
-static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif)
+struct sis_laptop {
+        u16 device;
+        u16 subvendor;
+        u16 subdevice;
+};
+static const struct sis_laptop sis_laptop[] = {
+        /* devid, subvendor, subdev */
+        { 0x5513, 0x1043, 0x1107 },     /* ASUS A6K */
+        /* end marker */
+        { 0, }
+};
+static u8 __devinit ata66_sis5513(ide_hwif_t *hwif)
 {
+        struct pci_dev *pdev = hwif->pci_dev;
+        const struct sis_laptop *lap = &sis_laptop[0];
        u8 ata66 = 0;
+        while (lap->device) {
+                if (lap->device == pdev->device &&
+                    lap->subvendor == pdev->subsystem_vendor &&
+                    lap->subdevice == pdev->subsystem_device)
+                        return ATA_CBL_PATA40_SHORT;
+                lap++;
+        }
        if (chipset_family >= ATA_133) {
                u16 regw = 0;
                u16 reg_addr = hwif->channel ? 0x52: 0x50;
@@ -811,7 +834,8 @@ static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif)
                pci_read_config_byte(hwif->pci_dev, 0x48, &reg48h);
                ata66 = (reg48h & mask) ? 0 : 1;
        }
-        return ata66;
+        return ata66 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
 }
 static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
@@ -841,8 +865,8 @@ static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
        if (!chipset_family)
                return;
-        if (!(hwif->udma_four))
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                hwif->udma_four = ata66_sis5513(hwif);
+                hwif->cbl = ata66_sis5513(hwif);
        if (chipset_family > ATA_16) {
                hwif->ide_dma_check = &sis5513_config_xfer_rate;
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index 7c383d9cc472..487879842af4 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -195,7 +195,7 @@ static inline void sl82c105_reset_host(struct pci_dev *dev)
 * This function is called when the IDE timer expires, the drive
 * indicates that it is READY, and we were waiting for DMA to complete.
 */
-static int sl82c105_ide_dma_lostirq(ide_drive_t *drive)
+static void sl82c105_dma_lost_irq(ide_drive_t *drive)
 {
        ide_hwif_t *hwif        = HWIF(drive);
        struct pci_dev *dev     = hwif->pci_dev;
@@ -222,9 +222,6 @@ static int sl82c105_ide_dma_lostirq(ide_drive_t *drive)
        }
        sl82c105_reset_host(dev);
-        /* __ide_dma_lostirq would return 1, so we do as well */
-        return 1;
 }
 /*
@@ -244,15 +241,12 @@ static void sl82c105_dma_start(ide_drive_t *drive)
        ide_dma_start(drive);
 }
-static int sl82c105_ide_dma_timeout(ide_drive_t *drive)
+static void sl82c105_dma_timeout(ide_drive_t *drive)
 {
-        ide_hwif_t *hwif        = HWIF(drive);
+        DBG(("sl82c105_dma_timeout(drive:%s)\n", drive->name));
-        struct pci_dev *dev     = hwif->pci_dev;
-        DBG(("sl82c105_ide_dma_timeout(drive:%s)\n", drive->name));
+        sl82c105_reset_host(HWIF(drive)->pci_dev);
+        ide_dma_timeout(drive);
-        sl82c105_reset_host(dev);
-        return __ide_dma_timeout(drive);
 }
 static int sl82c105_ide_dma_on(ide_drive_t *drive)
@@ -441,9 +435,9 @@ static void __devinit init_hwif_sl82c105(ide_hwif_t *hwif)
        hwif->ide_dma_check             = &sl82c105_ide_dma_check;
        hwif->ide_dma_on                = &sl82c105_ide_dma_on;
        hwif->dma_off_quietly           = &sl82c105_dma_off_quietly;
-        hwif->ide_dma_lostirq           = &sl82c105_ide_dma_lostirq;
+        hwif->dma_lost_irq              = &sl82c105_dma_lost_irq;
        hwif->dma_start                 = &sl82c105_dma_start;
-        hwif->ide_dma_timeout           = &sl82c105_ide_dma_timeout;
+        hwif->dma_timeout               = &sl82c105_dma_timeout;
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index c40f291f91e0..575dbbd8b482 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -199,10 +199,9 @@ static void __devinit init_hwif_slc90e66 (ide_hwif_t *hwif)
        hwif->mwdma_mask = 0x06;
        hwif->swdma_mask = 0x04;
-        if (!hwif->udma_four) {
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
                /* bit[0(1)]: 0:80, 1:40 */
-                hwif->udma_four = (reg47 & mask) ? 0 : 1;
+                hwif->cbl = (reg47 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-        }
        hwif->ide_dma_check = &slc90e66_config_drive_xfer_rate;
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index cee619bb2eaf..8de1f8e22494 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -220,13 +220,13 @@ static void __devinit init_hwif_tc86c001(ide_hwif_t *hwif)
        hwif->ide_dma_check     = &tc86c001_config_drive_xfer_rate;
        hwif->dma_start         = &tc86c001_dma_start;
-        if (!hwif->udma_four) {
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT) {
                /*
                 * System Control  1 Register bit 13 (PDIAGN):
                 * 0=80-pin cable, 1=40-pin cable
                 */
                scr1 = hwif->INW(sc_base + 0x00);
-                hwif->udma_four = (scr1 & 0x2000) ? 0 : 1;
+                hwif->cbl = (scr1 & 0x2000) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
        }
        if (!noautodma)
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index a508550c4095..d21dd2e7eeb3 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -1,6 +1,6 @@
 /*
 *
- * Version 3.38
+ * Version 3.45
 *
 * VIA IDE driver for Linux. Supported southbridges:
 *
@@ -9,6 +9,7 @@
 *   vt8235, vt8237, vt8237a
 *
 * Copyright (c) 2000-2002 Vojtech Pavlik
+ * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
 *
 * Based on the work of:
 *      Michel Aubry
@@ -33,6 +34,8 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/ide.h>
+#include <linux/dmi.h>
 #include <asm/io.h>
 #ifdef CONFIG_PPC_CHRP
@@ -41,8 +44,6 @@
 #include "ide-timing.h"
-#define DISPLAY_VIA_TIMINGS
 #define VIA_IDE_ENABLE          0x40
 #define VIA_IDE_CONFIG          0x41
 #define VIA_FIFO_CONFIG         0x43
@@ -54,18 +55,12 @@
 #define VIA_ADDRESS_SETUP       0x4c
 #define VIA_UDMA_TIMING         0x50
-#define VIA_UDMA                0x007
+#define VIA_BAD_PREQ            0x01 /* Crashes if PREQ# till DDACK# set */
-#define VIA_UDMA_NONE           0x000
+#define VIA_BAD_CLK66           0x02 /* 66 MHz clock doesn't work correctly */
-#define VIA_UDMA_33             0x001
+#define VIA_SET_FIFO            0x04 /* Needs to have FIFO split set */
-#define VIA_UDMA_66             0x002
+#define VIA_NO_UNMASK           0x08 /* Doesn't work with IRQ unmasking on */
-#define VIA_UDMA_100            0x003
+#define VIA_BAD_ID              0x10 /* Has wrong vendor ID (0x1107) */
-#define VIA_UDMA_133            0x004
+#define VIA_BAD_AST             0x20 /* Don't touch Address Setup Timing */
-#define VIA_BAD_PREQ            0x010   /* Crashes if PREQ# till DDACK# set */
-#define VIA_BAD_CLK66           0x020   /* 66 MHz clock doesn't work correctly */
-#define VIA_SET_FIFO            0x040   /* Needs to have FIFO split set */
-#define VIA_NO_UNMASK           0x080   /* Doesn't work with IRQ unmasking on */
-#define VIA_BAD_ID              0x100   /* Has wrong vendor ID (0x1107) */
-#define VIA_BAD_AST             0x200   /* Don't touch Address Setup Timing */
 /*
 * VIA SouthBridge chips.
@@ -76,36 +71,37 @@ static struct via_isa_bridge {
        u16 id;
        u8 rev_min;
        u8 rev_max;
-        u16 flags;
+        u8 udma_mask;
+        u8 flags;
 } via_isa_bridges[] = {
-        { "cx700",      PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "cx700",      PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8237s",    PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8237s",    PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt6410",     PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt6410",     PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8251",     PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8251",     PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8237",     PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8237",     PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8237a",    PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8237a",    PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8235",     PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8235",     PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8233a",    PCI_DEVICE_ID_VIA_8233A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
+        { "vt8233a",    PCI_DEVICE_ID_VIA_8233A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-        { "vt8233c",    PCI_DEVICE_ID_VIA_8233C_0,  0x00, 0x2f, VIA_UDMA_100 },
+        { "vt8233c",    PCI_DEVICE_ID_VIA_8233C_0,  0x00, 0x2f, ATA_UDMA5, },
-        { "vt8233",     PCI_DEVICE_ID_VIA_8233_0,   0x00, 0x2f, VIA_UDMA_100 },
+        { "vt8233",     PCI_DEVICE_ID_VIA_8233_0,   0x00, 0x2f, ATA_UDMA5, },
-        { "vt8231",     PCI_DEVICE_ID_VIA_8231,     0x00, 0x2f, VIA_UDMA_100 },
+        { "vt8231",     PCI_DEVICE_ID_VIA_8231,     0x00, 0x2f, ATA_UDMA5, },
-        { "vt82c686b",  PCI_DEVICE_ID_VIA_82C686,   0x40, 0x4f, VIA_UDMA_100 },
+        { "vt82c686b",  PCI_DEVICE_ID_VIA_82C686,   0x40, 0x4f, ATA_UDMA5, },
-        { "vt82c686a",  PCI_DEVICE_ID_VIA_82C686,   0x10, 0x2f, VIA_UDMA_66 },
+        { "vt82c686a",  PCI_DEVICE_ID_VIA_82C686,   0x10, 0x2f, ATA_UDMA4, },
-        { "vt82c686",   PCI_DEVICE_ID_VIA_82C686,   0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 },
+        { "vt82c686",   PCI_DEVICE_ID_VIA_82C686,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-        { "vt82c596b",  PCI_DEVICE_ID_VIA_82C596,   0x10, 0x2f, VIA_UDMA_66 },
+        { "vt82c596b",  PCI_DEVICE_ID_VIA_82C596,   0x10, 0x2f, ATA_UDMA4, },
-        { "vt82c596a",  PCI_DEVICE_ID_VIA_82C596,   0x00, 0x0f, VIA_UDMA_33 | VIA_BAD_CLK66 },
+        { "vt82c596a",  PCI_DEVICE_ID_VIA_82C596,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, VIA_UDMA_33 | VIA_SET_FIFO },
+        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, ATA_UDMA2, VIA_SET_FIFO },
-        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, VIA_UDMA_33 | VIA_SET_FIFO | VIA_BAD_PREQ },
+        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, ATA_UDMA2, VIA_SET_FIFO | VIA_BAD_PREQ },
-        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, VIA_UDMA_33 | VIA_SET_FIFO },
+        { "vt82c586b",  PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, ATA_UDMA2, VIA_SET_FIFO },
-        { "vt82c586a",  PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, VIA_UDMA_33 | VIA_SET_FIFO },
+        { "vt82c586a",  PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, ATA_UDMA2, VIA_SET_FIFO },
-        { "vt82c586",   PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, VIA_UDMA_NONE | VIA_SET_FIFO },
+        { "vt82c586",   PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f,      0x00, VIA_SET_FIFO },
-        { "vt82c576",   PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK },
+        { "vt82c576",   PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK },
-        { "vt82c576",   PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
+        { "vt82c576",   PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
        { NULL }
 };
 static unsigned int via_clock;
-static char *via_dma[] = { "MWDMA16", "UDMA33", "UDMA66", "UDMA100", "UDMA133" };
+static char *via_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
 struct via82cxxx_dev
 {
@@ -140,12 +136,12 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
        pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn),
                ((FIT(timing->active, 1, 16) - 1) << 4) | (FIT(timing->recover, 1, 16) - 1));
-        switch (vdev->via_config->flags & VIA_UDMA) {
+        switch (vdev->via_config->udma_mask) {
-                case VIA_UDMA_33:  t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
+        case ATA_UDMA2: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 5) - 2)) : 0x03; break;
-                case VIA_UDMA_66:  t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break;
+        case ATA_UDMA4: t = timing->udma ? (0xe8 | (FIT(timing->udma, 2, 9) - 2)) : 0x0f; break;
-                case VIA_UDMA_100: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
+        case ATA_UDMA5: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
-                case VIA_UDMA_133: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
+        case ATA_UDMA6: t = timing->udma ? (0xe0 | (FIT(timing->udma, 2, 9) - 2)) : 0x07; break;
-                default: return;
+        default: return;
        }
        pci_write_config_byte(dev, VIA_UDMA_TIMING + (3 - dn), t);
@@ -173,12 +169,12 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
        T = 1000000000 / via_clock;
-        switch (vdev->via_config->flags & VIA_UDMA) {
+        switch (vdev->via_config->udma_mask) {
-                case VIA_UDMA_33:   UT = T;   break;
+        case ATA_UDMA2: UT = T;   break;
-                case VIA_UDMA_66:   UT = T/2; break;
+        case ATA_UDMA4: UT = T/2; break;
-                case VIA_UDMA_100:  UT = T/3; break;
+        case ATA_UDMA5: UT = T/3; break;
-                case VIA_UDMA_133:  UT = T/4; break;
+        case ATA_UDMA6: UT = T/4; break;
-                default: UT = T;
+        default:        UT = T;
        }
        ide_timing_compute(drive, speed, &t, T, UT);
@@ -208,8 +204,7 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
 static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
 {
        if (pio == 255) {
-                via_set_drive(drive,
+                via_set_drive(drive, ide_find_best_pio_mode(drive));
-                        ide_find_best_mode(drive, XFER_PIO | XFER_EPIO));
                return;
        }
@@ -226,16 +221,10 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
 
 static int via82cxxx_ide_dma_check (ide_drive_t *drive)
 {
-        ide_hwif_t *hwif = HWIF(drive);
+        u8 speed = ide_max_dma_mode(drive);
-        struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
-        u16 w80 = hwif->udma_four;
-        u16 speed = ide_find_best_mode(drive,
+        if (speed == 0)
-                XFER_PIO | XFER_EPIO | XFER_SWDMA | XFER_MWDMA |
+                speed = ide_find_best_pio_mode(drive);
-                (vdev->via_config->flags & VIA_UDMA ? XFER_UDMA : 0) |
-                (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_66 ? XFER_UDMA_66 : 0) |
-                (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_100 ? XFER_UDMA_100 : 0) |
-                (w80 && (vdev->via_config->flags & VIA_UDMA) >= VIA_UDMA_133 ? XFER_UDMA_133 : 0));
        via_set_drive(drive, speed);
@@ -272,8 +261,8 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
 {
        int i;
-        switch (vdev->via_config->flags & VIA_UDMA) {
+        switch (vdev->via_config->udma_mask) {
-                case VIA_UDMA_66:
+                case ATA_UDMA4:
                        for (i = 24; i >= 0; i -= 8)
                                if (((u >> (i & 16)) & 8) &&
                                    ((u >> i) & 0x20) &&
@@ -286,7 +275,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
                                }
                        break;
-                case VIA_UDMA_100:
+                case ATA_UDMA5:
                        for (i = 24; i >= 0; i -= 8)
                                if (((u >> i) & 0x10) ||
                                    (((u >> i) & 0x20) &&
@@ -298,7 +287,7 @@ static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
                                }
                        break;
-                case VIA_UDMA_133:
+                case ATA_UDMA6:
                        for (i = 24; i >= 0; i -= 8)
                                if (((u >> i) & 0x10) ||
                                    (((u >> i) & 0x20) &&
@@ -353,7 +342,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
        via_cable_detect(vdev, u);
-        if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) {
+        if (via_config->udma_mask == ATA_UDMA4) {
                /* Enable Clk66 */
                pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008);
        } else if (via_config->flags & VIA_BAD_CLK66) {
@@ -416,16 +405,54 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
         */
        pci_read_config_byte(isa, PCI_REVISION_ID, &t);
-        printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %s "
+        printk(KERN_INFO "VP_IDE: VIA %s (rev %02x) IDE %sDMA%s "
                "controller on pci%s\n",
                via_config->name, t,
-                via_dma[via_config->flags & VIA_UDMA],
+                via_config->udma_mask ? "U" : "MW",
+                via_dma[via_config->udma_mask ?
+                        (fls(via_config->udma_mask) - 1) : 0],
                pci_name(dev));
        pci_dev_put(isa);
        return 0;
 }
+/*
+ *      Cable special cases
+ */
+static struct dmi_system_id cable_dmi_table[] = {
+        {
+                .ident = "Acer Ferrari 3400",
+                .matches = {
+                        DMI_MATCH(DMI_BOARD_VENDOR, "Acer,Inc."),
+                        DMI_MATCH(DMI_BOARD_NAME, "Ferrari 3400"),
+                },
+        },
+        { }
+};
+static int via_cable_override(void)
+{
+        /* Systems by DMI */
+        if (dmi_check_system(cable_dmi_table))
+                return 1;
+        return 0;
+}
+static u8 __devinit via82cxxx_cable_detect(ide_hwif_t *hwif)
+{
+        struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
+        if (via_cable_override())
+                return ATA_CBL_PATA40_SHORT;
+        if ((vdev->via_80w >> hwif->channel) & 1)
+                return ATA_CBL_PATA80;
+        else
+                return ATA_CBL_PATA40;
+}
 static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
 {
        struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev);
@@ -454,12 +481,14 @@ static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif)
                return;
        hwif->atapi_dma = 1;
-        hwif->ultra_mask = 0x7f;
+        hwif->ultra_mask = vdev->via_config->udma_mask;
        hwif->mwdma_mask = 0x07;
        hwif->swdma_mask = 0x07;
-        if (!hwif->udma_four)
+        if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-                hwif->udma_four = (vdev->via_80w >> hwif->channel) & 1;
+                hwif->cbl = via82cxxx_cable_detect(hwif);
        hwif->ide_dma_check = &via82cxxx_ide_dma_check;
        if (!noautodma)
                hwif->autodma = 1;
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index 45fc36f0f219..e46f47206542 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -942,8 +942,8 @@ pmac_ide_tune_chipset (ide_drive_t *drive, byte speed)
                                return 1;
                case XFER_UDMA_4:
                case XFER_UDMA_3:
-                        if (HWIF(drive)->udma_four == 0)
+                        if (drive->hwif->cbl != ATA_CBL_PATA80)
-                                return 1;               
+                                return 1;
                case XFER_UDMA_2:
                case XFER_UDMA_1:
                case XFER_UDMA_0:
@@ -1244,7 +1244,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
        hwif->chipset = ide_pmac;
        hwif->noprobe = !hwif->io_ports[IDE_DATA_OFFSET] || pmif->mediabay;
        hwif->hold = pmif->mediabay;
-        hwif->udma_four = pmif->cable_80;
+        hwif->cbl = pmif->cable_80 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
        hwif->drives[0].unmask = 1;
        hwif->drives[1].unmask = 1;
        hwif->tuneproc = pmac_ide_tuneproc;
@@ -1821,28 +1821,11 @@ pmac_ide_dma_check(ide_drive_t *drive)
                enable = 0;
        if (enable) {
-                short mode;
+                u8 mode = ide_max_dma_mode(drive);
-                
-                map = XFER_MWDMA;
+                if (mode >= XFER_UDMA_0)
-                if (pmif->kind == controller_kl_ata4
-                    || pmif->kind == controller_un_ata6
-                    || pmif->kind == controller_k2_ata6
-                    || pmif->kind == controller_sh_ata6) {
-                        map |= XFER_UDMA;
-                        if (pmif->cable_80) {
-                                map |= XFER_UDMA_66;
-                                if (pmif->kind == controller_un_ata6 ||
-                                    pmif->kind == controller_k2_ata6 ||
-                                    pmif->kind == controller_sh_ata6)
-                                        map |= XFER_UDMA_100;
-                                if (pmif->kind == controller_sh_ata6)
-                                        map |= XFER_UDMA_133;
-                        }
-                }
-                mode = ide_find_best_mode(drive, map);
-                if (mode & XFER_UDMA)
                        drive->using_dma = pmac_ide_udma_enable(drive, mode);
-                else if (mode & XFER_MWDMA)
+                else if (mode >= XFER_MW_DMA_0)
                        drive->using_dma = pmac_ide_mdma_enable(drive, mode);
                hwif->OUTB(0, IDE_CONTROL_REG);
                /* Apply settings to controller */
@@ -2004,20 +1987,19 @@ static void pmac_ide_dma_host_on(ide_drive_t *drive)
 {
 }
-static int
+static void
-pmac_ide_dma_lostirq (ide_drive_t *drive)
+pmac_ide_dma_lost_irq (ide_drive_t *drive)
 {
        pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data;
        volatile struct dbdma_regs __iomem *dma;
        unsigned long status;
        if (pmif == NULL)
-                return 0;
+                return;
        dma = pmif->dma_regs;
        status = readl(&dma->status);
        printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status);
-        return 0;
 }
 /*
@@ -2057,8 +2039,8 @@ pmac_ide_setup_dma(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
        hwif->ide_dma_test_irq = &pmac_ide_dma_test_irq;
        hwif->dma_host_off = &pmac_ide_dma_host_off;
        hwif->dma_host_on = &pmac_ide_dma_host_on;
-        hwif->ide_dma_timeout = &__ide_dma_timeout;
+        hwif->dma_timeout = &ide_dma_timeout;
-        hwif->ide_dma_lostirq = &pmac_ide_dma_lostirq;
+        hwif->dma_lost_irq = &pmac_ide_dma_lost_irq;
        hwif->atapi_dma = 1;
        switch(pmif->kind) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
index 79494c4f2b10..fa92f7f1d0d0 100644
--- a/fs/jfs/endian24.h
+++ b/fs/jfs/endian24.h
@@ -29,7 +29,7 @@
        __u32 __x = (x); \
        ((__u32)( \
                ((__x & (__u32)0x000000ffUL) << 16) | \
-                 (__x & (__u32)0x0000ff00UL)        | \
+                 (__x & (__u32)0x0000ff00UL)        | \
                ((__x & (__u32)0x00ff0000UL) >> 16) )); \
 })
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 9c5d59632aac..887f5759e536 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -26,34 +26,6 @@
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
-#ifdef CONFIG_JFS_DEBUG
-void dump_mem(char *label, void *data, int length)
-{
-        int i, j;
-        int *intptr = data;
-        char *charptr = data;
-        char buf[10], line[80];
-        printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
-               data);
-        for (i = 0; i < length; i += 16) {
-                line[0] = 0;
-                for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
-                        sprintf(buf, " %08x", intptr[i / 4 + j]);
-                        strcat(line, buf);
-                }
-                buf[0] = ' ';
-                buf[2] = 0;
-                for (j = 0; (j < 16) && (i + j < length); j++) {
-                        buf[1] =
-                            isprint(charptr[i + j]) ? charptr[i + j] : '.';
-                        strcat(line, buf);
-                }
-                printk("%s\n", line);
-        }
-}
-#endif
 #ifdef PROC_FS_JFS /* see jfs_debug.h */
 static struct proc_dir_entry *base;
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 7378798f0b21..044c1e654cc0 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,6 @@ extern void jfs_proc_clean(void);
 extern int jfsloglevel;
-extern void dump_mem(char *label, void *data, int length);
 extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 /* information message: e.g., configuration, major event */
@@ -94,7 +93,6 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 *      ---------
 */
 #else                           /* CONFIG_JFS_DEBUG */
-#define dump_mem(label,data,length) do {} while (0)
 #define ASSERT(p) do {} while (0)
 #define jfs_info(fmt, arg...) do {} while (0)
 #define jfs_debug(fmt, arg...) do {} while (0)
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 40b20111383c..c387540d3425 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -19,23 +19,23 @@
 #define _H_JFS_DINODE
 /*
- *      jfs_dinode.h: on-disk inode manager
+ *      jfs_dinode.h: on-disk inode manager
 */
-#define INODESLOTSIZE           128
+#define INODESLOTSIZE           128
-#define L2INODESLOTSIZE         7
+#define L2INODESLOTSIZE         7
-#define log2INODESIZE           9       /* log2(bytes per dinode) */
+#define log2INODESIZE           9       /* log2(bytes per dinode) */
 /*
- *      on-disk inode : 512 bytes
+ *      on-disk inode : 512 bytes
 *
 * note: align 64-bit fields on 8-byte boundary.
 */
 struct dinode {
        /*
-         *      I. base area (128 bytes)
+         *      I. base area (128 bytes)
-         *      ------------------------
+         *      ------------------------
         *
         * define generic/POSIX attributes
         */
@@ -70,16 +70,16 @@ struct dinode {
        __le32 di_acltype;      /* 4: Type of ACL */
        /*
-         *      Extension Areas.
+         *      Extension Areas.
         *
-         *      Historically, the inode was partitioned into 4 128-byte areas,
+         *      Historically, the inode was partitioned into 4 128-byte areas,
-         *      the last 3 being defined as unions which could have multiple
+         *      the last 3 being defined as unions which could have multiple
-         *      uses.  The first 96 bytes had been completely unused until
+         *      uses.  The first 96 bytes had been completely unused until
-         *      an index table was added to the directory.  It is now more
+         *      an index table was added to the directory.  It is now more
-         *      useful to describe the last 3/4 of the inode as a single
+         *      useful to describe the last 3/4 of the inode as a single
-         *      union.  We would probably be better off redesigning the
+         *      union.  We would probably be better off redesigning the
-         *      entire structure from scratch, but we don't want to break
+         *      entire structure from scratch, but we don't want to break
-         *      commonality with OS/2's JFS at this time.
+         *      commonality with OS/2's JFS at this time.
         */
        union {
                struct {
@@ -95,7 +95,7 @@ struct dinode {
                } _dir;                                 /* (384) */
 #define di_dirtable     u._dir._table
 #define di_dtroot       u._dir._dtroot
-#define di_parent       di_dtroot.header.idotdot
+#define di_parent       di_dtroot.header.idotdot
 #define di_DASD         di_dtroot.header.DASD
                struct {
@@ -127,14 +127,14 @@ struct dinode {
 #define di_inlinedata   u._file._u2._special._u
 #define di_rdev         u._file._u2._special._u._rdev
 #define di_fastsymlink  u._file._u2._special._u._fastsymlink
-#define di_inlineea     u._file._u2._special._inlineea
+#define di_inlineea     u._file._u2._special._inlineea
        } u;
 };
 /* extended mode bits (on-disk inode di_mode) */
-#define IFJOURNAL       0x00010000      /* journalled file */
+#define IFJOURNAL       0x00010000      /* journalled file */
-#define ISPARSE         0x00020000      /* sparse file enabled */
+#define ISPARSE         0x00020000      /* sparse file enabled */
-#define INLINEEA        0x00040000      /* inline EA area free */
+#define INLINEEA        0x00040000      /* inline EA area free */
 #define ISWAPFILE       0x00800000      /* file open for pager swap space */
 /* more extended mode bits: attributes for OS/2 */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index f3b1ebb22280..e1985066b1c6 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -154,12 +154,12 @@ static const s8 budtab[256] = {
 *              the in-core descriptor is initialized from disk.
 *
 * PARAMETERS:
- *      ipbmap  -  pointer to in-core inode for the block map.
+ *      ipbmap  - pointer to in-core inode for the block map.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOMEM - insufficient memory
+ *      -ENOMEM - insufficient memory
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbMount(struct inode *ipbmap)
 {
@@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap)
 *              the memory for this descriptor is freed.
 *
 * PARAMETERS:
- *      ipbmap  -  pointer to in-core inode for the block map.
+ *      ipbmap  - pointer to in-core inode for the block map.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbUnmount(struct inode *ipbmap, int mounterror)
 {
@@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap)
 *              at a time.
 *
 * PARAMETERS:
- *      ip      -  pointer to in-core inode;
+ *      ip      - pointer to in-core inode;
- *      blkno   -  starting block number to be freed.
+ *      blkno   - starting block number to be freed.
- *      nblocks -  number of blocks to be freed.
+ *      nblocks - number of blocks to be freed.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 {
@@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 /*
 * NAME:        dbUpdatePMap()
 *
- * FUNCTION:    update the allocation state (free or allocate) of the
+ * FUNCTION:    update the allocation state (free or allocate) of the
 *              specified block range in the persistent block allocation map.
 *
 *              the blocks will be updated in the persistent map one
 *              dmap at a time.
 *
 * PARAMETERS:
- *      ipbmap  -  pointer to in-core inode for the block map.
+ *      ipbmap  - pointer to in-core inode for the block map.
- *      free    -  'true' if block range is to be freed from the persistent
+ *      free    - 'true' if block range is to be freed from the persistent
- *                 map; 'false' if it is to   be allocated.
+ *                map; 'false' if it is to be allocated.
- *      blkno   -  starting block number of the range.
+ *      blkno   - starting block number of the range.
- *      nblocks -  number of contiguous blocks in the range.
+ *      nblocks - number of contiguous blocks in the range.
- *      tblk    -  transaction block;
+ *      tblk    - transaction block;
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int
 dbUpdatePMap(struct inode *ipbmap,
@@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap,
 /*
 * NAME:        dbNextAG()
 *
- * FUNCTION:    find the preferred allocation group for new allocations.
+ * FUNCTION:    find the preferred allocation group for new allocations.
 *
 *              Within the allocation groups, we maintain a preferred
 *              allocation group which consists of a group with at least
@@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap,
 *              empty ags around for large allocations.
 *
 * PARAMETERS:
- *      ipbmap  -  pointer to in-core inode for the block map.
+ *      ipbmap  - pointer to in-core inode for the block map.
 *
 * RETURN VALUES:
- *      the preferred allocation group number.
+ *      the preferred allocation group number.
 */
 int dbNextAG(struct inode *ipbmap)
 {
@@ -656,7 +656,7 @@ unlock:
 /*
 * NAME:        dbAlloc()
 *
- * FUNCTION:    attempt to allocate a specified number of contiguous free
+ * FUNCTION:    attempt to allocate a specified number of contiguous free
 *              blocks from the working allocation block map.
 *
 *              the block allocation policy uses hints and a multi-step
@@ -680,16 +680,16 @@ unlock:
 *              size or requests that specify no hint value.
 *
 * PARAMETERS:
- *      ip      -  pointer to in-core inode;
+ *      ip      - pointer to in-core inode;
- *      hint    - allocation hint.
+ *      hint    - allocation hint.
- *      nblocks - number of contiguous blocks in the range.
+ *      nblocks - number of contiguous blocks in the range.
- *      results - on successful return, set to the starting block number
+ *      results - on successful return, set to the starting block number
 *                of the newly allocated contiguous range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 {
@@ -706,12 +706,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
        /* assert that nblocks is valid */
        assert(nblocks > 0);
-#ifdef _STILL_TO_PORT
-        /* DASD limit check                                     F226941 */
-        if (OVER_LIMIT(ip, nblocks))
-                return -ENOSPC;
-#endif                          /* _STILL_TO_PORT */
        /* get the log2 number of blocks to be allocated.
         * if the number of blocks is not a log2 multiple,
         * it will be rounded up to the next log2 multiple.
@@ -720,7 +714,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
        bmp = JFS_SBI(ip->i_sb)->bmap;
-//retry:        /* serialize w.r.t.extendfs() */
        mapSize = bmp->db_mapsize;
        /* the hint should be within the map */
@@ -879,17 +872,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 /*
 * NAME:        dbAllocExact()
 *
- * FUNCTION:    try to allocate the requested extent;
+ * FUNCTION:    try to allocate the requested extent;
 *
 * PARAMETERS:
- *      ip      - pointer to in-core inode;
+ *      ip      - pointer to in-core inode;
- *      blkno   - extent address;
+ *      blkno   - extent address;
- *      nblocks - extent length;
+ *      nblocks - extent length;
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 {
@@ -946,7 +939,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 /*
 * NAME:        dbReAlloc()
 *
- * FUNCTION:    attempt to extend a current allocation by a specified
+ * FUNCTION:    attempt to extend a current allocation by a specified
 *              number of blocks.
 *
 *              this routine attempts to satisfy the allocation request
@@ -959,21 +952,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 *              number of blocks required.
 *
 * PARAMETERS:
- *      ip          -  pointer to in-core inode requiring allocation.
+ *      ip          -  pointer to in-core inode requiring allocation.
- *      blkno       -  starting block of the current allocation.
+ *      blkno       -  starting block of the current allocation.
- *      nblocks     -  number of contiguous blocks within the current
+ *      nblocks     -  number of contiguous blocks within the current
 *                     allocation.
- *      addnblocks  -  number of blocks to add to the allocation.
+ *      addnblocks  -  number of blocks to add to the allocation.
- *      results -      on successful return, set to the starting block number
+ *      results -      on successful return, set to the starting block number
 *                     of the existing allocation if the existing allocation
 *                     was extended in place or to a newly allocated contiguous
 *                     range if the existing allocation could not be extended
 *                     in place.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int
 dbReAlloc(struct inode *ip,
@@ -1004,7 +997,7 @@ dbReAlloc(struct inode *ip,
 /*
 * NAME:        dbExtend()
 *
- * FUNCTION:    attempt to extend a current allocation by a specified
+ * FUNCTION:    attempt to extend a current allocation by a specified
 *              number of blocks.
 *
 *              this routine attempts to satisfy the allocation request
@@ -1013,16 +1006,16 @@ dbReAlloc(struct inode *ip,
 *              immediately following the current allocation.
 *
 * PARAMETERS:
- *      ip          -  pointer to in-core inode requiring allocation.
+ *      ip          -  pointer to in-core inode requiring allocation.
- *      blkno       -  starting block of the current allocation.
+ *      blkno       -  starting block of the current allocation.
- *      nblocks     -  number of contiguous blocks within the current
+ *      nblocks     -  number of contiguous blocks within the current
 *                     allocation.
- *      addnblocks  -  number of blocks to add to the allocation.
+ *      addnblocks  -  number of blocks to add to the allocation.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 {
@@ -1109,19 +1102,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 /*
 * NAME:        dbAllocNext()
 *
- * FUNCTION:    attempt to allocate the blocks of the specified block
+ * FUNCTION:    attempt to allocate the blocks of the specified block
 *              range within a dmap.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap.
+ *      dp      -  pointer to dmap.
- *      blkno   -  starting block number of the range.
+ *      blkno   -  starting block number of the range.
- *      nblocks -  number of contiguous free blocks of the range.
+ *      nblocks -  number of contiguous free blocks of the range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
 */
@@ -1233,7 +1226,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
 * NAME:        dbAllocNear()
 *
- * FUNCTION:    attempt to allocate a number of contiguous free blocks near
+ * FUNCTION:    attempt to allocate a number of contiguous free blocks near
 *              a specified block (hint) within a dmap.
 *
 *              starting with the dmap leaf that covers the hint, we'll
@@ -1242,18 +1235,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 *              the desired free space.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap.
+ *      dp      -  pointer to dmap.
- *      blkno   -  block number to allocate near.
+ *      blkno   -  block number to allocate near.
- *      nblocks -  actual number of contiguous free blocks desired.
+ *      nblocks -  actual number of contiguous free blocks desired.
- *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
- *      results -  on successful return, set to the starting block number
+ *      results -  on successful return, set to the starting block number
 *                 of the newly allocated range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
 */
@@ -1316,7 +1309,7 @@ dbAllocNear(struct bmap * bmp,
 /*
 * NAME:        dbAllocAG()
 *
- * FUNCTION:    attempt to allocate the specified number of contiguous
+ * FUNCTION:    attempt to allocate the specified number of contiguous
 *              free blocks within the specified allocation group.
 *
 *              unless the allocation group size is equal to the number
@@ -1353,17 +1346,17 @@ dbAllocNear(struct bmap * bmp,
 *              the allocation group.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
 *      agno    - allocation group number.
- *      nblocks -  actual number of contiguous free blocks desired.
+ *      nblocks -  actual number of contiguous free blocks desired.
- *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
- *      results -  on successful return, set to the starting block number
+ *      results -  on successful return, set to the starting block number
 *                 of the newly allocated range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * note: IWRITE_LOCK(ipmap) held on entry/exit;
 */
@@ -1546,7 +1539,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 /*
 * NAME:        dbAllocAny()
 *
- * FUNCTION:    attempt to allocate the specified number of contiguous
+ * FUNCTION:    attempt to allocate the specified number of contiguous
 *              free blocks anywhere in the file system.
 *
 *              dbAllocAny() attempts to find the sufficient free space by
@@ -1556,16 +1549,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 *              desired free space is allocated.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      nblocks  -  actual number of contiguous free blocks desired.
+ *      nblocks  -  actual number of contiguous free blocks desired.
- *      l2nb     -  log2 number of contiguous free blocks desired.
+ *      l2nb     -  log2 number of contiguous free blocks desired.
- *      results -  on successful return, set to the starting block number
+ *      results -  on successful return, set to the starting block number
 *                 of the newly allocated range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -1598,9 +1591,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
 /*
 * NAME:        dbFindCtl()
 *
- * FUNCTION:    starting at a specified dmap control page level and block
+ * FUNCTION:    starting at a specified dmap control page level and block
 *              number, search down the dmap control levels for a range of
- *              contiguous free blocks large enough to satisfy an allocation
+ *              contiguous free blocks large enough to satisfy an allocation
 *              request for the specified number of free blocks.
 *
 *              if sufficient contiguous free blocks are found, this routine
@@ -1609,17 +1602,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
 *              is sufficient in size.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      level   -  starting dmap control page level.
+ *      level   -  starting dmap control page level.
- *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
- *      *blkno  -  on entry, starting block number for conducting the search.
+ *      *blkno  -  on entry, starting block number for conducting the search.
 *                 on successful return, the first block within a dmap page
 *                 that contains or starts a range of contiguous free blocks.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -1699,7 +1692,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 /*
 * NAME:        dbAllocCtl()
 *
- * FUNCTION:    attempt to allocate a specified number of contiguous
+ * FUNCTION:    attempt to allocate a specified number of contiguous
 *              blocks starting within a specific dmap.
 *
 *              this routine is called by higher level routines that search
@@ -1726,18 +1719,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 *              first dmap (i.e. blkno).
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      nblocks  -  actual number of contiguous free blocks to allocate.
+ *      nblocks  -  actual number of contiguous free blocks to allocate.
- *      l2nb     -  log2 number of contiguous free blocks to allocate.
+ *      l2nb     -  log2 number of contiguous free blocks to allocate.
- *      blkno    -  starting block number of the dmap to start the allocation
+ *      blkno    -  starting block number of the dmap to start the allocation
 *                  from.
- *      results -  on successful return, set to the starting block number
+ *      results -  on successful return, set to the starting block number
 *                 of the newly allocated range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -1870,7 +1863,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 /*
 * NAME:        dbAllocDmapLev()
 *
- * FUNCTION:    attempt to allocate a specified number of contiguous blocks
+ * FUNCTION:    attempt to allocate a specified number of contiguous blocks
 *              from a specified dmap.
 *
 *              this routine checks if the contiguous blocks are available.
@@ -1878,17 +1871,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 *              returned.
 *
 * PARAMETERS:
- *      mp      -  pointer to bmap descriptor
+ *      mp      -  pointer to bmap descriptor
- *      dp      -  pointer to dmap to attempt to allocate blocks from.
+ *      dp      -  pointer to dmap to attempt to allocate blocks from.
- *      l2nb    -  log2 number of contiguous block desired.
+ *      l2nb    -  log2 number of contiguous block desired.
- *      nblocks -  actual number of contiguous block desired.
+ *      nblocks -  actual number of contiguous block desired.
- *      results -  on successful return, set to the starting block number
+ *      results -  on successful return, set to the starting block number
 *                 of the newly allocated range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient disk resources
+ *      -ENOSPC - insufficient disk resources
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
 *      IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
@@ -1933,7 +1926,7 @@ dbAllocDmapLev(struct bmap * bmp,
 /*
 * NAME:        dbAllocDmap()
 *
- * FUNCTION:    adjust the disk allocation map to reflect the allocation
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
 *              of a specified block range within a dmap.
 *
 *              this routine allocates the specified blocks from the dmap
@@ -1946,14 +1939,14 @@ dbAllocDmapLev(struct bmap * bmp,
 *              covers this dmap.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap to allocate the block range from.
+ *      dp      -  pointer to dmap to allocate the block range from.
- *      blkno   -  starting block number of the block to be allocated.
+ *      blkno   -  starting block number of the block to be allocated.
- *      nblocks -  number of blocks to be allocated.
+ *      nblocks -  number of blocks to be allocated.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -1989,7 +1982,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
 * NAME:        dbFreeDmap()
 *
- * FUNCTION:    adjust the disk allocation map to reflect the allocation
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
 *              of a specified block range within a dmap.
 *
 *              this routine frees the specified blocks from the dmap through
@@ -1997,18 +1990,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 *              causes the maximum string of free blocks within the dmap to
 *              change (i.e. the value of the root of the dmap's dmtree), this
 *              routine will cause this change to be reflected up through the
- *              appropriate levels of the dmap control pages by a call to
+ *              appropriate levels of the dmap control pages by a call to
 *              dbAdjCtl() for the L0 dmap control page that covers this dmap.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap to free the block range from.
+ *      dp      -  pointer to dmap to free the block range from.
- *      blkno   -  starting block number of the block to be freed.
+ *      blkno   -  starting block number of the block to be freed.
- *      nblocks -  number of blocks to be freed.
+ *      nblocks -  number of blocks to be freed.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -2055,7 +2048,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
 * NAME:        dbAllocBits()
 *
- * FUNCTION:    allocate a specified block range from a dmap.
+ * FUNCTION:    allocate a specified block range from a dmap.
 *
 *              this routine updates the dmap to reflect the working
 *              state allocation of the specified block range. it directly
@@ -2065,10 +2058,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 *              dmap's dmtree, as a whole, to reflect the allocated range.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap to allocate bits from.
+ *      dp      -  pointer to dmap to allocate bits from.
- *      blkno   -  starting block number of the bits to be allocated.
+ *      blkno   -  starting block number of the bits to be allocated.
- *      nblocks -  number of bits to be allocated.
+ *      nblocks -  number of bits to be allocated.
 *
 * RETURN VALUES: none
 *
@@ -2149,7 +2142,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
                         * the allocated words.
                         */
                        for (; nwords > 0; nwords -= nw) {
-                                if (leaf[word] < BUDMIN) {
+                                if (leaf[word] < BUDMIN) {
                                        jfs_error(bmp->db_ipbmap->i_sb,
                                                  "dbAllocBits: leaf page "
                                                  "corrupt");
@@ -2202,7 +2195,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 /*
 * NAME:        dbFreeBits()
 *
- * FUNCTION:    free a specified block range from a dmap.
+ * FUNCTION:    free a specified block range from a dmap.
 *
 *              this routine updates the dmap to reflect the working
 *              state allocation of the specified block range. it directly
@@ -2212,10 +2205,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 *              dmtree, as a whole, to reflect the deallocated range.
 *
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      dp      -  pointer to dmap to free bits from.
+ *      dp      -  pointer to dmap to free bits from.
- *      blkno   -  starting block number of the bits to be freed.
+ *      blkno   -  starting block number of the bits to be freed.
- *      nblocks -  number of bits to be freed.
+ *      nblocks -  number of bits to be freed.
 *
 * RETURN VALUES: 0 for success
 *
@@ -2388,19 +2381,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 *              the new root value and the next dmap control page level to
 *              be adjusted.
 * PARAMETERS:
- *      bmp     -  pointer to bmap descriptor
+ *      bmp     -  pointer to bmap descriptor
- *      blkno   -  the first block of a block range within a dmap.  it is
+ *      blkno   -  the first block of a block range within a dmap.  it is
 *                 the allocation or deallocation of this block range that
 *                 requires the dmap control page to be adjusted.
- *      newval  -  the new value of the lower level dmap or dmap control
+ *      newval  -  the new value of the lower level dmap or dmap control
 *                 page root.
- *      alloc   -  'true' if adjustment is due to an allocation.
+ *      alloc   -  'true' if adjustment is due to an allocation.
- *      level   -  current level of dmap control page (i.e. L0, L1, L2) to
+ *      level   -  current level of dmap control page (i.e. L0, L1, L2) to
 *                 be adjusted.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 *
 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
 */
@@ -2544,16 +2537,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 /*
 * NAME:        dbSplit()
 *
- * FUNCTION:    update the leaf of a dmtree with a new value, splitting
+ * FUNCTION:    update the leaf of a dmtree with a new value, splitting
 *              the leaf from the binary buddy system of the dmtree's
 *              leaves, as required.
 *
 * PARAMETERS:
- *      tp      - pointer to the tree containing the leaf.
+ *      tp      - pointer to the tree containing the leaf.
- *      leafno  - the number of the leaf to be updated.
+ *      leafno  - the number of the leaf to be updated.
- *      splitsz - the size the binary buddy system starting at the leaf
+ *      splitsz - the size the binary buddy system starting at the leaf
 *                must be split to, specified as the log2 number of blocks.
- *      newval  - the new value for the leaf.
+ *      newval  - the new value for the leaf.
 *
 * RETURN VALUES: none
 *
@@ -2600,7 +2593,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 /*
 * NAME:        dbBackSplit()
 *
- * FUNCTION:    back split the binary buddy system of dmtree leaves
+ * FUNCTION:    back split the binary buddy system of dmtree leaves
 *              that hold a specified leaf until the specified leaf
 *              starts its own binary buddy system.
 *
@@ -2617,8 +2610,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 *              in which a previous join operation must be backed out.
 *
 * PARAMETERS:
- *      tp      - pointer to the tree containing the leaf.
+ *      tp      - pointer to the tree containing the leaf.
- *      leafno  - the number of the leaf to be updated.
+ *      leafno  - the number of the leaf to be updated.
 *
 * RETURN VALUES: none
 *
@@ -2692,14 +2685,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
 /*
 * NAME:        dbJoin()
 *
- * FUNCTION:    update the leaf of a dmtree with a new value, joining
+ * FUNCTION:    update the leaf of a dmtree with a new value, joining
 *              the leaf with other leaves of the dmtree into a multi-leaf
 *              binary buddy system, as required.
 *
 * PARAMETERS:
- *      tp      - pointer to the tree containing the leaf.
+ *      tp      - pointer to the tree containing the leaf.
- *      leafno  - the number of the leaf to be updated.
+ *      leafno  - the number of the leaf to be updated.
- *      newval  - the new value for the leaf.
+ *      newval  - the new value for the leaf.
 *
 * RETURN VALUES: none
 */
@@ -2785,15 +2778,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
 /*
 * NAME:        dbAdjTree()
 *
- * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
+ * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
 *              the dmtree, as required, to reflect the new leaf value.
 *              the combination of any buddies must already be done before
 *              this is called.
 *
 * PARAMETERS:
- *      tp      - pointer to the tree to be adjusted.
+ *      tp      - pointer to the tree to be adjusted.
- *      leafno  - the number of the leaf to be updated.
+ *      leafno  - the number of the leaf to be updated.
- *      newval  - the new value for the leaf.
+ *      newval  - the new value for the leaf.
 *
 * RETURN VALUES: none
 */
@@ -2852,7 +2845,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
 /*
 * NAME:        dbFindLeaf()
 *
- * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
+ * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
 *              the index of a leaf describing the free blocks if
 *              sufficient free blocks are found.
 *
@@ -2861,15 +2854,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
 *              free space.
 *
 * PARAMETERS:
- *      tp      - pointer to the tree to be searched.
+ *      tp      - pointer to the tree to be searched.
- *      l2nb    - log2 number of free blocks to search for.
+ *      l2nb    - log2 number of free blocks to search for.
 *      leafidx - return pointer to be set to the index of the leaf
 *                describing at least l2nb free blocks if sufficient
 *                free blocks are found.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOSPC - insufficient free blocks.
+ *      -ENOSPC - insufficient free blocks.
 */
 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 {
@@ -2916,18 +2909,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 /*
 * NAME:        dbFindBits()
 *
- * FUNCTION:    find a specified number of binary buddy free bits within a
+ * FUNCTION:    find a specified number of binary buddy free bits within a
 *              dmap bitmap word value.
 *
 *              this routine searches the bitmap value for (1 << l2nb) free
 *              bits at (1 << l2nb) alignments within the value.
 *
 * PARAMETERS:
- *      word    -  dmap bitmap word value.
+ *      word    -  dmap bitmap word value.
- *      l2nb    -  number of free bits specified as a log2 number.
+ *      l2nb    -  number of free bits specified as a log2 number.
 *
 * RETURN VALUES:
- *      starting bit number of free bits.
+ *      starting bit number of free bits.
 */
 static int dbFindBits(u32 word, int l2nb)
 {
@@ -2963,14 +2956,14 @@ static int dbFindBits(u32 word, int l2nb)
 /*
 * NAME:        dbMaxBud(u8 *cp)
 *
- * FUNCTION:    determine the largest binary buddy string of free
+ * FUNCTION:    determine the largest binary buddy string of free
 *              bits within 32-bits of the map.
 *
 * PARAMETERS:
- *      cp      -  pointer to the 32-bit value.
+ *      cp      -  pointer to the 32-bit value.
 *
 * RETURN VALUES:
- *      largest binary buddy of free bits within a dmap word.
+ *      largest binary buddy of free bits within a dmap word.
 */
 static int dbMaxBud(u8 * cp)
 {
@@ -3000,14 +2993,14 @@ static int dbMaxBud(u8 * cp)
 /*
 * NAME:        cnttz(uint word)
 *
- * FUNCTION:    determine the number of trailing zeros within a 32-bit
+ * FUNCTION:    determine the number of trailing zeros within a 32-bit
 *              value.
 *
 * PARAMETERS:
- *      value   -  32-bit value to be examined.
+ *      value   -  32-bit value to be examined.
 *
 * RETURN VALUES:
- *      count of trailing zeros
+ *      count of trailing zeros
 */
 static int cnttz(u32 word)
 {
@@ -3025,14 +3018,14 @@ static int cnttz(u32 word)
 /*
 * NAME:        cntlz(u32 value)
 *
- * FUNCTION:    determine the number of leading zeros within a 32-bit
+ * FUNCTION:    determine the number of leading zeros within a 32-bit
 *              value.
 *
 * PARAMETERS:
- *      value   -  32-bit value to be examined.
+ *      value   -  32-bit value to be examined.
 *
 * RETURN VALUES:
- *      count of leading zeros
+ *      count of leading zeros
 */
 static int cntlz(u32 value)
 {
@@ -3050,14 +3043,14 @@ static int cntlz(u32 value)
 * NAME:        blkstol2(s64 nb)
 *
 * FUNCTION:    convert a block count to its log2 value. if the block
- *              count is not a l2 multiple, it is rounded up to the next
+ *              count is not a l2 multiple, it is rounded up to the next
 *              larger l2 multiple.
 *
 * PARAMETERS:
- *      nb      -  number of blocks
+ *      nb      -  number of blocks
 *
 * RETURN VALUES:
- *      log2 number of blocks
+ *      log2 number of blocks
 */
 static int blkstol2(s64 nb)
 {
@@ -3099,13 +3092,13 @@ static int blkstol2(s64 nb)
 *              at a time.
 *
 * PARAMETERS:
- *      ip      -  pointer to in-core inode;
+ *      ip      -  pointer to in-core inode;
- *      blkno   -  starting block number to be freed.
+ *      blkno   -  starting block number to be freed.
- *      nblocks -  number of blocks to be freed.
+ *      nblocks -  number of blocks to be freed.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error
+ *      -EIO    - i/o error
 */
 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
 {
@@ -3278,10 +3271,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 * L2
 *  |
 *   L1---------------------------------L1
- *    |                                  |
+ *    |                                  |
- *     L0---------L0---------L0           L0---------L0---------L0
+ *     L0---------L0---------L0           L0---------L0---------L0
- *      |          |          |            |          |          |
+ *      |          |          |            |          |          |
- *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
+ *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
 *
 * <---old---><----------------------------extend----------------------->
@@ -3307,7 +3300,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
                 (long long) blkno, (long long) nblocks, (long long) newsize);
        /*
-         *      initialize bmap control page.
+         *      initialize bmap control page.
         *
         * all the data in bmap control page should exclude
         * the mkfs hidden dmap page.
@@ -3330,7 +3323,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
        /*
-         *      reconfigure db_agfree[]
+         *      reconfigure db_agfree[]
         * from old AG configuration to new AG configuration;
         *
         * coalesce contiguous k (newAGSize/oldAGSize) AGs;
@@ -3362,7 +3355,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        bmp->db_maxag = bmp->db_maxag / k;
        /*
-         *      extend bmap
+         *      extend bmap
         *
         * update bit maps and corresponding level control pages;
         * global control page db_nfree, db_agfree[agno], db_maxfreebud;
@@ -3410,7 +3403,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
                        /* compute start L0 */
                        j = 0;
                        l1leaf = l1dcp->stree + CTLLEAFIND;
-                        p += nbperpage; /* 1st L0 of L1.k  */
+                        p += nbperpage; /* 1st L0 of L1.k */
                }
                /*
@@ -3548,7 +3541,7 @@ errout:
        return -EIO;
        /*
-         *      finalize bmap control page
+         *      finalize bmap control page
         */
 finalize:
@@ -3567,7 +3560,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
        int i, n;
        /*
-         *      finalize bmap control page
+         *      finalize bmap control page
         */
 //finalize:
        /*
@@ -3953,8 +3946,8 @@ static int dbGetL2AGSize(s64 nblocks)
 * convert number of map pages to the zero origin top dmapctl level
 */
 #define BMAPPGTOLEV(npages)     \
-        (((npages) <= 3 + MAXL0PAGES) ? 0 \
+        (((npages) <= 3 + MAXL0PAGES) ? 0 : \
-       : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+         ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 {
@@ -3981,8 +3974,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
                factor =
                    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
                complete = (u32) npages / factor;
-                ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
+                ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
-                                      : ((i == 1) ? LPERCTL : 1));
+                                      ((i == 1) ? LPERCTL : 1));
                /* pages in last/incomplete child */
                npages = (u32) npages % factor;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 45ea454c74bd..11e6d471b364 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp)
 *      - 1 is added to account for the control page of the map.
 */
 #define BLKTODMAP(b,s)    \
-        ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+        ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
 /*
 * convert disk block number to the logical block number of the LEVEL 0
@@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp)
 *      - 1 is added to account for the control page of the map.
 */
 #define BLKTOL0(b,s)      \
-        (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+        (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
 /*
 * convert disk block number to the logical block number of the LEVEL 1
@@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp)
 * at the specified level which describes the disk block.
 */
 #define BLKTOCTL(b,s,l)   \
-        (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+        (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
 /*
 * convert aggregate map size to the zero origin dmapctl level of the
@@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp)
 * dmaptree must be consistent with dmapctl.
 */
 struct dmaptree {
-        __le32 nleafs;          /* 4: number of tree leafs      */
+        __le32 nleafs;          /* 4: number of tree leafs      */
-        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
+        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
-        __le32 leafidx;         /* 4: index of first tree leaf  */
+        __le32 leafidx;         /* 4: index of first tree leaf  */
-        __le32 height;          /* 4: height of the tree        */
+        __le32 height;          /* 4: height of the tree        */
        s8 budmin;              /* 1: min l2 tree leaf value to combine */
-        s8 stree[TREESIZE];     /* TREESIZE: tree               */
+        s8 stree[TREESIZE];     /* TREESIZE: tree               */
-        u8 pad[2];              /* 2: pad to word boundary      */
+        u8 pad[2];              /* 2: pad to word boundary      */
-};                              /* - 360 -                      */
+};                              /* - 360 -                      */
 /*
 *      dmap page per 8K blocks bitmap
 */
 struct dmap {
-        __le32 nblocks;         /* 4: num blks covered by this dmap     */
+        __le32 nblocks;         /* 4: num blks covered by this dmap     */
-        __le32 nfree;           /* 4: num of free blks in this dmap     */
+        __le32 nfree;           /* 4: num of free blks in this dmap     */
-        __le64 start;           /* 8: starting blkno for this dmap      */
+        __le64 start;           /* 8: starting blkno for this dmap      */
-        struct dmaptree tree;   /* 360: dmap tree                       */
+        struct dmaptree tree;   /* 360: dmap tree                       */
-        u8 pad[1672];           /* 1672: pad to 2048 bytes              */
+        u8 pad[1672];           /* 1672: pad to 2048 bytes              */
-        __le32 wmap[LPERDMAP];  /* 1024: bits of the working map        */
+        __le32 wmap[LPERDMAP];  /* 1024: bits of the working map        */
-        __le32 pmap[LPERDMAP];  /* 1024: bits of the persistent map     */
+        __le32 pmap[LPERDMAP];  /* 1024: bits of the persistent map     */
-};                              /* - 4096 -                             */
+};                              /* - 4096 -                             */
 /*
 *      disk map control page per level.
@@ -173,14 +173,14 @@ struct dmap {
 * dmapctl must be consistent with dmaptree.
 */
 struct dmapctl {
-        __le32 nleafs;          /* 4: number of tree leafs      */
+        __le32 nleafs;          /* 4: number of tree leafs      */
-        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
+        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
-        __le32 leafidx;         /* 4: index of the first tree leaf      */
+        __le32 leafidx;         /* 4: index of the first tree leaf      */
-        __le32 height;          /* 4: height of tree            */
+        __le32 height;          /* 4: height of tree            */
-        s8 budmin;              /* 1: minimum l2 tree leaf value        */
+        s8 budmin;              /* 1: minimum l2 tree leaf value        */
-        s8 stree[CTLTREESIZE];  /* CTLTREESIZE: dmapctl tree    */
+        s8 stree[CTLTREESIZE];  /* CTLTREESIZE: dmapctl tree    */
-        u8 pad[2714];           /* 2714: pad to 4096            */
+        u8 pad[2714];           /* 2714: pad to 4096            */
-};                              /* - 4096 -                     */
+};                              /* - 4096 -                     */
 /*
 *      common definition for dmaptree within dmap and dmapctl
@@ -202,41 +202,41 @@ typedef union dmtree {
 *      on-disk aggregate disk allocation map descriptor.
 */
 struct dbmap_disk {
-        __le64 dn_mapsize;      /* 8: number of blocks in aggregate     */
+        __le64 dn_mapsize;      /* 8: number of blocks in aggregate     */
-        __le64 dn_nfree;        /* 8: num free blks in aggregate map    */
+        __le64 dn_nfree;        /* 8: num free blks in aggregate map    */
-        __le32 dn_l2nbperpage;  /* 4: number of blks per page           */
+        __le32 dn_l2nbperpage;  /* 4: number of blks per page           */
-        __le32 dn_numag;        /* 4: total number of ags               */
+        __le32 dn_numag;        /* 4: total number of ags               */
-        __le32 dn_maxlevel;     /* 4: number of active ags              */
+        __le32 dn_maxlevel;     /* 4: number of active ags              */
-        __le32 dn_maxag;        /* 4: max active alloc group number     */
+        __le32 dn_maxag;        /* 4: max active alloc group number     */
-        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
+        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
-        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
+        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
-        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
+        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
-        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
+        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
-        __le32 dn_agstart;      /* 4: start tree index at AG height     */
+        __le32 dn_agstart;      /* 4: start tree index at AG height     */
-        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
+        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
-        __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count           */
+        __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count           */
-        __le64 dn_agsize;       /* 8: num of blks per alloc group       */
+        __le64 dn_agsize;       /* 8: num of blks per alloc group       */
-        s8 dn_maxfreebud;       /* 1: max free buddy system             */
+        s8 dn_maxfreebud;       /* 1: max free buddy system             */
-        u8 pad[3007];           /* 3007: pad to 4096                    */
+        u8 pad[3007];           /* 3007: pad to 4096                    */
-};                              /* - 4096 -                             */
+};                              /* - 4096 -                             */
 struct dbmap {
-        s64 dn_mapsize;         /* number of blocks in aggregate     */
+        s64 dn_mapsize;         /* number of blocks in aggregate        */
-        s64 dn_nfree;           /* num free blks in aggregate map    */
+        s64 dn_nfree;           /* num free blks in aggregate map       */
-        int dn_l2nbperpage;     /* number of blks per page           */
+        int dn_l2nbperpage;     /* number of blks per page              */
-        int dn_numag;           /* total number of ags               */
+        int dn_numag;           /* total number of ags                  */
-        int dn_maxlevel;        /* number of active ags              */
+        int dn_maxlevel;        /* number of active ags                 */
-        int dn_maxag;           /* max active alloc group number     */
+        int dn_maxag;           /* max active alloc group number        */
-        int dn_agpref;          /* preferred alloc group (hint)      */
+        int dn_agpref;          /* preferred alloc group (hint)         */
-        int dn_aglevel;         /* dmapctl level holding the AG      */
+        int dn_aglevel;         /* dmapctl level holding the AG         */
-        int dn_agheigth;        /* height in dmapctl of the AG       */
+        int dn_agheigth;        /* height in dmapctl of the AG          */
-        int dn_agwidth;         /* width in dmapctl of the AG        */
+        int dn_agwidth;         /* width in dmapctl of the AG           */
-        int dn_agstart;         /* start tree index at AG height     */
+        int dn_agstart;         /* start tree index at AG height        */
-        int dn_agl2size;        /* l2 num of blks per alloc group    */
+        int dn_agl2size;        /* l2 num of blks per alloc group       */
-        s64 dn_agfree[MAXAG];   /* per AG free count           */
+        s64 dn_agfree[MAXAG];   /* per AG free count                    */
-        s64 dn_agsize;          /* num of blks per alloc group       */
+        s64 dn_agsize;          /* num of blks per alloc group          */
-        signed char dn_maxfreebud;      /* max free buddy system             */
+        signed char dn_maxfreebud;      /* max free buddy system        */
-};                              /* - 4096 -                             */
+};                              /* - 4096 -                             */
 /*
 *      in-memory aggregate disk allocation map descriptor.
 */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 6d62f3222892..c14ba3cfa818 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
        lv = &llck->lv[llck->index];
        /*
-         *      Linelock slot size is twice the size of directory table
+         *      Linelock slot size is twice the size of directory table
-         *      slot size.  512 entries per page.
+         *      slot size.  512 entries per page.
         */
        lv->offset = ((index - 2) & 511) >> 1;
        lv->length = 1;
@@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
        btstack->nsplit = 1;
        /*
-         *      search down tree from root:
+         *      search down tree from root:
         *
         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
                        }
                        if (cmp == 0) {
                                /*
-                                 *      search hit
+                                 *      search hit
                                 */
                                /* search hit - leaf page:
                                 * return the entry found
@@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
                }
                /*
-                 *      search miss
+                 *      search miss
                 *
                 * base is the smallest index with key (Kj) greater than
                 * search key (K) and may be zero or (maxindex + 1) index.
@@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip,
        struct lv *lv;
        /*
-         *      retrieve search result
+         *      retrieve search result
         *
         * dtSearch() returns (leaf page pinned, index at which to insert).
         * n.b. dtSearch() may return index of (maxindex + 1) of
@@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip,
        DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
        /*
-         *      insert entry for new key
+         *      insert entry for new key
         */
        if (DO_INDEX(ip)) {
                if (JFS_IP(ip)->next_index == DIREND) {
@@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip,
        data.leaf.ino = *fsn;
        /*
-         *      leaf page does not have enough room for new entry:
+         *      leaf page does not have enough room for new entry:
         *
-         *      extend/split the leaf page;
+         *      extend/split the leaf page;
         *
         * dtSplitUp() will insert the entry and unpin the leaf page.
         */
@@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip,
        }
        /*
-         *      leaf page does have enough room for new entry:
+         *      leaf page does have enough room for new entry:
         *
-         *      insert the new data entry into the leaf page;
+         *      insert the new data entry into the leaf page;
         */
        BT_MARK_DIRTY(mp, ip);
        /*
@@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid,
        }
        /*
-         *      split leaf page
+         *      split leaf page
         *
         * The split routines insert the new entry, and
         * acquire txLock as appropriate.
         */
        /*
-         *      split root leaf page:
+         *      split root leaf page:
         */
        if (sp->header.flag & BT_ROOT) {
                /*
@@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid,
        }
        /*
-         *      extend first leaf page
+         *      extend first leaf page
         *
         * extend the 1st extent if less than buffer page size
         * (dtExtendPage() reurns leaf page unpinned)
@@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid,
        }
        /*
-         *      split leaf page <sp> into <sp> and a new right page <rp>.
+         *      split leaf page <sp> into <sp> and a new right page <rp>.
         *
         * return <rp> pinned and its extent descriptor <rpxd>
         */
@@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
        rp->header.freecnt = rp->header.maxslot - fsi;
        /*
-         *      sequential append at tail: append without split
+         *      sequential append at tail: append without split
         *
         * If splitting the last page on a level because of appending
         * a entry to it (skip is maxentry), it's likely that the access is
@@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
        }
        /*
-         *      non-sequential insert (at possibly middle page)
+         *      non-sequential insert (at possibly middle page)
         */
        /*
@@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
        left = 0;
        /*
-         *      compute fill factor for split pages
+         *      compute fill factor for split pages
         *
         * <nxt> traces the next entry to move to rp
         * <off> traces the next entry to stay in sp
@@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
        /* <nxt> poins to the 1st entry to move */
        /*
-         *      move entries to right page
+         *      move entries to right page
         *
         * dtMoveEntry() initializes rp and reserves entry for insertion
         *
@@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid,
                return (rc);
        /*
-         *      extend the extent
+         *      extend the extent
         */
        pxdlist = split->pxdlist;
        pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid,
        }
        /*
-         *      extend the page
+         *      extend the page
         */
        sp->header.self = *pxd;
@@ -1739,9 +1739,6 @@ static int dtExtendPage(tid_t tid,
        /* update buffer extent descriptor of extended page */
        xlen = lengthPXD(pxd);
        xsize = xlen << JFS_SBI(sb)->l2bsize;
-#ifdef _STILL_TO_PORT
-        bmSetXD(smp, xaddr, xsize);
-#endif                          /*  _STILL_TO_PORT */
        /*
         * copy old stbl to new stbl at start of extended area
@@ -1836,7 +1833,7 @@ static int dtExtendPage(tid_t tid,
        }
        /*
-         *      update parent entry on the parent/root page
+         *      update parent entry on the parent/root page
         */
        /*
         * acquire a transaction lock on the parent/root page
@@ -1904,7 +1901,7 @@ static int dtSplitRoot(tid_t tid,
        sp = &JFS_IP(ip)->i_dtroot;
        /*
-         *      allocate/initialize a single (right) child page
+         *      allocate/initialize a single (right) child page
         *
         * N.B. at first split, a one (or two) block to fit new entry
         * is allocated; at subsequent split, a full page is allocated;
@@ -1943,7 +1940,7 @@ static int dtSplitRoot(tid_t tid,
        rp->header.prev = 0;
        /*
-         *      move in-line root page into new right page extent
+         *      move in-line root page into new right page extent
         */
        /* linelock header + copied entries + new stbl (1st slot) in new page */
        ASSERT(dtlck->index == 0);
@@ -2016,7 +2013,7 @@ static int dtSplitRoot(tid_t tid,
        dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
        /*
-         *      reset parent/root page
+         *      reset parent/root page
         *
         * set the 1st entry offset to 0, which force the left-most key
         * at any level of the tree to be less than any search key.
@@ -2102,7 +2099,7 @@ int dtDelete(tid_t tid,
        dtpage_t *np;
        /*
-         *      search for the entry to delete:
+         *      search for the entry to delete:
         *
         * dtSearch() returns (leaf page pinned, index at which to delete).
         */
@@ -2253,7 +2250,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
        int i;
        /*
-         *      keep the root leaf page which has become empty
+         *      keep the root leaf page which has become empty
         */
        if (BT_IS_ROOT(fmp)) {
                /*
@@ -2269,7 +2266,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
        }
        /*
-         *      free the non-root leaf page
+         *      free the non-root leaf page
         */
        /*
         * acquire a transaction lock on the page
@@ -2299,7 +2296,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
        discard_metapage(fmp);
        /*
-         *      propagate page deletion up the directory tree
+         *      propagate page deletion up the directory tree
         *
         * If the delete from the parent page makes it empty,
         * continue all the way up the tree.
@@ -2440,10 +2437,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 #ifdef _NOTYET
 /*
- * NAME:        dtRelocate()
+ * NAME:        dtRelocate()
 *
- * FUNCTION:    relocate dtpage (internal or leaf) of directory;
+ * FUNCTION:    relocate dtpage (internal or leaf) of directory;
- *              This function is mainly used by defragfs utility.
+ *              This function is mainly used by defragfs utility.
 */
 int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
               s64 nxaddr)
@@ -2471,8 +2468,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
                   xlen);
        /*
-         *      1. get the internal parent dtpage covering
+         *      1. get the internal parent dtpage covering
-         *      router entry for the tartget page to be relocated;
+         *      router entry for the tartget page to be relocated;
         */
        rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
        if (rc)
@@ -2483,7 +2480,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
        jfs_info("dtRelocate: parent router entry validated.");
        /*
-         *      2. relocate the target dtpage
+         *      2. relocate the target dtpage
         */
        /* read in the target page from src extent */
        DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
@@ -2581,9 +2578,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
        /* update the buffer extent descriptor of the dtpage */
        xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
-#ifdef _STILL_TO_PORT
-        bmSetXD(mp, nxaddr, xsize);
-#endif /* _STILL_TO_PORT */
        /* unpin the relocated page */
        DT_PUTPAGE(mp);
        jfs_info("dtRelocate: target dtpage relocated.");
@@ -2594,7 +2589,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
         */
        /*
-         *      3. acquire maplock for the source extent to be freed;
+         *      3. acquire maplock for the source extent to be freed;
         */
        /* for dtpage relocation, write a LOG_NOREDOPAGE record
         * for the source dtpage (logredo() will init NoRedoPage
@@ -2609,7 +2604,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
        pxdlock->index = 1;
        /*
-         *      4. update the parent router entry for relocation;
+         *      4. update the parent router entry for relocation;
         *
         * acquire tlck for the parent entry covering the target dtpage;
         * write LOG_REDOPAGE to apply after image only;
@@ -2637,7 +2632,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
 * NAME:        dtSearchNode()
 *
 * FUNCTION:    Search for an dtpage containing a specified address
- *              This function is mainly used by defragfs utility.
+ *              This function is mainly used by defragfs utility.
 *
 * NOTE:        Search result on stack, the found page is pinned at exit.
 *              The result page must be an internal dtpage.
@@ -2660,7 +2655,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
        BT_CLR(btstack);        /* reset stack */
        /*
-         *      descend tree to the level with specified leftmost page
+         *      descend tree to the level with specified leftmost page
         *
         *  by convention, root bn = 0.
         */
@@ -2699,7 +2694,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
        }
        /*
-         *      search each page at the current levevl
+         *      search each page at the current levevl
         */
      loop:
        stbl = DT_GETSTBL(p);
@@ -3044,9 +3039,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (DO_INDEX(ip)) {
                /*
                 * persistent index is stored in directory entries.
-                 * Special cases:        0 = .
+                 * Special cases:        0 = .
-                 *                       1 = ..
+                 *                       1 = ..
-                 *                      -1 = End of directory
+                 *                      -1 = End of directory
                 */
                do_index = 1;
@@ -3128,10 +3123,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                /*
                 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
                 *
-                 * pn = index = 0:      First entry "."
+                 * pn = index = 0:      First entry "."
-                 * pn = 0; index = 1:   Second entry ".."
+                 * pn = 0; index = 1:   Second entry ".."
-                 * pn > 0:              Real entries, pn=1 -> leftmost page
+                 * pn > 0:              Real entries, pn=1 -> leftmost page
-                 * pn = index = -1:     No more entries
+                 * pn = index = -1:     No more entries
                 */
                dtpos = filp->f_pos;
                if (dtpos == 0) {
@@ -3351,7 +3346,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
        BT_CLR(btstack);        /* reset stack */
        /*
-         *      descend leftmost path of the tree
+         *      descend leftmost path of the tree
         *
         * by convention, root bn = 0.
         */
@@ -4531,7 +4526,7 @@ int dtModify(tid_t tid, struct inode *ip,
        struct ldtentry *entry;
        /*
-         *      search for the entry to modify:
+         *      search for the entry to modify:
         *
         * dtSearch() returns (leaf page pinned, index at which to modify).
         */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index af8513f78648..8561c6ecece0 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -35,7 +35,7 @@ typedef union {
 /*
- *      entry segment/slot
+ *      entry segment/slot
 *
 * an entry consists of type dependent head/only segment/slot and
 * additional segments/slots linked vi next field;
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index a35bdca6a805..7ae1e3281de9 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
 #endif
 static s64 extRoundDown(s64 nb);
-#define DPD(a)          (printk("(a): %d\n",(a)))
+#define DPD(a)          (printk("(a): %d\n",(a)))
-#define DPC(a)          (printk("(a): %c\n",(a)))
+#define DPC(a)          (printk("(a): %c\n",(a)))
 #define DPL1(a)                                 \
 {                                               \
        if ((a) >> 32)                          \
@@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb);
                printk("(a): %x\n",(a) << 32);  \
 }
-#define DPD1(a)         (printk("(a): %d  ",(a)))
+#define DPD1(a)         (printk("(a): %d  ",(a)))
-#define DPX(a)          (printk("(a): %08x\n",(a)))
+#define DPX(a)          (printk("(a): %08x\n",(a)))
-#define DPX1(a)         (printk("(a): %08x  ",(a)))
+#define DPX1(a)         (printk("(a): %08x  ",(a)))
-#define DPS(a)          (printk("%s\n",(a)))
+#define DPS(a)          (printk("%s\n",(a)))
-#define DPE(a)          (printk("\nENTERING: %s\n",(a)))
+#define DPE(a)          (printk("\nENTERING: %s\n",(a)))
-#define DPE1(a)          (printk("\nENTERING: %s",(a)))
+#define DPE1(a)         (printk("\nENTERING: %s",(a)))
-#define DPS1(a)         (printk("  %s  ",(a)))
+#define DPS1(a)         (printk("  %s  ",(a)))
 /*
 * NAME:        extAlloc()
 *
- * FUNCTION:    allocate an extent for a specified page range within a
+ * FUNCTION:    allocate an extent for a specified page range within a
 *              file.
 *
 * PARAMETERS:
@@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb);
 *                should be marked as allocated but not recorded.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 int
 extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
@@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 #ifdef _NOTYET
 /*
- * NAME:        extRealloc()
+ * NAME:        extRealloc()
 *
- * FUNCTION:    extend the allocation of a file extent containing a
+ * FUNCTION:    extend the allocation of a file extent containing a
 *              partial back last page.
 *
 * PARAMETERS:
@@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 *                should be marked as allocated but not recorded.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 {
@@ -345,9 +345,9 @@ exit:
 /*
- * NAME:        extHint()
+ * NAME:        extHint()
 *
- * FUNCTION:    produce an extent allocation hint for a file offset.
+ * FUNCTION:    produce an extent allocation hint for a file offset.
 *
 * PARAMETERS:
 *      ip      - the inode of the file.
@@ -356,8 +356,8 @@ exit:
 *                the hint.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int extHint(struct inode *ip, s64 offset, xad_t * xp)
 {
@@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
        lxdl.nlxd = 1;
        lxdl.lxd = &lxd;
        LXDoffset(&lxd, prev)
-            LXDlength(&lxd, nbperpage);
+        LXDlength(&lxd, nbperpage);
        xadl.maxnxad = 1;
        xadl.nxad = 0;
@@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
        if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
                return (rc);
-        /* check if not extent exists for the previous page.
+        /* check if no extent exists for the previous page.
         * this is possible for sparse files.
         */
        if (xadl.nxad == 0) {
-//              assert(ISSPARSE(ip));
+//              assert(ISSPARSE(ip));
                return (0);
        }
@@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
         */
        xp->flag &= XAD_NOTRECORDED;
-        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
+        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
                jfs_error(ip->i_sb, "extHint: corrupt xtree");
                return -EIO;
-        }
+        }
        return (0);
 }
 /*
- * NAME:        extRecord()
+ * NAME:        extRecord()
 *
- * FUNCTION:    change a page with a file from not recorded to recorded.
+ * FUNCTION:    change a page with a file from not recorded to recorded.
 *
 * PARAMETERS:
 *      ip      - inode of the file.
 *      cp      - cbuf of the file page.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 int extRecord(struct inode *ip, xad_t * xp)
 {
@@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp)
 #ifdef _NOTYET
 /*
- * NAME:        extFill()
+ * NAME:        extFill()
 *
- * FUNCTION:    allocate disk space for a file page that represents
+ * FUNCTION:    allocate disk space for a file page that represents
 *              a file hole.
 *
 * PARAMETERS:
@@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp)
 *      cp      - cbuf of the file page represent the hole.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 int extFill(struct inode *ip, xad_t * xp)
 {
        int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
        s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
-//      assert(ISSPARSE(ip));
+//      assert(ISSPARSE(ip));
        /* initialize the extent allocation hint */
        XADaddress(xp, 0);
@@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp)
 /*
 * NAME:        extBalloc()
 *
- * FUNCTION:    allocate disk blocks to form an extent.
+ * FUNCTION:    allocate disk blocks to form an extent.
 *
 *              initially, we will try to allocate disk blocks for the
 *              requested size (nblocks).  if this fails (nblocks
@@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp)
 *                 allocated block range.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 static int
 extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
@@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 /*
 * NAME:        extBrealloc()
 *
- * FUNCTION:    attempt to extend an extent's allocation.
+ * FUNCTION:    attempt to extend an extent's allocation.
 *
 *              Initially, we will try to extend the extent's allocation
 *              in place.  If this fails, we'll try to move the extent
@@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 *
 * PARAMETERS:
 *      ip       - the inode of the file.
- *      blkno    - starting block number of the extents current allocation.
+ *      blkno    - starting block number of the extents current allocation.
- *      nblks    - number of blocks within the extents current allocation.
+ *      nblks    - number of blocks within the extents current allocation.
 *      newnblks - pointer to a s64 value.  on entry, this value is the
 *                 the new desired extent size (number of blocks).  on
 *                 successful exit, this value is set to the extent's actual
@@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 *      newblkno - the starting block number of the extents new allocation.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
 */
 static int
 extBrealloc(struct inode *ip,
@@ -634,16 +634,16 @@ extBrealloc(struct inode *ip,
 /*
- * NAME:        extRoundDown()
+ * NAME:        extRoundDown()
 *
- * FUNCTION:    round down a specified number of blocks to the next
+ * FUNCTION:    round down a specified number of blocks to the next
 *              smallest power of 2 number.
 *
 * PARAMETERS:
 *      nb      - the inode of the file.
 *
 * RETURN VALUES:
- *      next smallest power of 2 number.
+ *      next smallest power of 2 number.
 */
 static s64 extRoundDown(s64 nb)
 {
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 38f70ac03bec..b3f5463fbe52 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -34,9 +34,9 @@
 #define JFS_UNICODE     0x00000001      /* unicode name */
 /* mount time flags for error handling */
-#define JFS_ERR_REMOUNT_RO 0x00000002   /* remount read-only */
+#define JFS_ERR_REMOUNT_RO 0x00000002   /* remount read-only */
-#define JFS_ERR_CONTINUE   0x00000004   /* continue */
+#define JFS_ERR_CONTINUE   0x00000004   /* continue */
-#define JFS_ERR_PANIC      0x00000008   /* panic */
+#define JFS_ERR_PANIC      0x00000008   /* panic */
 /* Quota support */
 #define JFS_USRQUOTA    0x00000010
@@ -83,7 +83,6 @@
 /*      case-insensitive name/directory support */
 #define JFS_AIX         0x80000000      /* AIX support */
-/*      POSIX name/directory  support - Never implemented*/
 /*
 *      buffer cache configuration
@@ -113,10 +112,10 @@
 #define IDATASIZE       256     /* inode inline data size */
 #define IXATTRSIZE      128     /* inode inline extended attribute size */
-#define XTPAGE_SIZE     4096
+#define XTPAGE_SIZE     4096
-#define log2_PAGESIZE     12
+#define log2_PAGESIZE   12
-#define IAG_SIZE        4096
+#define IAG_SIZE        4096
 #define IAG_EXTENT_SIZE 4096
 #define INOSPERIAG      4096    /* number of disk inodes per iag */
 #define L2INOSPERIAG    12      /* l2 number of disk inodes per iag */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index c6530227cda6..3870ba8b9086 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *);
 static void copy_to_dinode(struct dinode *, struct inode *);
 /*
- * NAME:        diMount()
+ * NAME:        diMount()
 *
- * FUNCTION:    initialize the incore inode map control structures for
+ * FUNCTION:    initialize the incore inode map control structures for
 *              a fileset or aggregate init time.
 *
- *              the inode map's control structure (dinomap) is
+ *              the inode map's control structure (dinomap) is
- *              brought in from disk and placed in virtual memory.
+ *              brought in from disk and placed in virtual memory.
 *
 * PARAMETERS:
- *      ipimap  - pointer to inode map inode for the aggregate or fileset.
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOMEM  - insufficient free virtual memory.
+ *      -ENOMEM - insufficient free virtual memory.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int diMount(struct inode *ipimap)
 {
@@ -180,18 +180,18 @@ int diMount(struct inode *ipimap)
 /*
- * NAME:        diUnmount()
+ * NAME:        diUnmount()
 *
- * FUNCTION:    write to disk the incore inode map control structures for
+ * FUNCTION:    write to disk the incore inode map control structures for
 *              a fileset or aggregate at unmount time.
 *
 * PARAMETERS:
- *      ipimap  - pointer to inode map inode for the aggregate or fileset.
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOMEM  - insufficient free virtual memory.
+ *      -ENOMEM - insufficient free virtual memory.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int diUnmount(struct inode *ipimap, int mounterror)
 {
@@ -274,9 +274,9 @@ int diSync(struct inode *ipimap)
 /*
- * NAME:        diRead()
+ * NAME:        diRead()
 *
- * FUNCTION:    initialize an incore inode from disk.
+ * FUNCTION:    initialize an incore inode from disk.
 *
 *              on entry, the specifed incore inode should itself
 *              specify the disk inode number corresponding to the
@@ -285,7 +285,7 @@ int diSync(struct inode *ipimap)
 *              this routine handles incore inode initialization for
 *              both "special" and "regular" inodes.  special inodes
 *              are those required early in the mount process and
- *              require special handling since much of the file system
+ *              require special handling since much of the file system
 *              is not yet initialized.  these "special" inodes are
 *              identified by a NULL inode map inode pointer and are
 *              actually initialized by a call to diReadSpecial().
@@ -298,12 +298,12 @@ int diSync(struct inode *ipimap)
 *              incore inode.
 *
 * PARAMETERS:
- *      ip  -  pointer to incore inode to be initialized from disk.
+ *      ip      -  pointer to incore inode to be initialized from disk.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
- *      -ENOMEM - insufficient memory
+ *      -ENOMEM - insufficient memory
 *
 */
 int diRead(struct inode *ip)
@@ -410,26 +410,26 @@ int diRead(struct inode *ip)
 /*
- * NAME:        diReadSpecial()
+ * NAME:        diReadSpecial()
 *
- * FUNCTION:    initialize a 'special' inode from disk.
+ * FUNCTION:    initialize a 'special' inode from disk.
 *
 *              this routines handles aggregate level inodes.  The
 *              inode cache cannot differentiate between the
 *              aggregate inodes and the filesystem inodes, so we
 *              handle these here.  We don't actually use the aggregate
- *              inode map, since these inodes are at a fixed location
+ *              inode map, since these inodes are at a fixed location
 *              and in some cases the aggregate inode map isn't initialized
 *              yet.
 *
 * PARAMETERS:
- *      sb - filesystem superblock
+ *      sb - filesystem superblock
 *      inum - aggregate inode number
 *      secondary - 1 if secondary aggregate inode table
 *
 * RETURN VALUES:
- *      new inode       - success
+ *      new inode       - success
- *      NULL            - i/o error.
+ *      NULL            - i/o error.
 */
 struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 {
@@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 }
 /*
- * NAME:        diWriteSpecial()
+ * NAME:        diWriteSpecial()
 *
- * FUNCTION:    Write the special inode to disk
+ * FUNCTION:    Write the special inode to disk
 *
 * PARAMETERS:
- *      ip - special inode
+ *      ip - special inode
 *      secondary - 1 if secondary aggregate inode table
 *
 * RETURN VALUES: none
@@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary)
 }
 /*
- * NAME:        diFreeSpecial()
+ * NAME:        diFreeSpecial()
 *
- * FUNCTION:    Free allocated space for special inode
+ * FUNCTION:    Free allocated space for special inode
 */
 void diFreeSpecial(struct inode *ip)
 {
@@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip)
 /*
- * NAME:        diWrite()
+ * NAME:        diWrite()
 *
- * FUNCTION:    write the on-disk inode portion of the in-memory inode
+ * FUNCTION:    write the on-disk inode portion of the in-memory inode
 *              to its corresponding on-disk inode.
 *
 *              on entry, the specifed incore inode should itself
@@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip)
 *
 * PARAMETERS:
 *      tid -  transacation id
- *      ip  -  pointer to incore inode to be written to the inode extent.
+ *      ip  -  pointer to incore inode to be written to the inode extent.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int diWrite(tid_t tid, struct inode *ip)
 {
@@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip)
        ilinelock = (struct linelock *) & tlck->lock;
        /*
-         *      regular file: 16 byte (XAD slot) granularity
+         *      regular file: 16 byte (XAD slot) granularity
         */
        if (type & tlckXTREE) {
                xtpage_t *p, *xp;
@@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip)
                                xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
        }
        /*
-         *      directory: 32 byte (directory entry slot) granularity
+         *      directory: 32 byte (directory entry slot) granularity
         */
        else if (type & tlckDTREE) {
                dtpage_t *p, *xp;
@@ -800,9 +800,8 @@ int diWrite(tid_t tid, struct inode *ip)
        }
        /*
-         *      lock/copy inode base: 128 byte slot granularity
+         *      lock/copy inode base: 128 byte slot granularity
         */
-// baseDinode:
        lv = & dilinelock->lv[dilinelock->index];
        lv->offset = dioffset >> L2INODESLOTSIZE;
        copy_to_dinode(dp, ip);
@@ -813,17 +812,6 @@ int diWrite(tid_t tid, struct inode *ip)
                lv->length = 1;
        dilinelock->index++;
-#ifdef _JFS_FASTDASD
-        /*
-         * We aren't logging changes to the DASD used in directory inodes,
-         * but we need to write them to disk.  If we don't unmount cleanly,
-         * mount will recalculate the DASD used.
-         */
-        if (S_ISDIR(ip->i_mode)
-            && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
-                memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
-#endif                          /*  _JFS_FASTDASD */
        /* release the buffer holding the updated on-disk inode.
         * the buffer will be later written by commit processing.
         */
@@ -834,9 +822,9 @@ int diWrite(tid_t tid, struct inode *ip)
 /*
- * NAME:        diFree(ip)
+ * NAME:        diFree(ip)
 *
- * FUNCTION:    free a specified inode from the inode working map
+ * FUNCTION:    free a specified inode from the inode working map
 *              for a fileset or aggregate.
 *
 *              if the inode to be freed represents the first (only)
@@ -865,11 +853,11 @@ int diWrite(tid_t tid, struct inode *ip)
 *              any updates and are held until all updates are complete.
 *
 * PARAMETERS:
- *      ip      - inode to be freed.
+ *      ip      - inode to be freed.
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int diFree(struct inode *ip)
 {
@@ -902,7 +890,8 @@ int diFree(struct inode *ip)
         * the map.
         */
        if (iagno >= imap->im_nextiag) {
-                dump_mem("imap", imap, 32);
+                print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
+                               imap, 32, 0);
                jfs_error(ip->i_sb,
                          "diFree: inum = %d, iagno = %d, nextiag = %d",
                          (uint) inum, iagno, imap->im_nextiag);
@@ -964,8 +953,8 @@ int diFree(struct inode *ip)
                return -EIO;
        }
        /*
-         *      inode extent still has some inodes or below low water mark:
+         *      inode extent still has some inodes or below low water mark:
-         *      keep the inode extent;
+         *      keep the inode extent;
         */
        if (bitmap ||
            imap->im_agctl[agno].numfree < 96 ||
@@ -1047,12 +1036,12 @@ int diFree(struct inode *ip)
        /*
-         *      inode extent has become free and above low water mark:
+         *      inode extent has become free and above low water mark:
-         *      free the inode extent;
+         *      free the inode extent;
         */
        /*
-         *      prepare to update iag list(s) (careful update step 1)
+         *      prepare to update iag list(s) (careful update step 1)
         */
        amp = bmp = cmp = dmp = NULL;
        fwd = back = -1;
@@ -1152,7 +1141,7 @@ int diFree(struct inode *ip)
        invalidate_pxd_metapages(ip, freepxd);
        /*
-         *      update iag list(s) (careful update step 2)
+         *      update iag list(s) (careful update step 2)
         */
        /* add the iag to the ag extent free list if this is the
         * first free extent for the iag.
@@ -1338,20 +1327,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 /*
- * NAME:        diAlloc(pip,dir,ip)
+ * NAME:        diAlloc(pip,dir,ip)
 *
- * FUNCTION:    allocate a disk inode from the inode working map
+ * FUNCTION:    allocate a disk inode from the inode working map
 *              for a fileset or aggregate.
 *
 * PARAMETERS:
- *      pip     - pointer to incore inode for the parent inode.
+ *      pip     - pointer to incore inode for the parent inode.
- *      dir     - 'true' if the new disk inode is for a directory.
+ *      dir     - 'true' if the new disk inode is for a directory.
- *      ip      - pointer to a new inode
+ *      ip      - pointer to a new inode
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 {
@@ -1433,7 +1422,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
        addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
        /*
-         *      try to allocate from the IAG
+         *      try to allocate from the IAG
         */
        /* check if the inode may be allocated from the iag
         * (i.e. the inode has free inodes or new extent can be added).
@@ -1633,9 +1622,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 /*
- * NAME:        diAllocAG(imap,agno,dir,ip)
+ * NAME:        diAllocAG(imap,agno,dir,ip)
 *
- * FUNCTION:    allocate a disk inode from the allocation group.
+ * FUNCTION:    allocate a disk inode from the allocation group.
 *
 *              this routine first determines if a new extent of free
 *              inodes should be added for the allocation group, with
@@ -1649,17 +1638,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 * PRE CONDITION: Already have the AG lock for this AG.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      agno    - allocation group to allocate from.
+ *      agno    - allocation group to allocate from.
- *      dir     - 'true' if the new disk inode is for a directory.
+ *      dir     - 'true' if the new disk inode is for a directory.
- *      ip      - pointer to the new inode to be filled in on successful return
+ *      ip      - pointer to the new inode to be filled in on successful return
 *                with the disk inode number allocated, its extent address
 *                and the start of the ag.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int
 diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1709,9 +1698,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
 /*
- * NAME:        diAllocAny(imap,agno,dir,iap)
+ * NAME:        diAllocAny(imap,agno,dir,iap)
 *
- * FUNCTION:    allocate a disk inode from any other allocation group.
+ * FUNCTION:    allocate a disk inode from any other allocation group.
 *
 *              this routine is called when an allocation attempt within
 *              the primary allocation group has failed. if attempts to
@@ -1719,17 +1708,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
 *              specified primary group.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      agno    - primary allocation group (to avoid).
+ *      agno    - primary allocation group (to avoid).
- *      dir     - 'true' if the new disk inode is for a directory.
+ *      dir     - 'true' if the new disk inode is for a directory.
- *      ip      - pointer to a new inode to be filled in on successful return
+ *      ip      - pointer to a new inode to be filled in on successful return
 *                with the disk inode number allocated, its extent address
 *                and the start of the ag.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int
 diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1772,9 +1761,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
 /*
- * NAME:        diAllocIno(imap,agno,ip)
+ * NAME:        diAllocIno(imap,agno,ip)
 *
- * FUNCTION:    allocate a disk inode from the allocation group's free
+ * FUNCTION:    allocate a disk inode from the allocation group's free
 *              inode list, returning an error if this free list is
 *              empty (i.e. no iags on the list).
 *
@@ -1785,16 +1774,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
 * PRE CONDITION: Already have AG lock for this AG.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      agno    - allocation group.
+ *      agno    - allocation group.
- *      ip      - pointer to new inode to be filled in on successful return
+ *      ip      - pointer to new inode to be filled in on successful return
 *                with the disk inode number allocated, its extent address
 *                and the start of the ag.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -1890,7 +1879,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 /*
- * NAME:        diAllocExt(imap,agno,ip)
+ * NAME:        diAllocExt(imap,agno,ip)
 *
 * FUNCTION:    add a new extent of free inodes to an iag, allocating
 *              an inode from this extent to satisfy the current allocation
@@ -1910,16 +1899,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 *              for the purpose of satisfying this request.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      agno    - allocation group number.
+ *      agno    - allocation group number.
- *      ip      - pointer to new inode to be filled in on successful return
+ *      ip      - pointer to new inode to be filled in on successful return
 *                with the disk inode number allocated, its extent address
 *                and the start of the ag.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -2010,7 +1999,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 /*
- * NAME:        diAllocBit(imap,iagp,ino)
+ * NAME:        diAllocBit(imap,iagp,ino)
 *
 * FUNCTION:    allocate a backed inode from an iag.
 *
@@ -2030,14 +2019,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 *      this AG.  Must have read lock on imap inode.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      iagp    - pointer to iag.
+ *      iagp    - pointer to iag.
- *      ino     - inode number to be allocated within the iag.
+ *      ino     - inode number to be allocated within the iag.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 {
@@ -2144,11 +2133,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 /*
- * NAME:        diNewExt(imap,iagp,extno)
+ * NAME:        diNewExt(imap,iagp,extno)
 *
- * FUNCTION:    initialize a new extent of inodes for an iag, allocating
+ * FUNCTION:    initialize a new extent of inodes for an iag, allocating
- *              the first inode of the extent for use for the current
+ *              the first inode of the extent for use for the current
- *              allocation request.
+ *              allocation request.
 *
 *              disk resources are allocated for the new extent of inodes
 *              and the inodes themselves are initialized to reflect their
@@ -2177,14 +2166,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 *      this AG.  Must have read lock on imap inode.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      iagp    - pointer to iag.
+ *      iagp    - pointer to iag.
- *      extno   - extent number.
+ *      extno   - extent number.
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 {
@@ -2430,7 +2419,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 /*
- * NAME:        diNewIAG(imap,iagnop,agno)
+ * NAME:        diNewIAG(imap,iagnop,agno)
 *
 * FUNCTION:    allocate a new iag for an allocation group.
 *
@@ -2443,16 +2432,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 *              and returned to satisfy the request.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      iagnop  - pointer to an iag number set with the number of the
+ *      iagnop  - pointer to an iag number set with the number of the
 *                newly allocated iag upon successful return.
- *      agno    - allocation group number.
+ *      agno    - allocation group number.
 *      bpp     - Buffer pointer to be filled in with new IAG's buffer
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -ENOSPC - insufficient disk resources.
+ *      -ENOSPC - insufficient disk resources.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 *
 * serialization:
 *      AG lock held on entry/exit;
@@ -2461,7 +2450,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 *
 * note: new iag transaction:
 * . synchronously write iag;
- * . write log of xtree and inode  of imap;
+ * . write log of xtree and inode of imap;
 * . commit;
 * . synchronous write of xtree (right to left, bottom to top);
 * . at start of logredo(): init in-memory imap with one additional iag page;
@@ -2481,9 +2470,6 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
        s64 xaddr = 0;
        s64 blkno;
        tid_t tid;
-#ifdef _STILL_TO_PORT
-        xad_t xad;
-#endif                          /*  _STILL_TO_PORT */
        struct inode *iplist[1];
        /* pick up pointers to the inode map and mount inodes */
@@ -2674,15 +2660,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 }
 /*
- * NAME:        diIAGRead()
+ * NAME:        diIAGRead()
 *
- * FUNCTION:    get the buffer for the specified iag within a fileset
+ * FUNCTION:    get the buffer for the specified iag within a fileset
 *              or aggregate inode map.
 *
 * PARAMETERS:
- *      imap    - pointer to inode map control structure.
+ *      imap    - pointer to inode map control structure.
- *      iagno   - iag number.
+ *      iagno   - iag number.
- *      bpp     - point to buffer pointer to be filled in on successful
+ *      bpp     - point to buffer pointer to be filled in on successful
 *                exit.
 *
 * SERIALIZATION:
@@ -2691,8 +2677,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 *       the read lock is unnecessary.)
 *
 * RETURN VALUES:
- *      0       - success.
+ *      0       - success.
- *      -EIO    - i/o error.
+ *      -EIO    - i/o error.
 */
 static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 {
@@ -2712,17 +2698,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 }
 /*
- * NAME:        diFindFree()
+ * NAME:        diFindFree()
 *
- * FUNCTION:    find the first free bit in a word starting at
+ * FUNCTION:    find the first free bit in a word starting at
 *              the specified bit position.
 *
 * PARAMETERS:
- *      word    - word to be examined.
+ *      word    - word to be examined.
- *      start   - starting bit position.
+ *      start   - starting bit position.
 *
 * RETURN VALUES:
- *      bit position of first free bit in the word or 32 if
+ *      bit position of first free bit in the word or 32 if
 *      no free bits were found.
 */
 static int diFindFree(u32 word, int start)
@@ -2897,7 +2883,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
                   atomic_read(&imap->im_numfree));
        /*
-         *      reconstruct imap
+         *      reconstruct imap
         *
         * coalesce contiguous k (newAGSize/oldAGSize) AGs;
         * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
@@ -2913,7 +2899,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
        }
        /*
-         *      process each iag page of the map.
+         *      process each iag page of the map.
         *
         * rebuild AG Free Inode List, AG Free Inode Extent List;
         */
@@ -2932,7 +2918,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
                /* leave free iag in the free iag list */
                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
-                        release_metapage(bp);
+                        release_metapage(bp);
                        continue;
                }
@@ -3063,13 +3049,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
 }
 /*
- * NAME:        copy_from_dinode()
+ * NAME:        copy_from_dinode()
 *
- * FUNCTION:    Copies inode info from disk inode to in-memory inode
+ * FUNCTION:    Copies inode info from disk inode to in-memory inode
 *
 * RETURN VALUES:
- *      0       - success
+ *      0       - success
- *      -ENOMEM - insufficient memory
+ *      -ENOMEM - insufficient memory
 */
 static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 {
@@ -3151,9 +3137,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 }
 /*
- * NAME:        copy_to_dinode()
+ * NAME:        copy_to_dinode()
 *
- * FUNCTION:    Copies inode info from in-memory inode to disk inode
+ * FUNCTION:    Copies inode info from in-memory inode to disk inode
 */
 static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 {
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
index 4f9c346ed498..610a0e9d8941 100644
--- a/fs/jfs/jfs_imap.h
+++ b/fs/jfs/jfs_imap.h
@@ -24,17 +24,17 @@
 *      jfs_imap.h: disk inode manager
 */
-#define EXTSPERIAG      128     /* number of disk inode extent per iag  */
+#define EXTSPERIAG      128     /* number of disk inode extent per iag  */
-#define IMAPBLKNO       0       /* lblkno of dinomap within inode map   */
+#define IMAPBLKNO       0       /* lblkno of dinomap within inode map   */
-#define SMAPSZ          4       /* number of words per summary map      */
+#define SMAPSZ          4       /* number of words per summary map      */
 #define EXTSPERSUM      32      /* number of extents per summary map entry */
 #define L2EXTSPERSUM    5       /* l2 number of extents per summary map */
 #define PGSPERIEXT      4       /* number of 4K pages per dinode extent */
-#define MAXIAGS         ((1<<20)-1)     /* maximum number of iags       */
+#define MAXIAGS         ((1<<20)-1)     /* maximum number of iags       */
-#define MAXAG           128     /* maximum number of allocation groups  */
+#define MAXAG           128     /* maximum number of allocation groups  */
-#define AMAPSIZE      512       /* bytes in the IAG allocation maps */
+#define AMAPSIZE        512     /* bytes in the IAG allocation maps */
-#define SMAPSIZE      16        /* bytes in the IAG summary maps */
+#define SMAPSIZE        16      /* bytes in the IAG summary maps */
 /* convert inode number to iag number */
 #define INOTOIAG(ino)   ((ino) >> L2INOSPERIAG)
@@ -60,31 +60,31 @@
 *      inode allocation group page (per 4096 inodes of an AG)
 */
 struct iag {
-        __le64 agstart;         /* 8: starting block of ag              */
+        __le64 agstart;         /* 8: starting block of ag              */
-        __le32 iagnum;          /* 4: inode allocation group number     */
+        __le32 iagnum;          /* 4: inode allocation group number     */
-        __le32 inofreefwd;      /* 4: ag inode free list forward        */
+        __le32 inofreefwd;      /* 4: ag inode free list forward        */
-        __le32 inofreeback;     /* 4: ag inode free list back           */
+        __le32 inofreeback;     /* 4: ag inode free list back           */
-        __le32 extfreefwd;      /* 4: ag inode extent free list forward */
+        __le32 extfreefwd;      /* 4: ag inode extent free list forward */
-        __le32 extfreeback;     /* 4: ag inode extent free list back    */
+        __le32 extfreeback;     /* 4: ag inode extent free list back    */
-        __le32 iagfree;         /* 4: iag free list                     */
+        __le32 iagfree;         /* 4: iag free list                     */
        /* summary map: 1 bit per inode extent */
        __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
-                                 *      note: this indicates free and backed
+                                 *      note: this indicates free and backed
-                                 *      inodes, if the extent is not backed the
+                                 *      inodes, if the extent is not backed the
-                                 *      value will be 1.  if the extent is
+                                 *      value will be 1.  if the extent is
-                                 *      backed but all inodes are being used the
+                                 *      backed but all inodes are being used the
-                                 *      value will be 1.  if the extent is
+                                 *      value will be 1.  if the extent is
-                                 *      backed but at least one of the inodes is
+                                 *      backed but at least one of the inodes is
-                                 *      free the value will be 0.
+                                 *      free the value will be 0.
                                 */
        __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
-        __le32 nfreeinos;               /* 4: number of free inodes             */
+        __le32 nfreeinos;       /* 4: number of free inodes             */
-        __le32 nfreeexts;               /* 4: number of free extents            */
+        __le32 nfreeexts;       /* 4: number of free extents            */
        /* (72) */
        u8 pad[1976];           /* 1976: pad to 2048 bytes */
        /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
-        __le32 wmap[EXTSPERIAG];        /* 512: working allocation map  */
+        __le32 wmap[EXTSPERIAG];        /* 512: working allocation map */
        __le32 pmap[EXTSPERIAG];        /* 512: persistent allocation map */
        pxd_t inoext[EXTSPERIAG];       /* 1024: inode extent addresses */
 };                              /* (4096) */
@@ -93,44 +93,44 @@ struct iag {
 *      per AG control information (in inode map control page)
 */
 struct iagctl_disk {
-        __le32 inofree;         /* 4: free inode list anchor            */
+        __le32 inofree;         /* 4: free inode list anchor            */
-        __le32 extfree;         /* 4: free extent list anchor           */
+        __le32 extfree;         /* 4: free extent list anchor           */
-        __le32 numinos;         /* 4: number of backed inodes           */
+        __le32 numinos;         /* 4: number of backed inodes           */
-        __le32 numfree;         /* 4: number of free inodes             */
+        __le32 numfree;         /* 4: number of free inodes             */
 };                              /* (16) */
 struct iagctl {
-        int inofree;            /* free inode list anchor            */
+        int inofree;            /* free inode list anchor               */
-        int extfree;            /* free extent list anchor           */
+        int extfree;            /* free extent list anchor              */
-        int numinos;            /* number of backed inodes           */
+        int numinos;            /* number of backed inodes              */
-        int numfree;            /* number of free inodes             */
+        int numfree;            /* number of free inodes                */
 };
 /*
 *      per fileset/aggregate inode map control page
 */
 struct dinomap_disk {
-        __le32 in_freeiag;      /* 4: free iag list anchor     */
+        __le32 in_freeiag;      /* 4: free iag list anchor      */
-        __le32 in_nextiag;      /* 4: next free iag number     */
+        __le32 in_nextiag;      /* 4: next free iag number      */
-        __le32 in_numinos;      /* 4: num of backed inodes */
+        __le32 in_numinos;      /* 4: num of backed inodes      */
        __le32 in_numfree;      /* 4: num of free backed inodes */
        __le32 in_nbperiext;    /* 4: num of blocks per inode extent */
-        __le32 in_l2nbperiext;  /* 4: l2 of in_nbperiext */
+        __le32 in_l2nbperiext;  /* 4: l2 of in_nbperiext        */
-        __le32 in_diskblock;    /* 4: for standalone test driver  */
+        __le32 in_diskblock;    /* 4: for standalone test driver */
-        __le32 in_maxag;        /* 4: for standalone test driver  */
+        __le32 in_maxag;        /* 4: for standalone test driver */
-        u8 pad[2016];           /* 2016: pad to 2048 */
+        u8 pad[2016];           /* 2016: pad to 2048            */
        struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
 };                              /* (4096) */
 struct dinomap {
-        int in_freeiag;         /* free iag list anchor     */
+        int in_freeiag;         /* free iag list anchor         */
-        int in_nextiag;         /* next free iag number     */
+        int in_nextiag;         /* next free iag number         */
-        int in_numinos;         /* num of backed inodes */
+        int in_numinos;         /* num of backed inodes         */
-        int in_numfree;         /* num of free backed inodes */
+        int in_numfree;         /* num of free backed inodes    */
        int in_nbperiext;       /* num of blocks per inode extent */
-        int in_l2nbperiext;     /* l2 of in_nbperiext */
+        int in_l2nbperiext;     /* l2 of in_nbperiext           */
-        int in_diskblock;       /* for standalone test driver  */
+        int in_diskblock;       /* for standalone test driver   */
-        int in_maxag;           /* for standalone test driver  */
+        int in_maxag;           /* for standalone test driver   */
        struct iagctl in_agctl[MAXAG];  /* AG control information */
 };
@@ -139,9 +139,9 @@ struct dinomap {
 */
 struct inomap {
        struct dinomap im_imap;         /* 4096: inode allocation control */
-        struct inode *im_ipimap;        /* 4: ptr to inode for imap   */
+        struct inode *im_ipimap;        /* 4: ptr to inode for imap     */
-        struct mutex im_freelock;       /* 4: iag free list lock      */
+        struct mutex im_freelock;       /* 4: iag free list lock        */
-        struct mutex im_aglock[MAXAG];  /* 512: per AG locks          */
+        struct mutex im_aglock[MAXAG];  /* 512: per AG locks            */
        u32 *im_DBGdimap;
        atomic_t im_numinos;    /* num of backed inodes */
        atomic_t im_numfree;    /* num of free backed inodes */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 8f453eff3c83..cb8f30985ad1 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -40,7 +40,7 @@ struct jfs_inode_info {
        uint    mode2;          /* jfs-specific mode            */
        uint    saved_uid;      /* saved for uid mount option */
        uint    saved_gid;      /* saved for gid mount option */
-        pxd_t   ixpxd;          /* inode extent descriptor      */
+        pxd_t   ixpxd;          /* inode extent descriptor      */
        dxd_t   acl;            /* dxd describing acl   */
        dxd_t   ea;             /* dxd describing ea    */
        time_t  otime;          /* time created */
@@ -190,7 +190,7 @@ struct jfs_sb_info {
        uint            gengen;         /* inode generation generator*/
        uint            inostamp;       /* shows inode belongs to fileset*/
-        /* Formerly in ipbmap */
+        /* Formerly in ipbmap */
        struct bmap     *bmap;          /* incore bmap descriptor       */
        struct nls_table *nls_tab;      /* current codepage             */
        struct inode *direct_inode;     /* metadata inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 44a2f33cb98d..de3e4a506dbc 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                goto writeRecord;
        /*
-         *      initialize/update page/transaction recovery lsn
+         *      initialize/update page/transaction recovery lsn
         */
        lsn = log->lsn;
@@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      initialize/update lsn of tblock of the page
+         *      initialize/update lsn of tblock of the page
         *
         * transaction inherits oldest lsn of pages associated
         * with allocation/deallocation of resources (their
@@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        LOGSYNC_UNLOCK(log, flags);
        /*
-         *      write the log record
+         *      write the log record
         */
      writeRecord:
        lsn = lmWriteRecord(log, tblk, lrd, tlck);
@@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                goto moveLrd;
        /*
-         *      move log record data
+         *      move log record data
         */
        /* retrieve source meta-data page to log */
        if (tlck->flag & tlckPAGELOCK) {
@@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      move log record descriptor
+         *      move log record descriptor
         */
      moveLrd:
        lrd->length = cpu_to_le16(len);
@@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log)
        LOGGC_LOCK(log);
        /*
-         *      write or queue the full page at the tail of write queue
+         *      write or queue the full page at the tail of write queue
         */
        /* get the tail tblk on commit queue */
        if (list_empty(&log->cqueue))
@@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log)
        LOGGC_UNLOCK(log);
        /*
-         *      allocate/initialize next page
+         *      allocate/initialize next page
         */
        /* if log wraps, the first data page of log is 2
         * (0 never used, 1 is superblock).
@@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
                }
        /*
-         *      forward syncpt
+         *      forward syncpt
         */
        /* if last sync is same as last syncpt,
         * invoke sync point forward processing to update sync.
@@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
                lsn = log->lsn;
        /*
-         *      setup next syncpt trigger (SWAG)
+         *      setup next syncpt trigger (SWAG)
         */
        logsize = log->logsize;
@@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
        if (more < 2 * LOGPSIZE) {
                jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
                /*
-                 *      log wrapping
+                 *      log wrapping
                 *
                 * option 1 - panic ? No.!
                 * option 2 - shutdown file systems
-                 *            associated with log ?
+                 *            associated with log ?
                 * option 3 - extend log ?
                 */
                /*
@@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 /*
 * NAME:        lmLogOpen()
 *
- * FUNCTION:    open the log on first open;
+ * FUNCTION:    open the log on first open;
 *      insert filesystem in the active list of the log.
 *
 * PARAMETER:   ipmnt   - file system mount inode
@@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb)
        init_waitqueue_head(&log->syncwait);
        /*
-         *      external log as separate logical volume
+         *      external log as separate logical volume
         *
         * file systems to log may have n-to-1 relationship;
         */
@@ -1155,7 +1155,7 @@ journal_found:
        return 0;
        /*
-         *      unwind on error
+         *      unwind on error
         */
      shutdown:         /* unwind lbmLogInit() */
        list_del(&log->journal_list);
@@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log)
        return 0;
        /*
-         *      unwind on error
+         *      unwind on error
         */
      errout30:         /* release log page */
        log->wqueue = NULL;
@@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb)
        if (test_bit(log_INLINELOG, &log->flag)) {
                /*
-                 *      in-line log in host file system
+                 *      in-line log in host file system
                 */
                rc = lmLogShutdown(log);
                kfree(log);
@@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb)
                goto out;
        /*
-         *      external log as separate logical volume
+         *      external log as separate logical volume
         */
        list_del(&log->journal_list);
        bdev = log->bdev;
@@ -1622,20 +1622,26 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
        if (!list_empty(&log->synclist)) {
                struct logsyncblk *lp;
+                printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
                list_for_each_entry(lp, &log->synclist, synclist) {
                        if (lp->xflag & COMMIT_PAGE) {
                                struct metapage *mp = (struct metapage *)lp;
-                                dump_mem("orphan metapage", lp,
+                                print_hex_dump(KERN_ERR, "metapage: ",
-                                         sizeof(struct metapage));
+                                               DUMP_PREFIX_ADDRESS, 16, 4,
-                                dump_mem("page", mp->page, sizeof(struct page));
+                                               mp, sizeof(struct metapage), 0);
-                        }
+                                print_hex_dump(KERN_ERR, "page: ",
-                        else
+                                               DUMP_PREFIX_ADDRESS, 16,
-                                dump_mem("orphan tblock", lp,
+                                               sizeof(long), mp->page,
-                                         sizeof(struct tblock));
+                                               sizeof(struct page), 0);
+                        } else
+                                print_hex_dump(KERN_ERR, "tblock:",
+                                               DUMP_PREFIX_ADDRESS, 16, 4,
+                                               lp, sizeof(struct tblock), 0);
                }
        }
+#else
+        WARN_ON(!list_empty(&log->synclist));
 #endif
-        //assert(list_empty(&log->synclist));
        clear_bit(log_FLUSH, &log->flag);
 }
@@ -1723,7 +1729,7 @@ int lmLogShutdown(struct jfs_log * log)
 *
 * PARAMETE:    log     - pointer to logs inode.
 *              fsdev   - kdev_t of filesystem.
- *              serial  - pointer to returned log serial number
+ *              serial  - pointer to returned log serial number
 *              activate - insert/remove device from active list.
 *
 * RETURN:      0       - success
@@ -1963,7 +1969,7 @@ static void lbmfree(struct lbuf * bp)
 * FUNCTION:    add a log buffer to the log redrive list
 *
 * PARAMETER:
- *     bp       - log buffer
+ *      bp      - log buffer
 *
 * NOTES:
 *      Takes log_redrive_lock.
@@ -2054,7 +2060,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
        bp->l_flag = flag;
        /*
-         *      insert bp at tail of write queue associated with log
+         *      insert bp at tail of write queue associated with log
         *
         * (request is either for bp already/currently at head of queue
         * or new bp to be inserted at tail)
@@ -2117,7 +2123,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
            log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
        /*
-         *      initiate pageout of the page
+         *      initiate pageout of the page
         */
        lbmStartIO(bp);
 }
@@ -2128,7 +2134,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
 *
 * FUNCTION:    Interface to DD strategy routine
 *
- * RETURN:      none
+ * RETURN:      none
 *
 * serialization: LCACHE_LOCK() is NOT held during log i/o;
 */
@@ -2222,7 +2228,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
        bio_put(bio);
        /*
-         *      pagein completion
+         *      pagein completion
         */
        if (bp->l_flag & lbmREAD) {
                bp->l_flag &= ~lbmREAD;
@@ -2236,7 +2242,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
        }
        /*
-         *      pageout completion
+         *      pageout completion
         *
         * the bp at the head of write queue has completed pageout.
         *
@@ -2302,7 +2308,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
        }
        /*
-         *      synchronous pageout:
+         *      synchronous pageout:
         *
         * buffer has not necessarily been removed from write queue
         * (e.g., synchronous write of partial-page with COMMIT):
@@ -2316,7 +2322,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
        }
        /*
-         *      Group Commit pageout:
+         *      Group Commit pageout:
         */
        else if (bp->l_flag & lbmGC) {
                LCACHE_UNLOCK(flags);
@@ -2324,7 +2330,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
        }
        /*
-         *      asynchronous pageout:
+         *      asynchronous pageout:
         *
         * buffer must have been removed from write queue:
         * insert buffer at head of freelist where it can be recycled
@@ -2375,7 +2381,7 @@ int jfsIOWait(void *arg)
 * FUNCTION:    format file system log
 *
 * PARAMETERS:
- *      log     - volume log
+ *      log     - volume log
 *      logAddress - start address of log space in FS block
 *      logSize - length of log space in FS block;
 *
@@ -2407,16 +2413,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
        npages = logSize >> sbi->l2nbperpage;
        /*
-         *      log space:
+         *      log space:
         *
         * page 0 - reserved;
         * page 1 - log superblock;
         * page 2 - log data page: A SYNC log record is written
-         *          into this page at logform time;
+         *          into this page at logform time;
         * pages 3-N - log data page: set to empty log data pages;
         */
        /*
-         *      init log superblock: log page 1
+         *      init log superblock: log page 1
         */
        logsuper = (struct logsuper *) bp->l_ldata;
@@ -2436,7 +2442,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
                goto exit;
        /*
-         *      init pages 2 to npages-1 as log data pages:
+         *      init pages 2 to npages-1 as log data pages:
         *
         * log page sequence number (lpsn) initialization:
         *
@@ -2479,7 +2485,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
                goto exit;
        /*
-         *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+         *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
         */
        for (lspn = 0; lspn < npages - 3; lspn++) {
                lp->h.page = lp->t.page = cpu_to_le32(lspn);
@@ -2495,7 +2501,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
        rc = 0;
 exit:
        /*
-         *      finalize log
+         *      finalize log
         */
        /* release the buffer */
        lbmFree(bp);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index a53fb17ea219..1f85ef0ec045 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -144,7 +144,7 @@ struct logpage {
 *
 * (this comment should be rewritten !)
 * jfs uses only "after" log records (only a single writer is allowed
- * in a  page, pages are written to temporary paging space if
+ * in a page, pages are written to temporary paging space if
 * if they must be written to disk before commit, and i/o is
 * scheduled for modified pages to their home location after
 * the log records containing the after values and the commit
@@ -153,7 +153,7 @@ struct logpage {
 *
 * a log record consists of a data area of variable length followed by
 * a descriptor of fixed size LOGRDSIZE bytes.
- * the  data area is rounded up to an integral number of 4-bytes and
+ * the data area is rounded up to an integral number of 4-bytes and
 * must be no longer than LOGPSIZE.
 * the descriptor is of size of multiple of 4-bytes and aligned on a
 * 4-byte boundary.
@@ -215,13 +215,13 @@ struct lrd {
        union {
                /*
-                 *      COMMIT: commit
+                 *      COMMIT: commit
                 *
                 * transaction commit: no type-dependent information;
                 */
                /*
-                 *      REDOPAGE: after-image
+                 *      REDOPAGE: after-image
                 *
                 * apply after-image;
                 *
@@ -236,7 +236,7 @@ struct lrd {
                } redopage;     /* (20) */
                /*
-                 *      NOREDOPAGE: the page is freed
+                 *      NOREDOPAGE: the page is freed
                 *
                 * do not apply after-image records which precede this record
                 * in the log with the same page block number to this page.
@@ -252,7 +252,7 @@ struct lrd {
                } noredopage;   /* (20) */
                /*
-                 *      UPDATEMAP: update block allocation map
+                 *      UPDATEMAP: update block allocation map
                 *
                 * either in-line PXD,
                 * or     out-of-line  XADLIST;
@@ -268,7 +268,7 @@ struct lrd {
                } updatemap;    /* (20) */
                /*
-                 *      NOREDOINOEXT: the inode extent is freed
+                 *      NOREDOINOEXT: the inode extent is freed
                 *
                 * do not apply after-image records which precede this
                 * record in the log with the any of the 4 page block
@@ -286,7 +286,7 @@ struct lrd {
                } noredoinoext; /* (20) */
                /*
-                 *      SYNCPT: log sync point
+                 *      SYNCPT: log sync point
                 *
                 * replay log upto syncpt address specified;
                 */
@@ -295,13 +295,13 @@ struct lrd {
                } syncpt;
                /*
-                 *      MOUNT: file system mount
+                 *      MOUNT: file system mount
                 *
                 * file system mount: no type-dependent information;
                 */
                /*
-                 *      ? FREEXTENT: free specified extent(s)
+                 *      ? FREEXTENT: free specified extent(s)
                 *
                 * free specified extent(s) from block allocation map
                 * N.B.: nextents should be length of data/sizeof(xad_t)
@@ -314,7 +314,7 @@ struct lrd {
                } freextent;
                /*
-                 *      ? NOREDOFILE: this file is freed
+                 *      ? NOREDOFILE: this file is freed
                 *
                 * do not apply records which precede this record in the log
                 * with the same inode number.
@@ -330,7 +330,7 @@ struct lrd {
                } noredofile;
                /*
-                 *      ? NEWPAGE:
+                 *      ? NEWPAGE:
                 *
                 * metadata type dependent
                 */
@@ -342,7 +342,7 @@ struct lrd {
                } newpage;
                /*
-                 *      ? DUMMY: filler
+                 *      ? DUMMY: filler
                 *
                 * no type-dependent information
                 */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 43d4f69afbec..77c7f1129dde 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -472,7 +472,8 @@ add_failed:
        printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
        goto skip;
 dump_bio:
-        dump_mem("bio", bio, sizeof(*bio));
+        print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
+                       4, bio, sizeof(*bio), 0);
 skip:
        bio_put(bio);
        unlock_page(page);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 4dd479834897..644429acb8c0 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb);
 */
 int jfs_mount(struct super_block *sb)
 {
-        int rc = 0;             /* Return code          */
+        int rc = 0;             /* Return code */
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct inode *ipaimap = NULL;
        struct inode *ipaimap2 = NULL;
@@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb)
                sbi->ipaimap2 = NULL;
        /*
-         *      mount (the only/single) fileset
+         *      mount (the only/single) fileset
         */
        /*
         * open fileset inode allocation map (aka fileset inode)
@@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb)
        goto out;
        /*
-         *      unwind on error
+         *      unwind on error
         */
      errout41:         /* close fileset inode allocation map inode */
        diFreeSpecial(ipimap);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 25430d0b0d59..7aa1f7004eaf 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -18,7 +18,7 @@
 */
 /*
- *      jfs_txnmgr.c: transaction manager
+ *      jfs_txnmgr.c: transaction manager
 *
 * notes:
 * transaction starts with txBegin() and ends with txCommit()
@@ -60,7 +60,7 @@
 #include "jfs_debug.h"
 /*
- *      transaction management structures
+ *      transaction management structures
 */
 static struct {
        int freetid;            /* index of a free tid structure */
@@ -103,19 +103,19 @@ module_param(nTxLock, int, 0);
 MODULE_PARM_DESC(nTxLock,
                 "Number of transaction locks (max:65536)");
-struct tblock *TxBlock;         /* transaction block table */
+struct tblock *TxBlock; /* transaction block table */
-static int TxLockLWM;           /* Low water mark for number of txLocks used */
+static int TxLockLWM;   /* Low water mark for number of txLocks used */
-static int TxLockHWM;           /* High water mark for number of txLocks used */
+static int TxLockHWM;   /* High water mark for number of txLocks used */
-static int TxLockVHWM;          /* Very High water mark */
+static int TxLockVHWM;  /* Very High water mark */
-struct tlock *TxLock;           /* transaction lock table */
+struct tlock *TxLock;   /* transaction lock table */
 /*
- *      transaction management lock
+ *      transaction management lock
 */
 static DEFINE_SPINLOCK(jfsTxnLock);
-#define TXN_LOCK()              spin_lock(&jfsTxnLock)
+#define TXN_LOCK()              spin_lock(&jfsTxnLock)
-#define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
+#define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
 #define LAZY_LOCK_INIT()        spin_lock_init(&TxAnchor.LazyLock);
 #define LAZY_LOCK(flags)        spin_lock_irqsave(&TxAnchor.LazyLock, flags)
@@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
 #define TXN_WAKEUP(event) wake_up_all(event)
 /*
- *      statistics
+ *      statistics
 */
 static struct {
        tid_t maxtid;           /* 4: biggest tid ever used */
@@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 static void LogSyncRelease(struct metapage * mp);
 /*
- *              transaction block/lock management
+ *              transaction block/lock management
- *              ---------------------------------
+ *              ---------------------------------
 */
 /*
@@ -227,9 +227,9 @@ static void txLockFree(lid_t lid)
 }
 /*
- * NAME:        txInit()
+ * NAME:        txInit()
 *
- * FUNCTION:    initialize transaction management structures
+ * FUNCTION:    initialize transaction management structures
 *
 * RETURN:
 *
@@ -333,9 +333,9 @@ int txInit(void)
 }
 /*
- * NAME:        txExit()
+ * NAME:        txExit()
 *
- * FUNCTION:    clean up when module is unloaded
+ * FUNCTION:    clean up when module is unloaded
 */
 void txExit(void)
 {
@@ -346,12 +346,12 @@ void txExit(void)
 }
 /*
- * NAME:        txBegin()
+ * NAME:        txBegin()
 *
- * FUNCTION:    start a transaction.
+ * FUNCTION:    start a transaction.
 *
- * PARAMETER:   sb      - superblock
+ * PARAMETER:   sb      - superblock
- *              flag    - force for nested tx;
+ *              flag    - force for nested tx;
 *
 * RETURN:      tid     - transaction id
 *
@@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag)
 }
 /*
- * NAME:        txBeginAnon()
+ * NAME:        txBeginAnon()
 *
- * FUNCTION:    start an anonymous transaction.
+ * FUNCTION:    start an anonymous transaction.
 *              Blocks if logsync or available tlocks are low to prevent
 *              anonymous tlocks from depleting supply.
 *
- * PARAMETER:   sb      - superblock
+ * PARAMETER:   sb      - superblock
 *
 * RETURN:      none
 */
@@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb)
 }
 /*
- *      txEnd()
+ *      txEnd()
 *
 * function: free specified transaction block.
 *
- *      logsync barrier processing:
+ *      logsync barrier processing:
 *
 * serialization:
 */
@@ -577,13 +577,13 @@ wakeup:
 }
 /*
- *      txLock()
+ *      txLock()
 *
 * function: acquire a transaction lock on the specified <mp>
 *
 * parameter:
 *
- * return:      transaction lock id
+ * return:      transaction lock id
 *
 * serialization:
 */
@@ -829,12 +829,16 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
        /* Only locks on ipimap or ipaimap should reach here */
        /* assert(jfs_ip->fileset == AGGREGATE_I); */
        if (jfs_ip->fileset != AGGREGATE_I) {
-                jfs_err("txLock: trying to lock locked page!");
+                printk(KERN_ERR "txLock: trying to lock locked page!");
-                dump_mem("ip", ip, sizeof(struct inode));
+                print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
-                dump_mem("mp", mp, sizeof(struct metapage));
+                               ip, sizeof(*ip), 0);
-                dump_mem("Locker's tblk", tid_to_tblock(tid),
+                print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
-                         sizeof(struct tblock));
+                               mp, sizeof(*mp), 0);
-                dump_mem("Tlock", tlck, sizeof(struct tlock));
+                print_hex_dump(KERN_ERR, "Locker's tblock: ",
+                               DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
+                               sizeof(struct tblock), 0);
+                print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
+                               tlck, sizeof(*tlck), 0);
                BUG();
        }
        INCREMENT(stattx.waitlock);     /* statistics */
@@ -857,17 +861,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 }
 /*
- * NAME:        txRelease()
+ * NAME:        txRelease()
 *
- * FUNCTION:    Release buffers associated with transaction locks, but don't
+ * FUNCTION:    Release buffers associated with transaction locks, but don't
 *              mark homeok yet.  The allows other transactions to modify
 *              buffers, but won't let them go to disk until commit record
 *              actually gets written.
 *
 * PARAMETER:
- *              tblk    -
+ *              tblk    -
 *
- * RETURN:      Errors from subroutines.
+ * RETURN:      Errors from subroutines.
 */
 static void txRelease(struct tblock * tblk)
 {
@@ -896,10 +900,10 @@ static void txRelease(struct tblock * tblk)
 }
 /*
- * NAME:        txUnlock()
+ * NAME:        txUnlock()
 *
- * FUNCTION:    Initiates pageout of pages modified by tid in journalled
+ * FUNCTION:    Initiates pageout of pages modified by tid in journalled
- *              objects and frees their lockwords.
+ *              objects and frees their lockwords.
 */
 static void txUnlock(struct tblock * tblk)
 {
@@ -983,10 +987,10 @@ static void txUnlock(struct tblock * tblk)
 }
 /*
- *      txMaplock()
+ *      txMaplock()
 *
 * function: allocate a transaction lock for freed page/entry;
- *      for freed page, maplock is used as xtlock/dtlock type;
+ *      for freed page, maplock is used as xtlock/dtlock type;
 */
 struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
 {
@@ -1057,7 +1061,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
 }
 /*
- *      txLinelock()
+ *      txLinelock()
 *
 * function: allocate a transaction lock for log vector list
 */
@@ -1092,39 +1096,39 @@ struct linelock *txLinelock(struct linelock * tlock)
 }
 /*
- *              transaction commit management
+ *              transaction commit management
- *              -----------------------------
+ *              -----------------------------
 */
 /*
- * NAME:        txCommit()
+ * NAME:        txCommit()
- *
+ *
- * FUNCTION:    commit the changes to the objects specified in
+ * FUNCTION:    commit the changes to the objects specified in
- *              clist.  For journalled segments only the
+ *              clist.  For journalled segments only the
- *              changes of the caller are committed, ie by tid.
+ *              changes of the caller are committed, ie by tid.
- *              for non-journalled segments the data are flushed to
+ *              for non-journalled segments the data are flushed to
- *              disk and then the change to the disk inode and indirect
+ *              disk and then the change to the disk inode and indirect
- *              blocks committed (so blocks newly allocated to the
+ *              blocks committed (so blocks newly allocated to the
- *              segment will be made a part of the segment atomically).
+ *              segment will be made a part of the segment atomically).
- *
+ *
- *              all of the segments specified in clist must be in
+ *              all of the segments specified in clist must be in
- *              one file system. no more than 6 segments are needed
+ *              one file system. no more than 6 segments are needed
- *              to handle all unix svcs.
+ *              to handle all unix svcs.
- *
+ *
- *              if the i_nlink field (i.e. disk inode link count)
+ *              if the i_nlink field (i.e. disk inode link count)
- *              is zero, and the type of inode is a regular file or
+ *              is zero, and the type of inode is a regular file or
- *              directory, or symbolic link , the inode is truncated
+ *              directory, or symbolic link , the inode is truncated
- *              to zero length. the truncation is committed but the
+ *              to zero length. the truncation is committed but the
- *              VM resources are unaffected until it is closed (see
+ *              VM resources are unaffected until it is closed (see
- *              iput and iclose).
+ *              iput and iclose).
 *
 * PARAMETER:
 *
 * RETURN:
 *
 * serialization:
- *              on entry the inode lock on each segment is assumed
+ *              on entry the inode lock on each segment is assumed
- *              to be held.
+ *              to be held.
 *
 * i/o error:
 */
@@ -1175,7 +1179,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
        if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
                tblk->xflag |= COMMIT_LAZY;
        /*
-         *      prepare non-journaled objects for commit
+         *      prepare non-journaled objects for commit
         *
         * flush data pages of non-journaled file
         * to prevent the file getting non-initialized disk blocks
@@ -1186,7 +1190,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
        cd.nip = nip;
        /*
-         *      acquire transaction lock on (on-disk) inodes
+         *      acquire transaction lock on (on-disk) inodes
         *
         * update on-disk inode from in-memory inode
         * acquiring transaction locks for AFTER records
@@ -1262,7 +1266,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
        }
        /*
-         *      write log records from transaction locks
+         *      write log records from transaction locks
         *
         * txUpdateMap() resets XAD_NEW in XAD.
         */
@@ -1294,7 +1298,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
                !test_cflag(COMMIT_Nolink, tblk->u.ip)));
        /*
-         *      write COMMIT log record
+         *      write COMMIT log record
         */
        lrd->type = cpu_to_le16(LOG_COMMIT);
        lrd->length = 0;
@@ -1303,7 +1307,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
        lmGroupCommit(log, tblk);
        /*
-         *      - transaction is now committed -
+         *      - transaction is now committed -
         */
        /*
@@ -1314,11 +1318,11 @@ int txCommit(tid_t tid,		/* transaction identifier */
                txForce(tblk);
        /*
-         *      update allocation map.
+         *      update allocation map.
         *
         * update inode allocation map and inode:
         * free pager lock on memory object of inode if any.
-         * update  block allocation map.
+         * update block allocation map.
         *
         * txUpdateMap() resets XAD_NEW in XAD.
         */
@@ -1326,7 +1330,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
                txUpdateMap(tblk);
        /*
-         *      free transaction locks and pageout/free pages
+         *      free transaction locks and pageout/free pages
         */
        txRelease(tblk);
@@ -1335,7 +1339,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
        /*
-         *      reset in-memory object state
+         *      reset in-memory object state
         */
        for (k = 0; k < cd.nip; k++) {
                ip = cd.iplist[k];
@@ -1358,11 +1362,11 @@ int txCommit(tid_t tid,		/* transaction identifier */
 }
 /*
- * NAME:        txLog()
+ * NAME:        txLog()
 *
- * FUNCTION:    Writes AFTER log records for all lines modified
+ * FUNCTION:    Writes AFTER log records for all lines modified
- *              by tid for segments specified by inodes in comdata.
+ *              by tid for segments specified by inodes in comdata.
- *              Code assumes only WRITELOCKS are recorded in lockwords.
+ *              Code assumes only WRITELOCKS are recorded in lockwords.
 *
 * PARAMETERS:
 *
@@ -1421,12 +1425,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
 }
 /*
- *      diLog()
+ *      diLog()
 *
- * function:    log inode tlock and format maplock to update bmap;
+ * function:    log inode tlock and format maplock to update bmap;
 */
 static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
-          struct tlock * tlck, struct commit * cd)
+                 struct tlock * tlck, struct commit * cd)
 {
        int rc = 0;
        struct metapage *mp;
@@ -1442,7 +1446,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        pxd = &lrd->log.redopage.pxd;
        /*
-         *      inode after image
+         *      inode after image
         */
        if (tlck->type & tlckENTRY) {
                /* log after-image for logredo(): */
@@ -1456,7 +1460,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                tlck->flag |= tlckWRITEPAGE;
        } else if (tlck->type & tlckFREE) {
                /*
-                 *      free inode extent
+                 *      free inode extent
                 *
                 * (pages of the freed inode extent have been invalidated and
                 * a maplock for free of the extent has been formatted at
@@ -1498,7 +1502,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                jfs_err("diLog: UFO type tlck:0x%p", tlck);
 #ifdef  _JFS_WIP
        /*
-         *      alloc/free external EA extent
+         *      alloc/free external EA extent
         *
         * a maplock for txUpdateMap() to update bPWMAP for alloc/free
         * of the extent has been formatted at txLock() time;
@@ -1534,9 +1538,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 /*
- *      dataLog()
+ *      dataLog()
 *
- * function:    log data tlock
+ * function:    log data tlock
 */
 static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
            struct tlock * tlck)
@@ -1580,9 +1584,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 /*
- *      dtLog()
+ *      dtLog()
 *
- * function:    log dtree tlock and format maplock to update bmap;
+ * function:    log dtree tlock and format maplock to update bmap;
 */
 static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
           struct tlock * tlck)
@@ -1603,10 +1607,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
        /*
-         *      page extension via relocation: entry insertion;
+         *      page extension via relocation: entry insertion;
-         *      page extension in-place: entry insertion;
+         *      page extension in-place: entry insertion;
-         *      new right page from page split, reinitialized in-line
+         *      new right page from page split, reinitialized in-line
-         *      root from root page split: entry insertion;
+         *      root from root page split: entry insertion;
         */
        if (tlck->type & (tlckNEW | tlckEXTEND)) {
                /* log after-image of the new page for logredo():
@@ -1641,8 +1645,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      entry insertion/deletion,
+         *      entry insertion/deletion,
-         *      sibling page link update (old right page before split);
+         *      sibling page link update (old right page before split);
         */
        if (tlck->type & (tlckENTRY | tlckRELINK)) {
                /* log after-image for logredo(): */
@@ -1658,11 +1662,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      page deletion: page has been invalidated
+         *      page deletion: page has been invalidated
-         *      page relocation: source extent
+         *      page relocation: source extent
         *
-         *      a maplock for free of the page has been formatted
+         *      a maplock for free of the page has been formatted
-         *      at txLock() time);
+         *      at txLock() time);
         */
        if (tlck->type & (tlckFREE | tlckRELOCATE)) {
                /* log LOG_NOREDOPAGE of the deleted page for logredo()
@@ -1683,9 +1687,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 /*
- *      xtLog()
+ *      xtLog()
 *
- * function:    log xtree tlock and format maplock to update bmap;
+ * function:    log xtree tlock and format maplock to update bmap;
 */
 static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
           struct tlock * tlck)
@@ -1725,8 +1729,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        xadlock = (struct xdlistlock *) maplock;
        /*
-         *      entry insertion/extension;
+         *      entry insertion/extension;
-         *      sibling page link update (old right page before split);
+         *      sibling page link update (old right page before split);
         */
        if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
                /* log after-image for logredo():
@@ -1801,7 +1805,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      page deletion: file deletion/truncation (ref. xtTruncate())
+         *      page deletion: file deletion/truncation (ref. xtTruncate())
         *
         * (page will be invalidated after log is written and bmap
         * is updated from the page);
@@ -1908,13 +1912,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        }
        /*
-         *      page/entry truncation: file truncation (ref. xtTruncate())
+         *      page/entry truncation: file truncation (ref. xtTruncate())
         *
-         *     |----------+------+------+---------------|
+         *      |----------+------+------+---------------|
-         *                |      |      |
+         *                 |      |      |
-         *                |      |     hwm - hwm before truncation
+         *                 |      |     hwm - hwm before truncation
-         *                |     next - truncation point
+         *                 |     next - truncation point
-         *               lwm - lwm before truncation
+         *                lwm - lwm before truncation
         * header ?
         */
        if (tlck->type & tlckTRUNCATE) {
@@ -1937,7 +1941,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                twm = xtlck->twm.offset;
                /*
-                 *      write log records
+                 *      write log records
                 */
                /* log after-image for logredo():
                 *
@@ -1997,7 +2001,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                }
                /*
-                 *      format maplock(s) for txUpdateMap() to update bmap
+                 *      format maplock(s) for txUpdateMap() to update bmap
                 */
                maplock->index = 0;
@@ -2069,9 +2073,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 /*
- *      mapLog()
+ *      mapLog()
 *
- * function:    log from maplock of freed data extents;
+ * function:    log from maplock of freed data extents;
 */
 static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
                   struct tlock * tlck)
@@ -2081,7 +2085,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
        pxd_t *pxd;
        /*
-         *      page relocation: free the source page extent
+         *      page relocation: free the source page extent
         *
         * a maplock for txUpdateMap() for free of the page
         * has been formatted at txLock() time saving the src
@@ -2155,10 +2159,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 }
 /*
- *      txEA()
+ *      txEA()
 *
- * function:    acquire maplock for EA/ACL extents or
+ * function:    acquire maplock for EA/ACL extents or
- *              set COMMIT_INLINE flag;
+ *              set COMMIT_INLINE flag;
 */
 void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
 {
@@ -2207,10 +2211,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
 }
 /*
- *      txForce()
+ *      txForce()
 *
 * function: synchronously write pages locked by transaction
- *              after txLog() but before txUpdateMap();
+ *           after txLog() but before txUpdateMap();
 */
 static void txForce(struct tblock * tblk)
 {
@@ -2273,10 +2277,10 @@ static void txForce(struct tblock * tblk)
 }
 /*
- *      txUpdateMap()
+ *      txUpdateMap()
 *
- * function:    update persistent allocation map (and working map
+ * function:    update persistent allocation map (and working map
- *              if appropriate);
+ *              if appropriate);
 *
 * parameter:
 */
@@ -2298,7 +2302,7 @@ static void txUpdateMap(struct tblock * tblk)
        /*
-         *      update block allocation map
+         *      update block allocation map
         *
         * update allocation state in pmap (and wmap) and
         * update lsn of the pmap page;
@@ -2382,7 +2386,7 @@ static void txUpdateMap(struct tblock * tblk)
                }
        }
        /*
-         *      update inode allocation map
+         *      update inode allocation map
         *
         * update allocation state in pmap and
         * update lsn of the pmap page;
@@ -2407,24 +2411,24 @@ static void txUpdateMap(struct tblock * tblk)
 }
 /*
- *      txAllocPMap()
+ *      txAllocPMap()
 *
 * function: allocate from persistent map;
 *
 * parameter:
- *      ipbmap  -
+ *      ipbmap  -
- *      malock -
+ *      malock  -
- *              xad list:
+ *              xad list:
- *              pxd:
+ *              pxd:
- *
+ *
- *      maptype -
+ *      maptype -
- *              allocate from persistent map;
+ *              allocate from persistent map;
- *              free from persistent map;
+ *              free from persistent map;
- *              (e.g., tmp file - free from working map at releae
+ *              (e.g., tmp file - free from working map at releae
- *               of last reference);
+ *               of last reference);
- *              free from persistent and working map;
+ *              free from persistent and working map;
- *
+ *
- *      lsn     - log sequence number;
+ *      lsn     - log sequence number;
 */
 static void txAllocPMap(struct inode *ip, struct maplock * maplock,
                        struct tblock * tblk)
@@ -2478,9 +2482,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 }
 /*
- *      txFreeMap()
+ *      txFreeMap()
 *
- * function:    free from persistent and/or working map;
+ * function:    free from persistent and/or working map;
 *
 * todo: optimization
 */
@@ -2579,9 +2583,9 @@ void txFreeMap(struct inode *ip,
 }
 /*
- *      txFreelock()
+ *      txFreelock()
 *
- * function:    remove tlock from inode anonymous locklist
+ * function:    remove tlock from inode anonymous locklist
 */
 void txFreelock(struct inode *ip)
 {
@@ -2619,7 +2623,7 @@ void txFreelock(struct inode *ip)
 }
 /*
- *      txAbort()
+ *      txAbort()
 *
 * function: abort tx before commit;
 *
@@ -2679,7 +2683,7 @@ void txAbort(tid_t tid, int dirty)
 }
 /*
- *      txLazyCommit(void)
+ *      txLazyCommit(void)
 *
 *      All transactions except those changing ipimap (COMMIT_FORCE) are
 *      processed by this routine.  This insures that the inode and block
@@ -2728,7 +2732,7 @@ static void txLazyCommit(struct tblock * tblk)
 }
 /*
- *      jfs_lazycommit(void)
+ *      jfs_lazycommit(void)
 *
 *      To be run as a kernel daemon.  If lbmIODone is called in an interrupt
 *      context, or where blocking is not wanted, this routine will process
@@ -2913,7 +2917,7 @@ void txResume(struct super_block *sb)
 }
 /*
- *      jfs_sync(void)
+ *      jfs_sync(void)
 *
 *      To be run as a kernel daemon.  This is awakened when tlocks run low.
 *      We write any inodes that have anonymous tlocks so they will become
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index 7863cf21afca..ab7288937019 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h
@@ -94,7 +94,7 @@ extern struct tblock *TxBlock;	/* transaction block table */
 */
 struct tlock {
        lid_t next;             /* 2: index next lockword on tid locklist
-                                 *          next lockword on freelist
+                                 *          next lockword on freelist
                                 */
        tid_t tid;              /* 2: transaction id holding lock */
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 09b252958687..649f9817accd 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -21,7 +21,7 @@
 /*
 *      jfs_types.h:
 *
- * basic type/utility  definitions
+ * basic type/utility definitions
 *
 * note: this header file must be the 1st include file
 * of JFS include list in all JFS .c file.
@@ -54,8 +54,8 @@ struct timestruc_t {
 */
 #define LEFTMOSTONE     0x80000000
-#define HIGHORDER       0x80000000u     /* high order bit on            */
+#define HIGHORDER       0x80000000u     /* high order bit on    */
-#define ONES            0xffffffffu     /* all bit on                   */
+#define ONES            0xffffffffu     /* all bit on           */
 /*
 *      logical xd (lxd)
@@ -148,7 +148,7 @@ typedef struct {
 #define sizeDXD(dxd)    le32_to_cpu((dxd)->size)
 /*
- *      directory entry argument
+ *      directory entry argument
 */
 struct component_name {
        int namlen;
@@ -160,14 +160,14 @@ struct component_name {
 *      DASD limit information - stored in directory inode
 */
 struct dasd {
-        u8 thresh;              /* Alert Threshold (in percent) */
+        u8 thresh;              /* Alert Threshold (in percent)         */
-        u8 delta;               /* Alert Threshold delta (in percent)   */
+        u8 delta;               /* Alert Threshold delta (in percent)   */
        u8 rsrvd1;
-        u8 limit_hi;            /* DASD limit (in logical blocks)       */
+        u8 limit_hi;            /* DASD limit (in logical blocks)       */
-        __le32 limit_lo;        /* DASD limit (in logical blocks)       */
+        __le32 limit_lo;        /* DASD limit (in logical blocks)       */
        u8 rsrvd2[3];
-        u8 used_hi;             /* DASD usage (in logical blocks)       */
+        u8 used_hi;             /* DASD usage (in logical blocks)       */
-        __le32 used_lo;         /* DASD usage (in logical blocks)       */
+        __le32 used_lo;         /* DASD usage (in logical blocks)       */
 };
 #define DASDLIMIT(dasdp) \
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index a386f48c73fc..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb)
        jfs_info("UnMount JFS: sb:0x%p", sb);
        /*
-         *      update superblock and close log
+         *      update superblock and close log
         *
         * if mounted read-write and log based recovery was enabled
         */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index acc97c46d8a4..1543906a2e0d 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -16,7 +16,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 /*
- *      jfs_xtree.c: extent allocation descriptor B+-tree manager
+ *      jfs_xtree.c: extent allocation descriptor B+-tree manager
 */
 #include <linux/fs.h>
@@ -32,30 +32,30 @@
 /*
 * xtree local flag
 */
-#define XT_INSERT       0x00000001
+#define XT_INSERT       0x00000001
 /*
- *       xtree key/entry comparison: extent offset
+ *      xtree key/entry comparison: extent offset
 *
 * return:
- *      -1: k < start of extent
+ *      -1: k < start of extent
- *       0: start_of_extent <= k <= end_of_extent
+ *       0: start_of_extent <= k <= end_of_extent
- *       1: k > end_of_extent
+ *       1: k > end_of_extent
 */
 #define XT_CMP(CMP, K, X, OFFSET64)\
 {\
-        OFFSET64 = offsetXAD(X);\
+        OFFSET64 = offsetXAD(X);\
-        (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+        (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
-              ((K) < OFFSET64) ? -1 : 0;\
+                ((K) < OFFSET64) ? -1 : 0;\
 }
 /* write a xad entry */
 #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
 {\
-        (XAD)->flag = (FLAG);\
+        (XAD)->flag = (FLAG);\
-        XADoffset((XAD), (OFF));\
+        XADoffset((XAD), (OFF));\
-        XADlength((XAD), (LEN));\
+        XADlength((XAD), (LEN));\
-        XADaddress((XAD), (ADDR));\
+        XADaddress((XAD), (ADDR));\
 }
 #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
@@ -76,13 +76,13 @@
                        MP = NULL;\
                        RC = -EIO;\
                }\
-        }\
+        }\
 }
 /* for consistency */
 #define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
-#define XT_GETSEARCH(IP, LEAF, BN, MP,  P, INDEX) \
+#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
        BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
 /* xtree entry parameter descriptor */
 struct xtsplit {
@@ -97,7 +97,7 @@ struct xtsplit {
 /*
- *      statistics
+ *      statistics
 */
 #ifdef CONFIG_JFS_STATISTICS
 static struct {
@@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
 #endif                          /*  _STILL_TO_PORT */
 /*
- *      xtLookup()
+ *      xtLookup()
 *
 * function: map a single page into a physical extent;
 */
@@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart,
        }
        /*
-         *      compute the physical extent covering logical extent
+         *      compute the physical extent covering logical extent
         *
         * N.B. search may have failed (e.g., hole in sparse file),
         * and returned the index of the next entry.
@@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart,
 /*
- *      xtLookupList()
+ *      xtLookupList()
 *
 * function: map a single logical extent into a list of physical extent;
 *
 * parameter:
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      struct lxdlist  *lxdlist,       lxd list (in)
+ *      struct lxdlist  *lxdlist,       lxd list (in)
- *      struct xadlist  *xadlist,       xad list (in/out)
+ *      struct xadlist  *xadlist,       xad list (in/out)
- *      int             flag)
+ *      int             flag)
 *
 * coverage of lxd by xad under assumption of
 * . lxd's are ordered and disjoint.
 * . xad's are ordered and disjoint.
 *
 * return:
- *      0:      success
+ *      0:      success
 *
 * note: a page being written (even a single byte) is backed fully,
- *      except the last page which is only backed with blocks
+ *      except the last page which is only backed with blocks
- *      required to cover the last byte;
+ *      required to cover the last byte;
- *      the extent backing a page is fully contained within an xad;
+ *      the extent backing a page is fully contained within an xad;
 */
 int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
                 struct xadlist * xadlist, int flag)
@@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
                return rc;
        /*
-         *      compute the physical extent covering logical extent
+         *      compute the physical extent covering logical extent
         *
         * N.B. search may have failed (e.g., hole in sparse file),
         * and returned the index of the next entry.
@@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
                if (lstart >= size)
                        goto mapend;
-                /* compare with the current xad  */
+                /* compare with the current xad */
                goto compare1;
        }
        /* lxd is covered by xad */
@@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
        /*
         * lxd is partially covered by xad
         */
-        else {                  /* (xend < lend)  */
+        else {                  /* (xend < lend) */
                /*
                 * get next xad
@@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
 /*
- *      xtSearch()
+ *      xtSearch()
 *
- * function:    search for the xad entry covering specified offset.
+ * function:    search for the xad entry covering specified offset.
 *
 * parameters:
- *      ip      - file object;
+ *      ip      - file object;
- *      xoff    - extent offset;
+ *      xoff    - extent offset;
- *      nextp   - address of next extent (if any) for search miss
+ *      nextp   - address of next extent (if any) for search miss
- *      cmpp    - comparison result:
+ *      cmpp    - comparison result:
- *      btstack - traverse stack;
+ *      btstack - traverse stack;
- *      flag    - search process flag (XT_INSERT);
+ *      flag    - search process flag (XT_INSERT);
 *
 * returns:
- *      btstack contains (bn, index) of search path traversed to the entry.
+ *      btstack contains (bn, index) of search path traversed to the entry.
- *      *cmpp is set to result of comparison with the entry returned.
+ *      *cmpp is set to result of comparison with the entry returned.
- *      the page containing the entry is pinned at exit.
+ *      the page containing the entry is pinned at exit.
 */
 static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
                    int *cmpp, struct btstack * btstack, int flag)
@@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
        btstack->nsplit = 0;
        /*
-         *      search down tree from root:
+         *      search down tree from root:
         *
         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
                        XT_CMP(cmp, xoff, &p->xad[index], t64);
                        if (cmp == 0) {
                                /*
-                                 *      search hit
+                                 *      search hit
                                 */
                                /* search hit - leaf page:
                                 * return the entry found
@@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
                }
                /*
-                 *      search miss
+                 *      search miss
                 *
                 * base is the smallest index with key (Kj) greater than
                 * search key (K) and may be zero or maxentry index.
@@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 }
 /*
- *      xtInsert()
+ *      xtInsert()
 *
 * function:
 *
 * parameter:
- *      tid     - transaction id;
+ *      tid     - transaction id;
- *      ip      - file object;
+ *      ip      - file object;
- *      xflag   - extent flag (XAD_NOTRECORDED):
+ *      xflag   - extent flag (XAD_NOTRECORDED):
- *      xoff    - extent offset;
+ *      xoff    - extent offset;
- *      xlen    - extent length;
+ *      xlen    - extent length;
- *      xaddrp  - extent address pointer (in/out):
+ *      xaddrp  - extent address pointer (in/out):
- *              if (*xaddrp)
+ *              if (*xaddrp)
- *                      caller allocated data extent at *xaddrp;
+ *                      caller allocated data extent at *xaddrp;
- *              else
+ *              else
- *                      allocate data extent and return its xaddr;
+ *                      allocate data extent and return its xaddr;
- *      flag    -
+ *      flag    -
 *
 * return:
 */
@@ -813,7 +813,7 @@ int xtInsert(tid_t tid,		/* transaction id */
        jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
        /*
-         *      search for the entry location at which to insert:
+         *      search for the entry location at which to insert:
         *
         * xtFastSearch() and xtSearch() both returns (leaf page
         * pinned, index at which to insert).
@@ -853,13 +853,13 @@ int xtInsert(tid_t tid,		/* transaction id */
        }
        /*
-         *      insert entry for new extent
+         *      insert entry for new extent
         */
        xflag |= XAD_NEW;
        /*
-         *      if the leaf page is full, split the page and
+         *      if the leaf page is full, split the page and
-         *      propagate up the router entry for the new page from split
+         *      propagate up the router entry for the new page from split
         *
         * The xtSplitUp() will insert the entry and unpin the leaf page.
         */
@@ -886,7 +886,7 @@ int xtInsert(tid_t tid,		/* transaction id */
        }
        /*
-         *      insert the new entry into the leaf page
+         *      insert the new entry into the leaf page
         */
        /*
         * acquire a transaction lock on the leaf page;
@@ -930,16 +930,16 @@ int xtInsert(tid_t tid,		/* transaction id */
 /*
- *      xtSplitUp()
+ *      xtSplitUp()
 *
 * function:
- *      split full pages as propagating insertion up the tree
+ *      split full pages as propagating insertion up the tree
 *
 * parameter:
- *      tid     - transaction id;
+ *      tid     - transaction id;
- *      ip      - file object;
+ *      ip      - file object;
- *      split   - entry parameter descriptor;
+ *      split   - entry parameter descriptor;
- *      btstack - traverse stack from xtSearch()
+ *      btstack - traverse stack from xtSearch()
 *
 * return:
 */
@@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid,
 /*
- *      xtSplitPage()
+ *      xtSplitPage()
 *
 * function:
- *      split a full non-root page into
+ *      split a full non-root page into
- *      original/split/left page and new right page
+ *      original/split/left page and new right page
- *      i.e., the original/split page remains as left page.
+ *      i.e., the original/split page remains as left page.
 *
 * parameter:
- *      int             tid,
+ *      int             tid,
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      struct xtsplit  *split,
+ *      struct xtsplit  *split,
- *      struct metapage **rmpp,
+ *      struct metapage **rmpp,
- *      u64             *rbnp,
+ *      u64             *rbnp,
 *
 * return:
- *      Pointer to page in which to insert or NULL on error.
+ *      Pointer to page in which to insert or NULL on error.
 */
 static int
 xtSplitPage(tid_t tid, struct inode *ip,
@@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
        rbn = addressPXD(pxd);
        /* Allocate blocks to quota. */
-       if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+        if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
-               rc = -EDQUOT;
+                rc = -EDQUOT;
-               goto clean_up;
+                goto clean_up;
        }
        quota_allocation += lengthPXD(pxd);
@@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
        skip = split->index;
        /*
-         *      sequential append at tail (after last entry of last page)
+         *      sequential append at tail (after last entry of last page)
         *
         * if splitting the last page on a level because of appending
         * a entry to it (skip is maxentry), it's likely that the access is
@@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
        }
        /*
-         *      non-sequential insert (at possibly middle page)
+         *      non-sequential insert (at possibly middle page)
         */
        /*
@@ -1465,25 +1465,24 @@ xtSplitPage(tid_t tid, struct inode *ip,
 /*
- *      xtSplitRoot()
+ *      xtSplitRoot()
 *
 * function:
- *      split the full root page into
+ *      split the full root page into original/root/split page and new
- *      original/root/split page and new right page
+ *      right page
- *      i.e., root remains fixed in tree anchor (inode) and
+ *      i.e., root remains fixed in tree anchor (inode) and the root is
- *      the root is copied to a single new right child page
+ *      copied to a single new right child page since root page <<
- *      since root page << non-root page, and
+ *      non-root page, and the split root page contains a single entry
- *      the split root page contains a single entry for the
+ *      for the new right child page.
- *      new right child page.
 *
 * parameter:
- *      int             tid,
+ *      int             tid,
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      struct xtsplit  *split,
+ *      struct xtsplit  *split,
- *      struct metapage **rmpp)
+ *      struct metapage **rmpp)
 *
 * return:
- *      Pointer to page in which to insert or NULL on error.
+ *      Pointer to page in which to insert or NULL on error.
 */
 static int
 xtSplitRoot(tid_t tid,
@@ -1505,7 +1504,7 @@ xtSplitRoot(tid_t tid,
        INCREMENT(xtStat.split);
        /*
-         *      allocate a single (right) child page
+         *      allocate a single (right) child page
         */
        pxdlist = split->pxdlist;
        pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1573,7 +1572,7 @@ xtSplitRoot(tid_t tid,
        }
        /*
-         *      reset the root
+         *      reset the root
         *
         * init root with the single entry for the new right page
         * set the 1st entry offset to 0, which force the left-most key
@@ -1610,7 +1609,7 @@ xtSplitRoot(tid_t tid,
 /*
- *      xtExtend()
+ *      xtExtend()
 *
 * function: extend in-place;
 *
@@ -1677,7 +1676,7 @@ int xtExtend(tid_t tid,		/* transaction id */
                goto extendOld;
        /*
-         *      extent overflow: insert entry for new extent
+         *      extent overflow: insert entry for new extent
         */
 //insertNew:
        xoff = offsetXAD(xad) + MAXXLEN;
@@ -1685,8 +1684,8 @@ int xtExtend(tid_t tid,		/* transaction id */
        nextindex = le16_to_cpu(p->header.nextindex);
        /*
-         *      if the leaf page is full, insert the new entry and
+         *      if the leaf page is full, insert the new entry and
-         *      propagate up the router entry for the new page from split
+         *      propagate up the router entry for the new page from split
         *
         * The xtSplitUp() will insert the entry and unpin the leaf page.
         */
@@ -1731,7 +1730,7 @@ int xtExtend(tid_t tid,		/* transaction id */
                }
        }
        /*
-         *      insert the new entry into the leaf page
+         *      insert the new entry into the leaf page
         */
        else {
                /* insert the new entry: mark the entry NEW */
@@ -1771,11 +1770,11 @@ int xtExtend(tid_t tid,		/* transaction id */
 #ifdef _NOTYET
 /*
- *      xtTailgate()
+ *      xtTailgate()
 *
 * function: split existing 'tail' extent
- *      (split offset >= start offset of tail extent), and
+ *      (split offset >= start offset of tail extent), and
- *      relocate and extend the split tail half;
+ *      relocate and extend the split tail half;
 *
 * note: existing extent may or may not have been committed.
 * caller is responsible for pager buffer cache update, and
@@ -1804,7 +1803,7 @@ int xtTailgate(tid_t tid,		/* transaction id */
 /*
 printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
-        (ulong)xoff, xlen, (ulong)xaddr);
+        (ulong)xoff, xlen, (ulong)xaddr);
 */
        /* there must exist extent to be tailgated */
@@ -1842,18 +1841,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
        xad = &p->xad[index];
 /*
 printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
-        (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+        (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
 */
        if ((llen = xoff - offsetXAD(xad)) == 0)
                goto updateOld;
        /*
-         *      partially replace extent: insert entry for new extent
+         *      partially replace extent: insert entry for new extent
         */
 //insertNew:
        /*
-         *      if the leaf page is full, insert the new entry and
+         *      if the leaf page is full, insert the new entry and
-         *      propagate up the router entry for the new page from split
+         *      propagate up the router entry for the new page from split
         *
         * The xtSplitUp() will insert the entry and unpin the leaf page.
         */
@@ -1898,7 +1897,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
                }
        }
        /*
-         *      insert the new entry into the leaf page
+         *      insert the new entry into the leaf page
         */
        else {
                /* insert the new entry: mark the entry NEW */
@@ -1955,17 +1954,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
 #endif /* _NOTYET */
 /*
- *      xtUpdate()
+ *      xtUpdate()
 *
 * function: update XAD;
 *
- *      update extent for allocated_but_not_recorded or
+ *      update extent for allocated_but_not_recorded or
- *      compressed extent;
+ *      compressed extent;
 *
 * parameter:
- *      nxad    - new XAD;
+ *      nxad    - new XAD;
- *                logical extent of the specified XAD must be completely
+ *              logical extent of the specified XAD must be completely
- *                contained by an existing XAD;
+ *              contained by an existing XAD;
 */
 int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
 {                               /* new XAD */
@@ -2416,19 +2415,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
 /*
- *      xtAppend()
+ *      xtAppend()
 *
 * function: grow in append mode from contiguous region specified ;
 *
 * parameter:
- *      tid             - transaction id;
+ *      tid             - transaction id;
- *      ip              - file object;
+ *      ip              - file object;
- *      xflag           - extent flag:
+ *      xflag           - extent flag:
- *      xoff            - extent offset;
+ *      xoff            - extent offset;
- *      maxblocks       - max extent length;
+ *      maxblocks       - max extent length;
- *      xlen            - extent length (in/out);
+ *      xlen            - extent length (in/out);
- *      xaddrp          - extent address pointer (in/out):
+ *      xaddrp          - extent address pointer (in/out):
- *      flag            -
+ *      flag            -
 *
 * return:
 */
@@ -2460,7 +2459,7 @@ int xtAppend(tid_t tid,		/* transaction id */
                 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
        /*
-         *      search for the entry location at which to insert:
+         *      search for the entry location at which to insert:
         *
         * xtFastSearch() and xtSearch() both returns (leaf page
         * pinned, index at which to insert).
@@ -2482,13 +2481,13 @@ int xtAppend(tid_t tid,		/* transaction id */
                xlen = min(xlen, (int)(next - xoff));
 //insert:
        /*
-         *      insert entry for new extent
+         *      insert entry for new extent
         */
        xflag |= XAD_NEW;
        /*
-         *      if the leaf page is full, split the page and
+         *      if the leaf page is full, split the page and
-         *      propagate up the router entry for the new page from split
+         *      propagate up the router entry for the new page from split
         *
         * The xtSplitUp() will insert the entry and unpin the leaf page.
         */
@@ -2545,7 +2544,7 @@ int xtAppend(tid_t tid,		/* transaction id */
        return 0;
        /*
-         *      insert the new entry into the leaf page
+         *      insert the new entry into the leaf page
         */
      insertLeaf:
        /*
@@ -2589,17 +2588,17 @@ int xtAppend(tid_t tid,		/* transaction id */
 /* - TBD for defragmentaion/reorganization -
 *
- *      xtDelete()
+ *      xtDelete()
 *
 * function:
- *      delete the entry with the specified key.
+ *      delete the entry with the specified key.
 *
- *      N.B.: whole extent of the entry is assumed to be deleted.
+ *      N.B.: whole extent of the entry is assumed to be deleted.
 *
 * parameter:
 *
 * return:
- *       ENOENT: if the entry is not found.
+ *      ENOENT: if the entry is not found.
 *
 * exception:
 */
@@ -2665,10 +2664,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
 /* - TBD for defragmentaion/reorganization -
 *
- *      xtDeleteUp()
+ *      xtDeleteUp()
 *
 * function:
- *      free empty pages as propagating deletion up the tree
+ *      free empty pages as propagating deletion up the tree
 *
 * parameter:
 *
@@ -2815,15 +2814,15 @@ xtDeleteUp(tid_t tid, struct inode *ip,
 /*
- * NAME:        xtRelocate()
+ * NAME:        xtRelocate()
 *
- * FUNCTION:    relocate xtpage or data extent of regular file;
+ * FUNCTION:    relocate xtpage or data extent of regular file;
- *              This function is mainly used by defragfs utility.
+ *              This function is mainly used by defragfs utility.
 *
- * NOTE:        This routine does not have the logic to handle
+ * NOTE:        This routine does not have the logic to handle
- *              uncommitted allocated extent. The caller should call
+ *              uncommitted allocated extent. The caller should call
- *              txCommit() to commit all the allocation before call
+ *              txCommit() to commit all the allocation before call
- *              this routine.
+ *              this routine.
 */
 int
 xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,  /* old XAD */
@@ -2865,8 +2864,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
        /*
-         *      1. get and validate the parent xtpage/xad entry
+         *      1. get and validate the parent xtpage/xad entry
-         *      covering the source extent to be relocated;
+         *      covering the source extent to be relocated;
         */
        if (xtype == DATAEXT) {
                /* search in leaf entry */
@@ -2910,7 +2909,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
        jfs_info("xtRelocate: parent xad entry validated.");
        /*
-         *      2. relocate the extent
+         *      2. relocate the extent
         */
        if (xtype == DATAEXT) {
                /* if the extent is allocated-but-not-recorded
@@ -2923,7 +2922,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                        XT_PUTPAGE(pmp);
                /*
-                 *      cmRelocate()
+                 *      cmRelocate()
                 *
                 * copy target data pages to be relocated;
                 *
@@ -2945,8 +2944,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                pno = offset >> CM_L2BSIZE;
                npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
 /*
-                npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+                npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
-                         (offset >> CM_L2BSIZE) + 1;
+                          (offset >> CM_L2BSIZE) + 1;
 */
                sxaddr = oxaddr;
                dxaddr = nxaddr;
@@ -2981,7 +2980,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
                jfs_info("xtRelocate: target data extent relocated.");
-        } else {                /* (xtype  == XTPAGE) */
+        } else {                /* (xtype == XTPAGE) */
                /*
                 * read in the target xtpage from the source extent;
@@ -3026,16 +3025,14 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                 */
                if (lmp) {
                        BT_MARK_DIRTY(lmp, ip);
-                        tlck =
+                        tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
-                            txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
                        lp->header.next = cpu_to_le64(nxaddr);
                        XT_PUTPAGE(lmp);
                }
                if (rmp) {
                        BT_MARK_DIRTY(rmp, ip);
-                        tlck =
+                        tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
-                            txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
                        rp->header.prev = cpu_to_le64(nxaddr);
                        XT_PUTPAGE(rmp);
                }
@@ -3062,7 +3059,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
                 * scan may be skipped by commit() and logredo();
                 */
                BT_MARK_DIRTY(mp, ip);
-                /* tlckNEW init  xtlck->lwm.offset = XTENTRYSTART; */
+                /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
                tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
                xtlck = (struct xtlock *) & tlck->lock;
@@ -3084,7 +3081,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
        }
        /*
-         *      3. acquire maplock for the source extent to be freed;
+         *      3. acquire maplock for the source extent to be freed;
         *
         * acquire a maplock saving the src relocated extent address;
         * to free of the extent at commit time;
@@ -3105,7 +3102,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
         *      is no buffer associated with this lock since the buffer
         *      has been redirected to the target location.
         */
-        else                    /* (xtype  == XTPAGE) */
+        else                    /* (xtype == XTPAGE) */
                tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
        pxdlock = (struct pxd_lock *) & tlck->lock;
@@ -3115,7 +3112,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
        pxdlock->index = 1;
        /*
-         *      4. update the parent xad entry for relocation;
+         *      4. update the parent xad entry for relocation;
         *
         * acquire tlck for the parent entry with XAD_NEW as entry
         * update which will write LOG_REDOPAGE and update bmap for
@@ -3143,22 +3140,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 /*
- *      xtSearchNode()
+ *      xtSearchNode()
 *
- * function:    search for the internal xad entry covering specified extent.
+ * function:    search for the internal xad entry covering specified extent.
- *              This function is mainly used by defragfs utility.
+ *              This function is mainly used by defragfs utility.
 *
 * parameters:
- *      ip      - file object;
+ *      ip      - file object;
- *      xad     - extent to find;
+ *      xad     - extent to find;
- *      cmpp    - comparison result:
+ *      cmpp    - comparison result:
- *      btstack - traverse stack;
+ *      btstack - traverse stack;
- *      flag    - search process flag;
+ *      flag    - search process flag;
 *
 * returns:
- *      btstack contains (bn, index) of search path traversed to the entry.
+ *      btstack contains (bn, index) of search path traversed to the entry.
- *      *cmpp is set to result of comparison with the entry returned.
+ *      *cmpp is set to result of comparison with the entry returned.
- *      the page containing the entry is pinned at exit.
+ *      the page containing the entry is pinned at exit.
 */
 static int xtSearchNode(struct inode *ip, xad_t * xad,  /* required XAD entry */
                        int *cmpp, struct btstack * btstack, int flag)
@@ -3181,7 +3178,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
        xaddr = addressXAD(xad);
        /*
-         *      search down tree from root:
+         *      search down tree from root:
         *
         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -3217,7 +3214,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
                        XT_CMP(cmp, xoff, &p->xad[index], t64);
                        if (cmp == 0) {
                                /*
-                                 *      search hit
+                                 *      search hit
                                 *
                                 * verify for exact match;
                                 */
@@ -3245,7 +3242,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
                }
                /*
-                 *      search miss - non-leaf page:
+                 *      search miss - non-leaf page:
                 *
                 * base is the smallest index with key (Kj) greater than
                 * search key (K) and may be zero or maxentry index.
@@ -3268,15 +3265,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
 /*
- *      xtRelink()
+ *      xtRelink()
 *
 * function:
- *      link around a freed page.
+ *      link around a freed page.
 *
 * Parameter:
- *      int           tid,
+ *      int             tid,
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      xtpage_t        *p)
+ *      xtpage_t        *p)
 *
 * returns:
 */
@@ -3338,7 +3335,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
 /*
- *      xtInitRoot()
+ *      xtInitRoot()
 *
 * initialize file root (inline in inode)
 */
@@ -3385,42 +3382,42 @@ void xtInitRoot(tid_t tid, struct inode *ip)
 #define MAX_TRUNCATE_LEAVES 50
 /*
- *      xtTruncate()
+ *      xtTruncate()
 *
 * function:
- *      traverse for truncation logging backward bottom up;
+ *      traverse for truncation logging backward bottom up;
- *      terminate at the last extent entry at the current subtree
+ *      terminate at the last extent entry at the current subtree
- *      root page covering new down size.
+ *      root page covering new down size.
- *      truncation may occur within the last extent entry.
+ *      truncation may occur within the last extent entry.
 *
 * parameter:
- *      int           tid,
+ *      int             tid,
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      s64           newsize,
+ *      s64             newsize,
- *      int           type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *      int             type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
 *
 * return:
 *
 * note:
- *      PWMAP:
+ *      PWMAP:
- *       1. truncate (non-COMMIT_NOLINK file)
+ *       1. truncate (non-COMMIT_NOLINK file)
- *          by jfs_truncate() or jfs_open(O_TRUNC):
+ *          by jfs_truncate() or jfs_open(O_TRUNC):
- *          xtree is updated;
+ *          xtree is updated;
 *       2. truncate index table of directory when last entry removed
- *       map update via tlock at commit time;
+ *      map update via tlock at commit time;
- *      PMAP:
+ *      PMAP:
 *       Call xtTruncate_pmap instead
- *      WMAP:
+ *      WMAP:
- *       1. remove (free zero link count) on last reference release
+ *       1. remove (free zero link count) on last reference release
- *          (pmap has been freed at commit zero link count);
+ *          (pmap has been freed at commit zero link count);
- *       2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *       2. truncate (COMMIT_NOLINK file, i.e., tmp file):
- *          xtree is updated;
+ *          xtree is updated;
- *       map update directly at truncation time;
+ *       map update directly at truncation time;
 *
- *      if (DELETE)
+ *      if (DELETE)
- *              no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *              no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
- *      else if (TRUNCATE)
+ *      else if (TRUNCATE)
- *              must write LOG_NOREDOPAGE for deleted index page;
+ *              must write LOG_NOREDOPAGE for deleted index page;
 *
 * pages may already have been tlocked by anonymous transactions
 * during file growth (i.e., write) before truncation;
@@ -3493,7 +3490,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
         * retained in the new sized file.
         * if type is PMAP, the data and index pages are NOT
         * freed, and the data and index blocks are NOT freed
-         * from  working map.
+         * from working map.
         * (this will allow continued access of data/index of
         * temporary file (zerolink count file truncated to zero-length)).
         */
@@ -3542,7 +3539,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
                goto getChild;
        /*
-         *      leaf page
+         *      leaf page
         */
        freed = 0;
@@ -3916,7 +3913,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
        }
        /*
-         *      internal page: go down to child page of current entry
+         *      internal page: go down to child page of current entry
         */
      getChild:
        /* save current parent entry for the child page */
@@ -3965,7 +3962,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 /*
- *      xtTruncate_pmap()
+ *      xtTruncate_pmap()
 *
 * function:
 *      Perform truncate to zero lenghth for deleted file, leaving the
@@ -3974,9 +3971,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 *      is committed to disk.
 *
 * parameter:
- *      tid_t           tid,
+ *      tid_t           tid,
- *      struct inode    *ip,
+ *      struct inode    *ip,
- *      s64             committed_size)
+ *      s64             committed_size)
 *
 * return: new committed size
 *
@@ -4050,7 +4047,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
        }
        /*
-         *      leaf page
+         *      leaf page
         */
        if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
@@ -4062,7 +4059,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
                xoff = offsetXAD(xad);
                xlen = lengthXAD(xad);
                XT_PUTPAGE(mp);
-                return  (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+                return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
        }
        tlck = txLock(tid, ip, mp, tlckXTREE);
        tlck->type = tlckXTREE | tlckFREE;
@@ -4099,8 +4096,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
                 */
                tlck = txLock(tid, ip, mp, tlckXTREE);
                xtlck = (struct xtlock *) & tlck->lock;
-                xtlck->hwm.offset =
+                xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
-                    le16_to_cpu(p->header.nextindex) - 1;
                tlck->type = tlckXTREE | tlckFREE;
                XT_PUTPAGE(mp);
@@ -4118,7 +4114,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
        else
                index--;
        /*
-         *      internal page: go down to child page of current entry
+         *      internal page: go down to child page of current entry
         */
      getChild:
        /* save current parent entry for the child page */
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 164f6f2b1019..70815c8a3d6a 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -19,14 +19,14 @@
 #define _H_JFS_XTREE
 /*
- *      jfs_xtree.h: extent allocation descriptor B+-tree manager
+ *      jfs_xtree.h: extent allocation descriptor B+-tree manager
 */
 #include "jfs_btree.h"
 /*
- *      extent allocation descriptor (xad)
+ *      extent allocation descriptor (xad)
 */
 typedef struct xad {
        unsigned flag:8;        /* 1: flag */
@@ -38,30 +38,30 @@ typedef struct xad {
        __le32 addr2;           /* 4: address in unit of fsblksize */
 } xad_t;                        /* (16) */
-#define MAXXLEN         ((1 << 24) - 1)
+#define MAXXLEN         ((1 << 24) - 1)
-#define XTSLOTSIZE      16
+#define XTSLOTSIZE      16
-#define L2XTSLOTSIZE    4
+#define L2XTSLOTSIZE    4
 /* xad_t field construction */
 #define XADoffset(xad, offset64)\
 {\
-        (xad)->off1 = ((u64)offset64) >> 32;\
+        (xad)->off1 = ((u64)offset64) >> 32;\
-        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
+        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
 }
 #define XADaddress(xad, address64)\
 {\
-        (xad)->addr1 = ((u64)address64) >> 32;\
+        (xad)->addr1 = ((u64)address64) >> 32;\
-        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
 }
-#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
+#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
 /* xad_t field extraction */
 #define offsetXAD(xad)\
-        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
+        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
 #define addressXAD(xad)\
-        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
-#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
+#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
 /* xad list */
 struct xadlist {
@@ -71,22 +71,22 @@ struct xadlist {
 };
 /* xad_t flags */
-#define XAD_NEW         0x01    /* new */
+#define XAD_NEW         0x01    /* new */
-#define XAD_EXTENDED    0x02    /* extended */
+#define XAD_EXTENDED    0x02    /* extended */
-#define XAD_COMPRESSED  0x04    /* compressed with recorded length */
+#define XAD_COMPRESSED  0x04    /* compressed with recorded length */
 #define XAD_NOTRECORDED 0x08    /* allocated but not recorded */
-#define XAD_COW         0x10    /* copy-on-write */
+#define XAD_COW         0x10    /* copy-on-write */
 /* possible values for maxentry */
-#define XTROOTINITSLOT_DIR  6
+#define XTROOTINITSLOT_DIR 6
-#define XTROOTINITSLOT  10
+#define XTROOTINITSLOT  10
-#define XTROOTMAXSLOT   18
+#define XTROOTMAXSLOT   18
-#define XTPAGEMAXSLOT   256
+#define XTPAGEMAXSLOT   256
-#define XTENTRYSTART    2
+#define XTENTRYSTART    2
 /*
- *      xtree page:
+ *      xtree page:
 */
 typedef union {
        struct xtheader {
@@ -106,7 +106,7 @@ typedef union {
 } xtpage_t;
 /*
- *      external declaration
+ *      external declaration
 */
 extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
                    int *pflag, s64 * paddr, int *plen, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 41c204771262..25161c4121e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 *              dentry  - child directory dentry
 *
 * RETURN:      -EINVAL - if name is . or ..
- *              -EINVAL  - if . or .. exist but are invalid.
+ *              -EINVAL - if . or .. exist but are invalid.
 *              errors from subroutines
 *
 * note:
@@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
        inode_dec_link_count(ip);
        /*
-         *      commit zero link count object
+         *      commit zero link count object
         */
        if (ip->i_nlink == 0) {
                assert(!test_cflag(COMMIT_Nolink, ip));
@@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 /*
 * NAME:        commitZeroLink()
 *
- * FUNCTION:    for non-directory, called by jfs_remove(),
+ * FUNCTION:    for non-directory, called by jfs_remove(),
 *              truncate a regular file, directory or symbolic
 *              link to zero length. return 0 if type is not
 *              one of these.
@@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
 /*
 * NAME:        jfs_free_zero_link()
 *
- * FUNCTION:    for non-directory, called by iClose(),
+ * FUNCTION:    for non-directory, called by iClose(),
 *              free resources of a file from cache and WORKING map
 *              for a file previously committed with zero link count
 *              while associated with a pager object,
@@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry,
 * NAME:        jfs_symlink(dip, dentry, name)
 *
 * FUNCTION:    creates a symbolic link to <symlink> by name <name>
- *                      in directory <dip>
+ *                      in directory <dip>
 *
- * PARAMETER:   dip         - parent directory vnode
+ * PARAMETER:   dip     - parent directory vnode
- *                      dentry  - dentry of symbolic link
+ *              dentry  - dentry of symbolic link
- *                      name    - the path name of the existing object
+ *              name    - the path name of the existing object
- *                                    that will be the source of the link
+ *                        that will be the source of the link
 *
 * RETURN:      errors from subroutines
 *
@@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 /*
- * NAME:        jfs_rename
+ * NAME:        jfs_rename
 *
- * FUNCTION:    rename a file or directory
+ * FUNCTION:    rename a file or directory
 */
 static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
               struct inode *new_dir, struct dentry *new_dentry)
@@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 /*
- * NAME:        jfs_mknod
+ * NAME:        jfs_mknod
 *
- * FUNCTION:    Create a special file (device)
+ * FUNCTION:    Create a special file (device)
 */
 static int jfs_mknod(struct inode *dir, struct dentry *dentry,
                int mode, dev_t rdev)
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 79d625f3f733..71984ee95346 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -29,17 +29,17 @@
 #include "jfs_txnmgr.h"
 #include "jfs_debug.h"
-#define BITSPERPAGE     (PSIZE << 3)
+#define BITSPERPAGE     (PSIZE << 3)
-#define L2MEGABYTE      20
+#define L2MEGABYTE      20
-#define MEGABYTE        (1 << L2MEGABYTE)
+#define MEGABYTE        (1 << L2MEGABYTE)
-#define MEGABYTE32     (MEGABYTE << 5)
+#define MEGABYTE32      (MEGABYTE << 5)
 /* convert block number to bmap file page number */
 #define BLKTODMAPN(b)\
-        (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
+        (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
 /*
- *      jfs_extendfs()
+ *      jfs_extendfs()
 *
 * function: extend file system;
 *
@@ -48,9 +48,9 @@
 *                                   workspace  space
 *
 * input:
- *      new LVSize: in LV blocks (required)
+ *      new LVSize: in LV blocks (required)
- *      new LogSize: in LV blocks (optional)
+ *      new LogSize: in LV blocks (optional)
- *      new FSSize: in LV blocks (optional)
+ *      new FSSize: in LV blocks (optional)
 *
 * new configuration:
 * 1. set new LogSize as specified or default from new LVSize;
@@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        }
        /*
-         *      reconfigure LV spaces
+         *      reconfigure LV spaces
-         *      ---------------------
+         *      ---------------------
         *
         * validate new size, or, if not specified, determine new size
         */
@@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
                log_formatted = 1;
        }
        /*
-         *      quiesce file system
+         *      quiesce file system
         *
         * (prepare to move the inline log and to prevent map update)
         *
@@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        }
        /*
-         *      extend block allocation map
+         *      extend block allocation map
-         *      ---------------------------
+         *      ---------------------------
         *
         * extendfs() for new extension, retry after crash recovery;
         *
@@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         *  s_size: aggregate size in physical blocks;
         */
        /*
-         *      compute the new block allocation map configuration
+         *      compute the new block allocation map configuration
         *
         * map dinode:
         *  di_size: map file size in byte;
@@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        newNpages = BLKTODMAPN(t64) + 1;
        /*
-         *      extend map from current map (WITHOUT growing mapfile)
+         *      extend map from current map (WITHOUT growing mapfile)
         *
         * map new extension with unmapped part of the last partial
         * dmap page, if applicable, and extra page(s) allocated
@@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        XSize -= nblocks;
        /*
-         *      grow map file to cover remaining extension
+         *      grow map file to cover remaining extension
-         *      and/or one extra dmap page for next extendfs();
+         *      and/or one extra dmap page for next extendfs();
         *
         * allocate new map pages and its backing blocks, and
         * update map file xtree
@@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        dbFinalizeBmap(ipbmap);
        /*
-         *      update inode allocation map
+         *      update inode allocation map
-         *      ---------------------------
+         *      ---------------------------
         *
         * move iag lists from old to new iag;
         * agstart field is not updated for logredo() to reconstruct
@@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        }
        /*
-         *      finalize
+         *      finalize
-         *      --------
+         *      --------
         *
         * extension is committed when on-disk super block is
         * updated with new descriptors: logredo will recover
@@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        diFreeSpecial(ipbmap2);
        /*
-         *      update superblock
+         *      update superblock
         */
        if ((rc = readSuper(sb, &bh)))
                goto error_out;
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
      resume:
        /*
-         *      resume file system transactions
+         *      resume file system transactions
         */
        txResume(sb);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index b753ba216450..b2375f0774b7 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -63,9 +63,9 @@
 *
 *   On-disk:
 *
- *     FEALISTs are stored on disk using blocks allocated by dbAlloc() and
+ *      FEALISTs are stored on disk using blocks allocated by dbAlloc() and
- *     written directly. An EA list may be in-lined in the inode if there is
+ *      written directly. An EA list may be in-lined in the inode if there is
- *     sufficient room available.
+ *      sufficient room available.
 */
 struct ea_buffer {
@@ -590,7 +590,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
      size_check:
        if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
                printk(KERN_ERR "ea_get: invalid extended attribute\n");
-                dump_mem("xattr", ea_buf->xattr, ea_size);
+                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+                                     ea_buf->xattr, ea_size, 1);
                ea_release(inode, ea_buf);
                rc = -EIO;
                goto clean_up;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 74f30e0c0381..98e78e2f18d6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
        rcu_read_lock();
        buffer += sprintf(buffer,
                "State:\t%s\n"
-                "SleepAVG:\t%lu%%\n"
                "Tgid:\t%d\n"
                "Pid:\t%d\n"
                "PPid:\t%d\n"
@@ -173,7 +172,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
                "Uid:\t%d\t%d\t%d\t%d\n"
                "Gid:\t%d\t%d\t%d\t%d\n",
                get_task_state(p),
-                (p->sleep_avg/1024)*100/(1020000000/1024),
                p->tgid, p->pid,
                pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
                pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
@@ -312,6 +310,41 @@ int proc_pid_status(struct task_struct *task, char * buffer)
        return buffer - orig;
 }
+static clock_t task_utime(struct task_struct *p)
+{
+        clock_t utime = cputime_to_clock_t(p->utime),
+                total = utime + cputime_to_clock_t(p->stime);
+        u64 temp;
+        /*
+         * Use CFS's precise accounting:
+         */
+        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        if (total) {
+                temp *= utime;
+                do_div(temp, total);
+        }
+        utime = (clock_t)temp;
+        return utime;
+}
+static clock_t task_stime(struct task_struct *p)
+{
+        clock_t stime = cputime_to_clock_t(p->stime);
+        /*
+         * Use CFS's precise accounting. (we subtract utime from
+         * the total, to make sure the total observed by userspace
+         * grows monotonically - apps rely on that):
+         */
+        stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
+        return stime;
+}
 static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 {
        unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -326,7 +359,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        unsigned long long start_time;
        unsigned long cmin_flt = 0, cmaj_flt = 0;
        unsigned long  min_flt = 0,  maj_flt = 0;
-        cputime_t cutime, cstime, utime, stime;
+        cputime_t cutime, cstime;
+        clock_t utime, stime;
        unsigned long rsslim = 0;
        char tcomm[sizeof(task->comm)];
        unsigned long flags;
@@ -344,7 +378,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        sigemptyset(&sigign);
        sigemptyset(&sigcatch);
-        cutime = cstime = utime = stime = cputime_zero;
+        cutime = cstime = cputime_zero;
+        utime = stime = 0;
        rcu_read_lock();
        if (lock_task_sighand(task, &flags)) {
@@ -370,15 +405,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                                utime = cputime_add(utime, t->utime);
+                                utime += task_utime(t);
-                                stime = cputime_add(stime, t->stime);
+                                stime += task_stime(t);
                                t = next_thread(t);
                        } while (t != task);
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        utime = cputime_add(utime, sig->utime);
+                        utime += cputime_to_clock_t(sig->utime);
-                        stime = cputime_add(stime, sig->stime);
+                        stime += cputime_to_clock_t(sig->stime);
                }
                sid = signal_session(sig);
@@ -394,8 +429,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                utime = task->utime;
+                utime = task_utime(task);
-                stime = task->stime;
+                stime = task_stime(task);
        }
        /* scale priority and nice values from timeslices to -20..20 */
@@ -426,8 +461,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
                cmin_flt,
                maj_flt,
                cmaj_flt,
-                cputime_to_clock_t(utime),
+                utime,
-                cputime_to_clock_t(stime),
+                stime,
                cputime_to_clock_t(cutime),
                cputime_to_clock_t(cstime),
                priority,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a5fa1fdafc4e..46ea5d56e1bb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 */
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
-        return sprintf(buffer, "%lu %lu %lu\n",
+        return sprintf(buffer, "%llu %llu %lu\n",
                        task->sched_info.cpu_time,
                        task->sched_info.run_delay,
                        task->sched_info.pcnt);
@@ -929,6 +929,69 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Print out various scheduling related per-task fields:
+ */
+static int sched_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        WARN_ON(!inode);
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        WARN_ON(!inode);
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_set_task(p);
+        put_task_struct(p);
+        return count;
+}
+static int sched_open(struct inode *inode, struct file *filp)
+{
+        int ret;
+        ret = single_open(filp, sched_show, NULL);
+        if (!ret) {
+                struct seq_file *m = filp->private_data;
+                m->private = inode;
+        }
+        return ret;
+}
+static const struct file_operations proc_pid_sched_operations = {
+        .open           = sched_open,
+        .read           = seq_read,
+        .write          = sched_write,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
@@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("environ",    S_IRUSR, pid_environ),
        INF("auxv",       S_IRUSR, pid_auxv),
        INF("status",     S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
+        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+#endif
        INF("cmdline",    S_IRUGO, pid_cmdline),
        INF("stat",       S_IRUGO, tgid_stat),
        INF("statm",      S_IRUGO, pid_statm),
@@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("environ",   S_IRUSR, pid_environ),
        INF("auxv",      S_IRUSR, pid_auxv),
        INF("status",    S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
+        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+#endif
        INF("cmdline",   S_IRUGO, pid_cmdline),
        INF("stat",      S_IRUGO, tid_stat),
        INF("statm",     S_IRUGO, pid_statm),
diff --git a/include/asm-generic/bitops/sched.h b/include/asm-generic/bitops/sched.h
index 815bb0148060..604fab7031a6 100644
--- a/include/asm-generic/bitops/sched.h
+++ b/include/asm-generic/bitops/sched.h
@@ -6,28 +6,23 @@
 /*
 * Every architecture must define this function. It's the fastest
- * way of searching a 140-bit bitmap where the first 100 bits are
+ * way of searching a 100-bit bitmap.  It's guaranteed that at least
- * unlikely to be set. It's guaranteed that at least one of the 140
+ * one of the 100 bits is cleared.
- * bits is cleared.
 */
 static inline int sched_find_first_bit(const unsigned long *b)
 {
 #if BITS_PER_LONG == 64
-        if (unlikely(b[0]))
+        if (b[0])
                return __ffs(b[0]);
-        if (likely(b[1]))
+        return __ffs(b[1]) + 64;
-                return __ffs(b[1]) + 64;
-        return __ffs(b[2]) + 128;
 #elif BITS_PER_LONG == 32
-        if (unlikely(b[0]))
+        if (b[0])
                return __ffs(b[0]);
-        if (unlikely(b[1]))
+        if (b[1])
                return __ffs(b[1]) + 32;
-        if (unlikely(b[2]))
+        if (b[2])
                return __ffs(b[2]) + 64;
-        if (b[3])
+        return __ffs(b[3]) + 96;
-                return __ffs(b[3]) + 96;
-        return __ffs(b[4]) + 128;
 #else
 #error BITS_PER_LONG not defined
 #endif
diff --git a/include/asm-mips/mach-au1x00/au1xxx_ide.h b/include/asm-mips/mach-au1x00/au1xxx_ide.h
index 8fcae21adbd5..4663e8b415c9 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_ide.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_ide.h
@@ -88,26 +88,26 @@ static const struct drive_list_entry dma_white_list [] = {
 /*
 * Hitachi
 */
-        { "HITACHI_DK14FA-20"    ,       "ALL"           },
+        { "HITACHI_DK14FA-20"    ,       NULL            },
-        { "HTS726060M9AT00"      ,       "ALL"           },
+        { "HTS726060M9AT00"      ,       NULL            },
 /*
 * Maxtor
 */
-        { "Maxtor 6E040L0"      ,       "ALL"           },
+        { "Maxtor 6E040L0"      ,       NULL            },
-        { "Maxtor 6Y080P0"      ,       "ALL"           },
+        { "Maxtor 6Y080P0"      ,       NULL            },
-        { "Maxtor 6Y160P0"      ,       "ALL"           },
+        { "Maxtor 6Y160P0"      ,       NULL            },
 /*
 * Seagate
 */
-        { "ST3120026A"          ,       "ALL"           },
+        { "ST3120026A"          ,       NULL            },
-        { "ST320014A"           ,       "ALL"           },
+        { "ST320014A"           ,       NULL            },
-        { "ST94011A"            ,       "ALL"           },
+        { "ST94011A"            ,       NULL            },
-        { "ST340016A"           ,       "ALL"           },
+        { "ST340016A"           ,       NULL            },
 /*
 * Western Digital
 */
-        { "WDC WD400UE-00HCT0"  ,       "ALL"           },
+        { "WDC WD400UE-00HCT0"  ,       NULL            },
-        { "WDC WD400JB-00JJC0"  ,       "ALL"           },
+        { "WDC WD400JB-00JJC0"  ,       NULL            },
        { NULL                  ,       NULL            }
 };
@@ -116,9 +116,9 @@ static const struct drive_list_entry dma_black_list [] = {
 /*
 * Western Digital
 */
-        { "WDC WD100EB-00CGH0"  ,       "ALL"           },
+        { "WDC WD100EB-00CGH0"  ,       NULL            },
-        { "WDC WD200BB-00AUA1"  ,       "ALL"           },
+        { "WDC WD200BB-00AUA1"  ,       NULL            },
-        { "WDC AC24300L"        ,       "ALL"           },
+        { "WDC AC24300L"        ,       NULL            },
        { NULL                  ,       NULL            }
 };
 #endif
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 7803014f3a11..8d302298a161 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -79,6 +79,19 @@
 #endif
 #ifdef CONFIG_PREEMPT
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler)
+ */
+#define in_atomic_preempt_off() \
+                ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+#ifdef CONFIG_PREEMPT
 # define preemptible()  (preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1e365acdd369..19ab25804056 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -25,6 +25,7 @@
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/semaphore.h>
+#include <asm/mutex.h>
 /******************************************************************************
 * IDE driver configuration options (play with these as desired):
@@ -685,6 +686,8 @@ typedef struct hwif_s {
        u8 mwdma_mask;
        u8 swdma_mask;
+        u8 cbl;         /* cable type */
        hwif_chipset_t chipset; /* sub-module for tuning.. */
        struct pci_dev  *pci_dev;       /* for pci chipsets */
@@ -735,8 +738,8 @@ typedef struct hwif_s {
        void (*ide_dma_clear_irq)(ide_drive_t *drive);
        void (*dma_host_on)(ide_drive_t *drive);
        void (*dma_host_off)(ide_drive_t *drive);
-        int (*ide_dma_lostirq)(ide_drive_t *drive);
+        void (*dma_lost_irq)(ide_drive_t *drive);
-        int (*ide_dma_timeout)(ide_drive_t *drive);
+        void (*dma_timeout)(ide_drive_t *drive);
        void (*OUTB)(u8 addr, unsigned long port);
        void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port);
@@ -791,7 +794,6 @@ typedef struct hwif_s {
        unsigned        sharing_irq: 1; /* 1 = sharing irq with another hwif */
        unsigned        reset      : 1; /* reset after probe */
        unsigned        autodma    : 1; /* auto-attempt using DMA at boot */
-        unsigned        udma_four  : 1; /* 1=ATA-66 capable, 0=default */
        unsigned        no_lba48   : 1; /* 1 = cannot do LBA48 */
        unsigned        no_lba48_dma : 1; /* 1 = cannot do LBA48 DMA */
        unsigned        auto_poll  : 1; /* supports nop auto-poll */
@@ -863,7 +865,7 @@ typedef struct hwgroup_s {
 typedef struct ide_driver_s ide_driver_t;
-extern struct semaphore ide_setting_sem;
+extern struct mutex ide_setting_mtx;
 int set_io_32bit(ide_drive_t *, int);
 int set_pio_mode(ide_drive_t *, int);
@@ -1304,8 +1306,8 @@ extern int __ide_dma_check(ide_drive_t *);
 extern int ide_dma_setup(ide_drive_t *);
 extern void ide_dma_start(ide_drive_t *);
 extern int __ide_dma_end(ide_drive_t *);
-extern int __ide_dma_lostirq(ide_drive_t *);
+extern void ide_dma_lost_irq(ide_drive_t *);
-extern int __ide_dma_timeout(ide_drive_t *);
+extern void ide_dma_timeout(ide_drive_t *);
 #endif /* CONFIG_BLK_DEV_IDEDMA_PCI */
 #else
@@ -1382,11 +1384,11 @@ extern const ide_pio_timings_t ide_pio_timings[6];
 extern spinlock_t ide_lock;
-extern struct semaphore ide_cfg_sem;
+extern struct mutex ide_cfg_mtx;
 /*
 * Structure locking:
 *
- * ide_cfg_sem and ide_lock together protect changes to
+ * ide_cfg_mtx and ide_lock together protect changes to
 * ide_hwif_t->{next,hwgroup}
 * ide_drive_t->next
 *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 693f0e6c54d4..cfb680585ab8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,8 @@
 #define SCHED_FIFO              1
 #define SCHED_RR                2
 #define SCHED_BATCH             3
+/* SCHED_ISO: reserved but not implemented yet */
+#define SCHED_IDLE              5
 #ifdef __KERNEL__
@@ -130,6 +132,26 @@ extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
+struct seq_file;
+struct cfs_rq;
+#ifdef CONFIG_SCHED_DEBUG
+extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
+#else
+static inline void
+proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+}
+static inline void proc_sched_set_task(struct task_struct *p)
+{
+}
+static inline void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+}
+#endif
 /*
 * Task state bitmask. NOTE! These bits are also
@@ -193,6 +215,7 @@ struct task_struct;
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle_bootup_task(struct task_struct *idle);
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
@@ -479,7 +502,7 @@ struct signal_struct {
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
-        unsigned long long sched_time;
+        unsigned long long sum_sched_runtime;
        /*
         * We don't bother to synchronize most readers of this at all,
@@ -521,31 +544,6 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED   0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT       0x00000008 /* group exit in progress */
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-#define MAX_USER_RT_PRIO        100
-#define MAX_RT_PRIO             MAX_USER_RT_PRIO
-#define MAX_PRIO                (MAX_RT_PRIO + 40)
-#define rt_prio(prio)           unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p)              rt_prio((p)->prio)
-#define batch_task(p)           (unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p)         ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
-#define has_rt_policy(p)        unlikely(is_rt_policy((p)->policy))
 /*
 * Some day this will be a full-fledged user tracking system..
 */
@@ -583,13 +581,13 @@ struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
        /* cumulative counters */
-        unsigned long   cpu_time,       /* time spent on the cpu */
+        unsigned long pcnt;           /* # of times run on this cpu */
-                        run_delay,      /* time spent waiting on a runqueue */
+        unsigned long long cpu_time,  /* time spent on the cpu */
-                        pcnt;           /* # of timeslices run on this cpu */
+                           run_delay; /* time spent waiting on a runqueue */
        /* timestamps */
-        unsigned long   last_arrival,   /* when we last ran on a cpu */
+        unsigned long long last_arrival,/* when we last ran on a cpu */
-                        last_queued;    /* when we were last queued to run */
+                           last_queued; /* when we were last queued to run */
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
@@ -639,18 +637,24 @@ static inline int sched_info_on(void)
 #endif
 }
-enum idle_type
+enum cpu_idle_type {
-{
+        CPU_IDLE,
-        SCHED_IDLE,
+        CPU_NOT_IDLE,
-        NOT_IDLE,
+        CPU_NEWLY_IDLE,
-        NEWLY_IDLE,
+        CPU_MAX_IDLE_TYPES
-        MAX_IDLE_TYPES
 };
 /*
 * sched-domains (multiprocessor balancing) declarations:
 */
-#define SCHED_LOAD_SCALE        128UL   /* increase resolution of load */
+/*
+ * Increase resolution of nice-level calculations:
+ */
+#define SCHED_LOAD_SHIFT        10
+#define SCHED_LOAD_SCALE        (1L << SCHED_LOAD_SHIFT)
+#define SCHED_LOAD_SCALE_FUZZ   (SCHED_LOAD_SCALE >> 5)
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE         1       /* Do load balancing on this domain. */
@@ -719,14 +723,14 @@ struct sched_domain {
 #ifdef CONFIG_SCHEDSTATS
        /* load_balance() stats */
-        unsigned long lb_cnt[MAX_IDLE_TYPES];
+        unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_failed[MAX_IDLE_TYPES];
+        unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_balanced[MAX_IDLE_TYPES];
+        unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_imbalance[MAX_IDLE_TYPES];
+        unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_gained[MAX_IDLE_TYPES];
+        unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_hot_gained[MAX_IDLE_TYPES];
+        unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_nobusyg[MAX_IDLE_TYPES];
+        unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
-        unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+        unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
        /* Active load balancing */
        unsigned long alb_cnt;
@@ -753,12 +757,6 @@ struct sched_domain {
 extern int partition_sched_domains(cpumask_t *partition1,
                                    cpumask_t *partition2);
-/*
- * Maximum cache size the migration-costs auto-tuning code will
- * search from:
- */
-extern unsigned int max_cache_size;
 #endif  /* CONFIG_SMP */
@@ -809,14 +807,86 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
-enum sleep_type {
+struct rq;
-        SLEEP_NORMAL,
+struct sched_domain;
-        SLEEP_NONINTERACTIVE,
-        SLEEP_INTERACTIVE,
+struct sched_class {
-        SLEEP_INTERRUPTED,
+        struct sched_class *next;
+        void (*enqueue_task) (struct rq *rq, struct task_struct *p,
+                              int wakeup, u64 now);
+        void (*dequeue_task) (struct rq *rq, struct task_struct *p,
+                              int sleep, u64 now);
+        void (*yield_task) (struct rq *rq, struct task_struct *p);
+        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+        struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
+        void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
+        int (*load_balance) (struct rq *this_rq, int this_cpu,
+                        struct rq *busiest,
+                        unsigned long max_nr_move, unsigned long max_load_move,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *all_pinned, unsigned long *total_load_moved);
+        void (*set_curr_task) (struct rq *rq);
+        void (*task_tick) (struct rq *rq, struct task_struct *p);
+        void (*task_new) (struct rq *rq, struct task_struct *p);
 };
-struct prio_array;
+struct load_weight {
+        unsigned long weight, inv_weight;
+};
+/*
+ * CFS stats for a schedulable entity (task, task-group etc)
+ *
+ * Current field usage histogram:
+ *
+ *     4 se->block_start
+ *     4 se->run_node
+ *     4 se->sleep_start
+ *     4 se->sleep_start_fair
+ *     6 se->load.weight
+ *     7 se->delta_fair
+ *    15 se->wait_runtime
+ */
+struct sched_entity {
+        long                    wait_runtime;
+        unsigned long           delta_fair_run;
+        unsigned long           delta_fair_sleep;
+        unsigned long           delta_exec;
+        s64                     fair_key;
+        struct load_weight      load;           /* for load-balancing */
+        struct rb_node          run_node;
+        unsigned int            on_rq;
+        u64                     wait_start_fair;
+        u64                     wait_start;
+        u64                     exec_start;
+        u64                     sleep_start;
+        u64                     sleep_start_fair;
+        u64                     block_start;
+        u64                     sleep_max;
+        u64                     block_max;
+        u64                     exec_max;
+        u64                     wait_max;
+        u64                     last_ran;
+        u64                     sum_exec_runtime;
+        s64                     sum_wait_runtime;
+        s64                     sum_sleep_runtime;
+        unsigned long           wait_runtime_overruns;
+        unsigned long           wait_runtime_underruns;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct sched_entity     *parent;
+        /* rq on which this entity is (to be) queued: */
+        struct cfs_rq           *cfs_rq;
+        /* rq "owned" by this entity/group: */
+        struct cfs_rq           *my_q;
+#endif
+};
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -832,23 +902,20 @@ struct task_struct {
        int oncpu;
 #endif
 #endif
-        int load_weight;        /* for niceness load balancing purposes */
        int prio, static_prio, normal_prio;
        struct list_head run_list;
-        struct prio_array *array;
+        struct sched_class *sched_class;
+        struct sched_entity se;
        unsigned short ioprio;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int btrace_seq;
 #endif
-        unsigned long sleep_avg;
-        unsigned long long timestamp, last_ran;
-        unsigned long long sched_time; /* sched_clock time spent running */
-        enum sleep_type sleep_type;
        unsigned int policy;
        cpumask_t cpus_allowed;
-        unsigned int time_slice, first_time_slice;
+        unsigned int time_slice;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
@@ -1078,6 +1145,37 @@ struct task_struct {
 #endif
 };
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+#define MAX_USER_RT_PRIO        100
+#define MAX_RT_PRIO             MAX_USER_RT_PRIO
+#define MAX_PRIO                (MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO            (MAX_RT_PRIO + 20)
+static inline int rt_prio(int prio)
+{
+        if (unlikely(prio < MAX_RT_PRIO))
+                return 1;
+        return 0;
+}
+static inline int rt_task(struct task_struct *p)
+{
+        return rt_prio(p->prio);
+}
 static inline pid_t process_group(struct task_struct *tsk)
 {
        return tsk->signal->pgrp;
@@ -1223,7 +1321,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 extern unsigned long long sched_clock(void);
 extern unsigned long long
-current_sched_time(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -1232,6 +1330,8 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
+extern void sched_clock_unstable_event(void);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
 #else
@@ -1240,6 +1340,14 @@ static inline void idle_task_exit(void) {}
 extern void sched_idle_next(void);
+extern unsigned int sysctl_sched_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_batch_wakeup_granularity;
+extern unsigned int sysctl_sched_stat_granularity;
+extern unsigned int sysctl_sched_runtime_limit;
+extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_features;
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1317,8 +1425,8 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 #else
 static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
+extern void sched_fork(struct task_struct *p, int clone_flags);
-extern void FASTCALL(sched_exit(struct task_struct * p));
+extern void sched_dead(struct task_struct *p);
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
@@ -1406,7 +1514,7 @@ extern struct mm_struct * mm_alloc(void);
 extern void FASTCALL(__mmdrop(struct mm_struct *));
 static inline void mmdrop(struct mm_struct * mm)
 {
-        if (atomic_dec_and_test(&mm->mm_count))
+        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
 }
@@ -1638,10 +1746,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
        return task_thread_info(p)->cpu;
 }
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
-{
-        task_thread_info(p)->cpu = cpu;
-}
 #else
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a9d1f049cc15..da6c39b2d051 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
        .cache_nice_tries       = 0,                    \
        .busy_idx               = 0,                    \
        .idle_idx               = 0,                    \
-        .newidle_idx            = 1,                    \
+        .newidle_idx            = 0,                    \
        .wake_idx               = 0,                    \
        .forkexec_idx           = 0,                    \
        .flags                  = SD_LOAD_BALANCE       \
@@ -128,14 +128,15 @@
        .imbalance_pct          = 125,                  \
        .cache_nice_tries       = 1,                    \
        .busy_idx               = 2,                    \
-        .idle_idx               = 1,                    \
+        .idle_idx               = 0,                    \
-        .newidle_idx            = 2,                    \
+        .newidle_idx            = 0,                    \
        .wake_idx               = 1,                    \
        .forkexec_idx           = 1,                    \
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_NEWIDLE    \
                                | SD_BALANCE_EXEC       \
                                | SD_WAKE_AFFINE        \
+                                | SD_WAKE_IDLE          \
                                | SD_SHARE_PKG_RESOURCES\
                                | BALANCE_FOR_MC_POWER, \
        .last_balance           = jiffies,              \
@@ -158,14 +159,15 @@
        .imbalance_pct          = 125,                  \
        .cache_nice_tries       = 1,                    \
        .busy_idx               = 2,                    \
-        .idle_idx               = 1,                    \
+        .idle_idx               = 0,                    \
-        .newidle_idx            = 2,                    \
+        .newidle_idx            = 0,                    \
        .wake_idx               = 1,                    \
        .forkexec_idx           = 1,                    \
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_NEWIDLE    \
                                | SD_BALANCE_EXEC       \
                                | SD_WAKE_AFFINE        \
+                                | SD_WAKE_IDLE          \
                                | BALANCE_FOR_PKG_POWER,\
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
diff --git a/include/linux/wait.h b/include/linux/wait.h
index e820d00e1383..0e686280450b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -366,15 +366,15 @@ static inline void remove_wait_queue_locked(wait_queue_head_t *q,
 /*
 * These are the old interfaces to sleep waiting for an event.
- * They are racy.  DO NOT use them, use the wait_event* interfaces above.  
+ * They are racy.  DO NOT use them, use the wait_event* interfaces above.
- * We plan to remove these interfaces during 2.7.
+ * We plan to remove these interfaces.
 */
-extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+extern void sleep_on(wait_queue_head_t *q);
-extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+extern long sleep_on_timeout(wait_queue_head_t *q,
-                                      signed long timeout));
+                                      signed long timeout);
-extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+extern void interruptible_sleep_on(wait_queue_head_t *q);
-extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
-                                                    signed long timeout));
+                                           signed long timeout);
 /*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
diff --git a/init/main.c b/init/main.c
index eb8bdbae4fc7..0eb1c7463fe4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -436,15 +436,16 @@ static void noinline __init_refok rest_init(void)
        /*
         * The boot idle thread must execute schedule()
-         * at least one to get things moving:
+         * at least once to get things moving:
         */
+        init_idle_bootup_task(current);
        preempt_enable_no_resched();
        schedule();
        preempt_disable();
        /* Call into cpu_idle with preempt disabled */
        cpu_idle();
-} 
+}
 /* Check for early params. */
 static int __init do_early_param(char *param, char *val)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c0148ae992c4..81e697829633 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
        s64 tmp;
-        struct timespec ts;
+        unsigned long t1;
-        unsigned long t1,t2,t3;
+        unsigned long long t2, t3;
        unsigned long flags;
+        struct timespec ts;
        /* Though tsk->delays accessed later, early exit avoids
         * unnecessary returning of other data
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        d->cpu_count += t1;
-        jiffies_to_timespec(t2, &ts);
+        tmp = (s64)d->cpu_delay_total + t2;
-        tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
        d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
-        tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+        tmp = (s64)d->cpu_run_virtual_total + t3;
        d->cpu_run_virtual_total =
                (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
diff --git a/kernel/exit.c b/kernel/exit.c
index 5c8ecbaa19a5..ca6a11b73023 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
                sig->nivcsw += tsk->nivcsw;
-                sig->sched_time += tsk->sched_time;
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
+                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -182,7 +182,6 @@ repeat:
                zap_leader = (leader->exit_signal == -1);
        }
-        sched_exit(p);
        write_unlock_irq(&tasklist_lock);
        proc_flush_task(p);
        release_thread(p);
@@ -291,7 +290,7 @@ static void reparent_to_kthreadd(void)
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
-        if (!has_rt_policy(current) && (task_nice(current) < 0))
+        if (task_nice(current) < 0)
                set_user_nice(current, 0);
        /* cpus_allowed? */
        /* rt_priority? */
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cda1bcd..da3a155bba0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,7 +877,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->sched_time = 0;
+        sig->sum_sched_runtime = 0;
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
        INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -1040,7 +1040,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = cputime_zero;
        p->stime = cputime_zero;
-        p->sched_time = 0;
 #ifdef CONFIG_TASK_XACCT
        p->rchar = 0;           /* I/O counter: bytes read */
        p->wchar = 0;           /* I/O counter: bytes written */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e18373..b53c8fcd9d82 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-        return (p == current) ? current_sched_time(p) : p->sched_time;
+        return task_sched_runtime(p);
 }
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
                } while (t != p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = p->signal->sched_time;
+                cpu->sched = p->signal->sum_sched_runtime;
                /* Add in each other live thread.  */
                while ((t = next_thread(t)) != p) {
-                        cpu->sched += t->sched_time;
+                        cpu->sched += t->se.sum_exec_runtime;
                }
                cpu->sched += sched_ns(p);
                break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 */
 static void cleanup_timers(struct list_head *head,
                           cputime_t utime, cputime_t stime,
-                           unsigned long long sched_time)
+                           unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
        cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (timer->expires.sched < sched_time) {
+                if (timer->expires.sched < sum_exec_runtime) {
                        timer->expires.sched = 0;
                } else {
-                        timer->expires.sched -= sched_time;
+                        timer->expires.sched -= sum_exec_runtime;
                }
        }
 }
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
        cleanup_timers(tsk->cpu_timers,
-                       tsk->utime, tsk->stime, tsk->sched_time);
+                       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        cleanup_timers(tsk->signal->cpu_timers,
                       cputime_add(tsk->utime, tsk->signal->utime),
                       cputime_add(tsk->stime, tsk->signal->stime),
-                       tsk->sched_time + tsk->signal->sched_time);
+                     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
 }
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
                nsleft = max_t(unsigned long long, nsleft, 1);
                do {
                        if (likely(!(t->flags & PF_EXITING))) {
-                                ns = t->sched_time + nsleft;
+                                ns = t->se.sum_exec_runtime + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
                                        t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || tsk->sched_time < t->expires.sched) {
+                if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
                        tsk->it_sched_expires = t->expires.sched;
                        break;
                }
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
        int maxfire;
        struct signal_struct *const sig = tsk->signal;
        cputime_t utime, stime, ptime, virt_expires, prof_expires;
-        unsigned long long sched_time, sched_expires;
+        unsigned long long sum_sched_runtime, sched_expires;
        struct task_struct *t;
        struct list_head *timers = sig->cpu_timers;
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
         */
        utime = sig->utime;
        stime = sig->stime;
-        sched_time = sig->sched_time;
+        sum_sched_runtime = sig->sum_sched_runtime;
        t = tsk;
        do {
                utime = cputime_add(utime, t->utime);
                stime = cputime_add(stime, t->stime);
-                sched_time += t->sched_time;
+                sum_sched_runtime += t->se.sum_exec_runtime;
                t = next_thread(t);
        } while (t != tsk);
        ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || sched_time < t->expires.sched) {
+                if (!--maxfire || sum_sched_runtime < t->expires.sched) {
                        sched_expires = t->expires.sched;
                        break;
                }
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
                virt_left = cputime_sub(virt_expires, utime);
                virt_left = cputime_div_non_zero(virt_left, nthreads);
                if (sched_expires) {
-                        sched_left = sched_expires - sched_time;
+                        sched_left = sched_expires - sum_sched_runtime;
                        do_div(sched_left, nthreads);
                        sched_left = max_t(unsigned long long, sched_left, 1);
                } else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
                                t->it_virt_expires = ticks;
                        }
-                        sched = t->sched_time + sched_left;
+                        sched = t->se.sum_exec_runtime + sched_left;
                        if (sched_expires && (t->it_sched_expires == 0 ||
                                              t->it_sched_expires > sched)) {
                                t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
            (tsk->it_sched_expires == 0 ||
-             tsk->sched_time < tsk->it_sched_expires))
+             tsk->se.sum_exec_runtime < tsk->it_sched_expires))
                return;
 #undef  UNEXPIRED
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a3122699..9fbced64bfee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03  Interactivity tuning by Con Kolivas.
 *  2004-04-02  Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *              fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *              by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
 */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
+#include <linux/unistd.h>
 #include <asm/tlb.h>
-#include <asm/unistd.h>
 /*
 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
+#define NICE_0_LOAD             SCHED_LOAD_SCALE
+#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 /*
 * These are the 'tuning knobs' of the scheduler:
 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 */
 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE           (100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT       30
-#define CHILD_PENALTY            95
-#define PARENT_PENALTY          100
-#define EXIT_WEIGHT               3
-#define PRIO_BONUS_RATIO         25
-#define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA         2
-#define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT        (MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-#define CURRENT_BONUS(p) \
-        (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-                MAX_SLEEP_AVG)
-#define GRANULARITY     (10 * HZ / 1000 ? : 1)
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
-                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-                        num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
-                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-#define SCALE(v1,v1_max,v2_max) \
-        (v1) * (v2_max) / (v1_max)
-#define DELTA(p) \
-        (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
-                INTERACTIVE_DELTA)
-#define TASK_INTERACTIVE(p) \
-        ((p)->prio <= (p)->static_prio - DELTA(p))
-#define INTERACTIVE_SLEEP(p) \
-        (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-                (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-#define TASK_PREEMPTS_CURR(p, rq) \
-        ((p)->prio < (rq)->curr->prio)
-#define SCALE_PRIO(x, prio) \
-        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-static unsigned int static_prio_timeslice(int static_prio)
-{
-        if (static_prio < NICE_TO_PRIO(0))
-                return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-        else
-                return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
 #ifdef CONFIG_SMP
 /*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 }
 #endif
+#define SCALE_PRIO(x, prio) \
+        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 /*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
 */
+static unsigned int static_prio_timeslice(int static_prio)
+{
+        if (static_prio == NICE_TO_PRIO(19))
+                return 1;
+        if (static_prio < NICE_TO_PRIO(0))
+                return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+        else
+                return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+}
+static inline int rt_policy(int policy)
+{
+        if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+                return 1;
+        return 0;
+}
-static inline unsigned int task_timeslice(struct task_struct *p)
+static inline int task_has_rt_policy(struct task_struct *p)
 {
-        return static_prio_timeslice(p->static_prio);
+        return rt_policy(p->policy);
 }
 /*
- * These are the runqueue data structures:
+ * This is the priority-queue data structure of the RT scheduling class:
 */
+struct rt_prio_array {
+        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+        struct list_head queue[MAX_RT_PRIO];
+};
+struct load_stat {
+        struct load_weight load;
+        u64 load_update_start, load_update_last;
+        unsigned long delta_fair, delta_exec, delta_stat;
+};
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+        struct load_weight load;
+        unsigned long nr_running;
+        s64 fair_clock;
+        u64 exec_clock;
+        s64 wait_runtime;
+        u64 sleeper_bonus;
+        unsigned long wait_runtime_overruns, wait_runtime_underruns;
+        struct rb_root tasks_timeline;
+        struct rb_node *rb_leftmost;
+        struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* 'curr' points to currently running entity on this cfs_rq.
+         * It is set to NULL otherwise (i.e when none are currently running).
+         */
+        struct sched_entity *curr;
+        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
-struct prio_array {
+        /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-        unsigned int nr_active;
+         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-        DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+         * (like users, containers etc.)
-        struct list_head queue[MAX_PRIO];
+         *
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+         * list is used during load balance.
+         */
+        struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
+};
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+        struct rt_prio_array active;
+        int rt_load_balance_idx;
+        struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 };
 /*
@@ -235,22 +222,28 @@ struct prio_array {
 * acquire operations must be ordered by ascending &runqueue.
 */
 struct rq {
-        spinlock_t lock;
+        spinlock_t lock;        /* runqueue lock */
        /*
         * nr_running and cpu_load should be in the same cacheline because
         * remote CPUs use both these fields when doing load calculation.
         */
        unsigned long nr_running;
-        unsigned long raw_weighted_load;
+        #define CPU_LOAD_IDX_MAX 5
-#ifdef CONFIG_SMP
+        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned long cpu_load[3];
        unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
        unsigned char in_nohz_recently;
 #endif
+        struct load_stat ls;    /* capture load from *all* tasks on this cpu */
+        unsigned long nr_load_updates;
+        u64 nr_switches;
+        struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 #endif
-        unsigned long long nr_switches;
+        struct rt_rq  rt;
        /*
         * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
-        unsigned long expired_timestamp;
-        /* Cached timestamp set by update_cpu_clock() */
-        unsigned long long most_recent_timestamp;
        struct task_struct *curr, *idle;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
-        struct prio_array *active, *expired, arrays[2];
-        int best_expired_prio;
+        u64 clock, prev_clock_raw;
+        s64 clock_max_delta;
+        unsigned int clock_warps, clock_overflows;
+        unsigned int clock_unstable_events;
+        struct sched_class *load_balance_class;
        atomic_t nr_iowait;
 #ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 static DEFINE_MUTEX(sched_hotcpu_mutex);
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+{
+        rq->curr->sched_class->check_preempt_curr(rq, p);
+}
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
 }
 /*
+ * Per-runqueue clock, as finegrained as the platform can give us:
+ */
+static unsigned long long __rq_clock(struct rq *rq)
+{
+        u64 prev_raw = rq->prev_clock_raw;
+        u64 now = sched_clock();
+        s64 delta = now - prev_raw;
+        u64 clock = rq->clock;
+        /*
+         * Protect against sched_clock() occasionally going backwards:
+         */
+        if (unlikely(delta < 0)) {
+                clock++;
+                rq->clock_warps++;
+        } else {
+                /*
+                 * Catch too large forward jumps too:
+                 */
+                if (unlikely(delta > 2*TICK_NSEC)) {
+                        clock++;
+                        rq->clock_overflows++;
+                } else {
+                        if (unlikely(delta > rq->clock_max_delta))
+                                rq->clock_max_delta = delta;
+                        clock += delta;
+                }
+        }
+        rq->prev_clock_raw = now;
+        rq->clock = clock;
+        return clock;
+}
+static inline unsigned long long rq_clock(struct rq *rq)
+{
+        int this_cpu = smp_processor_id();
+        if (this_cpu == cpu_of(rq))
+                return __rq_clock(rq);
+        return rq->clock;
+}
+/*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+        p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+}
+#endif
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
 #endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
        spin_unlock_irqrestore(&rq->lock, *flags);
 }
-#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 14
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-        int cpu;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        seq_printf(seq, "timestamp %lu\n", jiffies);
-        for_each_online_cpu(cpu) {
-                struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-                struct sched_domain *sd;
-                int dcnt = 0;
-#endif
-                /* runqueue-specific stats */
-                seq_printf(seq,
-                    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
-                    cpu, rq->yld_both_empty,
-                    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
-                    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-                    rq->ttwu_cnt, rq->ttwu_local,
-                    rq->rq_sched_info.cpu_time,
-                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
-                seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
-                /* domain-specific stats */
-                preempt_disable();
-                for_each_domain(cpu, sd) {
-                        enum idle_type itype;
-                        char mask_str[NR_CPUS];
-                        cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
-                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
-                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
-                                        itype++) {
-                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
-                                                "%lu",
-                                    sd->lb_cnt[itype],
-                                    sd->lb_balanced[itype],
-                                    sd->lb_failed[itype],
-                                    sd->lb_imbalance[itype],
-                                    sd->lb_gained[itype],
-                                    sd->lb_hot_gained[itype],
-                                    sd->lb_nobusyq[itype],
-                                    sd->lb_nobusyg[itype]);
-                        }
-                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
-                            " %lu %lu %lu\n",
-                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
-                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                            sd->ttwu_move_balance);
-                }
-                preempt_enable();
-#endif
-        }
-        return 0;
-}
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
-                return -ENOMEM;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
-                m = file->private_data;
-                m->buf = buf;
-                m->size = size;
-        } else
-                kfree(buf);
-        return res;
-}
-const struct file_operations proc_schedstat_operations = {
-        .open    = schedstat_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = single_release,
-};
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{
-        if (rq) {
-                rq->rq_sched_info.run_delay += delta_jiffies;
-                rq->rq_sched_info.pcnt++;
-        }
-}
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{
-        if (rq)
-                rq->rq_sched_info.cpu_time += delta_jiffies;
-}
-# define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{}
-# define schedstat_inc(rq, field)       do { } while (0)
-# define schedstat_add(rq, field, amt)  do { } while (0)
-#endif
 /*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
        return rq;
 }
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
- * Called when a process is dequeued from the active array and given
+ * CPU frequency is/was unstable - start new by setting prev_clock_raw:
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
 */
-static inline void sched_info_dequeued(struct task_struct *t)
+void sched_clock_unstable_event(void)
 {
-        t->sched_info.last_queued = 0;
+        unsigned long flags;
+        struct rq *rq;
+        rq = task_rq_lock(current, &flags);
+        rq->prev_clock_raw = sched_clock();
+        rq->clock_unstable_events++;
+        task_rq_unlock(rq, &flags);
 }
 /*
- * Called when a task finally hits the cpu.  We can now calculate how
+ * resched_task - mark a task 'to be rescheduled now'.
- * long it was waiting to run.  We also note when it began so that we
+ *
- * can keep stats on how long its timeslice is.
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
 */
-static void sched_info_arrive(struct task_struct *t)
+#ifdef CONFIG_SMP
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+static void resched_task(struct task_struct *p)
 {
-        unsigned long now = jiffies, delta_jiffies = 0;
+        int cpu;
+        assert_spin_locked(&task_rq(p)->lock);
+        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+                return;
-        if (t->sched_info.last_queued)
+        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
-                delta_jiffies = now - t->sched_info.last_queued;
-        sched_info_dequeued(t);
+        cpu = task_cpu(p);
-        t->sched_info.run_delay += delta_jiffies;
+        if (cpu == smp_processor_id())
-        t->sched_info.last_arrival = now;
+                return;
-        t->sched_info.pcnt++;
-        rq_sched_info_arrive(task_rq(t), delta_jiffies);
+        /* NEED_RESCHED must be visible before we test polling */
+        smp_mb();
+        if (!tsk_is_polling(p))
+                smp_send_reschedule(cpu);
 }
-/*
+static void resched_cpu(int cpu)
- * Called when a process is queued into either the active or expired
- * array.  The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu.  Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
 {
-        if (unlikely(sched_info_on()))
+        struct rq *rq = cpu_rq(cpu);
-                if (!t->sched_info.last_queued)
+        unsigned long flags;
-                        t->sched_info.last_queued = jiffies;
+        if (!spin_trylock_irqsave(&rq->lock, flags))
+                return;
+        resched_task(cpu_curr(cpu));
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
+#else
+static inline void resched_task(struct task_struct *p)
+{
+        assert_spin_locked(&task_rq(p)->lock);
+        set_tsk_need_resched(p);
+}
+#endif
-/*
+static u64 div64_likely32(u64 divident, unsigned long divisor)
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- */
-static inline void sched_info_depart(struct task_struct *t)
 {
-        unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
+#if BITS_PER_LONG == 32
+        if (likely(divident <= 0xffffffffULL))
+                return (u32)divident / divisor;
+        do_div(divident, divisor);
-        t->sched_info.cpu_time += delta_jiffies;
+        return divident;
-        rq_sched_info_depart(task_rq(t), delta_jiffies);
+#else
+        return divident / divisor;
+#endif
 }
-/*
+#if BITS_PER_LONG == 32
- * Called when tasks are switched involuntarily due, typically, to expiring
+# define WMULT_CONST    (~0UL)
- * their time slice.  (This may also be called when switching to or from
+#else
- * the idle task.)  We are only called when prev != next.
+# define WMULT_CONST    (1UL << 32)
- */
+#endif
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+#define WMULT_SHIFT     32
+static inline unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+                struct load_weight *lw)
 {
-        struct rq *rq = task_rq(prev);
+        u64 tmp;
+        if (unlikely(!lw->inv_weight))
+                lw->inv_weight = WMULT_CONST / lw->weight;
+        tmp = (u64)delta_exec * weight;
        /*
-         * prev now departs the cpu.  It's not interesting to record
+         * Check whether we'd overflow the 64-bit multiplication:
-         * stats about how efficient we were at scheduling the idle
-         * process, however.
         */
-        if (prev != rq->idle)
+        if (unlikely(tmp > WMULT_CONST)) {
-                sched_info_depart(prev);
+                tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
+                                >> (WMULT_SHIFT/2);
+        } else {
+                tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
+        }
-        if (next != rq->idle)
+        return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
-                sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-        if (unlikely(sched_info_on()))
-                __sched_info_switch(prev, next);
 }
-#else
-#define sched_info_queued(t)            do { } while (0)
-#define sched_info_switch(t, next)      do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-/*
+static inline unsigned long
- * Adding/removing a task to/from a priority array:
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
- */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
 {
-        array->nr_active--;
+        return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-        list_del(&p->run_list);
-        if (list_empty(array->queue + p->prio))
-                __clear_bit(p->prio, array->bitmap);
 }
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+static void update_load_add(struct load_weight *lw, unsigned long inc)
 {
-        sched_info_queued(p);
+        lw->weight += inc;
-        list_add_tail(&p->run_list, array->queue + p->prio);
+        lw->inv_weight = 0;
-        __set_bit(p->prio, array->bitmap);
-        array->nr_active++;
-        p->array = array;
 }
-/*
+static void update_load_sub(struct load_weight *lw, unsigned long dec)
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
 {
-        list_move_tail(&p->run_list, array->queue + p->prio);
+        lw->weight -= dec;
+        lw->inv_weight = 0;
 }
-static inline void
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
 {
-        list_add(&p->run_list, array->queue + p->prio);
+        if (rq->curr != rq->idle && ls->load.weight) {
-        __set_bit(p->prio, array->bitmap);
+                ls->delta_exec += ls->delta_stat;
-        array->nr_active++;
+                ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-        p->array = array;
+                ls->delta_stat = 0;
+        }
 }
 /*
- * __normal_prio - return the priority that is based on the static
+ * Update delta_exec, delta_fair fields for rq.
- * priority but is modified by bonuses/penalties.
 *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * delta_fair clock advances at a rate inversely proportional to
- * into the -5 ... 0 ... +5 bonus/penalty range.
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
 *
- * We use 25% of the full 0...39 priority range so that:
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
 *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * This function is called /before/ updating rq->ls.load
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * and when switching tasks.
- *
- * Both properties are important to certain workloads.
 */
+static void update_curr_load(struct rq *rq, u64 now)
-static inline int __normal_prio(struct task_struct *p)
 {
-        int bonus, prio;
+        struct load_stat *ls = &rq->ls;
+        u64 start;
-        bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
-        prio = p->static_prio - bonus;
+        start = ls->load_update_start;
-        if (prio < MAX_RT_PRIO)
+        ls->load_update_start = now;
-                prio = MAX_RT_PRIO;
+        ls->delta_stat += now - start;
-        if (prio > MAX_PRIO-1)
+        /*
-                prio = MAX_PRIO-1;
+         * Stagger updates to ls->delta_fair. Very frequent updates
-        return prio;
+         * can be expensive.
+         */
+        if (ls->delta_stat >= sysctl_sched_stat_granularity)
+                __update_curr_load(rq, ls);
 }
 /*
@@ -791,53 +718,146 @@ static inline int __normal_prio(struct task_struct *p)
 * this code will need modification
 */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define LOAD_WEIGHT(lp) \
+#define load_weight(lp) \
        (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define PRIO_TO_LOAD_WEIGHT(prio) \
-        LOAD_WEIGHT(static_prio_timeslice(prio))
+        load_weight(static_prio_timeslice(prio))
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
-        (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+        (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-static void set_load_weight(struct task_struct *p)
+#define WEIGHT_IDLEPRIO         2
-{
+#define WMULT_IDLEPRIO          (1 << 31)
-        if (has_rt_policy(p)) {
-#ifdef CONFIG_SMP
+/*
-                if (p == task_rq(p)->migration_thread)
+ * Nice levels are multiplicative, with a gentle 10% change for every
-                        /*
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
-                         * The migration thread does the actual balancing.
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
-                         * Giving its load any weight will skew balancing
+ * that remained on nice 0.
-                         * adversely.
+ *
-                         */
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
-                        p->load_weight = 0;
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
-                else
+ * it's +10% CPU usage.
-#endif
+ */
-                        p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+static const int prio_to_weight[40] = {
-        } else
+/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
-                p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+/* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
-}
+/*   0 */  NICE_0_LOAD /* 1024 */,
+/*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
+/*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
+};
+static const u32 prio_to_wmult[40] = {
+        48356,   60446,   75558,   94446,  118058,  147573,
+        184467,  230589,  288233,  360285,  450347,
+        562979,  703746,  879575, 1099582, 1374389,
+        717986, 2147483, 2684354, 3355443, 4194304,
+        244160, 6557201, 8196502, 10250518, 12782640,
+        16025997, 19976592, 24970740, 31350126, 39045157,
+        49367440, 61356675, 76695844, 95443717, 119304647,
+        148102320, 186737708, 238609294, 286331153,
+};
 static inline void
-inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-        rq->raw_weighted_load += p->load_weight;
+        update_curr_load(rq, now);
+        update_load_add(&rq->ls.load, p->se.load.weight);
 }
 static inline void
-dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-        rq->raw_weighted_load -= p->load_weight;
+        update_curr_load(rq, now);
+        update_load_sub(&rq->ls.load, p->se.load.weight);
 }
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
        rq->nr_running++;
-        inc_raw_weighted_load(rq, p);
+        inc_load(rq, p, now);
 }
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
        rq->nr_running--;
-        dec_raw_weighted_load(rq, p);
+        dec_load(rq, p, now);
+}
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+/*
+ * runqueue iterator, to support SMP load-balancing between different
+ * scheduling classes, without having to expose their internal data
+ * structures to the load-balancing proper:
+ */
+struct rq_iterator {
+        void *arg;
+        struct task_struct *(*start)(void *);
+        struct task_struct *(*next)(void *);
+};
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_nr_move, unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned, unsigned long *load_moved,
+                      int this_best_prio, int best_prio, int best_prio_seen,
+                      struct rq_iterator *iterator);
+#include "sched_stats.h"
+#include "sched_rt.c"
+#include "sched_fair.c"
+#include "sched_idletask.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+#define sched_class_highest (&rt_sched_class)
+static void set_load_weight(struct task_struct *p)
+{
+        task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
+        p->se.wait_runtime = 0;
+        if (task_has_rt_policy(p)) {
+                p->se.load.weight = prio_to_weight[0] * 2;
+                p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+                return;
+        }
+        /*
+         * SCHED_IDLE tasks get minimal weight:
+         */
+        if (p->policy == SCHED_IDLE) {
+                p->se.load.weight = WEIGHT_IDLEPRIO;
+                p->se.load.inv_weight = WMULT_IDLEPRIO;
+                return;
+        }
+        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+}
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+{
+        sched_info_queued(p);
+        p->sched_class->enqueue_task(rq, p, wakeup, now);
+        p->se.on_rq = 1;
+}
+static void
+dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+        p->sched_class->dequeue_task(rq, p, sleep, now);
+        p->se.on_rq = 0;
+}
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
+{
+        return p->static_prio;
 }
 /*
@@ -851,7 +871,7 @@ static inline int normal_prio(struct task_struct *p)
 {
        int prio;
-        if (has_rt_policy(p))
+        if (task_has_rt_policy(p))
                prio = MAX_RT_PRIO-1 - p->rt_priority;
        else
                prio = __normal_prio(p);
@@ -879,222 +899,47 @@ static int effective_prio(struct task_struct *p)
 }
 /*
- * __activate_task - move a task to the runqueue.
+ * activate_task - move a task to the runqueue.
- */
-static void __activate_task(struct task_struct *p, struct rq *rq)
-{
-        struct prio_array *target = rq->active;
-        if (batch_task(p))
-                target = rq->expired;
-        enqueue_task(p, target);
-        inc_nr_running(p, rq);
-}
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
-{
-        enqueue_task_head(p, rq->active);
-        inc_nr_running(p, rq);
-}
-/*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
 */
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-        /* Caller must always ensure 'now >= p->timestamp' */
+        u64 now = rq_clock(rq);
-        unsigned long sleep_time = now - p->timestamp;
-        if (batch_task(p))
-                sleep_time = 0;
-        if (likely(sleep_time > 0)) {
-                /*
-                 * This ceiling is set to the lowest priority that would allow
-                 * a task to be reinserted into the active array on timeslice
-                 * completion.
-                 */
-                unsigned long ceiling = INTERACTIVE_SLEEP(p);
-                if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
-                        /*
-                         * Prevents user tasks from achieving best priority
-                         * with one single large enough sleep.
-                         */
-                        p->sleep_avg = ceiling;
-                        /*
-                         * Using INTERACTIVE_SLEEP() as a ceiling places a
-                         * nice(0) task 1ms sleep away from promotion, and
-                         * gives it 700ms to round-robin with no chance of
-                         * being demoted.  This is more than generous, so
-                         * mark this sleep as non-interactive to prevent the
-                         * on-runqueue bonus logic from intervening should
-                         * this task not receive cpu immediately.
-                         */
-                        p->sleep_type = SLEEP_NONINTERACTIVE;
-                } else {
-                        /*
-                         * Tasks waking from uninterruptible sleep are
-                         * limited in their sleep_avg rise as they
-                         * are likely to be waiting on I/O
-                         */
-                        if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-                                if (p->sleep_avg >= ceiling)
-                                        sleep_time = 0;
-                                else if (p->sleep_avg + sleep_time >=
-                                         ceiling) {
-                                                p->sleep_avg = ceiling;
-                                                sleep_time = 0;
-                                }
-                        }
-                        /*
+        if (p->state == TASK_UNINTERRUPTIBLE)
-                         * This code gives a bonus to interactive tasks.
+                rq->nr_uninterruptible--;
-                         *
-                         * The boost works by updating the 'average sleep time'
-                         * value here, based on ->timestamp. The more time a
-                         * task spends sleeping, the higher the average gets -
-                         * and the higher the priority boost gets as well.
-                         */
-                        p->sleep_avg += sleep_time;
-                }
-                if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-                        p->sleep_avg = NS_MAX_SLEEP_AVG;
-        }
-        return effective_prio(p);
+        enqueue_task(rq, p, wakeup, now);
+        inc_nr_running(p, rq, now);
 }
 /*
- * activate_task - move a task to the runqueue and do priority recalculation
+ * activate_idle_task - move idle task to the _front_ of runqueue.
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
 */
-static void activate_task(struct task_struct *p, struct rq *rq, int local)
+static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 {
-        unsigned long long now;
+        u64 now = rq_clock(rq);
-        if (rt_task(p))
+        if (p->state == TASK_UNINTERRUPTIBLE)
-                goto out;
+                rq->nr_uninterruptible--;
-        now = sched_clock();
-#ifdef CONFIG_SMP
-        if (!local) {
-                /* Compensate for drifting sched_clock */
-                struct rq *this_rq = this_rq();
-                now = (now - this_rq->most_recent_timestamp)
-                        + rq->most_recent_timestamp;
-        }
-#endif
-        /*
-         * Sleep time is in units of nanosecs, so shift by 20 to get a
-         * milliseconds-range estimation of the amount of time that the task
-         * spent sleeping:
-         */
-        if (unlikely(prof_on == SLEEP_PROFILING)) {
-                if (p->state == TASK_UNINTERRUPTIBLE)
-                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
-                                     (now - p->timestamp) >> 20);
-        }
-        p->prio = recalc_task_prio(p, now);
-        /*
+        enqueue_task(rq, p, 0, now);
-         * This checks to make sure it's not an uninterruptible task
+        inc_nr_running(p, rq, now);
-         * that is now waking up.
-         */
-        if (p->sleep_type == SLEEP_NORMAL) {
-                /*
-                 * Tasks which were woken up by interrupts (ie. hw events)
-                 * are most likely of interactive nature. So we give them
-                 * the credit of extending their sleep time to the period
-                 * of time they spend on the runqueue, waiting for execution
-                 * on a CPU, first time around:
-                 */
-                if (in_interrupt())
-                        p->sleep_type = SLEEP_INTERRUPTED;
-                else {
-                        /*
-                         * Normal first-time wakeups get a credit too for
-                         * on-runqueue time, but it will be weighted down:
-                         */
-                        p->sleep_type = SLEEP_INTERACTIVE;
-                }
-        }
-        p->timestamp = now;
-out:
-        __activate_task(p, rq);
 }
 /*
 * deactivate_task - remove a task from the runqueue.
 */
-static void deactivate_task(struct task_struct *p, struct rq *rq)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-        dec_nr_running(p, rq);
-        dequeue_task(p, p->array);
-        p->array = NULL;
-}
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-#ifdef CONFIG_SMP
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-static void resched_task(struct task_struct *p)
 {
-        int cpu;
+        u64 now = rq_clock(rq);
-        assert_spin_locked(&task_rq(p)->lock);
+        if (p->state == TASK_UNINTERRUPTIBLE)
+                rq->nr_uninterruptible++;
-        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+        dequeue_task(rq, p, sleep, now);
-                return;
+        dec_nr_running(p, rq, now);
-        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
-        cpu = task_cpu(p);
-        if (cpu == smp_processor_id())
-                return;
-        /* NEED_RESCHED must be visible before we test polling */
-        smp_mb();
-        if (!tsk_is_polling(p))
-                smp_send_reschedule(cpu);
 }
-static void resched_cpu(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        if (!spin_trylock_irqsave(&rq->lock, flags))
-                return;
-        resched_task(cpu_curr(cpu));
-        spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else
-static inline void resched_task(struct task_struct *p)
-{
-        assert_spin_locked(&task_rq(p)->lock);
-        set_tsk_need_resched(p);
-}
-#endif
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
@@ -1107,10 +952,42 @@ inline int task_curr(const struct task_struct *p)
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-        return cpu_rq(cpu)->raw_weighted_load;
+        return cpu_rq(cpu)->ls.load.weight;
+}
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+        task_thread_info(p)->cpu = cpu;
+        set_task_cfs_rq(p);
+#endif
 }
 #ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+        int old_cpu = task_cpu(p);
+        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+        u64 clock_offset, fair_clock_offset;
+        clock_offset = old_rq->clock - new_rq->clock;
+        fair_clock_offset = old_rq->cfs.fair_clock -
+                                                 new_rq->cfs.fair_clock;
+        if (p->se.wait_start)
+                p->se.wait_start -= clock_offset;
+        if (p->se.wait_start_fair)
+                p->se.wait_start_fair -= fair_clock_offset;
+        if (p->se.sleep_start)
+                p->se.sleep_start -= clock_offset;
+        if (p->se.block_start)
+                p->se.block_start -= clock_offset;
+        if (p->se.sleep_start_fair)
+                p->se.sleep_start_fair -= fair_clock_offset;
+        __set_task_cpu(p, new_cpu);
+}
 struct migration_req {
        struct list_head list;
@@ -1133,7 +1010,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * it is sufficient to simply update the task's cpu field.
         */
-        if (!p->array && !task_running(rq, p)) {
+        if (!p->se.on_rq && !task_running(rq, p)) {
                set_task_cpu(p, dest_cpu);
                return 0;
        }
@@ -1158,9 +1035,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 void wait_task_inactive(struct task_struct *p)
 {
        unsigned long flags;
+        int running, on_rq;
        struct rq *rq;
-        struct prio_array *array;
-        int running;
 repeat:
        /*
@@ -1192,7 +1068,7 @@ repeat:
         */
        rq = task_rq_lock(p, &flags);
        running = task_running(rq, p);
-        array = p->array;
+        on_rq = p->se.on_rq;
        task_rq_unlock(rq, &flags);
        /*
@@ -1215,7 +1091,7 @@ repeat:
         * running right now), it's preempted, and we should
         * yield - it could be a while.
         */
-        if (unlikely(array)) {
+        if (unlikely(on_rq)) {
                yield();
                goto repeat;
        }
@@ -1261,11 +1137,12 @@ void kick_process(struct task_struct *p)
 static inline unsigned long source_load(int cpu, int type)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
        if (type == 0)
-                return rq->raw_weighted_load;
+                return total;
-        return min(rq->cpu_load[type-1], rq->raw_weighted_load);
+        return min(rq->cpu_load[type-1], total);
 }
 /*
@@ -1275,11 +1152,12 @@ static inline unsigned long source_load(int cpu, int type)
 static inline unsigned long target_load(int cpu, int type)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
        if (type == 0)
-                return rq->raw_weighted_load;
+                return total;
-        return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+        return max(rq->cpu_load[type-1], total);
 }
 /*
@@ -1288,9 +1166,10 @@ static inline unsigned long target_load(int cpu, int type)
 static inline unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
        unsigned long n = rq->nr_running;
-        return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
+        return n ? total / n : SCHED_LOAD_SCALE;
 }
 /*
@@ -1392,9 +1271,9 @@ static int sched_balance_self(int cpu, int flag)
        struct sched_domain *tmp, *sd = NULL;
        for_each_domain(cpu, tmp) {
-                /*
+                /*
-                 * If power savings logic is enabled for a domain, stop there.
+                 * If power savings logic is enabled for a domain, stop there.
-                 */
+                 */
                if (tmp->flags & SD_POWERSAVINGS_BALANCE)
                        break;
                if (tmp->flags & flag)
@@ -1477,9 +1356,9 @@ static int wake_idle(int cpu, struct task_struct *p)
                                if (idle_cpu(i))
                                        return i;
                        }
-                }
+                } else {
-                else
                        break;
+                }
        }
        return cpu;
 }
@@ -1521,7 +1400,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!(old_state & state))
                goto out;
-        if (p->array)
+        if (p->se.on_rq)
                goto out_running;
        cpu = task_cpu(p);
@@ -1576,11 +1455,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                         * of the current CPU:
                         */
                        if (sync)
-                                tl -= current->load_weight;
+                                tl -= current->se.load.weight;
                        if ((tl <= load &&
                                tl + target_load(cpu, idx) <= tl_per_task) ||
-                                100*(tl + p->load_weight) <= imbalance*load) {
+                               100*(tl + p->se.load.weight) <= imbalance*load) {
                                /*
                                 * This domain has SD_WAKE_AFFINE and
                                 * p is cache cold in this domain, and
@@ -1614,7 +1493,7 @@ out_set_cpu:
                old_state = p->state;
                if (!(old_state & state))
                        goto out;
-                if (p->array)
+                if (p->se.on_rq)
                        goto out_running;
                this_cpu = smp_processor_id();
@@ -1623,25 +1502,7 @@ out_set_cpu:
 out_activate:
 #endif /* CONFIG_SMP */
-        if (old_state == TASK_UNINTERRUPTIBLE) {
+        activate_task(rq, p, 1);
-                rq->nr_uninterruptible--;
-                /*
-                 * Tasks on involuntary sleep don't earn
-                 * sleep_avg beyond just interactive state.
-                 */
-                p->sleep_type = SLEEP_NONINTERACTIVE;
-        } else
-        /*
-         * Tasks that have marked their sleep as noninteractive get
-         * woken up with their sleep average not weighted in an
-         * interactive way.
-         */
-                if (old_state & TASK_NONINTERACTIVE)
-                        p->sleep_type = SLEEP_NONINTERACTIVE;
-        activate_task(p, rq, cpu == this_cpu);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1511,8 @@ out_activate:
         * the waker guarantees that the freshly woken up task is going
         * to be considered on this CPU.)
         */
-        if (!sync || cpu != this_cpu) {
+        if (!sync || cpu != this_cpu)
-                if (TASK_PREEMPTS_CURR(p, rq))
+                check_preempt_curr(rq, p);
-                        resched_task(rq->curr);
-        }
        success = 1;
 out_running:
@@ -1676,19 +1535,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
        return try_to_wake_up(p, state, 0);
 }
-static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
- */
+ *
-void fastcall sched_fork(struct task_struct *p, int clone_flags)
+ * __sched_fork() is basic setup used by init_idle() too:
-{
+ */
-        int cpu = get_cpu();
+static void __sched_fork(struct task_struct *p)
+{
+        p->se.wait_start_fair           = 0;
+        p->se.wait_start                = 0;
+        p->se.exec_start                = 0;
+        p->se.sum_exec_runtime          = 0;
+        p->se.delta_exec                = 0;
+        p->se.delta_fair_run            = 0;
+        p->se.delta_fair_sleep          = 0;
+        p->se.wait_runtime              = 0;
+        p->se.sum_wait_runtime          = 0;
+        p->se.sum_sleep_runtime         = 0;
+        p->se.sleep_start               = 0;
+        p->se.sleep_start_fair          = 0;
+        p->se.block_start               = 0;
+        p->se.sleep_max                 = 0;
+        p->se.block_max                 = 0;
+        p->se.exec_max                  = 0;
+        p->se.wait_max                  = 0;
+        p->se.wait_runtime_overruns     = 0;
+        p->se.wait_runtime_underruns    = 0;
-#ifdef CONFIG_SMP
+        INIT_LIST_HEAD(&p->run_list);
-        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+        p->se.on_rq = 0;
-#endif
-        set_task_cpu(p, cpu);
        /*
         * We mark the process as running here, but have not actually
@@ -1697,16 +1573,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
         * event cannot wake it up and insert it on the runqueue either.
         */
        p->state = TASK_RUNNING;
+}
+/*
+ * fork()/clone()-time setup:
+ */
+void sched_fork(struct task_struct *p, int clone_flags)
+{
+        int cpu = get_cpu();
+        __sched_fork(p);
+#ifdef CONFIG_SMP
+        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+        __set_task_cpu(p, cpu);
        /*
         * Make sure we do not leak PI boosting priority to the child:
         */
        p->prio = current->normal_prio;
-        INIT_LIST_HEAD(&p->run_list);
-        p->array = NULL;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-        if (unlikely(sched_info_on()))
+        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1605,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
-        /*
-         * Share the timeslice between parent and child, thus the
-         * total amount of pending timeslices in the system doesn't change,
-         * resulting in more scheduling fairness.
-         */
-        local_irq_disable();
-        p->time_slice = (current->time_slice + 1) >> 1;
-        /*
-         * The remainder of the first timeslice might be recovered by
-         * the parent if the child exits early enough.
-         */
-        p->first_time_slice = 1;
-        current->time_slice >>= 1;
-        p->timestamp = sched_clock();
-        if (unlikely(!current->time_slice)) {
-                /*
-                 * This case is rare, it happens when the parent has only
-                 * a single jiffy left from its timeslice. Taking the
-                 * runqueue lock is not a problem.
-                 */
-                current->time_slice = 1;
-                task_running_tick(cpu_rq(cpu), current);
-        }
-        local_irq_enable();
        put_cpu();
 }
 /*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
+/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1623,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
-        struct rq *rq, *this_rq;
        unsigned long flags;
-        int this_cpu, cpu;
+        struct rq *rq;
+        int this_cpu;
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_RUNNING);
-        this_cpu = smp_processor_id();
+        this_cpu = smp_processor_id(); /* parent's CPU */
-        cpu = task_cpu(p);
-        /*
-         * We decrease the sleep average of forking parents
-         * and children as well, to keep max-interactive tasks
-         * from forking tasks that are max-interactive. The parent
-         * (current) is done further down, under its lock.
-         */
-        p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-                CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
        p->prio = effective_prio(p);
-        if (likely(cpu == this_cpu)) {
+        if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-                if (!(clone_flags & CLONE_VM)) {
+                        task_cpu(p) != this_cpu || !current->se.on_rq) {
-                        /*
+                activate_task(rq, p, 0);
-                         * The VM isn't cloned, so we're in a good position to
-                         * do child-runs-first in anticipation of an exec. This
-                         * usually avoids a lot of COW overhead.
-                         */
-                        if (unlikely(!current->array))
-                                __activate_task(p, rq);
-                        else {
-                                p->prio = current->prio;
-                                p->normal_prio = current->normal_prio;
-                                list_add_tail(&p->run_list, &current->run_list);
-                                p->array = current->array;
-                                p->array->nr_active++;
-                                inc_nr_running(p, rq);
-                        }
-                        set_need_resched();
-                } else
-                        /* Run child last */
-                        __activate_task(p, rq);
-                /*
-                 * We skip the following code due to cpu == this_cpu
-                 *
-                 *   task_rq_unlock(rq, &flags);
-                 *   this_rq = task_rq_lock(current, &flags);
-                 */
-                this_rq = rq;
        } else {
-                this_rq = cpu_rq(this_cpu);
                /*
-                 * Not the local CPU - must adjust timestamp. This should
+                 * Let the scheduling class do new task startup
-                 * get optimised away in the !CONFIG_SMP case.
+                 * management (if any):
                 */
-                p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+                p->sched_class->task_new(rq, p);
-                                        + rq->most_recent_timestamp;
-                __activate_task(p, rq);
-                if (TASK_PREEMPTS_CURR(p, rq))
-                        resched_task(rq->curr);
-                /*
-                 * Parent and child are on different CPUs, now get the
-                 * parent runqueue to update the parent's ->sleep_avg:
-                 */
-                task_rq_unlock(rq, &flags);
-                this_rq = task_rq_lock(current, &flags);
-        }
-        current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-                PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-        task_rq_unlock(this_rq, &flags);
-}
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
-void fastcall sched_exit(struct task_struct *p)
-{
-        unsigned long flags;
-        struct rq *rq;
-        /*
-         * If the child was a (relative-) CPU hog then decrease
-         * the sleep_avg of the parent as well.
-         */
-        rq = task_rq_lock(p->parent, &flags);
-        if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-                p->parent->time_slice += p->time_slice;
-                if (unlikely(p->parent->time_slice > task_timeslice(p)))
-                        p->parent->time_slice = task_timeslice(p);
        }
-        if (p->sleep_avg < p->parent->sleep_avg)
+        check_preempt_curr(rq, p);
-                p->parent->sleep_avg = p->parent->sleep_avg /
-                (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-                (EXIT_WEIGHT + 1);
        task_rq_unlock(rq, &flags);
 }
@@ -1917,7 +1708,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
-                 */
+                 */
                kprobe_flush_task(prev);
                put_task_struct(prev);
        }
@@ -1945,13 +1736,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */
-static inline struct task_struct *
+static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next)
 {
-        struct mm_struct *mm = next->mm;
+        struct mm_struct *mm, *oldmm;
-        struct mm_struct *oldmm = prev->active_mm;
+        prepare_task_switch(rq, next);
+        mm = next->mm;
+        oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
@@ -1959,16 +1752,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_enter_lazy_cpu_mode();
-        if (!mm) {
+        if (unlikely(!mm)) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (!prev->mm) {
+        if (unlikely(!prev->mm)) {
                prev->active_mm = NULL;
-                WARN_ON(rq->prev_mm);
                rq->prev_mm = oldmm;
        }
        /*
@@ -1984,7 +1776,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
-        return prev;
+        barrier();
+        /*
+         * this_rq must be evaluated again because prev may have moved
+         * CPUs since it called schedule(), thus the 'rq' on its stack
+         * frame will be invalid.
+         */
+        finish_task_switch(this_rq(), prev);
 }
 /*
@@ -2057,17 +1855,65 @@ unsigned long nr_active(void)
        return running + uninterruptible;
 }
-#ifdef CONFIG_SMP
 /*
- * Is this task likely cache-hot:
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC).
 */
-static inline int
+static void update_cpu_load(struct rq *this_rq)
-task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
 {
-        return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
+        u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
+        unsigned long total_load = this_rq->ls.load.weight;
+        unsigned long this_load =  total_load;
+        struct load_stat *ls = &this_rq->ls;
+        u64 now = __rq_clock(this_rq);
+        int i, scale;
+        this_rq->nr_load_updates++;
+        if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
+                goto do_avg;
+        /* Update delta_fair/delta_exec fields first */
+        update_curr_load(this_rq, now);
+        fair_delta64 = ls->delta_fair + 1;
+        ls->delta_fair = 0;
+        exec_delta64 = ls->delta_exec + 1;
+        ls->delta_exec = 0;
+        sample_interval64 = now - ls->load_update_last;
+        ls->load_update_last = now;
+        if ((s64)sample_interval64 < (s64)TICK_NSEC)
+                sample_interval64 = TICK_NSEC;
+        if (exec_delta64 > sample_interval64)
+                exec_delta64 = sample_interval64;
+        idle_delta64 = sample_interval64 - exec_delta64;
+        tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
+        tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
+        this_load = (unsigned long)tmp64;
+do_avg:
+        /* Update our load: */
+        for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+                unsigned long old_load, new_load;
+                /* scale is effectively 1 << i now, and >> i divides by scale */
+                old_load = this_rq->cpu_load[i];
+                new_load = this_load;
+                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+        }
 }
+#ifdef CONFIG_SMP
 /*
 * double_rq_lock - safely lock two runqueues
 *
@@ -2184,23 +2030,17 @@ void sched_exec(void)
 * pull_task - move a task from a remote runqueue to the local runqueue.
 * Both runqueues must be locked.
 */
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+static void pull_task(struct rq *src_rq, struct task_struct *p,
-                      struct task_struct *p, struct rq *this_rq,
+                      struct rq *this_rq, int this_cpu)
-                      struct prio_array *this_array, int this_cpu)
 {
-        dequeue_task(p, src_array);
+        deactivate_task(src_rq, p, 0);
-        dec_nr_running(p, src_rq);
        set_task_cpu(p, this_cpu);
-        inc_nr_running(p, this_rq);
+        activate_task(this_rq, p, 0);
-        enqueue_task(p, this_array);
-        p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
-                                + this_rq->most_recent_timestamp;
        /*
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
         */
-        if (TASK_PREEMPTS_CURR(p, this_rq))
+        check_preempt_curr(this_rq, p);
-                resched_task(this_rq->curr);
 }
 /*
@@ -2208,7 +2048,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
 */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                     struct sched_domain *sd, enum idle_type idle,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
                     int *all_pinned)
 {
        /*
@@ -2225,132 +2065,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                return 0;
        /*
-         * Aggressive migration if:
+         * Aggressive migration if too many balance attempts have failed:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
         */
+        if (sd->nr_balance_failed > sd->cache_nice_tries)
-        if (sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (task_hot(p, rq->most_recent_timestamp, sd))
-                        schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
                return 1;
-        }
-        if (task_hot(p, rq->most_recent_timestamp, sd))
-                return 0;
        return 1;
 }
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_nr_move, unsigned long max_load_move,
-                      struct sched_domain *sd, enum idle_type idle,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
+                      int *all_pinned, unsigned long *load_moved,
+                      int this_best_prio, int best_prio, int best_prio_seen,
+                      struct rq_iterator *iterator)
 {
-        int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+        int pulled = 0, pinned = 0, skip_for_load;
-            best_prio_seen, skip_for_load;
+        struct task_struct *p;
-        struct prio_array *array, *dst_array;
+        long rem_load_move = max_load_move;
-        struct list_head *head, *curr;
-        struct task_struct *tmp;
-        long rem_load_move;
        if (max_nr_move == 0 || max_load_move == 0)
                goto out;
-        rem_load_move = max_load_move;
        pinned = 1;
-        this_best_prio = rq_best_prio(this_rq);
-        best_prio = rq_best_prio(busiest);
-        /*
-         * Enable handling of the case where there is more than one task
-         * with the best priority.   If the current running task is one
-         * of those with prio==best_prio we know it won't be moved
-         * and therefore it's safe to override the skip (based on load) of
-         * any task we find with that prio.
-         */
-        best_prio_seen = best_prio == busiest->curr->prio;
        /*
-         * We first consider expired tasks. Those will likely not be
+         * Start the load-balancing iterator:
-         * executed in the near future, and they are most likely to
-         * be cache-cold, thus switching CPUs has the least effect
-         * on them.
         */
-        if (busiest->expired->nr_active) {
+        p = iterator->start(iterator->arg);
-                array = busiest->expired;
+next:
-                dst_array = this_rq->expired;
+        if (!p)
-        } else {
-                array = busiest->active;
-                dst_array = this_rq->active;
-        }
-new_array:
-        /* Start searching at priority 0: */
-        idx = 0;
-skip_bitmap:
-        if (!idx)
-                idx = sched_find_first_bit(array->bitmap);
-        else
-                idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-        if (idx >= MAX_PRIO) {
-                if (array == busiest->expired && busiest->active->nr_active) {
-                        array = busiest->active;
-                        dst_array = this_rq->active;
-                        goto new_array;
-                }
                goto out;
-        }
-        head = array->queue + idx;
-        curr = head->prev;
-skip_queue:
-        tmp = list_entry(curr, struct task_struct, run_list);
-        curr = curr->prev;
        /*
         * To help distribute high priority tasks accross CPUs we don't
         * skip a task if it will be the highest priority task (i.e. smallest
         * prio value) on its new queue regardless of its load weight
         */
-        skip_for_load = tmp->load_weight > rem_load_move;
+        skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-        if (skip_for_load && idx < this_best_prio)
+                                                         SCHED_LOAD_SCALE_FUZZ;
-                skip_for_load = !best_prio_seen && idx == best_prio;
+        if (skip_for_load && p->prio < this_best_prio)
+                skip_for_load = !best_prio_seen && p->prio == best_prio;
        if (skip_for_load ||
-            !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                best_prio_seen |= idx == best_prio;
+                best_prio_seen |= p->prio == best_prio;
-                if (curr != head)
+                p = iterator->next(iterator->arg);
-                        goto skip_queue;
+                goto next;
-                idx++;
-                goto skip_bitmap;
        }
-        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+        pull_task(busiest, p, this_rq, this_cpu);
        pulled++;
-        rem_load_move -= tmp->load_weight;
+        rem_load_move -= p->se.load.weight;
        /*
         * We only want to steal up to the prescribed number of tasks
         * and the prescribed amount of weighted load.
         */
        if (pulled < max_nr_move && rem_load_move > 0) {
-                if (idx < this_best_prio)
+                if (p->prio < this_best_prio)
-                        this_best_prio = idx;
+                        this_best_prio = p->prio;
-                if (curr != head)
+                p = iterator->next(iterator->arg);
-                        goto skip_queue;
+                goto next;
-                idx++;
-                goto skip_bitmap;
        }
 out:
        /*
@@ -2362,18 +2137,48 @@ out:
        if (all_pinned)
                *all_pinned = pinned;
+        *load_moved = max_load_move - rem_load_move;
        return pulled;
 }
 /*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_nr_move, unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned)
+{
+        struct sched_class *class = sched_class_highest;
+        unsigned long load_moved, total_nr_moved = 0, nr_moved;
+        long rem_load_move = max_load_move;
+        do {
+                nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+                                max_nr_move, (unsigned long)rem_load_move,
+                                sd, idle, all_pinned, &load_moved);
+                total_nr_moved += nr_moved;
+                max_nr_move -= nr_moved;
+                rem_load_move -= load_moved;
+                class = class->next;
+        } while (class && max_nr_move && rem_load_move > 0);
+        return total_nr_moved;
+}
+/*
 * find_busiest_group finds and returns the busiest CPU group within the
 * domain. It calculates and returns the amount of weighted load which
 * should be moved to restore balance via the imbalance parameter.
 */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   cpumask_t *cpus, int *balance)
+                   int *sd_idle, cpumask_t *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2196,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;
-        if (idle == NOT_IDLE)
+        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
-        else if (idle == NEWLY_IDLE)
+        else if (idle == CPU_NEWLY_IDLE)
                load_idx = sd->newidle_idx;
        else
                load_idx = sd->idle_idx;
@@ -2437,7 +2242,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        avg_load += load;
                        sum_nr_running += rq->nr_running;
-                        sum_weighted_load += rq->raw_weighted_load;
+                        sum_weighted_load += weighted_cpuload(i);
                }
                /*
@@ -2477,8 +2282,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 * Busy processors will not participate in power savings
                 * balance.
                 */
-                if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                if (idle == CPU_NOT_IDLE ||
-                        goto group_next;
+                                !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                        goto group_next;
                /*
                 * If the local group is idle or completely loaded
@@ -2488,42 +2294,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                    !this_nr_running))
                        power_savings_balance = 0;
-                /*
+                /*
                 * If a group is already running at full capacity or idle,
                 * don't include that group in power savings calculations
-                 */
+                 */
-                if (!power_savings_balance || sum_nr_running >= group_capacity
+                if (!power_savings_balance || sum_nr_running >= group_capacity
                    || !sum_nr_running)
-                        goto group_next;
+                        goto group_next;
-                /*
+                /*
                 * Calculate the group which has the least non-idle load.
-                 * This is the group from where we need to pick up the load
+                 * This is the group from where we need to pick up the load
-                 * for saving power
+                 * for saving power
-                 */
+                 */
-                if ((sum_nr_running < min_nr_running) ||
+                if ((sum_nr_running < min_nr_running) ||
-                    (sum_nr_running == min_nr_running &&
+                    (sum_nr_running == min_nr_running &&
                     first_cpu(group->cpumask) <
                     first_cpu(group_min->cpumask))) {
-                        group_min = group;
+                        group_min = group;
-                        min_nr_running = sum_nr_running;
+                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
                                                sum_nr_running;
-                }
+                }
-                /*
+                /*
                 * Calculate the group which is almost near its
-                 * capacity but still has some space to pick up some load
+                 * capacity but still has some space to pick up some load
-                 * from other group and save more power
+                 * from other group and save more power
-                 */
+                 */
-                if (sum_nr_running <= group_capacity - 1) {
+                if (sum_nr_running <= group_capacity - 1) {
-                        if (sum_nr_running > leader_nr_running ||
+                        if (sum_nr_running > leader_nr_running ||
-                            (sum_nr_running == leader_nr_running &&
+                            (sum_nr_running == leader_nr_running &&
-                             first_cpu(group->cpumask) >
+                             first_cpu(group->cpumask) >
-                              first_cpu(group_leader->cpumask))) {
+                              first_cpu(group_leader->cpumask))) {
-                                group_leader = group;
+                                group_leader = group;
-                                leader_nr_running = sum_nr_running;
+                                leader_nr_running = sum_nr_running;
-                        }
+                        }
                }
 group_next:
 #endif
@@ -2578,7 +2384,7 @@ group_next:
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (*imbalance < busiest_load_per_task) {
+        if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
                unsigned long tmp, pwr_now, pwr_move;
                unsigned int imbn;
@@ -2592,7 +2398,8 @@ small_imbalance:
                } else
                        this_load_per_task = SCHED_LOAD_SCALE;
-                if (max_load - this_load >= busiest_load_per_task * imbn) {
+                if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
                }
@@ -2639,7 +2446,7 @@ small_imbalance:
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
                goto ret;
        if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2463,7 @@ ret:
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum idle_type idle,
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
                   unsigned long imbalance, cpumask_t *cpus)
 {
        struct rq *busiest = NULL, *rq;
@@ -2664,17 +2471,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
        int i;
        for_each_cpu_mask(i, group->cpumask) {
+                unsigned long wl;
                if (!cpu_isset(i, *cpus))
                        continue;
                rq = cpu_rq(i);
+                wl = weighted_cpuload(i);
-                if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
+                if (rq->nr_running == 1 && wl > imbalance)
                        continue;
-                if (rq->raw_weighted_load > max_load) {
+                if (wl > max_load) {
-                        max_load = rq->raw_weighted_load;
+                        max_load = wl;
                        busiest = rq;
                }
        }
@@ -2698,7 +2507,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
 * tasks if there is an imbalance.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum idle_type idle,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2520,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        /*
         * When power savings policy is enabled for the parent domain, idle
         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as IDLE, instead of
+         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as NOT_IDLE.
+         * portraying it as CPU_NOT_IDLE.
         */
-        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                sd_idle = 1;
@@ -2848,7 +2657,7 @@ out_one_pinned:
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 *
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
 * this_rq is locked.
 */
 static int
@@ -2865,31 +2674,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
         * When power savings policy is enabled for the parent domain, idle
         * sibling can pick up load irrespective of busy siblings. In this case,
         * let the state of idle sibling percolate up as IDLE, instead of
-         * portraying it as NOT_IDLE.
+         * portraying it as CPU_NOT_IDLE.
         */
        if (sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                sd_idle = 1;
-        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+        schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
 redo:
-        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                   &sd_idle, &cpus, NULL);
        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
                                &cpus);
        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
                goto out_balanced;
        }
        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
        nr_moved = 0;
        if (busiest->nr_running > 1) {
@@ -2897,7 +2706,7 @@ redo:
                double_lock_balance(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                        minus_1_or_zero(busiest->nr_running),
-                                        imbalance, sd, NEWLY_IDLE, NULL);
+                                        imbalance, sd, CPU_NEWLY_IDLE, NULL);
                spin_unlock(&busiest->lock);
                if (!nr_moved) {
@@ -2908,7 +2717,7 @@ redo:
        }
        if (!nr_moved) {
-                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                        return -1;
@@ -2918,7 +2727,7 @@ redo:
        return nr_moved;
 out_balanced:
-        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                return -1;
@@ -2934,8 +2743,8 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
-        int pulled_task = 0;
+        int pulled_task = -1;
-        unsigned long next_balance = jiffies + 60 *  HZ;
+        unsigned long next_balance = jiffies + HZ;
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -2954,12 +2763,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                if (pulled_task)
                        break;
        }
-        if (!pulled_task)
+        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
                 * We are going idle. next_balance may be set based on
                 * a busy processor. So reset next_balance.
                 */
                this_rq->next_balance = next_balance;
+        }
 }
 /*
@@ -3003,7 +2813,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                schedstat_inc(sd, alb_cnt);
                if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-                               RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+                               RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
                               NULL))
                        schedstat_inc(sd, alb_pushed);
                else
@@ -3012,32 +2822,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        spin_unlock(&target_rq->lock);
 }
-static void update_load(struct rq *this_rq)
-{
-        unsigned long this_load;
-        unsigned int i, scale;
-        this_load = this_rq->raw_weighted_load;
-        /* Update our load: */
-        for (i = 0, scale = 1; i < 3; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale-1;
-                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
-        }
-}
 #ifdef CONFIG_NO_HZ
 static struct {
        atomic_t load_balancer;
@@ -3120,7 +2904,7 @@ static DEFINE_SPINLOCK(balancing);
 *
 * Balancing parameters are set up in arch_init_sched_domains.
 */
-static inline void rebalance_domains(int cpu, enum idle_type idle)
+static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
        int balance = 1;
        struct rq *rq = cpu_rq(cpu);
@@ -3134,13 +2918,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
                        continue;
                interval = sd->balance_interval;
-                if (idle != SCHED_IDLE)
+                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
                /* scale ms to jiffies */
                interval = msecs_to_jiffies(interval);
                if (unlikely(!interval))
                        interval = 1;
+                if (interval > HZ*NR_CPUS/10)
+                        interval = HZ*NR_CPUS/10;
                if (sd->flags & SD_SERIALIZE) {
                        if (!spin_trylock(&balancing))
@@ -3154,7 +2941,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
                                 * longer idle, or one of our SMT siblings is
                                 * not idle.
                                 */
-                                idle = NOT_IDLE;
+                                idle = CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
                }
@@ -3182,11 +2969,12 @@ out:
 */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-        int local_cpu = smp_processor_id();
+        int this_cpu = smp_processor_id();
-        struct rq *local_rq = cpu_rq(local_cpu);
+        struct rq *this_rq = cpu_rq(this_cpu);
-        enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(local_cpu, idle);
+        rebalance_domains(this_cpu, idle);
 #ifdef CONFIG_NO_HZ
        /*
@@ -3194,13 +2982,13 @@ static void run_rebalance_domains(struct softirq_action *h)
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-        if (local_rq->idle_at_tick &&
+        if (this_rq->idle_at_tick &&
-            atomic_read(&nohz.load_balancer) == local_cpu) {
+            atomic_read(&nohz.load_balancer) == this_cpu) {
                cpumask_t cpus = nohz.cpu_mask;
                struct rq *rq;
                int balance_cpu;
-                cpu_clear(local_cpu, cpus);
+                cpu_clear(this_cpu, cpus);
                for_each_cpu_mask(balance_cpu, cpus) {
                        /*
                         * If this cpu gets work to do, stop the load balancing
@@ -3213,8 +3001,8 @@ static void run_rebalance_domains(struct softirq_action *h)
                        rebalance_domains(balance_cpu, SCHED_IDLE);
                        rq = cpu_rq(balance_cpu);
-                        if (time_after(local_rq->next_balance, rq->next_balance))
+                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                local_rq->next_balance = rq->next_balance;
+                                this_rq->next_balance = rq->next_balance;
                }
        }
 #endif
@@ -3227,9 +3015,8 @@ static void run_rebalance_domains(struct softirq_action *h)
 * idle load balancing owner or decide to stop the periodic load balancing,
 * if the whole system is idle.
 */
-static inline void trigger_load_balance(int cpu)
+static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_NO_HZ
        /*
         * If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3068,29 @@ static inline void trigger_load_balance(int cpu)
        if (time_after_eq(jiffies, rq->next_balance))
                raise_softirq(SCHED_SOFTIRQ);
 }
-#else
+#else   /* CONFIG_SMP */
 /*
 * on UP we do not need to balance between CPUs:
 */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
+/* Avoid "used but not defined" warning on UP */
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_nr_move, unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned, unsigned long *load_moved,
+                      int this_best_prio, int best_prio, int best_prio_seen,
+                      struct rq_iterator *iterator)
+{
+        *load_moved = 0;
+        return 0;
+}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,54 +3098,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * This is called on clock ticks and on context switches.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * that have not yet been banked in case the task is currently running.
- */
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
-{
-        p->sched_time += now - p->last_ran;
-        p->last_ran = rq->most_recent_timestamp = now;
-}
-/*
- * Return current->sched_time plus any more ns on the sched_clock
- * that have not yet been banked.
 */
-unsigned long long current_sched_time(const struct task_struct *p)
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
-        unsigned long long ns;
        unsigned long flags;
+        u64 ns, delta_exec;
+        struct rq *rq;
-        local_irq_save(flags);
+        rq = task_rq_lock(p, &flags);
-        ns = p->sched_time + sched_clock() - p->last_ran;
+        ns = p->se.sum_exec_runtime;
-        local_irq_restore(flags);
+        if (rq->curr == p) {
+                delta_exec = rq_clock(rq) - p->se.exec_start;
+                if ((s64)delta_exec > 0)
+                        ns += delta_exec;
+        }
+        task_rq_unlock(rq, &flags);
        return ns;
 }
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
-        if (rq->curr->static_prio > rq->best_expired_prio)
-                return 1;
-        if (!STARVATION_LIMIT || !rq->expired_timestamp)
-                return 0;
-        if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
-                return 1;
-        return 0;
-}
-/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3415,81 +3192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
                cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
-static void task_running_tick(struct rq *rq, struct task_struct *p)
-{
-        if (p->array != rq->active) {
-                /* Task has expired but was not scheduled yet */
-                set_tsk_need_resched(p);
-                return;
-        }
-        spin_lock(&rq->lock);
-        /*
-         * The task was running during this tick - update the
-         * time slice counter. Note: we do not update a thread's
-         * priority until it either goes to sleep or uses up its
-         * timeslice. This makes it possible for interactive tasks
-         * to use up their timeslices at their highest priority levels.
-         */
-        if (rt_task(p)) {
-                /*
-                 * RR tasks need a special form of timeslice management.
-                 * FIFO tasks have no timeslices.
-                 */
-                if ((p->policy == SCHED_RR) && !--p->time_slice) {
-                        p->time_slice = task_timeslice(p);
-                        p->first_time_slice = 0;
-                        set_tsk_need_resched(p);
-                        /* put it at the end of the queue: */
-                        requeue_task(p, rq->active);
-                }
-                goto out_unlock;
-        }
-        if (!--p->time_slice) {
-                dequeue_task(p, rq->active);
-                set_tsk_need_resched(p);
-                p->prio = effective_prio(p);
-                p->time_slice = task_timeslice(p);
-                p->first_time_slice = 0;
-                if (!rq->expired_timestamp)
-                        rq->expired_timestamp = jiffies;
-                if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
-                        enqueue_task(p, rq->expired);
-                        if (p->static_prio < rq->best_expired_prio)
-                                rq->best_expired_prio = p->static_prio;
-                } else
-                        enqueue_task(p, rq->active);
-        } else {
-                /*
-                 * Prevent a too long timeslice allowing a task to monopolize
-                 * the CPU. We do this by splitting up the timeslice into
-                 * smaller pieces.
-                 *
-                 * Note: this does not mean the task's timeslices expire or
-                 * get lost in any way, they just might be preempted by
-                 * another task of equal priority. (one with higher
-                 * priority would have preempted this task already.) We
-                 * requeue this task to the end of the list on this priority
-                 * level, which is in essence a round-robin of tasks with
-                 * equal priority.
-                 *
-                 * This only applies to tasks in the interactive
-                 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-                 */
-                if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-                        p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-                        (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-                        (p->array == rq->active)) {
-                        requeue_task(p, rq->active);
-                        set_tsk_need_resched(p);
-                }
-        }
-out_unlock:
-        spin_unlock(&rq->lock);
-}
 /*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
@@ -3499,20 +3201,19 @@ out_unlock:
 */
 void scheduler_tick(void)
 {
-        unsigned long long now = sched_clock();
-        struct task_struct *p = current;
        int cpu = smp_processor_id();
-        int idle_at_tick = idle_cpu(cpu);
        struct rq *rq = cpu_rq(cpu);
+        struct task_struct *curr = rq->curr;
-        update_cpu_clock(p, rq, now);
+        spin_lock(&rq->lock);
+        if (curr != rq->idle) /* FIXME: needed? */
+                curr->sched_class->task_tick(rq, curr);
+        update_cpu_load(rq);
+        spin_unlock(&rq->lock);
-        if (!idle_at_tick)
-                task_running_tick(rq, p);
 #ifdef CONFIG_SMP
-        update_load(rq);
+        rq->idle_at_tick = idle_cpu(cpu);
-        rq->idle_at_tick = idle_at_tick;
+        trigger_load_balance(rq, cpu);
-        trigger_load_balance(cpu);
 #endif
 }
@@ -3554,170 +3255,129 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
-static inline int interactive_sleep(enum sleep_type sleep_type)
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
 {
-        return (sleep_type == SLEEP_INTERACTIVE ||
+        printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
-                sleep_type == SLEEP_INTERRUPTED);
+                prev->comm, preempt_count(), prev->pid);
+        debug_show_held_locks(prev);
+        if (irqs_disabled())
+                print_irqtrace_events(prev);
+        dump_stack();
 }
 /*
- * schedule() is the main scheduler function.
+ * Various schedule()-time debugging checks and statistics:
 */
-asmlinkage void __sched schedule(void)
+static inline void schedule_debug(struct task_struct *prev)
 {
-        struct task_struct *prev, *next;
-        struct prio_array *array;
-        struct list_head *queue;
-        unsigned long long now;
-        unsigned long run_time;
-        int cpu, idx, new_prio;
-        long *switch_count;
-        struct rq *rq;
        /*
         * Test if we are atomic.  Since do_exit() needs to call into
         * schedule() atomically, we ignore that path for now.
         * Otherwise, whine if we are scheduling when we should not be.
         */
-        if (unlikely(in_atomic() && !current->exit_state)) {
+        if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
-                printk(KERN_ERR "BUG: scheduling while atomic: "
+                __schedule_bug(prev);
-                        "%s/0x%08x/%d\n",
-                        current->comm, preempt_count(), current->pid);
-                debug_show_held_locks(current);
-                if (irqs_disabled())
-                        print_irqtrace_events(current);
-                dump_stack();
-        }
-        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-need_resched:
+        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-        preempt_disable();
-        prev = current;
-        release_kernel_lock(prev);
-need_resched_nonpreemptible:
-        rq = this_rq();
-        /*
+        schedstat_inc(this_rq(), sched_cnt);
-         * The idle thread is not allowed to schedule!
+}
-         * Remove this check after it has been exercised a bit.
-         */
-        if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-                printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-                dump_stack();
-        }
-        schedstat_inc(rq, sched_cnt);
+/*
-        now = sched_clock();
+ * Pick up the highest-prio task:
-        if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+ */
-                run_time = now - prev->timestamp;
+static inline struct task_struct *
-                if (unlikely((long long)(now - prev->timestamp) < 0))
+pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
-                        run_time = 0;
+{
-        } else
+        struct sched_class *class;
-                run_time = NS_MAX_SLEEP_AVG;
+        struct task_struct *p;
        /*
-         * Tasks charged proportionately less run_time at high sleep_avg to
+         * Optimization: we know that if all tasks are in
-         * delay them losing their interactive status
+         * the fair class we can call that function directly:
         */
-        run_time /= (CURRENT_BONUS(prev) ? : 1);
+        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+                p = fair_sched_class.pick_next_task(rq, now);
-        spin_lock_irq(&rq->lock);
+                if (likely(p))
+                        return p;
-        switch_count = &prev->nivcsw;
-        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-                switch_count = &prev->nvcsw;
-                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-                                unlikely(signal_pending(prev))))
-                        prev->state = TASK_RUNNING;
-                else {
-                        if (prev->state == TASK_UNINTERRUPTIBLE)
-                                rq->nr_uninterruptible++;
-                        deactivate_task(prev, rq);
-                }
-        }
-        cpu = smp_processor_id();
-        if (unlikely(!rq->nr_running)) {
-                idle_balance(cpu, rq);
-                if (!rq->nr_running) {
-                        next = rq->idle;
-                        rq->expired_timestamp = 0;
-                        goto switch_tasks;
-                }
        }
-        array = rq->active;
+        class = sched_class_highest;
-        if (unlikely(!array->nr_active)) {
+        for ( ; ; ) {
+                p = class->pick_next_task(rq, now);
+                if (p)
+                        return p;
                /*
-                 * Switch the active and expired arrays.
+                 * Will never be NULL as the idle class always
+                 * returns a non-NULL p:
                 */
-                schedstat_inc(rq, sched_switch);
+                class = class->next;
-                rq->active = rq->expired;
-                rq->expired = array;
-                array = rq->active;
-                rq->expired_timestamp = 0;
-                rq->best_expired_prio = MAX_PRIO;
        }
+}
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+        struct task_struct *prev, *next;
+        long *switch_count;
+        struct rq *rq;
+        u64 now;
+        int cpu;
-        idx = sched_find_first_bit(array->bitmap);
+need_resched:
-        queue = array->queue + idx;
+        preempt_disable();
-        next = list_entry(queue->next, struct task_struct, run_list);
+        cpu = smp_processor_id();
+        rq = cpu_rq(cpu);
+        rcu_qsctr_inc(cpu);
+        prev = rq->curr;
+        switch_count = &prev->nivcsw;
-        if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+        release_kernel_lock(prev);
-                unsigned long long delta = now - next->timestamp;
+need_resched_nonpreemptible:
-                if (unlikely((long long)(now - next->timestamp) < 0))
-                        delta = 0;
-                if (next->sleep_type == SLEEP_INTERACTIVE)
+        schedule_debug(prev);
-                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-                array = next->array;
+        spin_lock_irq(&rq->lock);
-                new_prio = recalc_task_prio(next, next->timestamp + delta);
+        clear_tsk_need_resched(prev);
-                if (unlikely(next->prio != new_prio)) {
+        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-                        dequeue_task(next, array);
+                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-                        next->prio = new_prio;
+                                unlikely(signal_pending(prev)))) {
-                        enqueue_task(next, array);
+                        prev->state = TASK_RUNNING;
+                } else {
+                        deactivate_task(rq, prev, 1);
                }
+                switch_count = &prev->nvcsw;
        }
-        next->sleep_type = SLEEP_NORMAL;
-switch_tasks:
-        if (next == rq->idle)
-                schedstat_inc(rq, sched_goidle);
-        prefetch(next);
-        prefetch_stack(next);
-        clear_tsk_need_resched(prev);
-        rcu_qsctr_inc(task_cpu(prev));
-        update_cpu_clock(prev, rq, now);
+        if (unlikely(!rq->nr_running))
+                idle_balance(cpu, rq);
-        prev->sleep_avg -= run_time;
+        now = __rq_clock(rq);
-        if ((long)prev->sleep_avg <= 0)
+        prev->sched_class->put_prev_task(rq, prev, now);
-                prev->sleep_avg = 0;
+        next = pick_next_task(rq, prev, now);
-        prev->timestamp = prev->last_ran = now;
        sched_info_switch(prev, next);
        if (likely(prev != next)) {
-                next->timestamp = next->last_ran = now;
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
-                prepare_task_switch(rq, next);
+                context_switch(rq, prev, next); /* unlocks the rq */
-                prev = context_switch(rq, prev, next);
-                barrier();
-                /*
-                 * this_rq must be evaluated again because prev may have moved
-                 * CPUs since it called schedule(), thus the 'rq' on its stack
-                 * frame will be invalid.
-                 */
-                finish_task_switch(this_rq(), prev);
        } else
                spin_unlock_irq(&rq->lock);
-        prev = current;
+        if (unlikely(reacquire_kernel_lock(current) < 0)) {
-        if (unlikely(reacquire_kernel_lock(prev) < 0))
+                cpu = smp_processor_id();
+                rq = cpu_rq(cpu);
                goto need_resched_nonpreemptible;
+        }
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
@@ -4045,74 +3705,85 @@ out:
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+static inline void
-#define SLEEP_ON_VAR                                    \
+sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
-        unsigned long flags;                            \
+{
-        wait_queue_t wait;                              \
+        spin_lock_irqsave(&q->lock, *flags);
-        init_waitqueue_entry(&wait, current);
+        __add_wait_queue(q, wait);
-#define SLEEP_ON_HEAD                                   \
-        spin_lock_irqsave(&q->lock,flags);              \
-        __add_wait_queue(q, &wait);                     \
        spin_unlock(&q->lock);
+}
-#define SLEEP_ON_TAIL                                   \
+static inline void
-        spin_lock_irq(&q->lock);                        \
+sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
-        __remove_wait_queue(q, &wait);                  \
+{
-        spin_unlock_irqrestore(&q->lock, flags);
+        spin_lock_irq(&q->lock);
+        __remove_wait_queue(q, wait);
+        spin_unlock_irqrestore(&q->lock, *flags);
+}
-void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
-        SLEEP_ON_VAR
+        unsigned long flags;
+        wait_queue_t wait;
+        init_waitqueue_entry(&wait, current);
        current->state = TASK_INTERRUPTIBLE;
-        SLEEP_ON_HEAD
+        sleep_on_head(q, &wait, &flags);
        schedule();
-        SLEEP_ON_TAIL
+        sleep_on_tail(q, &wait, &flags);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched
+long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-        SLEEP_ON_VAR
+        unsigned long flags;
+        wait_queue_t wait;
+        init_waitqueue_entry(&wait, current);
        current->state = TASK_INTERRUPTIBLE;
-        SLEEP_ON_HEAD
+        sleep_on_head(q, &wait, &flags);
        timeout = schedule_timeout(timeout);
-        SLEEP_ON_TAIL
+        sleep_on_tail(q, &wait, &flags);
        return timeout;
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-void fastcall __sched sleep_on(wait_queue_head_t *q)
+void __sched sleep_on(wait_queue_head_t *q)
 {
-        SLEEP_ON_VAR
+        unsigned long flags;
+        wait_queue_t wait;
+        init_waitqueue_entry(&wait, current);
        current->state = TASK_UNINTERRUPTIBLE;
-        SLEEP_ON_HEAD
+        sleep_on_head(q, &wait, &flags);
        schedule();
-        SLEEP_ON_TAIL
+        sleep_on_tail(q, &wait, &flags);
 }
 EXPORT_SYMBOL(sleep_on);
-long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-        SLEEP_ON_VAR
+        unsigned long flags;
+        wait_queue_t wait;
+        init_waitqueue_entry(&wait, current);
        current->state = TASK_UNINTERRUPTIBLE;
-        SLEEP_ON_HEAD
+        sleep_on_head(q, &wait, &flags);
        timeout = schedule_timeout(timeout);
-        SLEEP_ON_TAIL
+        sleep_on_tail(q, &wait, &flags);
        return timeout;
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
@@ -4129,29 +3800,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        struct prio_array *array;
        unsigned long flags;
+        int oldprio, on_rq;
        struct rq *rq;
-        int oldprio;
+        u64 now;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
+        now = rq_clock(rq);
        oldprio = p->prio;
-        array = p->array;
+        on_rq = p->se.on_rq;
-        if (array)
+        if (on_rq)
-                dequeue_task(p, array);
+                dequeue_task(rq, p, 0, now);
+        if (rt_prio(prio))
+                p->sched_class = &rt_sched_class;
+        else
+                p->sched_class = &fair_sched_class;
        p->prio = prio;
-        if (array) {
+        if (on_rq) {
-                /*
+                enqueue_task(rq, p, 0, now);
-                 * If changing to an RT priority then queue it
-                 * in the active array!
-                 */
-                if (rt_task(p))
-                        array = rq->active;
-                enqueue_task(p, array);
                /*
                 * Reschedule if we are currently running on this runqueue and
                 * our priority decreased, or if we are not currently running on
@@ -4160,8 +3832,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (task_running(rq, p)) {
                        if (p->prio > oldprio)
                                resched_task(rq->curr);
-                } else if (TASK_PREEMPTS_CURR(p, rq))
+                } else {
-                        resched_task(rq->curr);
+                        check_preempt_curr(rq, p);
+                }
        }
        task_rq_unlock(rq, &flags);
 }
@@ -4170,10 +3843,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 void set_user_nice(struct task_struct *p, long nice)
 {
-        struct prio_array *array;
+        int old_prio, delta, on_rq;
-        int old_prio, delta;
        unsigned long flags;
        struct rq *rq;
+        u64 now;
        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
                return;
@@ -4182,20 +3855,21 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
+        now = rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
         * it wont have any effect on scheduling until the task is
-         * not SCHED_NORMAL/SCHED_BATCH:
+         * SCHED_FIFO/SCHED_RR:
         */
-        if (has_rt_policy(p)) {
+        if (task_has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        array = p->array;
+        on_rq = p->se.on_rq;
-        if (array) {
+        if (on_rq) {
-                dequeue_task(p, array);
+                dequeue_task(rq, p, 0, now);
-                dec_raw_weighted_load(rq, p);
+                dec_load(rq, p, now);
        }
        p->static_prio = NICE_TO_PRIO(nice);
@@ -4204,9 +3878,9 @@ void set_user_nice(struct task_struct *p, long nice)
        p->prio = effective_prio(p);
        delta = p->prio - old_prio;
-        if (array) {
+        if (on_rq) {
-                enqueue_task(p, array);
+                enqueue_task(rq, p, 0, now);
-                inc_raw_weighted_load(rq, p);
+                inc_load(rq, p, now);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4326,20 +4000,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
 }
 /* Actually do priority change: must hold rq lock. */
-static void __setscheduler(struct task_struct *p, int policy, int prio)
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-        BUG_ON(p->array);
+        BUG_ON(p->se.on_rq);
        p->policy = policy;
+        switch (p->policy) {
+        case SCHED_NORMAL:
+        case SCHED_BATCH:
+        case SCHED_IDLE:
+                p->sched_class = &fair_sched_class;
+                break;
+        case SCHED_FIFO:
+        case SCHED_RR:
+                p->sched_class = &rt_sched_class;
+                break;
+        }
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
-        /*
-         * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-         */
-        if (policy == SCHED_BATCH)
-                p->sleep_avg = 0;
        set_load_weight(p);
 }
@@ -4354,8 +4036,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 int sched_setscheduler(struct task_struct *p, int policy,
                       struct sched_param *param)
 {
-        int retval, oldprio, oldpolicy = -1;
+        int retval, oldprio, oldpolicy = -1, on_rq;
-        struct prio_array *array;
        unsigned long flags;
        struct rq *rq;
@@ -4366,27 +4047,27 @@ recheck:
        if (policy < 0)
                policy = oldpolicy = p->policy;
        else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                        policy != SCHED_NORMAL && policy != SCHED_BATCH)
+                        policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                        policy != SCHED_IDLE)
                return -EINVAL;
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
-         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
-         * SCHED_BATCH is 0.
+         * SCHED_BATCH and SCHED_IDLE is 0.
         */
        if (param->sched_priority < 0 ||
            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if (is_rt_policy(policy) != (param->sched_priority != 0))
+        if (rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
-                if (is_rt_policy(policy)) {
+                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio;
-                        unsigned long flags;
                        if (!lock_task_sighand(p, &flags))
                                return -ESRCH;
@@ -4402,6 +4083,12 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
+                /*
+                 * Like positive nice levels, dont allow tasks to
+                 * move out of SCHED_IDLE either:
+                 */
+                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+                        return -EPERM;
                /* can't change other user's priorities */
                if ((current->euid != p->euid) &&
@@ -4429,13 +4116,13 @@ recheck:
                spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        array = p->array;
+        on_rq = p->se.on_rq;
-        if (array)
+        if (on_rq)
-                deactivate_task(p, rq);
+                deactivate_task(rq, p, 0);
        oldprio = p->prio;
-        __setscheduler(p, policy, param->sched_priority);
+        __setscheduler(rq, p, policy, param->sched_priority);
-        if (array) {
+        if (on_rq) {
-                __activate_task(p, rq);
+                activate_task(rq, p, 0);
                /*
                 * Reschedule if we are currently running on this runqueue and
                 * our priority decreased, or if we are not currently running on
@@ -4444,8 +4131,9 @@ recheck:
                if (task_running(rq, p)) {
                        if (p->prio > oldprio)
                                resched_task(rq->curr);
-                } else if (TASK_PREEMPTS_CURR(p, rq))
+                } else {
-                        resched_task(rq->curr);
+                        check_preempt_curr(rq, p);
+                }
        }
        __task_rq_unlock(rq);
        spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4717,41 +4405,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 /**
 * sys_sched_yield - yield the current processor to other threads.
 *
- * This function yields the current CPU by moving the calling thread
+ * This function yields the current CPU to other tasks. If there are no
- * to the expired array. If there are no other threads running on this
+ * other threads running on this CPU then this function will return.
- * CPU then this function will return.
 */
 asmlinkage long sys_sched_yield(void)
 {
        struct rq *rq = this_rq_lock();
-        struct prio_array *array = current->array, *target = rq->expired;
        schedstat_inc(rq, yld_cnt);
-        /*
+        if (unlikely(rq->nr_running == 1))
-         * We implement yielding by moving the task into the expired
-         * queue.
-         *
-         * (special rule: RT tasks will just roundrobin in the active
-         *  array.)
-         */
-        if (rt_task(current))
-                target = rq->active;
-        if (array->nr_active == 1) {
                schedstat_inc(rq, yld_act_empty);
-                if (!rq->expired->nr_active)
+        else
-                        schedstat_inc(rq, yld_both_empty);
+                current->sched_class->yield_task(rq, current);
-        } else if (!rq->expired->nr_active)
-                schedstat_inc(rq, yld_exp_empty);
-        if (array != target) {
-                dequeue_task(current, array);
-                enqueue_task(current, target);
-        } else
-                /*
-                 * requeue_task is cheaper so perform that if possible.
-                 */
-                requeue_task(current, array);
        /*
         * Since we are going to call schedule() anyway, there's
@@ -4902,6 +4567,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
                break;
        case SCHED_NORMAL:
        case SCHED_BATCH:
+        case SCHED_IDLE:
                ret = 0;
                break;
        }
@@ -4926,6 +4592,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
                break;
        case SCHED_NORMAL:
        case SCHED_BATCH:
+        case SCHED_IDLE:
                ret = 0;
        }
        return ret;
@@ -4960,7 +4627,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
                goto out_unlock;
        jiffies_to_timespec(p->policy == SCHED_FIFO ?
-                                0 : task_timeslice(p), &t);
+                                0 : static_prio_timeslice(p->static_prio), &t);
        read_unlock(&tasklist_lock);
        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -5035,6 +4702,9 @@ void show_state_filter(unsigned long state_filter)
        touch_all_softlockup_watchdogs();
+#ifdef CONFIG_SCHED_DEBUG
+        sysrq_sched_debug_show();
+#endif
        read_unlock(&tasklist_lock);
        /*
         * Only show locks if all tasks are dumped:
@@ -5043,6 +4713,11 @@ void show_state_filter(unsigned long state_filter)
                debug_show_all_locks();
 }
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{
+        idle->sched_class = &idle_sched_class;
+}
 /**
 * init_idle - set up an idle thread for a given CPU
 * @idle: task in question
@@ -5056,13 +4731,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        idle->timestamp = sched_clock();
+        __sched_fork(idle);
-        idle->sleep_avg = 0;
+        idle->se.exec_start = sched_clock();
-        idle->array = NULL;
        idle->prio = idle->normal_prio = MAX_PRIO;
-        idle->state = TASK_RUNNING;
        idle->cpus_allowed = cpumask_of_cpu(cpu);
-        set_task_cpu(idle, cpu);
+        __set_task_cpu(idle, cpu);
        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
@@ -5077,6 +4751,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #else
        task_thread_info(idle)->preempt_count = 0;
 #endif
+        /*
+         * The idle tasks have their own, simple scheduling class:
+         */
+        idle->sched_class = &idle_sched_class;
 }
 /*
@@ -5088,6 +4766,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+        unsigned int factor = 1 + ilog2(num_online_cpus());
+        const unsigned long gran_limit = 10000000;
+        sysctl_sched_granularity *= factor;
+        if (sysctl_sched_granularity > gran_limit)
+                sysctl_sched_granularity = gran_limit;
+        sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
+        sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+}
 #ifdef CONFIG_SMP
 /*
 * This is how migration works:
@@ -5161,7 +4861,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        struct rq *rq_dest, *rq_src;
-        int ret = 0;
+        int ret = 0, on_rq;
        if (unlikely(cpu_is_offline(dest_cpu)))
                return ret;
@@ -5177,20 +4877,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (!cpu_isset(dest_cpu, p->cpus_allowed))
                goto out;
+        on_rq = p->se.on_rq;
+        if (on_rq)
+                deactivate_task(rq_src, p, 0);
        set_task_cpu(p, dest_cpu);
-        if (p->array) {
+        if (on_rq) {
-                /*
+                activate_task(rq_dest, p, 0);
-                 * Sync timestamp with rq_dest's before activating.
+                check_preempt_curr(rq_dest, p);
-                 * The same thing could be achieved by doing this step
-                 * afterwards, and pretending it was a local activate.
-                 * This way is cleaner and logically correct.
-                 */
-                p->timestamp = p->timestamp - rq_src->most_recent_timestamp
-                                + rq_dest->most_recent_timestamp;
-                deactivate_task(p, rq_src);
-                __activate_task(p, rq_dest);
-                if (TASK_PREEMPTS_CURR(p, rq_dest))
-                        resched_task(rq_dest->curr);
        }
        ret = 1;
 out:
@@ -5342,7 +5035,8 @@ static void migrate_live_tasks(int src_cpu)
        write_unlock_irq(&tasklist_lock);
 }
-/* Schedules idle task to be the next runnable task on current CPU.
+/*
+ * Schedules idle task to be the next runnable task on current CPU.
 * It does so by boosting its priority to highest possible and adding it to
 * the _front_ of the runqueue. Used by CPU offline code.
 */
@@ -5362,10 +5056,10 @@ void sched_idle_next(void)
         */
        spin_lock_irqsave(&rq->lock, flags);
-        __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
        /* Add idle task to the _front_ of its priority queue: */
-        __activate_idle_task(p, rq);
+        activate_idle_task(p, rq);
        spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -5415,16 +5109,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
        struct rq *rq = cpu_rq(dead_cpu);
-        unsigned int arr, i;
+        struct task_struct *next;
-        for (arr = 0; arr < 2; arr++) {
+        for ( ; ; ) {
-                for (i = 0; i < MAX_PRIO; i++) {
+                if (!rq->nr_running)
-                        struct list_head *list = &rq->arrays[arr].queue[i];
+                        break;
+                next = pick_next_task(rq, rq->curr, rq_clock(rq));
-                        while (!list_empty(list))
+                if (!next)
-                                migrate_dead(dead_cpu, list_entry(list->next,
+                        break;
-                                             struct task_struct, run_list));
+                migrate_dead(dead_cpu, next);
-                }
        }
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -5448,14 +5141,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
+                p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
                if (IS_ERR(p))
                        return NOTIFY_BAD;
                p->flags |= PF_NOFREEZE;
                kthread_bind(p, cpu);
                /* Must be high prio: stop_machine expects to yield to it. */
                rq = task_rq_lock(p, &flags);
-                __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                task_rq_unlock(rq, &flags);
                cpu_rq(cpu)->migration_thread = p;
                break;
@@ -5486,9 +5179,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                rq = task_rq_lock(rq->idle, &flags);
-                deactivate_task(rq->idle, rq);
+                deactivate_task(rq, rq->idle, 0);
                rq->idle->static_prio = MAX_PRIO;
-                __setscheduler(rq->idle, SCHED_NORMAL, 0);
+                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
+                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                task_rq_unlock(rq, &flags);
                migrate_nr_uninterruptible(rq);
@@ -5797,483 +5491,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 #define SD_NODES_PER_DOMAIN 16
-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- *  - source: CPU1, target: CPU2 | cost1
- *  - source: CPU2, target: CPU1 | cost2
- *  - source: CPU1, target: CPU1 | cost3
- *  - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE            2
-#define MIN_CACHE_SIZE          (64*1024U)
-#define DEFAULT_CACHE_SIZE      (5*1024*1024U)
-#define ITERATIONS              1
-#define SIZE_THRESH             130
-#define COST_THRESH             130
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
-                        CONFIG_DEFAULT_MIGRATION_COST
-#else
-                        -1LL
-#endif
-};
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
-        int ints[MAX_DOMAIN_DISTANCE+1], i;
-        str = get_options(str, ARRAY_SIZE(ints), ints);
-        printk("#ints: %d\n", ints[0]);
-        for (i = 1; i <= ints[0]; i++) {
-                migration_cost[i-1] = (unsigned long long)ints[i]*1000;
-                printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
-        }
-        return 1;
-}
-__setup ("migration_cost=", migration_cost_setup);
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-#define MIGRATION_FACTOR_SCALE 128
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-static int __init setup_migration_factor(char *str)
-{
-        get_option(&str, &migration_factor);
-        migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
-        return 1;
-}
-__setup("migration_factor=", setup_migration_factor);
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
-        unsigned long distance = 0;
-        struct sched_domain *sd;
-        for_each_domain(cpu1, sd) {
-                WARN_ON(!cpu_isset(cpu1, sd->span));
-                if (cpu_isset(cpu2, sd->span))
-                        return distance;
-                distance++;
-        }
-        if (distance >= MAX_DOMAIN_DISTANCE) {
-                WARN_ON(1);
-                distance = MAX_DOMAIN_DISTANCE-1;
-        }
-        return distance;
-}
-static unsigned int migration_debug;
-static int __init setup_migration_debug(char *str)
-{
-        get_option(&str, &migration_debug);
-        return 1;
-}
-__setup("migration_debug=", setup_migration_debug);
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-static int __init setup_max_cache_size(char *str)
-{
-        get_option(&str, &max_cache_size);
-        return 1;
-}
-__setup("max_cache_size=", setup_max_cache_size);
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
-        unsigned long size = __size / sizeof(long);
-        unsigned long chunk1 = size / 3;
-        unsigned long chunk2 = 2 * size / 3;
-        unsigned long *cache = __cache;
-        int i;
-        for (i = 0; i < size/6; i += 8) {
-                switch (i % 6) {
-                        case 0: cache[i]++;
-                        case 1: cache[size-1-i]++;
-                        case 2: cache[chunk1-i]++;
-                        case 3: cache[chunk1+i]++;
-                        case 4: cache[chunk2-i]++;
-                        case 5: cache[chunk2+i]++;
-                }
-        }
-}
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
-        cpumask_t mask, saved_mask;
-        unsigned long long t0, t1, t2, t3, cost;
-        saved_mask = current->cpus_allowed;
-        /*
-         * Flush source caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        /*
-         * Migrate to the source CPU:
-         */
-        mask = cpumask_of_cpu(source);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != source);
-        /*
-         * Dirty the working set:
-         */
-        t0 = sched_clock();
-        touch_cache(cache, size);
-        t1 = sched_clock();
-        /*
-         * Migrate to the target CPU, dirty the L2 cache and access
-         * the shared buffer. (which represents the working set
-         * of a migrated task.)
-         */
-        mask = cpumask_of_cpu(target);
-        set_cpus_allowed(current, mask);
-        WARN_ON(smp_processor_id() != target);
-        t2 = sched_clock();
-        touch_cache(cache, size);
-        t3 = sched_clock();
-        cost = t1-t0 + t3-t2;
-        if (migration_debug >= 2)
-                printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
-                        source, target, t1-t0, t1-t0, t3-t2, cost);
-        /*
-         * Flush target caches to RAM and invalidate them:
-         */
-        sched_cacheflush();
-        set_cpus_allowed(current, saved_mask);
-        return cost;
-}
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
-        unsigned long long cost1, cost2;
-        int i;
-        /*
-         * Measure the migration cost of 'size' bytes, over an
-         * average of 10 runs:
-         *
-         * (We perturb the cache size by a small (0..4k)
-         *  value to compensate size/alignment related artifacts.
-         *  We also subtract the cost of the operation done on
-         *  the same CPU.)
-         */
-        cost1 = 0;
-        /*
-         * dry run, to make sure we start off cache-cold on cpu1,
-         * and to get any vmalloc pagefaults in advance:
-         */
-        measure_one(cache, size, cpu1, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-        measure_one(cache, size, cpu2, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-        /*
-         * (We measure the non-migrating [cached] cost on both
-         *  cpu1 and cpu2, to handle CPUs with different speeds)
-         */
-        cost2 = 0;
-        measure_one(cache, size, cpu1, cpu1);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-        measure_one(cache, size, cpu2, cpu2);
-        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-        /*
-         * Get the per-iteration migration cost:
-         */
-        do_div(cost1, 2 * ITERATIONS);
-        do_div(cost2, 2 * ITERATIONS);
-        return cost1 - cost2;
-}
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
-        unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
-        unsigned int max_size, size, size_found = 0;
-        long long cost = 0, prev_cost;
-        void *cache;
-        /*
-         * Search from max_cache_size*5 down to 64K - the real relevant
-         * cachesize has to lie somewhere inbetween.
-         */
-        if (max_cache_size) {
-                max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
-                size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
-        } else {
-                /*
-                 * Since we have no estimation about the relevant
-                 * search range
-                 */
-                max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
-                size = MIN_CACHE_SIZE;
-        }
-        if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
-                printk("cpu %d and %d not both online!\n", cpu1, cpu2);
-                return 0;
-        }
-        /*
-         * Allocate the working set:
-         */
-        cache = vmalloc(max_size);
-        if (!cache) {
-                printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
-                return 1000000; /* return 1 msec on very small boxen */
-        }
-        while (size <= max_size) {
-                prev_cost = cost;
-                cost = measure_cost(cpu1, cpu2, cache, size);
-                /*
-                 * Update the max:
-                 */
-                if (cost > 0) {
-                        if (max_cost < cost) {
-                                max_cost = cost;
-                                size_found = size;
-                        }
-                }
-                /*
-                 * Calculate average fluctuation, we use this to prevent
-                 * noise from triggering an early break out of the loop:
-                 */
-                fluct = abs(cost - prev_cost);
-                avg_fluct = (avg_fluct + fluct)/2;
-                if (migration_debug)
-                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
-                                "(%8Ld %8Ld)\n",
-                                cpu1, cpu2, size,
-                                (long)cost / 1000000,
-                                ((long)cost / 100000) % 10,
-                                (long)max_cost / 1000000,
-                                ((long)max_cost / 100000) % 10,
-                                domain_distance(cpu1, cpu2),
-                                cost, avg_fluct);
-                /*
-                 * If we iterated at least 20% past the previous maximum,
-                 * and the cost has dropped by more than 20% already,
-                 * (taking fluctuations into account) then we assume to
-                 * have found the maximum and break out of the loop early:
-                 */
-                if (size_found && (size*100 > size_found*SIZE_THRESH))
-                        if (cost+avg_fluct <= 0 ||
-                                max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-                                if (migration_debug)
-                                        printk("-> found max.\n");
-                                break;
-                        }
-                /*
-                 * Increase the cachesize in 10% steps:
-                 */
-                size = size * 10 / 9;
-        }
-        if (migration_debug)
-                printk("[%d][%d] working set size found: %d, cost: %Ld\n",
-                        cpu1, cpu2, size_found, max_cost);
-        vfree(cache);
-        /*
-         * A task is considered 'cache cold' if at least 2 times
-         * the worst-case cost of migration has passed.
-         *
-         * (this limit is only listened to if the load-balancing
-         * situation is 'nice' - if there is a large imbalance we
-         * ignore it for the sake of CPU utilization and
-         * processing fairness.)
-         */
-        return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
-        int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
-        unsigned long j0, j1, distance, max_distance = 0;
-        struct sched_domain *sd;
-        j0 = jiffies;
-        /*
-         * First pass - calculate the cacheflush times:
-         */
-        for_each_cpu_mask(cpu1, *cpu_map) {
-                for_each_cpu_mask(cpu2, *cpu_map) {
-                        if (cpu1 == cpu2)
-                                continue;
-                        distance = domain_distance(cpu1, cpu2);
-                        max_distance = max(max_distance, distance);
-                        /*
-                         * No result cached yet?
-                         */
-                        if (migration_cost[distance] == -1LL)
-                                migration_cost[distance] =
-                                        measure_migration_cost(cpu1, cpu2);
-                }
-        }
-        /*
-         * Second pass - update the sched domain hierarchy with
-         * the new cache-hot-time estimations:
-         */
-        for_each_cpu_mask(cpu, *cpu_map) {
-                distance = 0;
-                for_each_domain(cpu, sd) {
-                        sd->cache_hot_time = migration_cost[distance];
-                        distance++;
-                }
-        }
-        /*
-         * Print the matrix:
-         */
-        if (migration_debug)
-                printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
-                        max_cache_size,
-#ifdef CONFIG_X86
-                        cpu_khz/1000
-#else
-                        -1
-#endif
-                );
-        if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-                printk("migration_cost=");
-                for (distance = 0; distance <= max_distance; distance++) {
-                        if (distance)
-                                printk(",");
-                        printk("%ld", (long)migration_cost[distance] / 1000);
-                }
-                printk("\n");
-        }
-        j1 = jiffies;
-        if (migration_debug)
-                printk("migration: %ld seconds\n", (j1-j0) / HZ);
-        /*
-         * Move back to the original CPU. NUMA-Q gets confused
-         * if we migrate to another quad during bootup.
-         */
-        if (raw_smp_processor_id() != orig_cpu) {
-                cpumask_t mask = cpumask_of_cpu(orig_cpu),
-                        saved_mask = current->cpus_allowed;
-                set_cpus_allowed(current, mask);
-                set_cpus_allowed(current, saved_mask);
-        }
-}
 #ifdef CONFIG_NUMA
 /**
@@ -6574,7 +5791,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
-        struct sched_domain *sd;
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
        int sd_allnodes = 0;
@@ -6582,7 +5798,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        /*
         * Allocate the per-node list of sched groups
         */
-        sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+        sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
                                           GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +5817,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
-                if (cpus_weight(*cpu_map)
+                if (cpus_weight(*cpu_map) >
-                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                                SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
@@ -6661,7 +5877,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (i != first_cpu(this_sibling_map))
                        continue;
-                init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
+                init_sched_build_groups(this_sibling_map, cpu_map,
+                                        &cpu_to_cpu_group);
        }
 #endif
@@ -6672,11 +5889,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(this_core_map, this_core_map, *cpu_map);
                if (i != first_cpu(this_core_map))
                        continue;
-                init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
+                init_sched_build_groups(this_core_map, cpu_map,
+                                        &cpu_to_core_group);
        }
 #endif
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +5908,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sd_allnodes)
-                init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
+                init_sched_build_groups(*cpu_map, cpu_map,
+                                        &cpu_to_allnodes_group);
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -6719,6 +5937,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                sched_group_nodes[i] = sg;
                for_each_cpu_mask(j, nodemask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
@@ -6763,19 +5982,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
        for_each_cpu_mask(i, *cpu_map) {
-                sd = &per_cpu(cpu_domains, i);
+                struct sched_domain *sd = &per_cpu(cpu_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
        for_each_cpu_mask(i, *cpu_map) {
-                sd = &per_cpu(core_domains, i);
+                struct sched_domain *sd = &per_cpu(core_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
        for_each_cpu_mask(i, *cpu_map) {
-                sd = &per_cpu(phys_domains, i);
+                struct sched_domain *sd = &per_cpu(phys_domains, i);
                init_sched_groups_power(i, sd);
        }
@@ -6803,10 +6025,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
                cpu_attach_domain(sd, i);
        }
-        /*
-         * Tune cache-hot values:
-         */
-        calibrate_migration_costs(cpu_map);
        return 0;
@@ -7013,10 +6231,12 @@ void __init sched_init_smp(void)
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                BUG();
+        sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
+        sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
@@ -7030,28 +6250,51 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
+static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+{
+        cfs_rq->tasks_timeline = RB_ROOT;
+        cfs_rq->fair_clock = 1;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        cfs_rq->rq = rq;
+#endif
+}
 void __init sched_init(void)
 {
-        int i, j, k;
+        u64 now = sched_clock();
        int highest_cpu = 0;
+        int i, j;
+        /*
+         * Link up the scheduling class hierarchy:
+         */
+        rt_sched_class.next = &fair_sched_class;
+        fair_sched_class.next = &idle_sched_class;
+        idle_sched_class.next = NULL;
        for_each_possible_cpu(i) {
-                struct prio_array *array;
+                struct rt_prio_array *array;
                struct rq *rq;
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
-                rq->active = rq->arrays;
+                rq->clock = 1;
-                rq->expired = rq->arrays + 1;
+                init_cfs_rq(&rq->cfs, rq);
-                rq->best_expired_prio = MAX_PRIO;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+                list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+#endif
+                rq->ls.load_update_last = now;
+                rq->ls.load_update_start = now;
+                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+                        rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
                rq->sd = NULL;
-                for (j = 1; j < 3; j++)
-                        rq->cpu_load[j] = 0;
                rq->active_balance = 0;
+                rq->next_balance = jiffies;
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->migration_thread = NULL;
@@ -7059,16 +6302,14 @@ void __init sched_init(void)
 #endif
                atomic_set(&rq->nr_iowait, 0);
-                for (j = 0; j < 2; j++) {
+                array = &rq->rt.active;
-                        array = rq->arrays + j;
+                for (j = 0; j < MAX_RT_PRIO; j++) {
-                        for (k = 0; k < MAX_PRIO; k++) {
+                        INIT_LIST_HEAD(array->queue + j);
-                                INIT_LIST_HEAD(array->queue + k);
+                        __clear_bit(j, array->bitmap);
-                                __clear_bit(k, array->bitmap);
-                        }
-                        // delimiter for bitsearch
-                        __set_bit(MAX_PRIO, array->bitmap);
                }
                highest_cpu = i;
+                /* delimiter for bitsearch: */
+                __set_bit(MAX_RT_PRIO, array->bitmap);
        }
        set_load_weight(&init_task);
@@ -7095,6 +6336,10 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+        /*
+         * During early bootup we pretend to be a normal task:
+         */
+        current->sched_class = &fair_sched_class;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7125,29 +6370,55 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
-        struct prio_array *array;
        struct task_struct *g, *p;
        unsigned long flags;
        struct rq *rq;
+        int on_rq;
        read_lock_irq(&tasklist_lock);
        do_each_thread(g, p) {
-                if (!rt_task(p))
+                p->se.fair_key                  = 0;
+                p->se.wait_runtime              = 0;
+                p->se.wait_start_fair           = 0;
+                p->se.wait_start                = 0;
+                p->se.exec_start                = 0;
+                p->se.sleep_start               = 0;
+                p->se.sleep_start_fair          = 0;
+                p->se.block_start               = 0;
+                task_rq(p)->cfs.fair_clock      = 0;
+                task_rq(p)->clock               = 0;
+                if (!rt_task(p)) {
+                        /*
+                         * Renice negative nice level userspace
+                         * tasks back to 0:
+                         */
+                        if (TASK_NICE(p) < 0 && p->mm)
+                                set_user_nice(p, 0);
                        continue;
+                }
                spin_lock_irqsave(&p->pi_lock, flags);
                rq = __task_rq_lock(p);
+#ifdef CONFIG_SMP
+                /*
+                 * Do not touch the migration thread:
+                 */
+                if (p == rq->migration_thread)
+                        goto out_unlock;
+#endif
-                array = p->array;
+                on_rq = p->se.on_rq;
-                if (array)
+                if (on_rq)
-                        deactivate_task(p, task_rq(p));
+                        deactivate_task(task_rq(p), p, 0);
-                __setscheduler(p, SCHED_NORMAL, 0);
+                __setscheduler(rq, p, SCHED_NORMAL, 0);
-                if (array) {
+                if (on_rq) {
-                        __activate_task(p, task_rq(p));
+                        activate_task(task_rq(p), p, 0);
                        resched_task(rq->curr);
                }
+#ifdef CONFIG_SMP
+ out_unlock:
+#endif
                __task_rq_unlock(rq);
                spin_unlock_irqrestore(&p->pi_lock, flags);
        } while_each_thread(g, p);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 000000000000..1baf87cceb7c
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,275 @@
+/*
+ * kernel/time/sched_debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...)                     \
+ do {                                           \
+        if (m)                                  \
+                seq_printf(m, x);               \
+        else                                    \
+                printk(x);                      \
+ } while (0)
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
+{
+        if (rq->curr == p)
+                SEQ_printf(m, "R");
+        else
+                SEQ_printf(m, " ");
+        SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
+                      "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
+                p->comm, p->pid,
+                (long long)p->se.fair_key,
+                (long long)(p->se.fair_key - rq->cfs.fair_clock),
+                (long long)p->se.wait_runtime,
+                (long long)(p->nvcsw + p->nivcsw),
+                p->prio,
+                (long long)p->se.sum_exec_runtime,
+                (long long)p->se.sum_wait_runtime,
+                (long long)p->se.sum_sleep_runtime,
+                (long long)p->se.wait_runtime_overruns,
+                (long long)p->se.wait_runtime_underruns);
+}
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
+{
+        struct task_struct *g, *p;
+        SEQ_printf(m,
+        "\nrunnable tasks:\n"
+        "            task   PID        tree-key         delta       waiting"
+        "  switches  prio"
+        "        sum-exec        sum-wait       sum-sleep"
+        "    wait-overrun   wait-underrun\n"
+        "------------------------------------------------------------------"
+        "----------------"
+        "------------------------------------------------"
+        "--------------------------------\n");
+        read_lock_irq(&tasklist_lock);
+        do_each_thread(g, p) {
+                if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+                        continue;
+                print_task(m, rq, p, now);
+        } while_each_thread(g, p);
+        read_unlock_irq(&tasklist_lock);
+}
+static void
+print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+        s64 wait_runtime_rq_sum = 0;
+        struct task_struct *p;
+        struct rb_node *curr;
+        unsigned long flags;
+        struct rq *rq = &per_cpu(runqueues, cpu);
+        spin_lock_irqsave(&rq->lock, flags);
+        curr = first_fair(cfs_rq);
+        while (curr) {
+                p = rb_entry(curr, struct task_struct, se.run_node);
+                wait_runtime_rq_sum += p->se.wait_runtime;
+                curr = rb_next(curr);
+        }
+        spin_unlock_irqrestore(&rq->lock, flags);
+        SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
+                (long long)wait_runtime_rq_sum);
+}
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+        SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
+#define P(x) \
+        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
+        P(fair_clock);
+        P(exec_clock);
+        P(wait_runtime);
+        P(wait_runtime_overruns);
+        P(wait_runtime_underruns);
+        P(sleeper_bonus);
+#undef P
+        print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+}
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+        struct rq *rq = &per_cpu(runqueues, cpu);
+#ifdef CONFIG_X86
+        {
+                unsigned int freq = cpu_khz ? : 1;
+                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                           cpu, freq / 1000, (freq % 1000));
+        }
+#else
+        SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+#define P(x) \
+        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+        P(nr_running);
+        SEQ_printf(m, "  .%-30s: %lu\n", "load",
+                   rq->ls.load.weight);
+        P(ls.delta_fair);
+        P(ls.delta_exec);
+        P(nr_switches);
+        P(nr_load_updates);
+        P(nr_uninterruptible);
+        SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
+        P(next_balance);
+        P(curr->pid);
+        P(clock);
+        P(prev_clock_raw);
+        P(clock_warps);
+        P(clock_overflows);
+        P(clock_unstable_events);
+        P(clock_max_delta);
+        P(cpu_load[0]);
+        P(cpu_load[1]);
+        P(cpu_load[2]);
+        P(cpu_load[3]);
+        P(cpu_load[4]);
+#undef P
+        print_cfs_stats(m, cpu, now);
+        print_rq(m, rq, cpu, now);
+}
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+        u64 now = ktime_to_ns(ktime_get());
+        int cpu;
+        SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n",
+                init_utsname()->release,
+                (int)strcspn(init_utsname()->version, " "),
+                init_utsname()->version);
+        SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
+        for_each_online_cpu(cpu)
+                print_cpu(m, cpu, now);
+        SEQ_printf(m, "\n");
+        return 0;
+}
+void sysrq_sched_debug_show(void)
+{
+        sched_debug_show(NULL, NULL);
+}
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, sched_debug_show, NULL);
+}
+static struct file_operations sched_debug_fops = {
+        .open           = sched_debug_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init init_sched_debug_procfs(void)
+{
+        struct proc_dir_entry *pe;
+        pe = create_proc_entry("sched_debug", 0644, NULL);
+        if (!pe)
+                return -ENOMEM;
+        pe->proc_fops = &sched_debug_fops;
+        return 0;
+}
+__initcall(init_sched_debug_procfs);
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+        unsigned long flags;
+        int num_threads = 1;
+        rcu_read_lock();
+        if (lock_task_sighand(p, &flags)) {
+                num_threads = atomic_read(&p->signal->count);
+                unlock_task_sighand(p, &flags);
+        }
+        rcu_read_unlock();
+        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+        SEQ_printf(m, "----------------------------------------------\n");
+#define P(F) \
+        SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+        P(se.wait_start);
+        P(se.wait_start_fair);
+        P(se.exec_start);
+        P(se.sleep_start);
+        P(se.sleep_start_fair);
+        P(se.block_start);
+        P(se.sleep_max);
+        P(se.block_max);
+        P(se.exec_max);
+        P(se.wait_max);
+        P(se.wait_runtime);
+        P(se.wait_runtime_overruns);
+        P(se.wait_runtime_underruns);
+        P(se.sum_wait_runtime);
+        P(se.sum_exec_runtime);
+        SEQ_printf(m, "%-25s:%20Ld\n",
+                   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
+        P(se.load.weight);
+        P(policy);
+        P(prio);
+#undef P
+        {
+                u64 t0, t1;
+                t0 = sched_clock();
+                t1 = sched_clock();
+                SEQ_printf(m, "%-25s:%20Ld\n",
+                           "clock-delta", (long long)(t1-t0));
+        }
+}
+void proc_sched_set_task(struct task_struct *p)
+{
+        p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
+        p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+        p->se.sum_exec_runtime = 0;
+}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 000000000000..6971db0a7160
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1131 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ *  Interactivity improvements by Mike Galbraith
+ *  (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ *  Various enhancements by Dmitry Adamushko.
+ *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ *  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  Copyright IBM Corporation, 2007
+ *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ *  Scaled math optimizations by Thomas Gleixner
+ *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ */
+/*
+ * Preemption granularity:
+ * (default: 2 msec, units: nanoseconds)
+ *
+ * NOTE: this granularity value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS will typically be somewhat
+ * larger than this value. (to see the precise effective timeslice
+ * length of your workload, run vmstat and monitor the context-switches
+ * field)
+ *
+ * On SMP systems the value of this is multiplied by the log2 of the
+ * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
+ * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
+ */
+unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
+/*
+ * SCHED_BATCH wake-up granularity.
+ * (default: 10 msec, units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
+                                                        10000000000ULL/HZ;
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 1 msec, units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
+unsigned int sysctl_sched_stat_granularity __read_mostly;
+/*
+ * Initialized in sched_init_granularity():
+ */
+unsigned int sysctl_sched_runtime_limit __read_mostly;
+/*
+ * Debugging: various feature bits
+ */
+enum {
+        SCHED_FEAT_FAIR_SLEEPERS        = 1,
+        SCHED_FEAT_SLEEPER_AVG          = 2,
+        SCHED_FEAT_SLEEPER_LOAD_AVG     = 4,
+        SCHED_FEAT_PRECISE_CPU_LOAD     = 8,
+        SCHED_FEAT_START_DEBIT          = 16,
+        SCHED_FEAT_SKIP_INITIAL         = 32,
+};
+unsigned int sysctl_sched_features __read_mostly =
+                SCHED_FEAT_FAIR_SLEEPERS        *1 |
+                SCHED_FEAT_SLEEPER_AVG          *1 |
+                SCHED_FEAT_SLEEPER_LOAD_AVG     *1 |
+                SCHED_FEAT_PRECISE_CPU_LOAD     *1 |
+                SCHED_FEAT_START_DEBIT          *1 |
+                SCHED_FEAT_SKIP_INITIAL         *0;
+extern struct sched_class fair_sched_class;
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->rq;
+}
+/* currently running entity (if any) on this cfs_rq */
+static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->curr;
+}
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)      (!se->my_q)
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        cfs_rq->curr = se;
+}
+#else   /* CONFIG_FAIR_GROUP_SCHED */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+        return container_of(cfs_rq, struct rq, cfs);
+}
+static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        if (unlikely(rq->curr->sched_class != &fair_sched_class))
+                return NULL;
+        return &rq->curr->se;
+}
+#define entity_is_task(se)      1
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif  /* CONFIG_FAIR_GROUP_SCHED */
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+        return container_of(se, struct task_struct, se);
+}
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static inline void
+__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+        struct rb_node *parent = NULL;
+        struct sched_entity *entry;
+        s64 key = se->fair_key;
+        int leftmost = 1;
+        /*
+         * Find the right place in the rbtree:
+         */
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct sched_entity, run_node);
+                /*
+                 * We dont care about collisions. Nodes with
+                 * the same key stay together.
+                 */
+                if (key - entry->fair_key < 0) {
+                        link = &parent->rb_left;
+                } else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        /*
+         * Maintain a cache of leftmost tree entries (it is frequently
+         * used):
+         */
+        if (leftmost)
+                cfs_rq->rb_leftmost = &se->run_node;
+        rb_link_node(&se->run_node, parent, link);
+        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+        update_load_add(&cfs_rq->load, se->load.weight);
+        cfs_rq->nr_running++;
+        se->on_rq = 1;
+}
+static inline void
+__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (cfs_rq->rb_leftmost == &se->run_node)
+                cfs_rq->rb_leftmost = rb_next(&se->run_node);
+        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+        update_load_sub(&cfs_rq->load, se->load.weight);
+        cfs_rq->nr_running--;
+        se->on_rq = 0;
+}
+static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->rb_leftmost;
+}
+static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+{
+        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+}
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+/*
+ * We rescale the rescheduling granularity of tasks according to their
+ * nice level, but only linearly, not exponentially:
+ */
+static long
+niced_granularity(struct sched_entity *curr, unsigned long granularity)
+{
+        u64 tmp;
+        /*
+         * Negative nice levels get the same granularity as nice-0:
+         */
+        if (likely(curr->load.weight >= NICE_0_LOAD))
+                return granularity;
+        /*
+         * Positive nice level tasks get linearly finer
+         * granularity:
+         */
+        tmp = curr->load.weight * (u64)granularity;
+        /*
+         * It will always fit into 'long':
+         */
+        return (long) (tmp >> NICE_0_SHIFT);
+}
+static inline void
+limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        long limit = sysctl_sched_runtime_limit;
+        /*
+         * Niced tasks have the same history dynamic range as
+         * non-niced tasks:
+         */
+        if (unlikely(se->wait_runtime > limit)) {
+                se->wait_runtime = limit;
+                schedstat_inc(se, wait_runtime_overruns);
+                schedstat_inc(cfs_rq, wait_runtime_overruns);
+        }
+        if (unlikely(se->wait_runtime < -limit)) {
+                se->wait_runtime = -limit;
+                schedstat_inc(se, wait_runtime_underruns);
+                schedstat_inc(cfs_rq, wait_runtime_underruns);
+        }
+}
+static inline void
+__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
+{
+        se->wait_runtime += delta;
+        schedstat_add(se, sum_wait_runtime, delta);
+        limit_wait_runtime(cfs_rq, se);
+}
+static void
+add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
+{
+        schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
+        __add_wait_runtime(cfs_rq, se, delta);
+        schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+}
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
+{
+        unsigned long delta, delta_exec, delta_fair;
+        long delta_mine;
+        struct load_weight *lw = &cfs_rq->load;
+        unsigned long load = lw->weight;
+        if (unlikely(!load))
+                return;
+        delta_exec = curr->delta_exec;
+#ifdef CONFIG_SCHEDSTATS
+        if (unlikely(delta_exec > curr->exec_max))
+                curr->exec_max = delta_exec;
+#endif
+        curr->sum_exec_runtime += delta_exec;
+        cfs_rq->exec_clock += delta_exec;
+        delta_fair = calc_delta_fair(delta_exec, lw);
+        delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
+        if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
+                delta = calc_delta_mine(cfs_rq->sleeper_bonus,
+                                        curr->load.weight, lw);
+                if (unlikely(delta > cfs_rq->sleeper_bonus))
+                        delta = cfs_rq->sleeper_bonus;
+                cfs_rq->sleeper_bonus -= delta;
+                delta_mine -= delta;
+        }
+        cfs_rq->fair_clock += delta_fair;
+        /*
+         * We executed delta_exec amount of time on the CPU,
+         * but we were only entitled to delta_mine amount of
+         * time during that period (if nr_running == 1 then
+         * the two values are equal)
+         * [Note: delta_mine - delta_exec is negative]:
+         */
+        add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
+}
+static void update_curr(struct cfs_rq *cfs_rq, u64 now)
+{
+        struct sched_entity *curr = cfs_rq_curr(cfs_rq);
+        unsigned long delta_exec;
+        if (unlikely(!curr))
+                return;
+        /*
+         * Get the amount of time the current task was running
+         * since the last time we changed load (this cannot
+         * overflow on 32 bits):
+         */
+        delta_exec = (unsigned long)(now - curr->exec_start);
+        curr->delta_exec += delta_exec;
+        if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
+                __update_curr(cfs_rq, curr, now);
+                curr->delta_exec = 0;
+        }
+        curr->exec_start = now;
+}
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        se->wait_start_fair = cfs_rq->fair_clock;
+        se->wait_start = now;
+}
+/*
+ * We calculate fair deltas here, so protect against the random effects
+ * of a multiplication overflow by capping it to the runtime limit:
+ */
+#if BITS_PER_LONG == 32
+static inline unsigned long
+calc_weighted(unsigned long delta, unsigned long weight, int shift)
+{
+        u64 tmp = (u64)delta * weight >> shift;
+        if (unlikely(tmp > sysctl_sched_runtime_limit*2))
+                return sysctl_sched_runtime_limit*2;
+        return tmp;
+}
+#else
+static inline unsigned long
+calc_weighted(unsigned long delta, unsigned long weight, int shift)
+{
+        return delta * weight >> shift;
+}
+#endif
+/*
+ * Task is being enqueued - update stats:
+ */
+static void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        s64 key;
+        /*
+         * Are we enqueueing a waiting task? (for current tasks
+         * a dequeue/enqueue event is a NOP)
+         */
+        if (se != cfs_rq_curr(cfs_rq))
+                update_stats_wait_start(cfs_rq, se, now);
+        /*
+         * Update the key:
+         */
+        key = cfs_rq->fair_clock;
+        /*
+         * Optimize the common nice 0 case:
+         */
+        if (likely(se->load.weight == NICE_0_LOAD)) {
+                key -= se->wait_runtime;
+        } else {
+                u64 tmp;
+                if (se->wait_runtime < 0) {
+                        tmp = -se->wait_runtime;
+                        key += (tmp * se->load.inv_weight) >>
+                                        (WMULT_SHIFT - NICE_0_SHIFT);
+                } else {
+                        tmp = se->wait_runtime;
+                        key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
+                }
+        }
+        se->fair_key = key;
+}
+/*
+ * Note: must be called with a freshly updated rq->fair_clock.
+ */
+static inline void
+__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        unsigned long delta_fair = se->delta_fair_run;
+#ifdef CONFIG_SCHEDSTATS
+        {
+                s64 delta_wait = now - se->wait_start;
+                if (unlikely(delta_wait > se->wait_max))
+                        se->wait_max = delta_wait;
+        }
+#endif
+        if (unlikely(se->load.weight != NICE_0_LOAD))
+                delta_fair = calc_weighted(delta_fair, se->load.weight,
+                                                        NICE_0_SHIFT);
+        add_wait_runtime(cfs_rq, se, delta_fair);
+}
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        unsigned long delta_fair;
+        delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
+                        (u64)(cfs_rq->fair_clock - se->wait_start_fair));
+        se->delta_fair_run += delta_fair;
+        if (unlikely(abs(se->delta_fair_run) >=
+                                sysctl_sched_stat_granularity)) {
+                __update_stats_wait_end(cfs_rq, se, now);
+                se->delta_fair_run = 0;
+        }
+        se->wait_start_fair = 0;
+        se->wait_start = 0;
+}
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        update_curr(cfs_rq, now);
+        /*
+         * Mark the end of the wait period if dequeueing a
+         * waiting task:
+         */
+        if (se != cfs_rq_curr(cfs_rq))
+                update_stats_wait_end(cfs_rq, se, now);
+}
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        /*
+         * We are starting a new run period:
+         */
+        se->exec_start = now;
+}
+/*
+ * We are descheduling a task - update its stats:
+ */
+static inline void
+update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        se->exec_start = 0;
+}
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+static void
+__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        unsigned long load = cfs_rq->load.weight, delta_fair;
+        long prev_runtime;
+        if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
+                load = rq_of(cfs_rq)->cpu_load[2];
+        delta_fair = se->delta_fair_sleep;
+        /*
+         * Fix up delta_fair with the effect of us running
+         * during the whole sleep period:
+         */
+        if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
+                delta_fair = div64_likely32((u64)delta_fair * load,
+                                                load + se->load.weight);
+        if (unlikely(se->load.weight != NICE_0_LOAD))
+                delta_fair = calc_weighted(delta_fair, se->load.weight,
+                                                        NICE_0_SHIFT);
+        prev_runtime = se->wait_runtime;
+        __add_wait_runtime(cfs_rq, se, delta_fair);
+        delta_fair = se->wait_runtime - prev_runtime;
+        /*
+         * Track the amount of bonus we've given to sleepers:
+         */
+        cfs_rq->sleeper_bonus += delta_fair;
+        schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+}
+static void
+enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        struct task_struct *tsk = task_of(se);
+        unsigned long delta_fair;
+        if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
+                         !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
+                return;
+        delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
+                (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
+        se->delta_fair_sleep += delta_fair;
+        if (unlikely(abs(se->delta_fair_sleep) >=
+                                sysctl_sched_stat_granularity)) {
+                __enqueue_sleeper(cfs_rq, se, now);
+                se->delta_fair_sleep = 0;
+        }
+        se->sleep_start_fair = 0;
+#ifdef CONFIG_SCHEDSTATS
+        if (se->sleep_start) {
+                u64 delta = now - se->sleep_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > se->sleep_max))
+                        se->sleep_max = delta;
+                se->sleep_start = 0;
+                se->sum_sleep_runtime += delta;
+        }
+        if (se->block_start) {
+                u64 delta = now - se->block_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > se->block_max))
+                        se->block_max = delta;
+                se->block_start = 0;
+                se->sum_sleep_runtime += delta;
+        }
+#endif
+}
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+               int wakeup, u64 now)
+{
+        /*
+         * Update the fair clock.
+         */
+        update_curr(cfs_rq, now);
+        if (wakeup)
+                enqueue_sleeper(cfs_rq, se, now);
+        update_stats_enqueue(cfs_rq, se, now);
+        __enqueue_entity(cfs_rq, se);
+}
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+               int sleep, u64 now)
+{
+        update_stats_dequeue(cfs_rq, se, now);
+        if (sleep) {
+                se->sleep_start_fair = cfs_rq->fair_clock;
+#ifdef CONFIG_SCHEDSTATS
+                if (entity_is_task(se)) {
+                        struct task_struct *tsk = task_of(se);
+                        if (tsk->state & TASK_INTERRUPTIBLE)
+                                se->sleep_start = now;
+                        if (tsk->state & TASK_UNINTERRUPTIBLE)
+                                se->block_start = now;
+                }
+                cfs_rq->wait_runtime -= se->wait_runtime;
+#endif
+        }
+        __dequeue_entity(cfs_rq, se);
+}
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                          struct sched_entity *curr, unsigned long granularity)
+{
+        s64 __delta = curr->fair_key - se->fair_key;
+        /*
+         * Take scheduling granularity into account - do not
+         * preempt the current task unless the best task has
+         * a larger than sched_granularity fairness advantage:
+         */
+        if (__delta > niced_granularity(curr, granularity))
+                resched_task(rq_of(cfs_rq)->curr);
+}
+static inline void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+{
+        /*
+         * Any task has to be enqueued before it get to execute on
+         * a CPU. So account for the time it spent waiting on the
+         * runqueue. (note, here we rely on pick_next_task() having
+         * done a put_prev_task_fair() shortly before this, which
+         * updated rq->fair_clock - used by update_stats_wait_end())
+         */
+        update_stats_wait_end(cfs_rq, se, now);
+        update_stats_curr_start(cfs_rq, se, now);
+        set_cfs_rq_curr(cfs_rq, se);
+}
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+{
+        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        set_next_entity(cfs_rq, se, now);
+        return se;
+}
+static void
+put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
+{
+        /*
+         * If still on the runqueue then deactivate_task()
+         * was not called and update_curr() has to be done:
+         */
+        if (prev->on_rq)
+                update_curr(cfs_rq, now);
+        update_stats_curr_end(cfs_rq, prev, now);
+        if (prev->on_rq)
+                update_stats_wait_start(cfs_rq, prev, now);
+        set_cfs_rq_curr(cfs_rq, NULL);
+}
+static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct sched_entity *next;
+        u64 now = __rq_clock(rq);
+        /*
+         * Dequeue and enqueue the task to update its
+         * position within the tree:
+         */
+        dequeue_entity(cfs_rq, curr, 0, now);
+        enqueue_entity(cfs_rq, curr, 0, now);
+        /*
+         * Reschedule if another task tops the current one.
+         */
+        next = __pick_next_entity(cfs_rq);
+        if (next == curr)
+                return;
+        __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
+}
+/**************************************************
+ * CFS operations on tasks:
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+                for (; se; se = se->parent)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+        return p->se.cfs_rq;
+}
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+        return se->cfs_rq;
+}
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+        return grp->my_q;
+}
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+        /* A later patch will take group into account */
+        return &cpu_rq(this_cpu)->cfs;
+}
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+        list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+/* Do the two (enqueued) tasks belong to the same group ? */
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+        if (curr->se.cfs_rq == p->se.cfs_rq)
+                return 1;
+        return 0;
+}
+#else   /* CONFIG_FAIR_GROUP_SCHED */
+#define for_each_sched_entity(se) \
+                for (; se; se = NULL)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+        return &task_rq(p)->cfs;
+}
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+        struct task_struct *p = task_of(se);
+        struct rq *rq = task_rq(p);
+        return &rq->cfs;
+}
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+        return NULL;
+}
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+        return &cpu_rq(this_cpu)->cfs;
+}
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+        return 1;
+}
+#endif  /* CONFIG_FAIR_GROUP_SCHED */
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+{
+        struct cfs_rq *cfs_rq;
+        struct sched_entity *se = &p->se;
+        for_each_sched_entity(se) {
+                if (se->on_rq)
+                        break;
+                cfs_rq = cfs_rq_of(se);
+                enqueue_entity(cfs_rq, se, wakeup, now);
+        }
+}
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void
+dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+        struct cfs_rq *cfs_rq;
+        struct sched_entity *se = &p->se;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                dequeue_entity(cfs_rq, se, sleep, now);
+                /* Don't dequeue parent if it has other entities besides us */
+                if (cfs_rq->load.weight)
+                        break;
+        }
+}
+/*
+ * sched_yield() support is very simple - we dequeue and enqueue
+ */
+static void yield_task_fair(struct rq *rq, struct task_struct *p)
+{
+        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        u64 now = __rq_clock(rq);
+        /*
+         * Dequeue and enqueue the task to update its
+         * position within the tree:
+         */
+        dequeue_entity(cfs_rq, &p->se, 0, now);
+        enqueue_entity(cfs_rq, &p->se, 0, now);
+}
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
+{
+        struct task_struct *curr = rq->curr;
+        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        unsigned long gran;
+        if (unlikely(rt_prio(p->prio))) {
+                update_curr(cfs_rq, rq_clock(rq));
+                resched_task(curr);
+                return;
+        }
+        gran = sysctl_sched_wakeup_granularity;
+        /*
+         * Batch tasks prefer throughput over latency:
+         */
+        if (unlikely(p->policy == SCHED_BATCH))
+                gran = sysctl_sched_batch_wakeup_granularity;
+        if (is_same_group(curr, p))
+                __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+}
+static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
+{
+        struct cfs_rq *cfs_rq = &rq->cfs;
+        struct sched_entity *se;
+        if (unlikely(!cfs_rq->nr_running))
+                return NULL;
+        do {
+                se = pick_next_entity(cfs_rq, now);
+                cfs_rq = group_cfs_rq(se);
+        } while (cfs_rq);
+        return task_of(se);
+}
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
+{
+        struct sched_entity *se = &prev->se;
+        struct cfs_rq *cfs_rq;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                put_prev_entity(cfs_rq, se, now);
+        }
+}
+/**************************************************
+ * Fair scheduling class load-balancing methods:
+ */
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static inline struct task_struct *
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
+{
+        struct task_struct *p;
+        if (!curr)
+                return NULL;
+        p = rb_entry(curr, struct task_struct, se.run_node);
+        cfs_rq->rb_load_balance_curr = rb_next(curr);
+        return p;
+}
+static struct task_struct *load_balance_start_fair(void *arg)
+{
+        struct cfs_rq *cfs_rq = arg;
+        return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
+}
+static struct task_struct *load_balance_next_fair(void *arg)
+{
+        struct cfs_rq *cfs_rq = arg;
+        return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+}
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+        struct sched_entity *curr;
+        struct task_struct *p;
+        if (!cfs_rq->nr_running)
+                return MAX_PRIO;
+        curr = __pick_next_entity(cfs_rq);
+        p = task_of(curr);
+        return p->prio;
+}
+static int
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                        unsigned long max_nr_move, unsigned long max_load_move,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *all_pinned, unsigned long *total_load_moved)
+{
+        struct cfs_rq *busy_cfs_rq;
+        unsigned long load_moved, total_nr_moved = 0, nr_moved;
+        long rem_load_move = max_load_move;
+        struct rq_iterator cfs_rq_iterator;
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
+        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+                struct cfs_rq *this_cfs_rq;
+                long imbalance;
+                unsigned long maxload;
+                int this_best_prio, best_prio, best_prio_seen = 0;
+                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+                imbalance = busy_cfs_rq->load.weight -
+                                                 this_cfs_rq->load.weight;
+                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                if (imbalance <= 0)
+                        continue;
+                /* Don't pull more than imbalance/2 */
+                imbalance /= 2;
+                maxload = min(rem_load_move, imbalance);
+                this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+                best_prio = cfs_rq_best_prio(busy_cfs_rq);
+                /*
+                 * Enable handling of the case where there is more than one task
+                 * with the best priority. If the current running task is one
+                 * of those with prio==best_prio we know it won't be moved
+                 * and therefore it's safe to override the skip (based on load)
+                 * of any task we find with that prio.
+                 */
+                if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
+                        best_prio_seen = 1;
+                /* pass busy_cfs_rq argument into
+                 * load_balance_[start|next]_fair iterators
+                 */
+                cfs_rq_iterator.arg = busy_cfs_rq;
+                nr_moved = balance_tasks(this_rq, this_cpu, busiest,
+                                max_nr_move, maxload, sd, idle, all_pinned,
+                                &load_moved, this_best_prio, best_prio,
+                                best_prio_seen, &cfs_rq_iterator);
+                total_nr_moved += nr_moved;
+                max_nr_move -= nr_moved;
+                rem_load_move -= load_moved;
+                if (max_nr_move <= 0 || rem_load_move <= 0)
+                        break;
+        }
+        *total_load_moved = max_load_move - rem_load_move;
+        return total_nr_moved;
+}
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+{
+        struct cfs_rq *cfs_rq;
+        struct sched_entity *se = &curr->se;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                entity_tick(cfs_rq, se);
+        }
+}
+/*
+ * Share the fairness runtime between parent and child, thus the
+ * total amount of pressure for CPU stays equal - new tasks
+ * get a chance to run but frequent forkers are not allowed to
+ * monopolize the CPU. Note: the parent runqueue is locked,
+ * the child is not running yet.
+ */
+static void task_new_fair(struct rq *rq, struct task_struct *p)
+{
+        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct sched_entity *se = &p->se;
+        u64 now = rq_clock(rq);
+        sched_info_queued(p);
+        update_stats_enqueue(cfs_rq, se, now);
+        /*
+         * Child runs first: we let it run before the parent
+         * until it reschedules once. We set up the key so that
+         * it will preempt the parent:
+         */
+        p->se.fair_key = current->se.fair_key -
+                niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
+        /*
+         * The first wait is dominated by the child-runs-first logic,
+         * so do not credit it with that waiting time yet:
+         */
+        if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
+                p->se.wait_start_fair = 0;
+        /*
+         * The statistical average of wait_runtime is about
+         * -granularity/2, so initialize the task with that:
+         */
+        if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
+                p->se.wait_runtime = -(sysctl_sched_granularity / 2);
+        __enqueue_entity(cfs_rq, se);
+        inc_nr_running(p, rq, now);
+}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        struct sched_entity *se = &curr->se;
+        u64 now = rq_clock(rq);
+        struct cfs_rq *cfs_rq;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq, se, now);
+        }
+}
+#else
+static void set_curr_task_fair(struct rq *rq)
+{
+}
+#endif
+/*
+ * All the scheduling class methods:
+ */
+struct sched_class fair_sched_class __read_mostly = {
+        .enqueue_task           = enqueue_task_fair,
+        .dequeue_task           = dequeue_task_fair,
+        .yield_task             = yield_task_fair,
+        .check_preempt_curr     = check_preempt_curr_fair,
+        .pick_next_task         = pick_next_task_fair,
+        .put_prev_task          = put_prev_task_fair,
+        .load_balance           = load_balance_fair,
+        .set_curr_task          = set_curr_task_fair,
+        .task_tick              = task_tick_fair,
+        .task_new               = task_new_fair,
+};
+#ifdef CONFIG_SCHED_DEBUG
+void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+{
+        struct rq *rq = cpu_rq(cpu);
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq)
+                print_cfs_rq(m, cpu, cfs_rq, now);
+}
+#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 000000000000..41841e741c4a
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ *  handled in sched_fair.c)
+ */
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+{
+        resched_task(rq->idle);
+}
+static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
+{
+        schedstat_inc(rq, sched_goidle);
+        return rq->idle;
+}
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+        spin_unlock_irq(&rq->lock);
+        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+        dump_stack();
+        spin_lock_irq(&rq->lock);
+}
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
+{
+}
+static int
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                        unsigned long max_nr_move, unsigned long max_load_move,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *all_pinned, unsigned long *total_load_moved)
+{
+        return 0;
+}
+static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+{
+}
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+static struct sched_class idle_sched_class __read_mostly = {
+        /* no enqueue/yield_task for idle tasks */
+        /* dequeue is not valid, we print a debug message there: */
+        .dequeue_task           = dequeue_task_idle,
+        .check_preempt_curr     = check_preempt_curr_idle,
+        .pick_next_task         = pick_next_task_idle,
+        .put_prev_task          = put_prev_task_idle,
+        .load_balance           = load_balance_idle,
+        .task_tick              = task_tick_idle,
+        /* no .task_new for idle tasks */
+};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 000000000000..1192a2741b99
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void update_curr_rt(struct rq *rq, u64 now)
+{
+        struct task_struct *curr = rq->curr;
+        u64 delta_exec;
+        if (!task_has_rt_policy(curr))
+                return;
+        delta_exec = now - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        if (unlikely(delta_exec > curr->se.exec_max))
+                curr->se.exec_max = delta_exec;
+        curr->se.sum_exec_runtime += delta_exec;
+        curr->se.exec_start = now;
+}
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        list_add_tail(&p->run_list, array->queue + p->prio);
+        __set_bit(p->prio, array->bitmap);
+}
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void
+dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        update_curr_rt(rq, now);
+        list_del(&p->run_list);
+        if (list_empty(array->queue + p->prio))
+                __clear_bit(p->prio, array->bitmap);
+}
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        list_move_tail(&p->run_list, array->queue + p->prio);
+}
+static void
+yield_task_rt(struct rq *rq, struct task_struct *p)
+{
+        requeue_task_rt(rq, p);
+}
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+{
+        if (p->prio < rq->curr->prio)
+                resched_task(rq->curr);
+}
+static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        struct task_struct *next;
+        struct list_head *queue;
+        int idx;
+        idx = sched_find_first_bit(array->bitmap);
+        if (idx >= MAX_RT_PRIO)
+                return NULL;
+        queue = array->queue + idx;
+        next = list_entry(queue->next, struct task_struct, run_list);
+        next->se.exec_start = now;
+        return next;
+}
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
+{
+        update_curr_rt(rq, now);
+        p->se.exec_start = 0;
+}
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static struct task_struct *load_balance_start_rt(void *arg)
+{
+        struct rq *rq = arg;
+        struct rt_prio_array *array = &rq->rt.active;
+        struct list_head *head, *curr;
+        struct task_struct *p;
+        int idx;
+        idx = sched_find_first_bit(array->bitmap);
+        if (idx >= MAX_RT_PRIO)
+                return NULL;
+        head = array->queue + idx;
+        curr = head->prev;
+        p = list_entry(curr, struct task_struct, run_list);
+        curr = curr->prev;
+        rq->rt.rt_load_balance_idx = idx;
+        rq->rt.rt_load_balance_head = head;
+        rq->rt.rt_load_balance_curr = curr;
+        return p;
+}
+static struct task_struct *load_balance_next_rt(void *arg)
+{
+        struct rq *rq = arg;
+        struct rt_prio_array *array = &rq->rt.active;
+        struct list_head *head, *curr;
+        struct task_struct *p;
+        int idx;
+        idx = rq->rt.rt_load_balance_idx;
+        head = rq->rt.rt_load_balance_head;
+        curr = rq->rt.rt_load_balance_curr;
+        /*
+         * If we arrived back to the head again then
+         * iterate to the next queue (if any):
+         */
+        if (unlikely(head == curr)) {
+                int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+                if (next_idx >= MAX_RT_PRIO)
+                        return NULL;
+                idx = next_idx;
+                head = array->queue + idx;
+                curr = head->prev;
+                rq->rt.rt_load_balance_idx = idx;
+                rq->rt.rt_load_balance_head = head;
+        }
+        p = list_entry(curr, struct task_struct, run_list);
+        curr = curr->prev;
+        rq->rt.rt_load_balance_curr = curr;
+        return p;
+}
+static int
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                        unsigned long max_nr_move, unsigned long max_load_move,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *all_pinned, unsigned long *load_moved)
+{
+        int this_best_prio, best_prio, best_prio_seen = 0;
+        int nr_moved;
+        struct rq_iterator rt_rq_iterator;
+        best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
+        this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
+        /*
+         * Enable handling of the case where there is more than one task
+         * with the best priority.   If the current running task is one
+         * of those with prio==best_prio we know it won't be moved
+         * and therefore it's safe to override the skip (based on load)
+         * of any task we find with that prio.
+         */
+        if (busiest->curr->prio == best_prio)
+                best_prio_seen = 1;
+        rt_rq_iterator.start = load_balance_start_rt;
+        rt_rq_iterator.next = load_balance_next_rt;
+        /* pass 'busiest' rq argument into
+         * load_balance_[start|next]_rt iterators
+         */
+        rt_rq_iterator.arg = busiest;
+        nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
+                        max_load_move, sd, idle, all_pinned, load_moved,
+                        this_best_prio, best_prio, best_prio_seen,
+                        &rt_rq_iterator);
+        return nr_moved;
+}
+static void task_tick_rt(struct rq *rq, struct task_struct *p)
+{
+        /*
+         * RR tasks need a special form of timeslice management.
+         * FIFO tasks have no timeslices.
+         */
+        if (p->policy != SCHED_RR)
+                return;
+        if (--p->time_slice)
+                return;
+        p->time_slice = static_prio_timeslice(p->static_prio);
+        set_tsk_need_resched(p);
+        /* put it at the end of the queue: */
+        requeue_task_rt(rq, p);
+}
+/*
+ * No parent/child timeslice management necessary for RT tasks,
+ * just activate them:
+ */
+static void task_new_rt(struct rq *rq, struct task_struct *p)
+{
+        activate_task(rq, p, 1);
+}
+static struct sched_class rt_sched_class __read_mostly = {
+        .enqueue_task           = enqueue_task_rt,
+        .dequeue_task           = dequeue_task_rt,
+        .yield_task             = yield_task_rt,
+        .check_preempt_curr     = check_preempt_curr_rt,
+        .pick_next_task         = pick_next_task_rt,
+        .put_prev_task          = put_prev_task_rt,
+        .load_balance           = load_balance_rt,
+        .task_tick              = task_tick_rt,
+        .task_new               = task_new_rt,
+};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 000000000000..c63c38f6fa6e
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 14
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+        int cpu;
+        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        seq_printf(seq, "timestamp %lu\n", jiffies);
+        for_each_online_cpu(cpu) {
+                struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+                struct sched_domain *sd;
+                int dcnt = 0;
+#endif
+                /* runqueue-specific stats */
+                seq_printf(seq,
+                    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
+                    cpu, rq->yld_both_empty,
+                    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+                    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+                    rq->ttwu_cnt, rq->ttwu_local,
+                    rq->rq_sched_info.cpu_time,
+                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+                seq_printf(seq, "\n");
+#ifdef CONFIG_SMP
+                /* domain-specific stats */
+                preempt_disable();
+                for_each_domain(cpu, sd) {
+                        enum cpu_idle_type itype;
+                        char mask_str[NR_CPUS];
+                        cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+                                        itype++) {
+                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+                                                "%lu",
+                                    sd->lb_cnt[itype],
+                                    sd->lb_balanced[itype],
+                                    sd->lb_failed[itype],
+                                    sd->lb_imbalance[itype],
+                                    sd->lb_gained[itype],
+                                    sd->lb_hot_gained[itype],
+                                    sd->lb_nobusyq[itype],
+                                    sd->lb_nobusyg[itype]);
+                        }
+                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+                            " %lu %lu %lu\n",
+                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                            sd->ttwu_move_balance);
+                }
+                preempt_enable();
+#endif
+        }
+        return 0;
+}
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        char *buf = kmalloc(size, GFP_KERNEL);
+        struct seq_file *m;
+        int res;
+        if (!buf)
+                return -ENOMEM;
+        res = single_open(file, show_schedstat, NULL);
+        if (!res) {
+                m = file->private_data;
+                m->buf = buf;
+                m->size = size;
+        } else
+                kfree(buf);
+        return res;
+}
+const struct file_operations proc_schedstat_operations = {
+        .open    = schedstat_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+        if (rq) {
+                rq->rq_sched_info.run_delay += delta;
+                rq->rq_sched_info.pcnt++;
+        }
+}
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+        if (rq)
+                rq->rq_sched_info.cpu_time += delta;
+}
+# define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field)       do { } while (0)
+# define schedstat_add(rq, field, amt)  do { } while (0)
+#endif
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+        t->sched_info.last_queued = 0;
+}
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+        unsigned long long now = sched_clock(), delta = 0;
+        if (t->sched_info.last_queued)
+                delta = now - t->sched_info.last_queued;
+        sched_info_dequeued(t);
+        t->sched_info.run_delay += delta;
+        t->sched_info.last_arrival = now;
+        t->sched_info.pcnt++;
+        rq_sched_info_arrive(task_rq(t), delta);
+}
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+        if (unlikely(sched_info_on()))
+                if (!t->sched_info.last_queued)
+                        t->sched_info.last_queued = sched_clock();
+}
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+        unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
+        t->sched_info.cpu_time += delta;
+        rq_sched_info_depart(task_rq(t), delta);
+}
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+        struct rq *rq = task_rq(prev);
+        /*
+         * prev now departs the cpu.  It's not interesting to record
+         * stats about how efficient we were at scheduling the idle
+         * process, however.
+         */
+        if (prev != rq->idle)
+                sched_info_depart(prev);
+        if (next != rq->idle)
+                sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+        if (unlikely(sched_info_on()))
+                __sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t)            do { } while (0)
+#define sched_info_switch(t, next)      do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0b9886a00e74..73217a9e2875 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -488,7 +488,6 @@ void __init softirq_init(void)
 static int ksoftirqd(void * __bind_cpu)
 {
-        set_user_nice(current, 19);
        current->flags |= PF_NOFREEZE;
        set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 30ee462ee79f..51f5dac42a00 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -206,7 +206,87 @@ static ctl_table root_table[] = {
        { .ctl_name = 0 }
 };
+#ifdef CONFIG_SCHED_DEBUG
+static unsigned long min_sched_granularity_ns = 100000;         /* 100 usecs */
+static unsigned long max_sched_granularity_ns = 1000000000;     /* 1 second */
+static unsigned long min_wakeup_granularity_ns;                 /* 0 usecs */
+static unsigned long max_wakeup_granularity_ns = 1000000000;    /* 1 second */
+#endif
 static ctl_table kern_table[] = {
+#ifdef CONFIG_SCHED_DEBUG
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_granularity_ns",
+                .data           = &sysctl_sched_granularity,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_sched_granularity_ns,
+                .extra2         = &max_sched_granularity_ns,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_wakeup_granularity_ns",
+                .data           = &sysctl_sched_wakeup_granularity,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_wakeup_granularity_ns,
+                .extra2         = &max_wakeup_granularity_ns,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_batch_wakeup_granularity_ns",
+                .data           = &sysctl_sched_batch_wakeup_granularity,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_wakeup_granularity_ns,
+                .extra2         = &max_wakeup_granularity_ns,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_stat_granularity_ns",
+                .data           = &sysctl_sched_stat_granularity,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_wakeup_granularity_ns,
+                .extra2         = &max_wakeup_granularity_ns,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_runtime_limit_ns",
+                .data           = &sysctl_sched_runtime_limit,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &min_sched_granularity_ns,
+                .extra2         = &max_sched_granularity_ns,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_child_runs_first",
+                .data           = &sysctl_sched_child_runs_first,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_features",
+                .data           = &sysctl_sched_features,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
        {
                .ctl_name       = KERN_PANIC,
                .procname       = "panic",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index da95e10cfd70..fab32a286371 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP
           can be detected via the NMI-watchdog, on platforms that
           support it.)
+config SCHED_DEBUG
+        bool "Collect scheduler debugging info"
+        depends on DEBUG_KERNEL && PROC_FS
+        default y
+        help
+          If you say Y here, the /proc/sched_debug file will be provided
+          that can help debug the scheduler. The runtime overhead of this
+          option is minimal.
 config SCHEDSTATS
        bool "Collect scheduler statistics"
        depends on DEBUG_KERNEL && PROC_FS