83 files changed, 1944 insertions, 739 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 17f1f91af35c..946b66e1b652 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -148,9 +148,9 @@ tcp_available_congestion_control - STRING
        but not loaded.
 tcp_base_mss - INTEGER
-        The initial value of search_low to be used by Packetization Layer
+        The initial value of search_low to be used by the packetization layer
-        Path MTU Discovery (MTU probing).  If MTU probing is enabled,
+        Path MTU discovery (MTU probing).  If MTU probing is enabled,
-        this is the inital MSS used by the connection.
+        this is the initial MSS used by the connection.
 tcp_congestion_control - STRING
        Set the congestion control algorithm to be used for new
@@ -185,10 +185,9 @@ tcp_frto - INTEGER
        timeouts.  It is particularly beneficial in wireless environments
        where packet loss is typically due to random radio interference
        rather than intermediate router congestion.  F-RTO is sender-side
-        only modification.  Therefore it does not require any support from
+        only modification. Therefore it does not require any support from
-        the peer, but in a typical case, however, where wireless link is
+        the peer.
-        the local access link and most of the data flows downlink, the
-        faraway servers should have F-RTO enabled to take advantage of it.
        If set to 1, basic version is enabled.  2 enables SACK enhanced
        F-RTO if flow uses SACK.  The basic version can be used also when
        SACK is in use though scenario(s) with it exists where F-RTO
@@ -276,7 +275,7 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
        memory.
 tcp_moderate_rcvbuf - BOOLEAN
-        If set, TCP performs receive buffer autotuning, attempting to
+        If set, TCP performs receive buffer auto-tuning, attempting to
        automatically size the buffer (no greater than tcp_rmem[2]) to
        match the size required by the path for full throughput.  Enabled by
        default.
@@ -336,7 +335,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
        pressure.
        Default: 8K
-        default: default size of receive buffer used by TCP sockets.
+        default: initial size of receive buffer used by TCP sockets.
        This value overrides net.core.rmem_default used by other protocols.
        Default: 87380 bytes. This value results in window of 65535 with
        default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
@@ -344,8 +343,10 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
        max: maximal size of receive buffer allowed for automatically
        selected receiver buffers for TCP socket. This value does not override
-        net.core.rmem_max, "static" selection via SO_RCVBUF does not use this.
+        net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
-        Default: 87380*2 bytes.
+        automatic tuning of that socket's receive buffer size, in which
+        case this value is ignored.
+        Default: between 87380B and 4MB, depending on RAM size.
 tcp_sack - BOOLEAN
        Enable select acknowledgments (SACKS).
@@ -358,7 +359,7 @@ tcp_slow_start_after_idle - BOOLEAN
        Default: 1
 tcp_stdurg - BOOLEAN
-        Use the Host requirements interpretation of the TCP urg pointer field.
+        Use the Host requirements interpretation of the TCP urgent pointer field.
        Most hosts use the older BSD interpretation, so if you turn this on
        Linux might not communicate correctly with them.
        Default: FALSE
@@ -371,12 +372,12 @@ tcp_synack_retries - INTEGER
 tcp_syncookies - BOOLEAN
        Only valid when the kernel was compiled with CONFIG_SYNCOOKIES
        Send out syncookies when the syn backlog queue of a socket
-        overflows. This is to prevent against the common 'syn flood attack'
+        overflows. This is to prevent against the common 'SYN flood attack'
        Default: FALSE
        Note, that syncookies is fallback facility.
        It MUST NOT be used to help highly loaded servers to stand
-        against legal connection rate. If you see synflood warnings
+        against legal connection rate. If you see SYN flood warnings
        in your logs, but investigation shows that they occur
        because of overload with legal connections, you should tune
        another parameters until this warning disappear.
@@ -386,7 +387,7 @@ tcp_syncookies - BOOLEAN
        to use TCP extensions, can result in serious degradation
        of some services (f.e. SMTP relaying), visible not by you,
        but your clients and relays, contacting you. While you see
-        synflood warnings in logs not being really flooded, your server
+        SYN flood warnings in logs not being really flooded, your server
        is seriously misconfigured.
 tcp_syn_retries - INTEGER
@@ -419,19 +420,21 @@ tcp_window_scaling - BOOLEAN
        Enable window scaling as defined in RFC1323.
 tcp_wmem - vector of 3 INTEGERs: min, default, max
-        min: Amount of memory reserved for send buffers for TCP socket.
+        min: Amount of memory reserved for send buffers for TCP sockets.
        Each TCP socket has rights to use it due to fact of its birth.
        Default: 4K
-        default: Amount of memory allowed for send buffers for TCP socket
+        default: initial size of send buffer used by TCP sockets.  This
-        by default. This value overrides net.core.wmem_default used
+        value overrides net.core.wmem_default used by other protocols.
-        by other protocols, it is usually lower than net.core.wmem_default.
+        It is usually lower than net.core.wmem_default.
        Default: 16K
-        max: Maximal amount of memory allowed for automatically selected
+        max: Maximal amount of memory allowed for automatically tuned
-        send buffers for TCP socket. This value does not override
+        send buffers for TCP sockets. This value does not override
-        net.core.wmem_max, "static" selection via SO_SNDBUF does not use this.
+        net.core.wmem_max.  Calling setsockopt() with SO_SNDBUF disables
-        Default: 128K
+        automatic tuning of that socket's send buffer size, in which case
+        this value is ignored.
+        Default: between 64K and 4MB, depending on RAM size.
 tcp_workaround_signed_windows - BOOLEAN
        If set, assume no receipt of a window scaling option means the
@@ -1060,24 +1063,193 @@ bridge-nf-filter-pppoe-tagged - BOOLEAN
        Default: 1
-UNDOCUMENTED:
+proc/sys/net/sctp/* Variables:
+addip_enable - BOOLEAN
+        Enable or disable extension of  Dynamic Address Reconfiguration
+        (ADD-IP) functionality specified in RFC5061.  This extension provides
+        the ability to dynamically add and remove new addresses for the SCTP
+        associations.
+        1: Enable extension.
+        0: Disable extension.
+        Default: 0
+addip_noauth_enable - BOOLEAN
+        Dynamic Address Reconfiguration (ADD-IP) requires the use of
+        authentication to protect the operations of adding or removing new
+        addresses.  This requirement is mandated so that unauthorized hosts
+        would not be able to hijack associations.  However, older
+        implementations may not have implemented this requirement while
+        allowing the ADD-IP extension.  For reasons of interoperability,
+        we provide this variable to control the enforcement of the
+        authentication requirement.
+        1: Allow ADD-IP extension to be used without authentication.  This
+           should only be set in a closed environment for interoperability
+           with older implementations.
+        0: Enforce the authentication requirement
+        Default: 0
+auth_enable - BOOLEAN
+        Enable or disable Authenticated Chunks extension.  This extension
+        provides the ability to send and receive authenticated chunks and is
+        required for secure operation of Dynamic Address Reconfiguration
+        (ADD-IP) extension.
+        1: Enable this extension.
+        0: Disable this extension.
+        Default: 0
+prsctp_enable - BOOLEAN
+        Enable or disable the Partial Reliability extension (RFC3758) which
+        is used to notify peers that a given DATA should no longer be expected.
+        1: Enable extension
+        0: Disable
+        Default: 1
+max_burst - INTEGER
+        The limit of the number of new packets that can be initially sent.  It
+        controls how bursty the generated traffic can be.
+        Default: 4
+association_max_retrans - INTEGER
+        Set the maximum number for retransmissions that an association can
+        attempt deciding that the remote end is unreachable.  If this value
+        is exceeded, the association is terminated.
+        Default: 10
+max_init_retransmits - INTEGER
+        The maximum number of retransmissions of INIT and COOKIE-ECHO chunks
+        that an association will attempt before declaring the destination
+        unreachable and terminating.
+        Default: 8
+path_max_retrans - INTEGER
+        The maximum number of retransmissions that will be attempted on a given
+        path.  Once this threshold is exceeded, the path is considered
+        unreachable, and new traffic will use a different path when the
+        association is multihomed.
+        Default: 5
+rto_initial - INTEGER
+        The initial round trip timeout value in milliseconds that will be used
+        in calculating round trip times.  This is the initial time interval
+        for retransmissions.
+        Default: 3000
-dev_weight FIXME
+rto_max - INTEGER
-discovery_slots FIXME
+        The maximum value (in milliseconds) of the round trip timeout.  This
-discovery_timeout FIXME
+        is the largest time interval that can elapse between retransmissions.
-fast_poll_increase FIXME
-ip6_queue_maxlen FIXME
+        Default: 60000
-lap_keepalive_time FIXME
-lo_cong FIXME
+rto_min - INTEGER
-max_baud_rate FIXME
+        The minimum value (in milliseconds) of the round trip timeout.  This
-max_dgram_qlen FIXME
+        is the smallest time interval the can elapse between retransmissions.
-max_noreply_time FIXME
-max_tx_data_size FIXME
+        Default: 1000
-max_tx_window FIXME
-min_tx_turn_time FIXME
+hb_interval - INTEGER
-mod_cong FIXME
+        The interval (in milliseconds) between HEARTBEAT chunks.  These chunks
-no_cong FIXME
+        are sent at the specified interval on idle paths to probe the state of
-no_cong_thresh FIXME
+        a given path between 2 associations.
-slot_timeout FIXME
-warn_noreply_time FIXME
+        Default: 30000
+sack_timeout - INTEGER
+        The amount of time (in milliseconds) that the implementation will wait
+        to send a SACK.
+        Default: 200
+valid_cookie_life - INTEGER
+        The default lifetime of the SCTP cookie (in milliseconds).  The cookie
+        is used during association establishment.
+        Default: 60000
+cookie_preserve_enable - BOOLEAN
+        Enable or disable the ability to extend the lifetime of the SCTP cookie
+        that is used during the establishment phase of SCTP association
+        1: Enable cookie lifetime extension.
+        0: Disable
+        Default: 1
+rcvbuf_policy - INTEGER
+        Determines if the receive buffer is attributed to the socket or to
+        association.   SCTP supports the capability to create multiple
+        associations on a single socket.  When using this capability, it is
+        possible that a single stalled association that's buffering a lot
+        of data may block other associations from delivering their data by
+        consuming all of the receive buffer space.  To work around this,
+        the rcvbuf_policy could be set to attribute the receiver buffer space
+        to each association instead of the socket.  This prevents the described
+        blocking.
+        1: rcvbuf space is per association
+        0: recbuf space is per socket
+        Default: 0
+sndbuf_policy - INTEGER
+        Similar to rcvbuf_policy above, this applies to send buffer space.
+        1: Send buffer is tracked per association
+        0: Send buffer is tracked per socket.
+        Default: 0
+sctp_mem - vector of 3 INTEGERs: min, pressure, max
+        Number of pages allowed for queueing by all SCTP sockets.
+        min: Below this number of pages SCTP is not bothered about its
+        memory appetite. When amount of memory allocated by SCTP exceeds
+        this number, SCTP starts to moderate memory usage.
+        pressure: This value was introduced to follow format of tcp_mem.
+        max: Number of pages allowed for queueing by all SCTP sockets.
+        Default is calculated at boot time from amount of available memory.
+sctp_rmem - vector of 3 INTEGERs: min, default, max
+        See tcp_rmem for a description.
+sctp_wmem  - vector of 3 INTEGERs: min, default, max
+        See tcp_wmem for a description.
+UNDOCUMENTED:
+/proc/sys/net/core/*
+        dev_weight FIXME
+/proc/sys/net/unix/*
+        max_dgram_qlen FIXME
+/proc/sys/net/irda/*
+        fast_poll_increase FIXME
+        warn_noreply_time FIXME
+        discovery_slots FIXME
+        slot_timeout FIXME
+        max_baud_rate FIXME
+        discovery_timeout FIXME
+        lap_keepalive_time FIXME
+        max_noreply_time FIXME
+        max_tx_data_size FIXME
+        max_tx_window FIXME
+        min_tx_turn_time FIXME
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index a9e990ab980f..373ceacc367e 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
 arch_init_sched_domains function. This function will attach domains to all
 CPUs using cpu_attach_domain.
-Implementors should change the line
+The sched-domains debugging infrastructure can be enabled by enabling
-#undef SCHED_DOMAIN_DEBUG
+CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
-to
-#define SCHED_DOMAIN_DEBUG
-in kernel/sched.c as this enables an error checking parse of the sched domains
 which should catch most possible errors (described above). It also prints out
 the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 14f901f639ee..3ef339f491e0 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
 0.00015s. So this group can be scheduled with a period of 0.005s and a run time
 of 0.00015s.
-The remaining CPU time will be used for user input and other tass. Because
+The remaining CPU time will be used for user input and other tasks. Because
 realtime tasks have explicitly allocated the CPU time they need to perform
-their tasks, buffer underruns in the graphocs or audio can be eliminated.
+their tasks, buffer underruns in the graphics or audio can be eliminated.
 NOTE: the above example is not fully implemented as of yet (2.6.25). We still
 lack an EDF scheduler to make non-uniform periods usable.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6476125363e0..56a2f678019e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3082,8 +3082,8 @@ L:	linux-scsi@vger.kernel.org
 S:      Maintained
 OPROFILE
-P:      Philippe Elie
+P:      Robert Richter
-M:      phil.el@wanadoo.fr
+M:      robert.richter@amd.com
 L:      oprofile-list@lists.sf.net
 S:      Maintained
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2b2bb3f9b683..d1b867101e5f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -300,6 +300,29 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_cache);
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+        unsigned long flags;
+        void *ret;
+        int err;
+        /*
+         * - WB for WB-able memory and no other conflicting mappings
+         * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+         * - Inherit from confliting mappings otherwise
+         */
+        err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+        if (err < 0)
+                return NULL;
+        ret = (void *) __ioremap_caller(phys_addr, size, flags,
+                                        __builtin_return_address(0));
+        free_memtype(phys_addr, phys_addr + size);
+        return (void __iomem *)ret;
+}
 /**
 * iounmap - Free a IO remapping
 * @addr: virtual address from ioremap_*
@@ -365,7 +388,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
        if (page_is_ram(start >> PAGE_SHIFT))
                return __va(phys);
-        addr = (void *)ioremap(start, PAGE_SIZE);
+        addr = (void *)ioremap_default(start, PAGE_SIZE);
        if (addr)
                addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd343216..54d617f7df3e 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -709,11 +709,12 @@ static void bsg_kref_release_function(struct kref *kref)
 {
        struct bsg_class_device *bcd =
                container_of(kref, struct bsg_class_device, ref);
+        struct device *parent = bcd->parent;
        if (bcd->release)
                bcd->release(bcd->parent);
-        put_device(bcd->parent);
+        put_device(parent);
 }
 static int bsg_put_device(struct bsg_device *bd)
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index 3ff8b14420d9..9330b7922f62 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -29,14 +29,16 @@
 enum {
        ATA_ACPI_FILTER_SETXFER = 1 << 0,
        ATA_ACPI_FILTER_LOCK    = 1 << 1,
+        ATA_ACPI_FILTER_DIPM    = 1 << 2,
        ATA_ACPI_FILTER_DEFAULT = ATA_ACPI_FILTER_SETXFER |
-                                  ATA_ACPI_FILTER_LOCK,
+                                  ATA_ACPI_FILTER_LOCK |
+                                  ATA_ACPI_FILTER_DIPM,
 };
 static unsigned int ata_acpi_gtf_filter = ATA_ACPI_FILTER_DEFAULT;
 module_param_named(acpi_gtf_filter, ata_acpi_gtf_filter, int, 0644);
-MODULE_PARM_DESC(acpi_gtf_filter, "filter mask for ACPI _GTF commands, set to filter out (0x1=set xfermode, 0x2=lock/freeze lock)");
+MODULE_PARM_DESC(acpi_gtf_filter, "filter mask for ACPI _GTF commands, set to filter out (0x1=set xfermode, 0x2=lock/freeze lock, 0x4=DIPM)");
 #define NO_PORT_MULT            0xffff
 #define SATA_ADR(root, pmp)     (((root) << 16) | (pmp))
@@ -195,6 +197,10 @@ static void ata_acpi_handle_hotplug(struct ata_port *ap, struct ata_device *dev,
                /* This device does not support hotplug */
                return;
+        if (event == ACPI_NOTIFY_BUS_CHECK ||
+            event == ACPI_NOTIFY_DEVICE_CHECK)
+                status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
        spin_lock_irqsave(ap->lock, flags);
        switch (event) {
@@ -202,7 +208,6 @@ static void ata_acpi_handle_hotplug(struct ata_port *ap, struct ata_device *dev,
        case ACPI_NOTIFY_DEVICE_CHECK:
                ata_ehi_push_desc(ehi, "ACPI event");
-                status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
                if (ACPI_FAILURE(status)) {
                        ata_port_printk(ap, KERN_ERR,
                                "acpi: failed to determine bay status (0x%x)\n",
@@ -690,6 +695,14 @@ static int ata_acpi_filter_tf(const struct ata_taskfile *tf,
                        return 1;
        }
+        if (ata_acpi_gtf_filter & ATA_ACPI_FILTER_DIPM) {
+                /* inhibit enabling DIPM */
+                if (tf->command == ATA_CMD_SET_FEATURES &&
+                    tf->feature == SETFEATURES_SATA_ENABLE &&
+                    tf->nsect == SATA_DIPM)
+                        return 1;
+        }
        return 0;
 }
diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c
index e82c66e8d31b..26345d7b531c 100644
--- a/drivers/ata/pata_sis.c
+++ b/drivers/ata/pata_sis.c
@@ -56,6 +56,7 @@ static const struct sis_laptop sis_laptop[] = {
        { 0x5513, 0x1043, 0x1107 },     /* ASUS A6K */
        { 0x5513, 0x1734, 0x105F },     /* FSC Amilo A1630 */
        { 0x5513, 0x1071, 0x8640 },     /* EasyNote K5305 */
+        { 0x5513, 0x1039, 0x5513 },     /* Targa Visionary 1000 */
        /* end marker */
        { 0, }
 };
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 1b9a87047817..0e6df289cb46 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -755,9 +755,8 @@ static ssize_t ipmi_write(struct file *file,
                rv = ipmi_heartbeat();
                if (rv)
                        return rv;
-                return 1;
        }
-        return 0;
+        return len;
 }
 static ssize_t ipmi_read(struct file *file,
diff --git a/drivers/char/pcmcia/ipwireless/hardware.c b/drivers/char/pcmcia/ipwireless/hardware.c
index ba6340ae98af..929101ecbae2 100644
--- a/drivers/char/pcmcia/ipwireless/hardware.c
+++ b/drivers/char/pcmcia/ipwireless/hardware.c
@@ -590,8 +590,10 @@ static struct ipw_rx_packet *pool_allocate(struct ipw_hardware *hw,
                packet = kmalloc(sizeof(struct ipw_rx_packet) +
                                old_packet->length + minimum_free_space,
                                GFP_ATOMIC);
-                if (!packet)
+                if (!packet) {
+                        kfree(old_packet);
                        return NULL;
+                }
                memcpy(packet, old_packet,
                                sizeof(struct ipw_rx_packet)
                                        + old_packet->length);
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 5f80a9dff573..909cac93fa2a 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -678,12 +678,13 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
                if (arg != (1<<tmp))
                        return -EINVAL;
+                rtc_freq = arg;
                spin_lock_irqsave(&rtc_lock, flags);
                if (hpet_set_periodic_freq(arg)) {
                        spin_unlock_irqrestore(&rtc_lock, flags);
                        return 0;
                }
-                rtc_freq = arg;
                val = CMOS_READ(RTC_FREQ_SELECT) & 0xf0;
                val |= (16 - tmp);
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 13a4bdd4e4d6..c7a977bc03e8 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -623,6 +623,7 @@ static struct pnp_device_id tpm_pnp_tbl[] __devinitdata = {
        {"IFX0102", 0},         /* Infineon */
        {"BCM0101", 0},         /* Broadcom */
        {"NSC1200", 0},         /* National */
+        {"ICO0102", 0},         /* Intel */
        /* Add new here */
        {"", 0},                /* User Specified */
        {"", 0}                 /* Terminator */
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 0f3c66de69bc..8d8c6b736167 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1977,8 +1977,10 @@ isdn_writebuf_stub(int drvidx, int chan, const u_char __user * buf, int len)
        if (!skb)
                return -ENOMEM;
        skb_reserve(skb, hl);
-        if (copy_from_user(skb_put(skb, len), buf, len))
+        if (copy_from_user(skb_put(skb, len), buf, len)) {
+                dev_kfree_skb(skb);
                return -EFAULT;
+        }
        ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, 1, skb);
        if (ret <= 0)
                dev_kfree_skb(skb);
diff --git a/drivers/media/video/ov7670.c b/drivers/media/video/ov7670.c
index 2bc6bdc9c1f2..d7bfd30f74a9 100644
--- a/drivers/media/video/ov7670.c
+++ b/drivers/media/video/ov7670.c
@@ -406,8 +406,10 @@ static int ov7670_read(struct i2c_client *c, unsigned char reg,
        int ret;
        ret = i2c_smbus_read_byte_data(c, reg);
-        if (ret >= 0)
+        if (ret >= 0) {
                *value = (unsigned char) ret;
+                ret = 0;
+        }
        return ret;
 }
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index db3c892f87fb..d40d6d15ae20 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1686,9 +1686,14 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
                ioc->bus_type = SAS;
        }
-        if (ioc->bus_type == SAS && mpt_msi_enable == -1)
+        if (mpt_msi_enable == -1) {
-                ioc->msi_enable = 1;
+                /* Enable on SAS, disable on FC and SPI */
-        else
+                if (ioc->bus_type == SAS)
+                        ioc->msi_enable = 1;
+                else
+                        ioc->msi_enable = 0;
+        } else
+                /* follow flag: 0 - disable; 1 - enable */
                ioc->msi_enable = mpt_msi_enable;
        if (ioc->errata_flag_1064)
diff --git a/drivers/message/fusion/mptspi.c b/drivers/message/fusion/mptspi.c
index 25bcfcf36f2e..1effca4e40e1 100644
--- a/drivers/message/fusion/mptspi.c
+++ b/drivers/message/fusion/mptspi.c
@@ -1266,13 +1266,18 @@ mptspi_dv_renegotiate(struct _MPT_SCSI_HOST *hd)
 static int
 mptspi_ioc_reset(MPT_ADAPTER *ioc, int reset_phase)
 {
-        struct _MPT_SCSI_HOST *hd = shost_priv(ioc->sh);
        int rc;
        rc = mptscsih_ioc_reset(ioc, reset_phase);
-        if (reset_phase == MPT_IOC_POST_RESET)
+        /* only try to do a renegotiation if we're properly set up
+         * if we get an ioc fault on bringup, ioc->sh will be NULL */
+        if (reset_phase == MPT_IOC_POST_RESET &&
+            ioc->sh) {
+                struct _MPT_SCSI_HOST *hd = shost_priv(ioc->sh);
                mptspi_dv_renegotiate(hd);
+        }
        return rc;
 }
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index a7714da7c283..effc1ce8179a 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -152,6 +152,7 @@ static chipio_t pnp_info;
 static const struct pnp_device_id nsc_ircc_pnp_table[] = {
        { .id = "NSC6001", .driver_data = 0 },
        { .id = "IBM0071", .driver_data = 0 },
+        { .id = "HWPC224", .driver_data = 0 },
        { }
 };
diff --git a/drivers/net/irda/via-ircc.c b/drivers/net/irda/via-ircc.c
index 58e128784585..04ad3573b159 100644
--- a/drivers/net/irda/via-ircc.c
+++ b/drivers/net/irda/via-ircc.c
@@ -1546,6 +1546,7 @@ static int via_ircc_net_open(struct net_device *dev)
                        IRDA_WARNING("%s, unable to allocate dma2=%d\n",
                                     driver_name, self->io.dma2);
                        free_irq(self->io.irq, self);
+                        free_dma(self->io.dma);
                        return -EAGAIN;
                }
        }
@@ -1606,6 +1607,8 @@ static int via_ircc_net_close(struct net_device *dev)
        EnAllInt(iobase, OFF);
        free_irq(self->io.irq, dev);
        free_dma(self->io.dma);
+        if (self->io.dma2 != self->io.dma)
+                free_dma(self->io.dma2);
        return 0;
 }
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7ab94c825b57..b9018bfa0a97 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -602,6 +602,12 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
        tun->attached = 1;
        get_net(dev_net(tun->dev));
+        /* Make sure persistent devices do not get stuck in
+         * xoff state.
+         */
+        if (netif_running(tun->dev))
+                netif_wake_queue(tun->dev);
        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;
diff --git a/drivers/net/wireless/hostap/hostap_cs.c b/drivers/net/wireless/hostap/hostap_cs.c
index 80039a0ae027..3b4e55cf33cd 100644
--- a/drivers/net/wireless/hostap/hostap_cs.c
+++ b/drivers/net/wireless/hostap/hostap_cs.c
@@ -777,8 +777,10 @@ static int hostap_cs_suspend(struct pcmcia_device *link)
        int dev_open = 0;
        struct hostap_interface *iface = NULL;
-        if (dev)
+        if (!dev)
-                iface = netdev_priv(dev);
+                return -ENODEV;
+        iface = netdev_priv(dev);
        PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_SUSPEND\n", dev_info);
        if (iface && iface->local)
@@ -798,8 +800,10 @@ static int hostap_cs_resume(struct pcmcia_device *link)
        int dev_open = 0;
        struct hostap_interface *iface = NULL;
-        if (dev)
+        if (!dev)
-                iface = netdev_priv(dev);
+                return -ENODEV;
+        iface = netdev_priv(dev);
        PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_RESUME\n", dev_info);
diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.c b/drivers/net/wireless/iwlwifi/iwl-3945.c
index f5387a7a76c0..55ac850744b3 100644
--- a/drivers/net/wireless/iwlwifi/iwl-3945.c
+++ b/drivers/net/wireless/iwlwifi/iwl-3945.c
@@ -449,7 +449,7 @@ static void iwl3945_dbg_report_frame(struct iwl3945_priv *priv,
        if (print_summary) {
                char *title;
-                u32 rate;
+                int rate;
                if (hundred)
                        title = "100Frames";
@@ -487,7 +487,7 @@ static void iwl3945_dbg_report_frame(struct iwl3945_priv *priv,
                 *    but you can hack it to show more, if you'd like to. */
                if (dataframe)
                        IWL_DEBUG_RX("%s: mhd=0x%04x, dst=0x%02x, "
-                                     "len=%u, rssi=%d, chnl=%d, rate=%u, \n",
+                                     "len=%u, rssi=%d, chnl=%d, rate=%d, \n",
                                     title, fc, header->addr1[5],
                                     length, rssi, channel, rate);
                else {
diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c
index d448c9702a0f..387d4878af2f 100644
--- a/drivers/net/wireless/libertas/scan.c
+++ b/drivers/net/wireless/libertas/scan.c
@@ -567,11 +567,11 @@ static int lbs_process_bss(struct bss_descriptor *bss,
        pos += 8;
        /* beacon interval is 2 bytes long */
-        bss->beaconperiod = le16_to_cpup((void *) pos);
+        bss->beaconperiod = get_unaligned_le16(pos);
        pos += 2;
        /* capability information is 2 bytes long */
-        bss->capability = le16_to_cpup((void *) pos);
+        bss->capability = get_unaligned_le16(pos);
        lbs_deb_scan("process_bss: capabilities 0x%04x\n", bss->capability);
        pos += 2;
diff --git a/drivers/net/wireless/rt2x00/rt2400pci.c b/drivers/net/wireless/rt2x00/rt2400pci.c
index 560b9c73c0b9..b36ed1c6c746 100644
--- a/drivers/net/wireless/rt2x00/rt2400pci.c
+++ b/drivers/net/wireless/rt2x00/rt2400pci.c
@@ -731,6 +731,17 @@ static int rt2400pci_init_registers(struct rt2x00_dev *rt2x00dev)
                           (rt2x00dev->rx->data_size / 128));
        rt2x00pci_register_write(rt2x00dev, CSR9, reg);
+        rt2x00pci_register_read(rt2x00dev, CSR14, &reg);
+        rt2x00_set_field32(&reg, CSR14_TSF_COUNT, 0);
+        rt2x00_set_field32(&reg, CSR14_TSF_SYNC, 0);
+        rt2x00_set_field32(&reg, CSR14_TBCN, 0);
+        rt2x00_set_field32(&reg, CSR14_TCFP, 0);
+        rt2x00_set_field32(&reg, CSR14_TATIMW, 0);
+        rt2x00_set_field32(&reg, CSR14_BEACON_GEN, 0);
+        rt2x00_set_field32(&reg, CSR14_CFP_COUNT_PRELOAD, 0);
+        rt2x00_set_field32(&reg, CSR14_TBCM_PRELOAD, 0);
+        rt2x00pci_register_write(rt2x00dev, CSR14, reg);
        rt2x00pci_register_write(rt2x00dev, CNT3, 0x3f080000);
        rt2x00pci_register_read(rt2x00dev, ARCSR0, &reg);
diff --git a/drivers/net/wireless/rt2x00/rt2500pci.c b/drivers/net/wireless/rt2x00/rt2500pci.c
index a5ed54b69262..f7731fb82555 100644
--- a/drivers/net/wireless/rt2x00/rt2500pci.c
+++ b/drivers/net/wireless/rt2x00/rt2500pci.c
@@ -824,6 +824,17 @@ static int rt2500pci_init_registers(struct rt2x00_dev *rt2x00dev)
        rt2x00_set_field32(&reg, CSR11_CW_SELECT, 0);
        rt2x00pci_register_write(rt2x00dev, CSR11, reg);
+        rt2x00pci_register_read(rt2x00dev, CSR14, &reg);
+        rt2x00_set_field32(&reg, CSR14_TSF_COUNT, 0);
+        rt2x00_set_field32(&reg, CSR14_TSF_SYNC, 0);
+        rt2x00_set_field32(&reg, CSR14_TBCN, 0);
+        rt2x00_set_field32(&reg, CSR14_TCFP, 0);
+        rt2x00_set_field32(&reg, CSR14_TATIMW, 0);
+        rt2x00_set_field32(&reg, CSR14_BEACON_GEN, 0);
+        rt2x00_set_field32(&reg, CSR14_CFP_COUNT_PRELOAD, 0);
+        rt2x00_set_field32(&reg, CSR14_TBCM_PRELOAD, 0);
+        rt2x00pci_register_write(rt2x00dev, CSR14, reg);
        rt2x00pci_register_write(rt2x00dev, CNT3, 0);
        rt2x00pci_register_read(rt2x00dev, TXCSR8, &reg);
diff --git a/drivers/net/wireless/rt2x00/rt2500usb.c b/drivers/net/wireless/rt2x00/rt2500usb.c
index 61e59c17a60a..d90512f97b39 100644
--- a/drivers/net/wireless/rt2x00/rt2500usb.c
+++ b/drivers/net/wireless/rt2x00/rt2500usb.c
@@ -801,6 +801,13 @@ static int rt2500usb_init_registers(struct rt2x00_dev *rt2x00dev)
        rt2x00_set_field16(&reg, TXRX_CSR8_BBP_ID1_VALID, 0);
        rt2500usb_register_write(rt2x00dev, TXRX_CSR8, reg);
+        rt2500usb_register_read(rt2x00dev, TXRX_CSR19, &reg);
+        rt2x00_set_field16(&reg, TXRX_CSR19_TSF_COUNT, 0);
+        rt2x00_set_field16(&reg, TXRX_CSR19_TSF_SYNC, 0);
+        rt2x00_set_field16(&reg, TXRX_CSR19_TBCN, 0);
+        rt2x00_set_field16(&reg, TXRX_CSR19_BEACON_GEN, 0);
+        rt2500usb_register_write(rt2x00dev, TXRX_CSR19, reg);
        rt2500usb_register_write(rt2x00dev, TXRX_CSR21, 0xe78f);
        rt2500usb_register_write(rt2x00dev, MAC_CSR9, 0xff1d);
diff --git a/drivers/net/wireless/rt2x00/rt61pci.c b/drivers/net/wireless/rt2x00/rt61pci.c
index 14bc7b281659..c3afb5cbe807 100644
--- a/drivers/net/wireless/rt2x00/rt61pci.c
+++ b/drivers/net/wireless/rt2x00/rt61pci.c
@@ -1201,6 +1201,15 @@ static int rt61pci_init_registers(struct rt2x00_dev *rt2x00dev)
        rt2x00_set_field32(&reg, TXRX_CSR8_ACK_CTS_54MBS, 42);
        rt2x00pci_register_write(rt2x00dev, TXRX_CSR8, reg);
+        rt2x00pci_register_read(rt2x00dev, TXRX_CSR9, &reg);
+        rt2x00_set_field32(&reg, TXRX_CSR9_BEACON_INTERVAL, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TSF_TICKING, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TSF_SYNC, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TBTT_ENABLE, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_BEACON_GEN, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TIMESTAMP_COMPENSATE, 0);
+        rt2x00pci_register_write(rt2x00dev, TXRX_CSR9, reg);
        rt2x00pci_register_write(rt2x00dev, TXRX_CSR15, 0x0000000f);
        rt2x00pci_register_write(rt2x00dev, MAC_CSR6, 0x00000fff);
diff --git a/drivers/net/wireless/rt2x00/rt73usb.c b/drivers/net/wireless/rt2x00/rt73usb.c
index 83cc0147f698..46e9e081fbf1 100644
--- a/drivers/net/wireless/rt2x00/rt73usb.c
+++ b/drivers/net/wireless/rt2x00/rt73usb.c
@@ -1006,6 +1006,15 @@ static int rt73usb_init_registers(struct rt2x00_dev *rt2x00dev)
        rt2x00_set_field32(&reg, TXRX_CSR8_ACK_CTS_54MBS, 42);
        rt73usb_register_write(rt2x00dev, TXRX_CSR8, reg);
+        rt73usb_register_read(rt2x00dev, TXRX_CSR9, &reg);
+        rt2x00_set_field32(&reg, TXRX_CSR9_BEACON_INTERVAL, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TSF_TICKING, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TSF_SYNC, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TBTT_ENABLE, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_BEACON_GEN, 0);
+        rt2x00_set_field32(&reg, TXRX_CSR9_TIMESTAMP_COMPENSATE, 0);
+        rt73usb_register_write(rt2x00dev, TXRX_CSR9, reg);
        rt73usb_register_write(rt2x00dev, TXRX_CSR15, 0x0000000f);
        rt73usb_register_read(rt2x00dev, MAC_CSR6, &reg);
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 418606ac1c3b..694e95d35fd4 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -765,6 +765,7 @@ static void zd_op_remove_interface(struct ieee80211_hw *hw,
 {
        struct zd_mac *mac = zd_hw_mac(hw);
        mac->type = IEEE80211_IF_TYPE_INVALID;
+        zd_set_beacon_interval(&mac->chip, 0);
        zd_write_mac_addr(&mac->chip, NULL);
 }
diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c
index 8941f5eb96c2..6cdad9764604 100644
--- a/drivers/net/wireless/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zd1211rw/zd_usb.c
@@ -64,6 +64,7 @@ static struct usb_device_id usb_ids[] = {
        { USB_DEVICE(0x079b, 0x0062), .driver_info = DEVICE_ZD1211B },
        { USB_DEVICE(0x1582, 0x6003), .driver_info = DEVICE_ZD1211B },
        { USB_DEVICE(0x050d, 0x705c), .driver_info = DEVICE_ZD1211B },
+        { USB_DEVICE(0x083a, 0xe506), .driver_info = DEVICE_ZD1211B },
        { USB_DEVICE(0x083a, 0x4505), .driver_info = DEVICE_ZD1211B },
        { USB_DEVICE(0x0471, 0x1236), .driver_info = DEVICE_ZD1211B },
        { USB_DEVICE(0x13b1, 0x0024), .driver_info = DEVICE_ZD1211B },
diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c
index 3ce9f3defc12..956d3e79f6aa 100644
--- a/drivers/rapidio/rio-driver.c
+++ b/drivers/rapidio/rio-driver.c
@@ -101,8 +101,8 @@ static int rio_device_probe(struct device *dev)
                if (error >= 0) {
                        rdev->driver = rdrv;
                        error = 0;
+                } else
                        rio_dev_put(rdev);
-                }
        }
        return error;
 }
diff --git a/drivers/rtc/rtc-fm3130.c b/drivers/rtc/rtc-fm3130.c
index 11644c8fca82..abfdfcbaa059 100644
--- a/drivers/rtc/rtc-fm3130.c
+++ b/drivers/rtc/rtc-fm3130.c
@@ -55,7 +55,7 @@ struct fm3130 {
        int                     alarm;
 };
 static const struct i2c_device_id fm3130_id[] = {
-        { "fm3130-rtc", 0 },
+        { "fm3130", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, fm3130_id);
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index 0fc4c3630780..748a502a6355 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -302,6 +302,7 @@ static int pcf8563_remove(struct i2c_client *client)
 static const struct i2c_device_id pcf8563_id[] = {
        { "pcf8563", 0 },
+        { "rtc8564", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, pcf8563_id);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 999e91ea7451..e7a3a6554425 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -71,6 +71,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/libata.h>
+#include <linux/hdreg.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/processor.h>
@@ -4913,8 +4914,11 @@ static int ipr_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
        struct ipr_resource_entry *res;
        res = (struct ipr_resource_entry *)sdev->hostdata;
-        if (res && ipr_is_gata(res))
+        if (res && ipr_is_gata(res)) {
+                if (cmd == HDIO_GET_IDENTITY)
+                        return -ENOTTY;
                return ata_scsi_ioctl(sdev, cmd, arg);
+        }
        return -EINVAL;
 }
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a82d2fe80fb5..cbf55d59a54c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -207,6 +207,15 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
         */
        blk_execute_rq(req->q, NULL, req, 1);
+        /*
+         * Some devices (USB mass-storage in particular) may transfer
+         * garbage data together with a residue indicating that the data
+         * is invalid.  Prevent the garbage from being misinterpreted
+         * and prevent security leaks by zeroing out the excess data.
+         */
+        if (unlikely(req->data_len > 0 && req->data_len <= bufflen))
+                memset(buffer + (bufflen - req->data_len), 0, req->data_len);
        ret = req->errors;
 out:
        blk_put_request(req);
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 1bc00b721e9d..be95e55b228b 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2623,6 +2623,9 @@ static struct console serial8250_console = {
 static int __init serial8250_console_init(void)
 {
+        if (nr_uarts > UART_NR)
+                nr_uarts = UART_NR;
        serial8250_isa_init_ports();
        register_console(&serial8250_console);
        return 0;
diff --git a/drivers/ssb/driver_pcicore.c b/drivers/ssb/driver_pcicore.c
index d28c53868093..538c570df337 100644
--- a/drivers/ssb/driver_pcicore.c
+++ b/drivers/ssb/driver_pcicore.c
@@ -537,6 +537,13 @@ int ssb_pcicore_dev_irqvecs_enable(struct ssb_pcicore *pc,
        int err = 0;
        u32 tmp;
+        if (dev->bus->bustype != SSB_BUSTYPE_PCI) {
+                /* This SSB device is not on a PCI host-bus. So the IRQs are
+                 * not routed through the PCI core.
+                 * So we must not enable routing through the PCI core. */
+                goto out;
+        }
        if (!pdev)
                goto out;
        bus = pdev->bus;
diff --git a/drivers/usb/host/ohci-au1xxx.c b/drivers/usb/host/ohci-au1xxx.c
index f90fe0c7373f..68c17f5ea8ea 100644
--- a/drivers/usb/host/ohci-au1xxx.c
+++ b/drivers/usb/host/ohci-au1xxx.c
@@ -8,7 +8,7 @@
 * Bus Glue for AMD Alchemy Au1xxx
 *
 * Written by Christopher Hoover <ch@hpl.hp.com>
- * Based on fragments of previous driver by Rusell King et al.
+ * Based on fragments of previous driver by Russell King et al.
 *
 * Modified for LH7A404 from ohci-sa1111.c
 *  by Durgesh Pattamatta <pattamattad@sharpsec.com>
diff --git a/drivers/usb/host/ohci-lh7a404.c b/drivers/usb/host/ohci-lh7a404.c
index 13c12ed22252..1ef5d482c145 100644
--- a/drivers/usb/host/ohci-lh7a404.c
+++ b/drivers/usb/host/ohci-lh7a404.c
@@ -8,7 +8,7 @@
 * Bus Glue for Sharp LH7A404
 *
 * Written by Christopher Hoover <ch@hpl.hp.com>
- * Based on fragments of previous driver by Rusell King et al.
+ * Based on fragments of previous driver by Russell King et al.
 *
 * Modified for LH7A404 from ohci-sa1111.c
 *  by Durgesh Pattamatta <pattamattad@sharpsec.com>
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index ead4772f0f27..3c7a740cfe0c 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -8,7 +8,7 @@
 * USB Bus Glue for Samsung S3C2410
 *
 * Written by Christopher Hoover <ch@hpl.hp.com>
- * Based on fragments of previous driver by Rusell King et al.
+ * Based on fragments of previous driver by Russell King et al.
 *
 * Modified for S3C2410 from ohci-sa1111.c, ohci-omap.c and ohci-lh7a40.c
 *      by Ben Dooks, <ben@simtec.co.uk>
diff --git a/drivers/usb/host/ohci-sa1111.c b/drivers/usb/host/ohci-sa1111.c
index 0f48f2d99226..2e9dceb9bb99 100644
--- a/drivers/usb/host/ohci-sa1111.c
+++ b/drivers/usb/host/ohci-sa1111.c
@@ -8,7 +8,7 @@
 * SA1111 Bus Glue
 *
 * Written by Christopher Hoover <ch@hpl.hp.com>
- * Based on fragments of previous driver by Rusell King et al.
+ * Based on fragments of previous driver by Russell King et al.
 *
 * This file is licenced under the GPL.
 */
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 24843fdd5395..59df132cc375 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -74,6 +74,7 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
 {
        struct fb_info *info = vma->vm_private_data;
        struct fb_deferred_io *fbdefio = info->fbdefio;
+        struct page *cur;
        /* this is a callback we get when userspace first tries to
        write to the page. we schedule a workqueue. that workqueue
@@ -83,7 +84,24 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
        /* protect against the workqueue changing the page list */
        mutex_lock(&fbdefio->lock);
-        list_add(&page->lru, &fbdefio->pagelist);
+        /* we loop through the pagelist before adding in order
+        to keep the pagelist sorted */
+        list_for_each_entry(cur, &fbdefio->pagelist, lru) {
+                /* this check is to catch the case where a new
+                process could start writing to the same page
+                through a new pte. this new access can cause the
+                mkwrite even when the original ps's pte is marked
+                writable */
+                if (unlikely(cur == page))
+                        goto page_already_added;
+                else if (cur->index > page->index)
+                        break;
+        }
+        list_add_tail(&page->lru, &cur->lru);
+page_already_added:
        mutex_unlock(&fbdefio->lock);
        /* come back after delay to process the deferred IO */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
 ;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543ceec..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (rc == -EREMOTE && !is_dfs_referral) {
-                if (rc == -EREMOTE && !is_dfs_referral) {
+                is_dfs_referral = true;
-                        is_dfs_referral = true;
+                cFYI(DBG2, ("DFS ref"));
-                        cFYI(DBG2, ("DFS ref"));
+                /* for DFS, server does not give us real inode data */
-                        /* for DFS, server does not give us real inode data */
+                fill_fake_finddataunix(&find_data, sb);
-                        fill_fake_finddataunix(&find_data, sb);
+                rc = 0;
-                        rc = 0;
+        } else if (rc)
-                }
+                goto cgiiu_exit;
-        }
        num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
        end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                *pinode = new_inode(sb);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
-                goto cgiiu_exit;
+                        goto cgiiu_exit;
                }
                /* Is an i_ino of zero legal? */
                /* note ino incremented to unique num in new_inode */
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df3..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        bprm->exec -= stack_shift;
        down_write(&mm->mmap_sem);
-        vm_flags = vma->vm_flags;
+        vm_flags = VM_STACK_FLAGS;
        /*
         * Adjust stack execute permissions; explicitly enable for
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..80e20d9f2780 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
 */
 int ocfs2_file_lock(struct file *file, int ex, int trylock)
 {
-        int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
-        unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+        unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * Get the lock at NLMODE to start - that way we
                 * can cancel the upconvert request if need be.
                 */
-                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+                ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        }
        lockres->l_action = OCFS2_AST_CONVERT;
-        lkm_flags |= LKM_CONVERT;
+        lkm_flags |= DLM_LKF_CONVERT;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
                return;
-        if (lockres->l_level == LKM_NLMODE)
+        if (lockres->l_level == DLM_LOCK_NL)
                return;
        mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
        lockres->l_blocking = DLM_LOCK_EX;
-        gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+        ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
        if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
-                /* If I'm the only one writing to this iclog, sync it to disk */
+                /*
-                if (atomic_read(&iclog->ic_refcnt) == 1) {
+                 * If I'm the only one writing to this iclog, sync it to disk.
+                 * We need to do an atomic compare and decrement here to avoid
+                 * racing with concurrent atomic_dec_and_lock() calls in
+                 * xlog_state_release_iclog() when there is more than one
+                 * reference to the iclog.
+                 */
+                if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+                        /* we are the only one */
                        spin_unlock(&log->l_icloglock);
-                        if ((error = xlog_state_release_iclog(log, iclog)))
+                        error = xlog_state_release_iclog(log, iclog);
+                        if (error)
                                return error;
                } else {
-                        atomic_dec(&iclog->ic_refcnt);
                        spin_unlock(&log->l_icloglock);
                }
                goto restart;
diff --git a/include/asm-avr32/setup.h b/include/asm-avr32/setup.h
index ea3070ff13a5..ff5b7cf6be4d 100644
--- a/include/asm-avr32/setup.h
+++ b/include/asm-avr32/setup.h
@@ -2,7 +2,7 @@
 * Copyright (C) 2004-2006 Atmel Corporation
 *
 * Based on linux/include/asm-arm/setup.h
- *   Copyright (C) 1997-1999 Russel King
+ *   Copyright (C) 1997-1999 Russell King
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
diff --git a/include/asm-frv/system.h b/include/asm-frv/system.h
index d3a12a9079f7..7742ec000cc4 100644
--- a/include/asm-frv/system.h
+++ b/include/asm-frv/system.h
@@ -87,7 +87,7 @@ do {								\
 } while(0)
 #define irqs_disabled() \
-        ({unsigned long flags; local_save_flags(flags); flags; })
+        ({unsigned long flags; local_save_flags(flags); !!flags; })
 #define local_irq_save(flags)                   \
 do {                                            \
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 268a012bcd79..28bddbcb38be 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -192,8 +192,8 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
                unsigned cpu = smp_processor_id();
                ldt_desc ldt;
-                set_tssldt_descriptor(&ldt, (unsigned long)addr,
+                set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
-                                      DESC_LDT, entries * sizeof(ldt) - 1);
+                                      entries * LDT_ENTRY_SIZE - 1);
                write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
                                &ldt, DESC_LDT);
                asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33a8f42041fa..f6cd60f2de63 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
-extern unsigned long weighted_cpuload(const int cpu);
 struct seq_file;
 struct cfs_rq;
@@ -784,6 +783,8 @@ struct sched_domain {
        unsigned int balance_interval;  /* initialise to 1. units in ms. */
        unsigned int nr_balance_failed; /* initialise to 0 */
+        u64 last_update;
 #ifdef CONFIG_SCHEDSTATS
        /* load_balance() stats */
        unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -823,23 +824,6 @@ extern int arch_reinit_sched_domains(void);
 #endif  /* CONFIG_SMP */
-/*
- * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
- * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
- * task of nice 0 or enough lower priority tasks to bring up the
- * weighted_cpuload
- */
-static inline int above_background_load(void)
-{
-        unsigned long cpu;
-        for_each_online_cpu(cpu) {
-                if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
-                        return 1;
-        }
-        return 0;
-}
 struct io_context;                      /* See blkdev.h */
 #define NGROUPS_SMALL           32
 #define NGROUPS_PER_BLOCK       ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
@@ -921,8 +905,8 @@ struct sched_class {
        void (*set_cpus_allowed)(struct task_struct *p,
                                 const cpumask_t *newmask);
-        void (*join_domain)(struct rq *rq);
+        void (*rq_online)(struct rq *rq);
-        void (*leave_domain)(struct rq *rq);
+        void (*rq_offline)(struct rq *rq);
        void (*switched_from) (struct rq *this_rq, struct task_struct *task,
                               int running);
@@ -1039,6 +1023,7 @@ struct task_struct {
 #endif
        int prio, static_prio, normal_prio;
+        unsigned int rt_priority;
        const struct sched_class *sched_class;
        struct sched_entity se;
        struct sched_rt_entity rt;
@@ -1122,7 +1107,6 @@ struct task_struct {
        int __user *set_child_tid;              /* CLONE_CHILD_SETTID */
        int __user *clear_child_tid;            /* CLONE_CHILD_CLEARTID */
-        unsigned int rt_priority;
        cputime_t utime, stime, utimescaled, stimescaled;
        cputime_t gtime;
        cputime_t prev_utime, prev_stime;
@@ -1141,12 +1125,12 @@ struct task_struct {
        gid_t gid,egid,sgid,fsgid;
        struct group_info *group_info;
        kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-        unsigned securebits;
        struct user_struct *user;
+        unsigned securebits;
 #ifdef CONFIG_KEYS
+        unsigned char jit_keyring;      /* default keyring to attach requested keys to */
        struct key *request_key_auth;   /* assumed request_key authority */
        struct key *thread_keyring;     /* keyring private to this thread */
-        unsigned char jit_keyring;      /* default keyring to attach requested keys to */
 #endif
        char comm[TASK_COMM_LEN]; /* executable name excluding path
                                     - access with [gs]et_task_comm (which lock
@@ -1233,8 +1217,8 @@ struct task_struct {
 # define MAX_LOCK_DEPTH 48UL
        u64 curr_chain_key;
        int lockdep_depth;
-        struct held_lock held_locks[MAX_LOCK_DEPTH];
        unsigned int lockdep_recursion;
+        struct held_lock held_locks[MAX_LOCK_DEPTH];
 #endif
 /* journalling filesystem info */
@@ -1262,10 +1246,6 @@ struct task_struct {
        u64 acct_vm_mem1;       /* accumulated virtual memory usage */
        cputime_t acct_stimexpd;/* stime since last update */
 #endif
-#ifdef CONFIG_NUMA
-        struct mempolicy *mempolicy;
-        short il_next;
-#endif
 #ifdef CONFIG_CPUSETS
        nodemask_t mems_allowed;
        int cpuset_mems_generation;
@@ -1285,6 +1265,10 @@ struct task_struct {
        struct list_head pi_state_list;
        struct futex_pi_state *pi_state_cache;
 #endif
+#ifdef CONFIG_NUMA
+        struct mempolicy *mempolicy;
+        short il_next;
+#endif
        atomic_t fs_excl;       /* holding fs exclusive resources */
        struct rcu_head rcu;
@@ -1504,6 +1488,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE    0x00800000      /* Allowed to write to swap */
 #define PF_SPREAD_PAGE  0x01000000      /* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB  0x02000000      /* Spread some slab caches over cpuset */
+#define PF_THREAD_BOUND 0x04000000      /* Thread bound to specific cpu */
 #define PF_MEMPOLICY    0x10000000      /* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER 0x20000000      /* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP 0x40000000      /* Freezer should not count it as freezeable */
@@ -1637,6 +1622,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_shares_ratelimit;
 int sched_nr_latency_handler(struct ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length,
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 2ca6bae88721..fb0c215a3051 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -339,6 +339,7 @@ struct xfrm_usersa_info {
 #define XFRM_STATE_NOPMTUDISC   4
 #define XFRM_STATE_WILDRECV     8
 #define XFRM_STATE_ICMP         16
+#define XFRM_STATE_AF_UNSPEC    32
 };
 struct xfrm_usersa_id {
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
 #
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
-            exit.o itimer.o time.o softirq.o resource.o \
+            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_SMP) += sched_cpupri.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+#ifndef CONFIG_SMP
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+#else /* CONFIG_SMP */
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -403,3 +425,5 @@ out:
        cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97e989c..64a05da9bc4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
        if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
                return -ENOSPC;
+        if (tsk->flags & PF_THREAD_BOUND) {
+                cpumask_t mask;
+                mutex_lock(&callback_mutex);
+                mask = cs->cpus_allowed;
+                mutex_unlock(&callback_mutex);
+                if (!cpus_equal(tsk->cpus_allowed, mask))
+                        return -EINVAL;
+        }
        return security_task_setscheduler(tsk, 0, NULL);
 }
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        int err;
        mutex_lock(&callback_mutex);
        guarantee_online_cpus(cs, &cpus);
-        set_cpus_allowed_ptr(tsk, &cpus);
+        err = set_cpus_allowed_ptr(tsk, &cpus);
        mutex_unlock(&callback_mutex);
+        if (err)
+                return;
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
        set_task_cpu(k, cpu);
        k->cpus_allowed = cpumask_of_cpu(cpu);
        k->rt.nr_cpus_allowed = 1;
+        k->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..d16c8d9fbd8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include "sched_cpupri.h"
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +291,15 @@ struct task_group root_task_group;
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 #define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -307,9 +309,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        return NULL;
+}
 #endif  /* CONFIG_GROUP_SCHED */
@@ -373,6 +379,7 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+        u64 pair_start;
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -401,6 +408,31 @@ struct cfs_rq {
         */
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
+#ifdef CONFIG_SMP
+        /*
+         * the part of load.weight contributed by tasks
+         */
+        unsigned long task_weight;
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+        /*
+         * this cpu's part of tg->shares
+         */
+        unsigned long shares;
+        /*
+         * load.weight at the time we set shares
+         */
+        unsigned long rq_weight;
+#endif
 #endif
 };
@@ -452,6 +484,9 @@ struct root_domain {
         */
        cpumask_t rto_mask;
        atomic_t rto_count;
+#ifdef CONFIG_SMP
+        struct cpupri cpupri;
+#endif
 };
 /*
@@ -526,6 +561,9 @@ struct rq {
        int push_cpu;
        /* cpu of this runqueue: */
        int cpu;
+        int online;
+        unsigned long avg_load_per_task;
        struct task_struct *migration_thread;
        struct list_head migration_queue;
@@ -749,6 +787,12 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
+ * ratelimit for updating the group shares.
+ * default: 0.5ms
+ */
+const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
@@ -775,82 +819,6 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
-unsigned long long time_sync_thresh = 100000;
-static DEFINE_PER_CPU(unsigned long long, time_offset);
-static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-/*
- * Global lock which we take every now and then to synchronize
- * the CPUs time. This method is not warp-safe, but it's good
- * enough to synchronize slowly diverging time sources and thus
- * it's good enough for tracing:
- */
-static DEFINE_SPINLOCK(time_sync_lock);
-static unsigned long long prev_global_time;
-static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
-{
-        /*
-         * We want this inlined, to not get tracer function calls
-         * in this critical section:
-         */
-        spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-        __raw_spin_lock(&time_sync_lock.raw_lock);
-        if (time < prev_global_time) {
-                per_cpu(time_offset, cpu) += prev_global_time - time;
-                time = prev_global_time;
-        } else {
-                prev_global_time = time;
-        }
-        __raw_spin_unlock(&time_sync_lock.raw_lock);
-        spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
-        return time;
-}
-static unsigned long long __cpu_clock(int cpu)
-{
-        unsigned long long now;
-        /*
-         * Only call sched_clock() if the scheduler has already been
-         * initialized (some code might call cpu_clock() very early):
-         */
-        if (unlikely(!scheduler_running))
-                return 0;
-        now = sched_clock_cpu(cpu);
-        return now;
-}
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-unsigned long long cpu_clock(int cpu)
-{
-        unsigned long long prev_cpu_time, time, delta_time;
-        unsigned long flags;
-        local_irq_save(flags);
-        prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-        time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-        delta_time = time-prev_cpu_time;
-        if (unlikely(delta_time > time_sync_thresh)) {
-                time = __sync_cpu_clock(time, cpu);
-                per_cpu(prev_cpu_time, cpu) = time;
-        }
-        local_irq_restore(flags);
-        return time;
-}
-EXPORT_SYMBOL_GPL(cpu_clock);
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
 #endif
@@ -1313,15 +1281,15 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
-#endif
+#endif /* CONFIG_NO_HZ */
-#else
+#else /* !CONFIG_SMP */
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_thread_flag(p, tif_bit);
 }
-#endif
+#endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
 # define WMULT_CONST    (~0UL)
@@ -1336,6 +1304,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
 */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                struct load_weight *lw)
@@ -1363,12 +1334,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-        return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -1479,17 +1444,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->nr_running)
+                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        return rq->avg_load_per_task;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static void
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
+        struct task_group *parent, *child;
+        rcu_read_lock();
+        parent = &root_task_group;
+down:
+        (*down)(parent, cpu, sd);
+        list_for_each_entry_rcu(child, &parent->children, siblings) {
+                parent = child;
+                goto down;
+up:
+                continue;
+        }
+        (*up)(parent, cpu, sd);
+        child = parent;
+        parent = parent->parent;
+        if (parent)
+                goto up;
+        rcu_read_unlock();
+}
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+                          unsigned long sd_shares, unsigned long sd_rq_weight)
+{
+        int boost = 0;
+        unsigned long shares;
+        unsigned long rq_weight;
+        if (!tg->se[cpu])
+                return;
+        rq_weight = tg->cfs_rq[cpu]->load.weight;
+        /*
+         * If there are currently no tasks on the cpu pretend there is one of
+         * average load so that when a new task gets to run here it will not
+         * get delayed by group starvation.
+         */
+        if (!rq_weight) {
+                boost = 1;
+                rq_weight = NICE_0_LOAD;
+        }
+        if (unlikely(rq_weight > sd_rq_weight))
+                rq_weight = sd_rq_weight;
+        /*
+         *           \Sum shares * rq_weight
+         * shares =  -----------------------
+         *               \Sum rq_weight
+         *
+         */
+        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        /*
+         * record the actual number of shares, not the boosted amount.
+         */
+        tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+        tg->cfs_rq[cpu]->rq_weight = rq_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        else if (shares > MAX_SHARES)
+                shares = MAX_SHARES;
+        __set_se_shares(tg->se[cpu], shares);
+}
+/*
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
+ */
+static void
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+        unsigned long rq_weight = 0;
+        unsigned long shares = 0;
+        int i;
+        for_each_cpu_mask(i, sd->span) {
+                rq_weight += tg->cfs_rq[i]->load.weight;
+                shares += tg->cfs_rq[i]->shares;
+        }
+        if ((!shares && rq_weight) || shares > tg->shares)
+                shares = tg->shares;
+        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+                shares = tg->shares;
+        if (!rq_weight)
+                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+        for_each_cpu_mask(i, sd->span) {
+                struct rq *rq = cpu_rq(i);
+                unsigned long flags;
+                spin_lock_irqsave(&rq->lock, flags);
+                __update_group_shares_cpu(tg, i, shares, rq_weight);
+                spin_unlock_irqrestore(&rq->lock, flags);
+        }
+}
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static void
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+        unsigned long load;
+        if (!tg->parent) {
+                load = cpu_rq(cpu)->load.weight;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->cfs_rq[cpu]->shares;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
+        tg->cfs_rq[cpu]->h_load = load;
+}
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+}
+static void update_shares(struct sched_domain *sd)
+{
+        u64 now = cpu_clock(raw_smp_processor_id());
+        s64 elapsed = now - sd->last_update;
+        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+                sd->last_update = now;
+                walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+        }
 }
+static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+        spin_unlock(&rq->lock);
+        update_shares(sd);
+        spin_lock(&rq->lock);
+}
+static void update_h_load(int cpu)
+{
+        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+}
+#else
+static inline void update_shares(struct sched_domain *sd)
+{
+}
+static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+}
 #endif
-#endif /* CONFIG_SMP */
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+#ifdef CONFIG_SMP
+        cfs_rq->shares = shares;
+#endif
+}
+#endif
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -1500,27 +1659,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 #define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
+static void inc_nr_running(struct rq *rq)
-{
-        update_load_add(&rq->load, p->se.load.weight);
-}
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-        update_load_sub(&rq->load, p->se.load.weight);
-}
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
-        inc_load(rq, p);
 }
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
        rq->nr_running--;
-        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1693,12 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
+static void update_avg(u64 *avg, u64 sample)
+{
+        s64 diff = sample - *avg;
+        *avg += diff >> 3;
+}
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
        sched_info_queued(p);
@@ -1553,6 +1708,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+        if (sleep && p->se.last_wakeup) {
+                update_avg(&p->se.avg_overlap,
+                           p->se.sum_exec_runtime - p->se.last_wakeup);
+                p->se.last_wakeup = 0;
+        }
+        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, sleep);
        p->se.on_rq = 0;
 }
@@ -1612,7 +1774,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, wakeup);
-        inc_nr_running(p, rq);
+        inc_nr_running(rq);
 }
 /*
@@ -1624,7 +1786,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, sleep);
-        dec_nr_running(p, rq);
+        dec_nr_running(rq);
 }
 /**
@@ -1636,12 +1798,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-/* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
        set_task_rq(p, cpu);
@@ -1670,6 +1826,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 #ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cpu_rq(cpu)->load.weight;
+}
 /*
 * Is this task likely cache-hot:
 */
@@ -1880,7 +2042,7 @@ static unsigned long source_load(int cpu, int type)
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0)
+        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
        return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2057,13 @@ static unsigned long target_load(int cpu, int type)
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0)
+        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
        return max(rq->cpu_load[type-1], total);
 }
 /*
- * Return the average load per task on the cpu's run queue
- */
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        unsigned long n = rq->nr_running;
-        return n ? total / n : SCHED_LOAD_SCALE;
-}
-/*
 * find_idlest_group finds and returns the least busy CPU group within the
 * domain.
 */
@@ -2019,6 +2169,9 @@ static int sched_balance_self(int cpu, int flag)
                        sd = tmp;
        }
+        if (sd)
+                update_shares(sd);
        while (sd) {
                cpumask_t span, tmpmask;
                struct sched_group *group;
@@ -2085,6 +2238,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
+#ifdef CONFIG_SMP
+        if (sched_feat(LB_WAKEUP_UPDATE)) {
+                struct sched_domain *sd;
+                this_cpu = raw_smp_processor_id();
+                cpu = task_cpu(p);
+                for_each_domain(this_cpu, sd) {
+                        if (cpu_isset(cpu, sd->span)) {
+                                update_shares(sd);
+                                break;
+                        }
+                }
+        }
+#endif
        smp_wmb();
        rq = task_rq_lock(p, &flags);
        old_state = p->state;
@@ -2131,7 +2300,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                        }
                }
        }
-#endif
+#endif /* CONFIG_SCHEDSTATS */
 out_activate:
 #endif /* CONFIG_SMP */
@@ -2157,6 +2326,8 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+        current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
        return success;
@@ -2277,7 +2448,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-                inc_nr_running(p, rq);
+                inc_nr_running(rq);
        }
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -2331,7 +2502,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                notifier->ops->sched_out(notifier, next);
 }
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
@@ -2343,7 +2514,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 {
 }
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
 * prepare_task_switch - prepare to switch tasks
@@ -2785,7 +2956,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              enum cpu_idle_type idle, int *all_pinned,
              int *this_best_prio, struct rq_iterator *iterator)
 {
-        int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+        int loops = 0, pulled = 0, pinned = 0;
        struct task_struct *p;
        long rem_load_move = max_load_move;
@@ -2801,14 +2972,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 next:
        if (!p || loops++ > sysctl_sched_nr_migrate)
                goto out;
-        /*
-         * To help distribute high priority tasks across CPUs we don't
+        if ((p->se.load.weight >> 1) > rem_load_move ||
-         * skip a task if it will be the highest priority task (i.e. smallest
-         * prio value) on its new queue regardless of its load weight
-         */
-        skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-                                                         SCHED_LOAD_SCALE_FUZZ;
-        if ((skip_for_load && p->prio >= *this_best_prio) ||
            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
                p = iterator->next(iterator->arg);
                goto next;
@@ -2863,6 +3028,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                max_load_move - total_load_moved,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                        break;
        } while (class && max_load_move > total_load_moved);
        return total_load_moved > 0;
@@ -2939,6 +3108,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;
        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                int __group_imb = 0;
                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
+                unsigned long sum_avg_load_per_task;
+                unsigned long avg_load_per_task;
                local_group = cpu_isset(this_cpu, group->cpumask);
@@ -2961,6 +3133,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
+                sum_avg_load_per_task = avg_load_per_task = 0;
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
@@ -2994,6 +3168,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        avg_load += load;
                        sum_nr_running += rq->nr_running;
                        sum_weighted_load += weighted_cpuload(i);
+                        sum_avg_load_per_task += cpu_avg_load_per_task(i);
                }
                /*
@@ -3015,7 +3191,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = sg_div_cpu_power(group,
                                avg_load * SCHED_LOAD_SCALE);
-                if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+                /*
+                 * Consider the group unbalanced when the imbalance is larger
+                 * than the average weight of two tasks.
+                 *
+                 * APZ: with cgroup the avg task weight can vary wildly and
+                 *      might not be a suitable number - should we keep a
+                 *      normalized nr_running number somewhere that negates
+                 *      the hierarchy?
+                 */
+                avg_load_per_task = sg_div_cpu_power(group,
+                                sum_avg_load_per_task * SCHED_LOAD_SCALE);
+                if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                        __group_imb = 1;
                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3345,9 @@ small_imbalance:
                        if (busiest_load_per_task > this_load_per_task)
                                imbn = 1;
                } else
-                        this_load_per_task = SCHED_LOAD_SCALE;
+                        this_load_per_task = cpu_avg_load_per_task(this_cpu);
-                if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+                if (max_load - this_load + 2*busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
@@ -3284,6 +3473,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
+        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3386,8 +3576,9 @@ redo:
        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return ld_moved;
+        goto out;
 out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3593,13 @@ out_one_pinned:
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return 0;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
 }
 /*
@@ -3438,6 +3634,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
+        update_shares_locked(this_rq, sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                   &sd_idle, cpus, NULL);
        if (!group) {
@@ -3481,6 +3678,7 @@ redo:
        } else
                sd->nr_balance_failed = 0;
+        update_shares_locked(this_rq, sd);
        return ld_moved;
 out_balanced:
@@ -3672,6 +3870,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
+        int need_serialize;
        cpumask_t tmp;
        for_each_domain(cpu, sd) {
@@ -3689,8 +3888,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (interval > HZ*NR_CPUS/10)
                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
-                if (sd->flags & SD_SERIALIZE) {
+                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
                }
@@ -3706,7 +3906,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                        }
                        sd->last_balance = jiffies;
                }
-                if (sd->flags & SD_SERIALIZE)
+                if (need_serialize)
                        spin_unlock(&balancing);
 out:
                if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4070,6 +4270,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
                prev->comm, prev->pid, preempt_count());
        debug_show_held_locks(prev);
+        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
@@ -4143,7 +4344,7 @@ asmlinkage void __sched schedule(void)
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
-        int cpu;
+        int cpu, hrtick = sched_feat(HRTICK);
 need_resched:
        preempt_disable();
@@ -4158,7 +4359,8 @@ need_resched_nonpreemptible:
        schedule_debug(prev);
-        hrtick_clear(rq);
+        if (hrtick)
+                hrtick_clear(rq);
        /*
         * Do the rq-clock update outside the rq lock:
@@ -4204,7 +4406,8 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
-        hrtick_set(rq);
+        if (hrtick)
+                hrtick_set(rq);
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
@@ -4586,10 +4789,8 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_task(rq, p, 0);
-                dec_load(rq, p);
-        }
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -4599,7 +4800,6 @@ void set_user_nice(struct task_struct *p, long nice)
        if (on_rq) {
                enqueue_task(rq, p, 0);
-                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -5070,24 +5270,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
        return sched_setaffinity(pid, &new_mask);
 }
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
        struct task_struct *p;
@@ -5571,6 +5753,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
                goto out;
        }
+        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+                     !cpus_equal(p->cpus_allowed, *new_mask))) {
+                ret = -EINVAL;
+                goto out;
+        }
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
@@ -5622,10 +5810,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
-                goto out;
+                goto done;
        /* Affinity changed (again). */
        if (!cpu_isset(dest_cpu, p->cpus_allowed))
-                goto out;
+                goto fail;
        on_rq = p->se.on_rq;
        if (on_rq)
@@ -5636,8 +5824,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                activate_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p);
        }
+done:
        ret = 1;
-out:
+fail:
        double_rq_unlock(rq_src, rq_dest);
        return ret;
 }
@@ -6059,6 +6248,36 @@ static void unregister_sched_domain_sysctl(void)
 }
 #endif
+static void set_rq_online(struct rq *rq)
+{
+        if (!rq->online) {
+                const struct sched_class *class;
+                cpu_set(rq->cpu, rq->rd->online);
+                rq->online = 1;
+                for_each_class(class) {
+                        if (class->rq_online)
+                                class->rq_online(rq);
+                }
+        }
+}
+static void set_rq_offline(struct rq *rq)
+{
+        if (rq->online) {
+                const struct sched_class *class;
+                for_each_class(class) {
+                        if (class->rq_offline)
+                                class->rq_offline(rq);
+                }
+                cpu_clear(rq->cpu, rq->rd->online);
+                rq->online = 0;
+        }
+}
 /*
 * migration_call - callback that gets triggered when a CPU is added.
 * Here we can start up the necessary migration thread for the new CPU.
@@ -6096,7 +6315,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                        cpu_set(cpu, rq->rd->online);
+                        set_rq_online(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
@@ -6157,7 +6377,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                        cpu_clear(cpu, rq->rd->online);
+                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
@@ -6191,6 +6411,28 @@ void __init migration_init(void)
 #ifdef CONFIG_SCHED_DEBUG
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+        switch (lvl) {
+        case SD_LV_NONE:
+                        return "NONE";
+        case SD_LV_SIBLING:
+                        return "SIBLING";
+        case SD_LV_MC:
+                        return "MC";
+        case SD_LV_CPU:
+                        return "CPU";
+        case SD_LV_NODE:
+                        return "NODE";
+        case SD_LV_ALLNODES:
+                        return "ALLNODES";
+        case SD_LV_MAX:
+                        return "MAX";
+        }
+        return "MAX";
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6210,7 +6452,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
-        printk(KERN_CONT "span %s\n", str);
+        printk(KERN_CONT "span %s level %s\n",
+                str, sd_level_to_string(sd->level));
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6294,9 +6537,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        }
        kfree(groupmask);
 }
-#else
+#else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
@@ -6356,20 +6599,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
        unsigned long flags;
-        const struct sched_class *class;
        spin_lock_irqsave(&rq->lock, flags);
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
-                for (class = sched_class_highest; class; class = class->next) {
+                if (cpu_isset(rq->cpu, old_rd->online))
-                        if (class->leave_domain)
+                        set_rq_offline(rq);
-                                class->leave_domain(rq);
-                }
                cpu_clear(rq->cpu, old_rd->span);
-                cpu_clear(rq->cpu, old_rd->online);
                if (atomic_dec_and_test(&old_rd->refcount))
                        kfree(old_rd);
@@ -6380,12 +6619,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        cpu_set(rq->cpu, rd->span);
        if (cpu_isset(rq->cpu, cpu_online_map))
-                cpu_set(rq->cpu, rd->online);
+                set_rq_online(rq);
-        for (class = sched_class_highest; class; class = class->next) {
-                if (class->join_domain)
-                        class->join_domain(rq);
-        }
        spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -6396,6 +6630,8 @@ static void init_rootdomain(struct root_domain *rd)
        cpus_clear(rd->span);
        cpus_clear(rd->online);
+        cpupri_init(&rd->cpupri);
 }
 static void init_defrootdomain(void)
@@ -6590,7 +6826,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
                cpus_or(*span, *span, *nodemask);
        }
 }
-#endif
+#endif /* CONFIG_NUMA */
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
@@ -6609,7 +6845,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
                *sg = &per_cpu(sched_group_cpus, cpu);
        return cpu;
 }
-#endif
+#endif /* CONFIG_SCHED_SMT */
 /*
 * multi-core sched-domains:
@@ -6617,7 +6853,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
@@ -6719,7 +6955,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                sg = sg->next;
        } while (sg != group_head);
 }
-#endif
+#endif /* CONFIG_NUMA */
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
@@ -6756,11 +6992,11 @@ next_sg:
                sched_group_nodes_bycpu[cpu] = NULL;
        }
 }
-#else
+#else /* !CONFIG_NUMA */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
-#endif
+#endif /* CONFIG_NUMA */
 /*
 * Initialize sched groups cpu_power.
@@ -7469,7 +7705,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 #endif
        return err;
 }
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 /*
 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7480,21 +7716,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
+        int cpu = (int)(long)hcpu;
        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
+                disable_runtime(cpu_rq(cpu));
+                /* fall-through */
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                detach_destroy_domains(&cpu_online_map);
                free_sched_domains();
                return NOTIFY_OK;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
+                enable_runtime(cpu_rq(cpu));
+                /* fall-through */
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                /*
@@ -7694,8 +7937,8 @@ void __init sched_init(void)
                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
@@ -7709,8 +7952,8 @@ void __init sched_init(void)
                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
        }
 #ifdef CONFIG_SMP
@@ -7726,8 +7969,8 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), RUNTIME_INF);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
@@ -7737,8 +7980,8 @@ void __init sched_init(void)
        INIT_LIST_HEAD(&root_task_group.children);
        init_task_group.parent = &root_task_group;
        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_GROUP_SCHED */
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -7818,6 +8061,7 @@ void __init sched_init(void)
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
                rq->cpu = i;
+                rq->online = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
@@ -8057,7 +8301,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8075,7 +8319,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
@@ -8146,7 +8390,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
@@ -8164,7 +8408,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
@@ -8275,17 +8519,14 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, &flags);
 }
-#endif
+#endif /* CONFIG_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
        int on_rq;
-        spin_lock_irq(&rq->lock);
        on_rq = se->on_rq;
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
@@ -8295,8 +8536,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
+}
-        spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+        struct cfs_rq *cfs_rq = se->cfs_rq;
+        struct rq *rq = cfs_rq->rq;
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __set_se_shares(se, shares);
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -8335,8 +8585,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-        for_each_possible_cpu(i)
+        for_each_possible_cpu(i) {
+                /*
+                 * force a rebalance
+                 */
+                cfs_rq_set_shares(tg->cfs_rq[i], 0);
                set_se_shares(tg->se[i], shares);
+        }
        /*
         * Enable load balance activity on this group, by inserting it back on
@@ -8375,7 +8630,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-        struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+        struct task_group *tgi, *parent = tg->parent;
        unsigned long total = 0;
        if (!parent) {
@@ -8399,7 +8654,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
        }
        rcu_read_unlock();
-        return total + to_ratio(period, runtime) <
+        return total + to_ratio(period, runtime) <=
                to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
                                parent->rt_bandwidth.rt_runtime);
 }
@@ -8519,16 +8774,21 @@ long sched_group_rt_period(struct task_group *tg)
 static int sched_rt_global_constraints(void)
 {
+        struct task_group *tg = &root_task_group;
+        u64 rt_runtime, rt_period;
        int ret = 0;
+        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        rt_runtime = tg->rt_bandwidth.rt_runtime;
        mutex_lock(&rt_constraints_mutex);
-        if (!__rt_schedulable(NULL, 1, 0))
+        if (!__rt_schedulable(tg, rt_period, rt_runtime))
                ret = -EINVAL;
        mutex_unlock(&rt_constraints_mutex);
        return ret;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
@@ -8546,7 +8806,7 @@ static int sched_rt_global_constraints(void)
        return 0;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 int sched_rt_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
@@ -8654,7 +8914,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return (u64) tg->shares;
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8678,7 +8938,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
        return sched_group_rt_period(cgroup_tg(cgrp));
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 8affbfd0cdb0..22ed55d1167f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -330,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 {
        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
+unsigned long long cpu_clock(int cpu)
+{
+        unsigned long long clock;
+        unsigned long flags;
+        local_irq_save(flags);
+        clock = sched_clock_cpu(cpu);
+        local_irq_restore(flags);
+        return clock;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include "sched_cpupri.h"
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+        int cpupri;
+        if (prio == CPUPRI_INVALID)
+                cpupri = CPUPRI_INVALID;
+        else if (prio == MAX_PRIO)
+                cpupri = CPUPRI_IDLE;
+        else if (prio >= MAX_RT_PRIO)
+                cpupri = CPUPRI_NORMAL;
+        else
+                cpupri = MAX_RT_PRIO - prio + 1;
+        return cpupri;
+}
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+                cpumask_t *lowest_mask)
+{
+        int                  idx      = 0;
+        int                  task_pri = convert_prio(p->prio);
+        for_each_cpupri_active(cp->pri_active, idx) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                cpumask_t mask;
+                if (idx >= task_pri)
+                        break;
+                cpus_and(mask, p->cpus_allowed, vec->mask);
+                if (cpus_empty(mask))
+                        continue;
+                *lowest_mask = mask;
+                return 1;
+        }
+        return 0;
+}
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int                  oldpri  = *currpri;
+        unsigned long        flags;
+        newpri = convert_prio(newpri);
+        BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+        if (newpri == oldpri)
+                return;
+        /*
+         * If the cpu was currently mapped to a different value, we
+         * first need to unmap the old value
+         */
+        if (likely(oldpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                vec->count--;
+                if (!vec->count)
+                        clear_bit(oldpri, cp->pri_active);
+                cpu_clear(cpu, vec->mask);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        if (likely(newpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                cpu_set(cpu, vec->mask);
+                vec->count++;
+                if (vec->count == 1)
+                        set_bit(newpri, cp->pri_active);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        *currpri = newpri;
+}
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+                spin_lock_init(&vec->lock);
+                vec->count = 0;
+                cpus_clear(vec->mask);
+        }
+        for_each_possible_cpu(i)
+                cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+#include <linux/sched.h>
+#define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRI_WORDS     BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+struct cpupri_vec {
+        spinlock_t lock;
+        int        count;
+        cpumask_t  mask;
+};
+struct cpupri {
+        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+        long              pri_active[CPUPRI_NR_PRI_WORDS];
+        int               cpu_to_pri[NR_CPUS];
+};
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+                 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..bbe6b31c3c56 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
-#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#else
        char path[128] = "";
        struct cgroup *cgroup = NULL;
        struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                cgroup_path(cgroup, path, sizeof(path));
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#else
+        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SCHEDSTATS
-        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-                        rq->bkl_count);
+        P(yld_exp_empty);
+        P(yld_act_empty);
+        P(yld_both_empty);
+        P(yld_count);
+        P(sched_switch);
+        P(sched_count);
+        P(sched_goidle);
+        P(ttwu_count);
+        P(ttwu_local);
+        P(bkl_count);
+#undef P
 #endif
        SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
+}
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+        char path[128] = "";
+        struct cgroup *cgroup = NULL;
+        struct task_group *tg = rt_rq->tg;
+        if (tg)
+                cgroup = tg->css.cgroup;
+        if (cgroup)
+                cgroup_path(cgroup, path, sizeof(path));
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
+#else
+        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+#define P(x) \
+        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+        P(rt_nr_running);
+        P(rt_throttled);
+        PN(rt_time);
+        PN(rt_runtime);
+#undef PN
+#undef P
 }
 static void print_cpu(struct seq_file *m, int cpu)
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 #undef PN
        print_cfs_stats(m, cpu);
+        print_rt_stats(m, cpu);
        print_rq(m, rq, cpu);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 /*
 * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 /*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                delta = calc_delta_mine(delta,
+                                se->load.weight, &cfs_rq_of(se)->load);
+        }
+        return delta;
+}
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                delta = calc_delta_mine(delta,
+                                cfs_rq_of(se)->load.weight, &se->load);
+        }
+        return delta;
+}
+/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        u64 slice = __sched_period(cfs_rq->nr_running);
+        return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                slice *= se->load.weight;
-                do_div(slice, cfs_rq->load.weight);
-        }
-        return slice;
 }
 /*
 * We calculate the vruntime slice of a to be inserted task
 *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
 */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long nr_running = cfs_rq->nr_running;
-        unsigned long weight;
-        u64 vslice;
        if (!se->on_rq)
                nr_running++;
-        vslice = __sched_period(nr_running);
+        return __sched_period(nr_running);
+}
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+        struct load_weight lw = {
+                .weight = NICE_0_LOAD,
+                .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+        };
        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
+                struct load_weight *se_lw = &se->load;
+                unsigned long rw = cfs_rq_of(se)->load.weight;
+#ifdef CONFIG_FAIR_SCHED_GROUP
+                struct cfs_rq *cfs_rq = se->my_q;
+                struct task_group *tg = NULL
+                if (cfs_rq)
+                        tg = cfs_rq->tg;
+                if (tg && tg->shares < NICE_0_LOAD) {
+                        /*
+                         * scale shares to what it would have been had
+                         * tg->weight been NICE_0_LOAD:
+                         *
+                         *   weight = 1024 * shares / tg->weight
+                         */
+                        lw.weight *= se->load.weight;
+                        lw.weight /= tg->shares;
+                        lw.inv_weight = 0;
+                        se_lw = &lw;
+                        rw += lw.weight - se->load.weight;
+                } else
+#endif
-                weight = cfs_rq->load.weight;
+                if (se->load.weight < NICE_0_LOAD) {
-                if (!se->on_rq)
+                        se_lw = &lw;
-                        weight += se->load.weight;
+                        rw += NICE_0_LOAD - se->load.weight;
+                }
-                vslice *= NICE_0_LOAD;
+                delta = calc_delta_mine(delta, rw, se_lw);
-                do_div(vslice, weight);
        }
-        return vslice;
+        return delta;
 }
 /*
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = delta_exec;
+        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-                delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-                                                        &curr->load);
-        }
        curr->vruntime += delta_exec_weighted;
 }
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+        cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
        list_add(&se->group_node, &cfs_rq->tasks);
@@ -523,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
        list_del_init(&se->group_node);
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS))
+                if (sched_feat(NEW_FAIR_SLEEPERS)) {
-                        vruntime -= sysctl_sched_latency;
+                        unsigned long thresh = sysctl_sched_latency;
+                        /*
+                         * convert the sleeper threshold into virtual time
+                         */
+                        if (sched_feat(NORMALIZED_SLEEPER))
+                                thresh = calc_delta_fair(thresh, se);
+                        vruntime -= thresh;
+                }
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                __enqueue_entity(cfs_rq, se);
 }
-static void update_avg(u64 *avg, u64 sample)
-{
-        s64 diff = sample - *avg;
-        *avg += diff >> 3;
-}
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (!se->last_wakeup)
-                return;
-        update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-        se->last_wakeup = 0;
-}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_stats_dequeue(cfs_rq, se);
        if (sleep) {
-                update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (!cfs_rq->next)
+        struct rq *rq = rq_of(cfs_rq);
-                return se;
+        u64 pair_slice = rq->clock - cfs_rq->pair_start;
-        if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+        if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+                cfs_rq->pair_start = rq->clock;
                return se;
+        }
        return cfs_rq->next;
 }
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                hrtick_start(rq, delta, requeue);
        }
 }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
        }
        return cpu;
 }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
        return cpu;
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+                long wl, long wg)
+{
+        struct sched_entity *se = tg->se[cpu];
+        long more_w;
+        if (!tg->parent)
+                return wl;
+        /*
+         * By not taking the decrease of shares on the other cpu into
+         * account our error leans towards reducing the affine wakeups.
+         */
+        if (!wl && sched_feat(ASYM_EFF_LOAD))
+                return wl;
+        /*
+         * Instead of using this increment, also add the difference
+         * between when the shares were last updated and now.
+         */
+        more_w = se->my_q->load.weight - se->my_q->rq_weight;
+        wl += more_w;
+        wg += more_w;
+        for_each_sched_entity(se) {
+#define D(n) (likely(n) ? (n) : 1)
+                long S, rw, s, a, b;
+                S = se->my_q->tg->shares;
+                s = se->my_q->shares;
+                rw = se->my_q->rq_weight;
+                a = S*(rw + wl);
+                b = S*rw + s*wg;
+                wl = s*(a-b)/D(b);
+                /*
+                 * Assume the group is already running and will
+                 * thus already be accounted for in the weight.
+                 *
+                 * That is, moving shares between CPUs, does not
+                 * alter the group weight.
+                 */
+                wg = 0;
+#undef D
+        }
+        return wl;
+}
+#else
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+                unsigned long wl, unsigned long wg)
+{
+        return wl;
+}
+#endif
 static int
 wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
            unsigned int imbalance)
 {
        struct task_struct *curr = this_rq->curr;
+        struct task_group *tg;
        unsigned long tl = this_load;
        unsigned long tl_per_task;
+        unsigned long weight;
        int balanced;
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
         * effect of the currently running task from the load
         * of the current CPU:
         */
-        if (sync)
+        if (sync) {
-                tl -= current->se.load.weight;
+                tg = task_group(current);
+                weight = current->se.load.weight;
+                tl += effective_load(tg, this_cpu, -weight, -weight);
+                load += effective_load(tg, prev_cpu, 0, -weight);
+        }
-        balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+        tg = task_group(p);
+        weight = p->se.load.weight;
+        balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
        /*
         * If the currently running task will sleep within
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-        if (sync && balanced && curr->sched_class == &fair_sched_class) {
+        if (sync && balanced) {
                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                                p->se.avg_overlap < sysctl_sched_migration_cost)
+                    p->se.avg_overlap < sysctl_sched_migration_cost)
                        return 1;
        }
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
        unsigned long gran = sysctl_sched_wakeup_granularity;
        /*
-         * More easily preempt - nice tasks, while not making
+         * More easily preempt - nice tasks, while not making it harder for
-         * it harder for + nice tasks.
+         * + nice tasks.
         */
-        if (unlikely(se->load.weight > NICE_0_LOAD))
+        if (sched_feat(ASYM_GRAN))
-                gran = calc_delta_fair(gran, &se->load);
+                gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+        else
+                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                return;
        }
-        se->last_wakeup = se->sum_exec_runtime;
        if (unlikely(se == pse))
                return;
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        struct task_struct *p = NULL;
        struct sched_entity *se;
-        if (next == &cfs_rq->tasks)
+        while (next != &cfs_rq->tasks) {
-                return NULL;
-        /* Skip over entities that are not tasks */
-        do {
                se = list_entry(next, struct sched_entity, group_node);
                next = next->next;
-        } while (next != &cfs_rq->tasks && !entity_is_task(se));
-        if (next == &cfs_rq->tasks)
+                /* Skip over entities that are not tasks */
-                return NULL;
+                if (entity_is_task(se)) {
+                        p = task_of(se);
+                        break;
+                }
+        }
        cfs_rq->balance_iterator = next;
-        if (entity_is_task(se))
-                p = task_of(se);
        return p;
 }
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move, struct sched_domain *sd,
+                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+                struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *curr;
+        struct rq_iterator cfs_rq_iterator;
-        struct task_struct *p;
-        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-                return MAX_PRIO;
-        curr = cfs_rq->curr;
-        if (!curr)
-                curr = __pick_next_entity(cfs_rq);
-        p = task_of(curr);
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
+        cfs_rq_iterator.arg = cfs_rq;
-        return p->prio;
+        return balance_tasks(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &cfs_rq_iterator);
 }
-#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-        struct rq_iterator cfs_rq_iterator;
+        int busiest_cpu = cpu_of(busiest);
+        struct task_group *tg;
-        cfs_rq_iterator.start = load_balance_start_fair;
-        cfs_rq_iterator.next = load_balance_next_fair;
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+        rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
+        update_h_load(busiest_cpu);
-                struct cfs_rq *this_cfs_rq;
-                long imbalance;
-                unsigned long maxload;
-                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+        list_for_each_entry(tg, &task_groups, list) {
+                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+                u64 rem_load, moved_load;
-                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                /*
-                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                 * empty group
-                if (imbalance <= 0)
+                 */
+                if (!busiest_cfs_rq->task_weight)
                        continue;
-                /* Don't pull more than imbalance/2 */
+                rem_load = (u64)rem_load_move * busiest_weight;
-                imbalance /= 2;
+                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                maxload = min(rem_load_move, imbalance);
-                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-#else
+                                rem_load, sd, idle, all_pinned, this_best_prio,
-# define maxload rem_load_move
+                                tg->cfs_rq[busiest_cpu]);
-#endif
-                /*
+                if (!moved_load)
-                 * pass busy_cfs_rq argument into
+                        continue;
-                 * load_balance_[start|next]_fair iterators
-                 */
+                moved_load *= busiest_h_load;
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                moved_load = div_u64(moved_load, busiest_weight + 1);
-                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                               maxload, sd, idle, all_pinned,
-                                               this_best_prio,
-                                               &cfs_rq_iterator);
-                if (rem_load_move <= 0)
+                rem_load_move -= moved_load;
+                if (rem_load_move < 0)
                        break;
        }
+        rcu_read_unlock();
        return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                  unsigned long max_load_move,
+                  struct sched_domain *sd, enum cpu_idle_type idle,
+                  int *all_pinned, int *this_best_prio)
+{
+        return __load_balance_fair(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &busiest->cfs);
+}
+#endif
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 /*
 * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(DEADLINE, 1)
+SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0f3c19197fa4..47ceac9e8552 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
 static inline void rt_set_overload(struct rq *rq)
 {
+        if (!rq->online)
+                return;
        cpu_set(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
 static inline void rt_clear_overload(struct rq *rq)
 {
+        if (!rq->online)
+                return;
        /* the order here really doesn't matter */
        atomic_dec(&rq->rd->rto_count);
        cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
        return &rt_rq->tg->rt_bandwidth;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
@@ -220,49 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
        return &def_rt_bandwidth;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
-{
-        int i, idle = 1;
-        cpumask_t span;
-        if (rt_b->rt_runtime == RUNTIME_INF)
-                return 1;
-        span = sched_rt_period_mask();
-        for_each_cpu_mask(i, span) {
-                int enqueue = 0;
-                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-                struct rq *rq = rq_of_rt_rq(rt_rq);
-                spin_lock(&rq->lock);
-                if (rt_rq->rt_time) {
-                        u64 runtime;
-                        spin_lock(&rt_rq->rt_runtime_lock);
-                        runtime = rt_rq->rt_runtime;
-                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-                                rt_rq->rt_throttled = 0;
-                                enqueue = 1;
-                        }
-                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
-                                idle = 0;
-                        spin_unlock(&rt_rq->rt_runtime_lock);
-                } else if (rt_rq->rt_nr_running)
-                        idle = 0;
-                if (enqueue)
-                        sched_rt_rq_enqueue(rt_rq);
-                spin_unlock(&rq->lock);
-        }
-        return idle;
-}
 #ifdef CONFIG_SMP
-static int balance_runtime(struct rt_rq *rt_rq)
+static int do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -281,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
                        continue;
                spin_lock(&iter->rt_runtime_lock);
+                if (iter->rt_runtime == RUNTIME_INF)
+                        goto next;
                diff = iter->rt_runtime - iter->rt_time;
                if (diff > 0) {
                        do_div(diff, weight);
@@ -294,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq)
                                break;
                        }
                }
+next:
                spin_unlock(&iter->rt_runtime_lock);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
        return more;
 }
-#endif
+static void __disable_runtime(struct rq *rq)
+{
+        struct root_domain *rd = rq->rd;
+        struct rt_rq *rt_rq;
+        if (unlikely(!scheduler_running))
+                return;
+        for_each_leaf_rt_rq(rt_rq, rq) {
+                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+                s64 want;
+                int i;
+                spin_lock(&rt_b->rt_runtime_lock);
+                spin_lock(&rt_rq->rt_runtime_lock);
+                if (rt_rq->rt_runtime == RUNTIME_INF ||
+                                rt_rq->rt_runtime == rt_b->rt_runtime)
+                        goto balanced;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                want = rt_b->rt_runtime - rt_rq->rt_runtime;
+                for_each_cpu_mask(i, rd->span) {
+                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                        s64 diff;
+                        if (iter == rt_rq)
+                                continue;
+                        spin_lock(&iter->rt_runtime_lock);
+                        if (want > 0) {
+                                diff = min_t(s64, iter->rt_runtime, want);
+                                iter->rt_runtime -= diff;
+                                want -= diff;
+                        } else {
+                                iter->rt_runtime -= want;
+                                want -= want;
+                        }
+                        spin_unlock(&iter->rt_runtime_lock);
+                        if (!want)
+                                break;
+                }
+                spin_lock(&rt_rq->rt_runtime_lock);
+                BUG_ON(want);
+balanced:
+                rt_rq->rt_runtime = RUNTIME_INF;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                spin_unlock(&rt_b->rt_runtime_lock);
+        }
+}
+static void disable_runtime(struct rq *rq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __disable_runtime(rq);
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
+static void __enable_runtime(struct rq *rq)
+{
+        struct rt_rq *rt_rq;
+        if (unlikely(!scheduler_running))
+                return;
+        for_each_leaf_rt_rq(rt_rq, rq) {
+                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+                spin_lock(&rt_b->rt_runtime_lock);
+                spin_lock(&rt_rq->rt_runtime_lock);
+                rt_rq->rt_runtime = rt_b->rt_runtime;
+                rt_rq->rt_time = 0;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                spin_unlock(&rt_b->rt_runtime_lock);
+        }
+}
+static void enable_runtime(struct rq *rq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __enable_runtime(rq);
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+        int more = 0;
+        if (rt_rq->rt_time > rt_rq->rt_runtime) {
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                more = do_balance_runtime(rt_rq);
+                spin_lock(&rt_rq->rt_runtime_lock);
+        }
+        return more;
+}
+#else /* !CONFIG_SMP */
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+        return 0;
+}
+#endif /* CONFIG_SMP */
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+        int i, idle = 1;
+        cpumask_t span;
+        if (rt_b->rt_runtime == RUNTIME_INF)
+                return 1;
+        span = sched_rt_period_mask();
+        for_each_cpu_mask(i, span) {
+                int enqueue = 0;
+                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+                struct rq *rq = rq_of_rt_rq(rt_rq);
+                spin_lock(&rq->lock);
+                if (rt_rq->rt_time) {
+                        u64 runtime;
+                        spin_lock(&rt_rq->rt_runtime_lock);
+                        if (rt_rq->rt_throttled)
+                                balance_runtime(rt_rq);
+                        runtime = rt_rq->rt_runtime;
+                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+                                rt_rq->rt_throttled = 0;
+                                enqueue = 1;
+                        }
+                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
+                                idle = 0;
+                        spin_unlock(&rt_rq->rt_runtime_lock);
+                } else if (rt_rq->rt_nr_running)
+                        idle = 0;
+                if (enqueue)
+                        sched_rt_rq_enqueue(rt_rq);
+                spin_unlock(&rq->lock);
+        }
+        return idle;
+}
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
@@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
                return 0;
-#ifdef CONFIG_SMP
+        balance_runtime(rt_rq);
-        if (rt_rq->rt_time > runtime) {
+        runtime = sched_rt_runtime(rt_rq);
-                int more;
+        if (runtime == RUNTIME_INF)
+                return 0;
-                spin_unlock(&rt_rq->rt_runtime_lock);
-                more = balance_runtime(rt_rq);
-                spin_lock(&rt_rq->rt_runtime_lock);
-                if (more)
-                        runtime = sched_rt_runtime(rt_rq);
-        }
-#endif
        if (rt_rq->rt_time > runtime) {
                rt_rq->rt_throttled = 1;
@@ -392,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+        if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+                struct rq *rq = rq_of_rt_rq(rt_rq);
                rt_rq->highest_prio = rt_se_prio(rt_se);
+#ifdef CONFIG_SMP
+                if (rq->online)
+                        cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                   rt_se_prio(rt_se));
+#endif
+        }
 #endif
 #ifdef CONFIG_SMP
        if (rt_se->nr_cpus_allowed > 1) {
                struct rq *rq = rq_of_rt_rq(rt_rq);
                rq->rt.rt_nr_migratory++;
        }
@@ -417,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+        int highest_prio = rt_rq->highest_prio;
+#endif
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
@@ -440,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
                rq->rt.rt_nr_migratory--;
        }
+        if (rt_rq->highest_prio != highest_prio) {
+                struct rq *rq = rq_of_rt_rq(rt_rq);
+                if (rq->online)
+                        cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                   rt_rq->highest_prio);
+        }
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -455,6 +588,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
+        struct list_head *queue = array->queue + rt_se_prio(rt_se);
        /*
         * Don't enqueue the group if its throttled, or when empty.
@@ -465,7 +599,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+        if (rt_se->nr_cpus_allowed == 1)
+                list_add(&rt_se->run_list, queue);
+        else
+                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -532,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
                rt_se->timeout = 0;
        enqueue_rt_entity(rt_se);
+        inc_cpu_load(rq, p->se.load.weight);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -540,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se);
+        dec_cpu_load(rq, p->se.load.weight);
 }
 /*
@@ -550,10 +692,12 @@ static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
        struct rt_prio_array *array = &rt_rq->active;
-        struct list_head *queue = array->queue + rt_se_prio(rt_se);
-        if (on_rt_rq(rt_se))
+        if (on_rt_rq(rt_se)) {
-                list_move_tail(&rt_se->run_list, queue);
+                list_del_init(&rt_se->run_list);
+                list_add_tail(&rt_se->run_list,
+                              array->queue + rt_se_prio(rt_se));
+        }
 }
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -616,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-        if (p->prio < rq->curr->prio)
+        if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
+                return;
+        }
+#ifdef CONFIG_SMP
+        /*
+         * If:
+         *
+         * - the newly woken task is of equal priority to the current task
+         * - the newly woken task is non-migratable while current is migratable
+         * - current will be preempted on the next reschedule
+         *
+         * we should check to see if current can readily move to a different
+         * cpu.  If so, we will reschedule to allow the push logic to try
+         * to move current somewhere else, making room for our non-migratable
+         * task.
+         */
+        if((p->prio == rq->curr->prio)
+           && p->rt.nr_cpus_allowed == 1
+           && rq->curr->rt.nr_cpus_allowed != 1) {
+                cpumask_t mask;
+                if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                        /*
+                         * There appears to be other cpus that can accept
+                         * current, so lets reschedule to try and push it away
+                         */
+                        resched_task(rq->curr);
+        }
+#endif
 }
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -720,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-        int       lowest_prio = -1;
-        int       lowest_cpu  = -1;
-        int       count       = 0;
-        int       cpu;
-        cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-        /*
-         * Scan each rq for the lowest prio.
-         */
-        for_each_cpu_mask(cpu, *lowest_mask) {
-                struct rq *rq = cpu_rq(cpu);
-                /* We look for lowest RT prio or non-rt CPU */
-                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                        /*
-                         * if we already found a low RT queue
-                         * and now we found this non-rt queue
-                         * clear the mask and set our bit.
-                         * Otherwise just return the queue as is
-                         * and the count==1 will cause the algorithm
-                         * to use the first bit found.
-                         */
-                        if (lowest_cpu != -1) {
-                                cpus_clear(*lowest_mask);
-                                cpu_set(rq->cpu, *lowest_mask);
-                        }
-                        return 1;
-                }
-                /* no locking for now */
-                if ((rq->rt.highest_prio > task->prio)
-                    && (rq->rt.highest_prio >= lowest_prio)) {
-                        if (rq->rt.highest_prio > lowest_prio) {
-                                /* new low - clear old data */
-                                lowest_prio = rq->rt.highest_prio;
-                                lowest_cpu = cpu;
-                                count = 0;
-                        }
-                        count++;
-                } else
-                        cpu_clear(cpu, *lowest_mask);
-        }
-        /*
-         * Clear out all the set bits that represent
-         * runqueues that were of higher prio than
-         * the lowest_prio.
-         */
-        if (lowest_cpu > 0) {
-                /*
-                 * Perhaps we could add another cpumask op to
-                 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-                 * Then that could be optimized to use memset and such.
-                 */
-                for_each_cpu_mask(cpu, *lowest_mask) {
-                        if (cpu >= lowest_cpu)
-                                break;
-                        cpu_clear(cpu, *lowest_mask);
-                }
-        }
-        return count;
-}
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
        int first;
@@ -808,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task)
        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-        int count    = find_lowest_cpus(task, lowest_mask);
-        if (!count)
+        if (task->rt.nr_cpus_allowed == 1)
-                return -1; /* No targets found */
+                return -1; /* No other targets possible */
-        /*
+        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
-         * There is no sense in performing an optimal search if only one
+                return -1; /* No targets found */
-         * target is found.
-         */
-        if (count == 1)
-                return first_cpu(*lowest_mask);
        /*
         * At this point we have built a mask of cpus representing the
@@ -1163,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 }
 /* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_set_overload(rq);
+        __enable_runtime(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 /* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_clear_overload(rq);
+        __disable_runtime(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 /*
@@ -1336,8 +1445,8 @@ static const struct sched_class rt_sched_class = {
        .load_balance           = load_balance_rt,
        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
-        .join_domain            = join_domain_rt,
+        .rq_online              = rq_online_rt,
-        .leave_domain           = leave_domain_rt,
+        .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_wake_up           = task_wake_up_rt,
@@ -1350,3 +1459,17 @@ static const struct sched_class rt_sched_class = {
        .prio_changed           = prio_changed_rt,
        .switched_to            = switched_to_rt,
 };
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+static void print_rt_stats(struct seq_file *m, int cpu)
+{
+        struct rt_rq *rt_rq;
+        rcu_read_lock();
+        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+                print_rt_rq(m, cpu, rt_rq);
+        rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.cpu_time += delta;
 }
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+        if (rq)
+                rq->rq_sched_info.run_delay += delta;
+}
 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 # define schedstat_set(var, val)        do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
 {}
 static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)       do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 #endif
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+        t->sched_info.last_queued = 0;
+}
 /*
 * Called when a process is dequeued from the active array and given
 * the cpu.  We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 * active queue, thus delaying tasks in the expired queue from running;
 * see scheduler_tick()).
 *
- * This function is only called from sched_info_arrive(), rather than
+ * Though we are interested in knowing how long it was from the *first* time a
- * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * task was queued to the time that it finally hit a cpu, we call this routine
- * times as it is shuffled about, we're really interested in knowing how
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * long it was from the *first* time it was queued to the time that it
+ * delta taken on each cpu would annul the skew.
- * finally hit a cpu.
 */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
-        t->sched_info.last_queued = 0;
+        unsigned long long now = task_rq(t)->clock, delta = 0;
+        if (unlikely(sched_info_on()))
+                if (t->sched_info.last_queued)
+                        delta = now - t->sched_info.last_queued;
+        sched_info_reset_dequeued(t);
+        t->sched_info.run_delay += delta;
+        rq_sched_info_dequeued(task_rq(t), delta);
 }
 /*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
-        sched_info_dequeued(t);
+        sched_info_reset_dequeued(t);
        t->sched_info.run_delay += delta;
        t->sched_info.last_arrival = now;
        t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
                __sched_info_switch(prev, next);
 }
 #else
-#define sched_info_queued(t)            do { } while (0)
+#define sched_info_queued(t)                    do { } while (0)
-#define sched_info_switch(t, next)      do { } while (0)
+#define sched_info_reset_dequeued(t)    do { } while (0)
+#define sched_info_dequeued(t)                  do { } while (0)
+#define sched_info_switch(t, next)              do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe8cdc80ff02 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -266,6 +266,14 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_shares_ratelimit",
+                .data           = &sysctl_sched_shares_ratelimit,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_child_runs_first",
                .data           = &sysctl_sched_child_runs_first,
                .maxlen         = sizeof(unsigned int),
diff --git a/mm/slub.c b/mm/slub.c
index 1a427c0ae83b..315c392253c7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1628,9 +1628,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        void **object;
        struct kmem_cache_cpu *c;
        unsigned long flags;
+        unsigned int objsize;
        local_irq_save(flags);
        c = get_cpu_slab(s, smp_processor_id());
+        objsize = c->objsize;
        if (unlikely(!c->freelist || !node_match(c, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1643,7 +1645,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        local_irq_restore(flags);
        if (unlikely((gfpflags & __GFP_ZERO) && object))
-                memset(object, 0, c->objsize);
+                memset(object, 0, objsize);
        return object;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4b02d14e7ab9..e1600ad8fb0e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1359,17 +1359,17 @@ static int check_leaf(struct trie *t, struct leaf *l,
                        t->stats.semantic_match_miss++;
 #endif
                if (err <= 0)
-                        return plen;
+                        return err;
        }
-        return -1;
+        return 1;
 }
 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
                          struct fib_result *res)
 {
        struct trie *t = (struct trie *) tb->tb_data;
-        int plen, ret = 0;
+        int ret;
        struct node *n;
        struct tnode *pn;
        int pos, bits;
@@ -1393,10 +1393,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
        /* Just a leaf? */
        if (IS_LEAF(n)) {
-                plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+                ret = check_leaf(t, (struct leaf *)n, key, flp, res);
-                if (plen < 0)
-                        goto failed;
-                ret = 0;
                goto found;
        }
@@ -1421,11 +1418,9 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
                }
                if (IS_LEAF(n)) {
-                        plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+                        ret = check_leaf(t, (struct leaf *)n, key, flp, res);
-                        if (plen < 0)
+                        if (ret > 0)
                                goto backtrace;
-                        ret = 0;
                        goto found;
                }
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 7750c97fde7b..ffeaffc3fffe 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -439,8 +439,8 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
                                     unsigned int *len)
 {
        unsigned long subid;
-        unsigned int  size;
        unsigned long *optr;
+        size_t size;
        size = eoc - ctx->pointer + 1;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 5ff0ce6e9d39..7ddc30f0744f 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -224,7 +224,7 @@ static __init int tcpprobe_init(void)
        if (bufsize < 0)
                return -EINVAL;
-        tcp_probe.log = kcalloc(sizeof(struct tcp_log), bufsize, GFP_KERNEL);
+        tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
        if (!tcp_probe.log)
                goto err0;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 147588f4c7c0..ff61a5cdb0b3 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -749,12 +749,12 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
        }
        write_unlock_bh(&idev->lock);
+        addrconf_del_timer(ifp);
        ipv6_ifa_notify(RTM_DELADDR, ifp);
        atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
-        addrconf_del_timer(ifp);
        /*
         * Purge or update corresponding prefix
         *
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 3cd1c993d52b..dcf94fdfb863 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -445,7 +445,7 @@ looped_back:
                        kfree_skb(skb);
                        return -1;
                }
-                if (!ipv6_chk_home_addr(&init_net, addr)) {
+                if (!ipv6_chk_home_addr(dev_net(skb->dst->dev), addr)) {
                        IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
                                         IPSTATS_MIB_INADDRERRORS);
                        kfree_skb(skb);
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index 9e1fb82e3220..2f05ec1037ab 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -101,8 +101,8 @@ static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info)
        hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
                          &irda_nl_family, 0,  IRDA_NL_CMD_GET_MODE);
-        if (IS_ERR(hdr)) {
+        if (hdr == NULL) {
-                ret = PTR_ERR(hdr);
+                ret = -EMSGSIZE;
                goto err_out;
        }
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 98c0b5e56ecc..df0836ff1a20 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -530,8 +530,6 @@ static int ieee80211_stop(struct net_device *dev)
                                local->sta_hw_scanning = 0;
                }
-                flush_workqueue(local->hw.workqueue);
                sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
                kfree(sdata->u.sta.extra_ie);
                sdata->u.sta.extra_ie = NULL;
@@ -555,6 +553,8 @@ static int ieee80211_stop(struct net_device *dev)
                ieee80211_led_radio(local, 0);
+                flush_workqueue(local->hw.workqueue);
                tasklet_disable(&local->tx_pending_tasklet);
                tasklet_disable(&local->tasklet);
        }
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4d2b582dd055..b404537c0bcd 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -547,15 +547,14 @@ static void ieee80211_set_associated(struct net_device *dev,
                        sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
                }
-                netif_carrier_on(dev);
                ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
                memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
                memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
                ieee80211_sta_send_associnfo(dev, ifsta);
        } else {
+                netif_carrier_off(dev);
                ieee80211_sta_tear_down_BA_sessions(dev, ifsta->bssid);
                ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-                netif_carrier_off(dev);
                ieee80211_reset_erp_info(dev);
                sdata->bss_conf.assoc_ht = 0;
@@ -569,6 +568,10 @@ static void ieee80211_set_associated(struct net_device *dev,
        sdata->bss_conf.assoc = assoc;
        ieee80211_bss_info_change_notify(sdata, changed);
+        if (assoc)
+                netif_carrier_on(dev);
        wrqu.ap_addr.sa_family = ARPHRD_ETHER;
        wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
 }
@@ -3611,8 +3614,10 @@ static int ieee80211_sta_find_ibss(struct net_device *dev,
        spin_unlock_bh(&local->sta_bss_lock);
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
-        printk(KERN_DEBUG "   sta_find_ibss: selected %s current "
+        if (found)
-               "%s\n", print_mac(mac, bssid), print_mac(mac2, ifsta->bssid));
+                printk(KERN_DEBUG "   sta_find_ibss: selected %s current "
+                       "%s\n", print_mac(mac, bssid),
+                       print_mac(mac2, ifsta->bssid));
 #endif /* CONFIG_MAC80211_IBSS_DEBUG */
        if (found && memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0 &&
            (bss = ieee80211_rx_bss_get(dev, bssid,
diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h
index 04afc13ed825..4ea7b97d1af1 100644
--- a/net/mac80211/rc80211_pid.h
+++ b/net/mac80211/rc80211_pid.h
@@ -141,7 +141,6 @@ struct rc_pid_events_file_info {
 *      rate behaviour values (lower means we should trust more what we learnt
 *      about behaviour of rates, higher means we should trust more the natural
 *      ordering of rates)
- * @fast_start: if Y, push high rates right after initialization
 */
 struct rc_pid_debugfs_entries {
        struct dentry *dir;
@@ -154,7 +153,6 @@ struct rc_pid_debugfs_entries {
        struct dentry *sharpen_factor;
        struct dentry *sharpen_duration;
        struct dentry *norm_offset;
-        struct dentry *fast_start;
 };
 void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf,
@@ -267,9 +265,6 @@ struct rc_pid_info {
        /* Normalization offset. */
        unsigned int norm_offset;
-        /* Fast starst parameter. */
-        unsigned int fast_start;
        /* Rates information. */
        struct rc_pid_rateinfo *rinfo;
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
index a849b745bdb5..bcd27c1d7594 100644
--- a/net/mac80211/rc80211_pid_algo.c
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -398,13 +398,25 @@ static void *rate_control_pid_alloc(struct ieee80211_local *local)
                return NULL;
        }
+        pinfo->target = RC_PID_TARGET_PF;
+        pinfo->sampling_period = RC_PID_INTERVAL;
+        pinfo->coeff_p = RC_PID_COEFF_P;
+        pinfo->coeff_i = RC_PID_COEFF_I;
+        pinfo->coeff_d = RC_PID_COEFF_D;
+        pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT;
+        pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR;
+        pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION;
+        pinfo->norm_offset = RC_PID_NORM_OFFSET;
+        pinfo->rinfo = rinfo;
+        pinfo->oldrate = 0;
        /* Sort the rates. This is optimized for the most common case (i.e.
         * almost-sorted CCK+OFDM rates). Kind of bubble-sort with reversed
         * mapping too. */
        for (i = 0; i < sband->n_bitrates; i++) {
                rinfo[i].index = i;
                rinfo[i].rev_index = i;
-                if (pinfo->fast_start)
+                if (RC_PID_FAST_START)
                        rinfo[i].diff = 0;
                else
                        rinfo[i].diff = i * pinfo->norm_offset;
@@ -425,19 +437,6 @@ static void *rate_control_pid_alloc(struct ieee80211_local *local)
                        break;
        }
-        pinfo->target = RC_PID_TARGET_PF;
-        pinfo->sampling_period = RC_PID_INTERVAL;
-        pinfo->coeff_p = RC_PID_COEFF_P;
-        pinfo->coeff_i = RC_PID_COEFF_I;
-        pinfo->coeff_d = RC_PID_COEFF_D;
-        pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT;
-        pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR;
-        pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION;
-        pinfo->norm_offset = RC_PID_NORM_OFFSET;
-        pinfo->fast_start = RC_PID_FAST_START;
-        pinfo->rinfo = rinfo;
-        pinfo->oldrate = 0;
 #ifdef CONFIG_MAC80211_DEBUGFS
        de = &pinfo->dentries;
        de->dir = debugfs_create_dir("rc80211_pid",
@@ -465,9 +464,6 @@ static void *rate_control_pid_alloc(struct ieee80211_local *local)
        de->norm_offset = debugfs_create_u32("norm_offset",
                                             S_IRUSR | S_IWUSR, de->dir,
                                             &pinfo->norm_offset);
-        de->fast_start = debugfs_create_bool("fast_start",
-                                             S_IRUSR | S_IWUSR, de->dir,
-                                             &pinfo->fast_start);
 #endif
        return pinfo;
@@ -479,7 +475,6 @@ static void rate_control_pid_free(void *priv)
 #ifdef CONFIG_MAC80211_DEBUGFS
        struct rc_pid_debugfs_entries *de = &pinfo->dentries;
-        debugfs_remove(de->fast_start);
        debugfs_remove(de->norm_offset);
        debugfs_remove(de->sharpen_duration);
        debugfs_remove(de->sharpen_factor);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 271cd01d57ae..dd28fb239a60 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -844,9 +844,15 @@ static int tcp_packet(struct nf_conn *ct,
                        /* Attempt to reopen a closed/aborted connection.
                         * Delete this connection and look up again. */
                        write_unlock_bh(&tcp_lock);
-                        if (del_timer(&ct->timeout))
+                        /* Only repeat if we can actually remove the timer.
+                         * Destruction may already be in progress in process
+                         * context and we must give it a chance to terminate.
+                         */
+                        if (del_timer(&ct->timeout)) {
                                ct->timeout.function((unsigned long)ct);
-                        return -NF_REPEAT;
+                                return -NF_REPEAT;
+                        }
+                        return -NF_DROP;
                }
                /* Fall through */
        case TCP_CONNTRACK_IGNORE:
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index fdc14a0d21af..9080c61b71a5 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -584,12 +584,7 @@ list_start:
        rcu_read_unlock();
        genlmsg_end(ans_skb, data);
+        return genlmsg_reply(ans_skb, info);
-        ret_val = genlmsg_reply(ans_skb, info);
-        if (ret_val != 0)
-                goto list_failure;
-        return 0;
 list_retry:
        /* XXX - this limit is a guesstimate */
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 22c191267808..44be5d5261f4 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -386,11 +386,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
        rcu_read_unlock();
        genlmsg_end(ans_skb, data);
+        return genlmsg_reply(ans_skb, info);
-        ret_val = genlmsg_reply(ans_skb, info);
-        if (ret_val != 0)
-                goto listdef_failure;
-        return 0;
 listdef_failure_lock:
        rcu_read_unlock();
@@ -501,11 +497,7 @@ static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info)
                goto version_failure;
        genlmsg_end(ans_skb, data);
+        return genlmsg_reply(ans_skb, info);
-        ret_val = genlmsg_reply(ans_skb, info);
-        if (ret_val != 0)
-                goto version_failure;
-        return 0;
 version_failure:
        kfree_skb(ans_skb);
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 52b2611a6eb6..56f80872924e 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1107,11 +1107,7 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
                goto list_failure;
        genlmsg_end(ans_skb, data);
+        return genlmsg_reply(ans_skb, info);
-        ret_val = genlmsg_reply(ans_skb, info);
-        if (ret_val != 0)
-                goto list_failure;
-        return 0;
 list_failure:
        kfree_skb(ans_skb);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 0c9d5a6950fe..fcdb45d1071b 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5899,12 +5899,6 @@ static int sctp_eat_data(const struct sctp_association *asoc,
                return SCTP_IERROR_NO_DATA;
        }
-        /* If definately accepting the DATA chunk, record its TSN, otherwise
-         * wait for renege processing.
-         */
-        if (SCTP_CMD_CHUNK_ULP == deliver)
-                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
        chunk->data_accepted = 1;
        /* Note: Some chunks may get overcounted (if we drop) or overcounted
@@ -5924,6 +5918,9 @@ static int sctp_eat_data(const struct sctp_association *asoc,
         * and discard the DATA chunk.
         */
        if (ntohs(data_hdr->stream) >= asoc->c.sinit_max_instreams) {
+                /* Mark tsn as received even though we drop it */
+                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
                err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM,
                                         &data_hdr->stream,
                                         sizeof(data_hdr->stream));
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index ce6cda6b6994..a1f654aea268 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -710,6 +710,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
        if (!skb)
                goto fail;
+        /* Now that all memory allocations for this chunk succeeded, we
+         * can mark it as received so the tsn_map is updated correctly.
+         */
+        sctp_tsnmap_mark(&asoc->peer.tsn_map, ntohl(chunk->subh.data_hdr->tsn));
        /* First calculate the padding, so we don't inadvertently
         * pass up the wrong length to the user.
         *
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b976d9ed10e4..04c41504f84c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -277,9 +277,8 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *
        memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
        x->props.flags = p->flags;
-        if (!x->sel.family)
+        if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
                x->sel.family = p->family;
 }
 /*