125 files changed, 5240 insertions, 1219 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index d9014aa0eb68..e33ee74eee77 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -227,7 +227,6 @@ Each cgroup is represented by a directory in the cgroup file system
 containing the following files describing that cgroup:
 - tasks: list of tasks (by pid) attached to that cgroup
- - releasable flag: cgroup currently removeable?
 - notify_on_release flag: run the release agent on exit?
 - release_agent: the path to use for release notifications (this file
   exists in the top cgroup only)
@@ -360,7 +359,7 @@ Now you want to do something with this cgroup.
 In this directory you can find several files:
 # ls
-notify_on_release releasable tasks
+notify_on_release tasks
 (plus whatever files added by the attached subsystems)
 Now attach your shell to this cgroup:
@@ -479,7 +478,6 @@ newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-(cgroup_mutex held by caller)
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
@@ -498,6 +496,7 @@ remain valid while the caller holds cgroup_mutex.
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
            struct cgroup *old_cgrp, struct task_struct *task)
+(cgroup_mutex held by caller)
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
@@ -511,6 +510,7 @@ void exit(struct cgroup_subsys *ss, struct task_struct *task)
 Called during task exit.
 int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+(cgroup_mutex held by caller)
 Called after creation of a cgroup to allow a subsystem to populate
 the cgroup directory with file entries.  The subsystem should make
@@ -520,6 +520,7 @@ method can return an error code, the error code is currently not
 always handled well.
 void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
+(cgroup_mutex held by caller)
 Called at the end of cgroup_clone() to do any paramater
 initialization which might be required before a task could attach.  For
@@ -527,7 +528,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set
 up.
 void bind(struct cgroup_subsys *ss, struct cgroup *root)
-(cgroup_mutex held by caller)
+(cgroup_mutex and ss->hierarchy_mutex held by caller)
 Called when a cgroup subsystem is rebound to a different hierarchy
 and root cgroup. Currently this will only involve movement between
diff --git a/Documentation/controllers/memcg_test.txt b/Documentation/controllers/memcg_test.txt
new file mode 100644
index 000000000000..08d4d3ea0d79
--- /dev/null
+++ b/Documentation/controllers/memcg_test.txt
@@ -0,0 +1,342 @@
+Memory Resource Controller(Memcg)  Implementation Memo.
+Last Updated: 2008/12/15
+Base Kernel Version: based on 2.6.28-rc8-mm.
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+(*) Topics on API should be in Documentation/controllers/memory.txt)
+0. How to record usage ?
+   2 objects are used.
+   page_cgroup ....an object per page.
+        Allocated at boot or memory hotplug. Freed at memory hot removal.
+   swap_cgroup ... an entry per swp_entry.
+        Allocated at swapon(). Freed at swapoff().
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+1. Charge
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+        mem_cgroup_newpage_charge()
+          Called at new page fault and Copy-On-Write.
+        mem_cgroup_try_charge_swapin()
+          Called at do_swap_page() (page fault on swap entry) and swapoff.
+          Followed by charge-commit-cancel protocol. (With swap accounting)
+          At commit, a charge recorded in swap_cgroup is removed.
+        mem_cgroup_cache_charge()
+          Called at add_to_page_cache()
+        mem_cgroup_cache_charge_swapin()
+          Called at shmem's swapin.
+        mem_cgroup_prepare_migration()
+          Called before migration. "extra" charge is done and followed by
+          charge-commit-cancel protocol.
+          At commit, charge against oldpage or newpage will be committed.
+2. Uncharge
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+        mem_cgroup_uncharge_page()
+          Called when an anonymous page is fully unmapped. I.e., mapcount goes
+          to 0. If the page is SwapCache, uncharge is delayed until
+          mem_cgroup_uncharge_swapcache().
+        mem_cgroup_uncharge_cache_page()
+          Called when a page-cache is deleted from radix-tree. If the page is
+          SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache().
+        mem_cgroup_uncharge_swapcache()
+          Called when SwapCache is removed from radix-tree. The charge itself
+          is moved to swap_cgroup. (If mem+swap controller is disabled, no
+          charge to swap occurs.)
+        mem_cgroup_uncharge_swap()
+          Called when swp_entry's refcnt goes down to 0. A charge against swap
+          disappears.
+        mem_cgroup_end_migration(old, new)
+        At success of migration old is uncharged (if necessary), a charge
+        to new page is committed. At failure, charge to old page is committed.
+3. charge-commit-cancel
+        In some case, we can't know this "charge" is valid or not at charging
+        (because of races).
+        To handle such case, there are charge-commit-cancel functions.
+                mem_cgroup_try_charge_XXX
+                mem_cgroup_commit_charge_XXX
+                mem_cgroup_cancel_charge_XXX
+        these are used in swap-in and migration.
+        At try_charge(), there are no flags to say "this page is charged".
+        at this point, usage += PAGE_SIZE.
+        At commit(), the function checks the page should be charged or not
+        and set flags or avoid charging.(usage -= PAGE_SIZE)
+        At cancel(), simply usage -= PAGE_SIZE.
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+4. Anonymous
+        Anonymous page is newly allocated at
+                  - page fault into MAP_ANONYMOUS mapping.
+                  - Copy-On-Write.
+        It is charged right after it's allocated before doing any page table
+        related operations. Of course, it's uncharged when another page is used
+        for the fault address.
+        At freeing anonymous page (by exit() or munmap()), zap_pte() is called
+        and pages for ptes are freed one by one.(see mm/memory.c). Uncharges
+        are done at page_remove_rmap() when page_mapcount() goes down to 0.
+        Another page freeing is by page-reclaim (vmscan.c) and anonymous
+        pages are swapped out. In this case, the page is marked as
+        PageSwapCache(). uncharge() routine doesn't uncharge the page marked
+        as SwapCache(). It's delayed until __delete_from_swap_cache().
+        4.1 Swap-in.
+        At swap-in, the page is taken from swap-cache. There are 2 cases.
+        (a) If the SwapCache is newly allocated and read, it has no charges.
+        (b) If the SwapCache has been mapped by processes, it has been
+            charged already.
+        This swap-in is one of the most complicated work. In do_swap_page(),
+        following events occur when pte is unchanged.
+        (1) the page (SwapCache) is looked up.
+        (2) lock_page()
+        (3) try_charge_swapin()
+        (4) reuse_swap_page() (may call delete_swap_cache())
+        (5) commit_charge_swapin()
+        (6) swap_free().
+        Considering following situation for example.
+        (A) The page has not been charged before (2) and reuse_swap_page()
+            doesn't call delete_from_swap_cache().
+        (B) The page has not been charged before (2) and reuse_swap_page()
+            calls delete_from_swap_cache().
+        (C) The page has been charged before (2) and reuse_swap_page() doesn't
+            call delete_from_swap_cache().
+        (D) The page has been charged before (2) and reuse_swap_page() calls
+            delete_from_swap_cache().
+            memory.usage/memsw.usage changes to this page/swp_entry will be
+         Case          (A)      (B)       (C)     (D)
+         Event
+       Before (2)     0/ 1     0/ 1      1/ 1    1/ 1
+          ===========================================
+          (3)        +1/+1    +1/+1     +1/+1   +1/+1
+          (4)          -       0/ 0       -     -1/ 0
+          (5)         0/-1     0/ 0     -1/-1    0/ 0
+          (6)          -       0/-1       -      0/-1
+          ===========================================
+       Result         1/ 1     1/ 1      1/ 1    1/ 1
+       In any cases, charges to this page should be 1/ 1.
+        4.2 Swap-out.
+        At swap-out, typical state transition is below.
+        (a) add to swap cache. (marked as SwapCache)
+            swp_entry's refcnt += 1.
+        (b) fully unmapped.
+            swp_entry's refcnt += # of ptes.
+        (c) write back to swap.
+        (d) delete from swap cache. (remove from SwapCache)
+            swp_entry's refcnt -= 1.
+        At (b), the page is marked as SwapCache and not uncharged.
+        At (d), the page is removed from SwapCache and a charge in page_cgroup
+        is moved to swap_cgroup.
+        Finally, at task exit,
+        (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+        Here, a charge in swap_cgroup disappears.
+5. Page Cache
+        Page Cache is charged at
+        - add_to_page_cache_locked().
+        uncharged at
+        - __remove_from_page_cache().
+        The logic is very clear. (About migration, see below)
+        Note: __remove_from_page_cache() is called by remove_from_page_cache()
+        and __remove_mapping().
+6. Shmem(tmpfs) Page Cache
+        Memcg's charge/uncharge have special handlers of shmem. The best way
+        to understand shmem's page state transition is to read mm/shmem.c.
+        But brief explanation of the behavior of memcg around shmem will be
+        helpful to understand the logic.
+        Shmem's page (just leaf page, not direct/indirect block) can be on
+                - radix-tree of shmem's inode.
+                - SwapCache.
+                - Both on radix-tree and SwapCache. This happens at swap-in
+                  and swap-out,
+        It's charged when...
+        - A new page is added to shmem's radix-tree.
+        - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+        It's uncharged when
+        - A page is removed from radix-tree and not SwapCache.
+        - When SwapCache is removed, a charge is moved to swap_cgroup.
+        - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup
+          disappears.
+7. Page Migration
+        One of the most complicated functions is page-migration-handler.
+        Memcg has 2 routines. Assume that we are migrating a page's contents
+        from OLDPAGE to NEWPAGE.
+        Usual migration logic is..
+        (a) remove the page from LRU.
+        (b) allocate NEWPAGE (migration target)
+        (c) lock by lock_page().
+        (d) unmap all mappings.
+        (e-1) If necessary, replace entry in radix-tree.
+        (e-2) move contents of a page.
+        (f) map all mappings again.
+        (g) pushback the page to LRU.
+        (-) OLDPAGE will be freed.
+        Before (g), memcg should complete all necessary charge/uncharge to
+        NEWPAGE/OLDPAGE.
+        The point is....
+        - If OLDPAGE is anonymous, all charges will be dropped at (d) because
+          try_to_unmap() drops all mapcount and the page will not be
+          SwapCache.
+        - If OLDPAGE is SwapCache, charges will be kept at (g) because
+          __delete_from_swap_cache() isn't called at (e-1)
+        - If OLDPAGE is page-cache, charges will be kept at (g) because
+          remove_from_swap_cache() isn't called at (e-1)
+        memcg provides following hooks.
+        - mem_cgroup_prepare_migration(OLDPAGE)
+          Called after (b) to account a charge (usage += PAGE_SIZE) against
+          memcg which OLDPAGE belongs to.
+        - mem_cgroup_end_migration(OLDPAGE, NEWPAGE)
+          Called after (f) before (g).
+          If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already
+          charged, a charge by prepare_migration() is automatically canceled.
+          If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE.
+          But zap_pte() (by exit or munmap) can be called while migration,
+          we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
+8. LRU
+        Each memcg has its own private LRU. Now, it's handling is under global
+        VM's control (means that it's handled under global zone->lru_lock).
+        Almost all routines around memcg's LRU is called by global LRU's
+        list management functions under zone->lru_lock().
+        A special function is mem_cgroup_isolate_pages(). This scans
+        memcg's private LRU and call __isolate_lru_page() to extract a page
+        from LRU.
+        (By __isolate_lru_page(), the page is removed from both of global and
+         private LRU.)
+9. Typical Tests.
+ Tests for racy cases.
+ 9.1 Small limit to memcg.
+        When you do test to do racy case, it's good test to set memcg's limit
+        to be very small rather than GB. Many races found in the test under
+        xKB or xxMB limits.
+        (Memory behavior under GB and Memory behavior under MB shows very
+         different situation.)
+ 9.2 Shmem
+        Historically, memcg's shmem handling was poor and we saw some amount
+        of troubles here. This is because shmem is page-cache but can be
+        SwapCache. Test with shmem/tmpfs is always good test.
+ 9.3 Migration
+        For NUMA, migration is an another special case. To do easy test, cpuset
+        is useful. Following is a sample script to do migration.
+        mount -t cgroup -o cpuset none /opt/cpuset
+        mkdir /opt/cpuset/01
+        echo 1 > /opt/cpuset/01/cpuset.cpus
+        echo 0 > /opt/cpuset/01/cpuset.mems
+        echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+        mkdir /opt/cpuset/02
+        echo 1 > /opt/cpuset/02/cpuset.cpus
+        echo 1 > /opt/cpuset/02/cpuset.mems
+        echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+        In above set, when you moves a task from 01 to 02, page migration to
+        node 0 to node 1 will occur. Following is a script to migrate all
+        under cpuset.
+        --
+        move_task()
+        {
+        for pid in $1
+        do
+                /bin/echo $pid >$2/tasks 2>/dev/null
+                echo -n $pid
+                echo -n " "
+        done
+        echo END
+        }
+        G1_TASK=`cat ${G1}/tasks`
+        G2_TASK=`cat ${G2}/tasks`
+        move_task "${G1_TASK}" ${G2} &
+        --
+ 9.4 Memory hotplug.
+        memory hotplug test is one of good test.
+        to offline memory, do following.
+        # echo offline > /sys/devices/system/memory/memoryXXX/state
+        (XXX is the place of memory)
+        This is an easy way to test page migration, too.
+ 9.5 mkdir/rmdir
+        When using hierarchy, mkdir/rmdir test should be done.
+        Use tests like the following.
+        echo 1 >/opt/cgroup/01/memory/use_hierarchy
+        mkdir /opt/cgroup/01/child_a
+        mkdir /opt/cgroup/01/child_b
+        set limit to 01.
+        add limit to 01/child_b
+        run jobs under child_a and child_b
+        create/delete following groups at random while jobs are running.
+        /opt/cgroup/01/child_a/child_aa
+        /opt/cgroup/01/child_b/child_bb
+        /opt/cgroup/01/child_c
+        running new jobs in new group is also good.
+ 9.6 Mount with other subsystems.
+        Mounting with other subsystems is a good test because there is a
+        race and lock dependency with other cgroup subsystems.
+        example)
+        # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices
+        and do task move, mkdir, rmdir etc...under this.
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 1c07547d3f81..e1501964df1e 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -137,7 +137,32 @@ behind this approach is that a cgroup that aggressively uses a shared
 page will eventually get charged for it (once it is uncharged from
 the cgroup that brought it in -- this will happen on memory pressure).
-2.4 Reclaim
+Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used..
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+When swap is accounted, following files are added.
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+usage of mem+swap is limited by memsw.limit_in_bytes.
+Note: why 'mem+swap' rather than swap.
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+mem+swap.
+In other words, when we want to limit the usage of swap without affecting
+global LRU, mem+swap limit is better than just limiting swap from OS point
+of view.
+2.5 Reclaim
 Each cgroup maintains a per cgroup LRU that consists of an active
 and inactive list. When a cgroup goes over its limit, we first try
@@ -207,12 +232,6 @@ exceeded.
 The memory.stat file gives accounting information. Now, the number of
 caches, RSS and Active pages/Inactive pages are shown.
-The memory.force_empty gives an interface to drop *all* charges by force.
-# echo 1 > memory.force_empty
-will drop all charges in cgroup. Currently, this is maintained for test.
 4. Testing
 Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11].
@@ -242,10 +261,106 @@ reclaimed.
 A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
 cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. Such charges are automatically dropped at
+tasks have migrated away from it.
-rmdir() if there are no tasks.
+Such charges are freed(at default) or moved to its parent. When moved,
+both of RSS and CACHES are moved to parent.
+If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also.
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+5. Misc. interfaces.
+5.1 force_empty
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  You can use this interface only when the cgroup has no tasks.
+  When writing anything to this
+  # echo 0 > memory.force_empty
+  Almost all pages tracked by this memcg will be unmapped and freed. Some of
+  pages cannot be freed because it's locked or in-use. Such pages are moved
+  to parent and this cgroup will be empty. But this may return -EBUSY in
+  some too busy case.
+  Typical use case of this interface is that calling this before rmdir().
+  Because rmdir() moves all pages to parent, some out-of-use page caches can be
+  moved to the parent. If you want to avoid that, force_empty will be useful.
+5.2 stat file
+  memory.stat file includes following statistics (now)
+        cache                   - # of pages from page-cache and shmem.
+        rss                     - # of pages from anonymous memory.
+        pgpgin                  - # of event of charging
+        pgpgout                 - # of event of uncharging
+        active_anon             - # of pages on active lru of anon, shmem.
+        inactive_anon           - # of pages on active lru of anon, shmem
+        active_file             - # of pages on active lru of file-cache
+        inactive_file           - # of pages on inactive lru of file cache
+        unevictable             - # of pages cannot be reclaimed.(mlocked etc)
+        Below is depend on CONFIG_DEBUG_VM.
+        inactive_ratio          - VM inernal parameter. (see mm/page_alloc.c)
+        recent_rotated_anon     - VM internal parameter. (see mm/vmscan.c)
+        recent_rotated_file     - VM internal parameter. (see mm/vmscan.c)
+        recent_scanned_anon     - VM internal parameter. (see mm/vmscan.c)
+        recent_scanned_file     - VM internal parameter. (see mm/vmscan.c)
+  Memo:
+        recent_rotated means recent frequency of lru rotation.
+        recent_scanned means recent # of scans to lru.
+        showing for better debug please see the code for meanings.
+5.3 swappiness
+  Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
+  Following cgroup's swapiness can't be changed.
+  - root cgroup (uses /proc/sys/vm/swappiness).
+  - a cgroup which uses hierarchy and it has child cgroup.
+  - a cgroup which uses hierarchy and not the root of hierarchy.
+6. Hierarchy support
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy
+                root
+             /  |   \
+           /    |    \
+          a     b       c
+                        | \
+                        |  \
+                        d   e
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled.  If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+6.1 Enabling hierarchical accounting and reclaim
+The memory controller by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
+# echo 1 > memory.use_hierarchy
+The feature can be disabled by
+# echo 0 > memory.use_hierarchy
+NOTE1: Enabling/disabling will fail if the cgroup already has other
+cgroups created below it.
+NOTE2: This feature can be enabled/disabled per subtree.
-5. TODO
+7. TODO
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/hwmon/abituguru-datasheet b/Documentation/hwmon/abituguru-datasheet
index 4d184f2db0ea..d9251efdcec7 100644
--- a/Documentation/hwmon/abituguru-datasheet
+++ b/Documentation/hwmon/abituguru-datasheet
@@ -121,7 +121,7 @@ Once all bytes have been read data will hold 0x09, but there is no reason to
 test for this. Notice that the number of bytes is bank address dependent see
 above and below.
-After completing a successfull read it is advised to put the uGuru back in
+After completing a successful read it is advised to put the uGuru back in
 ready mode, so that it is ready for the next read / write cycle. This way
 if your program / driver is unloaded and later loaded again the detection
 algorithm described above will still work.
@@ -141,7 +141,7 @@ don't ask why this is the way it is.
 Once DATA holds 0x01 read CMD it should hold 0xAC now.
-After completing a successfull write it is advised to put the uGuru back in
+After completing a successful write it is advised to put the uGuru back in
 ready mode, so that it is ready for the next read / write cycle. This way
 if your program / driver is unloaded and later loaded again the detection
 algorithm described above will still work.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 532eacbbed62..fb849020aea9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1562,6 +1562,9 @@ and is between 256 and 4096 characters. It is defined in the file
        nosoftlockup    [KNL] Disable the soft-lockup detector.
+        noswapaccount   [KNL] Disable accounting of swap in memory resource
+                        controller. (See Documentation/controllers/memory.txt)
        nosync          [HW,M68K] Disables sync negotiation for all devices.
        notsc           [BUGS=X86-32] Disable Time Stamp Counter
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index 38d324d62b25..e5b071d46619 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -191,7 +191,7 @@ Vport States:
      This is equivalent to a driver "attach" on an adapter, which is
      independent of the adapter's link state.
    - Instantiation of the vport on the FC link via ELS traffic, etc.
-      This is equivalent to a "link up" and successfull link initialization.
+      This is equivalent to a "link up" and successful link initialization.
  Further information can be found in the interfaces section below for
  Vport Creation.
@@ -320,7 +320,7 @@ Vport Creation:
      This is equivalent to a driver "attach" on an adapter, which is
      independent of the adapter's link state.
    - Instantiation of the vport on the FC link via ELS traffic, etc.
-      This is equivalent to a "link up" and successfull link initialization.
+      This is equivalent to a "link up" and successful link initialization.
  The LLDD's vport_create() function will not synchronously wait for both
  parts to be fully completed before returning. It must validate that the
diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX
index 7b0ceaaad7af..d63fa024ac05 100644
--- a/Documentation/w1/masters/00-INDEX
+++ b/Documentation/w1/masters/00-INDEX
@@ -4,5 +4,7 @@ ds2482
        - The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
 ds2490
        - The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
+mxc_w1
+        - W1 master controller driver found on Freescale MX2/MX3 SoCs
 w1-gpio
        - GPIO 1-wire bus master driver.
diff --git a/Documentation/w1/masters/mxc-w1 b/Documentation/w1/masters/mxc-w1
new file mode 100644
index 000000000000..97f6199a7f39
--- /dev/null
+++ b/Documentation/w1/masters/mxc-w1
@@ -0,0 +1,11 @@
+Kernel driver mxc_w1
+====================
+Supported chips:
+  * Freescale MX27, MX31 and probably other i.MX SoCs
+    Datasheets:
+        http://www.freescale.com/files/32bit/doc/data_sheet/MCIMX31.pdf?fpsp=1
+        http://www.freescale.com/files/dsp/MCIMX27.pdf?fpsp=1
+Author: Originally based on Freescale code, prepared for mainline by
+        Sascha Hauer <s.hauer@pengutronix.de>
diff --git a/Documentation/w1/w1.netlink b/Documentation/w1/w1.netlink
index 3640c7c87d45..804445f745ed 100644
--- a/Documentation/w1/w1.netlink
+++ b/Documentation/w1/w1.netlink
@@ -5,69 +5,157 @@ Message types.
 =============
 There are three types of messages between w1 core and userspace:
-1. Events. They are generated each time new master or slave device found
+1. Events. They are generated each time new master or slave device
-        either due to automatic or requested search.
+        found either due to automatic or requested search.
-2. Userspace commands. Includes read/write and search/alarm search comamnds.
+2. Userspace commands.
 3. Replies to userspace commands.
 Protocol.
 ========
-[struct cn_msg] - connector header. It's length field is equal to size of the attached data.
+[struct cn_msg] - connector header.
+        Its length field is equal to size of the attached data
 [struct w1_netlink_msg] - w1 netlink header.
        __u8 type       - message type.
-                        W1_SLAVE_ADD/W1_SLAVE_REMOVE - slave add/remove events.
+                        W1_LIST_MASTERS
-                        W1_MASTER_ADD/W1_MASTER_REMOVE - master add/remove events.
+                                list current bus masters
-                        W1_MASTER_CMD - userspace command for bus master device (search/alarm search).
+                        W1_SLAVE_ADD/W1_SLAVE_REMOVE
-                        W1_SLAVE_CMD - userspace command for slave device (read/write/ search/alarm search
+                                slave add/remove events
-                                        for bus master device where given slave device found).
+                        W1_MASTER_ADD/W1_MASTER_REMOVE
+                                master add/remove events
+                        W1_MASTER_CMD
+                                userspace command for bus master
+                                device (search/alarm search)
+                        W1_SLAVE_CMD
+                                userspace command for slave device
+                                (read/write/touch)
        __u8 res        - reserved
-        __u16 len       - size of attached to this header data.
+        __u16 len       - size of data attached to this header data
        union {
-                __u8 id;                         - slave unique device id
+                __u8 id[8];                      - slave unique device id
                struct w1_mst {
-                        __u32           id;      - master's id.
+                        __u32           id;      - master's id
                        __u32           res;     - reserved
                } mst;
        } id;
-[strucrt w1_netlink_cmd] - command for gived master or slave device.
+[struct w1_netlink_cmd] - command for given master or slave device.
        __u8 cmd        - command opcode.
-                        W1_CMD_READ     - read command.
+                        W1_CMD_READ     - read command
-                        W1_CMD_WRITE    - write command.
+                        W1_CMD_WRITE    - write command
-                        W1_CMD_SEARCH   - search command.
+                        W1_CMD_TOUCH    - touch command
-                        W1_CMD_ALARM_SEARCH - alarm search command.
+                                (write and sample data back to userspace)
+                        W1_CMD_SEARCH   - search command
+                        W1_CMD_ALARM_SEARCH - alarm search command
        __u8 res        - reserved
-        __u16 len       - length of data for this command.
+        __u16 len       - length of data for this command
-                        For read command data must be allocated like for write command.
+                For read command data must be allocated like for write command
-        __u8 data[0]    - data for this command.
+        __u8 data[0]    - data for this command
-Each connector message can include one or more w1_netlink_msg with zero of more attached w1_netlink_cmd messages.
+Each connector message can include one or more w1_netlink_msg with
+zero or more attached w1_netlink_cmd messages.
-For event messages there are no w1_netlink_cmd embedded structures, only connector header
+For event messages there are no w1_netlink_cmd embedded structures,
-and w1_netlink_msg strucutre with "len" field being zero and filled type (one of event types)
+only connector header and w1_netlink_msg strucutre with "len" field
-and id - either 8 bytes of slave unique id in host order, or master's id, which is assigned
+being zero and filled type (one of event types) and id:
-to bus master device when it is added to w1 core.
+either 8 bytes of slave unique id in host order,
+or master's id, which is assigned to bus master device
+when it is added to w1 core.
+Currently replies to userspace commands are only generated for read
+command request. One reply is generated exactly for one w1_netlink_cmd
+read request. Replies are not combined when sent - i.e. typical reply
+messages looks like the following:
-Currently replies to userspace commands are only generated for read command request.
-One reply is generated exactly for one w1_netlink_cmd read request.
-Replies are not combined when sent - i.e. typical reply messages looks like the following:
 [cn_msg][w1_netlink_msg][w1_netlink_cmd]
-cn_msg.len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd) + cmd->len;
+cn_msg.len = sizeof(struct w1_netlink_msg) +
+             sizeof(struct w1_netlink_cmd) +
+             cmd->len;
 w1_netlink_msg.len = sizeof(struct w1_netlink_cmd) + cmd->len;
 w1_netlink_cmd.len = cmd->len;
+Replies to W1_LIST_MASTERS should send a message back to the userspace
+which will contain list of all registered master ids in the following
+format:
+        cn_msg (CN_W1_IDX.CN_W1_VAL as id, len is equal to sizeof(struct
+        w1_netlink_msg) plus number of masters multipled by 4)
+        w1_netlink_msg (type: W1_LIST_MASTERS, len is equal to
+                number of masters multiplied by 4 (u32 size))
+        id0 ... idN
+        Each message is at most 4k in size, so if number of master devices
+        exceeds this, it will be split into several messages,
+        cn.seq will be increased for each one.
+W1 search and alarm search commands.
+request:
+[cn_msg]
+  [w1_netlink_msg type = W1_MASTER_CMD
+        id is equal to the bus master id to use for searching]
+  [w1_netlink_cmd cmd = W1_CMD_SEARCH or W1_CMD_ALARM_SEARCH]
+reply:
+  [cn_msg, ack = 1 and increasing, 0 means the last message,
+        seq is equal to the request seq]
+  [w1_netlink_msg type = W1_MASTER_CMD]
+  [w1_netlink_cmd cmd = W1_CMD_SEARCH or W1_CMD_ALARM_SEARCH
+        len is equal to number of IDs multiplied by 8]
+  [64bit-id0 ... 64bit-idN]
+Length in each header corresponds to the size of the data behind it, so
+w1_netlink_cmd->len = N * 8; where N is number of IDs in this message.
+        Can be zero.
+w1_netlink_msg->len = sizeof(struct w1_netlink_cmd) + N * 8;
+cn_msg->len = sizeof(struct w1_netlink_msg) +
+              sizeof(struct w1_netlink_cmd) +
+              N*8;
+W1 reset command.
+[cn_msg]
+  [w1_netlink_msg type = W1_MASTER_CMD
+        id is equal to the bus master id to use for searching]
+  [w1_netlink_cmd cmd = W1_CMD_RESET]
+Command status replies.
+======================
+Each command (either root, master or slave with or without w1_netlink_cmd
+structure) will be 'acked' by the w1 core. Format of the reply is the same
+as request message except that length parameters do not account for data
+requested by the user, i.e. read/write/touch IO requests will not contain
+data, so w1_netlink_cmd.len will be 0, w1_netlink_msg.len will be size
+of the w1_netlink_cmd structure and cn_msg.len will be equal to the sum
+of the sizeof(struct w1_netlink_msg) and sizeof(struct w1_netlink_cmd).
+If reply is generated for master or root command (which do not have
+w1_netlink_cmd attached), reply will contain only cn_msg and w1_netlink_msg
+structires.
+w1_netlink_msg.status field will carry positive error value
+(EINVAL for example) or zero in case of success.
+All other fields in every structure will mirror the same parameters in the
+request message (except lengths as described above).
+Status reply is generated for every w1_netlink_cmd embedded in the
+w1_netlink_msg, if there are no w1_netlink_cmd structures,
+reply will be generated for the w1_netlink_msg.
+All w1_netlink_cmd command structures are handled in every w1_netlink_msg,
+even if there were errors, only length mismatch interrupts message processing.
 Operation steps in w1 core when new command is received.
 =======================================================
-When new message (w1_netlink_msg) is received w1 core detects if it is master of slave request,
+When new message (w1_netlink_msg) is received w1 core detects if it is
-according to w1_netlink_msg.type field.
+master or slave request, according to w1_netlink_msg.type field.
 Then master or slave device is searched for.
-When found, master device (requested or those one on where slave device is found) is locked.
+When found, master device (requested or those one on where slave device
-If slave command is requested, then reset/select procedure is started to select given device.
+is found) is locked. If slave command is requested, then reset/select
+procedure is started to select given device.
 Then all requested in w1_netlink_msg operations are performed one by one.
 If command requires reply (like read command) it is sent on command completion.
@@ -82,8 +170,8 @@ Connector [1] specific documentation.
 Each connector message includes two u32 fields as "address".
 w1 uses CN_W1_IDX and CN_W1_VAL defined in include/linux/connector.h header.
 Each message also includes sequence and acknowledge numbers.
-Sequence number for event messages is appropriate bus master sequence number increased with
+Sequence number for event messages is appropriate bus master sequence number
-each event message sent "through" this master.
+increased with each event message sent "through" this master.
 Sequence number for userspace requests is set by userspace application.
 Sequence number for reply is the same as was in request, and
 acknowledge number is set to seq+1.
@@ -93,6 +181,6 @@ Additional documantion, source code examples.
 ============================================
 1. Documentation/connector
-2. http://tservice.net.ru/~s0mbre/archive/w1
+2. http://www.ioremap.net/archive/w1
-This archive includes userspace application w1d.c which
+This archive includes userspace application w1d.c which uses
-uses read/write/search commands for all master/slave devices found on the bus.
+read/write/search commands for all master/slave devices found on the bus.
diff --git a/MAINTAINERS b/MAINTAINERS
index 246878f41cf5..57e0309243cc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1360,6 +1360,11 @@ P:	Maciej W. Rozycki
 M:      macro@linux-mips.org
 S:      Maintained
+DELL LAPTOP DRIVER
+P:      Matthew Garrett
+M:      mjg59@srcf.ucam.org
+S:      Maintained
 DELL LAPTOP SMM DRIVER
 P:      Massimo Dal Zotto
 M:      dz@debian.org
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 15c62d3ca129..3bf908e2873a 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -314,7 +314,7 @@ extern char *isolated_loader;
 *      we need to call spu_release(ctx) before sleeping, and
 *      then spu_acquire(ctx) when awoken.
 *
- *      Returns with state_mutex re-acquired when successfull or
+ *      Returns with state_mutex re-acquired when successful or
 *      with -ERESTARTSYS and the state_mutex dropped when interrupted.
 */
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 4f8d60586b07..8040376c4890 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -54,7 +54,8 @@ extern int __smp4m_processor_id(void);
 #define SMP_PRINTK(x)
 #endif
-static inline unsigned long swap(volatile unsigned long *ptr, unsigned long val)
+static inline unsigned long
+swap_ulong(volatile unsigned long *ptr, unsigned long val)
 {
        __asm__ __volatile__("swap [%1], %0\n\t" :
                             "=&r" (val), "=&r" (ptr) :
@@ -90,7 +91,7 @@ void __cpuinit smp4m_callin(void)
         * to call the scheduler code.
         */
        /* Allow master to continue. */
-        swap(&cpu_callin_map[cpuid], 1);
+        swap_ulong(&cpu_callin_map[cpuid], 1);
        /* XXX: What's up with all the flushes? */
        local_flush_cache_all();
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index b0461856acfb..a4cff5d6e380 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -982,7 +982,7 @@ static int __init longhaul_init(void)
        case 10:
                printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
        default:
-                ;;
+                ;
        }
        return -ENODEV;
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 088885ed51b9..e1c7611e9144 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -64,7 +64,7 @@
 #include <linux/jiffies.h>
 #include "iphase.h"               
 #include "suni.h"                 
-#define swap(x) (((x & 0xff) << 8) | ((x & 0xff00) >> 8))  
+#define swap_byte_order(x) (((x & 0xff) << 8) | ((x & 0xff00) >> 8))
 #define PRIV(dev) ((struct suni_priv *) dev->phy_data)
@@ -1306,7 +1306,7 @@ static void rx_dle_intr(struct atm_dev *dev)
          // get real pkt length  pwang_test
          trailer = (struct cpcs_trailer*)((u_char *)skb->data +
                                 skb->len - sizeof(*trailer));
-          length =  swap(trailer->length);
+          length = swap_byte_order(trailer->length);
          if ((length > iadev->rx_buf_sz) || (length > 
                              (skb->len - sizeof(struct cpcs_trailer))))
          {
@@ -2995,7 +2995,7 @@ static int ia_pkt_tx (struct atm_vcc *vcc, struct sk_buff *skb) {
                skb->len, PCI_DMA_TODEVICE);
        wr_ptr->local_pkt_addr = (buf_desc_ptr->buf_start_hi << 16) | 
                                                  buf_desc_ptr->buf_start_lo;  
-        /* wr_ptr->bytes = swap(total_len);     didn't seem to affect ?? */  
+        /* wr_ptr->bytes = swap_byte_order(total_len); didn't seem to affect?? */
        wr_ptr->bytes = skb->len;  
        /* hw bug - DLEs of 0x2d, 0x2e, 0x2f cause DMA lockup */
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 112a6ba9a96f..146c97613da0 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -32,7 +32,7 @@
 /* These are global because they are accessed in tty_io.c */
 #ifdef CONFIG_UNIX98_PTYS
-struct tty_driver *ptm_driver;
+static struct tty_driver *ptm_driver;
 static struct tty_driver *pts_driver;
 #endif
diff --git a/drivers/char/tpm/tpm_nsc.c b/drivers/char/tpm/tpm_nsc.c
index ab18c1e7b115..70efba2ee053 100644
--- a/drivers/char/tpm/tpm_nsc.c
+++ b/drivers/char/tpm/tpm_nsc.c
@@ -273,12 +273,23 @@ static void tpm_nsc_remove(struct device *dev)
        }
 }
-static struct device_driver nsc_drv = {
+static int tpm_nsc_suspend(struct platform_device *dev, pm_message_t msg)
-        .name = "tpm_nsc",
+{
-        .bus = &platform_bus_type,
+        return tpm_pm_suspend(&dev->dev, msg);
-        .owner = THIS_MODULE,
+}
-        .suspend = tpm_pm_suspend,
-        .resume = tpm_pm_resume,
+static int tpm_nsc_resume(struct platform_device *dev)
+{
+        return tpm_pm_resume(&dev->dev);
+}
+static struct platform_driver nsc_drv = {
+        .suspend         = tpm_nsc_suspend,
+        .resume          = tpm_nsc_resume,
+        .driver          = {
+                .name    = "tpm_nsc",
+                .owner   = THIS_MODULE,
+        },
 };
 static int __init init_nsc(void)
@@ -297,7 +308,7 @@ static int __init init_nsc(void)
                        return -ENODEV;
        }
-        err = driver_register(&nsc_drv);
+        err = platform_driver_register(&nsc_drv);
        if (err)
                return err;
@@ -308,17 +319,15 @@ static int __init init_nsc(void)
        /* enable the DPM module */
        tpm_write_index(nscAddrBase, NSC_LDC_INDEX, 0x01);
-        pdev = kzalloc(sizeof(struct platform_device), GFP_KERNEL);
+        pdev = platform_device_alloc("tpm_nscl0", -1);
        if (!pdev) {
                rc = -ENOMEM;
                goto err_unreg_drv;
        }
-        pdev->name = "tpm_nscl0";
-        pdev->id = -1;
        pdev->num_resources = 0;
+        pdev->dev.driver = &nsc_drv.driver;
        pdev->dev.release = tpm_nsc_remove;
-        pdev->dev.driver = &nsc_drv;
        if ((rc = platform_device_register(pdev)) < 0)
                goto err_free_dev;
@@ -377,7 +386,7 @@ err_unreg_dev:
 err_free_dev:
        kfree(pdev);
 err_unreg_drv:
-        driver_unregister(&nsc_drv);
+        platform_driver_unregister(&nsc_drv);
        return rc;
 }
@@ -390,7 +399,7 @@ static void __exit cleanup_nsc(void)
                pdev = NULL;
        }
-        driver_unregister(&nsc_drv);
+        platform_driver_unregister(&nsc_drv);
 }
 module_init(init_nsc);
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 80014213fb53..7900bd63b36d 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -969,8 +969,7 @@ int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows)
 *      Takes the console sem and the called methods then take the tty
 *      termios_mutex and the tty ctrl_lock in that order.
 */
+static int vt_resize(struct tty_struct *tty, struct winsize *ws)
-int vt_resize(struct tty_struct *tty, struct winsize *ws)
 {
        struct vc_data *vc = tty->driver_data;
        int ret;
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 50a071f1c945..777fba48d2d3 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -238,11 +238,11 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
 }
 /**
- * smi_request: generate SMI request
+ * dcdbas_smi_request: generate SMI request
 *
 * Called with smi_data_lock.
 */
-static int smi_request(struct smi_cmd *smi_cmd)
+int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 {
        cpumask_t old_mask;
        int ret = 0;
@@ -309,14 +309,14 @@ static ssize_t smi_request_store(struct device *dev,
        switch (val) {
        case 2:
                /* Raw SMI */
-                ret = smi_request(smi_cmd);
+                ret = dcdbas_smi_request(smi_cmd);
                if (!ret)
                        ret = count;
                break;
        case 1:
                /* Calling Interface SMI */
                smi_cmd->ebx = (u32) virt_to_phys(smi_cmd->command_buffer);
-                ret = smi_request(smi_cmd);
+                ret = dcdbas_smi_request(smi_cmd);
                if (!ret)
                        ret = count;
                break;
@@ -333,6 +333,7 @@ out:
        mutex_unlock(&smi_data_lock);
        return ret;
 }
+EXPORT_SYMBOL(dcdbas_smi_request);
 /**
 * host_control_smi: generate host control SMI
diff --git a/drivers/firmware/dcdbas.h b/drivers/firmware/dcdbas.h
index 87bc3417de27..ca3cb0a54ab6 100644
--- a/drivers/firmware/dcdbas.h
+++ b/drivers/firmware/dcdbas.h
@@ -101,5 +101,7 @@ struct apm_cmd {
        } __attribute__ ((packed)) parameters;
 } __attribute__ ((packed));
+int dcdbas_smi_request(struct smi_cmd *smi_cmd);
 #endif /* _DCDBAS_H_ */
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 3bf8ee120d42..261b9aa3f248 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -56,9 +56,9 @@ struct memmap_attribute {
        ssize_t (*show)(struct firmware_map_entry *entry, char *buf);
 };
-struct memmap_attribute memmap_start_attr = __ATTR_RO(start);
+static struct memmap_attribute memmap_start_attr = __ATTR_RO(start);
-struct memmap_attribute memmap_end_attr   = __ATTR_RO(end);
+static struct memmap_attribute memmap_end_attr   = __ATTR_RO(end);
-struct memmap_attribute memmap_type_attr  = __ATTR_RO(type);
+static struct memmap_attribute memmap_type_attr  = __ATTR_RO(type);
 /*
 * These are default attributes that are added for every memmap entry.
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index a812db243477..6ba57e91d7ab 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -2705,7 +2705,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                        sizeof(struct ietf_mpa_frame));
-        /* notify OF layer that accept event was successfull */
+        /* notify OF layer that accept event was successful */
        cm_id->add_ref(cm_id);
        cm_event.event = IW_CM_EVENT_ESTABLISHED;
diff --git a/drivers/isdn/hardware/eicon/debuglib.h b/drivers/isdn/hardware/eicon/debuglib.h
index 016410cf2273..8ea587783e14 100644
--- a/drivers/isdn/hardware/eicon/debuglib.h
+++ b/drivers/isdn/hardware/eicon/debuglib.h
@@ -235,7 +235,7 @@ typedef void ( * DbgOld) (unsigned short, char *, va_list) ;
 typedef void ( * DbgEv)  (unsigned short, unsigned long, va_list) ;
 typedef void ( * DbgIrq) (unsigned short, int, char *, va_list) ;
 typedef struct _DbgHandle_
-{ char    Registered ; /* driver successfull registered */
+{ char    Registered ; /* driver successfully registered */
 #define DBG_HANDLE_REG_NEW 0x01  /* this (new) structure    */
 #define DBG_HANDLE_REG_OLD 0x7f  /* old structure (see below)  */
 char    Version;  /* version of this structure  */
diff --git a/drivers/isdn/hardware/eicon/os_4bri.c b/drivers/isdn/hardware/eicon/os_4bri.c
index 7b4ec3f60dbf..c964b8d91ada 100644
--- a/drivers/isdn/hardware/eicon/os_4bri.c
+++ b/drivers/isdn/hardware/eicon/os_4bri.c
@@ -997,7 +997,7 @@ diva_4bri_start_adapter(PISDN_ADAPTER IoAdapter,
        diva_xdi_display_adapter_features(IoAdapter->ANum);
        for (i = 0; i < IoAdapter->tasks; i++) {
-                DBG_LOG(("A(%d) %s adapter successfull started",
+                DBG_LOG(("A(%d) %s adapter successfully started",
                         IoAdapter->QuadroList->QuadroAdapter[i]->ANum,
                         (IoAdapter->tasks == 1) ? "BRI 2.0" : "4BRI"))
                diva_xdi_didd_register_adapter(IoAdapter->QuadroList->QuadroAdapter[i]->ANum);
diff --git a/drivers/isdn/hardware/eicon/os_bri.c b/drivers/isdn/hardware/eicon/os_bri.c
index f31bba5b16ff..08f01993f46b 100644
--- a/drivers/isdn/hardware/eicon/os_bri.c
+++ b/drivers/isdn/hardware/eicon/os_bri.c
@@ -736,7 +736,7 @@ diva_bri_start_adapter(PISDN_ADAPTER IoAdapter,
        IoAdapter->Properties.Features = (word) features;
        diva_xdi_display_adapter_features(IoAdapter->ANum);
-        DBG_LOG(("A(%d) BRI adapter successfull started", IoAdapter->ANum))
+        DBG_LOG(("A(%d) BRI adapter successfully started", IoAdapter->ANum))
            /*
               Register with DIDD
             */
diff --git a/drivers/isdn/hardware/eicon/os_pri.c b/drivers/isdn/hardware/eicon/os_pri.c
index 903356547b79..5d65405c75f4 100644
--- a/drivers/isdn/hardware/eicon/os_pri.c
+++ b/drivers/isdn/hardware/eicon/os_pri.c
@@ -513,7 +513,7 @@ diva_pri_start_adapter(PISDN_ADAPTER IoAdapter,
        diva_xdi_display_adapter_features(IoAdapter->ANum);
-        DBG_LOG(("A(%d) PRI adapter successfull started", IoAdapter->ANum))
+        DBG_LOG(("A(%d) PRI adapter successfully started", IoAdapter->ANum))
        /*
           Register with DIDD
         */
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index fee7304102af..3949a1c73451 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -498,6 +498,18 @@ config SGI_GRU_DEBUG
        This option enables addition debugging code for the SGI GRU driver. If
        you are unsure, say N.
+config DELL_LAPTOP
+        tristate "Dell Laptop Extras (EXPERIMENTAL)"
+        depends on X86
+        depends on DCDBAS
+        depends on EXPERIMENTAL
+        depends on BACKLIGHT_CLASS_DEVICE
+        depends on RFKILL
+        default n
+        ---help---
+        This driver adds support for rfkill and backlight control to Dell
+        laptops.
 source "drivers/misc/c2port/Kconfig"
 endif # MISC_DEVICES
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 817f7f5ab3bd..5de863a0e395 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_ICS932S401)	+= ics932s401.o
 obj-$(CONFIG_TC1100_WMI)        += tc1100-wmi.o
 obj-$(CONFIG_LKDTM)             += lkdtm.o
 obj-$(CONFIG_TIFM_CORE)         += tifm_core.o
+obj-$(CONFIG_DELL_LAPTOP)       += dell-laptop.o
 obj-$(CONFIG_TIFM_7XX1)         += tifm_7xx1.o
 obj-$(CONFIG_PHANTOM)           += phantom.o
 obj-$(CONFIG_SGI_IOC4)          += ioc4.o
diff --git a/drivers/misc/dell-laptop.c b/drivers/misc/dell-laptop.c
new file mode 100644
index 000000000000..4d33a2068b7a
--- /dev/null
+++ b/drivers/misc/dell-laptop.c
@@ -0,0 +1,436 @@
+/*
+ *  Driver for Dell laptop extras
+ *
+ *  Copyright (c) Red Hat <mjg@redhat.com>
+ *
+ *  Based on documentation in the libsmbios package, Copyright (C) 2005 Dell
+ *  Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/backlight.h>
+#include <linux/err.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+#include <linux/rfkill.h>
+#include <linux/power_supply.h>
+#include <linux/acpi.h>
+#include "../firmware/dcdbas.h"
+#define BRIGHTNESS_TOKEN 0x7d
+/* This structure will be modified by the firmware when we enter
+ * system management mode, hence the volatiles */
+struct calling_interface_buffer {
+        u16 class;
+        u16 select;
+        volatile u32 input[4];
+        volatile u32 output[4];
+} __packed;
+struct calling_interface_token {
+        u16 tokenID;
+        u16 location;
+        union {
+                u16 value;
+                u16 stringlength;
+        };
+};
+struct calling_interface_structure {
+        struct dmi_header header;
+        u16 cmdIOAddress;
+        u8 cmdIOCode;
+        u32 supportedCmds;
+        struct calling_interface_token tokens[];
+} __packed;
+static int da_command_address;
+static int da_command_code;
+static int da_num_tokens;
+static struct calling_interface_token *da_tokens;
+static struct backlight_device *dell_backlight_device;
+static struct rfkill *wifi_rfkill;
+static struct rfkill *bluetooth_rfkill;
+static struct rfkill *wwan_rfkill;
+static const struct dmi_system_id __initdata dell_device_table[] = {
+        {
+                .ident = "Dell laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_CHASSIS_TYPE, "8"),
+                },
+        },
+        { }
+};
+static void parse_da_table(const struct dmi_header *dm)
+{
+        /* Final token is a terminator, so we don't want to copy it */
+        int tokens = (dm->length-11)/sizeof(struct calling_interface_token)-1;
+        struct calling_interface_structure *table =
+                container_of(dm, struct calling_interface_structure, header);
+        /* 4 bytes of table header, plus 7 bytes of Dell header, plus at least
+           6 bytes of entry */
+        if (dm->length < 17)
+                return;
+        da_command_address = table->cmdIOAddress;
+        da_command_code = table->cmdIOCode;
+        da_tokens = krealloc(da_tokens, (da_num_tokens + tokens) *
+                             sizeof(struct calling_interface_token),
+                             GFP_KERNEL);
+        if (!da_tokens)
+                return;
+        memcpy(da_tokens+da_num_tokens, table->tokens,
+               sizeof(struct calling_interface_token) * tokens);
+        da_num_tokens += tokens;
+}
+static void find_tokens(const struct dmi_header *dm)
+{
+        switch (dm->type) {
+        case 0xd4: /* Indexed IO */
+                break;
+        case 0xd5: /* Protected Area Type 1 */
+                break;
+        case 0xd6: /* Protected Area Type 2 */
+                break;
+        case 0xda: /* Calling interface */
+                parse_da_table(dm);
+                break;
+        }
+}
+static int find_token_location(int tokenid)
+{
+        int i;
+        for (i = 0; i < da_num_tokens; i++) {
+                if (da_tokens[i].tokenID == tokenid)
+                        return da_tokens[i].location;
+        }
+        return -1;
+}
+static struct calling_interface_buffer *
+dell_send_request(struct calling_interface_buffer *buffer, int class,
+                  int select)
+{
+        struct smi_cmd command;
+        command.magic = SMI_CMD_MAGIC;
+        command.command_address = da_command_address;
+        command.command_code = da_command_code;
+        command.ebx = virt_to_phys(buffer);
+        command.ecx = 0x42534931;
+        buffer->class = class;
+        buffer->select = select;
+        dcdbas_smi_request(&command);
+        return buffer;
+}
+/* Derived from information in DellWirelessCtl.cpp:
+   Class 17, select 11 is radio control. It returns an array of 32-bit values.
+   result[0]: return code
+   result[1]:
+     Bit 0:      Hardware switch supported
+     Bit 1:      Wifi locator supported
+     Bit 2:      Wifi is supported
+     Bit 3:      Bluetooth is supported
+     Bit 4:      WWAN is supported
+     Bit 5:      Wireless keyboard supported
+     Bits 6-7:   Reserved
+     Bit 8:      Wifi is installed
+     Bit 9:      Bluetooth is installed
+     Bit 10:     WWAN is installed
+     Bits 11-15: Reserved
+     Bit 16:     Hardware switch is on
+     Bit 17:     Wifi is blocked
+     Bit 18:     Bluetooth is blocked
+     Bit 19:     WWAN is blocked
+     Bits 20-31: Reserved
+   result[2]: NVRAM size in bytes
+   result[3]: NVRAM format version number
+*/
+static int dell_rfkill_set(int radio, enum rfkill_state state)
+{
+        struct calling_interface_buffer buffer;
+        int disable = (state == RFKILL_STATE_UNBLOCKED) ? 0 : 1;
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        buffer.input[0] = (1 | (radio<<8) | (disable << 16));
+        dell_send_request(&buffer, 17, 11);
+        return 0;
+}
+static int dell_wifi_set(void *data, enum rfkill_state state)
+{
+        return dell_rfkill_set(1, state);
+}
+static int dell_bluetooth_set(void *data, enum rfkill_state state)
+{
+        return dell_rfkill_set(2, state);
+}
+static int dell_wwan_set(void *data, enum rfkill_state state)
+{
+        return dell_rfkill_set(3, state);
+}
+static int dell_rfkill_get(int bit, enum rfkill_state *state)
+{
+        struct calling_interface_buffer buffer;
+        int status;
+        int new_state = RFKILL_STATE_HARD_BLOCKED;
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        dell_send_request(&buffer, 17, 11);
+        status = buffer.output[1];
+        if (status & (1<<16))
+                new_state = RFKILL_STATE_SOFT_BLOCKED;
+        if (status & (1<<bit))
+                *state = new_state;
+        else
+                *state = RFKILL_STATE_UNBLOCKED;
+        return 0;
+}
+static int dell_wifi_get(void *data, enum rfkill_state *state)
+{
+        return dell_rfkill_get(17, state);
+}
+static int dell_bluetooth_get(void *data, enum rfkill_state *state)
+{
+        return dell_rfkill_get(18, state);
+}
+static int dell_wwan_get(void *data, enum rfkill_state *state)
+{
+        return dell_rfkill_get(19, state);
+}
+static int dell_setup_rfkill(void)
+{
+        struct calling_interface_buffer buffer;
+        int status;
+        int ret;
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        dell_send_request(&buffer, 17, 11);
+        status = buffer.output[1];
+        if ((status & (1<<2|1<<8)) == (1<<2|1<<8)) {
+                wifi_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_WLAN);
+                if (!wifi_rfkill)
+                        goto err_wifi;
+                wifi_rfkill->name = "dell-wifi";
+                wifi_rfkill->toggle_radio = dell_wifi_set;
+                wifi_rfkill->get_state = dell_wifi_get;
+                ret = rfkill_register(wifi_rfkill);
+                if (ret)
+                        goto err_wifi;
+        }
+        if ((status & (1<<3|1<<9)) == (1<<3|1<<9)) {
+                bluetooth_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_BLUETOOTH);
+                if (!bluetooth_rfkill)
+                        goto err_bluetooth;
+                bluetooth_rfkill->name = "dell-bluetooth";
+                bluetooth_rfkill->toggle_radio = dell_bluetooth_set;
+                bluetooth_rfkill->get_state = dell_bluetooth_get;
+                ret = rfkill_register(bluetooth_rfkill);
+                if (ret)
+                        goto err_bluetooth;
+        }
+        if ((status & (1<<4|1<<10)) == (1<<4|1<<10)) {
+                wwan_rfkill = rfkill_allocate(NULL, RFKILL_TYPE_WWAN);
+                if (!wwan_rfkill)
+                        goto err_wwan;
+                wwan_rfkill->name = "dell-wwan";
+                wwan_rfkill->toggle_radio = dell_wwan_set;
+                wwan_rfkill->get_state = dell_wwan_get;
+                ret = rfkill_register(wwan_rfkill);
+                if (ret)
+                        goto err_wwan;
+        }
+        return 0;
+err_wwan:
+        if (wwan_rfkill)
+                rfkill_free(wwan_rfkill);
+        if (bluetooth_rfkill) {
+                rfkill_unregister(bluetooth_rfkill);
+                bluetooth_rfkill = NULL;
+        }
+err_bluetooth:
+        if (bluetooth_rfkill)
+                rfkill_free(bluetooth_rfkill);
+        if (wifi_rfkill) {
+                rfkill_unregister(wifi_rfkill);
+                wifi_rfkill = NULL;
+        }
+err_wifi:
+        if (wifi_rfkill)
+                rfkill_free(wifi_rfkill);
+        return ret;
+}
+static int dell_send_intensity(struct backlight_device *bd)
+{
+        struct calling_interface_buffer buffer;
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        buffer.input[0] = find_token_location(BRIGHTNESS_TOKEN);
+        buffer.input[1] = bd->props.brightness;
+        if (buffer.input[0] == -1)
+                return -ENODEV;
+        if (power_supply_is_system_supplied() > 0)
+                dell_send_request(&buffer, 1, 2);
+        else
+                dell_send_request(&buffer, 1, 1);
+        return 0;
+}
+static int dell_get_intensity(struct backlight_device *bd)
+{
+        struct calling_interface_buffer buffer;
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        buffer.input[0] = find_token_location(BRIGHTNESS_TOKEN);
+        if (buffer.input[0] == -1)
+                return -ENODEV;
+        if (power_supply_is_system_supplied() > 0)
+                dell_send_request(&buffer, 0, 2);
+        else
+                dell_send_request(&buffer, 0, 1);
+        return buffer.output[1];
+}
+static struct backlight_ops dell_ops = {
+        .get_brightness = dell_get_intensity,
+        .update_status  = dell_send_intensity,
+};
+static int __init dell_init(void)
+{
+        struct calling_interface_buffer buffer;
+        int max_intensity = 0;
+        int ret;
+        if (!dmi_check_system(dell_device_table))
+                return -ENODEV;
+        dmi_walk(find_tokens);
+        if (!da_tokens)  {
+                printk(KERN_INFO "dell-laptop: Unable to find dmi tokens\n");
+                return -ENODEV;
+        }
+        ret = dell_setup_rfkill();
+        if (ret) {
+                printk(KERN_WARNING "dell-laptop: Unable to setup rfkill\n");
+                goto out;
+        }
+#ifdef CONFIG_ACPI
+        /* In the event of an ACPI backlight being available, don't
+         * register the platform controller.
+         */
+        if (acpi_video_backlight_support())
+                return 0;
+#endif
+        memset(&buffer, 0, sizeof(struct calling_interface_buffer));
+        buffer.input[0] = find_token_location(BRIGHTNESS_TOKEN);
+        if (buffer.input[0] != -1) {
+                dell_send_request(&buffer, 0, 2);
+                max_intensity = buffer.output[3];
+        }
+        if (max_intensity) {
+                dell_backlight_device = backlight_device_register(
+                        "dell_backlight",
+                        NULL, NULL,
+                        &dell_ops);
+                if (IS_ERR(dell_backlight_device)) {
+                        ret = PTR_ERR(dell_backlight_device);
+                        dell_backlight_device = NULL;
+                        goto out;
+                }
+                dell_backlight_device->props.max_brightness = max_intensity;
+                dell_backlight_device->props.brightness =
+                        dell_get_intensity(dell_backlight_device);
+                backlight_update_status(dell_backlight_device);
+        }
+        return 0;
+out:
+        if (wifi_rfkill)
+                rfkill_unregister(wifi_rfkill);
+        if (bluetooth_rfkill)
+                rfkill_unregister(bluetooth_rfkill);
+        if (wwan_rfkill)
+                rfkill_unregister(wwan_rfkill);
+        kfree(da_tokens);
+        return ret;
+}
+static void __exit dell_exit(void)
+{
+        backlight_device_unregister(dell_backlight_device);
+        if (wifi_rfkill)
+                rfkill_unregister(wifi_rfkill);
+        if (bluetooth_rfkill)
+                rfkill_unregister(bluetooth_rfkill);
+        if (wwan_rfkill)
+                rfkill_unregister(wwan_rfkill);
+}
+module_init(dell_init);
+module_exit(dell_exit);
+MODULE_AUTHOR("Matthew Garrett <mjg@redhat.com>");
+MODULE_DESCRIPTION("Dell laptop driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dmi:*svnDellInc.:*:ct8:*");
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index 5d9bcf109c13..4abbe573fa40 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(ubi_leb_unmap);
 * @dtype: expected data type
 *
 * This function maps an un-mapped logical eraseblock @lnum to a physical
- * eraseblock. This means, that after a successfull invocation of this
+ * eraseblock. This means, that after a successful invocation of this
 * function the logical eraseblock @lnum will be empty (contain only %0xFF
 * bytes) and be mapped to a physical eraseblock, even if an unclean reboot
 * happens.
diff --git a/drivers/net/wireless/ath5k/dma.c b/drivers/net/wireless/ath5k/dma.c
index 7e2b1a67e5da..b65b4feb2d28 100644
--- a/drivers/net/wireless/ath5k/dma.c
+++ b/drivers/net/wireless/ath5k/dma.c
@@ -594,7 +594,7 @@ int ath5k_hw_get_isr(struct ath5k_hw *ah, enum ath5k_int *interrupt_mask)
                 * XXX: BMISS interrupts may occur after association.
                 * I found this on 5210 code but it needs testing. If this is
                 * true we should disable them before assoc and re-enable them
-                 * after a successfull assoc + some jiffies.
+                 * after a successful assoc + some jiffies.
                        interrupt_mask &= ~AR5K_INT_BMISS;
                 */
        }
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 9caa96a13586..a611ad857983 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -287,7 +287,7 @@ static void zd_op_stop(struct ieee80211_hw *hw)
 * @skb - a sk-buffer
 * @flags: extra flags to set in the TX status info
 * @ackssi: ACK signal strength
- * @success - True for successfull transmission of the frame
+ * @success - True for successful transmission of the frame
 *
 * This information calls ieee80211_tx_status_irqsafe() if required by the
 * control information. It copies the control information into the status
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 162330b9d1dc..7e5155e88ac7 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -86,13 +86,11 @@ enum ds_type {
 struct ds1307 {
-        u8                      reg_addr;
        u8                      regs[11];
        enum ds_type            type;
        unsigned long           flags;
 #define HAS_NVRAM       0               /* bit 0 == sysfs file active */
 #define HAS_ALARM       1               /* bit 1 == irq claimed */
-        struct i2c_msg          msg[2];
        struct i2c_client       *client;
        struct rtc_device       *rtc;
        struct work_struct      work;
@@ -204,13 +202,9 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t)
        int             tmp;
        /* read the RTC date and time registers all at once */
-        ds1307->reg_addr = 0;
+        tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
-        ds1307->msg[1].flags = I2C_M_RD;
+                DS1307_REG_SECS, 7, ds1307->regs);
-        ds1307->msg[1].len = 7;
+        if (tmp != 7) {
-        tmp = i2c_transfer(to_i2c_adapter(ds1307->client->dev.parent),
-                        ds1307->msg, 2);
-        if (tmp != 2) {
                dev_err(dev, "%s error %d\n", "read", tmp);
                return -EIO;
        }
@@ -257,7 +251,6 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
                t->tm_hour, t->tm_mday,
                t->tm_mon, t->tm_year, t->tm_wday);
-        *buf++ = 0;             /* first register addr */
        buf[DS1307_REG_SECS] = bin2bcd(t->tm_sec);
        buf[DS1307_REG_MIN] = bin2bcd(t->tm_min);
        buf[DS1307_REG_HOUR] = bin2bcd(t->tm_hour);
@@ -282,23 +275,19 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
                break;
        }
-        ds1307->msg[1].flags = 0;
-        ds1307->msg[1].len = 8;
        dev_dbg(dev, "%s: %02x %02x %02x %02x %02x %02x %02x\n",
                "write", buf[0], buf[1], buf[2], buf[3],
                buf[4], buf[5], buf[6]);
-        result = i2c_transfer(to_i2c_adapter(ds1307->client->dev.parent),
+        result = i2c_smbus_write_i2c_block_data(ds1307->client, 0, 7, buf);
-                        &ds1307->msg[1], 1);
+        if (result < 0) {
-        if (result != 1) {
+                dev_err(dev, "%s error %d\n", "write", result);
-                dev_err(dev, "%s error %d\n", "write", tmp);
+                return result;
-                return -EIO;
        }
        return 0;
 }
-static int ds1307_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 {
        struct i2c_client       *client = to_i2c_client(dev);
        struct ds1307           *ds1307 = i2c_get_clientdata(client);
@@ -308,13 +297,9 @@ static int ds1307_read_alarm(struct device *dev, struct rtc_wkalrm *t)
                return -EINVAL;
        /* read all ALARM1, ALARM2, and status registers at once */
-        ds1307->reg_addr = DS1339_REG_ALARM1_SECS;
+        ret = i2c_smbus_read_i2c_block_data(client,
-        ds1307->msg[1].flags = I2C_M_RD;
+                        DS1339_REG_ALARM1_SECS, 9, ds1307->regs);
-        ds1307->msg[1].len = 9;
+        if (ret != 9) {
-        ret = i2c_transfer(to_i2c_adapter(client->dev.parent),
-                        ds1307->msg, 2);
-        if (ret != 2) {
                dev_err(dev, "%s error %d\n", "alarm read", ret);
                return -EIO;
        }
@@ -353,7 +338,7 @@ static int ds1307_read_alarm(struct device *dev, struct rtc_wkalrm *t)
        return 0;
 }
-static int ds1307_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 {
        struct i2c_client       *client = to_i2c_client(dev);
        struct ds1307           *ds1307 = i2c_get_clientdata(client);
@@ -371,13 +356,9 @@ static int ds1307_set_alarm(struct device *dev, struct rtc_wkalrm *t)
                t->enabled, t->pending);
        /* read current status of both alarms and the chip */
-        ds1307->reg_addr = DS1339_REG_ALARM1_SECS;
+        ret = i2c_smbus_read_i2c_block_data(client,
-        ds1307->msg[1].flags = I2C_M_RD;
+                        DS1339_REG_ALARM1_SECS, 9, buf);
-        ds1307->msg[1].len = 9;
+        if (ret != 9) {
-        ret = i2c_transfer(to_i2c_adapter(client->dev.parent),
-                        ds1307->msg, 2);
-        if (ret != 2) {
                dev_err(dev, "%s error %d\n", "alarm write", ret);
                return -EIO;
        }
@@ -392,7 +373,6 @@ static int ds1307_set_alarm(struct device *dev, struct rtc_wkalrm *t)
                        ds1307->regs[6], control, status);
        /* set ALARM1, using 24 hour and day-of-month modes */
-        *buf++ = DS1339_REG_ALARM1_SECS;        /* first register addr */
        buf[0] = bin2bcd(t->time.tm_sec);
        buf[1] = bin2bcd(t->time.tm_min);
        buf[2] = bin2bcd(t->time.tm_hour);
@@ -411,14 +391,11 @@ static int ds1307_set_alarm(struct device *dev, struct rtc_wkalrm *t)
        }
        buf[8] = status & ~(DS1337_BIT_A1I | DS1337_BIT_A2I);
-        ds1307->msg[1].flags = 0;
+        ret = i2c_smbus_write_i2c_block_data(client,
-        ds1307->msg[1].len = 10;
+                        DS1339_REG_ALARM1_SECS, 9, buf);
+        if (ret < 0) {
-        ret = i2c_transfer(to_i2c_adapter(client->dev.parent),
-                        &ds1307->msg[1], 1);
-        if (ret != 1) {
                dev_err(dev, "can't set alarm time\n");
-                return -EIO;
+                return ret;
        }
        return 0;
@@ -475,8 +452,8 @@ static int ds1307_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 static const struct rtc_class_ops ds13xx_rtc_ops = {
        .read_time      = ds1307_get_time,
        .set_time       = ds1307_set_time,
-        .read_alarm     = ds1307_read_alarm,
+        .read_alarm     = ds1337_read_alarm,
-        .set_alarm      = ds1307_set_alarm,
+        .set_alarm      = ds1337_set_alarm,
        .ioctl          = ds1307_ioctl,
 };
@@ -490,7 +467,6 @@ ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 {
        struct i2c_client       *client;
        struct ds1307           *ds1307;
-        struct i2c_msg          msg[2];
        int                     result;
        client = kobj_to_i2c_client(kobj);
@@ -503,24 +479,10 @@ ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
        if (unlikely(!count))
                return count;
-        msg[0].addr = client->addr;
+        result = i2c_smbus_read_i2c_block_data(client, 8 + off, count, buf);
-        msg[0].flags = 0;
+        if (result < 0)
-        msg[0].len = 1;
-        msg[0].buf = buf;
-        buf[0] = 8 + off;
-        msg[1].addr = client->addr;
-        msg[1].flags = I2C_M_RD;
-        msg[1].len = count;
-        msg[1].buf = buf;
-        result = i2c_transfer(to_i2c_adapter(client->dev.parent), msg, 2);
-        if (result != 2) {
                dev_err(&client->dev, "%s error %d\n", "nvram read", result);
-                return -EIO;
+        return result;
-        }
-        return count;
 }
 static ssize_t
@@ -528,8 +490,7 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
                char *buf, loff_t off, size_t count)
 {
        struct i2c_client       *client;
-        u8                      buffer[NVRAM_SIZE + 1];
+        int                     result;
-        int                     ret;
        client = kobj_to_i2c_client(kobj);
@@ -540,11 +501,12 @@ ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
        if (unlikely(!count))
                return count;
-        buffer[0] = 8 + off;
+        result = i2c_smbus_write_i2c_block_data(client, 8 + off, count, buf);
-        memcpy(buffer + 1, buf, count);
+        if (result < 0) {
+                dev_err(&client->dev, "%s error %d\n", "nvram write", result);
-        ret = i2c_master_send(client, buffer, count + 1);
+                return result;
-        return (ret < 0) ? ret : (ret - 1);
+        }
+        return count;
 }
 static struct bin_attribute nvram = {
@@ -571,9 +533,11 @@ static int __devinit ds1307_probe(struct i2c_client *client,
        const struct chip_desc  *chip = &chips[id->driver_data];
        struct i2c_adapter      *adapter = to_i2c_adapter(client->dev.parent);
        int                     want_irq = false;
+        unsigned char           *buf;
        if (!i2c_check_functionality(adapter,
-                        I2C_FUNC_I2C | I2C_FUNC_SMBUS_WRITE_BYTE_DATA))
+                        I2C_FUNC_SMBUS_WRITE_BYTE_DATA |
+                        I2C_FUNC_SMBUS_I2C_BLOCK))
                return -EIO;
        if (!(ds1307 = kzalloc(sizeof(struct ds1307), GFP_KERNEL)))
@@ -581,18 +545,8 @@ static int __devinit ds1307_probe(struct i2c_client *client,
        ds1307->client = client;
        i2c_set_clientdata(client, ds1307);
-        ds1307->msg[0].addr = client->addr;
-        ds1307->msg[0].flags = 0;
-        ds1307->msg[0].len = 1;
-        ds1307->msg[0].buf = &ds1307->reg_addr;
-        ds1307->msg[1].addr = client->addr;
-        ds1307->msg[1].flags = I2C_M_RD;
-        ds1307->msg[1].len = sizeof(ds1307->regs);
-        ds1307->msg[1].buf = ds1307->regs;
        ds1307->type = id->driver_data;
+        buf = ds1307->regs;
        switch (ds1307->type) {
        case ds_1337:
@@ -602,21 +556,15 @@ static int __devinit ds1307_probe(struct i2c_client *client,
                        INIT_WORK(&ds1307->work, ds1307_work);
                        want_irq = true;
                }
-                ds1307->reg_addr = DS1337_REG_CONTROL;
-                ds1307->msg[1].len = 2;
                /* get registers that the "rtc" read below won't read... */
-                tmp = i2c_transfer(adapter, ds1307->msg, 2);
+                tmp = i2c_smbus_read_i2c_block_data(ds1307->client,
+                                DS1337_REG_CONTROL, 2, buf);
                if (tmp != 2) {
                        pr_debug("read error %d\n", tmp);
                        err = -EIO;
                        goto exit_free;
                }
-                ds1307->reg_addr = 0;
-                ds1307->msg[1].len = sizeof(ds1307->regs);
                /* oscillator off?  turn it on, so clock can tick. */
                if (ds1307->regs[0] & DS1337_BIT_nEOSC)
                        ds1307->regs[0] &= ~DS1337_BIT_nEOSC;
@@ -647,9 +595,8 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 read_rtc:
        /* read RTC registers */
+        tmp = i2c_smbus_read_i2c_block_data(ds1307->client, 0, 8, buf);
-        tmp = i2c_transfer(adapter, ds1307->msg, 2);
+        if (tmp != 8) {
-        if (tmp != 2) {
                pr_debug("read error %d\n", tmp);
                err = -EIO;
                goto exit_free;
@@ -707,22 +654,6 @@ read_rtc:
                break;
        }
-        tmp = ds1307->regs[DS1307_REG_SECS];
-        tmp = bcd2bin(tmp & 0x7f);
-        if (tmp > 60)
-                goto exit_bad;
-        tmp = bcd2bin(ds1307->regs[DS1307_REG_MIN] & 0x7f);
-        if (tmp > 60)
-                goto exit_bad;
-        tmp = bcd2bin(ds1307->regs[DS1307_REG_MDAY] & 0x3f);
-        if (tmp == 0 || tmp > 31)
-                goto exit_bad;
-        tmp = bcd2bin(ds1307->regs[DS1307_REG_MONTH] & 0x1f);
-        if (tmp == 0 || tmp > 12)
-                goto exit_bad;
        tmp = ds1307->regs[DS1307_REG_HOUR];
        switch (ds1307->type) {
        case ds_1340:
@@ -779,13 +710,6 @@ read_rtc:
        return 0;
-exit_bad:
-        dev_dbg(&client->dev, "%s: %02x %02x %02x %02x %02x %02x %02x\n",
-                        "bogus register",
-                        ds1307->regs[0], ds1307->regs[1],
-                        ds1307->regs[2], ds1307->regs[3],
-                        ds1307->regs[4], ds1307->regs[5],
-                        ds1307->regs[6]);
 exit_irq:
        if (ds1307->rtc)
                rtc_device_unregister(ds1307->rtc);
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index b8f9c00633f3..d82aad5224f0 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -2621,7 +2621,7 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
                }
        }
-        /* double-check if current erp/cqr was successfull */
+        /* double-check if current erp/cqr was successful */
        if ((cqr->irb.scsw.cmd.cstat == 0x00) &&
            (cqr->irb.scsw.cmd.dstat ==
             (DEV_STAT_CHN_END | DEV_STAT_DEV_END))) {
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 05a14536c369..4a39084d9c95 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -199,7 +199,7 @@ struct dasd_ccw_req {
 #define DASD_CQR_ERROR          0x82    /* request is completed with error */
 #define DASD_CQR_CLEAR_PENDING  0x83    /* request is clear pending */
 #define DASD_CQR_CLEARED        0x84    /* request was cleared */
-#define DASD_CQR_SUCCESS        0x85    /* request was successfull */
+#define DASD_CQR_SUCCESS        0x85    /* request was successful */
 /* per dasd_ccw_req flags */
diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c
index 4005c44a404c..71605a179d65 100644
--- a/drivers/s390/char/tape_3590.c
+++ b/drivers/s390/char/tape_3590.c
@@ -801,7 +801,7 @@ tape_3590_done(struct tape_device *device, struct tape_request *request)
 static inline int
 tape_3590_erp_succeded(struct tape_device *device, struct tape_request *request)
 {
-        DBF_EVENT(3, "Error Recovery successfull for %s\n",
+        DBF_EVENT(3, "Error Recovery successful for %s\n",
                  tape_op_verbose[request->op]);
        return tape_3590_done(device, request);
 }
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 06b71823f399..659f8a791656 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -379,7 +379,7 @@ int cio_commit_config(struct subchannel *sch)
                if (ccode < 0) /* -EIO if msch gets a program check. */
                        return ccode;
                switch (ccode) {
-                case 0: /* successfull */
+                case 0: /* successful */
                        if (stsch(sch->schid, &schib) ||
                            !css_sch_is_valid(&schib))
                                return -ENODEV;
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 744f928a59ea..10cb0f8726e5 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -114,7 +114,7 @@ static inline int qdio_check_ccq(struct qdio_q *q, unsigned int ccq)
 * @count: count of buffers to examine
 * @auto_ack: automatically acknowledge buffers
 *
- * Returns the number of successfull extracted equal buffer states.
+ * Returns the number of successfully extracted equal buffer states.
 * Stops processing if a state is different from the last buffers state.
 */
 static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state,
diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig
index 90616822cd20..96d2f8e4c275 100644
--- a/drivers/w1/masters/Kconfig
+++ b/drivers/w1/masters/Kconfig
@@ -34,6 +34,12 @@ config W1_MASTER_DS2482
          This driver can also be built as a module.  If so, the module
          will be called ds2482.
+config W1_MASTER_MXC
+        tristate "Freescale MXC 1-wire busmaster"
+        depends on W1 && ARCH_MXC
+        help
+          Say Y here to enable MXC 1-wire host
 config W1_MASTER_DS1WM
        tristate "Maxim DS1WM 1-wire busmaster"
        depends on W1 && ARM && HAVE_CLK
diff --git a/drivers/w1/masters/Makefile b/drivers/w1/masters/Makefile
index bc4714a75f3a..c5a3e96fcbab 100644
--- a/drivers/w1/masters/Makefile
+++ b/drivers/w1/masters/Makefile
@@ -5,6 +5,8 @@
 obj-$(CONFIG_W1_MASTER_MATROX)          += matrox_w1.o
 obj-$(CONFIG_W1_MASTER_DS2490)          += ds2490.o
 obj-$(CONFIG_W1_MASTER_DS2482)          += ds2482.o
+obj-$(CONFIG_W1_MASTER_MXC)             += mxc_w1.o
 obj-$(CONFIG_W1_MASTER_DS1WM)           += ds1wm.o
 obj-$(CONFIG_W1_MASTER_GPIO)            += w1-gpio.o
 obj-$(CONFIG_HDQ_MASTER_OMAP)           += omap_hdq.o
diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c
new file mode 100644
index 000000000000..b9d74d0b353e
--- /dev/null
+++ b/drivers/w1/masters/mxc_w1.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2005-2008 Freescale Semiconductor, Inc. All Rights Reserved.
+ * Copyright 2008 Luotao Fu, kernel@pengutronix.de
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include "../w1.h"
+#include "../w1_int.h"
+#include "../w1_log.h"
+/* According to the mx27 Datasheet the reset procedure should take up to about
+ * 1350us. We set the timeout to 500*100us = 50ms for sure */
+#define MXC_W1_RESET_TIMEOUT 500
+/*
+ * MXC W1 Register offsets
+ */
+#define MXC_W1_CONTROL          0x00
+#define MXC_W1_TIME_DIVIDER     0x02
+#define MXC_W1_RESET            0x04
+#define MXC_W1_COMMAND          0x06
+#define MXC_W1_TXRX             0x08
+#define MXC_W1_INTERRUPT        0x0A
+#define MXC_W1_INTERRUPT_EN     0x0C
+struct mxc_w1_device {
+        void __iomem *regs;
+        unsigned int clkdiv;
+        struct clk *clk;
+        struct w1_bus_master bus_master;
+};
+/*
+ * this is the low level routine to
+ * reset the device on the One Wire interface
+ * on the hardware
+ */
+static u8 mxc_w1_ds2_reset_bus(void *data)
+{
+        u8 reg_val;
+        unsigned int timeout_cnt = 0;
+        struct mxc_w1_device *dev = data;
+        __raw_writeb(0x80, (dev->regs + MXC_W1_CONTROL));
+        while (1) {
+                reg_val = __raw_readb(dev->regs + MXC_W1_CONTROL);
+                if (((reg_val >> 7) & 0x1) == 0 ||
+                    timeout_cnt > MXC_W1_RESET_TIMEOUT)
+                        break;
+                else
+                        timeout_cnt++;
+                udelay(100);
+        }
+        return (reg_val >> 7) & 0x1;
+}
+/*
+ * this is the low level routine to read/write a bit on the One Wire
+ * interface on the hardware. It does write 0 if parameter bit is set
+ * to 0, otherwise a write 1/read.
+ */
+static u8 mxc_w1_ds2_touch_bit(void *data, u8 bit)
+{
+        struct mxc_w1_device *mdev = data;
+        void __iomem *ctrl_addr = mdev->regs + MXC_W1_CONTROL;
+        unsigned int timeout_cnt = 400; /* Takes max. 120us according to
+                                         * datasheet.
+                                         */
+        __raw_writeb((1 << (5 - bit)), ctrl_addr);
+        while (timeout_cnt--) {
+                if (!((__raw_readb(ctrl_addr) >> (5 - bit)) & 0x1))
+                        break;
+                udelay(1);
+        }
+        return ((__raw_readb(ctrl_addr)) >> 3) & 0x1;
+}
+static int __init mxc_w1_probe(struct platform_device *pdev)
+{
+        struct mxc_w1_device *mdev;
+        struct resource *res;
+        int err = 0;
+        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+        if (!res)
+                return -ENODEV;
+        mdev = kzalloc(sizeof(struct mxc_w1_device), GFP_KERNEL);
+        if (!mdev)
+                return -ENOMEM;
+        mdev->clk = clk_get(&pdev->dev, "owire_clk");
+        if (!mdev->clk) {
+                err = -ENODEV;
+                goto failed_clk;
+        }
+        mdev->clkdiv = (clk_get_rate(mdev->clk) / 1000000) - 1;
+        res = request_mem_region(res->start, resource_size(res),
+                                "mxc_w1");
+        if (!res) {
+                err = -EBUSY;
+                goto failed_req;
+        }
+        mdev->regs = ioremap(res->start, resource_size(res));
+        if (!mdev->regs) {
+                printk(KERN_ERR "Cannot map frame buffer registers\n");
+                goto failed_ioremap;
+        }
+        clk_enable(mdev->clk);
+        __raw_writeb(mdev->clkdiv, mdev->regs + MXC_W1_TIME_DIVIDER);
+        mdev->bus_master.data = mdev;
+        mdev->bus_master.reset_bus = mxc_w1_ds2_reset_bus;
+        mdev->bus_master.touch_bit = mxc_w1_ds2_touch_bit;
+        err = w1_add_master_device(&mdev->bus_master);
+        if (err)
+                goto failed_add;
+        platform_set_drvdata(pdev, mdev);
+        return 0;
+failed_add:
+        iounmap(mdev->regs);
+failed_ioremap:
+        release_mem_region(res->start, resource_size(res));
+failed_req:
+        clk_put(mdev->clk);
+failed_clk:
+        kfree(mdev);
+        return err;
+}
+/*
+ * disassociate the w1 device from the driver
+ */
+static int mxc_w1_remove(struct platform_device *pdev)
+{
+        struct mxc_w1_device *mdev = platform_get_drvdata(pdev);
+        struct resource *res;
+        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+        w1_remove_master_device(&mdev->bus_master);
+        iounmap(mdev->regs);
+        release_mem_region(res->start, resource_size(res));
+        clk_disable(mdev->clk);
+        clk_put(mdev->clk);
+        platform_set_drvdata(pdev, NULL);
+        return 0;
+}
+static struct platform_driver mxc_w1_driver = {
+        .driver = {
+                   .name = "mxc_w1",
+        },
+        .probe = mxc_w1_probe,
+        .remove = mxc_w1_remove,
+};
+static int __init mxc_w1_init(void)
+{
+        return platform_driver_register(&mxc_w1_driver);
+}
+static void mxc_w1_exit(void)
+{
+        platform_driver_unregister(&mxc_w1_driver);
+}
+module_init(mxc_w1_init);
+module_exit(mxc_w1_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Freescale Semiconductors Inc");
+MODULE_DESCRIPTION("Driver for One-Wire on MXC");
diff --git a/drivers/w1/w1.h b/drivers/w1/w1.h
index 97304bd83ec9..d8a9709f3449 100644
--- a/drivers/w1/w1.h
+++ b/drivers/w1/w1.h
@@ -210,6 +210,7 @@ u8 w1_read_8(struct w1_master *);
 int w1_reset_bus(struct w1_master *);
 u8 w1_calc_crc8(u8 *, int);
 void w1_write_block(struct w1_master *, const u8 *, int);
+void w1_touch_block(struct w1_master *, u8 *, int);
 u8 w1_read_block(struct w1_master *, u8 *, int);
 int w1_reset_select_slave(struct w1_slave *sl);
 void w1_next_pullup(struct w1_master *, int);
diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c
index 5139c25ca962..442bd8bbd4a5 100644
--- a/drivers/w1/w1_io.c
+++ b/drivers/w1/w1_io.c
@@ -238,7 +238,6 @@ EXPORT_SYMBOL_GPL(w1_read_8);
 * @param dev     the master device
 * @param buf     pointer to the data to write
 * @param len     the number of bytes to write
- * @return        the byte read
 */
 void w1_write_block(struct w1_master *dev, const u8 *buf, int len)
 {
@@ -256,6 +255,31 @@ void w1_write_block(struct w1_master *dev, const u8 *buf, int len)
 EXPORT_SYMBOL_GPL(w1_write_block);
 /**
+ * Touches a series of bytes.
+ *
+ * @param dev     the master device
+ * @param buf     pointer to the data to write
+ * @param len     the number of bytes to write
+ */
+void w1_touch_block(struct w1_master *dev, u8 *buf, int len)
+{
+        int i, j;
+        u8 tmp;
+        for (i = 0; i < len; ++i) {
+                tmp = 0;
+                for (j = 0; j < 8; ++j) {
+                        if (j == 7)
+                                w1_pre_write(dev);
+                        tmp |= w1_touch_bit(dev, (buf[i] >> j) & 0x1) << j;
+                }
+                buf[i] = tmp;
+        }
+}
+EXPORT_SYMBOL_GPL(w1_touch_block);
+/**
 * Reads a series of bytes.
 *
 * @param dev     the master device
diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c
index 65c5ebd0787e..fdf72851c574 100644
--- a/drivers/w1/w1_netlink.c
+++ b/drivers/w1/w1_netlink.c
@@ -47,21 +47,56 @@ void w1_netlink_send(struct w1_master *dev, struct w1_netlink_msg *msg)
        cn_netlink_send(m, 0, GFP_KERNEL);
 }
-static int w1_process_command_master(struct w1_master *dev, struct cn_msg *msg,
+static void w1_send_slave(struct w1_master *dev, u64 rn)
-                struct w1_netlink_msg *hdr, struct w1_netlink_cmd *cmd)
+{
+        struct cn_msg *msg = dev->priv;
+        struct w1_netlink_msg *hdr = (struct w1_netlink_msg *)(msg + 1);
+        struct w1_netlink_cmd *cmd = (struct w1_netlink_cmd *)(hdr + 1);
+        int avail;
+        avail = dev->priv_size - cmd->len;
+        if (avail > 8) {
+                u64 *data = (void *)(cmd + 1) + cmd->len;
+                *data = rn;
+                cmd->len += 8;
+                hdr->len += 8;
+                msg->len += 8;
+                return;
+        }
+        msg->ack++;
+        cn_netlink_send(msg, 0, GFP_KERNEL);
+        msg->len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd);
+        hdr->len = sizeof(struct w1_netlink_cmd);
+        cmd->len = 0;
+}
+static int w1_process_search_command(struct w1_master *dev, struct cn_msg *msg,
+                unsigned int avail)
 {
-        dev_dbg(&dev->dev, "%s: %s: cmd=%02x, len=%u.\n",
+        struct w1_netlink_msg *hdr = (struct w1_netlink_msg *)(msg + 1);
-                __func__, dev->name, cmd->cmd, cmd->len);
+        struct w1_netlink_cmd *cmd = (struct w1_netlink_cmd *)(hdr + 1);
+        int search_type = (cmd->cmd == W1_CMD_ALARM_SEARCH)?W1_ALARM_SEARCH:W1_SEARCH;
-        if (cmd->cmd != W1_CMD_SEARCH && cmd->cmd != W1_CMD_ALARM_SEARCH)
+        dev->priv = msg;
-                return -EINVAL;
+        dev->priv_size = avail;
+        w1_search_devices(dev, search_type, w1_send_slave);
+        msg->ack = 0;
+        cn_netlink_send(msg, 0, GFP_KERNEL);
+        dev->priv = NULL;
+        dev->priv_size = 0;
-        w1_search_process(dev, (cmd->cmd == W1_CMD_ALARM_SEARCH)?W1_ALARM_SEARCH:W1_SEARCH);
        return 0;
 }
-static int w1_send_read_reply(struct w1_slave *sl, struct cn_msg *msg,
+static int w1_send_read_reply(struct cn_msg *msg, struct w1_netlink_msg *hdr,
-                struct w1_netlink_msg *hdr, struct w1_netlink_cmd *cmd)
+                struct w1_netlink_cmd *cmd)
 {
        void *data;
        struct w1_netlink_msg *h;
@@ -85,7 +120,8 @@ static int w1_send_read_reply(struct w1_slave *sl, struct cn_msg *msg,
        memcpy(c, cmd, sizeof(struct w1_netlink_cmd));
        cm->ack = msg->seq+1;
-        cm->len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd) + cmd->len;
+        cm->len = sizeof(struct w1_netlink_msg) +
+                sizeof(struct w1_netlink_cmd) + cmd->len;
        h->len = sizeof(struct w1_netlink_cmd) + cmd->len;
@@ -98,36 +134,178 @@ static int w1_send_read_reply(struct w1_slave *sl, struct cn_msg *msg,
        return err;
 }
-static int w1_process_command_slave(struct w1_slave *sl, struct cn_msg *msg,
+static int w1_process_command_io(struct w1_master *dev, struct cn_msg *msg,
                struct w1_netlink_msg *hdr, struct w1_netlink_cmd *cmd)
 {
        int err = 0;
-        dev_dbg(&sl->master->dev, "%s: %02x.%012llx.%02x: cmd=%02x, len=%u.\n",
+        switch (cmd->cmd) {
-                __func__, sl->reg_num.family, (unsigned long long)sl->reg_num.id, sl->reg_num.crc,
+        case W1_CMD_TOUCH:
-                cmd->cmd, cmd->len);
+                w1_touch_block(dev, cmd->data, cmd->len);
+                w1_send_read_reply(msg, hdr, cmd);
+                break;
+        case W1_CMD_READ:
+                w1_read_block(dev, cmd->data, cmd->len);
+                w1_send_read_reply(msg, hdr, cmd);
+                break;
+        case W1_CMD_WRITE:
+                w1_write_block(dev, cmd->data, cmd->len);
+                break;
+        default:
+                err = -EINVAL;
+                break;
+        }
+        return err;
+}
+static int w1_process_command_master(struct w1_master *dev, struct cn_msg *req_msg,
+                struct w1_netlink_msg *req_hdr, struct w1_netlink_cmd *req_cmd)
+{
+        int err = -EINVAL;
+        struct cn_msg *msg;
+        struct w1_netlink_msg *hdr;
+        struct w1_netlink_cmd *cmd;
+        msg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!msg)
+                return -ENOMEM;
+        msg->id = req_msg->id;
+        msg->seq = req_msg->seq;
+        msg->ack = 0;
+        msg->len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd);
+        hdr = (struct w1_netlink_msg *)(msg + 1);
+        cmd = (struct w1_netlink_cmd *)(hdr + 1);
+        hdr->type = W1_MASTER_CMD;
+        hdr->id = req_hdr->id;
+        hdr->len = sizeof(struct w1_netlink_cmd);
+        cmd->cmd = req_cmd->cmd;
+        cmd->len = 0;
        switch (cmd->cmd) {
-                case W1_CMD_READ:
+        case W1_CMD_SEARCH:
-                        w1_read_block(sl->master, cmd->data, cmd->len);
+        case W1_CMD_ALARM_SEARCH:
-                        w1_send_read_reply(sl, msg, hdr, cmd);
+                err = w1_process_search_command(dev, msg,
-                        break;
+                                PAGE_SIZE - msg->len - sizeof(struct cn_msg));
-                case W1_CMD_WRITE:
+                break;
-                        w1_write_block(sl->master, cmd->data, cmd->len);
+        case W1_CMD_READ:
-                        break;
+        case W1_CMD_WRITE:
-                case W1_CMD_SEARCH:
+        case W1_CMD_TOUCH:
-                case W1_CMD_ALARM_SEARCH:
+                err = w1_process_command_io(dev, req_msg, req_hdr, req_cmd);
-                        w1_search_process(sl->master,
+                break;
-                                        (cmd->cmd == W1_CMD_ALARM_SEARCH)?W1_ALARM_SEARCH:W1_SEARCH);
+        case W1_CMD_RESET:
-                        break;
+                err = w1_reset_bus(dev);
-                default:
+                break;
-                        err = -1;
+        default:
-                        break;
+                err = -EINVAL;
+                break;
        }
+        kfree(msg);
        return err;
 }
+static int w1_process_command_slave(struct w1_slave *sl, struct cn_msg *msg,
+                struct w1_netlink_msg *hdr, struct w1_netlink_cmd *cmd)
+{
+        dev_dbg(&sl->master->dev, "%s: %02x.%012llx.%02x: cmd=%02x, len=%u.\n",
+                __func__, sl->reg_num.family, (unsigned long long)sl->reg_num.id,
+                sl->reg_num.crc, cmd->cmd, cmd->len);
+        return w1_process_command_io(sl->master, msg, hdr, cmd);
+}
+static int w1_process_command_root(struct cn_msg *msg, struct w1_netlink_msg *mcmd)
+{
+        struct w1_master *m;
+        struct cn_msg *cn;
+        struct w1_netlink_msg *w;
+        u32 *id;
+        if (mcmd->type != W1_LIST_MASTERS) {
+                printk(KERN_NOTICE "%s: msg: %x.%x, wrong type: %u, len: %u.\n",
+                        __func__, msg->id.idx, msg->id.val, mcmd->type, mcmd->len);
+                return -EPROTO;
+        }
+        cn = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!cn)
+                return -ENOMEM;
+        cn->id.idx = CN_W1_IDX;
+        cn->id.val = CN_W1_VAL;
+        cn->seq = msg->seq;
+        cn->ack = 1;
+        cn->len = sizeof(struct w1_netlink_msg);
+        w = (struct w1_netlink_msg *)(cn + 1);
+        w->type = W1_LIST_MASTERS;
+        w->status = 0;
+        w->len = 0;
+        id = (u32 *)(w + 1);
+        mutex_lock(&w1_mlock);
+        list_for_each_entry(m, &w1_masters, w1_master_entry) {
+                if (cn->len + sizeof(*id) > PAGE_SIZE - sizeof(struct cn_msg)) {
+                        cn_netlink_send(cn, 0, GFP_KERNEL);
+                        cn->ack++;
+                        cn->len = sizeof(struct w1_netlink_msg);
+                        w->len = 0;
+                        id = (u32 *)(w + 1);
+                }
+                *id = m->id;
+                w->len += sizeof(*id);
+                cn->len += sizeof(*id);
+                id++;
+        }
+        cn->ack = 0;
+        cn_netlink_send(cn, 0, GFP_KERNEL);
+        mutex_unlock(&w1_mlock);
+        kfree(cn);
+        return 0;
+}
+static int w1_netlink_send_error(struct cn_msg *rcmsg, struct w1_netlink_msg *rmsg,
+                struct w1_netlink_cmd *rcmd, int error)
+{
+        struct cn_msg *cmsg;
+        struct w1_netlink_msg *msg;
+        struct w1_netlink_cmd *cmd;
+        cmsg = kzalloc(sizeof(*msg) + sizeof(*cmd) + sizeof(*cmsg), GFP_KERNEL);
+        if (!cmsg)
+                return -ENOMEM;
+        msg = (struct w1_netlink_msg *)(cmsg + 1);
+        cmd = (struct w1_netlink_cmd *)(msg + 1);
+        memcpy(cmsg, rcmsg, sizeof(*cmsg));
+        cmsg->len = sizeof(*msg);
+        memcpy(msg, rmsg, sizeof(*msg));
+        msg->len = 0;
+        msg->status = (short)-error;
+        if (rcmd) {
+                memcpy(cmd, rcmd, sizeof(*cmd));
+                cmd->len = 0;
+                msg->len += sizeof(*cmd);
+                cmsg->len += sizeof(*cmd);
+        }
+        error = cn_netlink_send(cmsg, 0, GFP_KERNEL);
+        kfree(cmsg);
+        return error;
+}
 static void w1_cn_callback(void *data)
 {
        struct cn_msg *msg = data;
@@ -144,6 +322,7 @@ static void w1_cn_callback(void *data)
                dev = NULL;
                sl = NULL;
+                cmd = NULL;
                memcpy(&id, m->id.id, sizeof(id));
 #if 0
@@ -155,15 +334,15 @@ static void w1_cn_callback(void *data)
                        break;
                }
-                if (!mlen)
-                        goto out_cont;
                if (m->type == W1_MASTER_CMD) {
                        dev = w1_search_master_id(m->id.mst.id);
                } else if (m->type == W1_SLAVE_CMD) {
                        sl = w1_search_slave(&id);
                        if (sl)
                                dev = sl->master;
+                } else {
+                        err = w1_process_command_root(msg, m);
+                        goto out_cont;
                }
                if (!dev) {
@@ -171,6 +350,10 @@ static void w1_cn_callback(void *data)
                        goto out_cont;
                }
+                err = 0;
+                if (!mlen)
+                        goto out_cont;
                mutex_lock(&dev->mutex);
                if (sl && w1_reset_select_slave(sl)) {
@@ -187,9 +370,12 @@ static void w1_cn_callback(void *data)
                        }
                        if (sl)
-                                w1_process_command_slave(sl, msg, m, cmd);
+                                err = w1_process_command_slave(sl, msg, m, cmd);
                        else
-                                w1_process_command_master(dev, msg, m, cmd);
+                                err = w1_process_command_master(dev, msg, m, cmd);
+                        w1_netlink_send_error(msg, m, cmd, err);
+                        err = 0;
                        cmd_data += cmd->len + sizeof(struct w1_netlink_cmd);
                        mlen -= cmd->len + sizeof(struct w1_netlink_cmd);
@@ -200,6 +386,8 @@ out_up:
                        atomic_dec(&sl->refcnt);
                mutex_unlock(&dev->mutex);
 out_cont:
+                if (!cmd || err)
+                        w1_netlink_send_error(msg, m, cmd, err);
                msg->len -= sizeof(struct w1_netlink_msg) + m->len;
                m = (struct w1_netlink_msg *)(((u8 *)m) + sizeof(struct w1_netlink_msg) + m->len);
@@ -209,11 +397,6 @@ out_cont:
                if (err == -ENODEV)
                        err = 0;
        }
-#if 0
-        if (err) {
-                printk("%s: malformed message. Dropping.\n", __func__);
-        }
-#endif
 }
 int w1_init_netlink(void)
diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h
index 56122b9e9294..27e950f935b1 100644
--- a/drivers/w1/w1_netlink.h
+++ b/drivers/w1/w1_netlink.h
@@ -34,12 +34,13 @@ enum w1_netlink_message_types {
        W1_MASTER_REMOVE,
        W1_MASTER_CMD,
        W1_SLAVE_CMD,
+        W1_LIST_MASTERS,
 };
 struct w1_netlink_msg
 {
        __u8                            type;
-        __u8                            reserved;
+        __u8                            status;
        __u16                           len;
        union {
                __u8                    id[8];
@@ -51,10 +52,15 @@ struct w1_netlink_msg
        __u8                            data[0];
 };
-#define W1_CMD_READ             0x0
+enum w1_commands {
-#define W1_CMD_WRITE            0x1
+        W1_CMD_READ = 0,
-#define W1_CMD_SEARCH           0x2
+        W1_CMD_WRITE,
-#define W1_CMD_ALARM_SEARCH     0x3
+        W1_CMD_SEARCH,
+        W1_CMD_ALARM_SEARCH,
+        W1_CMD_TOUCH,
+        W1_CMD_RESET,
+        W1_CMD_MAX,
+};
 struct w1_netlink_cmd
 {
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 4b75a16de009..526187c8a12d 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -17,3 +17,27 @@ config XEN_SCRUB_PAGES
          is not accidentally visible to other domains.  Is it more
          secure, but slightly less efficient.
          If in doubt, say yes.
+config XENFS
+        tristate "Xen filesystem"
+        depends on XEN
+        default y
+        help
+          The xen filesystem provides a way for domains to share
+          information with each other and with the hypervisor.
+          For example, by reading and writing the "xenbus" file, guests
+          may pass arbitrary information to the initial domain.
+          If in doubt, say yes.
+config XEN_COMPAT_XENFS
+       bool "Create compatibility mount point /proc/xen"
+       depends on XENFS
+       default y
+       help
+         The old xenstore userspace tools expect to find "xenbus"
+         under /proc/xen, but "xenbus" is now found at the root of the
+         xenfs filesystem.  Selecting this causes the kernel to create
+         the compatibilty mount point /proc/xen if it is running on
+         a xen platform.
+         If in doubt, say yes.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index d2a8fdf0e191..ff8accc9e103 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,5 +1,7 @@
 obj-y   += grant-table.o features.o events.o manage.o
 obj-y   += xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)       += cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)       += xencomm.o
 obj-$(CONFIG_XEN_BALLOON)       += balloon.o
+obj-$(CONFIG_XENFS)             += xenfs/
+\ No newline at end of file
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 9678b3e98c63..92a1ef80a288 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -136,7 +136,6 @@ EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
 /**
 * xenbus_switch_state
 * @dev: xenbus device
- * @xbt: transaction handle
 * @state: new state
 *
 * Advertise in the store a change of the given driver to the given new_state.
@@ -267,7 +266,7 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error);
 * @fmt: error message format
 *
 * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
- * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
 * closedown of this driver and its peer.
 */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index b2a03184a246..773d1cf23283 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -40,6 +40,7 @@
 #include <linux/ctype.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
@@ -55,7 +56,10 @@
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 int xen_store_evtchn;
+EXPORT_SYMBOL(xen_store_evtchn);
 struct xenstore_domain_interface *xen_store_interface;
 static unsigned long xen_store_mfn;
@@ -166,6 +170,9 @@ static int read_backend_details(struct xenbus_device *xendev)
        return read_otherend_details(xendev, "backend-id", "backend");
 }
+static struct device_attribute xenbus_dev_attrs[] = {
+        __ATTR_NULL
+};
 /* Bus type for frontend drivers. */
 static struct xen_bus_type xenbus_frontend = {
@@ -174,12 +181,13 @@ static struct xen_bus_type xenbus_frontend = {
        .get_bus_id = frontend_bus_id,
        .probe = xenbus_probe_frontend,
        .bus = {
-                .name     = "xen",
+                .name      = "xen",
-                .match    = xenbus_match,
+                .match     = xenbus_match,
-                .uevent   = xenbus_uevent,
+                .uevent    = xenbus_uevent,
-                .probe    = xenbus_dev_probe,
+                .probe     = xenbus_dev_probe,
-                .remove   = xenbus_dev_remove,
+                .remove    = xenbus_dev_remove,
-                .shutdown = xenbus_dev_shutdown,
+                .shutdown  = xenbus_dev_shutdown,
+                .dev_attrs = xenbus_dev_attrs,
        },
 };
@@ -852,6 +860,14 @@ static int __init xenbus_probe_init(void)
        if (!xen_initial_domain())
                xenbus_probe(NULL);
+#ifdef CONFIG_XEN_COMPAT_XENFS
+        /*
+         * Create xenfs mountpoint in /proc for compatibility with
+         * utilities that expect to find "xenbus" under "/proc/xen".
+         */
+        proc_mkdir("xen", NULL);
+#endif
        return 0;
  out_unreg_back:
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 7f2f91c0e11d..e325eab4724d 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -184,6 +184,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
        return ret;
 }
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
 /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
 static void *xs_talkv(struct xenbus_transaction t,
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
new file mode 100644
index 000000000000..25275c3bbdff
--- /dev/null
+++ b/drivers/xen/xenfs/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XENFS) += xenfs.o
+xenfs-objs = super.o xenbus.o
+\ No newline at end of file
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
new file mode 100644
index 000000000000..515741a8e6b8
--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,64 @@
+/*
+ *  xenfs.c - a filesystem for passing info between the a domain and
+ *  the hypervisor.
+ *
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include "xenfs.h"
+#include <asm/xen/hypervisor.h>
+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");
+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        static struct tree_descr xenfs_files[] = {
+                [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+                {""},
+        };
+        return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+}
+static int xenfs_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data, struct vfsmount *mnt)
+{
+        return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+}
+static struct file_system_type xenfs_type = {
+        .owner =        THIS_MODULE,
+        .name =         "xenfs",
+        .get_sb =       xenfs_get_sb,
+        .kill_sb =      kill_litter_super,
+};
+static int __init xenfs_init(void)
+{
+        if (xen_pv_domain())
+                return register_filesystem(&xenfs_type);
+        printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+        return 0;
+}
+static void __exit xenfs_exit(void)
+{
+        if (xen_pv_domain())
+                unregister_filesystem(&xenfs_type);
+}
+module_init(xenfs_init);
+module_exit(xenfs_exit);
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
new file mode 100644
index 000000000000..875a4c59c594
--- /dev/null
+++ b/drivers/xen/xenfs/xenbus.c
@@ -0,0 +1,593 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+        struct list_head list;
+        struct xenbus_transaction handle;
+};
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+        struct list_head list;
+        unsigned int cons;
+        unsigned int len;
+        char msg[];
+};
+struct xenbus_file_priv {
+        /*
+         * msgbuffer_mutex is held while partial requests are built up
+         * and complete requests are acted on.  It therefore protects
+         * the "transactions" and "watches" lists, and the partial
+         * request length and buffer.
+         *
+         * reply_mutex protects the reply being built up to return to
+         * usermode.  It nests inside msgbuffer_mutex but may be held
+         * alone during a watch callback.
+         */
+        struct mutex msgbuffer_mutex;
+        /* In-progress transactions */
+        struct list_head transactions;
+        /* Active watches. */
+        struct list_head watches;
+        /* Partial request. */
+        unsigned int len;
+        union {
+                struct xsd_sockmsg msg;
+                char buffer[PAGE_SIZE];
+        } u;
+        /* Response queue. */
+        struct mutex reply_mutex;
+        struct list_head read_buffers;
+        wait_queue_head_t read_waitq;
+};
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+                               char __user *ubuf,
+                               size_t len, loff_t *ppos)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        struct read_buffer *rb;
+        unsigned i;
+        int ret;
+        mutex_lock(&u->reply_mutex);
+        while (list_empty(&u->read_buffers)) {
+                mutex_unlock(&u->reply_mutex);
+                ret = wait_event_interruptible(u->read_waitq,
+                                               !list_empty(&u->read_buffers));
+                if (ret)
+                        return ret;
+                mutex_lock(&u->reply_mutex);
+        }
+        rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+        i = 0;
+        while (i < len) {
+                unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+                ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+                i += sz - ret;
+                rb->cons += sz - ret;
+                if (ret != sz) {
+                        if (i == 0)
+                                i = -EFAULT;
+                        goto out;
+                }
+                /* Clear out buffer if it has been consumed */
+                if (rb->cons == rb->len) {
+                        list_del(&rb->list);
+                        kfree(rb);
+                        if (list_empty(&u->read_buffers))
+                                break;
+                        rb = list_entry(u->read_buffers.next,
+                                        struct read_buffer, list);
+                }
+        }
+out:
+        mutex_unlock(&u->reply_mutex);
+        return i;
+}
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+        struct read_buffer *rb;
+        if (len == 0)
+                return 0;
+        rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+        if (rb == NULL)
+                return -ENOMEM;
+        rb->cons = 0;
+        rb->len = len;
+        memcpy(rb->msg, data, len);
+        list_add_tail(&rb->list, queue);
+        return 0;
+}
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+        struct read_buffer *rb;
+        while (!list_empty(list)) {
+                rb = list_entry(list->next, struct read_buffer, list);
+                list_del(list->next);
+                kfree(rb);
+        }
+}
+struct watch_adapter {
+        struct list_head list;
+        struct xenbus_watch watch;
+        struct xenbus_file_priv *dev_data;
+        char *token;
+};
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+        kfree(watch->watch.node);
+        kfree(watch->token);
+        kfree(watch);
+}
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+                                                 const char *token)
+{
+        struct watch_adapter *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (watch == NULL)
+                goto out_fail;
+        watch->watch.node = kstrdup(path, GFP_KERNEL);
+        if (watch->watch.node == NULL)
+                goto out_free;
+        watch->token = kstrdup(token, GFP_KERNEL);
+        if (watch->token == NULL)
+                goto out_free;
+        return watch;
+out_free:
+        free_watch_adapter(watch);
+out_fail:
+        return NULL;
+}
+static void watch_fired(struct xenbus_watch *watch,
+                        const char **vec,
+                        unsigned int len)
+{
+        struct watch_adapter *adap;
+        struct xsd_sockmsg hdr;
+        const char *path, *token;
+        int path_len, tok_len, body_len, data_len = 0;
+        int ret;
+        LIST_HEAD(staging_q);
+        adap = container_of(watch, struct watch_adapter, watch);
+        path = vec[XS_WATCH_PATH];
+        token = adap->token;
+        path_len = strlen(path) + 1;
+        tok_len = strlen(token) + 1;
+        if (len > 2)
+                data_len = vec[len] - vec[2] + 1;
+        body_len = path_len + tok_len + data_len;
+        hdr.type = XS_WATCH_EVENT;
+        hdr.len = body_len;
+        mutex_lock(&adap->dev_data->reply_mutex);
+        ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+        if (!ret)
+                ret = queue_reply(&staging_q, path, path_len);
+        if (!ret)
+                ret = queue_reply(&staging_q, token, tok_len);
+        if (!ret && len > 2)
+                ret = queue_reply(&staging_q, vec[2], data_len);
+        if (!ret) {
+                /* success: pass reply list onto watcher */
+                list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+                wake_up(&adap->dev_data->read_waitq);
+        } else
+                queue_cleanup(&staging_q);
+        mutex_unlock(&adap->dev_data->reply_mutex);
+}
+static int xenbus_write_transaction(unsigned msg_type,
+                                    struct xenbus_file_priv *u)
+{
+        int rc, ret;
+        void *reply;
+        struct xenbus_transaction_holder *trans = NULL;
+        LIST_HEAD(staging_q);
+        if (msg_type == XS_TRANSACTION_START) {
+                trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+                if (!trans) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+        }
+        reply = xenbus_dev_request_and_reply(&u->u.msg);
+        if (IS_ERR(reply)) {
+                kfree(trans);
+                rc = PTR_ERR(reply);
+                goto out;
+        }
+        if (msg_type == XS_TRANSACTION_START) {
+                trans->handle.id = simple_strtoul(reply, NULL, 0);
+                list_add(&trans->list, &u->transactions);
+        } else if (msg_type == XS_TRANSACTION_END) {
+                list_for_each_entry(trans, &u->transactions, list)
+                        if (trans->handle.id == u->u.msg.tx_id)
+                                break;
+                BUG_ON(&trans->list == &u->transactions);
+                list_del(&trans->list);
+                kfree(trans);
+        }
+        mutex_lock(&u->reply_mutex);
+        ret = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+        if (!ret)
+                ret = queue_reply(&staging_q, reply, u->u.msg.len);
+        if (!ret) {
+                list_splice_tail(&staging_q, &u->read_buffers);
+                wake_up(&u->read_waitq);
+        } else {
+                queue_cleanup(&staging_q);
+                rc = ret;
+        }
+        mutex_unlock(&u->reply_mutex);
+        kfree(reply);
+out:
+        return rc;
+}
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+        struct watch_adapter *watch, *tmp_watch;
+        char *path, *token;
+        int err, rc;
+        LIST_HEAD(staging_q);
+        path = u->u.buffer + sizeof(u->u.msg);
+        token = memchr(path, 0, u->u.msg.len);
+        if (token == NULL) {
+                rc = -EILSEQ;
+                goto out;
+        }
+        token++;
+        if (msg_type == XS_WATCH) {
+                watch = alloc_watch_adapter(path, token);
+                if (watch == NULL) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                watch->watch.callback = watch_fired;
+                watch->dev_data = u;
+                err = register_xenbus_watch(&watch->watch);
+                if (err) {
+                        free_watch_adapter(watch);
+                        rc = err;
+                        goto out;
+                }
+                list_add(&watch->list, &u->watches);
+        } else {
+                list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+                        if (!strcmp(watch->token, token) &&
+                            !strcmp(watch->watch.node, path)) {
+                                unregister_xenbus_watch(&watch->watch);
+                                list_del(&watch->list);
+                                free_watch_adapter(watch);
+                                break;
+                        }
+                }
+        }
+        /* Success.  Synthesize a reply to say all is OK. */
+        {
+                struct {
+                        struct xsd_sockmsg hdr;
+                        char body[3];
+                } __packed reply = {
+                        {
+                                .type = msg_type,
+                                .len = sizeof(reply.body)
+                        },
+                        "OK"
+                };
+                mutex_lock(&u->reply_mutex);
+                rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+                mutex_unlock(&u->reply_mutex);
+        }
+out:
+        return rc;
+}
+static ssize_t xenbus_file_write(struct file *filp,
+                                const char __user *ubuf,
+                                size_t len, loff_t *ppos)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        uint32_t msg_type;
+        int rc = len;
+        int ret;
+        LIST_HEAD(staging_q);
+        /*
+         * We're expecting usermode to be writing properly formed
+         * xenbus messages.  If they write an incomplete message we
+         * buffer it up.  Once it is complete, we act on it.
+         */
+        /*
+         * Make sure concurrent writers can't stomp all over each
+         * other's messages and make a mess of our partial message
+         * buffer.  We don't make any attemppt to stop multiple
+         * writers from making a mess of each other's incomplete
+         * messages; we're just trying to guarantee our own internal
+         * consistency and make sure that single writes are handled
+         * atomically.
+         */
+        mutex_lock(&u->msgbuffer_mutex);
+        /* Get this out of the way early to avoid confusion */
+        if (len == 0)
+                goto out;
+        /* Can't write a xenbus message larger we can buffer */
+        if ((len + u->len) > sizeof(u->u.buffer)) {
+                /* On error, dump existing buffer */
+                u->len = 0;
+                rc = -EINVAL;
+                goto out;
+        }
+        ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+        if (ret == len) {
+                rc = -EFAULT;
+                goto out;
+        }
+        /* Deal with a partial copy. */
+        len -= ret;
+        rc = len;
+        u->len += len;
+        /* Return if we haven't got a full message yet */
+        if (u->len < sizeof(u->u.msg))
+                goto out;       /* not even the header yet */
+        /* If we're expecting a message that's larger than we can
+           possibly send, dump what we have and return an error. */
+        if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+                rc = -E2BIG;
+                u->len = 0;
+                goto out;
+        }
+        if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+                goto out;       /* incomplete data portion */
+        /*
+         * OK, now we have a complete message.  Do something with it.
+         */
+        msg_type = u->u.msg.type;
+        switch (msg_type) {
+        case XS_TRANSACTION_START:
+        case XS_TRANSACTION_END:
+        case XS_DIRECTORY:
+        case XS_READ:
+        case XS_GET_PERMS:
+        case XS_RELEASE:
+        case XS_GET_DOMAIN_PATH:
+        case XS_WRITE:
+        case XS_MKDIR:
+        case XS_RM:
+        case XS_SET_PERMS:
+                /* Send out a transaction */
+                ret = xenbus_write_transaction(msg_type, u);
+                break;
+        case XS_WATCH:
+        case XS_UNWATCH:
+                /* (Un)Ask for some path to be watched for changes */
+                ret = xenbus_write_watch(msg_type, u);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        if (ret != 0)
+                rc = ret;
+        /* Buffered message consumed */
+        u->len = 0;
+ out:
+        mutex_unlock(&u->msgbuffer_mutex);
+        return rc;
+}
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+        struct xenbus_file_priv *u;
+        if (xen_store_evtchn == 0)
+                return -ENOENT;
+        nonseekable_open(inode, filp);
+        u = kzalloc(sizeof(*u), GFP_KERNEL);
+        if (u == NULL)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&u->transactions);
+        INIT_LIST_HEAD(&u->watches);
+        INIT_LIST_HEAD(&u->read_buffers);
+        init_waitqueue_head(&u->read_waitq);
+        mutex_init(&u->reply_mutex);
+        mutex_init(&u->msgbuffer_mutex);
+        filp->private_data = u;
+        return 0;
+}
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+        struct xenbus_file_priv *u = filp->private_data;
+        struct xenbus_transaction_holder *trans, *tmp;
+        struct watch_adapter *watch, *tmp_watch;
+        /*
+         * No need for locking here because there are no other users,
+         * by definition.
+         */
+        list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+                xenbus_transaction_end(trans->handle, 1);
+                list_del(&trans->list);
+                kfree(trans);
+        }
+        list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+                unregister_xenbus_watch(&watch->watch);
+                list_del(&watch->list);
+                free_watch_adapter(watch);
+        }
+        kfree(u);
+        return 0;
+}
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+        struct xenbus_file_priv *u = file->private_data;
+        poll_wait(file, &u->read_waitq, wait);
+        if (!list_empty(&u->read_buffers))
+                return POLLIN | POLLRDNORM;
+        return 0;
+}
+const struct file_operations xenbus_file_ops = {
+        .read = xenbus_file_read,
+        .write = xenbus_file_write,
+        .open = xenbus_file_open,
+        .release = xenbus_file_release,
+        .poll = xenbus_file_poll,
+};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
new file mode 100644
index 000000000000..51f08b2d0bf1
--- /dev/null
+++ b/drivers/xen/xenfs/xenfs.h
@@ -0,0 +1,6 @@
+#ifndef _XENFS_XENBUS_H
+#define _XENFS_XENBUS_H
+extern const struct file_operations xenbus_file_ops;
+#endif  /* _XENFS_XENBUS_H */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
        elf_addr_t __user *u_base_platform;
+        elf_addr_t __user *u_rand_bytes;
        const char *k_platform = ELF_PLATFORM;
        const char *k_base_platform = ELF_BASE_PLATFORM;
+        unsigned char k_rand_bytes[16];
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
+        /*
+         * Generate 16 random bytes for userspace PRNG seeding.
+         */
+        get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+        u_rand_bytes = (elf_addr_t __user *)
+                       STACK_ALLOC(p, sizeof(k_rand_bytes));
+        if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+                return -EFAULT;
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_GID, cred->gid);
        NEW_AUX_ENT(AT_EGID, cred->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
                NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 #include "coda_int.h"
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 static ctl_table coda_table[] = {
        {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
        {}
 };
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
        },
        {}
 };
+#endif
 void coda_sysctl_init(void)
 {
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a32..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
        spin_unlock(&dcache_lock);
 }
-#define do_switch(x,y) do { \
-        __typeof__ (x) __tmp = x; \
-        x = y; y = __tmp; } while (0)
 /*
 * When switching names, the actual string doesn't strictly have to
 * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        /*
                         * Both external: swap the pointers
                         */
-                        do_switch(target->d_name.name, dentry->d_name.name);
+                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        return;
                }
        }
-        do_switch(dentry->d_name.len, target->d_name.len);
+        swap(dentry->d_name.len, target->d_name.len);
 }
 /*
@@ -1680,7 +1676,7 @@ already_unhashed:
        /* Switch the names.. */
        switch_names(dentry, target);
-        do_switch(dentry->d_name.hash, target->d_name.hash);
+        swap(dentry->d_name.hash, target->d_name.hash);
        /* ... and switch the parents */
        if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
                target->d_parent = target;
                INIT_LIST_HEAD(&target->d_u.d_child);
        } else {
-                do_switch(dentry->d_parent, target->d_parent);
+                swap(dentry->d_parent, target->d_parent);
                /* And add them back to the (new) parent lists */
                list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        struct dentry *dparent, *aparent;
        switch_names(dentry, anon);
-        do_switch(dentry->d_name.hash, anon->d_name.hash);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
diff --git a/fs/dquot.c b/fs/dquot.c
index 61bfff64e5af..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -2090,10 +2090,12 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        }
        if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
+                check_blim = 1;
                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
+                check_ilim = 1;
                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        memset(ei->i_data, 0, sizeof(ei->i_data));
-        ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
-                ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-        /* dirsync is only applied to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT2_DIRSYNC_FL;
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
        ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb74..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
 * ext2_splice_branch - splice the allocated branch onto inode.
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext2_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto setflags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext2_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT2_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
        percpu_counter_destroy(&sbi->s_dirs_counter);
        brelse (sbi->s_sbh);
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                printk ("EXT2-fs: not enough memory\n");
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
                printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-                ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT3_DIRSYNC_FL;
 #ifdef EXT3_FRAGMENTS
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        goto flags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext3_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT3_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1dd2abe6313e..8d6f965e502c 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c22d01467bd1..01c235bc2054 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
                ext3_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -1546,6 +1547,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1786,7 +1794,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..3f54db31cdc2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2536,7 +2536,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 */
                newdepth = ext_depth(inode);
                /*
-                 * update the extent length after successfull insert of the
+                 * update the extent length after successful insert of the
                 * split extent
                 */
                orig_ex.ee_len = cpu_to_le16(ee_len -
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9fd2a5e1be4d..4b8d431d7dff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        /*
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time*3 +
+                                journal->j_average_commit_time) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
 *
 * Returns an error code or 0 on success.
 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+         * We try and optimize the sleep time against what the underlying disk
+         * can do, instead of having a static sleep time.  This is usefull for
+         * the case where our storage is so fast that it is more optimal to go
+         * ahead and force a flush and wait for the transaction to be committed
+         * than it is to wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We acheive this by measuring how long it takes
+         * to commit a transaction, and compare it with how long this
+         * transaction has been running, and if run time < commit time then we
+         * sleep for the delta and commit.  This greatly helps super fast disks
+         * that would see slowdowns as more threads started doing fsyncs.
+         *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = min_t(u64, commit_time,
+                                    1000*jiffies_to_usecs(1));
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 54ff4c77aaa3..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3868,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
        struct ocfs2_extent_rec *rec, *tmprec;
-        right_el = path_leaf_el(right_path);;
+        right_el = path_leaf_el(right_path);
        if (left_path)
                left_el = path_leaf_el(left_path);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f731ab491795..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1324,7 +1324,7 @@ again:
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+                mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -2951,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+        mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8f795f978aa..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1605,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
                            struct ocfs2_space_resv *sr)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
        offset = (unsigned long)(*ppos % PAGE_SIZE);
        pfn = (unsigned long)(*ppos / PAGE_SIZE);
-        if (pfn > saved_max_pfn)
-                return -EINVAL;
        do {
                if (count > (PAGE_SIZE - offset))
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c97d4c931715..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-        int nextfh;
+        int nextfh, ret;
        struct romfs_inode ri;
        struct inode *i;
@@ -526,11 +526,11 @@ romfs_iget(struct super_block *sb, unsigned long ino)
        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
        /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
+        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
+        if (ret >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
+                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
+        else
-                ino = 0;
+                ino = 0;
        ROMFS_I(i)->i_metasize = ino;
        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/super.c b/fs/super.c
index cb20744ec789..7d67387496cb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -458,7 +458,6 @@ void sync_filesystems(int wait)
                if (sb->s_flags & MS_RDONLY)
                        continue;
                sb->s_need_sync_fs = 1;
-                async_synchronize_full_special(&sb->s_async_list);
        }
 restart:
@@ -471,6 +470,7 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
+                async_synchronize_full_special(&sb->s_async_list);
                if (sb->s_root && (wait || sb->s_dirt))
                        sb->s_op->sync_fs(sb, wait);
                up_read(&sb->s_umount);
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index d7afa9dd6635..f3b5d4e3a2ac 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -23,16 +23,16 @@
 #define AT_PLATFORM 15  /* string identifying CPU for optimizations */
 #define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
 #define AT_CLKTCK 17    /* frequency at which times() increments */
+/* AT_* values 18 through 22 are reserved */
 #define AT_SECURE 23   /* secure mode boolean */
 #define AT_BASE_PLATFORM 24     /* string identifying real platform, may
                                 * differ from AT_PLATFORM. */
+#define AT_RANDOM 25    /* address of 16 random bytes */
 #define AT_EXECFN  31   /* filename of program */
 #ifdef __KERNEL__
-#define AT_VECTOR_SIZE_BASE 18 /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 19 /* NEW_AUX_ENT entries in auxiliary table */
  /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
 #endif
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 08b78c09b09a..e267e62827bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -52,9 +52,9 @@ struct cgroup_subsys_state {
         * hierarchy structure */
        struct cgroup *cgroup;
-        /* State maintained by the cgroup system to allow
+        /* State maintained by the cgroup system to allow subsystems
-         * subsystems to be "busy". Should be accessed via css_get()
+         * to be "busy". Should be accessed via css_get(),
-         * and css_put() */
+         * css_tryget() and and css_put(). */
        atomic_t refcnt;
@@ -64,11 +64,14 @@ struct cgroup_subsys_state {
 /* bits in struct cgroup_subsys_state flags field */
 enum {
        CSS_ROOT, /* This CSS is the root of the subsystem */
+        CSS_REMOVED, /* This CSS is dead */
 };
 /*
- * Call css_get() to hold a reference on the cgroup;
+ * Call css_get() to hold a reference on the css; it can be used
- *
+ * for a reference obtained via:
+ * - an existing ref-counted reference to the css
+ * - task->cgroups for a locked task
 */
 static inline void css_get(struct cgroup_subsys_state *css)
@@ -77,9 +80,32 @@ static inline void css_get(struct cgroup_subsys_state *css)
        if (!test_bit(CSS_ROOT, &css->flags))
                atomic_inc(&css->refcnt);
 }
+static inline bool css_is_removed(struct cgroup_subsys_state *css)
+{
+        return test_bit(CSS_REMOVED, &css->flags);
+}
+/*
+ * Call css_tryget() to take a reference on a css if your existing
+ * (known-valid) reference isn't already ref-counted. Returns false if
+ * the css has been destroyed.
+ */
+static inline bool css_tryget(struct cgroup_subsys_state *css)
+{
+        if (test_bit(CSS_ROOT, &css->flags))
+                return true;
+        while (!atomic_inc_not_zero(&css->refcnt)) {
+                if (test_bit(CSS_REMOVED, &css->flags))
+                        return false;
+        }
+        return true;
+}
 /*
 * css_put() should be called to release a reference taken by
- * css_get()
+ * css_get() or css_tryget()
 */
 extern void __css_put(struct cgroup_subsys_state *css);
@@ -116,7 +142,7 @@ struct cgroup {
        struct list_head children;      /* my children */
        struct cgroup *parent;  /* my parent */
-        struct dentry *dentry;          /* cgroup fs entry */
+        struct dentry *dentry;          /* cgroup fs entry, RCU protected */
        /* Private pointers for each registered subsystem */
        struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -145,6 +171,9 @@ struct cgroup {
        int pids_use_count;
        /* Length of the current tasks_pids array */
        int pids_length;
+        /* For RCU-protected deletion */
+        struct rcu_head rcu_head;
 };
 /* A css_set is a structure holding pointers to a set of
@@ -337,9 +366,23 @@ struct cgroup_subsys {
 #define MAX_CGROUP_TYPE_NAMELEN 32
        const char *name;
-        /* Protected by RCU */
+        /*
-        struct cgroupfs_root *root;
+         * Protects sibling/children links of cgroups in this
+         * hierarchy, plus protects which hierarchy (or none) the
+         * subsystem is a part of (i.e. root/sibling).  To avoid
+         * potential deadlocks, the following operations should not be
+         * undertaken while holding any hierarchy_mutex:
+         *
+         * - allocating memory
+         * - initiating hotplug events
+         */
+        struct mutex hierarchy_mutex;
+        /*
+         * Link to parent, and list entry in parent's children.
+         * Protected by this->hierarchy_mutex and cgroup_lock()
+         */
+        struct cgroupfs_root *root;
        struct list_head sibling;
 };
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 51ea2bdea0f9..90c6074a36ca 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,8 +20,9 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask);
+extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask);
+extern void cpuset_cpus_allowed_locked(struct task_struct *p,
+                                       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -86,12 +87,13 @@ static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
-static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask)
+static inline void cpuset_cpus_allowed(struct task_struct *p,
+                                       struct cpumask *mask)
 {
        *mask = cpu_possible_map;
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                                                cpumask_t *mask)
+                                              struct cpumask *mask)
 {
        *mask = cpu_possible_map;
 }
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 78c775a83f7c..121720d74e15 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -194,6 +194,30 @@ struct ext2_group_desc
 #define EXT2_FL_USER_VISIBLE            FS_FL_USER_VISIBLE      /* User visible flags */
 #define EXT2_FL_USER_MODIFIABLE         FS_FL_USER_MODIFIABLE   /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
+                           EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\
+                           EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\
+                           EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
+                           EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & EXT2_REG_FLMASK;
+        else
+                return flags & EXT2_OTHER_FLMASK;
+}
 /*
 * ioctl commands
 */
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index dc541f3653d1..1cdb66367c98 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -101,7 +101,7 @@ struct ext2_sb_info {
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
-        struct blockgroup_lock s_blockgroup_lock;
+        struct blockgroup_lock *s_blockgroup_lock;
        /* root of the per fs reservation window tree */
        spinlock_t s_rsv_window_lock;
        struct rb_root s_rsv_window_root;
@@ -111,7 +111,7 @@ struct ext2_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group)
 {
-        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 #endif  /* _LINUX_EXT2_FS_SB */
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d14f02918483..d76800f6ecf0 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -178,6 +178,30 @@ struct ext3_group_desc
 #define EXT3_FL_USER_VISIBLE            0x0003DFFF /* User visible flags */
 #define EXT3_FL_USER_MODIFIABLE         0x000380FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+                           EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\
+                           EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\
+                           EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+                           EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & EXT3_REG_FLMASK;
+        else
+                return flags & EXT3_OTHER_FLMASK;
+}
 /*
 * Inode dynamic state flags
 */
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index e024e38248ff..76fdc0f4b028 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -60,7 +60,7 @@ struct ext3_sb_info {
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
-        struct blockgroup_lock s_blockgroup_lock;
+        struct blockgroup_lock *s_blockgroup_lock;
        /* root of the per fs reservation window tree */
        spinlock_t s_rsv_window_lock;
@@ -86,7 +86,7 @@ struct ext3_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
 {
-        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 #endif  /* _LINUX_EXT3_FS_SB */
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 346e2b80be7d..6384b19efe64 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -543,6 +543,11 @@ struct transaction_s
        unsigned long           t_expires;
        /*
+         * When this transaction started, in nanoseconds [no locking]
+         */
+        ktime_t                 t_start_time;
+        /*
         * How many handles used this transaction? [t_handle_lock]
         */
        int t_handle_count;
@@ -798,9 +803,19 @@ struct journal_s
        struct buffer_head      **j_wbuf;
        int                     j_wbufsize;
+        /*
+         * this is the pid of the last person to run a synchronous operation
+         * through the journal.
+         */
        pid_t                   j_last_sync_writer;
        /*
+         * the average amount of time in nanoseconds it takes to commit a
+         * transaction to the disk.  [j_state_lock]
+         */
+        u64                     j_average_commit_time;
+        /*
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here
         */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6b8e2027165e..343df9ef2412 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -476,6 +476,12 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
        __val = __val < __min ? __min: __val;   \
        __val > __max ? __max: __val; })
+/*
+ * swap - swap value of @a and @b
+ */
+#define swap(a, b) ({ typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; })
 /**
 * container_of - cast a member of a structure out to the containing structure
 * @ptr:        the pointer to the member.
diff --git a/include/linux/magic.h b/include/linux/magic.h
index f7f3fdddbef0..439f6f3cb0c4 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -13,6 +13,7 @@
 #define EFS_SUPER_MAGIC         0x414A53
 #define EXT2_SUPER_MAGIC        0xEF53
 #define EXT3_SUPER_MAGIC        0xEF53
+#define XENFS_SUPER_MAGIC       0xabba1974
 #define EXT4_SUPER_MAGIC        0xEF53
 #define HPFS_SUPER_MAGIC        0xf995e849
 #define ISOFS_SUPER_MAGIC       0x9660
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fbe14d39521..326f45c86530 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,22 +19,45 @@
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
+#include <linux/cgroup.h>
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/*
+ * All "charge" functions with gfp_mask should use GFP_KERNEL or
+ * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
+ * alloc memory but reclaims memory from all available zones. So, "where I want
+ * memory from" bits of gfp_mask has no meaning. So any bits of that field is
+ * available but adding a rule is better. charge functions' gfp_mask should
+ * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
+ * codes.
+ * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
+ */
-extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask);
+/* for swap handling */
+extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+                struct page *page, gfp_t mask, struct mem_cgroup **ptr);
+extern void mem_cgroup_commit_charge_swapin(struct page *page,
+                                        struct mem_cgroup *ptr);
+extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask);
-extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
+extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_del_lru(struct page *page);
+extern void mem_cgroup_move_lists(struct page *page,
+                                  enum lru_list from, enum lru_list to);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
+extern int mem_cgroup_shrink_usage(struct page *page,
+                        struct mm_struct *mm, gfp_t gfp_mask);
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
@@ -47,12 +70,20 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-#define mm_match_cgroup(mm, cgroup)     \
+static inline
-        ((cgroup) == mem_cgroup_from_task((mm)->owner))
+int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
+{
+        struct mem_cgroup *mem;
+        rcu_read_lock();
+        mem = mem_cgroup_from_task((mm)->owner);
+        rcu_read_unlock();
+        return cgroup == mem;
+}
 extern int
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
+mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr);
-extern void mem_cgroup_end_migration(struct page *page);
+extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
+        struct page *oldpage, struct page *newpage);
 /*
 * For memory reclaim.
@@ -65,13 +96,32 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
                                                        int priority);
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
                                                        int priority);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+                                       struct zone *zone,
+                                       enum lru_list lru);
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+                                                      struct zone *zone);
+struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat_from_page(struct page *page);
-extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-                                        int priority, enum lru_list lru);
+extern int do_swap_account;
+#endif
+static inline bool mem_cgroup_disabled(void)
+{
+        if (mem_cgroup_subsys.disabled)
+                return true;
+        return false;
+}
+extern bool mem_cgroup_oom_called(struct task_struct *task);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline int mem_cgroup_charge(struct page *page,
+struct mem_cgroup;
+static inline int mem_cgroup_newpage_charge(struct page *page,
                                        struct mm_struct *mm, gfp_t gfp_mask)
 {
        return 0;
@@ -83,6 +133,21 @@ static inline int mem_cgroup_cache_charge(struct page *page,
        return 0;
 }
+static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+                struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
+{
+        return 0;
+}
+static inline void mem_cgroup_commit_charge_swapin(struct page *page,
+                                          struct mem_cgroup *ptr)
+{
+}
+static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
+{
+}
 static inline void mem_cgroup_uncharge_page(struct page *page)
 {
 }
@@ -91,12 +156,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
-static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+static inline int mem_cgroup_shrink_usage(struct page *page,
+                        struct mm_struct *mm, gfp_t gfp_mask)
 {
        return 0;
 }
-static inline void mem_cgroup_move_lists(struct page *page, bool active)
+static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
+{
+}
+static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
+{
+        return ;
+}
+static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
+{
+        return ;
+}
+static inline void mem_cgroup_del_lru(struct page *page)
+{
+        return ;
+}
+static inline void
+mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
 {
 }
@@ -112,12 +198,14 @@ static inline int task_in_mem_cgroup(struct task_struct *task,
 }
 static inline int
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
        return 0;
 }
-static inline void mem_cgroup_end_migration(struct page *page)
+static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
+                                        struct page *oldpage,
+                                        struct page *newpage)
 {
 }
@@ -146,12 +234,42 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 {
 }
-static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
+static inline bool mem_cgroup_disabled(void)
-                                        struct zone *zone, int priority,
+{
-                                        enum lru_list lru)
+        return true;
+}
+static inline bool mem_cgroup_oom_called(struct task_struct *task)
+{
+        return false;
+}
+static inline int
+mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+        return 1;
+}
+static inline unsigned long
+mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
+                         enum lru_list lru)
 {
        return 0;
 }
+static inline struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
+{
+        return NULL;
+}
+static inline struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+        return NULL;
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index c948350c378e..7fbb97267556 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -28,6 +28,7 @@ add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
        list_add(&page->lru, &zone->lru[l].list);
        __inc_zone_state(zone, NR_LRU_BASE + l);
+        mem_cgroup_add_lru_list(page, l);
 }
 static inline void
@@ -35,6 +36,7 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
        list_del(&page->lru);
        __dec_zone_state(zone, NR_LRU_BASE + l);
+        mem_cgroup_del_lru_list(page, l);
 }
 static inline void
@@ -54,6 +56,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
                l += page_is_file_cache(page);
        }
        __dec_zone_state(zone, NR_LRU_BASE + l);
+        mem_cgroup_del_lru_list(page, l);
 }
 /**
@@ -78,23 +81,4 @@ static inline enum lru_list page_lru(struct page *page)
        return lru;
 }
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static inline int inactive_anon_is_low(struct zone *zone)
-{
-        unsigned long active, inactive;
-        active = zone_page_state(zone, NR_ACTIVE_ANON);
-        inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-        if (inactive * zone->inactive_ratio < active)
-                return 1;
-        return 0;
-}
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 35a7b5e19465..09c14e213b63 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -263,6 +263,19 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
+struct zone_reclaim_stat {
+        /*
+         * The pageout code in vmscan.c keeps track of how many of the
+         * mem/swap backed and file backed pages are refeferenced.
+         * The higher the rotated/scanned ratio, the more valuable
+         * that cache is.
+         *
+         * The anon LRU stats live in [0], file LRU stats in [1]
+         */
+        unsigned long           recent_rotated[2];
+        unsigned long           recent_scanned[2];
+};
 struct zone {
        /* Fields commonly accessed by the page allocator */
        unsigned long           pages_min, pages_low, pages_high;
@@ -315,16 +328,7 @@ struct zone {
                unsigned long nr_scan;
        } lru[NR_LRU_LISTS];
-        /*
+        struct zone_reclaim_stat reclaim_stat;
-         * The pageout code in vmscan.c keeps track of how many of the
-         * mem/swap backed and file backed pages are refeferenced.
-         * The higher the rotated/scanned ratio, the more valuable
-         * that cache is.
-         *
-         * The anon LRU stats live in [0], file LRU stats in [1]
-         */
-        unsigned long           recent_rotated[2];
-        unsigned long           recent_scanned[2];
        unsigned long           pages_scanned;     /* since last reclaim */
        unsigned long           flags;             /* zone flags, see below */
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 1e6d34bfa094..602cc1fdee90 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -26,10 +26,6 @@ enum {
        PCG_LOCK,  /* page cgroup is locked */
        PCG_CACHE, /* charged as cache */
        PCG_USED, /* this object is in use. */
-        /* flags for LRU placement */
-        PCG_ACTIVE, /* page is active in this cgroup */
-        PCG_FILE, /* page is file system backed */
-        PCG_UNEVICTABLE, /* page is unevictableable */
 };
 #define TESTPCGFLAG(uname, lname)                       \
@@ -50,19 +46,6 @@ TESTPCGFLAG(Cache, CACHE)
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
-/* LRU management flags (from global-lru definition) */
-TESTPCGFLAG(File, FILE)
-SETPCGFLAG(File, FILE)
-CLEARPCGFLAG(File, FILE)
-TESTPCGFLAG(Active, ACTIVE)
-SETPCGFLAG(Active, ACTIVE)
-CLEARPCGFLAG(Active, ACTIVE)
-TESTPCGFLAG(Unevictable, UNEVICTABLE)
-SETPCGFLAG(Unevictable, UNEVICTABLE)
-CLEARPCGFLAG(Unevictable, UNEVICTABLE)
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
        return page_to_nid(pc->page);
@@ -105,4 +88,39 @@ static inline void page_cgroup_init(void)
 }
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#include <linux/swap.h>
+extern struct mem_cgroup *
+swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
+extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+#else
+#include <linux/swap.h>
+static inline
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+        return NULL;
+}
+static inline
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+        return NULL;
+}
+static inline int
+swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+        return 0;
+}
+static inline void swap_cgroup_swapoff(int type)
+{
+        return;
+}
+#endif
 #endif
diff --git a/include/linux/pid.h b/include/linux/pid.h
index bb206c56d1f0..49f1c2f66e95 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns);
 extern void free_pid(struct pid *pid);
 /*
+ * ns_of_pid() returns the pid namespace in which the specified pid was
+ * allocated.
+ *
+ * NOTE:
+ *      ns_of_pid() is expected to be called for a process (task) that has
+ *      an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
+ *      is expected to be non-NULL. If @pid is NULL, caller should handle
+ *      the resulting NULL pid-ns.
+ */
+static inline struct pid_namespace *ns_of_pid(struct pid *pid)
+{
+        struct pid_namespace *ns = NULL;
+        if (pid)
+                ns = pid->numbers[pid->level].ns;
+        return ns;
+}
+/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index d82fe825d62f..38d10326246a 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -79,11 +79,7 @@ static inline void zap_pid_ns_processes(struct pid_namespace *ns)
 }
 #endif /* CONFIG_PID_NS */
-static inline struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
+extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
-{
-        return tsk->nsproxy->pid_ns;
-}
 void pidhash_init(void);
 void pidmap_init(void);
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 271c1c2c9f6f..dede0a2cfc45 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -43,6 +43,10 @@ struct res_counter {
         * the routines below consider this to be IRQ-safe
         */
        spinlock_t lock;
+        /*
+         * Parent counter, used for hierarchial resource accounting
+         */
+        struct res_counter *parent;
 };
 /**
@@ -87,7 +91,7 @@ enum {
 * helpers for accounting
 */
-void res_counter_init(struct res_counter *counter);
+void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 /*
 * charge - try to consume more resource.
@@ -103,7 +107,7 @@ void res_counter_init(struct res_counter *counter);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
                unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-                unsigned long val);
+                unsigned long val, struct res_counter **limit_fail_at);
 /*
 * uncharge - tell that some portion of the resource is released
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 91dee50fe260..d30215578877 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ static inline void lru_cache_add_active_file(struct page *page)
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-                                                        gfp_t gfp_mask);
+                                                  gfp_t gfp_mask, bool noswap,
+                                                  unsigned int swappiness);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
@@ -333,6 +334,22 @@ static inline void disable_swap_token(void)
        put_swap_token(swap_token_mm);
 }
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
+#else
+static inline void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+}
+#endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
+#else
+static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
+{
+}
+#endif
 #else /* CONFIG_SWAP */
 #define nr_swap_pages                           0L
@@ -409,6 +426,12 @@ static inline swp_entry_t get_swap_page(void)
 #define has_swap_token(x) 0
 #define disable_swap_token() do { } while(0)
+static inline int mem_cgroup_cache_charge_swapin(struct page *page,
+                        struct mm_struct *mm, gfp_t mask, bool locked)
+{
+        return 0;
+}
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 6369d89c25d5..f87f9614844d 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -136,8 +136,6 @@ struct xenbus_transaction
 /* Nil transaction ID. */
 #define XBT_NIL ((struct xenbus_transaction) { 0 })
-int __init xenbus_dev_init(void);
 char **xenbus_directory(struct xenbus_transaction t,
                        const char *dir, const char *node, unsigned int *num);
 void *xenbus_read(struct xenbus_transaction t,
diff --git a/init/Kconfig b/init/Kconfig
index e7893b1d3e42..a724a149bf3f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -271,59 +271,6 @@ config LOG_BUF_SHIFT
                     13 =>  8 KB
                     12 =>  4 KB
-config CGROUPS
-        bool "Control Group support"
-        help
-          This option will let you use process cgroup subsystems
-          such as Cpusets
-          Say N if unsure.
-config CGROUP_DEBUG
-        bool "Example debug cgroup subsystem"
-        depends on CGROUPS
-        default n
-        help
-          This option enables a simple cgroup subsystem that
-          exports useful debugging information about the cgroups
-          framework
-          Say N if unsure
-config CGROUP_NS
-        bool "Namespace cgroup subsystem"
-        depends on CGROUPS
-        help
-          Provides a simple namespace cgroup subsystem to
-          provide hierarchical naming of sets of namespaces,
-          for instance virtual servers and checkpoint/restart
-          jobs.
-config CGROUP_FREEZER
-        bool "control group freezer subsystem"
-        depends on CGROUPS
-        help
-          Provides a way to freeze and unfreeze all tasks in a
-          cgroup.
-config CGROUP_DEVICE
-        bool "Device controller for cgroups"
-        depends on CGROUPS && EXPERIMENTAL
-        help
-          Provides a cgroup implementing whitelists for devices which
-          a process in the cgroup can mknod or open.
-config CPUSETS
-        bool "Cpuset support"
-        depends on SMP && CGROUPS
-        help
-          This option will let you create and manage CPUSETs which
-          allow dynamically partitioning a system into sets of CPUs and
-          Memory Nodes and assigning tasks to run only within those sets.
-          This is primarily useful on large SMP or NUMA systems.
-          Say N if unsure.
 #
 # Architectures with an unreliable sched_clock() should select this:
 #
@@ -337,6 +284,8 @@ config GROUP_SCHED
        help
          This feature lets CPU scheduler recognize task groups and control CPU
          bandwidth allocation to such task groups.
+          In order to create a group from arbitrary set of processes, use
+          CONFIG_CGROUPS. (See Control Group support.)
 config FAIR_GROUP_SCHED
        bool "Group scheduling for SCHED_OTHER"
@@ -379,6 +328,66 @@ config CGROUP_SCHED
 endchoice
+menu "Control Group support"
+config CGROUPS
+        bool "Control Group support"
+        help
+          This option add support for grouping sets of processes together, for
+          use with process control subsystems such as Cpusets, CFS, memory
+          controls or device isolation.
+          See
+                - Documentation/cpusets.txt     (Cpusets)
+                - Documentation/scheduler/sched-design-CFS.txt  (CFS)
+                - Documentation/cgroups/ (features for grouping, isolation)
+                - Documentation/controllers/ (features for resource control)
+          Say N if unsure.
+config CGROUP_DEBUG
+        bool "Example debug cgroup subsystem"
+        depends on CGROUPS
+        default n
+        help
+          This option enables a simple cgroup subsystem that
+          exports useful debugging information about the cgroups
+          framework
+          Say N if unsure
+config CGROUP_NS
+        bool "Namespace cgroup subsystem"
+        depends on CGROUPS
+        help
+          Provides a simple namespace cgroup subsystem to
+          provide hierarchical naming of sets of namespaces,
+          for instance virtual servers and checkpoint/restart
+          jobs.
+config CGROUP_FREEZER
+        bool "control group freezer subsystem"
+        depends on CGROUPS
+        help
+          Provides a way to freeze and unfreeze all tasks in a
+          cgroup.
+config CGROUP_DEVICE
+        bool "Device controller for cgroups"
+        depends on CGROUPS && EXPERIMENTAL
+        help
+          Provides a cgroup implementing whitelists for devices which
+          a process in the cgroup can mknod or open.
+config CPUSETS
+        bool "Cpuset support"
+        depends on SMP && CGROUPS
+        help
+          This option will let you create and manage CPUSETs which
+          allow dynamically partitioning a system into sets of CPUs and
+          Memory Nodes and assigning tasks to run only within those sets.
+          This is primarily useful on large SMP or NUMA systems.
+          Say N if unsure.
 config CGROUP_CPUACCT
        bool "Simple CPU accounting cgroup subsystem"
        depends on CGROUPS
@@ -393,9 +402,6 @@ config RESOURCE_COUNTERS
          infrastructure that works with cgroups
        depends on CGROUPS
-config MM_OWNER
-        bool
 config CGROUP_MEM_RES_CTLR
        bool "Memory Resource Controller for Control Groups"
        depends on CGROUPS && RESOURCE_COUNTERS
@@ -414,11 +420,33 @@ config CGROUP_MEM_RES_CTLR
          sure you need the memory resource controller. Even when you enable
          this, you can set "cgroup_disable=memory" at your boot option to
          disable memory resource controller and you can avoid overheads.
-          (and lose benefits of memory resource contoller)
+          (and lose benefits of memory resource controller)
          This config option also selects MM_OWNER config option, which
          could in turn add some fork/exit overhead.
+config MM_OWNER
+        bool
+config CGROUP_MEM_RES_CTLR_SWAP
+        bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
+        depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
+        help
+          Add swap management feature to memory resource controller. When you
+          enable this, you can limit mem+swap usage per cgroup. In other words,
+          when you disable this, memory resource controller has no cares to
+          usage of swap...a process can exhaust all of the swap. This extension
+          is useful when you want to avoid exhaustion swap but this itself
+          adds more overheads and consumes memory for remembering information.
+          Especially if you use 32bit system or small memory system, please
+          be careful about enabling this. When memory resource controller
+          is disabled by boot option, this will be automatically disabled and
+          there will be no overhead from this. Even when you set this config=y,
+          if boot option "noswapaccount" is set, swap will not be accounted.
+endmenu
 config SYSFS_DEPRECATED
        bool
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index eddb6247a553..23fdb8492b8e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -505,7 +505,8 @@ static void __do_notify(struct mqueue_inode_info *info)
                        sig_i.si_errno = 0;
                        sig_i.si_code = SI_MESGQ;
                        sig_i.si_value = info->notify.sigev_value;
-                        sig_i.si_pid = task_tgid_vnr(current);
+                        sig_i.si_pid = task_tgid_nr_ns(current,
+                                                ns_of_pid(info->notify_owner));
                        sig_i.si_uid = current_uid();
                        kill_pid_info(info->notify.sigev_signo,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f221446aa02d..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
        /* Tracks how many cgroups are currently defined in hierarchy.*/
        int number_of_cgroups;
-        /* A list running through the mounted hierarchies */
+        /* A list running through the active hierarchies */
        struct list_head root_list;
        /* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_subsys(_root, _ss) \
 list_for_each_entry(_ss, &_root->subsys_list, sibling)
-/* for_each_root() allows you to iterate across the active hierarchies */
+/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_root(_root) \
+#define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        rcu_read_lock();
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup *cgrp = cg->subsys[i]->cgroup;
+                struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        return 0;
 }
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @cg: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_cg_links,
+                         struct css_set *cg, struct cgroup *cgrp)
+{
+        struct cg_cgroup_link *link;
+        BUG_ON(list_empty(tmp_cg_links));
+        link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+                                cgrp_link_list);
+        link->cg = cg;
+        list_move(&link->cgrp_link_list, &cgrp->css_sets);
+        list_add(&link->cg_link_list, &cg->cg_links);
+}
 /*
 * find_css_set() takes an existing cgroup group and a
 * cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
        int i;
        struct list_head tmp_cg_links;
-        struct cg_cgroup_link *link;
        struct hlist_head *hhead;
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
                 * only do it for the first subsystem in each
                 * hierarchy
                 */
-                if (ss->root->subsys_list.next == &ss->sibling) {
+                if (ss->root->subsys_list.next == &ss->sibling)
-                        BUG_ON(list_empty(&tmp_cg_links));
+                        link_css_set(&tmp_cg_links, res, cgrp);
-                        link = list_entry(tmp_cg_links.next,
-                                          struct cg_cgroup_link,
-                                          cgrp_link_list);
-                        list_del(&link->cgrp_link_list);
-                        list_add(&link->cgrp_link_list, &cgrp->css_sets);
-                        link->cg = res;
-                        list_add(&link->cg_link_list, &res->cg_links);
-                }
-        }
-        if (list_empty(&rootnode.subsys_list)) {
-                link = list_entry(tmp_cg_links.next,
-                                  struct cg_cgroup_link,
-                                  cgrp_link_list);
-                list_del(&link->cgrp_link_list);
-                list_add(&link->cgrp_link_list, &dummytop->css_sets);
-                link->cg = res;
-                list_add(&link->cg_link_list, &res->cg_links);
        }
+        if (list_empty(&rootnode.subsys_list))
+                link_css_set(&tmp_cg_links, res, dummytop);
        BUG_ON(!list_empty(&tmp_cg_links));
@@ -586,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
        for_each_subsys(cgrp->root, ss)
-                if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+                if (ss->pre_destroy)
                        ss->pre_destroy(ss, cgrp);
        return;
 }
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+        struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+        kfree(cgrp);
+}
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -610,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                /*
                 * Release the subsystem state objects.
                 */
-                for_each_subsys(cgrp->root, ss) {
+                for_each_subsys(cgrp->root, ss)
-                        if (cgrp->subsys[ss->subsys_id])
+                        ss->destroy(ss, cgrp);
-                                ss->destroy(ss, cgrp);
-                }
                cgrp->root->number_of_cgroups--;
                mutex_unlock(&cgroup_mutex);
-                /* Drop the active superblock reference that we took when we
+                /*
-                 * created the cgroup */
+                 * Drop the active superblock reference that we took when we
+                 * created the cgroup
+                 */
                deactivate_super(cgrp->root->sb);
-                kfree(cgrp);
+                call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
        }
        iput(inode);
 }
@@ -712,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                        mutex_lock(&ss->hierarchy_mutex);
                        cgrp->subsys[i] = dummytop->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
-                        list_add(&ss->sibling, &root->subsys_list);
+                        list_move(&ss->sibling, &root->subsys_list);
-                        rcu_assign_pointer(ss->root, root);
+                        ss->root = root;
                        if (ss->bind)
                                ss->bind(ss, cgrp);
+                        mutex_unlock(&ss->hierarchy_mutex);
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                        mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
                                ss->bind(ss, dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
-                        rcu_assign_pointer(subsys[i]->root, &rootnode);
+                        subsys[i]->root = &rootnode;
-                        list_del(&ss->sibling);
+                        list_move(&ss->sibling, &rootnode.subsys_list);
+                        mutex_unlock(&ss->hierarchy_mutex);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
                        BUG_ON(!cgrp->subsys[i]);
@@ -990,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                root = NULL;
        } else {
                /* New superblock */
-                struct cgroup *cgrp = &root->top_cgroup;
+                struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
                int i;
@@ -1031,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                list_add(&root->root_list, &roots);
                root_count++;
-                sb->s_root->d_fsdata = &root->top_cgroup;
+                sb->s_root->d_fsdata = root_cgrp;
                root->top_cgroup.dentry = sb->s_root;
                /* Link the top cgroup in this hierarchy into all
@@ -1042,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        struct hlist_node *node;
                        struct css_set *cg;
-                        hlist_for_each_entry(cg, node, hhead, hlist) {
+                        hlist_for_each_entry(cg, node, hhead, hlist)
-                                struct cg_cgroup_link *link;
+                                link_css_set(&tmp_cg_links, cg, root_cgrp);
-                                BUG_ON(list_empty(&tmp_cg_links));
-                                link = list_entry(tmp_cg_links.next,
-                                                  struct cg_cgroup_link,
-                                                  cgrp_link_list);
-                                list_del(&link->cgrp_link_list);
-                                link->cg = cg;
-                                list_add(&link->cgrp_link_list,
-                                         &root->top_cgroup.css_sets);
-                                list_add(&link->cg_link_list, &cg->cg_links);
-                        }
                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
-                BUG_ON(!list_empty(&cgrp->sibling));
+                BUG_ON(!list_empty(&root_cgrp->sibling));
-                BUG_ON(!list_empty(&cgrp->children));
+                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
-                cgroup_populate_dir(cgrp);
+                cgroup_populate_dir(root_cgrp);
                mutex_unlock(&inode->i_mutex);
                mutex_unlock(&cgroup_mutex);
        }
@@ -1113,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
        }
        write_unlock(&css_set_lock);
-        if (!list_empty(&root->root_list)) {
+        list_del(&root->root_list);
-                list_del(&root->root_list);
+        root_count--;
-                root_count--;
-        }
        mutex_unlock(&cgroup_mutex);
        kfree(root);
@@ -1145,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * Returns 0 on success, -errno on error.
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
 */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
+        struct dentry *dentry = rcu_dereference(cgrp->dentry);
-        if (cgrp == dummytop) {
+        if (!dentry || cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1165,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        *--start = '\0';
        for (;;) {
-                int len = cgrp->dentry->d_name.len;
+                int len = dentry->d_name.len;
                if ((start -= len) < buf)
                        return -ENAMETOOLONG;
                memcpy(start, cgrp->dentry->d_name.name, len);
                cgrp = cgrp->parent;
                if (!cgrp)
                        break;
+                dentry = rcu_dereference(cgrp->dentry);
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1216,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        int retval = 0;
        struct cgroup_subsys *ss;
        struct cgroup *oldcgrp;
-        struct css_set *cg = tsk->cgroups;
+        struct css_set *cg;
        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        int subsys_id;
@@ -1236,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                }
        }
+        task_lock(tsk);
+        cg = tsk->cgroups;
+        get_css_set(cg);
+        task_unlock(tsk);
        /*
         * Locate or allocate a new css_set for this task,
         * based on its final set of cgroups
         */
        newcg = find_css_set(cg, cgrp);
+        put_css_set(cg);
        if (!newcg)
                return -ENOMEM;
@@ -1445,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (!cft || cgroup_is_removed(cgrp))
+        if (cgroup_is_removed(cgrp))
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (!cft || cgroup_is_removed(cgrp))
+        if (cgroup_is_removed(cgrp))
                return -ENODEV;
        if (cft->read)
@@ -1554,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        err = generic_file_open(inode, file);
        if (err)
                return err;
        cft = __d_cft(file->f_dentry);
-        if (!cft)
-                return -ENODEV;
        if (cft->read_map || cft->read_seq_string) {
                struct cgroup_seqfile_state *state =
                        kzalloc(sizeof(*state), GFP_USER);
@@ -1671,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
        if (!error) {
                dentry->d_fsdata = cgrp;
                inc_nlink(parent->d_inode);
-                cgrp->dentry = dentry;
+                rcu_assign_pointer(cgrp->dentry, dentry);
                dget(dentry);
        }
        dput(dentry);
@@ -1812,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
        struct task_struct *res;
        struct list_head *l = it->task;
+        struct cg_cgroup_link *link;
        /* If the iterator cg is NULL, we have no tasks */
        if (!it->cg_link)
@@ -1819,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
        res = list_entry(l, struct task_struct, cg_list);
        /* Advance iterator to find next entry */
        l = l->next;
-        if (l == &res->cgroups->tasks) {
+        link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+        if (l == &link->cg->tasks) {
                /* We reached the end of this task list - move on to
                 * the next cg_cgroup_link */
                cgroup_advance_iter(cgrp, it);
@@ -2013,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 */
 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
 {
-        int n = 0;
+        int n = 0, pid;
        struct cgroup_iter it;
        struct task_struct *tsk;
        cgroup_iter_start(cgrp, &it);
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
                if (unlikely(n == npids))
                        break;
-                pidarray[n++] = task_pid_vnr(tsk);
+                pid = task_pid_vnr(tsk);
+                if (pid > 0)
+                        pidarray[n++] = pid;
        }
        cgroup_iter_end(cgrp, &it);
        return n;
@@ -2052,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        ret = 0;
        cgrp = dentry->d_fsdata;
-        rcu_read_lock();
        cgroup_iter_start(cgrp, &it);
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2077,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        }
        cgroup_iter_end(cgrp, &it);
-        rcu_read_unlock();
 err:
        return ret;
 }
@@ -2324,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp)
 {
        css->cgroup = cgrp;
-        atomic_set(&css->refcnt, 0);
+        atomic_set(&css->refcnt, 1);
        css->flags = 0;
        if (cgrp == dummytop)
                set_bit(CSS_ROOT, &css->flags);
@@ -2332,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
        cgrp->subsys[ss->subsys_id] = css;
 }
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+        /* We need to take each hierarchy_mutex in a consistent order */
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                struct cgroup_subsys *ss = subsys[i];
+                if (ss->root == root)
+                        mutex_lock_nested(&ss->hierarchy_mutex, i);
+        }
+}
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+        int i;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                struct cgroup_subsys *ss = subsys[i];
+                if (ss->root == root)
+                        mutex_unlock(&ss->hierarchy_mutex);
+        }
+}
 /*
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
@@ -2380,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                init_cgroup_css(css, ss, cgrp);
        }
+        cgroup_lock_hierarchy(root);
        list_add(&cgrp->sibling, &cgrp->parent->children);
+        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups++;
        err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2431,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
        /* Check the reference count on each subsystem. Since we
         * already established that there are no tasks in the
-         * cgroup, if the css refcount is also 0, then there should
+         * cgroup, if the css refcount is also 1, then there should
         * be no outstanding references, so the subsystem is safe to
         * destroy. We scan across all subsystems rather than using
         * the per-hierarchy linked list of mounted subsystems since
@@ -2452,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
                 * release agent to be called anyway. */
-                if (css && atomic_read(&css->refcnt))
+                if (css && (atomic_read(&css->refcnt) > 1))
                        return 1;
        }
        return 0;
 }
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+        struct cgroup_subsys *ss;
+        unsigned long flags;
+        bool failed = false;
+        local_irq_save(flags);
+        for_each_subsys(cgrp->root, ss) {
+                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                int refcnt;
+                do {
+                        /* We can only remove a CSS with a refcnt==1 */
+                        refcnt = atomic_read(&css->refcnt);
+                        if (refcnt > 1) {
+                                failed = true;
+                                goto done;
+                        }
+                        BUG_ON(!refcnt);
+                        /*
+                         * Drop the refcnt to 0 while we check other
+                         * subsystems. This will cause any racing
+                         * css_tryget() to spin until we set the
+                         * CSS_REMOVED bits or abort
+                         */
+                } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+        }
+ done:
+        for_each_subsys(cgrp->root, ss) {
+                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                if (failed) {
+                        /*
+                         * Restore old refcnt if we previously managed
+                         * to clear it from 1 to 0
+                         */
+                        if (!atomic_read(&css->refcnt))
+                                atomic_set(&css->refcnt, 1);
+                } else {
+                        /* Commit the fact that the CSS is removed */
+                        set_bit(CSS_REMOVED, &css->flags);
+                }
+        }
+        local_irq_restore(flags);
+        return !failed;
+}
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cgroup *cgrp = dentry->d_fsdata;
        struct dentry *d;
        struct cgroup *parent;
-        struct super_block *sb;
-        struct cgroupfs_root *root;
        /* the vfs holds both inode->i_mutex already */
@@ -2487,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
-        root = cgrp->root;
-        sb = root->sb;
        if (atomic_read(&cgrp->count)
            || !list_empty(&cgrp->children)
-            || cgroup_has_css_refs(cgrp)) {
+            || !cgroup_clear_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -2502,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        if (!list_empty(&cgrp->release_list))
                list_del(&cgrp->release_list);
        spin_unlock(&release_list_lock);
-        /* delete my sibling from parent->children */
+        cgroup_lock_hierarchy(cgrp->root);
+        /* delete this cgroup from parent->children */
        list_del(&cgrp->sibling);
+        cgroup_unlock_hierarchy(cgrp->root);
        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
        spin_unlock(&d->d_lock);
@@ -2525,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
        /* Create the top cgroup state for this subsystem */
+        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
        css = ss->create(ss, dummytop);
        /* We don't handle early failures gracefully */
@@ -2544,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
+        mutex_init(&ss->hierarchy_mutex);
        ss->active = 1;
 }
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
        init_cgroup_root(&rootnode);
-        list_add(&rootnode.root_list, &roots);
        root_count = 1;
        init_task.cgroups = &init_css_set;
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
        mutex_lock(&cgroup_mutex);
-        for_each_root(root) {
+        for_each_active_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int subsys_id;
                int count = 0;
-                /* Skip this hierarchy if it has no active subsystems */
-                if (!root->actual_subsys_bits)
-                        continue;
                seq_printf(m, "%lu:", root->subsys_bits);
                for_each_subsys(root, ss)
                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2800,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
 {
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
+                task_lock(child);
                if (list_empty(&child->cg_list))
                        list_add(&child->cg_list, &child->cgroups->tasks);
+                task_unlock(child);
                write_unlock(&css_set_lock);
        }
 }
@@ -2907,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                mutex_unlock(&cgroup_mutex);
                return 0;
        }
+        task_lock(tsk);
        cg = tsk->cgroups;
        parent = task_cgroup(tsk, subsys->subsys_id);
@@ -2919,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
        /* Keep the cgroup alive */
        get_css_set(cg);
+        task_unlock(tsk);
        mutex_unlock(&cgroup_mutex);
        /* Now do the VFS work to create a cgroup */
@@ -2937,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
        }
        /* Create the cgroup directory, which also creates the cgroup */
-        ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+        ret = vfs_mkdir(inode, dentry, 0755);
        child = __d_cgrp(dentry);
        dput(dentry);
        if (ret) {
@@ -2947,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                goto out_release;
        }
-        if (!child) {
-                printk(KERN_INFO
-                       "Couldn't find new cgroup %s\n", nodename);
-                ret = -ENOMEM;
-                goto out_release;
-        }
        /* The cgroup now exists. Retake cgroup_mutex and check
         * that we're still in the same state that we thought we
         * were. */
@@ -3049,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
        rcu_read_lock();
-        if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+        if ((atomic_dec_return(&css->refcnt) == 1) &&
+            notify_on_release(cgrp)) {
                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                check_for_release(cgrp);
        }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 345ace5117de..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
        struct cgroup_subsys_state css;
        unsigned long flags;            /* "unsigned long" so bitops work */
-        cpumask_t cpus_allowed;         /* CPUs allowed to tasks in cpuset */
+        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
        struct cpuset *parent;          /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
-        .cpus_allowed = CPU_MASK_ALL,
-        .mems_allowed = NODE_MASK_ALL,
 };
 /*
@@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
 };
 /*
- * Return in *pmask the portion of a cpusets's cpus_allowed that
+ * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.  If we get
 * all the way to the top and still haven't found any online cpus,
@@ -291,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
 * Call with callback_mutex held.
 */
-static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+static void guarantee_online_cpus(const struct cpuset *cs,
+                                  struct cpumask *pmask)
 {
-        while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
+        while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
                cs = cs->parent;
        if (cs)
-                cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
+                cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
        else
-                *pmask = cpu_online_map;
+                cpumask_copy(pmask, cpu_online_mask);
-        BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
+        BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
 }
 /*
@@ -375,14 +374,9 @@ void cpuset_update_task_memory_state(void)
        struct task_struct *tsk = current;
        struct cpuset *cs;
-        if (task_cs(tsk) == &top_cpuset) {
+        rcu_read_lock();
-                /* Don't need rcu for top_cpuset.  It's never freed. */
+        my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-                my_cpusets_mem_gen = top_cpuset.mems_generation;
+        rcu_read_unlock();
-        } else {
-                rcu_read_lock();
-                my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
-                rcu_read_unlock();
-        }
        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
                mutex_lock(&callback_mutex);
@@ -414,12 +408,43 @@ void cpuset_update_task_memory_state(void)
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 {
-        return  cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+        return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
 }
+/**
+ * alloc_trial_cpuset - allocate a trial cpuset
+ * @cs: the cpuset that the trial cpuset duplicates
+ */
+static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+{
+        struct cpuset *trial;
+        trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+        if (!trial)
+                return NULL;
+        if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
+                kfree(trial);
+                return NULL;
+        }
+        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+        return trial;
+}
+/**
+ * free_trial_cpuset - free the trial cpuset
+ * @trial: the trial cpuset to be freed
+ */
+static void free_trial_cpuset(struct cpuset *trial)
+{
+        free_cpumask_var(trial->cpus_allowed);
+        kfree(trial);
+}
 /*
 * validate_change() - Used to validate that any proposed cpuset change
 *                     follows the structural rules for cpusets.
@@ -469,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
                c = cgroup_cs(cont);
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
-                    cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
+                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
                        return -EINVAL;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
@@ -479,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
        if (cgroup_task_count(cur->css.cgroup)) {
-                if (cpus_empty(trial->cpus_allowed) ||
+                if (cpumask_empty(trial->cpus_allowed) ||
                    nodes_empty(trial->mems_allowed)) {
                        return -ENOSPC;
                }
@@ -494,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
+        return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
 }
 static void
@@ -519,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
                cp = list_first_entry(&q, struct cpuset, stack_list);
                list_del(q.next);
-                if (cpus_empty(cp->cpus_allowed))
+                if (cpumask_empty(cp->cpus_allowed))
                        continue;
                if (is_sched_load_balance(cp))
@@ -586,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
-static int generate_sched_domains(cpumask_t **domains,
+/* FIXME: see the FIXME in partition_sched_domains() */
+static int generate_sched_domains(struct cpumask **domains,
                        struct sched_domain_attr **attributes)
 {
        LIST_HEAD(q);           /* queue of cpusets to be scanned */
@@ -594,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
-        cpumask_t *doms;        /* resulting partition; i.e. sched domains */
+        struct cpumask *doms;   /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
-        int nslot;              /* next empty doms[] cpumask_t slot */
+        int nslot;              /* next empty doms[] struct cpumask slot */
        doms = NULL;
        dattr = NULL;
@@ -605,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+                doms = kmalloc(cpumask_size(), GFP_KERNEL);
                if (!doms)
                        goto done;
@@ -614,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                *doms = top_cpuset.cpus_allowed;
+                cpumask_copy(doms, top_cpuset.cpus_allowed);
                ndoms = 1;
                goto done;
@@ -633,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
                cp = list_first_entry(&q, struct cpuset, stack_list);
                list_del(q.next);
-                if (cpus_empty(cp->cpus_allowed))
+                if (cpumask_empty(cp->cpus_allowed))
                        continue;
                /*
@@ -684,7 +710,7 @@ restart:
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
-        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
+        doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
        if (!doms)
                goto done;
@@ -696,7 +722,7 @@ restart:
        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
-                cpumask_t *dp;
+                struct cpumask *dp;
                int apn = a->pn;
                if (apn < 0) {
@@ -719,14 +745,14 @@ restart:
                        continue;
                }
-                cpus_clear(*dp);
+                cpumask_clear(dp);
                if (dattr)
                        *(dattr + nslot) = SD_ATTR_INIT;
                for (j = i; j < csn; j++) {
                        struct cpuset *b = csa[j];
                        if (apn == b->pn) {
-                                cpus_or(*dp, *dp, b->cpus_allowed);
+                                cpumask_or(dp, dp, b->cpus_allowed);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
@@ -766,7 +792,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
        struct sched_domain_attr *attr;
-        cpumask_t *doms;
+        struct cpumask *doms;
        int ndoms;
        get_online_cpus();
@@ -835,7 +861,7 @@ void rebuild_sched_domains(void)
 static int cpuset_test_cpumask(struct task_struct *tsk,
                               struct cgroup_scanner *scan)
 {
-        return !cpus_equal(tsk->cpus_allowed,
+        return !cpumask_equal(&tsk->cpus_allowed,
                        (cgroup_cs(scan->cg))->cpus_allowed);
 }
@@ -853,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 static void cpuset_change_cpumask(struct task_struct *tsk,
                                  struct cgroup_scanner *scan)
 {
-        set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
+        set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
 }
 /**
@@ -885,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 * @cs: the cpuset to consider
 * @buf: buffer of cpu numbers written to this cpuset
 */
-static int update_cpumask(struct cpuset *cs, const char *buf)
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+                          const char *buf)
 {
        struct ptr_heap heap;
-        struct cpuset trialcs;
        int retval;
        int is_load_balanced;
@@ -896,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (cs == &top_cpuset)
                return -EACCES;
-        trialcs = *cs;
        /*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
@@ -905,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
         * with tasks have cpus.
         */
        if (!*buf) {
-                cpus_clear(trialcs.cpus_allowed);
+                cpumask_clear(trialcs->cpus_allowed);
        } else {
-                retval = cpulist_parse(buf, &trialcs.cpus_allowed);
+                retval = cpulist_parse(buf, trialcs->cpus_allowed);
                if (retval < 0)
                        return retval;
-                if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+                if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
                        return -EINVAL;
        }
-        retval = validate_change(cs, &trialcs);
+        retval = validate_change(cs, trialcs);
        if (retval < 0)
                return retval;
        /* Nothing to do if the cpus didn't change */
-        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;
        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
        if (retval)
                return retval;
-        is_load_balanced = is_sched_load_balance(&trialcs);
+        is_load_balanced = is_sched_load_balance(trialcs);
        mutex_lock(&callback_mutex);
-        cs->cpus_allowed = trialcs.cpus_allowed;
+        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
        /*
@@ -1017,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
        fudge = 10;                             /* spare mmarray[] slots */
-        fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+        fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
        retval = -ENOMEM;
        /*
@@ -1104,9 +1128,9 @@ done:
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
-static int update_nodemask(struct cpuset *cs, const char *buf)
+static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
+                           const char *buf)
 {
-        struct cpuset trialcs;
        nodemask_t oldmem;
        int retval;
@@ -1117,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
        if (cs == &top_cpuset)
                return -EACCES;
-        trialcs = *cs;
        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
@@ -1126,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
         * with tasks have memory.
         */
        if (!*buf) {
-                nodes_clear(trialcs.mems_allowed);
+                nodes_clear(trialcs->mems_allowed);
        } else {
-                retval = nodelist_parse(buf, trialcs.mems_allowed);
+                retval = nodelist_parse(buf, trialcs->mems_allowed);
                if (retval < 0)
                        goto done;
-                if (!nodes_subset(trialcs.mems_allowed,
+                if (!nodes_subset(trialcs->mems_allowed,
                                node_states[N_HIGH_MEMORY]))
                        return -EINVAL;
        }
        oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+        if (nodes_equal(oldmem, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
-        retval = validate_change(cs, &trialcs);
+        retval = validate_change(cs, trialcs);
        if (retval < 0)
                goto done;
        mutex_lock(&callback_mutex);
-        cs->mems_allowed = trialcs.mems_allowed;
+        cs->mems_allowed = trialcs->mems_allowed;
        cs->mems_generation = cpuset_mems_generation++;
        mutex_unlock(&callback_mutex);
@@ -1167,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
-                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+                if (!cpumask_empty(cs->cpus_allowed) &&
+                    is_sched_load_balance(cs))
                        async_rebuild_sched_domains();
        }
@@ -1186,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
 {
-        struct cpuset trialcs;
+        struct cpuset *trialcs;
        int err;
        int balance_flag_changed;
-        trialcs = *cs;
+        trialcs = alloc_trial_cpuset(cs);
+        if (!trialcs)
+                return -ENOMEM;
        if (turning_on)
-                set_bit(bit, &trialcs.flags);
+                set_bit(bit, &trialcs->flags);
        else
-                clear_bit(bit, &trialcs.flags);
+                clear_bit(bit, &trialcs->flags);
-        err = validate_change(cs, &trialcs);
+        err = validate_change(cs, trialcs);
        if (err < 0)
-                return err;
+                goto out;
        balance_flag_changed = (is_sched_load_balance(cs) !=
-                                        is_sched_load_balance(&trialcs));
+                                is_sched_load_balance(trialcs));
        mutex_lock(&callback_mutex);
-        cs->flags = trialcs.flags;
+        cs->flags = trialcs->flags;
        mutex_unlock(&callback_mutex);
-        if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
+        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                async_rebuild_sched_domains();
-        return 0;
+out:
+        free_trial_cpuset(trialcs);
+        return err;
 }
 /*
@@ -1311,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
+/* Protected by cgroup_lock */
+static cpumask_var_t cpus_attach;
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss,
                             struct cgroup *cont, struct task_struct *tsk)
 {
        struct cpuset *cs = cgroup_cs(cont);
+        int ret = 0;
-        if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
                return -ENOSPC;
-        if (tsk->flags & PF_THREAD_BOUND) {
-                cpumask_t mask;
+        if (tsk->flags & PF_THREAD_BOUND) {
                mutex_lock(&callback_mutex);
-                mask = cs->cpus_allowed;
+                if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
+                        ret = -EINVAL;
                mutex_unlock(&callback_mutex);
-                if (!cpus_equal(tsk->cpus_allowed, mask))
-                        return -EINVAL;
        }
-        return security_task_setscheduler(tsk, 0, NULL);
+        return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
 }
 static void cpuset_attach(struct cgroup_subsys *ss,
                          struct cgroup *cont, struct cgroup *oldcont,
                          struct task_struct *tsk)
 {
-        cpumask_t cpus;
        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
        int err;
-        mutex_lock(&callback_mutex);
+        if (cs == &top_cpuset) {
-        guarantee_online_cpus(cs, &cpus);
+                cpumask_copy(cpus_attach, cpu_possible_mask);
-        err = set_cpus_allowed_ptr(tsk, &cpus);
+        } else {
-        mutex_unlock(&callback_mutex);
+                mutex_lock(&callback_mutex);
+                guarantee_online_cpus(cs, cpus_attach);
+                mutex_unlock(&callback_mutex);
+        }
+        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        if (err)
                return;
@@ -1359,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
                        cpuset_migrate_mm(mm, &from, &to);
                mmput(mm);
        }
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1454,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                                const char *buf)
 {
        int retval = 0;
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *trialcs;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
+        trialcs = alloc_trial_cpuset(cs);
+        if (!trialcs)
+                return -ENOMEM;
        switch (cft->private) {
        case FILE_CPULIST:
-                retval = update_cpumask(cgroup_cs(cgrp), buf);
+                retval = update_cpumask(cs, trialcs, buf);
                break;
        case FILE_MEMLIST:
-                retval = update_nodemask(cgroup_cs(cgrp), buf);
+                retval = update_nodemask(cs, trialcs, buf);
                break;
        default:
                retval = -EINVAL;
                break;
        }
+        free_trial_cpuset(trialcs);
        cgroup_unlock();
        return retval;
 }
@@ -1487,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-        cpumask_t mask;
+        int ret;
        mutex_lock(&callback_mutex);
-        mask = cs->cpus_allowed;
+        ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        return cpulist_scnprintf(page, PAGE_SIZE, &mask);
+        return ret;
 }
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1729,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
        parent_cs = cgroup_cs(parent);
        cs->mems_allowed = parent_cs->mems_allowed;
-        cs->cpus_allowed = parent_cs->cpus_allowed;
+        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
        return;
 }
@@ -1755,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
+        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
+                kfree(cs);
+                return ERR_PTR(-ENOMEM);
+        }
        cpuset_update_task_memory_state();
        cs->flags = 0;
@@ -1763,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-        cpus_clear(cs->cpus_allowed);
+        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
        cs->mems_generation = cpuset_mems_generation++;
        fmeter_init(&cs->fmeter);
@@ -1790,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
        number_of_cpusets--;
+        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
@@ -1813,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
 int __init cpuset_init_early(void)
 {
+        alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
        top_cpuset.mems_generation = cpuset_mems_generation++;
        return 0;
 }
@@ -1828,7 +1875,7 @@ int __init cpuset_init(void)
 {
        int err = 0;
-        cpus_setall(top_cpuset.cpus_allowed);
+        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        fmeter_init(&top_cpuset.fmeter);
@@ -1840,6 +1887,9 @@ int __init cpuset_init(void)
        if (err < 0)
                return err;
+        if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+                BUG();
        number_of_cpusets = 1;
        return 0;
 }
@@ -1914,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
         * has online cpus, so can't be empty).
         */
        parent = cs->parent;
-        while (cpus_empty(parent->cpus_allowed) ||
+        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent->parent;
@@ -1955,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                }
                /* Continue past cpusets with all cpus, mems online */
-                if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+                if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
@@ -1963,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
-                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                            cpu_online_mask);
                nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
                mutex_unlock(&callback_mutex);
                /* Move tasks from the empty cpuset to a parent */
-                if (cpus_empty(cp->cpus_allowed) ||
+                if (cpumask_empty(cp->cpus_allowed) ||
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
                else {
@@ -1995,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
        struct sched_domain_attr *attr;
-        cpumask_t *doms;
+        struct cpumask *doms;
        int ndoms;
        switch (phase) {
@@ -2010,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        }
        cgroup_lock();
-        top_cpuset.cpus_allowed = cpu_online_map;
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2055,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
-        top_cpuset.cpus_allowed = cpu_online_map;
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2065,15 +2116,15 @@ void __init cpuset_init_smp(void)
 /**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
- * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_map, even if this means going outside the
 * tasks cpuset.
 **/
-void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
        mutex_lock(&callback_mutex);
        cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2084,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
 * Must be called with callback_mutex held.
 **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
        task_lock(tsk);
        guarantee_online_cpus(task_cs(tsk), pmask);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..4018308048cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (pid != &init_struct_pid) {
                retval = -ENOMEM;
-                pid = alloc_pid(task_active_pid_ns(p));
+                pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
                if (clone_flags & CLONE_NEWPID) {
-                        retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+                        retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
                        if (retval < 0)
                                goto bad_fork_free_pid;
                }
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
 struct ns_cgroup {
        struct cgroup_subsys_state css;
-        spinlock_t lock;
 };
 struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
        if (!ns_cgroup)
                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&ns_cgroup->lock);
        return &ns_cgroup->css;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index af9224cdd6c0..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_session_nr_ns);
+struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
+{
+        return ns_of_pid(task_pid(tsk));
+}
+EXPORT_SYMBOL_GPL(task_active_pid_ns);
 /*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
 #include <linux/uaccess.h>
 #include <linux/mm.h>
-void res_counter_init(struct res_counter *counter)
+void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
        spin_lock_init(&counter->lock);
        counter->limit = (unsigned long long)LLONG_MAX;
+        counter->parent = parent;
 }
 int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
        return 0;
 }
-int res_counter_charge(struct res_counter *counter, unsigned long val)
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+                        struct res_counter **limit_fail_at)
 {
        int ret;
        unsigned long flags;
+        struct res_counter *c, *u;
-        spin_lock_irqsave(&counter->lock, flags);
-        ret = res_counter_charge_locked(counter, val);
+        *limit_fail_at = NULL;
-        spin_unlock_irqrestore(&counter->lock, flags);
+        local_irq_save(flags);
+        for (c = counter; c != NULL; c = c->parent) {
+                spin_lock(&c->lock);
+                ret = res_counter_charge_locked(c, val);
+                spin_unlock(&c->lock);
+                if (ret < 0) {
+                        *limit_fail_at = c;
+                        goto undo;
+                }
+        }
+        ret = 0;
+        goto done;
+undo:
+        for (u = counter; u != c; u = u->parent) {
+                spin_lock(&u->lock);
+                res_counter_uncharge_locked(u, val);
+                spin_unlock(&u->lock);
+        }
+done:
+        local_irq_restore(flags);
        return ret;
 }
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
        unsigned long flags;
+        struct res_counter *c;
-        spin_lock_irqsave(&counter->lock, flags);
+        local_irq_save(flags);
-        res_counter_uncharge_locked(counter, val);
+        for (c = counter; c != NULL; c = c->parent) {
-        spin_unlock_irqrestore(&counter->lock, flags);
+                spin_lock(&c->lock);
+                res_counter_uncharge_locked(c, val);
+                spin_unlock(&c->lock);
+        }
+        local_irq_restore(flags);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0c0b4bc3f08..8e1352c75557 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        }
 }
-#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
 /*
 * Share the fairness runtime between parent and child, thus the
 * total amount of pressure for CPU stays equal - new tasks
diff --git a/lib/sort.c b/lib/sort.c
index 6abbaf3d5858..926d00429ed2 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -32,11 +32,11 @@ static void generic_swap(void *a, void *b, int size)
 * @base: pointer to data to sort
 * @num: number of elements
 * @size: size of each element
- * @cmp: pointer to comparison function
+ * @cmp_func: pointer to comparison function
- * @swap: pointer to swap function or NULL
+ * @swap_func: pointer to swap function or NULL
 *
 * This function does a heapsort on the given array. You may provide a
- * swap function optimized to your element type.
+ * swap_func function optimized to your element type.
 *
 * Sorting time is O(n log n) both on average and worst-case. While
 * qsort is about 20% faster on average, it suffers from exploitable
@@ -45,37 +45,39 @@ static void generic_swap(void *a, void *b, int size)
 */
 void sort(void *base, size_t num, size_t size,
-          int (*cmp)(const void *, const void *),
+          int (*cmp_func)(const void *, const void *),
-          void (*swap)(void *, void *, int size))
+          void (*swap_func)(void *, void *, int size))
 {
        /* pre-scale counters for performance */
        int i = (num/2 - 1) * size, n = num * size, c, r;
-        if (!swap)
+        if (!swap_func)
-                swap = (size == 4 ? u32_swap : generic_swap);
+                swap_func = (size == 4 ? u32_swap : generic_swap);
        /* heapify */
        for ( ; i >= 0; i -= size) {
                for (r = i; r * 2 + size < n; r  = c) {
                        c = r * 2 + size;
-                        if (c < n - size && cmp(base + c, base + c + size) < 0)
+                        if (c < n - size &&
+                                        cmp_func(base + c, base + c + size) < 0)
                                c += size;
-                        if (cmp(base + r, base + c) >= 0)
+                        if (cmp_func(base + r, base + c) >= 0)
                                break;
-                        swap(base + r, base + c, size);
+                        swap_func(base + r, base + c, size);
                }
        }
        /* sort */
        for (i = n - size; i > 0; i -= size) {
-                swap(base, base + i, size);
+                swap_func(base, base + i, size);
                for (r = 0; r * 2 + size < i; r = c) {
                        c = r * 2 + size;
-                        if (c < i - size && cmp(base + c, base + c + size) < 0)
+                        if (c < i - size &&
+                                        cmp_func(base + c, base + c + size) < 0)
                                c += size;
-                        if (cmp(base + r, base + c) >= 0)
+                        if (cmp_func(base + r, base + c) >= 0)
                                break;
-                        swap(base + r, base + c, size);
+                        swap_func(base + r, base + c, size);
                }
        }
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f55a1e2baf7..ceba0bd03662 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        VM_BUG_ON(!PageLocked(page));
        error = mem_cgroup_cache_charge(page, current->mm,
-                                        gfp_mask & ~__GFP_HIGHMEM);
+                                        gfp_mask & GFP_RECLAIM_MASK);
        if (error)
                goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 51ee96545579..e2996b80601f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -34,12 +36,23 @@
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
+#include "internal.h"
 #include <asm/uaccess.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account         (0)
+#endif
+static DEFINE_MUTEX(memcg_tasklist);    /* can be hold under cgroup_mutex */
 /*
 * Statistics for memory cgroup.
 */
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
 } ____cacheline_aligned_in_smp;
 struct mem_cgroup_stat {
-        struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+        struct mem_cgroup_stat_cpu cpustat[0];
 };
 /*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
        /*
         * spin_lock to protect the per cgroup LRU
         */
-        spinlock_t              lru_lock;
        struct list_head        lists[NR_LRU_LISTS];
        unsigned long           count[NR_LRU_LISTS];
+        struct zone_reclaim_stat reclaim_stat;
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
@@ -122,44 +136,73 @@ struct mem_cgroup {
         */
        struct res_counter res;
        /*
+         * the counter to account for mem+swap usage.
+         */
+        struct res_counter memsw;
+        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
         */
        struct mem_cgroup_lru_info info;
+        /*
+          protect against reclaim related member.
+        */
+        spinlock_t reclaim_param_lock;
        int     prev_priority;  /* for recording reclaim priority */
+        /*
+         * While reclaiming in a hiearchy, we cache the last child we
+         * reclaimed from. Protected by hierarchy_mutex
+         */
+        struct mem_cgroup *last_scanned_child;
        /*
-         * statistics.
+         * Should the accounting and control be hierarchical, per subtree?
+         */
+        bool use_hierarchy;
+        unsigned long   last_oom_jiffies;
+        atomic_t        refcnt;
+        unsigned int    swappiness;
+        /*
+         * statistics. This must be placed at the end of memcg.
         */
        struct mem_cgroup_stat stat;
 };
-static struct mem_cgroup init_mem_cgroup;
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
+        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        NR_CHARGE_TYPE,
 };
 /* only for here (for easy reading.) */
 #define PCGF_CACHE      (1UL << PCG_CACHE)
 #define PCGF_USED       (1UL << PCG_USED)
-#define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
 #define PCGF_LOCK       (1UL << PCG_LOCK)
-#define PCGF_FILE       (1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
-        PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
-        PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+        PCGF_USED | PCGF_LOCK, /* Anon */
-        PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
        0, /* FORCE */
 };
-/*
+/* for encoding cft->private value on file */
- * Always modified under lru lock. Then, not necessary to preempt_disable()
+#define _MEM                    (0)
- */
+#define _MEMSWAP                (1)
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         struct page_cgroup *pc,
                                         bool charge)
@@ -167,10 +210,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        int val = (charge)? 1 : -1;
        struct mem_cgroup_stat *stat = &mem->stat;
        struct mem_cgroup_stat_cpu *cpustat;
+        int cpu = get_cpu();
-        VM_BUG_ON(!irqs_disabled());
+        cpustat = &stat->cpustat[cpu];
-        cpustat = &stat->cpustat[smp_processor_id()];
        if (PageCgroupCache(pc))
                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
        else
@@ -182,6 +224,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        else
                __mem_cgroup_stat_add_safe(cpustat,
                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+        put_cpu();
 }
 static struct mem_cgroup_per_zone *
@@ -197,6 +240,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
        int nid = page_cgroup_nid(pc);
        int zid = page_cgroup_zid(pc);
+        if (!mem)
+                return NULL;
        return mem_cgroup_zoneinfo(mem, nid, zid);
 }
@@ -236,77 +282,152 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
-                        struct page_cgroup *pc)
 {
-        int lru = LRU_BASE;
+        struct mem_cgroup *mem = NULL;
+        /*
+         * Because we have no locks, mm->owner's may be being moved to other
+         * cgroup. We use css_tryget() here even if this looks
+         * pessimistic (rather than adding locks here).
+         */
+        rcu_read_lock();
+        do {
+                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                if (unlikely(!mem))
+                        break;
+        } while (!css_tryget(&mem->css));
+        rcu_read_unlock();
+        return mem;
+}
-        if (PageCgroupUnevictable(pc))
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
-                lru = LRU_UNEVICTABLE;
+{
-        else {
+        if (!mem)
-                if (PageCgroupActive(pc))
+                return true;
-                        lru += LRU_ACTIVE;
+        return css_is_removed(&mem->css);
-                if (PageCgroupFile(pc))
+}
-                        lru += LRU_FILE;
-        }
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
-        list_del(&pc->lru);
+{
+        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
+        struct mem_cgroup_per_zone *mz;
+        if (mem_cgroup_disabled())
+                return;
+        pc = lookup_page_cgroup(page);
+        /* can happen while we handle swapcache. */
+        if (list_empty(&pc->lru) || !pc->mem_cgroup)
+                return;
+        /*
+         * We don't check PCG_USED bit. It's cleared when the "page" is finally
+         * removed from global LRU.
+         */
+        mz = page_cgroup_zoneinfo(pc);
+        mem = pc->mem_cgroup;
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        list_del_init(&pc->lru);
+        return;
 }
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
+void mem_cgroup_del_lru(struct page *page)
-                                struct page_cgroup *pc)
 {
-        int lru = LRU_BASE;
+        mem_cgroup_del_lru_list(page, page_lru(page));
+}
-        if (PageCgroupUnevictable(pc))
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
-                lru = LRU_UNEVICTABLE;
+{
-        else {
+        struct mem_cgroup_per_zone *mz;
-                if (PageCgroupActive(pc))
+        struct page_cgroup *pc;
-                        lru += LRU_ACTIVE;
-                if (PageCgroupFile(pc))
-                        lru += LRU_FILE;
-        }
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        if (mem_cgroup_disabled())
-        list_add(&pc->lru, &mz->lists[lru]);
+                return;
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+        pc = lookup_page_cgroup(page);
+        smp_rmb();
+        /* unused page is not rotated. */
+        if (!PageCgroupUsed(pc))
+                return;
+        mz = page_cgroup_zoneinfo(pc);
+        list_move(&pc->lru, &mz->lists[lru]);
 }
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 {
-        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        struct page_cgroup *pc;
-        int active    = PageCgroupActive(pc);
+        struct mem_cgroup_per_zone *mz;
-        int file      = PageCgroupFile(pc);
-        int unevictable = PageCgroupUnevictable(pc);
-        enum lru_list from = unevictable ? LRU_UNEVICTABLE :
-                                (LRU_FILE * !!file + !!active);
-        if (lru == from)
+        if (mem_cgroup_disabled())
+                return;
+        pc = lookup_page_cgroup(page);
+        /* barrier to sync with "charge" */
+        smp_rmb();
+        if (!PageCgroupUsed(pc))
                return;
-        MEM_CGROUP_ZSTAT(mz, from) -= 1;
+        mz = page_cgroup_zoneinfo(pc);
+        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        list_add(&pc->lru, &mz->lists[lru]);
+}
+/*
+ * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * lru because the page may.be reused after it's fully uncharged (because of
+ * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * it again. This function is only used to charge SwapCache. It's done under
+ * lock_page and expected that zone->lru_lock is never held.
+ */
+static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+{
+        unsigned long flags;
+        struct zone *zone = page_zone(page);
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        spin_lock_irqsave(&zone->lru_lock, flags);
        /*
-         * However this is done under mz->lru_lock, another flags, which
+         * Forget old LRU when this page_cgroup is *not* used. This Used bit
-         * are not related to LRU, will be modified from out-of-lock.
+         * is guarded by lock_page() because the page is SwapCache.
-         * We have to use atomic set/clear flags.
         */
-        if (is_unevictable_lru(lru)) {
+        if (!PageCgroupUsed(pc))
-                ClearPageCgroupActive(pc);
+                mem_cgroup_del_lru_list(page, page_lru(page));
-                SetPageCgroupUnevictable(pc);
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
-        } else {
+}
-                if (is_active_lru(lru))
-                        SetPageCgroupActive(pc);
-                else
-                        ClearPageCgroupActive(pc);
-                ClearPageCgroupUnevictable(pc);
-        }
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
-        list_move(&pc->lru, &mz->lists[lru]);
+{
+        unsigned long flags;
+        struct zone *zone = page_zone(page);
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        /* link when the page is linked to LRU but page_cgroup isn't */
+        if (PageLRU(page) && list_empty(&pc->lru))
+                mem_cgroup_add_lru_list(page, page_lru(page));
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+void mem_cgroup_move_lists(struct page *page,
+                           enum lru_list from, enum lru_list to)
+{
+        if (mem_cgroup_disabled())
+                return;
+        mem_cgroup_del_lru_list(page, from);
+        mem_cgroup_add_lru_list(page, to);
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -320,37 +441,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 }
 /*
- * This routine assumes that the appropriate zone's lru lock is already held
- */
-void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
-{
-        struct page_cgroup *pc;
-        struct mem_cgroup_per_zone *mz;
-        unsigned long flags;
-        if (mem_cgroup_subsys.disabled)
-                return;
-        /*
-         * We cannot lock_page_cgroup while holding zone's lru_lock,
-         * because other holders of lock_page_cgroup can be interrupted
-         * with an attempt to rotate_reclaimable_page.  But we cannot
-         * safely get to page_cgroup without it, so just try_lock it:
-         * mem_cgroup_isolate_pages allows for page left on wrong list.
-         */
-        pc = lookup_page_cgroup(page);
-        if (!trylock_page_cgroup(pc))
-                return;
-        if (pc && PageCgroupUsed(pc)) {
-                mz = page_cgroup_zoneinfo(pc);
-                spin_lock_irqsave(&mz->lru_lock, flags);
-                __mem_cgroup_move_lists(pc, lru);
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
-        }
-        unlock_page_cgroup(pc);
-}
-/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
@@ -372,39 +462,108 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
-        return mem->prev_priority;
+        int prev_priority;
+        spin_lock(&mem->reclaim_param_lock);
+        prev_priority = mem->prev_priority;
+        spin_unlock(&mem->reclaim_param_lock);
+        return prev_priority;
 }
 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
+        spin_lock(&mem->reclaim_param_lock);
        if (priority < mem->prev_priority)
                mem->prev_priority = priority;
+        spin_unlock(&mem->reclaim_param_lock);
 }
 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
+        spin_lock(&mem->reclaim_param_lock);
        mem->prev_priority = priority;
+        spin_unlock(&mem->reclaim_param_lock);
 }
-/*
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
- * Calculate # of pages to be scanned in this priority/zone.
+{
- * See also vmscan.c
+        unsigned long active;
- *
+        unsigned long inactive;
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
+        unsigned long gb;
- * (see include/linux/mmzone.h)
+        unsigned long inactive_ratio;
- */
+        inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+        active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+        gb = (inactive + active) >> (30 - PAGE_SHIFT);
+        if (gb)
+                inactive_ratio = int_sqrt(10 * gb);
+        else
+                inactive_ratio = 1;
+        if (present_pages) {
+                present_pages[0] = inactive;
+                present_pages[1] = active;
+        }
+        return inactive_ratio;
+}
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+        unsigned long active;
+        unsigned long inactive;
+        unsigned long present_pages[2];
+        unsigned long inactive_ratio;
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+        inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-                                        int priority, enum lru_list lru)
+        inactive = present_pages[0];
+        active = present_pages[1];
+        if (inactive * inactive_ratio < active)
+                return 1;
+        return 0;
+}
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+                                       struct zone *zone,
+                                       enum lru_list lru)
 {
-        long nr_pages;
        int nid = zone->zone_pgdat->node_id;
        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+        return MEM_CGROUP_ZSTAT(mz, lru);
+}
-        return (nr_pages >> priority);
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+                                                      struct zone *zone)
+{
+        int nid = zone->zone_pgdat->node_id;
+        int zid = zone_idx(zone);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+        return &mz->reclaim_stat;
+}
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+        struct page_cgroup *pc;
+        struct mem_cgroup_per_zone *mz;
+        if (mem_cgroup_disabled())
+                return NULL;
+        pc = lookup_page_cgroup(page);
+        mz = page_cgroup_zoneinfo(pc);
+        if (!mz)
+                return NULL;
+        return &mz->reclaim_stat;
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -429,95 +588,281 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
        src = &mz->lists[lru];
-        spin_lock(&mz->lru_lock);
        scan = 0;
        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
                if (scan >= nr_to_scan)
                        break;
+                page = pc->page;
                if (unlikely(!PageCgroupUsed(pc)))
                        continue;
-                page = pc->page;
                if (unlikely(!PageLRU(page)))
                        continue;
-                /*
-                 * TODO: play better with lumpy reclaim, grabbing anything.
-                 */
-                if (PageUnevictable(page) ||
-                    (PageActive(page) && !active) ||
-                    (!PageActive(page) && active)) {
-                        __mem_cgroup_move_lists(pc, page_lru(page));
-                        continue;
-                }
                scan++;
-                list_move(&pc->lru, &pc_list);
                if (__isolate_lru_page(page, mode, file) == 0) {
                        list_move(&page->lru, dst);
                        nr_taken++;
                }
        }
-        list_splice(&pc_list, src);
-        spin_unlock(&mz->lru_lock);
        *scanned = scan;
        return nr_taken;
 }
+#define mem_cgroup_from_res_counter(counter, member)    \
+        container_of(counter, struct mem_cgroup, member)
 /*
- * Charge the memory controller for page usage.
+ * This routine finds the DFS walk successor. This routine should be
- * Return
+ * called with hierarchy_mutex held
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
 */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+static struct mem_cgroup *
-                                gfp_t gfp_mask, enum charge_type ctype,
+mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
-                                struct mem_cgroup *memcg)
 {
+        struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
+        curr_cgroup = curr->css.cgroup;
+        root_cgroup = root_mem->css.cgroup;
+        if (!list_empty(&curr_cgroup->children)) {
+                /*
+                 * Walk down to children
+                 */
+                mem_cgroup_put(curr);
+                cgroup = list_entry(curr_cgroup->children.next,
+                                                struct cgroup, sibling);
+                curr = mem_cgroup_from_cont(cgroup);
+                mem_cgroup_get(curr);
+                goto done;
+        }
+visit_parent:
+        if (curr_cgroup == root_cgroup) {
+                mem_cgroup_put(curr);
+                curr = root_mem;
+                mem_cgroup_get(curr);
+                goto done;
+        }
+        /*
+         * Goto next sibling
+         */
+        if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
+                mem_cgroup_put(curr);
+                cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
+                                                sibling);
+                curr = mem_cgroup_from_cont(cgroup);
+                mem_cgroup_get(curr);
+                goto done;
+        }
+        /*
+         * Go up to next parent and next parent's sibling if need be
+         */
+        curr_cgroup = curr_cgroup->parent;
+        goto visit_parent;
+done:
+        root_mem->last_scanned_child = curr;
+        return curr;
+}
+/*
+ * Visit the first child (need not be the first child as per the ordering
+ * of the cgroup list, since we track last_scanned_child) of @mem and use
+ * that to reclaim free pages from.
+ */
+static struct mem_cgroup *
+mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+{
+        struct cgroup *cgroup;
+        struct mem_cgroup *ret;
+        bool obsolete;
+        obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
+        /*
+         * Scan all children under the mem_cgroup mem
+         */
+        mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+        if (list_empty(&root_mem->css.cgroup->children)) {
+                ret = root_mem;
+                goto done;
+        }
+        if (!root_mem->last_scanned_child || obsolete) {
+                if (obsolete && root_mem->last_scanned_child)
+                        mem_cgroup_put(root_mem->last_scanned_child);
+                cgroup = list_first_entry(&root_mem->css.cgroup->children,
+                                struct cgroup, sibling);
+                ret = mem_cgroup_from_cont(cgroup);
+                mem_cgroup_get(ret);
+        } else
+                ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
+                                                root_mem);
+done:
+        root_mem->last_scanned_child = ret;
+        mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+        return ret;
+}
+static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+{
+        if (do_swap_account) {
+                if (res_counter_check_under_limit(&mem->res) &&
+                        res_counter_check_under_limit(&mem->memsw))
+                        return true;
+        } else
+                if (res_counter_check_under_limit(&mem->res))
+                        return true;
+        return false;
+}
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+        struct cgroup *cgrp = memcg->css.cgroup;
+        unsigned int swappiness;
+        /* root ? */
+        if (cgrp->parent == NULL)
+                return vm_swappiness;
+        spin_lock(&memcg->reclaim_param_lock);
+        swappiness = memcg->swappiness;
+        spin_unlock(&memcg->reclaim_param_lock);
+        return swappiness;
+}
+/*
+ * Dance down the hierarchy if needed to reclaim memory. We remember the
+ * last child we reclaimed from, so that we don't end up penalizing
+ * one child extensively based on its position in the children list.
+ *
+ * root_mem is the original ancestor that we've been reclaim from.
+ */
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+                                                gfp_t gfp_mask, bool noswap)
+{
+        struct mem_cgroup *next_mem;
+        int ret = 0;
+        /*
+         * Reclaim unconditionally and don't check for return value.
+         * We need to reclaim in the current group and down the tree.
+         * One might think about checking for children before reclaiming,
+         * but there might be left over accounting, even after children
+         * have left.
+         */
+        ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+                                           get_swappiness(root_mem));
+        if (mem_cgroup_check_under_limit(root_mem))
+                return 0;
+        if (!root_mem->use_hierarchy)
+                return ret;
+        next_mem = mem_cgroup_get_first_node(root_mem);
+        while (next_mem != root_mem) {
+                if (mem_cgroup_is_obsolete(next_mem)) {
+                        mem_cgroup_put(next_mem);
+                        next_mem = mem_cgroup_get_first_node(root_mem);
+                        continue;
+                }
+                ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+                                                   get_swappiness(next_mem));
+                if (mem_cgroup_check_under_limit(root_mem))
+                        return 0;
+                mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+                next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
+                mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+        }
+        return ret;
+}
+bool mem_cgroup_oom_called(struct task_struct *task)
+{
+        bool ret = false;
        struct mem_cgroup *mem;
-        struct page_cgroup *pc;
+        struct mm_struct *mm;
-        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct mem_cgroup_per_zone *mz;
-        unsigned long flags;
-        pc = lookup_page_cgroup(page);
+        rcu_read_lock();
-        /* can happen at boot */
+        mm = task->mm;
-        if (unlikely(!pc))
+        if (!mm)
+                mm = &init_mm;
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
+                ret = true;
+        rcu_read_unlock();
+        return ret;
+}
+/*
+ * Unlike exported interface, "oom" parameter is added. if oom==true,
+ * oom-killer can be invoked.
+ */
+static int __mem_cgroup_try_charge(struct mm_struct *mm,
+                        gfp_t gfp_mask, struct mem_cgroup **memcg,
+                        bool oom)
+{
+        struct mem_cgroup *mem, *mem_over_limit;
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct res_counter *fail_res;
+        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+                /* Don't account this! */
+                *memcg = NULL;
                return 0;
-        prefetchw(pc);
+        }
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
+        mem = *memcg;
-        if (likely(!memcg)) {
+        if (likely(!mem)) {
-                rcu_read_lock();
+                mem = try_get_mem_cgroup_from_mm(mm);
-                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                *memcg = mem;
-                if (unlikely(!mem)) {
-                        rcu_read_unlock();
-                        return 0;
-                }
-                /*
-                 * For every charge from the cgroup, increment reference count
-                 */
-                css_get(&mem->css);
-                rcu_read_unlock();
        } else {
-                mem = memcg;
+                css_get(&mem->css);
-                css_get(&memcg->css);
        }
+        if (unlikely(!mem))
+                return 0;
+        VM_BUG_ON(mem_cgroup_is_obsolete(mem));
+        while (1) {
+                int ret;
+                bool noswap = false;
+                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+                if (likely(!ret)) {
+                        if (!do_swap_account)
+                                break;
+                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+                                                        &fail_res);
+                        if (likely(!ret))
+                                break;
+                        /* mem+swap counter fails */
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
+                        noswap = true;
+                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+                                                                        memsw);
+                } else
+                        /* mem counter fails */
+                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+                                                                        res);
-        while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
                if (!(gfp_mask & __GFP_WAIT))
-                        goto out;
+                        goto nomem;
-                if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
-                        continue;
+                                                        noswap);
                /*
                 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -525,49 +870,214 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                 * moved to swap cache or just unmapped from the cgroup.
                 * Check the limit again to see if the reclaim reduced the
                 * current usage of the cgroup before giving up
+                 *
                 */
-                if (res_counter_check_under_limit(&mem->res))
+                if (mem_cgroup_check_under_limit(mem_over_limit))
                        continue;
                if (!nr_retries--) {
-                        mem_cgroup_out_of_memory(mem, gfp_mask);
+                        if (oom) {
-                        goto out;
+                                mutex_lock(&memcg_tasklist);
+                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+                                mutex_unlock(&memcg_tasklist);
+                                mem_over_limit->last_oom_jiffies = jiffies;
+                        }
+                        goto nomem;
                }
        }
+        return 0;
+nomem:
+        css_put(&mem->css);
+        return -ENOMEM;
+}
+static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+{
+        struct mem_cgroup *mem;
+        swp_entry_t ent;
+        if (!PageSwapCache(page))
+                return NULL;
+        ent.val = page_private(page);
+        mem = lookup_swap_cgroup(ent);
+        if (!mem)
+                return NULL;
+        if (!css_tryget(&mem->css))
+                return NULL;
+        return mem;
+}
+/*
+ * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+ * USED state. If already USED, uncharge and return.
+ */
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                     struct page_cgroup *pc,
+                                     enum charge_type ctype)
+{
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                if (do_swap_account)
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
                css_put(&mem->css);
+                return;
-                goto done;
        }
        pc->mem_cgroup = mem;
-        /*
+        smp_wmb();
-         * If a page is accounted as a page cache, insert to inactive list.
-         * If anon, insert to active list.
-         */
        pc->flags = pcg_default_flags[ctype];
-        mz = page_cgroup_zoneinfo(pc);
+        mem_cgroup_charge_statistics(mem, pc, true);
-        spin_lock_irqsave(&mz->lru_lock, flags);
-        __mem_cgroup_add_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
        unlock_page_cgroup(pc);
+}
-done:
+/**
-        return 0;
+ * mem_cgroup_move_account - move account of the page
+ * @pc: page_cgroup of the page.
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ *
+ * returns 0 at success,
+ * returns -EBUSY when lock is busy or "pc" is unstable.
+ *
+ * This function does "uncharge" from old cgroup but doesn't do "charge" to
+ * new cgroup. It should be done by a caller.
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+        struct mem_cgroup *from, struct mem_cgroup *to)
+{
+        struct mem_cgroup_per_zone *from_mz, *to_mz;
+        int nid, zid;
+        int ret = -EBUSY;
+        VM_BUG_ON(from == to);
+        VM_BUG_ON(PageLRU(pc->page));
+        nid = page_cgroup_nid(pc);
+        zid = page_cgroup_zid(pc);
+        from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
+        to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
+        if (!trylock_page_cgroup(pc))
+                return ret;
+        if (!PageCgroupUsed(pc))
+                goto out;
+        if (pc->mem_cgroup != from)
+                goto out;
+        css_put(&from->css);
+        res_counter_uncharge(&from->res, PAGE_SIZE);
+        mem_cgroup_charge_statistics(from, pc, false);
+        if (do_swap_account)
+                res_counter_uncharge(&from->memsw, PAGE_SIZE);
+        pc->mem_cgroup = to;
+        mem_cgroup_charge_statistics(to, pc, true);
+        css_get(&to->css);
+        ret = 0;
 out:
-        css_put(&mem->css);
+        unlock_page_cgroup(pc);
-        return -ENOMEM;
+        return ret;
+}
+/*
+ * move charges to its parent.
+ */
+static int mem_cgroup_move_parent(struct page_cgroup *pc,
+                                  struct mem_cgroup *child,
+                                  gfp_t gfp_mask)
+{
+        struct page *page = pc->page;
+        struct cgroup *cg = child->css.cgroup;
+        struct cgroup *pcg = cg->parent;
+        struct mem_cgroup *parent;
+        int ret;
+        /* Is ROOT ? */
+        if (!pcg)
+                return -EINVAL;
+        parent = mem_cgroup_from_cont(pcg);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+        if (ret || !parent)
+                return ret;
+        if (!get_page_unless_zero(page))
+                return -EBUSY;
+        ret = isolate_lru_page(page);
+        if (ret)
+                goto cancel;
+        ret = mem_cgroup_move_account(pc, child, parent);
+        /* drop extra refcnt by try_charge() (move_account increment one) */
+        css_put(&parent->css);
+        putback_lru_page(page);
+        if (!ret) {
+                put_page(page);
+                return 0;
+        }
+        /* uncharge if move fails */
+cancel:
+        res_counter_uncharge(&parent->res, PAGE_SIZE);
+        if (do_swap_account)
+                res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+        put_page(page);
+        return ret;
+}
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask, enum charge_type ctype,
+                                struct mem_cgroup *memcg)
+{
+        struct mem_cgroup *mem;
+        struct page_cgroup *pc;
+        int ret;
+        pc = lookup_page_cgroup(page);
+        /* can happen at boot */
+        if (unlikely(!pc))
+                return 0;
+        prefetchw(pc);
+        mem = memcg;
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+        if (ret || !mem)
+                return ret;
+        __mem_cgroup_commit_charge(mem, pc, ctype);
+        return 0;
 }
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_newpage_charge(struct page *page,
+                              struct mm_struct *mm, gfp_t gfp_mask)
 {
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
        if (PageCompound(page))
                return 0;
@@ -589,7 +1099,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        if (mem_cgroup_subsys.disabled)
+        struct mem_cgroup *mem = NULL;
+        int ret;
+        if (mem_cgroup_disabled())
                return 0;
        if (PageCompound(page))
                return 0;
@@ -601,6 +1114,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
         * For GFP_NOWAIT case, the page may be pre-charged before calling
         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
         * charge twice. (It works but has to pay a bit larger cost.)
+         * And when the page is SwapCache, it should take swap information
+         * into account. This is under lock_page() now.
         */
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
@@ -617,58 +1132,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
-        if (unlikely(!mm))
+        if (do_swap_account && PageSwapCache(page)) {
+                mem = try_get_mem_cgroup_from_swapcache(page);
+                if (mem)
+                        mm = NULL;
+                  else
+                        mem = NULL;
+                /* SwapCache may be still linked to LRU now. */
+                mem_cgroup_lru_del_before_commit_swapcache(page);
+        }
+        if (unlikely(!mm && !mem))
                mm = &init_mm;
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
-        else
-                return mem_cgroup_charge_common(page, mm, gfp_mask,
+        ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+        if (mem)
+                css_put(&mem->css);
+        if (PageSwapCache(page))
+                mem_cgroup_lru_add_after_commit_swapcache(page);
+        if (do_swap_account && !ret && PageSwapCache(page)) {
+                swp_entry_t ent = {.val = page_private(page)};
+                /* avoid double counting */
+                mem = swap_cgroup_record(ent, NULL);
+                if (mem) {
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                        mem_cgroup_put(mem);
+                }
+        }
+        return ret;
+}
+/*
+ * While swap-in, try_charge -> commit or cancel, the page is locked.
+ * And when try_charge() successfully returns, one refcnt to memcg without
+ * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * "commit()" or removed by "cancel()"
+ */
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+                                 struct page *page,
+                                 gfp_t mask, struct mem_cgroup **ptr)
+{
+        struct mem_cgroup *mem;
+        int ret;
+        if (mem_cgroup_disabled())
+                return 0;
+        if (!do_swap_account)
+                goto charge_cur_mm;
+        /*
+         * A racing thread's fault, or swapoff, may have already updated
+         * the pte, and even removed page from swap cache: return success
+         * to go on to do_swap_page()'s pte_same() test, which should fail.
+         */
+        if (!PageSwapCache(page))
+                return 0;
+        mem = try_get_mem_cgroup_from_swapcache(page);
+        if (!mem)
+                goto charge_cur_mm;
+        *ptr = mem;
+        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+        /* drop extra refcnt from tryget */
+        css_put(&mem->css);
+        return ret;
+charge_cur_mm:
+        if (unlikely(!mm))
+                mm = &init_mm;
+        return __mem_cgroup_try_charge(mm, mask, ptr, true);
+}
+void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
+{
+        struct page_cgroup *pc;
+        if (mem_cgroup_disabled())
+                return;
+        if (!ptr)
+                return;
+        pc = lookup_page_cgroup(page);
+        mem_cgroup_lru_del_before_commit_swapcache(page);
+        __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        mem_cgroup_lru_add_after_commit_swapcache(page);
+        /*
+         * Now swap is on-memory. This means this page may be
+         * counted both as mem and swap....double count.
+         * Fix it by uncharging from memsw. Basically, this SwapCache is stable
+         * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
+         * may call delete_from_swap_cache() before reach here.
+         */
+        if (do_swap_account && PageSwapCache(page)) {
+                swp_entry_t ent = {.val = page_private(page)};
+                struct mem_cgroup *memcg;
+                memcg = swap_cgroup_record(ent, NULL);
+                if (memcg) {
+                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                        mem_cgroup_put(memcg);
+                }
+        }
+        /* add this page(page_cgroup) to the LRU we want. */
 }
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!mem)
+                return;
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (do_swap_account)
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        css_put(&mem->css);
+}
 /*
 * uncharge if !page_mapped(page)
 */
-static void
+static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct mem_cgroup_per_zone *mz;
-        unsigned long flags;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
-                return;
+                return NULL;
+        if (PageSwapCache(page))
+                return NULL;
        /*
         * Check if our page_cgroup is valid
         */
        pc = lookup_page_cgroup(page);
        if (unlikely(!pc || !PageCgroupUsed(pc)))
-                return;
+                return NULL;
        lock_page_cgroup(pc);
-        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-             || !PageCgroupUsed(pc)) {
+        mem = pc->mem_cgroup;
-                /* This happens at race in zap_pte_range() and do_swap_page()*/
-                unlock_page_cgroup(pc);
+        if (!PageCgroupUsed(pc))
-                return;
+                goto unlock_out;
+        switch (ctype) {
+        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+                if (page_mapped(page))
+                        goto unlock_out;
+                break;
+        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
+                if (!PageAnon(page)) {  /* Shared memory */
+                        if (page->mapping && !page_is_file_cache(page))
+                                goto unlock_out;
+                } else if (page_mapped(page)) /* Anon */
+                                goto unlock_out;
+                break;
+        default:
+                break;
        }
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        mem_cgroup_charge_statistics(mem, pc, false);
        ClearPageCgroupUsed(pc);
-        mem = pc->mem_cgroup;
+        /*
+         * pc->mem_cgroup is not cleared here. It will be accessed when it's
+         * freed from LRU. This is safe because uncharged page is expected not
+         * to be reused (freed soon). Exception is SwapCache, it's handled by
+         * special functions.
+         */
        mz = page_cgroup_zoneinfo(pc);
-        spin_lock_irqsave(&mz->lru_lock, flags);
-        __mem_cgroup_remove_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
        unlock_page_cgroup(pc);
-        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        /* at swapout, this memcg will be accessed to record to swap */
-        css_put(&mem->css);
+        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                css_put(&mem->css);
-        return;
+        return mem;
+unlock_out:
+        unlock_page_cgroup(pc);
+        return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
@@ -689,16 +1344,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 }
 /*
- * Before starting migration, account against new page.
+ * called from __delete_from_swap_cache() and drop "page" account.
+ * memcg information is recorded to swap_cgroup of "ent"
+ */
+void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+        struct mem_cgroup *memcg;
+        memcg = __mem_cgroup_uncharge_common(page,
+                                        MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+        /* record memcg information */
+        if (do_swap_account && memcg) {
+                swap_cgroup_record(ent, memcg);
+                mem_cgroup_get(memcg);
+        }
+        if (memcg)
+                css_put(&memcg->css);
+}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/*
+ * called from swap_entry_free(). remove record in swap_cgroup and
+ * uncharge "memsw" account.
 */
-int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+void mem_cgroup_uncharge_swap(swp_entry_t ent)
+{
+        struct mem_cgroup *memcg;
+        if (!do_swap_account)
+                return;
+        memcg = swap_cgroup_record(ent, NULL);
+        if (memcg) {
+                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                mem_cgroup_put(memcg);
+        }
+}
+#endif
+/*
+ * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
+ * page belongs to.
+ */
+int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
        int ret = 0;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
        pc = lookup_page_cgroup(page);
@@ -706,41 +1400,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                css_get(&mem->css);
-                if (PageCgroupCache(pc)) {
-                        if (page_is_file_cache(page))
-                                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-                        else
-                                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-                }
        }
        unlock_page_cgroup(pc);
        if (mem) {
-                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
-                        ctype, mem);
                css_put(&mem->css);
        }
+        *ptr = mem;
        return ret;
 }
 /* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct page *newpage)
+void mem_cgroup_end_migration(struct mem_cgroup *mem,
+                struct page *oldpage, struct page *newpage)
 {
+        struct page *target, *unused;
+        struct page_cgroup *pc;
+        enum charge_type ctype;
+        if (!mem)
+                return;
+        /* at migration success, oldpage->mapping is NULL. */
+        if (oldpage->mapping) {
+                target = oldpage;
+                unused = NULL;
+        } else {
+                target = newpage;
+                unused = oldpage;
+        }
+        if (PageAnon(target))
+                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+        else if (page_is_file_cache(target))
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        else
+                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+        /* unused page is not on radix-tree now. */
+        if (unused)
+                __mem_cgroup_uncharge_common(unused, ctype);
+        pc = lookup_page_cgroup(target);
        /*
-         * At success, page->mapping is not NULL.
+         * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
-         * special rollback care is necessary when
+         * So, double-counting is effectively avoided.
-         * 1. at migration failure. (newpage->mapping is cleared in this case)
-         * 2. the newpage was moved but not remapped again because the task
-         *    exits and the newpage is obsolete. In this case, the new page
-         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
-         *    always for avoiding mess. The  page_cgroup will be removed if
-         *    unnecessary. File cache pages is still on radix-tree. Don't
-         *    care it.
         */
-        if (!newpage->mapping)
+        __mem_cgroup_commit_charge(mem, pc, ctype);
-                __mem_cgroup_uncharge_common(newpage,
-                                MEM_CGROUP_CHARGE_TYPE_FORCE);
+        /*
-        else if (PageAnon(newpage))
+         * Both of oldpage and newpage are still under lock_page().
-                mem_cgroup_uncharge_page(newpage);
+         * Then, we don't have to care about race in radix-tree.
+         * But we have to be careful that this page is unmapped or not.
+         *
+         * There is a case for !page_mapped(). At the start of
+         * migration, oldpage was mapped. But now, it's zapped.
+         * But we know *target* page is not freed/reused under us.
+         * mem_cgroup_uncharge_page() does all necessary checks.
+         */
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+                mem_cgroup_uncharge_page(target);
 }
 /*
@@ -748,29 +1468,26 @@ void mem_cgroup_end_migration(struct page *newpage)
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
-int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_shrink_usage(struct page *page,
+                            struct mm_struct *mm,
+                            gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        int progress = 0;
        int retry = MEM_CGROUP_RECLAIM_RETRIES;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
-        if (!mm)
+        if (page)
+                mem = try_get_mem_cgroup_from_swapcache(page);
+        if (!mem && mm)
+                mem = try_get_mem_cgroup_from_mm(mm);
+        if (unlikely(!mem))
                return 0;
-        rcu_read_lock();
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        if (unlikely(!mem)) {
-                rcu_read_unlock();
-                return 0;
-        }
-        css_get(&mem->css);
-        rcu_read_unlock();
        do {
-                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+                progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
-                progress += res_counter_check_under_limit(&mem->res);
+                progress += mem_cgroup_check_under_limit(mem);
        } while (!progress && --retry);
        css_put(&mem->css);
@@ -779,117 +1496,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
        return 0;
 }
+static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-                                   unsigned long long val)
+                                unsigned long long val)
 {
        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
        int progress;
+        u64 memswlimit;
        int ret = 0;
-        while (res_counter_set_limit(&memcg->res, val)) {
+        while (retry_count) {
                if (signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                if (!retry_count) {
+                /*
-                        ret = -EBUSY;
+                 * Rather than hide all in some function, I do this in
+                 * open coded manner. You see what this really does.
+                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 */
+                mutex_lock(&set_limit_mutex);
+                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                if (memswlimit < val) {
+                        ret = -EINVAL;
+                        mutex_unlock(&set_limit_mutex);
                        break;
                }
-                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+                ret = res_counter_set_limit(&memcg->res, val);
-                if (!progress)
+                mutex_unlock(&set_limit_mutex);
-                        retry_count--;
+                if (!ret)
+                        break;
+                progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
+                                                           false);
+                if (!progress)                  retry_count--;
        }
        return ret;
 }
+int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
+                                unsigned long long val)
+{
+        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+        u64 memlimit, oldusage, curusage;
+        int ret;
+        if (!do_swap_account)
+                return -EINVAL;
+        while (retry_count) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                /*
+                 * Rather than hide all in some function, I do this in
+                 * open coded manner. You see what this really does.
+                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 */
+                mutex_lock(&set_limit_mutex);
+                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                if (memlimit > val) {
+                        ret = -EINVAL;
+                        mutex_unlock(&set_limit_mutex);
+                        break;
+                }
+                ret = res_counter_set_limit(&memcg->memsw, val);
+                mutex_unlock(&set_limit_mutex);
+                if (!ret)
+                        break;
+                oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
+                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                if (curusage >= oldusage)
+                        retry_count--;
+        }
+        return ret;
+}
 /*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
-#define FORCE_UNCHARGE_BATCH    (128)
+static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
-static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+                                int node, int zid, enum lru_list lru)
-                            struct mem_cgroup_per_zone *mz,
-                            enum lru_list lru)
 {
-        struct page_cgroup *pc;
+        struct zone *zone;
-        struct page *page;
+        struct mem_cgroup_per_zone *mz;
-        int count = FORCE_UNCHARGE_BATCH;
+        struct page_cgroup *pc, *busy;
-        unsigned long flags;
+        unsigned long flags, loop;
        struct list_head *list;
+        int ret = 0;
+        zone = &NODE_DATA(node)->node_zones[zid];
+        mz = mem_cgroup_zoneinfo(mem, node, zid);
        list = &mz->lists[lru];
-        spin_lock_irqsave(&mz->lru_lock, flags);
+        loop = MEM_CGROUP_ZSTAT(mz, lru);
-        while (!list_empty(list)) {
+        /* give some margin against EBUSY etc...*/
-                pc = list_entry(list->prev, struct page_cgroup, lru);
+        loop += 256;
-                page = pc->page;
+        busy = NULL;
-                if (!PageCgroupUsed(pc))
+        while (loop--) {
-                        break;
+                ret = 0;
-                get_page(page);
+                spin_lock_irqsave(&zone->lru_lock, flags);
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
+                if (list_empty(list)) {
-                /*
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
-                 * Check if this page is on LRU. !LRU page can be found
-                 * if it's under page migration.
-                 */
-                if (PageLRU(page)) {
-                        __mem_cgroup_uncharge_common(page,
-                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
-                        put_page(page);
-                        if (--count <= 0) {
-                                count = FORCE_UNCHARGE_BATCH;
-                                cond_resched();
-                        }
-                } else {
-                        spin_lock_irqsave(&mz->lru_lock, flags);
                        break;
                }
-                spin_lock_irqsave(&mz->lru_lock, flags);
+                pc = list_entry(list->prev, struct page_cgroup, lru);
+                if (busy == pc) {
+                        list_move(&pc->lru, list);
+                        busy = 0;
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
+                        continue;
+                }
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+                if (ret == -ENOMEM)
+                        break;
+                if (ret == -EBUSY || ret == -EINVAL) {
+                        /* found lock contention or "pc" is obsolete. */
+                        busy = pc;
+                        cond_resched();
+                } else
+                        busy = NULL;
        }
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        if (!ret && !list_empty(list))
+                return -EBUSY;
+        return ret;
 }
 /*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
-static int mem_cgroup_force_empty(struct mem_cgroup *mem)
+static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
 {
-        int ret = -EBUSY;
+        int ret;
-        int node, zid;
+        int node, zid, shrink;
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct cgroup *cgrp = mem->css.cgroup;
        css_get(&mem->css);
-        /*
-         * page reclaim code (kswapd etc..) will move pages between
+        shrink = 0;
-         * active_list <-> inactive_list while we don't take a lock.
+        /* should free all ? */
-         * So, we have to do loop here until all lists are empty.
+        if (free_all)
-         */
+                goto try_to_free;
+move_account:
        while (mem->res.usage > 0) {
-                if (atomic_read(&mem->css.cgroup->count) > 0)
+                ret = -EBUSY;
+                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+                        goto out;
+                ret = -EINTR;
+                if (signal_pending(current))
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
-                for_each_node_state(node, N_POSSIBLE)
+                ret = 0;
-                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                for_each_node_state(node, N_POSSIBLE) {
-                                struct mem_cgroup_per_zone *mz;
+                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
-                                mz = mem_cgroup_zoneinfo(mem, node, zid);
+                                for_each_lru(l) {
-                                for_each_lru(l)
+                                        ret = mem_cgroup_force_empty_list(mem,
-                                        mem_cgroup_force_empty_list(mem, mz, l);
+                                                        node, zid, l);
+                                        if (ret)
+                                                break;
+                                }
                        }
+                        if (ret)
+                                break;
+                }
+                /* it seems parent cgroup doesn't have enough mem */
+                if (ret == -ENOMEM)
+                        goto try_to_free;
                cond_resched();
        }
        ret = 0;
 out:
        css_put(&mem->css);
        return ret;
+try_to_free:
+        /* returns EBUSY if there is a task or if we come here twice. */
+        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
+                ret = -EBUSY;
+                goto out;
+        }
+        /* we call try-to-free pages for make this cgroup empty */
+        lru_add_drain_all();
+        /* try to free all pages in this cgroup */
+        shrink = 1;
+        while (nr_retries && mem->res.usage > 0) {
+                int progress;
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        goto out;
+                }
+                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+                                                false, get_swappiness(mem));
+                if (!progress) {
+                        nr_retries--;
+                        /* maybe some writeback is necessary */
+                        congestion_wait(WRITE, HZ/10);
+                }
+        }
+        lru_add_drain();
+        /* try move_account...there may be some *locked* pages. */
+        if (mem->res.usage)
+                goto move_account;
+        ret = 0;
+        goto out;
+}
+int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+{
+        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+}
+static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
+{
+        return mem_cgroup_from_cont(cont)->use_hierarchy;
+}
+static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
+                                        u64 val)
+{
+        int retval = 0;
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct cgroup *parent = cont->parent;
+        struct mem_cgroup *parent_mem = NULL;
+        if (parent)
+                parent_mem = mem_cgroup_from_cont(parent);
+        cgroup_lock();
+        /*
+         * If parent's use_hiearchy is set, we can't make any modifications
+         * in the child subtrees. If it is unset, then the change can
+         * occur, provided the current cgroup has no children.
+         *
+         * For the root cgroup, parent_mem is NULL, we allow value to be
+         * set if there are no children.
+         */
+        if ((!parent_mem || !parent_mem->use_hierarchy) &&
+                                (val == 1 || val == 0)) {
+                if (list_empty(&cont->children))
+                        mem->use_hierarchy = val;
+                else
+                        retval = -EBUSY;
+        } else
+                retval = -EINVAL;
+        cgroup_unlock();
+        return retval;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-                                    cft->private);
+        u64 val = 0;
+        int type, name;
+        type = MEMFILE_TYPE(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (type) {
+        case _MEM:
+                val = res_counter_read_u64(&mem->res, name);
+                break;
+        case _MEMSWAP:
+                if (do_swap_account)
+                        val = res_counter_read_u64(&mem->memsw, name);
+                break;
+        default:
+                BUG();
+                break;
+        }
+        return val;
 }
 /*
 * The user of this function is...
@@ -899,15 +1794,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                            const char *buffer)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        int type, name;
        unsigned long long val;
        int ret;
-        switch (cft->private) {
+        type = MEMFILE_TYPE(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
        case RES_LIMIT:
                /* This function does all necessary parse...reuse it */
                ret = res_counter_memparse_write_strategy(buffer, &val);
-                if (!ret)
+                if (ret)
+                        break;
+                if (type == _MEM)
                        ret = mem_cgroup_resize_limit(memcg, val);
+                else
+                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
                break;
        default:
                ret = -EINVAL; /* should be BUG() ? */
@@ -916,27 +1818,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
        return ret;
 }
+static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
+                unsigned long long *mem_limit, unsigned long long *memsw_limit)
+{
+        struct cgroup *cgroup;
+        unsigned long long min_limit, min_memsw_limit, tmp;
+        min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+        min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+        cgroup = memcg->css.cgroup;
+        if (!memcg->use_hierarchy)
+                goto out;
+        while (cgroup->parent) {
+                cgroup = cgroup->parent;
+                memcg = mem_cgroup_from_cont(cgroup);
+                if (!memcg->use_hierarchy)
+                        break;
+                tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                min_limit = min(min_limit, tmp);
+                tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                min_memsw_limit = min(min_memsw_limit, tmp);
+        }
+out:
+        *mem_limit = min_limit;
+        *memsw_limit = min_memsw_limit;
+        return;
+}
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
        struct mem_cgroup *mem;
+        int type, name;
        mem = mem_cgroup_from_cont(cont);
-        switch (event) {
+        type = MEMFILE_TYPE(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
        case RES_MAX_USAGE:
-                res_counter_reset_max(&mem->res);
+                if (type == _MEM)
+                        res_counter_reset_max(&mem->res);
+                else
+                        res_counter_reset_max(&mem->memsw);
                break;
        case RES_FAILCNT:
-                res_counter_reset_failcnt(&mem->res);
+                if (type == _MEM)
+                        res_counter_reset_failcnt(&mem->res);
+                else
+                        res_counter_reset_failcnt(&mem->memsw);
                break;
        }
        return 0;
 }
-static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
-{
-        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
-}
 static const struct mem_cgroup_stat_desc {
        const char *msg;
        u64 unit;
@@ -985,43 +1919,163 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
        }
+        {
+                unsigned long long limit, memsw_limit;
+                memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+                cb->fill(cb, "hierarchical_memory_limit", limit);
+                if (do_swap_account)
+                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+        }
+#ifdef CONFIG_DEBUG_VM
+        cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
+        {
+                int nid, zid;
+                struct mem_cgroup_per_zone *mz;
+                unsigned long recent_rotated[2] = {0, 0};
+                unsigned long recent_scanned[2] = {0, 0};
+                for_each_online_node(nid)
+                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                                mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+                                recent_rotated[0] +=
+                                        mz->reclaim_stat.recent_rotated[0];
+                                recent_rotated[1] +=
+                                        mz->reclaim_stat.recent_rotated[1];
+                                recent_scanned[0] +=
+                                        mz->reclaim_stat.recent_scanned[0];
+                                recent_scanned[1] +=
+                                        mz->reclaim_stat.recent_scanned[1];
+                        }
+                cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
+                cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
+                cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
+                cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+        }
+#endif
+        return 0;
+}
+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        return get_swappiness(memcg);
+}
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+                                       u64 val)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *parent;
+        if (val > 100)
+                return -EINVAL;
+        if (cgrp->parent == NULL)
+                return -EINVAL;
+        parent = mem_cgroup_from_cont(cgrp->parent);
+        /* If under hierarchy, only empty-root can set this value */
+        if ((parent->use_hierarchy) ||
+            (memcg->use_hierarchy && !list_empty(&cgrp->children)))
+                return -EINVAL;
+        spin_lock(&memcg->reclaim_param_lock);
+        memcg->swappiness = val;
+        spin_unlock(&memcg->reclaim_param_lock);
        return 0;
 }
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
-                .private = RES_USAGE,
+                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "max_usage_in_bytes",
-                .private = RES_MAX_USAGE,
+                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "limit_in_bytes",
-                .private = RES_LIMIT,
+                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "failcnt",
-                .private = RES_FAILCNT,
+                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read,
        },
        {
+                .name = "stat",
+                .read_map = mem_control_stat_show,
+        },
+        {
                .name = "force_empty",
-                .trigger = mem_force_empty_write,
+                .trigger = mem_cgroup_force_empty_write,
        },
        {
-                .name = "stat",
+                .name = "use_hierarchy",
-                .read_map = mem_control_stat_show,
+                .write_u64 = mem_cgroup_hierarchy_write,
+                .read_u64 = mem_cgroup_hierarchy_read,
+        },
+        {
+                .name = "swappiness",
+                .read_u64 = mem_cgroup_swappiness_read,
+                .write_u64 = mem_cgroup_swappiness_write,
        },
 };
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static struct cftype memsw_cgroup_files[] = {
+        {
+                .name = "memsw.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+                .trigger = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+                .write_string = mem_cgroup_write,
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.failcnt",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+                .trigger = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read,
+        },
+};
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+        if (!do_swap_account)
+                return 0;
+        return cgroup_add_files(cont, ss, memsw_cgroup_files,
+                                ARRAY_SIZE(memsw_cgroup_files));
+};
+#else
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+        return 0;
+}
+#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -1047,7 +2101,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                spin_lock_init(&mz->lru_lock);
                for_each_lru(l)
                        INIT_LIST_HEAD(&mz->lists[l]);
        }
@@ -1059,55 +2112,113 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        kfree(mem->info.nodeinfo[node]);
 }
+static int mem_cgroup_size(void)
+{
+        int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+        return sizeof(struct mem_cgroup) + cpustat_size;
+}
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *mem;
+        int size = mem_cgroup_size();
-        if (sizeof(*mem) < PAGE_SIZE)
+        if (size < PAGE_SIZE)
-                mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+                mem = kmalloc(size, GFP_KERNEL);
        else
-                mem = vmalloc(sizeof(*mem));
+                mem = vmalloc(size);
        if (mem)
-                memset(mem, 0, sizeof(*mem));
+                memset(mem, 0, size);
        return mem;
 }
-static void mem_cgroup_free(struct mem_cgroup *mem)
+/*
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
+ */
+static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
-        if (sizeof(*mem) < PAGE_SIZE)
+        int node;
+        for_each_node_state(node, N_POSSIBLE)
+                free_mem_cgroup_per_zone_info(mem, node);
+        if (mem_cgroup_size() < PAGE_SIZE)
                kfree(mem);
        else
                vfree(mem);
 }
+static void mem_cgroup_get(struct mem_cgroup *mem)
+{
+        atomic_inc(&mem->refcnt);
+}
+static void mem_cgroup_put(struct mem_cgroup *mem)
+{
+        if (atomic_dec_and_test(&mem->refcnt))
+                __mem_cgroup_free(mem);
+}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static void __init enable_swap_cgroup(void)
+{
+        if (!mem_cgroup_disabled() && really_do_swap_account)
+                do_swap_account = 1;
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem, *parent;
        int node;
-        if (unlikely((cont->parent) == NULL)) {
+        mem = mem_cgroup_alloc();
-                mem = &init_mem_cgroup;
+        if (!mem)
-        } else {
+                return ERR_PTR(-ENOMEM);
-                mem = mem_cgroup_alloc();
-                if (!mem)
-                        return ERR_PTR(-ENOMEM);
-        }
-        res_counter_init(&mem->res);
        for_each_node_state(node, N_POSSIBLE)
                if (alloc_mem_cgroup_per_zone_info(mem, node))
                        goto free_out;
+        /* root ? */
+        if (cont->parent == NULL) {
+                enable_swap_cgroup();
+                parent = NULL;
+        } else {
+                parent = mem_cgroup_from_cont(cont->parent);
+                mem->use_hierarchy = parent->use_hierarchy;
+        }
+        if (parent && parent->use_hierarchy) {
+                res_counter_init(&mem->res, &parent->res);
+                res_counter_init(&mem->memsw, &parent->memsw);
+        } else {
+                res_counter_init(&mem->res, NULL);
+                res_counter_init(&mem->memsw, NULL);
+        }
+        mem->last_scanned_child = NULL;
+        spin_lock_init(&mem->reclaim_param_lock);
+        if (parent)
+                mem->swappiness = get_swappiness(parent);
+        atomic_set(&mem->refcnt, 1);
        return &mem->css;
 free_out:
-        for_each_node_state(node, N_POSSIBLE)
+        __mem_cgroup_free(mem);
-                free_mem_cgroup_per_zone_info(mem, node);
-        if (cont->parent != NULL)
-                mem_cgroup_free(mem);
        return ERR_PTR(-ENOMEM);
 }
@@ -1115,26 +2226,26 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
                                        struct cgroup *cont)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-        mem_cgroup_force_empty(mem);
+        mem_cgroup_force_empty(mem, false);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        int node;
+        mem_cgroup_put(mem_cgroup_from_cont(cont));
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-        for_each_node_state(node, N_POSSIBLE)
-                free_mem_cgroup_per_zone_info(mem, node);
-        mem_cgroup_free(mem_cgroup_from_cont(cont));
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        return cgroup_add_files(cont, ss, mem_cgroup_files,
+        int ret;
-                                        ARRAY_SIZE(mem_cgroup_files));
+        ret = cgroup_add_files(cont, ss, mem_cgroup_files,
+                                ARRAY_SIZE(mem_cgroup_files));
+        if (!ret)
+                ret = register_memsw_files(cont, ss);
+        return ret;
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -1142,25 +2253,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *old_cont,
                                struct task_struct *p)
 {
-        struct mm_struct *mm;
+        mutex_lock(&memcg_tasklist);
-        struct mem_cgroup *mem, *old_mem;
-        mm = get_task_mm(p);
-        if (mm == NULL)
-                return;
-        mem = mem_cgroup_from_cont(cont);
-        old_mem = mem_cgroup_from_cont(old_cont);
        /*
-         * Only thread group leaders are allowed to migrate, the mm_struct is
+         * FIXME: It's better to move charges of this process from old
-         * in effect owned by the leader
+         * memcg to new memcg. But it's just on TODO-List now.
         */
-        if (!thread_group_leader(p))
+        mutex_unlock(&memcg_tasklist);
-                goto out;
-out:
-        mmput(mm);
 }
 struct cgroup_subsys mem_cgroup_subsys = {
@@ -1173,3 +2271,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .attach = mem_cgroup_move_task,
        .early_init = 0,
 };
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init disable_swap_account(char *s)
+{
+        really_do_swap_account = 0;
+        return 1;
+}
+__setup("noswapaccount", disable_swap_account);
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 3f8fa06b963b..e009ce870859 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2000,7 +2000,7 @@ gotten:
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
-        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        /*
@@ -2392,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        swp_entry_t entry;
        pte_t pte;
+        struct mem_cgroup *ptr = NULL;
        int ret = 0;
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2430,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
                unlock_page(page);
                goto out;
@@ -2448,7 +2449,19 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_nomap;
        }
-        /* The page isn't present yet, go ahead with the fault. */
+        /*
+         * The page isn't present yet, go ahead with the fault.
+         *
+         * Be careful about the sequence of operations here.
+         * To get its accounting right, reuse_swap_page() must be called
+         * while the page is counted on swap but not yet in mapcount i.e.
+         * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+         * must be called after the swap_free(), or it will never succeed.
+         * Because delete_from_swap_page() may be called by reuse_swap_page(),
+         * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
+         * in page->private. In this case, a record in swap_cgroup  is silently
+         * discarded at swap_free().
+         */
        inc_mm_counter(mm, anon_rss);
        pte = mk_pte(page, vma->vm_page_prot);
@@ -2456,10 +2469,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                write_access = 0;
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
        page_add_anon_rmap(page, vma, address);
+        /* It's better to call commit-charge after rmap is established */
+        mem_cgroup_commit_charge_swapin(page, ptr);
        swap_free(entry);
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2480,7 +2494,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-        mem_cgroup_uncharge_page(page);
+        mem_cgroup_cancel_charge_swapin(ptr);
        pte_unmap_unlock(page_table, ptl);
        unlock_page(page);
        page_cache_release(page);
@@ -2510,7 +2524,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto oom;
        __SetPageUptodate(page);
-        if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
@@ -2601,7 +2615,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                ret = VM_FAULT_OOM;
                                goto out;
                        }
-                        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
                                ret = VM_FAULT_OOM;
                                page_cache_release(page);
                                goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 55373983c9c6..a30ea5fcf9f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
                goto out;
-        /*
-         * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
-         * Failure is not an option here: we're now expected to remove every
-         * migration pte, and will cause crashes otherwise.  Normally this
-         * is not an issue: mem_cgroup_prepare_migration bumped up the old
-         * page_cgroup count for safety, that's now attached to the new page,
-         * so this charge should just be another incrementation of the count,
-         * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
-         * there's been a force_empty, those reference counts may no longer
-         * be reliable, and this charge can actually fail: oh well, we don't
-         * make the situation any worse by proceeding as if it had succeeded.
-         */
-        mem_cgroup_charge(new, mm, GFP_ATOMIC);
        get_page(new);
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
@@ -378,9 +364,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        anon = PageAnon(page);
        page->mapping = NULL;
-        if (!anon) /* This page was removed from radix-tree. */
-                mem_cgroup_uncharge_cache_page(page);
        /*
         * If any waiters have accumulated on the new page then
         * wake them up.
@@ -614,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
        int charge = 0;
+        struct mem_cgroup *mem;
        if (!newpage)
                return -ENOMEM;
@@ -623,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                goto move_newpage;
        }
-        charge = mem_cgroup_prepare_migration(page, newpage);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto move_newpage;
-        }
        /* prepare cgroup just returns 0 or -ENOMEM */
-        BUG_ON(charge);
        rc = -EAGAIN;
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
                lock_page(page);
        }
+        /* charge against new page */
+        charge = mem_cgroup_prepare_migration(page, &mem);
+        if (charge == -ENOMEM) {
+                rc = -ENOMEM;
+                goto unlock;
+        }
+        BUG_ON(charge);
        if (PageWriteback(page)) {
                if (!force)
-                        goto unlock;
+                        goto uncharge;
                wait_on_page_writeback(page);
        }
        /*
@@ -693,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
+uncharge:
+        if (!charge)
+                mem_cgroup_end_migration(mem, page, newpage);
 unlock:
        unlock_page(page);
@@ -709,8 +697,6 @@ unlock:
        }
 move_newpage:
-        if (!charge)
-                mem_cgroup_end_migration(newpage);
        /*
         * Move the new page to the LRU. If migration was not successful
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6b9e758c98a5..40ba05061a4f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -429,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        unsigned long points = 0;
        struct task_struct *p;
-        cgroup_lock();
        read_lock(&tasklist_lock);
 retry:
        p = select_bad_process(&points, mem);
@@ -444,7 +443,6 @@ retry:
                goto retry;
 out:
        read_unlock(&tasklist_lock);
-        cgroup_unlock();
 }
 #endif
@@ -560,6 +558,13 @@ void pagefault_out_of_memory(void)
                /* Got some memory back in the last second. */
                return;
+        /*
+         * If this is from memcg, oom-killer is already invoked.
+         * and not worth to go system-wide-oom.
+         */
+        if (mem_cgroup_oom_called(current))
+                goto rest_and_return;
        if (sysctl_panic_on_oom)
                panic("out of memory from page fault. panic_on_oom is selected.\n");
@@ -571,6 +576,7 @@ void pagefault_out_of_memory(void)
         * Give "p" a good chance of killing itself before we
         * retry to allocate memory.
         */
+rest_and_return:
        if (!test_thread_flag(TIF_MEMDIE))
                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7bf22e045318..5675b3073854 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3523,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        INIT_LIST_HEAD(&zone->lru[l].list);
                        zone->lru[l].nr_scan = 0;
                }
-                zone->recent_rotated[0] = 0;
+                zone->reclaim_stat.recent_rotated[0] = 0;
-                zone->recent_rotated[1] = 0;
+                zone->reclaim_stat.recent_rotated[1] = 0;
-                zone->recent_scanned[0] = 0;
+                zone->reclaim_stat.recent_scanned[0] = 0;
-                zone->recent_scanned[1] = 0;
+                zone->reclaim_stat.recent_scanned[1] = 0;
                zap_zone_vm_stats(zone);
                zone->flags = 0;
                if (!size)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index d6507a660ed6..7006a11350c8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -8,6 +8,7 @@
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
 #include <linux/cgroup.h>
+#include <linux/swapops.h>
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
        pc->flags = 0;
        pc->mem_cgroup = NULL;
        pc->page = pfn_to_page(pfn);
+        INIT_LIST_HEAD(&pc->lru);
 }
 static unsigned long total_usage;
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void)
        int nid, fail;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return;
        for_each_online_node(nid)  {
@@ -103,13 +105,11 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 /* __alloc_bootmem...() is protected by !slab_available() */
 static int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
-        struct mem_section *section;
+        struct mem_section *section = __pfn_to_section(pfn);
        struct page_cgroup *base, *pc;
        unsigned long table_size;
        int nid, index;
-        section = __pfn_to_section(pfn);
        if (!section->page_cgroup) {
                nid = page_to_nid(pfn_to_page(pfn));
                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
@@ -145,7 +145,6 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
                __init_page_cgroup(pc, pfn + index);
        }
-        section = __pfn_to_section(pfn);
        section->page_cgroup = base - pfn;
        total_usage += table_size;
        return 0;
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void)
        unsigned long pfn;
        int fail = 0;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return;
        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 }
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+        struct page **map;
+        unsigned long length;
+};
+struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+/*
+ * This 8bytes seems big..maybe we can reduce this when we can use "id" for
+ * cgroup rather than pointer.
+ */
+struct swap_cgroup {
+        struct mem_cgroup       *val;
+};
+#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
+#define SC_POS_MASK     (SC_PER_PAGE - 1)
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ *  - we have no race in "exchange" when we're accessed via SwapCache because
+ *    SwapCache(and its swp_entry) is under lock.
+ *  - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+        struct page *page;
+        struct swap_cgroup_ctrl *ctrl;
+        unsigned long idx, max;
+        if (!do_swap_account)
+                return 0;
+        ctrl = &swap_cgroup_ctrl[type];
+        for (idx = 0; idx < ctrl->length; idx++) {
+                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+                if (!page)
+                        goto not_enough_page;
+                ctrl->map[idx] = page;
+        }
+        return 0;
+not_enough_page:
+        max = idx;
+        for (idx = 0; idx < max; idx++)
+                __free_page(ctrl->map[idx]);
+        return -ENOMEM;
+}
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @mem: mem_cgroup to be recorded
+ *
+ * Returns old value at success, NULL at failure.
+ * (Of course, old value can be NULL.)
+ */
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+        int type = swp_type(ent);
+        unsigned long offset = swp_offset(ent);
+        unsigned long idx = offset / SC_PER_PAGE;
+        unsigned long pos = offset & SC_POS_MASK;
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        struct mem_cgroup *old;
+        if (!do_swap_account)
+                return NULL;
+        ctrl = &swap_cgroup_ctrl[type];
+        mappage = ctrl->map[idx];
+        sc = page_address(mappage);
+        sc += pos;
+        old = sc->val;
+        sc->val = mem;
+        return old;
+}
+/**
+ * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns pointer to mem_cgroup at success. NULL at failure.
+ */
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+        int type = swp_type(ent);
+        unsigned long offset = swp_offset(ent);
+        unsigned long idx = offset / SC_PER_PAGE;
+        unsigned long pos = offset & SC_POS_MASK;
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        struct mem_cgroup *ret;
+        if (!do_swap_account)
+                return NULL;
+        ctrl = &swap_cgroup_ctrl[type];
+        mappage = ctrl->map[idx];
+        sc = page_address(mappage);
+        sc += pos;
+        ret = sc->val;
+        return ret;
+}
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+        void *array;
+        unsigned long array_size;
+        unsigned long length;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return 0;
+        length = ((max_pages/SC_PER_PAGE) + 1);
+        array_size = length * sizeof(void *);
+        array = vmalloc(array_size);
+        if (!array)
+                goto nomem;
+        memset(array, 0, array_size);
+        ctrl = &swap_cgroup_ctrl[type];
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl->length = length;
+        ctrl->map = array;
+        if (swap_cgroup_prepare(type)) {
+                /* memory shortage */
+                ctrl->map = NULL;
+                ctrl->length = 0;
+                vfree(array);
+                mutex_unlock(&swap_cgroup_mutex);
+                goto nomem;
+        }
+        mutex_unlock(&swap_cgroup_mutex);
+        printk(KERN_INFO
+                "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
+                " and %ld bytes to hold mem_cgroup pointers on swap\n",
+                array_size, length * PAGE_SIZE);
+        printk(KERN_INFO
+        "swap_cgroup can be disabled by noswapaccount boot option.\n");
+        return 0;
+nomem:
+        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+        printk(KERN_INFO
+                "swap_cgroup can be disabled by noswapaccount boot option\n");
+        return -ENOMEM;
+}
+void swap_cgroup_swapoff(int type)
+{
+        int i;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return;
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl = &swap_cgroup_ctrl[type];
+        if (ctrl->map) {
+                for (i = 0; i < ctrl->length; i++) {
+                        struct page *page = ctrl->map[i];
+                        if (page)
+                                __free_page(page);
+                }
+                vfree(ctrl->map);
+                ctrl->map = NULL;
+                ctrl->length = 0;
+        }
+        mutex_unlock(&swap_cgroup_mutex);
+}
+#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 5941f9801363..5d0de96c9789 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -928,7 +928,11 @@ found:
        error = 1;
        if (!inode)
                goto out;
-        /* Precharge page using GFP_KERNEL while we can wait */
+        /*
+         * Charge page using GFP_KERNEL while we can wait.
+         * Charged back to the user(not to caller) when swap account is used.
+         * add_to_page_cache() will be called with GFP_NOWAIT.
+         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
@@ -1320,15 +1324,19 @@ repeat:
                } else {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* allow reclaim from this memory cgroup */
-                                error = mem_cgroup_shrink_usage(current->mm,
+                                error = mem_cgroup_shrink_usage(swappage,
+                                                                current->mm,
                                                                gfp);
-                                if (error)
+                                if (error) {
+                                        unlock_page(swappage);
+                                        page_cache_release(swappage);
                                        goto failed;
+                                }
                        }
+                        unlock_page(swappage);
+                        page_cache_release(swappage);
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
@@ -1379,7 +1387,7 @@ repeat:
                        /* Precharge page while we can wait, compensate after */
                        error = mem_cgroup_cache_charge(filepage, current->mm,
-                                                        gfp & ~__GFP_HIGHMEM);
+                                        GFP_KERNEL);
                        if (error) {
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
diff --git a/mm/swap.c b/mm/swap.c
index ba2c0e8b8b54..8adb9feb61e1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -151,6 +151,26 @@ void  rotate_reclaimable_page(struct page *page)
        }
 }
+static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+                                     int file, int rotated)
+{
+        struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
+        struct zone_reclaim_stat *memcg_reclaim_stat;
+        memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+        reclaim_stat->recent_scanned[file]++;
+        if (rotated)
+                reclaim_stat->recent_rotated[file]++;
+        if (!memcg_reclaim_stat)
+                return;
+        memcg_reclaim_stat->recent_scanned[file]++;
+        if (rotated)
+                memcg_reclaim_stat->recent_rotated[file]++;
+}
 /*
 * FIXME: speed this up?
 */
@@ -168,10 +188,8 @@ void activate_page(struct page *page)
                lru += LRU_ACTIVE;
                add_page_to_lru_list(zone, page, lru);
                __count_vm_event(PGACTIVATE);
-                mem_cgroup_move_lists(page, lru);
-                zone->recent_rotated[!!file]++;
+                update_page_reclaim_stat(zone, page, !!file, 1);
-                zone->recent_scanned[!!file]++;
        }
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -386,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
        int i;
        struct zone *zone = NULL;
        VM_BUG_ON(is_unevictable_lru(lru));
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                struct zone *pagezone = page_zone(page);
                int file;
+                int active;
                if (pagezone != zone) {
                        if (zone)
@@ -403,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
                VM_BUG_ON(PageUnevictable(page));
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
+                active = is_active_lru(lru);
                file = is_file_lru(lru);
-                zone->recent_scanned[file]++;
+                if (active)
-                if (is_active_lru(lru)) {
                        SetPageActive(page);
-                        zone->recent_rotated[file]++;
+                update_page_reclaim_stat(zone, page, file, active);
-                }
                add_page_to_lru_list(zone, page, lru);
        }
        if (zone)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 81c825f67a7f..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/page_cgroup.h>
 #include <asm/pgtable.h>
@@ -108,6 +109,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 */
 void __delete_from_swap_cache(struct page *page)
 {
+        swp_entry_t ent = {.val = page_private(page)};
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapCache(page));
        VM_BUG_ON(PageWriteback(page));
@@ -118,6 +121,7 @@ void __delete_from_swap_cache(struct page *page)
        total_swapcache_pages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
+        mem_cgroup_uncharge_swapcache(page, ent);
 }
 /**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eec5ca758a23..da422c47e2ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
+#include <linux/page_cgroup.h>
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -470,8 +471,9 @@ out:
        return NULL;
 }
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
 {
+        unsigned long offset = swp_offset(ent);
        int count = p->swap_map[offset];
        if (count < SWAP_MAP_MAX) {
@@ -486,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
                                swap_list.next = p - swap_info;
                        nr_swap_pages++;
                        p->inuse_pages--;
+                        mem_cgroup_uncharge_swap(ent);
                }
        }
        return count;
@@ -501,7 +504,7 @@ void swap_free(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
-                swap_entry_free(p, swp_offset(entry));
+                swap_entry_free(p, entry);
                spin_unlock(&swap_lock);
        }
 }
@@ -581,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
-                if (swap_entry_free(p, swp_offset(entry)) == 1) {
+                if (swap_entry_free(p, entry) == 1) {
                        page = find_get_page(&swapper_space, entry.val);
                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
@@ -690,17 +693,18 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
+        struct mem_cgroup *ptr = NULL;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
-        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
                ret = -ENOMEM;
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
                if (ret > 0)
-                        mem_cgroup_uncharge_page(page);
+                        mem_cgroup_cancel_charge_swapin(ptr);
                ret = 0;
                goto out;
        }
@@ -710,6 +714,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        page_add_anon_rmap(page, vma, addr);
+        mem_cgroup_commit_charge_swapin(page, ptr);
        swap_free(entry);
        /*
         * Move the page to the active list so it is not
@@ -1492,6 +1497,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+        /* Destroy swap account informatin */
+        swap_cgroup_swapoff(type);
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);
@@ -1809,6 +1817,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                }
                swap_map[page_nr] = SWAP_MAP_BAD;
        }
+        error = swap_cgroup_swapon(type, maxpages);
+        if (error)
+                goto bad_swap;
        nr_good_pages = swap_header->info.last_page -
                        swap_header->info.nr_badpages -
                        1 /* header page */;
@@ -1880,6 +1893,7 @@ bad_swap:
                bd_release(bdev);
        }
        destroy_swap_extents(p);
+        swap_cgroup_swapoff(type);
 bad_swap_2:
        spin_lock(&swap_lock);
        p->swap_file = NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b07c48b09a93..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -125,11 +125,30 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scan_global_lru(sc)     (!(sc)->mem_cgroup)
+#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
 #else
-#define scan_global_lru(sc)     (1)
+#define scanning_global_lru(sc) (1)
 #endif
+static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
+                                                  struct scan_control *sc)
+{
+        if (!scanning_global_lru(sc))
+                return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+        return &zone->reclaim_stat;
+}
+static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
+                                   enum lru_list lru)
+{
+        if (!scanning_global_lru(sc))
+                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+        return zone_page_state(zone, NR_LRU_BASE + lru);
+}
 /*
 * Add a shrinker callback to be called from the vm
 */
@@ -512,7 +531,6 @@ redo:
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
        }
-        mem_cgroup_move_lists(page, lru);
        /*
         * page's status can change while we move it among lru. If an evictable
@@ -547,7 +565,6 @@ void putback_lru_page(struct page *page)
        lru = !!TestClearPageActive(page) + page_is_file_cache(page);
        lru_cache_add_lru(page, lru);
-        mem_cgroup_move_lists(page, lru);
        put_page(page);
 }
 #endif /* CONFIG_UNEVICTABLE_LRU */
@@ -813,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
                return ret;
        ret = -EBUSY;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -821,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
                 */
                ClearPageLRU(page);
                ret = 0;
+                mem_cgroup_del_lru(page);
        }
        return ret;
@@ -1029,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
        struct pagevec pvec;
        unsigned long nr_scanned = 0;
        unsigned long nr_reclaimed = 0;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        pagevec_init(&pvec, 1);
@@ -1070,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
                                                -count[LRU_INACTIVE_ANON]);
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc))
                        zone->pages_scanned += nr_scan;
-                        zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-                        zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+                reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-                        zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+                reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
-                        zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+                reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-                }
+                reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
@@ -1108,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
-                } else if (scan_global_lru(sc))
+                } else if (scanning_global_lru(sc))
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
@@ -1134,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                        SetPageLRU(page);
                        lru = page_lru(page);
                        add_page_to_lru_list(zone, page, lru);
-                        mem_cgroup_move_lists(page, lru);
+                        if (PageActive(page)) {
-                        if (PageActive(page) && scan_global_lru(sc)) {
                                int file = !!page_is_file_cache(page);
-                                zone->recent_rotated[file]++;
+                                reclaim_stat->recent_rotated[file]++;
                        }
                        if (!pagevec_add(&pvec, page)) {
                                spin_unlock_irq(&zone->lru_lock);
@@ -1197,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct pagevec pvec;
        enum lru_list lru;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -1207,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * zone->pages_scanned is used for detect zone's oom
         * mem_cgroup remembers nr_scan by itself.
         */
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                zone->pages_scanned += pgscanned;
-                zone->recent_scanned[!!file] += pgmoved;
        }
+        reclaim_stat->recent_scanned[!!file] += pgmoved;
        if (file)
                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1251,8 +1272,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * This helps balance scan pressure between file and anonymous
         * pages in get_scan_ratio.
         */
-        if (scan_global_lru(sc))
+        reclaim_stat->recent_rotated[!!file] += pgmoved;
-                zone->recent_rotated[!!file] += pgmoved;
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
@@ -1263,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                ClearPageActive(page);
                list_move(&page->lru, &zone->lru[lru].list);
-                mem_cgroup_move_lists(page, lru);
+                mem_cgroup_add_lru_list(page, lru);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1292,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        pagevec_release(&pvec);
 }
+static int inactive_anon_is_low_global(struct zone *zone)
+{
+        unsigned long active, inactive;
+        active = zone_page_state(zone, NR_ACTIVE_ANON);
+        inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+        if (inactive * zone->inactive_ratio < active)
+                return 1;
+        return 0;
+}
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ * @sc:   scan control of this context
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+{
+        int low;
+        if (scanning_global_lru(sc))
+                low = inactive_anon_is_low_global(zone);
+        else
+                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+        return low;
+}
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        struct zone *zone, struct scan_control *sc, int priority)
 {
@@ -1302,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                return 0;
        }
-        if (lru == LRU_ACTIVE_ANON &&
+        if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
-            (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
                shrink_active_list(nr_to_scan, zone, sc, priority, file);
                return 0;
        }
@@ -1325,6 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        unsigned long anon, file, free;
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        /* If we have no swap space, do not bother scanning anon pages. */
        if (nr_swap_pages <= 0) {
@@ -1333,17 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
                return;
        }
-        anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
+        anon  = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
-                zone_page_state(zone, NR_INACTIVE_ANON);
+                zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
-        file  = zone_page_state(zone, NR_ACTIVE_FILE) +
+        file  = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
-                zone_page_state(zone, NR_INACTIVE_FILE);
+                zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
-        free  = zone_page_state(zone, NR_FREE_PAGES);
+        if (scanning_global_lru(sc)) {
-        /* If we have very few page cache pages, force-scan anon pages. */
+                free  = zone_page_state(zone, NR_FREE_PAGES);
-        if (unlikely(file + free <= zone->pages_high)) {
+                /* If we have very few page cache pages,
-                percent[0] = 100;
+                   force-scan anon pages. */
-                percent[1] = 0;
+                if (unlikely(file + free <= zone->pages_high)) {
-                return;
+                        percent[0] = 100;
+                        percent[1] = 0;
+                        return;
+                }
        }
        /*
@@ -1357,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
-        if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                spin_lock_irq(&zone->lru_lock);
-                zone->recent_scanned[0] /= 2;
+                reclaim_stat->recent_scanned[0] /= 2;
-                zone->recent_rotated[0] /= 2;
+                reclaim_stat->recent_rotated[0] /= 2;
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (unlikely(zone->recent_scanned[1] > file / 4)) {
+        if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
                spin_lock_irq(&zone->lru_lock);
-                zone->recent_scanned[1] /= 2;
+                reclaim_stat->recent_scanned[1] /= 2;
-                zone->recent_rotated[1] /= 2;
+                reclaim_stat->recent_rotated[1] /= 2;
                spin_unlock_irq(&zone->lru_lock);
        }
@@ -1383,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
         */
-        ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+        ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
-        ap /= zone->recent_rotated[0] + 1;
+        ap /= reclaim_stat->recent_rotated[0] + 1;
-        fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
-        fp /= zone->recent_rotated[1] + 1;
+        fp /= reclaim_stat->recent_rotated[1] + 1;
        /* Normalize to percentages */
        percent[0] = 100 * ap / (ap + fp + 1);
@@ -1411,30 +1466,23 @@ static void shrink_zone(int priority, struct zone *zone,
        get_scan_ratio(zone, sc, percent);
        for_each_evictable_lru(l) {
-                if (scan_global_lru(sc)) {
+                int file = is_file_lru(l);
-                        int file = is_file_lru(l);
+                int scan;
-                        int scan;
+                scan = zone_page_state(zone, NR_LRU_BASE + l);
-                        scan = zone_page_state(zone, NR_LRU_BASE + l);
+                if (priority) {
-                        if (priority) {
+                        scan >>= priority;
-                                scan >>= priority;
+                        scan = (scan * percent[file]) / 100;
-                                scan = (scan * percent[file]) / 100;
+                }
-                        }
+                if (scanning_global_lru(sc)) {
                        zone->lru[l].nr_scan += scan;
                        nr[l] = zone->lru[l].nr_scan;
                        if (nr[l] >= swap_cluster_max)
                                zone->lru[l].nr_scan = 0;
                        else
                                nr[l] = 0;
-                } else {
+                } else
-                        /*
+                        nr[l] = scan;
-                         * This reclaim occurs not because zone memory shortage
-                         * but because memory controller hits its limit.
-                         * Don't modify zone reclaim related data.
-                         */
-                        nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
-                                                                priority, l);
-                }
        }
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1467,9 +1515,7 @@ static void shrink_zone(int priority, struct zone *zone,
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+        if (inactive_anon_is_low(zone, sc))
-                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
-        else if (!scan_global_lru(sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
        throttle_vm_writeout(sc->gfp_mask);
@@ -1504,7 +1550,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
                        note_zone_scanning_priority(zone, priority);
@@ -1557,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        delayacct_freepages_start();
-        if (scan_global_lru(sc))
+        if (scanning_global_lru(sc))
                count_vm_event(ALLOCSTALL);
        /*
         * mem_cgroup will not do shrink_slab.
         */
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1581,7 +1627,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
                 */
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc)) {
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1612,7 +1658,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        congestion_wait(WRITE, HZ/10);
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
-        if (!sc->all_unreclaimable && scan_global_lru(sc))
+        if (!sc->all_unreclaimable && scanning_global_lru(sc))
                ret = sc->nr_reclaimed;
 out:
        /*
@@ -1625,7 +1671,7 @@ out:
        if (priority < 0)
                priority = 0;
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1661,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-                                                gfp_t gfp_mask)
+                                           gfp_t gfp_mask,
+                                           bool noswap,
+                                           unsigned int swappiness)
 {
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_swap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
-                .swappiness = vm_swappiness,
+                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
        };
        struct zonelist *zonelist;
+        if (noswap)
+                sc.may_swap = 0;
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1761,7 +1812,7 @@ loop_again:
                         * Do some background aging of the anon list, to give
                         * pages a chance to be referenced before reclaiming.
                         */
-                        if (inactive_anon_is_low(zone))
+                        if (inactive_anon_is_low(zone, &sc))
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
@@ -2404,6 +2455,7 @@ retry:
                __dec_zone_state(zone, NR_UNEVICTABLE);
                list_move(&page->lru, &zone->lru[l].list);
+                mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
                __inc_zone_state(zone, NR_INACTIVE_ANON + l);
                __count_vm_event(UNEVICTABLE_PGRESCUED);
        } else {
@@ -2412,6 +2464,7 @@ retry:
                 */
                SetPageUnevictable(page);
                list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+                mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
                if (page_evictable(page, NULL))
                        goto retry;
        }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 76f06b94ab9f..c4a59824ac2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2752,7 +2752,7 @@ int __init ip6_route_init(void)
                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
                                  SLAB_HWCACHE_ALIGN, NULL);
        if (!ip6_dst_ops_template.kmem_cachep)
-                goto out;;
+                goto out;
        ret = register_pernet_subsys(&ip6_route_net_ops);
        if (ret)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 9048fe7e7ea7..a031034720b4 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -128,7 +128,7 @@ static struct ctl_table_header *ip6_header;
 int ipv6_sysctl_register(void)
 {
-        int err = -ENOMEM;;
+        int err = -ENOMEM;
        ip6_header = register_net_sysctl_rotable(net_ipv6_ctl_path, ipv6_table);
        if (ip6_header == NULL)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index f3965df00559..33133d27b539 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -435,7 +435,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
        int i;
        q->perturb_timer.function = sfq_perturbation;
-        q->perturb_timer.data = (unsigned long)sch;;
+        q->perturb_timer.data = (unsigned long)sch;
        init_timer_deferrable(&q->perturb_timer);
        for (i = 0; i < SFQ_HASH_DIVISOR; i++)
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 20c576f530fa..56935bbc1496 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -489,7 +489,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
        return 0;
 out_err:
-        /* Clean up any successfull allocations */
+        /* Clean up any successful allocations */
        sctp_auth_destroy_hmacs(ep->auth_hmacs);
        return -ENOMEM;
 }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 5ba78701adc3..3aacd0fe7179 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -513,11 +513,14 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
        struct dev_cgroup *dev_cgroup;
        struct dev_whitelist_item *wh;
+        if (!S_ISBLK(mode) && !S_ISCHR(mode))
+                return 0;
        rcu_read_lock();
        dev_cgroup = task_devcgroup(current);
-        list_for_each_entry(wh, &dev_cgroup->whitelist, list) {
+        list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
                if (wh->type & DEV_ALL)
                        goto acc_check;
                if ((wh->type & DEV_BLOCK) && !S_ISBLK(mode))
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index bf107a389ac1..71e2b914363e 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -569,7 +569,7 @@ static ssize_t smk_write_cipso(struct file *file, const char __user *buf,
        if (skp == NULL)
                goto out;
-        rule += SMK_LABELLEN;;
+        rule += SMK_LABELLEN;
        ret = sscanf(rule, "%d", &maplevel);
        if (ret != 1 || maplevel > SMACK_CIPSO_MAXLEVEL)
                goto out;
diff --git a/sound/soc/au1x/dbdma2.c b/sound/soc/au1x/dbdma2.c
index 74c823d60f91..bc8d654576c0 100644
--- a/sound/soc/au1x/dbdma2.c
+++ b/sound/soc/au1x/dbdma2.c
@@ -187,7 +187,7 @@ static int au1x_pcm_dbdma_realloc(struct au1xpsc_audio_dmadata *pcd,
                                        au1x_pcm_dmatx_cb, (void *)pcd);
        if (!pcd->ddma_chan)
-                return -ENOMEM;;
+                return -ENOMEM;
        au1xxx_dbdma_set_devwidth(pcd->ddma_chan, msbits);
        au1xxx_dbdma_ring_alloc(pcd->ddma_chan, 2);
diff --git a/sound/soc/davinci/davinci-pcm.c b/sound/soc/davinci/davinci-pcm.c
index 74abc9b4f1cc..366049d8578c 100644
--- a/sound/soc/davinci/davinci-pcm.c
+++ b/sound/soc/davinci/davinci-pcm.c
@@ -212,7 +212,7 @@ davinci_pcm_pointer(struct snd_pcm_substream *substream)
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                count = src - runtime->dma_addr;
        else
-                count = dst - runtime->dma_addr;;
+                count = dst - runtime->dma_addr;
        spin_unlock(&prtd->lock);