aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/hugetlb.txt2
-rw-r--r--Documentation/cgroups/memory.txt26
-rw-r--r--Documentation/cgroups/resource_counter.txt197
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-omap.txt9
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt1
-rw-r--r--Documentation/kdump/kdump.txt7
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--Documentation/sysctl/kernel.txt40
-rw-r--r--MAINTAINERS6
-rw-r--r--arch/arm/boot/dts/am335x-boneblack.dts4
-rw-r--r--arch/arm/boot/dts/am33xx.dtsi2
-rw-r--r--arch/arm64/include/asm/pgtable.h1
-rw-r--r--arch/ia64/kernel/perfmon.c2
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h1
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c4
-rw-r--r--arch/sh/mm/numa.c2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h7
-rw-r--r--arch/tile/kernel/early_printk.c19
-rw-r--r--arch/tile/kernel/setup.c45
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--drivers/base/Kconfig8
-rw-r--r--drivers/rtc/Kconfig8
-rw-r--r--drivers/rtc/interface.c21
-rw-r--r--drivers/rtc/rtc-ab8500.c2
-rw-r--r--drivers/rtc/rtc-ds1307.c127
-rw-r--r--drivers/rtc/rtc-ds1374.c285
-rw-r--r--drivers/rtc/rtc-isl12057.c83
-rw-r--r--drivers/rtc/rtc-omap.c547
-rw-r--r--drivers/rtc/rtc-pcf8563.c55
-rw-r--r--drivers/rtc/rtc-sirfsoc.c66
-rw-r--r--drivers/rtc/rtc-snvs.c39
-rw-r--r--drivers/usb/storage/debug.c2
-rw-r--r--fs/binfmt_elf.c40
-rw-r--r--fs/binfmt_misc.c393
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifssmb.c20
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2misc.c38
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/cifs/smb2pdu.h28
-rw-r--r--fs/file.c2
-rw-r--r--fs/hfs/catalog.c14
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/nilfs2/file.c10
-rw-r--r--fs/nilfs2/inode.c32
-rw-r--r--fs/nilfs2/namei.c15
-rw-r--r--fs/nilfs2/the_nilfs.c3
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c12
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c18
-rw-r--r--fs/ocfs2/dlmglue.c37
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/array.c47
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/generic.c163
-rw-r--r--fs/proc/internal.h11
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c104
-rw-r--r--include/linux/cgroup.h26
-rw-r--r--include/linux/compaction.h10
-rw-r--r--include/linux/file.h1
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/hugetlb.h3
-rw-r--r--include/linux/hugetlb_cgroup.h1
-rw-r--r--include/linux/kern_levels.h13
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/memcontrol.h50
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/page_cgroup.h105
-rw-r--r--include/linux/page_counter.h51
-rw-r--r--include/linux/percpu-refcount.h47
-rw-r--r--include/linux/printk.h1
-rw-r--r--include/linux/ptrace.h2
-rw-r--r--include/linux/res_counter.h223
-rw-r--r--include/linux/slab.h4
-rw-r--r--include/linux/swap_cgroup.h42
-rw-r--r--include/net/sock.h26
-rw-r--r--include/uapi/linux/sysctl.h1
-rw-r--r--init/Kconfig56
-rw-r--r--init/main.c14
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c245
-rw-r--r--kernel/kmod.c43
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c28
-rw-r--r--kernel/printk/printk.c49
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--lib/dma-debug.c43
-rw-r--r--lib/dynamic_debug.c4
-rw-r--r--lib/lcm.c8
-rw-r--r--mm/Makefile4
-rw-r--r--mm/cma.c14
-rw-r--r--mm/compaction.c139
-rw-r--r--mm/debug.c5
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/huge_memory.c1
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/hugetlb_cgroup.c103
-rw-r--r--mm/internal.h7
-rw-r--r--mm/memcontrol.c1706
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_cgroup.c530
-rw-r--r--mm/page_counter.c192
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c23
-rw-r--r--mm/slab.h8
-rw-r--r--mm/slab_common.c40
-rw-r--r--mm/slub.c21
-rw-r--r--mm/swap_cgroup.c208
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmalloc.c3
-rw-r--r--mm/vmscan.c18
-rw-r--r--net/ipv4/tcp_memcontrol.c87
-rwxr-xr-xscripts/checkpatch.pl263
-rwxr-xr-xscripts/kernel-doc2
142 files changed, 3655 insertions, 3940 deletions
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt
index a9faaca1f029..106245c3aecc 100644
--- a/Documentation/cgroups/hugetlb.txt
+++ b/Documentation/cgroups/hugetlb.txt
@@ -29,7 +29,7 @@ Brief summary of control files
29 29
30 hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage 30 hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
31 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded 31 hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
32 hugetlb.<hugepagesize>.usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb 32 hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
33 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit 33 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
34 34
35For a system supporting two hugepage size (16M and 16G) the control 35For a system supporting two hugepage size (16M and 16G) the control
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 02ab997a1ed2..46b2b5080317 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,5 +1,10 @@
1Memory Resource Controller 1Memory Resource Controller
2 2
3NOTE: This document is hopelessly outdated and it asks for a complete
4 rewrite. It still contains a useful information so we are keeping it
5 here but make sure to check the current code if you need a deeper
6 understanding.
7
3NOTE: The Memory Resource Controller has generically been referred to as the 8NOTE: The Memory Resource Controller has generically been referred to as the
4 memory controller in this document. Do not confuse memory controller 9 memory controller in this document. Do not confuse memory controller
5 used here with the memory controller that is used in hardware. 10 used here with the memory controller that is used in hardware.
@@ -52,9 +57,9 @@ Brief summary of control files.
52 tasks # attach a task(thread) and show list of threads 57 tasks # attach a task(thread) and show list of threads
53 cgroup.procs # show list of processes 58 cgroup.procs # show list of processes
54 cgroup.event_control # an interface for event_fd() 59 cgroup.event_control # an interface for event_fd()
55 memory.usage_in_bytes # show current res_counter usage for memory 60 memory.usage_in_bytes # show current usage for memory
56 (See 5.5 for details) 61 (See 5.5 for details)
57 memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap 62 memory.memsw.usage_in_bytes # show current usage for memory+Swap
58 (See 5.5 for details) 63 (See 5.5 for details)
59 memory.limit_in_bytes # set/show limit of memory usage 64 memory.limit_in_bytes # set/show limit of memory usage
60 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage 65 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
@@ -116,16 +121,16 @@ The memory controller is the first controller developed.
116 121
1172.1. Design 1222.1. Design
118 123
119The core of the design is a counter called the res_counter. The res_counter 124The core of the design is a counter called the page_counter. The
120tracks the current memory usage and limit of the group of processes associated 125page_counter tracks the current memory usage and limit of the group of
121with the controller. Each cgroup has a memory controller specific data 126processes associated with the controller. Each cgroup has a memory controller
122structure (mem_cgroup) associated with it. 127specific data structure (mem_cgroup) associated with it.
123 128
1242.2. Accounting 1292.2. Accounting
125 130
126 +--------------------+ 131 +--------------------+
127 | mem_cgroup | 132 | mem_cgroup |
128 | (res_counter) | 133 | (page_counter) |
129 +--------------------+ 134 +--------------------+
130 / ^ \ 135 / ^ \
131 / | \ 136 / | \
@@ -352,9 +357,8 @@ set:
3520. Configuration 3570. Configuration
353 358
354a. Enable CONFIG_CGROUPS 359a. Enable CONFIG_CGROUPS
355b. Enable CONFIG_RESOURCE_COUNTERS 360b. Enable CONFIG_MEMCG
356c. Enable CONFIG_MEMCG 361c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
357d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
358d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 362d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
359 363
3601. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) 3641. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
deleted file mode 100644
index 762ca54eb929..000000000000
--- a/Documentation/cgroups/resource_counter.txt
+++ /dev/null
@@ -1,197 +0,0 @@
1
2 The Resource Counter
3
4The resource counter, declared at include/linux/res_counter.h,
5is supposed to facilitate the resource management by controllers
6by providing common stuff for accounting.
7
8This "stuff" includes the res_counter structure and routines
9to work with it.
10
11
12
131. Crucial parts of the res_counter structure
14
15 a. unsigned long long usage
16
17 The usage value shows the amount of a resource that is consumed
18 by a group at a given time. The units of measurement should be
19 determined by the controller that uses this counter. E.g. it can
20 be bytes, items or any other unit the controller operates on.
21
22 b. unsigned long long max_usage
23
24 The maximal value of the usage over time.
25
26 This value is useful when gathering statistical information about
27 the particular group, as it shows the actual resource requirements
28 for a particular group, not just some usage snapshot.
29
30 c. unsigned long long limit
31
32 The maximal allowed amount of resource to consume by the group. In
33 case the group requests for more resources, so that the usage value
34 would exceed the limit, the resource allocation is rejected (see
35 the next section).
36
37 d. unsigned long long failcnt
38
39 The failcnt stands for "failures counter". This is the number of
40 resource allocation attempts that failed.
41
42 c. spinlock_t lock
43
44 Protects changes of the above values.
45
46
47
482. Basic accounting routines
49
50 a. void res_counter_init(struct res_counter *rc,
51 struct res_counter *rc_parent)
52
53 Initializes the resource counter. As usual, should be the first
54 routine called for a new counter.
55
56 The struct res_counter *parent can be used to define a hierarchical
57 child -> parent relationship directly in the res_counter structure,
58 NULL can be used to define no relationship.
59
60 c. int res_counter_charge(struct res_counter *rc, unsigned long val,
61 struct res_counter **limit_fail_at)
62
63 When a resource is about to be allocated it has to be accounted
64 with the appropriate resource counter (controller should determine
65 which one to use on its own). This operation is called "charging".
66
67 This is not very important which operation - resource allocation
68 or charging - is performed first, but
69 * if the allocation is performed first, this may create a
70 temporary resource over-usage by the time resource counter is
71 charged;
72 * if the charging is performed first, then it should be uncharged
73 on error path (if the one is called).
74
75 If the charging fails and a hierarchical dependency exists, the
76 limit_fail_at parameter is set to the particular res_counter element
77 where the charging failed.
78
79 d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val)
80
81 When a resource is released (freed) it should be de-accounted
82 from the resource counter it was accounted to. This is called
83 "uncharging". The return value of this function indicate the amount
84 of charges still present in the counter.
85
86 The _locked routines imply that the res_counter->lock is taken.
87
88 e. u64 res_counter_uncharge_until
89 (struct res_counter *rc, struct res_counter *top,
90 unsigned long val)
91
92 Almost same as res_counter_uncharge() but propagation of uncharge
93 stops when rc == top. This is useful when kill a res_counter in
94 child cgroup.
95
96 2.1 Other accounting routines
97
98 There are more routines that may help you with common needs, like
99 checking whether the limit is reached or resetting the max_usage
100 value. They are all declared in include/linux/res_counter.h.
101
102
103
1043. Analyzing the resource counter registrations
105
106 a. If the failcnt value constantly grows, this means that the counter's
107 limit is too tight. Either the group is misbehaving and consumes too
108 many resources, or the configuration is not suitable for the group
109 and the limit should be increased.
110
111 b. The max_usage value can be used to quickly tune the group. One may
112 set the limits to maximal values and either load the container with
113 a common pattern or leave one for a while. After this the max_usage
114 value shows the amount of memory the container would require during
115 its common activity.
116
117 Setting the limit a bit above this value gives a pretty good
118 configuration that works in most of the cases.
119
120 c. If the max_usage is much less than the limit, but the failcnt value
121 is growing, then the group tries to allocate a big chunk of resource
122 at once.
123
124 d. If the max_usage is much less than the limit, but the failcnt value
125 is 0, then this group is given too high limit, that it does not
126 require. It is better to lower the limit a bit leaving more resource
127 for other groups.
128
129
130
1314. Communication with the control groups subsystem (cgroups)
132
133All the resource controllers that are using cgroups and resource counters
134should provide files (in the cgroup filesystem) to work with the resource
135counter fields. They are recommended to adhere to the following rules:
136
137 a. File names
138
139 Field name File name
140 ---------------------------------------------------
141 usage usage_in_<unit_of_measurement>
142 max_usage max_usage_in_<unit_of_measurement>
143 limit limit_in_<unit_of_measurement>
144 failcnt failcnt
145 lock no file :)
146
147 b. Reading from file should show the corresponding field value in the
148 appropriate format.
149
150 c. Writing to file
151
152 Field Expected behavior
153 ----------------------------------
154 usage prohibited
155 max_usage reset to usage
156 limit set the limit
157 failcnt reset to zero
158
159
160
1615. Usage example
162
163 a. Declare a task group (take a look at cgroups subsystem for this) and
164 fold a res_counter into it
165
166 struct my_group {
167 struct res_counter res;
168
169 <other fields>
170 }
171
172 b. Put hooks in resource allocation/release paths
173
174 int alloc_something(...)
175 {
176 if (res_counter_charge(res_counter_ptr, amount) < 0)
177 return -ENOMEM;
178
179 <allocate the resource and return to the caller>
180 }
181
182 void release_something(...)
183 {
184 res_counter_uncharge(res_counter_ptr, amount);
185
186 <release the resource>
187 }
188
189 In order to keep the usage value self-consistent, both the
190 "res_counter_ptr" and the "amount" in release_something() should be
191 the same as they were in the alloc_something() when the releasing
192 resource was allocated.
193
194 c. Provide the way to read res_counter values and set them (the cgroups
195 still can help with it).
196
197 c. Compile and run :)
diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
index 5a0f02d34d95..4ba4dbd34289 100644
--- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
@@ -5,11 +5,17 @@ Required properties:
5 - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family. 5 - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family.
6 - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family. 6 - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family.
7 This RTC IP has special WAKE-EN Register to enable 7 This RTC IP has special WAKE-EN Register to enable
8 Wakeup generation for event Alarm. 8 Wakeup generation for event Alarm. It can also be
9 used to control an external PMIC via the
10 pmic_power_en pin.
9- reg: Address range of rtc register set 11- reg: Address range of rtc register set
10- interrupts: rtc timer, alarm interrupts in order 12- interrupts: rtc timer, alarm interrupts in order
11- interrupt-parent: phandle for the interrupt controller 13- interrupt-parent: phandle for the interrupt controller
12 14
15Optional properties:
16- system-power-controller: whether the rtc is controlling the system power
17 through pmic_power_en
18
13Example: 19Example:
14 20
15rtc@1c23000 { 21rtc@1c23000 {
@@ -18,4 +24,5 @@ rtc@1c23000 {
18 interrupts = <19 24 interrupts = <19
19 19>; 25 19>;
20 interrupt-parent = <&intc>; 26 interrupt-parent = <&intc>;
27 system-power-controller;
21}; 28};
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index 0d354625299c..2417cb0b493b 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -115,6 +115,7 @@ nxp NXP Semiconductors
115onnn ON Semiconductor Corp. 115onnn ON Semiconductor Corp.
116opencores OpenCores.org 116opencores OpenCores.org
117panasonic Panasonic Corporation 117panasonic Panasonic Corporation
118pericom Pericom Technology Inc.
118phytec PHYTEC Messtechnik GmbH 119phytec PHYTEC Messtechnik GmbH
119picochip Picochip Ltd 120picochip Picochip Ltd
120plathome Plat'Home Co., Ltd. 121plathome Plat'Home Co., Ltd.
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 6c0b9f27e465..bc4bd5a44b88 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -471,6 +471,13 @@ format. Crash is available on Dave Anderson's site at the following URL:
471 471
472 http://people.redhat.com/~anderson/ 472 http://people.redhat.com/~anderson/
473 473
474Trigger Kdump on WARN()
475=======================
476
477The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This
478will cause a kdump to occur at the panic() call. In cases where a user wants
479to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1
480to achieve the same behaviour.
474 481
475Contact 482Contact
476======= 483=======
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 838f3776c924..d6eb3636fe5a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2509,6 +2509,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2509 timeout < 0: reboot immediately 2509 timeout < 0: reboot immediately
2510 Format: <timeout> 2510 Format: <timeout>
2511 2511
2512 panic_on_warn panic() instead of WARN(). Useful to cause kdump
2513 on a WARN().
2514
2512 crash_kexec_post_notifiers 2515 crash_kexec_post_notifiers
2513 Run kdump after running panic-notifiers and dumping 2516 Run kdump after running panic-notifiers and dumping
2514 kmsg. This only for the users who doubt kdump always 2517 kmsg. This only for the users who doubt kdump always
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57baff5bdb80..b5d0c8501a18 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -54,8 +54,9 @@ show up in /proc/sys/kernel:
54- overflowuid 54- overflowuid
55- panic 55- panic
56- panic_on_oops 56- panic_on_oops
57- panic_on_unrecovered_nmi
58- panic_on_stackoverflow 57- panic_on_stackoverflow
58- panic_on_unrecovered_nmi
59- panic_on_warn
59- pid_max 60- pid_max
60- powersave-nap [ PPC only ] 61- powersave-nap [ PPC only ]
61- printk 62- printk
@@ -527,19 +528,6 @@ the recommended setting is 60.
527 528
528============================================================== 529==============================================================
529 530
530panic_on_unrecovered_nmi:
531
532The default Linux behaviour on an NMI of either memory or unknown is
533to continue operation. For many environments such as scientific
534computing it is preferable that the box is taken out and the error
535dealt with than an uncorrected parity/ECC error get propagated.
536
537A small number of systems do generate NMI's for bizarre random reasons
538such as power management so the default is off. That sysctl works like
539the existing panic controls already in that directory.
540
541==============================================================
542
543panic_on_oops: 531panic_on_oops:
544 532
545Controls the kernel's behaviour when an oops or BUG is encountered. 533Controls the kernel's behaviour when an oops or BUG is encountered.
@@ -563,6 +551,30 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
563 551
564============================================================== 552==============================================================
565 553
554panic_on_unrecovered_nmi:
555
556The default Linux behaviour on an NMI of either memory or unknown is
557to continue operation. For many environments such as scientific
558computing it is preferable that the box is taken out and the error
559dealt with than an uncorrected parity/ECC error get propagated.
560
561A small number of systems do generate NMI's for bizarre random reasons
562such as power management so the default is off. That sysctl works like
563the existing panic controls already in that directory.
564
565==============================================================
566
567panic_on_warn:
568
569Calls panic() in the WARN() path when set to 1. This is useful to avoid
570a kernel rebuild when attempting to kdump at the location of a WARN().
571
5720: only WARN(), default behaviour.
573
5741: call panic() after printing out WARN() location.
575
576==============================================================
577
566perf_cpu_time_max_percent: 578perf_cpu_time_max_percent:
567 579
568Hints to the kernel how much CPU time it should be allowed to 580Hints to the kernel how much CPU time it should be allowed to
diff --git a/MAINTAINERS b/MAINTAINERS
index 1563a3b38960..079efaf1b5e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2605,7 +2605,7 @@ L: cgroups@vger.kernel.org
2605L: linux-mm@kvack.org 2605L: linux-mm@kvack.org
2606S: Maintained 2606S: Maintained
2607F: mm/memcontrol.c 2607F: mm/memcontrol.c
2608F: mm/page_cgroup.c 2608F: mm/swap_cgroup.c
2609 2609
2610CORETEMP HARDWARE MONITORING DRIVER 2610CORETEMP HARDWARE MONITORING DRIVER
2611M: Fenghua Yu <fenghua.yu@intel.com> 2611M: Fenghua Yu <fenghua.yu@intel.com>
@@ -2722,7 +2722,7 @@ F: drivers/net/wireless/cw1200/
2722 2722
2723CX18 VIDEO4LINUX DRIVER 2723CX18 VIDEO4LINUX DRIVER
2724M: Andy Walls <awalls@md.metrocast.net> 2724M: Andy Walls <awalls@md.metrocast.net>
2725L: ivtv-devel@ivtvdriver.org (moderated for non-subscribers) 2725L: ivtv-devel@ivtvdriver.org (subscribers-only)
2726L: linux-media@vger.kernel.org 2726L: linux-media@vger.kernel.org
2727T: git git://linuxtv.org/media_tree.git 2727T: git git://linuxtv.org/media_tree.git
2728W: http://linuxtv.org 2728W: http://linuxtv.org
@@ -5208,7 +5208,7 @@ F: drivers/media/tuners/it913x*
5208 5208
5209IVTV VIDEO4LINUX DRIVER 5209IVTV VIDEO4LINUX DRIVER
5210M: Andy Walls <awalls@md.metrocast.net> 5210M: Andy Walls <awalls@md.metrocast.net>
5211L: ivtv-devel@ivtvdriver.org (moderated for non-subscribers) 5211L: ivtv-devel@ivtvdriver.org (subscribers-only)
5212L: linux-media@vger.kernel.org 5212L: linux-media@vger.kernel.org
5213T: git git://linuxtv.org/media_tree.git 5213T: git git://linuxtv.org/media_tree.git
5214W: http://www.ivtvdriver.org 5214W: http://www.ivtvdriver.org
diff --git a/arch/arm/boot/dts/am335x-boneblack.dts b/arch/arm/boot/dts/am335x-boneblack.dts
index 901739fcb85a..5c42d259fa68 100644
--- a/arch/arm/boot/dts/am335x-boneblack.dts
+++ b/arch/arm/boot/dts/am335x-boneblack.dts
@@ -80,3 +80,7 @@
80 status = "okay"; 80 status = "okay";
81 }; 81 };
82}; 82};
83
84&rtc {
85 system-power-controller;
86};
diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi
index befe713b3e1b..acd37057bca9 100644
--- a/arch/arm/boot/dts/am33xx.dtsi
+++ b/arch/arm/boot/dts/am33xx.dtsi
@@ -435,7 +435,7 @@
435 }; 435 };
436 436
437 rtc: rtc@44e3e000 { 437 rtc: rtc@44e3e000 {
438 compatible = "ti,da830-rtc"; 438 compatible = "ti,am3352-rtc", "ti,da830-rtc";
439 reg = <0x44e3e000 0x1000>; 439 reg = <0x44e3e000 0x1000>;
440 interrupts = <75 440 interrupts = <75
441 76>; 441 76>;
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 41a43bf26492..df22314f57cf 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -279,6 +279,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
279#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ 279#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
280#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 280#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
281 281
282#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
282#define pmd_young(pmd) pte_young(pmd_pte(pmd)) 283#define pmd_young(pmd) pte_young(pmd_pte(pmd))
283#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 284#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
284#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) 285#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5845ffea67c3..dc063fe6646a 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2662,7 +2662,7 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
2662 2662
2663 ret = -ENOMEM; 2663 ret = -ENOMEM;
2664 2664
2665 fd = get_unused_fd(); 2665 fd = get_unused_fd_flags(0);
2666 if (fd < 0) 2666 if (fd < 0)
2667 return fd; 2667 return fd;
2668 2668
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index ae153c40ab7c..9b4b1904efae 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -467,6 +467,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
467} 467}
468 468
469#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) 469#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
470#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
470#define pmd_young(pmd) pte_young(pmd_pte(pmd)) 471#define pmd_young(pmd) pte_young(pmd_pte(pmd))
471#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) 472#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
472#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 473#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 65d633f20d37..1a3429e1ccb5 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -301,7 +301,7 @@ static int spufs_context_open(struct path *path)
301 int ret; 301 int ret;
302 struct file *filp; 302 struct file *filp;
303 303
304 ret = get_unused_fd(); 304 ret = get_unused_fd_flags(0);
305 if (ret < 0) 305 if (ret < 0)
306 return ret; 306 return ret;
307 307
@@ -518,7 +518,7 @@ static int spufs_gang_open(struct path *path)
518 int ret; 518 int ret;
519 struct file *filp; 519 struct file *filp;
520 520
521 ret = get_unused_fd(); 521 ret = get_unused_fd_flags(0);
522 if (ret < 0) 522 if (ret < 0)
523 return ret; 523 return ret;
524 524
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index 3d85225b9e95..bce52ba66206 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -31,7 +31,7 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
31 unsigned long bootmem_paddr; 31 unsigned long bootmem_paddr;
32 32
33 /* Don't allow bogus node assignment */ 33 /* Don't allow bogus node assignment */
34 BUG_ON(nid > MAX_NUMNODES || nid <= 0); 34 BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
35 35
36 start_pfn = start >> PAGE_SHIFT; 36 start_pfn = start >> PAGE_SHIFT;
37 end_pfn = end >> PAGE_SHIFT; 37 end_pfn = end >> PAGE_SHIFT;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index bfeb626085ac..1ff9e7864168 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -667,6 +667,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
667} 667}
668 668
669#ifdef CONFIG_TRANSPARENT_HUGEPAGE 669#ifdef CONFIG_TRANSPARENT_HUGEPAGE
670static inline unsigned long pmd_dirty(pmd_t pmd)
671{
672 pte_t pte = __pte(pmd_val(pmd));
673
674 return pte_dirty(pte);
675}
676
670static inline unsigned long pmd_young(pmd_t pmd) 677static inline unsigned long pmd_young(pmd_t pmd)
671{ 678{
672 pte_t pte = __pte(pmd_val(pmd)); 679 pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00e7f6d..aefb2c086726 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -43,13 +43,20 @@ static struct console early_hv_console = {
43 43
44void early_panic(const char *fmt, ...) 44void early_panic(const char *fmt, ...)
45{ 45{
46 va_list ap; 46 struct va_format vaf;
47 va_list args;
48
47 arch_local_irq_disable_all(); 49 arch_local_irq_disable_all();
48 va_start(ap, fmt); 50
49 early_printk("Kernel panic - not syncing: "); 51 va_start(args, fmt);
50 early_vprintk(fmt, ap); 52
51 early_printk("\n"); 53 vaf.fmt = fmt;
52 va_end(ap); 54 vaf.va = &args;
55
56 early_printk("Kernel panic - not syncing: %pV", &vaf);
57
58 va_end(args);
59
53 dump_stack(); 60 dump_stack();
54 hv_halt(); 61 hv_halt();
55} 62}
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index b9736ded06f2..7f079bbfdf4c 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -534,11 +534,10 @@ static void __init setup_memory(void)
534 } 534 }
535 } 535 }
536 physpages -= dropped_pages; 536 physpages -= dropped_pages;
537 pr_warning("Only using %ldMB memory;" 537 pr_warn("Only using %ldMB memory - ignoring %ldMB\n",
538 " ignoring %ldMB.\n", 538 physpages >> (20 - PAGE_SHIFT),
539 physpages >> (20 - PAGE_SHIFT), 539 dropped_pages >> (20 - PAGE_SHIFT));
540 dropped_pages >> (20 - PAGE_SHIFT)); 540 pr_warn("Consider using a larger page size\n");
541 pr_warning("Consider using a larger page size.\n");
542 } 541 }
543#endif 542#endif
544 543
@@ -566,9 +565,8 @@ static void __init setup_memory(void)
566 565
567#ifndef __tilegx__ 566#ifndef __tilegx__
568 if (node_end_pfn[0] > MAXMEM_PFN) { 567 if (node_end_pfn[0] > MAXMEM_PFN) {
569 pr_warning("Only using %ldMB LOWMEM.\n", 568 pr_warn("Only using %ldMB LOWMEM\n", MAXMEM >> 20);
570 MAXMEM>>20); 569 pr_warn("Use a HIGHMEM enabled kernel\n");
571 pr_warning("Use a HIGHMEM enabled kernel.\n");
572 max_low_pfn = MAXMEM_PFN; 570 max_low_pfn = MAXMEM_PFN;
573 max_pfn = MAXMEM_PFN; 571 max_pfn = MAXMEM_PFN;
574 node_end_pfn[0] = MAXMEM_PFN; 572 node_end_pfn[0] = MAXMEM_PFN;
@@ -1112,8 +1110,8 @@ static void __init load_hv_initrd(void)
1112 fd = hv_fs_findfile((HV_VirtAddr) initramfs_file); 1110 fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
1113 if (fd == HV_ENOENT) { 1111 if (fd == HV_ENOENT) {
1114 if (set_initramfs_file) { 1112 if (set_initramfs_file) {
1115 pr_warning("No such hvfs initramfs file '%s'\n", 1113 pr_warn("No such hvfs initramfs file '%s'\n",
1116 initramfs_file); 1114 initramfs_file);
1117 return; 1115 return;
1118 } else { 1116 } else {
1119 /* Try old backwards-compatible name. */ 1117 /* Try old backwards-compatible name. */
@@ -1126,8 +1124,8 @@ static void __init load_hv_initrd(void)
1126 stat = hv_fs_fstat(fd); 1124 stat = hv_fs_fstat(fd);
1127 BUG_ON(stat.size < 0); 1125 BUG_ON(stat.size < 0);
1128 if (stat.flags & HV_FS_ISDIR) { 1126 if (stat.flags & HV_FS_ISDIR) {
1129 pr_warning("Ignoring hvfs file '%s': it's a directory.\n", 1127 pr_warn("Ignoring hvfs file '%s': it's a directory\n",
1130 initramfs_file); 1128 initramfs_file);
1131 return; 1129 return;
1132 } 1130 }
1133 initrd = alloc_bootmem_pages(stat.size); 1131 initrd = alloc_bootmem_pages(stat.size);
@@ -1185,9 +1183,8 @@ static void __init validate_hv(void)
1185 HV_Topology topology = hv_inquire_topology(); 1183 HV_Topology topology = hv_inquire_topology();
1186 BUG_ON(topology.coord.x != 0 || topology.coord.y != 0); 1184 BUG_ON(topology.coord.x != 0 || topology.coord.y != 0);
1187 if (topology.width != 1 || topology.height != 1) { 1185 if (topology.width != 1 || topology.height != 1) {
1188 pr_warning("Warning: booting UP kernel on %dx%d grid;" 1186 pr_warn("Warning: booting UP kernel on %dx%d grid; will ignore all but first tile\n",
1189 " will ignore all but first tile.\n", 1187 topology.width, topology.height);
1190 topology.width, topology.height);
1191 } 1188 }
1192#endif 1189#endif
1193 1190
@@ -1208,9 +1205,8 @@ static void __init validate_hv(void)
1208 * We use a struct cpumask for this, so it must be big enough. 1205 * We use a struct cpumask for this, so it must be big enough.
1209 */ 1206 */
1210 if ((smp_height * smp_width) > nr_cpu_ids) 1207 if ((smp_height * smp_width) > nr_cpu_ids)
1211 early_panic("Hypervisor %d x %d grid too big for Linux" 1208 early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n",
1212 " NR_CPUS %d\n", smp_height, smp_width, 1209 smp_height, smp_width, nr_cpu_ids);
1213 nr_cpu_ids);
1214#endif 1210#endif
1215 1211
1216 /* 1212 /*
@@ -1265,10 +1261,9 @@ static void __init validate_va(void)
1265 1261
1266 /* Kernel PCs must have their high bit set; see intvec.S. */ 1262 /* Kernel PCs must have their high bit set; see intvec.S. */
1267 if ((long)VMALLOC_START >= 0) 1263 if ((long)VMALLOC_START >= 0)
1268 early_panic( 1264 early_panic("Linux VMALLOC region below the 2GB line (%#lx)!\n"
1269 "Linux VMALLOC region below the 2GB line (%#lx)!\n" 1265 "Reconfigure the kernel with smaller VMALLOC_RESERVE\n",
1270 "Reconfigure the kernel with smaller VMALLOC_RESERVE.\n", 1266 VMALLOC_START);
1271 VMALLOC_START);
1272#endif 1267#endif
1273} 1268}
1274 1269
@@ -1395,7 +1390,7 @@ static void __init setup_cpu_maps(void)
1395 1390
1396static int __init dataplane(char *str) 1391static int __init dataplane(char *str)
1397{ 1392{
1398 pr_warning("WARNING: dataplane support disabled in this kernel\n"); 1393 pr_warn("WARNING: dataplane support disabled in this kernel\n");
1399 return 0; 1394 return 0;
1400} 1395}
1401 1396
@@ -1413,8 +1408,8 @@ void __init setup_arch(char **cmdline_p)
1413 len = hv_get_command_line((HV_VirtAddr) boot_command_line, 1408 len = hv_get_command_line((HV_VirtAddr) boot_command_line,
1414 COMMAND_LINE_SIZE); 1409 COMMAND_LINE_SIZE);
1415 if (boot_command_line[0]) 1410 if (boot_command_line[0])
1416 pr_warning("WARNING: ignoring dynamic command line \"%s\"\n", 1411 pr_warn("WARNING: ignoring dynamic command line \"%s\"\n",
1417 boot_command_line); 1412 boot_command_line);
1418 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); 1413 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
1419#else 1414#else
1420 char *hv_cmdline; 1415 char *hv_cmdline;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c112ea63f40d..e8a5454acc99 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -100,6 +100,11 @@ static inline int pte_young(pte_t pte)
100 return pte_flags(pte) & _PAGE_ACCESSED; 100 return pte_flags(pte) & _PAGE_ACCESSED;
101} 101}
102 102
103static inline int pmd_dirty(pmd_t pmd)
104{
105 return pmd_flags(pmd) & _PAGE_DIRTY;
106}
107
103static inline int pmd_young(pmd_t pmd) 108static inline int pmd_young(pmd_t pmd)
104{ 109{
105 return pmd_flags(pmd) & _PAGE_ACCESSED; 110 return pmd_flags(pmd) & _PAGE_ACCESSED;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index df04227d00cf..98504ec99c7d 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -267,18 +267,24 @@ comment "Default contiguous memory area size:"
267config CMA_SIZE_MBYTES 267config CMA_SIZE_MBYTES
268 int "Size in Mega Bytes" 268 int "Size in Mega Bytes"
269 depends on !CMA_SIZE_SEL_PERCENTAGE 269 depends on !CMA_SIZE_SEL_PERCENTAGE
270 default 0 if X86
270 default 16 271 default 16
271 help 272 help
272 Defines the size (in MiB) of the default memory area for Contiguous 273 Defines the size (in MiB) of the default memory area for Contiguous
273 Memory Allocator. 274 Memory Allocator. If the size of 0 is selected, CMA is disabled by
275 default, but it can be enabled by passing cma=size[MG] to the kernel.
276
274 277
275config CMA_SIZE_PERCENTAGE 278config CMA_SIZE_PERCENTAGE
276 int "Percentage of total memory" 279 int "Percentage of total memory"
277 depends on !CMA_SIZE_SEL_MBYTES 280 depends on !CMA_SIZE_SEL_MBYTES
281 default 0 if X86
278 default 10 282 default 10
279 help 283 help
280 Defines the size of the default memory area for Contiguous Memory 284 Defines the size of the default memory area for Contiguous Memory
281 Allocator as a percentage of the total memory in the system. 285 Allocator as a percentage of the total memory in the system.
286 If 0 percent is selected, CMA is disabled by default, but it can be
287 enabled by passing cma=size[MG] to the kernel.
282 288
283choice 289choice
284 prompt "Selected region size" 290 prompt "Selected region size"
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index b682651b5307..4511ddc1ac31 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -192,6 +192,14 @@ config RTC_DRV_DS1374
192 This driver can also be built as a module. If so, the module 192 This driver can also be built as a module. If so, the module
193 will be called rtc-ds1374. 193 will be called rtc-ds1374.
194 194
195config RTC_DRV_DS1374_WDT
196 bool "Dallas/Maxim DS1374 watchdog timer"
197 depends on RTC_DRV_DS1374
198 help
199 If you say Y here you will get support for the
200 watchdog timer in the Dallas Semiconductor DS1374
201 real-time clock chips.
202
195config RTC_DRV_DS1672 203config RTC_DRV_DS1672
196 tristate "Dallas/Maxim DS1672" 204 tristate "Dallas/Maxim DS1672"
197 help 205 help
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 5b2717f5dafa..45bfc28ee3aa 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -30,6 +30,14 @@ static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
30 else { 30 else {
31 memset(tm, 0, sizeof(struct rtc_time)); 31 memset(tm, 0, sizeof(struct rtc_time));
32 err = rtc->ops->read_time(rtc->dev.parent, tm); 32 err = rtc->ops->read_time(rtc->dev.parent, tm);
33 if (err < 0) {
34 dev_err(&rtc->dev, "read_time: fail to read\n");
35 return err;
36 }
37
38 err = rtc_valid_tm(tm);
39 if (err < 0)
40 dev_err(&rtc->dev, "read_time: rtc_time isn't valid\n");
33 } 41 }
34 return err; 42 return err;
35} 43}
@@ -891,11 +899,24 @@ again:
891 if (next) { 899 if (next) {
892 struct rtc_wkalrm alarm; 900 struct rtc_wkalrm alarm;
893 int err; 901 int err;
902 int retry = 3;
903
894 alarm.time = rtc_ktime_to_tm(next->expires); 904 alarm.time = rtc_ktime_to_tm(next->expires);
895 alarm.enabled = 1; 905 alarm.enabled = 1;
906reprogram:
896 err = __rtc_set_alarm(rtc, &alarm); 907 err = __rtc_set_alarm(rtc, &alarm);
897 if (err == -ETIME) 908 if (err == -ETIME)
898 goto again; 909 goto again;
910 else if (err) {
911 if (retry-- > 0)
912 goto reprogram;
913
914 timer = container_of(next, struct rtc_timer, node);
915 timerqueue_del(&rtc->timerqueue, &timer->node);
916 timer->enabled = 0;
917 dev_err(&rtc->dev, "__rtc_set_alarm: err=%d\n", err);
918 goto again;
919 }
899 } else 920 } else
900 rtc_alarm_disable(rtc); 921 rtc_alarm_disable(rtc);
901 922
diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c
index 727e2f5d14d9..866e0ef5122d 100644
--- a/drivers/rtc/rtc-ab8500.c
+++ b/drivers/rtc/rtc-ab8500.c
@@ -504,6 +504,8 @@ static int ab8500_rtc_probe(struct platform_device *pdev)
504 return err; 504 return err;
505 } 505 }
506 506
507 rtc->uie_unsupported = 1;
508
507 return 0; 509 return 0;
508} 510}
509 511
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index bb43cf703efc..4ffabb322a9a 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -35,7 +35,7 @@ enum ds_type {
35 ds_1388, 35 ds_1388,
36 ds_3231, 36 ds_3231,
37 m41t00, 37 m41t00,
38 mcp7941x, 38 mcp794xx,
39 rx_8025, 39 rx_8025,
40 last_ds_type /* always last */ 40 last_ds_type /* always last */
41 /* rs5c372 too? different address... */ 41 /* rs5c372 too? different address... */
@@ -46,7 +46,7 @@ enum ds_type {
46#define DS1307_REG_SECS 0x00 /* 00-59 */ 46#define DS1307_REG_SECS 0x00 /* 00-59 */
47# define DS1307_BIT_CH 0x80 47# define DS1307_BIT_CH 0x80
48# define DS1340_BIT_nEOSC 0x80 48# define DS1340_BIT_nEOSC 0x80
49# define MCP7941X_BIT_ST 0x80 49# define MCP794XX_BIT_ST 0x80
50#define DS1307_REG_MIN 0x01 /* 00-59 */ 50#define DS1307_REG_MIN 0x01 /* 00-59 */
51#define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */ 51#define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */
52# define DS1307_BIT_12HR 0x40 /* in REG_HOUR */ 52# define DS1307_BIT_12HR 0x40 /* in REG_HOUR */
@@ -54,7 +54,7 @@ enum ds_type {
54# define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */ 54# define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */
55# define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */ 55# define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */
56#define DS1307_REG_WDAY 0x03 /* 01-07 */ 56#define DS1307_REG_WDAY 0x03 /* 01-07 */
57# define MCP7941X_BIT_VBATEN 0x08 57# define MCP794XX_BIT_VBATEN 0x08
58#define DS1307_REG_MDAY 0x04 /* 01-31 */ 58#define DS1307_REG_MDAY 0x04 /* 01-31 */
59#define DS1307_REG_MONTH 0x05 /* 01-12 */ 59#define DS1307_REG_MONTH 0x05 /* 01-12 */
60# define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */ 60# define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */
@@ -159,7 +159,7 @@ static struct chip_desc chips[last_ds_type] = {
159 [ds_3231] = { 159 [ds_3231] = {
160 .alarm = 1, 160 .alarm = 1,
161 }, 161 },
162 [mcp7941x] = { 162 [mcp794xx] = {
163 .alarm = 1, 163 .alarm = 1,
164 /* this is battery backed SRAM */ 164 /* this is battery backed SRAM */
165 .nvram_offset = 0x20, 165 .nvram_offset = 0x20,
@@ -176,7 +176,8 @@ static const struct i2c_device_id ds1307_id[] = {
176 { "ds1340", ds_1340 }, 176 { "ds1340", ds_1340 },
177 { "ds3231", ds_3231 }, 177 { "ds3231", ds_3231 },
178 { "m41t00", m41t00 }, 178 { "m41t00", m41t00 },
179 { "mcp7941x", mcp7941x }, 179 { "mcp7940x", mcp794xx },
180 { "mcp7941x", mcp794xx },
180 { "pt7c4338", ds_1307 }, 181 { "pt7c4338", ds_1307 },
181 { "rx8025", rx_8025 }, 182 { "rx8025", rx_8025 },
182 { } 183 { }
@@ -439,14 +440,14 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
439 buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN 440 buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN
440 | DS1340_BIT_CENTURY; 441 | DS1340_BIT_CENTURY;
441 break; 442 break;
442 case mcp7941x: 443 case mcp794xx:
443 /* 444 /*
444 * these bits were cleared when preparing the date/time 445 * these bits were cleared when preparing the date/time
445 * values and need to be set again before writing the 446 * values and need to be set again before writing the
446 * buffer out to the device. 447 * buffer out to the device.
447 */ 448 */
448 buf[DS1307_REG_SECS] |= MCP7941X_BIT_ST; 449 buf[DS1307_REG_SECS] |= MCP794XX_BIT_ST;
449 buf[DS1307_REG_WDAY] |= MCP7941X_BIT_VBATEN; 450 buf[DS1307_REG_WDAY] |= MCP794XX_BIT_VBATEN;
450 break; 451 break;
451 default: 452 default:
452 break; 453 break;
@@ -614,26 +615,26 @@ static const struct rtc_class_ops ds13xx_rtc_ops = {
614/*----------------------------------------------------------------------*/ 615/*----------------------------------------------------------------------*/
615 616
616/* 617/*
617 * Alarm support for mcp7941x devices. 618 * Alarm support for mcp794xx devices.
618 */ 619 */
619 620
620#define MCP7941X_REG_CONTROL 0x07 621#define MCP794XX_REG_CONTROL 0x07
621# define MCP7941X_BIT_ALM0_EN 0x10 622# define MCP794XX_BIT_ALM0_EN 0x10
622# define MCP7941X_BIT_ALM1_EN 0x20 623# define MCP794XX_BIT_ALM1_EN 0x20
623#define MCP7941X_REG_ALARM0_BASE 0x0a 624#define MCP794XX_REG_ALARM0_BASE 0x0a
624#define MCP7941X_REG_ALARM0_CTRL 0x0d 625#define MCP794XX_REG_ALARM0_CTRL 0x0d
625#define MCP7941X_REG_ALARM1_BASE 0x11 626#define MCP794XX_REG_ALARM1_BASE 0x11
626#define MCP7941X_REG_ALARM1_CTRL 0x14 627#define MCP794XX_REG_ALARM1_CTRL 0x14
627# define MCP7941X_BIT_ALMX_IF (1 << 3) 628# define MCP794XX_BIT_ALMX_IF (1 << 3)
628# define MCP7941X_BIT_ALMX_C0 (1 << 4) 629# define MCP794XX_BIT_ALMX_C0 (1 << 4)
629# define MCP7941X_BIT_ALMX_C1 (1 << 5) 630# define MCP794XX_BIT_ALMX_C1 (1 << 5)
630# define MCP7941X_BIT_ALMX_C2 (1 << 6) 631# define MCP794XX_BIT_ALMX_C2 (1 << 6)
631# define MCP7941X_BIT_ALMX_POL (1 << 7) 632# define MCP794XX_BIT_ALMX_POL (1 << 7)
632# define MCP7941X_MSK_ALMX_MATCH (MCP7941X_BIT_ALMX_C0 | \ 633# define MCP794XX_MSK_ALMX_MATCH (MCP794XX_BIT_ALMX_C0 | \
633 MCP7941X_BIT_ALMX_C1 | \ 634 MCP794XX_BIT_ALMX_C1 | \
634 MCP7941X_BIT_ALMX_C2) 635 MCP794XX_BIT_ALMX_C2)
635 636
636static void mcp7941x_work(struct work_struct *work) 637static void mcp794xx_work(struct work_struct *work)
637{ 638{
638 struct ds1307 *ds1307 = container_of(work, struct ds1307, work); 639 struct ds1307 *ds1307 = container_of(work, struct ds1307, work);
639 struct i2c_client *client = ds1307->client; 640 struct i2c_client *client = ds1307->client;
@@ -642,22 +643,22 @@ static void mcp7941x_work(struct work_struct *work)
642 mutex_lock(&ds1307->rtc->ops_lock); 643 mutex_lock(&ds1307->rtc->ops_lock);
643 644
644 /* Check and clear alarm 0 interrupt flag. */ 645 /* Check and clear alarm 0 interrupt flag. */
645 reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_ALARM0_CTRL); 646 reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_ALARM0_CTRL);
646 if (reg < 0) 647 if (reg < 0)
647 goto out; 648 goto out;
648 if (!(reg & MCP7941X_BIT_ALMX_IF)) 649 if (!(reg & MCP794XX_BIT_ALMX_IF))
649 goto out; 650 goto out;
650 reg &= ~MCP7941X_BIT_ALMX_IF; 651 reg &= ~MCP794XX_BIT_ALMX_IF;
651 ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_ALARM0_CTRL, reg); 652 ret = i2c_smbus_write_byte_data(client, MCP794XX_REG_ALARM0_CTRL, reg);
652 if (ret < 0) 653 if (ret < 0)
653 goto out; 654 goto out;
654 655
655 /* Disable alarm 0. */ 656 /* Disable alarm 0. */
656 reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); 657 reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_CONTROL);
657 if (reg < 0) 658 if (reg < 0)
658 goto out; 659 goto out;
659 reg &= ~MCP7941X_BIT_ALM0_EN; 660 reg &= ~MCP794XX_BIT_ALM0_EN;
660 ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); 661 ret = i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, reg);
661 if (ret < 0) 662 if (ret < 0)
662 goto out; 663 goto out;
663 664
@@ -669,7 +670,7 @@ out:
669 mutex_unlock(&ds1307->rtc->ops_lock); 670 mutex_unlock(&ds1307->rtc->ops_lock);
670} 671}
671 672
672static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t) 673static int mcp794xx_read_alarm(struct device *dev, struct rtc_wkalrm *t)
673{ 674{
674 struct i2c_client *client = to_i2c_client(dev); 675 struct i2c_client *client = to_i2c_client(dev);
675 struct ds1307 *ds1307 = i2c_get_clientdata(client); 676 struct ds1307 *ds1307 = i2c_get_clientdata(client);
@@ -680,11 +681,11 @@ static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t)
680 return -EINVAL; 681 return -EINVAL;
681 682
682 /* Read control and alarm 0 registers. */ 683 /* Read control and alarm 0 registers. */
683 ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); 684 ret = ds1307->read_block_data(client, MCP794XX_REG_CONTROL, 10, regs);
684 if (ret < 0) 685 if (ret < 0)
685 return ret; 686 return ret;
686 687
687 t->enabled = !!(regs[0] & MCP7941X_BIT_ALM0_EN); 688 t->enabled = !!(regs[0] & MCP794XX_BIT_ALM0_EN);
688 689
689 /* Report alarm 0 time assuming 24-hour and day-of-month modes. */ 690 /* Report alarm 0 time assuming 24-hour and day-of-month modes. */
690 t->time.tm_sec = bcd2bin(ds1307->regs[3] & 0x7f); 691 t->time.tm_sec = bcd2bin(ds1307->regs[3] & 0x7f);
@@ -701,14 +702,14 @@ static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t)
701 "enabled=%d polarity=%d irq=%d match=%d\n", __func__, 702 "enabled=%d polarity=%d irq=%d match=%d\n", __func__,
702 t->time.tm_sec, t->time.tm_min, t->time.tm_hour, 703 t->time.tm_sec, t->time.tm_min, t->time.tm_hour,
703 t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, t->enabled, 704 t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, t->enabled,
704 !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_POL), 705 !!(ds1307->regs[6] & MCP794XX_BIT_ALMX_POL),
705 !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_IF), 706 !!(ds1307->regs[6] & MCP794XX_BIT_ALMX_IF),
706 (ds1307->regs[6] & MCP7941X_MSK_ALMX_MATCH) >> 4); 707 (ds1307->regs[6] & MCP794XX_MSK_ALMX_MATCH) >> 4);
707 708
708 return 0; 709 return 0;
709} 710}
710 711
711static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t) 712static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t)
712{ 713{
713 struct i2c_client *client = to_i2c_client(dev); 714 struct i2c_client *client = to_i2c_client(dev);
714 struct ds1307 *ds1307 = i2c_get_clientdata(client); 715 struct ds1307 *ds1307 = i2c_get_clientdata(client);
@@ -725,7 +726,7 @@ static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t)
725 t->enabled, t->pending); 726 t->enabled, t->pending);
726 727
727 /* Read control and alarm 0 registers. */ 728 /* Read control and alarm 0 registers. */
728 ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); 729 ret = ds1307->read_block_data(client, MCP794XX_REG_CONTROL, 10, regs);
729 if (ret < 0) 730 if (ret < 0)
730 return ret; 731 return ret;
731 732
@@ -738,23 +739,23 @@ static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t)
738 regs[8] = bin2bcd(t->time.tm_mon) + 1; 739 regs[8] = bin2bcd(t->time.tm_mon) + 1;
739 740
740 /* Clear the alarm 0 interrupt flag. */ 741 /* Clear the alarm 0 interrupt flag. */
741 regs[6] &= ~MCP7941X_BIT_ALMX_IF; 742 regs[6] &= ~MCP794XX_BIT_ALMX_IF;
742 /* Set alarm match: second, minute, hour, day, date, month. */ 743 /* Set alarm match: second, minute, hour, day, date, month. */
743 regs[6] |= MCP7941X_MSK_ALMX_MATCH; 744 regs[6] |= MCP794XX_MSK_ALMX_MATCH;
744 745
745 if (t->enabled) 746 if (t->enabled)
746 regs[0] |= MCP7941X_BIT_ALM0_EN; 747 regs[0] |= MCP794XX_BIT_ALM0_EN;
747 else 748 else
748 regs[0] &= ~MCP7941X_BIT_ALM0_EN; 749 regs[0] &= ~MCP794XX_BIT_ALM0_EN;
749 750
750 ret = ds1307->write_block_data(client, MCP7941X_REG_CONTROL, 10, regs); 751 ret = ds1307->write_block_data(client, MCP794XX_REG_CONTROL, 10, regs);
751 if (ret < 0) 752 if (ret < 0)
752 return ret; 753 return ret;
753 754
754 return 0; 755 return 0;
755} 756}
756 757
757static int mcp7941x_alarm_irq_enable(struct device *dev, unsigned int enabled) 758static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled)
758{ 759{
759 struct i2c_client *client = to_i2c_client(dev); 760 struct i2c_client *client = to_i2c_client(dev);
760 struct ds1307 *ds1307 = i2c_get_clientdata(client); 761 struct ds1307 *ds1307 = i2c_get_clientdata(client);
@@ -763,24 +764,24 @@ static int mcp7941x_alarm_irq_enable(struct device *dev, unsigned int enabled)
763 if (!test_bit(HAS_ALARM, &ds1307->flags)) 764 if (!test_bit(HAS_ALARM, &ds1307->flags))
764 return -EINVAL; 765 return -EINVAL;
765 766
766 reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); 767 reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_CONTROL);
767 if (reg < 0) 768 if (reg < 0)
768 return reg; 769 return reg;
769 770
770 if (enabled) 771 if (enabled)
771 reg |= MCP7941X_BIT_ALM0_EN; 772 reg |= MCP794XX_BIT_ALM0_EN;
772 else 773 else
773 reg &= ~MCP7941X_BIT_ALM0_EN; 774 reg &= ~MCP794XX_BIT_ALM0_EN;
774 775
775 return i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); 776 return i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, reg);
776} 777}
777 778
778static const struct rtc_class_ops mcp7941x_rtc_ops = { 779static const struct rtc_class_ops mcp794xx_rtc_ops = {
779 .read_time = ds1307_get_time, 780 .read_time = ds1307_get_time,
780 .set_time = ds1307_set_time, 781 .set_time = ds1307_set_time,
781 .read_alarm = mcp7941x_read_alarm, 782 .read_alarm = mcp794xx_read_alarm,
782 .set_alarm = mcp7941x_set_alarm, 783 .set_alarm = mcp794xx_set_alarm,
783 .alarm_irq_enable = mcp7941x_alarm_irq_enable, 784 .alarm_irq_enable = mcp794xx_alarm_irq_enable,
784}; 785};
785 786
786/*----------------------------------------------------------------------*/ 787/*----------------------------------------------------------------------*/
@@ -1049,10 +1050,10 @@ static int ds1307_probe(struct i2c_client *client,
1049 case ds_1388: 1050 case ds_1388:
1050 ds1307->offset = 1; /* Seconds starts at 1 */ 1051 ds1307->offset = 1; /* Seconds starts at 1 */
1051 break; 1052 break;
1052 case mcp7941x: 1053 case mcp794xx:
1053 rtc_ops = &mcp7941x_rtc_ops; 1054 rtc_ops = &mcp794xx_rtc_ops;
1054 if (ds1307->client->irq > 0 && chip->alarm) { 1055 if (ds1307->client->irq > 0 && chip->alarm) {
1055 INIT_WORK(&ds1307->work, mcp7941x_work); 1056 INIT_WORK(&ds1307->work, mcp794xx_work);
1056 want_irq = true; 1057 want_irq = true;
1057 } 1058 }
1058 break; 1059 break;
@@ -1117,18 +1118,18 @@ read_rtc:
1117 dev_warn(&client->dev, "SET TIME!\n"); 1118 dev_warn(&client->dev, "SET TIME!\n");
1118 } 1119 }
1119 break; 1120 break;
1120 case mcp7941x: 1121 case mcp794xx:
1121 /* make sure that the backup battery is enabled */ 1122 /* make sure that the backup battery is enabled */
1122 if (!(ds1307->regs[DS1307_REG_WDAY] & MCP7941X_BIT_VBATEN)) { 1123 if (!(ds1307->regs[DS1307_REG_WDAY] & MCP794XX_BIT_VBATEN)) {
1123 i2c_smbus_write_byte_data(client, DS1307_REG_WDAY, 1124 i2c_smbus_write_byte_data(client, DS1307_REG_WDAY,
1124 ds1307->regs[DS1307_REG_WDAY] 1125 ds1307->regs[DS1307_REG_WDAY]
1125 | MCP7941X_BIT_VBATEN); 1126 | MCP794XX_BIT_VBATEN);
1126 } 1127 }
1127 1128
1128 /* clock halted? turn it on, so clock can tick. */ 1129 /* clock halted? turn it on, so clock can tick. */
1129 if (!(tmp & MCP7941X_BIT_ST)) { 1130 if (!(tmp & MCP794XX_BIT_ST)) {
1130 i2c_smbus_write_byte_data(client, DS1307_REG_SECS, 1131 i2c_smbus_write_byte_data(client, DS1307_REG_SECS,
1131 MCP7941X_BIT_ST); 1132 MCP794XX_BIT_ST);
1132 dev_warn(&client->dev, "SET TIME!\n"); 1133 dev_warn(&client->dev, "SET TIME!\n");
1133 goto read_rtc; 1134 goto read_rtc;
1134 } 1135 }
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 9e6e14fb53d7..8605fde394b2 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -4,6 +4,7 @@
4 * Based on code by Randy Vinson <rvinson@mvista.com>, 4 * Based on code by Randy Vinson <rvinson@mvista.com>,
5 * which was based on the m41t00.c by Mark Greer <mgreer@mvista.com>. 5 * which was based on the m41t00.c by Mark Greer <mgreer@mvista.com>.
6 * 6 *
7 * Copyright (C) 2014 Rose Technology
7 * Copyright (C) 2006-2007 Freescale Semiconductor 8 * Copyright (C) 2006-2007 Freescale Semiconductor
8 * 9 *
9 * 2005 (c) MontaVista Software, Inc. This file is licensed under 10 * 2005 (c) MontaVista Software, Inc. This file is licensed under
@@ -26,6 +27,13 @@
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <linux/pm.h> 29#include <linux/pm.h>
30#ifdef CONFIG_RTC_DRV_DS1374_WDT
31#include <linux/fs.h>
32#include <linux/ioctl.h>
33#include <linux/miscdevice.h>
34#include <linux/reboot.h>
35#include <linux/watchdog.h>
36#endif
29 37
30#define DS1374_REG_TOD0 0x00 /* Time of Day */ 38#define DS1374_REG_TOD0 0x00 /* Time of Day */
31#define DS1374_REG_TOD1 0x01 39#define DS1374_REG_TOD1 0x01
@@ -49,6 +57,14 @@ static const struct i2c_device_id ds1374_id[] = {
49}; 57};
50MODULE_DEVICE_TABLE(i2c, ds1374_id); 58MODULE_DEVICE_TABLE(i2c, ds1374_id);
51 59
60#ifdef CONFIG_OF
61static const struct of_device_id ds1374_of_match[] = {
62 { .compatible = "dallas,ds1374" },
63 { }
64};
65MODULE_DEVICE_TABLE(of, ds1374_of_match);
66#endif
67
52struct ds1374 { 68struct ds1374 {
53 struct i2c_client *client; 69 struct i2c_client *client;
54 struct rtc_device *rtc; 70 struct rtc_device *rtc;
@@ -162,6 +178,7 @@ static int ds1374_set_time(struct device *dev, struct rtc_time *time)
162 return ds1374_write_rtc(client, itime, DS1374_REG_TOD0, 4); 178 return ds1374_write_rtc(client, itime, DS1374_REG_TOD0, 4);
163} 179}
164 180
181#ifndef CONFIG_RTC_DRV_DS1374_WDT
165/* The ds1374 has a decrementer for an alarm, rather than a comparator. 182/* The ds1374 has a decrementer for an alarm, rather than a comparator.
166 * If the time of day is changed, then the alarm will need to be 183 * If the time of day is changed, then the alarm will need to be
167 * reset. 184 * reset.
@@ -263,6 +280,7 @@ out:
263 mutex_unlock(&ds1374->mutex); 280 mutex_unlock(&ds1374->mutex);
264 return ret; 281 return ret;
265} 282}
283#endif
266 284
267static irqreturn_t ds1374_irq(int irq, void *dev_id) 285static irqreturn_t ds1374_irq(int irq, void *dev_id)
268{ 286{
@@ -307,6 +325,7 @@ unlock:
307 mutex_unlock(&ds1374->mutex); 325 mutex_unlock(&ds1374->mutex);
308} 326}
309 327
328#ifndef CONFIG_RTC_DRV_DS1374_WDT
310static int ds1374_alarm_irq_enable(struct device *dev, unsigned int enabled) 329static int ds1374_alarm_irq_enable(struct device *dev, unsigned int enabled)
311{ 330{
312 struct i2c_client *client = to_i2c_client(dev); 331 struct i2c_client *client = to_i2c_client(dev);
@@ -331,15 +350,260 @@ out:
331 mutex_unlock(&ds1374->mutex); 350 mutex_unlock(&ds1374->mutex);
332 return ret; 351 return ret;
333} 352}
353#endif
334 354
335static const struct rtc_class_ops ds1374_rtc_ops = { 355static const struct rtc_class_ops ds1374_rtc_ops = {
336 .read_time = ds1374_read_time, 356 .read_time = ds1374_read_time,
337 .set_time = ds1374_set_time, 357 .set_time = ds1374_set_time,
358#ifndef CONFIG_RTC_DRV_DS1374_WDT
338 .read_alarm = ds1374_read_alarm, 359 .read_alarm = ds1374_read_alarm,
339 .set_alarm = ds1374_set_alarm, 360 .set_alarm = ds1374_set_alarm,
340 .alarm_irq_enable = ds1374_alarm_irq_enable, 361 .alarm_irq_enable = ds1374_alarm_irq_enable,
362#endif
363};
364
365#ifdef CONFIG_RTC_DRV_DS1374_WDT
366/*
367 *****************************************************************************
368 *
369 * Watchdog Driver
370 *
371 *****************************************************************************
372 */
373static struct i2c_client *save_client;
374/* Default margin */
375#define WD_TIMO 131762
376
377#define DRV_NAME "DS1374 Watchdog"
378
379static int wdt_margin = WD_TIMO;
380static unsigned long wdt_is_open;
381module_param(wdt_margin, int, 0);
382MODULE_PARM_DESC(wdt_margin, "Watchdog timeout in seconds (default 32s)");
383
384static const struct watchdog_info ds1374_wdt_info = {
385 .identity = "DS1374 WTD",
386 .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING |
387 WDIOF_MAGICCLOSE,
341}; 388};
342 389
390static int ds1374_wdt_settimeout(unsigned int timeout)
391{
392 int ret = -ENOIOCTLCMD;
393 int cr;
394
395 ret = cr = i2c_smbus_read_byte_data(save_client, DS1374_REG_CR);
396 if (ret < 0)
397 goto out;
398
399 /* Disable any existing watchdog/alarm before setting the new one */
400 cr &= ~DS1374_REG_CR_WACE;
401
402 ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr);
403 if (ret < 0)
404 goto out;
405
406 /* Set new watchdog time */
407 ret = ds1374_write_rtc(save_client, timeout, DS1374_REG_WDALM0, 3);
408 if (ret) {
409 pr_info("rtc-ds1374 - couldn't set new watchdog time\n");
410 goto out;
411 }
412
413 /* Enable watchdog timer */
414 cr |= DS1374_REG_CR_WACE | DS1374_REG_CR_WDALM;
415 cr &= ~DS1374_REG_CR_AIE;
416
417 ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr);
418 if (ret < 0)
419 goto out;
420
421 return 0;
422out:
423 return ret;
424}
425
426
427/*
428 * Reload the watchdog timer. (ie, pat the watchdog)
429 */
430static void ds1374_wdt_ping(void)
431{
432 u32 val;
433 int ret = 0;
434
435 ret = ds1374_read_rtc(save_client, &val, DS1374_REG_WDALM0, 3);
436 if (ret)
437 pr_info("WD TICK FAIL!!!!!!!!!! %i\n", ret);
438}
439
440static void ds1374_wdt_disable(void)
441{
442 int ret = -ENOIOCTLCMD;
443 int cr;
444
445 cr = i2c_smbus_read_byte_data(save_client, DS1374_REG_CR);
446 /* Disable watchdog timer */
447 cr &= ~DS1374_REG_CR_WACE;
448
449 ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr);
450}
451
452/*
453 * Watchdog device is opened, and watchdog starts running.
454 */
455static int ds1374_wdt_open(struct inode *inode, struct file *file)
456{
457 struct ds1374 *ds1374 = i2c_get_clientdata(save_client);
458
459 if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) {
460 mutex_lock(&ds1374->mutex);
461 if (test_and_set_bit(0, &wdt_is_open)) {
462 mutex_unlock(&ds1374->mutex);
463 return -EBUSY;
464 }
465 /*
466 * Activate
467 */
468 wdt_is_open = 1;
469 mutex_unlock(&ds1374->mutex);
470 return nonseekable_open(inode, file);
471 }
472 return -ENODEV;
473}
474
475/*
476 * Close the watchdog device.
477 */
478static int ds1374_wdt_release(struct inode *inode, struct file *file)
479{
480 if (MINOR(inode->i_rdev) == WATCHDOG_MINOR)
481 clear_bit(0, &wdt_is_open);
482
483 return 0;
484}
485
486/*
487 * Pat the watchdog whenever device is written to.
488 */
489static ssize_t ds1374_wdt_write(struct file *file, const char __user *data,
490 size_t len, loff_t *ppos)
491{
492 if (len) {
493 ds1374_wdt_ping();
494 return 1;
495 }
496 return 0;
497}
498
499static ssize_t ds1374_wdt_read(struct file *file, char __user *data,
500 size_t len, loff_t *ppos)
501{
502 return 0;
503}
504
505/*
506 * Handle commands from user-space.
507 */
508static long ds1374_wdt_ioctl(struct file *file, unsigned int cmd,
509 unsigned long arg)
510{
511 int new_margin, options;
512
513 switch (cmd) {
514 case WDIOC_GETSUPPORT:
515 return copy_to_user((struct watchdog_info __user *)arg,
516 &ds1374_wdt_info, sizeof(ds1374_wdt_info)) ? -EFAULT : 0;
517
518 case WDIOC_GETSTATUS:
519 case WDIOC_GETBOOTSTATUS:
520 return put_user(0, (int __user *)arg);
521 case WDIOC_KEEPALIVE:
522 ds1374_wdt_ping();
523 return 0;
524 case WDIOC_SETTIMEOUT:
525 if (get_user(new_margin, (int __user *)arg))
526 return -EFAULT;
527
528 if (new_margin < 1 || new_margin > 16777216)
529 return -EINVAL;
530
531 wdt_margin = new_margin;
532 ds1374_wdt_settimeout(new_margin);
533 ds1374_wdt_ping();
534 /* fallthrough */
535 case WDIOC_GETTIMEOUT:
536 return put_user(wdt_margin, (int __user *)arg);
537 case WDIOC_SETOPTIONS:
538 if (copy_from_user(&options, (int __user *)arg, sizeof(int)))
539 return -EFAULT;
540
541 if (options & WDIOS_DISABLECARD) {
542 pr_info("rtc-ds1374: disable watchdog\n");
543 ds1374_wdt_disable();
544 }
545
546 if (options & WDIOS_ENABLECARD) {
547 pr_info("rtc-ds1374: enable watchdog\n");
548 ds1374_wdt_settimeout(wdt_margin);
549 ds1374_wdt_ping();
550 }
551
552 return -EINVAL;
553 }
554 return -ENOTTY;
555}
556
557static long ds1374_wdt_unlocked_ioctl(struct file *file, unsigned int cmd,
558 unsigned long arg)
559{
560 int ret;
561 struct ds1374 *ds1374 = i2c_get_clientdata(save_client);
562
563 mutex_lock(&ds1374->mutex);
564 ret = ds1374_wdt_ioctl(file, cmd, arg);
565 mutex_unlock(&ds1374->mutex);
566
567 return ret;
568}
569
570static int ds1374_wdt_notify_sys(struct notifier_block *this,
571 unsigned long code, void *unused)
572{
573 if (code == SYS_DOWN || code == SYS_HALT)
574 /* Disable Watchdog */
575 ds1374_wdt_disable();
576 return NOTIFY_DONE;
577}
578
579static const struct file_operations ds1374_wdt_fops = {
580 .owner = THIS_MODULE,
581 .read = ds1374_wdt_read,
582 .unlocked_ioctl = ds1374_wdt_unlocked_ioctl,
583 .write = ds1374_wdt_write,
584 .open = ds1374_wdt_open,
585 .release = ds1374_wdt_release,
586 .llseek = no_llseek,
587};
588
589static struct miscdevice ds1374_miscdev = {
590 .minor = WATCHDOG_MINOR,
591 .name = "watchdog",
592 .fops = &ds1374_wdt_fops,
593};
594
595static struct notifier_block ds1374_wdt_notifier = {
596 .notifier_call = ds1374_wdt_notify_sys,
597};
598
599#endif /*CONFIG_RTC_DRV_DS1374_WDT*/
600/*
601 *****************************************************************************
602 *
603 * Driver Interface
604 *
605 *****************************************************************************
606 */
343static int ds1374_probe(struct i2c_client *client, 607static int ds1374_probe(struct i2c_client *client,
344 const struct i2c_device_id *id) 608 const struct i2c_device_id *id)
345{ 609{
@@ -378,12 +642,33 @@ static int ds1374_probe(struct i2c_client *client,
378 return PTR_ERR(ds1374->rtc); 642 return PTR_ERR(ds1374->rtc);
379 } 643 }
380 644
645#ifdef CONFIG_RTC_DRV_DS1374_WDT
646 save_client = client;
647 ret = misc_register(&ds1374_miscdev);
648 if (ret)
649 return ret;
650 ret = register_reboot_notifier(&ds1374_wdt_notifier);
651 if (ret) {
652 misc_deregister(&ds1374_miscdev);
653 return ret;
654 }
655 ds1374_wdt_settimeout(131072);
656#endif
657
381 return 0; 658 return 0;
382} 659}
383 660
384static int ds1374_remove(struct i2c_client *client) 661static int ds1374_remove(struct i2c_client *client)
385{ 662{
386 struct ds1374 *ds1374 = i2c_get_clientdata(client); 663 struct ds1374 *ds1374 = i2c_get_clientdata(client);
664#ifdef CONFIG_RTC_DRV_DS1374_WDT
665 int res;
666
667 res = misc_deregister(&ds1374_miscdev);
668 if (!res)
669 ds1374_miscdev.parent = NULL;
670 unregister_reboot_notifier(&ds1374_wdt_notifier);
671#endif
387 672
388 if (client->irq > 0) { 673 if (client->irq > 0) {
389 mutex_lock(&ds1374->mutex); 674 mutex_lock(&ds1374->mutex);
diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c
index 455b601d731d..6e1fcfb5d7e6 100644
--- a/drivers/rtc/rtc-isl12057.c
+++ b/drivers/rtc/rtc-isl12057.c
@@ -41,6 +41,7 @@
41#define ISL12057_REG_RTC_DW 0x03 /* Day of the Week */ 41#define ISL12057_REG_RTC_DW 0x03 /* Day of the Week */
42#define ISL12057_REG_RTC_DT 0x04 /* Date */ 42#define ISL12057_REG_RTC_DT 0x04 /* Date */
43#define ISL12057_REG_RTC_MO 0x05 /* Month */ 43#define ISL12057_REG_RTC_MO 0x05 /* Month */
44#define ISL12057_REG_RTC_MO_CEN BIT(7) /* Century bit */
44#define ISL12057_REG_RTC_YR 0x06 /* Year */ 45#define ISL12057_REG_RTC_YR 0x06 /* Year */
45#define ISL12057_RTC_SEC_LEN 7 46#define ISL12057_RTC_SEC_LEN 7
46 47
@@ -88,7 +89,7 @@ static void isl12057_rtc_regs_to_tm(struct rtc_time *tm, u8 *regs)
88 tm->tm_min = bcd2bin(regs[ISL12057_REG_RTC_MN]); 89 tm->tm_min = bcd2bin(regs[ISL12057_REG_RTC_MN]);
89 90
90 if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_MIL) { /* AM/PM */ 91 if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_MIL) { /* AM/PM */
91 tm->tm_hour = bcd2bin(regs[ISL12057_REG_RTC_HR] & 0x0f); 92 tm->tm_hour = bcd2bin(regs[ISL12057_REG_RTC_HR] & 0x1f);
92 if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_PM) 93 if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_PM)
93 tm->tm_hour += 12; 94 tm->tm_hour += 12;
94 } else { /* 24 hour mode */ 95 } else { /* 24 hour mode */
@@ -97,26 +98,37 @@ static void isl12057_rtc_regs_to_tm(struct rtc_time *tm, u8 *regs)
97 98
98 tm->tm_mday = bcd2bin(regs[ISL12057_REG_RTC_DT]); 99 tm->tm_mday = bcd2bin(regs[ISL12057_REG_RTC_DT]);
99 tm->tm_wday = bcd2bin(regs[ISL12057_REG_RTC_DW]) - 1; /* starts at 1 */ 100 tm->tm_wday = bcd2bin(regs[ISL12057_REG_RTC_DW]) - 1; /* starts at 1 */
100 tm->tm_mon = bcd2bin(regs[ISL12057_REG_RTC_MO]) - 1; /* starts at 1 */ 101 tm->tm_mon = bcd2bin(regs[ISL12057_REG_RTC_MO] & 0x1f) - 1; /* ditto */
101 tm->tm_year = bcd2bin(regs[ISL12057_REG_RTC_YR]) + 100; 102 tm->tm_year = bcd2bin(regs[ISL12057_REG_RTC_YR]) + 100;
103
104 /* Check if years register has overflown from 99 to 00 */
105 if (regs[ISL12057_REG_RTC_MO] & ISL12057_REG_RTC_MO_CEN)
106 tm->tm_year += 100;
102} 107}
103 108
104static int isl12057_rtc_tm_to_regs(u8 *regs, struct rtc_time *tm) 109static int isl12057_rtc_tm_to_regs(u8 *regs, struct rtc_time *tm)
105{ 110{
111 u8 century_bit;
112
106 /* 113 /*
107 * The clock has an 8 bit wide bcd-coded register for the year. 114 * The clock has an 8 bit wide bcd-coded register for the year.
115 * It also has a century bit encoded in MO flag which provides
116 * information about overflow of year register from 99 to 00.
108 * tm_year is an offset from 1900 and we are interested in the 117 * tm_year is an offset from 1900 and we are interested in the
109 * 2000-2099 range, so any value less than 100 is invalid. 118 * 2000-2199 range, so any value less than 100 or larger than
119 * 299 is invalid.
110 */ 120 */
111 if (tm->tm_year < 100) 121 if (tm->tm_year < 100 || tm->tm_year > 299)
112 return -EINVAL; 122 return -EINVAL;
113 123
124 century_bit = (tm->tm_year > 199) ? ISL12057_REG_RTC_MO_CEN : 0;
125
114 regs[ISL12057_REG_RTC_SC] = bin2bcd(tm->tm_sec); 126 regs[ISL12057_REG_RTC_SC] = bin2bcd(tm->tm_sec);
115 regs[ISL12057_REG_RTC_MN] = bin2bcd(tm->tm_min); 127 regs[ISL12057_REG_RTC_MN] = bin2bcd(tm->tm_min);
116 regs[ISL12057_REG_RTC_HR] = bin2bcd(tm->tm_hour); /* 24-hour format */ 128 regs[ISL12057_REG_RTC_HR] = bin2bcd(tm->tm_hour); /* 24-hour format */
117 regs[ISL12057_REG_RTC_DT] = bin2bcd(tm->tm_mday); 129 regs[ISL12057_REG_RTC_DT] = bin2bcd(tm->tm_mday);
118 regs[ISL12057_REG_RTC_MO] = bin2bcd(tm->tm_mon + 1); 130 regs[ISL12057_REG_RTC_MO] = bin2bcd(tm->tm_mon + 1) | century_bit;
119 regs[ISL12057_REG_RTC_YR] = bin2bcd(tm->tm_year - 100); 131 regs[ISL12057_REG_RTC_YR] = bin2bcd(tm->tm_year % 100);
120 regs[ISL12057_REG_RTC_DW] = bin2bcd(tm->tm_wday + 1); 132 regs[ISL12057_REG_RTC_DW] = bin2bcd(tm->tm_wday + 1);
121 133
122 return 0; 134 return 0;
@@ -152,17 +164,33 @@ static int isl12057_rtc_read_time(struct device *dev, struct rtc_time *tm)
152{ 164{
153 struct isl12057_rtc_data *data = dev_get_drvdata(dev); 165 struct isl12057_rtc_data *data = dev_get_drvdata(dev);
154 u8 regs[ISL12057_RTC_SEC_LEN]; 166 u8 regs[ISL12057_RTC_SEC_LEN];
167 unsigned int sr;
155 int ret; 168 int ret;
156 169
157 mutex_lock(&data->lock); 170 mutex_lock(&data->lock);
171 ret = regmap_read(data->regmap, ISL12057_REG_SR, &sr);
172 if (ret) {
173 dev_err(dev, "%s: unable to read oscillator status flag (%d)\n",
174 __func__, ret);
175 goto out;
176 } else {
177 if (sr & ISL12057_REG_SR_OSF) {
178 ret = -ENODATA;
179 goto out;
180 }
181 }
182
158 ret = regmap_bulk_read(data->regmap, ISL12057_REG_RTC_SC, regs, 183 ret = regmap_bulk_read(data->regmap, ISL12057_REG_RTC_SC, regs,
159 ISL12057_RTC_SEC_LEN); 184 ISL12057_RTC_SEC_LEN);
185 if (ret)
186 dev_err(dev, "%s: unable to read RTC time section (%d)\n",
187 __func__, ret);
188
189out:
160 mutex_unlock(&data->lock); 190 mutex_unlock(&data->lock);
161 191
162 if (ret) { 192 if (ret)
163 dev_err(dev, "%s: RTC read failed\n", __func__);
164 return ret; 193 return ret;
165 }
166 194
167 isl12057_rtc_regs_to_tm(tm, regs); 195 isl12057_rtc_regs_to_tm(tm, regs);
168 196
@@ -182,10 +210,24 @@ static int isl12057_rtc_set_time(struct device *dev, struct rtc_time *tm)
182 mutex_lock(&data->lock); 210 mutex_lock(&data->lock);
183 ret = regmap_bulk_write(data->regmap, ISL12057_REG_RTC_SC, regs, 211 ret = regmap_bulk_write(data->regmap, ISL12057_REG_RTC_SC, regs,
184 ISL12057_RTC_SEC_LEN); 212 ISL12057_RTC_SEC_LEN);
185 mutex_unlock(&data->lock); 213 if (ret) {
214 dev_err(dev, "%s: unable to write RTC time section (%d)\n",
215 __func__, ret);
216 goto out;
217 }
186 218
187 if (ret) 219 /*
188 dev_err(dev, "%s: RTC write failed\n", __func__); 220 * Now that RTC time has been updated, let's clear oscillator
221 * failure flag, if needed.
222 */
223 ret = regmap_update_bits(data->regmap, ISL12057_REG_SR,
224 ISL12057_REG_SR_OSF, 0);
225 if (ret < 0)
226 dev_err(dev, "%s: unable to clear osc. failure bit (%d)\n",
227 __func__, ret);
228
229out:
230 mutex_unlock(&data->lock);
189 231
190 return ret; 232 return ret;
191} 233}
@@ -203,15 +245,8 @@ static int isl12057_check_rtc_status(struct device *dev, struct regmap *regmap)
203 ret = regmap_update_bits(regmap, ISL12057_REG_INT, 245 ret = regmap_update_bits(regmap, ISL12057_REG_INT,
204 ISL12057_REG_INT_EOSC, 0); 246 ISL12057_REG_INT_EOSC, 0);
205 if (ret < 0) { 247 if (ret < 0) {
206 dev_err(dev, "Unable to enable oscillator\n"); 248 dev_err(dev, "%s: unable to enable oscillator (%d)\n",
207 return ret; 249 __func__, ret);
208 }
209
210 /* Clear oscillator failure bit if needed */
211 ret = regmap_update_bits(regmap, ISL12057_REG_SR,
212 ISL12057_REG_SR_OSF, 0);
213 if (ret < 0) {
214 dev_err(dev, "Unable to clear oscillator failure bit\n");
215 return ret; 250 return ret;
216 } 251 }
217 252
@@ -219,7 +254,8 @@ static int isl12057_check_rtc_status(struct device *dev, struct regmap *regmap)
219 ret = regmap_update_bits(regmap, ISL12057_REG_SR, 254 ret = regmap_update_bits(regmap, ISL12057_REG_SR,
220 ISL12057_REG_SR_A1F, 0); 255 ISL12057_REG_SR_A1F, 0);
221 if (ret < 0) { 256 if (ret < 0) {
222 dev_err(dev, "Unable to clear alarm bit\n"); 257 dev_err(dev, "%s: unable to clear alarm bit (%d)\n",
258 __func__, ret);
223 return ret; 259 return ret;
224 } 260 }
225 261
@@ -253,7 +289,8 @@ static int isl12057_probe(struct i2c_client *client,
253 regmap = devm_regmap_init_i2c(client, &isl12057_rtc_regmap_config); 289 regmap = devm_regmap_init_i2c(client, &isl12057_rtc_regmap_config);
254 if (IS_ERR(regmap)) { 290 if (IS_ERR(regmap)) {
255 ret = PTR_ERR(regmap); 291 ret = PTR_ERR(regmap);
256 dev_err(dev, "regmap allocation failed: %d\n", ret); 292 dev_err(dev, "%s: regmap allocation failed (%d)\n",
293 __func__, ret);
257 return ret; 294 return ret;
258 } 295 }
259 296
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 21142e6574a9..4f1c6ca97211 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * TI OMAP1 Real Time Clock interface for Linux 2 * TI OMAP Real Time Clock interface for Linux
3 * 3 *
4 * Copyright (C) 2003 MontaVista Software, Inc. 4 * Copyright (C) 2003 MontaVista Software, Inc.
5 * Author: George G. Davis <gdavis@mvista.com> or <source@mvista.com> 5 * Author: George G. Davis <gdavis@mvista.com> or <source@mvista.com>
6 * 6 *
7 * Copyright (C) 2006 David Brownell (new RTC framework) 7 * Copyright (C) 2006 David Brownell (new RTC framework)
8 * Copyright (C) 2014 Johan Hovold <johan@kernel.org>
8 * 9 *
9 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License 11 * modify it under the terms of the GNU General Public License
@@ -25,7 +26,8 @@
25#include <linux/pm_runtime.h> 26#include <linux/pm_runtime.h>
26#include <linux/io.h> 27#include <linux/io.h>
27 28
28/* The OMAP1 RTC is a year/month/day/hours/minutes/seconds BCD clock 29/*
30 * The OMAP RTC is a year/month/day/hours/minutes/seconds BCD clock
29 * with century-range alarm matching, driven by the 32kHz clock. 31 * with century-range alarm matching, driven by the 32kHz clock.
30 * 32 *
31 * The main user-visible ways it differs from PC RTCs are by omitting 33 * The main user-visible ways it differs from PC RTCs are by omitting
@@ -39,10 +41,6 @@
39 * the SoC). See the BOARD-SPECIFIC CUSTOMIZATION comment. 41 * the SoC). See the BOARD-SPECIFIC CUSTOMIZATION comment.
40 */ 42 */
41 43
42#define DRIVER_NAME "omap_rtc"
43
44#define OMAP_RTC_BASE 0xfffb4800
45
46/* RTC registers */ 44/* RTC registers */
47#define OMAP_RTC_SECONDS_REG 0x00 45#define OMAP_RTC_SECONDS_REG 0x00
48#define OMAP_RTC_MINUTES_REG 0x04 46#define OMAP_RTC_MINUTES_REG 0x04
@@ -72,6 +70,15 @@
72 70
73#define OMAP_RTC_IRQWAKEEN 0x7c 71#define OMAP_RTC_IRQWAKEEN 0x7c
74 72
73#define OMAP_RTC_ALARM2_SECONDS_REG 0x80
74#define OMAP_RTC_ALARM2_MINUTES_REG 0x84
75#define OMAP_RTC_ALARM2_HOURS_REG 0x88
76#define OMAP_RTC_ALARM2_DAYS_REG 0x8c
77#define OMAP_RTC_ALARM2_MONTHS_REG 0x90
78#define OMAP_RTC_ALARM2_YEARS_REG 0x94
79
80#define OMAP_RTC_PMIC_REG 0x98
81
75/* OMAP_RTC_CTRL_REG bit fields: */ 82/* OMAP_RTC_CTRL_REG bit fields: */
76#define OMAP_RTC_CTRL_SPLIT BIT(7) 83#define OMAP_RTC_CTRL_SPLIT BIT(7)
77#define OMAP_RTC_CTRL_DISABLE BIT(6) 84#define OMAP_RTC_CTRL_DISABLE BIT(6)
@@ -84,6 +91,7 @@
84 91
85/* OMAP_RTC_STATUS_REG bit fields: */ 92/* OMAP_RTC_STATUS_REG bit fields: */
86#define OMAP_RTC_STATUS_POWER_UP BIT(7) 93#define OMAP_RTC_STATUS_POWER_UP BIT(7)
94#define OMAP_RTC_STATUS_ALARM2 BIT(7)
87#define OMAP_RTC_STATUS_ALARM BIT(6) 95#define OMAP_RTC_STATUS_ALARM BIT(6)
88#define OMAP_RTC_STATUS_1D_EVENT BIT(5) 96#define OMAP_RTC_STATUS_1D_EVENT BIT(5)
89#define OMAP_RTC_STATUS_1H_EVENT BIT(4) 97#define OMAP_RTC_STATUS_1H_EVENT BIT(4)
@@ -93,6 +101,7 @@
93#define OMAP_RTC_STATUS_BUSY BIT(0) 101#define OMAP_RTC_STATUS_BUSY BIT(0)
94 102
95/* OMAP_RTC_INTERRUPTS_REG bit fields: */ 103/* OMAP_RTC_INTERRUPTS_REG bit fields: */
104#define OMAP_RTC_INTERRUPTS_IT_ALARM2 BIT(4)
96#define OMAP_RTC_INTERRUPTS_IT_ALARM BIT(3) 105#define OMAP_RTC_INTERRUPTS_IT_ALARM BIT(3)
97#define OMAP_RTC_INTERRUPTS_IT_TIMER BIT(2) 106#define OMAP_RTC_INTERRUPTS_IT_TIMER BIT(2)
98 107
@@ -102,61 +111,82 @@
102/* OMAP_RTC_IRQWAKEEN bit fields: */ 111/* OMAP_RTC_IRQWAKEEN bit fields: */
103#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1) 112#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1)
104 113
114/* OMAP_RTC_PMIC bit fields: */
115#define OMAP_RTC_PMIC_POWER_EN_EN BIT(16)
116
105/* OMAP_RTC_KICKER values */ 117/* OMAP_RTC_KICKER values */
106#define KICK0_VALUE 0x83e70b13 118#define KICK0_VALUE 0x83e70b13
107#define KICK1_VALUE 0x95a4f1e0 119#define KICK1_VALUE 0x95a4f1e0
108 120
109#define OMAP_RTC_HAS_KICKER BIT(0) 121struct omap_rtc_device_type {
110 122 bool has_32kclk_en;
111/* 123 bool has_kicker;
112 * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup 124 bool has_irqwakeen;
113 * generation for event Alarm. 125 bool has_pmic_mode;
114 */ 126 bool has_power_up_reset;
115#define OMAP_RTC_HAS_IRQWAKEEN BIT(1) 127};
116 128
117/* 129struct omap_rtc {
118 * Some RTC IP revisions (like those in AM335x and DRA7x) need 130 struct rtc_device *rtc;
119 * the 32KHz clock to be explicitly enabled. 131 void __iomem *base;
120 */ 132 int irq_alarm;
121#define OMAP_RTC_HAS_32KCLK_EN BIT(2) 133 int irq_timer;
134 u8 interrupts_reg;
135 bool is_pmic_controller;
136 const struct omap_rtc_device_type *type;
137};
122 138
123static void __iomem *rtc_base; 139static inline u8 rtc_read(struct omap_rtc *rtc, unsigned int reg)
140{
141 return readb(rtc->base + reg);
142}
124 143
125#define rtc_read(addr) readb(rtc_base + (addr)) 144static inline u32 rtc_readl(struct omap_rtc *rtc, unsigned int reg)
126#define rtc_write(val, addr) writeb(val, rtc_base + (addr)) 145{
146 return readl(rtc->base + reg);
147}
127 148
128#define rtc_writel(val, addr) writel(val, rtc_base + (addr)) 149static inline void rtc_write(struct omap_rtc *rtc, unsigned int reg, u8 val)
150{
151 writeb(val, rtc->base + reg);
152}
129 153
154static inline void rtc_writel(struct omap_rtc *rtc, unsigned int reg, u32 val)
155{
156 writel(val, rtc->base + reg);
157}
130 158
131/* we rely on the rtc framework to handle locking (rtc->ops_lock), 159/*
160 * We rely on the rtc framework to handle locking (rtc->ops_lock),
132 * so the only other requirement is that register accesses which 161 * so the only other requirement is that register accesses which
133 * require BUSY to be clear are made with IRQs locally disabled 162 * require BUSY to be clear are made with IRQs locally disabled
134 */ 163 */
135static void rtc_wait_not_busy(void) 164static void rtc_wait_not_busy(struct omap_rtc *rtc)
136{ 165{
137 int count = 0; 166 int count;
138 u8 status; 167 u8 status;
139 168
140 /* BUSY may stay active for 1/32768 second (~30 usec) */ 169 /* BUSY may stay active for 1/32768 second (~30 usec) */
141 for (count = 0; count < 50; count++) { 170 for (count = 0; count < 50; count++) {
142 status = rtc_read(OMAP_RTC_STATUS_REG); 171 status = rtc_read(rtc, OMAP_RTC_STATUS_REG);
143 if ((status & (u8)OMAP_RTC_STATUS_BUSY) == 0) 172 if (!(status & OMAP_RTC_STATUS_BUSY))
144 break; 173 break;
145 udelay(1); 174 udelay(1);
146 } 175 }
147 /* now we have ~15 usec to read/write various registers */ 176 /* now we have ~15 usec to read/write various registers */
148} 177}
149 178
150static irqreturn_t rtc_irq(int irq, void *rtc) 179static irqreturn_t rtc_irq(int irq, void *dev_id)
151{ 180{
152 unsigned long events = 0; 181 struct omap_rtc *rtc = dev_id;
153 u8 irq_data; 182 unsigned long events = 0;
183 u8 irq_data;
154 184
155 irq_data = rtc_read(OMAP_RTC_STATUS_REG); 185 irq_data = rtc_read(rtc, OMAP_RTC_STATUS_REG);
156 186
157 /* alarm irq? */ 187 /* alarm irq? */
158 if (irq_data & OMAP_RTC_STATUS_ALARM) { 188 if (irq_data & OMAP_RTC_STATUS_ALARM) {
159 rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG); 189 rtc_write(rtc, OMAP_RTC_STATUS_REG, OMAP_RTC_STATUS_ALARM);
160 events |= RTC_IRQF | RTC_AF; 190 events |= RTC_IRQF | RTC_AF;
161 } 191 }
162 192
@@ -164,23 +194,21 @@ static irqreturn_t rtc_irq(int irq, void *rtc)
164 if (irq_data & OMAP_RTC_STATUS_1S_EVENT) 194 if (irq_data & OMAP_RTC_STATUS_1S_EVENT)
165 events |= RTC_IRQF | RTC_UF; 195 events |= RTC_IRQF | RTC_UF;
166 196
167 rtc_update_irq(rtc, 1, events); 197 rtc_update_irq(rtc->rtc, 1, events);
168 198
169 return IRQ_HANDLED; 199 return IRQ_HANDLED;
170} 200}
171 201
172static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) 202static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
173{ 203{
204 struct omap_rtc *rtc = dev_get_drvdata(dev);
174 u8 reg, irqwake_reg = 0; 205 u8 reg, irqwake_reg = 0;
175 struct platform_device *pdev = to_platform_device(dev);
176 const struct platform_device_id *id_entry =
177 platform_get_device_id(pdev);
178 206
179 local_irq_disable(); 207 local_irq_disable();
180 rtc_wait_not_busy(); 208 rtc_wait_not_busy(rtc);
181 reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); 209 reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
182 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) 210 if (rtc->type->has_irqwakeen)
183 irqwake_reg = rtc_read(OMAP_RTC_IRQWAKEEN); 211 irqwake_reg = rtc_read(rtc, OMAP_RTC_IRQWAKEEN);
184 212
185 if (enabled) { 213 if (enabled) {
186 reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; 214 reg |= OMAP_RTC_INTERRUPTS_IT_ALARM;
@@ -189,10 +217,10 @@ static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
189 reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; 217 reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM;
190 irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; 218 irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
191 } 219 }
192 rtc_wait_not_busy(); 220 rtc_wait_not_busy(rtc);
193 rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); 221 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, reg);
194 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) 222 if (rtc->type->has_irqwakeen)
195 rtc_write(irqwake_reg, OMAP_RTC_IRQWAKEEN); 223 rtc_write(rtc, OMAP_RTC_IRQWAKEEN, irqwake_reg);
196 local_irq_enable(); 224 local_irq_enable();
197 225
198 return 0; 226 return 0;
@@ -230,39 +258,47 @@ static void bcd2tm(struct rtc_time *tm)
230 tm->tm_year = bcd2bin(tm->tm_year) + 100; 258 tm->tm_year = bcd2bin(tm->tm_year) + 100;
231} 259}
232 260
261static void omap_rtc_read_time_raw(struct omap_rtc *rtc, struct rtc_time *tm)
262{
263 tm->tm_sec = rtc_read(rtc, OMAP_RTC_SECONDS_REG);
264 tm->tm_min = rtc_read(rtc, OMAP_RTC_MINUTES_REG);
265 tm->tm_hour = rtc_read(rtc, OMAP_RTC_HOURS_REG);
266 tm->tm_mday = rtc_read(rtc, OMAP_RTC_DAYS_REG);
267 tm->tm_mon = rtc_read(rtc, OMAP_RTC_MONTHS_REG);
268 tm->tm_year = rtc_read(rtc, OMAP_RTC_YEARS_REG);
269}
233 270
234static int omap_rtc_read_time(struct device *dev, struct rtc_time *tm) 271static int omap_rtc_read_time(struct device *dev, struct rtc_time *tm)
235{ 272{
273 struct omap_rtc *rtc = dev_get_drvdata(dev);
274
236 /* we don't report wday/yday/isdst ... */ 275 /* we don't report wday/yday/isdst ... */
237 local_irq_disable(); 276 local_irq_disable();
238 rtc_wait_not_busy(); 277 rtc_wait_not_busy(rtc);
239 278 omap_rtc_read_time_raw(rtc, tm);
240 tm->tm_sec = rtc_read(OMAP_RTC_SECONDS_REG);
241 tm->tm_min = rtc_read(OMAP_RTC_MINUTES_REG);
242 tm->tm_hour = rtc_read(OMAP_RTC_HOURS_REG);
243 tm->tm_mday = rtc_read(OMAP_RTC_DAYS_REG);
244 tm->tm_mon = rtc_read(OMAP_RTC_MONTHS_REG);
245 tm->tm_year = rtc_read(OMAP_RTC_YEARS_REG);
246
247 local_irq_enable(); 279 local_irq_enable();
248 280
249 bcd2tm(tm); 281 bcd2tm(tm);
282
250 return 0; 283 return 0;
251} 284}
252 285
253static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm) 286static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm)
254{ 287{
288 struct omap_rtc *rtc = dev_get_drvdata(dev);
289
255 if (tm2bcd(tm) < 0) 290 if (tm2bcd(tm) < 0)
256 return -EINVAL; 291 return -EINVAL;
292
257 local_irq_disable(); 293 local_irq_disable();
258 rtc_wait_not_busy(); 294 rtc_wait_not_busy(rtc);
259 295
260 rtc_write(tm->tm_year, OMAP_RTC_YEARS_REG); 296 rtc_write(rtc, OMAP_RTC_YEARS_REG, tm->tm_year);
261 rtc_write(tm->tm_mon, OMAP_RTC_MONTHS_REG); 297 rtc_write(rtc, OMAP_RTC_MONTHS_REG, tm->tm_mon);
262 rtc_write(tm->tm_mday, OMAP_RTC_DAYS_REG); 298 rtc_write(rtc, OMAP_RTC_DAYS_REG, tm->tm_mday);
263 rtc_write(tm->tm_hour, OMAP_RTC_HOURS_REG); 299 rtc_write(rtc, OMAP_RTC_HOURS_REG, tm->tm_hour);
264 rtc_write(tm->tm_min, OMAP_RTC_MINUTES_REG); 300 rtc_write(rtc, OMAP_RTC_MINUTES_REG, tm->tm_min);
265 rtc_write(tm->tm_sec, OMAP_RTC_SECONDS_REG); 301 rtc_write(rtc, OMAP_RTC_SECONDS_REG, tm->tm_sec);
266 302
267 local_irq_enable(); 303 local_irq_enable();
268 304
@@ -271,48 +307,50 @@ static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm)
271 307
272static int omap_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) 308static int omap_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
273{ 309{
310 struct omap_rtc *rtc = dev_get_drvdata(dev);
311 u8 interrupts;
312
274 local_irq_disable(); 313 local_irq_disable();
275 rtc_wait_not_busy(); 314 rtc_wait_not_busy(rtc);
276 315
277 alm->time.tm_sec = rtc_read(OMAP_RTC_ALARM_SECONDS_REG); 316 alm->time.tm_sec = rtc_read(rtc, OMAP_RTC_ALARM_SECONDS_REG);
278 alm->time.tm_min = rtc_read(OMAP_RTC_ALARM_MINUTES_REG); 317 alm->time.tm_min = rtc_read(rtc, OMAP_RTC_ALARM_MINUTES_REG);
279 alm->time.tm_hour = rtc_read(OMAP_RTC_ALARM_HOURS_REG); 318 alm->time.tm_hour = rtc_read(rtc, OMAP_RTC_ALARM_HOURS_REG);
280 alm->time.tm_mday = rtc_read(OMAP_RTC_ALARM_DAYS_REG); 319 alm->time.tm_mday = rtc_read(rtc, OMAP_RTC_ALARM_DAYS_REG);
281 alm->time.tm_mon = rtc_read(OMAP_RTC_ALARM_MONTHS_REG); 320 alm->time.tm_mon = rtc_read(rtc, OMAP_RTC_ALARM_MONTHS_REG);
282 alm->time.tm_year = rtc_read(OMAP_RTC_ALARM_YEARS_REG); 321 alm->time.tm_year = rtc_read(rtc, OMAP_RTC_ALARM_YEARS_REG);
283 322
284 local_irq_enable(); 323 local_irq_enable();
285 324
286 bcd2tm(&alm->time); 325 bcd2tm(&alm->time);
287 alm->enabled = !!(rtc_read(OMAP_RTC_INTERRUPTS_REG) 326
288 & OMAP_RTC_INTERRUPTS_IT_ALARM); 327 interrupts = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
328 alm->enabled = !!(interrupts & OMAP_RTC_INTERRUPTS_IT_ALARM);
289 329
290 return 0; 330 return 0;
291} 331}
292 332
293static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) 333static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
294{ 334{
335 struct omap_rtc *rtc = dev_get_drvdata(dev);
295 u8 reg, irqwake_reg = 0; 336 u8 reg, irqwake_reg = 0;
296 struct platform_device *pdev = to_platform_device(dev);
297 const struct platform_device_id *id_entry =
298 platform_get_device_id(pdev);
299 337
300 if (tm2bcd(&alm->time) < 0) 338 if (tm2bcd(&alm->time) < 0)
301 return -EINVAL; 339 return -EINVAL;
302 340
303 local_irq_disable(); 341 local_irq_disable();
304 rtc_wait_not_busy(); 342 rtc_wait_not_busy(rtc);
305 343
306 rtc_write(alm->time.tm_year, OMAP_RTC_ALARM_YEARS_REG); 344 rtc_write(rtc, OMAP_RTC_ALARM_YEARS_REG, alm->time.tm_year);
307 rtc_write(alm->time.tm_mon, OMAP_RTC_ALARM_MONTHS_REG); 345 rtc_write(rtc, OMAP_RTC_ALARM_MONTHS_REG, alm->time.tm_mon);
308 rtc_write(alm->time.tm_mday, OMAP_RTC_ALARM_DAYS_REG); 346 rtc_write(rtc, OMAP_RTC_ALARM_DAYS_REG, alm->time.tm_mday);
309 rtc_write(alm->time.tm_hour, OMAP_RTC_ALARM_HOURS_REG); 347 rtc_write(rtc, OMAP_RTC_ALARM_HOURS_REG, alm->time.tm_hour);
310 rtc_write(alm->time.tm_min, OMAP_RTC_ALARM_MINUTES_REG); 348 rtc_write(rtc, OMAP_RTC_ALARM_MINUTES_REG, alm->time.tm_min);
311 rtc_write(alm->time.tm_sec, OMAP_RTC_ALARM_SECONDS_REG); 349 rtc_write(rtc, OMAP_RTC_ALARM_SECONDS_REG, alm->time.tm_sec);
312 350
313 reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); 351 reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
314 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) 352 if (rtc->type->has_irqwakeen)
315 irqwake_reg = rtc_read(OMAP_RTC_IRQWAKEEN); 353 irqwake_reg = rtc_read(rtc, OMAP_RTC_IRQWAKEEN);
316 354
317 if (alm->enabled) { 355 if (alm->enabled) {
318 reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; 356 reg |= OMAP_RTC_INTERRUPTS_IT_ALARM;
@@ -321,15 +359,79 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
321 reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; 359 reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM;
322 irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; 360 irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
323 } 361 }
324 rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); 362 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, reg);
325 if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) 363 if (rtc->type->has_irqwakeen)
326 rtc_write(irqwake_reg, OMAP_RTC_IRQWAKEEN); 364 rtc_write(rtc, OMAP_RTC_IRQWAKEEN, irqwake_reg);
327 365
328 local_irq_enable(); 366 local_irq_enable();
329 367
330 return 0; 368 return 0;
331} 369}
332 370
371static struct omap_rtc *omap_rtc_power_off_rtc;
372
373/*
374 * omap_rtc_poweroff: RTC-controlled power off
375 *
376 * The RTC can be used to control an external PMIC via the pmic_power_en pin,
377 * which can be configured to transition to OFF on ALARM2 events.
378 *
379 * Notes:
380 * The two-second alarm offset is the shortest offset possible as the alarm
381 * registers must be set before the next timer update and the offset
382 * calculation is too heavy for everything to be done within a single access
383 * period (~15 us).
384 *
385 * Called with local interrupts disabled.
386 */
387static void omap_rtc_power_off(void)
388{
389 struct omap_rtc *rtc = omap_rtc_power_off_rtc;
390 struct rtc_time tm;
391 unsigned long now;
392 u32 val;
393
394 /* enable pmic_power_en control */
395 val = rtc_readl(rtc, OMAP_RTC_PMIC_REG);
396 rtc_writel(rtc, OMAP_RTC_PMIC_REG, val | OMAP_RTC_PMIC_POWER_EN_EN);
397
398 /* set alarm two seconds from now */
399 omap_rtc_read_time_raw(rtc, &tm);
400 bcd2tm(&tm);
401 rtc_tm_to_time(&tm, &now);
402 rtc_time_to_tm(now + 2, &tm);
403
404 if (tm2bcd(&tm) < 0) {
405 dev_err(&rtc->rtc->dev, "power off failed\n");
406 return;
407 }
408
409 rtc_wait_not_busy(rtc);
410
411 rtc_write(rtc, OMAP_RTC_ALARM2_SECONDS_REG, tm.tm_sec);
412 rtc_write(rtc, OMAP_RTC_ALARM2_MINUTES_REG, tm.tm_min);
413 rtc_write(rtc, OMAP_RTC_ALARM2_HOURS_REG, tm.tm_hour);
414 rtc_write(rtc, OMAP_RTC_ALARM2_DAYS_REG, tm.tm_mday);
415 rtc_write(rtc, OMAP_RTC_ALARM2_MONTHS_REG, tm.tm_mon);
416 rtc_write(rtc, OMAP_RTC_ALARM2_YEARS_REG, tm.tm_year);
417
418 /*
419 * enable ALARM2 interrupt
420 *
421 * NOTE: this fails on AM3352 if rtc_write (writeb) is used
422 */
423 val = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
424 rtc_writel(rtc, OMAP_RTC_INTERRUPTS_REG,
425 val | OMAP_RTC_INTERRUPTS_IT_ALARM2);
426
427 /*
428 * Wait for alarm to trigger (within two seconds) and external PMIC to
429 * power off the system. Add a 500 ms margin for external latencies
430 * (e.g. debounce circuits).
431 */
432 mdelay(2500);
433}
434
333static struct rtc_class_ops omap_rtc_ops = { 435static struct rtc_class_ops omap_rtc_ops = {
334 .read_time = omap_rtc_read_time, 436 .read_time = omap_rtc_read_time,
335 .set_time = omap_rtc_set_time, 437 .set_time = omap_rtc_set_time,
@@ -338,137 +440,140 @@ static struct rtc_class_ops omap_rtc_ops = {
338 .alarm_irq_enable = omap_rtc_alarm_irq_enable, 440 .alarm_irq_enable = omap_rtc_alarm_irq_enable,
339}; 441};
340 442
341static int omap_rtc_alarm; 443static const struct omap_rtc_device_type omap_rtc_default_type = {
342static int omap_rtc_timer; 444 .has_power_up_reset = true,
445};
343 446
344#define OMAP_RTC_DATA_AM3352_IDX 1 447static const struct omap_rtc_device_type omap_rtc_am3352_type = {
345#define OMAP_RTC_DATA_DA830_IDX 2 448 .has_32kclk_en = true,
449 .has_kicker = true,
450 .has_irqwakeen = true,
451 .has_pmic_mode = true,
452};
346 453
347static struct platform_device_id omap_rtc_devtype[] = { 454static const struct omap_rtc_device_type omap_rtc_da830_type = {
455 .has_kicker = true,
456};
457
458static const struct platform_device_id omap_rtc_id_table[] = {
348 { 459 {
349 .name = DRIVER_NAME, 460 .name = "omap_rtc",
350 }, 461 .driver_data = (kernel_ulong_t)&omap_rtc_default_type,
351 [OMAP_RTC_DATA_AM3352_IDX] = { 462 }, {
352 .name = "am3352-rtc", 463 .name = "am3352-rtc",
353 .driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN | 464 .driver_data = (kernel_ulong_t)&omap_rtc_am3352_type,
354 OMAP_RTC_HAS_32KCLK_EN, 465 }, {
355 },
356 [OMAP_RTC_DATA_DA830_IDX] = {
357 .name = "da830-rtc", 466 .name = "da830-rtc",
358 .driver_data = OMAP_RTC_HAS_KICKER, 467 .driver_data = (kernel_ulong_t)&omap_rtc_da830_type,
359 }, 468 }, {
360 {}, 469 /* sentinel */
470 }
361}; 471};
362MODULE_DEVICE_TABLE(platform, omap_rtc_devtype); 472MODULE_DEVICE_TABLE(platform, omap_rtc_id_table);
363 473
364static const struct of_device_id omap_rtc_of_match[] = { 474static const struct of_device_id omap_rtc_of_match[] = {
365 { .compatible = "ti,da830-rtc", 475 {
366 .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX], 476 .compatible = "ti,am3352-rtc",
367 }, 477 .data = &omap_rtc_am3352_type,
368 { .compatible = "ti,am3352-rtc", 478 }, {
369 .data = &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX], 479 .compatible = "ti,da830-rtc",
370 }, 480 .data = &omap_rtc_da830_type,
371 {}, 481 }, {
482 /* sentinel */
483 }
372}; 484};
373MODULE_DEVICE_TABLE(of, omap_rtc_of_match); 485MODULE_DEVICE_TABLE(of, omap_rtc_of_match);
374 486
375static int __init omap_rtc_probe(struct platform_device *pdev) 487static int __init omap_rtc_probe(struct platform_device *pdev)
376{ 488{
377 struct resource *res; 489 struct omap_rtc *rtc;
378 struct rtc_device *rtc; 490 struct resource *res;
379 u8 reg, new_ctrl; 491 u8 reg, mask, new_ctrl;
380 const struct platform_device_id *id_entry; 492 const struct platform_device_id *id_entry;
381 const struct of_device_id *of_id; 493 const struct of_device_id *of_id;
494 int ret;
382 495
383 of_id = of_match_device(omap_rtc_of_match, &pdev->dev); 496 rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
384 if (of_id) 497 if (!rtc)
385 pdev->id_entry = of_id->data; 498 return -ENOMEM;
386 499
387 id_entry = platform_get_device_id(pdev); 500 of_id = of_match_device(omap_rtc_of_match, &pdev->dev);
388 if (!id_entry) { 501 if (of_id) {
389 dev_err(&pdev->dev, "no matching device entry\n"); 502 rtc->type = of_id->data;
390 return -ENODEV; 503 rtc->is_pmic_controller = rtc->type->has_pmic_mode &&
504 of_property_read_bool(pdev->dev.of_node,
505 "system-power-controller");
506 } else {
507 id_entry = platform_get_device_id(pdev);
508 rtc->type = (void *)id_entry->driver_data;
391 } 509 }
392 510
393 omap_rtc_timer = platform_get_irq(pdev, 0); 511 rtc->irq_timer = platform_get_irq(pdev, 0);
394 if (omap_rtc_timer <= 0) { 512 if (rtc->irq_timer <= 0)
395 pr_debug("%s: no update irq?\n", pdev->name);
396 return -ENOENT; 513 return -ENOENT;
397 }
398 514
399 omap_rtc_alarm = platform_get_irq(pdev, 1); 515 rtc->irq_alarm = platform_get_irq(pdev, 1);
400 if (omap_rtc_alarm <= 0) { 516 if (rtc->irq_alarm <= 0)
401 pr_debug("%s: no alarm irq?\n", pdev->name);
402 return -ENOENT; 517 return -ENOENT;
403 }
404 518
405 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 519 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
406 rtc_base = devm_ioremap_resource(&pdev->dev, res); 520 rtc->base = devm_ioremap_resource(&pdev->dev, res);
407 if (IS_ERR(rtc_base)) 521 if (IS_ERR(rtc->base))
408 return PTR_ERR(rtc_base); 522 return PTR_ERR(rtc->base);
523
524 platform_set_drvdata(pdev, rtc);
409 525
410 /* Enable the clock/module so that we can access the registers */ 526 /* Enable the clock/module so that we can access the registers */
411 pm_runtime_enable(&pdev->dev); 527 pm_runtime_enable(&pdev->dev);
412 pm_runtime_get_sync(&pdev->dev); 528 pm_runtime_get_sync(&pdev->dev);
413 529
414 if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) { 530 if (rtc->type->has_kicker) {
415 rtc_writel(KICK0_VALUE, OMAP_RTC_KICK0_REG); 531 rtc_writel(rtc, OMAP_RTC_KICK0_REG, KICK0_VALUE);
416 rtc_writel(KICK1_VALUE, OMAP_RTC_KICK1_REG); 532 rtc_writel(rtc, OMAP_RTC_KICK1_REG, KICK1_VALUE);
417 }
418
419 rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
420 &omap_rtc_ops, THIS_MODULE);
421 if (IS_ERR(rtc)) {
422 pr_debug("%s: can't register RTC device, err %ld\n",
423 pdev->name, PTR_ERR(rtc));
424 goto fail0;
425 } 533 }
426 platform_set_drvdata(pdev, rtc);
427 534
428 /* clear pending irqs, and set 1/second periodic, 535 /*
429 * which we'll use instead of update irqs 536 * disable interrupts
537 *
538 * NOTE: ALARM2 is not cleared on AM3352 if rtc_write (writeb) is used
430 */ 539 */
431 rtc_write(0, OMAP_RTC_INTERRUPTS_REG); 540 rtc_writel(rtc, OMAP_RTC_INTERRUPTS_REG, 0);
432 541
433 /* enable RTC functional clock */ 542 /* enable RTC functional clock */
434 if (id_entry->driver_data & OMAP_RTC_HAS_32KCLK_EN) 543 if (rtc->type->has_32kclk_en) {
435 rtc_writel(OMAP_RTC_OSC_32KCLK_EN, OMAP_RTC_OSC_REG); 544 reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
545 rtc_writel(rtc, OMAP_RTC_OSC_REG,
546 reg | OMAP_RTC_OSC_32KCLK_EN);
547 }
436 548
437 /* clear old status */ 549 /* clear old status */
438 reg = rtc_read(OMAP_RTC_STATUS_REG); 550 reg = rtc_read(rtc, OMAP_RTC_STATUS_REG);
439 if (reg & (u8) OMAP_RTC_STATUS_POWER_UP) {
440 pr_info("%s: RTC power up reset detected\n",
441 pdev->name);
442 rtc_write(OMAP_RTC_STATUS_POWER_UP, OMAP_RTC_STATUS_REG);
443 }
444 if (reg & (u8) OMAP_RTC_STATUS_ALARM)
445 rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG);
446 551
447 /* handle periodic and alarm irqs */ 552 mask = OMAP_RTC_STATUS_ALARM;
448 if (devm_request_irq(&pdev->dev, omap_rtc_timer, rtc_irq, 0, 553
449 dev_name(&rtc->dev), rtc)) { 554 if (rtc->type->has_pmic_mode)
450 pr_debug("%s: RTC timer interrupt IRQ%d already claimed\n", 555 mask |= OMAP_RTC_STATUS_ALARM2;
451 pdev->name, omap_rtc_timer); 556
452 goto fail0; 557 if (rtc->type->has_power_up_reset) {
453 } 558 mask |= OMAP_RTC_STATUS_POWER_UP;
454 if ((omap_rtc_timer != omap_rtc_alarm) && 559 if (reg & OMAP_RTC_STATUS_POWER_UP)
455 (devm_request_irq(&pdev->dev, omap_rtc_alarm, rtc_irq, 0, 560 dev_info(&pdev->dev, "RTC power up reset detected\n");
456 dev_name(&rtc->dev), rtc))) {
457 pr_debug("%s: RTC alarm interrupt IRQ%d already claimed\n",
458 pdev->name, omap_rtc_alarm);
459 goto fail0;
460 } 561 }
461 562
563 if (reg & mask)
564 rtc_write(rtc, OMAP_RTC_STATUS_REG, reg & mask);
565
462 /* On boards with split power, RTC_ON_NOFF won't reset the RTC */ 566 /* On boards with split power, RTC_ON_NOFF won't reset the RTC */
463 reg = rtc_read(OMAP_RTC_CTRL_REG); 567 reg = rtc_read(rtc, OMAP_RTC_CTRL_REG);
464 if (reg & (u8) OMAP_RTC_CTRL_STOP) 568 if (reg & OMAP_RTC_CTRL_STOP)
465 pr_info("%s: already running\n", pdev->name); 569 dev_info(&pdev->dev, "already running\n");
466 570
467 /* force to 24 hour mode */ 571 /* force to 24 hour mode */
468 new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP); 572 new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT | OMAP_RTC_CTRL_AUTO_COMP);
469 new_ctrl |= OMAP_RTC_CTRL_STOP; 573 new_ctrl |= OMAP_RTC_CTRL_STOP;
470 574
471 /* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE: 575 /*
576 * BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE:
472 * 577 *
473 * - Device wake-up capability setting should come through chip 578 * - Device wake-up capability setting should come through chip
474 * init logic. OMAP1 boards should initialize the "wakeup capable" 579 * init logic. OMAP1 boards should initialize the "wakeup capable"
@@ -482,36 +587,70 @@ static int __init omap_rtc_probe(struct platform_device *pdev)
482 * is write-only, and always reads as zero...) 587 * is write-only, and always reads as zero...)
483 */ 588 */
484 589
590 if (new_ctrl & OMAP_RTC_CTRL_SPLIT)
591 dev_info(&pdev->dev, "split power mode\n");
592
593 if (reg != new_ctrl)
594 rtc_write(rtc, OMAP_RTC_CTRL_REG, new_ctrl);
595
485 device_init_wakeup(&pdev->dev, true); 596 device_init_wakeup(&pdev->dev, true);
486 597
487 if (new_ctrl & (u8) OMAP_RTC_CTRL_SPLIT) 598 rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
488 pr_info("%s: split power mode\n", pdev->name); 599 &omap_rtc_ops, THIS_MODULE);
600 if (IS_ERR(rtc->rtc)) {
601 ret = PTR_ERR(rtc->rtc);
602 goto err;
603 }
489 604
490 if (reg != new_ctrl) 605 /* handle periodic and alarm irqs */
491 rtc_write(new_ctrl, OMAP_RTC_CTRL_REG); 606 ret = devm_request_irq(&pdev->dev, rtc->irq_timer, rtc_irq, 0,
607 dev_name(&rtc->rtc->dev), rtc);
608 if (ret)
609 goto err;
610
611 if (rtc->irq_timer != rtc->irq_alarm) {
612 ret = devm_request_irq(&pdev->dev, rtc->irq_alarm, rtc_irq, 0,
613 dev_name(&rtc->rtc->dev), rtc);
614 if (ret)
615 goto err;
616 }
617
618 if (rtc->is_pmic_controller) {
619 if (!pm_power_off) {
620 omap_rtc_power_off_rtc = rtc;
621 pm_power_off = omap_rtc_power_off;
622 }
623 }
492 624
493 return 0; 625 return 0;
494 626
495fail0: 627err:
496 if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) 628 device_init_wakeup(&pdev->dev, false);
497 rtc_writel(0, OMAP_RTC_KICK0_REG); 629 if (rtc->type->has_kicker)
630 rtc_writel(rtc, OMAP_RTC_KICK0_REG, 0);
498 pm_runtime_put_sync(&pdev->dev); 631 pm_runtime_put_sync(&pdev->dev);
499 pm_runtime_disable(&pdev->dev); 632 pm_runtime_disable(&pdev->dev);
500 return -EIO; 633
634 return ret;
501} 635}
502 636
503static int __exit omap_rtc_remove(struct platform_device *pdev) 637static int __exit omap_rtc_remove(struct platform_device *pdev)
504{ 638{
505 const struct platform_device_id *id_entry = 639 struct omap_rtc *rtc = platform_get_drvdata(pdev);
506 platform_get_device_id(pdev); 640
641 if (pm_power_off == omap_rtc_power_off &&
642 omap_rtc_power_off_rtc == rtc) {
643 pm_power_off = NULL;
644 omap_rtc_power_off_rtc = NULL;
645 }
507 646
508 device_init_wakeup(&pdev->dev, 0); 647 device_init_wakeup(&pdev->dev, 0);
509 648
510 /* leave rtc running, but disable irqs */ 649 /* leave rtc running, but disable irqs */
511 rtc_write(0, OMAP_RTC_INTERRUPTS_REG); 650 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, 0);
512 651
513 if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) 652 if (rtc->type->has_kicker)
514 rtc_writel(0, OMAP_RTC_KICK0_REG); 653 rtc_writel(rtc, OMAP_RTC_KICK0_REG, 0);
515 654
516 /* Disable the clock/module */ 655 /* Disable the clock/module */
517 pm_runtime_put_sync(&pdev->dev); 656 pm_runtime_put_sync(&pdev->dev);
@@ -521,20 +660,21 @@ static int __exit omap_rtc_remove(struct platform_device *pdev)
521} 660}
522 661
523#ifdef CONFIG_PM_SLEEP 662#ifdef CONFIG_PM_SLEEP
524static u8 irqstat;
525
526static int omap_rtc_suspend(struct device *dev) 663static int omap_rtc_suspend(struct device *dev)
527{ 664{
528 irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG); 665 struct omap_rtc *rtc = dev_get_drvdata(dev);
529 666
530 /* FIXME the RTC alarm is not currently acting as a wakeup event 667 rtc->interrupts_reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
668
669 /*
670 * FIXME: the RTC alarm is not currently acting as a wakeup event
531 * source on some platforms, and in fact this enable() call is just 671 * source on some platforms, and in fact this enable() call is just
532 * saving a flag that's never used... 672 * saving a flag that's never used...
533 */ 673 */
534 if (device_may_wakeup(dev)) 674 if (device_may_wakeup(dev))
535 enable_irq_wake(omap_rtc_alarm); 675 enable_irq_wake(rtc->irq_alarm);
536 else 676 else
537 rtc_write(0, OMAP_RTC_INTERRUPTS_REG); 677 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, 0);
538 678
539 /* Disable the clock/module */ 679 /* Disable the clock/module */
540 pm_runtime_put_sync(dev); 680 pm_runtime_put_sync(dev);
@@ -544,13 +684,15 @@ static int omap_rtc_suspend(struct device *dev)
544 684
545static int omap_rtc_resume(struct device *dev) 685static int omap_rtc_resume(struct device *dev)
546{ 686{
687 struct omap_rtc *rtc = dev_get_drvdata(dev);
688
547 /* Enable the clock/module so that we can access the registers */ 689 /* Enable the clock/module so that we can access the registers */
548 pm_runtime_get_sync(dev); 690 pm_runtime_get_sync(dev);
549 691
550 if (device_may_wakeup(dev)) 692 if (device_may_wakeup(dev))
551 disable_irq_wake(omap_rtc_alarm); 693 disable_irq_wake(rtc->irq_alarm);
552 else 694 else
553 rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG); 695 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, rtc->interrupts_reg);
554 696
555 return 0; 697 return 0;
556} 698}
@@ -560,23 +702,32 @@ static SIMPLE_DEV_PM_OPS(omap_rtc_pm_ops, omap_rtc_suspend, omap_rtc_resume);
560 702
561static void omap_rtc_shutdown(struct platform_device *pdev) 703static void omap_rtc_shutdown(struct platform_device *pdev)
562{ 704{
563 rtc_write(0, OMAP_RTC_INTERRUPTS_REG); 705 struct omap_rtc *rtc = platform_get_drvdata(pdev);
706 u8 mask;
707
708 /*
709 * Keep the ALARM interrupt enabled to allow the system to power up on
710 * alarm events.
711 */
712 mask = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG);
713 mask &= OMAP_RTC_INTERRUPTS_IT_ALARM;
714 rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, mask);
564} 715}
565 716
566MODULE_ALIAS("platform:omap_rtc");
567static struct platform_driver omap_rtc_driver = { 717static struct platform_driver omap_rtc_driver = {
568 .remove = __exit_p(omap_rtc_remove), 718 .remove = __exit_p(omap_rtc_remove),
569 .shutdown = omap_rtc_shutdown, 719 .shutdown = omap_rtc_shutdown,
570 .driver = { 720 .driver = {
571 .name = DRIVER_NAME, 721 .name = "omap_rtc",
572 .owner = THIS_MODULE, 722 .owner = THIS_MODULE,
573 .pm = &omap_rtc_pm_ops, 723 .pm = &omap_rtc_pm_ops,
574 .of_match_table = omap_rtc_of_match, 724 .of_match_table = omap_rtc_of_match,
575 }, 725 },
576 .id_table = omap_rtc_devtype, 726 .id_table = omap_rtc_id_table,
577}; 727};
578 728
579module_platform_driver_probe(omap_rtc_driver, omap_rtc_probe); 729module_platform_driver_probe(omap_rtc_driver, omap_rtc_probe);
580 730
731MODULE_ALIAS("platform:omap_rtc");
581MODULE_AUTHOR("George G. Davis (and others)"); 732MODULE_AUTHOR("George G. Davis (and others)");
582MODULE_LICENSE("GPL"); 733MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index c2ef0a22ee94..96fb32e7d6f8 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -28,6 +28,7 @@
28#define PCF8563_REG_ST2 0x01 28#define PCF8563_REG_ST2 0x01
29#define PCF8563_BIT_AIE (1 << 1) 29#define PCF8563_BIT_AIE (1 << 1)
30#define PCF8563_BIT_AF (1 << 3) 30#define PCF8563_BIT_AF (1 << 3)
31#define PCF8563_BITS_ST2_N (7 << 5)
31 32
32#define PCF8563_REG_SC 0x02 /* datetime */ 33#define PCF8563_REG_SC 0x02 /* datetime */
33#define PCF8563_REG_MN 0x03 34#define PCF8563_REG_MN 0x03
@@ -41,6 +42,13 @@
41 42
42#define PCF8563_REG_CLKO 0x0D /* clock out */ 43#define PCF8563_REG_CLKO 0x0D /* clock out */
43#define PCF8563_REG_TMRC 0x0E /* timer control */ 44#define PCF8563_REG_TMRC 0x0E /* timer control */
45#define PCF8563_TMRC_ENABLE BIT(7)
46#define PCF8563_TMRC_4096 0
47#define PCF8563_TMRC_64 1
48#define PCF8563_TMRC_1 2
49#define PCF8563_TMRC_1_60 3
50#define PCF8563_TMRC_MASK 3
51
44#define PCF8563_REG_TMR 0x0F /* timer */ 52#define PCF8563_REG_TMR 0x0F /* timer */
45 53
46#define PCF8563_SC_LV 0x80 /* low voltage */ 54#define PCF8563_SC_LV 0x80 /* low voltage */
@@ -118,22 +126,21 @@ static int pcf8563_write_block_data(struct i2c_client *client,
118 126
119static int pcf8563_set_alarm_mode(struct i2c_client *client, bool on) 127static int pcf8563_set_alarm_mode(struct i2c_client *client, bool on)
120{ 128{
121 unsigned char buf[2]; 129 unsigned char buf;
122 int err; 130 int err;
123 131
124 err = pcf8563_read_block_data(client, PCF8563_REG_ST2, 1, buf + 1); 132 err = pcf8563_read_block_data(client, PCF8563_REG_ST2, 1, &buf);
125 if (err < 0) 133 if (err < 0)
126 return err; 134 return err;
127 135
128 if (on) 136 if (on)
129 buf[1] |= PCF8563_BIT_AIE; 137 buf |= PCF8563_BIT_AIE;
130 else 138 else
131 buf[1] &= ~PCF8563_BIT_AIE; 139 buf &= ~PCF8563_BIT_AIE;
132 140
133 buf[1] &= ~PCF8563_BIT_AF; 141 buf &= ~(PCF8563_BIT_AF | PCF8563_BITS_ST2_N);
134 buf[0] = PCF8563_REG_ST2;
135 142
136 err = pcf8563_write_block_data(client, PCF8563_REG_ST2, 1, buf + 1); 143 err = pcf8563_write_block_data(client, PCF8563_REG_ST2, 1, &buf);
137 if (err < 0) { 144 if (err < 0) {
138 dev_err(&client->dev, "%s: write error\n", __func__); 145 dev_err(&client->dev, "%s: write error\n", __func__);
139 return -EIO; 146 return -EIO;
@@ -336,8 +343,8 @@ static int pcf8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm)
336 __func__, buf[0], buf[1], buf[2], buf[3]); 343 __func__, buf[0], buf[1], buf[2], buf[3]);
337 344
338 tm->time.tm_min = bcd2bin(buf[0] & 0x7F); 345 tm->time.tm_min = bcd2bin(buf[0] & 0x7F);
339 tm->time.tm_hour = bcd2bin(buf[1] & 0x7F); 346 tm->time.tm_hour = bcd2bin(buf[1] & 0x3F);
340 tm->time.tm_mday = bcd2bin(buf[2] & 0x1F); 347 tm->time.tm_mday = bcd2bin(buf[2] & 0x3F);
341 tm->time.tm_wday = bcd2bin(buf[3] & 0x7); 348 tm->time.tm_wday = bcd2bin(buf[3] & 0x7);
342 tm->time.tm_mon = -1; 349 tm->time.tm_mon = -1;
343 tm->time.tm_year = -1; 350 tm->time.tm_year = -1;
@@ -361,6 +368,14 @@ static int pcf8563_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm)
361 struct i2c_client *client = to_i2c_client(dev); 368 struct i2c_client *client = to_i2c_client(dev);
362 unsigned char buf[4]; 369 unsigned char buf[4];
363 int err; 370 int err;
371 unsigned long alarm_time;
372
373 /* The alarm has no seconds, round up to nearest minute */
374 if (tm->time.tm_sec) {
375 rtc_tm_to_time(&tm->time, &alarm_time);
376 alarm_time += 60-tm->time.tm_sec;
377 rtc_time_to_tm(alarm_time, &tm->time);
378 }
364 379
365 dev_dbg(dev, "%s, min=%d hour=%d wday=%d mday=%d " 380 dev_dbg(dev, "%s, min=%d hour=%d wday=%d mday=%d "
366 "enabled=%d pending=%d\n", __func__, 381 "enabled=%d pending=%d\n", __func__,
@@ -381,6 +396,7 @@ static int pcf8563_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm)
381 396
382static int pcf8563_irq_enable(struct device *dev, unsigned int enabled) 397static int pcf8563_irq_enable(struct device *dev, unsigned int enabled)
383{ 398{
399 dev_dbg(dev, "%s: en=%d\n", __func__, enabled);
384 return pcf8563_set_alarm_mode(to_i2c_client(dev), !!enabled); 400 return pcf8563_set_alarm_mode(to_i2c_client(dev), !!enabled);
385} 401}
386 402
@@ -398,6 +414,8 @@ static int pcf8563_probe(struct i2c_client *client,
398{ 414{
399 struct pcf8563 *pcf8563; 415 struct pcf8563 *pcf8563;
400 int err; 416 int err;
417 unsigned char buf;
418 unsigned char alm_pending;
401 419
402 dev_dbg(&client->dev, "%s\n", __func__); 420 dev_dbg(&client->dev, "%s\n", __func__);
403 421
@@ -415,6 +433,22 @@ static int pcf8563_probe(struct i2c_client *client,
415 pcf8563->client = client; 433 pcf8563->client = client;
416 device_set_wakeup_capable(&client->dev, 1); 434 device_set_wakeup_capable(&client->dev, 1);
417 435
436 /* Set timer to lowest frequency to save power (ref Haoyu datasheet) */
437 buf = PCF8563_TMRC_1_60;
438 err = pcf8563_write_block_data(client, PCF8563_REG_TMRC, 1, &buf);
439 if (err < 0) {
440 dev_err(&client->dev, "%s: write error\n", __func__);
441 return err;
442 }
443
444 err = pcf8563_get_alarm_mode(client, NULL, &alm_pending);
445 if (err < 0) {
446 dev_err(&client->dev, "%s: read error\n", __func__);
447 return err;
448 }
449 if (alm_pending)
450 pcf8563_set_alarm_mode(client, 0);
451
418 pcf8563->rtc = devm_rtc_device_register(&client->dev, 452 pcf8563->rtc = devm_rtc_device_register(&client->dev,
419 pcf8563_driver.driver.name, 453 pcf8563_driver.driver.name,
420 &pcf8563_rtc_ops, THIS_MODULE); 454 &pcf8563_rtc_ops, THIS_MODULE);
@@ -435,6 +469,9 @@ static int pcf8563_probe(struct i2c_client *client,
435 469
436 } 470 }
437 471
472 /* the pcf8563 alarm only supports a minute accuracy */
473 pcf8563->rtc->uie_unsupported = 1;
474
438 return 0; 475 return 0;
439} 476}
440 477
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
index 76e38007ba90..d2ac6688e5c7 100644
--- a/drivers/rtc/rtc-sirfsoc.c
+++ b/drivers/rtc/rtc-sirfsoc.c
@@ -47,6 +47,7 @@ struct sirfsoc_rtc_drv {
47 unsigned irq_wake; 47 unsigned irq_wake;
48 /* Overflow for every 8 years extra time */ 48 /* Overflow for every 8 years extra time */
49 u32 overflow_rtc; 49 u32 overflow_rtc;
50 spinlock_t lock;
50#ifdef CONFIG_PM 51#ifdef CONFIG_PM
51 u32 saved_counter; 52 u32 saved_counter;
52 u32 saved_overflow_rtc; 53 u32 saved_overflow_rtc;
@@ -61,7 +62,7 @@ static int sirfsoc_rtc_read_alarm(struct device *dev,
61 62
62 rtcdrv = dev_get_drvdata(dev); 63 rtcdrv = dev_get_drvdata(dev);
63 64
64 local_irq_disable(); 65 spin_lock_irq(&rtcdrv->lock);
65 66
66 rtc_count = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); 67 rtc_count = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
67 68
@@ -84,7 +85,8 @@ static int sirfsoc_rtc_read_alarm(struct device *dev,
84 if (sirfsoc_rtc_iobrg_readl( 85 if (sirfsoc_rtc_iobrg_readl(
85 rtcdrv->rtc_base + RTC_STATUS) & SIRFSOC_RTC_AL0E) 86 rtcdrv->rtc_base + RTC_STATUS) & SIRFSOC_RTC_AL0E)
86 alrm->enabled = 1; 87 alrm->enabled = 1;
87 local_irq_enable(); 88
89 spin_unlock_irq(&rtcdrv->lock);
88 90
89 return 0; 91 return 0;
90} 92}
@@ -99,7 +101,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
99 if (alrm->enabled) { 101 if (alrm->enabled) {
100 rtc_tm_to_time(&(alrm->time), &rtc_alarm); 102 rtc_tm_to_time(&(alrm->time), &rtc_alarm);
101 103
102 local_irq_disable(); 104 spin_lock_irq(&rtcdrv->lock);
103 105
104 rtc_status_reg = sirfsoc_rtc_iobrg_readl( 106 rtc_status_reg = sirfsoc_rtc_iobrg_readl(
105 rtcdrv->rtc_base + RTC_STATUS); 107 rtcdrv->rtc_base + RTC_STATUS);
@@ -123,14 +125,15 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
123 rtc_status_reg |= SIRFSOC_RTC_AL0E; 125 rtc_status_reg |= SIRFSOC_RTC_AL0E;
124 sirfsoc_rtc_iobrg_writel( 126 sirfsoc_rtc_iobrg_writel(
125 rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); 127 rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
126 local_irq_enable(); 128
129 spin_unlock_irq(&rtcdrv->lock);
127 } else { 130 } else {
128 /* 131 /*
129 * if this function was called with enabled=0 132 * if this function was called with enabled=0
130 * then it could mean that the application is 133 * then it could mean that the application is
131 * trying to cancel an ongoing alarm 134 * trying to cancel an ongoing alarm
132 */ 135 */
133 local_irq_disable(); 136 spin_lock_irq(&rtcdrv->lock);
134 137
135 rtc_status_reg = sirfsoc_rtc_iobrg_readl( 138 rtc_status_reg = sirfsoc_rtc_iobrg_readl(
136 rtcdrv->rtc_base + RTC_STATUS); 139 rtcdrv->rtc_base + RTC_STATUS);
@@ -146,7 +149,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
146 rtcdrv->rtc_base + RTC_STATUS); 149 rtcdrv->rtc_base + RTC_STATUS);
147 } 150 }
148 151
149 local_irq_enable(); 152 spin_unlock_irq(&rtcdrv->lock);
150 } 153 }
151 154
152 return 0; 155 return 0;
@@ -209,12 +212,38 @@ static int sirfsoc_rtc_ioctl(struct device *dev, unsigned int cmd,
209 } 212 }
210} 213}
211 214
215static int sirfsoc_rtc_alarm_irq_enable(struct device *dev,
216 unsigned int enabled)
217{
218 unsigned long rtc_status_reg = 0x0;
219 struct sirfsoc_rtc_drv *rtcdrv;
220
221 rtcdrv = dev_get_drvdata(dev);
222
223 spin_lock_irq(&rtcdrv->lock);
224
225 rtc_status_reg = sirfsoc_rtc_iobrg_readl(
226 rtcdrv->rtc_base + RTC_STATUS);
227 if (enabled)
228 rtc_status_reg |= SIRFSOC_RTC_AL0E;
229 else
230 rtc_status_reg &= ~SIRFSOC_RTC_AL0E;
231
232 sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
233
234 spin_unlock_irq(&rtcdrv->lock);
235
236 return 0;
237
238}
239
212static const struct rtc_class_ops sirfsoc_rtc_ops = { 240static const struct rtc_class_ops sirfsoc_rtc_ops = {
213 .read_time = sirfsoc_rtc_read_time, 241 .read_time = sirfsoc_rtc_read_time,
214 .set_time = sirfsoc_rtc_set_time, 242 .set_time = sirfsoc_rtc_set_time,
215 .read_alarm = sirfsoc_rtc_read_alarm, 243 .read_alarm = sirfsoc_rtc_read_alarm,
216 .set_alarm = sirfsoc_rtc_set_alarm, 244 .set_alarm = sirfsoc_rtc_set_alarm,
217 .ioctl = sirfsoc_rtc_ioctl 245 .ioctl = sirfsoc_rtc_ioctl,
246 .alarm_irq_enable = sirfsoc_rtc_alarm_irq_enable
218}; 247};
219 248
220static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata) 249static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
@@ -223,6 +252,8 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
223 unsigned long rtc_status_reg = 0x0; 252 unsigned long rtc_status_reg = 0x0;
224 unsigned long events = 0x0; 253 unsigned long events = 0x0;
225 254
255 spin_lock(&rtcdrv->lock);
256
226 rtc_status_reg = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_STATUS); 257 rtc_status_reg = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_STATUS);
227 /* this bit will be set ONLY if an alarm was active 258 /* this bit will be set ONLY if an alarm was active
228 * and it expired NOW 259 * and it expired NOW
@@ -240,6 +271,9 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
240 rtc_status_reg &= ~(SIRFSOC_RTC_AL0E); 271 rtc_status_reg &= ~(SIRFSOC_RTC_AL0E);
241 } 272 }
242 sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); 273 sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
274
275 spin_unlock(&rtcdrv->lock);
276
243 /* this should wake up any apps polling/waiting on the read 277 /* this should wake up any apps polling/waiting on the read
244 * after setting the alarm 278 * after setting the alarm
245 */ 279 */
@@ -267,6 +301,8 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
267 if (rtcdrv == NULL) 301 if (rtcdrv == NULL)
268 return -ENOMEM; 302 return -ENOMEM;
269 303
304 spin_lock_init(&rtcdrv->lock);
305
270 err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base); 306 err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base);
271 if (err) { 307 if (err) {
272 dev_err(&pdev->dev, "unable to find base address of rtc node in dtb\n"); 308 dev_err(&pdev->dev, "unable to find base address of rtc node in dtb\n");
@@ -286,14 +322,6 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
286 rtc_div = ((32768 / RTC_HZ) / 2) - 1; 322 rtc_div = ((32768 / RTC_HZ) / 2) - 1;
287 sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV); 323 sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV);
288 324
289 rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
290 &sirfsoc_rtc_ops, THIS_MODULE);
291 if (IS_ERR(rtcdrv->rtc)) {
292 err = PTR_ERR(rtcdrv->rtc);
293 dev_err(&pdev->dev, "can't register RTC device\n");
294 return err;
295 }
296
297 /* 0x3 -> RTC_CLK */ 325 /* 0x3 -> RTC_CLK */
298 sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK, 326 sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK,
299 rtcdrv->rtc_base + RTC_CLOCK_SWITCH); 327 rtcdrv->rtc_base + RTC_CLOCK_SWITCH);
@@ -308,6 +336,14 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
308 rtcdrv->overflow_rtc = 336 rtcdrv->overflow_rtc =
309 sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE); 337 sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE);
310 338
339 rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
340 &sirfsoc_rtc_ops, THIS_MODULE);
341 if (IS_ERR(rtcdrv->rtc)) {
342 err = PTR_ERR(rtcdrv->rtc);
343 dev_err(&pdev->dev, "can't register RTC device\n");
344 return err;
345 }
346
311 rtcdrv->irq = platform_get_irq(pdev, 0); 347 rtcdrv->irq = platform_get_irq(pdev, 0);
312 err = devm_request_irq( 348 err = devm_request_irq(
313 &pdev->dev, 349 &pdev->dev,
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index fa384fe28988..2cd8ffe5c698 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -17,6 +17,7 @@
17#include <linux/of_device.h> 17#include <linux/of_device.h>
18#include <linux/platform_device.h> 18#include <linux/platform_device.h>
19#include <linux/rtc.h> 19#include <linux/rtc.h>
20#include <linux/clk.h>
20 21
21/* These register offsets are relative to LP (Low Power) range */ 22/* These register offsets are relative to LP (Low Power) range */
22#define SNVS_LPCR 0x04 23#define SNVS_LPCR 0x04
@@ -39,6 +40,7 @@ struct snvs_rtc_data {
39 void __iomem *ioaddr; 40 void __iomem *ioaddr;
40 int irq; 41 int irq;
41 spinlock_t lock; 42 spinlock_t lock;
43 struct clk *clk;
42}; 44};
43 45
44static u32 rtc_read_lp_counter(void __iomem *ioaddr) 46static u32 rtc_read_lp_counter(void __iomem *ioaddr)
@@ -260,6 +262,18 @@ static int snvs_rtc_probe(struct platform_device *pdev)
260 if (data->irq < 0) 262 if (data->irq < 0)
261 return data->irq; 263 return data->irq;
262 264
265 data->clk = devm_clk_get(&pdev->dev, "snvs-rtc");
266 if (IS_ERR(data->clk)) {
267 data->clk = NULL;
268 } else {
269 ret = clk_prepare_enable(data->clk);
270 if (ret) {
271 dev_err(&pdev->dev,
272 "Could not prepare or enable the snvs clock\n");
273 return ret;
274 }
275 }
276
263 platform_set_drvdata(pdev, data); 277 platform_set_drvdata(pdev, data);
264 278
265 spin_lock_init(&data->lock); 279 spin_lock_init(&data->lock);
@@ -280,7 +294,7 @@ static int snvs_rtc_probe(struct platform_device *pdev)
280 if (ret) { 294 if (ret) {
281 dev_err(&pdev->dev, "failed to request irq %d: %d\n", 295 dev_err(&pdev->dev, "failed to request irq %d: %d\n",
282 data->irq, ret); 296 data->irq, ret);
283 return ret; 297 goto error_rtc_device_register;
284 } 298 }
285 299
286 data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, 300 data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
@@ -288,10 +302,16 @@ static int snvs_rtc_probe(struct platform_device *pdev)
288 if (IS_ERR(data->rtc)) { 302 if (IS_ERR(data->rtc)) {
289 ret = PTR_ERR(data->rtc); 303 ret = PTR_ERR(data->rtc);
290 dev_err(&pdev->dev, "failed to register rtc: %d\n", ret); 304 dev_err(&pdev->dev, "failed to register rtc: %d\n", ret);
291 return ret; 305 goto error_rtc_device_register;
292 } 306 }
293 307
294 return 0; 308 return 0;
309
310error_rtc_device_register:
311 if (data->clk)
312 clk_disable_unprepare(data->clk);
313
314 return ret;
295} 315}
296 316
297#ifdef CONFIG_PM_SLEEP 317#ifdef CONFIG_PM_SLEEP
@@ -302,21 +322,34 @@ static int snvs_rtc_suspend(struct device *dev)
302 if (device_may_wakeup(dev)) 322 if (device_may_wakeup(dev))
303 enable_irq_wake(data->irq); 323 enable_irq_wake(data->irq);
304 324
325 if (data->clk)
326 clk_disable_unprepare(data->clk);
327
305 return 0; 328 return 0;
306} 329}
307 330
308static int snvs_rtc_resume(struct device *dev) 331static int snvs_rtc_resume(struct device *dev)
309{ 332{
310 struct snvs_rtc_data *data = dev_get_drvdata(dev); 333 struct snvs_rtc_data *data = dev_get_drvdata(dev);
334 int ret;
311 335
312 if (device_may_wakeup(dev)) 336 if (device_may_wakeup(dev))
313 disable_irq_wake(data->irq); 337 disable_irq_wake(data->irq);
314 338
339 if (data->clk) {
340 ret = clk_prepare_enable(data->clk);
341 if (ret)
342 return ret;
343 }
344
315 return 0; 345 return 0;
316} 346}
317#endif 347#endif
318 348
319static SIMPLE_DEV_PM_OPS(snvs_rtc_pm_ops, snvs_rtc_suspend, snvs_rtc_resume); 349static const struct dev_pm_ops snvs_rtc_pm_ops = {
350 .suspend_noirq = snvs_rtc_suspend,
351 .resume_noirq = snvs_rtc_resume,
352};
320 353
321static const struct of_device_id snvs_dt_ids[] = { 354static const struct of_device_id snvs_dt_ids[] = {
322 { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, 355 { .compatible = "fsl,sec-v4.0-mon-rtc-lp", },
diff --git a/drivers/usb/storage/debug.c b/drivers/usb/storage/debug.c
index 66a684a29938..2d81e1d8ee30 100644
--- a/drivers/usb/storage/debug.c
+++ b/drivers/usb/storage/debug.c
@@ -188,7 +188,7 @@ int usb_stor_dbg(const struct us_data *us, const char *fmt, ...)
188 188
189 va_start(args, fmt); 189 va_start(args, fmt);
190 190
191 r = dev_vprintk_emit(7, &us->pusb_dev->dev, fmt, args); 191 r = dev_vprintk_emit(LOGLEVEL_DEBUG, &us->pusb_dev->dev, fmt, args);
192 192
193 va_end(args); 193 va_end(args);
194 194
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d8fc0605b9d2..3a6175fe10c0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1994,18 +1994,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1994 shdr4extnum->sh_info = segs; 1994 shdr4extnum->sh_info = segs;
1995} 1995}
1996 1996
1997static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
1998 unsigned long mm_flags)
1999{
2000 struct vm_area_struct *vma;
2001 size_t size = 0;
2002
2003 for (vma = first_vma(current, gate_vma); vma != NULL;
2004 vma = next_vma(vma, gate_vma))
2005 size += vma_dump_size(vma, mm_flags);
2006 return size;
2007}
2008
2009/* 1997/*
2010 * Actual dumper 1998 * Actual dumper
2011 * 1999 *
@@ -2017,7 +2005,8 @@ static int elf_core_dump(struct coredump_params *cprm)
2017{ 2005{
2018 int has_dumped = 0; 2006 int has_dumped = 0;
2019 mm_segment_t fs; 2007 mm_segment_t fs;
2020 int segs; 2008 int segs, i;
2009 size_t vma_data_size = 0;
2021 struct vm_area_struct *vma, *gate_vma; 2010 struct vm_area_struct *vma, *gate_vma;
2022 struct elfhdr *elf = NULL; 2011 struct elfhdr *elf = NULL;
2023 loff_t offset = 0, dataoff; 2012 loff_t offset = 0, dataoff;
@@ -2026,6 +2015,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2026 struct elf_shdr *shdr4extnum = NULL; 2015 struct elf_shdr *shdr4extnum = NULL;
2027 Elf_Half e_phnum; 2016 Elf_Half e_phnum;
2028 elf_addr_t e_shoff; 2017 elf_addr_t e_shoff;
2018 elf_addr_t *vma_filesz = NULL;
2029 2019
2030 /* 2020 /*
2031 * We no longer stop all VM operations. 2021 * We no longer stop all VM operations.
@@ -2093,7 +2083,20 @@ static int elf_core_dump(struct coredump_params *cprm)
2093 2083
2094 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 2084 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
2095 2085
2096 offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); 2086 vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
2087 if (!vma_filesz)
2088 goto end_coredump;
2089
2090 for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
2091 vma = next_vma(vma, gate_vma)) {
2092 unsigned long dump_size;
2093
2094 dump_size = vma_dump_size(vma, cprm->mm_flags);
2095 vma_filesz[i++] = dump_size;
2096 vma_data_size += dump_size;
2097 }
2098
2099 offset += vma_data_size;
2097 offset += elf_core_extra_data_size(); 2100 offset += elf_core_extra_data_size();
2098 e_shoff = offset; 2101 e_shoff = offset;
2099 2102
@@ -2113,7 +2116,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2113 goto end_coredump; 2116 goto end_coredump;
2114 2117
2115 /* Write program headers for segments dump */ 2118 /* Write program headers for segments dump */
2116 for (vma = first_vma(current, gate_vma); vma != NULL; 2119 for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
2117 vma = next_vma(vma, gate_vma)) { 2120 vma = next_vma(vma, gate_vma)) {
2118 struct elf_phdr phdr; 2121 struct elf_phdr phdr;
2119 2122
@@ -2121,7 +2124,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2121 phdr.p_offset = offset; 2124 phdr.p_offset = offset;
2122 phdr.p_vaddr = vma->vm_start; 2125 phdr.p_vaddr = vma->vm_start;
2123 phdr.p_paddr = 0; 2126 phdr.p_paddr = 0;
2124 phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); 2127 phdr.p_filesz = vma_filesz[i++];
2125 phdr.p_memsz = vma->vm_end - vma->vm_start; 2128 phdr.p_memsz = vma->vm_end - vma->vm_start;
2126 offset += phdr.p_filesz; 2129 offset += phdr.p_filesz;
2127 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 2130 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -2149,12 +2152,12 @@ static int elf_core_dump(struct coredump_params *cprm)
2149 if (!dump_skip(cprm, dataoff - cprm->written)) 2152 if (!dump_skip(cprm, dataoff - cprm->written))
2150 goto end_coredump; 2153 goto end_coredump;
2151 2154
2152 for (vma = first_vma(current, gate_vma); vma != NULL; 2155 for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
2153 vma = next_vma(vma, gate_vma)) { 2156 vma = next_vma(vma, gate_vma)) {
2154 unsigned long addr; 2157 unsigned long addr;
2155 unsigned long end; 2158 unsigned long end;
2156 2159
2157 end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); 2160 end = vma->vm_start + vma_filesz[i++];
2158 2161
2159 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2162 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2160 struct page *page; 2163 struct page *page;
@@ -2187,6 +2190,7 @@ end_coredump:
2187cleanup: 2190cleanup:
2188 free_note_info(&info); 2191 free_note_info(&info);
2189 kfree(shdr4extnum); 2192 kfree(shdr4extnum);
2193 kfree(vma_filesz);
2190 kfree(phdr4note); 2194 kfree(phdr4note);
2191 kfree(elf); 2195 kfree(elf);
2192out: 2196out:
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd8beb9657a2..70789e198dea 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -1,21 +1,14 @@
1/* 1/*
2 * binfmt_misc.c 2 * binfmt_misc.c
3 * 3 *
4 * Copyright (C) 1997 Richard Günther 4 * Copyright (C) 1997 Richard Günther
5 * 5 *
6 * binfmt_misc detects binaries via a magic or filename extension and invokes 6 * binfmt_misc detects binaries via a magic or filename extension and invokes
7 * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and 7 * a specified wrapper. See Documentation/binfmt_misc.txt for more details.
8 * binfmt_mz.
9 *
10 * 1997-04-25 first version
11 * [...]
12 * 1997-05-19 cleanup
13 * 1997-06-26 hpa: pass the real filename rather than argv[0]
14 * 1997-06-30 minor cleanup
15 * 1997-08-09 removed extension stripping, locking cleanup
16 * 2001-02-28 AV: rewritten into something that resembles C. Original didn't.
17 */ 8 */
18 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
19#include <linux/module.h> 12#include <linux/module.h>
20#include <linux/init.h> 13#include <linux/init.h>
21#include <linux/sched.h> 14#include <linux/sched.h>
@@ -30,8 +23,13 @@
30#include <linux/mount.h> 23#include <linux/mount.h>
31#include <linux/syscalls.h> 24#include <linux/syscalls.h>
32#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/uaccess.h>
33 27
34#include <asm/uaccess.h> 28#ifdef DEBUG
29# define USE_DEBUG 1
30#else
31# define USE_DEBUG 0
32#endif
35 33
36enum { 34enum {
37 VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ 35 VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
@@ -41,9 +39,9 @@ static LIST_HEAD(entries);
41static int enabled = 1; 39static int enabled = 1;
42 40
43enum {Enabled, Magic}; 41enum {Enabled, Magic};
44#define MISC_FMT_PRESERVE_ARGV0 (1<<31) 42#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
45#define MISC_FMT_OPEN_BINARY (1<<30) 43#define MISC_FMT_OPEN_BINARY (1 << 30)
46#define MISC_FMT_CREDENTIALS (1<<29) 44#define MISC_FMT_CREDENTIALS (1 << 29)
47 45
48typedef struct { 46typedef struct {
49 struct list_head list; 47 struct list_head list;
@@ -87,20 +85,24 @@ static Node *check_file(struct linux_binprm *bprm)
87 char *p = strrchr(bprm->interp, '.'); 85 char *p = strrchr(bprm->interp, '.');
88 struct list_head *l; 86 struct list_head *l;
89 87
88 /* Walk all the registered handlers. */
90 list_for_each(l, &entries) { 89 list_for_each(l, &entries) {
91 Node *e = list_entry(l, Node, list); 90 Node *e = list_entry(l, Node, list);
92 char *s; 91 char *s;
93 int j; 92 int j;
94 93
94 /* Make sure this one is currently enabled. */
95 if (!test_bit(Enabled, &e->flags)) 95 if (!test_bit(Enabled, &e->flags))
96 continue; 96 continue;
97 97
98 /* Do matching based on extension if applicable. */
98 if (!test_bit(Magic, &e->flags)) { 99 if (!test_bit(Magic, &e->flags)) {
99 if (p && !strcmp(e->magic, p + 1)) 100 if (p && !strcmp(e->magic, p + 1))
100 return e; 101 return e;
101 continue; 102 continue;
102 } 103 }
103 104
105 /* Do matching based on magic & mask. */
104 s = bprm->buf + e->offset; 106 s = bprm->buf + e->offset;
105 if (e->mask) { 107 if (e->mask) {
106 for (j = 0; j < e->size; j++) 108 for (j = 0; j < e->size; j++)
@@ -123,7 +125,7 @@ static Node *check_file(struct linux_binprm *bprm)
123static int load_misc_binary(struct linux_binprm *bprm) 125static int load_misc_binary(struct linux_binprm *bprm)
124{ 126{
125 Node *fmt; 127 Node *fmt;
126 struct file * interp_file = NULL; 128 struct file *interp_file = NULL;
127 char iname[BINPRM_BUF_SIZE]; 129 char iname[BINPRM_BUF_SIZE];
128 const char *iname_addr = iname; 130 const char *iname_addr = iname;
129 int retval; 131 int retval;
@@ -131,7 +133,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
131 133
132 retval = -ENOEXEC; 134 retval = -ENOEXEC;
133 if (!enabled) 135 if (!enabled)
134 goto _ret; 136 goto ret;
135 137
136 /* to keep locking time low, we copy the interpreter string */ 138 /* to keep locking time low, we copy the interpreter string */
137 read_lock(&entries_lock); 139 read_lock(&entries_lock);
@@ -140,25 +142,26 @@ static int load_misc_binary(struct linux_binprm *bprm)
140 strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); 142 strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
141 read_unlock(&entries_lock); 143 read_unlock(&entries_lock);
142 if (!fmt) 144 if (!fmt)
143 goto _ret; 145 goto ret;
144 146
145 if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { 147 if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
146 retval = remove_arg_zero(bprm); 148 retval = remove_arg_zero(bprm);
147 if (retval) 149 if (retval)
148 goto _ret; 150 goto ret;
149 } 151 }
150 152
151 if (fmt->flags & MISC_FMT_OPEN_BINARY) { 153 if (fmt->flags & MISC_FMT_OPEN_BINARY) {
152 154
153 /* if the binary should be opened on behalf of the 155 /* if the binary should be opened on behalf of the
154 * interpreter than keep it open and assign descriptor 156 * interpreter than keep it open and assign descriptor
155 * to it */ 157 * to it
156 fd_binary = get_unused_fd(); 158 */
157 if (fd_binary < 0) { 159 fd_binary = get_unused_fd_flags(0);
158 retval = fd_binary; 160 if (fd_binary < 0) {
159 goto _ret; 161 retval = fd_binary;
160 } 162 goto ret;
161 fd_install(fd_binary, bprm->file); 163 }
164 fd_install(fd_binary, bprm->file);
162 165
163 /* if the binary is not readable than enforce mm->dumpable=0 166 /* if the binary is not readable than enforce mm->dumpable=0
164 regardless of the interpreter's permissions */ 167 regardless of the interpreter's permissions */
@@ -171,32 +174,32 @@ static int load_misc_binary(struct linux_binprm *bprm)
171 bprm->interp_flags |= BINPRM_FLAGS_EXECFD; 174 bprm->interp_flags |= BINPRM_FLAGS_EXECFD;
172 bprm->interp_data = fd_binary; 175 bprm->interp_data = fd_binary;
173 176
174 } else { 177 } else {
175 allow_write_access(bprm->file); 178 allow_write_access(bprm->file);
176 fput(bprm->file); 179 fput(bprm->file);
177 bprm->file = NULL; 180 bprm->file = NULL;
178 } 181 }
179 /* make argv[1] be the path to the binary */ 182 /* make argv[1] be the path to the binary */
180 retval = copy_strings_kernel (1, &bprm->interp, bprm); 183 retval = copy_strings_kernel(1, &bprm->interp, bprm);
181 if (retval < 0) 184 if (retval < 0)
182 goto _error; 185 goto error;
183 bprm->argc++; 186 bprm->argc++;
184 187
185 /* add the interp as argv[0] */ 188 /* add the interp as argv[0] */
186 retval = copy_strings_kernel (1, &iname_addr, bprm); 189 retval = copy_strings_kernel(1, &iname_addr, bprm);
187 if (retval < 0) 190 if (retval < 0)
188 goto _error; 191 goto error;
189 bprm->argc ++; 192 bprm->argc++;
190 193
191 /* Update interp in case binfmt_script needs it. */ 194 /* Update interp in case binfmt_script needs it. */
192 retval = bprm_change_interp(iname, bprm); 195 retval = bprm_change_interp(iname, bprm);
193 if (retval < 0) 196 if (retval < 0)
194 goto _error; 197 goto error;
195 198
196 interp_file = open_exec (iname); 199 interp_file = open_exec(iname);
197 retval = PTR_ERR (interp_file); 200 retval = PTR_ERR(interp_file);
198 if (IS_ERR (interp_file)) 201 if (IS_ERR(interp_file))
199 goto _error; 202 goto error;
200 203
201 bprm->file = interp_file; 204 bprm->file = interp_file;
202 if (fmt->flags & MISC_FMT_CREDENTIALS) { 205 if (fmt->flags & MISC_FMT_CREDENTIALS) {
@@ -207,23 +210,23 @@ static int load_misc_binary(struct linux_binprm *bprm)
207 memset(bprm->buf, 0, BINPRM_BUF_SIZE); 210 memset(bprm->buf, 0, BINPRM_BUF_SIZE);
208 retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); 211 retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
209 } else 212 } else
210 retval = prepare_binprm (bprm); 213 retval = prepare_binprm(bprm);
211 214
212 if (retval < 0) 215 if (retval < 0)
213 goto _error; 216 goto error;
214 217
215 retval = search_binary_handler(bprm); 218 retval = search_binary_handler(bprm);
216 if (retval < 0) 219 if (retval < 0)
217 goto _error; 220 goto error;
218 221
219_ret: 222ret:
220 return retval; 223 return retval;
221_error: 224error:
222 if (fd_binary > 0) 225 if (fd_binary > 0)
223 sys_close(fd_binary); 226 sys_close(fd_binary);
224 bprm->interp_flags = 0; 227 bprm->interp_flags = 0;
225 bprm->interp_data = 0; 228 bprm->interp_data = 0;
226 goto _ret; 229 goto ret;
227} 230}
228 231
229/* Command parsers */ 232/* Command parsers */
@@ -250,36 +253,40 @@ static char *scanarg(char *s, char del)
250 return s; 253 return s;
251} 254}
252 255
253static char * check_special_flags (char * sfs, Node * e) 256static char *check_special_flags(char *sfs, Node *e)
254{ 257{
255 char * p = sfs; 258 char *p = sfs;
256 int cont = 1; 259 int cont = 1;
257 260
258 /* special flags */ 261 /* special flags */
259 while (cont) { 262 while (cont) {
260 switch (*p) { 263 switch (*p) {
261 case 'P': 264 case 'P':
262 p++; 265 pr_debug("register: flag: P (preserve argv0)\n");
263 e->flags |= MISC_FMT_PRESERVE_ARGV0; 266 p++;
264 break; 267 e->flags |= MISC_FMT_PRESERVE_ARGV0;
265 case 'O': 268 break;
266 p++; 269 case 'O':
267 e->flags |= MISC_FMT_OPEN_BINARY; 270 pr_debug("register: flag: O (open binary)\n");
268 break; 271 p++;
269 case 'C': 272 e->flags |= MISC_FMT_OPEN_BINARY;
270 p++; 273 break;
271 /* this flags also implies the 274 case 'C':
272 open-binary flag */ 275 pr_debug("register: flag: C (preserve creds)\n");
273 e->flags |= (MISC_FMT_CREDENTIALS | 276 p++;
274 MISC_FMT_OPEN_BINARY); 277 /* this flags also implies the
275 break; 278 open-binary flag */
276 default: 279 e->flags |= (MISC_FMT_CREDENTIALS |
277 cont = 0; 280 MISC_FMT_OPEN_BINARY);
281 break;
282 default:
283 cont = 0;
278 } 284 }
279 } 285 }
280 286
281 return p; 287 return p;
282} 288}
289
283/* 290/*
284 * This registers a new binary format, it recognises the syntax 291 * This registers a new binary format, it recognises the syntax
285 * ':name:type:offset:magic:mask:interpreter:flags' 292 * ':name:type:offset:magic:mask:interpreter:flags'
@@ -292,6 +299,8 @@ static Node *create_entry(const char __user *buffer, size_t count)
292 char *buf, *p; 299 char *buf, *p;
293 char del; 300 char del;
294 301
302 pr_debug("register: received %zu bytes\n", count);
303
295 /* some sanity checks */ 304 /* some sanity checks */
296 err = -EINVAL; 305 err = -EINVAL;
297 if ((count < 11) || (count > MAX_REGISTER_LENGTH)) 306 if ((count < 11) || (count > MAX_REGISTER_LENGTH))
@@ -299,7 +308,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
299 308
300 err = -ENOMEM; 309 err = -ENOMEM;
301 memsize = sizeof(Node) + count + 8; 310 memsize = sizeof(Node) + count + 8;
302 e = kmalloc(memsize, GFP_USER); 311 e = kmalloc(memsize, GFP_KERNEL);
303 if (!e) 312 if (!e)
304 goto out; 313 goto out;
305 314
@@ -307,98 +316,175 @@ static Node *create_entry(const char __user *buffer, size_t count)
307 316
308 memset(e, 0, sizeof(Node)); 317 memset(e, 0, sizeof(Node));
309 if (copy_from_user(buf, buffer, count)) 318 if (copy_from_user(buf, buffer, count))
310 goto Efault; 319 goto efault;
311 320
312 del = *p++; /* delimeter */ 321 del = *p++; /* delimeter */
313 322
314 memset(buf+count, del, 8); 323 pr_debug("register: delim: %#x {%c}\n", del, del);
324
325 /* Pad the buffer with the delim to simplify parsing below. */
326 memset(buf + count, del, 8);
315 327
328 /* Parse the 'name' field. */
316 e->name = p; 329 e->name = p;
317 p = strchr(p, del); 330 p = strchr(p, del);
318 if (!p) 331 if (!p)
319 goto Einval; 332 goto einval;
320 *p++ = '\0'; 333 *p++ = '\0';
321 if (!e->name[0] || 334 if (!e->name[0] ||
322 !strcmp(e->name, ".") || 335 !strcmp(e->name, ".") ||
323 !strcmp(e->name, "..") || 336 !strcmp(e->name, "..") ||
324 strchr(e->name, '/')) 337 strchr(e->name, '/'))
325 goto Einval; 338 goto einval;
339
340 pr_debug("register: name: {%s}\n", e->name);
341
342 /* Parse the 'type' field. */
326 switch (*p++) { 343 switch (*p++) {
327 case 'E': e->flags = 1<<Enabled; break; 344 case 'E':
328 case 'M': e->flags = (1<<Enabled) | (1<<Magic); break; 345 pr_debug("register: type: E (extension)\n");
329 default: goto Einval; 346 e->flags = 1 << Enabled;
347 break;
348 case 'M':
349 pr_debug("register: type: M (magic)\n");
350 e->flags = (1 << Enabled) | (1 << Magic);
351 break;
352 default:
353 goto einval;
330 } 354 }
331 if (*p++ != del) 355 if (*p++ != del)
332 goto Einval; 356 goto einval;
357
333 if (test_bit(Magic, &e->flags)) { 358 if (test_bit(Magic, &e->flags)) {
334 char *s = strchr(p, del); 359 /* Handle the 'M' (magic) format. */
360 char *s;
361
362 /* Parse the 'offset' field. */
363 s = strchr(p, del);
335 if (!s) 364 if (!s)
336 goto Einval; 365 goto einval;
337 *s++ = '\0'; 366 *s++ = '\0';
338 e->offset = simple_strtoul(p, &p, 10); 367 e->offset = simple_strtoul(p, &p, 10);
339 if (*p++) 368 if (*p++)
340 goto Einval; 369 goto einval;
370 pr_debug("register: offset: %#x\n", e->offset);
371
372 /* Parse the 'magic' field. */
341 e->magic = p; 373 e->magic = p;
342 p = scanarg(p, del); 374 p = scanarg(p, del);
343 if (!p) 375 if (!p)
344 goto Einval; 376 goto einval;
345 p[-1] = '\0'; 377 p[-1] = '\0';
346 if (!e->magic[0]) 378 if (p == e->magic)
347 goto Einval; 379 goto einval;
380 if (USE_DEBUG)
381 print_hex_dump_bytes(
382 KBUILD_MODNAME ": register: magic[raw]: ",
383 DUMP_PREFIX_NONE, e->magic, p - e->magic);
384
385 /* Parse the 'mask' field. */
348 e->mask = p; 386 e->mask = p;
349 p = scanarg(p, del); 387 p = scanarg(p, del);
350 if (!p) 388 if (!p)
351 goto Einval; 389 goto einval;
352 p[-1] = '\0'; 390 p[-1] = '\0';
353 if (!e->mask[0]) 391 if (p == e->mask) {
354 e->mask = NULL; 392 e->mask = NULL;
393 pr_debug("register: mask[raw]: none\n");
394 } else if (USE_DEBUG)
395 print_hex_dump_bytes(
396 KBUILD_MODNAME ": register: mask[raw]: ",
397 DUMP_PREFIX_NONE, e->mask, p - e->mask);
398
399 /*
400 * Decode the magic & mask fields.
401 * Note: while we might have accepted embedded NUL bytes from
402 * above, the unescape helpers here will stop at the first one
403 * it encounters.
404 */
355 e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); 405 e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
356 if (e->mask && 406 if (e->mask &&
357 string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) 407 string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
358 goto Einval; 408 goto einval;
359 if (e->size + e->offset > BINPRM_BUF_SIZE) 409 if (e->size + e->offset > BINPRM_BUF_SIZE)
360 goto Einval; 410 goto einval;
411 pr_debug("register: magic/mask length: %i\n", e->size);
412 if (USE_DEBUG) {
413 print_hex_dump_bytes(
414 KBUILD_MODNAME ": register: magic[decoded]: ",
415 DUMP_PREFIX_NONE, e->magic, e->size);
416
417 if (e->mask) {
418 int i;
419 char *masked = kmalloc(e->size, GFP_KERNEL);
420
421 print_hex_dump_bytes(
422 KBUILD_MODNAME ": register: mask[decoded]: ",
423 DUMP_PREFIX_NONE, e->mask, e->size);
424
425 if (masked) {
426 for (i = 0; i < e->size; ++i)
427 masked[i] = e->magic[i] & e->mask[i];
428 print_hex_dump_bytes(
429 KBUILD_MODNAME ": register: magic[masked]: ",
430 DUMP_PREFIX_NONE, masked, e->size);
431
432 kfree(masked);
433 }
434 }
435 }
361 } else { 436 } else {
437 /* Handle the 'E' (extension) format. */
438
439 /* Skip the 'offset' field. */
362 p = strchr(p, del); 440 p = strchr(p, del);
363 if (!p) 441 if (!p)
364 goto Einval; 442 goto einval;
365 *p++ = '\0'; 443 *p++ = '\0';
444
445 /* Parse the 'magic' field. */
366 e->magic = p; 446 e->magic = p;
367 p = strchr(p, del); 447 p = strchr(p, del);
368 if (!p) 448 if (!p)
369 goto Einval; 449 goto einval;
370 *p++ = '\0'; 450 *p++ = '\0';
371 if (!e->magic[0] || strchr(e->magic, '/')) 451 if (!e->magic[0] || strchr(e->magic, '/'))
372 goto Einval; 452 goto einval;
453 pr_debug("register: extension: {%s}\n", e->magic);
454
455 /* Skip the 'mask' field. */
373 p = strchr(p, del); 456 p = strchr(p, del);
374 if (!p) 457 if (!p)
375 goto Einval; 458 goto einval;
376 *p++ = '\0'; 459 *p++ = '\0';
377 } 460 }
461
462 /* Parse the 'interpreter' field. */
378 e->interpreter = p; 463 e->interpreter = p;
379 p = strchr(p, del); 464 p = strchr(p, del);
380 if (!p) 465 if (!p)
381 goto Einval; 466 goto einval;
382 *p++ = '\0'; 467 *p++ = '\0';
383 if (!e->interpreter[0]) 468 if (!e->interpreter[0])
384 goto Einval; 469 goto einval;
385 470 pr_debug("register: interpreter: {%s}\n", e->interpreter);
386
387 p = check_special_flags (p, e);
388 471
472 /* Parse the 'flags' field. */
473 p = check_special_flags(p, e);
389 if (*p == '\n') 474 if (*p == '\n')
390 p++; 475 p++;
391 if (p != buf + count) 476 if (p != buf + count)
392 goto Einval; 477 goto einval;
478
393 return e; 479 return e;
394 480
395out: 481out:
396 return ERR_PTR(err); 482 return ERR_PTR(err);
397 483
398Efault: 484efault:
399 kfree(e); 485 kfree(e);
400 return ERR_PTR(-EFAULT); 486 return ERR_PTR(-EFAULT);
401Einval: 487einval:
402 kfree(e); 488 kfree(e);
403 return ERR_PTR(-EINVAL); 489 return ERR_PTR(-EINVAL);
404} 490}
@@ -417,7 +503,7 @@ static int parse_command(const char __user *buffer, size_t count)
417 return -EFAULT; 503 return -EFAULT;
418 if (!count) 504 if (!count)
419 return 0; 505 return 0;
420 if (s[count-1] == '\n') 506 if (s[count - 1] == '\n')
421 count--; 507 count--;
422 if (count == 1 && s[0] == '0') 508 if (count == 1 && s[0] == '0')
423 return 1; 509 return 1;
@@ -434,7 +520,7 @@ static void entry_status(Node *e, char *page)
434{ 520{
435 char *dp; 521 char *dp;
436 char *status = "disabled"; 522 char *status = "disabled";
437 const char * flags = "flags: "; 523 const char *flags = "flags: ";
438 524
439 if (test_bit(Enabled, &e->flags)) 525 if (test_bit(Enabled, &e->flags))
440 status = "enabled"; 526 status = "enabled";
@@ -448,19 +534,15 @@ static void entry_status(Node *e, char *page)
448 dp = page + strlen(page); 534 dp = page + strlen(page);
449 535
450 /* print the special flags */ 536 /* print the special flags */
451 sprintf (dp, "%s", flags); 537 sprintf(dp, "%s", flags);
452 dp += strlen (flags); 538 dp += strlen(flags);
453 if (e->flags & MISC_FMT_PRESERVE_ARGV0) { 539 if (e->flags & MISC_FMT_PRESERVE_ARGV0)
454 *dp ++ = 'P'; 540 *dp++ = 'P';
455 } 541 if (e->flags & MISC_FMT_OPEN_BINARY)
456 if (e->flags & MISC_FMT_OPEN_BINARY) { 542 *dp++ = 'O';
457 *dp ++ = 'O'; 543 if (e->flags & MISC_FMT_CREDENTIALS)
458 } 544 *dp++ = 'C';
459 if (e->flags & MISC_FMT_CREDENTIALS) { 545 *dp++ = '\n';
460 *dp ++ = 'C';
461 }
462 *dp ++ = '\n';
463
464 546
465 if (!test_bit(Magic, &e->flags)) { 547 if (!test_bit(Magic, &e->flags)) {
466 sprintf(dp, "extension .%s\n", e->magic); 548 sprintf(dp, "extension .%s\n", e->magic);
@@ -488,7 +570,7 @@ static void entry_status(Node *e, char *page)
488 570
489static struct inode *bm_get_inode(struct super_block *sb, int mode) 571static struct inode *bm_get_inode(struct super_block *sb, int mode)
490{ 572{
491 struct inode * inode = new_inode(sb); 573 struct inode *inode = new_inode(sb);
492 574
493 if (inode) { 575 if (inode) {
494 inode->i_ino = get_next_ino(); 576 inode->i_ino = get_next_ino();
@@ -528,13 +610,14 @@ static void kill_node(Node *e)
528/* /<entry> */ 610/* /<entry> */
529 611
530static ssize_t 612static ssize_t
531bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) 613bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
532{ 614{
533 Node *e = file_inode(file)->i_private; 615 Node *e = file_inode(file)->i_private;
534 ssize_t res; 616 ssize_t res;
535 char *page; 617 char *page;
536 618
537 if (!(page = (char*) __get_free_page(GFP_KERNEL))) 619 page = (char *) __get_free_page(GFP_KERNEL);
620 if (!page)
538 return -ENOMEM; 621 return -ENOMEM;
539 622
540 entry_status(e, page); 623 entry_status(e, page);
@@ -553,20 +636,28 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
553 int res = parse_command(buffer, count); 636 int res = parse_command(buffer, count);
554 637
555 switch (res) { 638 switch (res) {
556 case 1: clear_bit(Enabled, &e->flags); 639 case 1:
557 break; 640 /* Disable this handler. */
558 case 2: set_bit(Enabled, &e->flags); 641 clear_bit(Enabled, &e->flags);
559 break; 642 break;
560 case 3: root = dget(file->f_path.dentry->d_sb->s_root); 643 case 2:
561 mutex_lock(&root->d_inode->i_mutex); 644 /* Enable this handler. */
562 645 set_bit(Enabled, &e->flags);
563 kill_node(e); 646 break;
564 647 case 3:
565 mutex_unlock(&root->d_inode->i_mutex); 648 /* Delete this handler. */
566 dput(root); 649 root = dget(file->f_path.dentry->d_sb->s_root);
567 break; 650 mutex_lock(&root->d_inode->i_mutex);
568 default: return res; 651
652 kill_node(e);
653
654 mutex_unlock(&root->d_inode->i_mutex);
655 dput(root);
656 break;
657 default:
658 return res;
569 } 659 }
660
570 return count; 661 return count;
571} 662}
572 663
@@ -654,26 +745,36 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
654 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); 745 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
655} 746}
656 747
657static ssize_t bm_status_write(struct file * file, const char __user * buffer, 748static ssize_t bm_status_write(struct file *file, const char __user *buffer,
658 size_t count, loff_t *ppos) 749 size_t count, loff_t *ppos)
659{ 750{
660 int res = parse_command(buffer, count); 751 int res = parse_command(buffer, count);
661 struct dentry *root; 752 struct dentry *root;
662 753
663 switch (res) { 754 switch (res) {
664 case 1: enabled = 0; break; 755 case 1:
665 case 2: enabled = 1; break; 756 /* Disable all handlers. */
666 case 3: root = dget(file->f_path.dentry->d_sb->s_root); 757 enabled = 0;
667 mutex_lock(&root->d_inode->i_mutex); 758 break;
668 759 case 2:
669 while (!list_empty(&entries)) 760 /* Enable all handlers. */
670 kill_node(list_entry(entries.next, Node, list)); 761 enabled = 1;
671 762 break;
672 mutex_unlock(&root->d_inode->i_mutex); 763 case 3:
673 dput(root); 764 /* Delete all handlers. */
674 break; 765 root = dget(file->f_path.dentry->d_sb->s_root);
675 default: return res; 766 mutex_lock(&root->d_inode->i_mutex);
767
768 while (!list_empty(&entries))
769 kill_node(list_entry(entries.next, Node, list));
770
771 mutex_unlock(&root->d_inode->i_mutex);
772 dput(root);
773 break;
774 default:
775 return res;
676 } 776 }
777
677 return count; 778 return count;
678} 779}
679 780
@@ -690,14 +791,16 @@ static const struct super_operations s_ops = {
690 .evict_inode = bm_evict_inode, 791 .evict_inode = bm_evict_inode,
691}; 792};
692 793
693static int bm_fill_super(struct super_block * sb, void * data, int silent) 794static int bm_fill_super(struct super_block *sb, void *data, int silent)
694{ 795{
796 int err;
695 static struct tree_descr bm_files[] = { 797 static struct tree_descr bm_files[] = {
696 [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, 798 [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
697 [3] = {"register", &bm_register_operations, S_IWUSR}, 799 [3] = {"register", &bm_register_operations, S_IWUSR},
698 /* last one */ {""} 800 /* last one */ {""}
699 }; 801 };
700 int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); 802
803 err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
701 if (!err) 804 if (!err)
702 sb->s_op = &s_ops; 805 sb->s_op = &s_ops;
703 return err; 806 return err;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f77f7702fabe..67b2007f10fe 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -117,7 +117,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
117 goto out; 117 goto out;
118 } 118 }
119 major = i; 119 major = i;
120 ret = major;
121 } 120 }
122 121
123 cd->major = major; 122 cd->major = major;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6d00c419cbae..1ea780bc6376 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = {
38 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; 38 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
39/* security id for Authenticated Users system group */ 39/* security id for Authenticated Users system group */
40static const struct cifs_sid sid_authusers = { 40static const struct cifs_sid sid_authusers = {
41 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; 41 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
42/* group users */ 42/* group users */
43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
44 44
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 61d00a6e398f..fa13d5e79f64 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
2477 } 2477 }
2478 parm_data = (struct cifs_posix_lock *) 2478 parm_data = (struct cifs_posix_lock *)
2479 ((char *)&pSMBr->hdr.Protocol + data_offset); 2479 ((char *)&pSMBr->hdr.Protocol + data_offset);
2480 if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) 2480 if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
2481 pLockData->fl_type = F_UNLCK; 2481 pLockData->fl_type = F_UNLCK;
2482 else { 2482 else {
2483 if (parm_data->lock_type == 2483 if (parm_data->lock_type ==
2484 __constant_cpu_to_le16(CIFS_RDLCK)) 2484 cpu_to_le16(CIFS_RDLCK))
2485 pLockData->fl_type = F_RDLCK; 2485 pLockData->fl_type = F_RDLCK;
2486 else if (parm_data->lock_type == 2486 else if (parm_data->lock_type ==
2487 __constant_cpu_to_le16(CIFS_WRLCK)) 2487 cpu_to_le16(CIFS_WRLCK))
2488 pLockData->fl_type = F_WRLCK; 2488 pLockData->fl_type = F_WRLCK;
2489 2489
2490 pLockData->fl_start = le64_to_cpu(parm_data->start); 2490 pLockData->fl_start = le64_to_cpu(parm_data->start);
@@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
3276 pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); 3276 pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
3277 3277
3278 pSMB->TotalParameterCount = 0; 3278 pSMB->TotalParameterCount = 0;
3279 pSMB->TotalDataCount = __constant_cpu_to_le32(2); 3279 pSMB->TotalDataCount = cpu_to_le32(2);
3280 pSMB->MaxParameterCount = 0; 3280 pSMB->MaxParameterCount = 0;
3281 pSMB->MaxDataCount = 0; 3281 pSMB->MaxDataCount = 0;
3282 pSMB->MaxSetupCount = 4; 3282 pSMB->MaxSetupCount = 4;
3283 pSMB->Reserved = 0; 3283 pSMB->Reserved = 0;
3284 pSMB->ParameterOffset = 0; 3284 pSMB->ParameterOffset = 0;
3285 pSMB->DataCount = __constant_cpu_to_le32(2); 3285 pSMB->DataCount = cpu_to_le32(2);
3286 pSMB->DataOffset = 3286 pSMB->DataOffset =
3287 cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, 3287 cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
3288 compression_state) - 4); /* 84 */ 3288 compression_state) - 4); /* 84 */
3289 pSMB->SetupCount = 4; 3289 pSMB->SetupCount = 4;
3290 pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); 3290 pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
3291 pSMB->ParameterCount = 0; 3291 pSMB->ParameterCount = 0;
3292 pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); 3292 pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION);
3293 pSMB->IsFsctl = 1; /* FSCTL */ 3293 pSMB->IsFsctl = 1; /* FSCTL */
3294 pSMB->IsRootFlag = 0; 3294 pSMB->IsRootFlag = 0;
3295 pSMB->Fid = fid; /* file handle always le */ 3295 pSMB->Fid = fid; /* file handle always le */
3296 /* 3 byte pad, followed by 2 byte compress state */ 3296 /* 3 byte pad, followed by 2 byte compress state */
3297 pSMB->ByteCount = __constant_cpu_to_le16(5); 3297 pSMB->ByteCount = cpu_to_le16(5);
3298 inc_rfc1001_len(pSMB, 5); 3298 inc_rfc1001_len(pSMB, 5);
3299 3299
3300 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3300 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
3430 cifs_acl->version = cpu_to_le16(1); 3430 cifs_acl->version = cpu_to_le16(1);
3431 if (acl_type == ACL_TYPE_ACCESS) { 3431 if (acl_type == ACL_TYPE_ACCESS) {
3432 cifs_acl->access_entry_count = cpu_to_le16(count); 3432 cifs_acl->access_entry_count = cpu_to_le16(count);
3433 cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); 3433 cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
3434 } else if (acl_type == ACL_TYPE_DEFAULT) { 3434 } else if (acl_type == ACL_TYPE_DEFAULT) {
3435 cifs_acl->default_entry_count = cpu_to_le16(count); 3435 cifs_acl->default_entry_count = cpu_to_le16(count);
3436 cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); 3436 cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
3437 } else { 3437 } else {
3438 cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); 3438 cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
3439 return 0; 3439 return 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d535e168a9d3..96b7e9b7706d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1066,7 +1066,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1066 1066
1067 max_num = (max_buf - sizeof(struct smb_hdr)) / 1067 max_num = (max_buf - sizeof(struct smb_hdr)) /
1068 sizeof(LOCKING_ANDX_RANGE); 1068 sizeof(LOCKING_ANDX_RANGE);
1069 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1069 buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
1070 if (!buf) { 1070 if (!buf) {
1071 free_xid(xid); 1071 free_xid(xid);
1072 return -ENOMEM; 1072 return -ENOMEM;
@@ -1401,7 +1401,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
1401 1401
1402 max_num = (max_buf - sizeof(struct smb_hdr)) / 1402 max_num = (max_buf - sizeof(struct smb_hdr)) /
1403 sizeof(LOCKING_ANDX_RANGE); 1403 sizeof(LOCKING_ANDX_RANGE);
1404 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1404 buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
1405 if (!buf) 1405 if (!buf)
1406 return -ENOMEM; 1406 return -ENOMEM;
1407 1407
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 446cb7fb3f58..bce6fdcd5d48 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
46 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, 46 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
47 USHRT_MAX)); 47 USHRT_MAX));
48 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 48 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
49 pSMB->req.VcNumber = __constant_cpu_to_le16(1); 49 pSMB->req.VcNumber = cpu_to_le16(1);
50 50
51 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ 51 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
52 52
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 45992944e238..7198eac5dddd 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
111 return -EINVAL; 111 return -EINVAL;
112 112
113 max_num = max_buf / sizeof(struct smb2_lock_element); 113 max_num = max_buf / sizeof(struct smb2_lock_element);
114 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); 114 buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
115 if (!buf) 115 if (!buf)
116 return -ENOMEM; 116 return -ENOMEM;
117 117
@@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
247 } 247 }
248 248
249 max_num = max_buf / sizeof(struct smb2_lock_element); 249 max_num = max_buf / sizeof(struct smb2_lock_element);
250 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); 250 buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
251 if (!buf) { 251 if (!buf) {
252 free_xid(xid); 252 free_xid(xid);
253 return -ENOMEM; 253 return -ENOMEM;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1a08a34838fc..f1cefc9763ed 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
67 * indexed by command in host byte order 67 * indexed by command in host byte order
68 */ 68 */
69static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { 69static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
70 /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65), 70 /* SMB2_NEGOTIATE */ cpu_to_le16(65),
71 /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9), 71 /* SMB2_SESSION_SETUP */ cpu_to_le16(9),
72 /* SMB2_LOGOFF */ __constant_cpu_to_le16(4), 72 /* SMB2_LOGOFF */ cpu_to_le16(4),
73 /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16), 73 /* SMB2_TREE_CONNECT */ cpu_to_le16(16),
74 /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4), 74 /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
75 /* SMB2_CREATE */ __constant_cpu_to_le16(89), 75 /* SMB2_CREATE */ cpu_to_le16(89),
76 /* SMB2_CLOSE */ __constant_cpu_to_le16(60), 76 /* SMB2_CLOSE */ cpu_to_le16(60),
77 /* SMB2_FLUSH */ __constant_cpu_to_le16(4), 77 /* SMB2_FLUSH */ cpu_to_le16(4),
78 /* SMB2_READ */ __constant_cpu_to_le16(17), 78 /* SMB2_READ */ cpu_to_le16(17),
79 /* SMB2_WRITE */ __constant_cpu_to_le16(17), 79 /* SMB2_WRITE */ cpu_to_le16(17),
80 /* SMB2_LOCK */ __constant_cpu_to_le16(4), 80 /* SMB2_LOCK */ cpu_to_le16(4),
81 /* SMB2_IOCTL */ __constant_cpu_to_le16(49), 81 /* SMB2_IOCTL */ cpu_to_le16(49),
82 /* BB CHECK this ... not listed in documentation */ 82 /* BB CHECK this ... not listed in documentation */
83 /* SMB2_CANCEL */ __constant_cpu_to_le16(0), 83 /* SMB2_CANCEL */ cpu_to_le16(0),
84 /* SMB2_ECHO */ __constant_cpu_to_le16(4), 84 /* SMB2_ECHO */ cpu_to_le16(4),
85 /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9), 85 /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9),
86 /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9), 86 /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9),
87 /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9), 87 /* SMB2_QUERY_INFO */ cpu_to_le16(9),
88 /* SMB2_SET_INFO */ __constant_cpu_to_le16(2), 88 /* SMB2_SET_INFO */ cpu_to_le16(2),
89 /* BB FIXME can also be 44 for lease break */ 89 /* BB FIXME can also be 44 for lease break */
90 /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24) 90 /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
91}; 91};
92 92
93int 93int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 568f323665c8..93fd0586f9ec 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -600,7 +600,7 @@ smb2_clone_range(const unsigned int xid,
600 goto cchunk_out; 600 goto cchunk_out;
601 601
602 /* For now array only one chunk long, will make more flexible later */ 602 /* For now array only one chunk long, will make more flexible later */
603 pcchunk->ChunkCount = __constant_cpu_to_le32(1); 603 pcchunk->ChunkCount = cpu_to_le32(1);
604 pcchunk->Reserved = 0; 604 pcchunk->Reserved = 0;
605 pcchunk->Reserved2 = 0; 605 pcchunk->Reserved2 = 0;
606 606
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0ca7f6364754..3417340bf89e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1358,7 +1358,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
1358 char *ret_data = NULL; 1358 char *ret_data = NULL;
1359 1359
1360 fsctl_input.CompressionState = 1360 fsctl_input.CompressionState =
1361 __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); 1361 cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
1362 1362
1363 rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, 1363 rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
1364 FSCTL_SET_COMPRESSION, true /* is_fsctl */, 1364 FSCTL_SET_COMPRESSION, true /* is_fsctl */,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d84f46c5b2c5..ce858477002a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -85,7 +85,7 @@
85/* BB FIXME - analyze following length BB */ 85/* BB FIXME - analyze following length BB */
86#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ 86#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
87 87
88#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) 88#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
89 89
90/* 90/*
91 * SMB2 Header Definition 91 * SMB2 Header Definition
@@ -96,7 +96,7 @@
96 * 96 *
97 */ 97 */
98 98
99#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) 99#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
100 100
101struct smb2_hdr { 101struct smb2_hdr {
102 __be32 smb2_buf_length; /* big endian on wire */ 102 __be32 smb2_buf_length; /* big endian on wire */
@@ -137,16 +137,16 @@ struct smb2_transform_hdr {
137} __packed; 137} __packed;
138 138
139/* Encryption Algorithms */ 139/* Encryption Algorithms */
140#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) 140#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
141 141
142/* 142/*
143 * SMB2 flag definitions 143 * SMB2 flag definitions
144 */ 144 */
145#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001) 145#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
146#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002) 146#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
147#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004) 147#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
148#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008) 148#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
149#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000) 149#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
150 150
151/* 151/*
152 * Definitions for SMB2 Protocol Data Units (network frames) 152 * Definitions for SMB2 Protocol Data Units (network frames)
@@ -157,7 +157,7 @@ struct smb2_transform_hdr {
157 * 157 *
158 */ 158 */
159 159
160#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) 160#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
161 161
162struct smb2_err_rsp { 162struct smb2_err_rsp {
163 struct smb2_hdr hdr; 163 struct smb2_hdr hdr;
@@ -502,12 +502,12 @@ struct create_context {
502#define SMB2_LEASE_HANDLE_CACHING_HE 0x02 502#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
503#define SMB2_LEASE_WRITE_CACHING_HE 0x04 503#define SMB2_LEASE_WRITE_CACHING_HE 0x04
504 504
505#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) 505#define SMB2_LEASE_NONE cpu_to_le32(0x00)
506#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) 506#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01)
507#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) 507#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
508#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) 508#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
509 509
510#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) 510#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
511 511
512#define SMB2_LEASE_KEY_SIZE 16 512#define SMB2_LEASE_KEY_SIZE 16
513 513
diff --git a/fs/file.c b/fs/file.c
index ab3eb6a88239..ee738ea028fa 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -869,7 +869,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
869 struct file *file = fget_raw(fildes); 869 struct file *file = fget_raw(fildes);
870 870
871 if (file) { 871 if (file) {
872 ret = get_unused_fd(); 872 ret = get_unused_fd_flags(0);
873 if (ret >= 0) 873 if (ret >= 0)
874 fd_install(ret, file); 874 fd_install(ret, file);
875 else 875 else
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index ff0316b925a5..db458ee3a546 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -162,14 +162,16 @@ err2:
162 */ 162 */
163int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) 163int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
164{ 164{
165 int retval; 165 __be32 k1p, k2p;
166 166
167 retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID); 167 k1p = key1->cat.ParID;
168 if (!retval) 168 k2p = key2->cat.ParID;
169 retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
170 key2->cat.CName.name, key2->cat.CName.len);
171 169
172 return retval; 170 if (k1p != k2p)
171 return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
172
173 return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
174 key2->cat.CName.name, key2->cat.CName.len);
173} 175}
174 176
175/* Try to get a catalog entry for given catalog id */ 177/* Try to get a catalog entry for given catalog id */
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d5659d96ee7f..cf7e043a9447 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -447,7 +447,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
447 result = -EIO; 447 result = -EIO;
448 } 448 }
449 } 449 }
450 result = 0;
451 } 450 }
452 mutex_unlock(&server->root_setup_lock); 451 mutex_unlock(&server->root_setup_lock);
453 452
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index e9e3325f29f3..3a03e0aea1fb 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -39,21 +39,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
39 */ 39 */
40 struct the_nilfs *nilfs; 40 struct the_nilfs *nilfs;
41 struct inode *inode = file->f_mapping->host; 41 struct inode *inode = file->f_mapping->host;
42 int err; 42 int err = 0;
43
44 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
45 if (err)
46 return err;
47 mutex_lock(&inode->i_mutex);
48 43
49 if (nilfs_inode_dirty(inode)) { 44 if (nilfs_inode_dirty(inode)) {
50 if (datasync) 45 if (datasync)
51 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 46 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
52 0, LLONG_MAX); 47 start, end);
53 else 48 else
54 err = nilfs_construct_segment(inode->i_sb); 49 err = nilfs_construct_segment(inode->i_sb);
55 } 50 }
56 mutex_unlock(&inode->i_mutex);
57 51
58 nilfs = inode->i_sb->s_fs_info; 52 nilfs = inode->i_sb->s_fs_info;
59 if (!err) 53 if (!err)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e1fa69b341b9..8b5969538f39 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -49,6 +49,8 @@ struct nilfs_iget_args {
49 int for_gc; 49 int for_gc;
50}; 50};
51 51
52static int nilfs_iget_test(struct inode *inode, void *opaque);
53
52void nilfs_inode_add_blocks(struct inode *inode, int n) 54void nilfs_inode_add_blocks(struct inode *inode, int n)
53{ 55{
54 struct nilfs_root *root = NILFS_I(inode)->i_root; 56 struct nilfs_root *root = NILFS_I(inode)->i_root;
@@ -348,6 +350,17 @@ const struct address_space_operations nilfs_aops = {
348 .is_partially_uptodate = block_is_partially_uptodate, 350 .is_partially_uptodate = block_is_partially_uptodate,
349}; 351};
350 352
353static int nilfs_insert_inode_locked(struct inode *inode,
354 struct nilfs_root *root,
355 unsigned long ino)
356{
357 struct nilfs_iget_args args = {
358 .ino = ino, .root = root, .cno = 0, .for_gc = 0
359 };
360
361 return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
362}
363
351struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) 364struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
352{ 365{
353 struct super_block *sb = dir->i_sb; 366 struct super_block *sb = dir->i_sb;
@@ -383,7 +396,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
383 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 396 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
384 err = nilfs_bmap_read(ii->i_bmap, NULL); 397 err = nilfs_bmap_read(ii->i_bmap, NULL);
385 if (err < 0) 398 if (err < 0)
386 goto failed_bmap; 399 goto failed_after_creation;
387 400
388 set_bit(NILFS_I_BMAP, &ii->i_state); 401 set_bit(NILFS_I_BMAP, &ii->i_state);
389 /* No lock is needed; iget() ensures it. */ 402 /* No lock is needed; iget() ensures it. */
@@ -399,21 +412,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
399 spin_lock(&nilfs->ns_next_gen_lock); 412 spin_lock(&nilfs->ns_next_gen_lock);
400 inode->i_generation = nilfs->ns_next_generation++; 413 inode->i_generation = nilfs->ns_next_generation++;
401 spin_unlock(&nilfs->ns_next_gen_lock); 414 spin_unlock(&nilfs->ns_next_gen_lock);
402 insert_inode_hash(inode); 415 if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
416 err = -EIO;
417 goto failed_after_creation;
418 }
403 419
404 err = nilfs_init_acl(inode, dir); 420 err = nilfs_init_acl(inode, dir);
405 if (unlikely(err)) 421 if (unlikely(err))
406 goto failed_acl; /* never occur. When supporting 422 goto failed_after_creation; /* never occur. When supporting
407 nilfs_init_acl(), proper cancellation of 423 nilfs_init_acl(), proper cancellation of
408 above jobs should be considered */ 424 above jobs should be considered */
409 425
410 return inode; 426 return inode;
411 427
412 failed_acl: 428 failed_after_creation:
413 failed_bmap:
414 clear_nlink(inode); 429 clear_nlink(inode);
430 unlock_new_inode(inode);
415 iput(inode); /* raw_inode will be deleted through 431 iput(inode); /* raw_inode will be deleted through
416 generic_delete_inode() */ 432 nilfs_evict_inode() */
417 goto failed; 433 goto failed;
418 434
419 failed_ifile_create_inode: 435 failed_ifile_create_inode:
@@ -461,8 +477,8 @@ int nilfs_read_inode_common(struct inode *inode,
461 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 477 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
462 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 478 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
463 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 479 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
464 if (inode->i_nlink == 0 && inode->i_mode == 0) 480 if (inode->i_nlink == 0)
465 return -EINVAL; /* this inode is deleted */ 481 return -ESTALE; /* this inode is deleted */
466 482
467 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); 483 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
468 ii->i_flags = le32_to_cpu(raw_inode->i_flags); 484 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9de78f08989e..0f84b257932c 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
51 int err = nilfs_add_link(dentry, inode); 51 int err = nilfs_add_link(dentry, inode);
52 if (!err) { 52 if (!err) {
53 d_instantiate(dentry, inode); 53 d_instantiate(dentry, inode);
54 unlock_new_inode(inode);
54 return 0; 55 return 0;
55 } 56 }
56 inode_dec_link_count(inode); 57 inode_dec_link_count(inode);
58 unlock_new_inode(inode);
57 iput(inode); 59 iput(inode);
58 return err; 60 return err;
59} 61}
@@ -182,6 +184,7 @@ out:
182out_fail: 184out_fail:
183 drop_nlink(inode); 185 drop_nlink(inode);
184 nilfs_mark_inode_dirty(inode); 186 nilfs_mark_inode_dirty(inode);
187 unlock_new_inode(inode);
185 iput(inode); 188 iput(inode);
186 goto out; 189 goto out;
187} 190}
@@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
201 inode_inc_link_count(inode); 204 inode_inc_link_count(inode);
202 ihold(inode); 205 ihold(inode);
203 206
204 err = nilfs_add_nondir(dentry, inode); 207 err = nilfs_add_link(dentry, inode);
205 if (!err) 208 if (!err) {
209 d_instantiate(dentry, inode);
206 err = nilfs_transaction_commit(dir->i_sb); 210 err = nilfs_transaction_commit(dir->i_sb);
207 else 211 } else {
212 inode_dec_link_count(inode);
213 iput(inode);
208 nilfs_transaction_abort(dir->i_sb); 214 nilfs_transaction_abort(dir->i_sb);
215 }
209 216
210 return err; 217 return err;
211} 218}
@@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
243 250
244 nilfs_mark_inode_dirty(inode); 251 nilfs_mark_inode_dirty(inode);
245 d_instantiate(dentry, inode); 252 d_instantiate(dentry, inode);
253 unlock_new_inode(inode);
246out: 254out:
247 if (!err) 255 if (!err)
248 err = nilfs_transaction_commit(dir->i_sb); 256 err = nilfs_transaction_commit(dir->i_sb);
@@ -255,6 +263,7 @@ out_fail:
255 drop_nlink(inode); 263 drop_nlink(inode);
256 drop_nlink(inode); 264 drop_nlink(inode);
257 nilfs_mark_inode_dirty(inode); 265 nilfs_mark_inode_dirty(inode);
266 unlock_new_inode(inode);
258 iput(inode); 267 iput(inode);
259out_dir: 268out_dir:
260 drop_nlink(dir); 269 drop_nlink(dir);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 9da25fe9ea61..69bd801afb53 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -808,8 +808,7 @@ void nilfs_put_root(struct nilfs_root *root)
808 spin_lock(&nilfs->ns_cptree_lock); 808 spin_lock(&nilfs->ns_cptree_lock);
809 rb_erase(&root->rb_node, &nilfs->ns_cptree); 809 rb_erase(&root->rb_node, &nilfs->ns_cptree);
810 spin_unlock(&nilfs->ns_cptree_lock); 810 spin_unlock(&nilfs->ns_cptree_lock);
811 if (root->ifile) 811 iput(root->ifile);
812 iput(root->ifile);
813 812
814 kfree(root); 813 kfree(root);
815 } 814 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1ef547e49373..d9f222987f24 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1251,7 +1251,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1251 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1251 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1252 NULL); 1252 NULL);
1253 if (ret < 0) { 1253 if (ret < 0) {
1254 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " 1254 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
1255 "at logical block %llu", 1255 "at logical block %llu",
1256 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1256 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1257 (unsigned long long)v_blkno); 1257 (unsigned long long)v_blkno);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eb9d48746ab4..16eff45727ee 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1127,10 +1127,10 @@ static int o2hb_thread(void *data)
1127 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1127 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
1128 1128
1129 mlog(ML_HEARTBEAT, 1129 mlog(ML_HEARTBEAT,
1130 "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 1130 "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
1131 before_hb.tv_sec, (unsigned long) before_hb.tv_usec, 1131 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
1132 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1132 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1133 elapsed_msec); 1133 elapsed_msec, ret);
1134 1134
1135 if (!kthread_should_stop() && 1135 if (!kthread_should_stop() &&
1136 elapsed_msec < reg->hr_timeout_ms) { 1136 elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a96044004064..2e355e0f8335 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1736,7 +1736,7 @@ static void o2net_connect_expired(struct work_struct *work)
1736 o2net_idle_timeout() / 1000, 1736 o2net_idle_timeout() / 1000,
1737 o2net_idle_timeout() % 1000); 1737 o2net_idle_timeout() % 1000);
1738 1738
1739 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1739 o2net_set_nn_state(nn, NULL, 0, 0);
1740 } 1740 }
1741 spin_unlock(&nn->nn_lock); 1741 spin_unlock(&nn->nn_lock);
1742} 1742}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c43d9b4a1ec0..79d56dc981bc 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -744,7 +744,7 @@ restart:
744 if (ocfs2_read_dir_block(dir, block, &bh, 0)) { 744 if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
745 /* read error, skip block & hope for the best. 745 /* read error, skip block & hope for the best.
746 * ocfs2_read_dir_block() has released the bh. */ 746 * ocfs2_read_dir_block() has released the bh. */
747 ocfs2_error(dir->i_sb, "reading directory %llu, " 747 mlog(ML_ERROR, "reading directory %llu, "
748 "offset %lu\n", 748 "offset %lu\n",
749 (unsigned long long)OCFS2_I(dir)->ip_blkno, 749 (unsigned long long)OCFS2_I(dir)->ip_blkno,
750 block); 750 block);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 02d315fef432..50a59d2337b2 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -877,7 +877,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
877 * to be put in someone's domain map. 877 * to be put in someone's domain map.
878 * Also, explicitly disallow joining at certain troublesome 878 * Also, explicitly disallow joining at certain troublesome
879 * times (ie. during recovery). */ 879 * times (ie. during recovery). */
880 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 880 if (dlm->dlm_state != DLM_CTXT_LEAVING) {
881 int bit = query->node_idx; 881 int bit = query->node_idx;
882 spin_lock(&dlm->spinlock); 882 spin_lock(&dlm->spinlock);
883 883
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 215e41abf101..3689b3592042 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1460,6 +1460,18 @@ way_up_top:
1460 1460
1461 /* take care of the easy cases up front */ 1461 /* take care of the easy cases up front */
1462 spin_lock(&res->spinlock); 1462 spin_lock(&res->spinlock);
1463
1464 /*
1465 * Right after dlm spinlock was released, dlm_thread could have
1466 * purged the lockres. Check if lockres got unhashed. If so
1467 * start over.
1468 */
1469 if (hlist_unhashed(&res->hash_node)) {
1470 spin_unlock(&res->spinlock);
1471 dlm_lockres_put(res);
1472 goto way_up_top;
1473 }
1474
1463 if (res->state & (DLM_LOCK_RES_RECOVERING| 1475 if (res->state & (DLM_LOCK_RES_RECOVERING|
1464 DLM_LOCK_RES_MIGRATING)) { 1476 DLM_LOCK_RES_MIGRATING)) {
1465 spin_unlock(&res->spinlock); 1477 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3365839d2971..79b5af5e6a7b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1656,14 +1656,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1656 req.namelen = res->lockname.len; 1656 req.namelen = res->lockname.len;
1657 memcpy(req.name, res->lockname.name, res->lockname.len); 1657 memcpy(req.name, res->lockname.name, res->lockname.len);
1658 1658
1659resend:
1659 ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, 1660 ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
1660 &req, sizeof(req), nodenum, &status); 1661 &req, sizeof(req), nodenum, &status);
1661 /* XXX: negative status not handled properly here. */
1662 if (ret < 0) 1662 if (ret < 0)
1663 mlog(ML_ERROR, "Error %d when sending message %u (key " 1663 mlog(ML_ERROR, "Error %d when sending message %u (key "
1664 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, 1664 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1665 dlm->key, nodenum); 1665 dlm->key, nodenum);
1666 else { 1666 else if (status == -ENOMEM) {
1667 mlog_errno(status);
1668 msleep(50);
1669 goto resend;
1670 } else {
1667 BUG_ON(status < 0); 1671 BUG_ON(status < 0);
1668 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); 1672 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
1669 *real_master = (u8) (status & 0xff); 1673 *real_master = (u8) (status & 0xff);
@@ -1705,9 +1709,13 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1705 int ret = dlm_dispatch_assert_master(dlm, res, 1709 int ret = dlm_dispatch_assert_master(dlm, res,
1706 0, 0, flags); 1710 0, 0, flags);
1707 if (ret < 0) { 1711 if (ret < 0) {
1708 mlog_errno(-ENOMEM); 1712 mlog_errno(ret);
1709 /* retry!? */ 1713 spin_unlock(&res->spinlock);
1710 BUG(); 1714 dlm_lockres_put(res);
1715 spin_unlock(&dlm->spinlock);
1716 dlm_put(dlm);
1717 /* sender will take care of this and retry */
1718 return ret;
1711 } else 1719 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res); 1720 __dlm_lockres_grab_inflight_worker(dlm, res);
1713 spin_unlock(&res->spinlock); 1721 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 37297c14f9a3..1c423af04c69 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -861,8 +861,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
861 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing 861 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
862 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from 862 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
863 * downconverting the lock before the upconvert has fully completed. 863 * downconverting the lock before the upconvert has fully completed.
864 * Do not prevent the dc thread from downconverting if NONBLOCK lock
865 * had already returned.
864 */ 866 */
865 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); 867 if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
868 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
869 else
870 lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
866 871
867 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 872 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
868} 873}
@@ -1324,13 +1329,12 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
1324 1329
1325/* returns 0 if the mw that was removed was already satisfied, -EBUSY 1330/* returns 0 if the mw that was removed was already satisfied, -EBUSY
1326 * if the mask still hadn't reached its goal */ 1331 * if the mask still hadn't reached its goal */
1327static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, 1332static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
1328 struct ocfs2_mask_waiter *mw) 1333 struct ocfs2_mask_waiter *mw)
1329{ 1334{
1330 unsigned long flags;
1331 int ret = 0; 1335 int ret = 0;
1332 1336
1333 spin_lock_irqsave(&lockres->l_lock, flags); 1337 assert_spin_locked(&lockres->l_lock);
1334 if (!list_empty(&mw->mw_item)) { 1338 if (!list_empty(&mw->mw_item)) {
1335 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 1339 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
1336 ret = -EBUSY; 1340 ret = -EBUSY;
@@ -1338,6 +1342,18 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
1338 list_del_init(&mw->mw_item); 1342 list_del_init(&mw->mw_item);
1339 init_completion(&mw->mw_complete); 1343 init_completion(&mw->mw_complete);
1340 } 1344 }
1345
1346 return ret;
1347}
1348
1349static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
1350 struct ocfs2_mask_waiter *mw)
1351{
1352 unsigned long flags;
1353 int ret = 0;
1354
1355 spin_lock_irqsave(&lockres->l_lock, flags);
1356 ret = __lockres_remove_mask_waiter(lockres, mw);
1341 spin_unlock_irqrestore(&lockres->l_lock, flags); 1357 spin_unlock_irqrestore(&lockres->l_lock, flags);
1342 1358
1343 return ret; 1359 return ret;
@@ -1373,6 +1389,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1373 unsigned long flags; 1389 unsigned long flags;
1374 unsigned int gen; 1390 unsigned int gen;
1375 int noqueue_attempted = 0; 1391 int noqueue_attempted = 0;
1392 int dlm_locked = 0;
1376 1393
1377 ocfs2_init_mask_waiter(&mw); 1394 ocfs2_init_mask_waiter(&mw);
1378 1395
@@ -1481,6 +1498,7 @@ again:
1481 ocfs2_recover_from_dlm_error(lockres, 1); 1498 ocfs2_recover_from_dlm_error(lockres, 1);
1482 goto out; 1499 goto out;
1483 } 1500 }
1501 dlm_locked = 1;
1484 1502
1485 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", 1503 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1486 lockres->l_name); 1504 lockres->l_name);
@@ -1514,10 +1532,17 @@ out:
1514 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && 1532 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1515 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { 1533 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1516 wait = 0; 1534 wait = 0;
1517 if (lockres_remove_mask_waiter(lockres, &mw)) 1535 spin_lock_irqsave(&lockres->l_lock, flags);
1536 if (__lockres_remove_mask_waiter(lockres, &mw)) {
1537 if (dlm_locked)
1538 lockres_or_flags(lockres,
1539 OCFS2_LOCK_NONBLOCK_FINISHED);
1540 spin_unlock_irqrestore(&lockres->l_lock, flags);
1518 ret = -EAGAIN; 1541 ret = -EAGAIN;
1519 else 1542 } else {
1543 spin_unlock_irqrestore(&lockres->l_lock, flags);
1520 goto again; 1544 goto again;
1545 }
1521 } 1546 }
1522 if (wait) { 1547 if (wait) {
1523 ret = ocfs2_wait_for_mask(&mw); 1548 ret = ocfs2_wait_for_mask(&mw);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 324dc93ac896..69fb9f75b082 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2381,9 +2381,7 @@ out_dio:
2381 if (ret < 0) 2381 if (ret < 0)
2382 written = ret; 2382 written = ret;
2383 2383
2384 if (!ret && ((old_size != i_size_read(inode)) || 2384 if (!ret) {
2385 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2386 has_refcount)) {
2387 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2385 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2388 if (ret < 0) 2386 if (ret < 0)
2389 written = ret; 2387 written = ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 437de7f768c6..c8b25de9efbb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -540,8 +540,7 @@ bail:
540 if (status < 0) 540 if (status < 0)
541 make_bad_inode(inode); 541 make_bad_inode(inode);
542 542
543 if (args && bh) 543 brelse(bh);
544 brelse(bh);
545 544
546 return status; 545 return status;
547} 546}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 74caffeeee1d..56a768d06aa6 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -904,9 +904,6 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
904 struct buffer_head *di_bh = NULL; 904 struct buffer_head *di_bh = NULL;
905 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 905 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
906 906
907 if (!inode)
908 return -ENOENT;
909
910 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 907 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
911 return -EROFS; 908 return -EROFS;
912 909
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bbec539230fd..7d6b7d090452 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -144,6 +144,12 @@ enum ocfs2_unlock_action {
144 * before the upconvert 144 * before the upconvert
145 * has completed */ 145 * has completed */
146 146
147#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster
148 * lock has already
149 * returned, do not block
150 * dc thread from
151 * downconverting */
152
147struct ocfs2_lock_res_ops; 153struct ocfs2_lock_res_ops;
148 154
149typedef void (*ocfs2_lock_callback)(int status, unsigned long data); 155typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index a88b2a4fcc85..d5493e361a38 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -306,7 +306,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
306 assert_spin_locked(&osb->osb_lock); 306 assert_spin_locked(&osb->osb_lock);
307 307
308 BUG_ON(slot_num < 0); 308 BUG_ON(slot_num < 0);
309 BUG_ON(slot_num > osb->max_slots); 309 BUG_ON(slot_num >= osb->max_slots);
310 310
311 if (!si->si_slots[slot_num].sl_valid) 311 if (!si->si_slots[slot_num].sl_valid)
312 return -ENOENT; 312 return -ENOENT;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0945814ddb7b..83723179e1ec 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1629,8 +1629,9 @@ static int __init ocfs2_init(void)
1629 1629
1630 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1630 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1631 if (!ocfs2_debugfs_root) { 1631 if (!ocfs2_debugfs_root) {
1632 status = -EFAULT; 1632 status = -ENOMEM;
1633 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1633 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1634 goto out4;
1634 } 1635 }
1635 1636
1636 ocfs2_set_locking_protocol(); 1637 ocfs2_set_locking_protocol();
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 016f01df3825..662f8dee149f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1284,7 +1284,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1284 return -EOPNOTSUPP; 1284 return -EOPNOTSUPP;
1285 1285
1286 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 1286 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1287 ret = -ENODATA; 1287 return -ENODATA;
1288 1288
1289 xis.inode_bh = xbs.inode_bh = di_bh; 1289 xis.inode_bh = xbs.inode_bh = di_bh;
1290 di = (struct ocfs2_dinode *)di_bh->b_data; 1290 di = (struct ocfs2_dinode *)di_bh->b_data;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e4f35c..bd117d065b82 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -157,20 +157,29 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
157 struct user_namespace *user_ns = seq_user_ns(m); 157 struct user_namespace *user_ns = seq_user_ns(m);
158 struct group_info *group_info; 158 struct group_info *group_info;
159 int g; 159 int g;
160 struct fdtable *fdt = NULL; 160 struct task_struct *tracer;
161 const struct cred *cred; 161 const struct cred *cred;
162 pid_t ppid, tpid; 162 pid_t ppid, tpid = 0, tgid, ngid;
163 unsigned int max_fds = 0;
163 164
164 rcu_read_lock(); 165 rcu_read_lock();
165 ppid = pid_alive(p) ? 166 ppid = pid_alive(p) ?
166 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; 167 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
167 tpid = 0; 168
168 if (pid_alive(p)) { 169 tracer = ptrace_parent(p);
169 struct task_struct *tracer = ptrace_parent(p); 170 if (tracer)
170 if (tracer) 171 tpid = task_pid_nr_ns(tracer, ns);
171 tpid = task_pid_nr_ns(tracer, ns); 172
172 } 173 tgid = task_tgid_nr_ns(p, ns);
174 ngid = task_numa_group_id(p);
173 cred = get_task_cred(p); 175 cred = get_task_cred(p);
176
177 task_lock(p);
178 if (p->files)
179 max_fds = files_fdtable(p->files)->max_fds;
180 task_unlock(p);
181 rcu_read_unlock();
182
174 seq_printf(m, 183 seq_printf(m,
175 "State:\t%s\n" 184 "State:\t%s\n"
176 "Tgid:\t%d\n" 185 "Tgid:\t%d\n"
@@ -179,12 +188,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
179 "PPid:\t%d\n" 188 "PPid:\t%d\n"
180 "TracerPid:\t%d\n" 189 "TracerPid:\t%d\n"
181 "Uid:\t%d\t%d\t%d\t%d\n" 190 "Uid:\t%d\t%d\t%d\t%d\n"
182 "Gid:\t%d\t%d\t%d\t%d\n", 191 "Gid:\t%d\t%d\t%d\t%d\n"
192 "FDSize:\t%d\nGroups:\t",
183 get_task_state(p), 193 get_task_state(p),
184 task_tgid_nr_ns(p, ns), 194 tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
185 task_numa_group_id(p),
186 pid_nr_ns(pid, ns),
187 ppid, tpid,
188 from_kuid_munged(user_ns, cred->uid), 195 from_kuid_munged(user_ns, cred->uid),
189 from_kuid_munged(user_ns, cred->euid), 196 from_kuid_munged(user_ns, cred->euid),
190 from_kuid_munged(user_ns, cred->suid), 197 from_kuid_munged(user_ns, cred->suid),
@@ -192,20 +199,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
192 from_kgid_munged(user_ns, cred->gid), 199 from_kgid_munged(user_ns, cred->gid),
193 from_kgid_munged(user_ns, cred->egid), 200 from_kgid_munged(user_ns, cred->egid),
194 from_kgid_munged(user_ns, cred->sgid), 201 from_kgid_munged(user_ns, cred->sgid),
195 from_kgid_munged(user_ns, cred->fsgid)); 202 from_kgid_munged(user_ns, cred->fsgid),
196 203 max_fds);
197 task_lock(p);
198 if (p->files)
199 fdt = files_fdtable(p->files);
200 seq_printf(m,
201 "FDSize:\t%d\n"
202 "Groups:\t",
203 fdt ? fdt->max_fds : 0);
204 rcu_read_unlock();
205 204
206 group_info = cred->group_info; 205 group_info = cred->group_info;
207 task_unlock(p);
208
209 for (g = 0; g < group_info->ngroups; g++) 206 for (g = 0; g < group_info->ngroups; g++)
210 seq_printf(m, "%d ", 207 seq_printf(m, "%d ",
211 from_kgid_munged(user_ns, GROUP_AT(group_info, g))); 208 from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 64891f3e41bd..590aeda5af12 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2618,6 +2618,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2618 dput(dentry); 2618 dput(dentry);
2619 } 2619 }
2620 2620
2621 if (pid == tgid)
2622 return;
2623
2621 name.name = buf; 2624 name.name = buf;
2622 name.len = snprintf(buf, sizeof(buf), "%d", tgid); 2625 name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2623 leader = d_hash_and_lookup(mnt->mnt_root, &name); 2626 leader = d_hash_and_lookup(mnt->mnt_root, &name);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 317b72641ebf..7fea13229f33 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -31,9 +31,73 @@ static DEFINE_SPINLOCK(proc_subdir_lock);
31 31
32static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) 32static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
33{ 33{
34 if (de->namelen != len) 34 if (len < de->namelen)
35 return 0; 35 return -1;
36 return !memcmp(name, de->name, len); 36 if (len > de->namelen)
37 return 1;
38
39 return memcmp(name, de->name, len);
40}
41
42static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
43{
44 return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
45 subdir_node);
46}
47
48static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
49{
50 return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
51 subdir_node);
52}
53
54static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
55 const char *name,
56 unsigned int len)
57{
58 struct rb_node *node = dir->subdir.rb_node;
59
60 while (node) {
61 struct proc_dir_entry *de = container_of(node,
62 struct proc_dir_entry,
63 subdir_node);
64 int result = proc_match(len, name, de);
65
66 if (result < 0)
67 node = node->rb_left;
68 else if (result > 0)
69 node = node->rb_right;
70 else
71 return de;
72 }
73 return NULL;
74}
75
76static bool pde_subdir_insert(struct proc_dir_entry *dir,
77 struct proc_dir_entry *de)
78{
79 struct rb_root *root = &dir->subdir;
80 struct rb_node **new = &root->rb_node, *parent = NULL;
81
82 /* Figure out where to put new node */
83 while (*new) {
84 struct proc_dir_entry *this =
85 container_of(*new, struct proc_dir_entry, subdir_node);
86 int result = proc_match(de->namelen, de->name, this);
87
88 parent = *new;
89 if (result < 0)
90 new = &(*new)->rb_left;
91 else if (result > 0)
92 new = &(*new)->rb_right;
93 else
94 return false;
95 }
96
97 /* Add new node and rebalance tree. */
98 rb_link_node(&de->subdir_node, parent, new);
99 rb_insert_color(&de->subdir_node, root);
100 return true;
37} 101}
38 102
39static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) 103static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -92,10 +156,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
92 break; 156 break;
93 157
94 len = next - cp; 158 len = next - cp;
95 for (de = de->subdir; de ; de = de->next) { 159 de = pde_subdir_find(de, cp, len);
96 if (proc_match(len, cp, de))
97 break;
98 }
99 if (!de) { 160 if (!de) {
100 WARN(1, "name '%s'\n", name); 161 WARN(1, "name '%s'\n", name);
101 return -ENOENT; 162 return -ENOENT;
@@ -183,19 +244,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
183 struct inode *inode; 244 struct inode *inode;
184 245
185 spin_lock(&proc_subdir_lock); 246 spin_lock(&proc_subdir_lock);
186 for (de = de->subdir; de ; de = de->next) { 247 de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
187 if (de->namelen != dentry->d_name.len) 248 if (de) {
188 continue; 249 pde_get(de);
189 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 250 spin_unlock(&proc_subdir_lock);
190 pde_get(de); 251 inode = proc_get_inode(dir->i_sb, de);
191 spin_unlock(&proc_subdir_lock); 252 if (!inode)
192 inode = proc_get_inode(dir->i_sb, de); 253 return ERR_PTR(-ENOMEM);
193 if (!inode) 254 d_set_d_op(dentry, &simple_dentry_operations);
194 return ERR_PTR(-ENOMEM); 255 d_add(dentry, inode);
195 d_set_d_op(dentry, &simple_dentry_operations); 256 return NULL;
196 d_add(dentry, inode);
197 return NULL;
198 }
199 } 257 }
200 spin_unlock(&proc_subdir_lock); 258 spin_unlock(&proc_subdir_lock);
201 return ERR_PTR(-ENOENT); 259 return ERR_PTR(-ENOENT);
@@ -225,7 +283,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
225 return 0; 283 return 0;
226 284
227 spin_lock(&proc_subdir_lock); 285 spin_lock(&proc_subdir_lock);
228 de = de->subdir; 286 de = pde_subdir_first(de);
229 i = ctx->pos - 2; 287 i = ctx->pos - 2;
230 for (;;) { 288 for (;;) {
231 if (!de) { 289 if (!de) {
@@ -234,7 +292,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
234 } 292 }
235 if (!i) 293 if (!i)
236 break; 294 break;
237 de = de->next; 295 de = pde_subdir_next(de);
238 i--; 296 i--;
239 } 297 }
240 298
@@ -249,7 +307,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
249 } 307 }
250 spin_lock(&proc_subdir_lock); 308 spin_lock(&proc_subdir_lock);
251 ctx->pos++; 309 ctx->pos++;
252 next = de->next; 310 next = pde_subdir_next(de);
253 pde_put(de); 311 pde_put(de);
254 de = next; 312 de = next;
255 } while (de); 313 } while (de);
@@ -286,9 +344,8 @@ static const struct inode_operations proc_dir_inode_operations = {
286 344
287static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 345static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
288{ 346{
289 struct proc_dir_entry *tmp;
290 int ret; 347 int ret;
291 348
292 ret = proc_alloc_inum(&dp->low_ino); 349 ret = proc_alloc_inum(&dp->low_ino);
293 if (ret) 350 if (ret)
294 return ret; 351 return ret;
@@ -304,21 +361,21 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
304 dp->proc_iops = &proc_file_inode_operations; 361 dp->proc_iops = &proc_file_inode_operations;
305 } else { 362 } else {
306 WARN_ON(1); 363 WARN_ON(1);
364 proc_free_inum(dp->low_ino);
307 return -EINVAL; 365 return -EINVAL;
308 } 366 }
309 367
310 spin_lock(&proc_subdir_lock); 368 spin_lock(&proc_subdir_lock);
311
312 for (tmp = dir->subdir; tmp; tmp = tmp->next)
313 if (strcmp(tmp->name, dp->name) == 0) {
314 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
315 dir->name, dp->name);
316 break;
317 }
318
319 dp->next = dir->subdir;
320 dp->parent = dir; 369 dp->parent = dir;
321 dir->subdir = dp; 370 if (pde_subdir_insert(dir, dp) == false) {
371 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
372 dir->name, dp->name);
373 spin_unlock(&proc_subdir_lock);
374 if (S_ISDIR(dp->mode))
375 dir->nlink--;
376 proc_free_inum(dp->low_ino);
377 return -EEXIST;
378 }
322 spin_unlock(&proc_subdir_lock); 379 spin_unlock(&proc_subdir_lock);
323 380
324 return 0; 381 return 0;
@@ -354,6 +411,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
354 ent->namelen = qstr.len; 411 ent->namelen = qstr.len;
355 ent->mode = mode; 412 ent->mode = mode;
356 ent->nlink = nlink; 413 ent->nlink = nlink;
414 ent->subdir = RB_ROOT;
357 atomic_set(&ent->count, 1); 415 atomic_set(&ent->count, 1);
358 spin_lock_init(&ent->pde_unload_lock); 416 spin_lock_init(&ent->pde_unload_lock);
359 INIT_LIST_HEAD(&ent->pde_openers); 417 INIT_LIST_HEAD(&ent->pde_openers);
@@ -485,7 +543,6 @@ void pde_put(struct proc_dir_entry *pde)
485 */ 543 */
486void remove_proc_entry(const char *name, struct proc_dir_entry *parent) 544void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
487{ 545{
488 struct proc_dir_entry **p;
489 struct proc_dir_entry *de = NULL; 546 struct proc_dir_entry *de = NULL;
490 const char *fn = name; 547 const char *fn = name;
491 unsigned int len; 548 unsigned int len;
@@ -497,14 +554,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
497 } 554 }
498 len = strlen(fn); 555 len = strlen(fn);
499 556
500 for (p = &parent->subdir; *p; p=&(*p)->next ) { 557 de = pde_subdir_find(parent, fn, len);
501 if (proc_match(len, fn, *p)) { 558 if (de)
502 de = *p; 559 rb_erase(&de->subdir_node, &parent->subdir);
503 *p = de->next;
504 de->next = NULL;
505 break;
506 }
507 }
508 spin_unlock(&proc_subdir_lock); 560 spin_unlock(&proc_subdir_lock);
509 if (!de) { 561 if (!de) {
510 WARN(1, "name '%s'\n", name); 562 WARN(1, "name '%s'\n", name);
@@ -516,16 +568,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
516 if (S_ISDIR(de->mode)) 568 if (S_ISDIR(de->mode))
517 parent->nlink--; 569 parent->nlink--;
518 de->nlink = 0; 570 de->nlink = 0;
519 WARN(de->subdir, "%s: removing non-empty directory " 571 WARN(pde_subdir_first(de),
520 "'%s/%s', leaking at least '%s'\n", __func__, 572 "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
521 de->parent->name, de->name, de->subdir->name); 573 __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
522 pde_put(de); 574 pde_put(de);
523} 575}
524EXPORT_SYMBOL(remove_proc_entry); 576EXPORT_SYMBOL(remove_proc_entry);
525 577
526int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) 578int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
527{ 579{
528 struct proc_dir_entry **p;
529 struct proc_dir_entry *root = NULL, *de, *next; 580 struct proc_dir_entry *root = NULL, *de, *next;
530 const char *fn = name; 581 const char *fn = name;
531 unsigned int len; 582 unsigned int len;
@@ -537,24 +588,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
537 } 588 }
538 len = strlen(fn); 589 len = strlen(fn);
539 590
540 for (p = &parent->subdir; *p; p=&(*p)->next ) { 591 root = pde_subdir_find(parent, fn, len);
541 if (proc_match(len, fn, *p)) {
542 root = *p;
543 *p = root->next;
544 root->next = NULL;
545 break;
546 }
547 }
548 if (!root) { 592 if (!root) {
549 spin_unlock(&proc_subdir_lock); 593 spin_unlock(&proc_subdir_lock);
550 return -ENOENT; 594 return -ENOENT;
551 } 595 }
596 rb_erase(&root->subdir_node, &parent->subdir);
597
552 de = root; 598 de = root;
553 while (1) { 599 while (1) {
554 next = de->subdir; 600 next = pde_subdir_first(de);
555 if (next) { 601 if (next) {
556 de->subdir = next->next; 602 rb_erase(&next->subdir_node, &de->subdir);
557 next->next = NULL;
558 de = next; 603 de = next;
559 continue; 604 continue;
560 } 605 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa7a0ee182e1..7fb1a4869fd0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -24,10 +24,9 @@ struct mempolicy;
24 * tree) of these proc_dir_entries, so that we can dynamically 24 * tree) of these proc_dir_entries, so that we can dynamically
25 * add new files to /proc. 25 * add new files to /proc.
26 * 26 *
27 * The "next" pointer creates a linked list of one /proc directory, 27 * parent/subdir are used for the directory structure (every /proc file has a
28 * while parent/subdir create the directory structure (every 28 * parent, but "subdir" is empty for all non-directory entries).
29 * /proc file has a parent, but "subdir" is NULL for all 29 * subdir_node is used to build the rb tree "subdir" of the parent.
30 * non-directory entries).
31 */ 30 */
32struct proc_dir_entry { 31struct proc_dir_entry {
33 unsigned int low_ino; 32 unsigned int low_ino;
@@ -38,7 +37,9 @@ struct proc_dir_entry {
38 loff_t size; 37 loff_t size;
39 const struct inode_operations *proc_iops; 38 const struct inode_operations *proc_iops;
40 const struct file_operations *proc_fops; 39 const struct file_operations *proc_fops;
41 struct proc_dir_entry *next, *parent, *subdir; 40 struct proc_dir_entry *parent;
41 struct rb_root subdir;
42 struct rb_node subdir_node;
42 void *data; 43 void *data;
43 atomic_t count; /* use count */ 44 atomic_t count; /* use count */
44 atomic_t in_use; /* number of callers into module in progress; */ 45 atomic_t in_use; /* number of callers into module in progress; */
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..1bde894bc624 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net)
192 if (!netd) 192 if (!netd)
193 goto out; 193 goto out;
194 194
195 netd->subdir = RB_ROOT;
195 netd->data = net; 196 netd->data = net;
196 netd->nlink = 2; 197 netd->nlink = 2;
197 netd->namelen = 3; 198 netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 094e44d4a6be..e74ac9f1a2c0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = {
251 .proc_iops = &proc_root_inode_operations, 251 .proc_iops = &proc_root_inode_operations,
252 .proc_fops = &proc_root_operations, 252 .proc_fops = &proc_root_operations,
253 .parent = &proc_root, 253 .parent = &proc_root,
254 .subdir = RB_ROOT,
254 .name = "/proc", 255 .name = "/proc",
255}; 256};
256 257
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f6734c6b66a6..246eae84b13b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -447,58 +447,91 @@ struct mem_size_stats {
447 u64 pss; 447 u64 pss;
448}; 448};
449 449
450static void smaps_account(struct mem_size_stats *mss, struct page *page,
451 unsigned long size, bool young, bool dirty)
452{
453 int mapcount;
454
455 if (PageAnon(page))
456 mss->anonymous += size;
450 457
451static void smaps_pte_entry(pte_t ptent, unsigned long addr, 458 mss->resident += size;
452 unsigned long ptent_size, struct mm_walk *walk) 459 /* Accumulate the size in pages that have been accessed. */
460 if (young || PageReferenced(page))
461 mss->referenced += size;
462 mapcount = page_mapcount(page);
463 if (mapcount >= 2) {
464 u64 pss_delta;
465
466 if (dirty || PageDirty(page))
467 mss->shared_dirty += size;
468 else
469 mss->shared_clean += size;
470 pss_delta = (u64)size << PSS_SHIFT;
471 do_div(pss_delta, mapcount);
472 mss->pss += pss_delta;
473 } else {
474 if (dirty || PageDirty(page))
475 mss->private_dirty += size;
476 else
477 mss->private_clean += size;
478 mss->pss += (u64)size << PSS_SHIFT;
479 }
480}
481
482static void smaps_pte_entry(pte_t *pte, unsigned long addr,
483 struct mm_walk *walk)
453{ 484{
454 struct mem_size_stats *mss = walk->private; 485 struct mem_size_stats *mss = walk->private;
455 struct vm_area_struct *vma = mss->vma; 486 struct vm_area_struct *vma = mss->vma;
456 pgoff_t pgoff = linear_page_index(vma, addr); 487 pgoff_t pgoff = linear_page_index(vma, addr);
457 struct page *page = NULL; 488 struct page *page = NULL;
458 int mapcount;
459 489
460 if (pte_present(ptent)) { 490 if (pte_present(*pte)) {
461 page = vm_normal_page(vma, addr, ptent); 491 page = vm_normal_page(vma, addr, *pte);
462 } else if (is_swap_pte(ptent)) { 492 } else if (is_swap_pte(*pte)) {
463 swp_entry_t swpent = pte_to_swp_entry(ptent); 493 swp_entry_t swpent = pte_to_swp_entry(*pte);
464 494
465 if (!non_swap_entry(swpent)) 495 if (!non_swap_entry(swpent))
466 mss->swap += ptent_size; 496 mss->swap += PAGE_SIZE;
467 else if (is_migration_entry(swpent)) 497 else if (is_migration_entry(swpent))
468 page = migration_entry_to_page(swpent); 498 page = migration_entry_to_page(swpent);
469 } else if (pte_file(ptent)) { 499 } else if (pte_file(*pte)) {
470 if (pte_to_pgoff(ptent) != pgoff) 500 if (pte_to_pgoff(*pte) != pgoff)
471 mss->nonlinear += ptent_size; 501 mss->nonlinear += PAGE_SIZE;
472 } 502 }
473 503
474 if (!page) 504 if (!page)
475 return; 505 return;
476 506
477 if (PageAnon(page))
478 mss->anonymous += ptent_size;
479
480 if (page->index != pgoff) 507 if (page->index != pgoff)
481 mss->nonlinear += ptent_size; 508 mss->nonlinear += PAGE_SIZE;
482 509
483 mss->resident += ptent_size; 510 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
484 /* Accumulate the size in pages that have been accessed. */ 511}
485 if (pte_young(ptent) || PageReferenced(page)) 512
486 mss->referenced += ptent_size; 513#ifdef CONFIG_TRANSPARENT_HUGEPAGE
487 mapcount = page_mapcount(page); 514static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
488 if (mapcount >= 2) { 515 struct mm_walk *walk)
489 if (pte_dirty(ptent) || PageDirty(page)) 516{
490 mss->shared_dirty += ptent_size; 517 struct mem_size_stats *mss = walk->private;
491 else 518 struct vm_area_struct *vma = mss->vma;
492 mss->shared_clean += ptent_size; 519 struct page *page;
493 mss->pss += (ptent_size << PSS_SHIFT) / mapcount; 520
494 } else { 521 /* FOLL_DUMP will return -EFAULT on huge zero page */
495 if (pte_dirty(ptent) || PageDirty(page)) 522 page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
496 mss->private_dirty += ptent_size; 523 if (IS_ERR_OR_NULL(page))
497 else 524 return;
498 mss->private_clean += ptent_size; 525 mss->anonymous_thp += HPAGE_PMD_SIZE;
499 mss->pss += (ptent_size << PSS_SHIFT); 526 smaps_account(mss, page, HPAGE_PMD_SIZE,
500 } 527 pmd_young(*pmd), pmd_dirty(*pmd));
501} 528}
529#else
530static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
531 struct mm_walk *walk)
532{
533}
534#endif
502 535
503static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 536static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
504 struct mm_walk *walk) 537 struct mm_walk *walk)
@@ -509,9 +542,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
509 spinlock_t *ptl; 542 spinlock_t *ptl;
510 543
511 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 544 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
512 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); 545 smaps_pmd_entry(pmd, addr, walk);
513 spin_unlock(ptl); 546 spin_unlock(ptl);
514 mss->anonymous_thp += HPAGE_PMD_SIZE;
515 return 0; 547 return 0;
516 } 548 }
517 549
@@ -524,7 +556,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
524 */ 556 */
525 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 557 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
526 for (; addr != end; pte++, addr += PAGE_SIZE) 558 for (; addr != end; pte++, addr += PAGE_SIZE)
527 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); 559 smaps_pte_entry(pte, addr, walk);
528 pte_unmap_unlock(pte - 1, ptl); 560 pte_unmap_unlock(pte - 1, ptl);
529 cond_resched(); 561 cond_resched();
530 return 0; 562 return 0;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 27b0c9105da5..641e56494a92 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -113,6 +113,19 @@ static inline void css_get(struct cgroup_subsys_state *css)
113} 113}
114 114
115/** 115/**
116 * css_get_many - obtain references on the specified css
117 * @css: target css
118 * @n: number of references to get
119 *
120 * The caller must already have a reference.
121 */
122static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
123{
124 if (!(css->flags & CSS_NO_REF))
125 percpu_ref_get_many(&css->refcnt, n);
126}
127
128/**
116 * css_tryget - try to obtain a reference on the specified css 129 * css_tryget - try to obtain a reference on the specified css
117 * @css: target css 130 * @css: target css
118 * 131 *
@@ -159,6 +172,19 @@ static inline void css_put(struct cgroup_subsys_state *css)
159 percpu_ref_put(&css->refcnt); 172 percpu_ref_put(&css->refcnt);
160} 173}
161 174
175/**
176 * css_put_many - put css references
177 * @css: target css
178 * @n: number of references to put
179 *
180 * Put references obtained via css_get() and css_tryget_online().
181 */
182static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
183{
184 if (!(css->flags & CSS_NO_REF))
185 percpu_ref_put_many(&css->refcnt, n);
186}
187
162/* bits in struct cgroup flags field */ 188/* bits in struct cgroup flags field */
163enum { 189enum {
164 /* Control Group requires release notifications to userspace */ 190 /* Control Group requires release notifications to userspace */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 60bdf8dc02a3..3238ffa33f68 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -33,10 +33,11 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
33extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 33extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
34 int order, gfp_t gfp_mask, nodemask_t *mask, 34 int order, gfp_t gfp_mask, nodemask_t *mask,
35 enum migrate_mode mode, int *contended, 35 enum migrate_mode mode, int *contended,
36 struct zone **candidate_zone); 36 int alloc_flags, int classzone_idx);
37extern void compact_pgdat(pg_data_t *pgdat, int order); 37extern void compact_pgdat(pg_data_t *pgdat, int order);
38extern void reset_isolation_suitable(pg_data_t *pgdat); 38extern void reset_isolation_suitable(pg_data_t *pgdat);
39extern unsigned long compaction_suitable(struct zone *zone, int order); 39extern unsigned long compaction_suitable(struct zone *zone, int order,
40 int alloc_flags, int classzone_idx);
40 41
41/* Do not skip compaction more than 64 times */ 42/* Do not skip compaction more than 64 times */
42#define COMPACT_MAX_DEFER_SHIFT 6 43#define COMPACT_MAX_DEFER_SHIFT 6
@@ -103,7 +104,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
103static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 104static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
104 int order, gfp_t gfp_mask, nodemask_t *nodemask, 105 int order, gfp_t gfp_mask, nodemask_t *nodemask,
105 enum migrate_mode mode, int *contended, 106 enum migrate_mode mode, int *contended,
106 struct zone **candidate_zone) 107 int alloc_flags, int classzone_idx)
107{ 108{
108 return COMPACT_CONTINUE; 109 return COMPACT_CONTINUE;
109} 110}
@@ -116,7 +117,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
116{ 117{
117} 118}
118 119
119static inline unsigned long compaction_suitable(struct zone *zone, int order) 120static inline unsigned long compaction_suitable(struct zone *zone, int order,
121 int alloc_flags, int classzone_idx)
120{ 122{
121 return COMPACT_SKIPPED; 123 return COMPACT_SKIPPED;
122} 124}
diff --git a/include/linux/file.h b/include/linux/file.h
index 4d69123377a2..f87d30882a24 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -66,7 +66,6 @@ extern void set_close_on_exec(unsigned int fd, int flag);
66extern bool get_close_on_exec(unsigned int fd); 66extern bool get_close_on_exec(unsigned int fd);
67extern void put_filp(struct file *); 67extern void put_filp(struct file *);
68extern int get_unused_fd_flags(unsigned flags); 68extern int get_unused_fd_flags(unsigned flags);
69#define get_unused_fd() get_unused_fd_flags(0)
70extern void put_unused_fd(unsigned int fd); 69extern void put_unused_fd(unsigned int fd);
71 70
72extern void fd_install(unsigned int fd, struct file *file); 71extern void fd_install(unsigned int fd, struct file *file);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 41b30fd4d041..07d2699cdb51 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -381,8 +381,8 @@ extern void free_kmem_pages(unsigned long addr, unsigned int order);
381 381
382void page_alloc_init(void); 382void page_alloc_init(void);
383void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); 383void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
384void drain_all_pages(void); 384void drain_all_pages(struct zone *zone);
385void drain_local_pages(void *dummy); 385void drain_local_pages(struct zone *zone);
386 386
387/* 387/*
388 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what 388 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6e6d338641fe..cdd149ca5cc0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -311,7 +311,8 @@ static inline struct hstate *hstate_sizelog(int page_size_log)
311{ 311{
312 if (!page_size_log) 312 if (!page_size_log)
313 return &default_hstate; 313 return &default_hstate;
314 return size_to_hstate(1 << page_size_log); 314
315 return size_to_hstate(1UL << page_size_log);
315} 316}
316 317
317static inline struct hstate *hstate_vma(struct vm_area_struct *vma) 318static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0129f89cf98d..bcc853eccc85 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -16,7 +16,6 @@
16#define _LINUX_HUGETLB_CGROUP_H 16#define _LINUX_HUGETLB_CGROUP_H
17 17
18#include <linux/mmdebug.h> 18#include <linux/mmdebug.h>
19#include <linux/res_counter.h>
20 19
21struct hugetlb_cgroup; 20struct hugetlb_cgroup;
22/* 21/*
diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h
index 866caaa9e2bb..c2ce155d83cc 100644
--- a/include/linux/kern_levels.h
+++ b/include/linux/kern_levels.h
@@ -22,4 +22,17 @@
22 */ 22 */
23#define KERN_CONT "" 23#define KERN_CONT ""
24 24
25/* integer equivalents of KERN_<LEVEL> */
26#define LOGLEVEL_SCHED -2 /* Deferred messages from sched code
27 * are set to this special level */
28#define LOGLEVEL_DEFAULT -1 /* default (or last) loglevel */
29#define LOGLEVEL_EMERG 0 /* system is unusable */
30#define LOGLEVEL_ALERT 1 /* action must be taken immediately */
31#define LOGLEVEL_CRIT 2 /* critical conditions */
32#define LOGLEVEL_ERR 3 /* error conditions */
33#define LOGLEVEL_WARNING 4 /* warning conditions */
34#define LOGLEVEL_NOTICE 5 /* normal but significant condition */
35#define LOGLEVEL_INFO 6 /* informational */
36#define LOGLEVEL_DEBUG 7 /* debug-level messages */
37
25#endif 38#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 446d76a87ba1..233ea8107038 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -427,6 +427,7 @@ extern int panic_timeout;
427extern int panic_on_oops; 427extern int panic_on_oops;
428extern int panic_on_unrecovered_nmi; 428extern int panic_on_unrecovered_nmi;
429extern int panic_on_io_nmi; 429extern int panic_on_io_nmi;
430extern int panic_on_warn;
430extern int sysctl_panic_on_stackoverflow; 431extern int sysctl_panic_on_stackoverflow;
431/* 432/*
432 * Only to be used by arch init code. If the user over-wrote the default 433 * Only to be used by arch init code. If the user over-wrote the default
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6b75640ef5ab..6ea9f919e888 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,7 +25,6 @@
25#include <linux/jump_label.h> 25#include <linux/jump_label.h>
26 26
27struct mem_cgroup; 27struct mem_cgroup;
28struct page_cgroup;
29struct page; 28struct page;
30struct mm_struct; 29struct mm_struct;
31struct kmem_cache; 30struct kmem_cache;
@@ -68,10 +67,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
68struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 67struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
69struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 68struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
70 69
71bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 70bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
72 struct mem_cgroup *memcg); 71 struct mem_cgroup *root);
73bool task_in_mem_cgroup(struct task_struct *task, 72bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
74 const struct mem_cgroup *memcg);
75 73
76extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); 74extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
77extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 75extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -79,15 +77,16 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
79extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); 77extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
80extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); 78extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
81 79
82static inline 80static inline bool mm_match_cgroup(struct mm_struct *mm,
83bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) 81 struct mem_cgroup *memcg)
84{ 82{
85 struct mem_cgroup *task_memcg; 83 struct mem_cgroup *task_memcg;
86 bool match; 84 bool match = false;
87 85
88 rcu_read_lock(); 86 rcu_read_lock();
89 task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 87 task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
90 match = __mem_cgroup_same_or_subtree(memcg, task_memcg); 88 if (task_memcg)
89 match = mem_cgroup_is_descendant(task_memcg, memcg);
91 rcu_read_unlock(); 90 rcu_read_unlock();
92 return match; 91 return match;
93} 92}
@@ -141,8 +140,8 @@ static inline bool mem_cgroup_disabled(void)
141 140
142struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, 141struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
143 unsigned long *flags); 142 unsigned long *flags);
144void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, 143void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
145 unsigned long flags); 144 unsigned long *flags);
146void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 145void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
147 enum mem_cgroup_stat_index idx, int val); 146 enum mem_cgroup_stat_index idx, int val);
148 147
@@ -174,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
174void mem_cgroup_split_huge_fixup(struct page *head); 173void mem_cgroup_split_huge_fixup(struct page *head);
175#endif 174#endif
176 175
177#ifdef CONFIG_DEBUG_VM
178bool mem_cgroup_bad_page_check(struct page *page);
179void mem_cgroup_print_bad_page(struct page *page);
180#endif
181#else /* CONFIG_MEMCG */ 176#else /* CONFIG_MEMCG */
182struct mem_cgroup; 177struct mem_cgroup;
183 178
@@ -297,7 +292,7 @@ static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
297} 292}
298 293
299static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, 294static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
300 bool locked, unsigned long flags) 295 bool *locked, unsigned long *flags)
301{ 296{
302} 297}
303 298
@@ -347,19 +342,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
347} 342}
348#endif /* CONFIG_MEMCG */ 343#endif /* CONFIG_MEMCG */
349 344
350#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
351static inline bool
352mem_cgroup_bad_page_check(struct page *page)
353{
354 return false;
355}
356
357static inline void
358mem_cgroup_print_bad_page(struct page *page)
359{
360}
361#endif
362
363enum { 345enum {
364 UNDER_LIMIT, 346 UNDER_LIMIT,
365 SOFT_LIMIT, 347 SOFT_LIMIT,
@@ -447,9 +429,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
447 /* 429 /*
448 * __GFP_NOFAIL allocations will move on even if charging is not 430 * __GFP_NOFAIL allocations will move on even if charging is not
449 * possible. Therefore we don't even try, and have this allocation 431 * possible. Therefore we don't even try, and have this allocation
450 * unaccounted. We could in theory charge it with 432 * unaccounted. We could in theory charge it forcibly, but we hope
451 * res_counter_charge_nofail, but we hope those allocations are rare, 433 * those allocations are rare, and won't be worth the trouble.
452 * and won't be worth the trouble.
453 */ 434 */
454 if (gfp & __GFP_NOFAIL) 435 if (gfp & __GFP_NOFAIL)
455 return true; 436 return true;
@@ -467,8 +448,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
467 * memcg_kmem_uncharge_pages: uncharge pages from memcg 448 * memcg_kmem_uncharge_pages: uncharge pages from memcg
468 * @page: pointer to struct page being freed 449 * @page: pointer to struct page being freed
469 * @order: allocation order. 450 * @order: allocation order.
470 *
471 * there is no need to specify memcg here, since it is embedded in page_cgroup
472 */ 451 */
473static inline void 452static inline void
474memcg_kmem_uncharge_pages(struct page *page, int order) 453memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -485,8 +464,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
485 * 464 *
486 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or 465 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
487 * failure of the allocation. if @page is NULL, this function will revert the 466 * failure of the allocation. if @page is NULL, this function will revert the
488 * charges. Otherwise, it will commit the memcg given by @memcg to the 467 * charges. Otherwise, it will commit @page to @memcg.
489 * corresponding page_cgroup.
490 */ 468 */
491static inline void 469static inline void
492memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) 470memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 004e9d17b47e..bf9f57529dcf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,6 +22,7 @@
22#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 22#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
23 23
24struct address_space; 24struct address_space;
25struct mem_cgroup;
25 26
26#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) 27#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
27#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ 28#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
@@ -167,6 +168,10 @@ struct page {
167 struct page *first_page; /* Compound tail pages */ 168 struct page *first_page; /* Compound tail pages */
168 }; 169 };
169 170
171#ifdef CONFIG_MEMCG
172 struct mem_cgroup *mem_cgroup;
173#endif
174
170 /* 175 /*
171 * On machines where all RAM is mapped into kernel address space, 176 * On machines where all RAM is mapped into kernel address space,
172 * we can simply calculate the virtual address. On machines with 177 * we can simply calculate the virtual address. On machines with
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ffe66e381c04..3879d7664dfc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -722,9 +722,6 @@ typedef struct pglist_data {
722 int nr_zones; 722 int nr_zones;
723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
724 struct page *node_mem_map; 724 struct page *node_mem_map;
725#ifdef CONFIG_MEMCG
726 struct page_cgroup *node_page_cgroup;
727#endif
728#endif 725#endif
729#ifndef CONFIG_NO_BOOTMEM 726#ifndef CONFIG_NO_BOOTMEM
730 struct bootmem_data *bdata; 727 struct bootmem_data *bdata;
@@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1078#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1075#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1079 1076
1080struct page; 1077struct page;
1081struct page_cgroup;
1082struct mem_section { 1078struct mem_section {
1083 /* 1079 /*
1084 * This is, logically, a pointer to an array of struct 1080 * This is, logically, a pointer to an array of struct
@@ -1096,14 +1092,6 @@ struct mem_section {
1096 1092
1097 /* See declaration of similar field in struct zone */ 1093 /* See declaration of similar field in struct zone */
1098 unsigned long *pageblock_flags; 1094 unsigned long *pageblock_flags;
1099#ifdef CONFIG_MEMCG
1100 /*
1101 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
1102 * section. (see memcontrol.h/page_cgroup.h about this.)
1103 */
1104 struct page_cgroup *page_cgroup;
1105 unsigned long pad;
1106#endif
1107 /* 1095 /*
1108 * WARNING: mem_section must be a power-of-2 in size for the 1096 * WARNING: mem_section must be a power-of-2 in size for the
1109 * calculation and use of SECTION_ROOT_MASK to make sense. 1097 * calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
deleted file mode 100644
index 5c831f1eca79..000000000000
--- a/include/linux/page_cgroup.h
+++ /dev/null
@@ -1,105 +0,0 @@
1#ifndef __LINUX_PAGE_CGROUP_H
2#define __LINUX_PAGE_CGROUP_H
3
4enum {
5 /* flags for mem_cgroup */
6 PCG_USED = 0x01, /* This page is charged to a memcg */
7 PCG_MEM = 0x02, /* This page holds a memory charge */
8 PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */
9};
10
11struct pglist_data;
12
13#ifdef CONFIG_MEMCG
14struct mem_cgroup;
15
16/*
17 * Page Cgroup can be considered as an extended mem_map.
18 * A page_cgroup page is associated with every page descriptor. The
19 * page_cgroup helps us identify information about the cgroup
20 * All page cgroups are allocated at boot or memory hotplug event,
21 * then the page cgroup for pfn always exists.
22 */
23struct page_cgroup {
24 unsigned long flags;
25 struct mem_cgroup *mem_cgroup;
26};
27
28extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
29
30#ifdef CONFIG_SPARSEMEM
31static inline void page_cgroup_init_flatmem(void)
32{
33}
34extern void page_cgroup_init(void);
35#else
36extern void page_cgroup_init_flatmem(void);
37static inline void page_cgroup_init(void)
38{
39}
40#endif
41
42struct page_cgroup *lookup_page_cgroup(struct page *page);
43
44static inline int PageCgroupUsed(struct page_cgroup *pc)
45{
46 return !!(pc->flags & PCG_USED);
47}
48#else /* !CONFIG_MEMCG */
49struct page_cgroup;
50
51static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
52{
53}
54
55static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
56{
57 return NULL;
58}
59
60static inline void page_cgroup_init(void)
61{
62}
63
64static inline void page_cgroup_init_flatmem(void)
65{
66}
67#endif /* CONFIG_MEMCG */
68
69#include <linux/swap.h>
70
71#ifdef CONFIG_MEMCG_SWAP
72extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
73 unsigned short old, unsigned short new);
74extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
75extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
76extern int swap_cgroup_swapon(int type, unsigned long max_pages);
77extern void swap_cgroup_swapoff(int type);
78#else
79
80static inline
81unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
82{
83 return 0;
84}
85
86static inline
87unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
88{
89 return 0;
90}
91
92static inline int
93swap_cgroup_swapon(int type, unsigned long max_pages)
94{
95 return 0;
96}
97
98static inline void swap_cgroup_swapoff(int type)
99{
100 return;
101}
102
103#endif /* CONFIG_MEMCG_SWAP */
104
105#endif /* __LINUX_PAGE_CGROUP_H */
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
new file mode 100644
index 000000000000..955421575d16
--- /dev/null
+++ b/include/linux/page_counter.h
@@ -0,0 +1,51 @@
1#ifndef _LINUX_PAGE_COUNTER_H
2#define _LINUX_PAGE_COUNTER_H
3
4#include <linux/atomic.h>
5#include <linux/kernel.h>
6#include <asm/page.h>
7
8struct page_counter {
9 atomic_long_t count;
10 unsigned long limit;
11 struct page_counter *parent;
12
13 /* legacy */
14 unsigned long watermark;
15 unsigned long failcnt;
16};
17
18#if BITS_PER_LONG == 32
19#define PAGE_COUNTER_MAX LONG_MAX
20#else
21#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE)
22#endif
23
24static inline void page_counter_init(struct page_counter *counter,
25 struct page_counter *parent)
26{
27 atomic_long_set(&counter->count, 0);
28 counter->limit = PAGE_COUNTER_MAX;
29 counter->parent = parent;
30}
31
32static inline unsigned long page_counter_read(struct page_counter *counter)
33{
34 return atomic_long_read(&counter->count);
35}
36
37void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
38void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
39int page_counter_try_charge(struct page_counter *counter,
40 unsigned long nr_pages,
41 struct page_counter **fail);
42void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
43int page_counter_limit(struct page_counter *counter, unsigned long limit);
44int page_counter_memparse(const char *buf, unsigned long *nr_pages);
45
46static inline void page_counter_reset_watermark(struct page_counter *counter)
47{
48 counter->watermark = page_counter_read(counter);
49}
50
51#endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 51ce60c35f4c..530b249f7ea4 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -147,28 +147,42 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref,
147} 147}
148 148
149/** 149/**
150 * percpu_ref_get - increment a percpu refcount 150 * percpu_ref_get_many - increment a percpu refcount
151 * @ref: percpu_ref to get 151 * @ref: percpu_ref to get
152 * @nr: number of references to get
152 * 153 *
153 * Analagous to atomic_long_inc(). 154 * Analogous to atomic_long_add().
154 * 155 *
155 * This function is safe to call as long as @ref is between init and exit. 156 * This function is safe to call as long as @ref is between init and exit.
156 */ 157 */
157static inline void percpu_ref_get(struct percpu_ref *ref) 158static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
158{ 159{
159 unsigned long __percpu *percpu_count; 160 unsigned long __percpu *percpu_count;
160 161
161 rcu_read_lock_sched(); 162 rcu_read_lock_sched();
162 163
163 if (__ref_is_percpu(ref, &percpu_count)) 164 if (__ref_is_percpu(ref, &percpu_count))
164 this_cpu_inc(*percpu_count); 165 this_cpu_add(*percpu_count, nr);
165 else 166 else
166 atomic_long_inc(&ref->count); 167 atomic_long_add(nr, &ref->count);
167 168
168 rcu_read_unlock_sched(); 169 rcu_read_unlock_sched();
169} 170}
170 171
171/** 172/**
173 * percpu_ref_get - increment a percpu refcount
174 * @ref: percpu_ref to get
175 *
176 * Analagous to atomic_long_inc().
177 *
178 * This function is safe to call as long as @ref is between init and exit.
179 */
180static inline void percpu_ref_get(struct percpu_ref *ref)
181{
182 percpu_ref_get_many(ref, 1);
183}
184
185/**
172 * percpu_ref_tryget - try to increment a percpu refcount 186 * percpu_ref_tryget - try to increment a percpu refcount
173 * @ref: percpu_ref to try-get 187 * @ref: percpu_ref to try-get
174 * 188 *
@@ -231,29 +245,44 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
231} 245}
232 246
233/** 247/**
234 * percpu_ref_put - decrement a percpu refcount 248 * percpu_ref_put_many - decrement a percpu refcount
235 * @ref: percpu_ref to put 249 * @ref: percpu_ref to put
250 * @nr: number of references to put
236 * 251 *
237 * Decrement the refcount, and if 0, call the release function (which was passed 252 * Decrement the refcount, and if 0, call the release function (which was passed
238 * to percpu_ref_init()) 253 * to percpu_ref_init())
239 * 254 *
240 * This function is safe to call as long as @ref is between init and exit. 255 * This function is safe to call as long as @ref is between init and exit.
241 */ 256 */
242static inline void percpu_ref_put(struct percpu_ref *ref) 257static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
243{ 258{
244 unsigned long __percpu *percpu_count; 259 unsigned long __percpu *percpu_count;
245 260
246 rcu_read_lock_sched(); 261 rcu_read_lock_sched();
247 262
248 if (__ref_is_percpu(ref, &percpu_count)) 263 if (__ref_is_percpu(ref, &percpu_count))
249 this_cpu_dec(*percpu_count); 264 this_cpu_sub(*percpu_count, nr);
250 else if (unlikely(atomic_long_dec_and_test(&ref->count))) 265 else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
251 ref->release(ref); 266 ref->release(ref);
252 267
253 rcu_read_unlock_sched(); 268 rcu_read_unlock_sched();
254} 269}
255 270
256/** 271/**
272 * percpu_ref_put - decrement a percpu refcount
273 * @ref: percpu_ref to put
274 *
275 * Decrement the refcount, and if 0, call the release function (which was passed
276 * to percpu_ref_init())
277 *
278 * This function is safe to call as long as @ref is between init and exit.
279 */
280static inline void percpu_ref_put(struct percpu_ref *ref)
281{
282 percpu_ref_put_many(ref, 1);
283}
284
285/**
257 * percpu_ref_is_zero - test whether a percpu refcount reached zero 286 * percpu_ref_is_zero - test whether a percpu refcount reached zero
258 * @ref: percpu_ref to test 287 * @ref: percpu_ref to test
259 * 288 *
diff --git a/include/linux/printk.h b/include/linux/printk.h
index d78125f73ac4..3dd489f2dedc 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -118,7 +118,6 @@ int no_printk(const char *fmt, ...)
118#ifdef CONFIG_EARLY_PRINTK 118#ifdef CONFIG_EARLY_PRINTK
119extern asmlinkage __printf(1, 2) 119extern asmlinkage __printf(1, 2)
120void early_printk(const char *fmt, ...); 120void early_printk(const char *fmt, ...);
121void early_vprintk(const char *fmt, va_list ap);
122#else 121#else
123static inline __printf(1, 2) __cold 122static inline __printf(1, 2) __cold
124void early_printk(const char *s, ...) { } 123void early_printk(const char *s, ...) { }
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index cc79eff4a1ad..987a73a40ef8 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -52,7 +52,7 @@ extern void ptrace_notify(int exit_code);
52extern void __ptrace_link(struct task_struct *child, 52extern void __ptrace_link(struct task_struct *child,
53 struct task_struct *new_parent); 53 struct task_struct *new_parent);
54extern void __ptrace_unlink(struct task_struct *child); 54extern void __ptrace_unlink(struct task_struct *child);
55extern void exit_ptrace(struct task_struct *tracer); 55extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
56#define PTRACE_MODE_READ 0x01 56#define PTRACE_MODE_READ 0x01
57#define PTRACE_MODE_ATTACH 0x02 57#define PTRACE_MODE_ATTACH 0x02
58#define PTRACE_MODE_NOAUDIT 0x04 58#define PTRACE_MODE_NOAUDIT 0x04
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
deleted file mode 100644
index 56b7bc32db4f..000000000000
--- a/include/linux/res_counter.h
+++ /dev/null
@@ -1,223 +0,0 @@
1#ifndef __RES_COUNTER_H__
2#define __RES_COUNTER_H__
3
4/*
5 * Resource Counters
6 * Contain common data types and routines for resource accounting
7 *
8 * Copyright 2007 OpenVZ SWsoft Inc
9 *
10 * Author: Pavel Emelianov <xemul@openvz.org>
11 *
12 * See Documentation/cgroups/resource_counter.txt for more
13 * info about what this counter is.
14 */
15
16#include <linux/spinlock.h>
17#include <linux/errno.h>
18
19/*
20 * The core object. the cgroup that wishes to account for some
21 * resource may include this counter into its structures and use
22 * the helpers described beyond
23 */
24
25struct res_counter {
26 /*
27 * the current resource consumption level
28 */
29 unsigned long long usage;
30 /*
31 * the maximal value of the usage from the counter creation
32 */
33 unsigned long long max_usage;
34 /*
35 * the limit that usage cannot exceed
36 */
37 unsigned long long limit;
38 /*
39 * the limit that usage can be exceed
40 */
41 unsigned long long soft_limit;
42 /*
43 * the number of unsuccessful attempts to consume the resource
44 */
45 unsigned long long failcnt;
46 /*
47 * the lock to protect all of the above.
48 * the routines below consider this to be IRQ-safe
49 */
50 spinlock_t lock;
51 /*
52 * Parent counter, used for hierarchial resource accounting
53 */
54 struct res_counter *parent;
55};
56
57#define RES_COUNTER_MAX ULLONG_MAX
58
59/**
60 * Helpers to interact with userspace
61 * res_counter_read_u64() - returns the value of the specified member.
62 * res_counter_read/_write - put/get the specified fields from the
63 * res_counter struct to/from the user
64 *
65 * @counter: the counter in question
66 * @member: the field to work with (see RES_xxx below)
67 * @buf: the buffer to opeate on,...
68 * @nbytes: its size...
69 * @pos: and the offset.
70 */
71
72u64 res_counter_read_u64(struct res_counter *counter, int member);
73
74ssize_t res_counter_read(struct res_counter *counter, int member,
75 const char __user *buf, size_t nbytes, loff_t *pos,
76 int (*read_strategy)(unsigned long long val, char *s));
77
78int res_counter_memparse_write_strategy(const char *buf,
79 unsigned long long *res);
80
81/*
82 * the field descriptors. one for each member of res_counter
83 */
84
85enum {
86 RES_USAGE,
87 RES_MAX_USAGE,
88 RES_LIMIT,
89 RES_FAILCNT,
90 RES_SOFT_LIMIT,
91};
92
93/*
94 * helpers for accounting
95 */
96
97void res_counter_init(struct res_counter *counter, struct res_counter *parent);
98
99/*
100 * charge - try to consume more resource.
101 *
102 * @counter: the counter
103 * @val: the amount of the resource. each controller defines its own
104 * units, e.g. numbers, bytes, Kbytes, etc
105 *
106 * returns 0 on success and <0 if the counter->usage will exceed the
107 * counter->limit
108 *
109 * charge_nofail works the same, except that it charges the resource
110 * counter unconditionally, and returns < 0 if the after the current
111 * charge we are over limit.
112 */
113
114int __must_check res_counter_charge(struct res_counter *counter,
115 unsigned long val, struct res_counter **limit_fail_at);
116int res_counter_charge_nofail(struct res_counter *counter,
117 unsigned long val, struct res_counter **limit_fail_at);
118
119/*
120 * uncharge - tell that some portion of the resource is released
121 *
122 * @counter: the counter
123 * @val: the amount of the resource
124 *
125 * these calls check for usage underflow and show a warning on the console
126 *
127 * returns the total charges still present in @counter.
128 */
129
130u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
131
132u64 res_counter_uncharge_until(struct res_counter *counter,
133 struct res_counter *top,
134 unsigned long val);
135/**
136 * res_counter_margin - calculate chargeable space of a counter
137 * @cnt: the counter
138 *
139 * Returns the difference between the hard limit and the current usage
140 * of resource counter @cnt.
141 */
142static inline unsigned long long res_counter_margin(struct res_counter *cnt)
143{
144 unsigned long long margin;
145 unsigned long flags;
146
147 spin_lock_irqsave(&cnt->lock, flags);
148 if (cnt->limit > cnt->usage)
149 margin = cnt->limit - cnt->usage;
150 else
151 margin = 0;
152 spin_unlock_irqrestore(&cnt->lock, flags);
153 return margin;
154}
155
156/**
157 * Get the difference between the usage and the soft limit
158 * @cnt: The counter
159 *
160 * Returns 0 if usage is less than or equal to soft limit
161 * The difference between usage and soft limit, otherwise.
162 */
163static inline unsigned long long
164res_counter_soft_limit_excess(struct res_counter *cnt)
165{
166 unsigned long long excess;
167 unsigned long flags;
168
169 spin_lock_irqsave(&cnt->lock, flags);
170 if (cnt->usage <= cnt->soft_limit)
171 excess = 0;
172 else
173 excess = cnt->usage - cnt->soft_limit;
174 spin_unlock_irqrestore(&cnt->lock, flags);
175 return excess;
176}
177
178static inline void res_counter_reset_max(struct res_counter *cnt)
179{
180 unsigned long flags;
181
182 spin_lock_irqsave(&cnt->lock, flags);
183 cnt->max_usage = cnt->usage;
184 spin_unlock_irqrestore(&cnt->lock, flags);
185}
186
187static inline void res_counter_reset_failcnt(struct res_counter *cnt)
188{
189 unsigned long flags;
190
191 spin_lock_irqsave(&cnt->lock, flags);
192 cnt->failcnt = 0;
193 spin_unlock_irqrestore(&cnt->lock, flags);
194}
195
196static inline int res_counter_set_limit(struct res_counter *cnt,
197 unsigned long long limit)
198{
199 unsigned long flags;
200 int ret = -EBUSY;
201
202 spin_lock_irqsave(&cnt->lock, flags);
203 if (cnt->usage <= limit) {
204 cnt->limit = limit;
205 ret = 0;
206 }
207 spin_unlock_irqrestore(&cnt->lock, flags);
208 return ret;
209}
210
211static inline int
212res_counter_set_soft_limit(struct res_counter *cnt,
213 unsigned long long soft_limit)
214{
215 unsigned long flags;
216
217 spin_lock_irqsave(&cnt->lock, flags);
218 cnt->soft_limit = soft_limit;
219 spin_unlock_irqrestore(&cnt->lock, flags);
220 return 0;
221}
222
223#endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index c265bec6a57d..8a2457d42fc8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -513,10 +513,6 @@ struct memcg_cache_params {
513 513
514int memcg_update_all_caches(int num_memcgs); 514int memcg_update_all_caches(int num_memcgs);
515 515
516struct seq_file;
517int cache_show(struct kmem_cache *s, struct seq_file *m);
518void print_slabinfo_header(struct seq_file *m);
519
520/** 516/**
521 * kmalloc_array - allocate memory for an array. 517 * kmalloc_array - allocate memory for an array.
522 * @n: number of elements. 518 * @n: number of elements.
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
new file mode 100644
index 000000000000..145306bdc92f
--- /dev/null
+++ b/include/linux/swap_cgroup.h
@@ -0,0 +1,42 @@
1#ifndef __LINUX_SWAP_CGROUP_H
2#define __LINUX_SWAP_CGROUP_H
3
4#include <linux/swap.h>
5
6#ifdef CONFIG_MEMCG_SWAP
7
8extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
9 unsigned short old, unsigned short new);
10extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
11extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
12extern int swap_cgroup_swapon(int type, unsigned long max_pages);
13extern void swap_cgroup_swapoff(int type);
14
15#else
16
17static inline
18unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
19{
20 return 0;
21}
22
23static inline
24unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
25{
26 return 0;
27}
28
29static inline int
30swap_cgroup_swapon(int type, unsigned long max_pages)
31{
32 return 0;
33}
34
35static inline void swap_cgroup_swapoff(int type)
36{
37 return;
38}
39
40#endif /* CONFIG_MEMCG_SWAP */
41
42#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index e6f235ebf6c9..7ff44e062a38 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,8 +54,8 @@
54#include <linux/security.h> 54#include <linux/security.h>
55#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/uaccess.h> 56#include <linux/uaccess.h>
57#include <linux/page_counter.h>
57#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
58#include <linux/res_counter.h>
59#include <linux/static_key.h> 59#include <linux/static_key.h>
60#include <linux/aio.h> 60#include <linux/aio.h>
61#include <linux/sched.h> 61#include <linux/sched.h>
@@ -1062,7 +1062,7 @@ enum cg_proto_flags {
1062}; 1062};
1063 1063
1064struct cg_proto { 1064struct cg_proto {
1065 struct res_counter memory_allocated; /* Current allocated memory. */ 1065 struct page_counter memory_allocated; /* Current allocated memory. */
1066 struct percpu_counter sockets_allocated; /* Current number of sockets. */ 1066 struct percpu_counter sockets_allocated; /* Current number of sockets. */
1067 int memory_pressure; 1067 int memory_pressure;
1068 long sysctl_mem[3]; 1068 long sysctl_mem[3];
@@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
1214 unsigned long amt, 1214 unsigned long amt,
1215 int *parent_status) 1215 int *parent_status)
1216{ 1216{
1217 struct res_counter *fail; 1217 page_counter_charge(&prot->memory_allocated, amt);
1218 int ret;
1219 1218
1220 ret = res_counter_charge_nofail(&prot->memory_allocated, 1219 if (page_counter_read(&prot->memory_allocated) >
1221 amt << PAGE_SHIFT, &fail); 1220 prot->memory_allocated.limit)
1222 if (ret < 0)
1223 *parent_status = OVER_LIMIT; 1221 *parent_status = OVER_LIMIT;
1224} 1222}
1225 1223
1226static inline void memcg_memory_allocated_sub(struct cg_proto *prot, 1224static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
1227 unsigned long amt) 1225 unsigned long amt)
1228{ 1226{
1229 res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); 1227 page_counter_uncharge(&prot->memory_allocated, amt);
1230}
1231
1232static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
1233{
1234 u64 ret;
1235 ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE);
1236 return ret >> PAGE_SHIFT;
1237} 1228}
1238 1229
1239static inline long 1230static inline long
1240sk_memory_allocated(const struct sock *sk) 1231sk_memory_allocated(const struct sock *sk)
1241{ 1232{
1242 struct proto *prot = sk->sk_prot; 1233 struct proto *prot = sk->sk_prot;
1234
1243 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1235 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1244 return memcg_memory_allocated_read(sk->sk_cgrp); 1236 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1245 1237
1246 return atomic_long_read(prot->memory_allocated); 1238 return atomic_long_read(prot->memory_allocated);
1247} 1239}
@@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
1255 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); 1247 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
1256 /* update the root cgroup regardless */ 1248 /* update the root cgroup regardless */
1257 atomic_long_add_return(amt, prot->memory_allocated); 1249 atomic_long_add_return(amt, prot->memory_allocated);
1258 return memcg_memory_allocated_read(sk->sk_cgrp); 1250 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1259 } 1251 }
1260 1252
1261 return atomic_long_add_return(amt, prot->memory_allocated); 1253 return atomic_long_add_return(amt, prot->memory_allocated);
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 43aaba1cc037..0956373b56db 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
153 KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */ 153 KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
154 KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ 154 KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
155 KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ 155 KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
156 KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */
156}; 157};
157 158
158 159
diff --git a/init/Kconfig b/init/Kconfig
index 903505e66d1d..9afb971497f4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -893,14 +893,6 @@ config ARCH_SUPPORTS_INT128
893config ARCH_WANT_NUMA_VARIABLE_LOCALITY 893config ARCH_WANT_NUMA_VARIABLE_LOCALITY
894 bool 894 bool
895 895
896config NUMA_BALANCING_DEFAULT_ENABLED
897 bool "Automatically enable NUMA aware memory/task placement"
898 default y
899 depends on NUMA_BALANCING
900 help
901 If set, automatic NUMA balancing will be enabled if running on a NUMA
902 machine.
903
904config NUMA_BALANCING 896config NUMA_BALANCING
905 bool "Memory placement aware NUMA scheduler" 897 bool "Memory placement aware NUMA scheduler"
906 depends on ARCH_SUPPORTS_NUMA_BALANCING 898 depends on ARCH_SUPPORTS_NUMA_BALANCING
@@ -913,6 +905,14 @@ config NUMA_BALANCING
913 905
914 This system will be inactive on UMA systems. 906 This system will be inactive on UMA systems.
915 907
908config NUMA_BALANCING_DEFAULT_ENABLED
909 bool "Automatically enable NUMA aware memory/task placement"
910 default y
911 depends on NUMA_BALANCING
912 help
913 If set, automatic NUMA balancing will be enabled if running on a NUMA
914 machine.
915
916menuconfig CGROUPS 916menuconfig CGROUPS
917 boolean "Control Group support" 917 boolean "Control Group support"
918 select KERNFS 918 select KERNFS
@@ -972,32 +972,17 @@ config CGROUP_CPUACCT
972 Provides a simple Resource Controller for monitoring the 972 Provides a simple Resource Controller for monitoring the
973 total CPU consumed by the tasks in a cgroup. 973 total CPU consumed by the tasks in a cgroup.
974 974
975config RESOURCE_COUNTERS 975config PAGE_COUNTER
976 bool "Resource counters" 976 bool
977 help
978 This option enables controller independent resource accounting
979 infrastructure that works with cgroups.
980 977
981config MEMCG 978config MEMCG
982 bool "Memory Resource Controller for Control Groups" 979 bool "Memory Resource Controller for Control Groups"
983 depends on RESOURCE_COUNTERS 980 select PAGE_COUNTER
984 select EVENTFD 981 select EVENTFD
985 help 982 help
986 Provides a memory resource controller that manages both anonymous 983 Provides a memory resource controller that manages both anonymous
987 memory and page cache. (See Documentation/cgroups/memory.txt) 984 memory and page cache. (See Documentation/cgroups/memory.txt)
988 985
989 Note that setting this option increases fixed memory overhead
990 associated with each page of memory in the system. By this,
991 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
992 usage tracking struct at boot. Total amount of this is printed out
993 at boot.
994
995 Only enable when you're ok with these trade offs and really
996 sure you need the memory resource controller. Even when you enable
997 this, you can set "cgroup_disable=memory" at your boot option to
998 disable memory resource controller and you can avoid overheads.
999 (and lose benefits of memory resource controller)
1000
1001config MEMCG_SWAP 986config MEMCG_SWAP
1002 bool "Memory Resource Controller Swap Extension" 987 bool "Memory Resource Controller Swap Extension"
1003 depends on MEMCG && SWAP 988 depends on MEMCG && SWAP
@@ -1048,7 +1033,8 @@ config MEMCG_KMEM
1048 1033
1049config CGROUP_HUGETLB 1034config CGROUP_HUGETLB
1050 bool "HugeTLB Resource Controller for Control Groups" 1035 bool "HugeTLB Resource Controller for Control Groups"
1051 depends on RESOURCE_COUNTERS && HUGETLB_PAGE 1036 depends on HUGETLB_PAGE
1037 select PAGE_COUNTER
1052 default n 1038 default n
1053 help 1039 help
1054 Provides a cgroup Resource Controller for HugeTLB pages. 1040 Provides a cgroup Resource Controller for HugeTLB pages.
@@ -1294,6 +1280,22 @@ source "usr/Kconfig"
1294 1280
1295endif 1281endif
1296 1282
1283config INIT_FALLBACK
1284 bool "Fall back to defaults if init= parameter is bad"
1285 default y
1286 help
1287 If enabled, the kernel will try the default init binaries if an
1288 explicit request from the init= parameter fails.
1289
1290 This can have unexpected effects. For example, booting
1291 with init=/sbin/kiosk_app will run /sbin/init or even /bin/sh
1292 if /sbin/kiosk_app cannot be executed.
1293
1294 The default value of Y is consistent with historical behavior.
1295 Selecting N is likely to be more appropriate for most uses,
1296 especially on kiosks and on kernels that are intended to be
1297 run under the control of a script.
1298
1297config CC_OPTIMIZE_FOR_SIZE 1299config CC_OPTIMIZE_FOR_SIZE
1298 bool "Optimize for size" 1300 bool "Optimize for size"
1299 help 1301 help
diff --git a/init/main.c b/init/main.c
index 321d0ceb26d3..ca380ec685de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,7 +51,6 @@
51#include <linux/mempolicy.h> 51#include <linux/mempolicy.h>
52#include <linux/key.h> 52#include <linux/key.h>
53#include <linux/buffer_head.h> 53#include <linux/buffer_head.h>
54#include <linux/page_cgroup.h>
55#include <linux/debug_locks.h> 54#include <linux/debug_locks.h>
56#include <linux/debugobjects.h> 55#include <linux/debugobjects.h>
57#include <linux/lockdep.h> 56#include <linux/lockdep.h>
@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
485 */ 484 */
486static void __init mm_init(void) 485static void __init mm_init(void)
487{ 486{
488 /*
489 * page_cgroup requires contiguous pages,
490 * bigger than MAX_ORDER unless SPARSEMEM.
491 */
492 page_cgroup_init_flatmem();
493 mem_init(); 487 mem_init();
494 kmem_cache_init(); 488 kmem_cache_init();
495 percpu_init_late(); 489 percpu_init_late();
@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
627 initrd_start = 0; 621 initrd_start = 0;
628 } 622 }
629#endif 623#endif
630 page_cgroup_init();
631 debug_objects_mem_init(); 624 debug_objects_mem_init();
632 kmemleak_init(); 625 kmemleak_init();
633 setup_per_cpu_pageset(); 626 setup_per_cpu_pageset();
@@ -959,8 +952,13 @@ static int __ref kernel_init(void *unused)
959 ret = run_init_process(execute_command); 952 ret = run_init_process(execute_command);
960 if (!ret) 953 if (!ret)
961 return 0; 954 return 0;
955#ifndef CONFIG_INIT_FALLBACK
956 panic("Requested init %s failed (error %d).",
957 execute_command, ret);
958#else
962 pr_err("Failed to execute %s (error %d). Attempting defaults...\n", 959 pr_err("Failed to execute %s (error %d). Attempting defaults...\n",
963 execute_command, ret); 960 execute_command, ret);
961#endif
964 } 962 }
965 if (!try_to_run_init_process("/sbin/init") || 963 if (!try_to_run_init_process("/sbin/init") ||
966 !try_to_run_init_process("/etc/init") || 964 !try_to_run_init_process("/etc/init") ||
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..8714e5ded8b4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
118 } 118 }
119 119
120 /* 120 /*
121 * Accumulate here the counters for all threads but the group leader 121 * Accumulate here the counters for all threads as they die. We could
122 * as they die, so they can be added into the process-wide totals 122 * skip the group leader because it is the last user of signal_struct,
123 * when those are taken. The group leader stays around as a zombie as 123 * but we want to avoid the race with thread_group_cputime() which can
124 * long as there are other threads. When it gets reaped, the exit.c 124 * see the empty ->thread_head list.
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */ 125 */
129 task_cputime(tsk, &utime, &stime); 126 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock); 127 write_seqlock(&sig->stats_lock);
@@ -462,6 +459,44 @@ static void exit_mm(struct task_struct *tsk)
462 clear_thread_flag(TIF_MEMDIE); 459 clear_thread_flag(TIF_MEMDIE);
463} 460}
464 461
462static struct task_struct *find_alive_thread(struct task_struct *p)
463{
464 struct task_struct *t;
465
466 for_each_thread(p, t) {
467 if (!(t->flags & PF_EXITING))
468 return t;
469 }
470 return NULL;
471}
472
473static struct task_struct *find_child_reaper(struct task_struct *father)
474 __releases(&tasklist_lock)
475 __acquires(&tasklist_lock)
476{
477 struct pid_namespace *pid_ns = task_active_pid_ns(father);
478 struct task_struct *reaper = pid_ns->child_reaper;
479
480 if (likely(reaper != father))
481 return reaper;
482
483 reaper = find_alive_thread(father);
484 if (reaper) {
485 pid_ns->child_reaper = reaper;
486 return reaper;
487 }
488
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?: father->exit_code);
493 }
494 zap_pid_ns_processes(pid_ns);
495 write_lock_irq(&tasklist_lock);
496
497 return father;
498}
499
465/* 500/*
466 * When we die, we re-parent all our children, and try to: 501 * When we die, we re-parent all our children, and try to:
467 * 1. give them to another thread in our thread group, if such a member exists 502 * 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +504,36 @@ static void exit_mm(struct task_struct *tsk)
469 * child_subreaper for its children (like a service manager) 504 * child_subreaper for its children (like a service manager)
470 * 3. give it to the init process (PID 1) in our pid namespace 505 * 3. give it to the init process (PID 1) in our pid namespace
471 */ 506 */
472static struct task_struct *find_new_reaper(struct task_struct *father) 507static struct task_struct *find_new_reaper(struct task_struct *father,
473 __releases(&tasklist_lock) 508 struct task_struct *child_reaper)
474 __acquires(&tasklist_lock)
475{ 509{
476 struct pid_namespace *pid_ns = task_active_pid_ns(father); 510 struct task_struct *thread, *reaper;
477 struct task_struct *thread;
478 511
479 thread = father; 512 thread = find_alive_thread(father);
480 while_each_thread(father, thread) { 513 if (thread)
481 if (thread->flags & PF_EXITING)
482 continue;
483 if (unlikely(pid_ns->child_reaper == father))
484 pid_ns->child_reaper = thread;
485 return thread; 514 return thread;
486 }
487
488 if (unlikely(pid_ns->child_reaper == father)) {
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?:
493 father->exit_code);
494 }
495
496 zap_pid_ns_processes(pid_ns);
497 write_lock_irq(&tasklist_lock);
498 } else if (father->signal->has_child_subreaper) {
499 struct task_struct *reaper;
500 515
516 if (father->signal->has_child_subreaper) {
501 /* 517 /*
502 * Find the first ancestor marked as child_subreaper. 518 * Find the first ->is_child_subreaper ancestor in our pid_ns.
503 * Note that the code below checks same_thread_group(reaper, 519 * We start from father to ensure we can not look into another
504 * pid_ns->child_reaper). This is what we need to DTRT in a 520 * namespace, this is safe because all its threads are dead.
505 * PID namespace. However we still need the check above, see
506 * http://marc.info/?l=linux-kernel&m=131385460420380
507 */ 521 */
508 for (reaper = father->real_parent; 522 for (reaper = father;
509 reaper != &init_task; 523 !same_thread_group(reaper, child_reaper);
510 reaper = reaper->real_parent) { 524 reaper = reaper->real_parent) {
511 if (same_thread_group(reaper, pid_ns->child_reaper)) 525 /* call_usermodehelper() descendants need this check */
526 if (reaper == &init_task)
512 break; 527 break;
513 if (!reaper->signal->is_child_subreaper) 528 if (!reaper->signal->is_child_subreaper)
514 continue; 529 continue;
515 thread = reaper; 530 thread = find_alive_thread(reaper);
516 do { 531 if (thread)
517 if (!(thread->flags & PF_EXITING)) 532 return thread;
518 return reaper;
519 } while_each_thread(reaper, thread);
520 } 533 }
521 } 534 }
522 535
523 return pid_ns->child_reaper; 536 return child_reaper;
524} 537}
525 538
526/* 539/*
@@ -529,15 +542,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
529static void reparent_leader(struct task_struct *father, struct task_struct *p, 542static void reparent_leader(struct task_struct *father, struct task_struct *p,
530 struct list_head *dead) 543 struct list_head *dead)
531{ 544{
532 list_move_tail(&p->sibling, &p->real_parent->children); 545 if (unlikely(p->exit_state == EXIT_DEAD))
533
534 if (p->exit_state == EXIT_DEAD)
535 return;
536 /*
537 * If this is a threaded reparent there is no need to
538 * notify anyone anything has happened.
539 */
540 if (same_thread_group(p->real_parent, father))
541 return; 546 return;
542 547
543 /* We don't want people slaying init. */ 548 /* We don't want people slaying init. */
@@ -548,49 +553,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
548 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 553 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
549 if (do_notify_parent(p, p->exit_signal)) { 554 if (do_notify_parent(p, p->exit_signal)) {
550 p->exit_state = EXIT_DEAD; 555 p->exit_state = EXIT_DEAD;
551 list_move_tail(&p->sibling, dead); 556 list_add(&p->ptrace_entry, dead);
552 } 557 }
553 } 558 }
554 559
555 kill_orphaned_pgrp(p, father); 560 kill_orphaned_pgrp(p, father);
556} 561}
557 562
558static void forget_original_parent(struct task_struct *father) 563/*
564 * This does two things:
565 *
566 * A. Make init inherit all the child processes
567 * B. Check to see if any process groups have become orphaned
568 * as a result of our exiting, and if they have any stopped
569 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
570 */
571static void forget_original_parent(struct task_struct *father,
572 struct list_head *dead)
559{ 573{
560 struct task_struct *p, *n, *reaper; 574 struct task_struct *p, *t, *reaper;
561 LIST_HEAD(dead_children);
562 575
563 write_lock_irq(&tasklist_lock); 576 if (unlikely(!list_empty(&father->ptraced)))
564 /* 577 exit_ptrace(father, dead);
565 * Note that exit_ptrace() and find_new_reaper() might
566 * drop tasklist_lock and reacquire it.
567 */
568 exit_ptrace(father);
569 reaper = find_new_reaper(father);
570 578
571 list_for_each_entry_safe(p, n, &father->children, sibling) { 579 /* Can drop and reacquire tasklist_lock */
572 struct task_struct *t = p; 580 reaper = find_child_reaper(father);
581 if (list_empty(&father->children))
582 return;
573 583
574 do { 584 reaper = find_new_reaper(father, reaper);
585 list_for_each_entry(p, &father->children, sibling) {
586 for_each_thread(p, t) {
575 t->real_parent = reaper; 587 t->real_parent = reaper;
576 if (t->parent == father) { 588 BUG_ON((!t->ptrace) != (t->parent == father));
577 BUG_ON(t->ptrace); 589 if (likely(!t->ptrace))
578 t->parent = t->real_parent; 590 t->parent = t->real_parent;
579 }
580 if (t->pdeath_signal) 591 if (t->pdeath_signal)
581 group_send_sig_info(t->pdeath_signal, 592 group_send_sig_info(t->pdeath_signal,
582 SEND_SIG_NOINFO, t); 593 SEND_SIG_NOINFO, t);
583 } while_each_thread(p, t); 594 }
584 reparent_leader(father, p, &dead_children); 595 /*
585 } 596 * If this is a threaded reparent there is no need to
586 write_unlock_irq(&tasklist_lock); 597 * notify anyone anything has happened.
587 598 */
588 BUG_ON(!list_empty(&father->children)); 599 if (!same_thread_group(reaper, father))
589 600 reparent_leader(father, p, dead);
590 list_for_each_entry_safe(p, n, &dead_children, sibling) {
591 list_del_init(&p->sibling);
592 release_task(p);
593 } 601 }
602 list_splice_tail_init(&father->children, &reaper->children);
594} 603}
595 604
596/* 605/*
@@ -600,18 +609,12 @@ static void forget_original_parent(struct task_struct *father)
600static void exit_notify(struct task_struct *tsk, int group_dead) 609static void exit_notify(struct task_struct *tsk, int group_dead)
601{ 610{
602 bool autoreap; 611 bool autoreap;
603 612 struct task_struct *p, *n;
604 /* 613 LIST_HEAD(dead);
605 * This does two things:
606 *
607 * A. Make init inherit all the child processes
608 * B. Check to see if any process groups have become orphaned
609 * as a result of our exiting, and if they have any stopped
610 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
611 */
612 forget_original_parent(tsk);
613 614
614 write_lock_irq(&tasklist_lock); 615 write_lock_irq(&tasklist_lock);
616 forget_original_parent(tsk, &dead);
617
615 if (group_dead) 618 if (group_dead)
616 kill_orphaned_pgrp(tsk->group_leader, NULL); 619 kill_orphaned_pgrp(tsk->group_leader, NULL);
617 620
@@ -629,15 +632,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
629 } 632 }
630 633
631 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 634 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
635 if (tsk->exit_state == EXIT_DEAD)
636 list_add(&tsk->ptrace_entry, &dead);
632 637
633 /* mt-exec, de_thread() is waiting for group leader */ 638 /* mt-exec, de_thread() is waiting for group leader */
634 if (unlikely(tsk->signal->notify_count < 0)) 639 if (unlikely(tsk->signal->notify_count < 0))
635 wake_up_process(tsk->signal->group_exit_task); 640 wake_up_process(tsk->signal->group_exit_task);
636 write_unlock_irq(&tasklist_lock); 641 write_unlock_irq(&tasklist_lock);
637 642
638 /* If the process is dead, release it - nobody will wait for it */ 643 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
639 if (autoreap) 644 list_del_init(&p->ptrace_entry);
640 release_task(tsk); 645 release_task(p);
646 }
641} 647}
642 648
643#ifdef CONFIG_DEBUG_STACK_USAGE 649#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +988,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
982 */ 988 */
983static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 989static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
984{ 990{
985 unsigned long state; 991 int state, retval, status;
986 int retval, status, traced;
987 pid_t pid = task_pid_vnr(p); 992 pid_t pid = task_pid_vnr(p);
988 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 993 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
989 struct siginfo __user *infop; 994 struct siginfo __user *infop;
@@ -1008,21 +1013,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1008 } 1013 }
1009 return wait_noreap_copyout(wo, p, pid, uid, why, status); 1014 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1010 } 1015 }
1011
1012 traced = ptrace_reparented(p);
1013 /* 1016 /*
1014 * Move the task's state to DEAD/TRACE, only one thread can do this. 1017 * Move the task's state to DEAD/TRACE, only one thread can do this.
1015 */ 1018 */
1016 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; 1019 state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1020 EXIT_TRACE : EXIT_DEAD;
1017 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1021 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1018 return 0; 1022 return 0;
1019 /* 1023 /*
1020 * It can be ptraced but not reparented, check 1024 * We own this thread, nobody else can reap it.
1021 * thread_group_leader() to filter out sub-threads.
1022 */ 1025 */
1023 if (likely(!traced) && thread_group_leader(p)) { 1026 read_unlock(&tasklist_lock);
1024 struct signal_struct *psig; 1027 sched_annotate_sleep();
1025 struct signal_struct *sig; 1028
1029 /*
1030 * Check thread_group_leader() to exclude the traced sub-threads.
1031 */
1032 if (state == EXIT_DEAD && thread_group_leader(p)) {
1033 struct signal_struct *sig = p->signal;
1034 struct signal_struct *psig = current->signal;
1026 unsigned long maxrss; 1035 unsigned long maxrss;
1027 cputime_t tgutime, tgstime; 1036 cputime_t tgutime, tgstime;
1028 1037
@@ -1034,21 +1043,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1034 * accumulate in the parent's signal_struct c* fields. 1043 * accumulate in the parent's signal_struct c* fields.
1035 * 1044 *
1036 * We don't bother to take a lock here to protect these 1045 * We don't bother to take a lock here to protect these
1037 * p->signal fields, because they are only touched by 1046 * p->signal fields because the whole thread group is dead
1038 * __exit_signal, which runs with tasklist_lock 1047 * and nobody can change them.
1039 * write-locked anyway, and so is excluded here. We do 1048 *
1040 * need to protect the access to parent->signal fields, 1049 * psig->stats_lock also protects us from our sub-theads
1041 * as other threads in the parent group can be right 1050 * which can reap other children at the same time. Until
1042 * here reaping other children at the same time. 1051 * we change k_getrusage()-like users to rely on this lock
1052 * we have to take ->siglock as well.
1043 * 1053 *
1044 * We use thread_group_cputime_adjusted() to get times for 1054 * We use thread_group_cputime_adjusted() to get times for
1045 * the thread group, which consolidates times for all threads 1055 * the thread group, which consolidates times for all threads
1046 * in the group including the group leader. 1056 * in the group including the group leader.
1047 */ 1057 */
1048 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1058 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1049 spin_lock_irq(&p->real_parent->sighand->siglock); 1059 spin_lock_irq(&current->sighand->siglock);
1050 psig = p->real_parent->signal;
1051 sig = p->signal;
1052 write_seqlock(&psig->stats_lock); 1060 write_seqlock(&psig->stats_lock);
1053 psig->cutime += tgutime + sig->cutime; 1061 psig->cutime += tgutime + sig->cutime;
1054 psig->cstime += tgstime + sig->cstime; 1062 psig->cstime += tgstime + sig->cstime;
@@ -1073,16 +1081,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1073 task_io_accounting_add(&psig->ioac, &p->ioac); 1081 task_io_accounting_add(&psig->ioac, &p->ioac);
1074 task_io_accounting_add(&psig->ioac, &sig->ioac); 1082 task_io_accounting_add(&psig->ioac, &sig->ioac);
1075 write_sequnlock(&psig->stats_lock); 1083 write_sequnlock(&psig->stats_lock);
1076 spin_unlock_irq(&p->real_parent->sighand->siglock); 1084 spin_unlock_irq(&current->sighand->siglock);
1077 } 1085 }
1078 1086
1079 /*
1080 * Now we are sure this task is interesting, and no other
1081 * thread can reap it because we its state == DEAD/TRACE.
1082 */
1083 read_unlock(&tasklist_lock);
1084 sched_annotate_sleep();
1085
1086 retval = wo->wo_rusage 1087 retval = wo->wo_rusage
1087 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1088 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1088 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1089 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq; 48static struct workqueue_struct *khelper_wq;
49 49
50/*
51 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
52 * locking to protect this global - it is private to the singleton khelper
53 * thread and should only ever be modified by that thread.
54 */
55static const struct task_struct *kmod_thread_locker;
56
57#define CAP_BSET (void *)1 50#define CAP_BSET (void *)1
58#define CAP_PI (void *)2 51#define CAP_PI (void *)2
59 52
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
223static int ____call_usermodehelper(void *data) 216static int ____call_usermodehelper(void *data)
224{ 217{
225 struct subprocess_info *sub_info = data; 218 struct subprocess_info *sub_info = data;
226 int wait = sub_info->wait & ~UMH_KILLABLE;
227 struct cred *new; 219 struct cred *new;
228 int retval; 220 int retval;
229 221
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
267out: 259out:
268 sub_info->retval = retval; 260 sub_info->retval = retval;
269 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ 261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
270 if (wait != UMH_WAIT_PROC) 262 if (!(sub_info->wait & UMH_WAIT_PROC))
271 umh_complete(sub_info); 263 umh_complete(sub_info);
272 if (!retval) 264 if (!retval)
273 return 0; 265 return 0;
274 do_exit(0); 266 do_exit(0);
275} 267}
276 268
277static int call_helper(void *data)
278{
279 /* Worker thread started blocking khelper thread. */
280 kmod_thread_locker = current;
281 return ____call_usermodehelper(data);
282}
283
284/* Keventd can't block, but this (a child) can. */ 269/* Keventd can't block, but this (a child) can. */
285static int wait_for_helper(void *data) 270static int wait_for_helper(void *data)
286{ 271{
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
323{ 308{
324 struct subprocess_info *sub_info = 309 struct subprocess_info *sub_info =
325 container_of(work, struct subprocess_info, work); 310 container_of(work, struct subprocess_info, work);
326 int wait = sub_info->wait & ~UMH_KILLABLE;
327 pid_t pid; 311 pid_t pid;
328 312
329 /* CLONE_VFORK: wait until the usermode helper has execve'd 313 if (sub_info->wait & UMH_WAIT_PROC)
330 * successfully We need the data structures to stay around
331 * until that is done. */
332 if (wait == UMH_WAIT_PROC)
333 pid = kernel_thread(wait_for_helper, sub_info, 314 pid = kernel_thread(wait_for_helper, sub_info,
334 CLONE_FS | CLONE_FILES | SIGCHLD); 315 CLONE_FS | CLONE_FILES | SIGCHLD);
335 else { 316 else
336 pid = kernel_thread(call_helper, sub_info, 317 pid = kernel_thread(____call_usermodehelper, sub_info,
337 CLONE_VFORK | SIGCHLD); 318 SIGCHLD);
338 /* Worker thread stopped blocking khelper thread. */
339 kmod_thread_locker = NULL;
340 }
341 319
342 if (pid < 0) { 320 if (pid < 0) {
343 sub_info->retval = pid; 321 sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 goto out; 549 goto out;
572 } 550 }
573 /* 551 /*
574 * Worker thread must not wait for khelper thread at below
575 * wait_for_completion() if the thread was created with CLONE_VFORK
576 * flag, for khelper thread is already waiting for the thread at
577 * wait_for_completion() in do_fork().
578 */
579 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
580 retval = -EBUSY;
581 goto out;
582 }
583
584 /*
585 * Set the completion pointer only if there is a waiter. 552 * Set the completion pointer only if there is a waiter.
586 * This makes it possible to use umh_complete to free 553 * This makes it possible to use umh_complete to free
587 * the data structure in case of UMH_NO_WAIT. 554 * the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers; 35static bool crash_kexec_post_notifiers;
36int panic_on_warn __read_mostly;
36 37
37int panic_timeout = CONFIG_PANIC_TIMEOUT; 38int panic_timeout = CONFIG_PANIC_TIMEOUT;
38EXPORT_SYMBOL_GPL(panic_timeout); 39EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
428 if (args) 429 if (args)
429 vprintk(args->fmt, args->args); 430 vprintk(args->fmt, args->args);
430 431
432 if (panic_on_warn) {
433 /*
434 * This thread may hit another WARN() in the panic path.
435 * Resetting this prevents additional WARN() from panicking the
436 * system on this thread. Other threads are blocked by the
437 * panic_mutex in panic().
438 */
439 panic_on_warn = 0;
440 panic("panic_on_warn set ...\n");
441 }
442
431 print_modules(); 443 print_modules();
432 dump_stack(); 444 dump_stack();
433 print_oops_end_marker(); 445 print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
485 497
486core_param(panic, panic_timeout, int, 0644); 498core_param(panic, panic_timeout, int, 0644);
487core_param(pause_on_oops, pause_on_oops, int, 0644); 499core_param(pause_on_oops, pause_on_oops, int, 0644);
500core_param(panic_on_warn, panic_on_warn, int, 0644);
488 501
489static int __init setup_crash_kexec_post_notifiers(char *s) 502static int __init setup_crash_kexec_post_notifiers(char *s)
490{ 503{
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..82430c858d69 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -341,6 +341,8 @@ out:
341 341
342out_unlock: 342out_unlock:
343 spin_unlock_irq(&pidmap_lock); 343 spin_unlock_irq(&pidmap_lock);
344 put_pid_ns(ns);
345
344out_free: 346out_free:
345 while (++i <= ns->level) 347 while (++i <= ns->level)
346 free_pidmap(pid->numbers + i); 348 free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..bc6d6a89b6e6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -190,7 +190,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
190 /* Don't allow any more processes into the pid namespace */ 190 /* Don't allow any more processes into the pid namespace */
191 disable_pid_allocation(pid_ns); 191 disable_pid_allocation(pid_ns);
192 192
193 /* Ignore SIGCHLD causing any terminated children to autoreap */ 193 /*
194 * Ignore SIGCHLD causing any terminated children to autoreap.
195 * This speeds up the namespace shutdown, plus see the comment
196 * below.
197 */
194 spin_lock_irq(&me->sighand->siglock); 198 spin_lock_irq(&me->sighand->siglock);
195 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 199 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
196 spin_unlock_irq(&me->sighand->siglock); 200 spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +227,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
223 } 227 }
224 read_unlock(&tasklist_lock); 228 read_unlock(&tasklist_lock);
225 229
226 /* Firstly reap the EXIT_ZOMBIE children we may have. */ 230 /*
231 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
232 * sys_wait4() will also block until our children traced from the
233 * parent namespace are detached and become EXIT_DEAD.
234 */
227 do { 235 do {
228 clear_thread_flag(TIF_SIGPENDING); 236 clear_thread_flag(TIF_SIGPENDING);
229 rc = sys_wait4(-1, NULL, __WALL, NULL); 237 rc = sys_wait4(-1, NULL, __WALL, NULL);
230 } while (rc != -ECHILD); 238 } while (rc != -ECHILD);
231 239
232 /* 240 /*
233 * sys_wait4() above can't reap the TASK_DEAD children. 241 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
234 * Make sure they all go away, see free_pid(). 242 * really care, we could reparent them to the global init. We could
243 * exit and reap ->child_reaper even if it is not the last thread in
244 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
245 * pid_ns can not go away until proc_kill_sb() drops the reference.
246 *
247 * But this ns can also have other tasks injected by setns()+fork().
248 * Again, ignoring the user visible semantics we do not really need
249 * to wait until they are all reaped, but they can be reparented to
250 * us and thus we need to ensure that pid->child_reaper stays valid
251 * until they all go away. See free_pid()->wake_up_process().
252 *
253 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
254 * if reparented.
235 */ 255 */
236 for (;;) { 256 for (;;) {
237 set_current_state(TASK_UNINTERRUPTIBLE); 257 set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c8755e7e1dba..ea27c019655a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
63}; 63};
64 64
65/* Deferred messaged from sched code are marked by this special level */
66#define SCHED_MESSAGE_LOGLEVEL -2
67
68/* 65/*
69 * Low level drivers may need that to know if they can schedule in 66 * Low level drivers may need that to know if they can schedule in
70 * their unblank() callback or not. So let's export it. 67 * their unblank() callback or not. So let's export it.
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1259int do_syslog(int type, char __user *buf, int len, bool from_file) 1256int do_syslog(int type, char __user *buf, int len, bool from_file)
1260{ 1257{
1261 bool clear = false; 1258 bool clear = false;
1262 static int saved_console_loglevel = -1; 1259 static int saved_console_loglevel = LOGLEVEL_DEFAULT;
1263 int error; 1260 int error;
1264 1261
1265 error = check_syslog_permissions(type, from_file); 1262 error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1316 break; 1313 break;
1317 /* Disable logging to console */ 1314 /* Disable logging to console */
1318 case SYSLOG_ACTION_CONSOLE_OFF: 1315 case SYSLOG_ACTION_CONSOLE_OFF:
1319 if (saved_console_loglevel == -1) 1316 if (saved_console_loglevel == LOGLEVEL_DEFAULT)
1320 saved_console_loglevel = console_loglevel; 1317 saved_console_loglevel = console_loglevel;
1321 console_loglevel = minimum_console_loglevel; 1318 console_loglevel = minimum_console_loglevel;
1322 break; 1319 break;
1323 /* Enable logging to console */ 1320 /* Enable logging to console */
1324 case SYSLOG_ACTION_CONSOLE_ON: 1321 case SYSLOG_ACTION_CONSOLE_ON:
1325 if (saved_console_loglevel != -1) { 1322 if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
1326 console_loglevel = saved_console_loglevel; 1323 console_loglevel = saved_console_loglevel;
1327 saved_console_loglevel = -1; 1324 saved_console_loglevel = LOGLEVEL_DEFAULT;
1328 } 1325 }
1329 break; 1326 break;
1330 /* Set level of messages printed to console */ 1327 /* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1336 len = minimum_console_loglevel; 1333 len = minimum_console_loglevel;
1337 console_loglevel = len; 1334 console_loglevel = len;
1338 /* Implicitly re-enable logging to console */ 1335 /* Implicitly re-enable logging to console */
1339 saved_console_loglevel = -1; 1336 saved_console_loglevel = LOGLEVEL_DEFAULT;
1340 error = 0; 1337 error = 0;
1341 break; 1338 break;
1342 /* Number of chars in the log buffer */ 1339 /* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1627 int printed_len = 0; 1624 int printed_len = 0;
1628 bool in_sched = false; 1625 bool in_sched = false;
1629 /* cpu currently holding logbuf_lock in this function */ 1626 /* cpu currently holding logbuf_lock in this function */
1630 static volatile unsigned int logbuf_cpu = UINT_MAX; 1627 static unsigned int logbuf_cpu = UINT_MAX;
1631 1628
1632 if (level == SCHED_MESSAGE_LOGLEVEL) { 1629 if (level == LOGLEVEL_SCHED) {
1633 level = -1; 1630 level = LOGLEVEL_DEFAULT;
1634 in_sched = true; 1631 in_sched = true;
1635 } 1632 }
1636 1633
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1695 const char *end_of_header = printk_skip_level(text); 1692 const char *end_of_header = printk_skip_level(text);
1696 switch (kern_level) { 1693 switch (kern_level) {
1697 case '0' ... '7': 1694 case '0' ... '7':
1698 if (level == -1) 1695 if (level == LOGLEVEL_DEFAULT)
1699 level = kern_level - '0'; 1696 level = kern_level - '0';
1697 /* fallthrough */
1700 case 'd': /* KERN_DEFAULT */ 1698 case 'd': /* KERN_DEFAULT */
1701 lflags |= LOG_PREFIX; 1699 lflags |= LOG_PREFIX;
1702 } 1700 }
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1710 } 1708 }
1711 } 1709 }
1712 1710
1713 if (level == -1) 1711 if (level == LOGLEVEL_DEFAULT)
1714 level = default_message_loglevel; 1712 level = default_message_loglevel;
1715 1713
1716 if (dict) 1714 if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
1788 1786
1789asmlinkage int vprintk(const char *fmt, va_list args) 1787asmlinkage int vprintk(const char *fmt, va_list args)
1790{ 1788{
1791 return vprintk_emit(0, -1, NULL, 0, fmt, args); 1789 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1792} 1790}
1793EXPORT_SYMBOL(vprintk); 1791EXPORT_SYMBOL(vprintk);
1794 1792
@@ -1842,7 +1840,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
1842 } 1840 }
1843#endif 1841#endif
1844 va_start(args, fmt); 1842 va_start(args, fmt);
1845 r = vprintk_emit(0, -1, NULL, 0, fmt, args); 1843 r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1846 va_end(args); 1844 va_end(args);
1847 1845
1848 return r; 1846 return r;
@@ -1881,23 +1879,20 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
1881#ifdef CONFIG_EARLY_PRINTK 1879#ifdef CONFIG_EARLY_PRINTK
1882struct console *early_console; 1880struct console *early_console;
1883 1881
1884void early_vprintk(const char *fmt, va_list ap)
1885{
1886 if (early_console) {
1887 char buf[512];
1888 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1889
1890 early_console->write(early_console, buf, n);
1891 }
1892}
1893
1894asmlinkage __visible void early_printk(const char *fmt, ...) 1882asmlinkage __visible void early_printk(const char *fmt, ...)
1895{ 1883{
1896 va_list ap; 1884 va_list ap;
1885 char buf[512];
1886 int n;
1887
1888 if (!early_console)
1889 return;
1897 1890
1898 va_start(ap, fmt); 1891 va_start(ap, fmt);
1899 early_vprintk(fmt, ap); 1892 n = vscnprintf(buf, sizeof(buf), fmt, ap);
1900 va_end(ap); 1893 va_end(ap);
1894
1895 early_console->write(early_console, buf, n);
1901} 1896}
1902#endif 1897#endif
1903 1898
@@ -2634,7 +2629,7 @@ int printk_deferred(const char *fmt, ...)
2634 2629
2635 preempt_disable(); 2630 preempt_disable();
2636 va_start(args, fmt); 2631 va_start(args, fmt);
2637 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); 2632 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2638 va_end(args); 2633 va_end(args);
2639 2634
2640 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2635 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
485 485
486/* 486/*
487 * Detach all tasks we were using ptrace on. Called with tasklist held 487 * Detach all tasks we were using ptrace on. Called with tasklist held
488 * for writing, and returns with it held too. But note it can release 488 * for writing.
489 * and reacquire the lock.
490 */ 489 */
491void exit_ptrace(struct task_struct *tracer) 490void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
492 __releases(&tasklist_lock)
493 __acquires(&tasklist_lock)
494{ 491{
495 struct task_struct *p, *n; 492 struct task_struct *p, *n;
496 LIST_HEAD(ptrace_dead);
497
498 if (likely(list_empty(&tracer->ptraced)))
499 return;
500 493
501 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 494 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
502 if (unlikely(p->ptrace & PT_EXITKILL)) 495 if (unlikely(p->ptrace & PT_EXITKILL))
503 send_sig_info(SIGKILL, SEND_SIG_FORCED, p); 496 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
504 497
505 if (__ptrace_detach(tracer, p)) 498 if (__ptrace_detach(tracer, p))
506 list_add(&p->ptrace_entry, &ptrace_dead); 499 list_add(&p->ptrace_entry, dead);
507 }
508
509 write_unlock_irq(&tasklist_lock);
510 BUG_ON(!list_empty(&tracer->ptraced));
511
512 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
513 list_del_init(&p->ptrace_entry);
514 release_task(p);
515 } 500 }
516
517 write_lock_irq(&tasklist_lock);
518} 501}
519 502
520int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 503int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb398c0c5f08..b5797b78add6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
4527#ifdef CONFIG_DEBUG_STACK_USAGE 4527#ifdef CONFIG_DEBUG_STACK_USAGE
4528 free = stack_not_used(p); 4528 free = stack_not_used(p);
4529#endif 4529#endif
4530 ppid = 0;
4530 rcu_read_lock(); 4531 rcu_read_lock();
4531 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4532 if (pid_alive(p))
4533 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4532 rcu_read_unlock(); 4534 rcu_read_unlock();
4533 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4535 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4534 task_pid_nr(p), ppid, 4536 task_pid_nr(p), ppid,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..7c54ff79afd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1104,6 +1104,15 @@ static struct ctl_table kern_table[] = {
1104 .proc_handler = proc_dointvec, 1104 .proc_handler = proc_dointvec,
1105 }, 1105 },
1106#endif 1106#endif
1107 {
1108 .procname = "panic_on_warn",
1109 .data = &panic_on_warn,
1110 .maxlen = sizeof(int),
1111 .mode = 0644,
1112 .proc_handler = proc_dointvec_minmax,
1113 .extra1 = &zero,
1114 .extra2 = &one,
1115 },
1107 { } 1116 { }
1108}; 1117};
1109 1118
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
140 {} 141 {}
141}; 142};
142 143
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index add80cc02dbe..9722bd2dbc9b 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -102,6 +102,14 @@ static DEFINE_SPINLOCK(free_entries_lock);
102/* Global disable flag - will be set in case of an error */ 102/* Global disable flag - will be set in case of an error */
103static u32 global_disable __read_mostly; 103static u32 global_disable __read_mostly;
104 104
105/* Early initialization disable flag, set at the end of dma_debug_init */
106static bool dma_debug_initialized __read_mostly;
107
108static inline bool dma_debug_disabled(void)
109{
110 return global_disable || !dma_debug_initialized;
111}
112
105/* Global error count */ 113/* Global error count */
106static u32 error_count; 114static u32 error_count;
107 115
@@ -945,7 +953,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
945 struct dma_debug_entry *uninitialized_var(entry); 953 struct dma_debug_entry *uninitialized_var(entry);
946 int count; 954 int count;
947 955
948 if (global_disable) 956 if (dma_debug_disabled())
949 return 0; 957 return 0;
950 958
951 switch (action) { 959 switch (action) {
@@ -973,7 +981,7 @@ void dma_debug_add_bus(struct bus_type *bus)
973{ 981{
974 struct notifier_block *nb; 982 struct notifier_block *nb;
975 983
976 if (global_disable) 984 if (dma_debug_disabled())
977 return; 985 return;
978 986
979 nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); 987 nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
@@ -994,6 +1002,9 @@ void dma_debug_init(u32 num_entries)
994{ 1002{
995 int i; 1003 int i;
996 1004
1005 /* Do not use dma_debug_initialized here, since we really want to be
1006 * called to set dma_debug_initialized
1007 */
997 if (global_disable) 1008 if (global_disable)
998 return; 1009 return;
999 1010
@@ -1021,6 +1032,8 @@ void dma_debug_init(u32 num_entries)
1021 1032
1022 nr_total_entries = num_free_entries; 1033 nr_total_entries = num_free_entries;
1023 1034
1035 dma_debug_initialized = true;
1036
1024 pr_info("DMA-API: debugging enabled by kernel config\n"); 1037 pr_info("DMA-API: debugging enabled by kernel config\n");
1025} 1038}
1026 1039
@@ -1243,7 +1256,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
1243{ 1256{
1244 struct dma_debug_entry *entry; 1257 struct dma_debug_entry *entry;
1245 1258
1246 if (unlikely(global_disable)) 1259 if (unlikely(dma_debug_disabled()))
1247 return; 1260 return;
1248 1261
1249 if (dma_mapping_error(dev, dma_addr)) 1262 if (dma_mapping_error(dev, dma_addr))
@@ -1283,7 +1296,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
1283 struct hash_bucket *bucket; 1296 struct hash_bucket *bucket;
1284 unsigned long flags; 1297 unsigned long flags;
1285 1298
1286 if (unlikely(global_disable)) 1299 if (unlikely(dma_debug_disabled()))
1287 return; 1300 return;
1288 1301
1289 ref.dev = dev; 1302 ref.dev = dev;
@@ -1325,7 +1338,7 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
1325 .direction = direction, 1338 .direction = direction,
1326 }; 1339 };
1327 1340
1328 if (unlikely(global_disable)) 1341 if (unlikely(dma_debug_disabled()))
1329 return; 1342 return;
1330 1343
1331 if (map_single) 1344 if (map_single)
@@ -1342,7 +1355,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
1342 struct scatterlist *s; 1355 struct scatterlist *s;
1343 int i; 1356 int i;
1344 1357
1345 if (unlikely(global_disable)) 1358 if (unlikely(dma_debug_disabled()))
1346 return; 1359 return;
1347 1360
1348 for_each_sg(sg, s, mapped_ents, i) { 1361 for_each_sg(sg, s, mapped_ents, i) {
@@ -1395,7 +1408,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
1395 struct scatterlist *s; 1408 struct scatterlist *s;
1396 int mapped_ents = 0, i; 1409 int mapped_ents = 0, i;
1397 1410
1398 if (unlikely(global_disable)) 1411 if (unlikely(dma_debug_disabled()))
1399 return; 1412 return;
1400 1413
1401 for_each_sg(sglist, s, nelems, i) { 1414 for_each_sg(sglist, s, nelems, i) {
@@ -1427,7 +1440,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
1427{ 1440{
1428 struct dma_debug_entry *entry; 1441 struct dma_debug_entry *entry;
1429 1442
1430 if (unlikely(global_disable)) 1443 if (unlikely(dma_debug_disabled()))
1431 return; 1444 return;
1432 1445
1433 if (unlikely(virt == NULL)) 1446 if (unlikely(virt == NULL))
@@ -1462,7 +1475,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
1462 .direction = DMA_BIDIRECTIONAL, 1475 .direction = DMA_BIDIRECTIONAL,
1463 }; 1476 };
1464 1477
1465 if (unlikely(global_disable)) 1478 if (unlikely(dma_debug_disabled()))
1466 return; 1479 return;
1467 1480
1468 check_unmap(&ref); 1481 check_unmap(&ref);
@@ -1474,7 +1487,7 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
1474{ 1487{
1475 struct dma_debug_entry ref; 1488 struct dma_debug_entry ref;
1476 1489
1477 if (unlikely(global_disable)) 1490 if (unlikely(dma_debug_disabled()))
1478 return; 1491 return;
1479 1492
1480 ref.type = dma_debug_single; 1493 ref.type = dma_debug_single;
@@ -1494,7 +1507,7 @@ void debug_dma_sync_single_for_device(struct device *dev,
1494{ 1507{
1495 struct dma_debug_entry ref; 1508 struct dma_debug_entry ref;
1496 1509
1497 if (unlikely(global_disable)) 1510 if (unlikely(dma_debug_disabled()))
1498 return; 1511 return;
1499 1512
1500 ref.type = dma_debug_single; 1513 ref.type = dma_debug_single;
@@ -1515,7 +1528,7 @@ void debug_dma_sync_single_range_for_cpu(struct device *dev,
1515{ 1528{
1516 struct dma_debug_entry ref; 1529 struct dma_debug_entry ref;
1517 1530
1518 if (unlikely(global_disable)) 1531 if (unlikely(dma_debug_disabled()))
1519 return; 1532 return;
1520 1533
1521 ref.type = dma_debug_single; 1534 ref.type = dma_debug_single;
@@ -1536,7 +1549,7 @@ void debug_dma_sync_single_range_for_device(struct device *dev,
1536{ 1549{
1537 struct dma_debug_entry ref; 1550 struct dma_debug_entry ref;
1538 1551
1539 if (unlikely(global_disable)) 1552 if (unlikely(dma_debug_disabled()))
1540 return; 1553 return;
1541 1554
1542 ref.type = dma_debug_single; 1555 ref.type = dma_debug_single;
@@ -1556,7 +1569,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
1556 struct scatterlist *s; 1569 struct scatterlist *s;
1557 int mapped_ents = 0, i; 1570 int mapped_ents = 0, i;
1558 1571
1559 if (unlikely(global_disable)) 1572 if (unlikely(dma_debug_disabled()))
1560 return; 1573 return;
1561 1574
1562 for_each_sg(sg, s, nelems, i) { 1575 for_each_sg(sg, s, nelems, i) {
@@ -1589,7 +1602,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
1589 struct scatterlist *s; 1602 struct scatterlist *s;
1590 int mapped_ents = 0, i; 1603 int mapped_ents = 0, i;
1591 1604
1592 if (unlikely(global_disable)) 1605 if (unlikely(dma_debug_disabled()))
1593 return; 1606 return;
1594 1607
1595 for_each_sg(sg, s, nelems, i) { 1608 for_each_sg(sg, s, nelems, i) {
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index dfba05521748..527799d44476 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -576,7 +576,7 @@ void __dynamic_dev_dbg(struct _ddebug *descriptor,
576 } else { 576 } else {
577 char buf[PREFIX_SIZE]; 577 char buf[PREFIX_SIZE];
578 578
579 dev_printk_emit(7, dev, "%s%s %s: %pV", 579 dev_printk_emit(LOGLEVEL_DEBUG, dev, "%s%s %s: %pV",
580 dynamic_emit_prefix(descriptor, buf), 580 dynamic_emit_prefix(descriptor, buf),
581 dev_driver_string(dev), dev_name(dev), 581 dev_driver_string(dev), dev_name(dev),
582 &vaf); 582 &vaf);
@@ -605,7 +605,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
605 if (dev && dev->dev.parent) { 605 if (dev && dev->dev.parent) {
606 char buf[PREFIX_SIZE]; 606 char buf[PREFIX_SIZE];
607 607
608 dev_printk_emit(7, dev->dev.parent, 608 dev_printk_emit(LOGLEVEL_DEBUG, dev->dev.parent,
609 "%s%s %s %s%s: %pV", 609 "%s%s %s %s%s: %pV",
610 dynamic_emit_prefix(descriptor, buf), 610 dynamic_emit_prefix(descriptor, buf),
611 dev_driver_string(dev->dev.parent), 611 dev_driver_string(dev->dev.parent),
diff --git a/lib/lcm.c b/lib/lcm.c
index b9c8de461e9e..51cc6b13cd52 100644
--- a/lib/lcm.c
+++ b/lib/lcm.c
@@ -7,10 +7,8 @@
7unsigned long lcm(unsigned long a, unsigned long b) 7unsigned long lcm(unsigned long a, unsigned long b)
8{ 8{
9 if (a && b) 9 if (a && b)
10 return (a * b) / gcd(a, b); 10 return (a / gcd(a, b)) * b;
11 else if (b) 11 else
12 return b; 12 return 0;
13
14 return a;
15} 13}
16EXPORT_SYMBOL_GPL(lcm); 14EXPORT_SYMBOL_GPL(lcm);
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0023a9..b3c6ce932c64 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
55obj-$(CONFIG_MIGRATION) += migrate.o 55obj-$(CONFIG_MIGRATION) += migrate.o
56obj-$(CONFIG_QUICKLIST) += quicklist.o 56obj-$(CONFIG_QUICKLIST) += quicklist.o
57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
58obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o 58obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
59obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
60obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
59obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 61obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
60obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 62obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
61obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/cma.c b/mm/cma.c
index fde706e1284f..8e9ec13d31db 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -215,9 +215,21 @@ int __init cma_declare_contiguous(phys_addr_t base,
215 bool fixed, struct cma **res_cma) 215 bool fixed, struct cma **res_cma)
216{ 216{
217 phys_addr_t memblock_end = memblock_end_of_DRAM(); 217 phys_addr_t memblock_end = memblock_end_of_DRAM();
218 phys_addr_t highmem_start = __pa(high_memory); 218 phys_addr_t highmem_start;
219 int ret = 0; 219 int ret = 0;
220 220
221#ifdef CONFIG_X86
222 /*
223 * high_memory isn't direct mapped memory so retrieving its physical
224 * address isn't appropriate. But it would be useful to check the
225 * physical address of the highmem boundary so it's justfiable to get
226 * the physical address from it. On x86 there is a validation check for
227 * this case, so the following workaround is needed to avoid it.
228 */
229 highmem_start = __pa_nodebug(high_memory);
230#else
231 highmem_start = __pa(high_memory);
232#endif
221 pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", 233 pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
222 __func__, &size, &base, &limit, &alignment); 234 __func__, &size, &base, &limit, &alignment);
223 235
diff --git a/mm/compaction.c b/mm/compaction.c
index f9792ba3537c..546e571e9d60 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
41static unsigned long release_freepages(struct list_head *freelist) 41static unsigned long release_freepages(struct list_head *freelist)
42{ 42{
43 struct page *page, *next; 43 struct page *page, *next;
44 unsigned long count = 0; 44 unsigned long high_pfn = 0;
45 45
46 list_for_each_entry_safe(page, next, freelist, lru) { 46 list_for_each_entry_safe(page, next, freelist, lru) {
47 unsigned long pfn = page_to_pfn(page);
47 list_del(&page->lru); 48 list_del(&page->lru);
48 __free_page(page); 49 __free_page(page);
49 count++; 50 if (pfn > high_pfn)
51 high_pfn = pfn;
50 } 52 }
51 53
52 return count; 54 return high_pfn;
53} 55}
54 56
55static void map_pages(struct list_head *list) 57static void map_pages(struct list_head *list)
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc,
195 197
196 /* Update where async and sync compaction should restart */ 198 /* Update where async and sync compaction should restart */
197 if (migrate_scanner) { 199 if (migrate_scanner) {
198 if (cc->finished_update_migrate)
199 return;
200 if (pfn > zone->compact_cached_migrate_pfn[0]) 200 if (pfn > zone->compact_cached_migrate_pfn[0])
201 zone->compact_cached_migrate_pfn[0] = pfn; 201 zone->compact_cached_migrate_pfn[0] = pfn;
202 if (cc->mode != MIGRATE_ASYNC && 202 if (cc->mode != MIGRATE_ASYNC &&
203 pfn > zone->compact_cached_migrate_pfn[1]) 203 pfn > zone->compact_cached_migrate_pfn[1])
204 zone->compact_cached_migrate_pfn[1] = pfn; 204 zone->compact_cached_migrate_pfn[1] = pfn;
205 } else { 205 } else {
206 if (cc->finished_update_free)
207 return;
208 if (pfn < zone->compact_cached_free_pfn) 206 if (pfn < zone->compact_cached_free_pfn)
209 zone->compact_cached_free_pfn = pfn; 207 zone->compact_cached_free_pfn = pfn;
210 } 208 }
@@ -715,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
715 del_page_from_lru_list(page, lruvec, page_lru(page)); 713 del_page_from_lru_list(page, lruvec, page_lru(page));
716 714
717isolate_success: 715isolate_success:
718 cc->finished_update_migrate = true;
719 list_add(&page->lru, migratelist); 716 list_add(&page->lru, migratelist);
720 cc->nr_migratepages++; 717 cc->nr_migratepages++;
721 nr_isolated++; 718 nr_isolated++;
@@ -889,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc)
889 block_start_pfn - pageblock_nr_pages; 886 block_start_pfn - pageblock_nr_pages;
890 887
891 /* 888 /*
892 * Set a flag that we successfully isolated in this pageblock.
893 * In the next loop iteration, zone->compact_cached_free_pfn
894 * will not be updated and thus it will effectively contain the
895 * highest pageblock we isolated pages from.
896 */
897 if (isolated)
898 cc->finished_update_free = true;
899
900 /*
901 * isolate_freepages_block() might have aborted due to async 889 * isolate_freepages_block() might have aborted due to async
902 * compaction being contended 890 * compaction being contended
903 */ 891 */
@@ -1086,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1086 1074
1087 /* Compaction run is not finished if the watermark is not met */ 1075 /* Compaction run is not finished if the watermark is not met */
1088 watermark = low_wmark_pages(zone); 1076 watermark = low_wmark_pages(zone);
1089 watermark += (1 << cc->order);
1090 1077
1091 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 1078 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1079 cc->alloc_flags))
1092 return COMPACT_CONTINUE; 1080 return COMPACT_CONTINUE;
1093 1081
1094 /* Direct compactor: Is a suitable page free? */ 1082 /* Direct compactor: Is a suitable page free? */
@@ -1114,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1114 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1102 * COMPACT_PARTIAL - If the allocation would succeed without compaction
1115 * COMPACT_CONTINUE - If compaction should run now 1103 * COMPACT_CONTINUE - If compaction should run now
1116 */ 1104 */
1117unsigned long compaction_suitable(struct zone *zone, int order) 1105unsigned long compaction_suitable(struct zone *zone, int order,
1106 int alloc_flags, int classzone_idx)
1118{ 1107{
1119 int fragindex; 1108 int fragindex;
1120 unsigned long watermark; 1109 unsigned long watermark;
@@ -1126,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order)
1126 if (order == -1) 1115 if (order == -1)
1127 return COMPACT_CONTINUE; 1116 return COMPACT_CONTINUE;
1128 1117
1118 watermark = low_wmark_pages(zone);
1119 /*
1120 * If watermarks for high-order allocation are already met, there
1121 * should be no need for compaction at all.
1122 */
1123 if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1124 alloc_flags))
1125 return COMPACT_PARTIAL;
1126
1129 /* 1127 /*
1130 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1128 * Watermarks for order-0 must be met for compaction. Note the 2UL.
1131 * This is because during migration, copies of pages need to be 1129 * This is because during migration, copies of pages need to be
1132 * allocated and for a short time, the footprint is higher 1130 * allocated and for a short time, the footprint is higher
1133 */ 1131 */
1134 watermark = low_wmark_pages(zone) + (2UL << order); 1132 watermark += (2UL << order);
1135 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1133 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
1136 return COMPACT_SKIPPED; 1134 return COMPACT_SKIPPED;
1137 1135
1138 /* 1136 /*
1139 * fragmentation index determines if allocation failures are due to 1137 * fragmentation index determines if allocation failures are due to
1140 * low memory or external fragmentation 1138 * low memory or external fragmentation
1141 * 1139 *
1142 * index of -1000 implies allocations might succeed depending on 1140 * index of -1000 would imply allocations might succeed depending on
1143 * watermarks 1141 * watermarks, but we already failed the high-order watermark check
1144 * index towards 0 implies failure is due to lack of memory 1142 * index towards 0 implies failure is due to lack of memory
1145 * index towards 1000 implies failure is due to fragmentation 1143 * index towards 1000 implies failure is due to fragmentation
1146 * 1144 *
@@ -1150,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
1150 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1148 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1151 return COMPACT_SKIPPED; 1149 return COMPACT_SKIPPED;
1152 1150
1153 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
1154 0, 0))
1155 return COMPACT_PARTIAL;
1156
1157 return COMPACT_CONTINUE; 1151 return COMPACT_CONTINUE;
1158} 1152}
1159 1153
@@ -1164,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1164 unsigned long end_pfn = zone_end_pfn(zone); 1158 unsigned long end_pfn = zone_end_pfn(zone);
1165 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1159 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1166 const bool sync = cc->mode != MIGRATE_ASYNC; 1160 const bool sync = cc->mode != MIGRATE_ASYNC;
1161 unsigned long last_migrated_pfn = 0;
1167 1162
1168 ret = compaction_suitable(zone, cc->order); 1163 ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1164 cc->classzone_idx);
1169 switch (ret) { 1165 switch (ret) {
1170 case COMPACT_PARTIAL: 1166 case COMPACT_PARTIAL:
1171 case COMPACT_SKIPPED: 1167 case COMPACT_SKIPPED:
@@ -1208,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1208 while ((ret = compact_finished(zone, cc, migratetype)) == 1204 while ((ret = compact_finished(zone, cc, migratetype)) ==
1209 COMPACT_CONTINUE) { 1205 COMPACT_CONTINUE) {
1210 int err; 1206 int err;
1207 unsigned long isolate_start_pfn = cc->migrate_pfn;
1211 1208
1212 switch (isolate_migratepages(zone, cc)) { 1209 switch (isolate_migratepages(zone, cc)) {
1213 case ISOLATE_ABORT: 1210 case ISOLATE_ABORT:
@@ -1216,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1216 cc->nr_migratepages = 0; 1213 cc->nr_migratepages = 0;
1217 goto out; 1214 goto out;
1218 case ISOLATE_NONE: 1215 case ISOLATE_NONE:
1219 continue; 1216 /*
1217 * We haven't isolated and migrated anything, but
1218 * there might still be unflushed migrations from
1219 * previous cc->order aligned block.
1220 */
1221 goto check_drain;
1220 case ISOLATE_SUCCESS: 1222 case ISOLATE_SUCCESS:
1221 ; 1223 ;
1222 } 1224 }
@@ -1241,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1241 goto out; 1243 goto out;
1242 } 1244 }
1243 } 1245 }
1246
1247 /*
1248 * Record where we could have freed pages by migration and not
1249 * yet flushed them to buddy allocator. We use the pfn that
1250 * isolate_migratepages() started from in this loop iteration
1251 * - this is the lowest page that could have been isolated and
1252 * then freed by migration.
1253 */
1254 if (!last_migrated_pfn)
1255 last_migrated_pfn = isolate_start_pfn;
1256
1257check_drain:
1258 /*
1259 * Has the migration scanner moved away from the previous
1260 * cc->order aligned block where we migrated from? If yes,
1261 * flush the pages that were freed, so that they can merge and
1262 * compact_finished() can detect immediately if allocation
1263 * would succeed.
1264 */
1265 if (cc->order > 0 && last_migrated_pfn) {
1266 int cpu;
1267 unsigned long current_block_start =
1268 cc->migrate_pfn & ~((1UL << cc->order) - 1);
1269
1270 if (last_migrated_pfn < current_block_start) {
1271 cpu = get_cpu();
1272 lru_add_drain_cpu(cpu);
1273 drain_local_pages(zone);
1274 put_cpu();
1275 /* No more flushing until we migrate again */
1276 last_migrated_pfn = 0;
1277 }
1278 }
1279
1244 } 1280 }
1245 1281
1246out: 1282out:
1247 /* Release free pages and check accounting */ 1283 /*
1248 cc->nr_freepages -= release_freepages(&cc->freepages); 1284 * Release free pages and update where the free scanner should restart,
1249 VM_BUG_ON(cc->nr_freepages != 0); 1285 * so we don't leave any returned pages behind in the next attempt.
1286 */
1287 if (cc->nr_freepages > 0) {
1288 unsigned long free_pfn = release_freepages(&cc->freepages);
1289
1290 cc->nr_freepages = 0;
1291 VM_BUG_ON(free_pfn == 0);
1292 /* The cached pfn is always the first in a pageblock */
1293 free_pfn &= ~(pageblock_nr_pages-1);
1294 /*
1295 * Only go back, not forward. The cached pfn might have been
1296 * already reset to zone end in compact_finished()
1297 */
1298 if (free_pfn > zone->compact_cached_free_pfn)
1299 zone->compact_cached_free_pfn = free_pfn;
1300 }
1250 1301
1251 trace_mm_compaction_end(ret); 1302 trace_mm_compaction_end(ret);
1252 1303
@@ -1254,7 +1305,8 @@ out:
1254} 1305}
1255 1306
1256static unsigned long compact_zone_order(struct zone *zone, int order, 1307static unsigned long compact_zone_order(struct zone *zone, int order,
1257 gfp_t gfp_mask, enum migrate_mode mode, int *contended) 1308 gfp_t gfp_mask, enum migrate_mode mode, int *contended,
1309 int alloc_flags, int classzone_idx)
1258{ 1310{
1259 unsigned long ret; 1311 unsigned long ret;
1260 struct compact_control cc = { 1312 struct compact_control cc = {
@@ -1264,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
1264 .gfp_mask = gfp_mask, 1316 .gfp_mask = gfp_mask,
1265 .zone = zone, 1317 .zone = zone,
1266 .mode = mode, 1318 .mode = mode,
1319 .alloc_flags = alloc_flags,
1320 .classzone_idx = classzone_idx,
1267 }; 1321 };
1268 INIT_LIST_HEAD(&cc.freepages); 1322 INIT_LIST_HEAD(&cc.freepages);
1269 INIT_LIST_HEAD(&cc.migratepages); 1323 INIT_LIST_HEAD(&cc.migratepages);
@@ -1288,14 +1342,13 @@ int sysctl_extfrag_threshold = 500;
1288 * @mode: The migration mode for async, sync light, or sync migration 1342 * @mode: The migration mode for async, sync light, or sync migration
1289 * @contended: Return value that determines if compaction was aborted due to 1343 * @contended: Return value that determines if compaction was aborted due to
1290 * need_resched() or lock contention 1344 * need_resched() or lock contention
1291 * @candidate_zone: Return the zone where we think allocation should succeed
1292 * 1345 *
1293 * This is the main entry point for direct page compaction. 1346 * This is the main entry point for direct page compaction.
1294 */ 1347 */
1295unsigned long try_to_compact_pages(struct zonelist *zonelist, 1348unsigned long try_to_compact_pages(struct zonelist *zonelist,
1296 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1349 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1297 enum migrate_mode mode, int *contended, 1350 enum migrate_mode mode, int *contended,
1298 struct zone **candidate_zone) 1351 int alloc_flags, int classzone_idx)
1299{ 1352{
1300 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1353 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1301 int may_enter_fs = gfp_mask & __GFP_FS; 1354 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1303,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1303 struct zoneref *z; 1356 struct zoneref *z;
1304 struct zone *zone; 1357 struct zone *zone;
1305 int rc = COMPACT_DEFERRED; 1358 int rc = COMPACT_DEFERRED;
1306 int alloc_flags = 0;
1307 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1359 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1308 1360
1309 *contended = COMPACT_CONTENDED_NONE; 1361 *contended = COMPACT_CONTENDED_NONE;
@@ -1312,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1312 if (!order || !may_enter_fs || !may_perform_io) 1364 if (!order || !may_enter_fs || !may_perform_io)
1313 return COMPACT_SKIPPED; 1365 return COMPACT_SKIPPED;
1314 1366
1315#ifdef CONFIG_CMA
1316 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1317 alloc_flags |= ALLOC_CMA;
1318#endif
1319 /* Compact each zone in the list */ 1367 /* Compact each zone in the list */
1320 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1368 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1321 nodemask) { 1369 nodemask) {
@@ -1326,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1326 continue; 1374 continue;
1327 1375
1328 status = compact_zone_order(zone, order, gfp_mask, mode, 1376 status = compact_zone_order(zone, order, gfp_mask, mode,
1329 &zone_contended); 1377 &zone_contended, alloc_flags, classzone_idx);
1330 rc = max(status, rc); 1378 rc = max(status, rc);
1331 /* 1379 /*
1332 * It takes at least one zone that wasn't lock contended 1380 * It takes at least one zone that wasn't lock contended
@@ -1335,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1335 all_zones_contended &= zone_contended; 1383 all_zones_contended &= zone_contended;
1336 1384
1337 /* If a normal allocation would succeed, stop compacting */ 1385 /* If a normal allocation would succeed, stop compacting */
1338 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 1386 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1339 alloc_flags)) { 1387 classzone_idx, alloc_flags)) {
1340 *candidate_zone = zone;
1341 /* 1388 /*
1342 * We think the allocation will succeed in this zone, 1389 * We think the allocation will succeed in this zone,
1343 * but it is not certain, hence the false. The caller 1390 * but it is not certain, hence the false. The caller
@@ -1359,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1359 goto break_loop; 1406 goto break_loop;
1360 } 1407 }
1361 1408
1362 if (mode != MIGRATE_ASYNC) { 1409 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
1363 /* 1410 /*
1364 * We think that allocation won't succeed in this zone 1411 * We think that allocation won't succeed in this zone
1365 * so we defer compaction there. If it ends up 1412 * so we defer compaction there. If it ends up
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
95 dump_flags(page->flags & badflags, 95 dump_flags(page->flags & badflags,
96 pageflag_names, ARRAY_SIZE(pageflag_names)); 96 pageflag_names, ARRAY_SIZE(pageflag_names));
97 } 97 }
98 mem_cgroup_print_bad_page(page); 98#ifdef CONFIG_MEMCG
99 if (page->mem_cgroup)
100 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
101#endif
99} 102}
100 103
101void dump_page(struct page *page, const char *reason) 104void dump_page(struct page *page, const char *reason)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index f2a3571c6e22..8d82809eb085 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
182 if (frontswap_ops) 182 if (frontswap_ops)
183 frontswap_ops->init(type); 183 frontswap_ops->init(type);
184 else { 184 else {
185 BUG_ON(type > MAX_SWAPFILES); 185 BUG_ON(type >= MAX_SWAPFILES);
186 set_bit(type, need_init); 186 set_bit(type, need_init);
187 } 187 }
188} 188}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de984159cf0b..5b2c6875fc38 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
784 if (!pmd_none(*pmd)) 784 if (!pmd_none(*pmd))
785 return false; 785 return false;
786 entry = mk_pmd(zero_page, vma->vm_page_prot); 786 entry = mk_pmd(zero_page, vma->vm_page_prot);
787 entry = pmd_wrprotect(entry);
788 entry = pmd_mkhuge(entry); 787 entry = pmd_mkhuge(entry);
789 pgtable_trans_huge_deposit(mm, pmd, pgtable); 788 pgtable_trans_huge_deposit(mm, pmd, pgtable);
790 set_pmd_at(mm, haddr, pmd, entry); 789 set_pmd_at(mm, haddr, pmd, entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd722769927..30cd96879152 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2638 2638
2639 tlb_start_vma(tlb, vma); 2639 tlb_start_vma(tlb, vma);
2640 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2640 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2641 address = start;
2641again: 2642again:
2642 for (address = start; address < end; address += sz) { 2643 for (; address < end; address += sz) {
2643 ptep = huge_pte_offset(mm, address); 2644 ptep = huge_pte_offset(mm, address);
2644 if (!ptep) 2645 if (!ptep)
2645 continue; 2646 continue;
@@ -2686,6 +2687,7 @@ again:
2686 page_remove_rmap(page); 2687 page_remove_rmap(page);
2687 force_flush = !__tlb_remove_page(tlb, page); 2688 force_flush = !__tlb_remove_page(tlb, page);
2688 if (force_flush) { 2689 if (force_flush) {
2690 address += sz;
2689 spin_unlock(ptl); 2691 spin_unlock(ptl);
2690 break; 2692 break;
2691 } 2693 }
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a67c26e0f360..037e1c00a5b7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/page_counter.h>
17#include <linux/slab.h> 18#include <linux/slab.h>
18#include <linux/hugetlb.h> 19#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h> 20#include <linux/hugetlb_cgroup.h>
@@ -23,7 +24,7 @@ struct hugetlb_cgroup {
23 /* 24 /*
24 * the counter to account for hugepages from hugetlb. 25 * the counter to account for hugepages from hugetlb.
25 */ 26 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE]; 27 struct page_counter hugepage[HUGE_MAX_HSTATE];
27}; 28};
28 29
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 30#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
60 int idx; 61 int idx;
61 62
62 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
63 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) 64 if (page_counter_read(&h_cg->hugepage[idx]))
64 return true; 65 return true;
65 } 66 }
66 return false; 67 return false;
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
79 80
80 if (parent_h_cgroup) { 81 if (parent_h_cgroup) {
81 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
82 res_counter_init(&h_cgroup->hugepage[idx], 83 page_counter_init(&h_cgroup->hugepage[idx],
83 &parent_h_cgroup->hugepage[idx]); 84 &parent_h_cgroup->hugepage[idx]);
84 } else { 85 } else {
85 root_h_cgroup = h_cgroup; 86 root_h_cgroup = h_cgroup;
86 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 87 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
87 res_counter_init(&h_cgroup->hugepage[idx], NULL); 88 page_counter_init(&h_cgroup->hugepage[idx], NULL);
88 } 89 }
89 return &h_cgroup->css; 90 return &h_cgroup->css;
90} 91}
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
108static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 109static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
109 struct page *page) 110 struct page *page)
110{ 111{
111 int csize; 112 unsigned int nr_pages;
112 struct res_counter *counter; 113 struct page_counter *counter;
113 struct res_counter *fail_res;
114 struct hugetlb_cgroup *page_hcg; 114 struct hugetlb_cgroup *page_hcg;
115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
116 116
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
123 if (!page_hcg || page_hcg != h_cg) 123 if (!page_hcg || page_hcg != h_cg)
124 goto out; 124 goto out;
125 125
126 csize = PAGE_SIZE << compound_order(page); 126 nr_pages = 1 << compound_order(page);
127 if (!parent) { 127 if (!parent) {
128 parent = root_h_cgroup; 128 parent = root_h_cgroup;
129 /* root has no limit */ 129 /* root has no limit */
130 res_counter_charge_nofail(&parent->hugepage[idx], 130 page_counter_charge(&parent->hugepage[idx], nr_pages);
131 csize, &fail_res);
132 } 131 }
133 counter = &h_cg->hugepage[idx]; 132 counter = &h_cg->hugepage[idx];
134 res_counter_uncharge_until(counter, counter->parent, csize); 133 /* Take the pages off the local counter */
134 page_counter_cancel(counter, nr_pages);
135 135
136 set_hugetlb_cgroup(page, parent); 136 set_hugetlb_cgroup(page, parent);
137out: 137out:
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
166 struct hugetlb_cgroup **ptr) 166 struct hugetlb_cgroup **ptr)
167{ 167{
168 int ret = 0; 168 int ret = 0;
169 struct res_counter *fail_res; 169 struct page_counter *counter;
170 struct hugetlb_cgroup *h_cg = NULL; 170 struct hugetlb_cgroup *h_cg = NULL;
171 unsigned long csize = nr_pages * PAGE_SIZE;
172 171
173 if (hugetlb_cgroup_disabled()) 172 if (hugetlb_cgroup_disabled())
174 goto done; 173 goto done;
@@ -187,7 +186,7 @@ again:
187 } 186 }
188 rcu_read_unlock(); 187 rcu_read_unlock();
189 188
190 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); 189 ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
191 css_put(&h_cg->css); 190 css_put(&h_cg->css);
192done: 191done:
193 *ptr = h_cg; 192 *ptr = h_cg;
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
213 struct page *page) 212 struct page *page)
214{ 213{
215 struct hugetlb_cgroup *h_cg; 214 struct hugetlb_cgroup *h_cg;
216 unsigned long csize = nr_pages * PAGE_SIZE;
217 215
218 if (hugetlb_cgroup_disabled()) 216 if (hugetlb_cgroup_disabled())
219 return; 217 return;
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
222 if (unlikely(!h_cg)) 220 if (unlikely(!h_cg))
223 return; 221 return;
224 set_hugetlb_cgroup(page, NULL); 222 set_hugetlb_cgroup(page, NULL);
225 res_counter_uncharge(&h_cg->hugepage[idx], csize); 223 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
226 return; 224 return;
227} 225}
228 226
229void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 227void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
230 struct hugetlb_cgroup *h_cg) 228 struct hugetlb_cgroup *h_cg)
231{ 229{
232 unsigned long csize = nr_pages * PAGE_SIZE;
233
234 if (hugetlb_cgroup_disabled() || !h_cg) 230 if (hugetlb_cgroup_disabled() || !h_cg)
235 return; 231 return;
236 232
237 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 233 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
238 return; 234 return;
239 235
240 res_counter_uncharge(&h_cg->hugepage[idx], csize); 236 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
241 return; 237 return;
242} 238}
243 239
240enum {
241 RES_USAGE,
242 RES_LIMIT,
243 RES_MAX_USAGE,
244 RES_FAILCNT,
245};
246
244static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 247static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
245 struct cftype *cft) 248 struct cftype *cft)
246{ 249{
247 int idx, name; 250 struct page_counter *counter;
248 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 251 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
249 252
250 idx = MEMFILE_IDX(cft->private); 253 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
251 name = MEMFILE_ATTR(cft->private);
252 254
253 return res_counter_read_u64(&h_cg->hugepage[idx], name); 255 switch (MEMFILE_ATTR(cft->private)) {
256 case RES_USAGE:
257 return (u64)page_counter_read(counter) * PAGE_SIZE;
258 case RES_LIMIT:
259 return (u64)counter->limit * PAGE_SIZE;
260 case RES_MAX_USAGE:
261 return (u64)counter->watermark * PAGE_SIZE;
262 case RES_FAILCNT:
263 return counter->failcnt;
264 default:
265 BUG();
266 }
254} 267}
255 268
269static DEFINE_MUTEX(hugetlb_limit_mutex);
270
256static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 271static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
257 char *buf, size_t nbytes, loff_t off) 272 char *buf, size_t nbytes, loff_t off)
258{ 273{
259 int idx, name, ret; 274 int ret, idx;
260 unsigned long long val; 275 unsigned long nr_pages;
261 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 276 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
262 277
278 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
279 return -EINVAL;
280
263 buf = strstrip(buf); 281 buf = strstrip(buf);
282 ret = page_counter_memparse(buf, &nr_pages);
283 if (ret)
284 return ret;
285
264 idx = MEMFILE_IDX(of_cft(of)->private); 286 idx = MEMFILE_IDX(of_cft(of)->private);
265 name = MEMFILE_ATTR(of_cft(of)->private);
266 287
267 switch (name) { 288 switch (MEMFILE_ATTR(of_cft(of)->private)) {
268 case RES_LIMIT: 289 case RES_LIMIT:
269 if (hugetlb_cgroup_is_root(h_cg)) { 290 mutex_lock(&hugetlb_limit_mutex);
270 /* Can't set limit on root */ 291 ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
271 ret = -EINVAL; 292 mutex_unlock(&hugetlb_limit_mutex);
272 break;
273 }
274 /* This function does all necessary parse...reuse it */
275 ret = res_counter_memparse_write_strategy(buf, &val);
276 if (ret)
277 break;
278 val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx]));
279 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
280 break; 293 break;
281 default: 294 default:
282 ret = -EINVAL; 295 ret = -EINVAL;
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
288static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 301static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
289 char *buf, size_t nbytes, loff_t off) 302 char *buf, size_t nbytes, loff_t off)
290{ 303{
291 int idx, name, ret = 0; 304 int ret = 0;
305 struct page_counter *counter;
292 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
293 307
294 idx = MEMFILE_IDX(of_cft(of)->private); 308 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
295 name = MEMFILE_ATTR(of_cft(of)->private);
296 309
297 switch (name) { 310 switch (MEMFILE_ATTR(of_cft(of)->private)) {
298 case RES_MAX_USAGE: 311 case RES_MAX_USAGE:
299 res_counter_reset_max(&h_cg->hugepage[idx]); 312 page_counter_reset_watermark(counter);
300 break; 313 break;
301 case RES_FAILCNT: 314 case RES_FAILCNT:
302 res_counter_reset_failcnt(&h_cg->hugepage[idx]); 315 counter->failcnt = 0;
303 break; 316 break;
304 default: 317 default:
305 ret = -EINVAL; 318 ret = -EINVAL;
diff --git a/mm/internal.h b/mm/internal.h
index a4f90ba7068e..efad241f7014 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -161,13 +161,10 @@ struct compact_control {
161 unsigned long migrate_pfn; /* isolate_migratepages search base */ 161 unsigned long migrate_pfn; /* isolate_migratepages search base */
162 enum migrate_mode mode; /* Async or sync migration mode */ 162 enum migrate_mode mode; /* Async or sync migration mode */
163 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 163 bool ignore_skip_hint; /* Scan blocks even if marked skip */
164 bool finished_update_free; /* True when the zone cached pfns are
165 * no longer being updated
166 */
167 bool finished_update_migrate;
168
169 int order; /* order a direct compactor needs */ 164 int order; /* order a direct compactor needs */
170 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 165 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
166 const int alloc_flags; /* alloc flags of a direct compactor */
167 const int classzone_idx; /* zone index of a direct compactor */
171 struct zone *zone; 168 struct zone *zone;
172 int contended; /* Signal need_sched() or lock 169 int contended; /* Signal need_sched() or lock
173 * contention detected during 170 * contention detected during
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ee48428cf8e3..85df503ec023 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
25 * GNU General Public License for more details. 25 * GNU General Public License for more details.
26 */ 26 */
27 27
28#include <linux/res_counter.h> 28#include <linux/page_counter.h>
29#include <linux/memcontrol.h> 29#include <linux/memcontrol.h>
30#include <linux/cgroup.h> 30#include <linux/cgroup.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
@@ -51,7 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/vmpressure.h> 52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h> 54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/oom.h> 56#include <linux/oom.h>
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 143 unsigned long targets[MEM_CGROUP_NTARGETS];
144}; 144};
145 145
146struct mem_cgroup_reclaim_iter { 146struct reclaim_iter {
147 /* 147 struct mem_cgroup *position;
148 * last scanned hierarchy member. Valid only if last_dead_count
149 * matches memcg->dead_count of the hierarchy root group.
150 */
151 struct mem_cgroup *last_visited;
152 int last_dead_count;
153
154 /* scan generation, increased every round-trip */ 148 /* scan generation, increased every round-trip */
155 unsigned int generation; 149 unsigned int generation;
156}; 150};
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
162 struct lruvec lruvec; 156 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS]; 157 unsigned long lru_size[NR_LRU_LISTS];
164 158
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 159 struct reclaim_iter iter[DEF_PRIORITY + 1];
166 160
167 struct rb_node tree_node; /* RB tree node */ 161 struct rb_node tree_node; /* RB tree node */
168 unsigned long long usage_in_excess;/* Set to the value by which */ 162 unsigned long usage_in_excess;/* Set to the value by which */
169 /* the soft limit is exceeded*/ 163 /* the soft limit is exceeded*/
170 bool on_tree; 164 bool on_tree;
171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 165 struct mem_cgroup *memcg; /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198 192
199struct mem_cgroup_threshold { 193struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd; 194 struct eventfd_ctx *eventfd;
201 u64 threshold; 195 unsigned long threshold;
202}; 196};
203 197
204/* For threshold */ 198/* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
284 */ 278 */
285struct mem_cgroup { 279struct mem_cgroup {
286 struct cgroup_subsys_state css; 280 struct cgroup_subsys_state css;
287 /* 281
288 * the counter to account for memory usage 282 /* Accounted resources */
289 */ 283 struct page_counter memory;
290 struct res_counter res; 284 struct page_counter memsw;
285 struct page_counter kmem;
286
287 unsigned long soft_limit;
291 288
292 /* vmpressure notifications */ 289 /* vmpressure notifications */
293 struct vmpressure vmpressure; 290 struct vmpressure vmpressure;
@@ -296,15 +293,6 @@ struct mem_cgroup {
296 int initialized; 293 int initialized;
297 294
298 /* 295 /*
299 * the counter to account for mem+swap usage.
300 */
301 struct res_counter memsw;
302
303 /*
304 * the counter to account for kernel memory usage.
305 */
306 struct res_counter kmem;
307 /*
308 * Should the accounting and control be hierarchical, per subtree? 296 * Should the accounting and control be hierarchical, per subtree?
309 */ 297 */
310 bool use_hierarchy; 298 bool use_hierarchy;
@@ -352,7 +340,6 @@ struct mem_cgroup {
352 struct mem_cgroup_stat_cpu nocpu_base; 340 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock; 341 spinlock_t pcp_counter_lock;
354 342
355 atomic_t dead_count;
356#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 343#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem; 344 struct cg_proto tcp_mem;
358#endif 345#endif
@@ -382,7 +369,6 @@ struct mem_cgroup {
382/* internal only representation about the status of kmem accounting. */ 369/* internal only representation about the status of kmem accounting. */
383enum { 370enum {
384 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
385 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
386}; 372};
387 373
388#ifdef CONFIG_MEMCG_KMEM 374#ifdef CONFIG_MEMCG_KMEM
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
396 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 382 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
397} 383}
398 384
399static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
400{
401 /*
402 * Our caller must use css_get() first, because memcg_uncharge_kmem()
403 * will call css_put() if it sees the memcg is dead.
404 */
405 smp_wmb();
406 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
407 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
408}
409
410static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
411{
412 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
413 &memcg->kmem_account_flags);
414}
415#endif 385#endif
416 386
417/* Stuffs for move charges at task migration. */ 387/* Stuffs for move charges at task migration. */
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
650 * This check can't live in kmem destruction function, 620 * This check can't live in kmem destruction function,
651 * since the charges will outlive the cgroup 621 * since the charges will outlive the cgroup
652 */ 622 */
653 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 623 WARN_ON(page_counter_read(&memcg->kmem));
654} 624}
655#else 625#else
656static void disarm_kmem_keys(struct mem_cgroup *memcg) 626static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
664 disarm_kmem_keys(memcg); 634 disarm_kmem_keys(memcg);
665} 635}
666 636
667static void drain_all_stock_async(struct mem_cgroup *memcg);
668
669static struct mem_cgroup_per_zone * 637static struct mem_cgroup_per_zone *
670mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 638mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
671{ 639{
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page)
706 674
707static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 675static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
708 struct mem_cgroup_tree_per_zone *mctz, 676 struct mem_cgroup_tree_per_zone *mctz,
709 unsigned long long new_usage_in_excess) 677 unsigned long new_usage_in_excess)
710{ 678{
711 struct rb_node **p = &mctz->rb_root.rb_node; 679 struct rb_node **p = &mctz->rb_root.rb_node;
712 struct rb_node *parent = NULL; 680 struct rb_node *parent = NULL;
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
755 spin_unlock_irqrestore(&mctz->lock, flags); 723 spin_unlock_irqrestore(&mctz->lock, flags);
756} 724}
757 725
726static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
727{
728 unsigned long nr_pages = page_counter_read(&memcg->memory);
729 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
730 unsigned long excess = 0;
731
732 if (nr_pages > soft_limit)
733 excess = nr_pages - soft_limit;
734
735 return excess;
736}
758 737
759static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 738static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
760{ 739{
761 unsigned long long excess; 740 unsigned long excess;
762 struct mem_cgroup_per_zone *mz; 741 struct mem_cgroup_per_zone *mz;
763 struct mem_cgroup_tree_per_zone *mctz; 742 struct mem_cgroup_tree_per_zone *mctz;
764 743
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
769 */ 748 */
770 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 749 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
771 mz = mem_cgroup_page_zoneinfo(memcg, page); 750 mz = mem_cgroup_page_zoneinfo(memcg, page);
772 excess = res_counter_soft_limit_excess(&memcg->res); 751 excess = soft_limit_excess(memcg);
773 /* 752 /*
774 * We have to update the tree if mz is on RB-tree or 753 * We have to update the tree if mz is on RB-tree or
775 * mem is over its softlimit. 754 * mem is over its softlimit.
@@ -825,7 +804,7 @@ retry:
825 * position in the tree. 804 * position in the tree.
826 */ 805 */
827 __mem_cgroup_remove_exceeded(mz, mctz); 806 __mem_cgroup_remove_exceeded(mz, mctz);
828 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 807 if (!soft_limit_excess(mz->memcg) ||
829 !css_tryget_online(&mz->memcg->css)) 808 !css_tryget_online(&mz->memcg->css))
830 goto retry; 809 goto retry;
831done: 810done:
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1062 return memcg; 1041 return memcg;
1063} 1042}
1064 1043
1065/*
1066 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1067 * ref. count) or NULL if the whole root's subtree has been visited.
1068 *
1069 * helper function to be used by mem_cgroup_iter
1070 */
1071static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1072 struct mem_cgroup *last_visited)
1073{
1074 struct cgroup_subsys_state *prev_css, *next_css;
1075
1076 prev_css = last_visited ? &last_visited->css : NULL;
1077skip_node:
1078 next_css = css_next_descendant_pre(prev_css, &root->css);
1079
1080 /*
1081 * Even if we found a group we have to make sure it is
1082 * alive. css && !memcg means that the groups should be
1083 * skipped and we should continue the tree walk.
1084 * last_visited css is safe to use because it is
1085 * protected by css_get and the tree walk is rcu safe.
1086 *
1087 * We do not take a reference on the root of the tree walk
1088 * because we might race with the root removal when it would
1089 * be the only node in the iterated hierarchy and mem_cgroup_iter
1090 * would end up in an endless loop because it expects that at
1091 * least one valid node will be returned. Root cannot disappear
1092 * because caller of the iterator should hold it already so
1093 * skipping css reference should be safe.
1094 */
1095 if (next_css) {
1096 struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
1097
1098 if (next_css == &root->css)
1099 return memcg;
1100
1101 if (css_tryget_online(next_css)) {
1102 /*
1103 * Make sure the memcg is initialized:
1104 * mem_cgroup_css_online() orders the the
1105 * initialization against setting the flag.
1106 */
1107 if (smp_load_acquire(&memcg->initialized))
1108 return memcg;
1109 css_put(next_css);
1110 }
1111
1112 prev_css = next_css;
1113 goto skip_node;
1114 }
1115
1116 return NULL;
1117}
1118
1119static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1120{
1121 /*
1122 * When a group in the hierarchy below root is destroyed, the
1123 * hierarchy iterator can no longer be trusted since it might
1124 * have pointed to the destroyed group. Invalidate it.
1125 */
1126 atomic_inc(&root->dead_count);
1127}
1128
1129static struct mem_cgroup *
1130mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1131 struct mem_cgroup *root,
1132 int *sequence)
1133{
1134 struct mem_cgroup *position = NULL;
1135 /*
1136 * A cgroup destruction happens in two stages: offlining and
1137 * release. They are separated by a RCU grace period.
1138 *
1139 * If the iterator is valid, we may still race with an
1140 * offlining. The RCU lock ensures the object won't be
1141 * released, tryget will fail if we lost the race.
1142 */
1143 *sequence = atomic_read(&root->dead_count);
1144 if (iter->last_dead_count == *sequence) {
1145 smp_rmb();
1146 position = iter->last_visited;
1147
1148 /*
1149 * We cannot take a reference to root because we might race
1150 * with root removal and returning NULL would end up in
1151 * an endless loop on the iterator user level when root
1152 * would be returned all the time.
1153 */
1154 if (position && position != root &&
1155 !css_tryget_online(&position->css))
1156 position = NULL;
1157 }
1158 return position;
1159}
1160
1161static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1162 struct mem_cgroup *last_visited,
1163 struct mem_cgroup *new_position,
1164 struct mem_cgroup *root,
1165 int sequence)
1166{
1167 /* root reference counting symmetric to mem_cgroup_iter_load */
1168 if (last_visited && last_visited != root)
1169 css_put(&last_visited->css);
1170 /*
1171 * We store the sequence count from the time @last_visited was
1172 * loaded successfully instead of rereading it here so that we
1173 * don't lose destruction events in between. We could have
1174 * raced with the destruction of @new_position after all.
1175 */
1176 iter->last_visited = new_position;
1177 smp_wmb();
1178 iter->last_dead_count = sequence;
1179}
1180
1181/** 1044/**
1182 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1045 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1183 * @root: hierarchy root 1046 * @root: hierarchy root
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1199 struct mem_cgroup *prev, 1062 struct mem_cgroup *prev,
1200 struct mem_cgroup_reclaim_cookie *reclaim) 1063 struct mem_cgroup_reclaim_cookie *reclaim)
1201{ 1064{
1065 struct reclaim_iter *uninitialized_var(iter);
1066 struct cgroup_subsys_state *css = NULL;
1202 struct mem_cgroup *memcg = NULL; 1067 struct mem_cgroup *memcg = NULL;
1203 struct mem_cgroup *last_visited = NULL; 1068 struct mem_cgroup *pos = NULL;
1204 1069
1205 if (mem_cgroup_disabled()) 1070 if (mem_cgroup_disabled())
1206 return NULL; 1071 return NULL;
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1209 root = root_mem_cgroup; 1074 root = root_mem_cgroup;
1210 1075
1211 if (prev && !reclaim) 1076 if (prev && !reclaim)
1212 last_visited = prev; 1077 pos = prev;
1213 1078
1214 if (!root->use_hierarchy && root != root_mem_cgroup) { 1079 if (!root->use_hierarchy && root != root_mem_cgroup) {
1215 if (prev) 1080 if (prev)
1216 goto out_css_put; 1081 goto out;
1217 return root; 1082 return root;
1218 } 1083 }
1219 1084
1220 rcu_read_lock(); 1085 rcu_read_lock();
1221 while (!memcg) {
1222 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1223 int uninitialized_var(seq);
1224
1225 if (reclaim) {
1226 struct mem_cgroup_per_zone *mz;
1227
1228 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1229 iter = &mz->reclaim_iter[reclaim->priority];
1230 if (prev && reclaim->generation != iter->generation) {
1231 iter->last_visited = NULL;
1232 goto out_unlock;
1233 }
1234 1086
1235 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1087 if (reclaim) {
1088 struct mem_cgroup_per_zone *mz;
1089
1090 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1091 iter = &mz->iter[reclaim->priority];
1092
1093 if (prev && reclaim->generation != iter->generation)
1094 goto out_unlock;
1095
1096 do {
1097 pos = ACCESS_ONCE(iter->position);
1098 /*
1099 * A racing update may change the position and
1100 * put the last reference, hence css_tryget(),
1101 * or retry to see the updated position.
1102 */
1103 } while (pos && !css_tryget(&pos->css));
1104 }
1105
1106 if (pos)
1107 css = &pos->css;
1108
1109 for (;;) {
1110 css = css_next_descendant_pre(css, &root->css);
1111 if (!css) {
1112 /*
1113 * Reclaimers share the hierarchy walk, and a
1114 * new one might jump in right at the end of
1115 * the hierarchy - make sure they see at least
1116 * one group and restart from the beginning.
1117 */
1118 if (!prev)
1119 continue;
1120 break;
1236 } 1121 }
1237 1122
1238 memcg = __mem_cgroup_iter_next(root, last_visited); 1123 /*
1124 * Verify the css and acquire a reference. The root
1125 * is provided by the caller, so we know it's alive
1126 * and kicking, and don't take an extra reference.
1127 */
1128 memcg = mem_cgroup_from_css(css);
1129
1130 if (css == &root->css)
1131 break;
1239 1132
1240 if (reclaim) { 1133 if (css_tryget(css)) {
1241 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1134 /*
1242 seq); 1135 * Make sure the memcg is initialized:
1136 * mem_cgroup_css_online() orders the the
1137 * initialization against setting the flag.
1138 */
1139 if (smp_load_acquire(&memcg->initialized))
1140 break;
1243 1141
1244 if (!memcg) 1142 css_put(css);
1245 iter->generation++;
1246 else if (!prev && memcg)
1247 reclaim->generation = iter->generation;
1248 } 1143 }
1249 1144
1250 if (prev && !memcg) 1145 memcg = NULL;
1251 goto out_unlock; 1146 }
1147
1148 if (reclaim) {
1149 if (cmpxchg(&iter->position, pos, memcg) == pos) {
1150 if (memcg)
1151 css_get(&memcg->css);
1152 if (pos)
1153 css_put(&pos->css);
1154 }
1155
1156 /*
1157 * pairs with css_tryget when dereferencing iter->position
1158 * above.
1159 */
1160 if (pos)
1161 css_put(&pos->css);
1162
1163 if (!memcg)
1164 iter->generation++;
1165 else if (!prev)
1166 reclaim->generation = iter->generation;
1252 } 1167 }
1168
1253out_unlock: 1169out_unlock:
1254 rcu_read_unlock(); 1170 rcu_read_unlock();
1255out_css_put: 1171out:
1256 if (prev && prev != root) 1172 if (prev && prev != root)
1257 css_put(&prev->css); 1173 css_put(&prev->css);
1258 1174
@@ -1346,15 +1262,18 @@ out:
1346} 1262}
1347 1263
1348/** 1264/**
1349 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1265 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1350 * @page: the page 1266 * @page: the page
1351 * @zone: zone of the page 1267 * @zone: zone of the page
1268 *
1269 * This function is only safe when following the LRU page isolation
1270 * and putback protocol: the LRU lock must be held, and the page must
1271 * either be PageLRU() or the caller must have isolated/allocated it.
1352 */ 1272 */
1353struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1273struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1354{ 1274{
1355 struct mem_cgroup_per_zone *mz; 1275 struct mem_cgroup_per_zone *mz;
1356 struct mem_cgroup *memcg; 1276 struct mem_cgroup *memcg;
1357 struct page_cgroup *pc;
1358 struct lruvec *lruvec; 1277 struct lruvec *lruvec;
1359 1278
1360 if (mem_cgroup_disabled()) { 1279 if (mem_cgroup_disabled()) {
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1362 goto out; 1281 goto out;
1363 } 1282 }
1364 1283
1365 pc = lookup_page_cgroup(page); 1284 memcg = page->mem_cgroup;
1366 memcg = pc->mem_cgroup;
1367
1368 /* 1285 /*
1369 * Surreptitiously switch any uncharged offlist page to root: 1286 * Swapcache readahead pages are added to the LRU - and
1370 * an uncharged page off lru does nothing to secure 1287 * possibly migrated - before they are charged.
1371 * its former mem_cgroup from sudden removal.
1372 *
1373 * Our caller holds lru_lock, and PageCgroupUsed is updated
1374 * under page_cgroup lock: between them, they make all uses
1375 * of pc->mem_cgroup safe.
1376 */ 1288 */
1377 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1289 if (!memcg)
1378 pc->mem_cgroup = memcg = root_mem_cgroup; 1290 memcg = root_mem_cgroup;
1379 1291
1380 mz = mem_cgroup_page_zoneinfo(memcg, page); 1292 mz = mem_cgroup_page_zoneinfo(memcg, page);
1381 lruvec = &mz->lruvec; 1293 lruvec = &mz->lruvec;
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1414 VM_BUG_ON((long)(*lru_size) < 0); 1326 VM_BUG_ON((long)(*lru_size) < 0);
1415} 1327}
1416 1328
1417/* 1329bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
1418 * Checks whether given mem is same or in the root_mem_cgroup's
1419 * hierarchy subtree
1420 */
1421bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1422 struct mem_cgroup *memcg)
1423{ 1330{
1424 if (root_memcg == memcg) 1331 if (root == memcg)
1425 return true; 1332 return true;
1426 if (!root_memcg->use_hierarchy || !memcg) 1333 if (!root->use_hierarchy)
1427 return false; 1334 return false;
1428 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1335 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
1429}
1430
1431static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1432 struct mem_cgroup *memcg)
1433{
1434 bool ret;
1435
1436 rcu_read_lock();
1437 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1438 rcu_read_unlock();
1439 return ret;
1440} 1336}
1441 1337
1442bool task_in_mem_cgroup(struct task_struct *task, 1338bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1443 const struct mem_cgroup *memcg)
1444{ 1339{
1445 struct mem_cgroup *curr = NULL; 1340 struct mem_cgroup *task_memcg;
1446 struct task_struct *p; 1341 struct task_struct *p;
1447 bool ret; 1342 bool ret;
1448 1343
1449 p = find_lock_task_mm(task); 1344 p = find_lock_task_mm(task);
1450 if (p) { 1345 if (p) {
1451 curr = get_mem_cgroup_from_mm(p->mm); 1346 task_memcg = get_mem_cgroup_from_mm(p->mm);
1452 task_unlock(p); 1347 task_unlock(p);
1453 } else { 1348 } else {
1454 /* 1349 /*
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task,
1457 * killed to prevent needlessly killing additional tasks. 1352 * killed to prevent needlessly killing additional tasks.
1458 */ 1353 */
1459 rcu_read_lock(); 1354 rcu_read_lock();
1460 curr = mem_cgroup_from_task(task); 1355 task_memcg = mem_cgroup_from_task(task);
1461 if (curr) 1356 css_get(&task_memcg->css);
1462 css_get(&curr->css);
1463 rcu_read_unlock(); 1357 rcu_read_unlock();
1464 } 1358 }
1465 /* 1359 ret = mem_cgroup_is_descendant(task_memcg, memcg);
1466 * We should check use_hierarchy of "memcg" not "curr". Because checking 1360 css_put(&task_memcg->css);
1467 * use_hierarchy of "curr" here make this function true if hierarchy is
1468 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1469 * hierarchy(even if use_hierarchy is disabled in "memcg").
1470 */
1471 ret = mem_cgroup_same_or_subtree(memcg, curr);
1472 css_put(&curr->css);
1473 return ret; 1361 return ret;
1474} 1362}
1475 1363
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1492 return inactive * inactive_ratio < active; 1380 return inactive * inactive_ratio < active;
1493} 1381}
1494 1382
1495#define mem_cgroup_from_res_counter(counter, member) \ 1383#define mem_cgroup_from_counter(counter, member) \
1496 container_of(counter, struct mem_cgroup, member) 1384 container_of(counter, struct mem_cgroup, member)
1497 1385
1498/** 1386/**
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1504 */ 1392 */
1505static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1393static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1506{ 1394{
1507 unsigned long long margin; 1395 unsigned long margin = 0;
1396 unsigned long count;
1397 unsigned long limit;
1508 1398
1509 margin = res_counter_margin(&memcg->res); 1399 count = page_counter_read(&memcg->memory);
1510 if (do_swap_account) 1400 limit = ACCESS_ONCE(memcg->memory.limit);
1511 margin = min(margin, res_counter_margin(&memcg->memsw)); 1401 if (count < limit)
1512 return margin >> PAGE_SHIFT; 1402 margin = limit - count;
1403
1404 if (do_swap_account) {
1405 count = page_counter_read(&memcg->memsw);
1406 limit = ACCESS_ONCE(memcg->memsw.limit);
1407 if (count <= limit)
1408 margin = min(margin, limit - count);
1409 }
1410
1411 return margin;
1513} 1412}
1514 1413
1515int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1414int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1522} 1421}
1523 1422
1524/* 1423/*
1525 * memcg->moving_account is used for checking possibility that some thread is
1526 * calling move_account(). When a thread on CPU-A starts moving pages under
1527 * a memcg, other threads should check memcg->moving_account under
1528 * rcu_read_lock(), like this:
1529 *
1530 * CPU-A CPU-B
1531 * rcu_read_lock()
1532 * memcg->moving_account+1 if (memcg->mocing_account)
1533 * take heavy locks.
1534 * synchronize_rcu() update something.
1535 * rcu_read_unlock()
1536 * start move here.
1537 */
1538
1539static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1540{
1541 atomic_inc(&memcg->moving_account);
1542 synchronize_rcu();
1543}
1544
1545static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1546{
1547 /*
1548 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1549 * We check NULL in callee rather than caller.
1550 */
1551 if (memcg)
1552 atomic_dec(&memcg->moving_account);
1553}
1554
1555/*
1556 * A routine for checking "mem" is under move_account() or not. 1424 * A routine for checking "mem" is under move_account() or not.
1557 * 1425 *
1558 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1426 * Checking a cgroup is mc.from or mc.to or under hierarchy of
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1574 if (!from) 1442 if (!from)
1575 goto unlock; 1443 goto unlock;
1576 1444
1577 ret = mem_cgroup_same_or_subtree(memcg, from) 1445 ret = mem_cgroup_is_descendant(from, memcg) ||
1578 || mem_cgroup_same_or_subtree(memcg, to); 1446 mem_cgroup_is_descendant(to, memcg);
1579unlock: 1447unlock:
1580 spin_unlock(&mc.lock); 1448 spin_unlock(&mc.lock);
1581 return ret; 1449 return ret;
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1597 return false; 1465 return false;
1598} 1466}
1599 1467
1600/*
1601 * Take this lock when
1602 * - a code tries to modify page's memcg while it's USED.
1603 * - a code tries to modify page state accounting in a memcg.
1604 */
1605static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1606 unsigned long *flags)
1607{
1608 spin_lock_irqsave(&memcg->move_lock, *flags);
1609}
1610
1611static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1612 unsigned long *flags)
1613{
1614 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1615}
1616
1617#define K(x) ((x) << (PAGE_SHIFT-10)) 1468#define K(x) ((x) << (PAGE_SHIFT-10))
1618/** 1469/**
1619 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1470 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1644 1495
1645 rcu_read_unlock(); 1496 rcu_read_unlock();
1646 1497
1647 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1498 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1648 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1499 K((u64)page_counter_read(&memcg->memory)),
1649 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1500 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1650 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1501 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1651 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1502 K((u64)page_counter_read(&memcg->memsw)),
1652 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1503 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1653 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1504 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1654 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1505 K((u64)page_counter_read(&memcg->kmem)),
1655 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1506 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1656 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1657 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1658 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1659 1507
1660 for_each_mem_cgroup_tree(iter, memcg) { 1508 for_each_mem_cgroup_tree(iter, memcg) {
1661 pr_info("Memory cgroup stats for "); 1509 pr_info("Memory cgroup stats for ");
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1695/* 1543/*
1696 * Return the memory (and swap, if configured) limit for a memcg. 1544 * Return the memory (and swap, if configured) limit for a memcg.
1697 */ 1545 */
1698static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1546static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1699{ 1547{
1700 u64 limit; 1548 unsigned long limit;
1701
1702 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1703 1549
1704 /* 1550 limit = memcg->memory.limit;
1705 * Do not consider swap space if we cannot swap due to swappiness
1706 */
1707 if (mem_cgroup_swappiness(memcg)) { 1551 if (mem_cgroup_swappiness(memcg)) {
1708 u64 memsw; 1552 unsigned long memsw_limit;
1709 1553
1710 limit += total_swap_pages << PAGE_SHIFT; 1554 memsw_limit = memcg->memsw.limit;
1711 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1555 limit = min(limit + total_swap_pages, memsw_limit);
1712
1713 /*
1714 * If memsw is finite and limits the amount of swap space
1715 * available to this memcg, return that limit.
1716 */
1717 limit = min(limit, memsw);
1718 } 1556 }
1719
1720 return limit; 1557 return limit;
1721} 1558}
1722 1559
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1740 } 1577 }
1741 1578
1742 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1579 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1743 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1580 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1744 for_each_mem_cgroup_tree(iter, memcg) { 1581 for_each_mem_cgroup_tree(iter, memcg) {
1745 struct css_task_iter it; 1582 struct css_task_iter it;
1746 struct task_struct *task; 1583 struct task_struct *task;
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1880 memcg->last_scanned_node = node; 1717 memcg->last_scanned_node = node;
1881 return node; 1718 return node;
1882} 1719}
1883
1884/*
1885 * Check all nodes whether it contains reclaimable pages or not.
1886 * For quick scan, we make use of scan_nodes. This will allow us to skip
1887 * unused nodes. But scan_nodes is lazily updated and may not cotain
1888 * enough new information. We need to do double check.
1889 */
1890static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1891{
1892 int nid;
1893
1894 /*
1895 * quick check...making use of scan_node.
1896 * We can skip unused nodes.
1897 */
1898 if (!nodes_empty(memcg->scan_nodes)) {
1899 for (nid = first_node(memcg->scan_nodes);
1900 nid < MAX_NUMNODES;
1901 nid = next_node(nid, memcg->scan_nodes)) {
1902
1903 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1904 return true;
1905 }
1906 }
1907 /*
1908 * Check rest of nodes.
1909 */
1910 for_each_node_state(nid, N_MEMORY) {
1911 if (node_isset(nid, memcg->scan_nodes))
1912 continue;
1913 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1914 return true;
1915 }
1916 return false;
1917}
1918
1919#else 1720#else
1920int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1721int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1921{ 1722{
1922 return 0; 1723 return 0;
1923} 1724}
1924
1925static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1926{
1927 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1928}
1929#endif 1725#endif
1930 1726
1931static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1727static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1943 .priority = 0, 1739 .priority = 0,
1944 }; 1740 };
1945 1741
1946 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1742 excess = soft_limit_excess(root_memcg);
1947 1743
1948 while (1) { 1744 while (1) {
1949 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1745 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1969 } 1765 }
1970 continue; 1766 continue;
1971 } 1767 }
1972 if (!mem_cgroup_reclaimable(victim, false))
1973 continue;
1974 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1768 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1975 zone, &nr_scanned); 1769 zone, &nr_scanned);
1976 *total_scanned += nr_scanned; 1770 *total_scanned += nr_scanned;
1977 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1771 if (!soft_limit_excess(root_memcg))
1978 break; 1772 break;
1979 } 1773 }
1980 mem_cgroup_iter_break(root_memcg, victim); 1774 mem_cgroup_iter_break(root_memcg, victim);
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
2081 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1875 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2082 oom_wait_memcg = oom_wait_info->memcg; 1876 oom_wait_memcg = oom_wait_info->memcg;
2083 1877
2084 /* 1878 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2085 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 1879 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2086 * Then we can use css_is_ancestor without taking care of RCU.
2087 */
2088 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2089 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2090 return 0; 1880 return 0;
2091 return autoremove_wake_function(wait, mode, sync, arg); 1881 return autoremove_wake_function(wait, mode, sync, arg);
2092} 1882}
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2228 unsigned long *flags) 2018 unsigned long *flags)
2229{ 2019{
2230 struct mem_cgroup *memcg; 2020 struct mem_cgroup *memcg;
2231 struct page_cgroup *pc;
2232 2021
2233 rcu_read_lock(); 2022 rcu_read_lock();
2234 2023
2235 if (mem_cgroup_disabled()) 2024 if (mem_cgroup_disabled())
2236 return NULL; 2025 return NULL;
2237
2238 pc = lookup_page_cgroup(page);
2239again: 2026again:
2240 memcg = pc->mem_cgroup; 2027 memcg = page->mem_cgroup;
2241 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2028 if (unlikely(!memcg))
2242 return NULL; 2029 return NULL;
2243 2030
2244 *locked = false; 2031 *locked = false;
2245 if (atomic_read(&memcg->moving_account) <= 0) 2032 if (atomic_read(&memcg->moving_account) <= 0)
2246 return memcg; 2033 return memcg;
2247 2034
2248 move_lock_mem_cgroup(memcg, flags); 2035 spin_lock_irqsave(&memcg->move_lock, *flags);
2249 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2036 if (memcg != page->mem_cgroup) {
2250 move_unlock_mem_cgroup(memcg, flags); 2037 spin_unlock_irqrestore(&memcg->move_lock, *flags);
2251 goto again; 2038 goto again;
2252 } 2039 }
2253 *locked = true; 2040 *locked = true;
@@ -2261,11 +2048,11 @@ again:
2261 * @locked: value received from mem_cgroup_begin_page_stat() 2048 * @locked: value received from mem_cgroup_begin_page_stat()
2262 * @flags: value received from mem_cgroup_begin_page_stat() 2049 * @flags: value received from mem_cgroup_begin_page_stat()
2263 */ 2050 */
2264void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, 2051void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
2265 unsigned long flags) 2052 unsigned long *flags)
2266{ 2053{
2267 if (memcg && locked) 2054 if (memcg && *locked)
2268 move_unlock_mem_cgroup(memcg, &flags); 2055 spin_unlock_irqrestore(&memcg->move_lock, *flags);
2269 2056
2270 rcu_read_unlock(); 2057 rcu_read_unlock();
2271} 2058}
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex);
2316static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2103static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2317{ 2104{
2318 struct memcg_stock_pcp *stock; 2105 struct memcg_stock_pcp *stock;
2319 bool ret = true; 2106 bool ret = false;
2320 2107
2321 if (nr_pages > CHARGE_BATCH) 2108 if (nr_pages > CHARGE_BATCH)
2322 return false; 2109 return ret;
2323 2110
2324 stock = &get_cpu_var(memcg_stock); 2111 stock = &get_cpu_var(memcg_stock);
2325 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2112 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2326 stock->nr_pages -= nr_pages; 2113 stock->nr_pages -= nr_pages;
2327 else /* need to call res_counter_charge */ 2114 ret = true;
2328 ret = false; 2115 }
2329 put_cpu_var(memcg_stock); 2116 put_cpu_var(memcg_stock);
2330 return ret; 2117 return ret;
2331} 2118}
2332 2119
2333/* 2120/*
2334 * Returns stocks cached in percpu to res_counter and reset cached information. 2121 * Returns stocks cached in percpu and reset cached information.
2335 */ 2122 */
2336static void drain_stock(struct memcg_stock_pcp *stock) 2123static void drain_stock(struct memcg_stock_pcp *stock)
2337{ 2124{
2338 struct mem_cgroup *old = stock->cached; 2125 struct mem_cgroup *old = stock->cached;
2339 2126
2340 if (stock->nr_pages) { 2127 if (stock->nr_pages) {
2341 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2128 page_counter_uncharge(&old->memory, stock->nr_pages);
2342
2343 res_counter_uncharge(&old->res, bytes);
2344 if (do_swap_account) 2129 if (do_swap_account)
2345 res_counter_uncharge(&old->memsw, bytes); 2130 page_counter_uncharge(&old->memsw, stock->nr_pages);
2131 css_put_many(&old->css, stock->nr_pages);
2346 stock->nr_pages = 0; 2132 stock->nr_pages = 0;
2347 } 2133 }
2348 stock->cached = NULL; 2134 stock->cached = NULL;
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void)
2371} 2157}
2372 2158
2373/* 2159/*
2374 * Cache charges(val) which is from res_counter, to local per_cpu area. 2160 * Cache charges(val) to local per_cpu area.
2375 * This will be consumed by consume_stock() function, later. 2161 * This will be consumed by consume_stock() function, later.
2376 */ 2162 */
2377static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2163static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2388 2174
2389/* 2175/*
2390 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2176 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2391 * of the hierarchy under it. sync flag says whether we should block 2177 * of the hierarchy under it.
2392 * until the work is done.
2393 */ 2178 */
2394static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2179static void drain_all_stock(struct mem_cgroup *root_memcg)
2395{ 2180{
2396 int cpu, curcpu; 2181 int cpu, curcpu;
2397 2182
2183 /* If someone's already draining, avoid adding running more workers. */
2184 if (!mutex_trylock(&percpu_charge_mutex))
2185 return;
2398 /* Notify other cpus that system-wide "drain" is running */ 2186 /* Notify other cpus that system-wide "drain" is running */
2399 get_online_cpus(); 2187 get_online_cpus();
2400 curcpu = get_cpu(); 2188 curcpu = get_cpu();
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2405 memcg = stock->cached; 2193 memcg = stock->cached;
2406 if (!memcg || !stock->nr_pages) 2194 if (!memcg || !stock->nr_pages)
2407 continue; 2195 continue;
2408 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2196 if (!mem_cgroup_is_descendant(memcg, root_memcg))
2409 continue; 2197 continue;
2410 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2198 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2411 if (cpu == curcpu) 2199 if (cpu == curcpu)
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2415 } 2203 }
2416 } 2204 }
2417 put_cpu(); 2205 put_cpu();
2418
2419 if (!sync)
2420 goto out;
2421
2422 for_each_online_cpu(cpu) {
2423 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2424 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2425 flush_work(&stock->work);
2426 }
2427out:
2428 put_online_cpus(); 2206 put_online_cpus();
2429}
2430
2431/*
2432 * Tries to drain stocked charges in other cpus. This function is asynchronous
2433 * and just put a work per cpu for draining localy on each cpu. Caller can
2434 * expects some charges will be back to res_counter later but cannot wait for
2435 * it.
2436 */
2437static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2438{
2439 /*
2440 * If someone calls draining, avoid adding more kworker runs.
2441 */
2442 if (!mutex_trylock(&percpu_charge_mutex))
2443 return;
2444 drain_all_stock(root_memcg, false);
2445 mutex_unlock(&percpu_charge_mutex);
2446}
2447
2448/* This is a synchronous drain interface. */
2449static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2450{
2451 /* called when force_empty is called */
2452 mutex_lock(&percpu_charge_mutex);
2453 drain_all_stock(root_memcg, true);
2454 mutex_unlock(&percpu_charge_mutex); 2207 mutex_unlock(&percpu_charge_mutex);
2455} 2208}
2456 2209
@@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2506 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2259 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2507 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2260 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2508 struct mem_cgroup *mem_over_limit; 2261 struct mem_cgroup *mem_over_limit;
2509 struct res_counter *fail_res; 2262 struct page_counter *counter;
2510 unsigned long nr_reclaimed; 2263 unsigned long nr_reclaimed;
2511 unsigned long long size;
2512 bool may_swap = true; 2264 bool may_swap = true;
2513 bool drained = false; 2265 bool drained = false;
2514 int ret = 0; 2266 int ret = 0;
@@ -2519,16 +2271,15 @@ retry:
2519 if (consume_stock(memcg, nr_pages)) 2271 if (consume_stock(memcg, nr_pages))
2520 goto done; 2272 goto done;
2521 2273
2522 size = batch * PAGE_SIZE;
2523 if (!do_swap_account || 2274 if (!do_swap_account ||
2524 !res_counter_charge(&memcg->memsw, size, &fail_res)) { 2275 !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2525 if (!res_counter_charge(&memcg->res, size, &fail_res)) 2276 if (!page_counter_try_charge(&memcg->memory, batch, &counter))
2526 goto done_restock; 2277 goto done_restock;
2527 if (do_swap_account) 2278 if (do_swap_account)
2528 res_counter_uncharge(&memcg->memsw, size); 2279 page_counter_uncharge(&memcg->memsw, batch);
2529 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2280 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2530 } else { 2281 } else {
2531 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2282 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2532 may_swap = false; 2283 may_swap = false;
2533 } 2284 }
2534 2285
@@ -2561,7 +2312,7 @@ retry:
2561 goto retry; 2312 goto retry;
2562 2313
2563 if (!drained) { 2314 if (!drained) {
2564 drain_all_stock_async(mem_over_limit); 2315 drain_all_stock(mem_over_limit);
2565 drained = true; 2316 drained = true;
2566 goto retry; 2317 goto retry;
2567 } 2318 }
@@ -2603,6 +2354,7 @@ bypass:
2603 return -EINTR; 2354 return -EINTR;
2604 2355
2605done_restock: 2356done_restock:
2357 css_get_many(&memcg->css, batch);
2606 if (batch > nr_pages) 2358 if (batch > nr_pages)
2607 refill_stock(memcg, batch - nr_pages); 2359 refill_stock(memcg, batch - nr_pages);
2608done: 2360done:
@@ -2611,32 +2363,14 @@ done:
2611 2363
2612static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2364static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2613{ 2365{
2614 unsigned long bytes = nr_pages * PAGE_SIZE;
2615
2616 if (mem_cgroup_is_root(memcg)) 2366 if (mem_cgroup_is_root(memcg))
2617 return; 2367 return;
2618 2368
2619 res_counter_uncharge(&memcg->res, bytes); 2369 page_counter_uncharge(&memcg->memory, nr_pages);
2620 if (do_swap_account) 2370 if (do_swap_account)
2621 res_counter_uncharge(&memcg->memsw, bytes); 2371 page_counter_uncharge(&memcg->memsw, nr_pages);
2622}
2623
2624/*
2625 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2626 * This is useful when moving usage to parent cgroup.
2627 */
2628static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2629 unsigned int nr_pages)
2630{
2631 unsigned long bytes = nr_pages * PAGE_SIZE;
2632
2633 if (mem_cgroup_is_root(memcg))
2634 return;
2635 2372
2636 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2373 css_put_many(&memcg->css, nr_pages);
2637 if (do_swap_account)
2638 res_counter_uncharge_until(&memcg->memsw,
2639 memcg->memsw.parent, bytes);
2640} 2374}
2641 2375
2642/* 2376/*
@@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2665 */ 2399 */
2666struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2400struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2667{ 2401{
2668 struct mem_cgroup *memcg = NULL; 2402 struct mem_cgroup *memcg;
2669 struct page_cgroup *pc;
2670 unsigned short id; 2403 unsigned short id;
2671 swp_entry_t ent; 2404 swp_entry_t ent;
2672 2405
2673 VM_BUG_ON_PAGE(!PageLocked(page), page); 2406 VM_BUG_ON_PAGE(!PageLocked(page), page);
2674 2407
2675 pc = lookup_page_cgroup(page); 2408 memcg = page->mem_cgroup;
2676 if (PageCgroupUsed(pc)) { 2409 if (memcg) {
2677 memcg = pc->mem_cgroup; 2410 if (!css_tryget_online(&memcg->css))
2678 if (memcg && !css_tryget_online(&memcg->css))
2679 memcg = NULL; 2411 memcg = NULL;
2680 } else if (PageSwapCache(page)) { 2412 } else if (PageSwapCache(page)) {
2681 ent.val = page_private(page); 2413 ent.val = page_private(page);
@@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
2723static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2455static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2724 bool lrucare) 2456 bool lrucare)
2725{ 2457{
2726 struct page_cgroup *pc = lookup_page_cgroup(page);
2727 int isolated; 2458 int isolated;
2728 2459
2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2460 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2730 /*
2731 * we don't need page_cgroup_lock about tail pages, becase they are not
2732 * accessed by any other context at this point.
2733 */
2734 2461
2735 /* 2462 /*
2736 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2463 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2741 2468
2742 /* 2469 /*
2743 * Nobody should be changing or seriously looking at 2470 * Nobody should be changing or seriously looking at
2744 * pc->mem_cgroup and pc->flags at this point: 2471 * page->mem_cgroup at this point:
2745 * 2472 *
2746 * - the page is uncharged 2473 * - the page is uncharged
2747 * 2474 *
@@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2753 * - a page cache insertion, a swapin fault, or a migration 2480 * - a page cache insertion, a swapin fault, or a migration
2754 * have the page locked 2481 * have the page locked
2755 */ 2482 */
2756 pc->mem_cgroup = memcg; 2483 page->mem_cgroup = memcg;
2757 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2758 2484
2759 if (lrucare) 2485 if (lrucare)
2760 unlock_page_lru(page, isolated); 2486 unlock_page_lru(page, isolated);
2761} 2487}
2762 2488
2763static DEFINE_MUTEX(set_limit_mutex);
2764
2765#ifdef CONFIG_MEMCG_KMEM 2489#ifdef CONFIG_MEMCG_KMEM
2766/* 2490/*
2767 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2491 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex);
2769 */ 2493 */
2770static DEFINE_MUTEX(memcg_slab_mutex); 2494static DEFINE_MUTEX(memcg_slab_mutex);
2771 2495
2772static DEFINE_MUTEX(activate_kmem_mutex);
2773
2774/* 2496/*
2775 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2497 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2776 * in the memcg_cache_params struct. 2498 * in the memcg_cache_params struct.
@@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2784 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2506 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2785} 2507}
2786 2508
2787#ifdef CONFIG_SLABINFO 2509static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2788static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2510 unsigned long nr_pages)
2789{
2790 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2791 struct memcg_cache_params *params;
2792
2793 if (!memcg_kmem_is_active(memcg))
2794 return -EIO;
2795
2796 print_slabinfo_header(m);
2797
2798 mutex_lock(&memcg_slab_mutex);
2799 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2800 cache_show(memcg_params_to_cache(params), m);
2801 mutex_unlock(&memcg_slab_mutex);
2802
2803 return 0;
2804}
2805#endif
2806
2807static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2808{ 2511{
2809 struct res_counter *fail_res; 2512 struct page_counter *counter;
2810 int ret = 0; 2513 int ret = 0;
2811 2514
2812 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2515 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2813 if (ret) 2516 if (ret < 0)
2814 return ret; 2517 return ret;
2815 2518
2816 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2519 ret = try_charge(memcg, gfp, nr_pages);
2817 if (ret == -EINTR) { 2520 if (ret == -EINTR) {
2818 /* 2521 /*
2819 * try_charge() chose to bypass to root due to OOM kill or 2522 * try_charge() chose to bypass to root due to OOM kill or
@@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2830 * when the allocation triggers should have been already 2533 * when the allocation triggers should have been already
2831 * directed to the root cgroup in memcontrol.h 2534 * directed to the root cgroup in memcontrol.h
2832 */ 2535 */
2833 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2536 page_counter_charge(&memcg->memory, nr_pages);
2834 if (do_swap_account) 2537 if (do_swap_account)
2835 res_counter_charge_nofail(&memcg->memsw, size, 2538 page_counter_charge(&memcg->memsw, nr_pages);
2836 &fail_res); 2539 css_get_many(&memcg->css, nr_pages);
2837 ret = 0; 2540 ret = 0;
2838 } else if (ret) 2541 } else if (ret)
2839 res_counter_uncharge(&memcg->kmem, size); 2542 page_counter_uncharge(&memcg->kmem, nr_pages);
2840 2543
2841 return ret; 2544 return ret;
2842} 2545}
2843 2546
2844static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2547static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
2548 unsigned long nr_pages)
2845{ 2549{
2846 res_counter_uncharge(&memcg->res, size); 2550 page_counter_uncharge(&memcg->memory, nr_pages);
2847 if (do_swap_account) 2551 if (do_swap_account)
2848 res_counter_uncharge(&memcg->memsw, size); 2552 page_counter_uncharge(&memcg->memsw, nr_pages);
2849 2553
2850 /* Not down to 0 */ 2554 page_counter_uncharge(&memcg->kmem, nr_pages);
2851 if (res_counter_uncharge(&memcg->kmem, size))
2852 return;
2853 2555
2854 /* 2556 css_put_many(&memcg->css, nr_pages);
2855 * Releases a reference taken in kmem_cgroup_css_offline in case
2856 * this last uncharge is racing with the offlining code or it is
2857 * outliving the memcg existence.
2858 *
2859 * The memory barrier imposed by test&clear is paired with the
2860 * explicit one in memcg_kmem_mark_dead().
2861 */
2862 if (memcg_kmem_test_and_clear_dead(memcg))
2863 css_put(&memcg->css);
2864} 2557}
2865 2558
2866/* 2559/*
@@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3124 2817
3125int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2818int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3126{ 2819{
2820 unsigned int nr_pages = 1 << order;
3127 int res; 2821 int res;
3128 2822
3129 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 2823 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
3130 PAGE_SIZE << order);
3131 if (!res) 2824 if (!res)
3132 atomic_add(1 << order, &cachep->memcg_params->nr_pages); 2825 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
3133 return res; 2826 return res;
3134} 2827}
3135 2828
3136void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2829void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3137{ 2830{
3138 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 2831 unsigned int nr_pages = 1 << order;
3139 atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 2832
2833 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2834 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
3140} 2835}
3141 2836
3142/* 2837/*
@@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3257 return true; 2952 return true;
3258 } 2953 }
3259 2954
3260 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 2955 ret = memcg_charge_kmem(memcg, gfp, 1 << order);
3261 if (!ret) 2956 if (!ret)
3262 *_memcg = memcg; 2957 *_memcg = memcg;
3263 2958
@@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3268void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2963void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3269 int order) 2964 int order)
3270{ 2965{
3271 struct page_cgroup *pc;
3272
3273 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2966 VM_BUG_ON(mem_cgroup_is_root(memcg));
3274 2967
3275 /* The page allocation failed. Revert */ 2968 /* The page allocation failed. Revert */
3276 if (!page) { 2969 if (!page) {
3277 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 2970 memcg_uncharge_kmem(memcg, 1 << order);
3278 return; 2971 return;
3279 } 2972 }
3280 /* 2973 page->mem_cgroup = memcg;
3281 * The page is freshly allocated and not visible to any
3282 * outside callers yet. Set up pc non-atomically.
3283 */
3284 pc = lookup_page_cgroup(page);
3285 pc->mem_cgroup = memcg;
3286 pc->flags = PCG_USED;
3287} 2974}
3288 2975
3289void __memcg_kmem_uncharge_pages(struct page *page, int order) 2976void __memcg_kmem_uncharge_pages(struct page *page, int order)
3290{ 2977{
3291 struct mem_cgroup *memcg = NULL; 2978 struct mem_cgroup *memcg = page->mem_cgroup;
3292 struct page_cgroup *pc;
3293
3294 2979
3295 pc = lookup_page_cgroup(page);
3296 if (!PageCgroupUsed(pc))
3297 return;
3298
3299 memcg = pc->mem_cgroup;
3300 pc->flags = 0;
3301
3302 /*
3303 * We trust that only if there is a memcg associated with the page, it
3304 * is a valid allocation
3305 */
3306 if (!memcg) 2980 if (!memcg)
3307 return; 2981 return;
3308 2982
3309 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2983 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3310 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 2984
2985 memcg_uncharge_kmem(memcg, 1 << order);
2986 page->mem_cgroup = NULL;
3311} 2987}
3312#else 2988#else
3313static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2989static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3325 */ 3001 */
3326void mem_cgroup_split_huge_fixup(struct page *head) 3002void mem_cgroup_split_huge_fixup(struct page *head)
3327{ 3003{
3328 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3329 struct page_cgroup *pc;
3330 struct mem_cgroup *memcg;
3331 int i; 3004 int i;
3332 3005
3333 if (mem_cgroup_disabled()) 3006 if (mem_cgroup_disabled())
3334 return; 3007 return;
3335 3008
3336 memcg = head_pc->mem_cgroup; 3009 for (i = 1; i < HPAGE_PMD_NR; i++)
3337 for (i = 1; i < HPAGE_PMD_NR; i++) { 3010 head[i].mem_cgroup = head->mem_cgroup;
3338 pc = head_pc + i; 3011
3339 pc->mem_cgroup = memcg; 3012 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3340 pc->flags = head_pc->flags;
3341 }
3342 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3343 HPAGE_PMD_NR); 3013 HPAGE_PMD_NR);
3344} 3014}
3345#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3015#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3348 * mem_cgroup_move_account - move account of the page 3018 * mem_cgroup_move_account - move account of the page
3349 * @page: the page 3019 * @page: the page
3350 * @nr_pages: number of regular pages (>1 for huge pages) 3020 * @nr_pages: number of regular pages (>1 for huge pages)
3351 * @pc: page_cgroup of the page.
3352 * @from: mem_cgroup which the page is moved from. 3021 * @from: mem_cgroup which the page is moved from.
3353 * @to: mem_cgroup which the page is moved to. @from != @to. 3022 * @to: mem_cgroup which the page is moved to. @from != @to.
3354 * 3023 *
@@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3361 */ 3030 */
3362static int mem_cgroup_move_account(struct page *page, 3031static int mem_cgroup_move_account(struct page *page,
3363 unsigned int nr_pages, 3032 unsigned int nr_pages,
3364 struct page_cgroup *pc,
3365 struct mem_cgroup *from, 3033 struct mem_cgroup *from,
3366 struct mem_cgroup *to) 3034 struct mem_cgroup *to)
3367{ 3035{
@@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
3381 goto out; 3049 goto out;
3382 3050
3383 /* 3051 /*
3384 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3052 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
3385 * of its source page while we change it: page migration takes 3053 * of its source page while we change it: page migration takes
3386 * both pages off the LRU, but page cache replacement doesn't. 3054 * both pages off the LRU, but page cache replacement doesn't.
3387 */ 3055 */
@@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page,
3389 goto out; 3057 goto out;
3390 3058
3391 ret = -EINVAL; 3059 ret = -EINVAL;
3392 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3060 if (page->mem_cgroup != from)
3393 goto out_unlock; 3061 goto out_unlock;
3394 3062
3395 move_lock_mem_cgroup(from, &flags); 3063 spin_lock_irqsave(&from->move_lock, flags);
3396 3064
3397 if (!PageAnon(page) && page_mapped(page)) { 3065 if (!PageAnon(page) && page_mapped(page)) {
3398 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3066 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page,
3409 } 3077 }
3410 3078
3411 /* 3079 /*
3412 * It is safe to change pc->mem_cgroup here because the page 3080 * It is safe to change page->mem_cgroup here because the page
3413 * is referenced, charged, and isolated - we can't race with 3081 * is referenced, charged, and isolated - we can't race with
3414 * uncharging, charging, migration, or LRU putback. 3082 * uncharging, charging, migration, or LRU putback.
3415 */ 3083 */
3416 3084
3417 /* caller should have done css_get */ 3085 /* caller should have done css_get */
3418 pc->mem_cgroup = to; 3086 page->mem_cgroup = to;
3419 move_unlock_mem_cgroup(from, &flags); 3087 spin_unlock_irqrestore(&from->move_lock, flags);
3088
3420 ret = 0; 3089 ret = 0;
3421 3090
3422 local_irq_disable(); 3091 local_irq_disable();
@@ -3431,72 +3100,6 @@ out:
3431 return ret; 3100 return ret;
3432} 3101}
3433 3102
3434/**
3435 * mem_cgroup_move_parent - moves page to the parent group
3436 * @page: the page to move
3437 * @pc: page_cgroup of the page
3438 * @child: page's cgroup
3439 *
3440 * move charges to its parent or the root cgroup if the group has no
3441 * parent (aka use_hierarchy==0).
3442 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3443 * mem_cgroup_move_account fails) the failure is always temporary and
3444 * it signals a race with a page removal/uncharge or migration. In the
3445 * first case the page is on the way out and it will vanish from the LRU
3446 * on the next attempt and the call should be retried later.
3447 * Isolation from the LRU fails only if page has been isolated from
3448 * the LRU since we looked at it and that usually means either global
3449 * reclaim or migration going on. The page will either get back to the
3450 * LRU or vanish.
3451 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3452 * (!PageCgroupUsed) or moved to a different group. The page will
3453 * disappear in the next attempt.
3454 */
3455static int mem_cgroup_move_parent(struct page *page,
3456 struct page_cgroup *pc,
3457 struct mem_cgroup *child)
3458{
3459 struct mem_cgroup *parent;
3460 unsigned int nr_pages;
3461 unsigned long uninitialized_var(flags);
3462 int ret;
3463
3464 VM_BUG_ON(mem_cgroup_is_root(child));
3465
3466 ret = -EBUSY;
3467 if (!get_page_unless_zero(page))
3468 goto out;
3469 if (isolate_lru_page(page))
3470 goto put;
3471
3472 nr_pages = hpage_nr_pages(page);
3473
3474 parent = parent_mem_cgroup(child);
3475 /*
3476 * If no parent, move charges to root cgroup.
3477 */
3478 if (!parent)
3479 parent = root_mem_cgroup;
3480
3481 if (nr_pages > 1) {
3482 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3483 flags = compound_lock_irqsave(page);
3484 }
3485
3486 ret = mem_cgroup_move_account(page, nr_pages,
3487 pc, child, parent);
3488 if (!ret)
3489 __mem_cgroup_cancel_local_charge(child, nr_pages);
3490
3491 if (nr_pages > 1)
3492 compound_unlock_irqrestore(page, flags);
3493 putback_lru_page(page);
3494put:
3495 put_page(page);
3496out:
3497 return ret;
3498}
3499
3500#ifdef CONFIG_MEMCG_SWAP 3103#ifdef CONFIG_MEMCG_SWAP
3501static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3104static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3502 bool charge) 3105 bool charge)
@@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3516 * 3119 *
3517 * Returns 0 on success, -EINVAL on failure. 3120 * Returns 0 on success, -EINVAL on failure.
3518 * 3121 *
3519 * The caller must have charged to @to, IOW, called res_counter_charge() about 3122 * The caller must have charged to @to, IOW, called page_counter_charge() about
3520 * both res and memsw, and called css_get(). 3123 * both res and memsw, and called css_get().
3521 */ 3124 */
3522static int mem_cgroup_move_swap_account(swp_entry_t entry, 3125static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3532 mem_cgroup_swap_statistics(to, true); 3135 mem_cgroup_swap_statistics(to, true);
3533 /* 3136 /*
3534 * This function is only called from task migration context now. 3137 * This function is only called from task migration context now.
3535 * It postpones res_counter and refcount handling till the end 3138 * It postpones page_counter and refcount handling till the end
3536 * of task migration(mem_cgroup_clear_mc()) for performance 3139 * of task migration(mem_cgroup_clear_mc()) for performance
3537 * improvement. But we cannot postpone css_get(to) because if 3140 * improvement. But we cannot postpone css_get(to) because if
3538 * the process that has been moved to @to does swap-in, the 3141 * the process that has been moved to @to does swap-in, the
@@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3554} 3157}
3555#endif 3158#endif
3556 3159
3557#ifdef CONFIG_DEBUG_VM 3160static DEFINE_MUTEX(memcg_limit_mutex);
3558static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3559{
3560 struct page_cgroup *pc;
3561
3562 pc = lookup_page_cgroup(page);
3563 /*
3564 * Can be NULL while feeding pages into the page allocator for
3565 * the first time, i.e. during boot or memory hotplug;
3566 * or when mem_cgroup_disabled().
3567 */
3568 if (likely(pc) && PageCgroupUsed(pc))
3569 return pc;
3570 return NULL;
3571}
3572
3573bool mem_cgroup_bad_page_check(struct page *page)
3574{
3575 if (mem_cgroup_disabled())
3576 return false;
3577
3578 return lookup_page_cgroup_used(page) != NULL;
3579}
3580
3581void mem_cgroup_print_bad_page(struct page *page)
3582{
3583 struct page_cgroup *pc;
3584
3585 pc = lookup_page_cgroup_used(page);
3586 if (pc) {
3587 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3588 pc, pc->flags, pc->mem_cgroup);
3589 }
3590}
3591#endif
3592 3161
3593static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3162static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3594 unsigned long long val) 3163 unsigned long limit)
3595{ 3164{
3165 unsigned long curusage;
3166 unsigned long oldusage;
3167 bool enlarge = false;
3596 int retry_count; 3168 int retry_count;
3597 int ret = 0; 3169 int ret;
3598 int children = mem_cgroup_count_children(memcg);
3599 u64 curusage, oldusage;
3600 int enlarge;
3601 3170
3602 /* 3171 /*
3603 * For keeping hierarchical_reclaim simple, how long we should retry 3172 * For keeping hierarchical_reclaim simple, how long we should retry
3604 * is depends on callers. We set our retry-count to be function 3173 * is depends on callers. We set our retry-count to be function
3605 * of # of children which we should visit in this loop. 3174 * of # of children which we should visit in this loop.
3606 */ 3175 */
3607 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3176 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3177 mem_cgroup_count_children(memcg);
3608 3178
3609 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3179 oldusage = page_counter_read(&memcg->memory);
3610 3180
3611 enlarge = 0; 3181 do {
3612 while (retry_count) {
3613 if (signal_pending(current)) { 3182 if (signal_pending(current)) {
3614 ret = -EINTR; 3183 ret = -EINTR;
3615 break; 3184 break;
3616 } 3185 }
3617 /* 3186
3618 * Rather than hide all in some function, I do this in 3187 mutex_lock(&memcg_limit_mutex);
3619 * open coded manner. You see what this really does. 3188 if (limit > memcg->memsw.limit) {
3620 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3189 mutex_unlock(&memcg_limit_mutex);
3621 */
3622 mutex_lock(&set_limit_mutex);
3623 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3624 ret = -EINVAL; 3190 ret = -EINVAL;
3625 mutex_unlock(&set_limit_mutex);
3626 break; 3191 break;
3627 } 3192 }
3628 3193 if (limit > memcg->memory.limit)
3629 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) 3194 enlarge = true;
3630 enlarge = 1; 3195 ret = page_counter_limit(&memcg->memory, limit);
3631 3196 mutex_unlock(&memcg_limit_mutex);
3632 ret = res_counter_set_limit(&memcg->res, val);
3633 mutex_unlock(&set_limit_mutex);
3634 3197
3635 if (!ret) 3198 if (!ret)
3636 break; 3199 break;
3637 3200
3638 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3201 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3639 3202
3640 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3203 curusage = page_counter_read(&memcg->memory);
3641 /* Usage is reduced ? */ 3204 /* Usage is reduced ? */
3642 if (curusage >= oldusage) 3205 if (curusage >= oldusage)
3643 retry_count--; 3206 retry_count--;
3644 else 3207 else
3645 oldusage = curusage; 3208 oldusage = curusage;
3646 } 3209 } while (retry_count);
3210
3647 if (!ret && enlarge) 3211 if (!ret && enlarge)
3648 memcg_oom_recover(memcg); 3212 memcg_oom_recover(memcg);
3649 3213
@@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3651} 3215}
3652 3216
3653static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3217static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3654 unsigned long long val) 3218 unsigned long limit)
3655{ 3219{
3220 unsigned long curusage;
3221 unsigned long oldusage;
3222 bool enlarge = false;
3656 int retry_count; 3223 int retry_count;
3657 u64 oldusage, curusage; 3224 int ret;
3658 int children = mem_cgroup_count_children(memcg);
3659 int ret = -EBUSY;
3660 int enlarge = 0;
3661 3225
3662 /* see mem_cgroup_resize_res_limit */ 3226 /* see mem_cgroup_resize_res_limit */
3663 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3227 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3664 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3228 mem_cgroup_count_children(memcg);
3665 while (retry_count) { 3229
3230 oldusage = page_counter_read(&memcg->memsw);
3231
3232 do {
3666 if (signal_pending(current)) { 3233 if (signal_pending(current)) {
3667 ret = -EINTR; 3234 ret = -EINTR;
3668 break; 3235 break;
3669 } 3236 }
3670 /* 3237
3671 * Rather than hide all in some function, I do this in 3238 mutex_lock(&memcg_limit_mutex);
3672 * open coded manner. You see what this really does. 3239 if (limit < memcg->memory.limit) {
3673 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3240 mutex_unlock(&memcg_limit_mutex);
3674 */
3675 mutex_lock(&set_limit_mutex);
3676 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3677 ret = -EINVAL; 3241 ret = -EINVAL;
3678 mutex_unlock(&set_limit_mutex);
3679 break; 3242 break;
3680 } 3243 }
3681 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) 3244 if (limit > memcg->memsw.limit)
3682 enlarge = 1; 3245 enlarge = true;
3683 ret = res_counter_set_limit(&memcg->memsw, val); 3246 ret = page_counter_limit(&memcg->memsw, limit);
3684 mutex_unlock(&set_limit_mutex); 3247 mutex_unlock(&memcg_limit_mutex);
3685 3248
3686 if (!ret) 3249 if (!ret)
3687 break; 3250 break;
3688 3251
3689 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3252 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3690 3253
3691 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3254 curusage = page_counter_read(&memcg->memsw);
3692 /* Usage is reduced ? */ 3255 /* Usage is reduced ? */
3693 if (curusage >= oldusage) 3256 if (curusage >= oldusage)
3694 retry_count--; 3257 retry_count--;
3695 else 3258 else
3696 oldusage = curusage; 3259 oldusage = curusage;
3697 } 3260 } while (retry_count);
3261
3698 if (!ret && enlarge) 3262 if (!ret && enlarge)
3699 memcg_oom_recover(memcg); 3263 memcg_oom_recover(memcg);
3264
3700 return ret; 3265 return ret;
3701} 3266}
3702 3267
@@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3709 unsigned long reclaimed; 3274 unsigned long reclaimed;
3710 int loop = 0; 3275 int loop = 0;
3711 struct mem_cgroup_tree_per_zone *mctz; 3276 struct mem_cgroup_tree_per_zone *mctz;
3712 unsigned long long excess; 3277 unsigned long excess;
3713 unsigned long nr_scanned; 3278 unsigned long nr_scanned;
3714 3279
3715 if (order > 0) 3280 if (order > 0)
@@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3735 nr_reclaimed += reclaimed; 3300 nr_reclaimed += reclaimed;
3736 *total_scanned += nr_scanned; 3301 *total_scanned += nr_scanned;
3737 spin_lock_irq(&mctz->lock); 3302 spin_lock_irq(&mctz->lock);
3303 __mem_cgroup_remove_exceeded(mz, mctz);
3738 3304
3739 /* 3305 /*
3740 * If we failed to reclaim anything from this memory cgroup 3306 * If we failed to reclaim anything from this memory cgroup
3741 * it is time to move on to the next cgroup 3307 * it is time to move on to the next cgroup
3742 */ 3308 */
3743 next_mz = NULL; 3309 next_mz = NULL;
3744 if (!reclaimed) { 3310 if (!reclaimed)
3745 do { 3311 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3746 /* 3312
3747 * Loop until we find yet another one. 3313 excess = soft_limit_excess(mz->memcg);
3748 *
3749 * By the time we get the soft_limit lock
3750 * again, someone might have aded the
3751 * group back on the RB tree. Iterate to
3752 * make sure we get a different mem.
3753 * mem_cgroup_largest_soft_limit_node returns
3754 * NULL if no other cgroup is present on
3755 * the tree
3756 */
3757 next_mz =
3758 __mem_cgroup_largest_soft_limit_node(mctz);
3759 if (next_mz == mz)
3760 css_put(&next_mz->memcg->css);
3761 else /* next_mz == NULL or other memcg */
3762 break;
3763 } while (1);
3764 }
3765 __mem_cgroup_remove_exceeded(mz, mctz);
3766 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3767 /* 3314 /*
3768 * One school of thought says that we should not add 3315 * One school of thought says that we should not add
3769 * back the node to the tree if reclaim returns 0. 3316 * back the node to the tree if reclaim returns 0.
@@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3792 return nr_reclaimed; 3339 return nr_reclaimed;
3793} 3340}
3794 3341
3795/**
3796 * mem_cgroup_force_empty_list - clears LRU of a group
3797 * @memcg: group to clear
3798 * @node: NUMA node
3799 * @zid: zone id
3800 * @lru: lru to to clear
3801 *
3802 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3803 * reclaim the pages page themselves - pages are moved to the parent (or root)
3804 * group.
3805 */
3806static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3807 int node, int zid, enum lru_list lru)
3808{
3809 struct lruvec *lruvec;
3810 unsigned long flags;
3811 struct list_head *list;
3812 struct page *busy;
3813 struct zone *zone;
3814
3815 zone = &NODE_DATA(node)->node_zones[zid];
3816 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3817 list = &lruvec->lists[lru];
3818
3819 busy = NULL;
3820 do {
3821 struct page_cgroup *pc;
3822 struct page *page;
3823
3824 spin_lock_irqsave(&zone->lru_lock, flags);
3825 if (list_empty(list)) {
3826 spin_unlock_irqrestore(&zone->lru_lock, flags);
3827 break;
3828 }
3829 page = list_entry(list->prev, struct page, lru);
3830 if (busy == page) {
3831 list_move(&page->lru, list);
3832 busy = NULL;
3833 spin_unlock_irqrestore(&zone->lru_lock, flags);
3834 continue;
3835 }
3836 spin_unlock_irqrestore(&zone->lru_lock, flags);
3837
3838 pc = lookup_page_cgroup(page);
3839
3840 if (mem_cgroup_move_parent(page, pc, memcg)) {
3841 /* found lock contention or "pc" is obsolete. */
3842 busy = page;
3843 } else
3844 busy = NULL;
3845 cond_resched();
3846 } while (!list_empty(list));
3847}
3848
3849/*
3850 * make mem_cgroup's charge to be 0 if there is no task by moving
3851 * all the charges and pages to the parent.
3852 * This enables deleting this mem_cgroup.
3853 *
3854 * Caller is responsible for holding css reference on the memcg.
3855 */
3856static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3857{
3858 int node, zid;
3859 u64 usage;
3860
3861 do {
3862 /* This is for making all *used* pages to be on LRU. */
3863 lru_add_drain_all();
3864 drain_all_stock_sync(memcg);
3865 mem_cgroup_start_move(memcg);
3866 for_each_node_state(node, N_MEMORY) {
3867 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3868 enum lru_list lru;
3869 for_each_lru(lru) {
3870 mem_cgroup_force_empty_list(memcg,
3871 node, zid, lru);
3872 }
3873 }
3874 }
3875 mem_cgroup_end_move(memcg);
3876 memcg_oom_recover(memcg);
3877 cond_resched();
3878
3879 /*
3880 * Kernel memory may not necessarily be trackable to a specific
3881 * process. So they are not migrated, and therefore we can't
3882 * expect their value to drop to 0 here.
3883 * Having res filled up with kmem only is enough.
3884 *
3885 * This is a safety check because mem_cgroup_force_empty_list
3886 * could have raced with mem_cgroup_replace_page_cache callers
3887 * so the lru seemed empty but the page could have been added
3888 * right after the check. RES_USAGE should be safe as we always
3889 * charge before adding to the LRU.
3890 */
3891 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
3892 res_counter_read_u64(&memcg->kmem, RES_USAGE);
3893 } while (usage > 0);
3894}
3895
3896/* 3342/*
3897 * Test whether @memcg has children, dead or alive. Note that this 3343 * Test whether @memcg has children, dead or alive. Note that this
3898 * function doesn't care whether @memcg has use_hierarchy enabled and 3344 * function doesn't care whether @memcg has use_hierarchy enabled and
@@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3930 /* we call try-to-free pages for make this cgroup empty */ 3376 /* we call try-to-free pages for make this cgroup empty */
3931 lru_add_drain_all(); 3377 lru_add_drain_all();
3932 /* try to free all pages in this cgroup */ 3378 /* try to free all pages in this cgroup */
3933 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3379 while (nr_retries && page_counter_read(&memcg->memory)) {
3934 int progress; 3380 int progress;
3935 3381
3936 if (signal_pending(current)) 3382 if (signal_pending(current))
@@ -4001,8 +3447,8 @@ out:
4001 return retval; 3447 return retval;
4002} 3448}
4003 3449
4004static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3450static unsigned long tree_stat(struct mem_cgroup *memcg,
4005 enum mem_cgroup_stat_index idx) 3451 enum mem_cgroup_stat_index idx)
4006{ 3452{
4007 struct mem_cgroup *iter; 3453 struct mem_cgroup *iter;
4008 long val = 0; 3454 long val = 0;
@@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4020{ 3466{
4021 u64 val; 3467 u64 val;
4022 3468
4023 if (!mem_cgroup_is_root(memcg)) { 3469 if (mem_cgroup_is_root(memcg)) {
3470 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
3471 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
3472 if (swap)
3473 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
3474 } else {
4024 if (!swap) 3475 if (!swap)
4025 return res_counter_read_u64(&memcg->res, RES_USAGE); 3476 val = page_counter_read(&memcg->memory);
4026 else 3477 else
4027 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3478 val = page_counter_read(&memcg->memsw);
4028 } 3479 }
4029
4030 /*
4031 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4032 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4033 */
4034 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4035 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4036
4037 if (swap)
4038 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4039
4040 return val << PAGE_SHIFT; 3480 return val << PAGE_SHIFT;
4041} 3481}
4042 3482
3483enum {
3484 RES_USAGE,
3485 RES_LIMIT,
3486 RES_MAX_USAGE,
3487 RES_FAILCNT,
3488 RES_SOFT_LIMIT,
3489};
4043 3490
4044static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3491static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4045 struct cftype *cft) 3492 struct cftype *cft)
4046{ 3493{
4047 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3494 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4048 enum res_type type = MEMFILE_TYPE(cft->private); 3495 struct page_counter *counter;
4049 int name = MEMFILE_ATTR(cft->private);
4050 3496
4051 switch (type) { 3497 switch (MEMFILE_TYPE(cft->private)) {
4052 case _MEM: 3498 case _MEM:
4053 if (name == RES_USAGE) 3499 counter = &memcg->memory;
4054 return mem_cgroup_usage(memcg, false); 3500 break;
4055 return res_counter_read_u64(&memcg->res, name);
4056 case _MEMSWAP: 3501 case _MEMSWAP:
4057 if (name == RES_USAGE) 3502 counter = &memcg->memsw;
4058 return mem_cgroup_usage(memcg, true); 3503 break;
4059 return res_counter_read_u64(&memcg->memsw, name);
4060 case _KMEM: 3504 case _KMEM:
4061 return res_counter_read_u64(&memcg->kmem, name); 3505 counter = &memcg->kmem;
4062 break; 3506 break;
4063 default: 3507 default:
4064 BUG(); 3508 BUG();
4065 } 3509 }
3510
3511 switch (MEMFILE_ATTR(cft->private)) {
3512 case RES_USAGE:
3513 if (counter == &memcg->memory)
3514 return mem_cgroup_usage(memcg, false);
3515 if (counter == &memcg->memsw)
3516 return mem_cgroup_usage(memcg, true);
3517 return (u64)page_counter_read(counter) * PAGE_SIZE;
3518 case RES_LIMIT:
3519 return (u64)counter->limit * PAGE_SIZE;
3520 case RES_MAX_USAGE:
3521 return (u64)counter->watermark * PAGE_SIZE;
3522 case RES_FAILCNT:
3523 return counter->failcnt;
3524 case RES_SOFT_LIMIT:
3525 return (u64)memcg->soft_limit * PAGE_SIZE;
3526 default:
3527 BUG();
3528 }
4066} 3529}
4067 3530
4068#ifdef CONFIG_MEMCG_KMEM 3531#ifdef CONFIG_MEMCG_KMEM
4069/* should be called with activate_kmem_mutex held */ 3532static int memcg_activate_kmem(struct mem_cgroup *memcg,
4070static int __memcg_activate_kmem(struct mem_cgroup *memcg, 3533 unsigned long nr_pages)
4071 unsigned long long limit)
4072{ 3534{
4073 int err = 0; 3535 int err = 0;
4074 int memcg_id; 3536 int memcg_id;
@@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4115 * We couldn't have accounted to this cgroup, because it hasn't got the 3577 * We couldn't have accounted to this cgroup, because it hasn't got the
4116 * active bit set yet, so this should succeed. 3578 * active bit set yet, so this should succeed.
4117 */ 3579 */
4118 err = res_counter_set_limit(&memcg->kmem, limit); 3580 err = page_counter_limit(&memcg->kmem, nr_pages);
4119 VM_BUG_ON(err); 3581 VM_BUG_ON(err);
4120 3582
4121 static_key_slow_inc(&memcg_kmem_enabled_key); 3583 static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -4130,26 +3592,17 @@ out:
4130 return err; 3592 return err;
4131} 3593}
4132 3594
4133static int memcg_activate_kmem(struct mem_cgroup *memcg,
4134 unsigned long long limit)
4135{
4136 int ret;
4137
4138 mutex_lock(&activate_kmem_mutex);
4139 ret = __memcg_activate_kmem(memcg, limit);
4140 mutex_unlock(&activate_kmem_mutex);
4141 return ret;
4142}
4143
4144static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3595static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4145 unsigned long long val) 3596 unsigned long limit)
4146{ 3597{
4147 int ret; 3598 int ret;
4148 3599
3600 mutex_lock(&memcg_limit_mutex);
4149 if (!memcg_kmem_is_active(memcg)) 3601 if (!memcg_kmem_is_active(memcg))
4150 ret = memcg_activate_kmem(memcg, val); 3602 ret = memcg_activate_kmem(memcg, limit);
4151 else 3603 else
4152 ret = res_counter_set_limit(&memcg->kmem, val); 3604 ret = page_counter_limit(&memcg->kmem, limit);
3605 mutex_unlock(&memcg_limit_mutex);
4153 return ret; 3606 return ret;
4154} 3607}
4155 3608
@@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4161 if (!parent) 3614 if (!parent)
4162 return 0; 3615 return 0;
4163 3616
4164 mutex_lock(&activate_kmem_mutex); 3617 mutex_lock(&memcg_limit_mutex);
4165 /* 3618 /*
4166 * If the parent cgroup is not kmem-active now, it cannot be activated 3619 * If the parent cgroup is not kmem-active now, it cannot be activated
4167 * after this point, because it has at least one child already. 3620 * after this point, because it has at least one child already.
4168 */ 3621 */
4169 if (memcg_kmem_is_active(parent)) 3622 if (memcg_kmem_is_active(parent))
4170 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 3623 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
4171 mutex_unlock(&activate_kmem_mutex); 3624 mutex_unlock(&memcg_limit_mutex);
4172 return ret; 3625 return ret;
4173} 3626}
4174#else 3627#else
4175static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3628static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4176 unsigned long long val) 3629 unsigned long limit)
4177{ 3630{
4178 return -EINVAL; 3631 return -EINVAL;
4179} 3632}
@@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
4187 char *buf, size_t nbytes, loff_t off) 3640 char *buf, size_t nbytes, loff_t off)
4188{ 3641{
4189 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3642 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4190 enum res_type type; 3643 unsigned long nr_pages;
4191 int name;
4192 unsigned long long val;
4193 int ret; 3644 int ret;
4194 3645
4195 buf = strstrip(buf); 3646 buf = strstrip(buf);
4196 type = MEMFILE_TYPE(of_cft(of)->private); 3647 ret = page_counter_memparse(buf, &nr_pages);
4197 name = MEMFILE_ATTR(of_cft(of)->private); 3648 if (ret)
3649 return ret;
4198 3650
4199 switch (name) { 3651 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4200 case RES_LIMIT: 3652 case RES_LIMIT:
4201 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3653 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4202 ret = -EINVAL; 3654 ret = -EINVAL;
4203 break; 3655 break;
4204 } 3656 }
4205 /* This function does all necessary parse...reuse it */ 3657 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4206 ret = res_counter_memparse_write_strategy(buf, &val); 3658 case _MEM:
4207 if (ret) 3659 ret = mem_cgroup_resize_limit(memcg, nr_pages);
4208 break; 3660 break;
4209 if (type == _MEM) 3661 case _MEMSWAP:
4210 ret = mem_cgroup_resize_limit(memcg, val); 3662 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
4211 else if (type == _MEMSWAP)
4212 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4213 else if (type == _KMEM)
4214 ret = memcg_update_kmem_limit(memcg, val);
4215 else
4216 return -EINVAL;
4217 break;
4218 case RES_SOFT_LIMIT:
4219 ret = res_counter_memparse_write_strategy(buf, &val);
4220 if (ret)
4221 break; 3663 break;
4222 /* 3664 case _KMEM:
4223 * For memsw, soft limits are hard to implement in terms 3665 ret = memcg_update_kmem_limit(memcg, nr_pages);
4224 * of semantics, for now, we support soft limits for 3666 break;
4225 * control without swap 3667 }
4226 */
4227 if (type == _MEM)
4228 ret = res_counter_set_soft_limit(&memcg->res, val);
4229 else
4230 ret = -EINVAL;
4231 break; 3668 break;
4232 default: 3669 case RES_SOFT_LIMIT:
4233 ret = -EINVAL; /* should be BUG() ? */ 3670 memcg->soft_limit = nr_pages;
3671 ret = 0;
4234 break; 3672 break;
4235 } 3673 }
4236 return ret ?: nbytes; 3674 return ret ?: nbytes;
4237} 3675}
4238 3676
4239static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4240 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4241{
4242 unsigned long long min_limit, min_memsw_limit, tmp;
4243
4244 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4245 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4246 if (!memcg->use_hierarchy)
4247 goto out;
4248
4249 while (memcg->css.parent) {
4250 memcg = mem_cgroup_from_css(memcg->css.parent);
4251 if (!memcg->use_hierarchy)
4252 break;
4253 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4254 min_limit = min(min_limit, tmp);
4255 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4256 min_memsw_limit = min(min_memsw_limit, tmp);
4257 }
4258out:
4259 *mem_limit = min_limit;
4260 *memsw_limit = min_memsw_limit;
4261}
4262
4263static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3677static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
4264 size_t nbytes, loff_t off) 3678 size_t nbytes, loff_t off)
4265{ 3679{
4266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3680 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4267 int name; 3681 struct page_counter *counter;
4268 enum res_type type;
4269 3682
4270 type = MEMFILE_TYPE(of_cft(of)->private); 3683 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4271 name = MEMFILE_ATTR(of_cft(of)->private); 3684 case _MEM:
3685 counter = &memcg->memory;
3686 break;
3687 case _MEMSWAP:
3688 counter = &memcg->memsw;
3689 break;
3690 case _KMEM:
3691 counter = &memcg->kmem;
3692 break;
3693 default:
3694 BUG();
3695 }
4272 3696
4273 switch (name) { 3697 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4274 case RES_MAX_USAGE: 3698 case RES_MAX_USAGE:
4275 if (type == _MEM) 3699 page_counter_reset_watermark(counter);
4276 res_counter_reset_max(&memcg->res);
4277 else if (type == _MEMSWAP)
4278 res_counter_reset_max(&memcg->memsw);
4279 else if (type == _KMEM)
4280 res_counter_reset_max(&memcg->kmem);
4281 else
4282 return -EINVAL;
4283 break; 3700 break;
4284 case RES_FAILCNT: 3701 case RES_FAILCNT:
4285 if (type == _MEM) 3702 counter->failcnt = 0;
4286 res_counter_reset_failcnt(&memcg->res);
4287 else if (type == _MEMSWAP)
4288 res_counter_reset_failcnt(&memcg->memsw);
4289 else if (type == _KMEM)
4290 res_counter_reset_failcnt(&memcg->kmem);
4291 else
4292 return -EINVAL;
4293 break; 3703 break;
3704 default:
3705 BUG();
4294 } 3706 }
4295 3707
4296 return nbytes; 3708 return nbytes;
@@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4387static int memcg_stat_show(struct seq_file *m, void *v) 3799static int memcg_stat_show(struct seq_file *m, void *v)
4388{ 3800{
4389 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3801 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3802 unsigned long memory, memsw;
4390 struct mem_cgroup *mi; 3803 struct mem_cgroup *mi;
4391 unsigned int i; 3804 unsigned int i;
4392 3805
@@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
4406 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3819 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4407 3820
4408 /* Hierarchical information */ 3821 /* Hierarchical information */
4409 { 3822 memory = memsw = PAGE_COUNTER_MAX;
4410 unsigned long long limit, memsw_limit; 3823 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4411 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 3824 memory = min(memory, mi->memory.limit);
4412 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 3825 memsw = min(memsw, mi->memsw.limit);
4413 if (do_swap_account)
4414 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4415 memsw_limit);
4416 } 3826 }
3827 seq_printf(m, "hierarchical_memory_limit %llu\n",
3828 (u64)memory * PAGE_SIZE);
3829 if (do_swap_account)
3830 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3831 (u64)memsw * PAGE_SIZE);
4417 3832
4418 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3833 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4419 long long val = 0; 3834 long long val = 0;
@@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4497static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3912static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4498{ 3913{
4499 struct mem_cgroup_threshold_ary *t; 3914 struct mem_cgroup_threshold_ary *t;
4500 u64 usage; 3915 unsigned long usage;
4501 int i; 3916 int i;
4502 3917
4503 rcu_read_lock(); 3918 rcu_read_lock();
@@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4596{ 4011{
4597 struct mem_cgroup_thresholds *thresholds; 4012 struct mem_cgroup_thresholds *thresholds;
4598 struct mem_cgroup_threshold_ary *new; 4013 struct mem_cgroup_threshold_ary *new;
4599 u64 threshold, usage; 4014 unsigned long threshold;
4015 unsigned long usage;
4600 int i, size, ret; 4016 int i, size, ret;
4601 4017
4602 ret = res_counter_memparse_write_strategy(args, &threshold); 4018 ret = page_counter_memparse(args, &threshold);
4603 if (ret) 4019 if (ret)
4604 return ret; 4020 return ret;
4605 4021
@@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4689{ 4105{
4690 struct mem_cgroup_thresholds *thresholds; 4106 struct mem_cgroup_thresholds *thresholds;
4691 struct mem_cgroup_threshold_ary *new; 4107 struct mem_cgroup_threshold_ary *new;
4692 u64 usage; 4108 unsigned long usage;
4693 int i, j, size; 4109 int i, j, size;
4694 4110
4695 mutex_lock(&memcg->thresholds_lock); 4111 mutex_lock(&memcg->thresholds_lock);
@@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4855{ 4271{
4856 mem_cgroup_sockets_destroy(memcg); 4272 mem_cgroup_sockets_destroy(memcg);
4857} 4273}
4858
4859static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4860{
4861 if (!memcg_kmem_is_active(memcg))
4862 return;
4863
4864 /*
4865 * kmem charges can outlive the cgroup. In the case of slab
4866 * pages, for instance, a page contain objects from various
4867 * processes. As we prevent from taking a reference for every
4868 * such allocation we have to be careful when doing uncharge
4869 * (see memcg_uncharge_kmem) and here during offlining.
4870 *
4871 * The idea is that that only the _last_ uncharge which sees
4872 * the dead memcg will drop the last reference. An additional
4873 * reference is taken here before the group is marked dead
4874 * which is then paired with css_put during uncharge resp. here.
4875 *
4876 * Although this might sound strange as this path is called from
4877 * css_offline() when the referencemight have dropped down to 0 and
4878 * shouldn't be incremented anymore (css_tryget_online() would
4879 * fail) we do not have other options because of the kmem
4880 * allocations lifetime.
4881 */
4882 css_get(&memcg->css);
4883
4884 memcg_kmem_mark_dead(memcg);
4885
4886 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
4887 return;
4888
4889 if (memcg_kmem_test_and_clear_dead(memcg))
4890 css_put(&memcg->css);
4891}
4892#else 4274#else
4893static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4275static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4894{ 4276{
@@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4898static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4280static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4899{ 4281{
4900} 4282}
4901
4902static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4903{
4904}
4905#endif 4283#endif
4906 4284
4907/* 4285/*
@@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = {
5228#ifdef CONFIG_SLABINFO 4606#ifdef CONFIG_SLABINFO
5229 { 4607 {
5230 .name = "kmem.slabinfo", 4608 .name = "kmem.slabinfo",
5231 .seq_show = mem_cgroup_slabinfo_read, 4609 .seq_start = slab_start,
4610 .seq_next = slab_next,
4611 .seq_stop = slab_stop,
4612 .seq_show = memcg_slab_show,
5232 }, 4613 },
5233#endif 4614#endif
5234#endif 4615#endif
@@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5363 */ 4744 */
5364struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4745struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5365{ 4746{
5366 if (!memcg->res.parent) 4747 if (!memcg->memory.parent)
5367 return NULL; 4748 return NULL;
5368 return mem_cgroup_from_res_counter(memcg->res.parent, res); 4749 return mem_cgroup_from_counter(memcg->memory.parent, memory);
5369} 4750}
5370EXPORT_SYMBOL(parent_mem_cgroup); 4751EXPORT_SYMBOL(parent_mem_cgroup);
5371 4752
@@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5410 /* root ? */ 4791 /* root ? */
5411 if (parent_css == NULL) { 4792 if (parent_css == NULL) {
5412 root_mem_cgroup = memcg; 4793 root_mem_cgroup = memcg;
5413 res_counter_init(&memcg->res, NULL); 4794 page_counter_init(&memcg->memory, NULL);
5414 res_counter_init(&memcg->memsw, NULL); 4795 page_counter_init(&memcg->memsw, NULL);
5415 res_counter_init(&memcg->kmem, NULL); 4796 page_counter_init(&memcg->kmem, NULL);
5416 } 4797 }
5417 4798
5418 memcg->last_scanned_node = MAX_NUMNODES; 4799 memcg->last_scanned_node = MAX_NUMNODES;
@@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5451 memcg->swappiness = mem_cgroup_swappiness(parent); 4832 memcg->swappiness = mem_cgroup_swappiness(parent);
5452 4833
5453 if (parent->use_hierarchy) { 4834 if (parent->use_hierarchy) {
5454 res_counter_init(&memcg->res, &parent->res); 4835 page_counter_init(&memcg->memory, &parent->memory);
5455 res_counter_init(&memcg->memsw, &parent->memsw); 4836 page_counter_init(&memcg->memsw, &parent->memsw);
5456 res_counter_init(&memcg->kmem, &parent->kmem); 4837 page_counter_init(&memcg->kmem, &parent->kmem);
5457 4838
5458 /* 4839 /*
5459 * No need to take a reference to the parent because cgroup 4840 * No need to take a reference to the parent because cgroup
5460 * core guarantees its existence. 4841 * core guarantees its existence.
5461 */ 4842 */
5462 } else { 4843 } else {
5463 res_counter_init(&memcg->res, NULL); 4844 page_counter_init(&memcg->memory, NULL);
5464 res_counter_init(&memcg->memsw, NULL); 4845 page_counter_init(&memcg->memsw, NULL);
5465 res_counter_init(&memcg->kmem, NULL); 4846 page_counter_init(&memcg->kmem, NULL);
5466 /* 4847 /*
5467 * Deeper hierachy with use_hierarchy == false doesn't make 4848 * Deeper hierachy with use_hierarchy == false doesn't make
5468 * much sense so let cgroup subsystem know about this 4849 * much sense so let cgroup subsystem know about this
@@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5487 return 0; 4868 return 0;
5488} 4869}
5489 4870
5490/*
5491 * Announce all parents that a group from their hierarchy is gone.
5492 */
5493static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
5494{
5495 struct mem_cgroup *parent = memcg;
5496
5497 while ((parent = parent_mem_cgroup(parent)))
5498 mem_cgroup_iter_invalidate(parent);
5499
5500 /*
5501 * if the root memcg is not hierarchical we have to check it
5502 * explicitely.
5503 */
5504 if (!root_mem_cgroup->use_hierarchy)
5505 mem_cgroup_iter_invalidate(root_mem_cgroup);
5506}
5507
5508static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4871static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5509{ 4872{
5510 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4873 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5511 struct mem_cgroup_event *event, *tmp; 4874 struct mem_cgroup_event *event, *tmp;
5512 struct cgroup_subsys_state *iter;
5513 4875
5514 /* 4876 /*
5515 * Unregister events and notify userspace. 4877 * Unregister events and notify userspace.
@@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5523 } 4885 }
5524 spin_unlock(&memcg->event_list_lock); 4886 spin_unlock(&memcg->event_list_lock);
5525 4887
5526 kmem_cgroup_css_offline(memcg);
5527
5528 mem_cgroup_invalidate_reclaim_iterators(memcg);
5529
5530 /*
5531 * This requires that offlining is serialized. Right now that is
5532 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
5533 */
5534 css_for_each_descendant_post(iter, css)
5535 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5536
5537 memcg_unregister_all_caches(memcg); 4888 memcg_unregister_all_caches(memcg);
5538 vmpressure_cleanup(&memcg->vmpressure); 4889 vmpressure_cleanup(&memcg->vmpressure);
5539} 4890}
@@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5541static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4892static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5542{ 4893{
5543 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4894 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5544 /*
5545 * XXX: css_offline() would be where we should reparent all
5546 * memory to prepare the cgroup for destruction. However,
5547 * memcg does not do css_tryget_online() and res_counter charging
5548 * under the same RCU lock region, which means that charging
5549 * could race with offlining. Offlining only happens to
5550 * cgroups with no tasks in them but charges can show up
5551 * without any tasks from the swapin path when the target
5552 * memcg is looked up from the swapout record and not from the
5553 * current task as it usually is. A race like this can leak
5554 * charges and put pages with stale cgroup pointers into
5555 * circulation:
5556 *
5557 * #0 #1
5558 * lookup_swap_cgroup_id()
5559 * rcu_read_lock()
5560 * mem_cgroup_lookup()
5561 * css_tryget_online()
5562 * rcu_read_unlock()
5563 * disable css_tryget_online()
5564 * call_rcu()
5565 * offline_css()
5566 * reparent_charges()
5567 * res_counter_charge()
5568 * css_put()
5569 * css_free()
5570 * pc->mem_cgroup = dead memcg
5571 * add page to lru
5572 *
5573 * The bulk of the charges are still moved in offline_css() to
5574 * avoid pinning a lot of pages in case a long-term reference
5575 * like a swapout record is deferring the css_free() to long
5576 * after offlining. But this makes sure we catch any charges
5577 * made after offlining:
5578 */
5579 mem_cgroup_reparent_charges(memcg);
5580 4895
5581 memcg_destroy_kmem(memcg); 4896 memcg_destroy_kmem(memcg);
5582 __mem_cgroup_free(memcg); 4897 __mem_cgroup_free(memcg);
@@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5599{ 4914{
5600 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4915 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5601 4916
5602 mem_cgroup_resize_limit(memcg, ULLONG_MAX); 4917 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
5603 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 4918 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
5604 memcg_update_kmem_limit(memcg, ULLONG_MAX); 4919 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
5605 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 4920 memcg->soft_limit = 0;
5606} 4921}
5607 4922
5608#ifdef CONFIG_MMU 4923#ifdef CONFIG_MMU
@@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5758 unsigned long addr, pte_t ptent, union mc_target *target) 5073 unsigned long addr, pte_t ptent, union mc_target *target)
5759{ 5074{
5760 struct page *page = NULL; 5075 struct page *page = NULL;
5761 struct page_cgroup *pc;
5762 enum mc_target_type ret = MC_TARGET_NONE; 5076 enum mc_target_type ret = MC_TARGET_NONE;
5763 swp_entry_t ent = { .val = 0 }; 5077 swp_entry_t ent = { .val = 0 };
5764 5078
@@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5772 if (!page && !ent.val) 5086 if (!page && !ent.val)
5773 return ret; 5087 return ret;
5774 if (page) { 5088 if (page) {
5775 pc = lookup_page_cgroup(page);
5776 /* 5089 /*
5777 * Do only loose check w/o serialization. 5090 * Do only loose check w/o serialization.
5778 * mem_cgroup_move_account() checks the pc is valid or 5091 * mem_cgroup_move_account() checks the page is valid or
5779 * not under LRU exclusion. 5092 * not under LRU exclusion.
5780 */ 5093 */
5781 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5094 if (page->mem_cgroup == mc.from) {
5782 ret = MC_TARGET_PAGE; 5095 ret = MC_TARGET_PAGE;
5783 if (target) 5096 if (target)
5784 target->page = page; 5097 target->page = page;
@@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5806 unsigned long addr, pmd_t pmd, union mc_target *target) 5119 unsigned long addr, pmd_t pmd, union mc_target *target)
5807{ 5120{
5808 struct page *page = NULL; 5121 struct page *page = NULL;
5809 struct page_cgroup *pc;
5810 enum mc_target_type ret = MC_TARGET_NONE; 5122 enum mc_target_type ret = MC_TARGET_NONE;
5811 5123
5812 page = pmd_page(pmd); 5124 page = pmd_page(pmd);
5813 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5125 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5814 if (!move_anon()) 5126 if (!move_anon())
5815 return ret; 5127 return ret;
5816 pc = lookup_page_cgroup(page); 5128 if (page->mem_cgroup == mc.from) {
5817 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5818 ret = MC_TARGET_PAGE; 5129 ret = MC_TARGET_PAGE;
5819 if (target) { 5130 if (target) {
5820 get_page(page); 5131 get_page(page);
@@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void)
5897{ 5208{
5898 struct mem_cgroup *from = mc.from; 5209 struct mem_cgroup *from = mc.from;
5899 struct mem_cgroup *to = mc.to; 5210 struct mem_cgroup *to = mc.to;
5900 int i;
5901 5211
5902 /* we must uncharge all the leftover precharges from mc.to */ 5212 /* we must uncharge all the leftover precharges from mc.to */
5903 if (mc.precharge) { 5213 if (mc.precharge) {
@@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void)
5916 if (mc.moved_swap) { 5226 if (mc.moved_swap) {
5917 /* uncharge swap account from the old cgroup */ 5227 /* uncharge swap account from the old cgroup */
5918 if (!mem_cgroup_is_root(mc.from)) 5228 if (!mem_cgroup_is_root(mc.from))
5919 res_counter_uncharge(&mc.from->memsw, 5229 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5920 PAGE_SIZE * mc.moved_swap);
5921
5922 for (i = 0; i < mc.moved_swap; i++)
5923 css_put(&mc.from->css);
5924 5230
5925 /* 5231 /*
5926 * we charged both to->res and to->memsw, so we should 5232 * we charged both to->memory and to->memsw, so we
5927 * uncharge to->res. 5233 * should uncharge to->memory.
5928 */ 5234 */
5929 if (!mem_cgroup_is_root(mc.to)) 5235 if (!mem_cgroup_is_root(mc.to))
5930 res_counter_uncharge(&mc.to->res, 5236 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5931 PAGE_SIZE * mc.moved_swap); 5237
5238 css_put_many(&mc.from->css, mc.moved_swap);
5239
5932 /* we've already done css_get(mc.to) */ 5240 /* we've already done css_get(mc.to) */
5933 mc.moved_swap = 0; 5241 mc.moved_swap = 0;
5934 } 5242 }
@@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void)
5939 5247
5940static void mem_cgroup_clear_mc(void) 5248static void mem_cgroup_clear_mc(void)
5941{ 5249{
5942 struct mem_cgroup *from = mc.from;
5943
5944 /* 5250 /*
5945 * we must clear moving_task before waking up waiters at the end of 5251 * we must clear moving_task before waking up waiters at the end of
5946 * task migration. 5252 * task migration.
@@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void)
5951 mc.from = NULL; 5257 mc.from = NULL;
5952 mc.to = NULL; 5258 mc.to = NULL;
5953 spin_unlock(&mc.lock); 5259 spin_unlock(&mc.lock);
5954 mem_cgroup_end_move(from);
5955} 5260}
5956 5261
5957static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5262static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
@@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5984 VM_BUG_ON(mc.precharge); 5289 VM_BUG_ON(mc.precharge);
5985 VM_BUG_ON(mc.moved_charge); 5290 VM_BUG_ON(mc.moved_charge);
5986 VM_BUG_ON(mc.moved_swap); 5291 VM_BUG_ON(mc.moved_swap);
5987 mem_cgroup_start_move(from); 5292
5988 spin_lock(&mc.lock); 5293 spin_lock(&mc.lock);
5989 mc.from = from; 5294 mc.from = from;
5990 mc.to = memcg; 5295 mc.to = memcg;
@@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6004static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5309static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6005 struct cgroup_taskset *tset) 5310 struct cgroup_taskset *tset)
6006{ 5311{
6007 mem_cgroup_clear_mc(); 5312 if (mc.to)
5313 mem_cgroup_clear_mc();
6008} 5314}
6009 5315
6010static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5316static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
@@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6018 enum mc_target_type target_type; 5324 enum mc_target_type target_type;
6019 union mc_target target; 5325 union mc_target target;
6020 struct page *page; 5326 struct page *page;
6021 struct page_cgroup *pc;
6022 5327
6023 /* 5328 /*
6024 * We don't take compound_lock() here but no race with splitting thp 5329 * We don't take compound_lock() here but no race with splitting thp
@@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6039 if (target_type == MC_TARGET_PAGE) { 5344 if (target_type == MC_TARGET_PAGE) {
6040 page = target.page; 5345 page = target.page;
6041 if (!isolate_lru_page(page)) { 5346 if (!isolate_lru_page(page)) {
6042 pc = lookup_page_cgroup(page);
6043 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5347 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6044 pc, mc.from, mc.to)) { 5348 mc.from, mc.to)) {
6045 mc.precharge -= HPAGE_PMD_NR; 5349 mc.precharge -= HPAGE_PMD_NR;
6046 mc.moved_charge += HPAGE_PMD_NR; 5350 mc.moved_charge += HPAGE_PMD_NR;
6047 } 5351 }
@@ -6069,9 +5373,7 @@ retry:
6069 page = target.page; 5373 page = target.page;
6070 if (isolate_lru_page(page)) 5374 if (isolate_lru_page(page))
6071 goto put; 5375 goto put;
6072 pc = lookup_page_cgroup(page); 5376 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
6073 if (!mem_cgroup_move_account(page, 1, pc,
6074 mc.from, mc.to)) {
6075 mc.precharge--; 5377 mc.precharge--;
6076 /* we uncharge from mc.from later. */ 5378 /* we uncharge from mc.from later. */
6077 mc.moved_charge++; 5379 mc.moved_charge++;
@@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
6115 struct vm_area_struct *vma; 5417 struct vm_area_struct *vma;
6116 5418
6117 lru_add_drain_all(); 5419 lru_add_drain_all();
5420 /*
5421 * Signal mem_cgroup_begin_page_stat() to take the memcg's
5422 * move_lock while we're moving its pages to another memcg.
5423 * Then wait for already started RCU-only updates to finish.
5424 */
5425 atomic_inc(&mc.from->moving_account);
5426 synchronize_rcu();
6118retry: 5427retry:
6119 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5428 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6120 /* 5429 /*
@@ -6147,6 +5456,7 @@ retry:
6147 break; 5456 break;
6148 } 5457 }
6149 up_read(&mm->mmap_sem); 5458 up_read(&mm->mmap_sem);
5459 atomic_dec(&mc.from->moving_account);
6150} 5460}
6151 5461
6152static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5462static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
@@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void)
6250 */ 5560 */
6251void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5561void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6252{ 5562{
6253 struct page_cgroup *pc; 5563 struct mem_cgroup *memcg;
6254 unsigned short oldid; 5564 unsigned short oldid;
6255 5565
6256 VM_BUG_ON_PAGE(PageLRU(page), page); 5566 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6259 if (!do_swap_account) 5569 if (!do_swap_account)
6260 return; 5570 return;
6261 5571
6262 pc = lookup_page_cgroup(page); 5572 memcg = page->mem_cgroup;
6263 5573
6264 /* Readahead page, never charged */ 5574 /* Readahead page, never charged */
6265 if (!PageCgroupUsed(pc)) 5575 if (!memcg)
6266 return; 5576 return;
6267 5577
6268 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); 5578 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
6269
6270 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6271 VM_BUG_ON_PAGE(oldid, page); 5579 VM_BUG_ON_PAGE(oldid, page);
5580 mem_cgroup_swap_statistics(memcg, true);
5581
5582 page->mem_cgroup = NULL;
6272 5583
6273 pc->flags &= ~PCG_MEMSW; 5584 if (!mem_cgroup_is_root(memcg))
6274 css_get(&pc->mem_cgroup->css); 5585 page_counter_uncharge(&memcg->memory, 1);
6275 mem_cgroup_swap_statistics(pc->mem_cgroup, true); 5586
5587 /* XXX: caller holds IRQ-safe mapping->tree_lock */
5588 VM_BUG_ON(!irqs_disabled());
5589
5590 mem_cgroup_charge_statistics(memcg, page, -1);
5591 memcg_check_events(memcg, page);
6276} 5592}
6277 5593
6278/** 5594/**
@@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
6294 memcg = mem_cgroup_lookup(id); 5610 memcg = mem_cgroup_lookup(id);
6295 if (memcg) { 5611 if (memcg) {
6296 if (!mem_cgroup_is_root(memcg)) 5612 if (!mem_cgroup_is_root(memcg))
6297 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 5613 page_counter_uncharge(&memcg->memsw, 1);
6298 mem_cgroup_swap_statistics(memcg, false); 5614 mem_cgroup_swap_statistics(memcg, false);
6299 css_put(&memcg->css); 5615 css_put(&memcg->css);
6300 } 5616 }
@@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6330 goto out; 5646 goto out;
6331 5647
6332 if (PageSwapCache(page)) { 5648 if (PageSwapCache(page)) {
6333 struct page_cgroup *pc = lookup_page_cgroup(page);
6334 /* 5649 /*
6335 * Every swap fault against a single page tries to charge the 5650 * Every swap fault against a single page tries to charge the
6336 * page, bail as early as possible. shmem_unuse() encounters 5651 * page, bail as early as possible. shmem_unuse() encounters
@@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6338 * the page lock, which serializes swap cache removal, which 5653 * the page lock, which serializes swap cache removal, which
6339 * in turn serializes uncharging. 5654 * in turn serializes uncharging.
6340 */ 5655 */
6341 if (PageCgroupUsed(pc)) 5656 if (page->mem_cgroup)
6342 goto out; 5657 goto out;
6343 } 5658 }
6344 5659
@@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6452} 5767}
6453 5768
6454static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5769static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6455 unsigned long nr_mem, unsigned long nr_memsw,
6456 unsigned long nr_anon, unsigned long nr_file, 5770 unsigned long nr_anon, unsigned long nr_file,
6457 unsigned long nr_huge, struct page *dummy_page) 5771 unsigned long nr_huge, struct page *dummy_page)
6458{ 5772{
5773 unsigned long nr_pages = nr_anon + nr_file;
6459 unsigned long flags; 5774 unsigned long flags;
6460 5775
6461 if (!mem_cgroup_is_root(memcg)) { 5776 if (!mem_cgroup_is_root(memcg)) {
6462 if (nr_mem) 5777 page_counter_uncharge(&memcg->memory, nr_pages);
6463 res_counter_uncharge(&memcg->res, 5778 if (do_swap_account)
6464 nr_mem * PAGE_SIZE); 5779 page_counter_uncharge(&memcg->memsw, nr_pages);
6465 if (nr_memsw)
6466 res_counter_uncharge(&memcg->memsw,
6467 nr_memsw * PAGE_SIZE);
6468 memcg_oom_recover(memcg); 5780 memcg_oom_recover(memcg);
6469 } 5781 }
6470 5782
@@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6473 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5785 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
6474 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5786 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
6475 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5787 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
6476 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); 5788 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6477 memcg_check_events(memcg, dummy_page); 5789 memcg_check_events(memcg, dummy_page);
6478 local_irq_restore(flags); 5790 local_irq_restore(flags);
5791
5792 if (!mem_cgroup_is_root(memcg))
5793 css_put_many(&memcg->css, nr_pages);
6479} 5794}
6480 5795
6481static void uncharge_list(struct list_head *page_list) 5796static void uncharge_list(struct list_head *page_list)
6482{ 5797{
6483 struct mem_cgroup *memcg = NULL; 5798 struct mem_cgroup *memcg = NULL;
6484 unsigned long nr_memsw = 0;
6485 unsigned long nr_anon = 0; 5799 unsigned long nr_anon = 0;
6486 unsigned long nr_file = 0; 5800 unsigned long nr_file = 0;
6487 unsigned long nr_huge = 0; 5801 unsigned long nr_huge = 0;
6488 unsigned long pgpgout = 0; 5802 unsigned long pgpgout = 0;
6489 unsigned long nr_mem = 0;
6490 struct list_head *next; 5803 struct list_head *next;
6491 struct page *page; 5804 struct page *page;
6492 5805
6493 next = page_list->next; 5806 next = page_list->next;
6494 do { 5807 do {
6495 unsigned int nr_pages = 1; 5808 unsigned int nr_pages = 1;
6496 struct page_cgroup *pc;
6497 5809
6498 page = list_entry(next, struct page, lru); 5810 page = list_entry(next, struct page, lru);
6499 next = page->lru.next; 5811 next = page->lru.next;
@@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list)
6501 VM_BUG_ON_PAGE(PageLRU(page), page); 5813 VM_BUG_ON_PAGE(PageLRU(page), page);
6502 VM_BUG_ON_PAGE(page_count(page), page); 5814 VM_BUG_ON_PAGE(page_count(page), page);
6503 5815
6504 pc = lookup_page_cgroup(page); 5816 if (!page->mem_cgroup)
6505 if (!PageCgroupUsed(pc))
6506 continue; 5817 continue;
6507 5818
6508 /* 5819 /*
6509 * Nobody should be changing or seriously looking at 5820 * Nobody should be changing or seriously looking at
6510 * pc->mem_cgroup and pc->flags at this point, we have 5821 * page->mem_cgroup at this point, we have fully
6511 * fully exclusive access to the page. 5822 * exclusive access to the page.
6512 */ 5823 */
6513 5824
6514 if (memcg != pc->mem_cgroup) { 5825 if (memcg != page->mem_cgroup) {
6515 if (memcg) { 5826 if (memcg) {
6516 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 5827 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
6517 nr_anon, nr_file, nr_huge, page); 5828 nr_huge, page);
6518 pgpgout = nr_mem = nr_memsw = 0; 5829 pgpgout = nr_anon = nr_file = nr_huge = 0;
6519 nr_anon = nr_file = nr_huge = 0;
6520 } 5830 }
6521 memcg = pc->mem_cgroup; 5831 memcg = page->mem_cgroup;
6522 } 5832 }
6523 5833
6524 if (PageTransHuge(page)) { 5834 if (PageTransHuge(page)) {
@@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list)
6532 else 5842 else
6533 nr_file += nr_pages; 5843 nr_file += nr_pages;
6534 5844
6535 if (pc->flags & PCG_MEM) 5845 page->mem_cgroup = NULL;
6536 nr_mem += nr_pages;
6537 if (pc->flags & PCG_MEMSW)
6538 nr_memsw += nr_pages;
6539 pc->flags = 0;
6540 5846
6541 pgpgout++; 5847 pgpgout++;
6542 } while (next != page_list); 5848 } while (next != page_list);
6543 5849
6544 if (memcg) 5850 if (memcg)
6545 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 5851 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
6546 nr_anon, nr_file, nr_huge, page); 5852 nr_huge, page);
6547} 5853}
6548 5854
6549/** 5855/**
@@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list)
6555 */ 5861 */
6556void mem_cgroup_uncharge(struct page *page) 5862void mem_cgroup_uncharge(struct page *page)
6557{ 5863{
6558 struct page_cgroup *pc;
6559
6560 if (mem_cgroup_disabled()) 5864 if (mem_cgroup_disabled())
6561 return; 5865 return;
6562 5866
6563 /* Don't touch page->lru of any random page, pre-check: */ 5867 /* Don't touch page->lru of any random page, pre-check: */
6564 pc = lookup_page_cgroup(page); 5868 if (!page->mem_cgroup)
6565 if (!PageCgroupUsed(pc))
6566 return; 5869 return;
6567 5870
6568 INIT_LIST_HEAD(&page->lru); 5871 INIT_LIST_HEAD(&page->lru);
@@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
6598void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5901void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6599 bool lrucare) 5902 bool lrucare)
6600{ 5903{
6601 struct page_cgroup *pc; 5904 struct mem_cgroup *memcg;
6602 int isolated; 5905 int isolated;
6603 5906
6604 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5907 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6613 return; 5916 return;
6614 5917
6615 /* Page cache replacement: new page already charged? */ 5918 /* Page cache replacement: new page already charged? */
6616 pc = lookup_page_cgroup(newpage); 5919 if (newpage->mem_cgroup)
6617 if (PageCgroupUsed(pc))
6618 return; 5920 return;
6619 5921
6620 /* Re-entrant migration: old page already uncharged? */ 5922 /*
6621 pc = lookup_page_cgroup(oldpage); 5923 * Swapcache readahead pages can get migrated before being
6622 if (!PageCgroupUsed(pc)) 5924 * charged, and migration from compaction can happen to an
5925 * uncharged page when the PFN walker finds a page that
5926 * reclaim just put back on the LRU but has not released yet.
5927 */
5928 memcg = oldpage->mem_cgroup;
5929 if (!memcg)
6623 return; 5930 return;
6624 5931
6625 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6626 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6627
6628 if (lrucare) 5932 if (lrucare)
6629 lock_page_lru(oldpage, &isolated); 5933 lock_page_lru(oldpage, &isolated);
6630 5934
6631 pc->flags = 0; 5935 oldpage->mem_cgroup = NULL;
6632 5936
6633 if (lrucare) 5937 if (lrucare)
6634 unlock_page_lru(oldpage, isolated); 5938 unlock_page_lru(oldpage, isolated);
6635 5939
6636 commit_charge(newpage, pc->mem_cgroup, lrucare); 5940 commit_charge(newpage, memcg, lrucare);
6637} 5941}
6638 5942
6639/* 5943/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b852b10ec76d..e5ee0ca7ae85 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,7 +233,7 @@ void shake_page(struct page *p, int access)
233 lru_add_drain_all(); 233 lru_add_drain_all();
234 if (PageLRU(p)) 234 if (PageLRU(p))
235 return; 235 return;
236 drain_all_pages(); 236 drain_all_pages(page_zone(p));
237 if (PageLRU(p) || is_free_buddy_page(p)) 237 if (PageLRU(p) || is_free_buddy_page(p))
238 return; 238 return;
239 } 239 }
@@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
1661 if (!is_free_buddy_page(page)) 1661 if (!is_free_buddy_page(page))
1662 lru_add_drain_all(); 1662 lru_add_drain_all();
1663 if (!is_free_buddy_page(page)) 1663 if (!is_free_buddy_page(page))
1664 drain_all_pages(); 1664 drain_all_pages(page_zone(page));
1665 SetPageHWPoison(page); 1665 SetPageHWPoison(page);
1666 if (!is_free_buddy_page(page)) 1666 if (!is_free_buddy_page(page))
1667 pr_info("soft offline: %#lx: page leaked\n", 1667 pr_info("soft offline: %#lx: page leaked\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1bf4807cb21e..9fab10795bea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1725,7 +1725,7 @@ repeat:
1725 if (drain) { 1725 if (drain) {
1726 lru_add_drain_all(); 1726 lru_add_drain_all();
1727 cond_resched(); 1727 cond_resched();
1728 drain_all_pages(); 1728 drain_all_pages(zone);
1729 } 1729 }
1730 1730
1731 pfn = scan_movable_pages(start_pfn, end_pfn); 1731 pfn = scan_movable_pages(start_pfn, end_pfn);
@@ -1747,7 +1747,7 @@ repeat:
1747 lru_add_drain_all(); 1747 lru_add_drain_all();
1748 yield(); 1748 yield();
1749 /* drain pcp pages, this is synchronous. */ 1749 /* drain pcp pages, this is synchronous. */
1750 drain_all_pages(); 1750 drain_all_pages(zone);
1751 /* 1751 /*
1752 * dissolve free hugepages in the memory block before doing offlining 1752 * dissolve free hugepages in the memory block before doing offlining
1753 * actually in order to make hugetlbfs's object counting consistent. 1753 * actually in order to make hugetlbfs's object counting consistent.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5340f6b91312..3b014d326151 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -119,7 +119,7 @@ found:
119 119
120/* return true if the task is not adequate as candidate victim task. */ 120/* return true if the task is not adequate as candidate victim task. */
121static bool oom_unkillable_task(struct task_struct *p, 121static bool oom_unkillable_task(struct task_struct *p,
122 const struct mem_cgroup *memcg, const nodemask_t *nodemask) 122 struct mem_cgroup *memcg, const nodemask_t *nodemask)
123{ 123{
124 if (is_global_init(p)) 124 if (is_global_init(p))
125 return true; 125 return true;
@@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, 353 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
354 * swapents, oom_score_adj value, and name. 354 * swapents, oom_score_adj value, and name.
355 */ 355 */
356static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 356static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
357{ 357{
358 struct task_struct *p; 358 struct task_struct *p;
359 struct task_struct *task; 359 struct task_struct *task;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 19ceae87522d..d5d81f5384d1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page)
2357 dec_zone_page_state(page, NR_WRITEBACK); 2357 dec_zone_page_state(page, NR_WRITEBACK);
2358 inc_zone_page_state(page, NR_WRITTEN); 2358 inc_zone_page_state(page, NR_WRITTEN);
2359 } 2359 }
2360 mem_cgroup_end_page_stat(memcg, locked, memcg_flags); 2360 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
2361 return ret; 2361 return ret;
2362} 2362}
2363 2363
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2400 inc_zone_page_state(page, NR_WRITEBACK); 2400 inc_zone_page_state(page, NR_WRITEBACK);
2401 } 2401 }
2402 mem_cgroup_end_page_stat(memcg, locked, memcg_flags); 2402 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
2403 return ret; 2403 return ret;
2404 2404
2405} 2405}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 616a2c956b4b..a7198c065999 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 51#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 52#include <linux/kmemleak.h>
54#include <linux/compaction.h> 53#include <linux/compaction.h>
@@ -641,8 +640,10 @@ static inline int free_pages_check(struct page *page)
641 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 640 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
642 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 641 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
643 } 642 }
644 if (unlikely(mem_cgroup_bad_page_check(page))) 643#ifdef CONFIG_MEMCG
645 bad_reason = "cgroup check failed"; 644 if (unlikely(page->mem_cgroup))
645 bad_reason = "page still charged to cgroup";
646#endif
646 if (unlikely(bad_reason)) { 647 if (unlikely(bad_reason)) {
647 bad_page(page, bad_reason, bad_flags); 648 bad_page(page, bad_reason, bad_flags);
648 return 1; 649 return 1;
@@ -741,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
741 int i; 742 int i;
742 int bad = 0; 743 int bad = 0;
743 744
745 VM_BUG_ON_PAGE(PageTail(page), page);
746 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
747
744 trace_mm_page_free(page, order); 748 trace_mm_page_free(page, order);
745 kmemcheck_free_shadow(page, order); 749 kmemcheck_free_shadow(page, order);
746 750
@@ -898,8 +902,10 @@ static inline int check_new_page(struct page *page)
898 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 902 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
899 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 903 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
900 } 904 }
901 if (unlikely(mem_cgroup_bad_page_check(page))) 905#ifdef CONFIG_MEMCG
902 bad_reason = "cgroup check failed"; 906 if (unlikely(page->mem_cgroup))
907 bad_reason = "page still charged to cgroup";
908#endif
903 if (unlikely(bad_reason)) { 909 if (unlikely(bad_reason)) {
904 bad_page(page, bad_reason, bad_flags); 910 bad_page(page, bad_reason, bad_flags);
905 return 1; 911 return 1;
@@ -1267,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1267#endif 1273#endif
1268 1274
1269/* 1275/*
1270 * Drain pages of the indicated processor. 1276 * Drain pcplists of the indicated processor and zone.
1271 * 1277 *
1272 * The processor must either be the current processor and the 1278 * The processor must either be the current processor and the
1273 * thread pinned to the current processor or a processor that 1279 * thread pinned to the current processor or a processor that
1274 * is not online. 1280 * is not online.
1275 */ 1281 */
1276static void drain_pages(unsigned int cpu) 1282static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1277{ 1283{
1278 unsigned long flags; 1284 unsigned long flags;
1279 struct zone *zone; 1285 struct per_cpu_pageset *pset;
1286 struct per_cpu_pages *pcp;
1280 1287
1281 for_each_populated_zone(zone) { 1288 local_irq_save(flags);
1282 struct per_cpu_pageset *pset; 1289 pset = per_cpu_ptr(zone->pageset, cpu);
1283 struct per_cpu_pages *pcp;
1284 1290
1285 local_irq_save(flags); 1291 pcp = &pset->pcp;
1286 pset = per_cpu_ptr(zone->pageset, cpu); 1292 if (pcp->count) {
1293 free_pcppages_bulk(zone, pcp->count, pcp);
1294 pcp->count = 0;
1295 }
1296 local_irq_restore(flags);
1297}
1287 1298
1288 pcp = &pset->pcp; 1299/*
1289 if (pcp->count) { 1300 * Drain pcplists of all zones on the indicated processor.
1290 free_pcppages_bulk(zone, pcp->count, pcp); 1301 *
1291 pcp->count = 0; 1302 * The processor must either be the current processor and the
1292 } 1303 * thread pinned to the current processor or a processor that
1293 local_irq_restore(flags); 1304 * is not online.
1305 */
1306static void drain_pages(unsigned int cpu)
1307{
1308 struct zone *zone;
1309
1310 for_each_populated_zone(zone) {
1311 drain_pages_zone(cpu, zone);
1294 } 1312 }
1295} 1313}
1296 1314
1297/* 1315/*
1298 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1316 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1317 *
1318 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
1319 * the single zone's pages.
1299 */ 1320 */
1300void drain_local_pages(void *arg) 1321void drain_local_pages(struct zone *zone)
1301{ 1322{
1302 drain_pages(smp_processor_id()); 1323 int cpu = smp_processor_id();
1324
1325 if (zone)
1326 drain_pages_zone(cpu, zone);
1327 else
1328 drain_pages(cpu);
1303} 1329}
1304 1330
1305/* 1331/*
1306 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1332 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1307 * 1333 *
1334 * When zone parameter is non-NULL, spill just the single zone's pages.
1335 *
1308 * Note that this code is protected against sending an IPI to an offline 1336 * Note that this code is protected against sending an IPI to an offline
1309 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1337 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1310 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1338 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1311 * nothing keeps CPUs from showing up after we populated the cpumask and 1339 * nothing keeps CPUs from showing up after we populated the cpumask and
1312 * before the call to on_each_cpu_mask(). 1340 * before the call to on_each_cpu_mask().
1313 */ 1341 */
1314void drain_all_pages(void) 1342void drain_all_pages(struct zone *zone)
1315{ 1343{
1316 int cpu; 1344 int cpu;
1317 struct per_cpu_pageset *pcp;
1318 struct zone *zone;
1319 1345
1320 /* 1346 /*
1321 * Allocate in the BSS so we wont require allocation in 1347 * Allocate in the BSS so we wont require allocation in
@@ -1330,20 +1356,31 @@ void drain_all_pages(void)
1330 * disables preemption as part of its processing 1356 * disables preemption as part of its processing
1331 */ 1357 */
1332 for_each_online_cpu(cpu) { 1358 for_each_online_cpu(cpu) {
1359 struct per_cpu_pageset *pcp;
1360 struct zone *z;
1333 bool has_pcps = false; 1361 bool has_pcps = false;
1334 for_each_populated_zone(zone) { 1362
1363 if (zone) {
1335 pcp = per_cpu_ptr(zone->pageset, cpu); 1364 pcp = per_cpu_ptr(zone->pageset, cpu);
1336 if (pcp->pcp.count) { 1365 if (pcp->pcp.count)
1337 has_pcps = true; 1366 has_pcps = true;
1338 break; 1367 } else {
1368 for_each_populated_zone(z) {
1369 pcp = per_cpu_ptr(z->pageset, cpu);
1370 if (pcp->pcp.count) {
1371 has_pcps = true;
1372 break;
1373 }
1339 } 1374 }
1340 } 1375 }
1376
1341 if (has_pcps) 1377 if (has_pcps)
1342 cpumask_set_cpu(cpu, &cpus_with_pcps); 1378 cpumask_set_cpu(cpu, &cpus_with_pcps);
1343 else 1379 else
1344 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1380 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1345 } 1381 }
1346 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1382 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
1383 zone, 1);
1347} 1384}
1348 1385
1349#ifdef CONFIG_HIBERNATION 1386#ifdef CONFIG_HIBERNATION
@@ -1705,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1705 unsigned long mark, int classzone_idx, int alloc_flags, 1742 unsigned long mark, int classzone_idx, int alloc_flags,
1706 long free_pages) 1743 long free_pages)
1707{ 1744{
1708 /* free_pages my go negative - that's OK */ 1745 /* free_pages may go negative - that's OK */
1709 long min = mark; 1746 long min = mark;
1710 int o; 1747 int o;
1711 long free_cma = 0; 1748 long free_cma = 0;
@@ -2296,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2296 int classzone_idx, int migratetype, enum migrate_mode mode, 2333 int classzone_idx, int migratetype, enum migrate_mode mode,
2297 int *contended_compaction, bool *deferred_compaction) 2334 int *contended_compaction, bool *deferred_compaction)
2298{ 2335{
2299 struct zone *last_compact_zone = NULL;
2300 unsigned long compact_result; 2336 unsigned long compact_result;
2301 struct page *page; 2337 struct page *page;
2302 2338
@@ -2307,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2307 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2343 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2308 nodemask, mode, 2344 nodemask, mode,
2309 contended_compaction, 2345 contended_compaction,
2310 &last_compact_zone); 2346 alloc_flags, classzone_idx);
2311 current->flags &= ~PF_MEMALLOC; 2347 current->flags &= ~PF_MEMALLOC;
2312 2348
2313 switch (compact_result) { 2349 switch (compact_result) {
@@ -2326,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2326 */ 2362 */
2327 count_vm_event(COMPACTSTALL); 2363 count_vm_event(COMPACTSTALL);
2328 2364
2329 /* Page migration frees to the PCP lists but we want merging */
2330 drain_pages(get_cpu());
2331 put_cpu();
2332
2333 page = get_page_from_freelist(gfp_mask, nodemask, 2365 page = get_page_from_freelist(gfp_mask, nodemask,
2334 order, zonelist, high_zoneidx, 2366 order, zonelist, high_zoneidx,
2335 alloc_flags & ~ALLOC_NO_WATERMARKS, 2367 alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2345,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2345 } 2377 }
2346 2378
2347 /* 2379 /*
2348 * last_compact_zone is where try_to_compact_pages thought allocation
2349 * should succeed, so it did not defer compaction. But here we know
2350 * that it didn't succeed, so we do the defer.
2351 */
2352 if (last_compact_zone && mode != MIGRATE_ASYNC)
2353 defer_compaction(last_compact_zone, order);
2354
2355 /*
2356 * It's bad if compaction run occurs and fails. The most likely reason 2380 * It's bad if compaction run occurs and fails. The most likely reason
2357 * is that pages exist, but not enough to satisfy watermarks. 2381 * is that pages exist, but not enough to satisfy watermarks.
2358 */ 2382 */
@@ -2433,7 +2457,7 @@ retry:
2433 * pages are pinned on the per-cpu lists. Drain them and try again 2457 * pages are pinned on the per-cpu lists. Drain them and try again
2434 */ 2458 */
2435 if (!page && !drained) { 2459 if (!page && !drained) {
2436 drain_all_pages(); 2460 drain_all_pages(NULL);
2437 drained = true; 2461 drained = true;
2438 goto retry; 2462 goto retry;
2439 } 2463 }
@@ -3893,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3893 else 3917 else
3894 page_group_by_mobility_disabled = 0; 3918 page_group_by_mobility_disabled = 0;
3895 3919
3896 printk("Built %i zonelists in %s order, mobility grouping %s. " 3920 pr_info("Built %i zonelists in %s order, mobility grouping %s. "
3897 "Total pages: %ld\n", 3921 "Total pages: %ld\n",
3898 nr_online_nodes, 3922 nr_online_nodes,
3899 zonelist_order_name[current_zonelist_order], 3923 zonelist_order_name[current_zonelist_order],
3900 page_group_by_mobility_disabled ? "off" : "on", 3924 page_group_by_mobility_disabled ? "off" : "on",
3901 vm_total_pages); 3925 vm_total_pages);
3902#ifdef CONFIG_NUMA 3926#ifdef CONFIG_NUMA
3903 printk("Policy zone: %s\n", zone_names[policy_zone]); 3927 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
3904#endif 3928#endif
3905} 3929}
3906 3930
@@ -4832,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4832#endif 4856#endif
4833 init_waitqueue_head(&pgdat->kswapd_wait); 4857 init_waitqueue_head(&pgdat->kswapd_wait);
4834 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4858 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4835 pgdat_page_cgroup_init(pgdat);
4836 4859
4837 for (j = 0; j < MAX_NR_ZONES; j++) { 4860 for (j = 0; j < MAX_NR_ZONES; j++) {
4838 struct zone *zone = pgdat->node_zones + j; 4861 struct zone *zone = pgdat->node_zones + j;
@@ -5334,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5334 find_zone_movable_pfns_for_nodes(); 5357 find_zone_movable_pfns_for_nodes();
5335 5358
5336 /* Print out the zone ranges */ 5359 /* Print out the zone ranges */
5337 printk("Zone ranges:\n"); 5360 pr_info("Zone ranges:\n");
5338 for (i = 0; i < MAX_NR_ZONES; i++) { 5361 for (i = 0; i < MAX_NR_ZONES; i++) {
5339 if (i == ZONE_MOVABLE) 5362 if (i == ZONE_MOVABLE)
5340 continue; 5363 continue;
5341 printk(KERN_CONT " %-8s ", zone_names[i]); 5364 pr_info(" %-8s ", zone_names[i]);
5342 if (arch_zone_lowest_possible_pfn[i] == 5365 if (arch_zone_lowest_possible_pfn[i] ==
5343 arch_zone_highest_possible_pfn[i]) 5366 arch_zone_highest_possible_pfn[i])
5344 printk(KERN_CONT "empty\n"); 5367 pr_cont("empty\n");
5345 else 5368 else
5346 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5369 pr_cont("[mem %0#10lx-%0#10lx]\n",
5347 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5370 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5348 (arch_zone_highest_possible_pfn[i] 5371 (arch_zone_highest_possible_pfn[i]
5349 << PAGE_SHIFT) - 1); 5372 << PAGE_SHIFT) - 1);
5350 } 5373 }
5351 5374
5352 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5375 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5353 printk("Movable zone start for each node\n"); 5376 pr_info("Movable zone start for each node\n");
5354 for (i = 0; i < MAX_NUMNODES; i++) { 5377 for (i = 0; i < MAX_NUMNODES; i++) {
5355 if (zone_movable_pfn[i]) 5378 if (zone_movable_pfn[i])
5356 printk(" Node %d: %#010lx\n", i, 5379 pr_info(" Node %d: %#010lx\n", i,
5357 zone_movable_pfn[i] << PAGE_SHIFT); 5380 zone_movable_pfn[i] << PAGE_SHIFT);
5358 } 5381 }
5359 5382
5360 /* Print out the early node map */ 5383 /* Print out the early node map */
5361 printk("Early memory node ranges\n"); 5384 pr_info("Early memory node ranges\n");
5362 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5385 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5363 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5386 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5364 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5387 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5365 5388
5366 /* Initialise every node */ 5389 /* Initialise every node */
@@ -5496,7 +5519,7 @@ void __init mem_init_print_info(const char *str)
5496 5519
5497#undef adj_init_size 5520#undef adj_init_size
5498 5521
5499 printk("Memory: %luK/%luK available " 5522 pr_info("Memory: %luK/%luK available "
5500 "(%luK kernel code, %luK rwdata, %luK rodata, " 5523 "(%luK kernel code, %luK rwdata, %luK rodata, "
5501 "%luK init, %luK bss, %luK reserved" 5524 "%luK init, %luK bss, %luK reserved"
5502#ifdef CONFIG_HIGHMEM 5525#ifdef CONFIG_HIGHMEM
@@ -6385,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6385 */ 6408 */
6386 6409
6387 lru_add_drain_all(); 6410 lru_add_drain_all();
6388 drain_all_pages(); 6411 drain_all_pages(cc.zone);
6389 6412
6390 order = 0; 6413 order = 0;
6391 outer_start = start; 6414 outer_start = start;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index 5331c2bd85a2..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,530 +0,0 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static unsigned long total_usage;
15
16#if !defined(CONFIG_SPARSEMEM)
17
18
19void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
20{
21 pgdat->node_page_cgroup = NULL;
22}
23
24struct page_cgroup *lookup_page_cgroup(struct page *page)
25{
26 unsigned long pfn = page_to_pfn(page);
27 unsigned long offset;
28 struct page_cgroup *base;
29
30 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
31#ifdef CONFIG_DEBUG_VM
32 /*
33 * The sanity checks the page allocator does upon freeing a
34 * page can reach here before the page_cgroup arrays are
35 * allocated when feeding a range of pages to the allocator
36 * for the first time during bootup or memory hotplug.
37 */
38 if (unlikely(!base))
39 return NULL;
40#endif
41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42 return base + offset;
43}
44
45static int __init alloc_node_page_cgroup(int nid)
46{
47 struct page_cgroup *base;
48 unsigned long table_size;
49 unsigned long nr_pages;
50
51 nr_pages = NODE_DATA(nid)->node_spanned_pages;
52 if (!nr_pages)
53 return 0;
54
55 table_size = sizeof(struct page_cgroup) * nr_pages;
56
57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
60 if (!base)
61 return -ENOMEM;
62 NODE_DATA(nid)->node_page_cgroup = base;
63 total_usage += table_size;
64 return 0;
65}
66
67void __init page_cgroup_init_flatmem(void)
68{
69
70 int nid, fail;
71
72 if (mem_cgroup_disabled())
73 return;
74
75 for_each_online_node(nid) {
76 fail = alloc_node_page_cgroup(nid);
77 if (fail)
78 goto fail;
79 }
80 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
81 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
82 " don't want memory cgroups\n");
83 return;
84fail:
85 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
86 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
87 panic("Out of memory");
88}
89
90#else /* CONFIG_FLAT_NODE_MEM_MAP */
91
92struct page_cgroup *lookup_page_cgroup(struct page *page)
93{
94 unsigned long pfn = page_to_pfn(page);
95 struct mem_section *section = __pfn_to_section(pfn);
96#ifdef CONFIG_DEBUG_VM
97 /*
98 * The sanity checks the page allocator does upon freeing a
99 * page can reach here before the page_cgroup arrays are
100 * allocated when feeding a range of pages to the allocator
101 * for the first time during bootup or memory hotplug.
102 */
103 if (!section->page_cgroup)
104 return NULL;
105#endif
106 return section->page_cgroup + pfn;
107}
108
109static void *__meminit alloc_page_cgroup(size_t size, int nid)
110{
111 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
112 void *addr = NULL;
113
114 addr = alloc_pages_exact_nid(nid, size, flags);
115 if (addr) {
116 kmemleak_alloc(addr, size, 1, flags);
117 return addr;
118 }
119
120 if (node_state(nid, N_HIGH_MEMORY))
121 addr = vzalloc_node(size, nid);
122 else
123 addr = vzalloc(size);
124
125 return addr;
126}
127
128static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
129{
130 struct mem_section *section;
131 struct page_cgroup *base;
132 unsigned long table_size;
133
134 section = __pfn_to_section(pfn);
135
136 if (section->page_cgroup)
137 return 0;
138
139 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
140 base = alloc_page_cgroup(table_size, nid);
141
142 /*
143 * The value stored in section->page_cgroup is (base - pfn)
144 * and it does not point to the memory block allocated above,
145 * causing kmemleak false positives.
146 */
147 kmemleak_not_leak(base);
148
149 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n");
151 return -ENOMEM;
152 }
153
154 /*
155 * The passed "pfn" may not be aligned to SECTION. For the calculation
156 * we need to apply a mask.
157 */
158 pfn &= PAGE_SECTION_MASK;
159 section->page_cgroup = base - pfn;
160 total_usage += table_size;
161 return 0;
162}
163#ifdef CONFIG_MEMORY_HOTPLUG
164static void free_page_cgroup(void *addr)
165{
166 if (is_vmalloc_addr(addr)) {
167 vfree(addr);
168 } else {
169 struct page *page = virt_to_page(addr);
170 size_t table_size =
171 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
172
173 BUG_ON(PageReserved(page));
174 kmemleak_free(addr);
175 free_pages_exact(addr, table_size);
176 }
177}
178
179static void __free_page_cgroup(unsigned long pfn)
180{
181 struct mem_section *ms;
182 struct page_cgroup *base;
183
184 ms = __pfn_to_section(pfn);
185 if (!ms || !ms->page_cgroup)
186 return;
187 base = ms->page_cgroup + pfn;
188 free_page_cgroup(base);
189 ms->page_cgroup = NULL;
190}
191
192static int __meminit online_page_cgroup(unsigned long start_pfn,
193 unsigned long nr_pages,
194 int nid)
195{
196 unsigned long start, end, pfn;
197 int fail = 0;
198
199 start = SECTION_ALIGN_DOWN(start_pfn);
200 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
201
202 if (nid == -1) {
203 /*
204 * In this case, "nid" already exists and contains valid memory.
205 * "start_pfn" passed to us is a pfn which is an arg for
206 * online__pages(), and start_pfn should exist.
207 */
208 nid = pfn_to_nid(start_pfn);
209 VM_BUG_ON(!node_state(nid, N_ONLINE));
210 }
211
212 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
213 if (!pfn_present(pfn))
214 continue;
215 fail = init_section_page_cgroup(pfn, nid);
216 }
217 if (!fail)
218 return 0;
219
220 /* rollback */
221 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
222 __free_page_cgroup(pfn);
223
224 return -ENOMEM;
225}
226
227static int __meminit offline_page_cgroup(unsigned long start_pfn,
228 unsigned long nr_pages, int nid)
229{
230 unsigned long start, end, pfn;
231
232 start = SECTION_ALIGN_DOWN(start_pfn);
233 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
234
235 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
236 __free_page_cgroup(pfn);
237 return 0;
238
239}
240
241static int __meminit page_cgroup_callback(struct notifier_block *self,
242 unsigned long action, void *arg)
243{
244 struct memory_notify *mn = arg;
245 int ret = 0;
246 switch (action) {
247 case MEM_GOING_ONLINE:
248 ret = online_page_cgroup(mn->start_pfn,
249 mn->nr_pages, mn->status_change_nid);
250 break;
251 case MEM_OFFLINE:
252 offline_page_cgroup(mn->start_pfn,
253 mn->nr_pages, mn->status_change_nid);
254 break;
255 case MEM_CANCEL_ONLINE:
256 offline_page_cgroup(mn->start_pfn,
257 mn->nr_pages, mn->status_change_nid);
258 break;
259 case MEM_GOING_OFFLINE:
260 break;
261 case MEM_ONLINE:
262 case MEM_CANCEL_OFFLINE:
263 break;
264 }
265
266 return notifier_from_errno(ret);
267}
268
269#endif
270
271void __init page_cgroup_init(void)
272{
273 unsigned long pfn;
274 int nid;
275
276 if (mem_cgroup_disabled())
277 return;
278
279 for_each_node_state(nid, N_MEMORY) {
280 unsigned long start_pfn, end_pfn;
281
282 start_pfn = node_start_pfn(nid);
283 end_pfn = node_end_pfn(nid);
284 /*
285 * start_pfn and end_pfn may not be aligned to SECTION and the
286 * page->flags of out of node pages are not initialized. So we
287 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
288 */
289 for (pfn = start_pfn;
290 pfn < end_pfn;
291 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
292
293 if (!pfn_valid(pfn))
294 continue;
295 /*
296 * Nodes's pfns can be overlapping.
297 * We know some arch can have a nodes layout such as
298 * -------------pfn-------------->
299 * N0 | N1 | N2 | N0 | N1 | N2|....
300 */
301 if (pfn_to_nid(pfn) != nid)
302 continue;
303 if (init_section_page_cgroup(pfn, nid))
304 goto oom;
305 }
306 }
307 hotplug_memory_notifier(page_cgroup_callback, 0);
308 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
309 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
310 "don't want memory cgroups\n");
311 return;
312oom:
313 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
314 panic("Out of memory");
315}
316
317void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
318{
319 return;
320}
321
322#endif
323
324
325#ifdef CONFIG_MEMCG_SWAP
326
327static DEFINE_MUTEX(swap_cgroup_mutex);
328struct swap_cgroup_ctrl {
329 struct page **map;
330 unsigned long length;
331 spinlock_t lock;
332};
333
334static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
335
336struct swap_cgroup {
337 unsigned short id;
338};
339#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
340
341/*
342 * SwapCgroup implements "lookup" and "exchange" operations.
343 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
344 * against SwapCache. At swap_free(), this is accessed directly from swap.
345 *
346 * This means,
347 * - we have no race in "exchange" when we're accessed via SwapCache because
348 * SwapCache(and its swp_entry) is under lock.
349 * - When called via swap_free(), there is no user of this entry and no race.
350 * Then, we don't need lock around "exchange".
351 *
352 * TODO: we can push these buffers out to HIGHMEM.
353 */
354
355/*
356 * allocate buffer for swap_cgroup.
357 */
358static int swap_cgroup_prepare(int type)
359{
360 struct page *page;
361 struct swap_cgroup_ctrl *ctrl;
362 unsigned long idx, max;
363
364 ctrl = &swap_cgroup_ctrl[type];
365
366 for (idx = 0; idx < ctrl->length; idx++) {
367 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
368 if (!page)
369 goto not_enough_page;
370 ctrl->map[idx] = page;
371 }
372 return 0;
373not_enough_page:
374 max = idx;
375 for (idx = 0; idx < max; idx++)
376 __free_page(ctrl->map[idx]);
377
378 return -ENOMEM;
379}
380
381static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
382 struct swap_cgroup_ctrl **ctrlp)
383{
384 pgoff_t offset = swp_offset(ent);
385 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage;
387 struct swap_cgroup *sc;
388
389 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
390 if (ctrlp)
391 *ctrlp = ctrl;
392
393 mappage = ctrl->map[offset / SC_PER_PAGE];
394 sc = page_address(mappage);
395 return sc + offset % SC_PER_PAGE;
396}
397
398/**
399 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
400 * @ent: swap entry to be cmpxchged
401 * @old: old id
402 * @new: new id
403 *
404 * Returns old id at success, 0 at failure.
405 * (There is no mem_cgroup using 0 as its id)
406 */
407unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
408 unsigned short old, unsigned short new)
409{
410 struct swap_cgroup_ctrl *ctrl;
411 struct swap_cgroup *sc;
412 unsigned long flags;
413 unsigned short retval;
414
415 sc = lookup_swap_cgroup(ent, &ctrl);
416
417 spin_lock_irqsave(&ctrl->lock, flags);
418 retval = sc->id;
419 if (retval == old)
420 sc->id = new;
421 else
422 retval = 0;
423 spin_unlock_irqrestore(&ctrl->lock, flags);
424 return retval;
425}
426
427/**
428 * swap_cgroup_record - record mem_cgroup for this swp_entry.
429 * @ent: swap entry to be recorded into
430 * @id: mem_cgroup to be recorded
431 *
432 * Returns old value at success, 0 at failure.
433 * (Of course, old value can be 0.)
434 */
435unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
436{
437 struct swap_cgroup_ctrl *ctrl;
438 struct swap_cgroup *sc;
439 unsigned short old;
440 unsigned long flags;
441
442 sc = lookup_swap_cgroup(ent, &ctrl);
443
444 spin_lock_irqsave(&ctrl->lock, flags);
445 old = sc->id;
446 sc->id = id;
447 spin_unlock_irqrestore(&ctrl->lock, flags);
448
449 return old;
450}
451
452/**
453 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
454 * @ent: swap entry to be looked up.
455 *
456 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
457 */
458unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
459{
460 return lookup_swap_cgroup(ent, NULL)->id;
461}
462
463int swap_cgroup_swapon(int type, unsigned long max_pages)
464{
465 void *array;
466 unsigned long array_size;
467 unsigned long length;
468 struct swap_cgroup_ctrl *ctrl;
469
470 if (!do_swap_account)
471 return 0;
472
473 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
474 array_size = length * sizeof(void *);
475
476 array = vzalloc(array_size);
477 if (!array)
478 goto nomem;
479
480 ctrl = &swap_cgroup_ctrl[type];
481 mutex_lock(&swap_cgroup_mutex);
482 ctrl->length = length;
483 ctrl->map = array;
484 spin_lock_init(&ctrl->lock);
485 if (swap_cgroup_prepare(type)) {
486 /* memory shortage */
487 ctrl->map = NULL;
488 ctrl->length = 0;
489 mutex_unlock(&swap_cgroup_mutex);
490 vfree(array);
491 goto nomem;
492 }
493 mutex_unlock(&swap_cgroup_mutex);
494
495 return 0;
496nomem:
497 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
498 printk(KERN_INFO
499 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
500 return -ENOMEM;
501}
502
503void swap_cgroup_swapoff(int type)
504{
505 struct page **map;
506 unsigned long i, length;
507 struct swap_cgroup_ctrl *ctrl;
508
509 if (!do_swap_account)
510 return;
511
512 mutex_lock(&swap_cgroup_mutex);
513 ctrl = &swap_cgroup_ctrl[type];
514 map = ctrl->map;
515 length = ctrl->length;
516 ctrl->map = NULL;
517 ctrl->length = 0;
518 mutex_unlock(&swap_cgroup_mutex);
519
520 if (map) {
521 for (i = 0; i < length; i++) {
522 struct page *page = map[i];
523 if (page)
524 __free_page(page);
525 }
526 vfree(map);
527 }
528}
529
530#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000000..a009574fbba9
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,192 @@
1/*
2 * Lockless hierarchical page accounting & limiting
3 *
4 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/page_counter.h>
8#include <linux/atomic.h>
9#include <linux/kernel.h>
10#include <linux/string.h>
11#include <linux/sched.h>
12#include <linux/bug.h>
13#include <asm/page.h>
14
15/**
16 * page_counter_cancel - take pages out of the local counter
17 * @counter: counter
18 * @nr_pages: number of pages to cancel
19 */
20void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
21{
22 long new;
23
24 new = atomic_long_sub_return(nr_pages, &counter->count);
25 /* More uncharges than charges? */
26 WARN_ON_ONCE(new < 0);
27}
28
29/**
30 * page_counter_charge - hierarchically charge pages
31 * @counter: counter
32 * @nr_pages: number of pages to charge
33 *
34 * NOTE: This does not consider any configured counter limits.
35 */
36void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
37{
38 struct page_counter *c;
39
40 for (c = counter; c; c = c->parent) {
41 long new;
42
43 new = atomic_long_add_return(nr_pages, &c->count);
44 /*
45 * This is indeed racy, but we can live with some
46 * inaccuracy in the watermark.
47 */
48 if (new > c->watermark)
49 c->watermark = new;
50 }
51}
52
53/**
54 * page_counter_try_charge - try to hierarchically charge pages
55 * @counter: counter
56 * @nr_pages: number of pages to charge
57 * @fail: points first counter to hit its limit, if any
58 *
59 * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
60 * its ancestors has hit its configured limit.
61 */
62int page_counter_try_charge(struct page_counter *counter,
63 unsigned long nr_pages,
64 struct page_counter **fail)
65{
66 struct page_counter *c;
67
68 for (c = counter; c; c = c->parent) {
69 long new;
70 /*
71 * Charge speculatively to avoid an expensive CAS. If
72 * a bigger charge fails, it might falsely lock out a
73 * racing smaller charge and send it into reclaim
74 * early, but the error is limited to the difference
75 * between the two sizes, which is less than 2M/4M in
76 * case of a THP locking out a regular page charge.
77 *
78 * The atomic_long_add_return() implies a full memory
79 * barrier between incrementing the count and reading
80 * the limit. When racing with page_counter_limit(),
81 * we either see the new limit or the setter sees the
82 * counter has changed and retries.
83 */
84 new = atomic_long_add_return(nr_pages, &c->count);
85 if (new > c->limit) {
86 atomic_long_sub(nr_pages, &c->count);
87 /*
88 * This is racy, but we can live with some
89 * inaccuracy in the failcnt.
90 */
91 c->failcnt++;
92 *fail = c;
93 goto failed;
94 }
95 /*
96 * Just like with failcnt, we can live with some
97 * inaccuracy in the watermark.
98 */
99 if (new > c->watermark)
100 c->watermark = new;
101 }
102 return 0;
103
104failed:
105 for (c = counter; c != *fail; c = c->parent)
106 page_counter_cancel(c, nr_pages);
107
108 return -ENOMEM;
109}
110
111/**
112 * page_counter_uncharge - hierarchically uncharge pages
113 * @counter: counter
114 * @nr_pages: number of pages to uncharge
115 */
116void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
117{
118 struct page_counter *c;
119
120 for (c = counter; c; c = c->parent)
121 page_counter_cancel(c, nr_pages);
122}
123
124/**
125 * page_counter_limit - limit the number of pages allowed
126 * @counter: counter
127 * @limit: limit to set
128 *
129 * Returns 0 on success, -EBUSY if the current number of pages on the
130 * counter already exceeds the specified limit.
131 *
132 * The caller must serialize invocations on the same counter.
133 */
134int page_counter_limit(struct page_counter *counter, unsigned long limit)
135{
136 for (;;) {
137 unsigned long old;
138 long count;
139
140 /*
141 * Update the limit while making sure that it's not
142 * below the concurrently-changing counter value.
143 *
144 * The xchg implies two full memory barriers before
145 * and after, so the read-swap-read is ordered and
146 * ensures coherency with page_counter_try_charge():
147 * that function modifies the count before checking
148 * the limit, so if it sees the old limit, we see the
149 * modified counter and retry.
150 */
151 count = atomic_long_read(&counter->count);
152
153 if (count > limit)
154 return -EBUSY;
155
156 old = xchg(&counter->limit, limit);
157
158 if (atomic_long_read(&counter->count) <= count)
159 return 0;
160
161 counter->limit = old;
162 cond_resched();
163 }
164}
165
166/**
167 * page_counter_memparse - memparse() for page counter limits
168 * @buf: string to parse
169 * @nr_pages: returns the result in number of pages
170 *
171 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
172 * limited to %PAGE_COUNTER_MAX.
173 */
174int page_counter_memparse(const char *buf, unsigned long *nr_pages)
175{
176 char unlimited[] = "-1";
177 char *end;
178 u64 bytes;
179
180 if (!strncmp(buf, unlimited, sizeof(unlimited))) {
181 *nr_pages = PAGE_COUNTER_MAX;
182 return 0;
183 }
184
185 bytes = memparse(buf, &end);
186 if (*end != '\0')
187 return -EINVAL;
188
189 *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
190
191 return 0;
192}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c8778f7e208e..72f5ac381ab3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -68,7 +68,7 @@ out:
68 68
69 spin_unlock_irqrestore(&zone->lock, flags); 69 spin_unlock_irqrestore(&zone->lock, flags);
70 if (!ret) 70 if (!ret)
71 drain_all_pages(); 71 drain_all_pages(zone);
72 return ret; 72 return ret;
73} 73}
74 74
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e4c7213210c..45eba36fd673 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page)
1053 __inc_zone_page_state(page, NR_FILE_MAPPED); 1053 __inc_zone_page_state(page, NR_FILE_MAPPED);
1054 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1054 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1055 } 1055 }
1056 mem_cgroup_end_page_stat(memcg, locked, flags); 1056 mem_cgroup_end_page_stat(memcg, &locked, &flags);
1057} 1057}
1058 1058
1059static void page_remove_file_rmap(struct page *page) 1059static void page_remove_file_rmap(struct page *page)
@@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page)
1083 if (unlikely(PageMlocked(page))) 1083 if (unlikely(PageMlocked(page)))
1084 clear_page_mlock(page); 1084 clear_page_mlock(page);
1085out: 1085out:
1086 mem_cgroup_end_page_stat(memcg, locked, flags); 1086 mem_cgroup_end_page_stat(memcg, &locked, &flags);
1087} 1087}
1088 1088
1089/** 1089/**
diff --git a/mm/slab.c b/mm/slab.c
index f34e053ec46e..79e15f0a2a6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep,
2590 * Be lazy and only check for valid flags here, keeping it out of the 2590 * Be lazy and only check for valid flags here, keeping it out of the
2591 * critical path in kmem_cache_alloc(). 2591 * critical path in kmem_cache_alloc().
2592 */ 2592 */
2593 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2593 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
2594 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
2595 BUG();
2596 }
2594 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2597 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2595 2598
2596 /* Take the node list lock to change the colour_next on this node */ 2599 /* Take the node list lock to change the colour_next on this node */
@@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3580 3583
3581 for_each_online_node(node) { 3584 for_each_online_node(node) {
3582 3585
3583 if (use_alien_caches) { 3586 if (use_alien_caches) {
3584 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3587 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3585 if (!new_alien) 3588 if (!new_alien)
3586 goto fail; 3589 goto fail;
3587 } 3590 }
3588 3591
3589 new_shared = NULL; 3592 new_shared = NULL;
3590 if (cachep->shared) { 3593 if (cachep->shared) {
@@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4043 4046
4044#ifdef CONFIG_DEBUG_SLAB_LEAK 4047#ifdef CONFIG_DEBUG_SLAB_LEAK
4045 4048
4046static void *leaks_start(struct seq_file *m, loff_t *pos)
4047{
4048 mutex_lock(&slab_mutex);
4049 return seq_list_start(&slab_caches, *pos);
4050}
4051
4052static inline int add_caller(unsigned long *n, unsigned long v) 4049static inline int add_caller(unsigned long *n, unsigned long v)
4053{ 4050{
4054 unsigned long *p; 4051 unsigned long *p;
@@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p)
4170} 4167}
4171 4168
4172static const struct seq_operations slabstats_op = { 4169static const struct seq_operations slabstats_op = {
4173 .start = leaks_start, 4170 .start = slab_start,
4174 .next = slab_next, 4171 .next = slab_next,
4175 .stop = slab_stop, 4172 .stop = slab_stop,
4176 .show = leaks_show, 4173 .show = leaks_show,
diff --git a/mm/slab.h b/mm/slab.h
index ab019e63e3c2..1cf4005482dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
209 209
210 rcu_read_lock(); 210 rcu_read_lock();
211 params = rcu_dereference(s->memcg_params); 211 params = rcu_dereference(s->memcg_params);
212 cachep = params->memcg_caches[idx];
213 rcu_read_unlock();
214 212
215 /* 213 /*
216 * Make sure we will access the up-to-date value. The code updating 214 * Make sure we will access the up-to-date value. The code updating
217 * memcg_caches issues a write barrier to match this (see 215 * memcg_caches issues a write barrier to match this (see
218 * memcg_register_cache()). 216 * memcg_register_cache()).
219 */ 217 */
220 smp_read_barrier_depends(); 218 cachep = lockless_dereference(params->memcg_caches[idx]);
219 rcu_read_unlock();
220
221 return cachep; 221 return cachep;
222} 222}
223 223
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
357 357
358#endif 358#endif
359 359
360void *slab_start(struct seq_file *m, loff_t *pos);
360void *slab_next(struct seq_file *m, void *p, loff_t *pos); 361void *slab_next(struct seq_file *m, void *p, loff_t *pos);
361void slab_stop(struct seq_file *m, void *p); 362void slab_stop(struct seq_file *m, void *p);
363int memcg_slab_show(struct seq_file *m, void *p);
362 364
363#endif /* MM_SLAB_H */ 365#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index dcdab81bd240..e03dd6f2a272 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
240 size = ALIGN(size, align); 240 size = ALIGN(size, align);
241 flags = kmem_cache_flags(size, flags, name, NULL); 241 flags = kmem_cache_flags(size, flags, name, NULL);
242 242
243 list_for_each_entry(s, &slab_caches, list) { 243 list_for_each_entry_reverse(s, &slab_caches, list) {
244 if (slab_unmergeable(s)) 244 if (slab_unmergeable(s))
245 continue; 245 continue;
246 246
@@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
811#define SLABINFO_RIGHTS S_IRUSR 811#define SLABINFO_RIGHTS S_IRUSR
812#endif 812#endif
813 813
814void print_slabinfo_header(struct seq_file *m) 814static void print_slabinfo_header(struct seq_file *m)
815{ 815{
816 /* 816 /*
817 * Output format version, so at least we can change it 817 * Output format version, so at least we can change it
@@ -834,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m)
834 seq_putc(m, '\n'); 834 seq_putc(m, '\n');
835} 835}
836 836
837static void *s_start(struct seq_file *m, loff_t *pos) 837void *slab_start(struct seq_file *m, loff_t *pos)
838{ 838{
839 loff_t n = *pos;
840
841 mutex_lock(&slab_mutex); 839 mutex_lock(&slab_mutex);
842 if (!n)
843 print_slabinfo_header(m);
844
845 return seq_list_start(&slab_caches, *pos); 840 return seq_list_start(&slab_caches, *pos);
846} 841}
847 842
@@ -881,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
881 } 876 }
882} 877}
883 878
884int cache_show(struct kmem_cache *s, struct seq_file *m) 879static void cache_show(struct kmem_cache *s, struct seq_file *m)
885{ 880{
886 struct slabinfo sinfo; 881 struct slabinfo sinfo;
887 882
@@ -900,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
900 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); 895 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
901 slabinfo_show_stats(m, s); 896 slabinfo_show_stats(m, s);
902 seq_putc(m, '\n'); 897 seq_putc(m, '\n');
898}
899
900static int slab_show(struct seq_file *m, void *p)
901{
902 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
903
904 if (p == slab_caches.next)
905 print_slabinfo_header(m);
906 if (is_root_cache(s))
907 cache_show(s, m);
903 return 0; 908 return 0;
904} 909}
905 910
906static int s_show(struct seq_file *m, void *p) 911#ifdef CONFIG_MEMCG_KMEM
912int memcg_slab_show(struct seq_file *m, void *p)
907{ 913{
908 struct kmem_cache *s = list_entry(p, struct kmem_cache, list); 914 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
915 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
909 916
910 if (!is_root_cache(s)) 917 if (p == slab_caches.next)
911 return 0; 918 print_slabinfo_header(m);
912 return cache_show(s, m); 919 if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
920 cache_show(s, m);
921 return 0;
913} 922}
923#endif
914 924
915/* 925/*
916 * slabinfo_op - iterator that generates /proc/slabinfo 926 * slabinfo_op - iterator that generates /proc/slabinfo
@@ -926,10 +936,10 @@ static int s_show(struct seq_file *m, void *p)
926 * + further values on SMP and with statistics enabled 936 * + further values on SMP and with statistics enabled
927 */ 937 */
928static const struct seq_operations slabinfo_op = { 938static const struct seq_operations slabinfo_op = {
929 .start = s_start, 939 .start = slab_start,
930 .next = slab_next, 940 .next = slab_next,
931 .stop = slab_stop, 941 .stop = slab_stop,
932 .show = s_show, 942 .show = slab_show,
933}; 943};
934 944
935static int slabinfo_open(struct inode *inode, struct file *file) 945static int slabinfo_open(struct inode *inode, struct file *file)
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..386bbed76e94 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page)
849 maxobj = order_objects(compound_order(page), s->size, s->reserved); 849 maxobj = order_objects(compound_order(page), s->size, s->reserved);
850 if (page->objects > maxobj) { 850 if (page->objects > maxobj) {
851 slab_err(s, page, "objects %u > max %u", 851 slab_err(s, page, "objects %u > max %u",
852 s->name, page->objects, maxobj); 852 page->objects, maxobj);
853 return 0; 853 return 0;
854 } 854 }
855 if (page->inuse > page->objects) { 855 if (page->inuse > page->objects) {
856 slab_err(s, page, "inuse %u > max %u", 856 slab_err(s, page, "inuse %u > max %u",
857 s->name, page->inuse, page->objects); 857 page->inuse, page->objects);
858 return 0; 858 return 0;
859 } 859 }
860 /* Slab_pad_check fixes things up after itself */ 860 /* Slab_pad_check fixes things up after itself */
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
871 int nr = 0; 871 int nr = 0;
872 void *fp; 872 void *fp;
873 void *object = NULL; 873 void *object = NULL;
874 unsigned long max_objects; 874 int max_objects;
875 875
876 fp = page->freelist; 876 fp = page->freelist;
877 while (fp && nr <= page->objects) { 877 while (fp && nr <= page->objects) {
@@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1377 int order; 1377 int order;
1378 int idx; 1378 int idx;
1379 1379
1380 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1380 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1381 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
1382 BUG();
1383 }
1381 1384
1382 page = allocate_slab(s, 1385 page = allocate_slab(s,
1383 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1386 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2554 2557
2555 } else { /* Needs to be taken off a list */ 2558 } else { /* Needs to be taken off a list */
2556 2559
2557 n = get_node(s, page_to_nid(page)); 2560 n = get_node(s, page_to_nid(page));
2558 /* 2561 /*
2559 * Speculatively acquire the list_lock. 2562 * Speculatively acquire the list_lock.
2560 * If the cmpxchg does not succeed then we may 2563 * If the cmpxchg does not succeed then we may
@@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2587 * The list lock was not taken therefore no list 2590 * The list lock was not taken therefore no list
2588 * activity can be necessary. 2591 * activity can be necessary.
2589 */ 2592 */
2590 if (was_frozen) 2593 if (was_frozen)
2591 stat(s, FREE_FROZEN); 2594 stat(s, FREE_FROZEN);
2592 return; 2595 return;
2593 } 2596 }
2594 2597
2595 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) 2598 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2596 goto slab_empty; 2599 goto slab_empty;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
1#include <linux/swap_cgroup.h>
2#include <linux/vmalloc.h>
3#include <linux/mm.h>
4
5#include <linux/swapops.h> /* depends on mm.h include */
6
7static DEFINE_MUTEX(swap_cgroup_mutex);
8struct swap_cgroup_ctrl {
9 struct page **map;
10 unsigned long length;
11 spinlock_t lock;
12};
13
14static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
15
16struct swap_cgroup {
17 unsigned short id;
18};
19#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
20
21/*
22 * SwapCgroup implements "lookup" and "exchange" operations.
23 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
24 * against SwapCache. At swap_free(), this is accessed directly from swap.
25 *
26 * This means,
27 * - we have no race in "exchange" when we're accessed via SwapCache because
28 * SwapCache(and its swp_entry) is under lock.
29 * - When called via swap_free(), there is no user of this entry and no race.
30 * Then, we don't need lock around "exchange".
31 *
32 * TODO: we can push these buffers out to HIGHMEM.
33 */
34
35/*
36 * allocate buffer for swap_cgroup.
37 */
38static int swap_cgroup_prepare(int type)
39{
40 struct page *page;
41 struct swap_cgroup_ctrl *ctrl;
42 unsigned long idx, max;
43
44 ctrl = &swap_cgroup_ctrl[type];
45
46 for (idx = 0; idx < ctrl->length; idx++) {
47 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
48 if (!page)
49 goto not_enough_page;
50 ctrl->map[idx] = page;
51 }
52 return 0;
53not_enough_page:
54 max = idx;
55 for (idx = 0; idx < max; idx++)
56 __free_page(ctrl->map[idx]);
57
58 return -ENOMEM;
59}
60
61static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
62 struct swap_cgroup_ctrl **ctrlp)
63{
64 pgoff_t offset = swp_offset(ent);
65 struct swap_cgroup_ctrl *ctrl;
66 struct page *mappage;
67 struct swap_cgroup *sc;
68
69 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
70 if (ctrlp)
71 *ctrlp = ctrl;
72
73 mappage = ctrl->map[offset / SC_PER_PAGE];
74 sc = page_address(mappage);
75 return sc + offset % SC_PER_PAGE;
76}
77
78/**
79 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
80 * @ent: swap entry to be cmpxchged
81 * @old: old id
82 * @new: new id
83 *
84 * Returns old id at success, 0 at failure.
85 * (There is no mem_cgroup using 0 as its id)
86 */
87unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
88 unsigned short old, unsigned short new)
89{
90 struct swap_cgroup_ctrl *ctrl;
91 struct swap_cgroup *sc;
92 unsigned long flags;
93 unsigned short retval;
94
95 sc = lookup_swap_cgroup(ent, &ctrl);
96
97 spin_lock_irqsave(&ctrl->lock, flags);
98 retval = sc->id;
99 if (retval == old)
100 sc->id = new;
101 else
102 retval = 0;
103 spin_unlock_irqrestore(&ctrl->lock, flags);
104 return retval;
105}
106
107/**
108 * swap_cgroup_record - record mem_cgroup for this swp_entry.
109 * @ent: swap entry to be recorded into
110 * @id: mem_cgroup to be recorded
111 *
112 * Returns old value at success, 0 at failure.
113 * (Of course, old value can be 0.)
114 */
115unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
116{
117 struct swap_cgroup_ctrl *ctrl;
118 struct swap_cgroup *sc;
119 unsigned short old;
120 unsigned long flags;
121
122 sc = lookup_swap_cgroup(ent, &ctrl);
123
124 spin_lock_irqsave(&ctrl->lock, flags);
125 old = sc->id;
126 sc->id = id;
127 spin_unlock_irqrestore(&ctrl->lock, flags);
128
129 return old;
130}
131
132/**
133 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
134 * @ent: swap entry to be looked up.
135 *
136 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
137 */
138unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
139{
140 return lookup_swap_cgroup(ent, NULL)->id;
141}
142
143int swap_cgroup_swapon(int type, unsigned long max_pages)
144{
145 void *array;
146 unsigned long array_size;
147 unsigned long length;
148 struct swap_cgroup_ctrl *ctrl;
149
150 if (!do_swap_account)
151 return 0;
152
153 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
154 array_size = length * sizeof(void *);
155
156 array = vzalloc(array_size);
157 if (!array)
158 goto nomem;
159
160 ctrl = &swap_cgroup_ctrl[type];
161 mutex_lock(&swap_cgroup_mutex);
162 ctrl->length = length;
163 ctrl->map = array;
164 spin_lock_init(&ctrl->lock);
165 if (swap_cgroup_prepare(type)) {
166 /* memory shortage */
167 ctrl->map = NULL;
168 ctrl->length = 0;
169 mutex_unlock(&swap_cgroup_mutex);
170 vfree(array);
171 goto nomem;
172 }
173 mutex_unlock(&swap_cgroup_mutex);
174
175 return 0;
176nomem:
177 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
178 printk(KERN_INFO
179 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
180 return -ENOMEM;
181}
182
183void swap_cgroup_swapoff(int type)
184{
185 struct page **map;
186 unsigned long i, length;
187 struct swap_cgroup_ctrl *ctrl;
188
189 if (!do_swap_account)
190 return;
191
192 mutex_lock(&swap_cgroup_mutex);
193 ctrl = &swap_cgroup_ctrl[type];
194 map = ctrl->map;
195 length = ctrl->length;
196 ctrl->map = NULL;
197 ctrl->length = 0;
198 mutex_unlock(&swap_cgroup_mutex);
199
200 if (map) {
201 for (i = 0; i < length; i++) {
202 struct page *page = map[i];
203 if (page)
204 __free_page(page);
205 }
206 vfree(map);
207 }
208}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/page_cgroup.h>
21 20
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23 22
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
38#include <asm/pgtable.h> 38#include <asm/pgtable.h>
39#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
40#include <linux/swapops.h> 40#include <linux/swapops.h>
41#include <linux/page_cgroup.h> 41#include <linux/swap_cgroup.h>
42 42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char); 44 unsigned char);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 90520af7f186..8a18196fcdff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -463,8 +463,7 @@ overflow:
463 goto retry; 463 goto retry;
464 } 464 }
465 if (printk_ratelimit()) 465 if (printk_ratelimit())
466 printk(KERN_WARNING 466 pr_warn("vmap allocation for size %lu failed: "
467 "vmap allocation for size %lu failed: "
468 "use vmalloc=<size> to increase size.\n", size); 467 "use vmalloc=<size> to increase size.\n", size);
469 kfree(va); 468 kfree(va);
470 return ERR_PTR(-EBUSY); 469 return ERR_PTR(-EBUSY);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb47074ae03..4636d9e822c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
260 do_div(delta, lru_pages + 1); 260 do_div(delta, lru_pages + 1);
261 total_scan += delta; 261 total_scan += delta;
262 if (total_scan < 0) { 262 if (total_scan < 0) {
263 printk(KERN_ERR 263 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
264 "shrink_slab: %pF negative objects to delete nr=%ld\n",
265 shrinker->scan_objects, total_scan); 264 shrinker->scan_objects, total_scan);
266 total_scan = freeable; 265 total_scan = freeable;
267 } 266 }
@@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
875 * end of the LRU a second time. 874 * end of the LRU a second time.
876 */ 875 */
877 mapping = page_mapping(page); 876 mapping = page_mapping(page);
878 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || 877 if (((dirty || writeback) && mapping &&
878 bdi_write_congested(mapping->backing_dev_info)) ||
879 (writeback && PageReclaim(page))) 879 (writeback && PageReclaim(page)))
880 nr_congested++; 880 nr_congested++;
881 881
@@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
2249 return true; 2249 return true;
2250 2250
2251 /* If compaction would go ahead or the allocation would succeed, stop */ 2251 /* If compaction would go ahead or the allocation would succeed, stop */
2252 switch (compaction_suitable(zone, sc->order)) { 2252 switch (compaction_suitable(zone, sc->order, 0, 0)) {
2253 case COMPACT_PARTIAL: 2253 case COMPACT_PARTIAL:
2254 case COMPACT_CONTINUE: 2254 case COMPACT_CONTINUE:
2255 return false; 2255 return false;
@@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
2346 * If compaction is not ready to start and allocation is not likely 2346 * If compaction is not ready to start and allocation is not likely
2347 * to succeed without it, then keep reclaiming. 2347 * to succeed without it, then keep reclaiming.
2348 */ 2348 */
2349 if (compaction_suitable(zone, order) == COMPACT_SKIPPED) 2349 if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
2350 return false; 2350 return false;
2351 2351
2352 return watermark_ok; 2352 return watermark_ok;
@@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order,
2824 balance_gap, classzone_idx, 0)) 2824 balance_gap, classzone_idx, 0))
2825 return false; 2825 return false;
2826 2826
2827 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2827 if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
2828 compaction_suitable(zone, order) == COMPACT_SKIPPED) 2828 order, 0, classzone_idx) == COMPACT_SKIPPED)
2829 return false; 2829 return false;
2830 2830
2831 return true; 2831 return true;
@@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2952 * from memory. Do not reclaim more than needed for compaction. 2952 * from memory. Do not reclaim more than needed for compaction.
2953 */ 2953 */
2954 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2954 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2955 compaction_suitable(zone, sc->order) != 2955 compaction_suitable(zone, sc->order, 0, classzone_idx)
2956 COMPACT_SKIPPED) 2956 != COMPACT_SKIPPED)
2957 testorder = 0; 2957 testorder = 0;
2958 2958
2959 /* 2959 /*
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1d191357bf88..272327134a1b 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -9,13 +9,13 @@
9int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 9int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
10{ 10{
11 /* 11 /*
12 * The root cgroup does not use res_counters, but rather, 12 * The root cgroup does not use page_counters, but rather,
13 * rely on the data already collected by the network 13 * rely on the data already collected by the network
14 * subsystem 14 * subsystem
15 */ 15 */
16 struct res_counter *res_parent = NULL;
17 struct cg_proto *cg_proto, *parent_cg;
18 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 16 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
17 struct page_counter *counter_parent = NULL;
18 struct cg_proto *cg_proto, *parent_cg;
19 19
20 cg_proto = tcp_prot.proto_cgroup(memcg); 20 cg_proto = tcp_prot.proto_cgroup(memcg);
21 if (!cg_proto) 21 if (!cg_proto)
@@ -29,9 +29,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
29 29
30 parent_cg = tcp_prot.proto_cgroup(parent); 30 parent_cg = tcp_prot.proto_cgroup(parent);
31 if (parent_cg) 31 if (parent_cg)
32 res_parent = &parent_cg->memory_allocated; 32 counter_parent = &parent_cg->memory_allocated;
33 33
34 res_counter_init(&cg_proto->memory_allocated, res_parent); 34 page_counter_init(&cg_proto->memory_allocated, counter_parent);
35 percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); 35 percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
36 36
37 return 0; 37 return 0;
@@ -50,7 +50,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
50} 50}
51EXPORT_SYMBOL(tcp_destroy_cgroup); 51EXPORT_SYMBOL(tcp_destroy_cgroup);
52 52
53static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) 53static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
54{ 54{
55 struct cg_proto *cg_proto; 55 struct cg_proto *cg_proto;
56 int i; 56 int i;
@@ -60,20 +60,17 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
60 if (!cg_proto) 60 if (!cg_proto)
61 return -EINVAL; 61 return -EINVAL;
62 62
63 if (val > RES_COUNTER_MAX) 63 ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
64 val = RES_COUNTER_MAX;
65
66 ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
67 if (ret) 64 if (ret)
68 return ret; 65 return ret;
69 66
70 for (i = 0; i < 3; i++) 67 for (i = 0; i < 3; i++)
71 cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, 68 cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
72 sysctl_tcp_mem[i]); 69 sysctl_tcp_mem[i]);
73 70
74 if (val == RES_COUNTER_MAX) 71 if (nr_pages == PAGE_COUNTER_MAX)
75 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); 72 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
76 else if (val != RES_COUNTER_MAX) { 73 else {
77 /* 74 /*
78 * The active bit needs to be written after the static_key 75 * The active bit needs to be written after the static_key
79 * update. This is what guarantees that the socket activation 76 * update. This is what guarantees that the socket activation
@@ -102,11 +99,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
102 return 0; 99 return 0;
103} 100}
104 101
102enum {
103 RES_USAGE,
104 RES_LIMIT,
105 RES_MAX_USAGE,
106 RES_FAILCNT,
107};
108
109static DEFINE_MUTEX(tcp_limit_mutex);
110
105static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, 111static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
106 char *buf, size_t nbytes, loff_t off) 112 char *buf, size_t nbytes, loff_t off)
107{ 113{
108 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 114 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
109 unsigned long long val; 115 unsigned long nr_pages;
110 int ret = 0; 116 int ret = 0;
111 117
112 buf = strstrip(buf); 118 buf = strstrip(buf);
@@ -114,10 +120,12 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
114 switch (of_cft(of)->private) { 120 switch (of_cft(of)->private) {
115 case RES_LIMIT: 121 case RES_LIMIT:
116 /* see memcontrol.c */ 122 /* see memcontrol.c */
117 ret = res_counter_memparse_write_strategy(buf, &val); 123 ret = page_counter_memparse(buf, &nr_pages);
118 if (ret) 124 if (ret)
119 break; 125 break;
120 ret = tcp_update_limit(memcg, val); 126 mutex_lock(&tcp_limit_mutex);
127 ret = tcp_update_limit(memcg, nr_pages);
128 mutex_unlock(&tcp_limit_mutex);
121 break; 129 break;
122 default: 130 default:
123 ret = -EINVAL; 131 ret = -EINVAL;
@@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
126 return ret ?: nbytes; 134 return ret ?: nbytes;
127} 135}
128 136
129static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
130{
131 struct cg_proto *cg_proto;
132
133 cg_proto = tcp_prot.proto_cgroup(memcg);
134 if (!cg_proto)
135 return default_val;
136
137 return res_counter_read_u64(&cg_proto->memory_allocated, type);
138}
139
140static u64 tcp_read_usage(struct mem_cgroup *memcg)
141{
142 struct cg_proto *cg_proto;
143
144 cg_proto = tcp_prot.proto_cgroup(memcg);
145 if (!cg_proto)
146 return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
147
148 return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
149}
150
151static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) 137static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
152{ 138{
153 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 139 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
140 struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
154 u64 val; 141 u64 val;
155 142
156 switch (cft->private) { 143 switch (cft->private) {
157 case RES_LIMIT: 144 case RES_LIMIT:
158 val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); 145 if (!cg_proto)
146 return PAGE_COUNTER_MAX;
147 val = cg_proto->memory_allocated.limit;
148 val *= PAGE_SIZE;
159 break; 149 break;
160 case RES_USAGE: 150 case RES_USAGE:
161 val = tcp_read_usage(memcg); 151 if (!cg_proto)
152 val = atomic_long_read(&tcp_memory_allocated);
153 else
154 val = page_counter_read(&cg_proto->memory_allocated);
155 val *= PAGE_SIZE;
162 break; 156 break;
163 case RES_FAILCNT: 157 case RES_FAILCNT:
158 if (!cg_proto)
159 return 0;
160 val = cg_proto->memory_allocated.failcnt;
161 break;
164 case RES_MAX_USAGE: 162 case RES_MAX_USAGE:
165 val = tcp_read_stat(memcg, cft->private, 0); 163 if (!cg_proto)
164 return 0;
165 val = cg_proto->memory_allocated.watermark;
166 val *= PAGE_SIZE;
166 break; 167 break;
167 default: 168 default:
168 BUG(); 169 BUG();
@@ -183,10 +184,10 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
183 184
184 switch (of_cft(of)->private) { 185 switch (of_cft(of)->private) {
185 case RES_MAX_USAGE: 186 case RES_MAX_USAGE:
186 res_counter_reset_max(&cg_proto->memory_allocated); 187 page_counter_reset_watermark(&cg_proto->memory_allocated);
187 break; 188 break;
188 case RES_FAILCNT: 189 case RES_FAILCNT:
189 res_counter_reset_failcnt(&cg_proto->memory_allocated); 190 cg_proto->memory_allocated.failcnt = 0;
190 break; 191 break;
191 } 192 }
192 193
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 374abf443636..f0bb6d60c07b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -7,10 +7,11 @@
7 7
8use strict; 8use strict;
9use POSIX; 9use POSIX;
10use File::Basename;
11use Cwd 'abs_path';
10 12
11my $P = $0; 13my $P = $0;
12$P =~ s@(.*)/@@g; 14my $D = dirname(abs_path($P));
13my $D = $1;
14 15
15my $V = '0.32'; 16my $V = '0.32';
16 17
@@ -438,26 +439,29 @@ our $allowed_asm_includes = qr{(?x:
438 439
439# Load common spelling mistakes and build regular expression list. 440# Load common spelling mistakes and build regular expression list.
440my $misspellings; 441my $misspellings;
441my @spelling_list;
442my %spelling_fix; 442my %spelling_fix;
443open(my $spelling, '<', $spelling_file)
444 or die "$P: Can't open $spelling_file for reading: $!\n";
445while (<$spelling>) {
446 my $line = $_;
447 443
448 $line =~ s/\s*\n?$//g; 444if (open(my $spelling, '<', $spelling_file)) {
449 $line =~ s/^\s*//g; 445 my @spelling_list;
446 while (<$spelling>) {
447 my $line = $_;
450 448
451 next if ($line =~ m/^\s*#/); 449 $line =~ s/\s*\n?$//g;
452 next if ($line =~ m/^\s*$/); 450 $line =~ s/^\s*//g;
453 451
454 my ($suspect, $fix) = split(/\|\|/, $line); 452 next if ($line =~ m/^\s*#/);
453 next if ($line =~ m/^\s*$/);
455 454
456 push(@spelling_list, $suspect); 455 my ($suspect, $fix) = split(/\|\|/, $line);
457 $spelling_fix{$suspect} = $fix; 456
457 push(@spelling_list, $suspect);
458 $spelling_fix{$suspect} = $fix;
459 }
460 close($spelling);
461 $misspellings = join("|", @spelling_list);
462} else {
463 warn "No typos will be found - file '$spelling_file': $!\n";
458} 464}
459close($spelling);
460$misspellings = join("|", @spelling_list);
461 465
462sub build_types { 466sub build_types {
463 my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; 467 my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)";
@@ -942,7 +946,7 @@ sub sanitise_line {
942sub get_quoted_string { 946sub get_quoted_string {
943 my ($line, $rawline) = @_; 947 my ($line, $rawline) = @_;
944 948
945 return "" if ($line !~ m/(\"[X]+\")/g); 949 return "" if ($line !~ m/(\"[X\t]+\")/g);
946 return substr($rawline, $-[0], $+[0] - $-[0]); 950 return substr($rawline, $-[0], $+[0] - $-[0]);
947} 951}
948 952
@@ -1843,6 +1847,7 @@ sub process {
1843 my $non_utf8_charset = 0; 1847 my $non_utf8_charset = 0;
1844 1848
1845 my $last_blank_line = 0; 1849 my $last_blank_line = 0;
1850 my $last_coalesced_string_linenr = -1;
1846 1851
1847 our @report = (); 1852 our @report = ();
1848 our $cnt_lines = 0; 1853 our $cnt_lines = 0;
@@ -2078,6 +2083,12 @@ sub process {
2078 $in_commit_log = 0; 2083 $in_commit_log = 0;
2079 } 2084 }
2080 2085
2086# Check if MAINTAINERS is being updated. If so, there's probably no need to
2087# emit the "does MAINTAINERS need updating?" message on file add/move/delete
2088 if ($line =~ /^\s*MAINTAINERS\s*\|/) {
2089 $reported_maintainer_file = 1;
2090 }
2091
2081# Check signature styles 2092# Check signature styles
2082 if (!$in_header_lines && 2093 if (!$in_header_lines &&
2083 $line =~ /^(\s*)([a-z0-9_-]+by:|$signature_tags)(\s*)(.*)/i) { 2094 $line =~ /^(\s*)([a-z0-9_-]+by:|$signature_tags)(\s*)(.*)/i) {
@@ -2246,7 +2257,7 @@ sub process {
2246 } 2257 }
2247 2258
2248# Check for various typo / spelling mistakes 2259# Check for various typo / spelling mistakes
2249 if ($in_commit_log || $line =~ /^\+/) { 2260 if (defined($misspellings) && ($in_commit_log || $line =~ /^\+/)) {
2250 while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:$|[^a-z@])/gi) { 2261 while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:$|[^a-z@])/gi) {
2251 my $typo = $1; 2262 my $typo = $1;
2252 my $typo_fix = $spelling_fix{lc($typo)}; 2263 my $typo_fix = $spelling_fix{lc($typo)};
@@ -2403,33 +2414,6 @@ sub process {
2403 "line over $max_line_length characters\n" . $herecurr); 2414 "line over $max_line_length characters\n" . $herecurr);
2404 } 2415 }
2405 2416
2406# Check for user-visible strings broken across lines, which breaks the ability
2407# to grep for the string. Make exceptions when the previous string ends in a
2408# newline (multiple lines in one string constant) or '\t', '\r', ';', or '{'
2409# (common in inline assembly) or is a octal \123 or hexadecimal \xaf value
2410 if ($line =~ /^\+\s*"/ &&
2411 $prevline =~ /"\s*$/ &&
2412 $prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) {
2413 WARN("SPLIT_STRING",
2414 "quoted string split across lines\n" . $hereprev);
2415 }
2416
2417# check for missing a space in a string concatination
2418 if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) {
2419 WARN('MISSING_SPACE',
2420 "break quoted strings at a space character\n" . $hereprev);
2421 }
2422
2423# check for spaces before a quoted newline
2424 if ($rawline =~ /^.*\".*\s\\n/) {
2425 if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE",
2426 "unnecessary whitespace before a quoted newline\n" . $herecurr) &&
2427 $fix) {
2428 $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/;
2429 }
2430
2431 }
2432
2433# check for adding lines without a newline. 2417# check for adding lines without a newline.
2434 if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) { 2418 if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) {
2435 WARN("MISSING_EOF_NEWLINE", 2419 WARN("MISSING_EOF_NEWLINE",
@@ -2515,7 +2499,8 @@ sub process {
2515 } 2499 }
2516 } 2500 }
2517 2501
2518 if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|{)/) { 2502 if ($line =~ /^\+.*(\w+\s*)?\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|[,;\({\[\<\>])/ &&
2503 (!defined($1) || $1 !~ /sizeof\s*/)) {
2519 if (CHK("SPACING", 2504 if (CHK("SPACING",
2520 "No space is necessary after a cast\n" . $herecurr) && 2505 "No space is necessary after a cast\n" . $herecurr) &&
2521 $fix) { 2506 $fix) {
@@ -3563,14 +3548,33 @@ sub process {
3563 } 3548 }
3564 } 3549 }
3565 3550
3566 # , must have a space on the right. 3551 # , must not have a space before and must have a space on the right.
3567 } elsif ($op eq ',') { 3552 } elsif ($op eq ',') {
3553 my $rtrim_before = 0;
3554 my $space_after = 0;
3555 if ($ctx =~ /Wx./) {
3556 if (ERROR("SPACING",
3557 "space prohibited before that '$op' $at\n" . $hereptr)) {
3558 $line_fixed = 1;
3559 $rtrim_before = 1;
3560 }
3561 }
3568 if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { 3562 if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) {
3569 if (ERROR("SPACING", 3563 if (ERROR("SPACING",
3570 "space required after that '$op' $at\n" . $hereptr)) { 3564 "space required after that '$op' $at\n" . $hereptr)) {
3571 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
3572 $line_fixed = 1; 3565 $line_fixed = 1;
3573 $last_after = $n; 3566 $last_after = $n;
3567 $space_after = 1;
3568 }
3569 }
3570 if ($rtrim_before || $space_after) {
3571 if ($rtrim_before) {
3572 $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
3573 } else {
3574 $good = $fix_elements[$n] . trim($fix_elements[$n + 1]);
3575 }
3576 if ($space_after) {
3577 $good .= " ";
3574 } 3578 }
3575 } 3579 }
3576 3580
@@ -3814,9 +3818,27 @@ sub process {
3814# ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar 3818# ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar
3815 3819
3816 while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { 3820 while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) {
3817 CHK("UNNECESSARY_PARENTHESES", 3821 my $var = $1;
3818 "Unnecessary parentheses around $1\n" . $herecurr); 3822 if (CHK("UNNECESSARY_PARENTHESES",
3819 } 3823 "Unnecessary parentheses around $var\n" . $herecurr) &&
3824 $fix) {
3825 $fixed[$fixlinenr] =~ s/\(\s*\Q$var\E\s*\)/$var/;
3826 }
3827 }
3828
3829# check for unnecessary parentheses around function pointer uses
3830# ie: (foo->bar)(); should be foo->bar();
3831# but not "if (foo->bar) (" to avoid some false positives
3832 if ($line =~ /(\bif\s*|)(\(\s*$Ident\s*(?:$Member\s*)+\))[ \t]*\(/ && $1 !~ /^if/) {
3833 my $var = $2;
3834 if (CHK("UNNECESSARY_PARENTHESES",
3835 "Unnecessary parentheses around function pointer $var\n" . $herecurr) &&
3836 $fix) {
3837 my $var2 = deparenthesize($var);
3838 $var2 =~ s/\s//g;
3839 $fixed[$fixlinenr] =~ s/\Q$var\E/$var2/;
3840 }
3841 }
3820 3842
3821#goto labels aren't indented, allow a single space however 3843#goto labels aren't indented, allow a single space however
3822 if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and 3844 if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and
@@ -4056,7 +4078,9 @@ sub process {
4056#Ignore Page<foo> variants 4078#Ignore Page<foo> variants
4057 $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && 4079 $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ &&
4058#Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) 4080#Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show)
4059 $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) { 4081 $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/ &&
4082#Ignore some three character SI units explicitly, like MiB and KHz
4083 $var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) {
4060 while ($var =~ m{($Ident)}g) { 4084 while ($var =~ m{($Ident)}g) {
4061 my $word = $1; 4085 my $word = $1;
4062 next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); 4086 next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/);
@@ -4408,12 +4432,85 @@ sub process {
4408 "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); 4432 "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr);
4409 } 4433 }
4410 4434
4435# Check for user-visible strings broken across lines, which breaks the ability
4436# to grep for the string. Make exceptions when the previous string ends in a
4437# newline (multiple lines in one string constant) or '\t', '\r', ';', or '{'
4438# (common in inline assembly) or is a octal \123 or hexadecimal \xaf value
4439 if ($line =~ /^\+\s*"[X\t]*"/ &&
4440 $prevline =~ /"\s*$/ &&
4441 $prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) {
4442 if (WARN("SPLIT_STRING",
4443 "quoted string split across lines\n" . $hereprev) &&
4444 $fix &&
4445 $prevrawline =~ /^\+.*"\s*$/ &&
4446 $last_coalesced_string_linenr != $linenr - 1) {
4447 my $extracted_string = get_quoted_string($line, $rawline);
4448 my $comma_close = "";
4449 if ($rawline =~ /\Q$extracted_string\E(\s*\)\s*;\s*$|\s*,\s*)/) {
4450 $comma_close = $1;
4451 }
4452
4453 fix_delete_line($fixlinenr - 1, $prevrawline);
4454 fix_delete_line($fixlinenr, $rawline);
4455 my $fixedline = $prevrawline;
4456 $fixedline =~ s/"\s*$//;
4457 $fixedline .= substr($extracted_string, 1) . trim($comma_close);
4458 fix_insert_line($fixlinenr - 1, $fixedline);
4459 $fixedline = $rawline;
4460 $fixedline =~ s/\Q$extracted_string\E\Q$comma_close\E//;
4461 if ($fixedline !~ /\+\s*$/) {
4462 fix_insert_line($fixlinenr, $fixedline);
4463 }
4464 $last_coalesced_string_linenr = $linenr;
4465 }
4466 }
4467
4468# check for missing a space in a string concatenation
4469 if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) {
4470 WARN('MISSING_SPACE',
4471 "break quoted strings at a space character\n" . $hereprev);
4472 }
4473
4474# check for spaces before a quoted newline
4475 if ($rawline =~ /^.*\".*\s\\n/) {
4476 if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE",
4477 "unnecessary whitespace before a quoted newline\n" . $herecurr) &&
4478 $fix) {
4479 $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/;
4480 }
4481
4482 }
4483
4411# concatenated string without spaces between elements 4484# concatenated string without spaces between elements
4412 if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) { 4485 if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) {
4413 CHK("CONCATENATED_STRING", 4486 CHK("CONCATENATED_STRING",
4414 "Concatenated strings should use spaces between elements\n" . $herecurr); 4487 "Concatenated strings should use spaces between elements\n" . $herecurr);
4415 } 4488 }
4416 4489
4490# uncoalesced string fragments
4491 if ($line =~ /"X*"\s*"/) {
4492 WARN("STRING_FRAGMENTS",
4493 "Consecutive strings are generally better as a single string\n" . $herecurr);
4494 }
4495
4496# check for %L{u,d,i} in strings
4497 my $string;
4498 while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
4499 $string = substr($rawline, $-[1], $+[1] - $-[1]);
4500 $string =~ s/%%/__/g;
4501 if ($string =~ /(?<!%)%L[udi]/) {
4502 WARN("PRINTF_L",
4503 "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
4504 last;
4505 }
4506 }
4507
4508# check for line continuations in quoted strings with odd counts of "
4509 if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) {
4510 WARN("LINE_CONTINUATIONS",
4511 "Avoid line continuations in quoted strings\n" . $herecurr);
4512 }
4513
4417# warn about #if 0 4514# warn about #if 0
4418 if ($line =~ /^.\s*\#\s*if\s+0\b/) { 4515 if ($line =~ /^.\s*\#\s*if\s+0\b/) {
4419 CHK("REDUNDANT_CODE", 4516 CHK("REDUNDANT_CODE",
@@ -4426,7 +4523,7 @@ sub process {
4426 my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;'; 4523 my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
4427 if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) { 4524 if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
4428 WARN('NEEDLESS_IF', 4525 WARN('NEEDLESS_IF',
4429 "$1(NULL) is safe this check is probably not required\n" . $hereprev); 4526 "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
4430 } 4527 }
4431 } 4528 }
4432 4529
@@ -4458,6 +4555,28 @@ sub process {
4458 } 4555 }
4459 } 4556 }
4460 4557
4558# check for mask then right shift without a parentheses
4559 if ($^V && $^V ge 5.10.0 &&
4560 $line =~ /$LvalOrFunc\s*\&\s*($LvalOrFunc)\s*>>/ &&
4561 $4 !~ /^\&/) { # $LvalOrFunc may be &foo, ignore if so
4562 WARN("MASK_THEN_SHIFT",
4563 "Possible precedence defect with mask then right shift - may need parentheses\n" . $herecurr);
4564 }
4565
4566# check for pointer comparisons to NULL
4567 if ($^V && $^V ge 5.10.0) {
4568 while ($line =~ /\b$LvalOrFunc\s*(==|\!=)\s*NULL\b/g) {
4569 my $val = $1;
4570 my $equal = "!";
4571 $equal = "" if ($4 eq "!=");
4572 if (CHK("COMPARISON_TO_NULL",
4573 "Comparison to NULL could be written \"${equal}${val}\"\n" . $herecurr) &&
4574 $fix) {
4575 $fixed[$fixlinenr] =~ s/\b\Q$val\E\s*(?:==|\!=)\s*NULL\b/$equal$val/;
4576 }
4577 }
4578 }
4579
4461# check for bad placement of section $InitAttribute (e.g.: __initdata) 4580# check for bad placement of section $InitAttribute (e.g.: __initdata)
4462 if ($line =~ /(\b$InitAttribute\b)/) { 4581 if ($line =~ /(\b$InitAttribute\b)/) {
4463 my $attr = $1; 4582 my $attr = $1;
@@ -4652,6 +4771,15 @@ sub process {
4652 } 4771 }
4653 } 4772 }
4654 4773
4774# Check for __attribute__ weak, or __weak declarations (may have link issues)
4775 if ($^V && $^V ge 5.10.0 &&
4776 $line =~ /(?:$Declare|$DeclareMisordered)\s*$Ident\s*$balanced_parens\s*(?:$Attribute)?\s*;/ &&
4777 ($line =~ /\b__attribute__\s*\(\s*\(.*\bweak\b/ ||
4778 $line =~ /\b__weak\b/)) {
4779 ERROR("WEAK_DECLARATION",
4780 "Using weak declarations can have unintended link defects\n" . $herecurr);
4781 }
4782
4655# check for sizeof(&) 4783# check for sizeof(&)
4656 if ($line =~ /\bsizeof\s*\(\s*\&/) { 4784 if ($line =~ /\bsizeof\s*\(\s*\&/) {
4657 WARN("SIZEOF_ADDRESS", 4785 WARN("SIZEOF_ADDRESS",
@@ -4667,12 +4795,6 @@ sub process {
4667 } 4795 }
4668 } 4796 }
4669 4797
4670# check for line continuations in quoted strings with odd counts of "
4671 if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) {
4672 WARN("LINE_CONTINUATIONS",
4673 "Avoid line continuations in quoted strings\n" . $herecurr);
4674 }
4675
4676# check for struct spinlock declarations 4798# check for struct spinlock declarations
4677 if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) { 4799 if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) {
4678 WARN("USE_SPINLOCK_T", 4800 WARN("USE_SPINLOCK_T",
@@ -4908,6 +5030,17 @@ sub process {
4908 } 5030 }
4909 } 5031 }
4910 5032
5033# check for #defines like: 1 << <digit> that could be BIT(digit)
5034 if ($line =~ /#\s*define\s+\w+\s+\(?\s*1\s*([ulUL]*)\s*\<\<\s*(?:\d+|$Ident)\s*\)?/) {
5035 my $ull = "";
5036 $ull = "_ULL" if (defined($1) && $1 =~ /ll/i);
5037 if (CHK("BIT_MACRO",
5038 "Prefer using the BIT$ull macro\n" . $herecurr) &&
5039 $fix) {
5040 $fixed[$fixlinenr] =~ s/\(?\s*1\s*[ulUL]*\s*<<\s*(\d+|$Ident)\s*\)?/BIT${ull}($1)/;
5041 }
5042 }
5043
4911# check for case / default statements not preceded by break/fallthrough/switch 5044# check for case / default statements not preceded by break/fallthrough/switch
4912 if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) { 5045 if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) {
4913 my $has_break = 0; 5046 my $has_break = 0;
@@ -5071,18 +5204,6 @@ sub process {
5071 "#define of '$1' is wrong - use Kconfig variables or standard guards instead\n" . $herecurr); 5204 "#define of '$1' is wrong - use Kconfig variables or standard guards instead\n" . $herecurr);
5072 } 5205 }
5073 5206
5074# check for %L{u,d,i} in strings
5075 my $string;
5076 while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
5077 $string = substr($rawline, $-[1], $+[1] - $-[1]);
5078 $string =~ s/%%/__/g;
5079 if ($string =~ /(?<!%)%L[udi]/) {
5080 WARN("PRINTF_L",
5081 "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
5082 last;
5083 }
5084 }
5085
5086# whine mightly about in_atomic 5207# whine mightly about in_atomic
5087 if ($line =~ /\bin_atomic\s*\(/) { 5208 if ($line =~ /\bin_atomic\s*\(/) {
5088 if ($realfile =~ m@^drivers/@) { 5209 if ($realfile =~ m@^drivers/@) {
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 70bea942b413..9922e66883a5 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1753,7 +1753,7 @@ sub dump_struct($$) {
1753 # strip kmemcheck_bitfield_{begin,end}.*; 1753 # strip kmemcheck_bitfield_{begin,end}.*;
1754 $members =~ s/kmemcheck_bitfield_.*?;//gos; 1754 $members =~ s/kmemcheck_bitfield_.*?;//gos;
1755 # strip attributes 1755 # strip attributes
1756 $members =~ s/__aligned\s*\(.+\)//gos; 1756 $members =~ s/__aligned\s*\([^;]*\)//gos;
1757 1757
1758 create_parameterlist($members, ';', $file); 1758 create_parameterlist($members, ';', $file);
1759 check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested); 1759 check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested);