diff options
142 files changed, 3655 insertions, 3940 deletions
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt index a9faaca1f029..106245c3aecc 100644 --- a/Documentation/cgroups/hugetlb.txt +++ b/Documentation/cgroups/hugetlb.txt | |||
@@ -29,7 +29,7 @@ Brief summary of control files | |||
29 | 29 | ||
30 | hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage | 30 | hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage |
31 | hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded | 31 | hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded |
32 | hugetlb.<hugepagesize>.usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb | 32 | hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb |
33 | hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit | 33 | hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit |
34 | 34 | ||
35 | For a system supporting two hugepage size (16M and 16G) the control | 35 | For a system supporting two hugepage size (16M and 16G) the control |
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 02ab997a1ed2..46b2b5080317 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -1,5 +1,10 @@ | |||
1 | Memory Resource Controller | 1 | Memory Resource Controller |
2 | 2 | ||
3 | NOTE: This document is hopelessly outdated and it asks for a complete | ||
4 | rewrite. It still contains a useful information so we are keeping it | ||
5 | here but make sure to check the current code if you need a deeper | ||
6 | understanding. | ||
7 | |||
3 | NOTE: The Memory Resource Controller has generically been referred to as the | 8 | NOTE: The Memory Resource Controller has generically been referred to as the |
4 | memory controller in this document. Do not confuse memory controller | 9 | memory controller in this document. Do not confuse memory controller |
5 | used here with the memory controller that is used in hardware. | 10 | used here with the memory controller that is used in hardware. |
@@ -52,9 +57,9 @@ Brief summary of control files. | |||
52 | tasks # attach a task(thread) and show list of threads | 57 | tasks # attach a task(thread) and show list of threads |
53 | cgroup.procs # show list of processes | 58 | cgroup.procs # show list of processes |
54 | cgroup.event_control # an interface for event_fd() | 59 | cgroup.event_control # an interface for event_fd() |
55 | memory.usage_in_bytes # show current res_counter usage for memory | 60 | memory.usage_in_bytes # show current usage for memory |
56 | (See 5.5 for details) | 61 | (See 5.5 for details) |
57 | memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap | 62 | memory.memsw.usage_in_bytes # show current usage for memory+Swap |
58 | (See 5.5 for details) | 63 | (See 5.5 for details) |
59 | memory.limit_in_bytes # set/show limit of memory usage | 64 | memory.limit_in_bytes # set/show limit of memory usage |
60 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage | 65 | memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage |
@@ -116,16 +121,16 @@ The memory controller is the first controller developed. | |||
116 | 121 | ||
117 | 2.1. Design | 122 | 2.1. Design |
118 | 123 | ||
119 | The core of the design is a counter called the res_counter. The res_counter | 124 | The core of the design is a counter called the page_counter. The |
120 | tracks the current memory usage and limit of the group of processes associated | 125 | page_counter tracks the current memory usage and limit of the group of |
121 | with the controller. Each cgroup has a memory controller specific data | 126 | processes associated with the controller. Each cgroup has a memory controller |
122 | structure (mem_cgroup) associated with it. | 127 | specific data structure (mem_cgroup) associated with it. |
123 | 128 | ||
124 | 2.2. Accounting | 129 | 2.2. Accounting |
125 | 130 | ||
126 | +--------------------+ | 131 | +--------------------+ |
127 | | mem_cgroup | | 132 | | mem_cgroup | |
128 | | (res_counter) | | 133 | | (page_counter) | |
129 | +--------------------+ | 134 | +--------------------+ |
130 | / ^ \ | 135 | / ^ \ |
131 | / | \ | 136 | / | \ |
@@ -352,9 +357,8 @@ set: | |||
352 | 0. Configuration | 357 | 0. Configuration |
353 | 358 | ||
354 | a. Enable CONFIG_CGROUPS | 359 | a. Enable CONFIG_CGROUPS |
355 | b. Enable CONFIG_RESOURCE_COUNTERS | 360 | b. Enable CONFIG_MEMCG |
356 | c. Enable CONFIG_MEMCG | 361 | c. Enable CONFIG_MEMCG_SWAP (to use swap extension) |
357 | d. Enable CONFIG_MEMCG_SWAP (to use swap extension) | ||
358 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) | 362 | d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) |
359 | 363 | ||
360 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) | 364 | 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) |
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt deleted file mode 100644 index 762ca54eb929..000000000000 --- a/Documentation/cgroups/resource_counter.txt +++ /dev/null | |||
@@ -1,197 +0,0 @@ | |||
1 | |||
2 | The Resource Counter | ||
3 | |||
4 | The resource counter, declared at include/linux/res_counter.h, | ||
5 | is supposed to facilitate the resource management by controllers | ||
6 | by providing common stuff for accounting. | ||
7 | |||
8 | This "stuff" includes the res_counter structure and routines | ||
9 | to work with it. | ||
10 | |||
11 | |||
12 | |||
13 | 1. Crucial parts of the res_counter structure | ||
14 | |||
15 | a. unsigned long long usage | ||
16 | |||
17 | The usage value shows the amount of a resource that is consumed | ||
18 | by a group at a given time. The units of measurement should be | ||
19 | determined by the controller that uses this counter. E.g. it can | ||
20 | be bytes, items or any other unit the controller operates on. | ||
21 | |||
22 | b. unsigned long long max_usage | ||
23 | |||
24 | The maximal value of the usage over time. | ||
25 | |||
26 | This value is useful when gathering statistical information about | ||
27 | the particular group, as it shows the actual resource requirements | ||
28 | for a particular group, not just some usage snapshot. | ||
29 | |||
30 | c. unsigned long long limit | ||
31 | |||
32 | The maximal allowed amount of resource to consume by the group. In | ||
33 | case the group requests for more resources, so that the usage value | ||
34 | would exceed the limit, the resource allocation is rejected (see | ||
35 | the next section). | ||
36 | |||
37 | d. unsigned long long failcnt | ||
38 | |||
39 | The failcnt stands for "failures counter". This is the number of | ||
40 | resource allocation attempts that failed. | ||
41 | |||
42 | c. spinlock_t lock | ||
43 | |||
44 | Protects changes of the above values. | ||
45 | |||
46 | |||
47 | |||
48 | 2. Basic accounting routines | ||
49 | |||
50 | a. void res_counter_init(struct res_counter *rc, | ||
51 | struct res_counter *rc_parent) | ||
52 | |||
53 | Initializes the resource counter. As usual, should be the first | ||
54 | routine called for a new counter. | ||
55 | |||
56 | The struct res_counter *parent can be used to define a hierarchical | ||
57 | child -> parent relationship directly in the res_counter structure, | ||
58 | NULL can be used to define no relationship. | ||
59 | |||
60 | c. int res_counter_charge(struct res_counter *rc, unsigned long val, | ||
61 | struct res_counter **limit_fail_at) | ||
62 | |||
63 | When a resource is about to be allocated it has to be accounted | ||
64 | with the appropriate resource counter (controller should determine | ||
65 | which one to use on its own). This operation is called "charging". | ||
66 | |||
67 | This is not very important which operation - resource allocation | ||
68 | or charging - is performed first, but | ||
69 | * if the allocation is performed first, this may create a | ||
70 | temporary resource over-usage by the time resource counter is | ||
71 | charged; | ||
72 | * if the charging is performed first, then it should be uncharged | ||
73 | on error path (if the one is called). | ||
74 | |||
75 | If the charging fails and a hierarchical dependency exists, the | ||
76 | limit_fail_at parameter is set to the particular res_counter element | ||
77 | where the charging failed. | ||
78 | |||
79 | d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val) | ||
80 | |||
81 | When a resource is released (freed) it should be de-accounted | ||
82 | from the resource counter it was accounted to. This is called | ||
83 | "uncharging". The return value of this function indicate the amount | ||
84 | of charges still present in the counter. | ||
85 | |||
86 | The _locked routines imply that the res_counter->lock is taken. | ||
87 | |||
88 | e. u64 res_counter_uncharge_until | ||
89 | (struct res_counter *rc, struct res_counter *top, | ||
90 | unsigned long val) | ||
91 | |||
92 | Almost same as res_counter_uncharge() but propagation of uncharge | ||
93 | stops when rc == top. This is useful when kill a res_counter in | ||
94 | child cgroup. | ||
95 | |||
96 | 2.1 Other accounting routines | ||
97 | |||
98 | There are more routines that may help you with common needs, like | ||
99 | checking whether the limit is reached or resetting the max_usage | ||
100 | value. They are all declared in include/linux/res_counter.h. | ||
101 | |||
102 | |||
103 | |||
104 | 3. Analyzing the resource counter registrations | ||
105 | |||
106 | a. If the failcnt value constantly grows, this means that the counter's | ||
107 | limit is too tight. Either the group is misbehaving and consumes too | ||
108 | many resources, or the configuration is not suitable for the group | ||
109 | and the limit should be increased. | ||
110 | |||
111 | b. The max_usage value can be used to quickly tune the group. One may | ||
112 | set the limits to maximal values and either load the container with | ||
113 | a common pattern or leave one for a while. After this the max_usage | ||
114 | value shows the amount of memory the container would require during | ||
115 | its common activity. | ||
116 | |||
117 | Setting the limit a bit above this value gives a pretty good | ||
118 | configuration that works in most of the cases. | ||
119 | |||
120 | c. If the max_usage is much less than the limit, but the failcnt value | ||
121 | is growing, then the group tries to allocate a big chunk of resource | ||
122 | at once. | ||
123 | |||
124 | d. If the max_usage is much less than the limit, but the failcnt value | ||
125 | is 0, then this group is given too high limit, that it does not | ||
126 | require. It is better to lower the limit a bit leaving more resource | ||
127 | for other groups. | ||
128 | |||
129 | |||
130 | |||
131 | 4. Communication with the control groups subsystem (cgroups) | ||
132 | |||
133 | All the resource controllers that are using cgroups and resource counters | ||
134 | should provide files (in the cgroup filesystem) to work with the resource | ||
135 | counter fields. They are recommended to adhere to the following rules: | ||
136 | |||
137 | a. File names | ||
138 | |||
139 | Field name File name | ||
140 | --------------------------------------------------- | ||
141 | usage usage_in_<unit_of_measurement> | ||
142 | max_usage max_usage_in_<unit_of_measurement> | ||
143 | limit limit_in_<unit_of_measurement> | ||
144 | failcnt failcnt | ||
145 | lock no file :) | ||
146 | |||
147 | b. Reading from file should show the corresponding field value in the | ||
148 | appropriate format. | ||
149 | |||
150 | c. Writing to file | ||
151 | |||
152 | Field Expected behavior | ||
153 | ---------------------------------- | ||
154 | usage prohibited | ||
155 | max_usage reset to usage | ||
156 | limit set the limit | ||
157 | failcnt reset to zero | ||
158 | |||
159 | |||
160 | |||
161 | 5. Usage example | ||
162 | |||
163 | a. Declare a task group (take a look at cgroups subsystem for this) and | ||
164 | fold a res_counter into it | ||
165 | |||
166 | struct my_group { | ||
167 | struct res_counter res; | ||
168 | |||
169 | <other fields> | ||
170 | } | ||
171 | |||
172 | b. Put hooks in resource allocation/release paths | ||
173 | |||
174 | int alloc_something(...) | ||
175 | { | ||
176 | if (res_counter_charge(res_counter_ptr, amount) < 0) | ||
177 | return -ENOMEM; | ||
178 | |||
179 | <allocate the resource and return to the caller> | ||
180 | } | ||
181 | |||
182 | void release_something(...) | ||
183 | { | ||
184 | res_counter_uncharge(res_counter_ptr, amount); | ||
185 | |||
186 | <release the resource> | ||
187 | } | ||
188 | |||
189 | In order to keep the usage value self-consistent, both the | ||
190 | "res_counter_ptr" and the "amount" in release_something() should be | ||
191 | the same as they were in the alloc_something() when the releasing | ||
192 | resource was allocated. | ||
193 | |||
194 | c. Provide the way to read res_counter values and set them (the cgroups | ||
195 | still can help with it). | ||
196 | |||
197 | c. Compile and run :) | ||
diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt index 5a0f02d34d95..4ba4dbd34289 100644 --- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt +++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt | |||
@@ -5,11 +5,17 @@ Required properties: | |||
5 | - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family. | 5 | - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family. |
6 | - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family. | 6 | - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family. |
7 | This RTC IP has special WAKE-EN Register to enable | 7 | This RTC IP has special WAKE-EN Register to enable |
8 | Wakeup generation for event Alarm. | 8 | Wakeup generation for event Alarm. It can also be |
9 | used to control an external PMIC via the | ||
10 | pmic_power_en pin. | ||
9 | - reg: Address range of rtc register set | 11 | - reg: Address range of rtc register set |
10 | - interrupts: rtc timer, alarm interrupts in order | 12 | - interrupts: rtc timer, alarm interrupts in order |
11 | - interrupt-parent: phandle for the interrupt controller | 13 | - interrupt-parent: phandle for the interrupt controller |
12 | 14 | ||
15 | Optional properties: | ||
16 | - system-power-controller: whether the rtc is controlling the system power | ||
17 | through pmic_power_en | ||
18 | |||
13 | Example: | 19 | Example: |
14 | 20 | ||
15 | rtc@1c23000 { | 21 | rtc@1c23000 { |
@@ -18,4 +24,5 @@ rtc@1c23000 { | |||
18 | interrupts = <19 | 24 | interrupts = <19 |
19 | 19>; | 25 | 19>; |
20 | interrupt-parent = <&intc>; | 26 | interrupt-parent = <&intc>; |
27 | system-power-controller; | ||
21 | }; | 28 | }; |
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 0d354625299c..2417cb0b493b 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt | |||
@@ -115,6 +115,7 @@ nxp NXP Semiconductors | |||
115 | onnn ON Semiconductor Corp. | 115 | onnn ON Semiconductor Corp. |
116 | opencores OpenCores.org | 116 | opencores OpenCores.org |
117 | panasonic Panasonic Corporation | 117 | panasonic Panasonic Corporation |
118 | pericom Pericom Technology Inc. | ||
118 | phytec PHYTEC Messtechnik GmbH | 119 | phytec PHYTEC Messtechnik GmbH |
119 | picochip Picochip Ltd | 120 | picochip Picochip Ltd |
120 | plathome Plat'Home Co., Ltd. | 121 | plathome Plat'Home Co., Ltd. |
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 6c0b9f27e465..bc4bd5a44b88 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt | |||
@@ -471,6 +471,13 @@ format. Crash is available on Dave Anderson's site at the following URL: | |||
471 | 471 | ||
472 | http://people.redhat.com/~anderson/ | 472 | http://people.redhat.com/~anderson/ |
473 | 473 | ||
474 | Trigger Kdump on WARN() | ||
475 | ======================= | ||
476 | |||
477 | The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This | ||
478 | will cause a kdump to occur at the panic() call. In cases where a user wants | ||
479 | to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 | ||
480 | to achieve the same behaviour. | ||
474 | 481 | ||
475 | Contact | 482 | Contact |
476 | ======= | 483 | ======= |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 838f3776c924..d6eb3636fe5a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -2509,6 +2509,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2509 | timeout < 0: reboot immediately | 2509 | timeout < 0: reboot immediately |
2510 | Format: <timeout> | 2510 | Format: <timeout> |
2511 | 2511 | ||
2512 | panic_on_warn panic() instead of WARN(). Useful to cause kdump | ||
2513 | on a WARN(). | ||
2514 | |||
2512 | crash_kexec_post_notifiers | 2515 | crash_kexec_post_notifiers |
2513 | Run kdump after running panic-notifiers and dumping | 2516 | Run kdump after running panic-notifiers and dumping |
2514 | kmsg. This only for the users who doubt kdump always | 2517 | kmsg. This only for the users who doubt kdump always |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57baff5bdb80..b5d0c8501a18 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -54,8 +54,9 @@ show up in /proc/sys/kernel: | |||
54 | - overflowuid | 54 | - overflowuid |
55 | - panic | 55 | - panic |
56 | - panic_on_oops | 56 | - panic_on_oops |
57 | - panic_on_unrecovered_nmi | ||
58 | - panic_on_stackoverflow | 57 | - panic_on_stackoverflow |
58 | - panic_on_unrecovered_nmi | ||
59 | - panic_on_warn | ||
59 | - pid_max | 60 | - pid_max |
60 | - powersave-nap [ PPC only ] | 61 | - powersave-nap [ PPC only ] |
61 | - printk | 62 | - printk |
@@ -527,19 +528,6 @@ the recommended setting is 60. | |||
527 | 528 | ||
528 | ============================================================== | 529 | ============================================================== |
529 | 530 | ||
530 | panic_on_unrecovered_nmi: | ||
531 | |||
532 | The default Linux behaviour on an NMI of either memory or unknown is | ||
533 | to continue operation. For many environments such as scientific | ||
534 | computing it is preferable that the box is taken out and the error | ||
535 | dealt with than an uncorrected parity/ECC error get propagated. | ||
536 | |||
537 | A small number of systems do generate NMI's for bizarre random reasons | ||
538 | such as power management so the default is off. That sysctl works like | ||
539 | the existing panic controls already in that directory. | ||
540 | |||
541 | ============================================================== | ||
542 | |||
543 | panic_on_oops: | 531 | panic_on_oops: |
544 | 532 | ||
545 | Controls the kernel's behaviour when an oops or BUG is encountered. | 533 | Controls the kernel's behaviour when an oops or BUG is encountered. |
@@ -563,6 +551,30 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. | |||
563 | 551 | ||
564 | ============================================================== | 552 | ============================================================== |
565 | 553 | ||
554 | panic_on_unrecovered_nmi: | ||
555 | |||
556 | The default Linux behaviour on an NMI of either memory or unknown is | ||
557 | to continue operation. For many environments such as scientific | ||
558 | computing it is preferable that the box is taken out and the error | ||
559 | dealt with than an uncorrected parity/ECC error get propagated. | ||
560 | |||
561 | A small number of systems do generate NMI's for bizarre random reasons | ||
562 | such as power management so the default is off. That sysctl works like | ||
563 | the existing panic controls already in that directory. | ||
564 | |||
565 | ============================================================== | ||
566 | |||
567 | panic_on_warn: | ||
568 | |||
569 | Calls panic() in the WARN() path when set to 1. This is useful to avoid | ||
570 | a kernel rebuild when attempting to kdump at the location of a WARN(). | ||
571 | |||
572 | 0: only WARN(), default behaviour. | ||
573 | |||
574 | 1: call panic() after printing out WARN() location. | ||
575 | |||
576 | ============================================================== | ||
577 | |||
566 | perf_cpu_time_max_percent: | 578 | perf_cpu_time_max_percent: |
567 | 579 | ||
568 | Hints to the kernel how much CPU time it should be allowed to | 580 | Hints to the kernel how much CPU time it should be allowed to |
diff --git a/MAINTAINERS b/MAINTAINERS index 1563a3b38960..079efaf1b5e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2605,7 +2605,7 @@ L: cgroups@vger.kernel.org | |||
2605 | L: linux-mm@kvack.org | 2605 | L: linux-mm@kvack.org |
2606 | S: Maintained | 2606 | S: Maintained |
2607 | F: mm/memcontrol.c | 2607 | F: mm/memcontrol.c |
2608 | F: mm/page_cgroup.c | 2608 | F: mm/swap_cgroup.c |
2609 | 2609 | ||
2610 | CORETEMP HARDWARE MONITORING DRIVER | 2610 | CORETEMP HARDWARE MONITORING DRIVER |
2611 | M: Fenghua Yu <fenghua.yu@intel.com> | 2611 | M: Fenghua Yu <fenghua.yu@intel.com> |
@@ -2722,7 +2722,7 @@ F: drivers/net/wireless/cw1200/ | |||
2722 | 2722 | ||
2723 | CX18 VIDEO4LINUX DRIVER | 2723 | CX18 VIDEO4LINUX DRIVER |
2724 | M: Andy Walls <awalls@md.metrocast.net> | 2724 | M: Andy Walls <awalls@md.metrocast.net> |
2725 | L: ivtv-devel@ivtvdriver.org (moderated for non-subscribers) | 2725 | L: ivtv-devel@ivtvdriver.org (subscribers-only) |
2726 | L: linux-media@vger.kernel.org | 2726 | L: linux-media@vger.kernel.org |
2727 | T: git git://linuxtv.org/media_tree.git | 2727 | T: git git://linuxtv.org/media_tree.git |
2728 | W: http://linuxtv.org | 2728 | W: http://linuxtv.org |
@@ -5208,7 +5208,7 @@ F: drivers/media/tuners/it913x* | |||
5208 | 5208 | ||
5209 | IVTV VIDEO4LINUX DRIVER | 5209 | IVTV VIDEO4LINUX DRIVER |
5210 | M: Andy Walls <awalls@md.metrocast.net> | 5210 | M: Andy Walls <awalls@md.metrocast.net> |
5211 | L: ivtv-devel@ivtvdriver.org (moderated for non-subscribers) | 5211 | L: ivtv-devel@ivtvdriver.org (subscribers-only) |
5212 | L: linux-media@vger.kernel.org | 5212 | L: linux-media@vger.kernel.org |
5213 | T: git git://linuxtv.org/media_tree.git | 5213 | T: git git://linuxtv.org/media_tree.git |
5214 | W: http://www.ivtvdriver.org | 5214 | W: http://www.ivtvdriver.org |
diff --git a/arch/arm/boot/dts/am335x-boneblack.dts b/arch/arm/boot/dts/am335x-boneblack.dts index 901739fcb85a..5c42d259fa68 100644 --- a/arch/arm/boot/dts/am335x-boneblack.dts +++ b/arch/arm/boot/dts/am335x-boneblack.dts | |||
@@ -80,3 +80,7 @@ | |||
80 | status = "okay"; | 80 | status = "okay"; |
81 | }; | 81 | }; |
82 | }; | 82 | }; |
83 | |||
84 | &rtc { | ||
85 | system-power-controller; | ||
86 | }; | ||
diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index befe713b3e1b..acd37057bca9 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi | |||
@@ -435,7 +435,7 @@ | |||
435 | }; | 435 | }; |
436 | 436 | ||
437 | rtc: rtc@44e3e000 { | 437 | rtc: rtc@44e3e000 { |
438 | compatible = "ti,da830-rtc"; | 438 | compatible = "ti,am3352-rtc", "ti,da830-rtc"; |
439 | reg = <0x44e3e000 0x1000>; | 439 | reg = <0x44e3e000 0x1000>; |
440 | interrupts = <75 | 440 | interrupts = <75 |
441 | 76>; | 441 | 76>; |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 41a43bf26492..df22314f57cf 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -279,6 +279,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
279 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | 279 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ |
280 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 280 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
281 | 281 | ||
282 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) | ||
282 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) | 283 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) |
283 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) | 284 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) |
284 | #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) | 285 | #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) |
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5845ffea67c3..dc063fe6646a 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c | |||
@@ -2662,7 +2662,7 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg | |||
2662 | 2662 | ||
2663 | ret = -ENOMEM; | 2663 | ret = -ENOMEM; |
2664 | 2664 | ||
2665 | fd = get_unused_fd(); | 2665 | fd = get_unused_fd_flags(0); |
2666 | if (fd < 0) | 2666 | if (fd < 0) |
2667 | return fd; | 2667 | return fd; |
2668 | 2668 | ||
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index ae153c40ab7c..9b4b1904efae 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h | |||
@@ -467,6 +467,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) | |||
467 | } | 467 | } |
468 | 468 | ||
469 | #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) | 469 | #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) |
470 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) | ||
470 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) | 471 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) |
471 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) | 472 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) |
472 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) | 473 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) |
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 65d633f20d37..1a3429e1ccb5 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c | |||
@@ -301,7 +301,7 @@ static int spufs_context_open(struct path *path) | |||
301 | int ret; | 301 | int ret; |
302 | struct file *filp; | 302 | struct file *filp; |
303 | 303 | ||
304 | ret = get_unused_fd(); | 304 | ret = get_unused_fd_flags(0); |
305 | if (ret < 0) | 305 | if (ret < 0) |
306 | return ret; | 306 | return ret; |
307 | 307 | ||
@@ -518,7 +518,7 @@ static int spufs_gang_open(struct path *path) | |||
518 | int ret; | 518 | int ret; |
519 | struct file *filp; | 519 | struct file *filp; |
520 | 520 | ||
521 | ret = get_unused_fd(); | 521 | ret = get_unused_fd_flags(0); |
522 | if (ret < 0) | 522 | if (ret < 0) |
523 | return ret; | 523 | return ret; |
524 | 524 | ||
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index 3d85225b9e95..bce52ba66206 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c | |||
@@ -31,7 +31,7 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) | |||
31 | unsigned long bootmem_paddr; | 31 | unsigned long bootmem_paddr; |
32 | 32 | ||
33 | /* Don't allow bogus node assignment */ | 33 | /* Don't allow bogus node assignment */ |
34 | BUG_ON(nid > MAX_NUMNODES || nid <= 0); | 34 | BUG_ON(nid >= MAX_NUMNODES || nid <= 0); |
35 | 35 | ||
36 | start_pfn = start >> PAGE_SHIFT; | 36 | start_pfn = start >> PAGE_SHIFT; |
37 | end_pfn = end >> PAGE_SHIFT; | 37 | end_pfn = end >> PAGE_SHIFT; |
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index bfeb626085ac..1ff9e7864168 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -667,6 +667,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd) | |||
667 | } | 667 | } |
668 | 668 | ||
669 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 669 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
670 | static inline unsigned long pmd_dirty(pmd_t pmd) | ||
671 | { | ||
672 | pte_t pte = __pte(pmd_val(pmd)); | ||
673 | |||
674 | return pte_dirty(pte); | ||
675 | } | ||
676 | |||
670 | static inline unsigned long pmd_young(pmd_t pmd) | 677 | static inline unsigned long pmd_young(pmd_t pmd) |
671 | { | 678 | { |
672 | pte_t pte = __pte(pmd_val(pmd)); | 679 | pte_t pte = __pte(pmd_val(pmd)); |
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c index b608e00e7f6d..aefb2c086726 100644 --- a/arch/tile/kernel/early_printk.c +++ b/arch/tile/kernel/early_printk.c | |||
@@ -43,13 +43,20 @@ static struct console early_hv_console = { | |||
43 | 43 | ||
44 | void early_panic(const char *fmt, ...) | 44 | void early_panic(const char *fmt, ...) |
45 | { | 45 | { |
46 | va_list ap; | 46 | struct va_format vaf; |
47 | va_list args; | ||
48 | |||
47 | arch_local_irq_disable_all(); | 49 | arch_local_irq_disable_all(); |
48 | va_start(ap, fmt); | 50 | |
49 | early_printk("Kernel panic - not syncing: "); | 51 | va_start(args, fmt); |
50 | early_vprintk(fmt, ap); | 52 | |
51 | early_printk("\n"); | 53 | vaf.fmt = fmt; |
52 | va_end(ap); | 54 | vaf.va = &args; |
55 | |||
56 | early_printk("Kernel panic - not syncing: %pV", &vaf); | ||
57 | |||
58 | va_end(args); | ||
59 | |||
53 | dump_stack(); | 60 | dump_stack(); |
54 | hv_halt(); | 61 | hv_halt(); |
55 | } | 62 | } |
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index b9736ded06f2..7f079bbfdf4c 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c | |||
@@ -534,11 +534,10 @@ static void __init setup_memory(void) | |||
534 | } | 534 | } |
535 | } | 535 | } |
536 | physpages -= dropped_pages; | 536 | physpages -= dropped_pages; |
537 | pr_warning("Only using %ldMB memory;" | 537 | pr_warn("Only using %ldMB memory - ignoring %ldMB\n", |
538 | " ignoring %ldMB.\n", | 538 | physpages >> (20 - PAGE_SHIFT), |
539 | physpages >> (20 - PAGE_SHIFT), | 539 | dropped_pages >> (20 - PAGE_SHIFT)); |
540 | dropped_pages >> (20 - PAGE_SHIFT)); | 540 | pr_warn("Consider using a larger page size\n"); |
541 | pr_warning("Consider using a larger page size.\n"); | ||
542 | } | 541 | } |
543 | #endif | 542 | #endif |
544 | 543 | ||
@@ -566,9 +565,8 @@ static void __init setup_memory(void) | |||
566 | 565 | ||
567 | #ifndef __tilegx__ | 566 | #ifndef __tilegx__ |
568 | if (node_end_pfn[0] > MAXMEM_PFN) { | 567 | if (node_end_pfn[0] > MAXMEM_PFN) { |
569 | pr_warning("Only using %ldMB LOWMEM.\n", | 568 | pr_warn("Only using %ldMB LOWMEM\n", MAXMEM >> 20); |
570 | MAXMEM>>20); | 569 | pr_warn("Use a HIGHMEM enabled kernel\n"); |
571 | pr_warning("Use a HIGHMEM enabled kernel.\n"); | ||
572 | max_low_pfn = MAXMEM_PFN; | 570 | max_low_pfn = MAXMEM_PFN; |
573 | max_pfn = MAXMEM_PFN; | 571 | max_pfn = MAXMEM_PFN; |
574 | node_end_pfn[0] = MAXMEM_PFN; | 572 | node_end_pfn[0] = MAXMEM_PFN; |
@@ -1112,8 +1110,8 @@ static void __init load_hv_initrd(void) | |||
1112 | fd = hv_fs_findfile((HV_VirtAddr) initramfs_file); | 1110 | fd = hv_fs_findfile((HV_VirtAddr) initramfs_file); |
1113 | if (fd == HV_ENOENT) { | 1111 | if (fd == HV_ENOENT) { |
1114 | if (set_initramfs_file) { | 1112 | if (set_initramfs_file) { |
1115 | pr_warning("No such hvfs initramfs file '%s'\n", | 1113 | pr_warn("No such hvfs initramfs file '%s'\n", |
1116 | initramfs_file); | 1114 | initramfs_file); |
1117 | return; | 1115 | return; |
1118 | } else { | 1116 | } else { |
1119 | /* Try old backwards-compatible name. */ | 1117 | /* Try old backwards-compatible name. */ |
@@ -1126,8 +1124,8 @@ static void __init load_hv_initrd(void) | |||
1126 | stat = hv_fs_fstat(fd); | 1124 | stat = hv_fs_fstat(fd); |
1127 | BUG_ON(stat.size < 0); | 1125 | BUG_ON(stat.size < 0); |
1128 | if (stat.flags & HV_FS_ISDIR) { | 1126 | if (stat.flags & HV_FS_ISDIR) { |
1129 | pr_warning("Ignoring hvfs file '%s': it's a directory.\n", | 1127 | pr_warn("Ignoring hvfs file '%s': it's a directory\n", |
1130 | initramfs_file); | 1128 | initramfs_file); |
1131 | return; | 1129 | return; |
1132 | } | 1130 | } |
1133 | initrd = alloc_bootmem_pages(stat.size); | 1131 | initrd = alloc_bootmem_pages(stat.size); |
@@ -1185,9 +1183,8 @@ static void __init validate_hv(void) | |||
1185 | HV_Topology topology = hv_inquire_topology(); | 1183 | HV_Topology topology = hv_inquire_topology(); |
1186 | BUG_ON(topology.coord.x != 0 || topology.coord.y != 0); | 1184 | BUG_ON(topology.coord.x != 0 || topology.coord.y != 0); |
1187 | if (topology.width != 1 || topology.height != 1) { | 1185 | if (topology.width != 1 || topology.height != 1) { |
1188 | pr_warning("Warning: booting UP kernel on %dx%d grid;" | 1186 | pr_warn("Warning: booting UP kernel on %dx%d grid; will ignore all but first tile\n", |
1189 | " will ignore all but first tile.\n", | 1187 | topology.width, topology.height); |
1190 | topology.width, topology.height); | ||
1191 | } | 1188 | } |
1192 | #endif | 1189 | #endif |
1193 | 1190 | ||
@@ -1208,9 +1205,8 @@ static void __init validate_hv(void) | |||
1208 | * We use a struct cpumask for this, so it must be big enough. | 1205 | * We use a struct cpumask for this, so it must be big enough. |
1209 | */ | 1206 | */ |
1210 | if ((smp_height * smp_width) > nr_cpu_ids) | 1207 | if ((smp_height * smp_width) > nr_cpu_ids) |
1211 | early_panic("Hypervisor %d x %d grid too big for Linux" | 1208 | early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n", |
1212 | " NR_CPUS %d\n", smp_height, smp_width, | 1209 | smp_height, smp_width, nr_cpu_ids); |
1213 | nr_cpu_ids); | ||
1214 | #endif | 1210 | #endif |
1215 | 1211 | ||
1216 | /* | 1212 | /* |
@@ -1265,10 +1261,9 @@ static void __init validate_va(void) | |||
1265 | 1261 | ||
1266 | /* Kernel PCs must have their high bit set; see intvec.S. */ | 1262 | /* Kernel PCs must have their high bit set; see intvec.S. */ |
1267 | if ((long)VMALLOC_START >= 0) | 1263 | if ((long)VMALLOC_START >= 0) |
1268 | early_panic( | 1264 | early_panic("Linux VMALLOC region below the 2GB line (%#lx)!\n" |
1269 | "Linux VMALLOC region below the 2GB line (%#lx)!\n" | 1265 | "Reconfigure the kernel with smaller VMALLOC_RESERVE\n", |
1270 | "Reconfigure the kernel with smaller VMALLOC_RESERVE.\n", | 1266 | VMALLOC_START); |
1271 | VMALLOC_START); | ||
1272 | #endif | 1267 | #endif |
1273 | } | 1268 | } |
1274 | 1269 | ||
@@ -1395,7 +1390,7 @@ static void __init setup_cpu_maps(void) | |||
1395 | 1390 | ||
1396 | static int __init dataplane(char *str) | 1391 | static int __init dataplane(char *str) |
1397 | { | 1392 | { |
1398 | pr_warning("WARNING: dataplane support disabled in this kernel\n"); | 1393 | pr_warn("WARNING: dataplane support disabled in this kernel\n"); |
1399 | return 0; | 1394 | return 0; |
1400 | } | 1395 | } |
1401 | 1396 | ||
@@ -1413,8 +1408,8 @@ void __init setup_arch(char **cmdline_p) | |||
1413 | len = hv_get_command_line((HV_VirtAddr) boot_command_line, | 1408 | len = hv_get_command_line((HV_VirtAddr) boot_command_line, |
1414 | COMMAND_LINE_SIZE); | 1409 | COMMAND_LINE_SIZE); |
1415 | if (boot_command_line[0]) | 1410 | if (boot_command_line[0]) |
1416 | pr_warning("WARNING: ignoring dynamic command line \"%s\"\n", | 1411 | pr_warn("WARNING: ignoring dynamic command line \"%s\"\n", |
1417 | boot_command_line); | 1412 | boot_command_line); |
1418 | strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); | 1413 | strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); |
1419 | #else | 1414 | #else |
1420 | char *hv_cmdline; | 1415 | char *hv_cmdline; |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c112ea63f40d..e8a5454acc99 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -100,6 +100,11 @@ static inline int pte_young(pte_t pte) | |||
100 | return pte_flags(pte) & _PAGE_ACCESSED; | 100 | return pte_flags(pte) & _PAGE_ACCESSED; |
101 | } | 101 | } |
102 | 102 | ||
103 | static inline int pmd_dirty(pmd_t pmd) | ||
104 | { | ||
105 | return pmd_flags(pmd) & _PAGE_DIRTY; | ||
106 | } | ||
107 | |||
103 | static inline int pmd_young(pmd_t pmd) | 108 | static inline int pmd_young(pmd_t pmd) |
104 | { | 109 | { |
105 | return pmd_flags(pmd) & _PAGE_ACCESSED; | 110 | return pmd_flags(pmd) & _PAGE_ACCESSED; |
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index df04227d00cf..98504ec99c7d 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig | |||
@@ -267,18 +267,24 @@ comment "Default contiguous memory area size:" | |||
267 | config CMA_SIZE_MBYTES | 267 | config CMA_SIZE_MBYTES |
268 | int "Size in Mega Bytes" | 268 | int "Size in Mega Bytes" |
269 | depends on !CMA_SIZE_SEL_PERCENTAGE | 269 | depends on !CMA_SIZE_SEL_PERCENTAGE |
270 | default 0 if X86 | ||
270 | default 16 | 271 | default 16 |
271 | help | 272 | help |
272 | Defines the size (in MiB) of the default memory area for Contiguous | 273 | Defines the size (in MiB) of the default memory area for Contiguous |
273 | Memory Allocator. | 274 | Memory Allocator. If the size of 0 is selected, CMA is disabled by |
275 | default, but it can be enabled by passing cma=size[MG] to the kernel. | ||
276 | |||
274 | 277 | ||
275 | config CMA_SIZE_PERCENTAGE | 278 | config CMA_SIZE_PERCENTAGE |
276 | int "Percentage of total memory" | 279 | int "Percentage of total memory" |
277 | depends on !CMA_SIZE_SEL_MBYTES | 280 | depends on !CMA_SIZE_SEL_MBYTES |
281 | default 0 if X86 | ||
278 | default 10 | 282 | default 10 |
279 | help | 283 | help |
280 | Defines the size of the default memory area for Contiguous Memory | 284 | Defines the size of the default memory area for Contiguous Memory |
281 | Allocator as a percentage of the total memory in the system. | 285 | Allocator as a percentage of the total memory in the system. |
286 | If 0 percent is selected, CMA is disabled by default, but it can be | ||
287 | enabled by passing cma=size[MG] to the kernel. | ||
282 | 288 | ||
283 | choice | 289 | choice |
284 | prompt "Selected region size" | 290 | prompt "Selected region size" |
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index b682651b5307..4511ddc1ac31 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig | |||
@@ -192,6 +192,14 @@ config RTC_DRV_DS1374 | |||
192 | This driver can also be built as a module. If so, the module | 192 | This driver can also be built as a module. If so, the module |
193 | will be called rtc-ds1374. | 193 | will be called rtc-ds1374. |
194 | 194 | ||
195 | config RTC_DRV_DS1374_WDT | ||
196 | bool "Dallas/Maxim DS1374 watchdog timer" | ||
197 | depends on RTC_DRV_DS1374 | ||
198 | help | ||
199 | If you say Y here you will get support for the | ||
200 | watchdog timer in the Dallas Semiconductor DS1374 | ||
201 | real-time clock chips. | ||
202 | |||
195 | config RTC_DRV_DS1672 | 203 | config RTC_DRV_DS1672 |
196 | tristate "Dallas/Maxim DS1672" | 204 | tristate "Dallas/Maxim DS1672" |
197 | help | 205 | help |
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 5b2717f5dafa..45bfc28ee3aa 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c | |||
@@ -30,6 +30,14 @@ static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) | |||
30 | else { | 30 | else { |
31 | memset(tm, 0, sizeof(struct rtc_time)); | 31 | memset(tm, 0, sizeof(struct rtc_time)); |
32 | err = rtc->ops->read_time(rtc->dev.parent, tm); | 32 | err = rtc->ops->read_time(rtc->dev.parent, tm); |
33 | if (err < 0) { | ||
34 | dev_err(&rtc->dev, "read_time: fail to read\n"); | ||
35 | return err; | ||
36 | } | ||
37 | |||
38 | err = rtc_valid_tm(tm); | ||
39 | if (err < 0) | ||
40 | dev_err(&rtc->dev, "read_time: rtc_time isn't valid\n"); | ||
33 | } | 41 | } |
34 | return err; | 42 | return err; |
35 | } | 43 | } |
@@ -891,11 +899,24 @@ again: | |||
891 | if (next) { | 899 | if (next) { |
892 | struct rtc_wkalrm alarm; | 900 | struct rtc_wkalrm alarm; |
893 | int err; | 901 | int err; |
902 | int retry = 3; | ||
903 | |||
894 | alarm.time = rtc_ktime_to_tm(next->expires); | 904 | alarm.time = rtc_ktime_to_tm(next->expires); |
895 | alarm.enabled = 1; | 905 | alarm.enabled = 1; |
906 | reprogram: | ||
896 | err = __rtc_set_alarm(rtc, &alarm); | 907 | err = __rtc_set_alarm(rtc, &alarm); |
897 | if (err == -ETIME) | 908 | if (err == -ETIME) |
898 | goto again; | 909 | goto again; |
910 | else if (err) { | ||
911 | if (retry-- > 0) | ||
912 | goto reprogram; | ||
913 | |||
914 | timer = container_of(next, struct rtc_timer, node); | ||
915 | timerqueue_del(&rtc->timerqueue, &timer->node); | ||
916 | timer->enabled = 0; | ||
917 | dev_err(&rtc->dev, "__rtc_set_alarm: err=%d\n", err); | ||
918 | goto again; | ||
919 | } | ||
899 | } else | 920 | } else |
900 | rtc_alarm_disable(rtc); | 921 | rtc_alarm_disable(rtc); |
901 | 922 | ||
diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 727e2f5d14d9..866e0ef5122d 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c | |||
@@ -504,6 +504,8 @@ static int ab8500_rtc_probe(struct platform_device *pdev) | |||
504 | return err; | 504 | return err; |
505 | } | 505 | } |
506 | 506 | ||
507 | rtc->uie_unsupported = 1; | ||
508 | |||
507 | return 0; | 509 | return 0; |
508 | } | 510 | } |
509 | 511 | ||
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index bb43cf703efc..4ffabb322a9a 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c | |||
@@ -35,7 +35,7 @@ enum ds_type { | |||
35 | ds_1388, | 35 | ds_1388, |
36 | ds_3231, | 36 | ds_3231, |
37 | m41t00, | 37 | m41t00, |
38 | mcp7941x, | 38 | mcp794xx, |
39 | rx_8025, | 39 | rx_8025, |
40 | last_ds_type /* always last */ | 40 | last_ds_type /* always last */ |
41 | /* rs5c372 too? different address... */ | 41 | /* rs5c372 too? different address... */ |
@@ -46,7 +46,7 @@ enum ds_type { | |||
46 | #define DS1307_REG_SECS 0x00 /* 00-59 */ | 46 | #define DS1307_REG_SECS 0x00 /* 00-59 */ |
47 | # define DS1307_BIT_CH 0x80 | 47 | # define DS1307_BIT_CH 0x80 |
48 | # define DS1340_BIT_nEOSC 0x80 | 48 | # define DS1340_BIT_nEOSC 0x80 |
49 | # define MCP7941X_BIT_ST 0x80 | 49 | # define MCP794XX_BIT_ST 0x80 |
50 | #define DS1307_REG_MIN 0x01 /* 00-59 */ | 50 | #define DS1307_REG_MIN 0x01 /* 00-59 */ |
51 | #define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */ | 51 | #define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */ |
52 | # define DS1307_BIT_12HR 0x40 /* in REG_HOUR */ | 52 | # define DS1307_BIT_12HR 0x40 /* in REG_HOUR */ |
@@ -54,7 +54,7 @@ enum ds_type { | |||
54 | # define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */ | 54 | # define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */ |
55 | # define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */ | 55 | # define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */ |
56 | #define DS1307_REG_WDAY 0x03 /* 01-07 */ | 56 | #define DS1307_REG_WDAY 0x03 /* 01-07 */ |
57 | # define MCP7941X_BIT_VBATEN 0x08 | 57 | # define MCP794XX_BIT_VBATEN 0x08 |
58 | #define DS1307_REG_MDAY 0x04 /* 01-31 */ | 58 | #define DS1307_REG_MDAY 0x04 /* 01-31 */ |
59 | #define DS1307_REG_MONTH 0x05 /* 01-12 */ | 59 | #define DS1307_REG_MONTH 0x05 /* 01-12 */ |
60 | # define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */ | 60 | # define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */ |
@@ -159,7 +159,7 @@ static struct chip_desc chips[last_ds_type] = { | |||
159 | [ds_3231] = { | 159 | [ds_3231] = { |
160 | .alarm = 1, | 160 | .alarm = 1, |
161 | }, | 161 | }, |
162 | [mcp7941x] = { | 162 | [mcp794xx] = { |
163 | .alarm = 1, | 163 | .alarm = 1, |
164 | /* this is battery backed SRAM */ | 164 | /* this is battery backed SRAM */ |
165 | .nvram_offset = 0x20, | 165 | .nvram_offset = 0x20, |
@@ -176,7 +176,8 @@ static const struct i2c_device_id ds1307_id[] = { | |||
176 | { "ds1340", ds_1340 }, | 176 | { "ds1340", ds_1340 }, |
177 | { "ds3231", ds_3231 }, | 177 | { "ds3231", ds_3231 }, |
178 | { "m41t00", m41t00 }, | 178 | { "m41t00", m41t00 }, |
179 | { "mcp7941x", mcp7941x }, | 179 | { "mcp7940x", mcp794xx }, |
180 | { "mcp7941x", mcp794xx }, | ||
180 | { "pt7c4338", ds_1307 }, | 181 | { "pt7c4338", ds_1307 }, |
181 | { "rx8025", rx_8025 }, | 182 | { "rx8025", rx_8025 }, |
182 | { } | 183 | { } |
@@ -439,14 +440,14 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) | |||
439 | buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN | 440 | buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN |
440 | | DS1340_BIT_CENTURY; | 441 | | DS1340_BIT_CENTURY; |
441 | break; | 442 | break; |
442 | case mcp7941x: | 443 | case mcp794xx: |
443 | /* | 444 | /* |
444 | * these bits were cleared when preparing the date/time | 445 | * these bits were cleared when preparing the date/time |
445 | * values and need to be set again before writing the | 446 | * values and need to be set again before writing the |
446 | * buffer out to the device. | 447 | * buffer out to the device. |
447 | */ | 448 | */ |
448 | buf[DS1307_REG_SECS] |= MCP7941X_BIT_ST; | 449 | buf[DS1307_REG_SECS] |= MCP794XX_BIT_ST; |
449 | buf[DS1307_REG_WDAY] |= MCP7941X_BIT_VBATEN; | 450 | buf[DS1307_REG_WDAY] |= MCP794XX_BIT_VBATEN; |
450 | break; | 451 | break; |
451 | default: | 452 | default: |
452 | break; | 453 | break; |
@@ -614,26 +615,26 @@ static const struct rtc_class_ops ds13xx_rtc_ops = { | |||
614 | /*----------------------------------------------------------------------*/ | 615 | /*----------------------------------------------------------------------*/ |
615 | 616 | ||
616 | /* | 617 | /* |
617 | * Alarm support for mcp7941x devices. | 618 | * Alarm support for mcp794xx devices. |
618 | */ | 619 | */ |
619 | 620 | ||
620 | #define MCP7941X_REG_CONTROL 0x07 | 621 | #define MCP794XX_REG_CONTROL 0x07 |
621 | # define MCP7941X_BIT_ALM0_EN 0x10 | 622 | # define MCP794XX_BIT_ALM0_EN 0x10 |
622 | # define MCP7941X_BIT_ALM1_EN 0x20 | 623 | # define MCP794XX_BIT_ALM1_EN 0x20 |
623 | #define MCP7941X_REG_ALARM0_BASE 0x0a | 624 | #define MCP794XX_REG_ALARM0_BASE 0x0a |
624 | #define MCP7941X_REG_ALARM0_CTRL 0x0d | 625 | #define MCP794XX_REG_ALARM0_CTRL 0x0d |
625 | #define MCP7941X_REG_ALARM1_BASE 0x11 | 626 | #define MCP794XX_REG_ALARM1_BASE 0x11 |
626 | #define MCP7941X_REG_ALARM1_CTRL 0x14 | 627 | #define MCP794XX_REG_ALARM1_CTRL 0x14 |
627 | # define MCP7941X_BIT_ALMX_IF (1 << 3) | 628 | # define MCP794XX_BIT_ALMX_IF (1 << 3) |
628 | # define MCP7941X_BIT_ALMX_C0 (1 << 4) | 629 | # define MCP794XX_BIT_ALMX_C0 (1 << 4) |
629 | # define MCP7941X_BIT_ALMX_C1 (1 << 5) | 630 | # define MCP794XX_BIT_ALMX_C1 (1 << 5) |
630 | # define MCP7941X_BIT_ALMX_C2 (1 << 6) | 631 | # define MCP794XX_BIT_ALMX_C2 (1 << 6) |
631 | # define MCP7941X_BIT_ALMX_POL (1 << 7) | 632 | # define MCP794XX_BIT_ALMX_POL (1 << 7) |
632 | # define MCP7941X_MSK_ALMX_MATCH (MCP7941X_BIT_ALMX_C0 | \ | 633 | # define MCP794XX_MSK_ALMX_MATCH (MCP794XX_BIT_ALMX_C0 | \ |
633 | MCP7941X_BIT_ALMX_C1 | \ | 634 | MCP794XX_BIT_ALMX_C1 | \ |
634 | MCP7941X_BIT_ALMX_C2) | 635 | MCP794XX_BIT_ALMX_C2) |
635 | 636 | ||
636 | static void mcp7941x_work(struct work_struct *work) | 637 | static void mcp794xx_work(struct work_struct *work) |
637 | { | 638 | { |
638 | struct ds1307 *ds1307 = container_of(work, struct ds1307, work); | 639 | struct ds1307 *ds1307 = container_of(work, struct ds1307, work); |
639 | struct i2c_client *client = ds1307->client; | 640 | struct i2c_client *client = ds1307->client; |
@@ -642,22 +643,22 @@ static void mcp7941x_work(struct work_struct *work) | |||
642 | mutex_lock(&ds1307->rtc->ops_lock); | 643 | mutex_lock(&ds1307->rtc->ops_lock); |
643 | 644 | ||
644 | /* Check and clear alarm 0 interrupt flag. */ | 645 | /* Check and clear alarm 0 interrupt flag. */ |
645 | reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_ALARM0_CTRL); | 646 | reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_ALARM0_CTRL); |
646 | if (reg < 0) | 647 | if (reg < 0) |
647 | goto out; | 648 | goto out; |
648 | if (!(reg & MCP7941X_BIT_ALMX_IF)) | 649 | if (!(reg & MCP794XX_BIT_ALMX_IF)) |
649 | goto out; | 650 | goto out; |
650 | reg &= ~MCP7941X_BIT_ALMX_IF; | 651 | reg &= ~MCP794XX_BIT_ALMX_IF; |
651 | ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_ALARM0_CTRL, reg); | 652 | ret = i2c_smbus_write_byte_data(client, MCP794XX_REG_ALARM0_CTRL, reg); |
652 | if (ret < 0) | 653 | if (ret < 0) |
653 | goto out; | 654 | goto out; |
654 | 655 | ||
655 | /* Disable alarm 0. */ | 656 | /* Disable alarm 0. */ |
656 | reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); | 657 | reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_CONTROL); |
657 | if (reg < 0) | 658 | if (reg < 0) |
658 | goto out; | 659 | goto out; |
659 | reg &= ~MCP7941X_BIT_ALM0_EN; | 660 | reg &= ~MCP794XX_BIT_ALM0_EN; |
660 | ret = i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); | 661 | ret = i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, reg); |
661 | if (ret < 0) | 662 | if (ret < 0) |
662 | goto out; | 663 | goto out; |
663 | 664 | ||
@@ -669,7 +670,7 @@ out: | |||
669 | mutex_unlock(&ds1307->rtc->ops_lock); | 670 | mutex_unlock(&ds1307->rtc->ops_lock); |
670 | } | 671 | } |
671 | 672 | ||
672 | static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t) | 673 | static int mcp794xx_read_alarm(struct device *dev, struct rtc_wkalrm *t) |
673 | { | 674 | { |
674 | struct i2c_client *client = to_i2c_client(dev); | 675 | struct i2c_client *client = to_i2c_client(dev); |
675 | struct ds1307 *ds1307 = i2c_get_clientdata(client); | 676 | struct ds1307 *ds1307 = i2c_get_clientdata(client); |
@@ -680,11 +681,11 @@ static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t) | |||
680 | return -EINVAL; | 681 | return -EINVAL; |
681 | 682 | ||
682 | /* Read control and alarm 0 registers. */ | 683 | /* Read control and alarm 0 registers. */ |
683 | ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); | 684 | ret = ds1307->read_block_data(client, MCP794XX_REG_CONTROL, 10, regs); |
684 | if (ret < 0) | 685 | if (ret < 0) |
685 | return ret; | 686 | return ret; |
686 | 687 | ||
687 | t->enabled = !!(regs[0] & MCP7941X_BIT_ALM0_EN); | 688 | t->enabled = !!(regs[0] & MCP794XX_BIT_ALM0_EN); |
688 | 689 | ||
689 | /* Report alarm 0 time assuming 24-hour and day-of-month modes. */ | 690 | /* Report alarm 0 time assuming 24-hour and day-of-month modes. */ |
690 | t->time.tm_sec = bcd2bin(ds1307->regs[3] & 0x7f); | 691 | t->time.tm_sec = bcd2bin(ds1307->regs[3] & 0x7f); |
@@ -701,14 +702,14 @@ static int mcp7941x_read_alarm(struct device *dev, struct rtc_wkalrm *t) | |||
701 | "enabled=%d polarity=%d irq=%d match=%d\n", __func__, | 702 | "enabled=%d polarity=%d irq=%d match=%d\n", __func__, |
702 | t->time.tm_sec, t->time.tm_min, t->time.tm_hour, | 703 | t->time.tm_sec, t->time.tm_min, t->time.tm_hour, |
703 | t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, t->enabled, | 704 | t->time.tm_wday, t->time.tm_mday, t->time.tm_mon, t->enabled, |
704 | !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_POL), | 705 | !!(ds1307->regs[6] & MCP794XX_BIT_ALMX_POL), |
705 | !!(ds1307->regs[6] & MCP7941X_BIT_ALMX_IF), | 706 | !!(ds1307->regs[6] & MCP794XX_BIT_ALMX_IF), |
706 | (ds1307->regs[6] & MCP7941X_MSK_ALMX_MATCH) >> 4); | 707 | (ds1307->regs[6] & MCP794XX_MSK_ALMX_MATCH) >> 4); |
707 | 708 | ||
708 | return 0; | 709 | return 0; |
709 | } | 710 | } |
710 | 711 | ||
711 | static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t) | 712 | static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t) |
712 | { | 713 | { |
713 | struct i2c_client *client = to_i2c_client(dev); | 714 | struct i2c_client *client = to_i2c_client(dev); |
714 | struct ds1307 *ds1307 = i2c_get_clientdata(client); | 715 | struct ds1307 *ds1307 = i2c_get_clientdata(client); |
@@ -725,7 +726,7 @@ static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t) | |||
725 | t->enabled, t->pending); | 726 | t->enabled, t->pending); |
726 | 727 | ||
727 | /* Read control and alarm 0 registers. */ | 728 | /* Read control and alarm 0 registers. */ |
728 | ret = ds1307->read_block_data(client, MCP7941X_REG_CONTROL, 10, regs); | 729 | ret = ds1307->read_block_data(client, MCP794XX_REG_CONTROL, 10, regs); |
729 | if (ret < 0) | 730 | if (ret < 0) |
730 | return ret; | 731 | return ret; |
731 | 732 | ||
@@ -738,23 +739,23 @@ static int mcp7941x_set_alarm(struct device *dev, struct rtc_wkalrm *t) | |||
738 | regs[8] = bin2bcd(t->time.tm_mon) + 1; | 739 | regs[8] = bin2bcd(t->time.tm_mon) + 1; |
739 | 740 | ||
740 | /* Clear the alarm 0 interrupt flag. */ | 741 | /* Clear the alarm 0 interrupt flag. */ |
741 | regs[6] &= ~MCP7941X_BIT_ALMX_IF; | 742 | regs[6] &= ~MCP794XX_BIT_ALMX_IF; |
742 | /* Set alarm match: second, minute, hour, day, date, month. */ | 743 | /* Set alarm match: second, minute, hour, day, date, month. */ |
743 | regs[6] |= MCP7941X_MSK_ALMX_MATCH; | 744 | regs[6] |= MCP794XX_MSK_ALMX_MATCH; |
744 | 745 | ||
745 | if (t->enabled) | 746 | if (t->enabled) |
746 | regs[0] |= MCP7941X_BIT_ALM0_EN; | 747 | regs[0] |= MCP794XX_BIT_ALM0_EN; |
747 | else | 748 | else |
748 | regs[0] &= ~MCP7941X_BIT_ALM0_EN; | 749 | regs[0] &= ~MCP794XX_BIT_ALM0_EN; |
749 | 750 | ||
750 | ret = ds1307->write_block_data(client, MCP7941X_REG_CONTROL, 10, regs); | 751 | ret = ds1307->write_block_data(client, MCP794XX_REG_CONTROL, 10, regs); |
751 | if (ret < 0) | 752 | if (ret < 0) |
752 | return ret; | 753 | return ret; |
753 | 754 | ||
754 | return 0; | 755 | return 0; |
755 | } | 756 | } |
756 | 757 | ||
757 | static int mcp7941x_alarm_irq_enable(struct device *dev, unsigned int enabled) | 758 | static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled) |
758 | { | 759 | { |
759 | struct i2c_client *client = to_i2c_client(dev); | 760 | struct i2c_client *client = to_i2c_client(dev); |
760 | struct ds1307 *ds1307 = i2c_get_clientdata(client); | 761 | struct ds1307 *ds1307 = i2c_get_clientdata(client); |
@@ -763,24 +764,24 @@ static int mcp7941x_alarm_irq_enable(struct device *dev, unsigned int enabled) | |||
763 | if (!test_bit(HAS_ALARM, &ds1307->flags)) | 764 | if (!test_bit(HAS_ALARM, &ds1307->flags)) |
764 | return -EINVAL; | 765 | return -EINVAL; |
765 | 766 | ||
766 | reg = i2c_smbus_read_byte_data(client, MCP7941X_REG_CONTROL); | 767 | reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_CONTROL); |
767 | if (reg < 0) | 768 | if (reg < 0) |
768 | return reg; | 769 | return reg; |
769 | 770 | ||
770 | if (enabled) | 771 | if (enabled) |
771 | reg |= MCP7941X_BIT_ALM0_EN; | 772 | reg |= MCP794XX_BIT_ALM0_EN; |
772 | else | 773 | else |
773 | reg &= ~MCP7941X_BIT_ALM0_EN; | 774 | reg &= ~MCP794XX_BIT_ALM0_EN; |
774 | 775 | ||
775 | return i2c_smbus_write_byte_data(client, MCP7941X_REG_CONTROL, reg); | 776 | return i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, reg); |
776 | } | 777 | } |
777 | 778 | ||
778 | static const struct rtc_class_ops mcp7941x_rtc_ops = { | 779 | static const struct rtc_class_ops mcp794xx_rtc_ops = { |
779 | .read_time = ds1307_get_time, | 780 | .read_time = ds1307_get_time, |
780 | .set_time = ds1307_set_time, | 781 | .set_time = ds1307_set_time, |
781 | .read_alarm = mcp7941x_read_alarm, | 782 | .read_alarm = mcp794xx_read_alarm, |
782 | .set_alarm = mcp7941x_set_alarm, | 783 | .set_alarm = mcp794xx_set_alarm, |
783 | .alarm_irq_enable = mcp7941x_alarm_irq_enable, | 784 | .alarm_irq_enable = mcp794xx_alarm_irq_enable, |
784 | }; | 785 | }; |
785 | 786 | ||
786 | /*----------------------------------------------------------------------*/ | 787 | /*----------------------------------------------------------------------*/ |
@@ -1049,10 +1050,10 @@ static int ds1307_probe(struct i2c_client *client, | |||
1049 | case ds_1388: | 1050 | case ds_1388: |
1050 | ds1307->offset = 1; /* Seconds starts at 1 */ | 1051 | ds1307->offset = 1; /* Seconds starts at 1 */ |
1051 | break; | 1052 | break; |
1052 | case mcp7941x: | 1053 | case mcp794xx: |
1053 | rtc_ops = &mcp7941x_rtc_ops; | 1054 | rtc_ops = &mcp794xx_rtc_ops; |
1054 | if (ds1307->client->irq > 0 && chip->alarm) { | 1055 | if (ds1307->client->irq > 0 && chip->alarm) { |
1055 | INIT_WORK(&ds1307->work, mcp7941x_work); | 1056 | INIT_WORK(&ds1307->work, mcp794xx_work); |
1056 | want_irq = true; | 1057 | want_irq = true; |
1057 | } | 1058 | } |
1058 | break; | 1059 | break; |
@@ -1117,18 +1118,18 @@ read_rtc: | |||
1117 | dev_warn(&client->dev, "SET TIME!\n"); | 1118 | dev_warn(&client->dev, "SET TIME!\n"); |
1118 | } | 1119 | } |
1119 | break; | 1120 | break; |
1120 | case mcp7941x: | 1121 | case mcp794xx: |
1121 | /* make sure that the backup battery is enabled */ | 1122 | /* make sure that the backup battery is enabled */ |
1122 | if (!(ds1307->regs[DS1307_REG_WDAY] & MCP7941X_BIT_VBATEN)) { | 1123 | if (!(ds1307->regs[DS1307_REG_WDAY] & MCP794XX_BIT_VBATEN)) { |
1123 | i2c_smbus_write_byte_data(client, DS1307_REG_WDAY, | 1124 | i2c_smbus_write_byte_data(client, DS1307_REG_WDAY, |
1124 | ds1307->regs[DS1307_REG_WDAY] | 1125 | ds1307->regs[DS1307_REG_WDAY] |
1125 | | MCP7941X_BIT_VBATEN); | 1126 | | MCP794XX_BIT_VBATEN); |
1126 | } | 1127 | } |
1127 | 1128 | ||
1128 | /* clock halted? turn it on, so clock can tick. */ | 1129 | /* clock halted? turn it on, so clock can tick. */ |
1129 | if (!(tmp & MCP7941X_BIT_ST)) { | 1130 | if (!(tmp & MCP794XX_BIT_ST)) { |
1130 | i2c_smbus_write_byte_data(client, DS1307_REG_SECS, | 1131 | i2c_smbus_write_byte_data(client, DS1307_REG_SECS, |
1131 | MCP7941X_BIT_ST); | 1132 | MCP794XX_BIT_ST); |
1132 | dev_warn(&client->dev, "SET TIME!\n"); | 1133 | dev_warn(&client->dev, "SET TIME!\n"); |
1133 | goto read_rtc; | 1134 | goto read_rtc; |
1134 | } | 1135 | } |
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c index 9e6e14fb53d7..8605fde394b2 100644 --- a/drivers/rtc/rtc-ds1374.c +++ b/drivers/rtc/rtc-ds1374.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * Based on code by Randy Vinson <rvinson@mvista.com>, | 4 | * Based on code by Randy Vinson <rvinson@mvista.com>, |
5 | * which was based on the m41t00.c by Mark Greer <mgreer@mvista.com>. | 5 | * which was based on the m41t00.c by Mark Greer <mgreer@mvista.com>. |
6 | * | 6 | * |
7 | * Copyright (C) 2014 Rose Technology | ||
7 | * Copyright (C) 2006-2007 Freescale Semiconductor | 8 | * Copyright (C) 2006-2007 Freescale Semiconductor |
8 | * | 9 | * |
9 | * 2005 (c) MontaVista Software, Inc. This file is licensed under | 10 | * 2005 (c) MontaVista Software, Inc. This file is licensed under |
@@ -26,6 +27,13 @@ | |||
26 | #include <linux/workqueue.h> | 27 | #include <linux/workqueue.h> |
27 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
28 | #include <linux/pm.h> | 29 | #include <linux/pm.h> |
30 | #ifdef CONFIG_RTC_DRV_DS1374_WDT | ||
31 | #include <linux/fs.h> | ||
32 | #include <linux/ioctl.h> | ||
33 | #include <linux/miscdevice.h> | ||
34 | #include <linux/reboot.h> | ||
35 | #include <linux/watchdog.h> | ||
36 | #endif | ||
29 | 37 | ||
30 | #define DS1374_REG_TOD0 0x00 /* Time of Day */ | 38 | #define DS1374_REG_TOD0 0x00 /* Time of Day */ |
31 | #define DS1374_REG_TOD1 0x01 | 39 | #define DS1374_REG_TOD1 0x01 |
@@ -49,6 +57,14 @@ static const struct i2c_device_id ds1374_id[] = { | |||
49 | }; | 57 | }; |
50 | MODULE_DEVICE_TABLE(i2c, ds1374_id); | 58 | MODULE_DEVICE_TABLE(i2c, ds1374_id); |
51 | 59 | ||
60 | #ifdef CONFIG_OF | ||
61 | static const struct of_device_id ds1374_of_match[] = { | ||
62 | { .compatible = "dallas,ds1374" }, | ||
63 | { } | ||
64 | }; | ||
65 | MODULE_DEVICE_TABLE(of, ds1374_of_match); | ||
66 | #endif | ||
67 | |||
52 | struct ds1374 { | 68 | struct ds1374 { |
53 | struct i2c_client *client; | 69 | struct i2c_client *client; |
54 | struct rtc_device *rtc; | 70 | struct rtc_device *rtc; |
@@ -162,6 +178,7 @@ static int ds1374_set_time(struct device *dev, struct rtc_time *time) | |||
162 | return ds1374_write_rtc(client, itime, DS1374_REG_TOD0, 4); | 178 | return ds1374_write_rtc(client, itime, DS1374_REG_TOD0, 4); |
163 | } | 179 | } |
164 | 180 | ||
181 | #ifndef CONFIG_RTC_DRV_DS1374_WDT | ||
165 | /* The ds1374 has a decrementer for an alarm, rather than a comparator. | 182 | /* The ds1374 has a decrementer for an alarm, rather than a comparator. |
166 | * If the time of day is changed, then the alarm will need to be | 183 | * If the time of day is changed, then the alarm will need to be |
167 | * reset. | 184 | * reset. |
@@ -263,6 +280,7 @@ out: | |||
263 | mutex_unlock(&ds1374->mutex); | 280 | mutex_unlock(&ds1374->mutex); |
264 | return ret; | 281 | return ret; |
265 | } | 282 | } |
283 | #endif | ||
266 | 284 | ||
267 | static irqreturn_t ds1374_irq(int irq, void *dev_id) | 285 | static irqreturn_t ds1374_irq(int irq, void *dev_id) |
268 | { | 286 | { |
@@ -307,6 +325,7 @@ unlock: | |||
307 | mutex_unlock(&ds1374->mutex); | 325 | mutex_unlock(&ds1374->mutex); |
308 | } | 326 | } |
309 | 327 | ||
328 | #ifndef CONFIG_RTC_DRV_DS1374_WDT | ||
310 | static int ds1374_alarm_irq_enable(struct device *dev, unsigned int enabled) | 329 | static int ds1374_alarm_irq_enable(struct device *dev, unsigned int enabled) |
311 | { | 330 | { |
312 | struct i2c_client *client = to_i2c_client(dev); | 331 | struct i2c_client *client = to_i2c_client(dev); |
@@ -331,15 +350,260 @@ out: | |||
331 | mutex_unlock(&ds1374->mutex); | 350 | mutex_unlock(&ds1374->mutex); |
332 | return ret; | 351 | return ret; |
333 | } | 352 | } |
353 | #endif | ||
334 | 354 | ||
335 | static const struct rtc_class_ops ds1374_rtc_ops = { | 355 | static const struct rtc_class_ops ds1374_rtc_ops = { |
336 | .read_time = ds1374_read_time, | 356 | .read_time = ds1374_read_time, |
337 | .set_time = ds1374_set_time, | 357 | .set_time = ds1374_set_time, |
358 | #ifndef CONFIG_RTC_DRV_DS1374_WDT | ||
338 | .read_alarm = ds1374_read_alarm, | 359 | .read_alarm = ds1374_read_alarm, |
339 | .set_alarm = ds1374_set_alarm, | 360 | .set_alarm = ds1374_set_alarm, |
340 | .alarm_irq_enable = ds1374_alarm_irq_enable, | 361 | .alarm_irq_enable = ds1374_alarm_irq_enable, |
362 | #endif | ||
363 | }; | ||
364 | |||
365 | #ifdef CONFIG_RTC_DRV_DS1374_WDT | ||
366 | /* | ||
367 | ***************************************************************************** | ||
368 | * | ||
369 | * Watchdog Driver | ||
370 | * | ||
371 | ***************************************************************************** | ||
372 | */ | ||
373 | static struct i2c_client *save_client; | ||
374 | /* Default margin */ | ||
375 | #define WD_TIMO 131762 | ||
376 | |||
377 | #define DRV_NAME "DS1374 Watchdog" | ||
378 | |||
379 | static int wdt_margin = WD_TIMO; | ||
380 | static unsigned long wdt_is_open; | ||
381 | module_param(wdt_margin, int, 0); | ||
382 | MODULE_PARM_DESC(wdt_margin, "Watchdog timeout in seconds (default 32s)"); | ||
383 | |||
384 | static const struct watchdog_info ds1374_wdt_info = { | ||
385 | .identity = "DS1374 WTD", | ||
386 | .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | | ||
387 | WDIOF_MAGICCLOSE, | ||
341 | }; | 388 | }; |
342 | 389 | ||
390 | static int ds1374_wdt_settimeout(unsigned int timeout) | ||
391 | { | ||
392 | int ret = -ENOIOCTLCMD; | ||
393 | int cr; | ||
394 | |||
395 | ret = cr = i2c_smbus_read_byte_data(save_client, DS1374_REG_CR); | ||
396 | if (ret < 0) | ||
397 | goto out; | ||
398 | |||
399 | /* Disable any existing watchdog/alarm before setting the new one */ | ||
400 | cr &= ~DS1374_REG_CR_WACE; | ||
401 | |||
402 | ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr); | ||
403 | if (ret < 0) | ||
404 | goto out; | ||
405 | |||
406 | /* Set new watchdog time */ | ||
407 | ret = ds1374_write_rtc(save_client, timeout, DS1374_REG_WDALM0, 3); | ||
408 | if (ret) { | ||
409 | pr_info("rtc-ds1374 - couldn't set new watchdog time\n"); | ||
410 | goto out; | ||
411 | } | ||
412 | |||
413 | /* Enable watchdog timer */ | ||
414 | cr |= DS1374_REG_CR_WACE | DS1374_REG_CR_WDALM; | ||
415 | cr &= ~DS1374_REG_CR_AIE; | ||
416 | |||
417 | ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr); | ||
418 | if (ret < 0) | ||
419 | goto out; | ||
420 | |||
421 | return 0; | ||
422 | out: | ||
423 | return ret; | ||
424 | } | ||
425 | |||
426 | |||
427 | /* | ||
428 | * Reload the watchdog timer. (ie, pat the watchdog) | ||
429 | */ | ||
430 | static void ds1374_wdt_ping(void) | ||
431 | { | ||
432 | u32 val; | ||
433 | int ret = 0; | ||
434 | |||
435 | ret = ds1374_read_rtc(save_client, &val, DS1374_REG_WDALM0, 3); | ||
436 | if (ret) | ||
437 | pr_info("WD TICK FAIL!!!!!!!!!! %i\n", ret); | ||
438 | } | ||
439 | |||
440 | static void ds1374_wdt_disable(void) | ||
441 | { | ||
442 | int ret = -ENOIOCTLCMD; | ||
443 | int cr; | ||
444 | |||
445 | cr = i2c_smbus_read_byte_data(save_client, DS1374_REG_CR); | ||
446 | /* Disable watchdog timer */ | ||
447 | cr &= ~DS1374_REG_CR_WACE; | ||
448 | |||
449 | ret = i2c_smbus_write_byte_data(save_client, DS1374_REG_CR, cr); | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Watchdog device is opened, and watchdog starts running. | ||
454 | */ | ||
455 | static int ds1374_wdt_open(struct inode *inode, struct file *file) | ||
456 | { | ||
457 | struct ds1374 *ds1374 = i2c_get_clientdata(save_client); | ||
458 | |||
459 | if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) { | ||
460 | mutex_lock(&ds1374->mutex); | ||
461 | if (test_and_set_bit(0, &wdt_is_open)) { | ||
462 | mutex_unlock(&ds1374->mutex); | ||
463 | return -EBUSY; | ||
464 | } | ||
465 | /* | ||
466 | * Activate | ||
467 | */ | ||
468 | wdt_is_open = 1; | ||
469 | mutex_unlock(&ds1374->mutex); | ||
470 | return nonseekable_open(inode, file); | ||
471 | } | ||
472 | return -ENODEV; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Close the watchdog device. | ||
477 | */ | ||
478 | static int ds1374_wdt_release(struct inode *inode, struct file *file) | ||
479 | { | ||
480 | if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) | ||
481 | clear_bit(0, &wdt_is_open); | ||
482 | |||
483 | return 0; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Pat the watchdog whenever device is written to. | ||
488 | */ | ||
489 | static ssize_t ds1374_wdt_write(struct file *file, const char __user *data, | ||
490 | size_t len, loff_t *ppos) | ||
491 | { | ||
492 | if (len) { | ||
493 | ds1374_wdt_ping(); | ||
494 | return 1; | ||
495 | } | ||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | static ssize_t ds1374_wdt_read(struct file *file, char __user *data, | ||
500 | size_t len, loff_t *ppos) | ||
501 | { | ||
502 | return 0; | ||
503 | } | ||
504 | |||
505 | /* | ||
506 | * Handle commands from user-space. | ||
507 | */ | ||
508 | static long ds1374_wdt_ioctl(struct file *file, unsigned int cmd, | ||
509 | unsigned long arg) | ||
510 | { | ||
511 | int new_margin, options; | ||
512 | |||
513 | switch (cmd) { | ||
514 | case WDIOC_GETSUPPORT: | ||
515 | return copy_to_user((struct watchdog_info __user *)arg, | ||
516 | &ds1374_wdt_info, sizeof(ds1374_wdt_info)) ? -EFAULT : 0; | ||
517 | |||
518 | case WDIOC_GETSTATUS: | ||
519 | case WDIOC_GETBOOTSTATUS: | ||
520 | return put_user(0, (int __user *)arg); | ||
521 | case WDIOC_KEEPALIVE: | ||
522 | ds1374_wdt_ping(); | ||
523 | return 0; | ||
524 | case WDIOC_SETTIMEOUT: | ||
525 | if (get_user(new_margin, (int __user *)arg)) | ||
526 | return -EFAULT; | ||
527 | |||
528 | if (new_margin < 1 || new_margin > 16777216) | ||
529 | return -EINVAL; | ||
530 | |||
531 | wdt_margin = new_margin; | ||
532 | ds1374_wdt_settimeout(new_margin); | ||
533 | ds1374_wdt_ping(); | ||
534 | /* fallthrough */ | ||
535 | case WDIOC_GETTIMEOUT: | ||
536 | return put_user(wdt_margin, (int __user *)arg); | ||
537 | case WDIOC_SETOPTIONS: | ||
538 | if (copy_from_user(&options, (int __user *)arg, sizeof(int))) | ||
539 | return -EFAULT; | ||
540 | |||
541 | if (options & WDIOS_DISABLECARD) { | ||
542 | pr_info("rtc-ds1374: disable watchdog\n"); | ||
543 | ds1374_wdt_disable(); | ||
544 | } | ||
545 | |||
546 | if (options & WDIOS_ENABLECARD) { | ||
547 | pr_info("rtc-ds1374: enable watchdog\n"); | ||
548 | ds1374_wdt_settimeout(wdt_margin); | ||
549 | ds1374_wdt_ping(); | ||
550 | } | ||
551 | |||
552 | return -EINVAL; | ||
553 | } | ||
554 | return -ENOTTY; | ||
555 | } | ||
556 | |||
557 | static long ds1374_wdt_unlocked_ioctl(struct file *file, unsigned int cmd, | ||
558 | unsigned long arg) | ||
559 | { | ||
560 | int ret; | ||
561 | struct ds1374 *ds1374 = i2c_get_clientdata(save_client); | ||
562 | |||
563 | mutex_lock(&ds1374->mutex); | ||
564 | ret = ds1374_wdt_ioctl(file, cmd, arg); | ||
565 | mutex_unlock(&ds1374->mutex); | ||
566 | |||
567 | return ret; | ||
568 | } | ||
569 | |||
570 | static int ds1374_wdt_notify_sys(struct notifier_block *this, | ||
571 | unsigned long code, void *unused) | ||
572 | { | ||
573 | if (code == SYS_DOWN || code == SYS_HALT) | ||
574 | /* Disable Watchdog */ | ||
575 | ds1374_wdt_disable(); | ||
576 | return NOTIFY_DONE; | ||
577 | } | ||
578 | |||
579 | static const struct file_operations ds1374_wdt_fops = { | ||
580 | .owner = THIS_MODULE, | ||
581 | .read = ds1374_wdt_read, | ||
582 | .unlocked_ioctl = ds1374_wdt_unlocked_ioctl, | ||
583 | .write = ds1374_wdt_write, | ||
584 | .open = ds1374_wdt_open, | ||
585 | .release = ds1374_wdt_release, | ||
586 | .llseek = no_llseek, | ||
587 | }; | ||
588 | |||
589 | static struct miscdevice ds1374_miscdev = { | ||
590 | .minor = WATCHDOG_MINOR, | ||
591 | .name = "watchdog", | ||
592 | .fops = &ds1374_wdt_fops, | ||
593 | }; | ||
594 | |||
595 | static struct notifier_block ds1374_wdt_notifier = { | ||
596 | .notifier_call = ds1374_wdt_notify_sys, | ||
597 | }; | ||
598 | |||
599 | #endif /*CONFIG_RTC_DRV_DS1374_WDT*/ | ||
600 | /* | ||
601 | ***************************************************************************** | ||
602 | * | ||
603 | * Driver Interface | ||
604 | * | ||
605 | ***************************************************************************** | ||
606 | */ | ||
343 | static int ds1374_probe(struct i2c_client *client, | 607 | static int ds1374_probe(struct i2c_client *client, |
344 | const struct i2c_device_id *id) | 608 | const struct i2c_device_id *id) |
345 | { | 609 | { |
@@ -378,12 +642,33 @@ static int ds1374_probe(struct i2c_client *client, | |||
378 | return PTR_ERR(ds1374->rtc); | 642 | return PTR_ERR(ds1374->rtc); |
379 | } | 643 | } |
380 | 644 | ||
645 | #ifdef CONFIG_RTC_DRV_DS1374_WDT | ||
646 | save_client = client; | ||
647 | ret = misc_register(&ds1374_miscdev); | ||
648 | if (ret) | ||
649 | return ret; | ||
650 | ret = register_reboot_notifier(&ds1374_wdt_notifier); | ||
651 | if (ret) { | ||
652 | misc_deregister(&ds1374_miscdev); | ||
653 | return ret; | ||
654 | } | ||
655 | ds1374_wdt_settimeout(131072); | ||
656 | #endif | ||
657 | |||
381 | return 0; | 658 | return 0; |
382 | } | 659 | } |
383 | 660 | ||
384 | static int ds1374_remove(struct i2c_client *client) | 661 | static int ds1374_remove(struct i2c_client *client) |
385 | { | 662 | { |
386 | struct ds1374 *ds1374 = i2c_get_clientdata(client); | 663 | struct ds1374 *ds1374 = i2c_get_clientdata(client); |
664 | #ifdef CONFIG_RTC_DRV_DS1374_WDT | ||
665 | int res; | ||
666 | |||
667 | res = misc_deregister(&ds1374_miscdev); | ||
668 | if (!res) | ||
669 | ds1374_miscdev.parent = NULL; | ||
670 | unregister_reboot_notifier(&ds1374_wdt_notifier); | ||
671 | #endif | ||
387 | 672 | ||
388 | if (client->irq > 0) { | 673 | if (client->irq > 0) { |
389 | mutex_lock(&ds1374->mutex); | 674 | mutex_lock(&ds1374->mutex); |
diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c index 455b601d731d..6e1fcfb5d7e6 100644 --- a/drivers/rtc/rtc-isl12057.c +++ b/drivers/rtc/rtc-isl12057.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #define ISL12057_REG_RTC_DW 0x03 /* Day of the Week */ | 41 | #define ISL12057_REG_RTC_DW 0x03 /* Day of the Week */ |
42 | #define ISL12057_REG_RTC_DT 0x04 /* Date */ | 42 | #define ISL12057_REG_RTC_DT 0x04 /* Date */ |
43 | #define ISL12057_REG_RTC_MO 0x05 /* Month */ | 43 | #define ISL12057_REG_RTC_MO 0x05 /* Month */ |
44 | #define ISL12057_REG_RTC_MO_CEN BIT(7) /* Century bit */ | ||
44 | #define ISL12057_REG_RTC_YR 0x06 /* Year */ | 45 | #define ISL12057_REG_RTC_YR 0x06 /* Year */ |
45 | #define ISL12057_RTC_SEC_LEN 7 | 46 | #define ISL12057_RTC_SEC_LEN 7 |
46 | 47 | ||
@@ -88,7 +89,7 @@ static void isl12057_rtc_regs_to_tm(struct rtc_time *tm, u8 *regs) | |||
88 | tm->tm_min = bcd2bin(regs[ISL12057_REG_RTC_MN]); | 89 | tm->tm_min = bcd2bin(regs[ISL12057_REG_RTC_MN]); |
89 | 90 | ||
90 | if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_MIL) { /* AM/PM */ | 91 | if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_MIL) { /* AM/PM */ |
91 | tm->tm_hour = bcd2bin(regs[ISL12057_REG_RTC_HR] & 0x0f); | 92 | tm->tm_hour = bcd2bin(regs[ISL12057_REG_RTC_HR] & 0x1f); |
92 | if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_PM) | 93 | if (regs[ISL12057_REG_RTC_HR] & ISL12057_REG_RTC_HR_PM) |
93 | tm->tm_hour += 12; | 94 | tm->tm_hour += 12; |
94 | } else { /* 24 hour mode */ | 95 | } else { /* 24 hour mode */ |
@@ -97,26 +98,37 @@ static void isl12057_rtc_regs_to_tm(struct rtc_time *tm, u8 *regs) | |||
97 | 98 | ||
98 | tm->tm_mday = bcd2bin(regs[ISL12057_REG_RTC_DT]); | 99 | tm->tm_mday = bcd2bin(regs[ISL12057_REG_RTC_DT]); |
99 | tm->tm_wday = bcd2bin(regs[ISL12057_REG_RTC_DW]) - 1; /* starts at 1 */ | 100 | tm->tm_wday = bcd2bin(regs[ISL12057_REG_RTC_DW]) - 1; /* starts at 1 */ |
100 | tm->tm_mon = bcd2bin(regs[ISL12057_REG_RTC_MO]) - 1; /* starts at 1 */ | 101 | tm->tm_mon = bcd2bin(regs[ISL12057_REG_RTC_MO] & 0x1f) - 1; /* ditto */ |
101 | tm->tm_year = bcd2bin(regs[ISL12057_REG_RTC_YR]) + 100; | 102 | tm->tm_year = bcd2bin(regs[ISL12057_REG_RTC_YR]) + 100; |
103 | |||
104 | /* Check if years register has overflown from 99 to 00 */ | ||
105 | if (regs[ISL12057_REG_RTC_MO] & ISL12057_REG_RTC_MO_CEN) | ||
106 | tm->tm_year += 100; | ||
102 | } | 107 | } |
103 | 108 | ||
104 | static int isl12057_rtc_tm_to_regs(u8 *regs, struct rtc_time *tm) | 109 | static int isl12057_rtc_tm_to_regs(u8 *regs, struct rtc_time *tm) |
105 | { | 110 | { |
111 | u8 century_bit; | ||
112 | |||
106 | /* | 113 | /* |
107 | * The clock has an 8 bit wide bcd-coded register for the year. | 114 | * The clock has an 8 bit wide bcd-coded register for the year. |
115 | * It also has a century bit encoded in MO flag which provides | ||
116 | * information about overflow of year register from 99 to 00. | ||
108 | * tm_year is an offset from 1900 and we are interested in the | 117 | * tm_year is an offset from 1900 and we are interested in the |
109 | * 2000-2099 range, so any value less than 100 is invalid. | 118 | * 2000-2199 range, so any value less than 100 or larger than |
119 | * 299 is invalid. | ||
110 | */ | 120 | */ |
111 | if (tm->tm_year < 100) | 121 | if (tm->tm_year < 100 || tm->tm_year > 299) |
112 | return -EINVAL; | 122 | return -EINVAL; |
113 | 123 | ||
124 | century_bit = (tm->tm_year > 199) ? ISL12057_REG_RTC_MO_CEN : 0; | ||
125 | |||
114 | regs[ISL12057_REG_RTC_SC] = bin2bcd(tm->tm_sec); | 126 | regs[ISL12057_REG_RTC_SC] = bin2bcd(tm->tm_sec); |
115 | regs[ISL12057_REG_RTC_MN] = bin2bcd(tm->tm_min); | 127 | regs[ISL12057_REG_RTC_MN] = bin2bcd(tm->tm_min); |
116 | regs[ISL12057_REG_RTC_HR] = bin2bcd(tm->tm_hour); /* 24-hour format */ | 128 | regs[ISL12057_REG_RTC_HR] = bin2bcd(tm->tm_hour); /* 24-hour format */ |
117 | regs[ISL12057_REG_RTC_DT] = bin2bcd(tm->tm_mday); | 129 | regs[ISL12057_REG_RTC_DT] = bin2bcd(tm->tm_mday); |
118 | regs[ISL12057_REG_RTC_MO] = bin2bcd(tm->tm_mon + 1); | 130 | regs[ISL12057_REG_RTC_MO] = bin2bcd(tm->tm_mon + 1) | century_bit; |
119 | regs[ISL12057_REG_RTC_YR] = bin2bcd(tm->tm_year - 100); | 131 | regs[ISL12057_REG_RTC_YR] = bin2bcd(tm->tm_year % 100); |
120 | regs[ISL12057_REG_RTC_DW] = bin2bcd(tm->tm_wday + 1); | 132 | regs[ISL12057_REG_RTC_DW] = bin2bcd(tm->tm_wday + 1); |
121 | 133 | ||
122 | return 0; | 134 | return 0; |
@@ -152,17 +164,33 @@ static int isl12057_rtc_read_time(struct device *dev, struct rtc_time *tm) | |||
152 | { | 164 | { |
153 | struct isl12057_rtc_data *data = dev_get_drvdata(dev); | 165 | struct isl12057_rtc_data *data = dev_get_drvdata(dev); |
154 | u8 regs[ISL12057_RTC_SEC_LEN]; | 166 | u8 regs[ISL12057_RTC_SEC_LEN]; |
167 | unsigned int sr; | ||
155 | int ret; | 168 | int ret; |
156 | 169 | ||
157 | mutex_lock(&data->lock); | 170 | mutex_lock(&data->lock); |
171 | ret = regmap_read(data->regmap, ISL12057_REG_SR, &sr); | ||
172 | if (ret) { | ||
173 | dev_err(dev, "%s: unable to read oscillator status flag (%d)\n", | ||
174 | __func__, ret); | ||
175 | goto out; | ||
176 | } else { | ||
177 | if (sr & ISL12057_REG_SR_OSF) { | ||
178 | ret = -ENODATA; | ||
179 | goto out; | ||
180 | } | ||
181 | } | ||
182 | |||
158 | ret = regmap_bulk_read(data->regmap, ISL12057_REG_RTC_SC, regs, | 183 | ret = regmap_bulk_read(data->regmap, ISL12057_REG_RTC_SC, regs, |
159 | ISL12057_RTC_SEC_LEN); | 184 | ISL12057_RTC_SEC_LEN); |
185 | if (ret) | ||
186 | dev_err(dev, "%s: unable to read RTC time section (%d)\n", | ||
187 | __func__, ret); | ||
188 | |||
189 | out: | ||
160 | mutex_unlock(&data->lock); | 190 | mutex_unlock(&data->lock); |
161 | 191 | ||
162 | if (ret) { | 192 | if (ret) |
163 | dev_err(dev, "%s: RTC read failed\n", __func__); | ||
164 | return ret; | 193 | return ret; |
165 | } | ||
166 | 194 | ||
167 | isl12057_rtc_regs_to_tm(tm, regs); | 195 | isl12057_rtc_regs_to_tm(tm, regs); |
168 | 196 | ||
@@ -182,10 +210,24 @@ static int isl12057_rtc_set_time(struct device *dev, struct rtc_time *tm) | |||
182 | mutex_lock(&data->lock); | 210 | mutex_lock(&data->lock); |
183 | ret = regmap_bulk_write(data->regmap, ISL12057_REG_RTC_SC, regs, | 211 | ret = regmap_bulk_write(data->regmap, ISL12057_REG_RTC_SC, regs, |
184 | ISL12057_RTC_SEC_LEN); | 212 | ISL12057_RTC_SEC_LEN); |
185 | mutex_unlock(&data->lock); | 213 | if (ret) { |
214 | dev_err(dev, "%s: unable to write RTC time section (%d)\n", | ||
215 | __func__, ret); | ||
216 | goto out; | ||
217 | } | ||
186 | 218 | ||
187 | if (ret) | 219 | /* |
188 | dev_err(dev, "%s: RTC write failed\n", __func__); | 220 | * Now that RTC time has been updated, let's clear oscillator |
221 | * failure flag, if needed. | ||
222 | */ | ||
223 | ret = regmap_update_bits(data->regmap, ISL12057_REG_SR, | ||
224 | ISL12057_REG_SR_OSF, 0); | ||
225 | if (ret < 0) | ||
226 | dev_err(dev, "%s: unable to clear osc. failure bit (%d)\n", | ||
227 | __func__, ret); | ||
228 | |||
229 | out: | ||
230 | mutex_unlock(&data->lock); | ||
189 | 231 | ||
190 | return ret; | 232 | return ret; |
191 | } | 233 | } |
@@ -203,15 +245,8 @@ static int isl12057_check_rtc_status(struct device *dev, struct regmap *regmap) | |||
203 | ret = regmap_update_bits(regmap, ISL12057_REG_INT, | 245 | ret = regmap_update_bits(regmap, ISL12057_REG_INT, |
204 | ISL12057_REG_INT_EOSC, 0); | 246 | ISL12057_REG_INT_EOSC, 0); |
205 | if (ret < 0) { | 247 | if (ret < 0) { |
206 | dev_err(dev, "Unable to enable oscillator\n"); | 248 | dev_err(dev, "%s: unable to enable oscillator (%d)\n", |
207 | return ret; | 249 | __func__, ret); |
208 | } | ||
209 | |||
210 | /* Clear oscillator failure bit if needed */ | ||
211 | ret = regmap_update_bits(regmap, ISL12057_REG_SR, | ||
212 | ISL12057_REG_SR_OSF, 0); | ||
213 | if (ret < 0) { | ||
214 | dev_err(dev, "Unable to clear oscillator failure bit\n"); | ||
215 | return ret; | 250 | return ret; |
216 | } | 251 | } |
217 | 252 | ||
@@ -219,7 +254,8 @@ static int isl12057_check_rtc_status(struct device *dev, struct regmap *regmap) | |||
219 | ret = regmap_update_bits(regmap, ISL12057_REG_SR, | 254 | ret = regmap_update_bits(regmap, ISL12057_REG_SR, |
220 | ISL12057_REG_SR_A1F, 0); | 255 | ISL12057_REG_SR_A1F, 0); |
221 | if (ret < 0) { | 256 | if (ret < 0) { |
222 | dev_err(dev, "Unable to clear alarm bit\n"); | 257 | dev_err(dev, "%s: unable to clear alarm bit (%d)\n", |
258 | __func__, ret); | ||
223 | return ret; | 259 | return ret; |
224 | } | 260 | } |
225 | 261 | ||
@@ -253,7 +289,8 @@ static int isl12057_probe(struct i2c_client *client, | |||
253 | regmap = devm_regmap_init_i2c(client, &isl12057_rtc_regmap_config); | 289 | regmap = devm_regmap_init_i2c(client, &isl12057_rtc_regmap_config); |
254 | if (IS_ERR(regmap)) { | 290 | if (IS_ERR(regmap)) { |
255 | ret = PTR_ERR(regmap); | 291 | ret = PTR_ERR(regmap); |
256 | dev_err(dev, "regmap allocation failed: %d\n", ret); | 292 | dev_err(dev, "%s: regmap allocation failed (%d)\n", |
293 | __func__, ret); | ||
257 | return ret; | 294 | return ret; |
258 | } | 295 | } |
259 | 296 | ||
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index 21142e6574a9..4f1c6ca97211 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c | |||
@@ -1,10 +1,11 @@ | |||
1 | /* | 1 | /* |
2 | * TI OMAP1 Real Time Clock interface for Linux | 2 | * TI OMAP Real Time Clock interface for Linux |
3 | * | 3 | * |
4 | * Copyright (C) 2003 MontaVista Software, Inc. | 4 | * Copyright (C) 2003 MontaVista Software, Inc. |
5 | * Author: George G. Davis <gdavis@mvista.com> or <source@mvista.com> | 5 | * Author: George G. Davis <gdavis@mvista.com> or <source@mvista.com> |
6 | * | 6 | * |
7 | * Copyright (C) 2006 David Brownell (new RTC framework) | 7 | * Copyright (C) 2006 David Brownell (new RTC framework) |
8 | * Copyright (C) 2014 Johan Hovold <johan@kernel.org> | ||
8 | * | 9 | * |
9 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU General Public License | 11 | * modify it under the terms of the GNU General Public License |
@@ -25,7 +26,8 @@ | |||
25 | #include <linux/pm_runtime.h> | 26 | #include <linux/pm_runtime.h> |
26 | #include <linux/io.h> | 27 | #include <linux/io.h> |
27 | 28 | ||
28 | /* The OMAP1 RTC is a year/month/day/hours/minutes/seconds BCD clock | 29 | /* |
30 | * The OMAP RTC is a year/month/day/hours/minutes/seconds BCD clock | ||
29 | * with century-range alarm matching, driven by the 32kHz clock. | 31 | * with century-range alarm matching, driven by the 32kHz clock. |
30 | * | 32 | * |
31 | * The main user-visible ways it differs from PC RTCs are by omitting | 33 | * The main user-visible ways it differs from PC RTCs are by omitting |
@@ -39,10 +41,6 @@ | |||
39 | * the SoC). See the BOARD-SPECIFIC CUSTOMIZATION comment. | 41 | * the SoC). See the BOARD-SPECIFIC CUSTOMIZATION comment. |
40 | */ | 42 | */ |
41 | 43 | ||
42 | #define DRIVER_NAME "omap_rtc" | ||
43 | |||
44 | #define OMAP_RTC_BASE 0xfffb4800 | ||
45 | |||
46 | /* RTC registers */ | 44 | /* RTC registers */ |
47 | #define OMAP_RTC_SECONDS_REG 0x00 | 45 | #define OMAP_RTC_SECONDS_REG 0x00 |
48 | #define OMAP_RTC_MINUTES_REG 0x04 | 46 | #define OMAP_RTC_MINUTES_REG 0x04 |
@@ -72,6 +70,15 @@ | |||
72 | 70 | ||
73 | #define OMAP_RTC_IRQWAKEEN 0x7c | 71 | #define OMAP_RTC_IRQWAKEEN 0x7c |
74 | 72 | ||
73 | #define OMAP_RTC_ALARM2_SECONDS_REG 0x80 | ||
74 | #define OMAP_RTC_ALARM2_MINUTES_REG 0x84 | ||
75 | #define OMAP_RTC_ALARM2_HOURS_REG 0x88 | ||
76 | #define OMAP_RTC_ALARM2_DAYS_REG 0x8c | ||
77 | #define OMAP_RTC_ALARM2_MONTHS_REG 0x90 | ||
78 | #define OMAP_RTC_ALARM2_YEARS_REG 0x94 | ||
79 | |||
80 | #define OMAP_RTC_PMIC_REG 0x98 | ||
81 | |||
75 | /* OMAP_RTC_CTRL_REG bit fields: */ | 82 | /* OMAP_RTC_CTRL_REG bit fields: */ |
76 | #define OMAP_RTC_CTRL_SPLIT BIT(7) | 83 | #define OMAP_RTC_CTRL_SPLIT BIT(7) |
77 | #define OMAP_RTC_CTRL_DISABLE BIT(6) | 84 | #define OMAP_RTC_CTRL_DISABLE BIT(6) |
@@ -84,6 +91,7 @@ | |||
84 | 91 | ||
85 | /* OMAP_RTC_STATUS_REG bit fields: */ | 92 | /* OMAP_RTC_STATUS_REG bit fields: */ |
86 | #define OMAP_RTC_STATUS_POWER_UP BIT(7) | 93 | #define OMAP_RTC_STATUS_POWER_UP BIT(7) |
94 | #define OMAP_RTC_STATUS_ALARM2 BIT(7) | ||
87 | #define OMAP_RTC_STATUS_ALARM BIT(6) | 95 | #define OMAP_RTC_STATUS_ALARM BIT(6) |
88 | #define OMAP_RTC_STATUS_1D_EVENT BIT(5) | 96 | #define OMAP_RTC_STATUS_1D_EVENT BIT(5) |
89 | #define OMAP_RTC_STATUS_1H_EVENT BIT(4) | 97 | #define OMAP_RTC_STATUS_1H_EVENT BIT(4) |
@@ -93,6 +101,7 @@ | |||
93 | #define OMAP_RTC_STATUS_BUSY BIT(0) | 101 | #define OMAP_RTC_STATUS_BUSY BIT(0) |
94 | 102 | ||
95 | /* OMAP_RTC_INTERRUPTS_REG bit fields: */ | 103 | /* OMAP_RTC_INTERRUPTS_REG bit fields: */ |
104 | #define OMAP_RTC_INTERRUPTS_IT_ALARM2 BIT(4) | ||
96 | #define OMAP_RTC_INTERRUPTS_IT_ALARM BIT(3) | 105 | #define OMAP_RTC_INTERRUPTS_IT_ALARM BIT(3) |
97 | #define OMAP_RTC_INTERRUPTS_IT_TIMER BIT(2) | 106 | #define OMAP_RTC_INTERRUPTS_IT_TIMER BIT(2) |
98 | 107 | ||
@@ -102,61 +111,82 @@ | |||
102 | /* OMAP_RTC_IRQWAKEEN bit fields: */ | 111 | /* OMAP_RTC_IRQWAKEEN bit fields: */ |
103 | #define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1) | 112 | #define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1) |
104 | 113 | ||
114 | /* OMAP_RTC_PMIC bit fields: */ | ||
115 | #define OMAP_RTC_PMIC_POWER_EN_EN BIT(16) | ||
116 | |||
105 | /* OMAP_RTC_KICKER values */ | 117 | /* OMAP_RTC_KICKER values */ |
106 | #define KICK0_VALUE 0x83e70b13 | 118 | #define KICK0_VALUE 0x83e70b13 |
107 | #define KICK1_VALUE 0x95a4f1e0 | 119 | #define KICK1_VALUE 0x95a4f1e0 |
108 | 120 | ||
109 | #define OMAP_RTC_HAS_KICKER BIT(0) | 121 | struct omap_rtc_device_type { |
110 | 122 | bool has_32kclk_en; | |
111 | /* | 123 | bool has_kicker; |
112 | * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup | 124 | bool has_irqwakeen; |
113 | * generation for event Alarm. | 125 | bool has_pmic_mode; |
114 | */ | 126 | bool has_power_up_reset; |
115 | #define OMAP_RTC_HAS_IRQWAKEEN BIT(1) | 127 | }; |
116 | 128 | ||
117 | /* | 129 | struct omap_rtc { |
118 | * Some RTC IP revisions (like those in AM335x and DRA7x) need | 130 | struct rtc_device *rtc; |
119 | * the 32KHz clock to be explicitly enabled. | 131 | void __iomem *base; |
120 | */ | 132 | int irq_alarm; |
121 | #define OMAP_RTC_HAS_32KCLK_EN BIT(2) | 133 | int irq_timer; |
134 | u8 interrupts_reg; | ||
135 | bool is_pmic_controller; | ||
136 | const struct omap_rtc_device_type *type; | ||
137 | }; | ||
122 | 138 | ||
123 | static void __iomem *rtc_base; | 139 | static inline u8 rtc_read(struct omap_rtc *rtc, unsigned int reg) |
140 | { | ||
141 | return readb(rtc->base + reg); | ||
142 | } | ||
124 | 143 | ||
125 | #define rtc_read(addr) readb(rtc_base + (addr)) | 144 | static inline u32 rtc_readl(struct omap_rtc *rtc, unsigned int reg) |
126 | #define rtc_write(val, addr) writeb(val, rtc_base + (addr)) | 145 | { |
146 | return readl(rtc->base + reg); | ||
147 | } | ||
127 | 148 | ||
128 | #define rtc_writel(val, addr) writel(val, rtc_base + (addr)) | 149 | static inline void rtc_write(struct omap_rtc *rtc, unsigned int reg, u8 val) |
150 | { | ||
151 | writeb(val, rtc->base + reg); | ||
152 | } | ||
129 | 153 | ||
154 | static inline void rtc_writel(struct omap_rtc *rtc, unsigned int reg, u32 val) | ||
155 | { | ||
156 | writel(val, rtc->base + reg); | ||
157 | } | ||
130 | 158 | ||
131 | /* we rely on the rtc framework to handle locking (rtc->ops_lock), | 159 | /* |
160 | * We rely on the rtc framework to handle locking (rtc->ops_lock), | ||
132 | * so the only other requirement is that register accesses which | 161 | * so the only other requirement is that register accesses which |
133 | * require BUSY to be clear are made with IRQs locally disabled | 162 | * require BUSY to be clear are made with IRQs locally disabled |
134 | */ | 163 | */ |
135 | static void rtc_wait_not_busy(void) | 164 | static void rtc_wait_not_busy(struct omap_rtc *rtc) |
136 | { | 165 | { |
137 | int count = 0; | 166 | int count; |
138 | u8 status; | 167 | u8 status; |
139 | 168 | ||
140 | /* BUSY may stay active for 1/32768 second (~30 usec) */ | 169 | /* BUSY may stay active for 1/32768 second (~30 usec) */ |
141 | for (count = 0; count < 50; count++) { | 170 | for (count = 0; count < 50; count++) { |
142 | status = rtc_read(OMAP_RTC_STATUS_REG); | 171 | status = rtc_read(rtc, OMAP_RTC_STATUS_REG); |
143 | if ((status & (u8)OMAP_RTC_STATUS_BUSY) == 0) | 172 | if (!(status & OMAP_RTC_STATUS_BUSY)) |
144 | break; | 173 | break; |
145 | udelay(1); | 174 | udelay(1); |
146 | } | 175 | } |
147 | /* now we have ~15 usec to read/write various registers */ | 176 | /* now we have ~15 usec to read/write various registers */ |
148 | } | 177 | } |
149 | 178 | ||
150 | static irqreturn_t rtc_irq(int irq, void *rtc) | 179 | static irqreturn_t rtc_irq(int irq, void *dev_id) |
151 | { | 180 | { |
152 | unsigned long events = 0; | 181 | struct omap_rtc *rtc = dev_id; |
153 | u8 irq_data; | 182 | unsigned long events = 0; |
183 | u8 irq_data; | ||
154 | 184 | ||
155 | irq_data = rtc_read(OMAP_RTC_STATUS_REG); | 185 | irq_data = rtc_read(rtc, OMAP_RTC_STATUS_REG); |
156 | 186 | ||
157 | /* alarm irq? */ | 187 | /* alarm irq? */ |
158 | if (irq_data & OMAP_RTC_STATUS_ALARM) { | 188 | if (irq_data & OMAP_RTC_STATUS_ALARM) { |
159 | rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG); | 189 | rtc_write(rtc, OMAP_RTC_STATUS_REG, OMAP_RTC_STATUS_ALARM); |
160 | events |= RTC_IRQF | RTC_AF; | 190 | events |= RTC_IRQF | RTC_AF; |
161 | } | 191 | } |
162 | 192 | ||
@@ -164,23 +194,21 @@ static irqreturn_t rtc_irq(int irq, void *rtc) | |||
164 | if (irq_data & OMAP_RTC_STATUS_1S_EVENT) | 194 | if (irq_data & OMAP_RTC_STATUS_1S_EVENT) |
165 | events |= RTC_IRQF | RTC_UF; | 195 | events |= RTC_IRQF | RTC_UF; |
166 | 196 | ||
167 | rtc_update_irq(rtc, 1, events); | 197 | rtc_update_irq(rtc->rtc, 1, events); |
168 | 198 | ||
169 | return IRQ_HANDLED; | 199 | return IRQ_HANDLED; |
170 | } | 200 | } |
171 | 201 | ||
172 | static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) | 202 | static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) |
173 | { | 203 | { |
204 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
174 | u8 reg, irqwake_reg = 0; | 205 | u8 reg, irqwake_reg = 0; |
175 | struct platform_device *pdev = to_platform_device(dev); | ||
176 | const struct platform_device_id *id_entry = | ||
177 | platform_get_device_id(pdev); | ||
178 | 206 | ||
179 | local_irq_disable(); | 207 | local_irq_disable(); |
180 | rtc_wait_not_busy(); | 208 | rtc_wait_not_busy(rtc); |
181 | reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); | 209 | reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); |
182 | if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) | 210 | if (rtc->type->has_irqwakeen) |
183 | irqwake_reg = rtc_read(OMAP_RTC_IRQWAKEEN); | 211 | irqwake_reg = rtc_read(rtc, OMAP_RTC_IRQWAKEEN); |
184 | 212 | ||
185 | if (enabled) { | 213 | if (enabled) { |
186 | reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; | 214 | reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; |
@@ -189,10 +217,10 @@ static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) | |||
189 | reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; | 217 | reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; |
190 | irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; | 218 | irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; |
191 | } | 219 | } |
192 | rtc_wait_not_busy(); | 220 | rtc_wait_not_busy(rtc); |
193 | rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); | 221 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, reg); |
194 | if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) | 222 | if (rtc->type->has_irqwakeen) |
195 | rtc_write(irqwake_reg, OMAP_RTC_IRQWAKEEN); | 223 | rtc_write(rtc, OMAP_RTC_IRQWAKEEN, irqwake_reg); |
196 | local_irq_enable(); | 224 | local_irq_enable(); |
197 | 225 | ||
198 | return 0; | 226 | return 0; |
@@ -230,39 +258,47 @@ static void bcd2tm(struct rtc_time *tm) | |||
230 | tm->tm_year = bcd2bin(tm->tm_year) + 100; | 258 | tm->tm_year = bcd2bin(tm->tm_year) + 100; |
231 | } | 259 | } |
232 | 260 | ||
261 | static void omap_rtc_read_time_raw(struct omap_rtc *rtc, struct rtc_time *tm) | ||
262 | { | ||
263 | tm->tm_sec = rtc_read(rtc, OMAP_RTC_SECONDS_REG); | ||
264 | tm->tm_min = rtc_read(rtc, OMAP_RTC_MINUTES_REG); | ||
265 | tm->tm_hour = rtc_read(rtc, OMAP_RTC_HOURS_REG); | ||
266 | tm->tm_mday = rtc_read(rtc, OMAP_RTC_DAYS_REG); | ||
267 | tm->tm_mon = rtc_read(rtc, OMAP_RTC_MONTHS_REG); | ||
268 | tm->tm_year = rtc_read(rtc, OMAP_RTC_YEARS_REG); | ||
269 | } | ||
233 | 270 | ||
234 | static int omap_rtc_read_time(struct device *dev, struct rtc_time *tm) | 271 | static int omap_rtc_read_time(struct device *dev, struct rtc_time *tm) |
235 | { | 272 | { |
273 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
274 | |||
236 | /* we don't report wday/yday/isdst ... */ | 275 | /* we don't report wday/yday/isdst ... */ |
237 | local_irq_disable(); | 276 | local_irq_disable(); |
238 | rtc_wait_not_busy(); | 277 | rtc_wait_not_busy(rtc); |
239 | 278 | omap_rtc_read_time_raw(rtc, tm); | |
240 | tm->tm_sec = rtc_read(OMAP_RTC_SECONDS_REG); | ||
241 | tm->tm_min = rtc_read(OMAP_RTC_MINUTES_REG); | ||
242 | tm->tm_hour = rtc_read(OMAP_RTC_HOURS_REG); | ||
243 | tm->tm_mday = rtc_read(OMAP_RTC_DAYS_REG); | ||
244 | tm->tm_mon = rtc_read(OMAP_RTC_MONTHS_REG); | ||
245 | tm->tm_year = rtc_read(OMAP_RTC_YEARS_REG); | ||
246 | |||
247 | local_irq_enable(); | 279 | local_irq_enable(); |
248 | 280 | ||
249 | bcd2tm(tm); | 281 | bcd2tm(tm); |
282 | |||
250 | return 0; | 283 | return 0; |
251 | } | 284 | } |
252 | 285 | ||
253 | static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm) | 286 | static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm) |
254 | { | 287 | { |
288 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
289 | |||
255 | if (tm2bcd(tm) < 0) | 290 | if (tm2bcd(tm) < 0) |
256 | return -EINVAL; | 291 | return -EINVAL; |
292 | |||
257 | local_irq_disable(); | 293 | local_irq_disable(); |
258 | rtc_wait_not_busy(); | 294 | rtc_wait_not_busy(rtc); |
259 | 295 | ||
260 | rtc_write(tm->tm_year, OMAP_RTC_YEARS_REG); | 296 | rtc_write(rtc, OMAP_RTC_YEARS_REG, tm->tm_year); |
261 | rtc_write(tm->tm_mon, OMAP_RTC_MONTHS_REG); | 297 | rtc_write(rtc, OMAP_RTC_MONTHS_REG, tm->tm_mon); |
262 | rtc_write(tm->tm_mday, OMAP_RTC_DAYS_REG); | 298 | rtc_write(rtc, OMAP_RTC_DAYS_REG, tm->tm_mday); |
263 | rtc_write(tm->tm_hour, OMAP_RTC_HOURS_REG); | 299 | rtc_write(rtc, OMAP_RTC_HOURS_REG, tm->tm_hour); |
264 | rtc_write(tm->tm_min, OMAP_RTC_MINUTES_REG); | 300 | rtc_write(rtc, OMAP_RTC_MINUTES_REG, tm->tm_min); |
265 | rtc_write(tm->tm_sec, OMAP_RTC_SECONDS_REG); | 301 | rtc_write(rtc, OMAP_RTC_SECONDS_REG, tm->tm_sec); |
266 | 302 | ||
267 | local_irq_enable(); | 303 | local_irq_enable(); |
268 | 304 | ||
@@ -271,48 +307,50 @@ static int omap_rtc_set_time(struct device *dev, struct rtc_time *tm) | |||
271 | 307 | ||
272 | static int omap_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) | 308 | static int omap_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) |
273 | { | 309 | { |
310 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
311 | u8 interrupts; | ||
312 | |||
274 | local_irq_disable(); | 313 | local_irq_disable(); |
275 | rtc_wait_not_busy(); | 314 | rtc_wait_not_busy(rtc); |
276 | 315 | ||
277 | alm->time.tm_sec = rtc_read(OMAP_RTC_ALARM_SECONDS_REG); | 316 | alm->time.tm_sec = rtc_read(rtc, OMAP_RTC_ALARM_SECONDS_REG); |
278 | alm->time.tm_min = rtc_read(OMAP_RTC_ALARM_MINUTES_REG); | 317 | alm->time.tm_min = rtc_read(rtc, OMAP_RTC_ALARM_MINUTES_REG); |
279 | alm->time.tm_hour = rtc_read(OMAP_RTC_ALARM_HOURS_REG); | 318 | alm->time.tm_hour = rtc_read(rtc, OMAP_RTC_ALARM_HOURS_REG); |
280 | alm->time.tm_mday = rtc_read(OMAP_RTC_ALARM_DAYS_REG); | 319 | alm->time.tm_mday = rtc_read(rtc, OMAP_RTC_ALARM_DAYS_REG); |
281 | alm->time.tm_mon = rtc_read(OMAP_RTC_ALARM_MONTHS_REG); | 320 | alm->time.tm_mon = rtc_read(rtc, OMAP_RTC_ALARM_MONTHS_REG); |
282 | alm->time.tm_year = rtc_read(OMAP_RTC_ALARM_YEARS_REG); | 321 | alm->time.tm_year = rtc_read(rtc, OMAP_RTC_ALARM_YEARS_REG); |
283 | 322 | ||
284 | local_irq_enable(); | 323 | local_irq_enable(); |
285 | 324 | ||
286 | bcd2tm(&alm->time); | 325 | bcd2tm(&alm->time); |
287 | alm->enabled = !!(rtc_read(OMAP_RTC_INTERRUPTS_REG) | 326 | |
288 | & OMAP_RTC_INTERRUPTS_IT_ALARM); | 327 | interrupts = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); |
328 | alm->enabled = !!(interrupts & OMAP_RTC_INTERRUPTS_IT_ALARM); | ||
289 | 329 | ||
290 | return 0; | 330 | return 0; |
291 | } | 331 | } |
292 | 332 | ||
293 | static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) | 333 | static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) |
294 | { | 334 | { |
335 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
295 | u8 reg, irqwake_reg = 0; | 336 | u8 reg, irqwake_reg = 0; |
296 | struct platform_device *pdev = to_platform_device(dev); | ||
297 | const struct platform_device_id *id_entry = | ||
298 | platform_get_device_id(pdev); | ||
299 | 337 | ||
300 | if (tm2bcd(&alm->time) < 0) | 338 | if (tm2bcd(&alm->time) < 0) |
301 | return -EINVAL; | 339 | return -EINVAL; |
302 | 340 | ||
303 | local_irq_disable(); | 341 | local_irq_disable(); |
304 | rtc_wait_not_busy(); | 342 | rtc_wait_not_busy(rtc); |
305 | 343 | ||
306 | rtc_write(alm->time.tm_year, OMAP_RTC_ALARM_YEARS_REG); | 344 | rtc_write(rtc, OMAP_RTC_ALARM_YEARS_REG, alm->time.tm_year); |
307 | rtc_write(alm->time.tm_mon, OMAP_RTC_ALARM_MONTHS_REG); | 345 | rtc_write(rtc, OMAP_RTC_ALARM_MONTHS_REG, alm->time.tm_mon); |
308 | rtc_write(alm->time.tm_mday, OMAP_RTC_ALARM_DAYS_REG); | 346 | rtc_write(rtc, OMAP_RTC_ALARM_DAYS_REG, alm->time.tm_mday); |
309 | rtc_write(alm->time.tm_hour, OMAP_RTC_ALARM_HOURS_REG); | 347 | rtc_write(rtc, OMAP_RTC_ALARM_HOURS_REG, alm->time.tm_hour); |
310 | rtc_write(alm->time.tm_min, OMAP_RTC_ALARM_MINUTES_REG); | 348 | rtc_write(rtc, OMAP_RTC_ALARM_MINUTES_REG, alm->time.tm_min); |
311 | rtc_write(alm->time.tm_sec, OMAP_RTC_ALARM_SECONDS_REG); | 349 | rtc_write(rtc, OMAP_RTC_ALARM_SECONDS_REG, alm->time.tm_sec); |
312 | 350 | ||
313 | reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); | 351 | reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); |
314 | if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) | 352 | if (rtc->type->has_irqwakeen) |
315 | irqwake_reg = rtc_read(OMAP_RTC_IRQWAKEEN); | 353 | irqwake_reg = rtc_read(rtc, OMAP_RTC_IRQWAKEEN); |
316 | 354 | ||
317 | if (alm->enabled) { | 355 | if (alm->enabled) { |
318 | reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; | 356 | reg |= OMAP_RTC_INTERRUPTS_IT_ALARM; |
@@ -321,15 +359,79 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) | |||
321 | reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; | 359 | reg &= ~OMAP_RTC_INTERRUPTS_IT_ALARM; |
322 | irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; | 360 | irqwake_reg &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; |
323 | } | 361 | } |
324 | rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); | 362 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, reg); |
325 | if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) | 363 | if (rtc->type->has_irqwakeen) |
326 | rtc_write(irqwake_reg, OMAP_RTC_IRQWAKEEN); | 364 | rtc_write(rtc, OMAP_RTC_IRQWAKEEN, irqwake_reg); |
327 | 365 | ||
328 | local_irq_enable(); | 366 | local_irq_enable(); |
329 | 367 | ||
330 | return 0; | 368 | return 0; |
331 | } | 369 | } |
332 | 370 | ||
371 | static struct omap_rtc *omap_rtc_power_off_rtc; | ||
372 | |||
373 | /* | ||
374 | * omap_rtc_poweroff: RTC-controlled power off | ||
375 | * | ||
376 | * The RTC can be used to control an external PMIC via the pmic_power_en pin, | ||
377 | * which can be configured to transition to OFF on ALARM2 events. | ||
378 | * | ||
379 | * Notes: | ||
380 | * The two-second alarm offset is the shortest offset possible as the alarm | ||
381 | * registers must be set before the next timer update and the offset | ||
382 | * calculation is too heavy for everything to be done within a single access | ||
383 | * period (~15 us). | ||
384 | * | ||
385 | * Called with local interrupts disabled. | ||
386 | */ | ||
387 | static void omap_rtc_power_off(void) | ||
388 | { | ||
389 | struct omap_rtc *rtc = omap_rtc_power_off_rtc; | ||
390 | struct rtc_time tm; | ||
391 | unsigned long now; | ||
392 | u32 val; | ||
393 | |||
394 | /* enable pmic_power_en control */ | ||
395 | val = rtc_readl(rtc, OMAP_RTC_PMIC_REG); | ||
396 | rtc_writel(rtc, OMAP_RTC_PMIC_REG, val | OMAP_RTC_PMIC_POWER_EN_EN); | ||
397 | |||
398 | /* set alarm two seconds from now */ | ||
399 | omap_rtc_read_time_raw(rtc, &tm); | ||
400 | bcd2tm(&tm); | ||
401 | rtc_tm_to_time(&tm, &now); | ||
402 | rtc_time_to_tm(now + 2, &tm); | ||
403 | |||
404 | if (tm2bcd(&tm) < 0) { | ||
405 | dev_err(&rtc->rtc->dev, "power off failed\n"); | ||
406 | return; | ||
407 | } | ||
408 | |||
409 | rtc_wait_not_busy(rtc); | ||
410 | |||
411 | rtc_write(rtc, OMAP_RTC_ALARM2_SECONDS_REG, tm.tm_sec); | ||
412 | rtc_write(rtc, OMAP_RTC_ALARM2_MINUTES_REG, tm.tm_min); | ||
413 | rtc_write(rtc, OMAP_RTC_ALARM2_HOURS_REG, tm.tm_hour); | ||
414 | rtc_write(rtc, OMAP_RTC_ALARM2_DAYS_REG, tm.tm_mday); | ||
415 | rtc_write(rtc, OMAP_RTC_ALARM2_MONTHS_REG, tm.tm_mon); | ||
416 | rtc_write(rtc, OMAP_RTC_ALARM2_YEARS_REG, tm.tm_year); | ||
417 | |||
418 | /* | ||
419 | * enable ALARM2 interrupt | ||
420 | * | ||
421 | * NOTE: this fails on AM3352 if rtc_write (writeb) is used | ||
422 | */ | ||
423 | val = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); | ||
424 | rtc_writel(rtc, OMAP_RTC_INTERRUPTS_REG, | ||
425 | val | OMAP_RTC_INTERRUPTS_IT_ALARM2); | ||
426 | |||
427 | /* | ||
428 | * Wait for alarm to trigger (within two seconds) and external PMIC to | ||
429 | * power off the system. Add a 500 ms margin for external latencies | ||
430 | * (e.g. debounce circuits). | ||
431 | */ | ||
432 | mdelay(2500); | ||
433 | } | ||
434 | |||
333 | static struct rtc_class_ops omap_rtc_ops = { | 435 | static struct rtc_class_ops omap_rtc_ops = { |
334 | .read_time = omap_rtc_read_time, | 436 | .read_time = omap_rtc_read_time, |
335 | .set_time = omap_rtc_set_time, | 437 | .set_time = omap_rtc_set_time, |
@@ -338,137 +440,140 @@ static struct rtc_class_ops omap_rtc_ops = { | |||
338 | .alarm_irq_enable = omap_rtc_alarm_irq_enable, | 440 | .alarm_irq_enable = omap_rtc_alarm_irq_enable, |
339 | }; | 441 | }; |
340 | 442 | ||
341 | static int omap_rtc_alarm; | 443 | static const struct omap_rtc_device_type omap_rtc_default_type = { |
342 | static int omap_rtc_timer; | 444 | .has_power_up_reset = true, |
445 | }; | ||
343 | 446 | ||
344 | #define OMAP_RTC_DATA_AM3352_IDX 1 | 447 | static const struct omap_rtc_device_type omap_rtc_am3352_type = { |
345 | #define OMAP_RTC_DATA_DA830_IDX 2 | 448 | .has_32kclk_en = true, |
449 | .has_kicker = true, | ||
450 | .has_irqwakeen = true, | ||
451 | .has_pmic_mode = true, | ||
452 | }; | ||
346 | 453 | ||
347 | static struct platform_device_id omap_rtc_devtype[] = { | 454 | static const struct omap_rtc_device_type omap_rtc_da830_type = { |
455 | .has_kicker = true, | ||
456 | }; | ||
457 | |||
458 | static const struct platform_device_id omap_rtc_id_table[] = { | ||
348 | { | 459 | { |
349 | .name = DRIVER_NAME, | 460 | .name = "omap_rtc", |
350 | }, | 461 | .driver_data = (kernel_ulong_t)&omap_rtc_default_type, |
351 | [OMAP_RTC_DATA_AM3352_IDX] = { | 462 | }, { |
352 | .name = "am3352-rtc", | 463 | .name = "am3352-rtc", |
353 | .driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN | | 464 | .driver_data = (kernel_ulong_t)&omap_rtc_am3352_type, |
354 | OMAP_RTC_HAS_32KCLK_EN, | 465 | }, { |
355 | }, | ||
356 | [OMAP_RTC_DATA_DA830_IDX] = { | ||
357 | .name = "da830-rtc", | 466 | .name = "da830-rtc", |
358 | .driver_data = OMAP_RTC_HAS_KICKER, | 467 | .driver_data = (kernel_ulong_t)&omap_rtc_da830_type, |
359 | }, | 468 | }, { |
360 | {}, | 469 | /* sentinel */ |
470 | } | ||
361 | }; | 471 | }; |
362 | MODULE_DEVICE_TABLE(platform, omap_rtc_devtype); | 472 | MODULE_DEVICE_TABLE(platform, omap_rtc_id_table); |
363 | 473 | ||
364 | static const struct of_device_id omap_rtc_of_match[] = { | 474 | static const struct of_device_id omap_rtc_of_match[] = { |
365 | { .compatible = "ti,da830-rtc", | 475 | { |
366 | .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX], | 476 | .compatible = "ti,am3352-rtc", |
367 | }, | 477 | .data = &omap_rtc_am3352_type, |
368 | { .compatible = "ti,am3352-rtc", | 478 | }, { |
369 | .data = &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX], | 479 | .compatible = "ti,da830-rtc", |
370 | }, | 480 | .data = &omap_rtc_da830_type, |
371 | {}, | 481 | }, { |
482 | /* sentinel */ | ||
483 | } | ||
372 | }; | 484 | }; |
373 | MODULE_DEVICE_TABLE(of, omap_rtc_of_match); | 485 | MODULE_DEVICE_TABLE(of, omap_rtc_of_match); |
374 | 486 | ||
375 | static int __init omap_rtc_probe(struct platform_device *pdev) | 487 | static int __init omap_rtc_probe(struct platform_device *pdev) |
376 | { | 488 | { |
377 | struct resource *res; | 489 | struct omap_rtc *rtc; |
378 | struct rtc_device *rtc; | 490 | struct resource *res; |
379 | u8 reg, new_ctrl; | 491 | u8 reg, mask, new_ctrl; |
380 | const struct platform_device_id *id_entry; | 492 | const struct platform_device_id *id_entry; |
381 | const struct of_device_id *of_id; | 493 | const struct of_device_id *of_id; |
494 | int ret; | ||
382 | 495 | ||
383 | of_id = of_match_device(omap_rtc_of_match, &pdev->dev); | 496 | rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); |
384 | if (of_id) | 497 | if (!rtc) |
385 | pdev->id_entry = of_id->data; | 498 | return -ENOMEM; |
386 | 499 | ||
387 | id_entry = platform_get_device_id(pdev); | 500 | of_id = of_match_device(omap_rtc_of_match, &pdev->dev); |
388 | if (!id_entry) { | 501 | if (of_id) { |
389 | dev_err(&pdev->dev, "no matching device entry\n"); | 502 | rtc->type = of_id->data; |
390 | return -ENODEV; | 503 | rtc->is_pmic_controller = rtc->type->has_pmic_mode && |
504 | of_property_read_bool(pdev->dev.of_node, | ||
505 | "system-power-controller"); | ||
506 | } else { | ||
507 | id_entry = platform_get_device_id(pdev); | ||
508 | rtc->type = (void *)id_entry->driver_data; | ||
391 | } | 509 | } |
392 | 510 | ||
393 | omap_rtc_timer = platform_get_irq(pdev, 0); | 511 | rtc->irq_timer = platform_get_irq(pdev, 0); |
394 | if (omap_rtc_timer <= 0) { | 512 | if (rtc->irq_timer <= 0) |
395 | pr_debug("%s: no update irq?\n", pdev->name); | ||
396 | return -ENOENT; | 513 | return -ENOENT; |
397 | } | ||
398 | 514 | ||
399 | omap_rtc_alarm = platform_get_irq(pdev, 1); | 515 | rtc->irq_alarm = platform_get_irq(pdev, 1); |
400 | if (omap_rtc_alarm <= 0) { | 516 | if (rtc->irq_alarm <= 0) |
401 | pr_debug("%s: no alarm irq?\n", pdev->name); | ||
402 | return -ENOENT; | 517 | return -ENOENT; |
403 | } | ||
404 | 518 | ||
405 | res = platform_get_resource(pdev, IORESOURCE_MEM, 0); | 519 | res = platform_get_resource(pdev, IORESOURCE_MEM, 0); |
406 | rtc_base = devm_ioremap_resource(&pdev->dev, res); | 520 | rtc->base = devm_ioremap_resource(&pdev->dev, res); |
407 | if (IS_ERR(rtc_base)) | 521 | if (IS_ERR(rtc->base)) |
408 | return PTR_ERR(rtc_base); | 522 | return PTR_ERR(rtc->base); |
523 | |||
524 | platform_set_drvdata(pdev, rtc); | ||
409 | 525 | ||
410 | /* Enable the clock/module so that we can access the registers */ | 526 | /* Enable the clock/module so that we can access the registers */ |
411 | pm_runtime_enable(&pdev->dev); | 527 | pm_runtime_enable(&pdev->dev); |
412 | pm_runtime_get_sync(&pdev->dev); | 528 | pm_runtime_get_sync(&pdev->dev); |
413 | 529 | ||
414 | if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) { | 530 | if (rtc->type->has_kicker) { |
415 | rtc_writel(KICK0_VALUE, OMAP_RTC_KICK0_REG); | 531 | rtc_writel(rtc, OMAP_RTC_KICK0_REG, KICK0_VALUE); |
416 | rtc_writel(KICK1_VALUE, OMAP_RTC_KICK1_REG); | 532 | rtc_writel(rtc, OMAP_RTC_KICK1_REG, KICK1_VALUE); |
417 | } | ||
418 | |||
419 | rtc = devm_rtc_device_register(&pdev->dev, pdev->name, | ||
420 | &omap_rtc_ops, THIS_MODULE); | ||
421 | if (IS_ERR(rtc)) { | ||
422 | pr_debug("%s: can't register RTC device, err %ld\n", | ||
423 | pdev->name, PTR_ERR(rtc)); | ||
424 | goto fail0; | ||
425 | } | 533 | } |
426 | platform_set_drvdata(pdev, rtc); | ||
427 | 534 | ||
428 | /* clear pending irqs, and set 1/second periodic, | 535 | /* |
429 | * which we'll use instead of update irqs | 536 | * disable interrupts |
537 | * | ||
538 | * NOTE: ALARM2 is not cleared on AM3352 if rtc_write (writeb) is used | ||
430 | */ | 539 | */ |
431 | rtc_write(0, OMAP_RTC_INTERRUPTS_REG); | 540 | rtc_writel(rtc, OMAP_RTC_INTERRUPTS_REG, 0); |
432 | 541 | ||
433 | /* enable RTC functional clock */ | 542 | /* enable RTC functional clock */ |
434 | if (id_entry->driver_data & OMAP_RTC_HAS_32KCLK_EN) | 543 | if (rtc->type->has_32kclk_en) { |
435 | rtc_writel(OMAP_RTC_OSC_32KCLK_EN, OMAP_RTC_OSC_REG); | 544 | reg = rtc_read(rtc, OMAP_RTC_OSC_REG); |
545 | rtc_writel(rtc, OMAP_RTC_OSC_REG, | ||
546 | reg | OMAP_RTC_OSC_32KCLK_EN); | ||
547 | } | ||
436 | 548 | ||
437 | /* clear old status */ | 549 | /* clear old status */ |
438 | reg = rtc_read(OMAP_RTC_STATUS_REG); | 550 | reg = rtc_read(rtc, OMAP_RTC_STATUS_REG); |
439 | if (reg & (u8) OMAP_RTC_STATUS_POWER_UP) { | ||
440 | pr_info("%s: RTC power up reset detected\n", | ||
441 | pdev->name); | ||
442 | rtc_write(OMAP_RTC_STATUS_POWER_UP, OMAP_RTC_STATUS_REG); | ||
443 | } | ||
444 | if (reg & (u8) OMAP_RTC_STATUS_ALARM) | ||
445 | rtc_write(OMAP_RTC_STATUS_ALARM, OMAP_RTC_STATUS_REG); | ||
446 | 551 | ||
447 | /* handle periodic and alarm irqs */ | 552 | mask = OMAP_RTC_STATUS_ALARM; |
448 | if (devm_request_irq(&pdev->dev, omap_rtc_timer, rtc_irq, 0, | 553 | |
449 | dev_name(&rtc->dev), rtc)) { | 554 | if (rtc->type->has_pmic_mode) |
450 | pr_debug("%s: RTC timer interrupt IRQ%d already claimed\n", | 555 | mask |= OMAP_RTC_STATUS_ALARM2; |
451 | pdev->name, omap_rtc_timer); | 556 | |
452 | goto fail0; | 557 | if (rtc->type->has_power_up_reset) { |
453 | } | 558 | mask |= OMAP_RTC_STATUS_POWER_UP; |
454 | if ((omap_rtc_timer != omap_rtc_alarm) && | 559 | if (reg & OMAP_RTC_STATUS_POWER_UP) |
455 | (devm_request_irq(&pdev->dev, omap_rtc_alarm, rtc_irq, 0, | 560 | dev_info(&pdev->dev, "RTC power up reset detected\n"); |
456 | dev_name(&rtc->dev), rtc))) { | ||
457 | pr_debug("%s: RTC alarm interrupt IRQ%d already claimed\n", | ||
458 | pdev->name, omap_rtc_alarm); | ||
459 | goto fail0; | ||
460 | } | 561 | } |
461 | 562 | ||
563 | if (reg & mask) | ||
564 | rtc_write(rtc, OMAP_RTC_STATUS_REG, reg & mask); | ||
565 | |||
462 | /* On boards with split power, RTC_ON_NOFF won't reset the RTC */ | 566 | /* On boards with split power, RTC_ON_NOFF won't reset the RTC */ |
463 | reg = rtc_read(OMAP_RTC_CTRL_REG); | 567 | reg = rtc_read(rtc, OMAP_RTC_CTRL_REG); |
464 | if (reg & (u8) OMAP_RTC_CTRL_STOP) | 568 | if (reg & OMAP_RTC_CTRL_STOP) |
465 | pr_info("%s: already running\n", pdev->name); | 569 | dev_info(&pdev->dev, "already running\n"); |
466 | 570 | ||
467 | /* force to 24 hour mode */ | 571 | /* force to 24 hour mode */ |
468 | new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP); | 572 | new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT | OMAP_RTC_CTRL_AUTO_COMP); |
469 | new_ctrl |= OMAP_RTC_CTRL_STOP; | 573 | new_ctrl |= OMAP_RTC_CTRL_STOP; |
470 | 574 | ||
471 | /* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE: | 575 | /* |
576 | * BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE: | ||
472 | * | 577 | * |
473 | * - Device wake-up capability setting should come through chip | 578 | * - Device wake-up capability setting should come through chip |
474 | * init logic. OMAP1 boards should initialize the "wakeup capable" | 579 | * init logic. OMAP1 boards should initialize the "wakeup capable" |
@@ -482,36 +587,70 @@ static int __init omap_rtc_probe(struct platform_device *pdev) | |||
482 | * is write-only, and always reads as zero...) | 587 | * is write-only, and always reads as zero...) |
483 | */ | 588 | */ |
484 | 589 | ||
590 | if (new_ctrl & OMAP_RTC_CTRL_SPLIT) | ||
591 | dev_info(&pdev->dev, "split power mode\n"); | ||
592 | |||
593 | if (reg != new_ctrl) | ||
594 | rtc_write(rtc, OMAP_RTC_CTRL_REG, new_ctrl); | ||
595 | |||
485 | device_init_wakeup(&pdev->dev, true); | 596 | device_init_wakeup(&pdev->dev, true); |
486 | 597 | ||
487 | if (new_ctrl & (u8) OMAP_RTC_CTRL_SPLIT) | 598 | rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, |
488 | pr_info("%s: split power mode\n", pdev->name); | 599 | &omap_rtc_ops, THIS_MODULE); |
600 | if (IS_ERR(rtc->rtc)) { | ||
601 | ret = PTR_ERR(rtc->rtc); | ||
602 | goto err; | ||
603 | } | ||
489 | 604 | ||
490 | if (reg != new_ctrl) | 605 | /* handle periodic and alarm irqs */ |
491 | rtc_write(new_ctrl, OMAP_RTC_CTRL_REG); | 606 | ret = devm_request_irq(&pdev->dev, rtc->irq_timer, rtc_irq, 0, |
607 | dev_name(&rtc->rtc->dev), rtc); | ||
608 | if (ret) | ||
609 | goto err; | ||
610 | |||
611 | if (rtc->irq_timer != rtc->irq_alarm) { | ||
612 | ret = devm_request_irq(&pdev->dev, rtc->irq_alarm, rtc_irq, 0, | ||
613 | dev_name(&rtc->rtc->dev), rtc); | ||
614 | if (ret) | ||
615 | goto err; | ||
616 | } | ||
617 | |||
618 | if (rtc->is_pmic_controller) { | ||
619 | if (!pm_power_off) { | ||
620 | omap_rtc_power_off_rtc = rtc; | ||
621 | pm_power_off = omap_rtc_power_off; | ||
622 | } | ||
623 | } | ||
492 | 624 | ||
493 | return 0; | 625 | return 0; |
494 | 626 | ||
495 | fail0: | 627 | err: |
496 | if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) | 628 | device_init_wakeup(&pdev->dev, false); |
497 | rtc_writel(0, OMAP_RTC_KICK0_REG); | 629 | if (rtc->type->has_kicker) |
630 | rtc_writel(rtc, OMAP_RTC_KICK0_REG, 0); | ||
498 | pm_runtime_put_sync(&pdev->dev); | 631 | pm_runtime_put_sync(&pdev->dev); |
499 | pm_runtime_disable(&pdev->dev); | 632 | pm_runtime_disable(&pdev->dev); |
500 | return -EIO; | 633 | |
634 | return ret; | ||
501 | } | 635 | } |
502 | 636 | ||
503 | static int __exit omap_rtc_remove(struct platform_device *pdev) | 637 | static int __exit omap_rtc_remove(struct platform_device *pdev) |
504 | { | 638 | { |
505 | const struct platform_device_id *id_entry = | 639 | struct omap_rtc *rtc = platform_get_drvdata(pdev); |
506 | platform_get_device_id(pdev); | 640 | |
641 | if (pm_power_off == omap_rtc_power_off && | ||
642 | omap_rtc_power_off_rtc == rtc) { | ||
643 | pm_power_off = NULL; | ||
644 | omap_rtc_power_off_rtc = NULL; | ||
645 | } | ||
507 | 646 | ||
508 | device_init_wakeup(&pdev->dev, 0); | 647 | device_init_wakeup(&pdev->dev, 0); |
509 | 648 | ||
510 | /* leave rtc running, but disable irqs */ | 649 | /* leave rtc running, but disable irqs */ |
511 | rtc_write(0, OMAP_RTC_INTERRUPTS_REG); | 650 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, 0); |
512 | 651 | ||
513 | if (id_entry->driver_data & OMAP_RTC_HAS_KICKER) | 652 | if (rtc->type->has_kicker) |
514 | rtc_writel(0, OMAP_RTC_KICK0_REG); | 653 | rtc_writel(rtc, OMAP_RTC_KICK0_REG, 0); |
515 | 654 | ||
516 | /* Disable the clock/module */ | 655 | /* Disable the clock/module */ |
517 | pm_runtime_put_sync(&pdev->dev); | 656 | pm_runtime_put_sync(&pdev->dev); |
@@ -521,20 +660,21 @@ static int __exit omap_rtc_remove(struct platform_device *pdev) | |||
521 | } | 660 | } |
522 | 661 | ||
523 | #ifdef CONFIG_PM_SLEEP | 662 | #ifdef CONFIG_PM_SLEEP |
524 | static u8 irqstat; | ||
525 | |||
526 | static int omap_rtc_suspend(struct device *dev) | 663 | static int omap_rtc_suspend(struct device *dev) |
527 | { | 664 | { |
528 | irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG); | 665 | struct omap_rtc *rtc = dev_get_drvdata(dev); |
529 | 666 | ||
530 | /* FIXME the RTC alarm is not currently acting as a wakeup event | 667 | rtc->interrupts_reg = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); |
668 | |||
669 | /* | ||
670 | * FIXME: the RTC alarm is not currently acting as a wakeup event | ||
531 | * source on some platforms, and in fact this enable() call is just | 671 | * source on some platforms, and in fact this enable() call is just |
532 | * saving a flag that's never used... | 672 | * saving a flag that's never used... |
533 | */ | 673 | */ |
534 | if (device_may_wakeup(dev)) | 674 | if (device_may_wakeup(dev)) |
535 | enable_irq_wake(omap_rtc_alarm); | 675 | enable_irq_wake(rtc->irq_alarm); |
536 | else | 676 | else |
537 | rtc_write(0, OMAP_RTC_INTERRUPTS_REG); | 677 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, 0); |
538 | 678 | ||
539 | /* Disable the clock/module */ | 679 | /* Disable the clock/module */ |
540 | pm_runtime_put_sync(dev); | 680 | pm_runtime_put_sync(dev); |
@@ -544,13 +684,15 @@ static int omap_rtc_suspend(struct device *dev) | |||
544 | 684 | ||
545 | static int omap_rtc_resume(struct device *dev) | 685 | static int omap_rtc_resume(struct device *dev) |
546 | { | 686 | { |
687 | struct omap_rtc *rtc = dev_get_drvdata(dev); | ||
688 | |||
547 | /* Enable the clock/module so that we can access the registers */ | 689 | /* Enable the clock/module so that we can access the registers */ |
548 | pm_runtime_get_sync(dev); | 690 | pm_runtime_get_sync(dev); |
549 | 691 | ||
550 | if (device_may_wakeup(dev)) | 692 | if (device_may_wakeup(dev)) |
551 | disable_irq_wake(omap_rtc_alarm); | 693 | disable_irq_wake(rtc->irq_alarm); |
552 | else | 694 | else |
553 | rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG); | 695 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, rtc->interrupts_reg); |
554 | 696 | ||
555 | return 0; | 697 | return 0; |
556 | } | 698 | } |
@@ -560,23 +702,32 @@ static SIMPLE_DEV_PM_OPS(omap_rtc_pm_ops, omap_rtc_suspend, omap_rtc_resume); | |||
560 | 702 | ||
561 | static void omap_rtc_shutdown(struct platform_device *pdev) | 703 | static void omap_rtc_shutdown(struct platform_device *pdev) |
562 | { | 704 | { |
563 | rtc_write(0, OMAP_RTC_INTERRUPTS_REG); | 705 | struct omap_rtc *rtc = platform_get_drvdata(pdev); |
706 | u8 mask; | ||
707 | |||
708 | /* | ||
709 | * Keep the ALARM interrupt enabled to allow the system to power up on | ||
710 | * alarm events. | ||
711 | */ | ||
712 | mask = rtc_read(rtc, OMAP_RTC_INTERRUPTS_REG); | ||
713 | mask &= OMAP_RTC_INTERRUPTS_IT_ALARM; | ||
714 | rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, mask); | ||
564 | } | 715 | } |
565 | 716 | ||
566 | MODULE_ALIAS("platform:omap_rtc"); | ||
567 | static struct platform_driver omap_rtc_driver = { | 717 | static struct platform_driver omap_rtc_driver = { |
568 | .remove = __exit_p(omap_rtc_remove), | 718 | .remove = __exit_p(omap_rtc_remove), |
569 | .shutdown = omap_rtc_shutdown, | 719 | .shutdown = omap_rtc_shutdown, |
570 | .driver = { | 720 | .driver = { |
571 | .name = DRIVER_NAME, | 721 | .name = "omap_rtc", |
572 | .owner = THIS_MODULE, | 722 | .owner = THIS_MODULE, |
573 | .pm = &omap_rtc_pm_ops, | 723 | .pm = &omap_rtc_pm_ops, |
574 | .of_match_table = omap_rtc_of_match, | 724 | .of_match_table = omap_rtc_of_match, |
575 | }, | 725 | }, |
576 | .id_table = omap_rtc_devtype, | 726 | .id_table = omap_rtc_id_table, |
577 | }; | 727 | }; |
578 | 728 | ||
579 | module_platform_driver_probe(omap_rtc_driver, omap_rtc_probe); | 729 | module_platform_driver_probe(omap_rtc_driver, omap_rtc_probe); |
580 | 730 | ||
731 | MODULE_ALIAS("platform:omap_rtc"); | ||
581 | MODULE_AUTHOR("George G. Davis (and others)"); | 732 | MODULE_AUTHOR("George G. Davis (and others)"); |
582 | MODULE_LICENSE("GPL"); | 733 | MODULE_LICENSE("GPL"); |
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c index c2ef0a22ee94..96fb32e7d6f8 100644 --- a/drivers/rtc/rtc-pcf8563.c +++ b/drivers/rtc/rtc-pcf8563.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #define PCF8563_REG_ST2 0x01 | 28 | #define PCF8563_REG_ST2 0x01 |
29 | #define PCF8563_BIT_AIE (1 << 1) | 29 | #define PCF8563_BIT_AIE (1 << 1) |
30 | #define PCF8563_BIT_AF (1 << 3) | 30 | #define PCF8563_BIT_AF (1 << 3) |
31 | #define PCF8563_BITS_ST2_N (7 << 5) | ||
31 | 32 | ||
32 | #define PCF8563_REG_SC 0x02 /* datetime */ | 33 | #define PCF8563_REG_SC 0x02 /* datetime */ |
33 | #define PCF8563_REG_MN 0x03 | 34 | #define PCF8563_REG_MN 0x03 |
@@ -41,6 +42,13 @@ | |||
41 | 42 | ||
42 | #define PCF8563_REG_CLKO 0x0D /* clock out */ | 43 | #define PCF8563_REG_CLKO 0x0D /* clock out */ |
43 | #define PCF8563_REG_TMRC 0x0E /* timer control */ | 44 | #define PCF8563_REG_TMRC 0x0E /* timer control */ |
45 | #define PCF8563_TMRC_ENABLE BIT(7) | ||
46 | #define PCF8563_TMRC_4096 0 | ||
47 | #define PCF8563_TMRC_64 1 | ||
48 | #define PCF8563_TMRC_1 2 | ||
49 | #define PCF8563_TMRC_1_60 3 | ||
50 | #define PCF8563_TMRC_MASK 3 | ||
51 | |||
44 | #define PCF8563_REG_TMR 0x0F /* timer */ | 52 | #define PCF8563_REG_TMR 0x0F /* timer */ |
45 | 53 | ||
46 | #define PCF8563_SC_LV 0x80 /* low voltage */ | 54 | #define PCF8563_SC_LV 0x80 /* low voltage */ |
@@ -118,22 +126,21 @@ static int pcf8563_write_block_data(struct i2c_client *client, | |||
118 | 126 | ||
119 | static int pcf8563_set_alarm_mode(struct i2c_client *client, bool on) | 127 | static int pcf8563_set_alarm_mode(struct i2c_client *client, bool on) |
120 | { | 128 | { |
121 | unsigned char buf[2]; | 129 | unsigned char buf; |
122 | int err; | 130 | int err; |
123 | 131 | ||
124 | err = pcf8563_read_block_data(client, PCF8563_REG_ST2, 1, buf + 1); | 132 | err = pcf8563_read_block_data(client, PCF8563_REG_ST2, 1, &buf); |
125 | if (err < 0) | 133 | if (err < 0) |
126 | return err; | 134 | return err; |
127 | 135 | ||
128 | if (on) | 136 | if (on) |
129 | buf[1] |= PCF8563_BIT_AIE; | 137 | buf |= PCF8563_BIT_AIE; |
130 | else | 138 | else |
131 | buf[1] &= ~PCF8563_BIT_AIE; | 139 | buf &= ~PCF8563_BIT_AIE; |
132 | 140 | ||
133 | buf[1] &= ~PCF8563_BIT_AF; | 141 | buf &= ~(PCF8563_BIT_AF | PCF8563_BITS_ST2_N); |
134 | buf[0] = PCF8563_REG_ST2; | ||
135 | 142 | ||
136 | err = pcf8563_write_block_data(client, PCF8563_REG_ST2, 1, buf + 1); | 143 | err = pcf8563_write_block_data(client, PCF8563_REG_ST2, 1, &buf); |
137 | if (err < 0) { | 144 | if (err < 0) { |
138 | dev_err(&client->dev, "%s: write error\n", __func__); | 145 | dev_err(&client->dev, "%s: write error\n", __func__); |
139 | return -EIO; | 146 | return -EIO; |
@@ -336,8 +343,8 @@ static int pcf8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm) | |||
336 | __func__, buf[0], buf[1], buf[2], buf[3]); | 343 | __func__, buf[0], buf[1], buf[2], buf[3]); |
337 | 344 | ||
338 | tm->time.tm_min = bcd2bin(buf[0] & 0x7F); | 345 | tm->time.tm_min = bcd2bin(buf[0] & 0x7F); |
339 | tm->time.tm_hour = bcd2bin(buf[1] & 0x7F); | 346 | tm->time.tm_hour = bcd2bin(buf[1] & 0x3F); |
340 | tm->time.tm_mday = bcd2bin(buf[2] & 0x1F); | 347 | tm->time.tm_mday = bcd2bin(buf[2] & 0x3F); |
341 | tm->time.tm_wday = bcd2bin(buf[3] & 0x7); | 348 | tm->time.tm_wday = bcd2bin(buf[3] & 0x7); |
342 | tm->time.tm_mon = -1; | 349 | tm->time.tm_mon = -1; |
343 | tm->time.tm_year = -1; | 350 | tm->time.tm_year = -1; |
@@ -361,6 +368,14 @@ static int pcf8563_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm) | |||
361 | struct i2c_client *client = to_i2c_client(dev); | 368 | struct i2c_client *client = to_i2c_client(dev); |
362 | unsigned char buf[4]; | 369 | unsigned char buf[4]; |
363 | int err; | 370 | int err; |
371 | unsigned long alarm_time; | ||
372 | |||
373 | /* The alarm has no seconds, round up to nearest minute */ | ||
374 | if (tm->time.tm_sec) { | ||
375 | rtc_tm_to_time(&tm->time, &alarm_time); | ||
376 | alarm_time += 60-tm->time.tm_sec; | ||
377 | rtc_time_to_tm(alarm_time, &tm->time); | ||
378 | } | ||
364 | 379 | ||
365 | dev_dbg(dev, "%s, min=%d hour=%d wday=%d mday=%d " | 380 | dev_dbg(dev, "%s, min=%d hour=%d wday=%d mday=%d " |
366 | "enabled=%d pending=%d\n", __func__, | 381 | "enabled=%d pending=%d\n", __func__, |
@@ -381,6 +396,7 @@ static int pcf8563_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm) | |||
381 | 396 | ||
382 | static int pcf8563_irq_enable(struct device *dev, unsigned int enabled) | 397 | static int pcf8563_irq_enable(struct device *dev, unsigned int enabled) |
383 | { | 398 | { |
399 | dev_dbg(dev, "%s: en=%d\n", __func__, enabled); | ||
384 | return pcf8563_set_alarm_mode(to_i2c_client(dev), !!enabled); | 400 | return pcf8563_set_alarm_mode(to_i2c_client(dev), !!enabled); |
385 | } | 401 | } |
386 | 402 | ||
@@ -398,6 +414,8 @@ static int pcf8563_probe(struct i2c_client *client, | |||
398 | { | 414 | { |
399 | struct pcf8563 *pcf8563; | 415 | struct pcf8563 *pcf8563; |
400 | int err; | 416 | int err; |
417 | unsigned char buf; | ||
418 | unsigned char alm_pending; | ||
401 | 419 | ||
402 | dev_dbg(&client->dev, "%s\n", __func__); | 420 | dev_dbg(&client->dev, "%s\n", __func__); |
403 | 421 | ||
@@ -415,6 +433,22 @@ static int pcf8563_probe(struct i2c_client *client, | |||
415 | pcf8563->client = client; | 433 | pcf8563->client = client; |
416 | device_set_wakeup_capable(&client->dev, 1); | 434 | device_set_wakeup_capable(&client->dev, 1); |
417 | 435 | ||
436 | /* Set timer to lowest frequency to save power (ref Haoyu datasheet) */ | ||
437 | buf = PCF8563_TMRC_1_60; | ||
438 | err = pcf8563_write_block_data(client, PCF8563_REG_TMRC, 1, &buf); | ||
439 | if (err < 0) { | ||
440 | dev_err(&client->dev, "%s: write error\n", __func__); | ||
441 | return err; | ||
442 | } | ||
443 | |||
444 | err = pcf8563_get_alarm_mode(client, NULL, &alm_pending); | ||
445 | if (err < 0) { | ||
446 | dev_err(&client->dev, "%s: read error\n", __func__); | ||
447 | return err; | ||
448 | } | ||
449 | if (alm_pending) | ||
450 | pcf8563_set_alarm_mode(client, 0); | ||
451 | |||
418 | pcf8563->rtc = devm_rtc_device_register(&client->dev, | 452 | pcf8563->rtc = devm_rtc_device_register(&client->dev, |
419 | pcf8563_driver.driver.name, | 453 | pcf8563_driver.driver.name, |
420 | &pcf8563_rtc_ops, THIS_MODULE); | 454 | &pcf8563_rtc_ops, THIS_MODULE); |
@@ -435,6 +469,9 @@ static int pcf8563_probe(struct i2c_client *client, | |||
435 | 469 | ||
436 | } | 470 | } |
437 | 471 | ||
472 | /* the pcf8563 alarm only supports a minute accuracy */ | ||
473 | pcf8563->rtc->uie_unsupported = 1; | ||
474 | |||
438 | return 0; | 475 | return 0; |
439 | } | 476 | } |
440 | 477 | ||
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c index 76e38007ba90..d2ac6688e5c7 100644 --- a/drivers/rtc/rtc-sirfsoc.c +++ b/drivers/rtc/rtc-sirfsoc.c | |||
@@ -47,6 +47,7 @@ struct sirfsoc_rtc_drv { | |||
47 | unsigned irq_wake; | 47 | unsigned irq_wake; |
48 | /* Overflow for every 8 years extra time */ | 48 | /* Overflow for every 8 years extra time */ |
49 | u32 overflow_rtc; | 49 | u32 overflow_rtc; |
50 | spinlock_t lock; | ||
50 | #ifdef CONFIG_PM | 51 | #ifdef CONFIG_PM |
51 | u32 saved_counter; | 52 | u32 saved_counter; |
52 | u32 saved_overflow_rtc; | 53 | u32 saved_overflow_rtc; |
@@ -61,7 +62,7 @@ static int sirfsoc_rtc_read_alarm(struct device *dev, | |||
61 | 62 | ||
62 | rtcdrv = dev_get_drvdata(dev); | 63 | rtcdrv = dev_get_drvdata(dev); |
63 | 64 | ||
64 | local_irq_disable(); | 65 | spin_lock_irq(&rtcdrv->lock); |
65 | 66 | ||
66 | rtc_count = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); | 67 | rtc_count = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); |
67 | 68 | ||
@@ -84,7 +85,8 @@ static int sirfsoc_rtc_read_alarm(struct device *dev, | |||
84 | if (sirfsoc_rtc_iobrg_readl( | 85 | if (sirfsoc_rtc_iobrg_readl( |
85 | rtcdrv->rtc_base + RTC_STATUS) & SIRFSOC_RTC_AL0E) | 86 | rtcdrv->rtc_base + RTC_STATUS) & SIRFSOC_RTC_AL0E) |
86 | alrm->enabled = 1; | 87 | alrm->enabled = 1; |
87 | local_irq_enable(); | 88 | |
89 | spin_unlock_irq(&rtcdrv->lock); | ||
88 | 90 | ||
89 | return 0; | 91 | return 0; |
90 | } | 92 | } |
@@ -99,7 +101,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev, | |||
99 | if (alrm->enabled) { | 101 | if (alrm->enabled) { |
100 | rtc_tm_to_time(&(alrm->time), &rtc_alarm); | 102 | rtc_tm_to_time(&(alrm->time), &rtc_alarm); |
101 | 103 | ||
102 | local_irq_disable(); | 104 | spin_lock_irq(&rtcdrv->lock); |
103 | 105 | ||
104 | rtc_status_reg = sirfsoc_rtc_iobrg_readl( | 106 | rtc_status_reg = sirfsoc_rtc_iobrg_readl( |
105 | rtcdrv->rtc_base + RTC_STATUS); | 107 | rtcdrv->rtc_base + RTC_STATUS); |
@@ -123,14 +125,15 @@ static int sirfsoc_rtc_set_alarm(struct device *dev, | |||
123 | rtc_status_reg |= SIRFSOC_RTC_AL0E; | 125 | rtc_status_reg |= SIRFSOC_RTC_AL0E; |
124 | sirfsoc_rtc_iobrg_writel( | 126 | sirfsoc_rtc_iobrg_writel( |
125 | rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); | 127 | rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); |
126 | local_irq_enable(); | 128 | |
129 | spin_unlock_irq(&rtcdrv->lock); | ||
127 | } else { | 130 | } else { |
128 | /* | 131 | /* |
129 | * if this function was called with enabled=0 | 132 | * if this function was called with enabled=0 |
130 | * then it could mean that the application is | 133 | * then it could mean that the application is |
131 | * trying to cancel an ongoing alarm | 134 | * trying to cancel an ongoing alarm |
132 | */ | 135 | */ |
133 | local_irq_disable(); | 136 | spin_lock_irq(&rtcdrv->lock); |
134 | 137 | ||
135 | rtc_status_reg = sirfsoc_rtc_iobrg_readl( | 138 | rtc_status_reg = sirfsoc_rtc_iobrg_readl( |
136 | rtcdrv->rtc_base + RTC_STATUS); | 139 | rtcdrv->rtc_base + RTC_STATUS); |
@@ -146,7 +149,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev, | |||
146 | rtcdrv->rtc_base + RTC_STATUS); | 149 | rtcdrv->rtc_base + RTC_STATUS); |
147 | } | 150 | } |
148 | 151 | ||
149 | local_irq_enable(); | 152 | spin_unlock_irq(&rtcdrv->lock); |
150 | } | 153 | } |
151 | 154 | ||
152 | return 0; | 155 | return 0; |
@@ -209,12 +212,38 @@ static int sirfsoc_rtc_ioctl(struct device *dev, unsigned int cmd, | |||
209 | } | 212 | } |
210 | } | 213 | } |
211 | 214 | ||
215 | static int sirfsoc_rtc_alarm_irq_enable(struct device *dev, | ||
216 | unsigned int enabled) | ||
217 | { | ||
218 | unsigned long rtc_status_reg = 0x0; | ||
219 | struct sirfsoc_rtc_drv *rtcdrv; | ||
220 | |||
221 | rtcdrv = dev_get_drvdata(dev); | ||
222 | |||
223 | spin_lock_irq(&rtcdrv->lock); | ||
224 | |||
225 | rtc_status_reg = sirfsoc_rtc_iobrg_readl( | ||
226 | rtcdrv->rtc_base + RTC_STATUS); | ||
227 | if (enabled) | ||
228 | rtc_status_reg |= SIRFSOC_RTC_AL0E; | ||
229 | else | ||
230 | rtc_status_reg &= ~SIRFSOC_RTC_AL0E; | ||
231 | |||
232 | sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); | ||
233 | |||
234 | spin_unlock_irq(&rtcdrv->lock); | ||
235 | |||
236 | return 0; | ||
237 | |||
238 | } | ||
239 | |||
212 | static const struct rtc_class_ops sirfsoc_rtc_ops = { | 240 | static const struct rtc_class_ops sirfsoc_rtc_ops = { |
213 | .read_time = sirfsoc_rtc_read_time, | 241 | .read_time = sirfsoc_rtc_read_time, |
214 | .set_time = sirfsoc_rtc_set_time, | 242 | .set_time = sirfsoc_rtc_set_time, |
215 | .read_alarm = sirfsoc_rtc_read_alarm, | 243 | .read_alarm = sirfsoc_rtc_read_alarm, |
216 | .set_alarm = sirfsoc_rtc_set_alarm, | 244 | .set_alarm = sirfsoc_rtc_set_alarm, |
217 | .ioctl = sirfsoc_rtc_ioctl | 245 | .ioctl = sirfsoc_rtc_ioctl, |
246 | .alarm_irq_enable = sirfsoc_rtc_alarm_irq_enable | ||
218 | }; | 247 | }; |
219 | 248 | ||
220 | static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata) | 249 | static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata) |
@@ -223,6 +252,8 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata) | |||
223 | unsigned long rtc_status_reg = 0x0; | 252 | unsigned long rtc_status_reg = 0x0; |
224 | unsigned long events = 0x0; | 253 | unsigned long events = 0x0; |
225 | 254 | ||
255 | spin_lock(&rtcdrv->lock); | ||
256 | |||
226 | rtc_status_reg = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_STATUS); | 257 | rtc_status_reg = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_STATUS); |
227 | /* this bit will be set ONLY if an alarm was active | 258 | /* this bit will be set ONLY if an alarm was active |
228 | * and it expired NOW | 259 | * and it expired NOW |
@@ -240,6 +271,9 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata) | |||
240 | rtc_status_reg &= ~(SIRFSOC_RTC_AL0E); | 271 | rtc_status_reg &= ~(SIRFSOC_RTC_AL0E); |
241 | } | 272 | } |
242 | sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); | 273 | sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS); |
274 | |||
275 | spin_unlock(&rtcdrv->lock); | ||
276 | |||
243 | /* this should wake up any apps polling/waiting on the read | 277 | /* this should wake up any apps polling/waiting on the read |
244 | * after setting the alarm | 278 | * after setting the alarm |
245 | */ | 279 | */ |
@@ -267,6 +301,8 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev) | |||
267 | if (rtcdrv == NULL) | 301 | if (rtcdrv == NULL) |
268 | return -ENOMEM; | 302 | return -ENOMEM; |
269 | 303 | ||
304 | spin_lock_init(&rtcdrv->lock); | ||
305 | |||
270 | err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base); | 306 | err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base); |
271 | if (err) { | 307 | if (err) { |
272 | dev_err(&pdev->dev, "unable to find base address of rtc node in dtb\n"); | 308 | dev_err(&pdev->dev, "unable to find base address of rtc node in dtb\n"); |
@@ -286,14 +322,6 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev) | |||
286 | rtc_div = ((32768 / RTC_HZ) / 2) - 1; | 322 | rtc_div = ((32768 / RTC_HZ) / 2) - 1; |
287 | sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV); | 323 | sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV); |
288 | 324 | ||
289 | rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, | ||
290 | &sirfsoc_rtc_ops, THIS_MODULE); | ||
291 | if (IS_ERR(rtcdrv->rtc)) { | ||
292 | err = PTR_ERR(rtcdrv->rtc); | ||
293 | dev_err(&pdev->dev, "can't register RTC device\n"); | ||
294 | return err; | ||
295 | } | ||
296 | |||
297 | /* 0x3 -> RTC_CLK */ | 325 | /* 0x3 -> RTC_CLK */ |
298 | sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK, | 326 | sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK, |
299 | rtcdrv->rtc_base + RTC_CLOCK_SWITCH); | 327 | rtcdrv->rtc_base + RTC_CLOCK_SWITCH); |
@@ -308,6 +336,14 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev) | |||
308 | rtcdrv->overflow_rtc = | 336 | rtcdrv->overflow_rtc = |
309 | sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE); | 337 | sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE); |
310 | 338 | ||
339 | rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, | ||
340 | &sirfsoc_rtc_ops, THIS_MODULE); | ||
341 | if (IS_ERR(rtcdrv->rtc)) { | ||
342 | err = PTR_ERR(rtcdrv->rtc); | ||
343 | dev_err(&pdev->dev, "can't register RTC device\n"); | ||
344 | return err; | ||
345 | } | ||
346 | |||
311 | rtcdrv->irq = platform_get_irq(pdev, 0); | 347 | rtcdrv->irq = platform_get_irq(pdev, 0); |
312 | err = devm_request_irq( | 348 | err = devm_request_irq( |
313 | &pdev->dev, | 349 | &pdev->dev, |
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index fa384fe28988..2cd8ffe5c698 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/of_device.h> | 17 | #include <linux/of_device.h> |
18 | #include <linux/platform_device.h> | 18 | #include <linux/platform_device.h> |
19 | #include <linux/rtc.h> | 19 | #include <linux/rtc.h> |
20 | #include <linux/clk.h> | ||
20 | 21 | ||
21 | /* These register offsets are relative to LP (Low Power) range */ | 22 | /* These register offsets are relative to LP (Low Power) range */ |
22 | #define SNVS_LPCR 0x04 | 23 | #define SNVS_LPCR 0x04 |
@@ -39,6 +40,7 @@ struct snvs_rtc_data { | |||
39 | void __iomem *ioaddr; | 40 | void __iomem *ioaddr; |
40 | int irq; | 41 | int irq; |
41 | spinlock_t lock; | 42 | spinlock_t lock; |
43 | struct clk *clk; | ||
42 | }; | 44 | }; |
43 | 45 | ||
44 | static u32 rtc_read_lp_counter(void __iomem *ioaddr) | 46 | static u32 rtc_read_lp_counter(void __iomem *ioaddr) |
@@ -260,6 +262,18 @@ static int snvs_rtc_probe(struct platform_device *pdev) | |||
260 | if (data->irq < 0) | 262 | if (data->irq < 0) |
261 | return data->irq; | 263 | return data->irq; |
262 | 264 | ||
265 | data->clk = devm_clk_get(&pdev->dev, "snvs-rtc"); | ||
266 | if (IS_ERR(data->clk)) { | ||
267 | data->clk = NULL; | ||
268 | } else { | ||
269 | ret = clk_prepare_enable(data->clk); | ||
270 | if (ret) { | ||
271 | dev_err(&pdev->dev, | ||
272 | "Could not prepare or enable the snvs clock\n"); | ||
273 | return ret; | ||
274 | } | ||
275 | } | ||
276 | |||
263 | platform_set_drvdata(pdev, data); | 277 | platform_set_drvdata(pdev, data); |
264 | 278 | ||
265 | spin_lock_init(&data->lock); | 279 | spin_lock_init(&data->lock); |
@@ -280,7 +294,7 @@ static int snvs_rtc_probe(struct platform_device *pdev) | |||
280 | if (ret) { | 294 | if (ret) { |
281 | dev_err(&pdev->dev, "failed to request irq %d: %d\n", | 295 | dev_err(&pdev->dev, "failed to request irq %d: %d\n", |
282 | data->irq, ret); | 296 | data->irq, ret); |
283 | return ret; | 297 | goto error_rtc_device_register; |
284 | } | 298 | } |
285 | 299 | ||
286 | data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, | 300 | data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, |
@@ -288,10 +302,16 @@ static int snvs_rtc_probe(struct platform_device *pdev) | |||
288 | if (IS_ERR(data->rtc)) { | 302 | if (IS_ERR(data->rtc)) { |
289 | ret = PTR_ERR(data->rtc); | 303 | ret = PTR_ERR(data->rtc); |
290 | dev_err(&pdev->dev, "failed to register rtc: %d\n", ret); | 304 | dev_err(&pdev->dev, "failed to register rtc: %d\n", ret); |
291 | return ret; | 305 | goto error_rtc_device_register; |
292 | } | 306 | } |
293 | 307 | ||
294 | return 0; | 308 | return 0; |
309 | |||
310 | error_rtc_device_register: | ||
311 | if (data->clk) | ||
312 | clk_disable_unprepare(data->clk); | ||
313 | |||
314 | return ret; | ||
295 | } | 315 | } |
296 | 316 | ||
297 | #ifdef CONFIG_PM_SLEEP | 317 | #ifdef CONFIG_PM_SLEEP |
@@ -302,21 +322,34 @@ static int snvs_rtc_suspend(struct device *dev) | |||
302 | if (device_may_wakeup(dev)) | 322 | if (device_may_wakeup(dev)) |
303 | enable_irq_wake(data->irq); | 323 | enable_irq_wake(data->irq); |
304 | 324 | ||
325 | if (data->clk) | ||
326 | clk_disable_unprepare(data->clk); | ||
327 | |||
305 | return 0; | 328 | return 0; |
306 | } | 329 | } |
307 | 330 | ||
308 | static int snvs_rtc_resume(struct device *dev) | 331 | static int snvs_rtc_resume(struct device *dev) |
309 | { | 332 | { |
310 | struct snvs_rtc_data *data = dev_get_drvdata(dev); | 333 | struct snvs_rtc_data *data = dev_get_drvdata(dev); |
334 | int ret; | ||
311 | 335 | ||
312 | if (device_may_wakeup(dev)) | 336 | if (device_may_wakeup(dev)) |
313 | disable_irq_wake(data->irq); | 337 | disable_irq_wake(data->irq); |
314 | 338 | ||
339 | if (data->clk) { | ||
340 | ret = clk_prepare_enable(data->clk); | ||
341 | if (ret) | ||
342 | return ret; | ||
343 | } | ||
344 | |||
315 | return 0; | 345 | return 0; |
316 | } | 346 | } |
317 | #endif | 347 | #endif |
318 | 348 | ||
319 | static SIMPLE_DEV_PM_OPS(snvs_rtc_pm_ops, snvs_rtc_suspend, snvs_rtc_resume); | 349 | static const struct dev_pm_ops snvs_rtc_pm_ops = { |
350 | .suspend_noirq = snvs_rtc_suspend, | ||
351 | .resume_noirq = snvs_rtc_resume, | ||
352 | }; | ||
320 | 353 | ||
321 | static const struct of_device_id snvs_dt_ids[] = { | 354 | static const struct of_device_id snvs_dt_ids[] = { |
322 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, | 355 | { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, |
diff --git a/drivers/usb/storage/debug.c b/drivers/usb/storage/debug.c index 66a684a29938..2d81e1d8ee30 100644 --- a/drivers/usb/storage/debug.c +++ b/drivers/usb/storage/debug.c | |||
@@ -188,7 +188,7 @@ int usb_stor_dbg(const struct us_data *us, const char *fmt, ...) | |||
188 | 188 | ||
189 | va_start(args, fmt); | 189 | va_start(args, fmt); |
190 | 190 | ||
191 | r = dev_vprintk_emit(7, &us->pusb_dev->dev, fmt, args); | 191 | r = dev_vprintk_emit(LOGLEVEL_DEBUG, &us->pusb_dev->dev, fmt, args); |
192 | 192 | ||
193 | va_end(args); | 193 | va_end(args); |
194 | 194 | ||
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d8fc0605b9d2..3a6175fe10c0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -1994,18 +1994,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, | |||
1994 | shdr4extnum->sh_info = segs; | 1994 | shdr4extnum->sh_info = segs; |
1995 | } | 1995 | } |
1996 | 1996 | ||
1997 | static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, | ||
1998 | unsigned long mm_flags) | ||
1999 | { | ||
2000 | struct vm_area_struct *vma; | ||
2001 | size_t size = 0; | ||
2002 | |||
2003 | for (vma = first_vma(current, gate_vma); vma != NULL; | ||
2004 | vma = next_vma(vma, gate_vma)) | ||
2005 | size += vma_dump_size(vma, mm_flags); | ||
2006 | return size; | ||
2007 | } | ||
2008 | |||
2009 | /* | 1997 | /* |
2010 | * Actual dumper | 1998 | * Actual dumper |
2011 | * | 1999 | * |
@@ -2017,7 +2005,8 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2017 | { | 2005 | { |
2018 | int has_dumped = 0; | 2006 | int has_dumped = 0; |
2019 | mm_segment_t fs; | 2007 | mm_segment_t fs; |
2020 | int segs; | 2008 | int segs, i; |
2009 | size_t vma_data_size = 0; | ||
2021 | struct vm_area_struct *vma, *gate_vma; | 2010 | struct vm_area_struct *vma, *gate_vma; |
2022 | struct elfhdr *elf = NULL; | 2011 | struct elfhdr *elf = NULL; |
2023 | loff_t offset = 0, dataoff; | 2012 | loff_t offset = 0, dataoff; |
@@ -2026,6 +2015,7 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2026 | struct elf_shdr *shdr4extnum = NULL; | 2015 | struct elf_shdr *shdr4extnum = NULL; |
2027 | Elf_Half e_phnum; | 2016 | Elf_Half e_phnum; |
2028 | elf_addr_t e_shoff; | 2017 | elf_addr_t e_shoff; |
2018 | elf_addr_t *vma_filesz = NULL; | ||
2029 | 2019 | ||
2030 | /* | 2020 | /* |
2031 | * We no longer stop all VM operations. | 2021 | * We no longer stop all VM operations. |
@@ -2093,7 +2083,20 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2093 | 2083 | ||
2094 | dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); | 2084 | dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); |
2095 | 2085 | ||
2096 | offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); | 2086 | vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); |
2087 | if (!vma_filesz) | ||
2088 | goto end_coredump; | ||
2089 | |||
2090 | for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; | ||
2091 | vma = next_vma(vma, gate_vma)) { | ||
2092 | unsigned long dump_size; | ||
2093 | |||
2094 | dump_size = vma_dump_size(vma, cprm->mm_flags); | ||
2095 | vma_filesz[i++] = dump_size; | ||
2096 | vma_data_size += dump_size; | ||
2097 | } | ||
2098 | |||
2099 | offset += vma_data_size; | ||
2097 | offset += elf_core_extra_data_size(); | 2100 | offset += elf_core_extra_data_size(); |
2098 | e_shoff = offset; | 2101 | e_shoff = offset; |
2099 | 2102 | ||
@@ -2113,7 +2116,7 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2113 | goto end_coredump; | 2116 | goto end_coredump; |
2114 | 2117 | ||
2115 | /* Write program headers for segments dump */ | 2118 | /* Write program headers for segments dump */ |
2116 | for (vma = first_vma(current, gate_vma); vma != NULL; | 2119 | for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; |
2117 | vma = next_vma(vma, gate_vma)) { | 2120 | vma = next_vma(vma, gate_vma)) { |
2118 | struct elf_phdr phdr; | 2121 | struct elf_phdr phdr; |
2119 | 2122 | ||
@@ -2121,7 +2124,7 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2121 | phdr.p_offset = offset; | 2124 | phdr.p_offset = offset; |
2122 | phdr.p_vaddr = vma->vm_start; | 2125 | phdr.p_vaddr = vma->vm_start; |
2123 | phdr.p_paddr = 0; | 2126 | phdr.p_paddr = 0; |
2124 | phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); | 2127 | phdr.p_filesz = vma_filesz[i++]; |
2125 | phdr.p_memsz = vma->vm_end - vma->vm_start; | 2128 | phdr.p_memsz = vma->vm_end - vma->vm_start; |
2126 | offset += phdr.p_filesz; | 2129 | offset += phdr.p_filesz; |
2127 | phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; | 2130 | phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; |
@@ -2149,12 +2152,12 @@ static int elf_core_dump(struct coredump_params *cprm) | |||
2149 | if (!dump_skip(cprm, dataoff - cprm->written)) | 2152 | if (!dump_skip(cprm, dataoff - cprm->written)) |
2150 | goto end_coredump; | 2153 | goto end_coredump; |
2151 | 2154 | ||
2152 | for (vma = first_vma(current, gate_vma); vma != NULL; | 2155 | for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; |
2153 | vma = next_vma(vma, gate_vma)) { | 2156 | vma = next_vma(vma, gate_vma)) { |
2154 | unsigned long addr; | 2157 | unsigned long addr; |
2155 | unsigned long end; | 2158 | unsigned long end; |
2156 | 2159 | ||
2157 | end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); | 2160 | end = vma->vm_start + vma_filesz[i++]; |
2158 | 2161 | ||
2159 | for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { | 2162 | for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { |
2160 | struct page *page; | 2163 | struct page *page; |
@@ -2187,6 +2190,7 @@ end_coredump: | |||
2187 | cleanup: | 2190 | cleanup: |
2188 | free_note_info(&info); | 2191 | free_note_info(&info); |
2189 | kfree(shdr4extnum); | 2192 | kfree(shdr4extnum); |
2193 | kfree(vma_filesz); | ||
2190 | kfree(phdr4note); | 2194 | kfree(phdr4note); |
2191 | kfree(elf); | 2195 | kfree(elf); |
2192 | out: | 2196 | out: |
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fd8beb9657a2..70789e198dea 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c | |||
@@ -1,21 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * binfmt_misc.c | 2 | * binfmt_misc.c |
3 | * | 3 | * |
4 | * Copyright (C) 1997 Richard Günther | 4 | * Copyright (C) 1997 Richard Günther |
5 | * | 5 | * |
6 | * binfmt_misc detects binaries via a magic or filename extension and invokes | 6 | * binfmt_misc detects binaries via a magic or filename extension and invokes |
7 | * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and | 7 | * a specified wrapper. See Documentation/binfmt_misc.txt for more details. |
8 | * binfmt_mz. | ||
9 | * | ||
10 | * 1997-04-25 first version | ||
11 | * [...] | ||
12 | * 1997-05-19 cleanup | ||
13 | * 1997-06-26 hpa: pass the real filename rather than argv[0] | ||
14 | * 1997-06-30 minor cleanup | ||
15 | * 1997-08-09 removed extension stripping, locking cleanup | ||
16 | * 2001-02-28 AV: rewritten into something that resembles C. Original didn't. | ||
17 | */ | 8 | */ |
18 | 9 | ||
10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
11 | |||
19 | #include <linux/module.h> | 12 | #include <linux/module.h> |
20 | #include <linux/init.h> | 13 | #include <linux/init.h> |
21 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
@@ -30,8 +23,13 @@ | |||
30 | #include <linux/mount.h> | 23 | #include <linux/mount.h> |
31 | #include <linux/syscalls.h> | 24 | #include <linux/syscalls.h> |
32 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
26 | #include <linux/uaccess.h> | ||
33 | 27 | ||
34 | #include <asm/uaccess.h> | 28 | #ifdef DEBUG |
29 | # define USE_DEBUG 1 | ||
30 | #else | ||
31 | # define USE_DEBUG 0 | ||
32 | #endif | ||
35 | 33 | ||
36 | enum { | 34 | enum { |
37 | VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ | 35 | VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ |
@@ -41,9 +39,9 @@ static LIST_HEAD(entries); | |||
41 | static int enabled = 1; | 39 | static int enabled = 1; |
42 | 40 | ||
43 | enum {Enabled, Magic}; | 41 | enum {Enabled, Magic}; |
44 | #define MISC_FMT_PRESERVE_ARGV0 (1<<31) | 42 | #define MISC_FMT_PRESERVE_ARGV0 (1 << 31) |
45 | #define MISC_FMT_OPEN_BINARY (1<<30) | 43 | #define MISC_FMT_OPEN_BINARY (1 << 30) |
46 | #define MISC_FMT_CREDENTIALS (1<<29) | 44 | #define MISC_FMT_CREDENTIALS (1 << 29) |
47 | 45 | ||
48 | typedef struct { | 46 | typedef struct { |
49 | struct list_head list; | 47 | struct list_head list; |
@@ -87,20 +85,24 @@ static Node *check_file(struct linux_binprm *bprm) | |||
87 | char *p = strrchr(bprm->interp, '.'); | 85 | char *p = strrchr(bprm->interp, '.'); |
88 | struct list_head *l; | 86 | struct list_head *l; |
89 | 87 | ||
88 | /* Walk all the registered handlers. */ | ||
90 | list_for_each(l, &entries) { | 89 | list_for_each(l, &entries) { |
91 | Node *e = list_entry(l, Node, list); | 90 | Node *e = list_entry(l, Node, list); |
92 | char *s; | 91 | char *s; |
93 | int j; | 92 | int j; |
94 | 93 | ||
94 | /* Make sure this one is currently enabled. */ | ||
95 | if (!test_bit(Enabled, &e->flags)) | 95 | if (!test_bit(Enabled, &e->flags)) |
96 | continue; | 96 | continue; |
97 | 97 | ||
98 | /* Do matching based on extension if applicable. */ | ||
98 | if (!test_bit(Magic, &e->flags)) { | 99 | if (!test_bit(Magic, &e->flags)) { |
99 | if (p && !strcmp(e->magic, p + 1)) | 100 | if (p && !strcmp(e->magic, p + 1)) |
100 | return e; | 101 | return e; |
101 | continue; | 102 | continue; |
102 | } | 103 | } |
103 | 104 | ||
105 | /* Do matching based on magic & mask. */ | ||
104 | s = bprm->buf + e->offset; | 106 | s = bprm->buf + e->offset; |
105 | if (e->mask) { | 107 | if (e->mask) { |
106 | for (j = 0; j < e->size; j++) | 108 | for (j = 0; j < e->size; j++) |
@@ -123,7 +125,7 @@ static Node *check_file(struct linux_binprm *bprm) | |||
123 | static int load_misc_binary(struct linux_binprm *bprm) | 125 | static int load_misc_binary(struct linux_binprm *bprm) |
124 | { | 126 | { |
125 | Node *fmt; | 127 | Node *fmt; |
126 | struct file * interp_file = NULL; | 128 | struct file *interp_file = NULL; |
127 | char iname[BINPRM_BUF_SIZE]; | 129 | char iname[BINPRM_BUF_SIZE]; |
128 | const char *iname_addr = iname; | 130 | const char *iname_addr = iname; |
129 | int retval; | 131 | int retval; |
@@ -131,7 +133,7 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
131 | 133 | ||
132 | retval = -ENOEXEC; | 134 | retval = -ENOEXEC; |
133 | if (!enabled) | 135 | if (!enabled) |
134 | goto _ret; | 136 | goto ret; |
135 | 137 | ||
136 | /* to keep locking time low, we copy the interpreter string */ | 138 | /* to keep locking time low, we copy the interpreter string */ |
137 | read_lock(&entries_lock); | 139 | read_lock(&entries_lock); |
@@ -140,25 +142,26 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
140 | strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); | 142 | strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); |
141 | read_unlock(&entries_lock); | 143 | read_unlock(&entries_lock); |
142 | if (!fmt) | 144 | if (!fmt) |
143 | goto _ret; | 145 | goto ret; |
144 | 146 | ||
145 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { | 147 | if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { |
146 | retval = remove_arg_zero(bprm); | 148 | retval = remove_arg_zero(bprm); |
147 | if (retval) | 149 | if (retval) |
148 | goto _ret; | 150 | goto ret; |
149 | } | 151 | } |
150 | 152 | ||
151 | if (fmt->flags & MISC_FMT_OPEN_BINARY) { | 153 | if (fmt->flags & MISC_FMT_OPEN_BINARY) { |
152 | 154 | ||
153 | /* if the binary should be opened on behalf of the | 155 | /* if the binary should be opened on behalf of the |
154 | * interpreter than keep it open and assign descriptor | 156 | * interpreter than keep it open and assign descriptor |
155 | * to it */ | 157 | * to it |
156 | fd_binary = get_unused_fd(); | 158 | */ |
157 | if (fd_binary < 0) { | 159 | fd_binary = get_unused_fd_flags(0); |
158 | retval = fd_binary; | 160 | if (fd_binary < 0) { |
159 | goto _ret; | 161 | retval = fd_binary; |
160 | } | 162 | goto ret; |
161 | fd_install(fd_binary, bprm->file); | 163 | } |
164 | fd_install(fd_binary, bprm->file); | ||
162 | 165 | ||
163 | /* if the binary is not readable than enforce mm->dumpable=0 | 166 | /* if the binary is not readable than enforce mm->dumpable=0 |
164 | regardless of the interpreter's permissions */ | 167 | regardless of the interpreter's permissions */ |
@@ -171,32 +174,32 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
171 | bprm->interp_flags |= BINPRM_FLAGS_EXECFD; | 174 | bprm->interp_flags |= BINPRM_FLAGS_EXECFD; |
172 | bprm->interp_data = fd_binary; | 175 | bprm->interp_data = fd_binary; |
173 | 176 | ||
174 | } else { | 177 | } else { |
175 | allow_write_access(bprm->file); | 178 | allow_write_access(bprm->file); |
176 | fput(bprm->file); | 179 | fput(bprm->file); |
177 | bprm->file = NULL; | 180 | bprm->file = NULL; |
178 | } | 181 | } |
179 | /* make argv[1] be the path to the binary */ | 182 | /* make argv[1] be the path to the binary */ |
180 | retval = copy_strings_kernel (1, &bprm->interp, bprm); | 183 | retval = copy_strings_kernel(1, &bprm->interp, bprm); |
181 | if (retval < 0) | 184 | if (retval < 0) |
182 | goto _error; | 185 | goto error; |
183 | bprm->argc++; | 186 | bprm->argc++; |
184 | 187 | ||
185 | /* add the interp as argv[0] */ | 188 | /* add the interp as argv[0] */ |
186 | retval = copy_strings_kernel (1, &iname_addr, bprm); | 189 | retval = copy_strings_kernel(1, &iname_addr, bprm); |
187 | if (retval < 0) | 190 | if (retval < 0) |
188 | goto _error; | 191 | goto error; |
189 | bprm->argc ++; | 192 | bprm->argc++; |
190 | 193 | ||
191 | /* Update interp in case binfmt_script needs it. */ | 194 | /* Update interp in case binfmt_script needs it. */ |
192 | retval = bprm_change_interp(iname, bprm); | 195 | retval = bprm_change_interp(iname, bprm); |
193 | if (retval < 0) | 196 | if (retval < 0) |
194 | goto _error; | 197 | goto error; |
195 | 198 | ||
196 | interp_file = open_exec (iname); | 199 | interp_file = open_exec(iname); |
197 | retval = PTR_ERR (interp_file); | 200 | retval = PTR_ERR(interp_file); |
198 | if (IS_ERR (interp_file)) | 201 | if (IS_ERR(interp_file)) |
199 | goto _error; | 202 | goto error; |
200 | 203 | ||
201 | bprm->file = interp_file; | 204 | bprm->file = interp_file; |
202 | if (fmt->flags & MISC_FMT_CREDENTIALS) { | 205 | if (fmt->flags & MISC_FMT_CREDENTIALS) { |
@@ -207,23 +210,23 @@ static int load_misc_binary(struct linux_binprm *bprm) | |||
207 | memset(bprm->buf, 0, BINPRM_BUF_SIZE); | 210 | memset(bprm->buf, 0, BINPRM_BUF_SIZE); |
208 | retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); | 211 | retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); |
209 | } else | 212 | } else |
210 | retval = prepare_binprm (bprm); | 213 | retval = prepare_binprm(bprm); |
211 | 214 | ||
212 | if (retval < 0) | 215 | if (retval < 0) |
213 | goto _error; | 216 | goto error; |
214 | 217 | ||
215 | retval = search_binary_handler(bprm); | 218 | retval = search_binary_handler(bprm); |
216 | if (retval < 0) | 219 | if (retval < 0) |
217 | goto _error; | 220 | goto error; |
218 | 221 | ||
219 | _ret: | 222 | ret: |
220 | return retval; | 223 | return retval; |
221 | _error: | 224 | error: |
222 | if (fd_binary > 0) | 225 | if (fd_binary > 0) |
223 | sys_close(fd_binary); | 226 | sys_close(fd_binary); |
224 | bprm->interp_flags = 0; | 227 | bprm->interp_flags = 0; |
225 | bprm->interp_data = 0; | 228 | bprm->interp_data = 0; |
226 | goto _ret; | 229 | goto ret; |
227 | } | 230 | } |
228 | 231 | ||
229 | /* Command parsers */ | 232 | /* Command parsers */ |
@@ -250,36 +253,40 @@ static char *scanarg(char *s, char del) | |||
250 | return s; | 253 | return s; |
251 | } | 254 | } |
252 | 255 | ||
253 | static char * check_special_flags (char * sfs, Node * e) | 256 | static char *check_special_flags(char *sfs, Node *e) |
254 | { | 257 | { |
255 | char * p = sfs; | 258 | char *p = sfs; |
256 | int cont = 1; | 259 | int cont = 1; |
257 | 260 | ||
258 | /* special flags */ | 261 | /* special flags */ |
259 | while (cont) { | 262 | while (cont) { |
260 | switch (*p) { | 263 | switch (*p) { |
261 | case 'P': | 264 | case 'P': |
262 | p++; | 265 | pr_debug("register: flag: P (preserve argv0)\n"); |
263 | e->flags |= MISC_FMT_PRESERVE_ARGV0; | 266 | p++; |
264 | break; | 267 | e->flags |= MISC_FMT_PRESERVE_ARGV0; |
265 | case 'O': | 268 | break; |
266 | p++; | 269 | case 'O': |
267 | e->flags |= MISC_FMT_OPEN_BINARY; | 270 | pr_debug("register: flag: O (open binary)\n"); |
268 | break; | 271 | p++; |
269 | case 'C': | 272 | e->flags |= MISC_FMT_OPEN_BINARY; |
270 | p++; | 273 | break; |
271 | /* this flags also implies the | 274 | case 'C': |
272 | open-binary flag */ | 275 | pr_debug("register: flag: C (preserve creds)\n"); |
273 | e->flags |= (MISC_FMT_CREDENTIALS | | 276 | p++; |
274 | MISC_FMT_OPEN_BINARY); | 277 | /* this flags also implies the |
275 | break; | 278 | open-binary flag */ |
276 | default: | 279 | e->flags |= (MISC_FMT_CREDENTIALS | |
277 | cont = 0; | 280 | MISC_FMT_OPEN_BINARY); |
281 | break; | ||
282 | default: | ||
283 | cont = 0; | ||
278 | } | 284 | } |
279 | } | 285 | } |
280 | 286 | ||
281 | return p; | 287 | return p; |
282 | } | 288 | } |
289 | |||
283 | /* | 290 | /* |
284 | * This registers a new binary format, it recognises the syntax | 291 | * This registers a new binary format, it recognises the syntax |
285 | * ':name:type:offset:magic:mask:interpreter:flags' | 292 | * ':name:type:offset:magic:mask:interpreter:flags' |
@@ -292,6 +299,8 @@ static Node *create_entry(const char __user *buffer, size_t count) | |||
292 | char *buf, *p; | 299 | char *buf, *p; |
293 | char del; | 300 | char del; |
294 | 301 | ||
302 | pr_debug("register: received %zu bytes\n", count); | ||
303 | |||
295 | /* some sanity checks */ | 304 | /* some sanity checks */ |
296 | err = -EINVAL; | 305 | err = -EINVAL; |
297 | if ((count < 11) || (count > MAX_REGISTER_LENGTH)) | 306 | if ((count < 11) || (count > MAX_REGISTER_LENGTH)) |
@@ -299,7 +308,7 @@ static Node *create_entry(const char __user *buffer, size_t count) | |||
299 | 308 | ||
300 | err = -ENOMEM; | 309 | err = -ENOMEM; |
301 | memsize = sizeof(Node) + count + 8; | 310 | memsize = sizeof(Node) + count + 8; |
302 | e = kmalloc(memsize, GFP_USER); | 311 | e = kmalloc(memsize, GFP_KERNEL); |
303 | if (!e) | 312 | if (!e) |
304 | goto out; | 313 | goto out; |
305 | 314 | ||
@@ -307,98 +316,175 @@ static Node *create_entry(const char __user *buffer, size_t count) | |||
307 | 316 | ||
308 | memset(e, 0, sizeof(Node)); | 317 | memset(e, 0, sizeof(Node)); |
309 | if (copy_from_user(buf, buffer, count)) | 318 | if (copy_from_user(buf, buffer, count)) |
310 | goto Efault; | 319 | goto efault; |
311 | 320 | ||
312 | del = *p++; /* delimeter */ | 321 | del = *p++; /* delimeter */ |
313 | 322 | ||
314 | memset(buf+count, del, 8); | 323 | pr_debug("register: delim: %#x {%c}\n", del, del); |
324 | |||
325 | /* Pad the buffer with the delim to simplify parsing below. */ | ||
326 | memset(buf + count, del, 8); | ||
315 | 327 | ||
328 | /* Parse the 'name' field. */ | ||
316 | e->name = p; | 329 | e->name = p; |
317 | p = strchr(p, del); | 330 | p = strchr(p, del); |
318 | if (!p) | 331 | if (!p) |
319 | goto Einval; | 332 | goto einval; |
320 | *p++ = '\0'; | 333 | *p++ = '\0'; |
321 | if (!e->name[0] || | 334 | if (!e->name[0] || |
322 | !strcmp(e->name, ".") || | 335 | !strcmp(e->name, ".") || |
323 | !strcmp(e->name, "..") || | 336 | !strcmp(e->name, "..") || |
324 | strchr(e->name, '/')) | 337 | strchr(e->name, '/')) |
325 | goto Einval; | 338 | goto einval; |
339 | |||
340 | pr_debug("register: name: {%s}\n", e->name); | ||
341 | |||
342 | /* Parse the 'type' field. */ | ||
326 | switch (*p++) { | 343 | switch (*p++) { |
327 | case 'E': e->flags = 1<<Enabled; break; | 344 | case 'E': |
328 | case 'M': e->flags = (1<<Enabled) | (1<<Magic); break; | 345 | pr_debug("register: type: E (extension)\n"); |
329 | default: goto Einval; | 346 | e->flags = 1 << Enabled; |
347 | break; | ||
348 | case 'M': | ||
349 | pr_debug("register: type: M (magic)\n"); | ||
350 | e->flags = (1 << Enabled) | (1 << Magic); | ||
351 | break; | ||
352 | default: | ||
353 | goto einval; | ||
330 | } | 354 | } |
331 | if (*p++ != del) | 355 | if (*p++ != del) |
332 | goto Einval; | 356 | goto einval; |
357 | |||
333 | if (test_bit(Magic, &e->flags)) { | 358 | if (test_bit(Magic, &e->flags)) { |
334 | char *s = strchr(p, del); | 359 | /* Handle the 'M' (magic) format. */ |
360 | char *s; | ||
361 | |||
362 | /* Parse the 'offset' field. */ | ||
363 | s = strchr(p, del); | ||
335 | if (!s) | 364 | if (!s) |
336 | goto Einval; | 365 | goto einval; |
337 | *s++ = '\0'; | 366 | *s++ = '\0'; |
338 | e->offset = simple_strtoul(p, &p, 10); | 367 | e->offset = simple_strtoul(p, &p, 10); |
339 | if (*p++) | 368 | if (*p++) |
340 | goto Einval; | 369 | goto einval; |
370 | pr_debug("register: offset: %#x\n", e->offset); | ||
371 | |||
372 | /* Parse the 'magic' field. */ | ||
341 | e->magic = p; | 373 | e->magic = p; |
342 | p = scanarg(p, del); | 374 | p = scanarg(p, del); |
343 | if (!p) | 375 | if (!p) |
344 | goto Einval; | 376 | goto einval; |
345 | p[-1] = '\0'; | 377 | p[-1] = '\0'; |
346 | if (!e->magic[0]) | 378 | if (p == e->magic) |
347 | goto Einval; | 379 | goto einval; |
380 | if (USE_DEBUG) | ||
381 | print_hex_dump_bytes( | ||
382 | KBUILD_MODNAME ": register: magic[raw]: ", | ||
383 | DUMP_PREFIX_NONE, e->magic, p - e->magic); | ||
384 | |||
385 | /* Parse the 'mask' field. */ | ||
348 | e->mask = p; | 386 | e->mask = p; |
349 | p = scanarg(p, del); | 387 | p = scanarg(p, del); |
350 | if (!p) | 388 | if (!p) |
351 | goto Einval; | 389 | goto einval; |
352 | p[-1] = '\0'; | 390 | p[-1] = '\0'; |
353 | if (!e->mask[0]) | 391 | if (p == e->mask) { |
354 | e->mask = NULL; | 392 | e->mask = NULL; |
393 | pr_debug("register: mask[raw]: none\n"); | ||
394 | } else if (USE_DEBUG) | ||
395 | print_hex_dump_bytes( | ||
396 | KBUILD_MODNAME ": register: mask[raw]: ", | ||
397 | DUMP_PREFIX_NONE, e->mask, p - e->mask); | ||
398 | |||
399 | /* | ||
400 | * Decode the magic & mask fields. | ||
401 | * Note: while we might have accepted embedded NUL bytes from | ||
402 | * above, the unescape helpers here will stop at the first one | ||
403 | * it encounters. | ||
404 | */ | ||
355 | e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); | 405 | e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); |
356 | if (e->mask && | 406 | if (e->mask && |
357 | string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) | 407 | string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) |
358 | goto Einval; | 408 | goto einval; |
359 | if (e->size + e->offset > BINPRM_BUF_SIZE) | 409 | if (e->size + e->offset > BINPRM_BUF_SIZE) |
360 | goto Einval; | 410 | goto einval; |
411 | pr_debug("register: magic/mask length: %i\n", e->size); | ||
412 | if (USE_DEBUG) { | ||
413 | print_hex_dump_bytes( | ||
414 | KBUILD_MODNAME ": register: magic[decoded]: ", | ||
415 | DUMP_PREFIX_NONE, e->magic, e->size); | ||
416 | |||
417 | if (e->mask) { | ||
418 | int i; | ||
419 | char *masked = kmalloc(e->size, GFP_KERNEL); | ||
420 | |||
421 | print_hex_dump_bytes( | ||
422 | KBUILD_MODNAME ": register: mask[decoded]: ", | ||
423 | DUMP_PREFIX_NONE, e->mask, e->size); | ||
424 | |||
425 | if (masked) { | ||
426 | for (i = 0; i < e->size; ++i) | ||
427 | masked[i] = e->magic[i] & e->mask[i]; | ||
428 | print_hex_dump_bytes( | ||
429 | KBUILD_MODNAME ": register: magic[masked]: ", | ||
430 | DUMP_PREFIX_NONE, masked, e->size); | ||
431 | |||
432 | kfree(masked); | ||
433 | } | ||
434 | } | ||
435 | } | ||
361 | } else { | 436 | } else { |
437 | /* Handle the 'E' (extension) format. */ | ||
438 | |||
439 | /* Skip the 'offset' field. */ | ||
362 | p = strchr(p, del); | 440 | p = strchr(p, del); |
363 | if (!p) | 441 | if (!p) |
364 | goto Einval; | 442 | goto einval; |
365 | *p++ = '\0'; | 443 | *p++ = '\0'; |
444 | |||
445 | /* Parse the 'magic' field. */ | ||
366 | e->magic = p; | 446 | e->magic = p; |
367 | p = strchr(p, del); | 447 | p = strchr(p, del); |
368 | if (!p) | 448 | if (!p) |
369 | goto Einval; | 449 | goto einval; |
370 | *p++ = '\0'; | 450 | *p++ = '\0'; |
371 | if (!e->magic[0] || strchr(e->magic, '/')) | 451 | if (!e->magic[0] || strchr(e->magic, '/')) |
372 | goto Einval; | 452 | goto einval; |
453 | pr_debug("register: extension: {%s}\n", e->magic); | ||
454 | |||
455 | /* Skip the 'mask' field. */ | ||
373 | p = strchr(p, del); | 456 | p = strchr(p, del); |
374 | if (!p) | 457 | if (!p) |
375 | goto Einval; | 458 | goto einval; |
376 | *p++ = '\0'; | 459 | *p++ = '\0'; |
377 | } | 460 | } |
461 | |||
462 | /* Parse the 'interpreter' field. */ | ||
378 | e->interpreter = p; | 463 | e->interpreter = p; |
379 | p = strchr(p, del); | 464 | p = strchr(p, del); |
380 | if (!p) | 465 | if (!p) |
381 | goto Einval; | 466 | goto einval; |
382 | *p++ = '\0'; | 467 | *p++ = '\0'; |
383 | if (!e->interpreter[0]) | 468 | if (!e->interpreter[0]) |
384 | goto Einval; | 469 | goto einval; |
385 | 470 | pr_debug("register: interpreter: {%s}\n", e->interpreter); | |
386 | |||
387 | p = check_special_flags (p, e); | ||
388 | 471 | ||
472 | /* Parse the 'flags' field. */ | ||
473 | p = check_special_flags(p, e); | ||
389 | if (*p == '\n') | 474 | if (*p == '\n') |
390 | p++; | 475 | p++; |
391 | if (p != buf + count) | 476 | if (p != buf + count) |
392 | goto Einval; | 477 | goto einval; |
478 | |||
393 | return e; | 479 | return e; |
394 | 480 | ||
395 | out: | 481 | out: |
396 | return ERR_PTR(err); | 482 | return ERR_PTR(err); |
397 | 483 | ||
398 | Efault: | 484 | efault: |
399 | kfree(e); | 485 | kfree(e); |
400 | return ERR_PTR(-EFAULT); | 486 | return ERR_PTR(-EFAULT); |
401 | Einval: | 487 | einval: |
402 | kfree(e); | 488 | kfree(e); |
403 | return ERR_PTR(-EINVAL); | 489 | return ERR_PTR(-EINVAL); |
404 | } | 490 | } |
@@ -417,7 +503,7 @@ static int parse_command(const char __user *buffer, size_t count) | |||
417 | return -EFAULT; | 503 | return -EFAULT; |
418 | if (!count) | 504 | if (!count) |
419 | return 0; | 505 | return 0; |
420 | if (s[count-1] == '\n') | 506 | if (s[count - 1] == '\n') |
421 | count--; | 507 | count--; |
422 | if (count == 1 && s[0] == '0') | 508 | if (count == 1 && s[0] == '0') |
423 | return 1; | 509 | return 1; |
@@ -434,7 +520,7 @@ static void entry_status(Node *e, char *page) | |||
434 | { | 520 | { |
435 | char *dp; | 521 | char *dp; |
436 | char *status = "disabled"; | 522 | char *status = "disabled"; |
437 | const char * flags = "flags: "; | 523 | const char *flags = "flags: "; |
438 | 524 | ||
439 | if (test_bit(Enabled, &e->flags)) | 525 | if (test_bit(Enabled, &e->flags)) |
440 | status = "enabled"; | 526 | status = "enabled"; |
@@ -448,19 +534,15 @@ static void entry_status(Node *e, char *page) | |||
448 | dp = page + strlen(page); | 534 | dp = page + strlen(page); |
449 | 535 | ||
450 | /* print the special flags */ | 536 | /* print the special flags */ |
451 | sprintf (dp, "%s", flags); | 537 | sprintf(dp, "%s", flags); |
452 | dp += strlen (flags); | 538 | dp += strlen(flags); |
453 | if (e->flags & MISC_FMT_PRESERVE_ARGV0) { | 539 | if (e->flags & MISC_FMT_PRESERVE_ARGV0) |
454 | *dp ++ = 'P'; | 540 | *dp++ = 'P'; |
455 | } | 541 | if (e->flags & MISC_FMT_OPEN_BINARY) |
456 | if (e->flags & MISC_FMT_OPEN_BINARY) { | 542 | *dp++ = 'O'; |
457 | *dp ++ = 'O'; | 543 | if (e->flags & MISC_FMT_CREDENTIALS) |
458 | } | 544 | *dp++ = 'C'; |
459 | if (e->flags & MISC_FMT_CREDENTIALS) { | 545 | *dp++ = '\n'; |
460 | *dp ++ = 'C'; | ||
461 | } | ||
462 | *dp ++ = '\n'; | ||
463 | |||
464 | 546 | ||
465 | if (!test_bit(Magic, &e->flags)) { | 547 | if (!test_bit(Magic, &e->flags)) { |
466 | sprintf(dp, "extension .%s\n", e->magic); | 548 | sprintf(dp, "extension .%s\n", e->magic); |
@@ -488,7 +570,7 @@ static void entry_status(Node *e, char *page) | |||
488 | 570 | ||
489 | static struct inode *bm_get_inode(struct super_block *sb, int mode) | 571 | static struct inode *bm_get_inode(struct super_block *sb, int mode) |
490 | { | 572 | { |
491 | struct inode * inode = new_inode(sb); | 573 | struct inode *inode = new_inode(sb); |
492 | 574 | ||
493 | if (inode) { | 575 | if (inode) { |
494 | inode->i_ino = get_next_ino(); | 576 | inode->i_ino = get_next_ino(); |
@@ -528,13 +610,14 @@ static void kill_node(Node *e) | |||
528 | /* /<entry> */ | 610 | /* /<entry> */ |
529 | 611 | ||
530 | static ssize_t | 612 | static ssize_t |
531 | bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) | 613 | bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) |
532 | { | 614 | { |
533 | Node *e = file_inode(file)->i_private; | 615 | Node *e = file_inode(file)->i_private; |
534 | ssize_t res; | 616 | ssize_t res; |
535 | char *page; | 617 | char *page; |
536 | 618 | ||
537 | if (!(page = (char*) __get_free_page(GFP_KERNEL))) | 619 | page = (char *) __get_free_page(GFP_KERNEL); |
620 | if (!page) | ||
538 | return -ENOMEM; | 621 | return -ENOMEM; |
539 | 622 | ||
540 | entry_status(e, page); | 623 | entry_status(e, page); |
@@ -553,20 +636,28 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, | |||
553 | int res = parse_command(buffer, count); | 636 | int res = parse_command(buffer, count); |
554 | 637 | ||
555 | switch (res) { | 638 | switch (res) { |
556 | case 1: clear_bit(Enabled, &e->flags); | 639 | case 1: |
557 | break; | 640 | /* Disable this handler. */ |
558 | case 2: set_bit(Enabled, &e->flags); | 641 | clear_bit(Enabled, &e->flags); |
559 | break; | 642 | break; |
560 | case 3: root = dget(file->f_path.dentry->d_sb->s_root); | 643 | case 2: |
561 | mutex_lock(&root->d_inode->i_mutex); | 644 | /* Enable this handler. */ |
562 | 645 | set_bit(Enabled, &e->flags); | |
563 | kill_node(e); | 646 | break; |
564 | 647 | case 3: | |
565 | mutex_unlock(&root->d_inode->i_mutex); | 648 | /* Delete this handler. */ |
566 | dput(root); | 649 | root = dget(file->f_path.dentry->d_sb->s_root); |
567 | break; | 650 | mutex_lock(&root->d_inode->i_mutex); |
568 | default: return res; | 651 | |
652 | kill_node(e); | ||
653 | |||
654 | mutex_unlock(&root->d_inode->i_mutex); | ||
655 | dput(root); | ||
656 | break; | ||
657 | default: | ||
658 | return res; | ||
569 | } | 659 | } |
660 | |||
570 | return count; | 661 | return count; |
571 | } | 662 | } |
572 | 663 | ||
@@ -654,26 +745,36 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) | |||
654 | return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); | 745 | return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); |
655 | } | 746 | } |
656 | 747 | ||
657 | static ssize_t bm_status_write(struct file * file, const char __user * buffer, | 748 | static ssize_t bm_status_write(struct file *file, const char __user *buffer, |
658 | size_t count, loff_t *ppos) | 749 | size_t count, loff_t *ppos) |
659 | { | 750 | { |
660 | int res = parse_command(buffer, count); | 751 | int res = parse_command(buffer, count); |
661 | struct dentry *root; | 752 | struct dentry *root; |
662 | 753 | ||
663 | switch (res) { | 754 | switch (res) { |
664 | case 1: enabled = 0; break; | 755 | case 1: |
665 | case 2: enabled = 1; break; | 756 | /* Disable all handlers. */ |
666 | case 3: root = dget(file->f_path.dentry->d_sb->s_root); | 757 | enabled = 0; |
667 | mutex_lock(&root->d_inode->i_mutex); | 758 | break; |
668 | 759 | case 2: | |
669 | while (!list_empty(&entries)) | 760 | /* Enable all handlers. */ |
670 | kill_node(list_entry(entries.next, Node, list)); | 761 | enabled = 1; |
671 | 762 | break; | |
672 | mutex_unlock(&root->d_inode->i_mutex); | 763 | case 3: |
673 | dput(root); | 764 | /* Delete all handlers. */ |
674 | break; | 765 | root = dget(file->f_path.dentry->d_sb->s_root); |
675 | default: return res; | 766 | mutex_lock(&root->d_inode->i_mutex); |
767 | |||
768 | while (!list_empty(&entries)) | ||
769 | kill_node(list_entry(entries.next, Node, list)); | ||
770 | |||
771 | mutex_unlock(&root->d_inode->i_mutex); | ||
772 | dput(root); | ||
773 | break; | ||
774 | default: | ||
775 | return res; | ||
676 | } | 776 | } |
777 | |||
677 | return count; | 778 | return count; |
678 | } | 779 | } |
679 | 780 | ||
@@ -690,14 +791,16 @@ static const struct super_operations s_ops = { | |||
690 | .evict_inode = bm_evict_inode, | 791 | .evict_inode = bm_evict_inode, |
691 | }; | 792 | }; |
692 | 793 | ||
693 | static int bm_fill_super(struct super_block * sb, void * data, int silent) | 794 | static int bm_fill_super(struct super_block *sb, void *data, int silent) |
694 | { | 795 | { |
796 | int err; | ||
695 | static struct tree_descr bm_files[] = { | 797 | static struct tree_descr bm_files[] = { |
696 | [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, | 798 | [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, |
697 | [3] = {"register", &bm_register_operations, S_IWUSR}, | 799 | [3] = {"register", &bm_register_operations, S_IWUSR}, |
698 | /* last one */ {""} | 800 | /* last one */ {""} |
699 | }; | 801 | }; |
700 | int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); | 802 | |
803 | err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); | ||
701 | if (!err) | 804 | if (!err) |
702 | sb->s_op = &s_ops; | 805 | sb->s_op = &s_ops; |
703 | return err; | 806 | return err; |
diff --git a/fs/char_dev.c b/fs/char_dev.c index f77f7702fabe..67b2007f10fe 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c | |||
@@ -117,7 +117,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, | |||
117 | goto out; | 117 | goto out; |
118 | } | 118 | } |
119 | major = i; | 119 | major = i; |
120 | ret = major; | ||
121 | } | 120 | } |
122 | 121 | ||
123 | cd->major = major; | 122 | cd->major = major; |
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6d00c419cbae..1ea780bc6376 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c | |||
@@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = { | |||
38 | 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; | 38 | 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; |
39 | /* security id for Authenticated Users system group */ | 39 | /* security id for Authenticated Users system group */ |
40 | static const struct cifs_sid sid_authusers = { | 40 | static const struct cifs_sid sid_authusers = { |
41 | 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; | 41 | 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; |
42 | /* group users */ | 42 | /* group users */ |
43 | static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; | 43 | static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; |
44 | 44 | ||
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 61d00a6e398f..fa13d5e79f64 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c | |||
@@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, | |||
2477 | } | 2477 | } |
2478 | parm_data = (struct cifs_posix_lock *) | 2478 | parm_data = (struct cifs_posix_lock *) |
2479 | ((char *)&pSMBr->hdr.Protocol + data_offset); | 2479 | ((char *)&pSMBr->hdr.Protocol + data_offset); |
2480 | if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) | 2480 | if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) |
2481 | pLockData->fl_type = F_UNLCK; | 2481 | pLockData->fl_type = F_UNLCK; |
2482 | else { | 2482 | else { |
2483 | if (parm_data->lock_type == | 2483 | if (parm_data->lock_type == |
2484 | __constant_cpu_to_le16(CIFS_RDLCK)) | 2484 | cpu_to_le16(CIFS_RDLCK)) |
2485 | pLockData->fl_type = F_RDLCK; | 2485 | pLockData->fl_type = F_RDLCK; |
2486 | else if (parm_data->lock_type == | 2486 | else if (parm_data->lock_type == |
2487 | __constant_cpu_to_le16(CIFS_WRLCK)) | 2487 | cpu_to_le16(CIFS_WRLCK)) |
2488 | pLockData->fl_type = F_WRLCK; | 2488 | pLockData->fl_type = F_WRLCK; |
2489 | 2489 | ||
2490 | pLockData->fl_start = le64_to_cpu(parm_data->start); | 2490 | pLockData->fl_start = le64_to_cpu(parm_data->start); |
@@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, | |||
3276 | pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); | 3276 | pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); |
3277 | 3277 | ||
3278 | pSMB->TotalParameterCount = 0; | 3278 | pSMB->TotalParameterCount = 0; |
3279 | pSMB->TotalDataCount = __constant_cpu_to_le32(2); | 3279 | pSMB->TotalDataCount = cpu_to_le32(2); |
3280 | pSMB->MaxParameterCount = 0; | 3280 | pSMB->MaxParameterCount = 0; |
3281 | pSMB->MaxDataCount = 0; | 3281 | pSMB->MaxDataCount = 0; |
3282 | pSMB->MaxSetupCount = 4; | 3282 | pSMB->MaxSetupCount = 4; |
3283 | pSMB->Reserved = 0; | 3283 | pSMB->Reserved = 0; |
3284 | pSMB->ParameterOffset = 0; | 3284 | pSMB->ParameterOffset = 0; |
3285 | pSMB->DataCount = __constant_cpu_to_le32(2); | 3285 | pSMB->DataCount = cpu_to_le32(2); |
3286 | pSMB->DataOffset = | 3286 | pSMB->DataOffset = |
3287 | cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, | 3287 | cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, |
3288 | compression_state) - 4); /* 84 */ | 3288 | compression_state) - 4); /* 84 */ |
3289 | pSMB->SetupCount = 4; | 3289 | pSMB->SetupCount = 4; |
3290 | pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); | 3290 | pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); |
3291 | pSMB->ParameterCount = 0; | 3291 | pSMB->ParameterCount = 0; |
3292 | pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); | 3292 | pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); |
3293 | pSMB->IsFsctl = 1; /* FSCTL */ | 3293 | pSMB->IsFsctl = 1; /* FSCTL */ |
3294 | pSMB->IsRootFlag = 0; | 3294 | pSMB->IsRootFlag = 0; |
3295 | pSMB->Fid = fid; /* file handle always le */ | 3295 | pSMB->Fid = fid; /* file handle always le */ |
3296 | /* 3 byte pad, followed by 2 byte compress state */ | 3296 | /* 3 byte pad, followed by 2 byte compress state */ |
3297 | pSMB->ByteCount = __constant_cpu_to_le16(5); | 3297 | pSMB->ByteCount = cpu_to_le16(5); |
3298 | inc_rfc1001_len(pSMB, 5); | 3298 | inc_rfc1001_len(pSMB, 5); |
3299 | 3299 | ||
3300 | rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, | 3300 | rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, |
@@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, | |||
3430 | cifs_acl->version = cpu_to_le16(1); | 3430 | cifs_acl->version = cpu_to_le16(1); |
3431 | if (acl_type == ACL_TYPE_ACCESS) { | 3431 | if (acl_type == ACL_TYPE_ACCESS) { |
3432 | cifs_acl->access_entry_count = cpu_to_le16(count); | 3432 | cifs_acl->access_entry_count = cpu_to_le16(count); |
3433 | cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); | 3433 | cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); |
3434 | } else if (acl_type == ACL_TYPE_DEFAULT) { | 3434 | } else if (acl_type == ACL_TYPE_DEFAULT) { |
3435 | cifs_acl->default_entry_count = cpu_to_le16(count); | 3435 | cifs_acl->default_entry_count = cpu_to_le16(count); |
3436 | cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); | 3436 | cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); |
3437 | } else { | 3437 | } else { |
3438 | cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); | 3438 | cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); |
3439 | return 0; | 3439 | return 0; |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d535e168a9d3..96b7e9b7706d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -1066,7 +1066,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) | |||
1066 | 1066 | ||
1067 | max_num = (max_buf - sizeof(struct smb_hdr)) / | 1067 | max_num = (max_buf - sizeof(struct smb_hdr)) / |
1068 | sizeof(LOCKING_ANDX_RANGE); | 1068 | sizeof(LOCKING_ANDX_RANGE); |
1069 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); | 1069 | buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); |
1070 | if (!buf) { | 1070 | if (!buf) { |
1071 | free_xid(xid); | 1071 | free_xid(xid); |
1072 | return -ENOMEM; | 1072 | return -ENOMEM; |
@@ -1401,7 +1401,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, | |||
1401 | 1401 | ||
1402 | max_num = (max_buf - sizeof(struct smb_hdr)) / | 1402 | max_num = (max_buf - sizeof(struct smb_hdr)) / |
1403 | sizeof(LOCKING_ANDX_RANGE); | 1403 | sizeof(LOCKING_ANDX_RANGE); |
1404 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); | 1404 | buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); |
1405 | if (!buf) | 1405 | if (!buf) |
1406 | return -ENOMEM; | 1406 | return -ENOMEM; |
1407 | 1407 | ||
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 446cb7fb3f58..bce6fdcd5d48 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c | |||
@@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) | |||
46 | CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, | 46 | CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, |
47 | USHRT_MAX)); | 47 | USHRT_MAX)); |
48 | pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); | 48 | pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); |
49 | pSMB->req.VcNumber = __constant_cpu_to_le16(1); | 49 | pSMB->req.VcNumber = cpu_to_le16(1); |
50 | 50 | ||
51 | /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ | 51 | /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ |
52 | 52 | ||
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 45992944e238..7198eac5dddd 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c | |||
@@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, | |||
111 | return -EINVAL; | 111 | return -EINVAL; |
112 | 112 | ||
113 | max_num = max_buf / sizeof(struct smb2_lock_element); | 113 | max_num = max_buf / sizeof(struct smb2_lock_element); |
114 | buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); | 114 | buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); |
115 | if (!buf) | 115 | if (!buf) |
116 | return -ENOMEM; | 116 | return -ENOMEM; |
117 | 117 | ||
@@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) | |||
247 | } | 247 | } |
248 | 248 | ||
249 | max_num = max_buf / sizeof(struct smb2_lock_element); | 249 | max_num = max_buf / sizeof(struct smb2_lock_element); |
250 | buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); | 250 | buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); |
251 | if (!buf) { | 251 | if (!buf) { |
252 | free_xid(xid); | 252 | free_xid(xid); |
253 | return -ENOMEM; | 253 | return -ENOMEM; |
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a08a34838fc..f1cefc9763ed 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c | |||
@@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) | |||
67 | * indexed by command in host byte order | 67 | * indexed by command in host byte order |
68 | */ | 68 | */ |
69 | static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { | 69 | static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { |
70 | /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65), | 70 | /* SMB2_NEGOTIATE */ cpu_to_le16(65), |
71 | /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9), | 71 | /* SMB2_SESSION_SETUP */ cpu_to_le16(9), |
72 | /* SMB2_LOGOFF */ __constant_cpu_to_le16(4), | 72 | /* SMB2_LOGOFF */ cpu_to_le16(4), |
73 | /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16), | 73 | /* SMB2_TREE_CONNECT */ cpu_to_le16(16), |
74 | /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4), | 74 | /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), |
75 | /* SMB2_CREATE */ __constant_cpu_to_le16(89), | 75 | /* SMB2_CREATE */ cpu_to_le16(89), |
76 | /* SMB2_CLOSE */ __constant_cpu_to_le16(60), | 76 | /* SMB2_CLOSE */ cpu_to_le16(60), |
77 | /* SMB2_FLUSH */ __constant_cpu_to_le16(4), | 77 | /* SMB2_FLUSH */ cpu_to_le16(4), |
78 | /* SMB2_READ */ __constant_cpu_to_le16(17), | 78 | /* SMB2_READ */ cpu_to_le16(17), |
79 | /* SMB2_WRITE */ __constant_cpu_to_le16(17), | 79 | /* SMB2_WRITE */ cpu_to_le16(17), |
80 | /* SMB2_LOCK */ __constant_cpu_to_le16(4), | 80 | /* SMB2_LOCK */ cpu_to_le16(4), |
81 | /* SMB2_IOCTL */ __constant_cpu_to_le16(49), | 81 | /* SMB2_IOCTL */ cpu_to_le16(49), |
82 | /* BB CHECK this ... not listed in documentation */ | 82 | /* BB CHECK this ... not listed in documentation */ |
83 | /* SMB2_CANCEL */ __constant_cpu_to_le16(0), | 83 | /* SMB2_CANCEL */ cpu_to_le16(0), |
84 | /* SMB2_ECHO */ __constant_cpu_to_le16(4), | 84 | /* SMB2_ECHO */ cpu_to_le16(4), |
85 | /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9), | 85 | /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9), |
86 | /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9), | 86 | /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9), |
87 | /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9), | 87 | /* SMB2_QUERY_INFO */ cpu_to_le16(9), |
88 | /* SMB2_SET_INFO */ __constant_cpu_to_le16(2), | 88 | /* SMB2_SET_INFO */ cpu_to_le16(2), |
89 | /* BB FIXME can also be 44 for lease break */ | 89 | /* BB FIXME can also be 44 for lease break */ |
90 | /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24) | 90 | /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) |
91 | }; | 91 | }; |
92 | 92 | ||
93 | int | 93 | int |
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 568f323665c8..93fd0586f9ec 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c | |||
@@ -600,7 +600,7 @@ smb2_clone_range(const unsigned int xid, | |||
600 | goto cchunk_out; | 600 | goto cchunk_out; |
601 | 601 | ||
602 | /* For now array only one chunk long, will make more flexible later */ | 602 | /* For now array only one chunk long, will make more flexible later */ |
603 | pcchunk->ChunkCount = __constant_cpu_to_le32(1); | 603 | pcchunk->ChunkCount = cpu_to_le32(1); |
604 | pcchunk->Reserved = 0; | 604 | pcchunk->Reserved = 0; |
605 | pcchunk->Reserved2 = 0; | 605 | pcchunk->Reserved2 = 0; |
606 | 606 | ||
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0ca7f6364754..3417340bf89e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c | |||
@@ -1358,7 +1358,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, | |||
1358 | char *ret_data = NULL; | 1358 | char *ret_data = NULL; |
1359 | 1359 | ||
1360 | fsctl_input.CompressionState = | 1360 | fsctl_input.CompressionState = |
1361 | __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); | 1361 | cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); |
1362 | 1362 | ||
1363 | rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, | 1363 | rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, |
1364 | FSCTL_SET_COMPRESSION, true /* is_fsctl */, | 1364 | FSCTL_SET_COMPRESSION, true /* is_fsctl */, |
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d84f46c5b2c5..ce858477002a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h | |||
@@ -85,7 +85,7 @@ | |||
85 | /* BB FIXME - analyze following length BB */ | 85 | /* BB FIXME - analyze following length BB */ |
86 | #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ | 86 | #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ |
87 | 87 | ||
88 | #define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) | 88 | #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * SMB2 Header Definition | 91 | * SMB2 Header Definition |
@@ -96,7 +96,7 @@ | |||
96 | * | 96 | * |
97 | */ | 97 | */ |
98 | 98 | ||
99 | #define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) | 99 | #define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) |
100 | 100 | ||
101 | struct smb2_hdr { | 101 | struct smb2_hdr { |
102 | __be32 smb2_buf_length; /* big endian on wire */ | 102 | __be32 smb2_buf_length; /* big endian on wire */ |
@@ -137,16 +137,16 @@ struct smb2_transform_hdr { | |||
137 | } __packed; | 137 | } __packed; |
138 | 138 | ||
139 | /* Encryption Algorithms */ | 139 | /* Encryption Algorithms */ |
140 | #define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) | 140 | #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) |
141 | 141 | ||
142 | /* | 142 | /* |
143 | * SMB2 flag definitions | 143 | * SMB2 flag definitions |
144 | */ | 144 | */ |
145 | #define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001) | 145 | #define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) |
146 | #define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002) | 146 | #define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) |
147 | #define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004) | 147 | #define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) |
148 | #define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008) | 148 | #define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) |
149 | #define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000) | 149 | #define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) |
150 | 150 | ||
151 | /* | 151 | /* |
152 | * Definitions for SMB2 Protocol Data Units (network frames) | 152 | * Definitions for SMB2 Protocol Data Units (network frames) |
@@ -157,7 +157,7 @@ struct smb2_transform_hdr { | |||
157 | * | 157 | * |
158 | */ | 158 | */ |
159 | 159 | ||
160 | #define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) | 160 | #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) |
161 | 161 | ||
162 | struct smb2_err_rsp { | 162 | struct smb2_err_rsp { |
163 | struct smb2_hdr hdr; | 163 | struct smb2_hdr hdr; |
@@ -502,12 +502,12 @@ struct create_context { | |||
502 | #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 | 502 | #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 |
503 | #define SMB2_LEASE_WRITE_CACHING_HE 0x04 | 503 | #define SMB2_LEASE_WRITE_CACHING_HE 0x04 |
504 | 504 | ||
505 | #define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) | 505 | #define SMB2_LEASE_NONE cpu_to_le32(0x00) |
506 | #define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) | 506 | #define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) |
507 | #define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) | 507 | #define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) |
508 | #define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) | 508 | #define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) |
509 | 509 | ||
510 | #define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) | 510 | #define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) |
511 | 511 | ||
512 | #define SMB2_LEASE_KEY_SIZE 16 | 512 | #define SMB2_LEASE_KEY_SIZE 16 |
513 | 513 | ||
@@ -869,7 +869,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) | |||
869 | struct file *file = fget_raw(fildes); | 869 | struct file *file = fget_raw(fildes); |
870 | 870 | ||
871 | if (file) { | 871 | if (file) { |
872 | ret = get_unused_fd(); | 872 | ret = get_unused_fd_flags(0); |
873 | if (ret >= 0) | 873 | if (ret >= 0) |
874 | fd_install(ret, file); | 874 | fd_install(ret, file); |
875 | else | 875 | else |
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ff0316b925a5..db458ee3a546 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c | |||
@@ -162,14 +162,16 @@ err2: | |||
162 | */ | 162 | */ |
163 | int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) | 163 | int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) |
164 | { | 164 | { |
165 | int retval; | 165 | __be32 k1p, k2p; |
166 | 166 | ||
167 | retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID); | 167 | k1p = key1->cat.ParID; |
168 | if (!retval) | 168 | k2p = key2->cat.ParID; |
169 | retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, | ||
170 | key2->cat.CName.name, key2->cat.CName.len); | ||
171 | 169 | ||
172 | return retval; | 170 | if (k1p != k2p) |
171 | return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; | ||
172 | |||
173 | return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, | ||
174 | key2->cat.CName.name, key2->cat.CName.len); | ||
173 | } | 175 | } |
174 | 176 | ||
175 | /* Try to get a catalog entry for given catalog id */ | 177 | /* Try to get a catalog entry for given catalog id */ |
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index d5659d96ee7f..cf7e043a9447 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c | |||
@@ -447,7 +447,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg | |||
447 | result = -EIO; | 447 | result = -EIO; |
448 | } | 448 | } |
449 | } | 449 | } |
450 | result = 0; | ||
451 | } | 450 | } |
452 | mutex_unlock(&server->root_setup_lock); | 451 | mutex_unlock(&server->root_setup_lock); |
453 | 452 | ||
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index e9e3325f29f3..3a03e0aea1fb 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c | |||
@@ -39,21 +39,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
39 | */ | 39 | */ |
40 | struct the_nilfs *nilfs; | 40 | struct the_nilfs *nilfs; |
41 | struct inode *inode = file->f_mapping->host; | 41 | struct inode *inode = file->f_mapping->host; |
42 | int err; | 42 | int err = 0; |
43 | |||
44 | err = filemap_write_and_wait_range(inode->i_mapping, start, end); | ||
45 | if (err) | ||
46 | return err; | ||
47 | mutex_lock(&inode->i_mutex); | ||
48 | 43 | ||
49 | if (nilfs_inode_dirty(inode)) { | 44 | if (nilfs_inode_dirty(inode)) { |
50 | if (datasync) | 45 | if (datasync) |
51 | err = nilfs_construct_dsync_segment(inode->i_sb, inode, | 46 | err = nilfs_construct_dsync_segment(inode->i_sb, inode, |
52 | 0, LLONG_MAX); | 47 | start, end); |
53 | else | 48 | else |
54 | err = nilfs_construct_segment(inode->i_sb); | 49 | err = nilfs_construct_segment(inode->i_sb); |
55 | } | 50 | } |
56 | mutex_unlock(&inode->i_mutex); | ||
57 | 51 | ||
58 | nilfs = inode->i_sb->s_fs_info; | 52 | nilfs = inode->i_sb->s_fs_info; |
59 | if (!err) | 53 | if (!err) |
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index e1fa69b341b9..8b5969538f39 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c | |||
@@ -49,6 +49,8 @@ struct nilfs_iget_args { | |||
49 | int for_gc; | 49 | int for_gc; |
50 | }; | 50 | }; |
51 | 51 | ||
52 | static int nilfs_iget_test(struct inode *inode, void *opaque); | ||
53 | |||
52 | void nilfs_inode_add_blocks(struct inode *inode, int n) | 54 | void nilfs_inode_add_blocks(struct inode *inode, int n) |
53 | { | 55 | { |
54 | struct nilfs_root *root = NILFS_I(inode)->i_root; | 56 | struct nilfs_root *root = NILFS_I(inode)->i_root; |
@@ -348,6 +350,17 @@ const struct address_space_operations nilfs_aops = { | |||
348 | .is_partially_uptodate = block_is_partially_uptodate, | 350 | .is_partially_uptodate = block_is_partially_uptodate, |
349 | }; | 351 | }; |
350 | 352 | ||
353 | static int nilfs_insert_inode_locked(struct inode *inode, | ||
354 | struct nilfs_root *root, | ||
355 | unsigned long ino) | ||
356 | { | ||
357 | struct nilfs_iget_args args = { | ||
358 | .ino = ino, .root = root, .cno = 0, .for_gc = 0 | ||
359 | }; | ||
360 | |||
361 | return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); | ||
362 | } | ||
363 | |||
351 | struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) | 364 | struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) |
352 | { | 365 | { |
353 | struct super_block *sb = dir->i_sb; | 366 | struct super_block *sb = dir->i_sb; |
@@ -383,7 +396,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) | |||
383 | if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { | 396 | if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { |
384 | err = nilfs_bmap_read(ii->i_bmap, NULL); | 397 | err = nilfs_bmap_read(ii->i_bmap, NULL); |
385 | if (err < 0) | 398 | if (err < 0) |
386 | goto failed_bmap; | 399 | goto failed_after_creation; |
387 | 400 | ||
388 | set_bit(NILFS_I_BMAP, &ii->i_state); | 401 | set_bit(NILFS_I_BMAP, &ii->i_state); |
389 | /* No lock is needed; iget() ensures it. */ | 402 | /* No lock is needed; iget() ensures it. */ |
@@ -399,21 +412,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) | |||
399 | spin_lock(&nilfs->ns_next_gen_lock); | 412 | spin_lock(&nilfs->ns_next_gen_lock); |
400 | inode->i_generation = nilfs->ns_next_generation++; | 413 | inode->i_generation = nilfs->ns_next_generation++; |
401 | spin_unlock(&nilfs->ns_next_gen_lock); | 414 | spin_unlock(&nilfs->ns_next_gen_lock); |
402 | insert_inode_hash(inode); | 415 | if (nilfs_insert_inode_locked(inode, root, ino) < 0) { |
416 | err = -EIO; | ||
417 | goto failed_after_creation; | ||
418 | } | ||
403 | 419 | ||
404 | err = nilfs_init_acl(inode, dir); | 420 | err = nilfs_init_acl(inode, dir); |
405 | if (unlikely(err)) | 421 | if (unlikely(err)) |
406 | goto failed_acl; /* never occur. When supporting | 422 | goto failed_after_creation; /* never occur. When supporting |
407 | nilfs_init_acl(), proper cancellation of | 423 | nilfs_init_acl(), proper cancellation of |
408 | above jobs should be considered */ | 424 | above jobs should be considered */ |
409 | 425 | ||
410 | return inode; | 426 | return inode; |
411 | 427 | ||
412 | failed_acl: | 428 | failed_after_creation: |
413 | failed_bmap: | ||
414 | clear_nlink(inode); | 429 | clear_nlink(inode); |
430 | unlock_new_inode(inode); | ||
415 | iput(inode); /* raw_inode will be deleted through | 431 | iput(inode); /* raw_inode will be deleted through |
416 | generic_delete_inode() */ | 432 | nilfs_evict_inode() */ |
417 | goto failed; | 433 | goto failed; |
418 | 434 | ||
419 | failed_ifile_create_inode: | 435 | failed_ifile_create_inode: |
@@ -461,8 +477,8 @@ int nilfs_read_inode_common(struct inode *inode, | |||
461 | inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); | 477 | inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); |
462 | inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); | 478 | inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); |
463 | inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); | 479 | inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); |
464 | if (inode->i_nlink == 0 && inode->i_mode == 0) | 480 | if (inode->i_nlink == 0) |
465 | return -EINVAL; /* this inode is deleted */ | 481 | return -ESTALE; /* this inode is deleted */ |
466 | 482 | ||
467 | inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); | 483 | inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); |
468 | ii->i_flags = le32_to_cpu(raw_inode->i_flags); | 484 | ii->i_flags = le32_to_cpu(raw_inode->i_flags); |
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9de78f08989e..0f84b257932c 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c | |||
@@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) | |||
51 | int err = nilfs_add_link(dentry, inode); | 51 | int err = nilfs_add_link(dentry, inode); |
52 | if (!err) { | 52 | if (!err) { |
53 | d_instantiate(dentry, inode); | 53 | d_instantiate(dentry, inode); |
54 | unlock_new_inode(inode); | ||
54 | return 0; | 55 | return 0; |
55 | } | 56 | } |
56 | inode_dec_link_count(inode); | 57 | inode_dec_link_count(inode); |
58 | unlock_new_inode(inode); | ||
57 | iput(inode); | 59 | iput(inode); |
58 | return err; | 60 | return err; |
59 | } | 61 | } |
@@ -182,6 +184,7 @@ out: | |||
182 | out_fail: | 184 | out_fail: |
183 | drop_nlink(inode); | 185 | drop_nlink(inode); |
184 | nilfs_mark_inode_dirty(inode); | 186 | nilfs_mark_inode_dirty(inode); |
187 | unlock_new_inode(inode); | ||
185 | iput(inode); | 188 | iput(inode); |
186 | goto out; | 189 | goto out; |
187 | } | 190 | } |
@@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, | |||
201 | inode_inc_link_count(inode); | 204 | inode_inc_link_count(inode); |
202 | ihold(inode); | 205 | ihold(inode); |
203 | 206 | ||
204 | err = nilfs_add_nondir(dentry, inode); | 207 | err = nilfs_add_link(dentry, inode); |
205 | if (!err) | 208 | if (!err) { |
209 | d_instantiate(dentry, inode); | ||
206 | err = nilfs_transaction_commit(dir->i_sb); | 210 | err = nilfs_transaction_commit(dir->i_sb); |
207 | else | 211 | } else { |
212 | inode_dec_link_count(inode); | ||
213 | iput(inode); | ||
208 | nilfs_transaction_abort(dir->i_sb); | 214 | nilfs_transaction_abort(dir->i_sb); |
215 | } | ||
209 | 216 | ||
210 | return err; | 217 | return err; |
211 | } | 218 | } |
@@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
243 | 250 | ||
244 | nilfs_mark_inode_dirty(inode); | 251 | nilfs_mark_inode_dirty(inode); |
245 | d_instantiate(dentry, inode); | 252 | d_instantiate(dentry, inode); |
253 | unlock_new_inode(inode); | ||
246 | out: | 254 | out: |
247 | if (!err) | 255 | if (!err) |
248 | err = nilfs_transaction_commit(dir->i_sb); | 256 | err = nilfs_transaction_commit(dir->i_sb); |
@@ -255,6 +263,7 @@ out_fail: | |||
255 | drop_nlink(inode); | 263 | drop_nlink(inode); |
256 | drop_nlink(inode); | 264 | drop_nlink(inode); |
257 | nilfs_mark_inode_dirty(inode); | 265 | nilfs_mark_inode_dirty(inode); |
266 | unlock_new_inode(inode); | ||
258 | iput(inode); | 267 | iput(inode); |
259 | out_dir: | 268 | out_dir: |
260 | drop_nlink(dir); | 269 | drop_nlink(dir); |
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 9da25fe9ea61..69bd801afb53 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c | |||
@@ -808,8 +808,7 @@ void nilfs_put_root(struct nilfs_root *root) | |||
808 | spin_lock(&nilfs->ns_cptree_lock); | 808 | spin_lock(&nilfs->ns_cptree_lock); |
809 | rb_erase(&root->rb_node, &nilfs->ns_cptree); | 809 | rb_erase(&root->rb_node, &nilfs->ns_cptree); |
810 | spin_unlock(&nilfs->ns_cptree_lock); | 810 | spin_unlock(&nilfs->ns_cptree_lock); |
811 | if (root->ifile) | 811 | iput(root->ifile); |
812 | iput(root->ifile); | ||
813 | 812 | ||
814 | kfree(root); | 813 | kfree(root); |
815 | } | 814 | } |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1ef547e49373..d9f222987f24 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -1251,7 +1251,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
1251 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1251 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
1252 | NULL); | 1252 | NULL); |
1253 | if (ret < 0) { | 1253 | if (ret < 0) { |
1254 | ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | 1254 | mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " |
1255 | "at logical block %llu", | 1255 | "at logical block %llu", |
1256 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1256 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1257 | (unsigned long long)v_blkno); | 1257 | (unsigned long long)v_blkno); |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index eb9d48746ab4..16eff45727ee 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -1127,10 +1127,10 @@ static int o2hb_thread(void *data) | |||
1127 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | 1127 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); |
1128 | 1128 | ||
1129 | mlog(ML_HEARTBEAT, | 1129 | mlog(ML_HEARTBEAT, |
1130 | "start = %lu.%lu, end = %lu.%lu, msec = %u\n", | 1130 | "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", |
1131 | before_hb.tv_sec, (unsigned long) before_hb.tv_usec, | 1131 | before_hb.tv_sec, (unsigned long) before_hb.tv_usec, |
1132 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, | 1132 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, |
1133 | elapsed_msec); | 1133 | elapsed_msec, ret); |
1134 | 1134 | ||
1135 | if (!kthread_should_stop() && | 1135 | if (!kthread_should_stop() && |
1136 | elapsed_msec < reg->hr_timeout_ms) { | 1136 | elapsed_msec < reg->hr_timeout_ms) { |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a96044004064..2e355e0f8335 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -1736,7 +1736,7 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1736 | o2net_idle_timeout() / 1000, | 1736 | o2net_idle_timeout() / 1000, |
1737 | o2net_idle_timeout() % 1000); | 1737 | o2net_idle_timeout() % 1000); |
1738 | 1738 | ||
1739 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | 1739 | o2net_set_nn_state(nn, NULL, 0, 0); |
1740 | } | 1740 | } |
1741 | spin_unlock(&nn->nn_lock); | 1741 | spin_unlock(&nn->nn_lock); |
1742 | } | 1742 | } |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c43d9b4a1ec0..79d56dc981bc 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -744,7 +744,7 @@ restart: | |||
744 | if (ocfs2_read_dir_block(dir, block, &bh, 0)) { | 744 | if (ocfs2_read_dir_block(dir, block, &bh, 0)) { |
745 | /* read error, skip block & hope for the best. | 745 | /* read error, skip block & hope for the best. |
746 | * ocfs2_read_dir_block() has released the bh. */ | 746 | * ocfs2_read_dir_block() has released the bh. */ |
747 | ocfs2_error(dir->i_sb, "reading directory %llu, " | 747 | mlog(ML_ERROR, "reading directory %llu, " |
748 | "offset %lu\n", | 748 | "offset %lu\n", |
749 | (unsigned long long)OCFS2_I(dir)->ip_blkno, | 749 | (unsigned long long)OCFS2_I(dir)->ip_blkno, |
750 | block); | 750 | block); |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 02d315fef432..50a59d2337b2 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -877,7 +877,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, | |||
877 | * to be put in someone's domain map. | 877 | * to be put in someone's domain map. |
878 | * Also, explicitly disallow joining at certain troublesome | 878 | * Also, explicitly disallow joining at certain troublesome |
879 | * times (ie. during recovery). */ | 879 | * times (ie. during recovery). */ |
880 | if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { | 880 | if (dlm->dlm_state != DLM_CTXT_LEAVING) { |
881 | int bit = query->node_idx; | 881 | int bit = query->node_idx; |
882 | spin_lock(&dlm->spinlock); | 882 | spin_lock(&dlm->spinlock); |
883 | 883 | ||
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 215e41abf101..3689b3592042 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -1460,6 +1460,18 @@ way_up_top: | |||
1460 | 1460 | ||
1461 | /* take care of the easy cases up front */ | 1461 | /* take care of the easy cases up front */ |
1462 | spin_lock(&res->spinlock); | 1462 | spin_lock(&res->spinlock); |
1463 | |||
1464 | /* | ||
1465 | * Right after dlm spinlock was released, dlm_thread could have | ||
1466 | * purged the lockres. Check if lockres got unhashed. If so | ||
1467 | * start over. | ||
1468 | */ | ||
1469 | if (hlist_unhashed(&res->hash_node)) { | ||
1470 | spin_unlock(&res->spinlock); | ||
1471 | dlm_lockres_put(res); | ||
1472 | goto way_up_top; | ||
1473 | } | ||
1474 | |||
1463 | if (res->state & (DLM_LOCK_RES_RECOVERING| | 1475 | if (res->state & (DLM_LOCK_RES_RECOVERING| |
1464 | DLM_LOCK_RES_MIGRATING)) { | 1476 | DLM_LOCK_RES_MIGRATING)) { |
1465 | spin_unlock(&res->spinlock); | 1477 | spin_unlock(&res->spinlock); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 3365839d2971..79b5af5e6a7b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -1656,14 +1656,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
1656 | req.namelen = res->lockname.len; | 1656 | req.namelen = res->lockname.len; |
1657 | memcpy(req.name, res->lockname.name, res->lockname.len); | 1657 | memcpy(req.name, res->lockname.name, res->lockname.len); |
1658 | 1658 | ||
1659 | resend: | ||
1659 | ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, | 1660 | ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, |
1660 | &req, sizeof(req), nodenum, &status); | 1661 | &req, sizeof(req), nodenum, &status); |
1661 | /* XXX: negative status not handled properly here. */ | ||
1662 | if (ret < 0) | 1662 | if (ret < 0) |
1663 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 1663 | mlog(ML_ERROR, "Error %d when sending message %u (key " |
1664 | "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, | 1664 | "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, |
1665 | dlm->key, nodenum); | 1665 | dlm->key, nodenum); |
1666 | else { | 1666 | else if (status == -ENOMEM) { |
1667 | mlog_errno(status); | ||
1668 | msleep(50); | ||
1669 | goto resend; | ||
1670 | } else { | ||
1667 | BUG_ON(status < 0); | 1671 | BUG_ON(status < 0); |
1668 | BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); | 1672 | BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); |
1669 | *real_master = (u8) (status & 0xff); | 1673 | *real_master = (u8) (status & 0xff); |
@@ -1705,9 +1709,13 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, | |||
1705 | int ret = dlm_dispatch_assert_master(dlm, res, | 1709 | int ret = dlm_dispatch_assert_master(dlm, res, |
1706 | 0, 0, flags); | 1710 | 0, 0, flags); |
1707 | if (ret < 0) { | 1711 | if (ret < 0) { |
1708 | mlog_errno(-ENOMEM); | 1712 | mlog_errno(ret); |
1709 | /* retry!? */ | 1713 | spin_unlock(&res->spinlock); |
1710 | BUG(); | 1714 | dlm_lockres_put(res); |
1715 | spin_unlock(&dlm->spinlock); | ||
1716 | dlm_put(dlm); | ||
1717 | /* sender will take care of this and retry */ | ||
1718 | return ret; | ||
1711 | } else | 1719 | } else |
1712 | __dlm_lockres_grab_inflight_worker(dlm, res); | 1720 | __dlm_lockres_grab_inflight_worker(dlm, res); |
1713 | spin_unlock(&res->spinlock); | 1721 | spin_unlock(&res->spinlock); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 37297c14f9a3..1c423af04c69 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -861,8 +861,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo | |||
861 | * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing | 861 | * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing |
862 | * the OCFS2_LOCK_BUSY flag to prevent the dc thread from | 862 | * the OCFS2_LOCK_BUSY flag to prevent the dc thread from |
863 | * downconverting the lock before the upconvert has fully completed. | 863 | * downconverting the lock before the upconvert has fully completed. |
864 | * Do not prevent the dc thread from downconverting if NONBLOCK lock | ||
865 | * had already returned. | ||
864 | */ | 866 | */ |
865 | lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); | 867 | if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED)) |
868 | lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); | ||
869 | else | ||
870 | lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED); | ||
866 | 871 | ||
867 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | 872 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); |
868 | } | 873 | } |
@@ -1324,13 +1329,12 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, | |||
1324 | 1329 | ||
1325 | /* returns 0 if the mw that was removed was already satisfied, -EBUSY | 1330 | /* returns 0 if the mw that was removed was already satisfied, -EBUSY |
1326 | * if the mask still hadn't reached its goal */ | 1331 | * if the mask still hadn't reached its goal */ |
1327 | static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, | 1332 | static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, |
1328 | struct ocfs2_mask_waiter *mw) | 1333 | struct ocfs2_mask_waiter *mw) |
1329 | { | 1334 | { |
1330 | unsigned long flags; | ||
1331 | int ret = 0; | 1335 | int ret = 0; |
1332 | 1336 | ||
1333 | spin_lock_irqsave(&lockres->l_lock, flags); | 1337 | assert_spin_locked(&lockres->l_lock); |
1334 | if (!list_empty(&mw->mw_item)) { | 1338 | if (!list_empty(&mw->mw_item)) { |
1335 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | 1339 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) |
1336 | ret = -EBUSY; | 1340 | ret = -EBUSY; |
@@ -1338,6 +1342,18 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, | |||
1338 | list_del_init(&mw->mw_item); | 1342 | list_del_init(&mw->mw_item); |
1339 | init_completion(&mw->mw_complete); | 1343 | init_completion(&mw->mw_complete); |
1340 | } | 1344 | } |
1345 | |||
1346 | return ret; | ||
1347 | } | ||
1348 | |||
1349 | static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, | ||
1350 | struct ocfs2_mask_waiter *mw) | ||
1351 | { | ||
1352 | unsigned long flags; | ||
1353 | int ret = 0; | ||
1354 | |||
1355 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1356 | ret = __lockres_remove_mask_waiter(lockres, mw); | ||
1341 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 1357 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
1342 | 1358 | ||
1343 | return ret; | 1359 | return ret; |
@@ -1373,6 +1389,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, | |||
1373 | unsigned long flags; | 1389 | unsigned long flags; |
1374 | unsigned int gen; | 1390 | unsigned int gen; |
1375 | int noqueue_attempted = 0; | 1391 | int noqueue_attempted = 0; |
1392 | int dlm_locked = 0; | ||
1376 | 1393 | ||
1377 | ocfs2_init_mask_waiter(&mw); | 1394 | ocfs2_init_mask_waiter(&mw); |
1378 | 1395 | ||
@@ -1481,6 +1498,7 @@ again: | |||
1481 | ocfs2_recover_from_dlm_error(lockres, 1); | 1498 | ocfs2_recover_from_dlm_error(lockres, 1); |
1482 | goto out; | 1499 | goto out; |
1483 | } | 1500 | } |
1501 | dlm_locked = 1; | ||
1484 | 1502 | ||
1485 | mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", | 1503 | mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", |
1486 | lockres->l_name); | 1504 | lockres->l_name); |
@@ -1514,10 +1532,17 @@ out: | |||
1514 | if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && | 1532 | if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && |
1515 | mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { | 1533 | mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { |
1516 | wait = 0; | 1534 | wait = 0; |
1517 | if (lockres_remove_mask_waiter(lockres, &mw)) | 1535 | spin_lock_irqsave(&lockres->l_lock, flags); |
1536 | if (__lockres_remove_mask_waiter(lockres, &mw)) { | ||
1537 | if (dlm_locked) | ||
1538 | lockres_or_flags(lockres, | ||
1539 | OCFS2_LOCK_NONBLOCK_FINISHED); | ||
1540 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1518 | ret = -EAGAIN; | 1541 | ret = -EAGAIN; |
1519 | else | 1542 | } else { |
1543 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1520 | goto again; | 1544 | goto again; |
1545 | } | ||
1521 | } | 1546 | } |
1522 | if (wait) { | 1547 | if (wait) { |
1523 | ret = ocfs2_wait_for_mask(&mw); | 1548 | ret = ocfs2_wait_for_mask(&mw); |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 324dc93ac896..69fb9f75b082 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2381,9 +2381,7 @@ out_dio: | |||
2381 | if (ret < 0) | 2381 | if (ret < 0) |
2382 | written = ret; | 2382 | written = ret; |
2383 | 2383 | ||
2384 | if (!ret && ((old_size != i_size_read(inode)) || | 2384 | if (!ret) { |
2385 | (old_clusters != OCFS2_I(inode)->ip_clusters) || | ||
2386 | has_refcount)) { | ||
2387 | ret = jbd2_journal_force_commit(osb->journal->j_journal); | 2385 | ret = jbd2_journal_force_commit(osb->journal->j_journal); |
2388 | if (ret < 0) | 2386 | if (ret < 0) |
2389 | written = ret; | 2387 | written = ret; |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 437de7f768c6..c8b25de9efbb 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -540,8 +540,7 @@ bail: | |||
540 | if (status < 0) | 540 | if (status < 0) |
541 | make_bad_inode(inode); | 541 | make_bad_inode(inode); |
542 | 542 | ||
543 | if (args && bh) | 543 | brelse(bh); |
544 | brelse(bh); | ||
545 | 544 | ||
546 | return status; | 545 | return status; |
547 | } | 546 | } |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 74caffeeee1d..56a768d06aa6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -904,9 +904,6 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) | |||
904 | struct buffer_head *di_bh = NULL; | 904 | struct buffer_head *di_bh = NULL; |
905 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 905 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
906 | 906 | ||
907 | if (!inode) | ||
908 | return -ENOENT; | ||
909 | |||
910 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | 907 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) |
911 | return -EROFS; | 908 | return -EROFS; |
912 | 909 | ||
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bbec539230fd..7d6b7d090452 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -144,6 +144,12 @@ enum ocfs2_unlock_action { | |||
144 | * before the upconvert | 144 | * before the upconvert |
145 | * has completed */ | 145 | * has completed */ |
146 | 146 | ||
147 | #define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster | ||
148 | * lock has already | ||
149 | * returned, do not block | ||
150 | * dc thread from | ||
151 | * downconverting */ | ||
152 | |||
147 | struct ocfs2_lock_res_ops; | 153 | struct ocfs2_lock_res_ops; |
148 | 154 | ||
149 | typedef void (*ocfs2_lock_callback)(int status, unsigned long data); | 155 | typedef void (*ocfs2_lock_callback)(int status, unsigned long data); |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index a88b2a4fcc85..d5493e361a38 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -306,7 +306,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, | |||
306 | assert_spin_locked(&osb->osb_lock); | 306 | assert_spin_locked(&osb->osb_lock); |
307 | 307 | ||
308 | BUG_ON(slot_num < 0); | 308 | BUG_ON(slot_num < 0); |
309 | BUG_ON(slot_num > osb->max_slots); | 309 | BUG_ON(slot_num >= osb->max_slots); |
310 | 310 | ||
311 | if (!si->si_slots[slot_num].sl_valid) | 311 | if (!si->si_slots[slot_num].sl_valid) |
312 | return -ENOENT; | 312 | return -ENOENT; |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0945814ddb7b..83723179e1ec 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -1629,8 +1629,9 @@ static int __init ocfs2_init(void) | |||
1629 | 1629 | ||
1630 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | 1630 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); |
1631 | if (!ocfs2_debugfs_root) { | 1631 | if (!ocfs2_debugfs_root) { |
1632 | status = -EFAULT; | 1632 | status = -ENOMEM; |
1633 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | 1633 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); |
1634 | goto out4; | ||
1634 | } | 1635 | } |
1635 | 1636 | ||
1636 | ocfs2_set_locking_protocol(); | 1637 | ocfs2_set_locking_protocol(); |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 016f01df3825..662f8dee149f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -1284,7 +1284,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode, | |||
1284 | return -EOPNOTSUPP; | 1284 | return -EOPNOTSUPP; |
1285 | 1285 | ||
1286 | if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) | 1286 | if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) |
1287 | ret = -ENODATA; | 1287 | return -ENODATA; |
1288 | 1288 | ||
1289 | xis.inode_bh = xbs.inode_bh = di_bh; | 1289 | xis.inode_bh = xbs.inode_bh = di_bh; |
1290 | di = (struct ocfs2_dinode *)di_bh->b_data; | 1290 | di = (struct ocfs2_dinode *)di_bh->b_data; |
diff --git a/fs/proc/array.c b/fs/proc/array.c index cd3653e4f35c..bd117d065b82 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -157,20 +157,29 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, | |||
157 | struct user_namespace *user_ns = seq_user_ns(m); | 157 | struct user_namespace *user_ns = seq_user_ns(m); |
158 | struct group_info *group_info; | 158 | struct group_info *group_info; |
159 | int g; | 159 | int g; |
160 | struct fdtable *fdt = NULL; | 160 | struct task_struct *tracer; |
161 | const struct cred *cred; | 161 | const struct cred *cred; |
162 | pid_t ppid, tpid; | 162 | pid_t ppid, tpid = 0, tgid, ngid; |
163 | unsigned int max_fds = 0; | ||
163 | 164 | ||
164 | rcu_read_lock(); | 165 | rcu_read_lock(); |
165 | ppid = pid_alive(p) ? | 166 | ppid = pid_alive(p) ? |
166 | task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; | 167 | task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; |
167 | tpid = 0; | 168 | |
168 | if (pid_alive(p)) { | 169 | tracer = ptrace_parent(p); |
169 | struct task_struct *tracer = ptrace_parent(p); | 170 | if (tracer) |
170 | if (tracer) | 171 | tpid = task_pid_nr_ns(tracer, ns); |
171 | tpid = task_pid_nr_ns(tracer, ns); | 172 | |
172 | } | 173 | tgid = task_tgid_nr_ns(p, ns); |
174 | ngid = task_numa_group_id(p); | ||
173 | cred = get_task_cred(p); | 175 | cred = get_task_cred(p); |
176 | |||
177 | task_lock(p); | ||
178 | if (p->files) | ||
179 | max_fds = files_fdtable(p->files)->max_fds; | ||
180 | task_unlock(p); | ||
181 | rcu_read_unlock(); | ||
182 | |||
174 | seq_printf(m, | 183 | seq_printf(m, |
175 | "State:\t%s\n" | 184 | "State:\t%s\n" |
176 | "Tgid:\t%d\n" | 185 | "Tgid:\t%d\n" |
@@ -179,12 +188,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, | |||
179 | "PPid:\t%d\n" | 188 | "PPid:\t%d\n" |
180 | "TracerPid:\t%d\n" | 189 | "TracerPid:\t%d\n" |
181 | "Uid:\t%d\t%d\t%d\t%d\n" | 190 | "Uid:\t%d\t%d\t%d\t%d\n" |
182 | "Gid:\t%d\t%d\t%d\t%d\n", | 191 | "Gid:\t%d\t%d\t%d\t%d\n" |
192 | "FDSize:\t%d\nGroups:\t", | ||
183 | get_task_state(p), | 193 | get_task_state(p), |
184 | task_tgid_nr_ns(p, ns), | 194 | tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, |
185 | task_numa_group_id(p), | ||
186 | pid_nr_ns(pid, ns), | ||
187 | ppid, tpid, | ||
188 | from_kuid_munged(user_ns, cred->uid), | 195 | from_kuid_munged(user_ns, cred->uid), |
189 | from_kuid_munged(user_ns, cred->euid), | 196 | from_kuid_munged(user_ns, cred->euid), |
190 | from_kuid_munged(user_ns, cred->suid), | 197 | from_kuid_munged(user_ns, cred->suid), |
@@ -192,20 +199,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, | |||
192 | from_kgid_munged(user_ns, cred->gid), | 199 | from_kgid_munged(user_ns, cred->gid), |
193 | from_kgid_munged(user_ns, cred->egid), | 200 | from_kgid_munged(user_ns, cred->egid), |
194 | from_kgid_munged(user_ns, cred->sgid), | 201 | from_kgid_munged(user_ns, cred->sgid), |
195 | from_kgid_munged(user_ns, cred->fsgid)); | 202 | from_kgid_munged(user_ns, cred->fsgid), |
196 | 203 | max_fds); | |
197 | task_lock(p); | ||
198 | if (p->files) | ||
199 | fdt = files_fdtable(p->files); | ||
200 | seq_printf(m, | ||
201 | "FDSize:\t%d\n" | ||
202 | "Groups:\t", | ||
203 | fdt ? fdt->max_fds : 0); | ||
204 | rcu_read_unlock(); | ||
205 | 204 | ||
206 | group_info = cred->group_info; | 205 | group_info = cred->group_info; |
207 | task_unlock(p); | ||
208 | |||
209 | for (g = 0; g < group_info->ngroups; g++) | 206 | for (g = 0; g < group_info->ngroups; g++) |
210 | seq_printf(m, "%d ", | 207 | seq_printf(m, "%d ", |
211 | from_kgid_munged(user_ns, GROUP_AT(group_info, g))); | 208 | from_kgid_munged(user_ns, GROUP_AT(group_info, g))); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 64891f3e41bd..590aeda5af12 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -2618,6 +2618,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) | |||
2618 | dput(dentry); | 2618 | dput(dentry); |
2619 | } | 2619 | } |
2620 | 2620 | ||
2621 | if (pid == tgid) | ||
2622 | return; | ||
2623 | |||
2621 | name.name = buf; | 2624 | name.name = buf; |
2622 | name.len = snprintf(buf, sizeof(buf), "%d", tgid); | 2625 | name.len = snprintf(buf, sizeof(buf), "%d", tgid); |
2623 | leader = d_hash_and_lookup(mnt->mnt_root, &name); | 2626 | leader = d_hash_and_lookup(mnt->mnt_root, &name); |
diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 317b72641ebf..7fea13229f33 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c | |||
@@ -31,9 +31,73 @@ static DEFINE_SPINLOCK(proc_subdir_lock); | |||
31 | 31 | ||
32 | static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) | 32 | static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) |
33 | { | 33 | { |
34 | if (de->namelen != len) | 34 | if (len < de->namelen) |
35 | return 0; | 35 | return -1; |
36 | return !memcmp(name, de->name, len); | 36 | if (len > de->namelen) |
37 | return 1; | ||
38 | |||
39 | return memcmp(name, de->name, len); | ||
40 | } | ||
41 | |||
42 | static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) | ||
43 | { | ||
44 | return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, | ||
45 | subdir_node); | ||
46 | } | ||
47 | |||
48 | static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) | ||
49 | { | ||
50 | return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, | ||
51 | subdir_node); | ||
52 | } | ||
53 | |||
54 | static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, | ||
55 | const char *name, | ||
56 | unsigned int len) | ||
57 | { | ||
58 | struct rb_node *node = dir->subdir.rb_node; | ||
59 | |||
60 | while (node) { | ||
61 | struct proc_dir_entry *de = container_of(node, | ||
62 | struct proc_dir_entry, | ||
63 | subdir_node); | ||
64 | int result = proc_match(len, name, de); | ||
65 | |||
66 | if (result < 0) | ||
67 | node = node->rb_left; | ||
68 | else if (result > 0) | ||
69 | node = node->rb_right; | ||
70 | else | ||
71 | return de; | ||
72 | } | ||
73 | return NULL; | ||
74 | } | ||
75 | |||
76 | static bool pde_subdir_insert(struct proc_dir_entry *dir, | ||
77 | struct proc_dir_entry *de) | ||
78 | { | ||
79 | struct rb_root *root = &dir->subdir; | ||
80 | struct rb_node **new = &root->rb_node, *parent = NULL; | ||
81 | |||
82 | /* Figure out where to put new node */ | ||
83 | while (*new) { | ||
84 | struct proc_dir_entry *this = | ||
85 | container_of(*new, struct proc_dir_entry, subdir_node); | ||
86 | int result = proc_match(de->namelen, de->name, this); | ||
87 | |||
88 | parent = *new; | ||
89 | if (result < 0) | ||
90 | new = &(*new)->rb_left; | ||
91 | else if (result > 0) | ||
92 | new = &(*new)->rb_right; | ||
93 | else | ||
94 | return false; | ||
95 | } | ||
96 | |||
97 | /* Add new node and rebalance tree. */ | ||
98 | rb_link_node(&de->subdir_node, parent, new); | ||
99 | rb_insert_color(&de->subdir_node, root); | ||
100 | return true; | ||
37 | } | 101 | } |
38 | 102 | ||
39 | static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) | 103 | static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) |
@@ -92,10 +156,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, | |||
92 | break; | 156 | break; |
93 | 157 | ||
94 | len = next - cp; | 158 | len = next - cp; |
95 | for (de = de->subdir; de ; de = de->next) { | 159 | de = pde_subdir_find(de, cp, len); |
96 | if (proc_match(len, cp, de)) | ||
97 | break; | ||
98 | } | ||
99 | if (!de) { | 160 | if (!de) { |
100 | WARN(1, "name '%s'\n", name); | 161 | WARN(1, "name '%s'\n", name); |
101 | return -ENOENT; | 162 | return -ENOENT; |
@@ -183,19 +244,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | |||
183 | struct inode *inode; | 244 | struct inode *inode; |
184 | 245 | ||
185 | spin_lock(&proc_subdir_lock); | 246 | spin_lock(&proc_subdir_lock); |
186 | for (de = de->subdir; de ; de = de->next) { | 247 | de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); |
187 | if (de->namelen != dentry->d_name.len) | 248 | if (de) { |
188 | continue; | 249 | pde_get(de); |
189 | if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { | 250 | spin_unlock(&proc_subdir_lock); |
190 | pde_get(de); | 251 | inode = proc_get_inode(dir->i_sb, de); |
191 | spin_unlock(&proc_subdir_lock); | 252 | if (!inode) |
192 | inode = proc_get_inode(dir->i_sb, de); | 253 | return ERR_PTR(-ENOMEM); |
193 | if (!inode) | 254 | d_set_d_op(dentry, &simple_dentry_operations); |
194 | return ERR_PTR(-ENOMEM); | 255 | d_add(dentry, inode); |
195 | d_set_d_op(dentry, &simple_dentry_operations); | 256 | return NULL; |
196 | d_add(dentry, inode); | ||
197 | return NULL; | ||
198 | } | ||
199 | } | 257 | } |
200 | spin_unlock(&proc_subdir_lock); | 258 | spin_unlock(&proc_subdir_lock); |
201 | return ERR_PTR(-ENOENT); | 259 | return ERR_PTR(-ENOENT); |
@@ -225,7 +283,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, | |||
225 | return 0; | 283 | return 0; |
226 | 284 | ||
227 | spin_lock(&proc_subdir_lock); | 285 | spin_lock(&proc_subdir_lock); |
228 | de = de->subdir; | 286 | de = pde_subdir_first(de); |
229 | i = ctx->pos - 2; | 287 | i = ctx->pos - 2; |
230 | for (;;) { | 288 | for (;;) { |
231 | if (!de) { | 289 | if (!de) { |
@@ -234,7 +292,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, | |||
234 | } | 292 | } |
235 | if (!i) | 293 | if (!i) |
236 | break; | 294 | break; |
237 | de = de->next; | 295 | de = pde_subdir_next(de); |
238 | i--; | 296 | i--; |
239 | } | 297 | } |
240 | 298 | ||
@@ -249,7 +307,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, | |||
249 | } | 307 | } |
250 | spin_lock(&proc_subdir_lock); | 308 | spin_lock(&proc_subdir_lock); |
251 | ctx->pos++; | 309 | ctx->pos++; |
252 | next = de->next; | 310 | next = pde_subdir_next(de); |
253 | pde_put(de); | 311 | pde_put(de); |
254 | de = next; | 312 | de = next; |
255 | } while (de); | 313 | } while (de); |
@@ -286,9 +344,8 @@ static const struct inode_operations proc_dir_inode_operations = { | |||
286 | 344 | ||
287 | static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) | 345 | static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) |
288 | { | 346 | { |
289 | struct proc_dir_entry *tmp; | ||
290 | int ret; | 347 | int ret; |
291 | 348 | ||
292 | ret = proc_alloc_inum(&dp->low_ino); | 349 | ret = proc_alloc_inum(&dp->low_ino); |
293 | if (ret) | 350 | if (ret) |
294 | return ret; | 351 | return ret; |
@@ -304,21 +361,21 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp | |||
304 | dp->proc_iops = &proc_file_inode_operations; | 361 | dp->proc_iops = &proc_file_inode_operations; |
305 | } else { | 362 | } else { |
306 | WARN_ON(1); | 363 | WARN_ON(1); |
364 | proc_free_inum(dp->low_ino); | ||
307 | return -EINVAL; | 365 | return -EINVAL; |
308 | } | 366 | } |
309 | 367 | ||
310 | spin_lock(&proc_subdir_lock); | 368 | spin_lock(&proc_subdir_lock); |
311 | |||
312 | for (tmp = dir->subdir; tmp; tmp = tmp->next) | ||
313 | if (strcmp(tmp->name, dp->name) == 0) { | ||
314 | WARN(1, "proc_dir_entry '%s/%s' already registered\n", | ||
315 | dir->name, dp->name); | ||
316 | break; | ||
317 | } | ||
318 | |||
319 | dp->next = dir->subdir; | ||
320 | dp->parent = dir; | 369 | dp->parent = dir; |
321 | dir->subdir = dp; | 370 | if (pde_subdir_insert(dir, dp) == false) { |
371 | WARN(1, "proc_dir_entry '%s/%s' already registered\n", | ||
372 | dir->name, dp->name); | ||
373 | spin_unlock(&proc_subdir_lock); | ||
374 | if (S_ISDIR(dp->mode)) | ||
375 | dir->nlink--; | ||
376 | proc_free_inum(dp->low_ino); | ||
377 | return -EEXIST; | ||
378 | } | ||
322 | spin_unlock(&proc_subdir_lock); | 379 | spin_unlock(&proc_subdir_lock); |
323 | 380 | ||
324 | return 0; | 381 | return 0; |
@@ -354,6 +411,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, | |||
354 | ent->namelen = qstr.len; | 411 | ent->namelen = qstr.len; |
355 | ent->mode = mode; | 412 | ent->mode = mode; |
356 | ent->nlink = nlink; | 413 | ent->nlink = nlink; |
414 | ent->subdir = RB_ROOT; | ||
357 | atomic_set(&ent->count, 1); | 415 | atomic_set(&ent->count, 1); |
358 | spin_lock_init(&ent->pde_unload_lock); | 416 | spin_lock_init(&ent->pde_unload_lock); |
359 | INIT_LIST_HEAD(&ent->pde_openers); | 417 | INIT_LIST_HEAD(&ent->pde_openers); |
@@ -485,7 +543,6 @@ void pde_put(struct proc_dir_entry *pde) | |||
485 | */ | 543 | */ |
486 | void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | 544 | void remove_proc_entry(const char *name, struct proc_dir_entry *parent) |
487 | { | 545 | { |
488 | struct proc_dir_entry **p; | ||
489 | struct proc_dir_entry *de = NULL; | 546 | struct proc_dir_entry *de = NULL; |
490 | const char *fn = name; | 547 | const char *fn = name; |
491 | unsigned int len; | 548 | unsigned int len; |
@@ -497,14 +554,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | |||
497 | } | 554 | } |
498 | len = strlen(fn); | 555 | len = strlen(fn); |
499 | 556 | ||
500 | for (p = &parent->subdir; *p; p=&(*p)->next ) { | 557 | de = pde_subdir_find(parent, fn, len); |
501 | if (proc_match(len, fn, *p)) { | 558 | if (de) |
502 | de = *p; | 559 | rb_erase(&de->subdir_node, &parent->subdir); |
503 | *p = de->next; | ||
504 | de->next = NULL; | ||
505 | break; | ||
506 | } | ||
507 | } | ||
508 | spin_unlock(&proc_subdir_lock); | 560 | spin_unlock(&proc_subdir_lock); |
509 | if (!de) { | 561 | if (!de) { |
510 | WARN(1, "name '%s'\n", name); | 562 | WARN(1, "name '%s'\n", name); |
@@ -516,16 +568,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | |||
516 | if (S_ISDIR(de->mode)) | 568 | if (S_ISDIR(de->mode)) |
517 | parent->nlink--; | 569 | parent->nlink--; |
518 | de->nlink = 0; | 570 | de->nlink = 0; |
519 | WARN(de->subdir, "%s: removing non-empty directory " | 571 | WARN(pde_subdir_first(de), |
520 | "'%s/%s', leaking at least '%s'\n", __func__, | 572 | "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", |
521 | de->parent->name, de->name, de->subdir->name); | 573 | __func__, de->parent->name, de->name, pde_subdir_first(de)->name); |
522 | pde_put(de); | 574 | pde_put(de); |
523 | } | 575 | } |
524 | EXPORT_SYMBOL(remove_proc_entry); | 576 | EXPORT_SYMBOL(remove_proc_entry); |
525 | 577 | ||
526 | int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) | 578 | int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) |
527 | { | 579 | { |
528 | struct proc_dir_entry **p; | ||
529 | struct proc_dir_entry *root = NULL, *de, *next; | 580 | struct proc_dir_entry *root = NULL, *de, *next; |
530 | const char *fn = name; | 581 | const char *fn = name; |
531 | unsigned int len; | 582 | unsigned int len; |
@@ -537,24 +588,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) | |||
537 | } | 588 | } |
538 | len = strlen(fn); | 589 | len = strlen(fn); |
539 | 590 | ||
540 | for (p = &parent->subdir; *p; p=&(*p)->next ) { | 591 | root = pde_subdir_find(parent, fn, len); |
541 | if (proc_match(len, fn, *p)) { | ||
542 | root = *p; | ||
543 | *p = root->next; | ||
544 | root->next = NULL; | ||
545 | break; | ||
546 | } | ||
547 | } | ||
548 | if (!root) { | 592 | if (!root) { |
549 | spin_unlock(&proc_subdir_lock); | 593 | spin_unlock(&proc_subdir_lock); |
550 | return -ENOENT; | 594 | return -ENOENT; |
551 | } | 595 | } |
596 | rb_erase(&root->subdir_node, &parent->subdir); | ||
597 | |||
552 | de = root; | 598 | de = root; |
553 | while (1) { | 599 | while (1) { |
554 | next = de->subdir; | 600 | next = pde_subdir_first(de); |
555 | if (next) { | 601 | if (next) { |
556 | de->subdir = next->next; | 602 | rb_erase(&next->subdir_node, &de->subdir); |
557 | next->next = NULL; | ||
558 | de = next; | 603 | de = next; |
559 | continue; | 604 | continue; |
560 | } | 605 | } |
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa7a0ee182e1..7fb1a4869fd0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -24,10 +24,9 @@ struct mempolicy; | |||
24 | * tree) of these proc_dir_entries, so that we can dynamically | 24 | * tree) of these proc_dir_entries, so that we can dynamically |
25 | * add new files to /proc. | 25 | * add new files to /proc. |
26 | * | 26 | * |
27 | * The "next" pointer creates a linked list of one /proc directory, | 27 | * parent/subdir are used for the directory structure (every /proc file has a |
28 | * while parent/subdir create the directory structure (every | 28 | * parent, but "subdir" is empty for all non-directory entries). |
29 | * /proc file has a parent, but "subdir" is NULL for all | 29 | * subdir_node is used to build the rb tree "subdir" of the parent. |
30 | * non-directory entries). | ||
31 | */ | 30 | */ |
32 | struct proc_dir_entry { | 31 | struct proc_dir_entry { |
33 | unsigned int low_ino; | 32 | unsigned int low_ino; |
@@ -38,7 +37,9 @@ struct proc_dir_entry { | |||
38 | loff_t size; | 37 | loff_t size; |
39 | const struct inode_operations *proc_iops; | 38 | const struct inode_operations *proc_iops; |
40 | const struct file_operations *proc_fops; | 39 | const struct file_operations *proc_fops; |
41 | struct proc_dir_entry *next, *parent, *subdir; | 40 | struct proc_dir_entry *parent; |
41 | struct rb_root subdir; | ||
42 | struct rb_node subdir_node; | ||
42 | void *data; | 43 | void *data; |
43 | atomic_t count; /* use count */ | 44 | atomic_t count; /* use count */ |
44 | atomic_t in_use; /* number of callers into module in progress; */ | 45 | atomic_t in_use; /* number of callers into module in progress; */ |
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a63af3e0a612..1bde894bc624 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c | |||
@@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) | |||
192 | if (!netd) | 192 | if (!netd) |
193 | goto out; | 193 | goto out; |
194 | 194 | ||
195 | netd->subdir = RB_ROOT; | ||
195 | netd->data = net; | 196 | netd->data = net; |
196 | netd->nlink = 2; | 197 | netd->nlink = 2; |
197 | netd->namelen = 3; | 198 | netd->namelen = 3; |
diff --git a/fs/proc/root.c b/fs/proc/root.c index 094e44d4a6be..e74ac9f1a2c0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = { | |||
251 | .proc_iops = &proc_root_inode_operations, | 251 | .proc_iops = &proc_root_inode_operations, |
252 | .proc_fops = &proc_root_operations, | 252 | .proc_fops = &proc_root_operations, |
253 | .parent = &proc_root, | 253 | .parent = &proc_root, |
254 | .subdir = RB_ROOT, | ||
254 | .name = "/proc", | 255 | .name = "/proc", |
255 | }; | 256 | }; |
256 | 257 | ||
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6734c6b66a6..246eae84b13b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -447,58 +447,91 @@ struct mem_size_stats { | |||
447 | u64 pss; | 447 | u64 pss; |
448 | }; | 448 | }; |
449 | 449 | ||
450 | static void smaps_account(struct mem_size_stats *mss, struct page *page, | ||
451 | unsigned long size, bool young, bool dirty) | ||
452 | { | ||
453 | int mapcount; | ||
454 | |||
455 | if (PageAnon(page)) | ||
456 | mss->anonymous += size; | ||
450 | 457 | ||
451 | static void smaps_pte_entry(pte_t ptent, unsigned long addr, | 458 | mss->resident += size; |
452 | unsigned long ptent_size, struct mm_walk *walk) | 459 | /* Accumulate the size in pages that have been accessed. */ |
460 | if (young || PageReferenced(page)) | ||
461 | mss->referenced += size; | ||
462 | mapcount = page_mapcount(page); | ||
463 | if (mapcount >= 2) { | ||
464 | u64 pss_delta; | ||
465 | |||
466 | if (dirty || PageDirty(page)) | ||
467 | mss->shared_dirty += size; | ||
468 | else | ||
469 | mss->shared_clean += size; | ||
470 | pss_delta = (u64)size << PSS_SHIFT; | ||
471 | do_div(pss_delta, mapcount); | ||
472 | mss->pss += pss_delta; | ||
473 | } else { | ||
474 | if (dirty || PageDirty(page)) | ||
475 | mss->private_dirty += size; | ||
476 | else | ||
477 | mss->private_clean += size; | ||
478 | mss->pss += (u64)size << PSS_SHIFT; | ||
479 | } | ||
480 | } | ||
481 | |||
482 | static void smaps_pte_entry(pte_t *pte, unsigned long addr, | ||
483 | struct mm_walk *walk) | ||
453 | { | 484 | { |
454 | struct mem_size_stats *mss = walk->private; | 485 | struct mem_size_stats *mss = walk->private; |
455 | struct vm_area_struct *vma = mss->vma; | 486 | struct vm_area_struct *vma = mss->vma; |
456 | pgoff_t pgoff = linear_page_index(vma, addr); | 487 | pgoff_t pgoff = linear_page_index(vma, addr); |
457 | struct page *page = NULL; | 488 | struct page *page = NULL; |
458 | int mapcount; | ||
459 | 489 | ||
460 | if (pte_present(ptent)) { | 490 | if (pte_present(*pte)) { |
461 | page = vm_normal_page(vma, addr, ptent); | 491 | page = vm_normal_page(vma, addr, *pte); |
462 | } else if (is_swap_pte(ptent)) { | 492 | } else if (is_swap_pte(*pte)) { |
463 | swp_entry_t swpent = pte_to_swp_entry(ptent); | 493 | swp_entry_t swpent = pte_to_swp_entry(*pte); |
464 | 494 | ||
465 | if (!non_swap_entry(swpent)) | 495 | if (!non_swap_entry(swpent)) |
466 | mss->swap += ptent_size; | 496 | mss->swap += PAGE_SIZE; |
467 | else if (is_migration_entry(swpent)) | 497 | else if (is_migration_entry(swpent)) |
468 | page = migration_entry_to_page(swpent); | 498 | page = migration_entry_to_page(swpent); |
469 | } else if (pte_file(ptent)) { | 499 | } else if (pte_file(*pte)) { |
470 | if (pte_to_pgoff(ptent) != pgoff) | 500 | if (pte_to_pgoff(*pte) != pgoff) |
471 | mss->nonlinear += ptent_size; | 501 | mss->nonlinear += PAGE_SIZE; |
472 | } | 502 | } |
473 | 503 | ||
474 | if (!page) | 504 | if (!page) |
475 | return; | 505 | return; |
476 | 506 | ||
477 | if (PageAnon(page)) | ||
478 | mss->anonymous += ptent_size; | ||
479 | |||
480 | if (page->index != pgoff) | 507 | if (page->index != pgoff) |
481 | mss->nonlinear += ptent_size; | 508 | mss->nonlinear += PAGE_SIZE; |
482 | 509 | ||
483 | mss->resident += ptent_size; | 510 | smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); |
484 | /* Accumulate the size in pages that have been accessed. */ | 511 | } |
485 | if (pte_young(ptent) || PageReferenced(page)) | 512 | |
486 | mss->referenced += ptent_size; | 513 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
487 | mapcount = page_mapcount(page); | 514 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, |
488 | if (mapcount >= 2) { | 515 | struct mm_walk *walk) |
489 | if (pte_dirty(ptent) || PageDirty(page)) | 516 | { |
490 | mss->shared_dirty += ptent_size; | 517 | struct mem_size_stats *mss = walk->private; |
491 | else | 518 | struct vm_area_struct *vma = mss->vma; |
492 | mss->shared_clean += ptent_size; | 519 | struct page *page; |
493 | mss->pss += (ptent_size << PSS_SHIFT) / mapcount; | 520 | |
494 | } else { | 521 | /* FOLL_DUMP will return -EFAULT on huge zero page */ |
495 | if (pte_dirty(ptent) || PageDirty(page)) | 522 | page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); |
496 | mss->private_dirty += ptent_size; | 523 | if (IS_ERR_OR_NULL(page)) |
497 | else | 524 | return; |
498 | mss->private_clean += ptent_size; | 525 | mss->anonymous_thp += HPAGE_PMD_SIZE; |
499 | mss->pss += (ptent_size << PSS_SHIFT); | 526 | smaps_account(mss, page, HPAGE_PMD_SIZE, |
500 | } | 527 | pmd_young(*pmd), pmd_dirty(*pmd)); |
501 | } | 528 | } |
529 | #else | ||
530 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | ||
531 | struct mm_walk *walk) | ||
532 | { | ||
533 | } | ||
534 | #endif | ||
502 | 535 | ||
503 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 536 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
504 | struct mm_walk *walk) | 537 | struct mm_walk *walk) |
@@ -509,9 +542,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
509 | spinlock_t *ptl; | 542 | spinlock_t *ptl; |
510 | 543 | ||
511 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 544 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
512 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); | 545 | smaps_pmd_entry(pmd, addr, walk); |
513 | spin_unlock(ptl); | 546 | spin_unlock(ptl); |
514 | mss->anonymous_thp += HPAGE_PMD_SIZE; | ||
515 | return 0; | 547 | return 0; |
516 | } | 548 | } |
517 | 549 | ||
@@ -524,7 +556,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
524 | */ | 556 | */ |
525 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 557 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
526 | for (; addr != end; pte++, addr += PAGE_SIZE) | 558 | for (; addr != end; pte++, addr += PAGE_SIZE) |
527 | smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); | 559 | smaps_pte_entry(pte, addr, walk); |
528 | pte_unmap_unlock(pte - 1, ptl); | 560 | pte_unmap_unlock(pte - 1, ptl); |
529 | cond_resched(); | 561 | cond_resched(); |
530 | return 0; | 562 | return 0; |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 27b0c9105da5..641e56494a92 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -113,6 +113,19 @@ static inline void css_get(struct cgroup_subsys_state *css) | |||
113 | } | 113 | } |
114 | 114 | ||
115 | /** | 115 | /** |
116 | * css_get_many - obtain references on the specified css | ||
117 | * @css: target css | ||
118 | * @n: number of references to get | ||
119 | * | ||
120 | * The caller must already have a reference. | ||
121 | */ | ||
122 | static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) | ||
123 | { | ||
124 | if (!(css->flags & CSS_NO_REF)) | ||
125 | percpu_ref_get_many(&css->refcnt, n); | ||
126 | } | ||
127 | |||
128 | /** | ||
116 | * css_tryget - try to obtain a reference on the specified css | 129 | * css_tryget - try to obtain a reference on the specified css |
117 | * @css: target css | 130 | * @css: target css |
118 | * | 131 | * |
@@ -159,6 +172,19 @@ static inline void css_put(struct cgroup_subsys_state *css) | |||
159 | percpu_ref_put(&css->refcnt); | 172 | percpu_ref_put(&css->refcnt); |
160 | } | 173 | } |
161 | 174 | ||
175 | /** | ||
176 | * css_put_many - put css references | ||
177 | * @css: target css | ||
178 | * @n: number of references to put | ||
179 | * | ||
180 | * Put references obtained via css_get() and css_tryget_online(). | ||
181 | */ | ||
182 | static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) | ||
183 | { | ||
184 | if (!(css->flags & CSS_NO_REF)) | ||
185 | percpu_ref_put_many(&css->refcnt, n); | ||
186 | } | ||
187 | |||
162 | /* bits in struct cgroup flags field */ | 188 | /* bits in struct cgroup flags field */ |
163 | enum { | 189 | enum { |
164 | /* Control Group requires release notifications to userspace */ | 190 | /* Control Group requires release notifications to userspace */ |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 60bdf8dc02a3..3238ffa33f68 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -33,10 +33,11 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); | |||
33 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 33 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
34 | int order, gfp_t gfp_mask, nodemask_t *mask, | 34 | int order, gfp_t gfp_mask, nodemask_t *mask, |
35 | enum migrate_mode mode, int *contended, | 35 | enum migrate_mode mode, int *contended, |
36 | struct zone **candidate_zone); | 36 | int alloc_flags, int classzone_idx); |
37 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 37 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
38 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 38 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
39 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 39 | extern unsigned long compaction_suitable(struct zone *zone, int order, |
40 | int alloc_flags, int classzone_idx); | ||
40 | 41 | ||
41 | /* Do not skip compaction more than 64 times */ | 42 | /* Do not skip compaction more than 64 times */ |
42 | #define COMPACT_MAX_DEFER_SHIFT 6 | 43 | #define COMPACT_MAX_DEFER_SHIFT 6 |
@@ -103,7 +104,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) | |||
103 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 104 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, |
104 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 105 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
105 | enum migrate_mode mode, int *contended, | 106 | enum migrate_mode mode, int *contended, |
106 | struct zone **candidate_zone) | 107 | int alloc_flags, int classzone_idx) |
107 | { | 108 | { |
108 | return COMPACT_CONTINUE; | 109 | return COMPACT_CONTINUE; |
109 | } | 110 | } |
@@ -116,7 +117,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) | |||
116 | { | 117 | { |
117 | } | 118 | } |
118 | 119 | ||
119 | static inline unsigned long compaction_suitable(struct zone *zone, int order) | 120 | static inline unsigned long compaction_suitable(struct zone *zone, int order, |
121 | int alloc_flags, int classzone_idx) | ||
120 | { | 122 | { |
121 | return COMPACT_SKIPPED; | 123 | return COMPACT_SKIPPED; |
122 | } | 124 | } |
diff --git a/include/linux/file.h b/include/linux/file.h index 4d69123377a2..f87d30882a24 100644 --- a/include/linux/file.h +++ b/include/linux/file.h | |||
@@ -66,7 +66,6 @@ extern void set_close_on_exec(unsigned int fd, int flag); | |||
66 | extern bool get_close_on_exec(unsigned int fd); | 66 | extern bool get_close_on_exec(unsigned int fd); |
67 | extern void put_filp(struct file *); | 67 | extern void put_filp(struct file *); |
68 | extern int get_unused_fd_flags(unsigned flags); | 68 | extern int get_unused_fd_flags(unsigned flags); |
69 | #define get_unused_fd() get_unused_fd_flags(0) | ||
70 | extern void put_unused_fd(unsigned int fd); | 69 | extern void put_unused_fd(unsigned int fd); |
71 | 70 | ||
72 | extern void fd_install(unsigned int fd, struct file *file); | 71 | extern void fd_install(unsigned int fd, struct file *file); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 41b30fd4d041..07d2699cdb51 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -381,8 +381,8 @@ extern void free_kmem_pages(unsigned long addr, unsigned int order); | |||
381 | 381 | ||
382 | void page_alloc_init(void); | 382 | void page_alloc_init(void); |
383 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); | 383 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); |
384 | void drain_all_pages(void); | 384 | void drain_all_pages(struct zone *zone); |
385 | void drain_local_pages(void *dummy); | 385 | void drain_local_pages(struct zone *zone); |
386 | 386 | ||
387 | /* | 387 | /* |
388 | * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what | 388 | * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e6d338641fe..cdd149ca5cc0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -311,7 +311,8 @@ static inline struct hstate *hstate_sizelog(int page_size_log) | |||
311 | { | 311 | { |
312 | if (!page_size_log) | 312 | if (!page_size_log) |
313 | return &default_hstate; | 313 | return &default_hstate; |
314 | return size_to_hstate(1 << page_size_log); | 314 | |
315 | return size_to_hstate(1UL << page_size_log); | ||
315 | } | 316 | } |
316 | 317 | ||
317 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) | 318 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) |
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 0129f89cf98d..bcc853eccc85 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h | |||
@@ -16,7 +16,6 @@ | |||
16 | #define _LINUX_HUGETLB_CGROUP_H | 16 | #define _LINUX_HUGETLB_CGROUP_H |
17 | 17 | ||
18 | #include <linux/mmdebug.h> | 18 | #include <linux/mmdebug.h> |
19 | #include <linux/res_counter.h> | ||
20 | 19 | ||
21 | struct hugetlb_cgroup; | 20 | struct hugetlb_cgroup; |
22 | /* | 21 | /* |
diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h index 866caaa9e2bb..c2ce155d83cc 100644 --- a/include/linux/kern_levels.h +++ b/include/linux/kern_levels.h | |||
@@ -22,4 +22,17 @@ | |||
22 | */ | 22 | */ |
23 | #define KERN_CONT "" | 23 | #define KERN_CONT "" |
24 | 24 | ||
25 | /* integer equivalents of KERN_<LEVEL> */ | ||
26 | #define LOGLEVEL_SCHED -2 /* Deferred messages from sched code | ||
27 | * are set to this special level */ | ||
28 | #define LOGLEVEL_DEFAULT -1 /* default (or last) loglevel */ | ||
29 | #define LOGLEVEL_EMERG 0 /* system is unusable */ | ||
30 | #define LOGLEVEL_ALERT 1 /* action must be taken immediately */ | ||
31 | #define LOGLEVEL_CRIT 2 /* critical conditions */ | ||
32 | #define LOGLEVEL_ERR 3 /* error conditions */ | ||
33 | #define LOGLEVEL_WARNING 4 /* warning conditions */ | ||
34 | #define LOGLEVEL_NOTICE 5 /* normal but significant condition */ | ||
35 | #define LOGLEVEL_INFO 6 /* informational */ | ||
36 | #define LOGLEVEL_DEBUG 7 /* debug-level messages */ | ||
37 | |||
25 | #endif | 38 | #endif |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 446d76a87ba1..233ea8107038 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -427,6 +427,7 @@ extern int panic_timeout; | |||
427 | extern int panic_on_oops; | 427 | extern int panic_on_oops; |
428 | extern int panic_on_unrecovered_nmi; | 428 | extern int panic_on_unrecovered_nmi; |
429 | extern int panic_on_io_nmi; | 429 | extern int panic_on_io_nmi; |
430 | extern int panic_on_warn; | ||
430 | extern int sysctl_panic_on_stackoverflow; | 431 | extern int sysctl_panic_on_stackoverflow; |
431 | /* | 432 | /* |
432 | * Only to be used by arch init code. If the user over-wrote the default | 433 | * Only to be used by arch init code. If the user over-wrote the default |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6b75640ef5ab..6ea9f919e888 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <linux/jump_label.h> | 25 | #include <linux/jump_label.h> |
26 | 26 | ||
27 | struct mem_cgroup; | 27 | struct mem_cgroup; |
28 | struct page_cgroup; | ||
29 | struct page; | 28 | struct page; |
30 | struct mm_struct; | 29 | struct mm_struct; |
31 | struct kmem_cache; | 30 | struct kmem_cache; |
@@ -68,10 +67,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
68 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); | 67 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); |
69 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); | 68 | struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); |
70 | 69 | ||
71 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 70 | bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, |
72 | struct mem_cgroup *memcg); | 71 | struct mem_cgroup *root); |
73 | bool task_in_mem_cgroup(struct task_struct *task, | 72 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); |
74 | const struct mem_cgroup *memcg); | ||
75 | 73 | ||
76 | extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); | 74 | extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); |
77 | extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | 75 | extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); |
@@ -79,15 +77,16 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | |||
79 | extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); | 77 | extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); |
80 | extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); | 78 | extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); |
81 | 79 | ||
82 | static inline | 80 | static inline bool mm_match_cgroup(struct mm_struct *mm, |
83 | bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) | 81 | struct mem_cgroup *memcg) |
84 | { | 82 | { |
85 | struct mem_cgroup *task_memcg; | 83 | struct mem_cgroup *task_memcg; |
86 | bool match; | 84 | bool match = false; |
87 | 85 | ||
88 | rcu_read_lock(); | 86 | rcu_read_lock(); |
89 | task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 87 | task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
90 | match = __mem_cgroup_same_or_subtree(memcg, task_memcg); | 88 | if (task_memcg) |
89 | match = mem_cgroup_is_descendant(task_memcg, memcg); | ||
91 | rcu_read_unlock(); | 90 | rcu_read_unlock(); |
92 | return match; | 91 | return match; |
93 | } | 92 | } |
@@ -141,8 +140,8 @@ static inline bool mem_cgroup_disabled(void) | |||
141 | 140 | ||
142 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, | 141 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, |
143 | unsigned long *flags); | 142 | unsigned long *flags); |
144 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, | 143 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, |
145 | unsigned long flags); | 144 | unsigned long *flags); |
146 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, | 145 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, |
147 | enum mem_cgroup_stat_index idx, int val); | 146 | enum mem_cgroup_stat_index idx, int val); |
148 | 147 | ||
@@ -174,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | |||
174 | void mem_cgroup_split_huge_fixup(struct page *head); | 173 | void mem_cgroup_split_huge_fixup(struct page *head); |
175 | #endif | 174 | #endif |
176 | 175 | ||
177 | #ifdef CONFIG_DEBUG_VM | ||
178 | bool mem_cgroup_bad_page_check(struct page *page); | ||
179 | void mem_cgroup_print_bad_page(struct page *page); | ||
180 | #endif | ||
181 | #else /* CONFIG_MEMCG */ | 176 | #else /* CONFIG_MEMCG */ |
182 | struct mem_cgroup; | 177 | struct mem_cgroup; |
183 | 178 | ||
@@ -297,7 +292,7 @@ static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | |||
297 | } | 292 | } |
298 | 293 | ||
299 | static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, | 294 | static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, |
300 | bool locked, unsigned long flags) | 295 | bool *locked, unsigned long *flags) |
301 | { | 296 | { |
302 | } | 297 | } |
303 | 298 | ||
@@ -347,19 +342,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
347 | } | 342 | } |
348 | #endif /* CONFIG_MEMCG */ | 343 | #endif /* CONFIG_MEMCG */ |
349 | 344 | ||
350 | #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) | ||
351 | static inline bool | ||
352 | mem_cgroup_bad_page_check(struct page *page) | ||
353 | { | ||
354 | return false; | ||
355 | } | ||
356 | |||
357 | static inline void | ||
358 | mem_cgroup_print_bad_page(struct page *page) | ||
359 | { | ||
360 | } | ||
361 | #endif | ||
362 | |||
363 | enum { | 345 | enum { |
364 | UNDER_LIMIT, | 346 | UNDER_LIMIT, |
365 | SOFT_LIMIT, | 347 | SOFT_LIMIT, |
@@ -447,9 +429,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | |||
447 | /* | 429 | /* |
448 | * __GFP_NOFAIL allocations will move on even if charging is not | 430 | * __GFP_NOFAIL allocations will move on even if charging is not |
449 | * possible. Therefore we don't even try, and have this allocation | 431 | * possible. Therefore we don't even try, and have this allocation |
450 | * unaccounted. We could in theory charge it with | 432 | * unaccounted. We could in theory charge it forcibly, but we hope |
451 | * res_counter_charge_nofail, but we hope those allocations are rare, | 433 | * those allocations are rare, and won't be worth the trouble. |
452 | * and won't be worth the trouble. | ||
453 | */ | 434 | */ |
454 | if (gfp & __GFP_NOFAIL) | 435 | if (gfp & __GFP_NOFAIL) |
455 | return true; | 436 | return true; |
@@ -467,8 +448,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) | |||
467 | * memcg_kmem_uncharge_pages: uncharge pages from memcg | 448 | * memcg_kmem_uncharge_pages: uncharge pages from memcg |
468 | * @page: pointer to struct page being freed | 449 | * @page: pointer to struct page being freed |
469 | * @order: allocation order. | 450 | * @order: allocation order. |
470 | * | ||
471 | * there is no need to specify memcg here, since it is embedded in page_cgroup | ||
472 | */ | 451 | */ |
473 | static inline void | 452 | static inline void |
474 | memcg_kmem_uncharge_pages(struct page *page, int order) | 453 | memcg_kmem_uncharge_pages(struct page *page, int order) |
@@ -485,8 +464,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order) | |||
485 | * | 464 | * |
486 | * Needs to be called after memcg_kmem_newpage_charge, regardless of success or | 465 | * Needs to be called after memcg_kmem_newpage_charge, regardless of success or |
487 | * failure of the allocation. if @page is NULL, this function will revert the | 466 | * failure of the allocation. if @page is NULL, this function will revert the |
488 | * charges. Otherwise, it will commit the memcg given by @memcg to the | 467 | * charges. Otherwise, it will commit @page to @memcg. |
489 | * corresponding page_cgroup. | ||
490 | */ | 468 | */ |
491 | static inline void | 469 | static inline void |
492 | memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) | 470 | memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 004e9d17b47e..bf9f57529dcf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) | 22 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) |
23 | 23 | ||
24 | struct address_space; | 24 | struct address_space; |
25 | struct mem_cgroup; | ||
25 | 26 | ||
26 | #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) | 27 | #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) |
27 | #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ | 28 | #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ |
@@ -167,6 +168,10 @@ struct page { | |||
167 | struct page *first_page; /* Compound tail pages */ | 168 | struct page *first_page; /* Compound tail pages */ |
168 | }; | 169 | }; |
169 | 170 | ||
171 | #ifdef CONFIG_MEMCG | ||
172 | struct mem_cgroup *mem_cgroup; | ||
173 | #endif | ||
174 | |||
170 | /* | 175 | /* |
171 | * On machines where all RAM is mapped into kernel address space, | 176 | * On machines where all RAM is mapped into kernel address space, |
172 | * we can simply calculate the virtual address. On machines with | 177 | * we can simply calculate the virtual address. On machines with |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ffe66e381c04..3879d7664dfc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -722,9 +722,6 @@ typedef struct pglist_data { | |||
722 | int nr_zones; | 722 | int nr_zones; |
723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ | 723 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ |
724 | struct page *node_mem_map; | 724 | struct page *node_mem_map; |
725 | #ifdef CONFIG_MEMCG | ||
726 | struct page_cgroup *node_page_cgroup; | ||
727 | #endif | ||
728 | #endif | 725 | #endif |
729 | #ifndef CONFIG_NO_BOOTMEM | 726 | #ifndef CONFIG_NO_BOOTMEM |
730 | struct bootmem_data *bdata; | 727 | struct bootmem_data *bdata; |
@@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) | |||
1078 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) | 1075 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) |
1079 | 1076 | ||
1080 | struct page; | 1077 | struct page; |
1081 | struct page_cgroup; | ||
1082 | struct mem_section { | 1078 | struct mem_section { |
1083 | /* | 1079 | /* |
1084 | * This is, logically, a pointer to an array of struct | 1080 | * This is, logically, a pointer to an array of struct |
@@ -1096,14 +1092,6 @@ struct mem_section { | |||
1096 | 1092 | ||
1097 | /* See declaration of similar field in struct zone */ | 1093 | /* See declaration of similar field in struct zone */ |
1098 | unsigned long *pageblock_flags; | 1094 | unsigned long *pageblock_flags; |
1099 | #ifdef CONFIG_MEMCG | ||
1100 | /* | ||
1101 | * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use | ||
1102 | * section. (see memcontrol.h/page_cgroup.h about this.) | ||
1103 | */ | ||
1104 | struct page_cgroup *page_cgroup; | ||
1105 | unsigned long pad; | ||
1106 | #endif | ||
1107 | /* | 1095 | /* |
1108 | * WARNING: mem_section must be a power-of-2 in size for the | 1096 | * WARNING: mem_section must be a power-of-2 in size for the |
1109 | * calculation and use of SECTION_ROOT_MASK to make sense. | 1097 | * calculation and use of SECTION_ROOT_MASK to make sense. |
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h deleted file mode 100644 index 5c831f1eca79..000000000000 --- a/include/linux/page_cgroup.h +++ /dev/null | |||
@@ -1,105 +0,0 @@ | |||
1 | #ifndef __LINUX_PAGE_CGROUP_H | ||
2 | #define __LINUX_PAGE_CGROUP_H | ||
3 | |||
4 | enum { | ||
5 | /* flags for mem_cgroup */ | ||
6 | PCG_USED = 0x01, /* This page is charged to a memcg */ | ||
7 | PCG_MEM = 0x02, /* This page holds a memory charge */ | ||
8 | PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */ | ||
9 | }; | ||
10 | |||
11 | struct pglist_data; | ||
12 | |||
13 | #ifdef CONFIG_MEMCG | ||
14 | struct mem_cgroup; | ||
15 | |||
16 | /* | ||
17 | * Page Cgroup can be considered as an extended mem_map. | ||
18 | * A page_cgroup page is associated with every page descriptor. The | ||
19 | * page_cgroup helps us identify information about the cgroup | ||
20 | * All page cgroups are allocated at boot or memory hotplug event, | ||
21 | * then the page cgroup for pfn always exists. | ||
22 | */ | ||
23 | struct page_cgroup { | ||
24 | unsigned long flags; | ||
25 | struct mem_cgroup *mem_cgroup; | ||
26 | }; | ||
27 | |||
28 | extern void pgdat_page_cgroup_init(struct pglist_data *pgdat); | ||
29 | |||
30 | #ifdef CONFIG_SPARSEMEM | ||
31 | static inline void page_cgroup_init_flatmem(void) | ||
32 | { | ||
33 | } | ||
34 | extern void page_cgroup_init(void); | ||
35 | #else | ||
36 | extern void page_cgroup_init_flatmem(void); | ||
37 | static inline void page_cgroup_init(void) | ||
38 | { | ||
39 | } | ||
40 | #endif | ||
41 | |||
42 | struct page_cgroup *lookup_page_cgroup(struct page *page); | ||
43 | |||
44 | static inline int PageCgroupUsed(struct page_cgroup *pc) | ||
45 | { | ||
46 | return !!(pc->flags & PCG_USED); | ||
47 | } | ||
48 | #else /* !CONFIG_MEMCG */ | ||
49 | struct page_cgroup; | ||
50 | |||
51 | static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
52 | { | ||
53 | } | ||
54 | |||
55 | static inline struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
56 | { | ||
57 | return NULL; | ||
58 | } | ||
59 | |||
60 | static inline void page_cgroup_init(void) | ||
61 | { | ||
62 | } | ||
63 | |||
64 | static inline void page_cgroup_init_flatmem(void) | ||
65 | { | ||
66 | } | ||
67 | #endif /* CONFIG_MEMCG */ | ||
68 | |||
69 | #include <linux/swap.h> | ||
70 | |||
71 | #ifdef CONFIG_MEMCG_SWAP | ||
72 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
73 | unsigned short old, unsigned short new); | ||
74 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); | ||
75 | extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); | ||
76 | extern int swap_cgroup_swapon(int type, unsigned long max_pages); | ||
77 | extern void swap_cgroup_swapoff(int type); | ||
78 | #else | ||
79 | |||
80 | static inline | ||
81 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
82 | { | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static inline | ||
87 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
88 | { | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | static inline int | ||
93 | swap_cgroup_swapon(int type, unsigned long max_pages) | ||
94 | { | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static inline void swap_cgroup_swapoff(int type) | ||
99 | { | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | #endif /* CONFIG_MEMCG_SWAP */ | ||
104 | |||
105 | #endif /* __LINUX_PAGE_CGROUP_H */ | ||
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h new file mode 100644 index 000000000000..955421575d16 --- /dev/null +++ b/include/linux/page_counter.h | |||
@@ -0,0 +1,51 @@ | |||
1 | #ifndef _LINUX_PAGE_COUNTER_H | ||
2 | #define _LINUX_PAGE_COUNTER_H | ||
3 | |||
4 | #include <linux/atomic.h> | ||
5 | #include <linux/kernel.h> | ||
6 | #include <asm/page.h> | ||
7 | |||
8 | struct page_counter { | ||
9 | atomic_long_t count; | ||
10 | unsigned long limit; | ||
11 | struct page_counter *parent; | ||
12 | |||
13 | /* legacy */ | ||
14 | unsigned long watermark; | ||
15 | unsigned long failcnt; | ||
16 | }; | ||
17 | |||
18 | #if BITS_PER_LONG == 32 | ||
19 | #define PAGE_COUNTER_MAX LONG_MAX | ||
20 | #else | ||
21 | #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) | ||
22 | #endif | ||
23 | |||
24 | static inline void page_counter_init(struct page_counter *counter, | ||
25 | struct page_counter *parent) | ||
26 | { | ||
27 | atomic_long_set(&counter->count, 0); | ||
28 | counter->limit = PAGE_COUNTER_MAX; | ||
29 | counter->parent = parent; | ||
30 | } | ||
31 | |||
32 | static inline unsigned long page_counter_read(struct page_counter *counter) | ||
33 | { | ||
34 | return atomic_long_read(&counter->count); | ||
35 | } | ||
36 | |||
37 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); | ||
38 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); | ||
39 | int page_counter_try_charge(struct page_counter *counter, | ||
40 | unsigned long nr_pages, | ||
41 | struct page_counter **fail); | ||
42 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); | ||
43 | int page_counter_limit(struct page_counter *counter, unsigned long limit); | ||
44 | int page_counter_memparse(const char *buf, unsigned long *nr_pages); | ||
45 | |||
46 | static inline void page_counter_reset_watermark(struct page_counter *counter) | ||
47 | { | ||
48 | counter->watermark = page_counter_read(counter); | ||
49 | } | ||
50 | |||
51 | #endif /* _LINUX_PAGE_COUNTER_H */ | ||
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 51ce60c35f4c..530b249f7ea4 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h | |||
@@ -147,28 +147,42 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, | |||
147 | } | 147 | } |
148 | 148 | ||
149 | /** | 149 | /** |
150 | * percpu_ref_get - increment a percpu refcount | 150 | * percpu_ref_get_many - increment a percpu refcount |
151 | * @ref: percpu_ref to get | 151 | * @ref: percpu_ref to get |
152 | * @nr: number of references to get | ||
152 | * | 153 | * |
153 | * Analagous to atomic_long_inc(). | 154 | * Analogous to atomic_long_add(). |
154 | * | 155 | * |
155 | * This function is safe to call as long as @ref is between init and exit. | 156 | * This function is safe to call as long as @ref is between init and exit. |
156 | */ | 157 | */ |
157 | static inline void percpu_ref_get(struct percpu_ref *ref) | 158 | static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) |
158 | { | 159 | { |
159 | unsigned long __percpu *percpu_count; | 160 | unsigned long __percpu *percpu_count; |
160 | 161 | ||
161 | rcu_read_lock_sched(); | 162 | rcu_read_lock_sched(); |
162 | 163 | ||
163 | if (__ref_is_percpu(ref, &percpu_count)) | 164 | if (__ref_is_percpu(ref, &percpu_count)) |
164 | this_cpu_inc(*percpu_count); | 165 | this_cpu_add(*percpu_count, nr); |
165 | else | 166 | else |
166 | atomic_long_inc(&ref->count); | 167 | atomic_long_add(nr, &ref->count); |
167 | 168 | ||
168 | rcu_read_unlock_sched(); | 169 | rcu_read_unlock_sched(); |
169 | } | 170 | } |
170 | 171 | ||
171 | /** | 172 | /** |
173 | * percpu_ref_get - increment a percpu refcount | ||
174 | * @ref: percpu_ref to get | ||
175 | * | ||
176 | * Analagous to atomic_long_inc(). | ||
177 | * | ||
178 | * This function is safe to call as long as @ref is between init and exit. | ||
179 | */ | ||
180 | static inline void percpu_ref_get(struct percpu_ref *ref) | ||
181 | { | ||
182 | percpu_ref_get_many(ref, 1); | ||
183 | } | ||
184 | |||
185 | /** | ||
172 | * percpu_ref_tryget - try to increment a percpu refcount | 186 | * percpu_ref_tryget - try to increment a percpu refcount |
173 | * @ref: percpu_ref to try-get | 187 | * @ref: percpu_ref to try-get |
174 | * | 188 | * |
@@ -231,29 +245,44 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) | |||
231 | } | 245 | } |
232 | 246 | ||
233 | /** | 247 | /** |
234 | * percpu_ref_put - decrement a percpu refcount | 248 | * percpu_ref_put_many - decrement a percpu refcount |
235 | * @ref: percpu_ref to put | 249 | * @ref: percpu_ref to put |
250 | * @nr: number of references to put | ||
236 | * | 251 | * |
237 | * Decrement the refcount, and if 0, call the release function (which was passed | 252 | * Decrement the refcount, and if 0, call the release function (which was passed |
238 | * to percpu_ref_init()) | 253 | * to percpu_ref_init()) |
239 | * | 254 | * |
240 | * This function is safe to call as long as @ref is between init and exit. | 255 | * This function is safe to call as long as @ref is between init and exit. |
241 | */ | 256 | */ |
242 | static inline void percpu_ref_put(struct percpu_ref *ref) | 257 | static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) |
243 | { | 258 | { |
244 | unsigned long __percpu *percpu_count; | 259 | unsigned long __percpu *percpu_count; |
245 | 260 | ||
246 | rcu_read_lock_sched(); | 261 | rcu_read_lock_sched(); |
247 | 262 | ||
248 | if (__ref_is_percpu(ref, &percpu_count)) | 263 | if (__ref_is_percpu(ref, &percpu_count)) |
249 | this_cpu_dec(*percpu_count); | 264 | this_cpu_sub(*percpu_count, nr); |
250 | else if (unlikely(atomic_long_dec_and_test(&ref->count))) | 265 | else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) |
251 | ref->release(ref); | 266 | ref->release(ref); |
252 | 267 | ||
253 | rcu_read_unlock_sched(); | 268 | rcu_read_unlock_sched(); |
254 | } | 269 | } |
255 | 270 | ||
256 | /** | 271 | /** |
272 | * percpu_ref_put - decrement a percpu refcount | ||
273 | * @ref: percpu_ref to put | ||
274 | * | ||
275 | * Decrement the refcount, and if 0, call the release function (which was passed | ||
276 | * to percpu_ref_init()) | ||
277 | * | ||
278 | * This function is safe to call as long as @ref is between init and exit. | ||
279 | */ | ||
280 | static inline void percpu_ref_put(struct percpu_ref *ref) | ||
281 | { | ||
282 | percpu_ref_put_many(ref, 1); | ||
283 | } | ||
284 | |||
285 | /** | ||
257 | * percpu_ref_is_zero - test whether a percpu refcount reached zero | 286 | * percpu_ref_is_zero - test whether a percpu refcount reached zero |
258 | * @ref: percpu_ref to test | 287 | * @ref: percpu_ref to test |
259 | * | 288 | * |
diff --git a/include/linux/printk.h b/include/linux/printk.h index d78125f73ac4..3dd489f2dedc 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h | |||
@@ -118,7 +118,6 @@ int no_printk(const char *fmt, ...) | |||
118 | #ifdef CONFIG_EARLY_PRINTK | 118 | #ifdef CONFIG_EARLY_PRINTK |
119 | extern asmlinkage __printf(1, 2) | 119 | extern asmlinkage __printf(1, 2) |
120 | void early_printk(const char *fmt, ...); | 120 | void early_printk(const char *fmt, ...); |
121 | void early_vprintk(const char *fmt, va_list ap); | ||
122 | #else | 121 | #else |
123 | static inline __printf(1, 2) __cold | 122 | static inline __printf(1, 2) __cold |
124 | void early_printk(const char *s, ...) { } | 123 | void early_printk(const char *s, ...) { } |
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index cc79eff4a1ad..987a73a40ef8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h | |||
@@ -52,7 +52,7 @@ extern void ptrace_notify(int exit_code); | |||
52 | extern void __ptrace_link(struct task_struct *child, | 52 | extern void __ptrace_link(struct task_struct *child, |
53 | struct task_struct *new_parent); | 53 | struct task_struct *new_parent); |
54 | extern void __ptrace_unlink(struct task_struct *child); | 54 | extern void __ptrace_unlink(struct task_struct *child); |
55 | extern void exit_ptrace(struct task_struct *tracer); | 55 | extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); |
56 | #define PTRACE_MODE_READ 0x01 | 56 | #define PTRACE_MODE_READ 0x01 |
57 | #define PTRACE_MODE_ATTACH 0x02 | 57 | #define PTRACE_MODE_ATTACH 0x02 |
58 | #define PTRACE_MODE_NOAUDIT 0x04 | 58 | #define PTRACE_MODE_NOAUDIT 0x04 |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h deleted file mode 100644 index 56b7bc32db4f..000000000000 --- a/include/linux/res_counter.h +++ /dev/null | |||
@@ -1,223 +0,0 @@ | |||
1 | #ifndef __RES_COUNTER_H__ | ||
2 | #define __RES_COUNTER_H__ | ||
3 | |||
4 | /* | ||
5 | * Resource Counters | ||
6 | * Contain common data types and routines for resource accounting | ||
7 | * | ||
8 | * Copyright 2007 OpenVZ SWsoft Inc | ||
9 | * | ||
10 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
11 | * | ||
12 | * See Documentation/cgroups/resource_counter.txt for more | ||
13 | * info about what this counter is. | ||
14 | */ | ||
15 | |||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/errno.h> | ||
18 | |||
19 | /* | ||
20 | * The core object. the cgroup that wishes to account for some | ||
21 | * resource may include this counter into its structures and use | ||
22 | * the helpers described beyond | ||
23 | */ | ||
24 | |||
25 | struct res_counter { | ||
26 | /* | ||
27 | * the current resource consumption level | ||
28 | */ | ||
29 | unsigned long long usage; | ||
30 | /* | ||
31 | * the maximal value of the usage from the counter creation | ||
32 | */ | ||
33 | unsigned long long max_usage; | ||
34 | /* | ||
35 | * the limit that usage cannot exceed | ||
36 | */ | ||
37 | unsigned long long limit; | ||
38 | /* | ||
39 | * the limit that usage can be exceed | ||
40 | */ | ||
41 | unsigned long long soft_limit; | ||
42 | /* | ||
43 | * the number of unsuccessful attempts to consume the resource | ||
44 | */ | ||
45 | unsigned long long failcnt; | ||
46 | /* | ||
47 | * the lock to protect all of the above. | ||
48 | * the routines below consider this to be IRQ-safe | ||
49 | */ | ||
50 | spinlock_t lock; | ||
51 | /* | ||
52 | * Parent counter, used for hierarchial resource accounting | ||
53 | */ | ||
54 | struct res_counter *parent; | ||
55 | }; | ||
56 | |||
57 | #define RES_COUNTER_MAX ULLONG_MAX | ||
58 | |||
59 | /** | ||
60 | * Helpers to interact with userspace | ||
61 | * res_counter_read_u64() - returns the value of the specified member. | ||
62 | * res_counter_read/_write - put/get the specified fields from the | ||
63 | * res_counter struct to/from the user | ||
64 | * | ||
65 | * @counter: the counter in question | ||
66 | * @member: the field to work with (see RES_xxx below) | ||
67 | * @buf: the buffer to opeate on,... | ||
68 | * @nbytes: its size... | ||
69 | * @pos: and the offset. | ||
70 | */ | ||
71 | |||
72 | u64 res_counter_read_u64(struct res_counter *counter, int member); | ||
73 | |||
74 | ssize_t res_counter_read(struct res_counter *counter, int member, | ||
75 | const char __user *buf, size_t nbytes, loff_t *pos, | ||
76 | int (*read_strategy)(unsigned long long val, char *s)); | ||
77 | |||
78 | int res_counter_memparse_write_strategy(const char *buf, | ||
79 | unsigned long long *res); | ||
80 | |||
81 | /* | ||
82 | * the field descriptors. one for each member of res_counter | ||
83 | */ | ||
84 | |||
85 | enum { | ||
86 | RES_USAGE, | ||
87 | RES_MAX_USAGE, | ||
88 | RES_LIMIT, | ||
89 | RES_FAILCNT, | ||
90 | RES_SOFT_LIMIT, | ||
91 | }; | ||
92 | |||
93 | /* | ||
94 | * helpers for accounting | ||
95 | */ | ||
96 | |||
97 | void res_counter_init(struct res_counter *counter, struct res_counter *parent); | ||
98 | |||
99 | /* | ||
100 | * charge - try to consume more resource. | ||
101 | * | ||
102 | * @counter: the counter | ||
103 | * @val: the amount of the resource. each controller defines its own | ||
104 | * units, e.g. numbers, bytes, Kbytes, etc | ||
105 | * | ||
106 | * returns 0 on success and <0 if the counter->usage will exceed the | ||
107 | * counter->limit | ||
108 | * | ||
109 | * charge_nofail works the same, except that it charges the resource | ||
110 | * counter unconditionally, and returns < 0 if the after the current | ||
111 | * charge we are over limit. | ||
112 | */ | ||
113 | |||
114 | int __must_check res_counter_charge(struct res_counter *counter, | ||
115 | unsigned long val, struct res_counter **limit_fail_at); | ||
116 | int res_counter_charge_nofail(struct res_counter *counter, | ||
117 | unsigned long val, struct res_counter **limit_fail_at); | ||
118 | |||
119 | /* | ||
120 | * uncharge - tell that some portion of the resource is released | ||
121 | * | ||
122 | * @counter: the counter | ||
123 | * @val: the amount of the resource | ||
124 | * | ||
125 | * these calls check for usage underflow and show a warning on the console | ||
126 | * | ||
127 | * returns the total charges still present in @counter. | ||
128 | */ | ||
129 | |||
130 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); | ||
131 | |||
132 | u64 res_counter_uncharge_until(struct res_counter *counter, | ||
133 | struct res_counter *top, | ||
134 | unsigned long val); | ||
135 | /** | ||
136 | * res_counter_margin - calculate chargeable space of a counter | ||
137 | * @cnt: the counter | ||
138 | * | ||
139 | * Returns the difference between the hard limit and the current usage | ||
140 | * of resource counter @cnt. | ||
141 | */ | ||
142 | static inline unsigned long long res_counter_margin(struct res_counter *cnt) | ||
143 | { | ||
144 | unsigned long long margin; | ||
145 | unsigned long flags; | ||
146 | |||
147 | spin_lock_irqsave(&cnt->lock, flags); | ||
148 | if (cnt->limit > cnt->usage) | ||
149 | margin = cnt->limit - cnt->usage; | ||
150 | else | ||
151 | margin = 0; | ||
152 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
153 | return margin; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * Get the difference between the usage and the soft limit | ||
158 | * @cnt: The counter | ||
159 | * | ||
160 | * Returns 0 if usage is less than or equal to soft limit | ||
161 | * The difference between usage and soft limit, otherwise. | ||
162 | */ | ||
163 | static inline unsigned long long | ||
164 | res_counter_soft_limit_excess(struct res_counter *cnt) | ||
165 | { | ||
166 | unsigned long long excess; | ||
167 | unsigned long flags; | ||
168 | |||
169 | spin_lock_irqsave(&cnt->lock, flags); | ||
170 | if (cnt->usage <= cnt->soft_limit) | ||
171 | excess = 0; | ||
172 | else | ||
173 | excess = cnt->usage - cnt->soft_limit; | ||
174 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
175 | return excess; | ||
176 | } | ||
177 | |||
178 | static inline void res_counter_reset_max(struct res_counter *cnt) | ||
179 | { | ||
180 | unsigned long flags; | ||
181 | |||
182 | spin_lock_irqsave(&cnt->lock, flags); | ||
183 | cnt->max_usage = cnt->usage; | ||
184 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
185 | } | ||
186 | |||
187 | static inline void res_counter_reset_failcnt(struct res_counter *cnt) | ||
188 | { | ||
189 | unsigned long flags; | ||
190 | |||
191 | spin_lock_irqsave(&cnt->lock, flags); | ||
192 | cnt->failcnt = 0; | ||
193 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
194 | } | ||
195 | |||
196 | static inline int res_counter_set_limit(struct res_counter *cnt, | ||
197 | unsigned long long limit) | ||
198 | { | ||
199 | unsigned long flags; | ||
200 | int ret = -EBUSY; | ||
201 | |||
202 | spin_lock_irqsave(&cnt->lock, flags); | ||
203 | if (cnt->usage <= limit) { | ||
204 | cnt->limit = limit; | ||
205 | ret = 0; | ||
206 | } | ||
207 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
208 | return ret; | ||
209 | } | ||
210 | |||
211 | static inline int | ||
212 | res_counter_set_soft_limit(struct res_counter *cnt, | ||
213 | unsigned long long soft_limit) | ||
214 | { | ||
215 | unsigned long flags; | ||
216 | |||
217 | spin_lock_irqsave(&cnt->lock, flags); | ||
218 | cnt->soft_limit = soft_limit; | ||
219 | spin_unlock_irqrestore(&cnt->lock, flags); | ||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | #endif | ||
diff --git a/include/linux/slab.h b/include/linux/slab.h index c265bec6a57d..8a2457d42fc8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -513,10 +513,6 @@ struct memcg_cache_params { | |||
513 | 513 | ||
514 | int memcg_update_all_caches(int num_memcgs); | 514 | int memcg_update_all_caches(int num_memcgs); |
515 | 515 | ||
516 | struct seq_file; | ||
517 | int cache_show(struct kmem_cache *s, struct seq_file *m); | ||
518 | void print_slabinfo_header(struct seq_file *m); | ||
519 | |||
520 | /** | 516 | /** |
521 | * kmalloc_array - allocate memory for an array. | 517 | * kmalloc_array - allocate memory for an array. |
522 | * @n: number of elements. | 518 | * @n: number of elements. |
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h new file mode 100644 index 000000000000..145306bdc92f --- /dev/null +++ b/include/linux/swap_cgroup.h | |||
@@ -0,0 +1,42 @@ | |||
1 | #ifndef __LINUX_SWAP_CGROUP_H | ||
2 | #define __LINUX_SWAP_CGROUP_H | ||
3 | |||
4 | #include <linux/swap.h> | ||
5 | |||
6 | #ifdef CONFIG_MEMCG_SWAP | ||
7 | |||
8 | extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
9 | unsigned short old, unsigned short new); | ||
10 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); | ||
11 | extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); | ||
12 | extern int swap_cgroup_swapon(int type, unsigned long max_pages); | ||
13 | extern void swap_cgroup_swapoff(int type); | ||
14 | |||
15 | #else | ||
16 | |||
17 | static inline | ||
18 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
19 | { | ||
20 | return 0; | ||
21 | } | ||
22 | |||
23 | static inline | ||
24 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
25 | { | ||
26 | return 0; | ||
27 | } | ||
28 | |||
29 | static inline int | ||
30 | swap_cgroup_swapon(int type, unsigned long max_pages) | ||
31 | { | ||
32 | return 0; | ||
33 | } | ||
34 | |||
35 | static inline void swap_cgroup_swapoff(int type) | ||
36 | { | ||
37 | return; | ||
38 | } | ||
39 | |||
40 | #endif /* CONFIG_MEMCG_SWAP */ | ||
41 | |||
42 | #endif /* __LINUX_SWAP_CGROUP_H */ | ||
diff --git a/include/net/sock.h b/include/net/sock.h index e6f235ebf6c9..7ff44e062a38 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -54,8 +54,8 @@ | |||
54 | #include <linux/security.h> | 54 | #include <linux/security.h> |
55 | #include <linux/slab.h> | 55 | #include <linux/slab.h> |
56 | #include <linux/uaccess.h> | 56 | #include <linux/uaccess.h> |
57 | #include <linux/page_counter.h> | ||
57 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
58 | #include <linux/res_counter.h> | ||
59 | #include <linux/static_key.h> | 59 | #include <linux/static_key.h> |
60 | #include <linux/aio.h> | 60 | #include <linux/aio.h> |
61 | #include <linux/sched.h> | 61 | #include <linux/sched.h> |
@@ -1062,7 +1062,7 @@ enum cg_proto_flags { | |||
1062 | }; | 1062 | }; |
1063 | 1063 | ||
1064 | struct cg_proto { | 1064 | struct cg_proto { |
1065 | struct res_counter memory_allocated; /* Current allocated memory. */ | 1065 | struct page_counter memory_allocated; /* Current allocated memory. */ |
1066 | struct percpu_counter sockets_allocated; /* Current number of sockets. */ | 1066 | struct percpu_counter sockets_allocated; /* Current number of sockets. */ |
1067 | int memory_pressure; | 1067 | int memory_pressure; |
1068 | long sysctl_mem[3]; | 1068 | long sysctl_mem[3]; |
@@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot, | |||
1214 | unsigned long amt, | 1214 | unsigned long amt, |
1215 | int *parent_status) | 1215 | int *parent_status) |
1216 | { | 1216 | { |
1217 | struct res_counter *fail; | 1217 | page_counter_charge(&prot->memory_allocated, amt); |
1218 | int ret; | ||
1219 | 1218 | ||
1220 | ret = res_counter_charge_nofail(&prot->memory_allocated, | 1219 | if (page_counter_read(&prot->memory_allocated) > |
1221 | amt << PAGE_SHIFT, &fail); | 1220 | prot->memory_allocated.limit) |
1222 | if (ret < 0) | ||
1223 | *parent_status = OVER_LIMIT; | 1221 | *parent_status = OVER_LIMIT; |
1224 | } | 1222 | } |
1225 | 1223 | ||
1226 | static inline void memcg_memory_allocated_sub(struct cg_proto *prot, | 1224 | static inline void memcg_memory_allocated_sub(struct cg_proto *prot, |
1227 | unsigned long amt) | 1225 | unsigned long amt) |
1228 | { | 1226 | { |
1229 | res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); | 1227 | page_counter_uncharge(&prot->memory_allocated, amt); |
1230 | } | ||
1231 | |||
1232 | static inline u64 memcg_memory_allocated_read(struct cg_proto *prot) | ||
1233 | { | ||
1234 | u64 ret; | ||
1235 | ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE); | ||
1236 | return ret >> PAGE_SHIFT; | ||
1237 | } | 1228 | } |
1238 | 1229 | ||
1239 | static inline long | 1230 | static inline long |
1240 | sk_memory_allocated(const struct sock *sk) | 1231 | sk_memory_allocated(const struct sock *sk) |
1241 | { | 1232 | { |
1242 | struct proto *prot = sk->sk_prot; | 1233 | struct proto *prot = sk->sk_prot; |
1234 | |||
1243 | if (mem_cgroup_sockets_enabled && sk->sk_cgrp) | 1235 | if (mem_cgroup_sockets_enabled && sk->sk_cgrp) |
1244 | return memcg_memory_allocated_read(sk->sk_cgrp); | 1236 | return page_counter_read(&sk->sk_cgrp->memory_allocated); |
1245 | 1237 | ||
1246 | return atomic_long_read(prot->memory_allocated); | 1238 | return atomic_long_read(prot->memory_allocated); |
1247 | } | 1239 | } |
@@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) | |||
1255 | memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); | 1247 | memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); |
1256 | /* update the root cgroup regardless */ | 1248 | /* update the root cgroup regardless */ |
1257 | atomic_long_add_return(amt, prot->memory_allocated); | 1249 | atomic_long_add_return(amt, prot->memory_allocated); |
1258 | return memcg_memory_allocated_read(sk->sk_cgrp); | 1250 | return page_counter_read(&sk->sk_cgrp->memory_allocated); |
1259 | } | 1251 | } |
1260 | 1252 | ||
1261 | return atomic_long_add_return(amt, prot->memory_allocated); | 1253 | return atomic_long_add_return(amt, prot->memory_allocated); |
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 43aaba1cc037..0956373b56db 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h | |||
@@ -153,6 +153,7 @@ enum | |||
153 | KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */ | 153 | KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */ |
154 | KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ | 154 | KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ |
155 | KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ | 155 | KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ |
156 | KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ | ||
156 | }; | 157 | }; |
157 | 158 | ||
158 | 159 | ||
diff --git a/init/Kconfig b/init/Kconfig index 903505e66d1d..9afb971497f4 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -893,14 +893,6 @@ config ARCH_SUPPORTS_INT128 | |||
893 | config ARCH_WANT_NUMA_VARIABLE_LOCALITY | 893 | config ARCH_WANT_NUMA_VARIABLE_LOCALITY |
894 | bool | 894 | bool |
895 | 895 | ||
896 | config NUMA_BALANCING_DEFAULT_ENABLED | ||
897 | bool "Automatically enable NUMA aware memory/task placement" | ||
898 | default y | ||
899 | depends on NUMA_BALANCING | ||
900 | help | ||
901 | If set, automatic NUMA balancing will be enabled if running on a NUMA | ||
902 | machine. | ||
903 | |||
904 | config NUMA_BALANCING | 896 | config NUMA_BALANCING |
905 | bool "Memory placement aware NUMA scheduler" | 897 | bool "Memory placement aware NUMA scheduler" |
906 | depends on ARCH_SUPPORTS_NUMA_BALANCING | 898 | depends on ARCH_SUPPORTS_NUMA_BALANCING |
@@ -913,6 +905,14 @@ config NUMA_BALANCING | |||
913 | 905 | ||
914 | This system will be inactive on UMA systems. | 906 | This system will be inactive on UMA systems. |
915 | 907 | ||
908 | config NUMA_BALANCING_DEFAULT_ENABLED | ||
909 | bool "Automatically enable NUMA aware memory/task placement" | ||
910 | default y | ||
911 | depends on NUMA_BALANCING | ||
912 | help | ||
913 | If set, automatic NUMA balancing will be enabled if running on a NUMA | ||
914 | machine. | ||
915 | |||
916 | menuconfig CGROUPS | 916 | menuconfig CGROUPS |
917 | boolean "Control Group support" | 917 | boolean "Control Group support" |
918 | select KERNFS | 918 | select KERNFS |
@@ -972,32 +972,17 @@ config CGROUP_CPUACCT | |||
972 | Provides a simple Resource Controller for monitoring the | 972 | Provides a simple Resource Controller for monitoring the |
973 | total CPU consumed by the tasks in a cgroup. | 973 | total CPU consumed by the tasks in a cgroup. |
974 | 974 | ||
975 | config RESOURCE_COUNTERS | 975 | config PAGE_COUNTER |
976 | bool "Resource counters" | 976 | bool |
977 | help | ||
978 | This option enables controller independent resource accounting | ||
979 | infrastructure that works with cgroups. | ||
980 | 977 | ||
981 | config MEMCG | 978 | config MEMCG |
982 | bool "Memory Resource Controller for Control Groups" | 979 | bool "Memory Resource Controller for Control Groups" |
983 | depends on RESOURCE_COUNTERS | 980 | select PAGE_COUNTER |
984 | select EVENTFD | 981 | select EVENTFD |
985 | help | 982 | help |
986 | Provides a memory resource controller that manages both anonymous | 983 | Provides a memory resource controller that manages both anonymous |
987 | memory and page cache. (See Documentation/cgroups/memory.txt) | 984 | memory and page cache. (See Documentation/cgroups/memory.txt) |
988 | 985 | ||
989 | Note that setting this option increases fixed memory overhead | ||
990 | associated with each page of memory in the system. By this, | ||
991 | 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory | ||
992 | usage tracking struct at boot. Total amount of this is printed out | ||
993 | at boot. | ||
994 | |||
995 | Only enable when you're ok with these trade offs and really | ||
996 | sure you need the memory resource controller. Even when you enable | ||
997 | this, you can set "cgroup_disable=memory" at your boot option to | ||
998 | disable memory resource controller and you can avoid overheads. | ||
999 | (and lose benefits of memory resource controller) | ||
1000 | |||
1001 | config MEMCG_SWAP | 986 | config MEMCG_SWAP |
1002 | bool "Memory Resource Controller Swap Extension" | 987 | bool "Memory Resource Controller Swap Extension" |
1003 | depends on MEMCG && SWAP | 988 | depends on MEMCG && SWAP |
@@ -1048,7 +1033,8 @@ config MEMCG_KMEM | |||
1048 | 1033 | ||
1049 | config CGROUP_HUGETLB | 1034 | config CGROUP_HUGETLB |
1050 | bool "HugeTLB Resource Controller for Control Groups" | 1035 | bool "HugeTLB Resource Controller for Control Groups" |
1051 | depends on RESOURCE_COUNTERS && HUGETLB_PAGE | 1036 | depends on HUGETLB_PAGE |
1037 | select PAGE_COUNTER | ||
1052 | default n | 1038 | default n |
1053 | help | 1039 | help |
1054 | Provides a cgroup Resource Controller for HugeTLB pages. | 1040 | Provides a cgroup Resource Controller for HugeTLB pages. |
@@ -1294,6 +1280,22 @@ source "usr/Kconfig" | |||
1294 | 1280 | ||
1295 | endif | 1281 | endif |
1296 | 1282 | ||
1283 | config INIT_FALLBACK | ||
1284 | bool "Fall back to defaults if init= parameter is bad" | ||
1285 | default y | ||
1286 | help | ||
1287 | If enabled, the kernel will try the default init binaries if an | ||
1288 | explicit request from the init= parameter fails. | ||
1289 | |||
1290 | This can have unexpected effects. For example, booting | ||
1291 | with init=/sbin/kiosk_app will run /sbin/init or even /bin/sh | ||
1292 | if /sbin/kiosk_app cannot be executed. | ||
1293 | |||
1294 | The default value of Y is consistent with historical behavior. | ||
1295 | Selecting N is likely to be more appropriate for most uses, | ||
1296 | especially on kiosks and on kernels that are intended to be | ||
1297 | run under the control of a script. | ||
1298 | |||
1297 | config CC_OPTIMIZE_FOR_SIZE | 1299 | config CC_OPTIMIZE_FOR_SIZE |
1298 | bool "Optimize for size" | 1300 | bool "Optimize for size" |
1299 | help | 1301 | help |
diff --git a/init/main.c b/init/main.c index 321d0ceb26d3..ca380ec685de 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -51,7 +51,6 @@ | |||
51 | #include <linux/mempolicy.h> | 51 | #include <linux/mempolicy.h> |
52 | #include <linux/key.h> | 52 | #include <linux/key.h> |
53 | #include <linux/buffer_head.h> | 53 | #include <linux/buffer_head.h> |
54 | #include <linux/page_cgroup.h> | ||
55 | #include <linux/debug_locks.h> | 54 | #include <linux/debug_locks.h> |
56 | #include <linux/debugobjects.h> | 55 | #include <linux/debugobjects.h> |
57 | #include <linux/lockdep.h> | 56 | #include <linux/lockdep.h> |
@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void) | |||
485 | */ | 484 | */ |
486 | static void __init mm_init(void) | 485 | static void __init mm_init(void) |
487 | { | 486 | { |
488 | /* | ||
489 | * page_cgroup requires contiguous pages, | ||
490 | * bigger than MAX_ORDER unless SPARSEMEM. | ||
491 | */ | ||
492 | page_cgroup_init_flatmem(); | ||
493 | mem_init(); | 487 | mem_init(); |
494 | kmem_cache_init(); | 488 | kmem_cache_init(); |
495 | percpu_init_late(); | 489 | percpu_init_late(); |
@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void) | |||
627 | initrd_start = 0; | 621 | initrd_start = 0; |
628 | } | 622 | } |
629 | #endif | 623 | #endif |
630 | page_cgroup_init(); | ||
631 | debug_objects_mem_init(); | 624 | debug_objects_mem_init(); |
632 | kmemleak_init(); | 625 | kmemleak_init(); |
633 | setup_per_cpu_pageset(); | 626 | setup_per_cpu_pageset(); |
@@ -959,8 +952,13 @@ static int __ref kernel_init(void *unused) | |||
959 | ret = run_init_process(execute_command); | 952 | ret = run_init_process(execute_command); |
960 | if (!ret) | 953 | if (!ret) |
961 | return 0; | 954 | return 0; |
955 | #ifndef CONFIG_INIT_FALLBACK | ||
956 | panic("Requested init %s failed (error %d).", | ||
957 | execute_command, ret); | ||
958 | #else | ||
962 | pr_err("Failed to execute %s (error %d). Attempting defaults...\n", | 959 | pr_err("Failed to execute %s (error %d). Attempting defaults...\n", |
963 | execute_command, ret); | 960 | execute_command, ret); |
961 | #endif | ||
964 | } | 962 | } |
965 | if (!try_to_run_init_process("/sbin/init") || | 963 | if (!try_to_run_init_process("/sbin/init") || |
966 | !try_to_run_init_process("/etc/init") || | 964 | !try_to_run_init_process("/etc/init") || |
diff --git a/kernel/Makefile b/kernel/Makefile index 17ea6d4a9a24..a59481a3fa6c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o | |||
57 | obj-$(CONFIG_USER_NS) += user_namespace.o | 57 | obj-$(CONFIG_USER_NS) += user_namespace.o |
58 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 58 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
59 | obj-$(CONFIG_IKCONFIG) += configs.o | 59 | obj-$(CONFIG_IKCONFIG) += configs.o |
60 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | ||
61 | obj-$(CONFIG_SMP) += stop_machine.o | 60 | obj-$(CONFIG_SMP) += stop_machine.o |
62 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 61 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
63 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 62 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
diff --git a/kernel/exit.c b/kernel/exit.c index 232c4bc8bcc9..8714e5ded8b4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk) | |||
118 | } | 118 | } |
119 | 119 | ||
120 | /* | 120 | /* |
121 | * Accumulate here the counters for all threads but the group leader | 121 | * Accumulate here the counters for all threads as they die. We could |
122 | * as they die, so they can be added into the process-wide totals | 122 | * skip the group leader because it is the last user of signal_struct, |
123 | * when those are taken. The group leader stays around as a zombie as | 123 | * but we want to avoid the race with thread_group_cputime() which can |
124 | * long as there are other threads. When it gets reaped, the exit.c | 124 | * see the empty ->thread_head list. |
125 | * code will add its counts into these totals. We won't ever get here | ||
126 | * for the group leader, since it will have been the last reference on | ||
127 | * the signal_struct. | ||
128 | */ | 125 | */ |
129 | task_cputime(tsk, &utime, &stime); | 126 | task_cputime(tsk, &utime, &stime); |
130 | write_seqlock(&sig->stats_lock); | 127 | write_seqlock(&sig->stats_lock); |
@@ -462,6 +459,44 @@ static void exit_mm(struct task_struct *tsk) | |||
462 | clear_thread_flag(TIF_MEMDIE); | 459 | clear_thread_flag(TIF_MEMDIE); |
463 | } | 460 | } |
464 | 461 | ||
462 | static struct task_struct *find_alive_thread(struct task_struct *p) | ||
463 | { | ||
464 | struct task_struct *t; | ||
465 | |||
466 | for_each_thread(p, t) { | ||
467 | if (!(t->flags & PF_EXITING)) | ||
468 | return t; | ||
469 | } | ||
470 | return NULL; | ||
471 | } | ||
472 | |||
473 | static struct task_struct *find_child_reaper(struct task_struct *father) | ||
474 | __releases(&tasklist_lock) | ||
475 | __acquires(&tasklist_lock) | ||
476 | { | ||
477 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | ||
478 | struct task_struct *reaper = pid_ns->child_reaper; | ||
479 | |||
480 | if (likely(reaper != father)) | ||
481 | return reaper; | ||
482 | |||
483 | reaper = find_alive_thread(father); | ||
484 | if (reaper) { | ||
485 | pid_ns->child_reaper = reaper; | ||
486 | return reaper; | ||
487 | } | ||
488 | |||
489 | write_unlock_irq(&tasklist_lock); | ||
490 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
491 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
492 | father->signal->group_exit_code ?: father->exit_code); | ||
493 | } | ||
494 | zap_pid_ns_processes(pid_ns); | ||
495 | write_lock_irq(&tasklist_lock); | ||
496 | |||
497 | return father; | ||
498 | } | ||
499 | |||
465 | /* | 500 | /* |
466 | * When we die, we re-parent all our children, and try to: | 501 | * When we die, we re-parent all our children, and try to: |
467 | * 1. give them to another thread in our thread group, if such a member exists | 502 | * 1. give them to another thread in our thread group, if such a member exists |
@@ -469,58 +504,36 @@ static void exit_mm(struct task_struct *tsk) | |||
469 | * child_subreaper for its children (like a service manager) | 504 | * child_subreaper for its children (like a service manager) |
470 | * 3. give it to the init process (PID 1) in our pid namespace | 505 | * 3. give it to the init process (PID 1) in our pid namespace |
471 | */ | 506 | */ |
472 | static struct task_struct *find_new_reaper(struct task_struct *father) | 507 | static struct task_struct *find_new_reaper(struct task_struct *father, |
473 | __releases(&tasklist_lock) | 508 | struct task_struct *child_reaper) |
474 | __acquires(&tasklist_lock) | ||
475 | { | 509 | { |
476 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 510 | struct task_struct *thread, *reaper; |
477 | struct task_struct *thread; | ||
478 | 511 | ||
479 | thread = father; | 512 | thread = find_alive_thread(father); |
480 | while_each_thread(father, thread) { | 513 | if (thread) |
481 | if (thread->flags & PF_EXITING) | ||
482 | continue; | ||
483 | if (unlikely(pid_ns->child_reaper == father)) | ||
484 | pid_ns->child_reaper = thread; | ||
485 | return thread; | 514 | return thread; |
486 | } | ||
487 | |||
488 | if (unlikely(pid_ns->child_reaper == father)) { | ||
489 | write_unlock_irq(&tasklist_lock); | ||
490 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
491 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
492 | father->signal->group_exit_code ?: | ||
493 | father->exit_code); | ||
494 | } | ||
495 | |||
496 | zap_pid_ns_processes(pid_ns); | ||
497 | write_lock_irq(&tasklist_lock); | ||
498 | } else if (father->signal->has_child_subreaper) { | ||
499 | struct task_struct *reaper; | ||
500 | 515 | ||
516 | if (father->signal->has_child_subreaper) { | ||
501 | /* | 517 | /* |
502 | * Find the first ancestor marked as child_subreaper. | 518 | * Find the first ->is_child_subreaper ancestor in our pid_ns. |
503 | * Note that the code below checks same_thread_group(reaper, | 519 | * We start from father to ensure we can not look into another |
504 | * pid_ns->child_reaper). This is what we need to DTRT in a | 520 | * namespace, this is safe because all its threads are dead. |
505 | * PID namespace. However we still need the check above, see | ||
506 | * http://marc.info/?l=linux-kernel&m=131385460420380 | ||
507 | */ | 521 | */ |
508 | for (reaper = father->real_parent; | 522 | for (reaper = father; |
509 | reaper != &init_task; | 523 | !same_thread_group(reaper, child_reaper); |
510 | reaper = reaper->real_parent) { | 524 | reaper = reaper->real_parent) { |
511 | if (same_thread_group(reaper, pid_ns->child_reaper)) | 525 | /* call_usermodehelper() descendants need this check */ |
526 | if (reaper == &init_task) | ||
512 | break; | 527 | break; |
513 | if (!reaper->signal->is_child_subreaper) | 528 | if (!reaper->signal->is_child_subreaper) |
514 | continue; | 529 | continue; |
515 | thread = reaper; | 530 | thread = find_alive_thread(reaper); |
516 | do { | 531 | if (thread) |
517 | if (!(thread->flags & PF_EXITING)) | 532 | return thread; |
518 | return reaper; | ||
519 | } while_each_thread(reaper, thread); | ||
520 | } | 533 | } |
521 | } | 534 | } |
522 | 535 | ||
523 | return pid_ns->child_reaper; | 536 | return child_reaper; |
524 | } | 537 | } |
525 | 538 | ||
526 | /* | 539 | /* |
@@ -529,15 +542,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
529 | static void reparent_leader(struct task_struct *father, struct task_struct *p, | 542 | static void reparent_leader(struct task_struct *father, struct task_struct *p, |
530 | struct list_head *dead) | 543 | struct list_head *dead) |
531 | { | 544 | { |
532 | list_move_tail(&p->sibling, &p->real_parent->children); | 545 | if (unlikely(p->exit_state == EXIT_DEAD)) |
533 | |||
534 | if (p->exit_state == EXIT_DEAD) | ||
535 | return; | ||
536 | /* | ||
537 | * If this is a threaded reparent there is no need to | ||
538 | * notify anyone anything has happened. | ||
539 | */ | ||
540 | if (same_thread_group(p->real_parent, father)) | ||
541 | return; | 546 | return; |
542 | 547 | ||
543 | /* We don't want people slaying init. */ | 548 | /* We don't want people slaying init. */ |
@@ -548,49 +553,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
548 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 553 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
549 | if (do_notify_parent(p, p->exit_signal)) { | 554 | if (do_notify_parent(p, p->exit_signal)) { |
550 | p->exit_state = EXIT_DEAD; | 555 | p->exit_state = EXIT_DEAD; |
551 | list_move_tail(&p->sibling, dead); | 556 | list_add(&p->ptrace_entry, dead); |
552 | } | 557 | } |
553 | } | 558 | } |
554 | 559 | ||
555 | kill_orphaned_pgrp(p, father); | 560 | kill_orphaned_pgrp(p, father); |
556 | } | 561 | } |
557 | 562 | ||
558 | static void forget_original_parent(struct task_struct *father) | 563 | /* |
564 | * This does two things: | ||
565 | * | ||
566 | * A. Make init inherit all the child processes | ||
567 | * B. Check to see if any process groups have become orphaned | ||
568 | * as a result of our exiting, and if they have any stopped | ||
569 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
570 | */ | ||
571 | static void forget_original_parent(struct task_struct *father, | ||
572 | struct list_head *dead) | ||
559 | { | 573 | { |
560 | struct task_struct *p, *n, *reaper; | 574 | struct task_struct *p, *t, *reaper; |
561 | LIST_HEAD(dead_children); | ||
562 | 575 | ||
563 | write_lock_irq(&tasklist_lock); | 576 | if (unlikely(!list_empty(&father->ptraced))) |
564 | /* | 577 | exit_ptrace(father, dead); |
565 | * Note that exit_ptrace() and find_new_reaper() might | ||
566 | * drop tasklist_lock and reacquire it. | ||
567 | */ | ||
568 | exit_ptrace(father); | ||
569 | reaper = find_new_reaper(father); | ||
570 | 578 | ||
571 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 579 | /* Can drop and reacquire tasklist_lock */ |
572 | struct task_struct *t = p; | 580 | reaper = find_child_reaper(father); |
581 | if (list_empty(&father->children)) | ||
582 | return; | ||
573 | 583 | ||
574 | do { | 584 | reaper = find_new_reaper(father, reaper); |
585 | list_for_each_entry(p, &father->children, sibling) { | ||
586 | for_each_thread(p, t) { | ||
575 | t->real_parent = reaper; | 587 | t->real_parent = reaper; |
576 | if (t->parent == father) { | 588 | BUG_ON((!t->ptrace) != (t->parent == father)); |
577 | BUG_ON(t->ptrace); | 589 | if (likely(!t->ptrace)) |
578 | t->parent = t->real_parent; | 590 | t->parent = t->real_parent; |
579 | } | ||
580 | if (t->pdeath_signal) | 591 | if (t->pdeath_signal) |
581 | group_send_sig_info(t->pdeath_signal, | 592 | group_send_sig_info(t->pdeath_signal, |
582 | SEND_SIG_NOINFO, t); | 593 | SEND_SIG_NOINFO, t); |
583 | } while_each_thread(p, t); | 594 | } |
584 | reparent_leader(father, p, &dead_children); | 595 | /* |
585 | } | 596 | * If this is a threaded reparent there is no need to |
586 | write_unlock_irq(&tasklist_lock); | 597 | * notify anyone anything has happened. |
587 | 598 | */ | |
588 | BUG_ON(!list_empty(&father->children)); | 599 | if (!same_thread_group(reaper, father)) |
589 | 600 | reparent_leader(father, p, dead); | |
590 | list_for_each_entry_safe(p, n, &dead_children, sibling) { | ||
591 | list_del_init(&p->sibling); | ||
592 | release_task(p); | ||
593 | } | 601 | } |
602 | list_splice_tail_init(&father->children, &reaper->children); | ||
594 | } | 603 | } |
595 | 604 | ||
596 | /* | 605 | /* |
@@ -600,18 +609,12 @@ static void forget_original_parent(struct task_struct *father) | |||
600 | static void exit_notify(struct task_struct *tsk, int group_dead) | 609 | static void exit_notify(struct task_struct *tsk, int group_dead) |
601 | { | 610 | { |
602 | bool autoreap; | 611 | bool autoreap; |
603 | 612 | struct task_struct *p, *n; | |
604 | /* | 613 | LIST_HEAD(dead); |
605 | * This does two things: | ||
606 | * | ||
607 | * A. Make init inherit all the child processes | ||
608 | * B. Check to see if any process groups have become orphaned | ||
609 | * as a result of our exiting, and if they have any stopped | ||
610 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
611 | */ | ||
612 | forget_original_parent(tsk); | ||
613 | 614 | ||
614 | write_lock_irq(&tasklist_lock); | 615 | write_lock_irq(&tasklist_lock); |
616 | forget_original_parent(tsk, &dead); | ||
617 | |||
615 | if (group_dead) | 618 | if (group_dead) |
616 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 619 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
617 | 620 | ||
@@ -629,15 +632,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
629 | } | 632 | } |
630 | 633 | ||
631 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; | 634 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
635 | if (tsk->exit_state == EXIT_DEAD) | ||
636 | list_add(&tsk->ptrace_entry, &dead); | ||
632 | 637 | ||
633 | /* mt-exec, de_thread() is waiting for group leader */ | 638 | /* mt-exec, de_thread() is waiting for group leader */ |
634 | if (unlikely(tsk->signal->notify_count < 0)) | 639 | if (unlikely(tsk->signal->notify_count < 0)) |
635 | wake_up_process(tsk->signal->group_exit_task); | 640 | wake_up_process(tsk->signal->group_exit_task); |
636 | write_unlock_irq(&tasklist_lock); | 641 | write_unlock_irq(&tasklist_lock); |
637 | 642 | ||
638 | /* If the process is dead, release it - nobody will wait for it */ | 643 | list_for_each_entry_safe(p, n, &dead, ptrace_entry) { |
639 | if (autoreap) | 644 | list_del_init(&p->ptrace_entry); |
640 | release_task(tsk); | 645 | release_task(p); |
646 | } | ||
641 | } | 647 | } |
642 | 648 | ||
643 | #ifdef CONFIG_DEBUG_STACK_USAGE | 649 | #ifdef CONFIG_DEBUG_STACK_USAGE |
@@ -982,8 +988,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | |||
982 | */ | 988 | */ |
983 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 989 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
984 | { | 990 | { |
985 | unsigned long state; | 991 | int state, retval, status; |
986 | int retval, status, traced; | ||
987 | pid_t pid = task_pid_vnr(p); | 992 | pid_t pid = task_pid_vnr(p); |
988 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 993 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
989 | struct siginfo __user *infop; | 994 | struct siginfo __user *infop; |
@@ -1008,21 +1013,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1008 | } | 1013 | } |
1009 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | 1014 | return wait_noreap_copyout(wo, p, pid, uid, why, status); |
1010 | } | 1015 | } |
1011 | |||
1012 | traced = ptrace_reparented(p); | ||
1013 | /* | 1016 | /* |
1014 | * Move the task's state to DEAD/TRACE, only one thread can do this. | 1017 | * Move the task's state to DEAD/TRACE, only one thread can do this. |
1015 | */ | 1018 | */ |
1016 | state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; | 1019 | state = (ptrace_reparented(p) && thread_group_leader(p)) ? |
1020 | EXIT_TRACE : EXIT_DEAD; | ||
1017 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) | 1021 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) |
1018 | return 0; | 1022 | return 0; |
1019 | /* | 1023 | /* |
1020 | * It can be ptraced but not reparented, check | 1024 | * We own this thread, nobody else can reap it. |
1021 | * thread_group_leader() to filter out sub-threads. | ||
1022 | */ | 1025 | */ |
1023 | if (likely(!traced) && thread_group_leader(p)) { | 1026 | read_unlock(&tasklist_lock); |
1024 | struct signal_struct *psig; | 1027 | sched_annotate_sleep(); |
1025 | struct signal_struct *sig; | 1028 | |
1029 | /* | ||
1030 | * Check thread_group_leader() to exclude the traced sub-threads. | ||
1031 | */ | ||
1032 | if (state == EXIT_DEAD && thread_group_leader(p)) { | ||
1033 | struct signal_struct *sig = p->signal; | ||
1034 | struct signal_struct *psig = current->signal; | ||
1026 | unsigned long maxrss; | 1035 | unsigned long maxrss; |
1027 | cputime_t tgutime, tgstime; | 1036 | cputime_t tgutime, tgstime; |
1028 | 1037 | ||
@@ -1034,21 +1043,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1034 | * accumulate in the parent's signal_struct c* fields. | 1043 | * accumulate in the parent's signal_struct c* fields. |
1035 | * | 1044 | * |
1036 | * We don't bother to take a lock here to protect these | 1045 | * We don't bother to take a lock here to protect these |
1037 | * p->signal fields, because they are only touched by | 1046 | * p->signal fields because the whole thread group is dead |
1038 | * __exit_signal, which runs with tasklist_lock | 1047 | * and nobody can change them. |
1039 | * write-locked anyway, and so is excluded here. We do | 1048 | * |
1040 | * need to protect the access to parent->signal fields, | 1049 | * psig->stats_lock also protects us from our sub-theads |
1041 | * as other threads in the parent group can be right | 1050 | * which can reap other children at the same time. Until |
1042 | * here reaping other children at the same time. | 1051 | * we change k_getrusage()-like users to rely on this lock |
1052 | * we have to take ->siglock as well. | ||
1043 | * | 1053 | * |
1044 | * We use thread_group_cputime_adjusted() to get times for | 1054 | * We use thread_group_cputime_adjusted() to get times for |
1045 | * the thread group, which consolidates times for all threads | 1055 | * the thread group, which consolidates times for all threads |
1046 | * in the group including the group leader. | 1056 | * in the group including the group leader. |
1047 | */ | 1057 | */ |
1048 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | 1058 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1049 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1059 | spin_lock_irq(¤t->sighand->siglock); |
1050 | psig = p->real_parent->signal; | ||
1051 | sig = p->signal; | ||
1052 | write_seqlock(&psig->stats_lock); | 1060 | write_seqlock(&psig->stats_lock); |
1053 | psig->cutime += tgutime + sig->cutime; | 1061 | psig->cutime += tgutime + sig->cutime; |
1054 | psig->cstime += tgstime + sig->cstime; | 1062 | psig->cstime += tgstime + sig->cstime; |
@@ -1073,16 +1081,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1073 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1081 | task_io_accounting_add(&psig->ioac, &p->ioac); |
1074 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1082 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
1075 | write_sequnlock(&psig->stats_lock); | 1083 | write_sequnlock(&psig->stats_lock); |
1076 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1084 | spin_unlock_irq(¤t->sighand->siglock); |
1077 | } | 1085 | } |
1078 | 1086 | ||
1079 | /* | ||
1080 | * Now we are sure this task is interesting, and no other | ||
1081 | * thread can reap it because we its state == DEAD/TRACE. | ||
1082 | */ | ||
1083 | read_unlock(&tasklist_lock); | ||
1084 | sched_annotate_sleep(); | ||
1085 | |||
1086 | retval = wo->wo_rusage | 1087 | retval = wo->wo_rusage |
1087 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1088 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1088 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1089 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 80f7a6d00519..2777f40a9c7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -47,13 +47,6 @@ extern int max_threads; | |||
47 | 47 | ||
48 | static struct workqueue_struct *khelper_wq; | 48 | static struct workqueue_struct *khelper_wq; |
49 | 49 | ||
50 | /* | ||
51 | * kmod_thread_locker is used for deadlock avoidance. There is no explicit | ||
52 | * locking to protect this global - it is private to the singleton khelper | ||
53 | * thread and should only ever be modified by that thread. | ||
54 | */ | ||
55 | static const struct task_struct *kmod_thread_locker; | ||
56 | |||
57 | #define CAP_BSET (void *)1 | 50 | #define CAP_BSET (void *)1 |
58 | #define CAP_PI (void *)2 | 51 | #define CAP_PI (void *)2 |
59 | 52 | ||
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info) | |||
223 | static int ____call_usermodehelper(void *data) | 216 | static int ____call_usermodehelper(void *data) |
224 | { | 217 | { |
225 | struct subprocess_info *sub_info = data; | 218 | struct subprocess_info *sub_info = data; |
226 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
227 | struct cred *new; | 219 | struct cred *new; |
228 | int retval; | 220 | int retval; |
229 | 221 | ||
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data) | |||
267 | out: | 259 | out: |
268 | sub_info->retval = retval; | 260 | sub_info->retval = retval; |
269 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ | 261 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ |
270 | if (wait != UMH_WAIT_PROC) | 262 | if (!(sub_info->wait & UMH_WAIT_PROC)) |
271 | umh_complete(sub_info); | 263 | umh_complete(sub_info); |
272 | if (!retval) | 264 | if (!retval) |
273 | return 0; | 265 | return 0; |
274 | do_exit(0); | 266 | do_exit(0); |
275 | } | 267 | } |
276 | 268 | ||
277 | static int call_helper(void *data) | ||
278 | { | ||
279 | /* Worker thread started blocking khelper thread. */ | ||
280 | kmod_thread_locker = current; | ||
281 | return ____call_usermodehelper(data); | ||
282 | } | ||
283 | |||
284 | /* Keventd can't block, but this (a child) can. */ | 269 | /* Keventd can't block, but this (a child) can. */ |
285 | static int wait_for_helper(void *data) | 270 | static int wait_for_helper(void *data) |
286 | { | 271 | { |
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work) | |||
323 | { | 308 | { |
324 | struct subprocess_info *sub_info = | 309 | struct subprocess_info *sub_info = |
325 | container_of(work, struct subprocess_info, work); | 310 | container_of(work, struct subprocess_info, work); |
326 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
327 | pid_t pid; | 311 | pid_t pid; |
328 | 312 | ||
329 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 313 | if (sub_info->wait & UMH_WAIT_PROC) |
330 | * successfully We need the data structures to stay around | ||
331 | * until that is done. */ | ||
332 | if (wait == UMH_WAIT_PROC) | ||
333 | pid = kernel_thread(wait_for_helper, sub_info, | 314 | pid = kernel_thread(wait_for_helper, sub_info, |
334 | CLONE_FS | CLONE_FILES | SIGCHLD); | 315 | CLONE_FS | CLONE_FILES | SIGCHLD); |
335 | else { | 316 | else |
336 | pid = kernel_thread(call_helper, sub_info, | 317 | pid = kernel_thread(____call_usermodehelper, sub_info, |
337 | CLONE_VFORK | SIGCHLD); | 318 | SIGCHLD); |
338 | /* Worker thread stopped blocking khelper thread. */ | ||
339 | kmod_thread_locker = NULL; | ||
340 | } | ||
341 | 319 | ||
342 | if (pid < 0) { | 320 | if (pid < 0) { |
343 | sub_info->retval = pid; | 321 | sub_info->retval = pid; |
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
571 | goto out; | 549 | goto out; |
572 | } | 550 | } |
573 | /* | 551 | /* |
574 | * Worker thread must not wait for khelper thread at below | ||
575 | * wait_for_completion() if the thread was created with CLONE_VFORK | ||
576 | * flag, for khelper thread is already waiting for the thread at | ||
577 | * wait_for_completion() in do_fork(). | ||
578 | */ | ||
579 | if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { | ||
580 | retval = -EBUSY; | ||
581 | goto out; | ||
582 | } | ||
583 | |||
584 | /* | ||
585 | * Set the completion pointer only if there is a waiter. | 552 | * Set the completion pointer only if there is a waiter. |
586 | * This makes it possible to use umh_complete to free | 553 | * This makes it possible to use umh_complete to free |
587 | * the data structure in case of UMH_NO_WAIT. | 554 | * the data structure in case of UMH_NO_WAIT. |
diff --git a/kernel/panic.c b/kernel/panic.c index cf80672b7924..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -33,6 +33,7 @@ static int pause_on_oops; | |||
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | static bool crash_kexec_post_notifiers; | 35 | static bool crash_kexec_post_notifiers; |
36 | int panic_on_warn __read_mostly; | ||
36 | 37 | ||
37 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | 38 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
38 | EXPORT_SYMBOL_GPL(panic_timeout); | 39 | EXPORT_SYMBOL_GPL(panic_timeout); |
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, | |||
428 | if (args) | 429 | if (args) |
429 | vprintk(args->fmt, args->args); | 430 | vprintk(args->fmt, args->args); |
430 | 431 | ||
432 | if (panic_on_warn) { | ||
433 | /* | ||
434 | * This thread may hit another WARN() in the panic path. | ||
435 | * Resetting this prevents additional WARN() from panicking the | ||
436 | * system on this thread. Other threads are blocked by the | ||
437 | * panic_mutex in panic(). | ||
438 | */ | ||
439 | panic_on_warn = 0; | ||
440 | panic("panic_on_warn set ...\n"); | ||
441 | } | ||
442 | |||
431 | print_modules(); | 443 | print_modules(); |
432 | dump_stack(); | 444 | dump_stack(); |
433 | print_oops_end_marker(); | 445 | print_oops_end_marker(); |
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
485 | 497 | ||
486 | core_param(panic, panic_timeout, int, 0644); | 498 | core_param(panic, panic_timeout, int, 0644); |
487 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 499 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
500 | core_param(panic_on_warn, panic_on_warn, int, 0644); | ||
488 | 501 | ||
489 | static int __init setup_crash_kexec_post_notifiers(char *s) | 502 | static int __init setup_crash_kexec_post_notifiers(char *s) |
490 | { | 503 | { |
diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a26698144..82430c858d69 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -341,6 +341,8 @@ out: | |||
341 | 341 | ||
342 | out_unlock: | 342 | out_unlock: |
343 | spin_unlock_irq(&pidmap_lock); | 343 | spin_unlock_irq(&pidmap_lock); |
344 | put_pid_ns(ns); | ||
345 | |||
344 | out_free: | 346 | out_free: |
345 | while (++i <= ns->level) | 347 | while (++i <= ns->level) |
346 | free_pidmap(pid->numbers + i); | 348 | free_pidmap(pid->numbers + i); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761b..bc6d6a89b6e6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -190,7 +190,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
190 | /* Don't allow any more processes into the pid namespace */ | 190 | /* Don't allow any more processes into the pid namespace */ |
191 | disable_pid_allocation(pid_ns); | 191 | disable_pid_allocation(pid_ns); |
192 | 192 | ||
193 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | 193 | /* |
194 | * Ignore SIGCHLD causing any terminated children to autoreap. | ||
195 | * This speeds up the namespace shutdown, plus see the comment | ||
196 | * below. | ||
197 | */ | ||
194 | spin_lock_irq(&me->sighand->siglock); | 198 | spin_lock_irq(&me->sighand->siglock); |
195 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | 199 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; |
196 | spin_unlock_irq(&me->sighand->siglock); | 200 | spin_unlock_irq(&me->sighand->siglock); |
@@ -223,15 +227,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
223 | } | 227 | } |
224 | read_unlock(&tasklist_lock); | 228 | read_unlock(&tasklist_lock); |
225 | 229 | ||
226 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | 230 | /* |
231 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. | ||
232 | * sys_wait4() will also block until our children traced from the | ||
233 | * parent namespace are detached and become EXIT_DEAD. | ||
234 | */ | ||
227 | do { | 235 | do { |
228 | clear_thread_flag(TIF_SIGPENDING); | 236 | clear_thread_flag(TIF_SIGPENDING); |
229 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 237 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
230 | } while (rc != -ECHILD); | 238 | } while (rc != -ECHILD); |
231 | 239 | ||
232 | /* | 240 | /* |
233 | * sys_wait4() above can't reap the TASK_DEAD children. | 241 | * sys_wait4() above can't reap the EXIT_DEAD children but we do not |
234 | * Make sure they all go away, see free_pid(). | 242 | * really care, we could reparent them to the global init. We could |
243 | * exit and reap ->child_reaper even if it is not the last thread in | ||
244 | * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), | ||
245 | * pid_ns can not go away until proc_kill_sb() drops the reference. | ||
246 | * | ||
247 | * But this ns can also have other tasks injected by setns()+fork(). | ||
248 | * Again, ignoring the user visible semantics we do not really need | ||
249 | * to wait until they are all reaped, but they can be reparented to | ||
250 | * us and thus we need to ensure that pid->child_reaper stays valid | ||
251 | * until they all go away. See free_pid()->wake_up_process(). | ||
252 | * | ||
253 | * We rely on ignored SIGCHLD, an injected zombie must be autoreaped | ||
254 | * if reparented. | ||
235 | */ | 255 | */ |
236 | for (;;) { | 256 | for (;;) { |
237 | set_current_state(TASK_UNINTERRUPTIBLE); | 257 | set_current_state(TASK_UNINTERRUPTIBLE); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c8755e7e1dba..ea27c019655a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -62,9 +62,6 @@ int console_printk[4] = { | |||
62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ | 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
63 | }; | 63 | }; |
64 | 64 | ||
65 | /* Deferred messaged from sched code are marked by this special level */ | ||
66 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
67 | |||
68 | /* | 65 | /* |
69 | * Low level drivers may need that to know if they can schedule in | 66 | * Low level drivers may need that to know if they can schedule in |
70 | * their unblank() callback or not. So let's export it. | 67 | * their unblank() callback or not. So let's export it. |
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
1259 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 1256 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
1260 | { | 1257 | { |
1261 | bool clear = false; | 1258 | bool clear = false; |
1262 | static int saved_console_loglevel = -1; | 1259 | static int saved_console_loglevel = LOGLEVEL_DEFAULT; |
1263 | int error; | 1260 | int error; |
1264 | 1261 | ||
1265 | error = check_syslog_permissions(type, from_file); | 1262 | error = check_syslog_permissions(type, from_file); |
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1316 | break; | 1313 | break; |
1317 | /* Disable logging to console */ | 1314 | /* Disable logging to console */ |
1318 | case SYSLOG_ACTION_CONSOLE_OFF: | 1315 | case SYSLOG_ACTION_CONSOLE_OFF: |
1319 | if (saved_console_loglevel == -1) | 1316 | if (saved_console_loglevel == LOGLEVEL_DEFAULT) |
1320 | saved_console_loglevel = console_loglevel; | 1317 | saved_console_loglevel = console_loglevel; |
1321 | console_loglevel = minimum_console_loglevel; | 1318 | console_loglevel = minimum_console_loglevel; |
1322 | break; | 1319 | break; |
1323 | /* Enable logging to console */ | 1320 | /* Enable logging to console */ |
1324 | case SYSLOG_ACTION_CONSOLE_ON: | 1321 | case SYSLOG_ACTION_CONSOLE_ON: |
1325 | if (saved_console_loglevel != -1) { | 1322 | if (saved_console_loglevel != LOGLEVEL_DEFAULT) { |
1326 | console_loglevel = saved_console_loglevel; | 1323 | console_loglevel = saved_console_loglevel; |
1327 | saved_console_loglevel = -1; | 1324 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
1328 | } | 1325 | } |
1329 | break; | 1326 | break; |
1330 | /* Set level of messages printed to console */ | 1327 | /* Set level of messages printed to console */ |
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1336 | len = minimum_console_loglevel; | 1333 | len = minimum_console_loglevel; |
1337 | console_loglevel = len; | 1334 | console_loglevel = len; |
1338 | /* Implicitly re-enable logging to console */ | 1335 | /* Implicitly re-enable logging to console */ |
1339 | saved_console_loglevel = -1; | 1336 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
1340 | error = 0; | 1337 | error = 0; |
1341 | break; | 1338 | break; |
1342 | /* Number of chars in the log buffer */ | 1339 | /* Number of chars in the log buffer */ |
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1627 | int printed_len = 0; | 1624 | int printed_len = 0; |
1628 | bool in_sched = false; | 1625 | bool in_sched = false; |
1629 | /* cpu currently holding logbuf_lock in this function */ | 1626 | /* cpu currently holding logbuf_lock in this function */ |
1630 | static volatile unsigned int logbuf_cpu = UINT_MAX; | 1627 | static unsigned int logbuf_cpu = UINT_MAX; |
1631 | 1628 | ||
1632 | if (level == SCHED_MESSAGE_LOGLEVEL) { | 1629 | if (level == LOGLEVEL_SCHED) { |
1633 | level = -1; | 1630 | level = LOGLEVEL_DEFAULT; |
1634 | in_sched = true; | 1631 | in_sched = true; |
1635 | } | 1632 | } |
1636 | 1633 | ||
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1695 | const char *end_of_header = printk_skip_level(text); | 1692 | const char *end_of_header = printk_skip_level(text); |
1696 | switch (kern_level) { | 1693 | switch (kern_level) { |
1697 | case '0' ... '7': | 1694 | case '0' ... '7': |
1698 | if (level == -1) | 1695 | if (level == LOGLEVEL_DEFAULT) |
1699 | level = kern_level - '0'; | 1696 | level = kern_level - '0'; |
1697 | /* fallthrough */ | ||
1700 | case 'd': /* KERN_DEFAULT */ | 1698 | case 'd': /* KERN_DEFAULT */ |
1701 | lflags |= LOG_PREFIX; | 1699 | lflags |= LOG_PREFIX; |
1702 | } | 1700 | } |
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1710 | } | 1708 | } |
1711 | } | 1709 | } |
1712 | 1710 | ||
1713 | if (level == -1) | 1711 | if (level == LOGLEVEL_DEFAULT) |
1714 | level = default_message_loglevel; | 1712 | level = default_message_loglevel; |
1715 | 1713 | ||
1716 | if (dict) | 1714 | if (dict) |
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); | |||
1788 | 1786 | ||
1789 | asmlinkage int vprintk(const char *fmt, va_list args) | 1787 | asmlinkage int vprintk(const char *fmt, va_list args) |
1790 | { | 1788 | { |
1791 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | 1789 | return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); |
1792 | } | 1790 | } |
1793 | EXPORT_SYMBOL(vprintk); | 1791 | EXPORT_SYMBOL(vprintk); |
1794 | 1792 | ||
@@ -1842,7 +1840,7 @@ asmlinkage __visible int printk(const char *fmt, ...) | |||
1842 | } | 1840 | } |
1843 | #endif | 1841 | #endif |
1844 | va_start(args, fmt); | 1842 | va_start(args, fmt); |
1845 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | 1843 | r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); |
1846 | va_end(args); | 1844 | va_end(args); |
1847 | 1845 | ||
1848 | return r; | 1846 | return r; |
@@ -1881,23 +1879,20 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } | |||
1881 | #ifdef CONFIG_EARLY_PRINTK | 1879 | #ifdef CONFIG_EARLY_PRINTK |
1882 | struct console *early_console; | 1880 | struct console *early_console; |
1883 | 1881 | ||
1884 | void early_vprintk(const char *fmt, va_list ap) | ||
1885 | { | ||
1886 | if (early_console) { | ||
1887 | char buf[512]; | ||
1888 | int n = vscnprintf(buf, sizeof(buf), fmt, ap); | ||
1889 | |||
1890 | early_console->write(early_console, buf, n); | ||
1891 | } | ||
1892 | } | ||
1893 | |||
1894 | asmlinkage __visible void early_printk(const char *fmt, ...) | 1882 | asmlinkage __visible void early_printk(const char *fmt, ...) |
1895 | { | 1883 | { |
1896 | va_list ap; | 1884 | va_list ap; |
1885 | char buf[512]; | ||
1886 | int n; | ||
1887 | |||
1888 | if (!early_console) | ||
1889 | return; | ||
1897 | 1890 | ||
1898 | va_start(ap, fmt); | 1891 | va_start(ap, fmt); |
1899 | early_vprintk(fmt, ap); | 1892 | n = vscnprintf(buf, sizeof(buf), fmt, ap); |
1900 | va_end(ap); | 1893 | va_end(ap); |
1894 | |||
1895 | early_console->write(early_console, buf, n); | ||
1901 | } | 1896 | } |
1902 | #endif | 1897 | #endif |
1903 | 1898 | ||
@@ -2634,7 +2629,7 @@ int printk_deferred(const char *fmt, ...) | |||
2634 | 2629 | ||
2635 | preempt_disable(); | 2630 | preempt_disable(); |
2636 | va_start(args, fmt); | 2631 | va_start(args, fmt); |
2637 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); | 2632 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); |
2638 | va_end(args); | 2633 | va_end(args); |
2639 | 2634 | ||
2640 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2635 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c4..1eb9d90c3af9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
485 | 485 | ||
486 | /* | 486 | /* |
487 | * Detach all tasks we were using ptrace on. Called with tasklist held | 487 | * Detach all tasks we were using ptrace on. Called with tasklist held |
488 | * for writing, and returns with it held too. But note it can release | 488 | * for writing. |
489 | * and reacquire the lock. | ||
490 | */ | 489 | */ |
491 | void exit_ptrace(struct task_struct *tracer) | 490 | void exit_ptrace(struct task_struct *tracer, struct list_head *dead) |
492 | __releases(&tasklist_lock) | ||
493 | __acquires(&tasklist_lock) | ||
494 | { | 491 | { |
495 | struct task_struct *p, *n; | 492 | struct task_struct *p, *n; |
496 | LIST_HEAD(ptrace_dead); | ||
497 | |||
498 | if (likely(list_empty(&tracer->ptraced))) | ||
499 | return; | ||
500 | 493 | ||
501 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 494 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
502 | if (unlikely(p->ptrace & PT_EXITKILL)) | 495 | if (unlikely(p->ptrace & PT_EXITKILL)) |
503 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | 496 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); |
504 | 497 | ||
505 | if (__ptrace_detach(tracer, p)) | 498 | if (__ptrace_detach(tracer, p)) |
506 | list_add(&p->ptrace_entry, &ptrace_dead); | 499 | list_add(&p->ptrace_entry, dead); |
507 | } | ||
508 | |||
509 | write_unlock_irq(&tasklist_lock); | ||
510 | BUG_ON(!list_empty(&tracer->ptraced)); | ||
511 | |||
512 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { | ||
513 | list_del_init(&p->ptrace_entry); | ||
514 | release_task(p); | ||
515 | } | 500 | } |
516 | |||
517 | write_lock_irq(&tasklist_lock); | ||
518 | } | 501 | } |
519 | 502 | ||
520 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 503 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null | |||
@@ -1,211 +0,0 @@ | |||
1 | /* | ||
2 | * resource cgroups | ||
3 | * | ||
4 | * Copyright 2007 OpenVZ SWsoft Inc | ||
5 | * | ||
6 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
7 | * | ||
8 | */ | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/parser.h> | ||
12 | #include <linux/fs.h> | ||
13 | #include <linux/res_counter.h> | ||
14 | #include <linux/uaccess.h> | ||
15 | #include <linux/mm.h> | ||
16 | |||
17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | ||
18 | { | ||
19 | spin_lock_init(&counter->lock); | ||
20 | counter->limit = RES_COUNTER_MAX; | ||
21 | counter->soft_limit = RES_COUNTER_MAX; | ||
22 | counter->parent = parent; | ||
23 | } | ||
24 | |||
25 | static u64 res_counter_uncharge_locked(struct res_counter *counter, | ||
26 | unsigned long val) | ||
27 | { | ||
28 | if (WARN_ON(counter->usage < val)) | ||
29 | val = counter->usage; | ||
30 | |||
31 | counter->usage -= val; | ||
32 | return counter->usage; | ||
33 | } | ||
34 | |||
35 | static int res_counter_charge_locked(struct res_counter *counter, | ||
36 | unsigned long val, bool force) | ||
37 | { | ||
38 | int ret = 0; | ||
39 | |||
40 | if (counter->usage + val > counter->limit) { | ||
41 | counter->failcnt++; | ||
42 | ret = -ENOMEM; | ||
43 | if (!force) | ||
44 | return ret; | ||
45 | } | ||
46 | |||
47 | counter->usage += val; | ||
48 | if (counter->usage > counter->max_usage) | ||
49 | counter->max_usage = counter->usage; | ||
50 | return ret; | ||
51 | } | ||
52 | |||
53 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, | ||
54 | struct res_counter **limit_fail_at, bool force) | ||
55 | { | ||
56 | int ret, r; | ||
57 | unsigned long flags; | ||
58 | struct res_counter *c, *u; | ||
59 | |||
60 | r = ret = 0; | ||
61 | *limit_fail_at = NULL; | ||
62 | local_irq_save(flags); | ||
63 | for (c = counter; c != NULL; c = c->parent) { | ||
64 | spin_lock(&c->lock); | ||
65 | r = res_counter_charge_locked(c, val, force); | ||
66 | spin_unlock(&c->lock); | ||
67 | if (r < 0 && !ret) { | ||
68 | ret = r; | ||
69 | *limit_fail_at = c; | ||
70 | if (!force) | ||
71 | break; | ||
72 | } | ||
73 | } | ||
74 | |||
75 | if (ret < 0 && !force) { | ||
76 | for (u = counter; u != c; u = u->parent) { | ||
77 | spin_lock(&u->lock); | ||
78 | res_counter_uncharge_locked(u, val); | ||
79 | spin_unlock(&u->lock); | ||
80 | } | ||
81 | } | ||
82 | local_irq_restore(flags); | ||
83 | |||
84 | return ret; | ||
85 | } | ||
86 | |||
87 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
88 | struct res_counter **limit_fail_at) | ||
89 | { | ||
90 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
91 | } | ||
92 | |||
93 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | ||
94 | struct res_counter **limit_fail_at) | ||
95 | { | ||
96 | return __res_counter_charge(counter, val, limit_fail_at, true); | ||
97 | } | ||
98 | |||
99 | u64 res_counter_uncharge_until(struct res_counter *counter, | ||
100 | struct res_counter *top, | ||
101 | unsigned long val) | ||
102 | { | ||
103 | unsigned long flags; | ||
104 | struct res_counter *c; | ||
105 | u64 ret = 0; | ||
106 | |||
107 | local_irq_save(flags); | ||
108 | for (c = counter; c != top; c = c->parent) { | ||
109 | u64 r; | ||
110 | spin_lock(&c->lock); | ||
111 | r = res_counter_uncharge_locked(c, val); | ||
112 | if (c == counter) | ||
113 | ret = r; | ||
114 | spin_unlock(&c->lock); | ||
115 | } | ||
116 | local_irq_restore(flags); | ||
117 | return ret; | ||
118 | } | ||
119 | |||
120 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
121 | { | ||
122 | return res_counter_uncharge_until(counter, NULL, val); | ||
123 | } | ||
124 | |||
125 | static inline unsigned long long * | ||
126 | res_counter_member(struct res_counter *counter, int member) | ||
127 | { | ||
128 | switch (member) { | ||
129 | case RES_USAGE: | ||
130 | return &counter->usage; | ||
131 | case RES_MAX_USAGE: | ||
132 | return &counter->max_usage; | ||
133 | case RES_LIMIT: | ||
134 | return &counter->limit; | ||
135 | case RES_FAILCNT: | ||
136 | return &counter->failcnt; | ||
137 | case RES_SOFT_LIMIT: | ||
138 | return &counter->soft_limit; | ||
139 | }; | ||
140 | |||
141 | BUG(); | ||
142 | return NULL; | ||
143 | } | ||
144 | |||
145 | ssize_t res_counter_read(struct res_counter *counter, int member, | ||
146 | const char __user *userbuf, size_t nbytes, loff_t *pos, | ||
147 | int (*read_strategy)(unsigned long long val, char *st_buf)) | ||
148 | { | ||
149 | unsigned long long *val; | ||
150 | char buf[64], *s; | ||
151 | |||
152 | s = buf; | ||
153 | val = res_counter_member(counter, member); | ||
154 | if (read_strategy) | ||
155 | s += read_strategy(*val, s); | ||
156 | else | ||
157 | s += sprintf(s, "%llu\n", *val); | ||
158 | return simple_read_from_buffer((void __user *)userbuf, nbytes, | ||
159 | pos, buf, s - buf); | ||
160 | } | ||
161 | |||
162 | #if BITS_PER_LONG == 32 | ||
163 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
164 | { | ||
165 | unsigned long flags; | ||
166 | u64 ret; | ||
167 | |||
168 | spin_lock_irqsave(&counter->lock, flags); | ||
169 | ret = *res_counter_member(counter, member); | ||
170 | spin_unlock_irqrestore(&counter->lock, flags); | ||
171 | |||
172 | return ret; | ||
173 | } | ||
174 | #else | ||
175 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
176 | { | ||
177 | return *res_counter_member(counter, member); | ||
178 | } | ||
179 | #endif | ||
180 | |||
181 | int res_counter_memparse_write_strategy(const char *buf, | ||
182 | unsigned long long *resp) | ||
183 | { | ||
184 | char *end; | ||
185 | unsigned long long res; | ||
186 | |||
187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | ||
188 | if (*buf == '-') { | ||
189 | int rc = kstrtoull(buf + 1, 10, &res); | ||
190 | |||
191 | if (rc) | ||
192 | return rc; | ||
193 | if (res != 1) | ||
194 | return -EINVAL; | ||
195 | *resp = RES_COUNTER_MAX; | ||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | res = memparse(buf, &end); | ||
200 | if (*end != '\0') | ||
201 | return -EINVAL; | ||
202 | |||
203 | if (PAGE_ALIGN(res) >= res) | ||
204 | res = PAGE_ALIGN(res); | ||
205 | else | ||
206 | res = RES_COUNTER_MAX; | ||
207 | |||
208 | *resp = res; | ||
209 | |||
210 | return 0; | ||
211 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bb398c0c5f08..b5797b78add6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p) | |||
4527 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4527 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4528 | free = stack_not_used(p); | 4528 | free = stack_not_used(p); |
4529 | #endif | 4529 | #endif |
4530 | ppid = 0; | ||
4530 | rcu_read_lock(); | 4531 | rcu_read_lock(); |
4531 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | 4532 | if (pid_alive(p)) |
4533 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
4532 | rcu_read_unlock(); | 4534 | rcu_read_unlock(); |
4533 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4535 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
4534 | task_pid_nr(p), ppid, | 4536 | task_pid_nr(p), ppid, |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 15f2511a1b7c..7c54ff79afd7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1104,6 +1104,15 @@ static struct ctl_table kern_table[] = { | |||
1104 | .proc_handler = proc_dointvec, | 1104 | .proc_handler = proc_dointvec, |
1105 | }, | 1105 | }, |
1106 | #endif | 1106 | #endif |
1107 | { | ||
1108 | .procname = "panic_on_warn", | ||
1109 | .data = &panic_on_warn, | ||
1110 | .maxlen = sizeof(int), | ||
1111 | .mode = 0644, | ||
1112 | .proc_handler = proc_dointvec_minmax, | ||
1113 | .extra1 = &zero, | ||
1114 | .extra2 = &one, | ||
1115 | }, | ||
1107 | { } | 1116 | { } |
1108 | }; | 1117 | }; |
1109 | 1118 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a2963..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { | |||
137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
140 | { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, | ||
140 | {} | 141 | {} |
141 | }; | 142 | }; |
142 | 143 | ||
diff --git a/lib/dma-debug.c b/lib/dma-debug.c index add80cc02dbe..9722bd2dbc9b 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c | |||
@@ -102,6 +102,14 @@ static DEFINE_SPINLOCK(free_entries_lock); | |||
102 | /* Global disable flag - will be set in case of an error */ | 102 | /* Global disable flag - will be set in case of an error */ |
103 | static u32 global_disable __read_mostly; | 103 | static u32 global_disable __read_mostly; |
104 | 104 | ||
105 | /* Early initialization disable flag, set at the end of dma_debug_init */ | ||
106 | static bool dma_debug_initialized __read_mostly; | ||
107 | |||
108 | static inline bool dma_debug_disabled(void) | ||
109 | { | ||
110 | return global_disable || !dma_debug_initialized; | ||
111 | } | ||
112 | |||
105 | /* Global error count */ | 113 | /* Global error count */ |
106 | static u32 error_count; | 114 | static u32 error_count; |
107 | 115 | ||
@@ -945,7 +953,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti | |||
945 | struct dma_debug_entry *uninitialized_var(entry); | 953 | struct dma_debug_entry *uninitialized_var(entry); |
946 | int count; | 954 | int count; |
947 | 955 | ||
948 | if (global_disable) | 956 | if (dma_debug_disabled()) |
949 | return 0; | 957 | return 0; |
950 | 958 | ||
951 | switch (action) { | 959 | switch (action) { |
@@ -973,7 +981,7 @@ void dma_debug_add_bus(struct bus_type *bus) | |||
973 | { | 981 | { |
974 | struct notifier_block *nb; | 982 | struct notifier_block *nb; |
975 | 983 | ||
976 | if (global_disable) | 984 | if (dma_debug_disabled()) |
977 | return; | 985 | return; |
978 | 986 | ||
979 | nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); | 987 | nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); |
@@ -994,6 +1002,9 @@ void dma_debug_init(u32 num_entries) | |||
994 | { | 1002 | { |
995 | int i; | 1003 | int i; |
996 | 1004 | ||
1005 | /* Do not use dma_debug_initialized here, since we really want to be | ||
1006 | * called to set dma_debug_initialized | ||
1007 | */ | ||
997 | if (global_disable) | 1008 | if (global_disable) |
998 | return; | 1009 | return; |
999 | 1010 | ||
@@ -1021,6 +1032,8 @@ void dma_debug_init(u32 num_entries) | |||
1021 | 1032 | ||
1022 | nr_total_entries = num_free_entries; | 1033 | nr_total_entries = num_free_entries; |
1023 | 1034 | ||
1035 | dma_debug_initialized = true; | ||
1036 | |||
1024 | pr_info("DMA-API: debugging enabled by kernel config\n"); | 1037 | pr_info("DMA-API: debugging enabled by kernel config\n"); |
1025 | } | 1038 | } |
1026 | 1039 | ||
@@ -1243,7 +1256,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, | |||
1243 | { | 1256 | { |
1244 | struct dma_debug_entry *entry; | 1257 | struct dma_debug_entry *entry; |
1245 | 1258 | ||
1246 | if (unlikely(global_disable)) | 1259 | if (unlikely(dma_debug_disabled())) |
1247 | return; | 1260 | return; |
1248 | 1261 | ||
1249 | if (dma_mapping_error(dev, dma_addr)) | 1262 | if (dma_mapping_error(dev, dma_addr)) |
@@ -1283,7 +1296,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) | |||
1283 | struct hash_bucket *bucket; | 1296 | struct hash_bucket *bucket; |
1284 | unsigned long flags; | 1297 | unsigned long flags; |
1285 | 1298 | ||
1286 | if (unlikely(global_disable)) | 1299 | if (unlikely(dma_debug_disabled())) |
1287 | return; | 1300 | return; |
1288 | 1301 | ||
1289 | ref.dev = dev; | 1302 | ref.dev = dev; |
@@ -1325,7 +1338,7 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, | |||
1325 | .direction = direction, | 1338 | .direction = direction, |
1326 | }; | 1339 | }; |
1327 | 1340 | ||
1328 | if (unlikely(global_disable)) | 1341 | if (unlikely(dma_debug_disabled())) |
1329 | return; | 1342 | return; |
1330 | 1343 | ||
1331 | if (map_single) | 1344 | if (map_single) |
@@ -1342,7 +1355,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
1342 | struct scatterlist *s; | 1355 | struct scatterlist *s; |
1343 | int i; | 1356 | int i; |
1344 | 1357 | ||
1345 | if (unlikely(global_disable)) | 1358 | if (unlikely(dma_debug_disabled())) |
1346 | return; | 1359 | return; |
1347 | 1360 | ||
1348 | for_each_sg(sg, s, mapped_ents, i) { | 1361 | for_each_sg(sg, s, mapped_ents, i) { |
@@ -1395,7 +1408,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
1395 | struct scatterlist *s; | 1408 | struct scatterlist *s; |
1396 | int mapped_ents = 0, i; | 1409 | int mapped_ents = 0, i; |
1397 | 1410 | ||
1398 | if (unlikely(global_disable)) | 1411 | if (unlikely(dma_debug_disabled())) |
1399 | return; | 1412 | return; |
1400 | 1413 | ||
1401 | for_each_sg(sglist, s, nelems, i) { | 1414 | for_each_sg(sglist, s, nelems, i) { |
@@ -1427,7 +1440,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, | |||
1427 | { | 1440 | { |
1428 | struct dma_debug_entry *entry; | 1441 | struct dma_debug_entry *entry; |
1429 | 1442 | ||
1430 | if (unlikely(global_disable)) | 1443 | if (unlikely(dma_debug_disabled())) |
1431 | return; | 1444 | return; |
1432 | 1445 | ||
1433 | if (unlikely(virt == NULL)) | 1446 | if (unlikely(virt == NULL)) |
@@ -1462,7 +1475,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size, | |||
1462 | .direction = DMA_BIDIRECTIONAL, | 1475 | .direction = DMA_BIDIRECTIONAL, |
1463 | }; | 1476 | }; |
1464 | 1477 | ||
1465 | if (unlikely(global_disable)) | 1478 | if (unlikely(dma_debug_disabled())) |
1466 | return; | 1479 | return; |
1467 | 1480 | ||
1468 | check_unmap(&ref); | 1481 | check_unmap(&ref); |
@@ -1474,7 +1487,7 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, | |||
1474 | { | 1487 | { |
1475 | struct dma_debug_entry ref; | 1488 | struct dma_debug_entry ref; |
1476 | 1489 | ||
1477 | if (unlikely(global_disable)) | 1490 | if (unlikely(dma_debug_disabled())) |
1478 | return; | 1491 | return; |
1479 | 1492 | ||
1480 | ref.type = dma_debug_single; | 1493 | ref.type = dma_debug_single; |
@@ -1494,7 +1507,7 @@ void debug_dma_sync_single_for_device(struct device *dev, | |||
1494 | { | 1507 | { |
1495 | struct dma_debug_entry ref; | 1508 | struct dma_debug_entry ref; |
1496 | 1509 | ||
1497 | if (unlikely(global_disable)) | 1510 | if (unlikely(dma_debug_disabled())) |
1498 | return; | 1511 | return; |
1499 | 1512 | ||
1500 | ref.type = dma_debug_single; | 1513 | ref.type = dma_debug_single; |
@@ -1515,7 +1528,7 @@ void debug_dma_sync_single_range_for_cpu(struct device *dev, | |||
1515 | { | 1528 | { |
1516 | struct dma_debug_entry ref; | 1529 | struct dma_debug_entry ref; |
1517 | 1530 | ||
1518 | if (unlikely(global_disable)) | 1531 | if (unlikely(dma_debug_disabled())) |
1519 | return; | 1532 | return; |
1520 | 1533 | ||
1521 | ref.type = dma_debug_single; | 1534 | ref.type = dma_debug_single; |
@@ -1536,7 +1549,7 @@ void debug_dma_sync_single_range_for_device(struct device *dev, | |||
1536 | { | 1549 | { |
1537 | struct dma_debug_entry ref; | 1550 | struct dma_debug_entry ref; |
1538 | 1551 | ||
1539 | if (unlikely(global_disable)) | 1552 | if (unlikely(dma_debug_disabled())) |
1540 | return; | 1553 | return; |
1541 | 1554 | ||
1542 | ref.type = dma_debug_single; | 1555 | ref.type = dma_debug_single; |
@@ -1556,7 +1569,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, | |||
1556 | struct scatterlist *s; | 1569 | struct scatterlist *s; |
1557 | int mapped_ents = 0, i; | 1570 | int mapped_ents = 0, i; |
1558 | 1571 | ||
1559 | if (unlikely(global_disable)) | 1572 | if (unlikely(dma_debug_disabled())) |
1560 | return; | 1573 | return; |
1561 | 1574 | ||
1562 | for_each_sg(sg, s, nelems, i) { | 1575 | for_each_sg(sg, s, nelems, i) { |
@@ -1589,7 +1602,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | |||
1589 | struct scatterlist *s; | 1602 | struct scatterlist *s; |
1590 | int mapped_ents = 0, i; | 1603 | int mapped_ents = 0, i; |
1591 | 1604 | ||
1592 | if (unlikely(global_disable)) | 1605 | if (unlikely(dma_debug_disabled())) |
1593 | return; | 1606 | return; |
1594 | 1607 | ||
1595 | for_each_sg(sg, s, nelems, i) { | 1608 | for_each_sg(sg, s, nelems, i) { |
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index dfba05521748..527799d44476 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c | |||
@@ -576,7 +576,7 @@ void __dynamic_dev_dbg(struct _ddebug *descriptor, | |||
576 | } else { | 576 | } else { |
577 | char buf[PREFIX_SIZE]; | 577 | char buf[PREFIX_SIZE]; |
578 | 578 | ||
579 | dev_printk_emit(7, dev, "%s%s %s: %pV", | 579 | dev_printk_emit(LOGLEVEL_DEBUG, dev, "%s%s %s: %pV", |
580 | dynamic_emit_prefix(descriptor, buf), | 580 | dynamic_emit_prefix(descriptor, buf), |
581 | dev_driver_string(dev), dev_name(dev), | 581 | dev_driver_string(dev), dev_name(dev), |
582 | &vaf); | 582 | &vaf); |
@@ -605,7 +605,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor, | |||
605 | if (dev && dev->dev.parent) { | 605 | if (dev && dev->dev.parent) { |
606 | char buf[PREFIX_SIZE]; | 606 | char buf[PREFIX_SIZE]; |
607 | 607 | ||
608 | dev_printk_emit(7, dev->dev.parent, | 608 | dev_printk_emit(LOGLEVEL_DEBUG, dev->dev.parent, |
609 | "%s%s %s %s%s: %pV", | 609 | "%s%s %s %s%s: %pV", |
610 | dynamic_emit_prefix(descriptor, buf), | 610 | dynamic_emit_prefix(descriptor, buf), |
611 | dev_driver_string(dev->dev.parent), | 611 | dev_driver_string(dev->dev.parent), |
@@ -7,10 +7,8 @@ | |||
7 | unsigned long lcm(unsigned long a, unsigned long b) | 7 | unsigned long lcm(unsigned long a, unsigned long b) |
8 | { | 8 | { |
9 | if (a && b) | 9 | if (a && b) |
10 | return (a * b) / gcd(a, b); | 10 | return (a / gcd(a, b)) * b; |
11 | else if (b) | 11 | else |
12 | return b; | 12 | return 0; |
13 | |||
14 | return a; | ||
15 | } | 13 | } |
16 | EXPORT_SYMBOL_GPL(lcm); | 14 | EXPORT_SYMBOL_GPL(lcm); |
diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a9..b3c6ce932c64 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
55 | obj-$(CONFIG_MIGRATION) += migrate.o | 55 | obj-$(CONFIG_MIGRATION) += migrate.o |
56 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 56 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
57 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 57 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
58 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o | 58 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o |
59 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o | ||
60 | obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o | ||
59 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | 61 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o |
60 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 62 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
61 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
@@ -215,9 +215,21 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
215 | bool fixed, struct cma **res_cma) | 215 | bool fixed, struct cma **res_cma) |
216 | { | 216 | { |
217 | phys_addr_t memblock_end = memblock_end_of_DRAM(); | 217 | phys_addr_t memblock_end = memblock_end_of_DRAM(); |
218 | phys_addr_t highmem_start = __pa(high_memory); | 218 | phys_addr_t highmem_start; |
219 | int ret = 0; | 219 | int ret = 0; |
220 | 220 | ||
221 | #ifdef CONFIG_X86 | ||
222 | /* | ||
223 | * high_memory isn't direct mapped memory so retrieving its physical | ||
224 | * address isn't appropriate. But it would be useful to check the | ||
225 | * physical address of the highmem boundary so it's justfiable to get | ||
226 | * the physical address from it. On x86 there is a validation check for | ||
227 | * this case, so the following workaround is needed to avoid it. | ||
228 | */ | ||
229 | highmem_start = __pa_nodebug(high_memory); | ||
230 | #else | ||
231 | highmem_start = __pa(high_memory); | ||
232 | #endif | ||
221 | pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", | 233 | pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", |
222 | __func__, &size, &base, &limit, &alignment); | 234 | __func__, &size, &base, &limit, &alignment); |
223 | 235 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index f9792ba3537c..546e571e9d60 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
41 | static unsigned long release_freepages(struct list_head *freelist) | 41 | static unsigned long release_freepages(struct list_head *freelist) |
42 | { | 42 | { |
43 | struct page *page, *next; | 43 | struct page *page, *next; |
44 | unsigned long count = 0; | 44 | unsigned long high_pfn = 0; |
45 | 45 | ||
46 | list_for_each_entry_safe(page, next, freelist, lru) { | 46 | list_for_each_entry_safe(page, next, freelist, lru) { |
47 | unsigned long pfn = page_to_pfn(page); | ||
47 | list_del(&page->lru); | 48 | list_del(&page->lru); |
48 | __free_page(page); | 49 | __free_page(page); |
49 | count++; | 50 | if (pfn > high_pfn) |
51 | high_pfn = pfn; | ||
50 | } | 52 | } |
51 | 53 | ||
52 | return count; | 54 | return high_pfn; |
53 | } | 55 | } |
54 | 56 | ||
55 | static void map_pages(struct list_head *list) | 57 | static void map_pages(struct list_head *list) |
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
195 | 197 | ||
196 | /* Update where async and sync compaction should restart */ | 198 | /* Update where async and sync compaction should restart */ |
197 | if (migrate_scanner) { | 199 | if (migrate_scanner) { |
198 | if (cc->finished_update_migrate) | ||
199 | return; | ||
200 | if (pfn > zone->compact_cached_migrate_pfn[0]) | 200 | if (pfn > zone->compact_cached_migrate_pfn[0]) |
201 | zone->compact_cached_migrate_pfn[0] = pfn; | 201 | zone->compact_cached_migrate_pfn[0] = pfn; |
202 | if (cc->mode != MIGRATE_ASYNC && | 202 | if (cc->mode != MIGRATE_ASYNC && |
203 | pfn > zone->compact_cached_migrate_pfn[1]) | 203 | pfn > zone->compact_cached_migrate_pfn[1]) |
204 | zone->compact_cached_migrate_pfn[1] = pfn; | 204 | zone->compact_cached_migrate_pfn[1] = pfn; |
205 | } else { | 205 | } else { |
206 | if (cc->finished_update_free) | ||
207 | return; | ||
208 | if (pfn < zone->compact_cached_free_pfn) | 206 | if (pfn < zone->compact_cached_free_pfn) |
209 | zone->compact_cached_free_pfn = pfn; | 207 | zone->compact_cached_free_pfn = pfn; |
210 | } | 208 | } |
@@ -715,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
715 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 713 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
716 | 714 | ||
717 | isolate_success: | 715 | isolate_success: |
718 | cc->finished_update_migrate = true; | ||
719 | list_add(&page->lru, migratelist); | 716 | list_add(&page->lru, migratelist); |
720 | cc->nr_migratepages++; | 717 | cc->nr_migratepages++; |
721 | nr_isolated++; | 718 | nr_isolated++; |
@@ -889,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
889 | block_start_pfn - pageblock_nr_pages; | 886 | block_start_pfn - pageblock_nr_pages; |
890 | 887 | ||
891 | /* | 888 | /* |
892 | * Set a flag that we successfully isolated in this pageblock. | ||
893 | * In the next loop iteration, zone->compact_cached_free_pfn | ||
894 | * will not be updated and thus it will effectively contain the | ||
895 | * highest pageblock we isolated pages from. | ||
896 | */ | ||
897 | if (isolated) | ||
898 | cc->finished_update_free = true; | ||
899 | |||
900 | /* | ||
901 | * isolate_freepages_block() might have aborted due to async | 889 | * isolate_freepages_block() might have aborted due to async |
902 | * compaction being contended | 890 | * compaction being contended |
903 | */ | 891 | */ |
@@ -1086,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1086 | 1074 | ||
1087 | /* Compaction run is not finished if the watermark is not met */ | 1075 | /* Compaction run is not finished if the watermark is not met */ |
1088 | watermark = low_wmark_pages(zone); | 1076 | watermark = low_wmark_pages(zone); |
1089 | watermark += (1 << cc->order); | ||
1090 | 1077 | ||
1091 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 1078 | if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, |
1079 | cc->alloc_flags)) | ||
1092 | return COMPACT_CONTINUE; | 1080 | return COMPACT_CONTINUE; |
1093 | 1081 | ||
1094 | /* Direct compactor: Is a suitable page free? */ | 1082 | /* Direct compactor: Is a suitable page free? */ |
@@ -1114,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1114 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1102 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
1115 | * COMPACT_CONTINUE - If compaction should run now | 1103 | * COMPACT_CONTINUE - If compaction should run now |
1116 | */ | 1104 | */ |
1117 | unsigned long compaction_suitable(struct zone *zone, int order) | 1105 | unsigned long compaction_suitable(struct zone *zone, int order, |
1106 | int alloc_flags, int classzone_idx) | ||
1118 | { | 1107 | { |
1119 | int fragindex; | 1108 | int fragindex; |
1120 | unsigned long watermark; | 1109 | unsigned long watermark; |
@@ -1126,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
1126 | if (order == -1) | 1115 | if (order == -1) |
1127 | return COMPACT_CONTINUE; | 1116 | return COMPACT_CONTINUE; |
1128 | 1117 | ||
1118 | watermark = low_wmark_pages(zone); | ||
1119 | /* | ||
1120 | * If watermarks for high-order allocation are already met, there | ||
1121 | * should be no need for compaction at all. | ||
1122 | */ | ||
1123 | if (zone_watermark_ok(zone, order, watermark, classzone_idx, | ||
1124 | alloc_flags)) | ||
1125 | return COMPACT_PARTIAL; | ||
1126 | |||
1129 | /* | 1127 | /* |
1130 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 1128 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
1131 | * This is because during migration, copies of pages need to be | 1129 | * This is because during migration, copies of pages need to be |
1132 | * allocated and for a short time, the footprint is higher | 1130 | * allocated and for a short time, the footprint is higher |
1133 | */ | 1131 | */ |
1134 | watermark = low_wmark_pages(zone) + (2UL << order); | 1132 | watermark += (2UL << order); |
1135 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1133 | if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) |
1136 | return COMPACT_SKIPPED; | 1134 | return COMPACT_SKIPPED; |
1137 | 1135 | ||
1138 | /* | 1136 | /* |
1139 | * fragmentation index determines if allocation failures are due to | 1137 | * fragmentation index determines if allocation failures are due to |
1140 | * low memory or external fragmentation | 1138 | * low memory or external fragmentation |
1141 | * | 1139 | * |
1142 | * index of -1000 implies allocations might succeed depending on | 1140 | * index of -1000 would imply allocations might succeed depending on |
1143 | * watermarks | 1141 | * watermarks, but we already failed the high-order watermark check |
1144 | * index towards 0 implies failure is due to lack of memory | 1142 | * index towards 0 implies failure is due to lack of memory |
1145 | * index towards 1000 implies failure is due to fragmentation | 1143 | * index towards 1000 implies failure is due to fragmentation |
1146 | * | 1144 | * |
@@ -1150,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
1150 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 1148 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
1151 | return COMPACT_SKIPPED; | 1149 | return COMPACT_SKIPPED; |
1152 | 1150 | ||
1153 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, | ||
1154 | 0, 0)) | ||
1155 | return COMPACT_PARTIAL; | ||
1156 | |||
1157 | return COMPACT_CONTINUE; | 1151 | return COMPACT_CONTINUE; |
1158 | } | 1152 | } |
1159 | 1153 | ||
@@ -1164,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1164 | unsigned long end_pfn = zone_end_pfn(zone); | 1158 | unsigned long end_pfn = zone_end_pfn(zone); |
1165 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | 1159 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); |
1166 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1160 | const bool sync = cc->mode != MIGRATE_ASYNC; |
1161 | unsigned long last_migrated_pfn = 0; | ||
1167 | 1162 | ||
1168 | ret = compaction_suitable(zone, cc->order); | 1163 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, |
1164 | cc->classzone_idx); | ||
1169 | switch (ret) { | 1165 | switch (ret) { |
1170 | case COMPACT_PARTIAL: | 1166 | case COMPACT_PARTIAL: |
1171 | case COMPACT_SKIPPED: | 1167 | case COMPACT_SKIPPED: |
@@ -1208,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1208 | while ((ret = compact_finished(zone, cc, migratetype)) == | 1204 | while ((ret = compact_finished(zone, cc, migratetype)) == |
1209 | COMPACT_CONTINUE) { | 1205 | COMPACT_CONTINUE) { |
1210 | int err; | 1206 | int err; |
1207 | unsigned long isolate_start_pfn = cc->migrate_pfn; | ||
1211 | 1208 | ||
1212 | switch (isolate_migratepages(zone, cc)) { | 1209 | switch (isolate_migratepages(zone, cc)) { |
1213 | case ISOLATE_ABORT: | 1210 | case ISOLATE_ABORT: |
@@ -1216,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1216 | cc->nr_migratepages = 0; | 1213 | cc->nr_migratepages = 0; |
1217 | goto out; | 1214 | goto out; |
1218 | case ISOLATE_NONE: | 1215 | case ISOLATE_NONE: |
1219 | continue; | 1216 | /* |
1217 | * We haven't isolated and migrated anything, but | ||
1218 | * there might still be unflushed migrations from | ||
1219 | * previous cc->order aligned block. | ||
1220 | */ | ||
1221 | goto check_drain; | ||
1220 | case ISOLATE_SUCCESS: | 1222 | case ISOLATE_SUCCESS: |
1221 | ; | 1223 | ; |
1222 | } | 1224 | } |
@@ -1241,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1241 | goto out; | 1243 | goto out; |
1242 | } | 1244 | } |
1243 | } | 1245 | } |
1246 | |||
1247 | /* | ||
1248 | * Record where we could have freed pages by migration and not | ||
1249 | * yet flushed them to buddy allocator. We use the pfn that | ||
1250 | * isolate_migratepages() started from in this loop iteration | ||
1251 | * - this is the lowest page that could have been isolated and | ||
1252 | * then freed by migration. | ||
1253 | */ | ||
1254 | if (!last_migrated_pfn) | ||
1255 | last_migrated_pfn = isolate_start_pfn; | ||
1256 | |||
1257 | check_drain: | ||
1258 | /* | ||
1259 | * Has the migration scanner moved away from the previous | ||
1260 | * cc->order aligned block where we migrated from? If yes, | ||
1261 | * flush the pages that were freed, so that they can merge and | ||
1262 | * compact_finished() can detect immediately if allocation | ||
1263 | * would succeed. | ||
1264 | */ | ||
1265 | if (cc->order > 0 && last_migrated_pfn) { | ||
1266 | int cpu; | ||
1267 | unsigned long current_block_start = | ||
1268 | cc->migrate_pfn & ~((1UL << cc->order) - 1); | ||
1269 | |||
1270 | if (last_migrated_pfn < current_block_start) { | ||
1271 | cpu = get_cpu(); | ||
1272 | lru_add_drain_cpu(cpu); | ||
1273 | drain_local_pages(zone); | ||
1274 | put_cpu(); | ||
1275 | /* No more flushing until we migrate again */ | ||
1276 | last_migrated_pfn = 0; | ||
1277 | } | ||
1278 | } | ||
1279 | |||
1244 | } | 1280 | } |
1245 | 1281 | ||
1246 | out: | 1282 | out: |
1247 | /* Release free pages and check accounting */ | 1283 | /* |
1248 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1284 | * Release free pages and update where the free scanner should restart, |
1249 | VM_BUG_ON(cc->nr_freepages != 0); | 1285 | * so we don't leave any returned pages behind in the next attempt. |
1286 | */ | ||
1287 | if (cc->nr_freepages > 0) { | ||
1288 | unsigned long free_pfn = release_freepages(&cc->freepages); | ||
1289 | |||
1290 | cc->nr_freepages = 0; | ||
1291 | VM_BUG_ON(free_pfn == 0); | ||
1292 | /* The cached pfn is always the first in a pageblock */ | ||
1293 | free_pfn &= ~(pageblock_nr_pages-1); | ||
1294 | /* | ||
1295 | * Only go back, not forward. The cached pfn might have been | ||
1296 | * already reset to zone end in compact_finished() | ||
1297 | */ | ||
1298 | if (free_pfn > zone->compact_cached_free_pfn) | ||
1299 | zone->compact_cached_free_pfn = free_pfn; | ||
1300 | } | ||
1250 | 1301 | ||
1251 | trace_mm_compaction_end(ret); | 1302 | trace_mm_compaction_end(ret); |
1252 | 1303 | ||
@@ -1254,7 +1305,8 @@ out: | |||
1254 | } | 1305 | } |
1255 | 1306 | ||
1256 | static unsigned long compact_zone_order(struct zone *zone, int order, | 1307 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1257 | gfp_t gfp_mask, enum migrate_mode mode, int *contended) | 1308 | gfp_t gfp_mask, enum migrate_mode mode, int *contended, |
1309 | int alloc_flags, int classzone_idx) | ||
1258 | { | 1310 | { |
1259 | unsigned long ret; | 1311 | unsigned long ret; |
1260 | struct compact_control cc = { | 1312 | struct compact_control cc = { |
@@ -1264,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order, | |||
1264 | .gfp_mask = gfp_mask, | 1316 | .gfp_mask = gfp_mask, |
1265 | .zone = zone, | 1317 | .zone = zone, |
1266 | .mode = mode, | 1318 | .mode = mode, |
1319 | .alloc_flags = alloc_flags, | ||
1320 | .classzone_idx = classzone_idx, | ||
1267 | }; | 1321 | }; |
1268 | INIT_LIST_HEAD(&cc.freepages); | 1322 | INIT_LIST_HEAD(&cc.freepages); |
1269 | INIT_LIST_HEAD(&cc.migratepages); | 1323 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1288,14 +1342,13 @@ int sysctl_extfrag_threshold = 500; | |||
1288 | * @mode: The migration mode for async, sync light, or sync migration | 1342 | * @mode: The migration mode for async, sync light, or sync migration |
1289 | * @contended: Return value that determines if compaction was aborted due to | 1343 | * @contended: Return value that determines if compaction was aborted due to |
1290 | * need_resched() or lock contention | 1344 | * need_resched() or lock contention |
1291 | * @candidate_zone: Return the zone where we think allocation should succeed | ||
1292 | * | 1345 | * |
1293 | * This is the main entry point for direct page compaction. | 1346 | * This is the main entry point for direct page compaction. |
1294 | */ | 1347 | */ |
1295 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1348 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1296 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1349 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1297 | enum migrate_mode mode, int *contended, | 1350 | enum migrate_mode mode, int *contended, |
1298 | struct zone **candidate_zone) | 1351 | int alloc_flags, int classzone_idx) |
1299 | { | 1352 | { |
1300 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1353 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1301 | int may_enter_fs = gfp_mask & __GFP_FS; | 1354 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1303,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1303 | struct zoneref *z; | 1356 | struct zoneref *z; |
1304 | struct zone *zone; | 1357 | struct zone *zone; |
1305 | int rc = COMPACT_DEFERRED; | 1358 | int rc = COMPACT_DEFERRED; |
1306 | int alloc_flags = 0; | ||
1307 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ | 1359 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ |
1308 | 1360 | ||
1309 | *contended = COMPACT_CONTENDED_NONE; | 1361 | *contended = COMPACT_CONTENDED_NONE; |
@@ -1312,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1312 | if (!order || !may_enter_fs || !may_perform_io) | 1364 | if (!order || !may_enter_fs || !may_perform_io) |
1313 | return COMPACT_SKIPPED; | 1365 | return COMPACT_SKIPPED; |
1314 | 1366 | ||
1315 | #ifdef CONFIG_CMA | ||
1316 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
1317 | alloc_flags |= ALLOC_CMA; | ||
1318 | #endif | ||
1319 | /* Compact each zone in the list */ | 1367 | /* Compact each zone in the list */ |
1320 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1368 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
1321 | nodemask) { | 1369 | nodemask) { |
@@ -1326,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1326 | continue; | 1374 | continue; |
1327 | 1375 | ||
1328 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1376 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1329 | &zone_contended); | 1377 | &zone_contended, alloc_flags, classzone_idx); |
1330 | rc = max(status, rc); | 1378 | rc = max(status, rc); |
1331 | /* | 1379 | /* |
1332 | * It takes at least one zone that wasn't lock contended | 1380 | * It takes at least one zone that wasn't lock contended |
@@ -1335,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1335 | all_zones_contended &= zone_contended; | 1383 | all_zones_contended &= zone_contended; |
1336 | 1384 | ||
1337 | /* If a normal allocation would succeed, stop compacting */ | 1385 | /* If a normal allocation would succeed, stop compacting */ |
1338 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, | 1386 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
1339 | alloc_flags)) { | 1387 | classzone_idx, alloc_flags)) { |
1340 | *candidate_zone = zone; | ||
1341 | /* | 1388 | /* |
1342 | * We think the allocation will succeed in this zone, | 1389 | * We think the allocation will succeed in this zone, |
1343 | * but it is not certain, hence the false. The caller | 1390 | * but it is not certain, hence the false. The caller |
@@ -1359,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1359 | goto break_loop; | 1406 | goto break_loop; |
1360 | } | 1407 | } |
1361 | 1408 | ||
1362 | if (mode != MIGRATE_ASYNC) { | 1409 | if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { |
1363 | /* | 1410 | /* |
1364 | * We think that allocation won't succeed in this zone | 1411 | * We think that allocation won't succeed in this zone |
1365 | * so we defer compaction there. If it ends up | 1412 | * so we defer compaction there. If it ends up |
diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9a29b5..0e58f3211f89 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason, | |||
95 | dump_flags(page->flags & badflags, | 95 | dump_flags(page->flags & badflags, |
96 | pageflag_names, ARRAY_SIZE(pageflag_names)); | 96 | pageflag_names, ARRAY_SIZE(pageflag_names)); |
97 | } | 97 | } |
98 | mem_cgroup_print_bad_page(page); | 98 | #ifdef CONFIG_MEMCG |
99 | if (page->mem_cgroup) | ||
100 | pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); | ||
101 | #endif | ||
99 | } | 102 | } |
100 | 103 | ||
101 | void dump_page(struct page *page, const char *reason) | 104 | void dump_page(struct page *page, const char *reason) |
diff --git a/mm/frontswap.c b/mm/frontswap.c index f2a3571c6e22..8d82809eb085 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map) | |||
182 | if (frontswap_ops) | 182 | if (frontswap_ops) |
183 | frontswap_ops->init(type); | 183 | frontswap_ops->init(type); |
184 | else { | 184 | else { |
185 | BUG_ON(type > MAX_SWAPFILES); | 185 | BUG_ON(type >= MAX_SWAPFILES); |
186 | set_bit(type, need_init); | 186 | set_bit(type, need_init); |
187 | } | 187 | } |
188 | } | 188 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de984159cf0b..5b2c6875fc38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
784 | if (!pmd_none(*pmd)) | 784 | if (!pmd_none(*pmd)) |
785 | return false; | 785 | return false; |
786 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 786 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
787 | entry = pmd_wrprotect(entry); | ||
788 | entry = pmd_mkhuge(entry); | 787 | entry = pmd_mkhuge(entry); |
789 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 788 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
790 | set_pmd_at(mm, haddr, pmd, entry); | 789 | set_pmd_at(mm, haddr, pmd, entry); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9fd722769927..30cd96879152 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2638 | 2638 | ||
2639 | tlb_start_vma(tlb, vma); | 2639 | tlb_start_vma(tlb, vma); |
2640 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2640 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2641 | address = start; | ||
2641 | again: | 2642 | again: |
2642 | for (address = start; address < end; address += sz) { | 2643 | for (; address < end; address += sz) { |
2643 | ptep = huge_pte_offset(mm, address); | 2644 | ptep = huge_pte_offset(mm, address); |
2644 | if (!ptep) | 2645 | if (!ptep) |
2645 | continue; | 2646 | continue; |
@@ -2686,6 +2687,7 @@ again: | |||
2686 | page_remove_rmap(page); | 2687 | page_remove_rmap(page); |
2687 | force_flush = !__tlb_remove_page(tlb, page); | 2688 | force_flush = !__tlb_remove_page(tlb, page); |
2688 | if (force_flush) { | 2689 | if (force_flush) { |
2690 | address += sz; | ||
2689 | spin_unlock(ptl); | 2691 | spin_unlock(ptl); |
2690 | break; | 2692 | break; |
2691 | } | 2693 | } |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a67c26e0f360..037e1c00a5b7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/page_counter.h> | ||
17 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
18 | #include <linux/hugetlb.h> | 19 | #include <linux/hugetlb.h> |
19 | #include <linux/hugetlb_cgroup.h> | 20 | #include <linux/hugetlb_cgroup.h> |
@@ -23,7 +24,7 @@ struct hugetlb_cgroup { | |||
23 | /* | 24 | /* |
24 | * the counter to account for hugepages from hugetlb. | 25 | * the counter to account for hugepages from hugetlb. |
25 | */ | 26 | */ |
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | 27 | struct page_counter hugepage[HUGE_MAX_HSTATE]; |
27 | }; | 28 | }; |
28 | 29 | ||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 30 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) | |||
60 | int idx; | 61 | int idx; |
61 | 62 | ||
62 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | 63 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { |
63 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | 64 | if (page_counter_read(&h_cg->hugepage[idx])) |
64 | return true; | 65 | return true; |
65 | } | 66 | } |
66 | return false; | 67 | return false; |
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
79 | 80 | ||
80 | if (parent_h_cgroup) { | 81 | if (parent_h_cgroup) { |
81 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | 82 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) |
82 | res_counter_init(&h_cgroup->hugepage[idx], | 83 | page_counter_init(&h_cgroup->hugepage[idx], |
83 | &parent_h_cgroup->hugepage[idx]); | 84 | &parent_h_cgroup->hugepage[idx]); |
84 | } else { | 85 | } else { |
85 | root_h_cgroup = h_cgroup; | 86 | root_h_cgroup = h_cgroup; |
86 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | 87 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) |
87 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | 88 | page_counter_init(&h_cgroup->hugepage[idx], NULL); |
88 | } | 89 | } |
89 | return &h_cgroup->css; | 90 | return &h_cgroup->css; |
90 | } | 91 | } |
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) | |||
108 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, | 109 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, |
109 | struct page *page) | 110 | struct page *page) |
110 | { | 111 | { |
111 | int csize; | 112 | unsigned int nr_pages; |
112 | struct res_counter *counter; | 113 | struct page_counter *counter; |
113 | struct res_counter *fail_res; | ||
114 | struct hugetlb_cgroup *page_hcg; | 114 | struct hugetlb_cgroup *page_hcg; |
115 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); | 115 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); |
116 | 116 | ||
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, | |||
123 | if (!page_hcg || page_hcg != h_cg) | 123 | if (!page_hcg || page_hcg != h_cg) |
124 | goto out; | 124 | goto out; |
125 | 125 | ||
126 | csize = PAGE_SIZE << compound_order(page); | 126 | nr_pages = 1 << compound_order(page); |
127 | if (!parent) { | 127 | if (!parent) { |
128 | parent = root_h_cgroup; | 128 | parent = root_h_cgroup; |
129 | /* root has no limit */ | 129 | /* root has no limit */ |
130 | res_counter_charge_nofail(&parent->hugepage[idx], | 130 | page_counter_charge(&parent->hugepage[idx], nr_pages); |
131 | csize, &fail_res); | ||
132 | } | 131 | } |
133 | counter = &h_cg->hugepage[idx]; | 132 | counter = &h_cg->hugepage[idx]; |
134 | res_counter_uncharge_until(counter, counter->parent, csize); | 133 | /* Take the pages off the local counter */ |
134 | page_counter_cancel(counter, nr_pages); | ||
135 | 135 | ||
136 | set_hugetlb_cgroup(page, parent); | 136 | set_hugetlb_cgroup(page, parent); |
137 | out: | 137 | out: |
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | |||
166 | struct hugetlb_cgroup **ptr) | 166 | struct hugetlb_cgroup **ptr) |
167 | { | 167 | { |
168 | int ret = 0; | 168 | int ret = 0; |
169 | struct res_counter *fail_res; | 169 | struct page_counter *counter; |
170 | struct hugetlb_cgroup *h_cg = NULL; | 170 | struct hugetlb_cgroup *h_cg = NULL; |
171 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
172 | 171 | ||
173 | if (hugetlb_cgroup_disabled()) | 172 | if (hugetlb_cgroup_disabled()) |
174 | goto done; | 173 | goto done; |
@@ -187,7 +186,7 @@ again: | |||
187 | } | 186 | } |
188 | rcu_read_unlock(); | 187 | rcu_read_unlock(); |
189 | 188 | ||
190 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | 189 | ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); |
191 | css_put(&h_cg->css); | 190 | css_put(&h_cg->css); |
192 | done: | 191 | done: |
193 | *ptr = h_cg; | 192 | *ptr = h_cg; |
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | |||
213 | struct page *page) | 212 | struct page *page) |
214 | { | 213 | { |
215 | struct hugetlb_cgroup *h_cg; | 214 | struct hugetlb_cgroup *h_cg; |
216 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
217 | 215 | ||
218 | if (hugetlb_cgroup_disabled()) | 216 | if (hugetlb_cgroup_disabled()) |
219 | return; | 217 | return; |
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | |||
222 | if (unlikely(!h_cg)) | 220 | if (unlikely(!h_cg)) |
223 | return; | 221 | return; |
224 | set_hugetlb_cgroup(page, NULL); | 222 | set_hugetlb_cgroup(page, NULL); |
225 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | 223 | page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); |
226 | return; | 224 | return; |
227 | } | 225 | } |
228 | 226 | ||
229 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | 227 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, |
230 | struct hugetlb_cgroup *h_cg) | 228 | struct hugetlb_cgroup *h_cg) |
231 | { | 229 | { |
232 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
233 | |||
234 | if (hugetlb_cgroup_disabled() || !h_cg) | 230 | if (hugetlb_cgroup_disabled() || !h_cg) |
235 | return; | 231 | return; |
236 | 232 | ||
237 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | 233 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) |
238 | return; | 234 | return; |
239 | 235 | ||
240 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | 236 | page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); |
241 | return; | 237 | return; |
242 | } | 238 | } |
243 | 239 | ||
240 | enum { | ||
241 | RES_USAGE, | ||
242 | RES_LIMIT, | ||
243 | RES_MAX_USAGE, | ||
244 | RES_FAILCNT, | ||
245 | }; | ||
246 | |||
244 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, | 247 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
245 | struct cftype *cft) | 248 | struct cftype *cft) |
246 | { | 249 | { |
247 | int idx, name; | 250 | struct page_counter *counter; |
248 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); | 251 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
249 | 252 | ||
250 | idx = MEMFILE_IDX(cft->private); | 253 | counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; |
251 | name = MEMFILE_ATTR(cft->private); | ||
252 | 254 | ||
253 | return res_counter_read_u64(&h_cg->hugepage[idx], name); | 255 | switch (MEMFILE_ATTR(cft->private)) { |
256 | case RES_USAGE: | ||
257 | return (u64)page_counter_read(counter) * PAGE_SIZE; | ||
258 | case RES_LIMIT: | ||
259 | return (u64)counter->limit * PAGE_SIZE; | ||
260 | case RES_MAX_USAGE: | ||
261 | return (u64)counter->watermark * PAGE_SIZE; | ||
262 | case RES_FAILCNT: | ||
263 | return counter->failcnt; | ||
264 | default: | ||
265 | BUG(); | ||
266 | } | ||
254 | } | 267 | } |
255 | 268 | ||
269 | static DEFINE_MUTEX(hugetlb_limit_mutex); | ||
270 | |||
256 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | 271 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, |
257 | char *buf, size_t nbytes, loff_t off) | 272 | char *buf, size_t nbytes, loff_t off) |
258 | { | 273 | { |
259 | int idx, name, ret; | 274 | int ret, idx; |
260 | unsigned long long val; | 275 | unsigned long nr_pages; |
261 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); | 276 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
262 | 277 | ||
278 | if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ | ||
279 | return -EINVAL; | ||
280 | |||
263 | buf = strstrip(buf); | 281 | buf = strstrip(buf); |
282 | ret = page_counter_memparse(buf, &nr_pages); | ||
283 | if (ret) | ||
284 | return ret; | ||
285 | |||
264 | idx = MEMFILE_IDX(of_cft(of)->private); | 286 | idx = MEMFILE_IDX(of_cft(of)->private); |
265 | name = MEMFILE_ATTR(of_cft(of)->private); | ||
266 | 287 | ||
267 | switch (name) { | 288 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
268 | case RES_LIMIT: | 289 | case RES_LIMIT: |
269 | if (hugetlb_cgroup_is_root(h_cg)) { | 290 | mutex_lock(&hugetlb_limit_mutex); |
270 | /* Can't set limit on root */ | 291 | ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); |
271 | ret = -EINVAL; | 292 | mutex_unlock(&hugetlb_limit_mutex); |
272 | break; | ||
273 | } | ||
274 | /* This function does all necessary parse...reuse it */ | ||
275 | ret = res_counter_memparse_write_strategy(buf, &val); | ||
276 | if (ret) | ||
277 | break; | ||
278 | val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); | ||
279 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
280 | break; | 293 | break; |
281 | default: | 294 | default: |
282 | ret = -EINVAL; | 295 | ret = -EINVAL; |
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | |||
288 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, | 301 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, |
289 | char *buf, size_t nbytes, loff_t off) | 302 | char *buf, size_t nbytes, loff_t off) |
290 | { | 303 | { |
291 | int idx, name, ret = 0; | 304 | int ret = 0; |
305 | struct page_counter *counter; | ||
292 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); | 306 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
293 | 307 | ||
294 | idx = MEMFILE_IDX(of_cft(of)->private); | 308 | counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
295 | name = MEMFILE_ATTR(of_cft(of)->private); | ||
296 | 309 | ||
297 | switch (name) { | 310 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
298 | case RES_MAX_USAGE: | 311 | case RES_MAX_USAGE: |
299 | res_counter_reset_max(&h_cg->hugepage[idx]); | 312 | page_counter_reset_watermark(counter); |
300 | break; | 313 | break; |
301 | case RES_FAILCNT: | 314 | case RES_FAILCNT: |
302 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | 315 | counter->failcnt = 0; |
303 | break; | 316 | break; |
304 | default: | 317 | default: |
305 | ret = -EINVAL; | 318 | ret = -EINVAL; |
diff --git a/mm/internal.h b/mm/internal.h index a4f90ba7068e..efad241f7014 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -161,13 +161,10 @@ struct compact_control { | |||
161 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 161 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
162 | enum migrate_mode mode; /* Async or sync migration mode */ | 162 | enum migrate_mode mode; /* Async or sync migration mode */ |
163 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 163 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
164 | bool finished_update_free; /* True when the zone cached pfns are | ||
165 | * no longer being updated | ||
166 | */ | ||
167 | bool finished_update_migrate; | ||
168 | |||
169 | int order; /* order a direct compactor needs */ | 164 | int order; /* order a direct compactor needs */ |
170 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | 165 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
166 | const int alloc_flags; /* alloc flags of a direct compactor */ | ||
167 | const int classzone_idx; /* zone index of a direct compactor */ | ||
171 | struct zone *zone; | 168 | struct zone *zone; |
172 | int contended; /* Signal need_sched() or lock | 169 | int contended; /* Signal need_sched() or lock |
173 | * contention detected during | 170 | * contention detected during |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee48428cf8e3..85df503ec023 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -25,7 +25,7 @@ | |||
25 | * GNU General Public License for more details. | 25 | * GNU General Public License for more details. |
26 | */ | 26 | */ |
27 | 27 | ||
28 | #include <linux/res_counter.h> | 28 | #include <linux/page_counter.h> |
29 | #include <linux/memcontrol.h> | 29 | #include <linux/memcontrol.h> |
30 | #include <linux/cgroup.h> | 30 | #include <linux/cgroup.h> |
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
@@ -51,7 +51,7 @@ | |||
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/vmpressure.h> | 52 | #include <linux/vmpressure.h> |
53 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
54 | #include <linux/page_cgroup.h> | 54 | #include <linux/swap_cgroup.h> |
55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
56 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu { | |||
143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 143 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
144 | }; | 144 | }; |
145 | 145 | ||
146 | struct mem_cgroup_reclaim_iter { | 146 | struct reclaim_iter { |
147 | /* | 147 | struct mem_cgroup *position; |
148 | * last scanned hierarchy member. Valid only if last_dead_count | ||
149 | * matches memcg->dead_count of the hierarchy root group. | ||
150 | */ | ||
151 | struct mem_cgroup *last_visited; | ||
152 | int last_dead_count; | ||
153 | |||
154 | /* scan generation, increased every round-trip */ | 148 | /* scan generation, increased every round-trip */ |
155 | unsigned int generation; | 149 | unsigned int generation; |
156 | }; | 150 | }; |
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone { | |||
162 | struct lruvec lruvec; | 156 | struct lruvec lruvec; |
163 | unsigned long lru_size[NR_LRU_LISTS]; | 157 | unsigned long lru_size[NR_LRU_LISTS]; |
164 | 158 | ||
165 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 159 | struct reclaim_iter iter[DEF_PRIORITY + 1]; |
166 | 160 | ||
167 | struct rb_node tree_node; /* RB tree node */ | 161 | struct rb_node tree_node; /* RB tree node */ |
168 | unsigned long long usage_in_excess;/* Set to the value by which */ | 162 | unsigned long usage_in_excess;/* Set to the value by which */ |
169 | /* the soft limit is exceeded*/ | 163 | /* the soft limit is exceeded*/ |
170 | bool on_tree; | 164 | bool on_tree; |
171 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 165 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; | |||
198 | 192 | ||
199 | struct mem_cgroup_threshold { | 193 | struct mem_cgroup_threshold { |
200 | struct eventfd_ctx *eventfd; | 194 | struct eventfd_ctx *eventfd; |
201 | u64 threshold; | 195 | unsigned long threshold; |
202 | }; | 196 | }; |
203 | 197 | ||
204 | /* For threshold */ | 198 | /* For threshold */ |
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | |||
284 | */ | 278 | */ |
285 | struct mem_cgroup { | 279 | struct mem_cgroup { |
286 | struct cgroup_subsys_state css; | 280 | struct cgroup_subsys_state css; |
287 | /* | 281 | |
288 | * the counter to account for memory usage | 282 | /* Accounted resources */ |
289 | */ | 283 | struct page_counter memory; |
290 | struct res_counter res; | 284 | struct page_counter memsw; |
285 | struct page_counter kmem; | ||
286 | |||
287 | unsigned long soft_limit; | ||
291 | 288 | ||
292 | /* vmpressure notifications */ | 289 | /* vmpressure notifications */ |
293 | struct vmpressure vmpressure; | 290 | struct vmpressure vmpressure; |
@@ -296,15 +293,6 @@ struct mem_cgroup { | |||
296 | int initialized; | 293 | int initialized; |
297 | 294 | ||
298 | /* | 295 | /* |
299 | * the counter to account for mem+swap usage. | ||
300 | */ | ||
301 | struct res_counter memsw; | ||
302 | |||
303 | /* | ||
304 | * the counter to account for kernel memory usage. | ||
305 | */ | ||
306 | struct res_counter kmem; | ||
307 | /* | ||
308 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
309 | */ | 297 | */ |
310 | bool use_hierarchy; | 298 | bool use_hierarchy; |
@@ -352,7 +340,6 @@ struct mem_cgroup { | |||
352 | struct mem_cgroup_stat_cpu nocpu_base; | 340 | struct mem_cgroup_stat_cpu nocpu_base; |
353 | spinlock_t pcp_counter_lock; | 341 | spinlock_t pcp_counter_lock; |
354 | 342 | ||
355 | atomic_t dead_count; | ||
356 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 343 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
357 | struct cg_proto tcp_mem; | 344 | struct cg_proto tcp_mem; |
358 | #endif | 345 | #endif |
@@ -382,7 +369,6 @@ struct mem_cgroup { | |||
382 | /* internal only representation about the status of kmem accounting. */ | 369 | /* internal only representation about the status of kmem accounting. */ |
383 | enum { | 370 | enum { |
384 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | 371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ |
385 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | ||
386 | }; | 372 | }; |
387 | 373 | ||
388 | #ifdef CONFIG_MEMCG_KMEM | 374 | #ifdef CONFIG_MEMCG_KMEM |
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
396 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 382 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
397 | } | 383 | } |
398 | 384 | ||
399 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | ||
400 | { | ||
401 | /* | ||
402 | * Our caller must use css_get() first, because memcg_uncharge_kmem() | ||
403 | * will call css_put() if it sees the memcg is dead. | ||
404 | */ | ||
405 | smp_wmb(); | ||
406 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | ||
407 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | ||
408 | } | ||
409 | |||
410 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | ||
411 | { | ||
412 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, | ||
413 | &memcg->kmem_account_flags); | ||
414 | } | ||
415 | #endif | 385 | #endif |
416 | 386 | ||
417 | /* Stuffs for move charges at task migration. */ | 387 | /* Stuffs for move charges at task migration. */ |
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) | |||
650 | * This check can't live in kmem destruction function, | 620 | * This check can't live in kmem destruction function, |
651 | * since the charges will outlive the cgroup | 621 | * since the charges will outlive the cgroup |
652 | */ | 622 | */ |
653 | WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); | 623 | WARN_ON(page_counter_read(&memcg->kmem)); |
654 | } | 624 | } |
655 | #else | 625 | #else |
656 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 626 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg) | |||
664 | disarm_kmem_keys(memcg); | 634 | disarm_kmem_keys(memcg); |
665 | } | 635 | } |
666 | 636 | ||
667 | static void drain_all_stock_async(struct mem_cgroup *memcg); | ||
668 | |||
669 | static struct mem_cgroup_per_zone * | 637 | static struct mem_cgroup_per_zone * |
670 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | 638 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) |
671 | { | 639 | { |
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page) | |||
706 | 674 | ||
707 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | 675 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, |
708 | struct mem_cgroup_tree_per_zone *mctz, | 676 | struct mem_cgroup_tree_per_zone *mctz, |
709 | unsigned long long new_usage_in_excess) | 677 | unsigned long new_usage_in_excess) |
710 | { | 678 | { |
711 | struct rb_node **p = &mctz->rb_root.rb_node; | 679 | struct rb_node **p = &mctz->rb_root.rb_node; |
712 | struct rb_node *parent = NULL; | 680 | struct rb_node *parent = NULL; |
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
755 | spin_unlock_irqrestore(&mctz->lock, flags); | 723 | spin_unlock_irqrestore(&mctz->lock, flags); |
756 | } | 724 | } |
757 | 725 | ||
726 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | ||
727 | { | ||
728 | unsigned long nr_pages = page_counter_read(&memcg->memory); | ||
729 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | ||
730 | unsigned long excess = 0; | ||
731 | |||
732 | if (nr_pages > soft_limit) | ||
733 | excess = nr_pages - soft_limit; | ||
734 | |||
735 | return excess; | ||
736 | } | ||
758 | 737 | ||
759 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | 738 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
760 | { | 739 | { |
761 | unsigned long long excess; | 740 | unsigned long excess; |
762 | struct mem_cgroup_per_zone *mz; | 741 | struct mem_cgroup_per_zone *mz; |
763 | struct mem_cgroup_tree_per_zone *mctz; | 742 | struct mem_cgroup_tree_per_zone *mctz; |
764 | 743 | ||
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | |||
769 | */ | 748 | */ |
770 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 749 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
771 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 750 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
772 | excess = res_counter_soft_limit_excess(&memcg->res); | 751 | excess = soft_limit_excess(memcg); |
773 | /* | 752 | /* |
774 | * We have to update the tree if mz is on RB-tree or | 753 | * We have to update the tree if mz is on RB-tree or |
775 | * mem is over its softlimit. | 754 | * mem is over its softlimit. |
@@ -825,7 +804,7 @@ retry: | |||
825 | * position in the tree. | 804 | * position in the tree. |
826 | */ | 805 | */ |
827 | __mem_cgroup_remove_exceeded(mz, mctz); | 806 | __mem_cgroup_remove_exceeded(mz, mctz); |
828 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | 807 | if (!soft_limit_excess(mz->memcg) || |
829 | !css_tryget_online(&mz->memcg->css)) | 808 | !css_tryget_online(&mz->memcg->css)) |
830 | goto retry; | 809 | goto retry; |
831 | done: | 810 | done: |
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1062 | return memcg; | 1041 | return memcg; |
1063 | } | 1042 | } |
1064 | 1043 | ||
1065 | /* | ||
1066 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | ||
1067 | * ref. count) or NULL if the whole root's subtree has been visited. | ||
1068 | * | ||
1069 | * helper function to be used by mem_cgroup_iter | ||
1070 | */ | ||
1071 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | ||
1072 | struct mem_cgroup *last_visited) | ||
1073 | { | ||
1074 | struct cgroup_subsys_state *prev_css, *next_css; | ||
1075 | |||
1076 | prev_css = last_visited ? &last_visited->css : NULL; | ||
1077 | skip_node: | ||
1078 | next_css = css_next_descendant_pre(prev_css, &root->css); | ||
1079 | |||
1080 | /* | ||
1081 | * Even if we found a group we have to make sure it is | ||
1082 | * alive. css && !memcg means that the groups should be | ||
1083 | * skipped and we should continue the tree walk. | ||
1084 | * last_visited css is safe to use because it is | ||
1085 | * protected by css_get and the tree walk is rcu safe. | ||
1086 | * | ||
1087 | * We do not take a reference on the root of the tree walk | ||
1088 | * because we might race with the root removal when it would | ||
1089 | * be the only node in the iterated hierarchy and mem_cgroup_iter | ||
1090 | * would end up in an endless loop because it expects that at | ||
1091 | * least one valid node will be returned. Root cannot disappear | ||
1092 | * because caller of the iterator should hold it already so | ||
1093 | * skipping css reference should be safe. | ||
1094 | */ | ||
1095 | if (next_css) { | ||
1096 | struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); | ||
1097 | |||
1098 | if (next_css == &root->css) | ||
1099 | return memcg; | ||
1100 | |||
1101 | if (css_tryget_online(next_css)) { | ||
1102 | /* | ||
1103 | * Make sure the memcg is initialized: | ||
1104 | * mem_cgroup_css_online() orders the the | ||
1105 | * initialization against setting the flag. | ||
1106 | */ | ||
1107 | if (smp_load_acquire(&memcg->initialized)) | ||
1108 | return memcg; | ||
1109 | css_put(next_css); | ||
1110 | } | ||
1111 | |||
1112 | prev_css = next_css; | ||
1113 | goto skip_node; | ||
1114 | } | ||
1115 | |||
1116 | return NULL; | ||
1117 | } | ||
1118 | |||
1119 | static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) | ||
1120 | { | ||
1121 | /* | ||
1122 | * When a group in the hierarchy below root is destroyed, the | ||
1123 | * hierarchy iterator can no longer be trusted since it might | ||
1124 | * have pointed to the destroyed group. Invalidate it. | ||
1125 | */ | ||
1126 | atomic_inc(&root->dead_count); | ||
1127 | } | ||
1128 | |||
1129 | static struct mem_cgroup * | ||
1130 | mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | ||
1131 | struct mem_cgroup *root, | ||
1132 | int *sequence) | ||
1133 | { | ||
1134 | struct mem_cgroup *position = NULL; | ||
1135 | /* | ||
1136 | * A cgroup destruction happens in two stages: offlining and | ||
1137 | * release. They are separated by a RCU grace period. | ||
1138 | * | ||
1139 | * If the iterator is valid, we may still race with an | ||
1140 | * offlining. The RCU lock ensures the object won't be | ||
1141 | * released, tryget will fail if we lost the race. | ||
1142 | */ | ||
1143 | *sequence = atomic_read(&root->dead_count); | ||
1144 | if (iter->last_dead_count == *sequence) { | ||
1145 | smp_rmb(); | ||
1146 | position = iter->last_visited; | ||
1147 | |||
1148 | /* | ||
1149 | * We cannot take a reference to root because we might race | ||
1150 | * with root removal and returning NULL would end up in | ||
1151 | * an endless loop on the iterator user level when root | ||
1152 | * would be returned all the time. | ||
1153 | */ | ||
1154 | if (position && position != root && | ||
1155 | !css_tryget_online(&position->css)) | ||
1156 | position = NULL; | ||
1157 | } | ||
1158 | return position; | ||
1159 | } | ||
1160 | |||
1161 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | ||
1162 | struct mem_cgroup *last_visited, | ||
1163 | struct mem_cgroup *new_position, | ||
1164 | struct mem_cgroup *root, | ||
1165 | int sequence) | ||
1166 | { | ||
1167 | /* root reference counting symmetric to mem_cgroup_iter_load */ | ||
1168 | if (last_visited && last_visited != root) | ||
1169 | css_put(&last_visited->css); | ||
1170 | /* | ||
1171 | * We store the sequence count from the time @last_visited was | ||
1172 | * loaded successfully instead of rereading it here so that we | ||
1173 | * don't lose destruction events in between. We could have | ||
1174 | * raced with the destruction of @new_position after all. | ||
1175 | */ | ||
1176 | iter->last_visited = new_position; | ||
1177 | smp_wmb(); | ||
1178 | iter->last_dead_count = sequence; | ||
1179 | } | ||
1180 | |||
1181 | /** | 1044 | /** |
1182 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1045 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1183 | * @root: hierarchy root | 1046 | * @root: hierarchy root |
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1199 | struct mem_cgroup *prev, | 1062 | struct mem_cgroup *prev, |
1200 | struct mem_cgroup_reclaim_cookie *reclaim) | 1063 | struct mem_cgroup_reclaim_cookie *reclaim) |
1201 | { | 1064 | { |
1065 | struct reclaim_iter *uninitialized_var(iter); | ||
1066 | struct cgroup_subsys_state *css = NULL; | ||
1202 | struct mem_cgroup *memcg = NULL; | 1067 | struct mem_cgroup *memcg = NULL; |
1203 | struct mem_cgroup *last_visited = NULL; | 1068 | struct mem_cgroup *pos = NULL; |
1204 | 1069 | ||
1205 | if (mem_cgroup_disabled()) | 1070 | if (mem_cgroup_disabled()) |
1206 | return NULL; | 1071 | return NULL; |
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1209 | root = root_mem_cgroup; | 1074 | root = root_mem_cgroup; |
1210 | 1075 | ||
1211 | if (prev && !reclaim) | 1076 | if (prev && !reclaim) |
1212 | last_visited = prev; | 1077 | pos = prev; |
1213 | 1078 | ||
1214 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1079 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1215 | if (prev) | 1080 | if (prev) |
1216 | goto out_css_put; | 1081 | goto out; |
1217 | return root; | 1082 | return root; |
1218 | } | 1083 | } |
1219 | 1084 | ||
1220 | rcu_read_lock(); | 1085 | rcu_read_lock(); |
1221 | while (!memcg) { | ||
1222 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | ||
1223 | int uninitialized_var(seq); | ||
1224 | |||
1225 | if (reclaim) { | ||
1226 | struct mem_cgroup_per_zone *mz; | ||
1227 | |||
1228 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | ||
1229 | iter = &mz->reclaim_iter[reclaim->priority]; | ||
1230 | if (prev && reclaim->generation != iter->generation) { | ||
1231 | iter->last_visited = NULL; | ||
1232 | goto out_unlock; | ||
1233 | } | ||
1234 | 1086 | ||
1235 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1087 | if (reclaim) { |
1088 | struct mem_cgroup_per_zone *mz; | ||
1089 | |||
1090 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | ||
1091 | iter = &mz->iter[reclaim->priority]; | ||
1092 | |||
1093 | if (prev && reclaim->generation != iter->generation) | ||
1094 | goto out_unlock; | ||
1095 | |||
1096 | do { | ||
1097 | pos = ACCESS_ONCE(iter->position); | ||
1098 | /* | ||
1099 | * A racing update may change the position and | ||
1100 | * put the last reference, hence css_tryget(), | ||
1101 | * or retry to see the updated position. | ||
1102 | */ | ||
1103 | } while (pos && !css_tryget(&pos->css)); | ||
1104 | } | ||
1105 | |||
1106 | if (pos) | ||
1107 | css = &pos->css; | ||
1108 | |||
1109 | for (;;) { | ||
1110 | css = css_next_descendant_pre(css, &root->css); | ||
1111 | if (!css) { | ||
1112 | /* | ||
1113 | * Reclaimers share the hierarchy walk, and a | ||
1114 | * new one might jump in right at the end of | ||
1115 | * the hierarchy - make sure they see at least | ||
1116 | * one group and restart from the beginning. | ||
1117 | */ | ||
1118 | if (!prev) | ||
1119 | continue; | ||
1120 | break; | ||
1236 | } | 1121 | } |
1237 | 1122 | ||
1238 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1123 | /* |
1124 | * Verify the css and acquire a reference. The root | ||
1125 | * is provided by the caller, so we know it's alive | ||
1126 | * and kicking, and don't take an extra reference. | ||
1127 | */ | ||
1128 | memcg = mem_cgroup_from_css(css); | ||
1129 | |||
1130 | if (css == &root->css) | ||
1131 | break; | ||
1239 | 1132 | ||
1240 | if (reclaim) { | 1133 | if (css_tryget(css)) { |
1241 | mem_cgroup_iter_update(iter, last_visited, memcg, root, | 1134 | /* |
1242 | seq); | 1135 | * Make sure the memcg is initialized: |
1136 | * mem_cgroup_css_online() orders the the | ||
1137 | * initialization against setting the flag. | ||
1138 | */ | ||
1139 | if (smp_load_acquire(&memcg->initialized)) | ||
1140 | break; | ||
1243 | 1141 | ||
1244 | if (!memcg) | 1142 | css_put(css); |
1245 | iter->generation++; | ||
1246 | else if (!prev && memcg) | ||
1247 | reclaim->generation = iter->generation; | ||
1248 | } | 1143 | } |
1249 | 1144 | ||
1250 | if (prev && !memcg) | 1145 | memcg = NULL; |
1251 | goto out_unlock; | 1146 | } |
1147 | |||
1148 | if (reclaim) { | ||
1149 | if (cmpxchg(&iter->position, pos, memcg) == pos) { | ||
1150 | if (memcg) | ||
1151 | css_get(&memcg->css); | ||
1152 | if (pos) | ||
1153 | css_put(&pos->css); | ||
1154 | } | ||
1155 | |||
1156 | /* | ||
1157 | * pairs with css_tryget when dereferencing iter->position | ||
1158 | * above. | ||
1159 | */ | ||
1160 | if (pos) | ||
1161 | css_put(&pos->css); | ||
1162 | |||
1163 | if (!memcg) | ||
1164 | iter->generation++; | ||
1165 | else if (!prev) | ||
1166 | reclaim->generation = iter->generation; | ||
1252 | } | 1167 | } |
1168 | |||
1253 | out_unlock: | 1169 | out_unlock: |
1254 | rcu_read_unlock(); | 1170 | rcu_read_unlock(); |
1255 | out_css_put: | 1171 | out: |
1256 | if (prev && prev != root) | 1172 | if (prev && prev != root) |
1257 | css_put(&prev->css); | 1173 | css_put(&prev->css); |
1258 | 1174 | ||
@@ -1346,15 +1262,18 @@ out: | |||
1346 | } | 1262 | } |
1347 | 1263 | ||
1348 | /** | 1264 | /** |
1349 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page | 1265 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page |
1350 | * @page: the page | 1266 | * @page: the page |
1351 | * @zone: zone of the page | 1267 | * @zone: zone of the page |
1268 | * | ||
1269 | * This function is only safe when following the LRU page isolation | ||
1270 | * and putback protocol: the LRU lock must be held, and the page must | ||
1271 | * either be PageLRU() or the caller must have isolated/allocated it. | ||
1352 | */ | 1272 | */ |
1353 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | 1273 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) |
1354 | { | 1274 | { |
1355 | struct mem_cgroup_per_zone *mz; | 1275 | struct mem_cgroup_per_zone *mz; |
1356 | struct mem_cgroup *memcg; | 1276 | struct mem_cgroup *memcg; |
1357 | struct page_cgroup *pc; | ||
1358 | struct lruvec *lruvec; | 1277 | struct lruvec *lruvec; |
1359 | 1278 | ||
1360 | if (mem_cgroup_disabled()) { | 1279 | if (mem_cgroup_disabled()) { |
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | |||
1362 | goto out; | 1281 | goto out; |
1363 | } | 1282 | } |
1364 | 1283 | ||
1365 | pc = lookup_page_cgroup(page); | 1284 | memcg = page->mem_cgroup; |
1366 | memcg = pc->mem_cgroup; | ||
1367 | |||
1368 | /* | 1285 | /* |
1369 | * Surreptitiously switch any uncharged offlist page to root: | 1286 | * Swapcache readahead pages are added to the LRU - and |
1370 | * an uncharged page off lru does nothing to secure | 1287 | * possibly migrated - before they are charged. |
1371 | * its former mem_cgroup from sudden removal. | ||
1372 | * | ||
1373 | * Our caller holds lru_lock, and PageCgroupUsed is updated | ||
1374 | * under page_cgroup lock: between them, they make all uses | ||
1375 | * of pc->mem_cgroup safe. | ||
1376 | */ | 1288 | */ |
1377 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) | 1289 | if (!memcg) |
1378 | pc->mem_cgroup = memcg = root_mem_cgroup; | 1290 | memcg = root_mem_cgroup; |
1379 | 1291 | ||
1380 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 1292 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
1381 | lruvec = &mz->lruvec; | 1293 | lruvec = &mz->lruvec; |
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | |||
1414 | VM_BUG_ON((long)(*lru_size) < 0); | 1326 | VM_BUG_ON((long)(*lru_size) < 0); |
1415 | } | 1327 | } |
1416 | 1328 | ||
1417 | /* | 1329 | bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) |
1418 | * Checks whether given mem is same or in the root_mem_cgroup's | ||
1419 | * hierarchy subtree | ||
1420 | */ | ||
1421 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1422 | struct mem_cgroup *memcg) | ||
1423 | { | 1330 | { |
1424 | if (root_memcg == memcg) | 1331 | if (root == memcg) |
1425 | return true; | 1332 | return true; |
1426 | if (!root_memcg->use_hierarchy || !memcg) | 1333 | if (!root->use_hierarchy) |
1427 | return false; | 1334 | return false; |
1428 | return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); | 1335 | return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); |
1429 | } | ||
1430 | |||
1431 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1432 | struct mem_cgroup *memcg) | ||
1433 | { | ||
1434 | bool ret; | ||
1435 | |||
1436 | rcu_read_lock(); | ||
1437 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | ||
1438 | rcu_read_unlock(); | ||
1439 | return ret; | ||
1440 | } | 1336 | } |
1441 | 1337 | ||
1442 | bool task_in_mem_cgroup(struct task_struct *task, | 1338 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) |
1443 | const struct mem_cgroup *memcg) | ||
1444 | { | 1339 | { |
1445 | struct mem_cgroup *curr = NULL; | 1340 | struct mem_cgroup *task_memcg; |
1446 | struct task_struct *p; | 1341 | struct task_struct *p; |
1447 | bool ret; | 1342 | bool ret; |
1448 | 1343 | ||
1449 | p = find_lock_task_mm(task); | 1344 | p = find_lock_task_mm(task); |
1450 | if (p) { | 1345 | if (p) { |
1451 | curr = get_mem_cgroup_from_mm(p->mm); | 1346 | task_memcg = get_mem_cgroup_from_mm(p->mm); |
1452 | task_unlock(p); | 1347 | task_unlock(p); |
1453 | } else { | 1348 | } else { |
1454 | /* | 1349 | /* |
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task, | |||
1457 | * killed to prevent needlessly killing additional tasks. | 1352 | * killed to prevent needlessly killing additional tasks. |
1458 | */ | 1353 | */ |
1459 | rcu_read_lock(); | 1354 | rcu_read_lock(); |
1460 | curr = mem_cgroup_from_task(task); | 1355 | task_memcg = mem_cgroup_from_task(task); |
1461 | if (curr) | 1356 | css_get(&task_memcg->css); |
1462 | css_get(&curr->css); | ||
1463 | rcu_read_unlock(); | 1357 | rcu_read_unlock(); |
1464 | } | 1358 | } |
1465 | /* | 1359 | ret = mem_cgroup_is_descendant(task_memcg, memcg); |
1466 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1360 | css_put(&task_memcg->css); |
1467 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
1468 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* | ||
1469 | * hierarchy(even if use_hierarchy is disabled in "memcg"). | ||
1470 | */ | ||
1471 | ret = mem_cgroup_same_or_subtree(memcg, curr); | ||
1472 | css_put(&curr->css); | ||
1473 | return ret; | 1361 | return ret; |
1474 | } | 1362 | } |
1475 | 1363 | ||
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1492 | return inactive * inactive_ratio < active; | 1380 | return inactive * inactive_ratio < active; |
1493 | } | 1381 | } |
1494 | 1382 | ||
1495 | #define mem_cgroup_from_res_counter(counter, member) \ | 1383 | #define mem_cgroup_from_counter(counter, member) \ |
1496 | container_of(counter, struct mem_cgroup, member) | 1384 | container_of(counter, struct mem_cgroup, member) |
1497 | 1385 | ||
1498 | /** | 1386 | /** |
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1504 | */ | 1392 | */ |
1505 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | 1393 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) |
1506 | { | 1394 | { |
1507 | unsigned long long margin; | 1395 | unsigned long margin = 0; |
1396 | unsigned long count; | ||
1397 | unsigned long limit; | ||
1508 | 1398 | ||
1509 | margin = res_counter_margin(&memcg->res); | 1399 | count = page_counter_read(&memcg->memory); |
1510 | if (do_swap_account) | 1400 | limit = ACCESS_ONCE(memcg->memory.limit); |
1511 | margin = min(margin, res_counter_margin(&memcg->memsw)); | 1401 | if (count < limit) |
1512 | return margin >> PAGE_SHIFT; | 1402 | margin = limit - count; |
1403 | |||
1404 | if (do_swap_account) { | ||
1405 | count = page_counter_read(&memcg->memsw); | ||
1406 | limit = ACCESS_ONCE(memcg->memsw.limit); | ||
1407 | if (count <= limit) | ||
1408 | margin = min(margin, limit - count); | ||
1409 | } | ||
1410 | |||
1411 | return margin; | ||
1513 | } | 1412 | } |
1514 | 1413 | ||
1515 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) | 1414 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1522 | } | 1421 | } |
1523 | 1422 | ||
1524 | /* | 1423 | /* |
1525 | * memcg->moving_account is used for checking possibility that some thread is | ||
1526 | * calling move_account(). When a thread on CPU-A starts moving pages under | ||
1527 | * a memcg, other threads should check memcg->moving_account under | ||
1528 | * rcu_read_lock(), like this: | ||
1529 | * | ||
1530 | * CPU-A CPU-B | ||
1531 | * rcu_read_lock() | ||
1532 | * memcg->moving_account+1 if (memcg->mocing_account) | ||
1533 | * take heavy locks. | ||
1534 | * synchronize_rcu() update something. | ||
1535 | * rcu_read_unlock() | ||
1536 | * start move here. | ||
1537 | */ | ||
1538 | |||
1539 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | ||
1540 | { | ||
1541 | atomic_inc(&memcg->moving_account); | ||
1542 | synchronize_rcu(); | ||
1543 | } | ||
1544 | |||
1545 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | ||
1546 | { | ||
1547 | /* | ||
1548 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | ||
1549 | * We check NULL in callee rather than caller. | ||
1550 | */ | ||
1551 | if (memcg) | ||
1552 | atomic_dec(&memcg->moving_account); | ||
1553 | } | ||
1554 | |||
1555 | /* | ||
1556 | * A routine for checking "mem" is under move_account() or not. | 1424 | * A routine for checking "mem" is under move_account() or not. |
1557 | * | 1425 | * |
1558 | * Checking a cgroup is mc.from or mc.to or under hierarchy of | 1426 | * Checking a cgroup is mc.from or mc.to or under hierarchy of |
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | |||
1574 | if (!from) | 1442 | if (!from) |
1575 | goto unlock; | 1443 | goto unlock; |
1576 | 1444 | ||
1577 | ret = mem_cgroup_same_or_subtree(memcg, from) | 1445 | ret = mem_cgroup_is_descendant(from, memcg) || |
1578 | || mem_cgroup_same_or_subtree(memcg, to); | 1446 | mem_cgroup_is_descendant(to, memcg); |
1579 | unlock: | 1447 | unlock: |
1580 | spin_unlock(&mc.lock); | 1448 | spin_unlock(&mc.lock); |
1581 | return ret; | 1449 | return ret; |
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1597 | return false; | 1465 | return false; |
1598 | } | 1466 | } |
1599 | 1467 | ||
1600 | /* | ||
1601 | * Take this lock when | ||
1602 | * - a code tries to modify page's memcg while it's USED. | ||
1603 | * - a code tries to modify page state accounting in a memcg. | ||
1604 | */ | ||
1605 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | ||
1606 | unsigned long *flags) | ||
1607 | { | ||
1608 | spin_lock_irqsave(&memcg->move_lock, *flags); | ||
1609 | } | ||
1610 | |||
1611 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | ||
1612 | unsigned long *flags) | ||
1613 | { | ||
1614 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | ||
1615 | } | ||
1616 | |||
1617 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1468 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1618 | /** | 1469 | /** |
1619 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 1470 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1644 | 1495 | ||
1645 | rcu_read_unlock(); | 1496 | rcu_read_unlock(); |
1646 | 1497 | ||
1647 | pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1498 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", |
1648 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1499 | K((u64)page_counter_read(&memcg->memory)), |
1649 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1500 | K((u64)memcg->memory.limit), memcg->memory.failcnt); |
1650 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1501 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", |
1651 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", | 1502 | K((u64)page_counter_read(&memcg->memsw)), |
1652 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1503 | K((u64)memcg->memsw.limit), memcg->memsw.failcnt); |
1653 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1504 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", |
1654 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1505 | K((u64)page_counter_read(&memcg->kmem)), |
1655 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", | 1506 | K((u64)memcg->kmem.limit), memcg->kmem.failcnt); |
1656 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | ||
1657 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | ||
1658 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | ||
1659 | 1507 | ||
1660 | for_each_mem_cgroup_tree(iter, memcg) { | 1508 | for_each_mem_cgroup_tree(iter, memcg) { |
1661 | pr_info("Memory cgroup stats for "); | 1509 | pr_info("Memory cgroup stats for "); |
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1695 | /* | 1543 | /* |
1696 | * Return the memory (and swap, if configured) limit for a memcg. | 1544 | * Return the memory (and swap, if configured) limit for a memcg. |
1697 | */ | 1545 | */ |
1698 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1546 | static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1699 | { | 1547 | { |
1700 | u64 limit; | 1548 | unsigned long limit; |
1701 | |||
1702 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1703 | 1549 | ||
1704 | /* | 1550 | limit = memcg->memory.limit; |
1705 | * Do not consider swap space if we cannot swap due to swappiness | ||
1706 | */ | ||
1707 | if (mem_cgroup_swappiness(memcg)) { | 1551 | if (mem_cgroup_swappiness(memcg)) { |
1708 | u64 memsw; | 1552 | unsigned long memsw_limit; |
1709 | 1553 | ||
1710 | limit += total_swap_pages << PAGE_SHIFT; | 1554 | memsw_limit = memcg->memsw.limit; |
1711 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1555 | limit = min(limit + total_swap_pages, memsw_limit); |
1712 | |||
1713 | /* | ||
1714 | * If memsw is finite and limits the amount of swap space | ||
1715 | * available to this memcg, return that limit. | ||
1716 | */ | ||
1717 | limit = min(limit, memsw); | ||
1718 | } | 1556 | } |
1719 | |||
1720 | return limit; | 1557 | return limit; |
1721 | } | 1558 | } |
1722 | 1559 | ||
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1740 | } | 1577 | } |
1741 | 1578 | ||
1742 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 1579 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
1743 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | 1580 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
1744 | for_each_mem_cgroup_tree(iter, memcg) { | 1581 | for_each_mem_cgroup_tree(iter, memcg) { |
1745 | struct css_task_iter it; | 1582 | struct css_task_iter it; |
1746 | struct task_struct *task; | 1583 | struct task_struct *task; |
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1880 | memcg->last_scanned_node = node; | 1717 | memcg->last_scanned_node = node; |
1881 | return node; | 1718 | return node; |
1882 | } | 1719 | } |
1883 | |||
1884 | /* | ||
1885 | * Check all nodes whether it contains reclaimable pages or not. | ||
1886 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1887 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1888 | * enough new information. We need to do double check. | ||
1889 | */ | ||
1890 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1891 | { | ||
1892 | int nid; | ||
1893 | |||
1894 | /* | ||
1895 | * quick check...making use of scan_node. | ||
1896 | * We can skip unused nodes. | ||
1897 | */ | ||
1898 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1899 | for (nid = first_node(memcg->scan_nodes); | ||
1900 | nid < MAX_NUMNODES; | ||
1901 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1902 | |||
1903 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1904 | return true; | ||
1905 | } | ||
1906 | } | ||
1907 | /* | ||
1908 | * Check rest of nodes. | ||
1909 | */ | ||
1910 | for_each_node_state(nid, N_MEMORY) { | ||
1911 | if (node_isset(nid, memcg->scan_nodes)) | ||
1912 | continue; | ||
1913 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1914 | return true; | ||
1915 | } | ||
1916 | return false; | ||
1917 | } | ||
1918 | |||
1919 | #else | 1720 | #else |
1920 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1721 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1921 | { | 1722 | { |
1922 | return 0; | 1723 | return 0; |
1923 | } | 1724 | } |
1924 | |||
1925 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1926 | { | ||
1927 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
1928 | } | ||
1929 | #endif | 1725 | #endif |
1930 | 1726 | ||
1931 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1727 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1943 | .priority = 0, | 1739 | .priority = 0, |
1944 | }; | 1740 | }; |
1945 | 1741 | ||
1946 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1742 | excess = soft_limit_excess(root_memcg); |
1947 | 1743 | ||
1948 | while (1) { | 1744 | while (1) { |
1949 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1745 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1969 | } | 1765 | } |
1970 | continue; | 1766 | continue; |
1971 | } | 1767 | } |
1972 | if (!mem_cgroup_reclaimable(victim, false)) | ||
1973 | continue; | ||
1974 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | 1768 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, |
1975 | zone, &nr_scanned); | 1769 | zone, &nr_scanned); |
1976 | *total_scanned += nr_scanned; | 1770 | *total_scanned += nr_scanned; |
1977 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | 1771 | if (!soft_limit_excess(root_memcg)) |
1978 | break; | 1772 | break; |
1979 | } | 1773 | } |
1980 | mem_cgroup_iter_break(root_memcg, victim); | 1774 | mem_cgroup_iter_break(root_memcg, victim); |
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
2081 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1875 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
2082 | oom_wait_memcg = oom_wait_info->memcg; | 1876 | oom_wait_memcg = oom_wait_info->memcg; |
2083 | 1877 | ||
2084 | /* | 1878 | if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && |
2085 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. | 1879 | !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) |
2086 | * Then we can use css_is_ancestor without taking care of RCU. | ||
2087 | */ | ||
2088 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | ||
2089 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) | ||
2090 | return 0; | 1880 | return 0; |
2091 | return autoremove_wake_function(wait, mode, sync, arg); | 1881 | return autoremove_wake_function(wait, mode, sync, arg); |
2092 | } | 1882 | } |
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | |||
2228 | unsigned long *flags) | 2018 | unsigned long *flags) |
2229 | { | 2019 | { |
2230 | struct mem_cgroup *memcg; | 2020 | struct mem_cgroup *memcg; |
2231 | struct page_cgroup *pc; | ||
2232 | 2021 | ||
2233 | rcu_read_lock(); | 2022 | rcu_read_lock(); |
2234 | 2023 | ||
2235 | if (mem_cgroup_disabled()) | 2024 | if (mem_cgroup_disabled()) |
2236 | return NULL; | 2025 | return NULL; |
2237 | |||
2238 | pc = lookup_page_cgroup(page); | ||
2239 | again: | 2026 | again: |
2240 | memcg = pc->mem_cgroup; | 2027 | memcg = page->mem_cgroup; |
2241 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2028 | if (unlikely(!memcg)) |
2242 | return NULL; | 2029 | return NULL; |
2243 | 2030 | ||
2244 | *locked = false; | 2031 | *locked = false; |
2245 | if (atomic_read(&memcg->moving_account) <= 0) | 2032 | if (atomic_read(&memcg->moving_account) <= 0) |
2246 | return memcg; | 2033 | return memcg; |
2247 | 2034 | ||
2248 | move_lock_mem_cgroup(memcg, flags); | 2035 | spin_lock_irqsave(&memcg->move_lock, *flags); |
2249 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | 2036 | if (memcg != page->mem_cgroup) { |
2250 | move_unlock_mem_cgroup(memcg, flags); | 2037 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
2251 | goto again; | 2038 | goto again; |
2252 | } | 2039 | } |
2253 | *locked = true; | 2040 | *locked = true; |
@@ -2261,11 +2048,11 @@ again: | |||
2261 | * @locked: value received from mem_cgroup_begin_page_stat() | 2048 | * @locked: value received from mem_cgroup_begin_page_stat() |
2262 | * @flags: value received from mem_cgroup_begin_page_stat() | 2049 | * @flags: value received from mem_cgroup_begin_page_stat() |
2263 | */ | 2050 | */ |
2264 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, | 2051 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, |
2265 | unsigned long flags) | 2052 | unsigned long *flags) |
2266 | { | 2053 | { |
2267 | if (memcg && locked) | 2054 | if (memcg && *locked) |
2268 | move_unlock_mem_cgroup(memcg, &flags); | 2055 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
2269 | 2056 | ||
2270 | rcu_read_unlock(); | 2057 | rcu_read_unlock(); |
2271 | } | 2058 | } |
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex); | |||
2316 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2103 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2317 | { | 2104 | { |
2318 | struct memcg_stock_pcp *stock; | 2105 | struct memcg_stock_pcp *stock; |
2319 | bool ret = true; | 2106 | bool ret = false; |
2320 | 2107 | ||
2321 | if (nr_pages > CHARGE_BATCH) | 2108 | if (nr_pages > CHARGE_BATCH) |
2322 | return false; | 2109 | return ret; |
2323 | 2110 | ||
2324 | stock = &get_cpu_var(memcg_stock); | 2111 | stock = &get_cpu_var(memcg_stock); |
2325 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) | 2112 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) { |
2326 | stock->nr_pages -= nr_pages; | 2113 | stock->nr_pages -= nr_pages; |
2327 | else /* need to call res_counter_charge */ | 2114 | ret = true; |
2328 | ret = false; | 2115 | } |
2329 | put_cpu_var(memcg_stock); | 2116 | put_cpu_var(memcg_stock); |
2330 | return ret; | 2117 | return ret; |
2331 | } | 2118 | } |
2332 | 2119 | ||
2333 | /* | 2120 | /* |
2334 | * Returns stocks cached in percpu to res_counter and reset cached information. | 2121 | * Returns stocks cached in percpu and reset cached information. |
2335 | */ | 2122 | */ |
2336 | static void drain_stock(struct memcg_stock_pcp *stock) | 2123 | static void drain_stock(struct memcg_stock_pcp *stock) |
2337 | { | 2124 | { |
2338 | struct mem_cgroup *old = stock->cached; | 2125 | struct mem_cgroup *old = stock->cached; |
2339 | 2126 | ||
2340 | if (stock->nr_pages) { | 2127 | if (stock->nr_pages) { |
2341 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; | 2128 | page_counter_uncharge(&old->memory, stock->nr_pages); |
2342 | |||
2343 | res_counter_uncharge(&old->res, bytes); | ||
2344 | if (do_swap_account) | 2129 | if (do_swap_account) |
2345 | res_counter_uncharge(&old->memsw, bytes); | 2130 | page_counter_uncharge(&old->memsw, stock->nr_pages); |
2131 | css_put_many(&old->css, stock->nr_pages); | ||
2346 | stock->nr_pages = 0; | 2132 | stock->nr_pages = 0; |
2347 | } | 2133 | } |
2348 | stock->cached = NULL; | 2134 | stock->cached = NULL; |
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void) | |||
2371 | } | 2157 | } |
2372 | 2158 | ||
2373 | /* | 2159 | /* |
2374 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2160 | * Cache charges(val) to local per_cpu area. |
2375 | * This will be consumed by consume_stock() function, later. | 2161 | * This will be consumed by consume_stock() function, later. |
2376 | */ | 2162 | */ |
2377 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2163 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2388 | 2174 | ||
2389 | /* | 2175 | /* |
2390 | * Drains all per-CPU charge caches for given root_memcg resp. subtree | 2176 | * Drains all per-CPU charge caches for given root_memcg resp. subtree |
2391 | * of the hierarchy under it. sync flag says whether we should block | 2177 | * of the hierarchy under it. |
2392 | * until the work is done. | ||
2393 | */ | 2178 | */ |
2394 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | 2179 | static void drain_all_stock(struct mem_cgroup *root_memcg) |
2395 | { | 2180 | { |
2396 | int cpu, curcpu; | 2181 | int cpu, curcpu; |
2397 | 2182 | ||
2183 | /* If someone's already draining, avoid adding running more workers. */ | ||
2184 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2185 | return; | ||
2398 | /* Notify other cpus that system-wide "drain" is running */ | 2186 | /* Notify other cpus that system-wide "drain" is running */ |
2399 | get_online_cpus(); | 2187 | get_online_cpus(); |
2400 | curcpu = get_cpu(); | 2188 | curcpu = get_cpu(); |
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2405 | memcg = stock->cached; | 2193 | memcg = stock->cached; |
2406 | if (!memcg || !stock->nr_pages) | 2194 | if (!memcg || !stock->nr_pages) |
2407 | continue; | 2195 | continue; |
2408 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) | 2196 | if (!mem_cgroup_is_descendant(memcg, root_memcg)) |
2409 | continue; | 2197 | continue; |
2410 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2198 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2411 | if (cpu == curcpu) | 2199 | if (cpu == curcpu) |
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2415 | } | 2203 | } |
2416 | } | 2204 | } |
2417 | put_cpu(); | 2205 | put_cpu(); |
2418 | |||
2419 | if (!sync) | ||
2420 | goto out; | ||
2421 | |||
2422 | for_each_online_cpu(cpu) { | ||
2423 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
2424 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2425 | flush_work(&stock->work); | ||
2426 | } | ||
2427 | out: | ||
2428 | put_online_cpus(); | 2206 | put_online_cpus(); |
2429 | } | ||
2430 | |||
2431 | /* | ||
2432 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
2433 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
2434 | * expects some charges will be back to res_counter later but cannot wait for | ||
2435 | * it. | ||
2436 | */ | ||
2437 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) | ||
2438 | { | ||
2439 | /* | ||
2440 | * If someone calls draining, avoid adding more kworker runs. | ||
2441 | */ | ||
2442 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2443 | return; | ||
2444 | drain_all_stock(root_memcg, false); | ||
2445 | mutex_unlock(&percpu_charge_mutex); | ||
2446 | } | ||
2447 | |||
2448 | /* This is a synchronous drain interface. */ | ||
2449 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) | ||
2450 | { | ||
2451 | /* called when force_empty is called */ | ||
2452 | mutex_lock(&percpu_charge_mutex); | ||
2453 | drain_all_stock(root_memcg, true); | ||
2454 | mutex_unlock(&percpu_charge_mutex); | 2207 | mutex_unlock(&percpu_charge_mutex); |
2455 | } | 2208 | } |
2456 | 2209 | ||
@@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2506 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2259 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2507 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2260 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2508 | struct mem_cgroup *mem_over_limit; | 2261 | struct mem_cgroup *mem_over_limit; |
2509 | struct res_counter *fail_res; | 2262 | struct page_counter *counter; |
2510 | unsigned long nr_reclaimed; | 2263 | unsigned long nr_reclaimed; |
2511 | unsigned long long size; | ||
2512 | bool may_swap = true; | 2264 | bool may_swap = true; |
2513 | bool drained = false; | 2265 | bool drained = false; |
2514 | int ret = 0; | 2266 | int ret = 0; |
@@ -2519,16 +2271,15 @@ retry: | |||
2519 | if (consume_stock(memcg, nr_pages)) | 2271 | if (consume_stock(memcg, nr_pages)) |
2520 | goto done; | 2272 | goto done; |
2521 | 2273 | ||
2522 | size = batch * PAGE_SIZE; | ||
2523 | if (!do_swap_account || | 2274 | if (!do_swap_account || |
2524 | !res_counter_charge(&memcg->memsw, size, &fail_res)) { | 2275 | !page_counter_try_charge(&memcg->memsw, batch, &counter)) { |
2525 | if (!res_counter_charge(&memcg->res, size, &fail_res)) | 2276 | if (!page_counter_try_charge(&memcg->memory, batch, &counter)) |
2526 | goto done_restock; | 2277 | goto done_restock; |
2527 | if (do_swap_account) | 2278 | if (do_swap_account) |
2528 | res_counter_uncharge(&memcg->memsw, size); | 2279 | page_counter_uncharge(&memcg->memsw, batch); |
2529 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2280 | mem_over_limit = mem_cgroup_from_counter(counter, memory); |
2530 | } else { | 2281 | } else { |
2531 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2282 | mem_over_limit = mem_cgroup_from_counter(counter, memsw); |
2532 | may_swap = false; | 2283 | may_swap = false; |
2533 | } | 2284 | } |
2534 | 2285 | ||
@@ -2561,7 +2312,7 @@ retry: | |||
2561 | goto retry; | 2312 | goto retry; |
2562 | 2313 | ||
2563 | if (!drained) { | 2314 | if (!drained) { |
2564 | drain_all_stock_async(mem_over_limit); | 2315 | drain_all_stock(mem_over_limit); |
2565 | drained = true; | 2316 | drained = true; |
2566 | goto retry; | 2317 | goto retry; |
2567 | } | 2318 | } |
@@ -2603,6 +2354,7 @@ bypass: | |||
2603 | return -EINTR; | 2354 | return -EINTR; |
2604 | 2355 | ||
2605 | done_restock: | 2356 | done_restock: |
2357 | css_get_many(&memcg->css, batch); | ||
2606 | if (batch > nr_pages) | 2358 | if (batch > nr_pages) |
2607 | refill_stock(memcg, batch - nr_pages); | 2359 | refill_stock(memcg, batch - nr_pages); |
2608 | done: | 2360 | done: |
@@ -2611,32 +2363,14 @@ done: | |||
2611 | 2363 | ||
2612 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | 2364 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) |
2613 | { | 2365 | { |
2614 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2615 | |||
2616 | if (mem_cgroup_is_root(memcg)) | 2366 | if (mem_cgroup_is_root(memcg)) |
2617 | return; | 2367 | return; |
2618 | 2368 | ||
2619 | res_counter_uncharge(&memcg->res, bytes); | 2369 | page_counter_uncharge(&memcg->memory, nr_pages); |
2620 | if (do_swap_account) | 2370 | if (do_swap_account) |
2621 | res_counter_uncharge(&memcg->memsw, bytes); | 2371 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2622 | } | ||
2623 | |||
2624 | /* | ||
2625 | * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. | ||
2626 | * This is useful when moving usage to parent cgroup. | ||
2627 | */ | ||
2628 | static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | ||
2629 | unsigned int nr_pages) | ||
2630 | { | ||
2631 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2632 | |||
2633 | if (mem_cgroup_is_root(memcg)) | ||
2634 | return; | ||
2635 | 2372 | ||
2636 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | 2373 | css_put_many(&memcg->css, nr_pages); |
2637 | if (do_swap_account) | ||
2638 | res_counter_uncharge_until(&memcg->memsw, | ||
2639 | memcg->memsw.parent, bytes); | ||
2640 | } | 2374 | } |
2641 | 2375 | ||
2642 | /* | 2376 | /* |
@@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2665 | */ | 2399 | */ |
2666 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2400 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2667 | { | 2401 | { |
2668 | struct mem_cgroup *memcg = NULL; | 2402 | struct mem_cgroup *memcg; |
2669 | struct page_cgroup *pc; | ||
2670 | unsigned short id; | 2403 | unsigned short id; |
2671 | swp_entry_t ent; | 2404 | swp_entry_t ent; |
2672 | 2405 | ||
2673 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2406 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2674 | 2407 | ||
2675 | pc = lookup_page_cgroup(page); | 2408 | memcg = page->mem_cgroup; |
2676 | if (PageCgroupUsed(pc)) { | 2409 | if (memcg) { |
2677 | memcg = pc->mem_cgroup; | 2410 | if (!css_tryget_online(&memcg->css)) |
2678 | if (memcg && !css_tryget_online(&memcg->css)) | ||
2679 | memcg = NULL; | 2411 | memcg = NULL; |
2680 | } else if (PageSwapCache(page)) { | 2412 | } else if (PageSwapCache(page)) { |
2681 | ent.val = page_private(page); | 2413 | ent.val = page_private(page); |
@@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated) | |||
2723 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 2455 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, |
2724 | bool lrucare) | 2456 | bool lrucare) |
2725 | { | 2457 | { |
2726 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2727 | int isolated; | 2458 | int isolated; |
2728 | 2459 | ||
2729 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); | 2460 | VM_BUG_ON_PAGE(page->mem_cgroup, page); |
2730 | /* | ||
2731 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
2732 | * accessed by any other context at this point. | ||
2733 | */ | ||
2734 | 2461 | ||
2735 | /* | 2462 | /* |
2736 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page | 2463 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page |
@@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2741 | 2468 | ||
2742 | /* | 2469 | /* |
2743 | * Nobody should be changing or seriously looking at | 2470 | * Nobody should be changing or seriously looking at |
2744 | * pc->mem_cgroup and pc->flags at this point: | 2471 | * page->mem_cgroup at this point: |
2745 | * | 2472 | * |
2746 | * - the page is uncharged | 2473 | * - the page is uncharged |
2747 | * | 2474 | * |
@@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2753 | * - a page cache insertion, a swapin fault, or a migration | 2480 | * - a page cache insertion, a swapin fault, or a migration |
2754 | * have the page locked | 2481 | * have the page locked |
2755 | */ | 2482 | */ |
2756 | pc->mem_cgroup = memcg; | 2483 | page->mem_cgroup = memcg; |
2757 | pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); | ||
2758 | 2484 | ||
2759 | if (lrucare) | 2485 | if (lrucare) |
2760 | unlock_page_lru(page, isolated); | 2486 | unlock_page_lru(page, isolated); |
2761 | } | 2487 | } |
2762 | 2488 | ||
2763 | static DEFINE_MUTEX(set_limit_mutex); | ||
2764 | |||
2765 | #ifdef CONFIG_MEMCG_KMEM | 2489 | #ifdef CONFIG_MEMCG_KMEM |
2766 | /* | 2490 | /* |
2767 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2491 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or |
@@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex); | |||
2769 | */ | 2493 | */ |
2770 | static DEFINE_MUTEX(memcg_slab_mutex); | 2494 | static DEFINE_MUTEX(memcg_slab_mutex); |
2771 | 2495 | ||
2772 | static DEFINE_MUTEX(activate_kmem_mutex); | ||
2773 | |||
2774 | /* | 2496 | /* |
2775 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | 2497 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer |
2776 | * in the memcg_cache_params struct. | 2498 | * in the memcg_cache_params struct. |
@@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
2784 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | 2506 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); |
2785 | } | 2507 | } |
2786 | 2508 | ||
2787 | #ifdef CONFIG_SLABINFO | 2509 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
2788 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | 2510 | unsigned long nr_pages) |
2789 | { | ||
2790 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
2791 | struct memcg_cache_params *params; | ||
2792 | |||
2793 | if (!memcg_kmem_is_active(memcg)) | ||
2794 | return -EIO; | ||
2795 | |||
2796 | print_slabinfo_header(m); | ||
2797 | |||
2798 | mutex_lock(&memcg_slab_mutex); | ||
2799 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | ||
2800 | cache_show(memcg_params_to_cache(params), m); | ||
2801 | mutex_unlock(&memcg_slab_mutex); | ||
2802 | |||
2803 | return 0; | ||
2804 | } | ||
2805 | #endif | ||
2806 | |||
2807 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | ||
2808 | { | 2511 | { |
2809 | struct res_counter *fail_res; | 2512 | struct page_counter *counter; |
2810 | int ret = 0; | 2513 | int ret = 0; |
2811 | 2514 | ||
2812 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | 2515 | ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); |
2813 | if (ret) | 2516 | if (ret < 0) |
2814 | return ret; | 2517 | return ret; |
2815 | 2518 | ||
2816 | ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); | 2519 | ret = try_charge(memcg, gfp, nr_pages); |
2817 | if (ret == -EINTR) { | 2520 | if (ret == -EINTR) { |
2818 | /* | 2521 | /* |
2819 | * try_charge() chose to bypass to root due to OOM kill or | 2522 | * try_charge() chose to bypass to root due to OOM kill or |
@@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
2830 | * when the allocation triggers should have been already | 2533 | * when the allocation triggers should have been already |
2831 | * directed to the root cgroup in memcontrol.h | 2534 | * directed to the root cgroup in memcontrol.h |
2832 | */ | 2535 | */ |
2833 | res_counter_charge_nofail(&memcg->res, size, &fail_res); | 2536 | page_counter_charge(&memcg->memory, nr_pages); |
2834 | if (do_swap_account) | 2537 | if (do_swap_account) |
2835 | res_counter_charge_nofail(&memcg->memsw, size, | 2538 | page_counter_charge(&memcg->memsw, nr_pages); |
2836 | &fail_res); | 2539 | css_get_many(&memcg->css, nr_pages); |
2837 | ret = 0; | 2540 | ret = 0; |
2838 | } else if (ret) | 2541 | } else if (ret) |
2839 | res_counter_uncharge(&memcg->kmem, size); | 2542 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2840 | 2543 | ||
2841 | return ret; | 2544 | return ret; |
2842 | } | 2545 | } |
2843 | 2546 | ||
2844 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | 2547 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, |
2548 | unsigned long nr_pages) | ||
2845 | { | 2549 | { |
2846 | res_counter_uncharge(&memcg->res, size); | 2550 | page_counter_uncharge(&memcg->memory, nr_pages); |
2847 | if (do_swap_account) | 2551 | if (do_swap_account) |
2848 | res_counter_uncharge(&memcg->memsw, size); | 2552 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2849 | 2553 | ||
2850 | /* Not down to 0 */ | 2554 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2851 | if (res_counter_uncharge(&memcg->kmem, size)) | ||
2852 | return; | ||
2853 | 2555 | ||
2854 | /* | 2556 | css_put_many(&memcg->css, nr_pages); |
2855 | * Releases a reference taken in kmem_cgroup_css_offline in case | ||
2856 | * this last uncharge is racing with the offlining code or it is | ||
2857 | * outliving the memcg existence. | ||
2858 | * | ||
2859 | * The memory barrier imposed by test&clear is paired with the | ||
2860 | * explicit one in memcg_kmem_mark_dead(). | ||
2861 | */ | ||
2862 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
2863 | css_put(&memcg->css); | ||
2864 | } | 2557 | } |
2865 | 2558 | ||
2866 | /* | 2559 | /* |
@@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
3124 | 2817 | ||
3125 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2818 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
3126 | { | 2819 | { |
2820 | unsigned int nr_pages = 1 << order; | ||
3127 | int res; | 2821 | int res; |
3128 | 2822 | ||
3129 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, | 2823 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
3130 | PAGE_SIZE << order); | ||
3131 | if (!res) | 2824 | if (!res) |
3132 | atomic_add(1 << order, &cachep->memcg_params->nr_pages); | 2825 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); |
3133 | return res; | 2826 | return res; |
3134 | } | 2827 | } |
3135 | 2828 | ||
3136 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2829 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
3137 | { | 2830 | { |
3138 | memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); | 2831 | unsigned int nr_pages = 1 << order; |
3139 | atomic_sub(1 << order, &cachep->memcg_params->nr_pages); | 2832 | |
2833 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | ||
2834 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
3140 | } | 2835 | } |
3141 | 2836 | ||
3142 | /* | 2837 | /* |
@@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3257 | return true; | 2952 | return true; |
3258 | } | 2953 | } |
3259 | 2954 | ||
3260 | ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); | 2955 | ret = memcg_charge_kmem(memcg, gfp, 1 << order); |
3261 | if (!ret) | 2956 | if (!ret) |
3262 | *_memcg = memcg; | 2957 | *_memcg = memcg; |
3263 | 2958 | ||
@@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3268 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | 2963 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, |
3269 | int order) | 2964 | int order) |
3270 | { | 2965 | { |
3271 | struct page_cgroup *pc; | ||
3272 | |||
3273 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 2966 | VM_BUG_ON(mem_cgroup_is_root(memcg)); |
3274 | 2967 | ||
3275 | /* The page allocation failed. Revert */ | 2968 | /* The page allocation failed. Revert */ |
3276 | if (!page) { | 2969 | if (!page) { |
3277 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 2970 | memcg_uncharge_kmem(memcg, 1 << order); |
3278 | return; | 2971 | return; |
3279 | } | 2972 | } |
3280 | /* | 2973 | page->mem_cgroup = memcg; |
3281 | * The page is freshly allocated and not visible to any | ||
3282 | * outside callers yet. Set up pc non-atomically. | ||
3283 | */ | ||
3284 | pc = lookup_page_cgroup(page); | ||
3285 | pc->mem_cgroup = memcg; | ||
3286 | pc->flags = PCG_USED; | ||
3287 | } | 2974 | } |
3288 | 2975 | ||
3289 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | 2976 | void __memcg_kmem_uncharge_pages(struct page *page, int order) |
3290 | { | 2977 | { |
3291 | struct mem_cgroup *memcg = NULL; | 2978 | struct mem_cgroup *memcg = page->mem_cgroup; |
3292 | struct page_cgroup *pc; | ||
3293 | |||
3294 | 2979 | ||
3295 | pc = lookup_page_cgroup(page); | ||
3296 | if (!PageCgroupUsed(pc)) | ||
3297 | return; | ||
3298 | |||
3299 | memcg = pc->mem_cgroup; | ||
3300 | pc->flags = 0; | ||
3301 | |||
3302 | /* | ||
3303 | * We trust that only if there is a memcg associated with the page, it | ||
3304 | * is a valid allocation | ||
3305 | */ | ||
3306 | if (!memcg) | 2980 | if (!memcg) |
3307 | return; | 2981 | return; |
3308 | 2982 | ||
3309 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 2983 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
3310 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 2984 | |
2985 | memcg_uncharge_kmem(memcg, 1 << order); | ||
2986 | page->mem_cgroup = NULL; | ||
3311 | } | 2987 | } |
3312 | #else | 2988 | #else |
3313 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | 2989 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
@@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
3325 | */ | 3001 | */ |
3326 | void mem_cgroup_split_huge_fixup(struct page *head) | 3002 | void mem_cgroup_split_huge_fixup(struct page *head) |
3327 | { | 3003 | { |
3328 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | ||
3329 | struct page_cgroup *pc; | ||
3330 | struct mem_cgroup *memcg; | ||
3331 | int i; | 3004 | int i; |
3332 | 3005 | ||
3333 | if (mem_cgroup_disabled()) | 3006 | if (mem_cgroup_disabled()) |
3334 | return; | 3007 | return; |
3335 | 3008 | ||
3336 | memcg = head_pc->mem_cgroup; | 3009 | for (i = 1; i < HPAGE_PMD_NR; i++) |
3337 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3010 | head[i].mem_cgroup = head->mem_cgroup; |
3338 | pc = head_pc + i; | 3011 | |
3339 | pc->mem_cgroup = memcg; | 3012 | __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
3340 | pc->flags = head_pc->flags; | ||
3341 | } | ||
3342 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
3343 | HPAGE_PMD_NR); | 3013 | HPAGE_PMD_NR); |
3344 | } | 3014 | } |
3345 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3015 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
@@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3348 | * mem_cgroup_move_account - move account of the page | 3018 | * mem_cgroup_move_account - move account of the page |
3349 | * @page: the page | 3019 | * @page: the page |
3350 | * @nr_pages: number of regular pages (>1 for huge pages) | 3020 | * @nr_pages: number of regular pages (>1 for huge pages) |
3351 | * @pc: page_cgroup of the page. | ||
3352 | * @from: mem_cgroup which the page is moved from. | 3021 | * @from: mem_cgroup which the page is moved from. |
3353 | * @to: mem_cgroup which the page is moved to. @from != @to. | 3022 | * @to: mem_cgroup which the page is moved to. @from != @to. |
3354 | * | 3023 | * |
@@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3361 | */ | 3030 | */ |
3362 | static int mem_cgroup_move_account(struct page *page, | 3031 | static int mem_cgroup_move_account(struct page *page, |
3363 | unsigned int nr_pages, | 3032 | unsigned int nr_pages, |
3364 | struct page_cgroup *pc, | ||
3365 | struct mem_cgroup *from, | 3033 | struct mem_cgroup *from, |
3366 | struct mem_cgroup *to) | 3034 | struct mem_cgroup *to) |
3367 | { | 3035 | { |
@@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
3381 | goto out; | 3049 | goto out; |
3382 | 3050 | ||
3383 | /* | 3051 | /* |
3384 | * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup | 3052 | * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup |
3385 | * of its source page while we change it: page migration takes | 3053 | * of its source page while we change it: page migration takes |
3386 | * both pages off the LRU, but page cache replacement doesn't. | 3054 | * both pages off the LRU, but page cache replacement doesn't. |
3387 | */ | 3055 | */ |
@@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page, | |||
3389 | goto out; | 3057 | goto out; |
3390 | 3058 | ||
3391 | ret = -EINVAL; | 3059 | ret = -EINVAL; |
3392 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 3060 | if (page->mem_cgroup != from) |
3393 | goto out_unlock; | 3061 | goto out_unlock; |
3394 | 3062 | ||
3395 | move_lock_mem_cgroup(from, &flags); | 3063 | spin_lock_irqsave(&from->move_lock, flags); |
3396 | 3064 | ||
3397 | if (!PageAnon(page) && page_mapped(page)) { | 3065 | if (!PageAnon(page) && page_mapped(page)) { |
3398 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 3066 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
@@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page, | |||
3409 | } | 3077 | } |
3410 | 3078 | ||
3411 | /* | 3079 | /* |
3412 | * It is safe to change pc->mem_cgroup here because the page | 3080 | * It is safe to change page->mem_cgroup here because the page |
3413 | * is referenced, charged, and isolated - we can't race with | 3081 | * is referenced, charged, and isolated - we can't race with |
3414 | * uncharging, charging, migration, or LRU putback. | 3082 | * uncharging, charging, migration, or LRU putback. |
3415 | */ | 3083 | */ |
3416 | 3084 | ||
3417 | /* caller should have done css_get */ | 3085 | /* caller should have done css_get */ |
3418 | pc->mem_cgroup = to; | 3086 | page->mem_cgroup = to; |
3419 | move_unlock_mem_cgroup(from, &flags); | 3087 | spin_unlock_irqrestore(&from->move_lock, flags); |
3088 | |||
3420 | ret = 0; | 3089 | ret = 0; |
3421 | 3090 | ||
3422 | local_irq_disable(); | 3091 | local_irq_disable(); |
@@ -3431,72 +3100,6 @@ out: | |||
3431 | return ret; | 3100 | return ret; |
3432 | } | 3101 | } |
3433 | 3102 | ||
3434 | /** | ||
3435 | * mem_cgroup_move_parent - moves page to the parent group | ||
3436 | * @page: the page to move | ||
3437 | * @pc: page_cgroup of the page | ||
3438 | * @child: page's cgroup | ||
3439 | * | ||
3440 | * move charges to its parent or the root cgroup if the group has no | ||
3441 | * parent (aka use_hierarchy==0). | ||
3442 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
3443 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
3444 | * it signals a race with a page removal/uncharge or migration. In the | ||
3445 | * first case the page is on the way out and it will vanish from the LRU | ||
3446 | * on the next attempt and the call should be retried later. | ||
3447 | * Isolation from the LRU fails only if page has been isolated from | ||
3448 | * the LRU since we looked at it and that usually means either global | ||
3449 | * reclaim or migration going on. The page will either get back to the | ||
3450 | * LRU or vanish. | ||
3451 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
3452 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
3453 | * disappear in the next attempt. | ||
3454 | */ | ||
3455 | static int mem_cgroup_move_parent(struct page *page, | ||
3456 | struct page_cgroup *pc, | ||
3457 | struct mem_cgroup *child) | ||
3458 | { | ||
3459 | struct mem_cgroup *parent; | ||
3460 | unsigned int nr_pages; | ||
3461 | unsigned long uninitialized_var(flags); | ||
3462 | int ret; | ||
3463 | |||
3464 | VM_BUG_ON(mem_cgroup_is_root(child)); | ||
3465 | |||
3466 | ret = -EBUSY; | ||
3467 | if (!get_page_unless_zero(page)) | ||
3468 | goto out; | ||
3469 | if (isolate_lru_page(page)) | ||
3470 | goto put; | ||
3471 | |||
3472 | nr_pages = hpage_nr_pages(page); | ||
3473 | |||
3474 | parent = parent_mem_cgroup(child); | ||
3475 | /* | ||
3476 | * If no parent, move charges to root cgroup. | ||
3477 | */ | ||
3478 | if (!parent) | ||
3479 | parent = root_mem_cgroup; | ||
3480 | |||
3481 | if (nr_pages > 1) { | ||
3482 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
3483 | flags = compound_lock_irqsave(page); | ||
3484 | } | ||
3485 | |||
3486 | ret = mem_cgroup_move_account(page, nr_pages, | ||
3487 | pc, child, parent); | ||
3488 | if (!ret) | ||
3489 | __mem_cgroup_cancel_local_charge(child, nr_pages); | ||
3490 | |||
3491 | if (nr_pages > 1) | ||
3492 | compound_unlock_irqrestore(page, flags); | ||
3493 | putback_lru_page(page); | ||
3494 | put: | ||
3495 | put_page(page); | ||
3496 | out: | ||
3497 | return ret; | ||
3498 | } | ||
3499 | |||
3500 | #ifdef CONFIG_MEMCG_SWAP | 3103 | #ifdef CONFIG_MEMCG_SWAP |
3501 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | 3104 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
3502 | bool charge) | 3105 | bool charge) |
@@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
3516 | * | 3119 | * |
3517 | * Returns 0 on success, -EINVAL on failure. | 3120 | * Returns 0 on success, -EINVAL on failure. |
3518 | * | 3121 | * |
3519 | * The caller must have charged to @to, IOW, called res_counter_charge() about | 3122 | * The caller must have charged to @to, IOW, called page_counter_charge() about |
3520 | * both res and memsw, and called css_get(). | 3123 | * both res and memsw, and called css_get(). |
3521 | */ | 3124 | */ |
3522 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3125 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
@@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3532 | mem_cgroup_swap_statistics(to, true); | 3135 | mem_cgroup_swap_statistics(to, true); |
3533 | /* | 3136 | /* |
3534 | * This function is only called from task migration context now. | 3137 | * This function is only called from task migration context now. |
3535 | * It postpones res_counter and refcount handling till the end | 3138 | * It postpones page_counter and refcount handling till the end |
3536 | * of task migration(mem_cgroup_clear_mc()) for performance | 3139 | * of task migration(mem_cgroup_clear_mc()) for performance |
3537 | * improvement. But we cannot postpone css_get(to) because if | 3140 | * improvement. But we cannot postpone css_get(to) because if |
3538 | * the process that has been moved to @to does swap-in, the | 3141 | * the process that has been moved to @to does swap-in, the |
@@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3554 | } | 3157 | } |
3555 | #endif | 3158 | #endif |
3556 | 3159 | ||
3557 | #ifdef CONFIG_DEBUG_VM | 3160 | static DEFINE_MUTEX(memcg_limit_mutex); |
3558 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | ||
3559 | { | ||
3560 | struct page_cgroup *pc; | ||
3561 | |||
3562 | pc = lookup_page_cgroup(page); | ||
3563 | /* | ||
3564 | * Can be NULL while feeding pages into the page allocator for | ||
3565 | * the first time, i.e. during boot or memory hotplug; | ||
3566 | * or when mem_cgroup_disabled(). | ||
3567 | */ | ||
3568 | if (likely(pc) && PageCgroupUsed(pc)) | ||
3569 | return pc; | ||
3570 | return NULL; | ||
3571 | } | ||
3572 | |||
3573 | bool mem_cgroup_bad_page_check(struct page *page) | ||
3574 | { | ||
3575 | if (mem_cgroup_disabled()) | ||
3576 | return false; | ||
3577 | |||
3578 | return lookup_page_cgroup_used(page) != NULL; | ||
3579 | } | ||
3580 | |||
3581 | void mem_cgroup_print_bad_page(struct page *page) | ||
3582 | { | ||
3583 | struct page_cgroup *pc; | ||
3584 | |||
3585 | pc = lookup_page_cgroup_used(page); | ||
3586 | if (pc) { | ||
3587 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | ||
3588 | pc, pc->flags, pc->mem_cgroup); | ||
3589 | } | ||
3590 | } | ||
3591 | #endif | ||
3592 | 3161 | ||
3593 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3162 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3594 | unsigned long long val) | 3163 | unsigned long limit) |
3595 | { | 3164 | { |
3165 | unsigned long curusage; | ||
3166 | unsigned long oldusage; | ||
3167 | bool enlarge = false; | ||
3596 | int retry_count; | 3168 | int retry_count; |
3597 | int ret = 0; | 3169 | int ret; |
3598 | int children = mem_cgroup_count_children(memcg); | ||
3599 | u64 curusage, oldusage; | ||
3600 | int enlarge; | ||
3601 | 3170 | ||
3602 | /* | 3171 | /* |
3603 | * For keeping hierarchical_reclaim simple, how long we should retry | 3172 | * For keeping hierarchical_reclaim simple, how long we should retry |
3604 | * is depends on callers. We set our retry-count to be function | 3173 | * is depends on callers. We set our retry-count to be function |
3605 | * of # of children which we should visit in this loop. | 3174 | * of # of children which we should visit in this loop. |
3606 | */ | 3175 | */ |
3607 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; | 3176 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3177 | mem_cgroup_count_children(memcg); | ||
3608 | 3178 | ||
3609 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3179 | oldusage = page_counter_read(&memcg->memory); |
3610 | 3180 | ||
3611 | enlarge = 0; | 3181 | do { |
3612 | while (retry_count) { | ||
3613 | if (signal_pending(current)) { | 3182 | if (signal_pending(current)) { |
3614 | ret = -EINTR; | 3183 | ret = -EINTR; |
3615 | break; | 3184 | break; |
3616 | } | 3185 | } |
3617 | /* | 3186 | |
3618 | * Rather than hide all in some function, I do this in | 3187 | mutex_lock(&memcg_limit_mutex); |
3619 | * open coded manner. You see what this really does. | 3188 | if (limit > memcg->memsw.limit) { |
3620 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3189 | mutex_unlock(&memcg_limit_mutex); |
3621 | */ | ||
3622 | mutex_lock(&set_limit_mutex); | ||
3623 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { | ||
3624 | ret = -EINVAL; | 3190 | ret = -EINVAL; |
3625 | mutex_unlock(&set_limit_mutex); | ||
3626 | break; | 3191 | break; |
3627 | } | 3192 | } |
3628 | 3193 | if (limit > memcg->memory.limit) | |
3629 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) | 3194 | enlarge = true; |
3630 | enlarge = 1; | 3195 | ret = page_counter_limit(&memcg->memory, limit); |
3631 | 3196 | mutex_unlock(&memcg_limit_mutex); | |
3632 | ret = res_counter_set_limit(&memcg->res, val); | ||
3633 | mutex_unlock(&set_limit_mutex); | ||
3634 | 3197 | ||
3635 | if (!ret) | 3198 | if (!ret) |
3636 | break; | 3199 | break; |
3637 | 3200 | ||
3638 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); | 3201 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); |
3639 | 3202 | ||
3640 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3203 | curusage = page_counter_read(&memcg->memory); |
3641 | /* Usage is reduced ? */ | 3204 | /* Usage is reduced ? */ |
3642 | if (curusage >= oldusage) | 3205 | if (curusage >= oldusage) |
3643 | retry_count--; | 3206 | retry_count--; |
3644 | else | 3207 | else |
3645 | oldusage = curusage; | 3208 | oldusage = curusage; |
3646 | } | 3209 | } while (retry_count); |
3210 | |||
3647 | if (!ret && enlarge) | 3211 | if (!ret && enlarge) |
3648 | memcg_oom_recover(memcg); | 3212 | memcg_oom_recover(memcg); |
3649 | 3213 | ||
@@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3651 | } | 3215 | } |
3652 | 3216 | ||
3653 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 3217 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
3654 | unsigned long long val) | 3218 | unsigned long limit) |
3655 | { | 3219 | { |
3220 | unsigned long curusage; | ||
3221 | unsigned long oldusage; | ||
3222 | bool enlarge = false; | ||
3656 | int retry_count; | 3223 | int retry_count; |
3657 | u64 oldusage, curusage; | 3224 | int ret; |
3658 | int children = mem_cgroup_count_children(memcg); | ||
3659 | int ret = -EBUSY; | ||
3660 | int enlarge = 0; | ||
3661 | 3225 | ||
3662 | /* see mem_cgroup_resize_res_limit */ | 3226 | /* see mem_cgroup_resize_res_limit */ |
3663 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 3227 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3664 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3228 | mem_cgroup_count_children(memcg); |
3665 | while (retry_count) { | 3229 | |
3230 | oldusage = page_counter_read(&memcg->memsw); | ||
3231 | |||
3232 | do { | ||
3666 | if (signal_pending(current)) { | 3233 | if (signal_pending(current)) { |
3667 | ret = -EINTR; | 3234 | ret = -EINTR; |
3668 | break; | 3235 | break; |
3669 | } | 3236 | } |
3670 | /* | 3237 | |
3671 | * Rather than hide all in some function, I do this in | 3238 | mutex_lock(&memcg_limit_mutex); |
3672 | * open coded manner. You see what this really does. | 3239 | if (limit < memcg->memory.limit) { |
3673 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3240 | mutex_unlock(&memcg_limit_mutex); |
3674 | */ | ||
3675 | mutex_lock(&set_limit_mutex); | ||
3676 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { | ||
3677 | ret = -EINVAL; | 3241 | ret = -EINVAL; |
3678 | mutex_unlock(&set_limit_mutex); | ||
3679 | break; | 3242 | break; |
3680 | } | 3243 | } |
3681 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) | 3244 | if (limit > memcg->memsw.limit) |
3682 | enlarge = 1; | 3245 | enlarge = true; |
3683 | ret = res_counter_set_limit(&memcg->memsw, val); | 3246 | ret = page_counter_limit(&memcg->memsw, limit); |
3684 | mutex_unlock(&set_limit_mutex); | 3247 | mutex_unlock(&memcg_limit_mutex); |
3685 | 3248 | ||
3686 | if (!ret) | 3249 | if (!ret) |
3687 | break; | 3250 | break; |
3688 | 3251 | ||
3689 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); | 3252 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); |
3690 | 3253 | ||
3691 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3254 | curusage = page_counter_read(&memcg->memsw); |
3692 | /* Usage is reduced ? */ | 3255 | /* Usage is reduced ? */ |
3693 | if (curusage >= oldusage) | 3256 | if (curusage >= oldusage) |
3694 | retry_count--; | 3257 | retry_count--; |
3695 | else | 3258 | else |
3696 | oldusage = curusage; | 3259 | oldusage = curusage; |
3697 | } | 3260 | } while (retry_count); |
3261 | |||
3698 | if (!ret && enlarge) | 3262 | if (!ret && enlarge) |
3699 | memcg_oom_recover(memcg); | 3263 | memcg_oom_recover(memcg); |
3264 | |||
3700 | return ret; | 3265 | return ret; |
3701 | } | 3266 | } |
3702 | 3267 | ||
@@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3709 | unsigned long reclaimed; | 3274 | unsigned long reclaimed; |
3710 | int loop = 0; | 3275 | int loop = 0; |
3711 | struct mem_cgroup_tree_per_zone *mctz; | 3276 | struct mem_cgroup_tree_per_zone *mctz; |
3712 | unsigned long long excess; | 3277 | unsigned long excess; |
3713 | unsigned long nr_scanned; | 3278 | unsigned long nr_scanned; |
3714 | 3279 | ||
3715 | if (order > 0) | 3280 | if (order > 0) |
@@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3735 | nr_reclaimed += reclaimed; | 3300 | nr_reclaimed += reclaimed; |
3736 | *total_scanned += nr_scanned; | 3301 | *total_scanned += nr_scanned; |
3737 | spin_lock_irq(&mctz->lock); | 3302 | spin_lock_irq(&mctz->lock); |
3303 | __mem_cgroup_remove_exceeded(mz, mctz); | ||
3738 | 3304 | ||
3739 | /* | 3305 | /* |
3740 | * If we failed to reclaim anything from this memory cgroup | 3306 | * If we failed to reclaim anything from this memory cgroup |
3741 | * it is time to move on to the next cgroup | 3307 | * it is time to move on to the next cgroup |
3742 | */ | 3308 | */ |
3743 | next_mz = NULL; | 3309 | next_mz = NULL; |
3744 | if (!reclaimed) { | 3310 | if (!reclaimed) |
3745 | do { | 3311 | next_mz = __mem_cgroup_largest_soft_limit_node(mctz); |
3746 | /* | 3312 | |
3747 | * Loop until we find yet another one. | 3313 | excess = soft_limit_excess(mz->memcg); |
3748 | * | ||
3749 | * By the time we get the soft_limit lock | ||
3750 | * again, someone might have aded the | ||
3751 | * group back on the RB tree. Iterate to | ||
3752 | * make sure we get a different mem. | ||
3753 | * mem_cgroup_largest_soft_limit_node returns | ||
3754 | * NULL if no other cgroup is present on | ||
3755 | * the tree | ||
3756 | */ | ||
3757 | next_mz = | ||
3758 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
3759 | if (next_mz == mz) | ||
3760 | css_put(&next_mz->memcg->css); | ||
3761 | else /* next_mz == NULL or other memcg */ | ||
3762 | break; | ||
3763 | } while (1); | ||
3764 | } | ||
3765 | __mem_cgroup_remove_exceeded(mz, mctz); | ||
3766 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
3767 | /* | 3314 | /* |
3768 | * One school of thought says that we should not add | 3315 | * One school of thought says that we should not add |
3769 | * back the node to the tree if reclaim returns 0. | 3316 | * back the node to the tree if reclaim returns 0. |
@@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3792 | return nr_reclaimed; | 3339 | return nr_reclaimed; |
3793 | } | 3340 | } |
3794 | 3341 | ||
3795 | /** | ||
3796 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
3797 | * @memcg: group to clear | ||
3798 | * @node: NUMA node | ||
3799 | * @zid: zone id | ||
3800 | * @lru: lru to to clear | ||
3801 | * | ||
3802 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | ||
3803 | * reclaim the pages page themselves - pages are moved to the parent (or root) | ||
3804 | * group. | ||
3805 | */ | ||
3806 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | ||
3807 | int node, int zid, enum lru_list lru) | ||
3808 | { | ||
3809 | struct lruvec *lruvec; | ||
3810 | unsigned long flags; | ||
3811 | struct list_head *list; | ||
3812 | struct page *busy; | ||
3813 | struct zone *zone; | ||
3814 | |||
3815 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
3816 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
3817 | list = &lruvec->lists[lru]; | ||
3818 | |||
3819 | busy = NULL; | ||
3820 | do { | ||
3821 | struct page_cgroup *pc; | ||
3822 | struct page *page; | ||
3823 | |||
3824 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
3825 | if (list_empty(list)) { | ||
3826 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3827 | break; | ||
3828 | } | ||
3829 | page = list_entry(list->prev, struct page, lru); | ||
3830 | if (busy == page) { | ||
3831 | list_move(&page->lru, list); | ||
3832 | busy = NULL; | ||
3833 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3834 | continue; | ||
3835 | } | ||
3836 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3837 | |||
3838 | pc = lookup_page_cgroup(page); | ||
3839 | |||
3840 | if (mem_cgroup_move_parent(page, pc, memcg)) { | ||
3841 | /* found lock contention or "pc" is obsolete. */ | ||
3842 | busy = page; | ||
3843 | } else | ||
3844 | busy = NULL; | ||
3845 | cond_resched(); | ||
3846 | } while (!list_empty(list)); | ||
3847 | } | ||
3848 | |||
3849 | /* | ||
3850 | * make mem_cgroup's charge to be 0 if there is no task by moving | ||
3851 | * all the charges and pages to the parent. | ||
3852 | * This enables deleting this mem_cgroup. | ||
3853 | * | ||
3854 | * Caller is responsible for holding css reference on the memcg. | ||
3855 | */ | ||
3856 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | ||
3857 | { | ||
3858 | int node, zid; | ||
3859 | u64 usage; | ||
3860 | |||
3861 | do { | ||
3862 | /* This is for making all *used* pages to be on LRU. */ | ||
3863 | lru_add_drain_all(); | ||
3864 | drain_all_stock_sync(memcg); | ||
3865 | mem_cgroup_start_move(memcg); | ||
3866 | for_each_node_state(node, N_MEMORY) { | ||
3867 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
3868 | enum lru_list lru; | ||
3869 | for_each_lru(lru) { | ||
3870 | mem_cgroup_force_empty_list(memcg, | ||
3871 | node, zid, lru); | ||
3872 | } | ||
3873 | } | ||
3874 | } | ||
3875 | mem_cgroup_end_move(memcg); | ||
3876 | memcg_oom_recover(memcg); | ||
3877 | cond_resched(); | ||
3878 | |||
3879 | /* | ||
3880 | * Kernel memory may not necessarily be trackable to a specific | ||
3881 | * process. So they are not migrated, and therefore we can't | ||
3882 | * expect their value to drop to 0 here. | ||
3883 | * Having res filled up with kmem only is enough. | ||
3884 | * | ||
3885 | * This is a safety check because mem_cgroup_force_empty_list | ||
3886 | * could have raced with mem_cgroup_replace_page_cache callers | ||
3887 | * so the lru seemed empty but the page could have been added | ||
3888 | * right after the check. RES_USAGE should be safe as we always | ||
3889 | * charge before adding to the LRU. | ||
3890 | */ | ||
3891 | usage = res_counter_read_u64(&memcg->res, RES_USAGE) - | ||
3892 | res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
3893 | } while (usage > 0); | ||
3894 | } | ||
3895 | |||
3896 | /* | 3342 | /* |
3897 | * Test whether @memcg has children, dead or alive. Note that this | 3343 | * Test whether @memcg has children, dead or alive. Note that this |
3898 | * function doesn't care whether @memcg has use_hierarchy enabled and | 3344 | * function doesn't care whether @memcg has use_hierarchy enabled and |
@@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | |||
3930 | /* we call try-to-free pages for make this cgroup empty */ | 3376 | /* we call try-to-free pages for make this cgroup empty */ |
3931 | lru_add_drain_all(); | 3377 | lru_add_drain_all(); |
3932 | /* try to free all pages in this cgroup */ | 3378 | /* try to free all pages in this cgroup */ |
3933 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { | 3379 | while (nr_retries && page_counter_read(&memcg->memory)) { |
3934 | int progress; | 3380 | int progress; |
3935 | 3381 | ||
3936 | if (signal_pending(current)) | 3382 | if (signal_pending(current)) |
@@ -4001,8 +3447,8 @@ out: | |||
4001 | return retval; | 3447 | return retval; |
4002 | } | 3448 | } |
4003 | 3449 | ||
4004 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, | 3450 | static unsigned long tree_stat(struct mem_cgroup *memcg, |
4005 | enum mem_cgroup_stat_index idx) | 3451 | enum mem_cgroup_stat_index idx) |
4006 | { | 3452 | { |
4007 | struct mem_cgroup *iter; | 3453 | struct mem_cgroup *iter; |
4008 | long val = 0; | 3454 | long val = 0; |
@@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
4020 | { | 3466 | { |
4021 | u64 val; | 3467 | u64 val; |
4022 | 3468 | ||
4023 | if (!mem_cgroup_is_root(memcg)) { | 3469 | if (mem_cgroup_is_root(memcg)) { |
3470 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
3471 | val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
3472 | if (swap) | ||
3473 | val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
3474 | } else { | ||
4024 | if (!swap) | 3475 | if (!swap) |
4025 | return res_counter_read_u64(&memcg->res, RES_USAGE); | 3476 | val = page_counter_read(&memcg->memory); |
4026 | else | 3477 | else |
4027 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3478 | val = page_counter_read(&memcg->memsw); |
4028 | } | 3479 | } |
4029 | |||
4030 | /* | ||
4031 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
4032 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
4033 | */ | ||
4034 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4035 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4036 | |||
4037 | if (swap) | ||
4038 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
4039 | |||
4040 | return val << PAGE_SHIFT; | 3480 | return val << PAGE_SHIFT; |
4041 | } | 3481 | } |
4042 | 3482 | ||
3483 | enum { | ||
3484 | RES_USAGE, | ||
3485 | RES_LIMIT, | ||
3486 | RES_MAX_USAGE, | ||
3487 | RES_FAILCNT, | ||
3488 | RES_SOFT_LIMIT, | ||
3489 | }; | ||
4043 | 3490 | ||
4044 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 3491 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
4045 | struct cftype *cft) | 3492 | struct cftype *cft) |
4046 | { | 3493 | { |
4047 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3494 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4048 | enum res_type type = MEMFILE_TYPE(cft->private); | 3495 | struct page_counter *counter; |
4049 | int name = MEMFILE_ATTR(cft->private); | ||
4050 | 3496 | ||
4051 | switch (type) { | 3497 | switch (MEMFILE_TYPE(cft->private)) { |
4052 | case _MEM: | 3498 | case _MEM: |
4053 | if (name == RES_USAGE) | 3499 | counter = &memcg->memory; |
4054 | return mem_cgroup_usage(memcg, false); | 3500 | break; |
4055 | return res_counter_read_u64(&memcg->res, name); | ||
4056 | case _MEMSWAP: | 3501 | case _MEMSWAP: |
4057 | if (name == RES_USAGE) | 3502 | counter = &memcg->memsw; |
4058 | return mem_cgroup_usage(memcg, true); | 3503 | break; |
4059 | return res_counter_read_u64(&memcg->memsw, name); | ||
4060 | case _KMEM: | 3504 | case _KMEM: |
4061 | return res_counter_read_u64(&memcg->kmem, name); | 3505 | counter = &memcg->kmem; |
4062 | break; | 3506 | break; |
4063 | default: | 3507 | default: |
4064 | BUG(); | 3508 | BUG(); |
4065 | } | 3509 | } |
3510 | |||
3511 | switch (MEMFILE_ATTR(cft->private)) { | ||
3512 | case RES_USAGE: | ||
3513 | if (counter == &memcg->memory) | ||
3514 | return mem_cgroup_usage(memcg, false); | ||
3515 | if (counter == &memcg->memsw) | ||
3516 | return mem_cgroup_usage(memcg, true); | ||
3517 | return (u64)page_counter_read(counter) * PAGE_SIZE; | ||
3518 | case RES_LIMIT: | ||
3519 | return (u64)counter->limit * PAGE_SIZE; | ||
3520 | case RES_MAX_USAGE: | ||
3521 | return (u64)counter->watermark * PAGE_SIZE; | ||
3522 | case RES_FAILCNT: | ||
3523 | return counter->failcnt; | ||
3524 | case RES_SOFT_LIMIT: | ||
3525 | return (u64)memcg->soft_limit * PAGE_SIZE; | ||
3526 | default: | ||
3527 | BUG(); | ||
3528 | } | ||
4066 | } | 3529 | } |
4067 | 3530 | ||
4068 | #ifdef CONFIG_MEMCG_KMEM | 3531 | #ifdef CONFIG_MEMCG_KMEM |
4069 | /* should be called with activate_kmem_mutex held */ | 3532 | static int memcg_activate_kmem(struct mem_cgroup *memcg, |
4070 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, | 3533 | unsigned long nr_pages) |
4071 | unsigned long long limit) | ||
4072 | { | 3534 | { |
4073 | int err = 0; | 3535 | int err = 0; |
4074 | int memcg_id; | 3536 | int memcg_id; |
@@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4115 | * We couldn't have accounted to this cgroup, because it hasn't got the | 3577 | * We couldn't have accounted to this cgroup, because it hasn't got the |
4116 | * active bit set yet, so this should succeed. | 3578 | * active bit set yet, so this should succeed. |
4117 | */ | 3579 | */ |
4118 | err = res_counter_set_limit(&memcg->kmem, limit); | 3580 | err = page_counter_limit(&memcg->kmem, nr_pages); |
4119 | VM_BUG_ON(err); | 3581 | VM_BUG_ON(err); |
4120 | 3582 | ||
4121 | static_key_slow_inc(&memcg_kmem_enabled_key); | 3583 | static_key_slow_inc(&memcg_kmem_enabled_key); |
@@ -4130,26 +3592,17 @@ out: | |||
4130 | return err; | 3592 | return err; |
4131 | } | 3593 | } |
4132 | 3594 | ||
4133 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | ||
4134 | unsigned long long limit) | ||
4135 | { | ||
4136 | int ret; | ||
4137 | |||
4138 | mutex_lock(&activate_kmem_mutex); | ||
4139 | ret = __memcg_activate_kmem(memcg, limit); | ||
4140 | mutex_unlock(&activate_kmem_mutex); | ||
4141 | return ret; | ||
4142 | } | ||
4143 | |||
4144 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 3595 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4145 | unsigned long long val) | 3596 | unsigned long limit) |
4146 | { | 3597 | { |
4147 | int ret; | 3598 | int ret; |
4148 | 3599 | ||
3600 | mutex_lock(&memcg_limit_mutex); | ||
4149 | if (!memcg_kmem_is_active(memcg)) | 3601 | if (!memcg_kmem_is_active(memcg)) |
4150 | ret = memcg_activate_kmem(memcg, val); | 3602 | ret = memcg_activate_kmem(memcg, limit); |
4151 | else | 3603 | else |
4152 | ret = res_counter_set_limit(&memcg->kmem, val); | 3604 | ret = page_counter_limit(&memcg->kmem, limit); |
3605 | mutex_unlock(&memcg_limit_mutex); | ||
4153 | return ret; | 3606 | return ret; |
4154 | } | 3607 | } |
4155 | 3608 | ||
@@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
4161 | if (!parent) | 3614 | if (!parent) |
4162 | return 0; | 3615 | return 0; |
4163 | 3616 | ||
4164 | mutex_lock(&activate_kmem_mutex); | 3617 | mutex_lock(&memcg_limit_mutex); |
4165 | /* | 3618 | /* |
4166 | * If the parent cgroup is not kmem-active now, it cannot be activated | 3619 | * If the parent cgroup is not kmem-active now, it cannot be activated |
4167 | * after this point, because it has at least one child already. | 3620 | * after this point, because it has at least one child already. |
4168 | */ | 3621 | */ |
4169 | if (memcg_kmem_is_active(parent)) | 3622 | if (memcg_kmem_is_active(parent)) |
4170 | ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); | 3623 | ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); |
4171 | mutex_unlock(&activate_kmem_mutex); | 3624 | mutex_unlock(&memcg_limit_mutex); |
4172 | return ret; | 3625 | return ret; |
4173 | } | 3626 | } |
4174 | #else | 3627 | #else |
4175 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 3628 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4176 | unsigned long long val) | 3629 | unsigned long limit) |
4177 | { | 3630 | { |
4178 | return -EINVAL; | 3631 | return -EINVAL; |
4179 | } | 3632 | } |
@@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
4187 | char *buf, size_t nbytes, loff_t off) | 3640 | char *buf, size_t nbytes, loff_t off) |
4188 | { | 3641 | { |
4189 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 3642 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4190 | enum res_type type; | 3643 | unsigned long nr_pages; |
4191 | int name; | ||
4192 | unsigned long long val; | ||
4193 | int ret; | 3644 | int ret; |
4194 | 3645 | ||
4195 | buf = strstrip(buf); | 3646 | buf = strstrip(buf); |
4196 | type = MEMFILE_TYPE(of_cft(of)->private); | 3647 | ret = page_counter_memparse(buf, &nr_pages); |
4197 | name = MEMFILE_ATTR(of_cft(of)->private); | 3648 | if (ret) |
3649 | return ret; | ||
4198 | 3650 | ||
4199 | switch (name) { | 3651 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4200 | case RES_LIMIT: | 3652 | case RES_LIMIT: |
4201 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 3653 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
4202 | ret = -EINVAL; | 3654 | ret = -EINVAL; |
4203 | break; | 3655 | break; |
4204 | } | 3656 | } |
4205 | /* This function does all necessary parse...reuse it */ | 3657 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4206 | ret = res_counter_memparse_write_strategy(buf, &val); | 3658 | case _MEM: |
4207 | if (ret) | 3659 | ret = mem_cgroup_resize_limit(memcg, nr_pages); |
4208 | break; | 3660 | break; |
4209 | if (type == _MEM) | 3661 | case _MEMSWAP: |
4210 | ret = mem_cgroup_resize_limit(memcg, val); | 3662 | ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); |
4211 | else if (type == _MEMSWAP) | ||
4212 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
4213 | else if (type == _KMEM) | ||
4214 | ret = memcg_update_kmem_limit(memcg, val); | ||
4215 | else | ||
4216 | return -EINVAL; | ||
4217 | break; | ||
4218 | case RES_SOFT_LIMIT: | ||
4219 | ret = res_counter_memparse_write_strategy(buf, &val); | ||
4220 | if (ret) | ||
4221 | break; | 3663 | break; |
4222 | /* | 3664 | case _KMEM: |
4223 | * For memsw, soft limits are hard to implement in terms | 3665 | ret = memcg_update_kmem_limit(memcg, nr_pages); |
4224 | * of semantics, for now, we support soft limits for | 3666 | break; |
4225 | * control without swap | 3667 | } |
4226 | */ | ||
4227 | if (type == _MEM) | ||
4228 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
4229 | else | ||
4230 | ret = -EINVAL; | ||
4231 | break; | 3668 | break; |
4232 | default: | 3669 | case RES_SOFT_LIMIT: |
4233 | ret = -EINVAL; /* should be BUG() ? */ | 3670 | memcg->soft_limit = nr_pages; |
3671 | ret = 0; | ||
4234 | break; | 3672 | break; |
4235 | } | 3673 | } |
4236 | return ret ?: nbytes; | 3674 | return ret ?: nbytes; |
4237 | } | 3675 | } |
4238 | 3676 | ||
4239 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | ||
4240 | unsigned long long *mem_limit, unsigned long long *memsw_limit) | ||
4241 | { | ||
4242 | unsigned long long min_limit, min_memsw_limit, tmp; | ||
4243 | |||
4244 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
4245 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
4246 | if (!memcg->use_hierarchy) | ||
4247 | goto out; | ||
4248 | |||
4249 | while (memcg->css.parent) { | ||
4250 | memcg = mem_cgroup_from_css(memcg->css.parent); | ||
4251 | if (!memcg->use_hierarchy) | ||
4252 | break; | ||
4253 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
4254 | min_limit = min(min_limit, tmp); | ||
4255 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
4256 | min_memsw_limit = min(min_memsw_limit, tmp); | ||
4257 | } | ||
4258 | out: | ||
4259 | *mem_limit = min_limit; | ||
4260 | *memsw_limit = min_memsw_limit; | ||
4261 | } | ||
4262 | |||
4263 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, | 3677 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, |
4264 | size_t nbytes, loff_t off) | 3678 | size_t nbytes, loff_t off) |
4265 | { | 3679 | { |
4266 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 3680 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4267 | int name; | 3681 | struct page_counter *counter; |
4268 | enum res_type type; | ||
4269 | 3682 | ||
4270 | type = MEMFILE_TYPE(of_cft(of)->private); | 3683 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4271 | name = MEMFILE_ATTR(of_cft(of)->private); | 3684 | case _MEM: |
3685 | counter = &memcg->memory; | ||
3686 | break; | ||
3687 | case _MEMSWAP: | ||
3688 | counter = &memcg->memsw; | ||
3689 | break; | ||
3690 | case _KMEM: | ||
3691 | counter = &memcg->kmem; | ||
3692 | break; | ||
3693 | default: | ||
3694 | BUG(); | ||
3695 | } | ||
4272 | 3696 | ||
4273 | switch (name) { | 3697 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4274 | case RES_MAX_USAGE: | 3698 | case RES_MAX_USAGE: |
4275 | if (type == _MEM) | 3699 | page_counter_reset_watermark(counter); |
4276 | res_counter_reset_max(&memcg->res); | ||
4277 | else if (type == _MEMSWAP) | ||
4278 | res_counter_reset_max(&memcg->memsw); | ||
4279 | else if (type == _KMEM) | ||
4280 | res_counter_reset_max(&memcg->kmem); | ||
4281 | else | ||
4282 | return -EINVAL; | ||
4283 | break; | 3700 | break; |
4284 | case RES_FAILCNT: | 3701 | case RES_FAILCNT: |
4285 | if (type == _MEM) | 3702 | counter->failcnt = 0; |
4286 | res_counter_reset_failcnt(&memcg->res); | ||
4287 | else if (type == _MEMSWAP) | ||
4288 | res_counter_reset_failcnt(&memcg->memsw); | ||
4289 | else if (type == _KMEM) | ||
4290 | res_counter_reset_failcnt(&memcg->kmem); | ||
4291 | else | ||
4292 | return -EINVAL; | ||
4293 | break; | 3703 | break; |
3704 | default: | ||
3705 | BUG(); | ||
4294 | } | 3706 | } |
4295 | 3707 | ||
4296 | return nbytes; | 3708 | return nbytes; |
@@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
4387 | static int memcg_stat_show(struct seq_file *m, void *v) | 3799 | static int memcg_stat_show(struct seq_file *m, void *v) |
4388 | { | 3800 | { |
4389 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3801 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
3802 | unsigned long memory, memsw; | ||
4390 | struct mem_cgroup *mi; | 3803 | struct mem_cgroup *mi; |
4391 | unsigned int i; | 3804 | unsigned int i; |
4392 | 3805 | ||
@@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
4406 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | 3819 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); |
4407 | 3820 | ||
4408 | /* Hierarchical information */ | 3821 | /* Hierarchical information */ |
4409 | { | 3822 | memory = memsw = PAGE_COUNTER_MAX; |
4410 | unsigned long long limit, memsw_limit; | 3823 | for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { |
4411 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); | 3824 | memory = min(memory, mi->memory.limit); |
4412 | seq_printf(m, "hierarchical_memory_limit %llu\n", limit); | 3825 | memsw = min(memsw, mi->memsw.limit); |
4413 | if (do_swap_account) | ||
4414 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | ||
4415 | memsw_limit); | ||
4416 | } | 3826 | } |
3827 | seq_printf(m, "hierarchical_memory_limit %llu\n", | ||
3828 | (u64)memory * PAGE_SIZE); | ||
3829 | if (do_swap_account) | ||
3830 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | ||
3831 | (u64)memsw * PAGE_SIZE); | ||
4417 | 3832 | ||
4418 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3833 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4419 | long long val = 0; | 3834 | long long val = 0; |
@@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | |||
4497 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | 3912 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
4498 | { | 3913 | { |
4499 | struct mem_cgroup_threshold_ary *t; | 3914 | struct mem_cgroup_threshold_ary *t; |
4500 | u64 usage; | 3915 | unsigned long usage; |
4501 | int i; | 3916 | int i; |
4502 | 3917 | ||
4503 | rcu_read_lock(); | 3918 | rcu_read_lock(); |
@@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
4596 | { | 4011 | { |
4597 | struct mem_cgroup_thresholds *thresholds; | 4012 | struct mem_cgroup_thresholds *thresholds; |
4598 | struct mem_cgroup_threshold_ary *new; | 4013 | struct mem_cgroup_threshold_ary *new; |
4599 | u64 threshold, usage; | 4014 | unsigned long threshold; |
4015 | unsigned long usage; | ||
4600 | int i, size, ret; | 4016 | int i, size, ret; |
4601 | 4017 | ||
4602 | ret = res_counter_memparse_write_strategy(args, &threshold); | 4018 | ret = page_counter_memparse(args, &threshold); |
4603 | if (ret) | 4019 | if (ret) |
4604 | return ret; | 4020 | return ret; |
4605 | 4021 | ||
@@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
4689 | { | 4105 | { |
4690 | struct mem_cgroup_thresholds *thresholds; | 4106 | struct mem_cgroup_thresholds *thresholds; |
4691 | struct mem_cgroup_threshold_ary *new; | 4107 | struct mem_cgroup_threshold_ary *new; |
4692 | u64 usage; | 4108 | unsigned long usage; |
4693 | int i, j, size; | 4109 | int i, j, size; |
4694 | 4110 | ||
4695 | mutex_lock(&memcg->thresholds_lock); | 4111 | mutex_lock(&memcg->thresholds_lock); |
@@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) | |||
4855 | { | 4271 | { |
4856 | mem_cgroup_sockets_destroy(memcg); | 4272 | mem_cgroup_sockets_destroy(memcg); |
4857 | } | 4273 | } |
4858 | |||
4859 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
4860 | { | ||
4861 | if (!memcg_kmem_is_active(memcg)) | ||
4862 | return; | ||
4863 | |||
4864 | /* | ||
4865 | * kmem charges can outlive the cgroup. In the case of slab | ||
4866 | * pages, for instance, a page contain objects from various | ||
4867 | * processes. As we prevent from taking a reference for every | ||
4868 | * such allocation we have to be careful when doing uncharge | ||
4869 | * (see memcg_uncharge_kmem) and here during offlining. | ||
4870 | * | ||
4871 | * The idea is that that only the _last_ uncharge which sees | ||
4872 | * the dead memcg will drop the last reference. An additional | ||
4873 | * reference is taken here before the group is marked dead | ||
4874 | * which is then paired with css_put during uncharge resp. here. | ||
4875 | * | ||
4876 | * Although this might sound strange as this path is called from | ||
4877 | * css_offline() when the referencemight have dropped down to 0 and | ||
4878 | * shouldn't be incremented anymore (css_tryget_online() would | ||
4879 | * fail) we do not have other options because of the kmem | ||
4880 | * allocations lifetime. | ||
4881 | */ | ||
4882 | css_get(&memcg->css); | ||
4883 | |||
4884 | memcg_kmem_mark_dead(memcg); | ||
4885 | |||
4886 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) | ||
4887 | return; | ||
4888 | |||
4889 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
4890 | css_put(&memcg->css); | ||
4891 | } | ||
4892 | #else | 4274 | #else |
4893 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4275 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4894 | { | 4276 | { |
@@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4898 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4280 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4899 | { | 4281 | { |
4900 | } | 4282 | } |
4901 | |||
4902 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
4903 | { | ||
4904 | } | ||
4905 | #endif | 4283 | #endif |
4906 | 4284 | ||
4907 | /* | 4285 | /* |
@@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = { | |||
5228 | #ifdef CONFIG_SLABINFO | 4606 | #ifdef CONFIG_SLABINFO |
5229 | { | 4607 | { |
5230 | .name = "kmem.slabinfo", | 4608 | .name = "kmem.slabinfo", |
5231 | .seq_show = mem_cgroup_slabinfo_read, | 4609 | .seq_start = slab_start, |
4610 | .seq_next = slab_next, | ||
4611 | .seq_stop = slab_stop, | ||
4612 | .seq_show = memcg_slab_show, | ||
5232 | }, | 4613 | }, |
5233 | #endif | 4614 | #endif |
5234 | #endif | 4615 | #endif |
@@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5363 | */ | 4744 | */ |
5364 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | 4745 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) |
5365 | { | 4746 | { |
5366 | if (!memcg->res.parent) | 4747 | if (!memcg->memory.parent) |
5367 | return NULL; | 4748 | return NULL; |
5368 | return mem_cgroup_from_res_counter(memcg->res.parent, res); | 4749 | return mem_cgroup_from_counter(memcg->memory.parent, memory); |
5369 | } | 4750 | } |
5370 | EXPORT_SYMBOL(parent_mem_cgroup); | 4751 | EXPORT_SYMBOL(parent_mem_cgroup); |
5371 | 4752 | ||
@@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
5410 | /* root ? */ | 4791 | /* root ? */ |
5411 | if (parent_css == NULL) { | 4792 | if (parent_css == NULL) { |
5412 | root_mem_cgroup = memcg; | 4793 | root_mem_cgroup = memcg; |
5413 | res_counter_init(&memcg->res, NULL); | 4794 | page_counter_init(&memcg->memory, NULL); |
5414 | res_counter_init(&memcg->memsw, NULL); | 4795 | page_counter_init(&memcg->memsw, NULL); |
5415 | res_counter_init(&memcg->kmem, NULL); | 4796 | page_counter_init(&memcg->kmem, NULL); |
5416 | } | 4797 | } |
5417 | 4798 | ||
5418 | memcg->last_scanned_node = MAX_NUMNODES; | 4799 | memcg->last_scanned_node = MAX_NUMNODES; |
@@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5451 | memcg->swappiness = mem_cgroup_swappiness(parent); | 4832 | memcg->swappiness = mem_cgroup_swappiness(parent); |
5452 | 4833 | ||
5453 | if (parent->use_hierarchy) { | 4834 | if (parent->use_hierarchy) { |
5454 | res_counter_init(&memcg->res, &parent->res); | 4835 | page_counter_init(&memcg->memory, &parent->memory); |
5455 | res_counter_init(&memcg->memsw, &parent->memsw); | 4836 | page_counter_init(&memcg->memsw, &parent->memsw); |
5456 | res_counter_init(&memcg->kmem, &parent->kmem); | 4837 | page_counter_init(&memcg->kmem, &parent->kmem); |
5457 | 4838 | ||
5458 | /* | 4839 | /* |
5459 | * No need to take a reference to the parent because cgroup | 4840 | * No need to take a reference to the parent because cgroup |
5460 | * core guarantees its existence. | 4841 | * core guarantees its existence. |
5461 | */ | 4842 | */ |
5462 | } else { | 4843 | } else { |
5463 | res_counter_init(&memcg->res, NULL); | 4844 | page_counter_init(&memcg->memory, NULL); |
5464 | res_counter_init(&memcg->memsw, NULL); | 4845 | page_counter_init(&memcg->memsw, NULL); |
5465 | res_counter_init(&memcg->kmem, NULL); | 4846 | page_counter_init(&memcg->kmem, NULL); |
5466 | /* | 4847 | /* |
5467 | * Deeper hierachy with use_hierarchy == false doesn't make | 4848 | * Deeper hierachy with use_hierarchy == false doesn't make |
5468 | * much sense so let cgroup subsystem know about this | 4849 | * much sense so let cgroup subsystem know about this |
@@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5487 | return 0; | 4868 | return 0; |
5488 | } | 4869 | } |
5489 | 4870 | ||
5490 | /* | ||
5491 | * Announce all parents that a group from their hierarchy is gone. | ||
5492 | */ | ||
5493 | static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | ||
5494 | { | ||
5495 | struct mem_cgroup *parent = memcg; | ||
5496 | |||
5497 | while ((parent = parent_mem_cgroup(parent))) | ||
5498 | mem_cgroup_iter_invalidate(parent); | ||
5499 | |||
5500 | /* | ||
5501 | * if the root memcg is not hierarchical we have to check it | ||
5502 | * explicitely. | ||
5503 | */ | ||
5504 | if (!root_mem_cgroup->use_hierarchy) | ||
5505 | mem_cgroup_iter_invalidate(root_mem_cgroup); | ||
5506 | } | ||
5507 | |||
5508 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 4871 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
5509 | { | 4872 | { |
5510 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4873 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5511 | struct mem_cgroup_event *event, *tmp; | 4874 | struct mem_cgroup_event *event, *tmp; |
5512 | struct cgroup_subsys_state *iter; | ||
5513 | 4875 | ||
5514 | /* | 4876 | /* |
5515 | * Unregister events and notify userspace. | 4877 | * Unregister events and notify userspace. |
@@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5523 | } | 4885 | } |
5524 | spin_unlock(&memcg->event_list_lock); | 4886 | spin_unlock(&memcg->event_list_lock); |
5525 | 4887 | ||
5526 | kmem_cgroup_css_offline(memcg); | ||
5527 | |||
5528 | mem_cgroup_invalidate_reclaim_iterators(memcg); | ||
5529 | |||
5530 | /* | ||
5531 | * This requires that offlining is serialized. Right now that is | ||
5532 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. | ||
5533 | */ | ||
5534 | css_for_each_descendant_post(iter, css) | ||
5535 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | ||
5536 | |||
5537 | memcg_unregister_all_caches(memcg); | 4888 | memcg_unregister_all_caches(memcg); |
5538 | vmpressure_cleanup(&memcg->vmpressure); | 4889 | vmpressure_cleanup(&memcg->vmpressure); |
5539 | } | 4890 | } |
@@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5541 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 4892 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
5542 | { | 4893 | { |
5543 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4894 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5544 | /* | ||
5545 | * XXX: css_offline() would be where we should reparent all | ||
5546 | * memory to prepare the cgroup for destruction. However, | ||
5547 | * memcg does not do css_tryget_online() and res_counter charging | ||
5548 | * under the same RCU lock region, which means that charging | ||
5549 | * could race with offlining. Offlining only happens to | ||
5550 | * cgroups with no tasks in them but charges can show up | ||
5551 | * without any tasks from the swapin path when the target | ||
5552 | * memcg is looked up from the swapout record and not from the | ||
5553 | * current task as it usually is. A race like this can leak | ||
5554 | * charges and put pages with stale cgroup pointers into | ||
5555 | * circulation: | ||
5556 | * | ||
5557 | * #0 #1 | ||
5558 | * lookup_swap_cgroup_id() | ||
5559 | * rcu_read_lock() | ||
5560 | * mem_cgroup_lookup() | ||
5561 | * css_tryget_online() | ||
5562 | * rcu_read_unlock() | ||
5563 | * disable css_tryget_online() | ||
5564 | * call_rcu() | ||
5565 | * offline_css() | ||
5566 | * reparent_charges() | ||
5567 | * res_counter_charge() | ||
5568 | * css_put() | ||
5569 | * css_free() | ||
5570 | * pc->mem_cgroup = dead memcg | ||
5571 | * add page to lru | ||
5572 | * | ||
5573 | * The bulk of the charges are still moved in offline_css() to | ||
5574 | * avoid pinning a lot of pages in case a long-term reference | ||
5575 | * like a swapout record is deferring the css_free() to long | ||
5576 | * after offlining. But this makes sure we catch any charges | ||
5577 | * made after offlining: | ||
5578 | */ | ||
5579 | mem_cgroup_reparent_charges(memcg); | ||
5580 | 4895 | ||
5581 | memcg_destroy_kmem(memcg); | 4896 | memcg_destroy_kmem(memcg); |
5582 | __mem_cgroup_free(memcg); | 4897 | __mem_cgroup_free(memcg); |
@@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
5599 | { | 4914 | { |
5600 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4915 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5601 | 4916 | ||
5602 | mem_cgroup_resize_limit(memcg, ULLONG_MAX); | 4917 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
5603 | mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); | 4918 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
5604 | memcg_update_kmem_limit(memcg, ULLONG_MAX); | 4919 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
5605 | res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); | 4920 | memcg->soft_limit = 0; |
5606 | } | 4921 | } |
5607 | 4922 | ||
5608 | #ifdef CONFIG_MMU | 4923 | #ifdef CONFIG_MMU |
@@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
5758 | unsigned long addr, pte_t ptent, union mc_target *target) | 5073 | unsigned long addr, pte_t ptent, union mc_target *target) |
5759 | { | 5074 | { |
5760 | struct page *page = NULL; | 5075 | struct page *page = NULL; |
5761 | struct page_cgroup *pc; | ||
5762 | enum mc_target_type ret = MC_TARGET_NONE; | 5076 | enum mc_target_type ret = MC_TARGET_NONE; |
5763 | swp_entry_t ent = { .val = 0 }; | 5077 | swp_entry_t ent = { .val = 0 }; |
5764 | 5078 | ||
@@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
5772 | if (!page && !ent.val) | 5086 | if (!page && !ent.val) |
5773 | return ret; | 5087 | return ret; |
5774 | if (page) { | 5088 | if (page) { |
5775 | pc = lookup_page_cgroup(page); | ||
5776 | /* | 5089 | /* |
5777 | * Do only loose check w/o serialization. | 5090 | * Do only loose check w/o serialization. |
5778 | * mem_cgroup_move_account() checks the pc is valid or | 5091 | * mem_cgroup_move_account() checks the page is valid or |
5779 | * not under LRU exclusion. | 5092 | * not under LRU exclusion. |
5780 | */ | 5093 | */ |
5781 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 5094 | if (page->mem_cgroup == mc.from) { |
5782 | ret = MC_TARGET_PAGE; | 5095 | ret = MC_TARGET_PAGE; |
5783 | if (target) | 5096 | if (target) |
5784 | target->page = page; | 5097 | target->page = page; |
@@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
5806 | unsigned long addr, pmd_t pmd, union mc_target *target) | 5119 | unsigned long addr, pmd_t pmd, union mc_target *target) |
5807 | { | 5120 | { |
5808 | struct page *page = NULL; | 5121 | struct page *page = NULL; |
5809 | struct page_cgroup *pc; | ||
5810 | enum mc_target_type ret = MC_TARGET_NONE; | 5122 | enum mc_target_type ret = MC_TARGET_NONE; |
5811 | 5123 | ||
5812 | page = pmd_page(pmd); | 5124 | page = pmd_page(pmd); |
5813 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 5125 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
5814 | if (!move_anon()) | 5126 | if (!move_anon()) |
5815 | return ret; | 5127 | return ret; |
5816 | pc = lookup_page_cgroup(page); | 5128 | if (page->mem_cgroup == mc.from) { |
5817 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
5818 | ret = MC_TARGET_PAGE; | 5129 | ret = MC_TARGET_PAGE; |
5819 | if (target) { | 5130 | if (target) { |
5820 | get_page(page); | 5131 | get_page(page); |
@@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void) | |||
5897 | { | 5208 | { |
5898 | struct mem_cgroup *from = mc.from; | 5209 | struct mem_cgroup *from = mc.from; |
5899 | struct mem_cgroup *to = mc.to; | 5210 | struct mem_cgroup *to = mc.to; |
5900 | int i; | ||
5901 | 5211 | ||
5902 | /* we must uncharge all the leftover precharges from mc.to */ | 5212 | /* we must uncharge all the leftover precharges from mc.to */ |
5903 | if (mc.precharge) { | 5213 | if (mc.precharge) { |
@@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void) | |||
5916 | if (mc.moved_swap) { | 5226 | if (mc.moved_swap) { |
5917 | /* uncharge swap account from the old cgroup */ | 5227 | /* uncharge swap account from the old cgroup */ |
5918 | if (!mem_cgroup_is_root(mc.from)) | 5228 | if (!mem_cgroup_is_root(mc.from)) |
5919 | res_counter_uncharge(&mc.from->memsw, | 5229 | page_counter_uncharge(&mc.from->memsw, mc.moved_swap); |
5920 | PAGE_SIZE * mc.moved_swap); | ||
5921 | |||
5922 | for (i = 0; i < mc.moved_swap; i++) | ||
5923 | css_put(&mc.from->css); | ||
5924 | 5230 | ||
5925 | /* | 5231 | /* |
5926 | * we charged both to->res and to->memsw, so we should | 5232 | * we charged both to->memory and to->memsw, so we |
5927 | * uncharge to->res. | 5233 | * should uncharge to->memory. |
5928 | */ | 5234 | */ |
5929 | if (!mem_cgroup_is_root(mc.to)) | 5235 | if (!mem_cgroup_is_root(mc.to)) |
5930 | res_counter_uncharge(&mc.to->res, | 5236 | page_counter_uncharge(&mc.to->memory, mc.moved_swap); |
5931 | PAGE_SIZE * mc.moved_swap); | 5237 | |
5238 | css_put_many(&mc.from->css, mc.moved_swap); | ||
5239 | |||
5932 | /* we've already done css_get(mc.to) */ | 5240 | /* we've already done css_get(mc.to) */ |
5933 | mc.moved_swap = 0; | 5241 | mc.moved_swap = 0; |
5934 | } | 5242 | } |
@@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void) | |||
5939 | 5247 | ||
5940 | static void mem_cgroup_clear_mc(void) | 5248 | static void mem_cgroup_clear_mc(void) |
5941 | { | 5249 | { |
5942 | struct mem_cgroup *from = mc.from; | ||
5943 | |||
5944 | /* | 5250 | /* |
5945 | * we must clear moving_task before waking up waiters at the end of | 5251 | * we must clear moving_task before waking up waiters at the end of |
5946 | * task migration. | 5252 | * task migration. |
@@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void) | |||
5951 | mc.from = NULL; | 5257 | mc.from = NULL; |
5952 | mc.to = NULL; | 5258 | mc.to = NULL; |
5953 | spin_unlock(&mc.lock); | 5259 | spin_unlock(&mc.lock); |
5954 | mem_cgroup_end_move(from); | ||
5955 | } | 5260 | } |
5956 | 5261 | ||
5957 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | 5262 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, |
@@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5984 | VM_BUG_ON(mc.precharge); | 5289 | VM_BUG_ON(mc.precharge); |
5985 | VM_BUG_ON(mc.moved_charge); | 5290 | VM_BUG_ON(mc.moved_charge); |
5986 | VM_BUG_ON(mc.moved_swap); | 5291 | VM_BUG_ON(mc.moved_swap); |
5987 | mem_cgroup_start_move(from); | 5292 | |
5988 | spin_lock(&mc.lock); | 5293 | spin_lock(&mc.lock); |
5989 | mc.from = from; | 5294 | mc.from = from; |
5990 | mc.to = memcg; | 5295 | mc.to = memcg; |
@@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
6004 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, | 5309 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, |
6005 | struct cgroup_taskset *tset) | 5310 | struct cgroup_taskset *tset) |
6006 | { | 5311 | { |
6007 | mem_cgroup_clear_mc(); | 5312 | if (mc.to) |
5313 | mem_cgroup_clear_mc(); | ||
6008 | } | 5314 | } |
6009 | 5315 | ||
6010 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | 5316 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, |
@@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6018 | enum mc_target_type target_type; | 5324 | enum mc_target_type target_type; |
6019 | union mc_target target; | 5325 | union mc_target target; |
6020 | struct page *page; | 5326 | struct page *page; |
6021 | struct page_cgroup *pc; | ||
6022 | 5327 | ||
6023 | /* | 5328 | /* |
6024 | * We don't take compound_lock() here but no race with splitting thp | 5329 | * We don't take compound_lock() here but no race with splitting thp |
@@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6039 | if (target_type == MC_TARGET_PAGE) { | 5344 | if (target_type == MC_TARGET_PAGE) { |
6040 | page = target.page; | 5345 | page = target.page; |
6041 | if (!isolate_lru_page(page)) { | 5346 | if (!isolate_lru_page(page)) { |
6042 | pc = lookup_page_cgroup(page); | ||
6043 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 5347 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, |
6044 | pc, mc.from, mc.to)) { | 5348 | mc.from, mc.to)) { |
6045 | mc.precharge -= HPAGE_PMD_NR; | 5349 | mc.precharge -= HPAGE_PMD_NR; |
6046 | mc.moved_charge += HPAGE_PMD_NR; | 5350 | mc.moved_charge += HPAGE_PMD_NR; |
6047 | } | 5351 | } |
@@ -6069,9 +5373,7 @@ retry: | |||
6069 | page = target.page; | 5373 | page = target.page; |
6070 | if (isolate_lru_page(page)) | 5374 | if (isolate_lru_page(page)) |
6071 | goto put; | 5375 | goto put; |
6072 | pc = lookup_page_cgroup(page); | 5376 | if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { |
6073 | if (!mem_cgroup_move_account(page, 1, pc, | ||
6074 | mc.from, mc.to)) { | ||
6075 | mc.precharge--; | 5377 | mc.precharge--; |
6076 | /* we uncharge from mc.from later. */ | 5378 | /* we uncharge from mc.from later. */ |
6077 | mc.moved_charge++; | 5379 | mc.moved_charge++; |
@@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
6115 | struct vm_area_struct *vma; | 5417 | struct vm_area_struct *vma; |
6116 | 5418 | ||
6117 | lru_add_drain_all(); | 5419 | lru_add_drain_all(); |
5420 | /* | ||
5421 | * Signal mem_cgroup_begin_page_stat() to take the memcg's | ||
5422 | * move_lock while we're moving its pages to another memcg. | ||
5423 | * Then wait for already started RCU-only updates to finish. | ||
5424 | */ | ||
5425 | atomic_inc(&mc.from->moving_account); | ||
5426 | synchronize_rcu(); | ||
6118 | retry: | 5427 | retry: |
6119 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 5428 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
6120 | /* | 5429 | /* |
@@ -6147,6 +5456,7 @@ retry: | |||
6147 | break; | 5456 | break; |
6148 | } | 5457 | } |
6149 | up_read(&mm->mmap_sem); | 5458 | up_read(&mm->mmap_sem); |
5459 | atomic_dec(&mc.from->moving_account); | ||
6150 | } | 5460 | } |
6151 | 5461 | ||
6152 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, | 5462 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, |
@@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void) | |||
6250 | */ | 5560 | */ |
6251 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | 5561 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
6252 | { | 5562 | { |
6253 | struct page_cgroup *pc; | 5563 | struct mem_cgroup *memcg; |
6254 | unsigned short oldid; | 5564 | unsigned short oldid; |
6255 | 5565 | ||
6256 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5566 | VM_BUG_ON_PAGE(PageLRU(page), page); |
@@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
6259 | if (!do_swap_account) | 5569 | if (!do_swap_account) |
6260 | return; | 5570 | return; |
6261 | 5571 | ||
6262 | pc = lookup_page_cgroup(page); | 5572 | memcg = page->mem_cgroup; |
6263 | 5573 | ||
6264 | /* Readahead page, never charged */ | 5574 | /* Readahead page, never charged */ |
6265 | if (!PageCgroupUsed(pc)) | 5575 | if (!memcg) |
6266 | return; | 5576 | return; |
6267 | 5577 | ||
6268 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); | 5578 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); |
6269 | |||
6270 | oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); | ||
6271 | VM_BUG_ON_PAGE(oldid, page); | 5579 | VM_BUG_ON_PAGE(oldid, page); |
5580 | mem_cgroup_swap_statistics(memcg, true); | ||
5581 | |||
5582 | page->mem_cgroup = NULL; | ||
6272 | 5583 | ||
6273 | pc->flags &= ~PCG_MEMSW; | 5584 | if (!mem_cgroup_is_root(memcg)) |
6274 | css_get(&pc->mem_cgroup->css); | 5585 | page_counter_uncharge(&memcg->memory, 1); |
6275 | mem_cgroup_swap_statistics(pc->mem_cgroup, true); | 5586 | |
5587 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | ||
5588 | VM_BUG_ON(!irqs_disabled()); | ||
5589 | |||
5590 | mem_cgroup_charge_statistics(memcg, page, -1); | ||
5591 | memcg_check_events(memcg, page); | ||
6276 | } | 5592 | } |
6277 | 5593 | ||
6278 | /** | 5594 | /** |
@@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
6294 | memcg = mem_cgroup_lookup(id); | 5610 | memcg = mem_cgroup_lookup(id); |
6295 | if (memcg) { | 5611 | if (memcg) { |
6296 | if (!mem_cgroup_is_root(memcg)) | 5612 | if (!mem_cgroup_is_root(memcg)) |
6297 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 5613 | page_counter_uncharge(&memcg->memsw, 1); |
6298 | mem_cgroup_swap_statistics(memcg, false); | 5614 | mem_cgroup_swap_statistics(memcg, false); |
6299 | css_put(&memcg->css); | 5615 | css_put(&memcg->css); |
6300 | } | 5616 | } |
@@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
6330 | goto out; | 5646 | goto out; |
6331 | 5647 | ||
6332 | if (PageSwapCache(page)) { | 5648 | if (PageSwapCache(page)) { |
6333 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
6334 | /* | 5649 | /* |
6335 | * Every swap fault against a single page tries to charge the | 5650 | * Every swap fault against a single page tries to charge the |
6336 | * page, bail as early as possible. shmem_unuse() encounters | 5651 | * page, bail as early as possible. shmem_unuse() encounters |
@@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
6338 | * the page lock, which serializes swap cache removal, which | 5653 | * the page lock, which serializes swap cache removal, which |
6339 | * in turn serializes uncharging. | 5654 | * in turn serializes uncharging. |
6340 | */ | 5655 | */ |
6341 | if (PageCgroupUsed(pc)) | 5656 | if (page->mem_cgroup) |
6342 | goto out; | 5657 | goto out; |
6343 | } | 5658 | } |
6344 | 5659 | ||
@@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) | |||
6452 | } | 5767 | } |
6453 | 5768 | ||
6454 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | 5769 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, |
6455 | unsigned long nr_mem, unsigned long nr_memsw, | ||
6456 | unsigned long nr_anon, unsigned long nr_file, | 5770 | unsigned long nr_anon, unsigned long nr_file, |
6457 | unsigned long nr_huge, struct page *dummy_page) | 5771 | unsigned long nr_huge, struct page *dummy_page) |
6458 | { | 5772 | { |
5773 | unsigned long nr_pages = nr_anon + nr_file; | ||
6459 | unsigned long flags; | 5774 | unsigned long flags; |
6460 | 5775 | ||
6461 | if (!mem_cgroup_is_root(memcg)) { | 5776 | if (!mem_cgroup_is_root(memcg)) { |
6462 | if (nr_mem) | 5777 | page_counter_uncharge(&memcg->memory, nr_pages); |
6463 | res_counter_uncharge(&memcg->res, | 5778 | if (do_swap_account) |
6464 | nr_mem * PAGE_SIZE); | 5779 | page_counter_uncharge(&memcg->memsw, nr_pages); |
6465 | if (nr_memsw) | ||
6466 | res_counter_uncharge(&memcg->memsw, | ||
6467 | nr_memsw * PAGE_SIZE); | ||
6468 | memcg_oom_recover(memcg); | 5780 | memcg_oom_recover(memcg); |
6469 | } | 5781 | } |
6470 | 5782 | ||
@@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
6473 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); | 5785 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); |
6474 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); | 5786 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); |
6475 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); | 5787 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); |
6476 | __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); | 5788 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
6477 | memcg_check_events(memcg, dummy_page); | 5789 | memcg_check_events(memcg, dummy_page); |
6478 | local_irq_restore(flags); | 5790 | local_irq_restore(flags); |
5791 | |||
5792 | if (!mem_cgroup_is_root(memcg)) | ||
5793 | css_put_many(&memcg->css, nr_pages); | ||
6479 | } | 5794 | } |
6480 | 5795 | ||
6481 | static void uncharge_list(struct list_head *page_list) | 5796 | static void uncharge_list(struct list_head *page_list) |
6482 | { | 5797 | { |
6483 | struct mem_cgroup *memcg = NULL; | 5798 | struct mem_cgroup *memcg = NULL; |
6484 | unsigned long nr_memsw = 0; | ||
6485 | unsigned long nr_anon = 0; | 5799 | unsigned long nr_anon = 0; |
6486 | unsigned long nr_file = 0; | 5800 | unsigned long nr_file = 0; |
6487 | unsigned long nr_huge = 0; | 5801 | unsigned long nr_huge = 0; |
6488 | unsigned long pgpgout = 0; | 5802 | unsigned long pgpgout = 0; |
6489 | unsigned long nr_mem = 0; | ||
6490 | struct list_head *next; | 5803 | struct list_head *next; |
6491 | struct page *page; | 5804 | struct page *page; |
6492 | 5805 | ||
6493 | next = page_list->next; | 5806 | next = page_list->next; |
6494 | do { | 5807 | do { |
6495 | unsigned int nr_pages = 1; | 5808 | unsigned int nr_pages = 1; |
6496 | struct page_cgroup *pc; | ||
6497 | 5809 | ||
6498 | page = list_entry(next, struct page, lru); | 5810 | page = list_entry(next, struct page, lru); |
6499 | next = page->lru.next; | 5811 | next = page->lru.next; |
@@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list) | |||
6501 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5813 | VM_BUG_ON_PAGE(PageLRU(page), page); |
6502 | VM_BUG_ON_PAGE(page_count(page), page); | 5814 | VM_BUG_ON_PAGE(page_count(page), page); |
6503 | 5815 | ||
6504 | pc = lookup_page_cgroup(page); | 5816 | if (!page->mem_cgroup) |
6505 | if (!PageCgroupUsed(pc)) | ||
6506 | continue; | 5817 | continue; |
6507 | 5818 | ||
6508 | /* | 5819 | /* |
6509 | * Nobody should be changing or seriously looking at | 5820 | * Nobody should be changing or seriously looking at |
6510 | * pc->mem_cgroup and pc->flags at this point, we have | 5821 | * page->mem_cgroup at this point, we have fully |
6511 | * fully exclusive access to the page. | 5822 | * exclusive access to the page. |
6512 | */ | 5823 | */ |
6513 | 5824 | ||
6514 | if (memcg != pc->mem_cgroup) { | 5825 | if (memcg != page->mem_cgroup) { |
6515 | if (memcg) { | 5826 | if (memcg) { |
6516 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 5827 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
6517 | nr_anon, nr_file, nr_huge, page); | 5828 | nr_huge, page); |
6518 | pgpgout = nr_mem = nr_memsw = 0; | 5829 | pgpgout = nr_anon = nr_file = nr_huge = 0; |
6519 | nr_anon = nr_file = nr_huge = 0; | ||
6520 | } | 5830 | } |
6521 | memcg = pc->mem_cgroup; | 5831 | memcg = page->mem_cgroup; |
6522 | } | 5832 | } |
6523 | 5833 | ||
6524 | if (PageTransHuge(page)) { | 5834 | if (PageTransHuge(page)) { |
@@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list) | |||
6532 | else | 5842 | else |
6533 | nr_file += nr_pages; | 5843 | nr_file += nr_pages; |
6534 | 5844 | ||
6535 | if (pc->flags & PCG_MEM) | 5845 | page->mem_cgroup = NULL; |
6536 | nr_mem += nr_pages; | ||
6537 | if (pc->flags & PCG_MEMSW) | ||
6538 | nr_memsw += nr_pages; | ||
6539 | pc->flags = 0; | ||
6540 | 5846 | ||
6541 | pgpgout++; | 5847 | pgpgout++; |
6542 | } while (next != page_list); | 5848 | } while (next != page_list); |
6543 | 5849 | ||
6544 | if (memcg) | 5850 | if (memcg) |
6545 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 5851 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
6546 | nr_anon, nr_file, nr_huge, page); | 5852 | nr_huge, page); |
6547 | } | 5853 | } |
6548 | 5854 | ||
6549 | /** | 5855 | /** |
@@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list) | |||
6555 | */ | 5861 | */ |
6556 | void mem_cgroup_uncharge(struct page *page) | 5862 | void mem_cgroup_uncharge(struct page *page) |
6557 | { | 5863 | { |
6558 | struct page_cgroup *pc; | ||
6559 | |||
6560 | if (mem_cgroup_disabled()) | 5864 | if (mem_cgroup_disabled()) |
6561 | return; | 5865 | return; |
6562 | 5866 | ||
6563 | /* Don't touch page->lru of any random page, pre-check: */ | 5867 | /* Don't touch page->lru of any random page, pre-check: */ |
6564 | pc = lookup_page_cgroup(page); | 5868 | if (!page->mem_cgroup) |
6565 | if (!PageCgroupUsed(pc)) | ||
6566 | return; | 5869 | return; |
6567 | 5870 | ||
6568 | INIT_LIST_HEAD(&page->lru); | 5871 | INIT_LIST_HEAD(&page->lru); |
@@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
6598 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | 5901 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, |
6599 | bool lrucare) | 5902 | bool lrucare) |
6600 | { | 5903 | { |
6601 | struct page_cgroup *pc; | 5904 | struct mem_cgroup *memcg; |
6602 | int isolated; | 5905 | int isolated; |
6603 | 5906 | ||
6604 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | 5907 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
@@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
6613 | return; | 5916 | return; |
6614 | 5917 | ||
6615 | /* Page cache replacement: new page already charged? */ | 5918 | /* Page cache replacement: new page already charged? */ |
6616 | pc = lookup_page_cgroup(newpage); | 5919 | if (newpage->mem_cgroup) |
6617 | if (PageCgroupUsed(pc)) | ||
6618 | return; | 5920 | return; |
6619 | 5921 | ||
6620 | /* Re-entrant migration: old page already uncharged? */ | 5922 | /* |
6621 | pc = lookup_page_cgroup(oldpage); | 5923 | * Swapcache readahead pages can get migrated before being |
6622 | if (!PageCgroupUsed(pc)) | 5924 | * charged, and migration from compaction can happen to an |
5925 | * uncharged page when the PFN walker finds a page that | ||
5926 | * reclaim just put back on the LRU but has not released yet. | ||
5927 | */ | ||
5928 | memcg = oldpage->mem_cgroup; | ||
5929 | if (!memcg) | ||
6623 | return; | 5930 | return; |
6624 | 5931 | ||
6625 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); | ||
6626 | VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); | ||
6627 | |||
6628 | if (lrucare) | 5932 | if (lrucare) |
6629 | lock_page_lru(oldpage, &isolated); | 5933 | lock_page_lru(oldpage, &isolated); |
6630 | 5934 | ||
6631 | pc->flags = 0; | 5935 | oldpage->mem_cgroup = NULL; |
6632 | 5936 | ||
6633 | if (lrucare) | 5937 | if (lrucare) |
6634 | unlock_page_lru(oldpage, isolated); | 5938 | unlock_page_lru(oldpage, isolated); |
6635 | 5939 | ||
6636 | commit_charge(newpage, pc->mem_cgroup, lrucare); | 5940 | commit_charge(newpage, memcg, lrucare); |
6637 | } | 5941 | } |
6638 | 5942 | ||
6639 | /* | 5943 | /* |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b852b10ec76d..e5ee0ca7ae85 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -233,7 +233,7 @@ void shake_page(struct page *p, int access) | |||
233 | lru_add_drain_all(); | 233 | lru_add_drain_all(); |
234 | if (PageLRU(p)) | 234 | if (PageLRU(p)) |
235 | return; | 235 | return; |
236 | drain_all_pages(); | 236 | drain_all_pages(page_zone(p)); |
237 | if (PageLRU(p) || is_free_buddy_page(p)) | 237 | if (PageLRU(p) || is_free_buddy_page(p)) |
238 | return; | 238 | return; |
239 | } | 239 | } |
@@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1661 | if (!is_free_buddy_page(page)) | 1661 | if (!is_free_buddy_page(page)) |
1662 | lru_add_drain_all(); | 1662 | lru_add_drain_all(); |
1663 | if (!is_free_buddy_page(page)) | 1663 | if (!is_free_buddy_page(page)) |
1664 | drain_all_pages(); | 1664 | drain_all_pages(page_zone(page)); |
1665 | SetPageHWPoison(page); | 1665 | SetPageHWPoison(page); |
1666 | if (!is_free_buddy_page(page)) | 1666 | if (!is_free_buddy_page(page)) |
1667 | pr_info("soft offline: %#lx: page leaked\n", | 1667 | pr_info("soft offline: %#lx: page leaked\n", |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1bf4807cb21e..9fab10795bea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1725,7 +1725,7 @@ repeat: | |||
1725 | if (drain) { | 1725 | if (drain) { |
1726 | lru_add_drain_all(); | 1726 | lru_add_drain_all(); |
1727 | cond_resched(); | 1727 | cond_resched(); |
1728 | drain_all_pages(); | 1728 | drain_all_pages(zone); |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | pfn = scan_movable_pages(start_pfn, end_pfn); | 1731 | pfn = scan_movable_pages(start_pfn, end_pfn); |
@@ -1747,7 +1747,7 @@ repeat: | |||
1747 | lru_add_drain_all(); | 1747 | lru_add_drain_all(); |
1748 | yield(); | 1748 | yield(); |
1749 | /* drain pcp pages, this is synchronous. */ | 1749 | /* drain pcp pages, this is synchronous. */ |
1750 | drain_all_pages(); | 1750 | drain_all_pages(zone); |
1751 | /* | 1751 | /* |
1752 | * dissolve free hugepages in the memory block before doing offlining | 1752 | * dissolve free hugepages in the memory block before doing offlining |
1753 | * actually in order to make hugetlbfs's object counting consistent. | 1753 | * actually in order to make hugetlbfs's object counting consistent. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5340f6b91312..3b014d326151 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -119,7 +119,7 @@ found: | |||
119 | 119 | ||
120 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
121 | static bool oom_unkillable_task(struct task_struct *p, | 121 | static bool oom_unkillable_task(struct task_struct *p, |
122 | const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 122 | struct mem_cgroup *memcg, const nodemask_t *nodemask) |
123 | { | 123 | { |
124 | if (is_global_init(p)) | 124 | if (is_global_init(p)) |
125 | return true; | 125 | return true; |
@@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
353 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, | 353 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
354 | * swapents, oom_score_adj value, and name. | 354 | * swapents, oom_score_adj value, and name. |
355 | */ | 355 | */ |
356 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 356 | static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) |
357 | { | 357 | { |
358 | struct task_struct *p; | 358 | struct task_struct *p; |
359 | struct task_struct *task; | 359 | struct task_struct *task; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 19ceae87522d..d5d81f5384d1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page) | |||
2357 | dec_zone_page_state(page, NR_WRITEBACK); | 2357 | dec_zone_page_state(page, NR_WRITEBACK); |
2358 | inc_zone_page_state(page, NR_WRITTEN); | 2358 | inc_zone_page_state(page, NR_WRITTEN); |
2359 | } | 2359 | } |
2360 | mem_cgroup_end_page_stat(memcg, locked, memcg_flags); | 2360 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); |
2361 | return ret; | 2361 | return ret; |
2362 | } | 2362 | } |
2363 | 2363 | ||
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2399 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2399 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); |
2400 | inc_zone_page_state(page, NR_WRITEBACK); | 2400 | inc_zone_page_state(page, NR_WRITEBACK); |
2401 | } | 2401 | } |
2402 | mem_cgroup_end_page_stat(memcg, locked, memcg_flags); | 2402 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); |
2403 | return ret; | 2403 | return ret; |
2404 | 2404 | ||
2405 | } | 2405 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 616a2c956b4b..a7198c065999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,7 +48,6 @@ | |||
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_cgroup.h> | ||
52 | #include <linux/debugobjects.h> | 51 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 52 | #include <linux/kmemleak.h> |
54 | #include <linux/compaction.h> | 53 | #include <linux/compaction.h> |
@@ -641,8 +640,10 @@ static inline int free_pages_check(struct page *page) | |||
641 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; | 640 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
642 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; | 641 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; |
643 | } | 642 | } |
644 | if (unlikely(mem_cgroup_bad_page_check(page))) | 643 | #ifdef CONFIG_MEMCG |
645 | bad_reason = "cgroup check failed"; | 644 | if (unlikely(page->mem_cgroup)) |
645 | bad_reason = "page still charged to cgroup"; | ||
646 | #endif | ||
646 | if (unlikely(bad_reason)) { | 647 | if (unlikely(bad_reason)) { |
647 | bad_page(page, bad_reason, bad_flags); | 648 | bad_page(page, bad_reason, bad_flags); |
648 | return 1; | 649 | return 1; |
@@ -741,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
741 | int i; | 742 | int i; |
742 | int bad = 0; | 743 | int bad = 0; |
743 | 744 | ||
745 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
746 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | ||
747 | |||
744 | trace_mm_page_free(page, order); | 748 | trace_mm_page_free(page, order); |
745 | kmemcheck_free_shadow(page, order); | 749 | kmemcheck_free_shadow(page, order); |
746 | 750 | ||
@@ -898,8 +902,10 @@ static inline int check_new_page(struct page *page) | |||
898 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; | 902 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; |
899 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; | 903 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; |
900 | } | 904 | } |
901 | if (unlikely(mem_cgroup_bad_page_check(page))) | 905 | #ifdef CONFIG_MEMCG |
902 | bad_reason = "cgroup check failed"; | 906 | if (unlikely(page->mem_cgroup)) |
907 | bad_reason = "page still charged to cgroup"; | ||
908 | #endif | ||
903 | if (unlikely(bad_reason)) { | 909 | if (unlikely(bad_reason)) { |
904 | bad_page(page, bad_reason, bad_flags); | 910 | bad_page(page, bad_reason, bad_flags); |
905 | return 1; | 911 | return 1; |
@@ -1267,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1267 | #endif | 1273 | #endif |
1268 | 1274 | ||
1269 | /* | 1275 | /* |
1270 | * Drain pages of the indicated processor. | 1276 | * Drain pcplists of the indicated processor and zone. |
1271 | * | 1277 | * |
1272 | * The processor must either be the current processor and the | 1278 | * The processor must either be the current processor and the |
1273 | * thread pinned to the current processor or a processor that | 1279 | * thread pinned to the current processor or a processor that |
1274 | * is not online. | 1280 | * is not online. |
1275 | */ | 1281 | */ |
1276 | static void drain_pages(unsigned int cpu) | 1282 | static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
1277 | { | 1283 | { |
1278 | unsigned long flags; | 1284 | unsigned long flags; |
1279 | struct zone *zone; | 1285 | struct per_cpu_pageset *pset; |
1286 | struct per_cpu_pages *pcp; | ||
1280 | 1287 | ||
1281 | for_each_populated_zone(zone) { | 1288 | local_irq_save(flags); |
1282 | struct per_cpu_pageset *pset; | 1289 | pset = per_cpu_ptr(zone->pageset, cpu); |
1283 | struct per_cpu_pages *pcp; | ||
1284 | 1290 | ||
1285 | local_irq_save(flags); | 1291 | pcp = &pset->pcp; |
1286 | pset = per_cpu_ptr(zone->pageset, cpu); | 1292 | if (pcp->count) { |
1293 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
1294 | pcp->count = 0; | ||
1295 | } | ||
1296 | local_irq_restore(flags); | ||
1297 | } | ||
1287 | 1298 | ||
1288 | pcp = &pset->pcp; | 1299 | /* |
1289 | if (pcp->count) { | 1300 | * Drain pcplists of all zones on the indicated processor. |
1290 | free_pcppages_bulk(zone, pcp->count, pcp); | 1301 | * |
1291 | pcp->count = 0; | 1302 | * The processor must either be the current processor and the |
1292 | } | 1303 | * thread pinned to the current processor or a processor that |
1293 | local_irq_restore(flags); | 1304 | * is not online. |
1305 | */ | ||
1306 | static void drain_pages(unsigned int cpu) | ||
1307 | { | ||
1308 | struct zone *zone; | ||
1309 | |||
1310 | for_each_populated_zone(zone) { | ||
1311 | drain_pages_zone(cpu, zone); | ||
1294 | } | 1312 | } |
1295 | } | 1313 | } |
1296 | 1314 | ||
1297 | /* | 1315 | /* |
1298 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 1316 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
1317 | * | ||
1318 | * The CPU has to be pinned. When zone parameter is non-NULL, spill just | ||
1319 | * the single zone's pages. | ||
1299 | */ | 1320 | */ |
1300 | void drain_local_pages(void *arg) | 1321 | void drain_local_pages(struct zone *zone) |
1301 | { | 1322 | { |
1302 | drain_pages(smp_processor_id()); | 1323 | int cpu = smp_processor_id(); |
1324 | |||
1325 | if (zone) | ||
1326 | drain_pages_zone(cpu, zone); | ||
1327 | else | ||
1328 | drain_pages(cpu); | ||
1303 | } | 1329 | } |
1304 | 1330 | ||
1305 | /* | 1331 | /* |
1306 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | 1332 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
1307 | * | 1333 | * |
1334 | * When zone parameter is non-NULL, spill just the single zone's pages. | ||
1335 | * | ||
1308 | * Note that this code is protected against sending an IPI to an offline | 1336 | * Note that this code is protected against sending an IPI to an offline |
1309 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: | 1337 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: |
1310 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but | 1338 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but |
1311 | * nothing keeps CPUs from showing up after we populated the cpumask and | 1339 | * nothing keeps CPUs from showing up after we populated the cpumask and |
1312 | * before the call to on_each_cpu_mask(). | 1340 | * before the call to on_each_cpu_mask(). |
1313 | */ | 1341 | */ |
1314 | void drain_all_pages(void) | 1342 | void drain_all_pages(struct zone *zone) |
1315 | { | 1343 | { |
1316 | int cpu; | 1344 | int cpu; |
1317 | struct per_cpu_pageset *pcp; | ||
1318 | struct zone *zone; | ||
1319 | 1345 | ||
1320 | /* | 1346 | /* |
1321 | * Allocate in the BSS so we wont require allocation in | 1347 | * Allocate in the BSS so we wont require allocation in |
@@ -1330,20 +1356,31 @@ void drain_all_pages(void) | |||
1330 | * disables preemption as part of its processing | 1356 | * disables preemption as part of its processing |
1331 | */ | 1357 | */ |
1332 | for_each_online_cpu(cpu) { | 1358 | for_each_online_cpu(cpu) { |
1359 | struct per_cpu_pageset *pcp; | ||
1360 | struct zone *z; | ||
1333 | bool has_pcps = false; | 1361 | bool has_pcps = false; |
1334 | for_each_populated_zone(zone) { | 1362 | |
1363 | if (zone) { | ||
1335 | pcp = per_cpu_ptr(zone->pageset, cpu); | 1364 | pcp = per_cpu_ptr(zone->pageset, cpu); |
1336 | if (pcp->pcp.count) { | 1365 | if (pcp->pcp.count) |
1337 | has_pcps = true; | 1366 | has_pcps = true; |
1338 | break; | 1367 | } else { |
1368 | for_each_populated_zone(z) { | ||
1369 | pcp = per_cpu_ptr(z->pageset, cpu); | ||
1370 | if (pcp->pcp.count) { | ||
1371 | has_pcps = true; | ||
1372 | break; | ||
1373 | } | ||
1339 | } | 1374 | } |
1340 | } | 1375 | } |
1376 | |||
1341 | if (has_pcps) | 1377 | if (has_pcps) |
1342 | cpumask_set_cpu(cpu, &cpus_with_pcps); | 1378 | cpumask_set_cpu(cpu, &cpus_with_pcps); |
1343 | else | 1379 | else |
1344 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | 1380 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
1345 | } | 1381 | } |
1346 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); | 1382 | on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, |
1383 | zone, 1); | ||
1347 | } | 1384 | } |
1348 | 1385 | ||
1349 | #ifdef CONFIG_HIBERNATION | 1386 | #ifdef CONFIG_HIBERNATION |
@@ -1705,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, | |||
1705 | unsigned long mark, int classzone_idx, int alloc_flags, | 1742 | unsigned long mark, int classzone_idx, int alloc_flags, |
1706 | long free_pages) | 1743 | long free_pages) |
1707 | { | 1744 | { |
1708 | /* free_pages my go negative - that's OK */ | 1745 | /* free_pages may go negative - that's OK */ |
1709 | long min = mark; | 1746 | long min = mark; |
1710 | int o; | 1747 | int o; |
1711 | long free_cma = 0; | 1748 | long free_cma = 0; |
@@ -2296,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2296 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2333 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2297 | int *contended_compaction, bool *deferred_compaction) | 2334 | int *contended_compaction, bool *deferred_compaction) |
2298 | { | 2335 | { |
2299 | struct zone *last_compact_zone = NULL; | ||
2300 | unsigned long compact_result; | 2336 | unsigned long compact_result; |
2301 | struct page *page; | 2337 | struct page *page; |
2302 | 2338 | ||
@@ -2307,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2307 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2343 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, |
2308 | nodemask, mode, | 2344 | nodemask, mode, |
2309 | contended_compaction, | 2345 | contended_compaction, |
2310 | &last_compact_zone); | 2346 | alloc_flags, classzone_idx); |
2311 | current->flags &= ~PF_MEMALLOC; | 2347 | current->flags &= ~PF_MEMALLOC; |
2312 | 2348 | ||
2313 | switch (compact_result) { | 2349 | switch (compact_result) { |
@@ -2326,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2326 | */ | 2362 | */ |
2327 | count_vm_event(COMPACTSTALL); | 2363 | count_vm_event(COMPACTSTALL); |
2328 | 2364 | ||
2329 | /* Page migration frees to the PCP lists but we want merging */ | ||
2330 | drain_pages(get_cpu()); | ||
2331 | put_cpu(); | ||
2332 | |||
2333 | page = get_page_from_freelist(gfp_mask, nodemask, | 2365 | page = get_page_from_freelist(gfp_mask, nodemask, |
2334 | order, zonelist, high_zoneidx, | 2366 | order, zonelist, high_zoneidx, |
2335 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2367 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2345,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2345 | } | 2377 | } |
2346 | 2378 | ||
2347 | /* | 2379 | /* |
2348 | * last_compact_zone is where try_to_compact_pages thought allocation | ||
2349 | * should succeed, so it did not defer compaction. But here we know | ||
2350 | * that it didn't succeed, so we do the defer. | ||
2351 | */ | ||
2352 | if (last_compact_zone && mode != MIGRATE_ASYNC) | ||
2353 | defer_compaction(last_compact_zone, order); | ||
2354 | |||
2355 | /* | ||
2356 | * It's bad if compaction run occurs and fails. The most likely reason | 2380 | * It's bad if compaction run occurs and fails. The most likely reason |
2357 | * is that pages exist, but not enough to satisfy watermarks. | 2381 | * is that pages exist, but not enough to satisfy watermarks. |
2358 | */ | 2382 | */ |
@@ -2433,7 +2457,7 @@ retry: | |||
2433 | * pages are pinned on the per-cpu lists. Drain them and try again | 2457 | * pages are pinned on the per-cpu lists. Drain them and try again |
2434 | */ | 2458 | */ |
2435 | if (!page && !drained) { | 2459 | if (!page && !drained) { |
2436 | drain_all_pages(); | 2460 | drain_all_pages(NULL); |
2437 | drained = true; | 2461 | drained = true; |
2438 | goto retry; | 2462 | goto retry; |
2439 | } | 2463 | } |
@@ -3893,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
3893 | else | 3917 | else |
3894 | page_group_by_mobility_disabled = 0; | 3918 | page_group_by_mobility_disabled = 0; |
3895 | 3919 | ||
3896 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 3920 | pr_info("Built %i zonelists in %s order, mobility grouping %s. " |
3897 | "Total pages: %ld\n", | 3921 | "Total pages: %ld\n", |
3898 | nr_online_nodes, | 3922 | nr_online_nodes, |
3899 | zonelist_order_name[current_zonelist_order], | 3923 | zonelist_order_name[current_zonelist_order], |
3900 | page_group_by_mobility_disabled ? "off" : "on", | 3924 | page_group_by_mobility_disabled ? "off" : "on", |
3901 | vm_total_pages); | 3925 | vm_total_pages); |
3902 | #ifdef CONFIG_NUMA | 3926 | #ifdef CONFIG_NUMA |
3903 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 3927 | pr_info("Policy zone: %s\n", zone_names[policy_zone]); |
3904 | #endif | 3928 | #endif |
3905 | } | 3929 | } |
3906 | 3930 | ||
@@ -4832,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4832 | #endif | 4856 | #endif |
4833 | init_waitqueue_head(&pgdat->kswapd_wait); | 4857 | init_waitqueue_head(&pgdat->kswapd_wait); |
4834 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4858 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4835 | pgdat_page_cgroup_init(pgdat); | ||
4836 | 4859 | ||
4837 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4860 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4838 | struct zone *zone = pgdat->node_zones + j; | 4861 | struct zone *zone = pgdat->node_zones + j; |
@@ -5334,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5334 | find_zone_movable_pfns_for_nodes(); | 5357 | find_zone_movable_pfns_for_nodes(); |
5335 | 5358 | ||
5336 | /* Print out the zone ranges */ | 5359 | /* Print out the zone ranges */ |
5337 | printk("Zone ranges:\n"); | 5360 | pr_info("Zone ranges:\n"); |
5338 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5361 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5339 | if (i == ZONE_MOVABLE) | 5362 | if (i == ZONE_MOVABLE) |
5340 | continue; | 5363 | continue; |
5341 | printk(KERN_CONT " %-8s ", zone_names[i]); | 5364 | pr_info(" %-8s ", zone_names[i]); |
5342 | if (arch_zone_lowest_possible_pfn[i] == | 5365 | if (arch_zone_lowest_possible_pfn[i] == |
5343 | arch_zone_highest_possible_pfn[i]) | 5366 | arch_zone_highest_possible_pfn[i]) |
5344 | printk(KERN_CONT "empty\n"); | 5367 | pr_cont("empty\n"); |
5345 | else | 5368 | else |
5346 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", | 5369 | pr_cont("[mem %0#10lx-%0#10lx]\n", |
5347 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5370 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
5348 | (arch_zone_highest_possible_pfn[i] | 5371 | (arch_zone_highest_possible_pfn[i] |
5349 | << PAGE_SHIFT) - 1); | 5372 | << PAGE_SHIFT) - 1); |
5350 | } | 5373 | } |
5351 | 5374 | ||
5352 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 5375 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
5353 | printk("Movable zone start for each node\n"); | 5376 | pr_info("Movable zone start for each node\n"); |
5354 | for (i = 0; i < MAX_NUMNODES; i++) { | 5377 | for (i = 0; i < MAX_NUMNODES; i++) { |
5355 | if (zone_movable_pfn[i]) | 5378 | if (zone_movable_pfn[i]) |
5356 | printk(" Node %d: %#010lx\n", i, | 5379 | pr_info(" Node %d: %#010lx\n", i, |
5357 | zone_movable_pfn[i] << PAGE_SHIFT); | 5380 | zone_movable_pfn[i] << PAGE_SHIFT); |
5358 | } | 5381 | } |
5359 | 5382 | ||
5360 | /* Print out the early node map */ | 5383 | /* Print out the early node map */ |
5361 | printk("Early memory node ranges\n"); | 5384 | pr_info("Early memory node ranges\n"); |
5362 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5385 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5363 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5386 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
5364 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5387 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); |
5365 | 5388 | ||
5366 | /* Initialise every node */ | 5389 | /* Initialise every node */ |
@@ -5496,7 +5519,7 @@ void __init mem_init_print_info(const char *str) | |||
5496 | 5519 | ||
5497 | #undef adj_init_size | 5520 | #undef adj_init_size |
5498 | 5521 | ||
5499 | printk("Memory: %luK/%luK available " | 5522 | pr_info("Memory: %luK/%luK available " |
5500 | "(%luK kernel code, %luK rwdata, %luK rodata, " | 5523 | "(%luK kernel code, %luK rwdata, %luK rodata, " |
5501 | "%luK init, %luK bss, %luK reserved" | 5524 | "%luK init, %luK bss, %luK reserved" |
5502 | #ifdef CONFIG_HIGHMEM | 5525 | #ifdef CONFIG_HIGHMEM |
@@ -6385,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
6385 | */ | 6408 | */ |
6386 | 6409 | ||
6387 | lru_add_drain_all(); | 6410 | lru_add_drain_all(); |
6388 | drain_all_pages(); | 6411 | drain_all_pages(cc.zone); |
6389 | 6412 | ||
6390 | order = 0; | 6413 | order = 0; |
6391 | outer_start = start; | 6414 | outer_start = start; |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c deleted file mode 100644 index 5331c2bd85a2..000000000000 --- a/mm/page_cgroup.c +++ /dev/null | |||
@@ -1,530 +0,0 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/bit_spinlock.h> | ||
5 | #include <linux/page_cgroup.h> | ||
6 | #include <linux/hash.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/vmalloc.h> | ||
10 | #include <linux/cgroup.h> | ||
11 | #include <linux/swapops.h> | ||
12 | #include <linux/kmemleak.h> | ||
13 | |||
14 | static unsigned long total_usage; | ||
15 | |||
16 | #if !defined(CONFIG_SPARSEMEM) | ||
17 | |||
18 | |||
19 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
20 | { | ||
21 | pgdat->node_page_cgroup = NULL; | ||
22 | } | ||
23 | |||
24 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
25 | { | ||
26 | unsigned long pfn = page_to_pfn(page); | ||
27 | unsigned long offset; | ||
28 | struct page_cgroup *base; | ||
29 | |||
30 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | ||
31 | #ifdef CONFIG_DEBUG_VM | ||
32 | /* | ||
33 | * The sanity checks the page allocator does upon freeing a | ||
34 | * page can reach here before the page_cgroup arrays are | ||
35 | * allocated when feeding a range of pages to the allocator | ||
36 | * for the first time during bootup or memory hotplug. | ||
37 | */ | ||
38 | if (unlikely(!base)) | ||
39 | return NULL; | ||
40 | #endif | ||
41 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | ||
42 | return base + offset; | ||
43 | } | ||
44 | |||
45 | static int __init alloc_node_page_cgroup(int nid) | ||
46 | { | ||
47 | struct page_cgroup *base; | ||
48 | unsigned long table_size; | ||
49 | unsigned long nr_pages; | ||
50 | |||
51 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
52 | if (!nr_pages) | ||
53 | return 0; | ||
54 | |||
55 | table_size = sizeof(struct page_cgroup) * nr_pages; | ||
56 | |||
57 | base = memblock_virt_alloc_try_nid_nopanic( | ||
58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | ||
59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
60 | if (!base) | ||
61 | return -ENOMEM; | ||
62 | NODE_DATA(nid)->node_page_cgroup = base; | ||
63 | total_usage += table_size; | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | void __init page_cgroup_init_flatmem(void) | ||
68 | { | ||
69 | |||
70 | int nid, fail; | ||
71 | |||
72 | if (mem_cgroup_disabled()) | ||
73 | return; | ||
74 | |||
75 | for_each_online_node(nid) { | ||
76 | fail = alloc_node_page_cgroup(nid); | ||
77 | if (fail) | ||
78 | goto fail; | ||
79 | } | ||
80 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
81 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" | ||
82 | " don't want memory cgroups\n"); | ||
83 | return; | ||
84 | fail: | ||
85 | printk(KERN_CRIT "allocation of page_cgroup failed.\n"); | ||
86 | printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); | ||
87 | panic("Out of memory"); | ||
88 | } | ||
89 | |||
90 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
91 | |||
92 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
93 | { | ||
94 | unsigned long pfn = page_to_pfn(page); | ||
95 | struct mem_section *section = __pfn_to_section(pfn); | ||
96 | #ifdef CONFIG_DEBUG_VM | ||
97 | /* | ||
98 | * The sanity checks the page allocator does upon freeing a | ||
99 | * page can reach here before the page_cgroup arrays are | ||
100 | * allocated when feeding a range of pages to the allocator | ||
101 | * for the first time during bootup or memory hotplug. | ||
102 | */ | ||
103 | if (!section->page_cgroup) | ||
104 | return NULL; | ||
105 | #endif | ||
106 | return section->page_cgroup + pfn; | ||
107 | } | ||
108 | |||
109 | static void *__meminit alloc_page_cgroup(size_t size, int nid) | ||
110 | { | ||
111 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
112 | void *addr = NULL; | ||
113 | |||
114 | addr = alloc_pages_exact_nid(nid, size, flags); | ||
115 | if (addr) { | ||
116 | kmemleak_alloc(addr, size, 1, flags); | ||
117 | return addr; | ||
118 | } | ||
119 | |||
120 | if (node_state(nid, N_HIGH_MEMORY)) | ||
121 | addr = vzalloc_node(size, nid); | ||
122 | else | ||
123 | addr = vzalloc(size); | ||
124 | |||
125 | return addr; | ||
126 | } | ||
127 | |||
128 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) | ||
129 | { | ||
130 | struct mem_section *section; | ||
131 | struct page_cgroup *base; | ||
132 | unsigned long table_size; | ||
133 | |||
134 | section = __pfn_to_section(pfn); | ||
135 | |||
136 | if (section->page_cgroup) | ||
137 | return 0; | ||
138 | |||
139 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
140 | base = alloc_page_cgroup(table_size, nid); | ||
141 | |||
142 | /* | ||
143 | * The value stored in section->page_cgroup is (base - pfn) | ||
144 | * and it does not point to the memory block allocated above, | ||
145 | * causing kmemleak false positives. | ||
146 | */ | ||
147 | kmemleak_not_leak(base); | ||
148 | |||
149 | if (!base) { | ||
150 | printk(KERN_ERR "page cgroup allocation failure\n"); | ||
151 | return -ENOMEM; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
156 | * we need to apply a mask. | ||
157 | */ | ||
158 | pfn &= PAGE_SECTION_MASK; | ||
159 | section->page_cgroup = base - pfn; | ||
160 | total_usage += table_size; | ||
161 | return 0; | ||
162 | } | ||
163 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
164 | static void free_page_cgroup(void *addr) | ||
165 | { | ||
166 | if (is_vmalloc_addr(addr)) { | ||
167 | vfree(addr); | ||
168 | } else { | ||
169 | struct page *page = virt_to_page(addr); | ||
170 | size_t table_size = | ||
171 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
172 | |||
173 | BUG_ON(PageReserved(page)); | ||
174 | kmemleak_free(addr); | ||
175 | free_pages_exact(addr, table_size); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | static void __free_page_cgroup(unsigned long pfn) | ||
180 | { | ||
181 | struct mem_section *ms; | ||
182 | struct page_cgroup *base; | ||
183 | |||
184 | ms = __pfn_to_section(pfn); | ||
185 | if (!ms || !ms->page_cgroup) | ||
186 | return; | ||
187 | base = ms->page_cgroup + pfn; | ||
188 | free_page_cgroup(base); | ||
189 | ms->page_cgroup = NULL; | ||
190 | } | ||
191 | |||
192 | static int __meminit online_page_cgroup(unsigned long start_pfn, | ||
193 | unsigned long nr_pages, | ||
194 | int nid) | ||
195 | { | ||
196 | unsigned long start, end, pfn; | ||
197 | int fail = 0; | ||
198 | |||
199 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
200 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
201 | |||
202 | if (nid == -1) { | ||
203 | /* | ||
204 | * In this case, "nid" already exists and contains valid memory. | ||
205 | * "start_pfn" passed to us is a pfn which is an arg for | ||
206 | * online__pages(), and start_pfn should exist. | ||
207 | */ | ||
208 | nid = pfn_to_nid(start_pfn); | ||
209 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
210 | } | ||
211 | |||
212 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
213 | if (!pfn_present(pfn)) | ||
214 | continue; | ||
215 | fail = init_section_page_cgroup(pfn, nid); | ||
216 | } | ||
217 | if (!fail) | ||
218 | return 0; | ||
219 | |||
220 | /* rollback */ | ||
221 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
222 | __free_page_cgroup(pfn); | ||
223 | |||
224 | return -ENOMEM; | ||
225 | } | ||
226 | |||
227 | static int __meminit offline_page_cgroup(unsigned long start_pfn, | ||
228 | unsigned long nr_pages, int nid) | ||
229 | { | ||
230 | unsigned long start, end, pfn; | ||
231 | |||
232 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
233 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
234 | |||
235 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
236 | __free_page_cgroup(pfn); | ||
237 | return 0; | ||
238 | |||
239 | } | ||
240 | |||
241 | static int __meminit page_cgroup_callback(struct notifier_block *self, | ||
242 | unsigned long action, void *arg) | ||
243 | { | ||
244 | struct memory_notify *mn = arg; | ||
245 | int ret = 0; | ||
246 | switch (action) { | ||
247 | case MEM_GOING_ONLINE: | ||
248 | ret = online_page_cgroup(mn->start_pfn, | ||
249 | mn->nr_pages, mn->status_change_nid); | ||
250 | break; | ||
251 | case MEM_OFFLINE: | ||
252 | offline_page_cgroup(mn->start_pfn, | ||
253 | mn->nr_pages, mn->status_change_nid); | ||
254 | break; | ||
255 | case MEM_CANCEL_ONLINE: | ||
256 | offline_page_cgroup(mn->start_pfn, | ||
257 | mn->nr_pages, mn->status_change_nid); | ||
258 | break; | ||
259 | case MEM_GOING_OFFLINE: | ||
260 | break; | ||
261 | case MEM_ONLINE: | ||
262 | case MEM_CANCEL_OFFLINE: | ||
263 | break; | ||
264 | } | ||
265 | |||
266 | return notifier_from_errno(ret); | ||
267 | } | ||
268 | |||
269 | #endif | ||
270 | |||
271 | void __init page_cgroup_init(void) | ||
272 | { | ||
273 | unsigned long pfn; | ||
274 | int nid; | ||
275 | |||
276 | if (mem_cgroup_disabled()) | ||
277 | return; | ||
278 | |||
279 | for_each_node_state(nid, N_MEMORY) { | ||
280 | unsigned long start_pfn, end_pfn; | ||
281 | |||
282 | start_pfn = node_start_pfn(nid); | ||
283 | end_pfn = node_end_pfn(nid); | ||
284 | /* | ||
285 | * start_pfn and end_pfn may not be aligned to SECTION and the | ||
286 | * page->flags of out of node pages are not initialized. So we | ||
287 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. | ||
288 | */ | ||
289 | for (pfn = start_pfn; | ||
290 | pfn < end_pfn; | ||
291 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
292 | |||
293 | if (!pfn_valid(pfn)) | ||
294 | continue; | ||
295 | /* | ||
296 | * Nodes's pfns can be overlapping. | ||
297 | * We know some arch can have a nodes layout such as | ||
298 | * -------------pfn--------------> | ||
299 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
300 | */ | ||
301 | if (pfn_to_nid(pfn) != nid) | ||
302 | continue; | ||
303 | if (init_section_page_cgroup(pfn, nid)) | ||
304 | goto oom; | ||
305 | } | ||
306 | } | ||
307 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
308 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
309 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " | ||
310 | "don't want memory cgroups\n"); | ||
311 | return; | ||
312 | oom: | ||
313 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
314 | panic("Out of memory"); | ||
315 | } | ||
316 | |||
317 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
318 | { | ||
319 | return; | ||
320 | } | ||
321 | |||
322 | #endif | ||
323 | |||
324 | |||
325 | #ifdef CONFIG_MEMCG_SWAP | ||
326 | |||
327 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
328 | struct swap_cgroup_ctrl { | ||
329 | struct page **map; | ||
330 | unsigned long length; | ||
331 | spinlock_t lock; | ||
332 | }; | ||
333 | |||
334 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
335 | |||
336 | struct swap_cgroup { | ||
337 | unsigned short id; | ||
338 | }; | ||
339 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
340 | |||
341 | /* | ||
342 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
343 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
344 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
345 | * | ||
346 | * This means, | ||
347 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
348 | * SwapCache(and its swp_entry) is under lock. | ||
349 | * - When called via swap_free(), there is no user of this entry and no race. | ||
350 | * Then, we don't need lock around "exchange". | ||
351 | * | ||
352 | * TODO: we can push these buffers out to HIGHMEM. | ||
353 | */ | ||
354 | |||
355 | /* | ||
356 | * allocate buffer for swap_cgroup. | ||
357 | */ | ||
358 | static int swap_cgroup_prepare(int type) | ||
359 | { | ||
360 | struct page *page; | ||
361 | struct swap_cgroup_ctrl *ctrl; | ||
362 | unsigned long idx, max; | ||
363 | |||
364 | ctrl = &swap_cgroup_ctrl[type]; | ||
365 | |||
366 | for (idx = 0; idx < ctrl->length; idx++) { | ||
367 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
368 | if (!page) | ||
369 | goto not_enough_page; | ||
370 | ctrl->map[idx] = page; | ||
371 | } | ||
372 | return 0; | ||
373 | not_enough_page: | ||
374 | max = idx; | ||
375 | for (idx = 0; idx < max; idx++) | ||
376 | __free_page(ctrl->map[idx]); | ||
377 | |||
378 | return -ENOMEM; | ||
379 | } | ||
380 | |||
381 | static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | ||
382 | struct swap_cgroup_ctrl **ctrlp) | ||
383 | { | ||
384 | pgoff_t offset = swp_offset(ent); | ||
385 | struct swap_cgroup_ctrl *ctrl; | ||
386 | struct page *mappage; | ||
387 | struct swap_cgroup *sc; | ||
388 | |||
389 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | ||
390 | if (ctrlp) | ||
391 | *ctrlp = ctrl; | ||
392 | |||
393 | mappage = ctrl->map[offset / SC_PER_PAGE]; | ||
394 | sc = page_address(mappage); | ||
395 | return sc + offset % SC_PER_PAGE; | ||
396 | } | ||
397 | |||
398 | /** | ||
399 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
400 | * @ent: swap entry to be cmpxchged | ||
401 | * @old: old id | ||
402 | * @new: new id | ||
403 | * | ||
404 | * Returns old id at success, 0 at failure. | ||
405 | * (There is no mem_cgroup using 0 as its id) | ||
406 | */ | ||
407 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
408 | unsigned short old, unsigned short new) | ||
409 | { | ||
410 | struct swap_cgroup_ctrl *ctrl; | ||
411 | struct swap_cgroup *sc; | ||
412 | unsigned long flags; | ||
413 | unsigned short retval; | ||
414 | |||
415 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
416 | |||
417 | spin_lock_irqsave(&ctrl->lock, flags); | ||
418 | retval = sc->id; | ||
419 | if (retval == old) | ||
420 | sc->id = new; | ||
421 | else | ||
422 | retval = 0; | ||
423 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
424 | return retval; | ||
425 | } | ||
426 | |||
427 | /** | ||
428 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
429 | * @ent: swap entry to be recorded into | ||
430 | * @id: mem_cgroup to be recorded | ||
431 | * | ||
432 | * Returns old value at success, 0 at failure. | ||
433 | * (Of course, old value can be 0.) | ||
434 | */ | ||
435 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
436 | { | ||
437 | struct swap_cgroup_ctrl *ctrl; | ||
438 | struct swap_cgroup *sc; | ||
439 | unsigned short old; | ||
440 | unsigned long flags; | ||
441 | |||
442 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
443 | |||
444 | spin_lock_irqsave(&ctrl->lock, flags); | ||
445 | old = sc->id; | ||
446 | sc->id = id; | ||
447 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
448 | |||
449 | return old; | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | ||
454 | * @ent: swap entry to be looked up. | ||
455 | * | ||
456 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | ||
457 | */ | ||
458 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
459 | { | ||
460 | return lookup_swap_cgroup(ent, NULL)->id; | ||
461 | } | ||
462 | |||
463 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
464 | { | ||
465 | void *array; | ||
466 | unsigned long array_size; | ||
467 | unsigned long length; | ||
468 | struct swap_cgroup_ctrl *ctrl; | ||
469 | |||
470 | if (!do_swap_account) | ||
471 | return 0; | ||
472 | |||
473 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | ||
474 | array_size = length * sizeof(void *); | ||
475 | |||
476 | array = vzalloc(array_size); | ||
477 | if (!array) | ||
478 | goto nomem; | ||
479 | |||
480 | ctrl = &swap_cgroup_ctrl[type]; | ||
481 | mutex_lock(&swap_cgroup_mutex); | ||
482 | ctrl->length = length; | ||
483 | ctrl->map = array; | ||
484 | spin_lock_init(&ctrl->lock); | ||
485 | if (swap_cgroup_prepare(type)) { | ||
486 | /* memory shortage */ | ||
487 | ctrl->map = NULL; | ||
488 | ctrl->length = 0; | ||
489 | mutex_unlock(&swap_cgroup_mutex); | ||
490 | vfree(array); | ||
491 | goto nomem; | ||
492 | } | ||
493 | mutex_unlock(&swap_cgroup_mutex); | ||
494 | |||
495 | return 0; | ||
496 | nomem: | ||
497 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
498 | printk(KERN_INFO | ||
499 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); | ||
500 | return -ENOMEM; | ||
501 | } | ||
502 | |||
503 | void swap_cgroup_swapoff(int type) | ||
504 | { | ||
505 | struct page **map; | ||
506 | unsigned long i, length; | ||
507 | struct swap_cgroup_ctrl *ctrl; | ||
508 | |||
509 | if (!do_swap_account) | ||
510 | return; | ||
511 | |||
512 | mutex_lock(&swap_cgroup_mutex); | ||
513 | ctrl = &swap_cgroup_ctrl[type]; | ||
514 | map = ctrl->map; | ||
515 | length = ctrl->length; | ||
516 | ctrl->map = NULL; | ||
517 | ctrl->length = 0; | ||
518 | mutex_unlock(&swap_cgroup_mutex); | ||
519 | |||
520 | if (map) { | ||
521 | for (i = 0; i < length; i++) { | ||
522 | struct page *page = map[i]; | ||
523 | if (page) | ||
524 | __free_page(page); | ||
525 | } | ||
526 | vfree(map); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | #endif | ||
diff --git a/mm/page_counter.c b/mm/page_counter.c new file mode 100644 index 000000000000..a009574fbba9 --- /dev/null +++ b/mm/page_counter.c | |||
@@ -0,0 +1,192 @@ | |||
1 | /* | ||
2 | * Lockless hierarchical page accounting & limiting | ||
3 | * | ||
4 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner | ||
5 | */ | ||
6 | |||
7 | #include <linux/page_counter.h> | ||
8 | #include <linux/atomic.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/bug.h> | ||
13 | #include <asm/page.h> | ||
14 | |||
15 | /** | ||
16 | * page_counter_cancel - take pages out of the local counter | ||
17 | * @counter: counter | ||
18 | * @nr_pages: number of pages to cancel | ||
19 | */ | ||
20 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) | ||
21 | { | ||
22 | long new; | ||
23 | |||
24 | new = atomic_long_sub_return(nr_pages, &counter->count); | ||
25 | /* More uncharges than charges? */ | ||
26 | WARN_ON_ONCE(new < 0); | ||
27 | } | ||
28 | |||
29 | /** | ||
30 | * page_counter_charge - hierarchically charge pages | ||
31 | * @counter: counter | ||
32 | * @nr_pages: number of pages to charge | ||
33 | * | ||
34 | * NOTE: This does not consider any configured counter limits. | ||
35 | */ | ||
36 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | ||
37 | { | ||
38 | struct page_counter *c; | ||
39 | |||
40 | for (c = counter; c; c = c->parent) { | ||
41 | long new; | ||
42 | |||
43 | new = atomic_long_add_return(nr_pages, &c->count); | ||
44 | /* | ||
45 | * This is indeed racy, but we can live with some | ||
46 | * inaccuracy in the watermark. | ||
47 | */ | ||
48 | if (new > c->watermark) | ||
49 | c->watermark = new; | ||
50 | } | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * page_counter_try_charge - try to hierarchically charge pages | ||
55 | * @counter: counter | ||
56 | * @nr_pages: number of pages to charge | ||
57 | * @fail: points first counter to hit its limit, if any | ||
58 | * | ||
59 | * Returns 0 on success, or -ENOMEM and @fail if the counter or one of | ||
60 | * its ancestors has hit its configured limit. | ||
61 | */ | ||
62 | int page_counter_try_charge(struct page_counter *counter, | ||
63 | unsigned long nr_pages, | ||
64 | struct page_counter **fail) | ||
65 | { | ||
66 | struct page_counter *c; | ||
67 | |||
68 | for (c = counter; c; c = c->parent) { | ||
69 | long new; | ||
70 | /* | ||
71 | * Charge speculatively to avoid an expensive CAS. If | ||
72 | * a bigger charge fails, it might falsely lock out a | ||
73 | * racing smaller charge and send it into reclaim | ||
74 | * early, but the error is limited to the difference | ||
75 | * between the two sizes, which is less than 2M/4M in | ||
76 | * case of a THP locking out a regular page charge. | ||
77 | * | ||
78 | * The atomic_long_add_return() implies a full memory | ||
79 | * barrier between incrementing the count and reading | ||
80 | * the limit. When racing with page_counter_limit(), | ||
81 | * we either see the new limit or the setter sees the | ||
82 | * counter has changed and retries. | ||
83 | */ | ||
84 | new = atomic_long_add_return(nr_pages, &c->count); | ||
85 | if (new > c->limit) { | ||
86 | atomic_long_sub(nr_pages, &c->count); | ||
87 | /* | ||
88 | * This is racy, but we can live with some | ||
89 | * inaccuracy in the failcnt. | ||
90 | */ | ||
91 | c->failcnt++; | ||
92 | *fail = c; | ||
93 | goto failed; | ||
94 | } | ||
95 | /* | ||
96 | * Just like with failcnt, we can live with some | ||
97 | * inaccuracy in the watermark. | ||
98 | */ | ||
99 | if (new > c->watermark) | ||
100 | c->watermark = new; | ||
101 | } | ||
102 | return 0; | ||
103 | |||
104 | failed: | ||
105 | for (c = counter; c != *fail; c = c->parent) | ||
106 | page_counter_cancel(c, nr_pages); | ||
107 | |||
108 | return -ENOMEM; | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * page_counter_uncharge - hierarchically uncharge pages | ||
113 | * @counter: counter | ||
114 | * @nr_pages: number of pages to uncharge | ||
115 | */ | ||
116 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) | ||
117 | { | ||
118 | struct page_counter *c; | ||
119 | |||
120 | for (c = counter; c; c = c->parent) | ||
121 | page_counter_cancel(c, nr_pages); | ||
122 | } | ||
123 | |||
124 | /** | ||
125 | * page_counter_limit - limit the number of pages allowed | ||
126 | * @counter: counter | ||
127 | * @limit: limit to set | ||
128 | * | ||
129 | * Returns 0 on success, -EBUSY if the current number of pages on the | ||
130 | * counter already exceeds the specified limit. | ||
131 | * | ||
132 | * The caller must serialize invocations on the same counter. | ||
133 | */ | ||
134 | int page_counter_limit(struct page_counter *counter, unsigned long limit) | ||
135 | { | ||
136 | for (;;) { | ||
137 | unsigned long old; | ||
138 | long count; | ||
139 | |||
140 | /* | ||
141 | * Update the limit while making sure that it's not | ||
142 | * below the concurrently-changing counter value. | ||
143 | * | ||
144 | * The xchg implies two full memory barriers before | ||
145 | * and after, so the read-swap-read is ordered and | ||
146 | * ensures coherency with page_counter_try_charge(): | ||
147 | * that function modifies the count before checking | ||
148 | * the limit, so if it sees the old limit, we see the | ||
149 | * modified counter and retry. | ||
150 | */ | ||
151 | count = atomic_long_read(&counter->count); | ||
152 | |||
153 | if (count > limit) | ||
154 | return -EBUSY; | ||
155 | |||
156 | old = xchg(&counter->limit, limit); | ||
157 | |||
158 | if (atomic_long_read(&counter->count) <= count) | ||
159 | return 0; | ||
160 | |||
161 | counter->limit = old; | ||
162 | cond_resched(); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | /** | ||
167 | * page_counter_memparse - memparse() for page counter limits | ||
168 | * @buf: string to parse | ||
169 | * @nr_pages: returns the result in number of pages | ||
170 | * | ||
171 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | ||
172 | * limited to %PAGE_COUNTER_MAX. | ||
173 | */ | ||
174 | int page_counter_memparse(const char *buf, unsigned long *nr_pages) | ||
175 | { | ||
176 | char unlimited[] = "-1"; | ||
177 | char *end; | ||
178 | u64 bytes; | ||
179 | |||
180 | if (!strncmp(buf, unlimited, sizeof(unlimited))) { | ||
181 | *nr_pages = PAGE_COUNTER_MAX; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | bytes = memparse(buf, &end); | ||
186 | if (*end != '\0') | ||
187 | return -EINVAL; | ||
188 | |||
189 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); | ||
190 | |||
191 | return 0; | ||
192 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c8778f7e208e..72f5ac381ab3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -68,7 +68,7 @@ out: | |||
68 | 68 | ||
69 | spin_unlock_irqrestore(&zone->lock, flags); | 69 | spin_unlock_irqrestore(&zone->lock, flags); |
70 | if (!ret) | 70 | if (!ret) |
71 | drain_all_pages(); | 71 | drain_all_pages(zone); |
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | 74 | ||
@@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page) | |||
1053 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1053 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1054 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1054 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
1055 | } | 1055 | } |
1056 | mem_cgroup_end_page_stat(memcg, locked, flags); | 1056 | mem_cgroup_end_page_stat(memcg, &locked, &flags); |
1057 | } | 1057 | } |
1058 | 1058 | ||
1059 | static void page_remove_file_rmap(struct page *page) | 1059 | static void page_remove_file_rmap(struct page *page) |
@@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page) | |||
1083 | if (unlikely(PageMlocked(page))) | 1083 | if (unlikely(PageMlocked(page))) |
1084 | clear_page_mlock(page); | 1084 | clear_page_mlock(page); |
1085 | out: | 1085 | out: |
1086 | mem_cgroup_end_page_stat(memcg, locked, flags); | 1086 | mem_cgroup_end_page_stat(memcg, &locked, &flags); |
1087 | } | 1087 | } |
1088 | 1088 | ||
1089 | /** | 1089 | /** |
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2590 | * Be lazy and only check for valid flags here, keeping it out of the | 2590 | * Be lazy and only check for valid flags here, keeping it out of the |
2591 | * critical path in kmem_cache_alloc(). | 2591 | * critical path in kmem_cache_alloc(). |
2592 | */ | 2592 | */ |
2593 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 2593 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
2594 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
2595 | BUG(); | ||
2596 | } | ||
2594 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 2597 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
2595 | 2598 | ||
2596 | /* Take the node list lock to change the colour_next on this node */ | 2599 | /* Take the node list lock to change the colour_next on this node */ |
@@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | |||
3580 | 3583 | ||
3581 | for_each_online_node(node) { | 3584 | for_each_online_node(node) { |
3582 | 3585 | ||
3583 | if (use_alien_caches) { | 3586 | if (use_alien_caches) { |
3584 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); | 3587 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
3585 | if (!new_alien) | 3588 | if (!new_alien) |
3586 | goto fail; | 3589 | goto fail; |
3587 | } | 3590 | } |
3588 | 3591 | ||
3589 | new_shared = NULL; | 3592 | new_shared = NULL; |
3590 | if (cachep->shared) { | 3593 | if (cachep->shared) { |
@@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4043 | 4046 | ||
4044 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4047 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4045 | 4048 | ||
4046 | static void *leaks_start(struct seq_file *m, loff_t *pos) | ||
4047 | { | ||
4048 | mutex_lock(&slab_mutex); | ||
4049 | return seq_list_start(&slab_caches, *pos); | ||
4050 | } | ||
4051 | |||
4052 | static inline int add_caller(unsigned long *n, unsigned long v) | 4049 | static inline int add_caller(unsigned long *n, unsigned long v) |
4053 | { | 4050 | { |
4054 | unsigned long *p; | 4051 | unsigned long *p; |
@@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4170 | } | 4167 | } |
4171 | 4168 | ||
4172 | static const struct seq_operations slabstats_op = { | 4169 | static const struct seq_operations slabstats_op = { |
4173 | .start = leaks_start, | 4170 | .start = slab_start, |
4174 | .next = slab_next, | 4171 | .next = slab_next, |
4175 | .stop = slab_stop, | 4172 | .stop = slab_stop, |
4176 | .show = leaks_show, | 4173 | .show = leaks_show, |
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) | |||
209 | 209 | ||
210 | rcu_read_lock(); | 210 | rcu_read_lock(); |
211 | params = rcu_dereference(s->memcg_params); | 211 | params = rcu_dereference(s->memcg_params); |
212 | cachep = params->memcg_caches[idx]; | ||
213 | rcu_read_unlock(); | ||
214 | 212 | ||
215 | /* | 213 | /* |
216 | * Make sure we will access the up-to-date value. The code updating | 214 | * Make sure we will access the up-to-date value. The code updating |
217 | * memcg_caches issues a write barrier to match this (see | 215 | * memcg_caches issues a write barrier to match this (see |
218 | * memcg_register_cache()). | 216 | * memcg_register_cache()). |
219 | */ | 217 | */ |
220 | smp_read_barrier_depends(); | 218 | cachep = lockless_dereference(params->memcg_caches[idx]); |
219 | rcu_read_unlock(); | ||
220 | |||
221 | return cachep; | 221 | return cachep; |
222 | } | 222 | } |
223 | 223 | ||
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
357 | 357 | ||
358 | #endif | 358 | #endif |
359 | 359 | ||
360 | void *slab_start(struct seq_file *m, loff_t *pos); | ||
360 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); | 361 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); |
361 | void slab_stop(struct seq_file *m, void *p); | 362 | void slab_stop(struct seq_file *m, void *p); |
363 | int memcg_slab_show(struct seq_file *m, void *p); | ||
362 | 364 | ||
363 | #endif /* MM_SLAB_H */ | 365 | #endif /* MM_SLAB_H */ |
diff --git a/mm/slab_common.c b/mm/slab_common.c index dcdab81bd240..e03dd6f2a272 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, | |||
240 | size = ALIGN(size, align); | 240 | size = ALIGN(size, align); |
241 | flags = kmem_cache_flags(size, flags, name, NULL); | 241 | flags = kmem_cache_flags(size, flags, name, NULL); |
242 | 242 | ||
243 | list_for_each_entry(s, &slab_caches, list) { | 243 | list_for_each_entry_reverse(s, &slab_caches, list) { |
244 | if (slab_unmergeable(s)) | 244 | if (slab_unmergeable(s)) |
245 | continue; | 245 | continue; |
246 | 246 | ||
@@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); | |||
811 | #define SLABINFO_RIGHTS S_IRUSR | 811 | #define SLABINFO_RIGHTS S_IRUSR |
812 | #endif | 812 | #endif |
813 | 813 | ||
814 | void print_slabinfo_header(struct seq_file *m) | 814 | static void print_slabinfo_header(struct seq_file *m) |
815 | { | 815 | { |
816 | /* | 816 | /* |
817 | * Output format version, so at least we can change it | 817 | * Output format version, so at least we can change it |
@@ -834,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m) | |||
834 | seq_putc(m, '\n'); | 834 | seq_putc(m, '\n'); |
835 | } | 835 | } |
836 | 836 | ||
837 | static void *s_start(struct seq_file *m, loff_t *pos) | 837 | void *slab_start(struct seq_file *m, loff_t *pos) |
838 | { | 838 | { |
839 | loff_t n = *pos; | ||
840 | |||
841 | mutex_lock(&slab_mutex); | 839 | mutex_lock(&slab_mutex); |
842 | if (!n) | ||
843 | print_slabinfo_header(m); | ||
844 | |||
845 | return seq_list_start(&slab_caches, *pos); | 840 | return seq_list_start(&slab_caches, *pos); |
846 | } | 841 | } |
847 | 842 | ||
@@ -881,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
881 | } | 876 | } |
882 | } | 877 | } |
883 | 878 | ||
884 | int cache_show(struct kmem_cache *s, struct seq_file *m) | 879 | static void cache_show(struct kmem_cache *s, struct seq_file *m) |
885 | { | 880 | { |
886 | struct slabinfo sinfo; | 881 | struct slabinfo sinfo; |
887 | 882 | ||
@@ -900,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m) | |||
900 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); | 895 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); |
901 | slabinfo_show_stats(m, s); | 896 | slabinfo_show_stats(m, s); |
902 | seq_putc(m, '\n'); | 897 | seq_putc(m, '\n'); |
898 | } | ||
899 | |||
900 | static int slab_show(struct seq_file *m, void *p) | ||
901 | { | ||
902 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | ||
903 | |||
904 | if (p == slab_caches.next) | ||
905 | print_slabinfo_header(m); | ||
906 | if (is_root_cache(s)) | ||
907 | cache_show(s, m); | ||
903 | return 0; | 908 | return 0; |
904 | } | 909 | } |
905 | 910 | ||
906 | static int s_show(struct seq_file *m, void *p) | 911 | #ifdef CONFIG_MEMCG_KMEM |
912 | int memcg_slab_show(struct seq_file *m, void *p) | ||
907 | { | 913 | { |
908 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | 914 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); |
915 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
909 | 916 | ||
910 | if (!is_root_cache(s)) | 917 | if (p == slab_caches.next) |
911 | return 0; | 918 | print_slabinfo_header(m); |
912 | return cache_show(s, m); | 919 | if (!is_root_cache(s) && s->memcg_params->memcg == memcg) |
920 | cache_show(s, m); | ||
921 | return 0; | ||
913 | } | 922 | } |
923 | #endif | ||
914 | 924 | ||
915 | /* | 925 | /* |
916 | * slabinfo_op - iterator that generates /proc/slabinfo | 926 | * slabinfo_op - iterator that generates /proc/slabinfo |
@@ -926,10 +936,10 @@ static int s_show(struct seq_file *m, void *p) | |||
926 | * + further values on SMP and with statistics enabled | 936 | * + further values on SMP and with statistics enabled |
927 | */ | 937 | */ |
928 | static const struct seq_operations slabinfo_op = { | 938 | static const struct seq_operations slabinfo_op = { |
929 | .start = s_start, | 939 | .start = slab_start, |
930 | .next = slab_next, | 940 | .next = slab_next, |
931 | .stop = slab_stop, | 941 | .stop = slab_stop, |
932 | .show = s_show, | 942 | .show = slab_show, |
933 | }; | 943 | }; |
934 | 944 | ||
935 | static int slabinfo_open(struct inode *inode, struct file *file) | 945 | static int slabinfo_open(struct inode *inode, struct file *file) |
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
849 | maxobj = order_objects(compound_order(page), s->size, s->reserved); | 849 | maxobj = order_objects(compound_order(page), s->size, s->reserved); |
850 | if (page->objects > maxobj) { | 850 | if (page->objects > maxobj) { |
851 | slab_err(s, page, "objects %u > max %u", | 851 | slab_err(s, page, "objects %u > max %u", |
852 | s->name, page->objects, maxobj); | 852 | page->objects, maxobj); |
853 | return 0; | 853 | return 0; |
854 | } | 854 | } |
855 | if (page->inuse > page->objects) { | 855 | if (page->inuse > page->objects) { |
856 | slab_err(s, page, "inuse %u > max %u", | 856 | slab_err(s, page, "inuse %u > max %u", |
857 | s->name, page->inuse, page->objects); | 857 | page->inuse, page->objects); |
858 | return 0; | 858 | return 0; |
859 | } | 859 | } |
860 | /* Slab_pad_check fixes things up after itself */ | 860 | /* Slab_pad_check fixes things up after itself */ |
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
871 | int nr = 0; | 871 | int nr = 0; |
872 | void *fp; | 872 | void *fp; |
873 | void *object = NULL; | 873 | void *object = NULL; |
874 | unsigned long max_objects; | 874 | int max_objects; |
875 | 875 | ||
876 | fp = page->freelist; | 876 | fp = page->freelist; |
877 | while (fp && nr <= page->objects) { | 877 | while (fp && nr <= page->objects) { |
@@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1377 | int order; | 1377 | int order; |
1378 | int idx; | 1378 | int idx; |
1379 | 1379 | ||
1380 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1380 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
1381 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
1382 | BUG(); | ||
1383 | } | ||
1381 | 1384 | ||
1382 | page = allocate_slab(s, | 1385 | page = allocate_slab(s, |
1383 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | 1386 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); |
@@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2554 | 2557 | ||
2555 | } else { /* Needs to be taken off a list */ | 2558 | } else { /* Needs to be taken off a list */ |
2556 | 2559 | ||
2557 | n = get_node(s, page_to_nid(page)); | 2560 | n = get_node(s, page_to_nid(page)); |
2558 | /* | 2561 | /* |
2559 | * Speculatively acquire the list_lock. | 2562 | * Speculatively acquire the list_lock. |
2560 | * If the cmpxchg does not succeed then we may | 2563 | * If the cmpxchg does not succeed then we may |
@@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2587 | * The list lock was not taken therefore no list | 2590 | * The list lock was not taken therefore no list |
2588 | * activity can be necessary. | 2591 | * activity can be necessary. |
2589 | */ | 2592 | */ |
2590 | if (was_frozen) | 2593 | if (was_frozen) |
2591 | stat(s, FREE_FROZEN); | 2594 | stat(s, FREE_FROZEN); |
2592 | return; | 2595 | return; |
2593 | } | 2596 | } |
2594 | 2597 | ||
2595 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) | 2598 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) |
2596 | goto slab_empty; | 2599 | goto slab_empty; |
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000000..b5f7f24b8dd1 --- /dev/null +++ b/mm/swap_cgroup.c | |||
@@ -0,0 +1,208 @@ | |||
1 | #include <linux/swap_cgroup.h> | ||
2 | #include <linux/vmalloc.h> | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | #include <linux/swapops.h> /* depends on mm.h include */ | ||
6 | |||
7 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
8 | struct swap_cgroup_ctrl { | ||
9 | struct page **map; | ||
10 | unsigned long length; | ||
11 | spinlock_t lock; | ||
12 | }; | ||
13 | |||
14 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
15 | |||
16 | struct swap_cgroup { | ||
17 | unsigned short id; | ||
18 | }; | ||
19 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
20 | |||
21 | /* | ||
22 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
23 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
24 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
25 | * | ||
26 | * This means, | ||
27 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
28 | * SwapCache(and its swp_entry) is under lock. | ||
29 | * - When called via swap_free(), there is no user of this entry and no race. | ||
30 | * Then, we don't need lock around "exchange". | ||
31 | * | ||
32 | * TODO: we can push these buffers out to HIGHMEM. | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | * allocate buffer for swap_cgroup. | ||
37 | */ | ||
38 | static int swap_cgroup_prepare(int type) | ||
39 | { | ||
40 | struct page *page; | ||
41 | struct swap_cgroup_ctrl *ctrl; | ||
42 | unsigned long idx, max; | ||
43 | |||
44 | ctrl = &swap_cgroup_ctrl[type]; | ||
45 | |||
46 | for (idx = 0; idx < ctrl->length; idx++) { | ||
47 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
48 | if (!page) | ||
49 | goto not_enough_page; | ||
50 | ctrl->map[idx] = page; | ||
51 | } | ||
52 | return 0; | ||
53 | not_enough_page: | ||
54 | max = idx; | ||
55 | for (idx = 0; idx < max; idx++) | ||
56 | __free_page(ctrl->map[idx]); | ||
57 | |||
58 | return -ENOMEM; | ||
59 | } | ||
60 | |||
61 | static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | ||
62 | struct swap_cgroup_ctrl **ctrlp) | ||
63 | { | ||
64 | pgoff_t offset = swp_offset(ent); | ||
65 | struct swap_cgroup_ctrl *ctrl; | ||
66 | struct page *mappage; | ||
67 | struct swap_cgroup *sc; | ||
68 | |||
69 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | ||
70 | if (ctrlp) | ||
71 | *ctrlp = ctrl; | ||
72 | |||
73 | mappage = ctrl->map[offset / SC_PER_PAGE]; | ||
74 | sc = page_address(mappage); | ||
75 | return sc + offset % SC_PER_PAGE; | ||
76 | } | ||
77 | |||
78 | /** | ||
79 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
80 | * @ent: swap entry to be cmpxchged | ||
81 | * @old: old id | ||
82 | * @new: new id | ||
83 | * | ||
84 | * Returns old id at success, 0 at failure. | ||
85 | * (There is no mem_cgroup using 0 as its id) | ||
86 | */ | ||
87 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
88 | unsigned short old, unsigned short new) | ||
89 | { | ||
90 | struct swap_cgroup_ctrl *ctrl; | ||
91 | struct swap_cgroup *sc; | ||
92 | unsigned long flags; | ||
93 | unsigned short retval; | ||
94 | |||
95 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
96 | |||
97 | spin_lock_irqsave(&ctrl->lock, flags); | ||
98 | retval = sc->id; | ||
99 | if (retval == old) | ||
100 | sc->id = new; | ||
101 | else | ||
102 | retval = 0; | ||
103 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
104 | return retval; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
109 | * @ent: swap entry to be recorded into | ||
110 | * @id: mem_cgroup to be recorded | ||
111 | * | ||
112 | * Returns old value at success, 0 at failure. | ||
113 | * (Of course, old value can be 0.) | ||
114 | */ | ||
115 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
116 | { | ||
117 | struct swap_cgroup_ctrl *ctrl; | ||
118 | struct swap_cgroup *sc; | ||
119 | unsigned short old; | ||
120 | unsigned long flags; | ||
121 | |||
122 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
123 | |||
124 | spin_lock_irqsave(&ctrl->lock, flags); | ||
125 | old = sc->id; | ||
126 | sc->id = id; | ||
127 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
128 | |||
129 | return old; | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | ||
134 | * @ent: swap entry to be looked up. | ||
135 | * | ||
136 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | ||
137 | */ | ||
138 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
139 | { | ||
140 | return lookup_swap_cgroup(ent, NULL)->id; | ||
141 | } | ||
142 | |||
143 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
144 | { | ||
145 | void *array; | ||
146 | unsigned long array_size; | ||
147 | unsigned long length; | ||
148 | struct swap_cgroup_ctrl *ctrl; | ||
149 | |||
150 | if (!do_swap_account) | ||
151 | return 0; | ||
152 | |||
153 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | ||
154 | array_size = length * sizeof(void *); | ||
155 | |||
156 | array = vzalloc(array_size); | ||
157 | if (!array) | ||
158 | goto nomem; | ||
159 | |||
160 | ctrl = &swap_cgroup_ctrl[type]; | ||
161 | mutex_lock(&swap_cgroup_mutex); | ||
162 | ctrl->length = length; | ||
163 | ctrl->map = array; | ||
164 | spin_lock_init(&ctrl->lock); | ||
165 | if (swap_cgroup_prepare(type)) { | ||
166 | /* memory shortage */ | ||
167 | ctrl->map = NULL; | ||
168 | ctrl->length = 0; | ||
169 | mutex_unlock(&swap_cgroup_mutex); | ||
170 | vfree(array); | ||
171 | goto nomem; | ||
172 | } | ||
173 | mutex_unlock(&swap_cgroup_mutex); | ||
174 | |||
175 | return 0; | ||
176 | nomem: | ||
177 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
178 | printk(KERN_INFO | ||
179 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); | ||
180 | return -ENOMEM; | ||
181 | } | ||
182 | |||
183 | void swap_cgroup_swapoff(int type) | ||
184 | { | ||
185 | struct page **map; | ||
186 | unsigned long i, length; | ||
187 | struct swap_cgroup_ctrl *ctrl; | ||
188 | |||
189 | if (!do_swap_account) | ||
190 | return; | ||
191 | |||
192 | mutex_lock(&swap_cgroup_mutex); | ||
193 | ctrl = &swap_cgroup_ctrl[type]; | ||
194 | map = ctrl->map; | ||
195 | length = ctrl->length; | ||
196 | ctrl->map = NULL; | ||
197 | ctrl->length = 0; | ||
198 | mutex_unlock(&swap_cgroup_mutex); | ||
199 | |||
200 | if (map) { | ||
201 | for (i = 0; i < length; i++) { | ||
202 | struct page *page = map[i]; | ||
203 | if (page) | ||
204 | __free_page(page); | ||
205 | } | ||
206 | vfree(map); | ||
207 | } | ||
208 | } | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index 154444918685..9711342987a0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/blkdev.h> | 17 | #include <linux/blkdev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/page_cgroup.h> | ||
21 | 20 | ||
22 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
23 | 22 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..63f55ccb9b26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -38,7 +38,7 @@ | |||
38 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
39 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
40 | #include <linux/swapops.h> | 40 | #include <linux/swapops.h> |
41 | #include <linux/page_cgroup.h> | 41 | #include <linux/swap_cgroup.h> |
42 | 42 | ||
43 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | 43 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, |
44 | unsigned char); | 44 | unsigned char); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 90520af7f186..8a18196fcdff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -463,8 +463,7 @@ overflow: | |||
463 | goto retry; | 463 | goto retry; |
464 | } | 464 | } |
465 | if (printk_ratelimit()) | 465 | if (printk_ratelimit()) |
466 | printk(KERN_WARNING | 466 | pr_warn("vmap allocation for size %lu failed: " |
467 | "vmap allocation for size %lu failed: " | ||
468 | "use vmalloc=<size> to increase size.\n", size); | 467 | "use vmalloc=<size> to increase size.\n", size); |
469 | kfree(va); | 468 | kfree(va); |
470 | return ERR_PTR(-EBUSY); | 469 | return ERR_PTR(-EBUSY); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index dcb47074ae03..4636d9e822c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
260 | do_div(delta, lru_pages + 1); | 260 | do_div(delta, lru_pages + 1); |
261 | total_scan += delta; | 261 | total_scan += delta; |
262 | if (total_scan < 0) { | 262 | if (total_scan < 0) { |
263 | printk(KERN_ERR | 263 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |
264 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | ||
265 | shrinker->scan_objects, total_scan); | 264 | shrinker->scan_objects, total_scan); |
266 | total_scan = freeable; | 265 | total_scan = freeable; |
267 | } | 266 | } |
@@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
875 | * end of the LRU a second time. | 874 | * end of the LRU a second time. |
876 | */ | 875 | */ |
877 | mapping = page_mapping(page); | 876 | mapping = page_mapping(page); |
878 | if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || | 877 | if (((dirty || writeback) && mapping && |
878 | bdi_write_congested(mapping->backing_dev_info)) || | ||
879 | (writeback && PageReclaim(page))) | 879 | (writeback && PageReclaim(page))) |
880 | nr_congested++; | 880 | nr_congested++; |
881 | 881 | ||
@@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2249 | return true; | 2249 | return true; |
2250 | 2250 | ||
2251 | /* If compaction would go ahead or the allocation would succeed, stop */ | 2251 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2252 | switch (compaction_suitable(zone, sc->order)) { | 2252 | switch (compaction_suitable(zone, sc->order, 0, 0)) { |
2253 | case COMPACT_PARTIAL: | 2253 | case COMPACT_PARTIAL: |
2254 | case COMPACT_CONTINUE: | 2254 | case COMPACT_CONTINUE: |
2255 | return false; | 2255 | return false; |
@@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order) | |||
2346 | * If compaction is not ready to start and allocation is not likely | 2346 | * If compaction is not ready to start and allocation is not likely |
2347 | * to succeed without it, then keep reclaiming. | 2347 | * to succeed without it, then keep reclaiming. |
2348 | */ | 2348 | */ |
2349 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | 2349 | if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) |
2350 | return false; | 2350 | return false; |
2351 | 2351 | ||
2352 | return watermark_ok; | 2352 | return watermark_ok; |
@@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2824 | balance_gap, classzone_idx, 0)) | 2824 | balance_gap, classzone_idx, 0)) |
2825 | return false; | 2825 | return false; |
2826 | 2826 | ||
2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | 2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, |
2828 | compaction_suitable(zone, order) == COMPACT_SKIPPED) | 2828 | order, 0, classzone_idx) == COMPACT_SKIPPED) |
2829 | return false; | 2829 | return false; |
2830 | 2830 | ||
2831 | return true; | 2831 | return true; |
@@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2952 | * from memory. Do not reclaim more than needed for compaction. | 2952 | * from memory. Do not reclaim more than needed for compaction. |
2953 | */ | 2953 | */ |
2954 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && | 2954 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && |
2955 | compaction_suitable(zone, sc->order) != | 2955 | compaction_suitable(zone, sc->order, 0, classzone_idx) |
2956 | COMPACT_SKIPPED) | 2956 | != COMPACT_SKIPPED) |
2957 | testorder = 0; | 2957 | testorder = 0; |
2958 | 2958 | ||
2959 | /* | 2959 | /* |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 1d191357bf88..272327134a1b 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -9,13 +9,13 @@ | |||
9 | int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 9 | int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
10 | { | 10 | { |
11 | /* | 11 | /* |
12 | * The root cgroup does not use res_counters, but rather, | 12 | * The root cgroup does not use page_counters, but rather, |
13 | * rely on the data already collected by the network | 13 | * rely on the data already collected by the network |
14 | * subsystem | 14 | * subsystem |
15 | */ | 15 | */ |
16 | struct res_counter *res_parent = NULL; | ||
17 | struct cg_proto *cg_proto, *parent_cg; | ||
18 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 16 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
17 | struct page_counter *counter_parent = NULL; | ||
18 | struct cg_proto *cg_proto, *parent_cg; | ||
19 | 19 | ||
20 | cg_proto = tcp_prot.proto_cgroup(memcg); | 20 | cg_proto = tcp_prot.proto_cgroup(memcg); |
21 | if (!cg_proto) | 21 | if (!cg_proto) |
@@ -29,9 +29,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
29 | 29 | ||
30 | parent_cg = tcp_prot.proto_cgroup(parent); | 30 | parent_cg = tcp_prot.proto_cgroup(parent); |
31 | if (parent_cg) | 31 | if (parent_cg) |
32 | res_parent = &parent_cg->memory_allocated; | 32 | counter_parent = &parent_cg->memory_allocated; |
33 | 33 | ||
34 | res_counter_init(&cg_proto->memory_allocated, res_parent); | 34 | page_counter_init(&cg_proto->memory_allocated, counter_parent); |
35 | percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); | 35 | percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); |
36 | 36 | ||
37 | return 0; | 37 | return 0; |
@@ -50,7 +50,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) | |||
50 | } | 50 | } |
51 | EXPORT_SYMBOL(tcp_destroy_cgroup); | 51 | EXPORT_SYMBOL(tcp_destroy_cgroup); |
52 | 52 | ||
53 | static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | 53 | static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) |
54 | { | 54 | { |
55 | struct cg_proto *cg_proto; | 55 | struct cg_proto *cg_proto; |
56 | int i; | 56 | int i; |
@@ -60,20 +60,17 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
60 | if (!cg_proto) | 60 | if (!cg_proto) |
61 | return -EINVAL; | 61 | return -EINVAL; |
62 | 62 | ||
63 | if (val > RES_COUNTER_MAX) | 63 | ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); |
64 | val = RES_COUNTER_MAX; | ||
65 | |||
66 | ret = res_counter_set_limit(&cg_proto->memory_allocated, val); | ||
67 | if (ret) | 64 | if (ret) |
68 | return ret; | 65 | return ret; |
69 | 66 | ||
70 | for (i = 0; i < 3; i++) | 67 | for (i = 0; i < 3; i++) |
71 | cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, | 68 | cg_proto->sysctl_mem[i] = min_t(long, nr_pages, |
72 | sysctl_tcp_mem[i]); | 69 | sysctl_tcp_mem[i]); |
73 | 70 | ||
74 | if (val == RES_COUNTER_MAX) | 71 | if (nr_pages == PAGE_COUNTER_MAX) |
75 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | 72 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); |
76 | else if (val != RES_COUNTER_MAX) { | 73 | else { |
77 | /* | 74 | /* |
78 | * The active bit needs to be written after the static_key | 75 | * The active bit needs to be written after the static_key |
79 | * update. This is what guarantees that the socket activation | 76 | * update. This is what guarantees that the socket activation |
@@ -102,11 +99,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | |||
102 | return 0; | 99 | return 0; |
103 | } | 100 | } |
104 | 101 | ||
102 | enum { | ||
103 | RES_USAGE, | ||
104 | RES_LIMIT, | ||
105 | RES_MAX_USAGE, | ||
106 | RES_FAILCNT, | ||
107 | }; | ||
108 | |||
109 | static DEFINE_MUTEX(tcp_limit_mutex); | ||
110 | |||
105 | static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, | 111 | static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, |
106 | char *buf, size_t nbytes, loff_t off) | 112 | char *buf, size_t nbytes, loff_t off) |
107 | { | 113 | { |
108 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 114 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
109 | unsigned long long val; | 115 | unsigned long nr_pages; |
110 | int ret = 0; | 116 | int ret = 0; |
111 | 117 | ||
112 | buf = strstrip(buf); | 118 | buf = strstrip(buf); |
@@ -114,10 +120,12 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, | |||
114 | switch (of_cft(of)->private) { | 120 | switch (of_cft(of)->private) { |
115 | case RES_LIMIT: | 121 | case RES_LIMIT: |
116 | /* see memcontrol.c */ | 122 | /* see memcontrol.c */ |
117 | ret = res_counter_memparse_write_strategy(buf, &val); | 123 | ret = page_counter_memparse(buf, &nr_pages); |
118 | if (ret) | 124 | if (ret) |
119 | break; | 125 | break; |
120 | ret = tcp_update_limit(memcg, val); | 126 | mutex_lock(&tcp_limit_mutex); |
127 | ret = tcp_update_limit(memcg, nr_pages); | ||
128 | mutex_unlock(&tcp_limit_mutex); | ||
121 | break; | 129 | break; |
122 | default: | 130 | default: |
123 | ret = -EINVAL; | 131 | ret = -EINVAL; |
@@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, | |||
126 | return ret ?: nbytes; | 134 | return ret ?: nbytes; |
127 | } | 135 | } |
128 | 136 | ||
129 | static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) | ||
130 | { | ||
131 | struct cg_proto *cg_proto; | ||
132 | |||
133 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
134 | if (!cg_proto) | ||
135 | return default_val; | ||
136 | |||
137 | return res_counter_read_u64(&cg_proto->memory_allocated, type); | ||
138 | } | ||
139 | |||
140 | static u64 tcp_read_usage(struct mem_cgroup *memcg) | ||
141 | { | ||
142 | struct cg_proto *cg_proto; | ||
143 | |||
144 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
145 | if (!cg_proto) | ||
146 | return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; | ||
147 | |||
148 | return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); | ||
149 | } | ||
150 | |||
151 | static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) | 137 | static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) |
152 | { | 138 | { |
153 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 139 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
140 | struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); | ||
154 | u64 val; | 141 | u64 val; |
155 | 142 | ||
156 | switch (cft->private) { | 143 | switch (cft->private) { |
157 | case RES_LIMIT: | 144 | case RES_LIMIT: |
158 | val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); | 145 | if (!cg_proto) |
146 | return PAGE_COUNTER_MAX; | ||
147 | val = cg_proto->memory_allocated.limit; | ||
148 | val *= PAGE_SIZE; | ||
159 | break; | 149 | break; |
160 | case RES_USAGE: | 150 | case RES_USAGE: |
161 | val = tcp_read_usage(memcg); | 151 | if (!cg_proto) |
152 | val = atomic_long_read(&tcp_memory_allocated); | ||
153 | else | ||
154 | val = page_counter_read(&cg_proto->memory_allocated); | ||
155 | val *= PAGE_SIZE; | ||
162 | break; | 156 | break; |
163 | case RES_FAILCNT: | 157 | case RES_FAILCNT: |
158 | if (!cg_proto) | ||
159 | return 0; | ||
160 | val = cg_proto->memory_allocated.failcnt; | ||
161 | break; | ||
164 | case RES_MAX_USAGE: | 162 | case RES_MAX_USAGE: |
165 | val = tcp_read_stat(memcg, cft->private, 0); | 163 | if (!cg_proto) |
164 | return 0; | ||
165 | val = cg_proto->memory_allocated.watermark; | ||
166 | val *= PAGE_SIZE; | ||
166 | break; | 167 | break; |
167 | default: | 168 | default: |
168 | BUG(); | 169 | BUG(); |
@@ -183,10 +184,10 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, | |||
183 | 184 | ||
184 | switch (of_cft(of)->private) { | 185 | switch (of_cft(of)->private) { |
185 | case RES_MAX_USAGE: | 186 | case RES_MAX_USAGE: |
186 | res_counter_reset_max(&cg_proto->memory_allocated); | 187 | page_counter_reset_watermark(&cg_proto->memory_allocated); |
187 | break; | 188 | break; |
188 | case RES_FAILCNT: | 189 | case RES_FAILCNT: |
189 | res_counter_reset_failcnt(&cg_proto->memory_allocated); | 190 | cg_proto->memory_allocated.failcnt = 0; |
190 | break; | 191 | break; |
191 | } | 192 | } |
192 | 193 | ||
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 374abf443636..f0bb6d60c07b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl | |||
@@ -7,10 +7,11 @@ | |||
7 | 7 | ||
8 | use strict; | 8 | use strict; |
9 | use POSIX; | 9 | use POSIX; |
10 | use File::Basename; | ||
11 | use Cwd 'abs_path'; | ||
10 | 12 | ||
11 | my $P = $0; | 13 | my $P = $0; |
12 | $P =~ s@(.*)/@@g; | 14 | my $D = dirname(abs_path($P)); |
13 | my $D = $1; | ||
14 | 15 | ||
15 | my $V = '0.32'; | 16 | my $V = '0.32'; |
16 | 17 | ||
@@ -438,26 +439,29 @@ our $allowed_asm_includes = qr{(?x: | |||
438 | 439 | ||
439 | # Load common spelling mistakes and build regular expression list. | 440 | # Load common spelling mistakes and build regular expression list. |
440 | my $misspellings; | 441 | my $misspellings; |
441 | my @spelling_list; | ||
442 | my %spelling_fix; | 442 | my %spelling_fix; |
443 | open(my $spelling, '<', $spelling_file) | ||
444 | or die "$P: Can't open $spelling_file for reading: $!\n"; | ||
445 | while (<$spelling>) { | ||
446 | my $line = $_; | ||
447 | 443 | ||
448 | $line =~ s/\s*\n?$//g; | 444 | if (open(my $spelling, '<', $spelling_file)) { |
449 | $line =~ s/^\s*//g; | 445 | my @spelling_list; |
446 | while (<$spelling>) { | ||
447 | my $line = $_; | ||
450 | 448 | ||
451 | next if ($line =~ m/^\s*#/); | 449 | $line =~ s/\s*\n?$//g; |
452 | next if ($line =~ m/^\s*$/); | 450 | $line =~ s/^\s*//g; |
453 | 451 | ||
454 | my ($suspect, $fix) = split(/\|\|/, $line); | 452 | next if ($line =~ m/^\s*#/); |
453 | next if ($line =~ m/^\s*$/); | ||
455 | 454 | ||
456 | push(@spelling_list, $suspect); | 455 | my ($suspect, $fix) = split(/\|\|/, $line); |
457 | $spelling_fix{$suspect} = $fix; | 456 | |
457 | push(@spelling_list, $suspect); | ||
458 | $spelling_fix{$suspect} = $fix; | ||
459 | } | ||
460 | close($spelling); | ||
461 | $misspellings = join("|", @spelling_list); | ||
462 | } else { | ||
463 | warn "No typos will be found - file '$spelling_file': $!\n"; | ||
458 | } | 464 | } |
459 | close($spelling); | ||
460 | $misspellings = join("|", @spelling_list); | ||
461 | 465 | ||
462 | sub build_types { | 466 | sub build_types { |
463 | my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; | 467 | my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; |
@@ -942,7 +946,7 @@ sub sanitise_line { | |||
942 | sub get_quoted_string { | 946 | sub get_quoted_string { |
943 | my ($line, $rawline) = @_; | 947 | my ($line, $rawline) = @_; |
944 | 948 | ||
945 | return "" if ($line !~ m/(\"[X]+\")/g); | 949 | return "" if ($line !~ m/(\"[X\t]+\")/g); |
946 | return substr($rawline, $-[0], $+[0] - $-[0]); | 950 | return substr($rawline, $-[0], $+[0] - $-[0]); |
947 | } | 951 | } |
948 | 952 | ||
@@ -1843,6 +1847,7 @@ sub process { | |||
1843 | my $non_utf8_charset = 0; | 1847 | my $non_utf8_charset = 0; |
1844 | 1848 | ||
1845 | my $last_blank_line = 0; | 1849 | my $last_blank_line = 0; |
1850 | my $last_coalesced_string_linenr = -1; | ||
1846 | 1851 | ||
1847 | our @report = (); | 1852 | our @report = (); |
1848 | our $cnt_lines = 0; | 1853 | our $cnt_lines = 0; |
@@ -2078,6 +2083,12 @@ sub process { | |||
2078 | $in_commit_log = 0; | 2083 | $in_commit_log = 0; |
2079 | } | 2084 | } |
2080 | 2085 | ||
2086 | # Check if MAINTAINERS is being updated. If so, there's probably no need to | ||
2087 | # emit the "does MAINTAINERS need updating?" message on file add/move/delete | ||
2088 | if ($line =~ /^\s*MAINTAINERS\s*\|/) { | ||
2089 | $reported_maintainer_file = 1; | ||
2090 | } | ||
2091 | |||
2081 | # Check signature styles | 2092 | # Check signature styles |
2082 | if (!$in_header_lines && | 2093 | if (!$in_header_lines && |
2083 | $line =~ /^(\s*)([a-z0-9_-]+by:|$signature_tags)(\s*)(.*)/i) { | 2094 | $line =~ /^(\s*)([a-z0-9_-]+by:|$signature_tags)(\s*)(.*)/i) { |
@@ -2246,7 +2257,7 @@ sub process { | |||
2246 | } | 2257 | } |
2247 | 2258 | ||
2248 | # Check for various typo / spelling mistakes | 2259 | # Check for various typo / spelling mistakes |
2249 | if ($in_commit_log || $line =~ /^\+/) { | 2260 | if (defined($misspellings) && ($in_commit_log || $line =~ /^\+/)) { |
2250 | while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:$|[^a-z@])/gi) { | 2261 | while ($rawline =~ /(?:^|[^a-z@])($misspellings)(?:$|[^a-z@])/gi) { |
2251 | my $typo = $1; | 2262 | my $typo = $1; |
2252 | my $typo_fix = $spelling_fix{lc($typo)}; | 2263 | my $typo_fix = $spelling_fix{lc($typo)}; |
@@ -2403,33 +2414,6 @@ sub process { | |||
2403 | "line over $max_line_length characters\n" . $herecurr); | 2414 | "line over $max_line_length characters\n" . $herecurr); |
2404 | } | 2415 | } |
2405 | 2416 | ||
2406 | # Check for user-visible strings broken across lines, which breaks the ability | ||
2407 | # to grep for the string. Make exceptions when the previous string ends in a | ||
2408 | # newline (multiple lines in one string constant) or '\t', '\r', ';', or '{' | ||
2409 | # (common in inline assembly) or is a octal \123 or hexadecimal \xaf value | ||
2410 | if ($line =~ /^\+\s*"/ && | ||
2411 | $prevline =~ /"\s*$/ && | ||
2412 | $prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) { | ||
2413 | WARN("SPLIT_STRING", | ||
2414 | "quoted string split across lines\n" . $hereprev); | ||
2415 | } | ||
2416 | |||
2417 | # check for missing a space in a string concatination | ||
2418 | if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) { | ||
2419 | WARN('MISSING_SPACE', | ||
2420 | "break quoted strings at a space character\n" . $hereprev); | ||
2421 | } | ||
2422 | |||
2423 | # check for spaces before a quoted newline | ||
2424 | if ($rawline =~ /^.*\".*\s\\n/) { | ||
2425 | if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", | ||
2426 | "unnecessary whitespace before a quoted newline\n" . $herecurr) && | ||
2427 | $fix) { | ||
2428 | $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; | ||
2429 | } | ||
2430 | |||
2431 | } | ||
2432 | |||
2433 | # check for adding lines without a newline. | 2417 | # check for adding lines without a newline. |
2434 | if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) { | 2418 | if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) { |
2435 | WARN("MISSING_EOF_NEWLINE", | 2419 | WARN("MISSING_EOF_NEWLINE", |
@@ -2515,7 +2499,8 @@ sub process { | |||
2515 | } | 2499 | } |
2516 | } | 2500 | } |
2517 | 2501 | ||
2518 | if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|{)/) { | 2502 | if ($line =~ /^\+.*(\w+\s*)?\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|[,;\({\[\<\>])/ && |
2503 | (!defined($1) || $1 !~ /sizeof\s*/)) { | ||
2519 | if (CHK("SPACING", | 2504 | if (CHK("SPACING", |
2520 | "No space is necessary after a cast\n" . $herecurr) && | 2505 | "No space is necessary after a cast\n" . $herecurr) && |
2521 | $fix) { | 2506 | $fix) { |
@@ -3563,14 +3548,33 @@ sub process { | |||
3563 | } | 3548 | } |
3564 | } | 3549 | } |
3565 | 3550 | ||
3566 | # , must have a space on the right. | 3551 | # , must not have a space before and must have a space on the right. |
3567 | } elsif ($op eq ',') { | 3552 | } elsif ($op eq ',') { |
3553 | my $rtrim_before = 0; | ||
3554 | my $space_after = 0; | ||
3555 | if ($ctx =~ /Wx./) { | ||
3556 | if (ERROR("SPACING", | ||
3557 | "space prohibited before that '$op' $at\n" . $hereptr)) { | ||
3558 | $line_fixed = 1; | ||
3559 | $rtrim_before = 1; | ||
3560 | } | ||
3561 | } | ||
3568 | if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { | 3562 | if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { |
3569 | if (ERROR("SPACING", | 3563 | if (ERROR("SPACING", |
3570 | "space required after that '$op' $at\n" . $hereptr)) { | 3564 | "space required after that '$op' $at\n" . $hereptr)) { |
3571 | $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; | ||
3572 | $line_fixed = 1; | 3565 | $line_fixed = 1; |
3573 | $last_after = $n; | 3566 | $last_after = $n; |
3567 | $space_after = 1; | ||
3568 | } | ||
3569 | } | ||
3570 | if ($rtrim_before || $space_after) { | ||
3571 | if ($rtrim_before) { | ||
3572 | $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); | ||
3573 | } else { | ||
3574 | $good = $fix_elements[$n] . trim($fix_elements[$n + 1]); | ||
3575 | } | ||
3576 | if ($space_after) { | ||
3577 | $good .= " "; | ||
3574 | } | 3578 | } |
3575 | } | 3579 | } |
3576 | 3580 | ||
@@ -3814,9 +3818,27 @@ sub process { | |||
3814 | # ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar | 3818 | # ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar |
3815 | 3819 | ||
3816 | while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { | 3820 | while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { |
3817 | CHK("UNNECESSARY_PARENTHESES", | 3821 | my $var = $1; |
3818 | "Unnecessary parentheses around $1\n" . $herecurr); | 3822 | if (CHK("UNNECESSARY_PARENTHESES", |
3819 | } | 3823 | "Unnecessary parentheses around $var\n" . $herecurr) && |
3824 | $fix) { | ||
3825 | $fixed[$fixlinenr] =~ s/\(\s*\Q$var\E\s*\)/$var/; | ||
3826 | } | ||
3827 | } | ||
3828 | |||
3829 | # check for unnecessary parentheses around function pointer uses | ||
3830 | # ie: (foo->bar)(); should be foo->bar(); | ||
3831 | # but not "if (foo->bar) (" to avoid some false positives | ||
3832 | if ($line =~ /(\bif\s*|)(\(\s*$Ident\s*(?:$Member\s*)+\))[ \t]*\(/ && $1 !~ /^if/) { | ||
3833 | my $var = $2; | ||
3834 | if (CHK("UNNECESSARY_PARENTHESES", | ||
3835 | "Unnecessary parentheses around function pointer $var\n" . $herecurr) && | ||
3836 | $fix) { | ||
3837 | my $var2 = deparenthesize($var); | ||
3838 | $var2 =~ s/\s//g; | ||
3839 | $fixed[$fixlinenr] =~ s/\Q$var\E/$var2/; | ||
3840 | } | ||
3841 | } | ||
3820 | 3842 | ||
3821 | #goto labels aren't indented, allow a single space however | 3843 | #goto labels aren't indented, allow a single space however |
3822 | if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and | 3844 | if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and |
@@ -4056,7 +4078,9 @@ sub process { | |||
4056 | #Ignore Page<foo> variants | 4078 | #Ignore Page<foo> variants |
4057 | $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && | 4079 | $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && |
4058 | #Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) | 4080 | #Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) |
4059 | $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) { | 4081 | $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/ && |
4082 | #Ignore some three character SI units explicitly, like MiB and KHz | ||
4083 | $var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) { | ||
4060 | while ($var =~ m{($Ident)}g) { | 4084 | while ($var =~ m{($Ident)}g) { |
4061 | my $word = $1; | 4085 | my $word = $1; |
4062 | next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); | 4086 | next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); |
@@ -4408,12 +4432,85 @@ sub process { | |||
4408 | "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); | 4432 | "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); |
4409 | } | 4433 | } |
4410 | 4434 | ||
4435 | # Check for user-visible strings broken across lines, which breaks the ability | ||
4436 | # to grep for the string. Make exceptions when the previous string ends in a | ||
4437 | # newline (multiple lines in one string constant) or '\t', '\r', ';', or '{' | ||
4438 | # (common in inline assembly) or is a octal \123 or hexadecimal \xaf value | ||
4439 | if ($line =~ /^\+\s*"[X\t]*"/ && | ||
4440 | $prevline =~ /"\s*$/ && | ||
4441 | $prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) { | ||
4442 | if (WARN("SPLIT_STRING", | ||
4443 | "quoted string split across lines\n" . $hereprev) && | ||
4444 | $fix && | ||
4445 | $prevrawline =~ /^\+.*"\s*$/ && | ||
4446 | $last_coalesced_string_linenr != $linenr - 1) { | ||
4447 | my $extracted_string = get_quoted_string($line, $rawline); | ||
4448 | my $comma_close = ""; | ||
4449 | if ($rawline =~ /\Q$extracted_string\E(\s*\)\s*;\s*$|\s*,\s*)/) { | ||
4450 | $comma_close = $1; | ||
4451 | } | ||
4452 | |||
4453 | fix_delete_line($fixlinenr - 1, $prevrawline); | ||
4454 | fix_delete_line($fixlinenr, $rawline); | ||
4455 | my $fixedline = $prevrawline; | ||
4456 | $fixedline =~ s/"\s*$//; | ||
4457 | $fixedline .= substr($extracted_string, 1) . trim($comma_close); | ||
4458 | fix_insert_line($fixlinenr - 1, $fixedline); | ||
4459 | $fixedline = $rawline; | ||
4460 | $fixedline =~ s/\Q$extracted_string\E\Q$comma_close\E//; | ||
4461 | if ($fixedline !~ /\+\s*$/) { | ||
4462 | fix_insert_line($fixlinenr, $fixedline); | ||
4463 | } | ||
4464 | $last_coalesced_string_linenr = $linenr; | ||
4465 | } | ||
4466 | } | ||
4467 | |||
4468 | # check for missing a space in a string concatenation | ||
4469 | if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) { | ||
4470 | WARN('MISSING_SPACE', | ||
4471 | "break quoted strings at a space character\n" . $hereprev); | ||
4472 | } | ||
4473 | |||
4474 | # check for spaces before a quoted newline | ||
4475 | if ($rawline =~ /^.*\".*\s\\n/) { | ||
4476 | if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", | ||
4477 | "unnecessary whitespace before a quoted newline\n" . $herecurr) && | ||
4478 | $fix) { | ||
4479 | $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; | ||
4480 | } | ||
4481 | |||
4482 | } | ||
4483 | |||
4411 | # concatenated string without spaces between elements | 4484 | # concatenated string without spaces between elements |
4412 | if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) { | 4485 | if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) { |
4413 | CHK("CONCATENATED_STRING", | 4486 | CHK("CONCATENATED_STRING", |
4414 | "Concatenated strings should use spaces between elements\n" . $herecurr); | 4487 | "Concatenated strings should use spaces between elements\n" . $herecurr); |
4415 | } | 4488 | } |
4416 | 4489 | ||
4490 | # uncoalesced string fragments | ||
4491 | if ($line =~ /"X*"\s*"/) { | ||
4492 | WARN("STRING_FRAGMENTS", | ||
4493 | "Consecutive strings are generally better as a single string\n" . $herecurr); | ||
4494 | } | ||
4495 | |||
4496 | # check for %L{u,d,i} in strings | ||
4497 | my $string; | ||
4498 | while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) { | ||
4499 | $string = substr($rawline, $-[1], $+[1] - $-[1]); | ||
4500 | $string =~ s/%%/__/g; | ||
4501 | if ($string =~ /(?<!%)%L[udi]/) { | ||
4502 | WARN("PRINTF_L", | ||
4503 | "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr); | ||
4504 | last; | ||
4505 | } | ||
4506 | } | ||
4507 | |||
4508 | # check for line continuations in quoted strings with odd counts of " | ||
4509 | if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) { | ||
4510 | WARN("LINE_CONTINUATIONS", | ||
4511 | "Avoid line continuations in quoted strings\n" . $herecurr); | ||
4512 | } | ||
4513 | |||
4417 | # warn about #if 0 | 4514 | # warn about #if 0 |
4418 | if ($line =~ /^.\s*\#\s*if\s+0\b/) { | 4515 | if ($line =~ /^.\s*\#\s*if\s+0\b/) { |
4419 | CHK("REDUNDANT_CODE", | 4516 | CHK("REDUNDANT_CODE", |
@@ -4426,7 +4523,7 @@ sub process { | |||
4426 | my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;'; | 4523 | my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;'; |
4427 | if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) { | 4524 | if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) { |
4428 | WARN('NEEDLESS_IF', | 4525 | WARN('NEEDLESS_IF', |
4429 | "$1(NULL) is safe this check is probably not required\n" . $hereprev); | 4526 | "$1(NULL) is safe and this check is probably not required\n" . $hereprev); |
4430 | } | 4527 | } |
4431 | } | 4528 | } |
4432 | 4529 | ||
@@ -4458,6 +4555,28 @@ sub process { | |||
4458 | } | 4555 | } |
4459 | } | 4556 | } |
4460 | 4557 | ||
4558 | # check for mask then right shift without a parentheses | ||
4559 | if ($^V && $^V ge 5.10.0 && | ||
4560 | $line =~ /$LvalOrFunc\s*\&\s*($LvalOrFunc)\s*>>/ && | ||
4561 | $4 !~ /^\&/) { # $LvalOrFunc may be &foo, ignore if so | ||
4562 | WARN("MASK_THEN_SHIFT", | ||
4563 | "Possible precedence defect with mask then right shift - may need parentheses\n" . $herecurr); | ||
4564 | } | ||
4565 | |||
4566 | # check for pointer comparisons to NULL | ||
4567 | if ($^V && $^V ge 5.10.0) { | ||
4568 | while ($line =~ /\b$LvalOrFunc\s*(==|\!=)\s*NULL\b/g) { | ||
4569 | my $val = $1; | ||
4570 | my $equal = "!"; | ||
4571 | $equal = "" if ($4 eq "!="); | ||
4572 | if (CHK("COMPARISON_TO_NULL", | ||
4573 | "Comparison to NULL could be written \"${equal}${val}\"\n" . $herecurr) && | ||
4574 | $fix) { | ||
4575 | $fixed[$fixlinenr] =~ s/\b\Q$val\E\s*(?:==|\!=)\s*NULL\b/$equal$val/; | ||
4576 | } | ||
4577 | } | ||
4578 | } | ||
4579 | |||
4461 | # check for bad placement of section $InitAttribute (e.g.: __initdata) | 4580 | # check for bad placement of section $InitAttribute (e.g.: __initdata) |
4462 | if ($line =~ /(\b$InitAttribute\b)/) { | 4581 | if ($line =~ /(\b$InitAttribute\b)/) { |
4463 | my $attr = $1; | 4582 | my $attr = $1; |
@@ -4652,6 +4771,15 @@ sub process { | |||
4652 | } | 4771 | } |
4653 | } | 4772 | } |
4654 | 4773 | ||
4774 | # Check for __attribute__ weak, or __weak declarations (may have link issues) | ||
4775 | if ($^V && $^V ge 5.10.0 && | ||
4776 | $line =~ /(?:$Declare|$DeclareMisordered)\s*$Ident\s*$balanced_parens\s*(?:$Attribute)?\s*;/ && | ||
4777 | ($line =~ /\b__attribute__\s*\(\s*\(.*\bweak\b/ || | ||
4778 | $line =~ /\b__weak\b/)) { | ||
4779 | ERROR("WEAK_DECLARATION", | ||
4780 | "Using weak declarations can have unintended link defects\n" . $herecurr); | ||
4781 | } | ||
4782 | |||
4655 | # check for sizeof(&) | 4783 | # check for sizeof(&) |
4656 | if ($line =~ /\bsizeof\s*\(\s*\&/) { | 4784 | if ($line =~ /\bsizeof\s*\(\s*\&/) { |
4657 | WARN("SIZEOF_ADDRESS", | 4785 | WARN("SIZEOF_ADDRESS", |
@@ -4667,12 +4795,6 @@ sub process { | |||
4667 | } | 4795 | } |
4668 | } | 4796 | } |
4669 | 4797 | ||
4670 | # check for line continuations in quoted strings with odd counts of " | ||
4671 | if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) { | ||
4672 | WARN("LINE_CONTINUATIONS", | ||
4673 | "Avoid line continuations in quoted strings\n" . $herecurr); | ||
4674 | } | ||
4675 | |||
4676 | # check for struct spinlock declarations | 4798 | # check for struct spinlock declarations |
4677 | if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) { | 4799 | if ($line =~ /^.\s*\bstruct\s+spinlock\s+\w+\s*;/) { |
4678 | WARN("USE_SPINLOCK_T", | 4800 | WARN("USE_SPINLOCK_T", |
@@ -4908,6 +5030,17 @@ sub process { | |||
4908 | } | 5030 | } |
4909 | } | 5031 | } |
4910 | 5032 | ||
5033 | # check for #defines like: 1 << <digit> that could be BIT(digit) | ||
5034 | if ($line =~ /#\s*define\s+\w+\s+\(?\s*1\s*([ulUL]*)\s*\<\<\s*(?:\d+|$Ident)\s*\)?/) { | ||
5035 | my $ull = ""; | ||
5036 | $ull = "_ULL" if (defined($1) && $1 =~ /ll/i); | ||
5037 | if (CHK("BIT_MACRO", | ||
5038 | "Prefer using the BIT$ull macro\n" . $herecurr) && | ||
5039 | $fix) { | ||
5040 | $fixed[$fixlinenr] =~ s/\(?\s*1\s*[ulUL]*\s*<<\s*(\d+|$Ident)\s*\)?/BIT${ull}($1)/; | ||
5041 | } | ||
5042 | } | ||
5043 | |||
4911 | # check for case / default statements not preceded by break/fallthrough/switch | 5044 | # check for case / default statements not preceded by break/fallthrough/switch |
4912 | if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) { | 5045 | if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) { |
4913 | my $has_break = 0; | 5046 | my $has_break = 0; |
@@ -5071,18 +5204,6 @@ sub process { | |||
5071 | "#define of '$1' is wrong - use Kconfig variables or standard guards instead\n" . $herecurr); | 5204 | "#define of '$1' is wrong - use Kconfig variables or standard guards instead\n" . $herecurr); |
5072 | } | 5205 | } |
5073 | 5206 | ||
5074 | # check for %L{u,d,i} in strings | ||
5075 | my $string; | ||
5076 | while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) { | ||
5077 | $string = substr($rawline, $-[1], $+[1] - $-[1]); | ||
5078 | $string =~ s/%%/__/g; | ||
5079 | if ($string =~ /(?<!%)%L[udi]/) { | ||
5080 | WARN("PRINTF_L", | ||
5081 | "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr); | ||
5082 | last; | ||
5083 | } | ||
5084 | } | ||
5085 | |||
5086 | # whine mightly about in_atomic | 5207 | # whine mightly about in_atomic |
5087 | if ($line =~ /\bin_atomic\s*\(/) { | 5208 | if ($line =~ /\bin_atomic\s*\(/) { |
5088 | if ($realfile =~ m@^drivers/@) { | 5209 | if ($realfile =~ m@^drivers/@) { |
diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 70bea942b413..9922e66883a5 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc | |||
@@ -1753,7 +1753,7 @@ sub dump_struct($$) { | |||
1753 | # strip kmemcheck_bitfield_{begin,end}.*; | 1753 | # strip kmemcheck_bitfield_{begin,end}.*; |
1754 | $members =~ s/kmemcheck_bitfield_.*?;//gos; | 1754 | $members =~ s/kmemcheck_bitfield_.*?;//gos; |
1755 | # strip attributes | 1755 | # strip attributes |
1756 | $members =~ s/__aligned\s*\(.+\)//gos; | 1756 | $members =~ s/__aligned\s*\([^;]*\)//gos; |
1757 | 1757 | ||
1758 | create_parameterlist($members, ';', $file); | 1758 | create_parameterlist($members, ';', $file); |
1759 | check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested); | 1759 | check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested); |