diff options
78 files changed, 17513 insertions, 3017 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f56c7e172cee..5519d257b556 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2246,6 +2246,14 @@ L: linux-mtd@lists.infradead.org | |||
2246 | T: git git://git.infradead.org/mtd-2.6.git | 2246 | T: git git://git.infradead.org/mtd-2.6.git |
2247 | S: Maintained | 2247 | S: Maintained |
2248 | 2248 | ||
2249 | UNSORTED BLOCK IMAGES (UBI) | ||
2250 | P: Artem Bityutskiy | ||
2251 | M: dedekind@infradead.org | ||
2252 | W: http://www.linux-mtd.infradead.org/ | ||
2253 | L: linux-mtd@lists.infradead.org | ||
2254 | T: git git://git.infradead.org/ubi-2.6.git | ||
2255 | S: Maintained | ||
2256 | |||
2249 | MICROTEK X6 SCANNER | 2257 | MICROTEK X6 SCANNER |
2250 | P: Oliver Neukum | 2258 | P: Oliver Neukum |
2251 | M: oliver@neukum.name | 2259 | M: oliver@neukum.name |
@@ -2972,8 +2980,10 @@ P: Stephen Smalley | |||
2972 | M: sds@tycho.nsa.gov | 2980 | M: sds@tycho.nsa.gov |
2973 | P: James Morris | 2981 | P: James Morris |
2974 | M: jmorris@namei.org | 2982 | M: jmorris@namei.org |
2983 | P: Eric Paris | ||
2984 | M: eparis@parisplace.org | ||
2975 | L: linux-kernel@vger.kernel.org (kernel issues) | 2985 | L: linux-kernel@vger.kernel.org (kernel issues) |
2976 | L: selinux@tycho.nsa.gov (general discussion) | 2986 | L: selinux@tycho.nsa.gov (subscribers-only, general discussion) |
2977 | W: http://www.nsa.gov/selinux | 2987 | W: http://www.nsa.gov/selinux |
2978 | S: Supported | 2988 | S: Supported |
2979 | 2989 | ||
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 26f75c299440..6d1b91bf7ad5 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig | |||
@@ -292,5 +292,7 @@ source "drivers/mtd/nand/Kconfig" | |||
292 | 292 | ||
293 | source "drivers/mtd/onenand/Kconfig" | 293 | source "drivers/mtd/onenand/Kconfig" |
294 | 294 | ||
295 | source "drivers/mtd/ubi/Kconfig" | ||
296 | |||
295 | endmenu | 297 | endmenu |
296 | 298 | ||
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index c130e6261adf..92055405cb30 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile | |||
@@ -28,3 +28,5 @@ nftl-objs := nftlcore.o nftlmount.o | |||
28 | inftl-objs := inftlcore.o inftlmount.o | 28 | inftl-objs := inftlcore.o inftlmount.o |
29 | 29 | ||
30 | obj-y += chips/ maps/ devices/ nand/ onenand/ | 30 | obj-y += chips/ maps/ devices/ nand/ onenand/ |
31 | |||
32 | obj-$(CONFIG_MTD_UBI) += ubi/ | ||
diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig new file mode 100644 index 000000000000..b9daf159a4a7 --- /dev/null +++ b/drivers/mtd/ubi/Kconfig | |||
@@ -0,0 +1,58 @@ | |||
1 | # drivers/mtd/ubi/Kconfig | ||
2 | |||
3 | menu "UBI - Unsorted block images" | ||
4 | depends on MTD | ||
5 | |||
6 | config MTD_UBI | ||
7 | tristate "Enable UBI" | ||
8 | depends on MTD | ||
9 | select CRC32 | ||
10 | help | ||
11 | UBI is a software layer above MTD layer which admits of LVM-like | ||
12 | logical volumes on top of MTD devices, hides some complexities of | ||
13 | flash chips like wear and bad blocks and provides some other useful | ||
14 | capabilities. Please, consult the MTD web site for more details | ||
15 | (www.linux-mtd.infradead.org). | ||
16 | |||
17 | config MTD_UBI_WL_THRESHOLD | ||
18 | int "UBI wear-leveling threshold" | ||
19 | default 4096 | ||
20 | range 2 65536 | ||
21 | depends on MTD_UBI | ||
22 | help | ||
23 | This parameter defines the maximum difference between the highest | ||
24 | erase counter value and the lowest erase counter value of eraseblocks | ||
25 | of UBI devices. When this threshold is exceeded, UBI starts performing | ||
26 | wear leveling by means of moving data from eraseblock with low erase | ||
27 | counter to eraseblocks with high erase counter. Leave the default | ||
28 | value if unsure. | ||
29 | |||
30 | config MTD_UBI_BEB_RESERVE | ||
31 | int "Percentage of reserved eraseblocks for bad eraseblocks handling" | ||
32 | default 1 | ||
33 | range 0 25 | ||
34 | depends on MTD_UBI | ||
35 | help | ||
36 | If the MTD device admits of bad eraseblocks (e.g. NAND flash), UBI | ||
37 | reserves some amount of physical eraseblocks to handle new bad | ||
38 | eraseblocks. For example, if a flash physical eraseblock becomes bad, | ||
39 | UBI uses these reserved physical eraseblocks to relocate the bad one. | ||
40 | This option specifies how many physical eraseblocks will be reserved | ||
41 | for bad eraseblock handling (percents of total number of good flash | ||
42 | eraseblocks). If the underlying flash does not admit of bad | ||
43 | eraseblocks (e.g. NOR flash), this value is ignored and nothing is | ||
44 | reserved. Leave the default value if unsure. | ||
45 | |||
46 | config MTD_UBI_GLUEBI | ||
47 | bool "Emulate MTD devices" | ||
48 | default n | ||
49 | depends on MTD_UBI | ||
50 | help | ||
51 | This option enables MTD devices emulation on top of UBI volumes: for | ||
52 | each UBI volumes an MTD device is created, and all I/O to this MTD | ||
53 | device is redirected to the UBI volume. This is handy to make | ||
54 | MTD-oriented software (like JFFS2) work on top of UBI. Do not enable | ||
55 | this if no legacy software will be used. | ||
56 | |||
57 | source "drivers/mtd/ubi/Kconfig.debug" | ||
58 | endmenu | ||
diff --git a/drivers/mtd/ubi/Kconfig.debug b/drivers/mtd/ubi/Kconfig.debug new file mode 100644 index 000000000000..1e2ee22edeff --- /dev/null +++ b/drivers/mtd/ubi/Kconfig.debug | |||
@@ -0,0 +1,104 @@ | |||
1 | comment "UBI debugging options" | ||
2 | depends on MTD_UBI | ||
3 | |||
4 | config MTD_UBI_DEBUG | ||
5 | bool "UBI debugging" | ||
6 | depends on SYSFS | ||
7 | depends on MTD_UBI | ||
8 | select DEBUG_FS | ||
9 | select KALLSYMS_ALL | ||
10 | help | ||
11 | This option enables UBI debugging. | ||
12 | |||
13 | config MTD_UBI_DEBUG_MSG | ||
14 | bool "UBI debugging messages" | ||
15 | depends on MTD_UBI_DEBUG | ||
16 | default n | ||
17 | help | ||
18 | This option enables UBI debugging messages. | ||
19 | |||
20 | config MTD_UBI_DEBUG_PARANOID | ||
21 | bool "Extra self-checks" | ||
22 | default n | ||
23 | depends on MTD_UBI_DEBUG | ||
24 | help | ||
25 | This option enables extra checks in UBI code. Note this slows UBI down | ||
26 | significantly. | ||
27 | |||
28 | config MTD_UBI_DEBUG_DISABLE_BGT | ||
29 | bool "Do not enable the UBI background thread" | ||
30 | depends on MTD_UBI_DEBUG | ||
31 | default n | ||
32 | help | ||
33 | This option switches the background thread off by default. The thread | ||
34 | may be also be enabled/disabled via UBI sysfs. | ||
35 | |||
36 | config MTD_UBI_DEBUG_USERSPACE_IO | ||
37 | bool "Direct user-space write/erase support" | ||
38 | default n | ||
39 | depends on MTD_UBI_DEBUG | ||
40 | help | ||
41 | By default, users cannot directly write and erase individual | ||
42 | eraseblocks of dynamic volumes, and have to use update operation | ||
43 | instead. This option enables this capability - it is very useful for | ||
44 | debugging and testing. | ||
45 | |||
46 | config MTD_UBI_DEBUG_EMULATE_BITFLIPS | ||
47 | bool "Emulate flash bit-flips" | ||
48 | depends on MTD_UBI_DEBUG | ||
49 | default n | ||
50 | help | ||
51 | This option emulates bit-flips with probability 1/50, which in turn | ||
52 | causes scrubbing. Useful for debugging and stressing UBI. | ||
53 | |||
54 | config MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES | ||
55 | bool "Emulate flash write failures" | ||
56 | depends on MTD_UBI_DEBUG | ||
57 | default n | ||
58 | help | ||
59 | This option emulates write failures with probability 1/100. Useful for | ||
60 | debugging and testing how UBI handlines errors. | ||
61 | |||
62 | config MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES | ||
63 | bool "Emulate flash erase failures" | ||
64 | depends on MTD_UBI_DEBUG | ||
65 | default n | ||
66 | help | ||
67 | This option emulates erase failures with probability 1/100. Useful for | ||
68 | debugging and testing how UBI handlines errors. | ||
69 | |||
70 | menu "Additional UBI debugging messages" | ||
71 | depends on MTD_UBI_DEBUG | ||
72 | |||
73 | config MTD_UBI_DEBUG_MSG_BLD | ||
74 | bool "Additional UBI initialization and build messages" | ||
75 | default n | ||
76 | depends on MTD_UBI_DEBUG | ||
77 | help | ||
78 | This option enables detailed UBI initialization and device build | ||
79 | debugging messages. | ||
80 | |||
81 | config MTD_UBI_DEBUG_MSG_EBA | ||
82 | bool "Eraseblock association unit messages" | ||
83 | default n | ||
84 | depends on MTD_UBI_DEBUG | ||
85 | help | ||
86 | This option enables debugging messages from the UBI eraseblock | ||
87 | association unit. | ||
88 | |||
89 | config MTD_UBI_DEBUG_MSG_WL | ||
90 | bool "Wear-leveling unit messages" | ||
91 | default n | ||
92 | depends on MTD_UBI_DEBUG | ||
93 | help | ||
94 | This option enables debugging messages from the UBI wear-leveling | ||
95 | unit. | ||
96 | |||
97 | config MTD_UBI_DEBUG_MSG_IO | ||
98 | bool "Input/output unit messages" | ||
99 | default n | ||
100 | depends on MTD_UBI_DEBUG | ||
101 | help | ||
102 | This option enables debugging messages from the UBI input/output unit. | ||
103 | |||
104 | endmenu # UBI debugging messages | ||
diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile new file mode 100644 index 000000000000..dd834e04151b --- /dev/null +++ b/drivers/mtd/ubi/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | obj-$(CONFIG_MTD_UBI) += ubi.o | ||
2 | |||
3 | ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o scan.o | ||
4 | ubi-y += misc.o | ||
5 | |||
6 | ubi-$(CONFIG_MTD_UBI_DEBUG) += debug.o | ||
7 | ubi-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o | ||
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c new file mode 100644 index 000000000000..555d594d1811 --- /dev/null +++ b/drivers/mtd/ubi/build.c | |||
@@ -0,0 +1,848 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * Copyright (c) Nokia Corporation, 2007 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | * | ||
19 | * Author: Artem Bityutskiy (Битюцкий Артём), | ||
20 | * Frank Haverkamp | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * This file includes UBI initialization and building of UBI devices. At the | ||
25 | * moment UBI devices may only be added while UBI is initialized, but dynamic | ||
26 | * device add/remove functionality is planned. Also, at the moment we only | ||
27 | * attach UBI devices by scanning, which will become a bottleneck when flashes | ||
28 | * reach certain large size. Then one may improve UBI and add other methods. | ||
29 | */ | ||
30 | |||
31 | #include <linux/err.h> | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/moduleparam.h> | ||
34 | #include <linux/stringify.h> | ||
35 | #include <linux/stat.h> | ||
36 | #include "ubi.h" | ||
37 | |||
38 | /* Maximum length of the 'mtd=' parameter */ | ||
39 | #define MTD_PARAM_LEN_MAX 64 | ||
40 | |||
41 | /** | ||
42 | * struct mtd_dev_param - MTD device parameter description data structure. | ||
43 | * @name: MTD device name or number string | ||
44 | * @vid_hdr_offs: VID header offset | ||
45 | * @data_offs: data offset | ||
46 | */ | ||
47 | struct mtd_dev_param | ||
48 | { | ||
49 | char name[MTD_PARAM_LEN_MAX]; | ||
50 | int vid_hdr_offs; | ||
51 | int data_offs; | ||
52 | }; | ||
53 | |||
54 | /* Numbers of elements set in the @mtd_dev_param array */ | ||
55 | static int mtd_devs = 0; | ||
56 | |||
57 | /* MTD devices specification parameters */ | ||
58 | static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; | ||
59 | |||
60 | /* Number of UBI devices in system */ | ||
61 | int ubi_devices_cnt; | ||
62 | |||
63 | /* All UBI devices in system */ | ||
64 | struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; | ||
65 | |||
66 | /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ | ||
67 | struct class *ubi_class; | ||
68 | |||
69 | /* "Show" method for files in '/<sysfs>/class/ubi/' */ | ||
70 | static ssize_t ubi_version_show(struct class *class, char *buf) | ||
71 | { | ||
72 | return sprintf(buf, "%d\n", UBI_VERSION); | ||
73 | } | ||
74 | |||
75 | /* UBI version attribute ('/<sysfs>/class/ubi/version') */ | ||
76 | static struct class_attribute ubi_version = | ||
77 | __ATTR(version, S_IRUGO, ubi_version_show, NULL); | ||
78 | |||
79 | static ssize_t dev_attribute_show(struct device *dev, | ||
80 | struct device_attribute *attr, char *buf); | ||
81 | |||
82 | /* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ | ||
83 | static struct device_attribute dev_eraseblock_size = | ||
84 | __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); | ||
85 | static struct device_attribute dev_avail_eraseblocks = | ||
86 | __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); | ||
87 | static struct device_attribute dev_total_eraseblocks = | ||
88 | __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); | ||
89 | static struct device_attribute dev_volumes_count = | ||
90 | __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); | ||
91 | static struct device_attribute dev_max_ec = | ||
92 | __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); | ||
93 | static struct device_attribute dev_reserved_for_bad = | ||
94 | __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); | ||
95 | static struct device_attribute dev_bad_peb_count = | ||
96 | __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); | ||
97 | static struct device_attribute dev_max_vol_count = | ||
98 | __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); | ||
99 | static struct device_attribute dev_min_io_size = | ||
100 | __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); | ||
101 | static struct device_attribute dev_bgt_enabled = | ||
102 | __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); | ||
103 | |||
104 | /* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ | ||
105 | static ssize_t dev_attribute_show(struct device *dev, | ||
106 | struct device_attribute *attr, char *buf) | ||
107 | { | ||
108 | const struct ubi_device *ubi; | ||
109 | |||
110 | ubi = container_of(dev, struct ubi_device, dev); | ||
111 | if (attr == &dev_eraseblock_size) | ||
112 | return sprintf(buf, "%d\n", ubi->leb_size); | ||
113 | else if (attr == &dev_avail_eraseblocks) | ||
114 | return sprintf(buf, "%d\n", ubi->avail_pebs); | ||
115 | else if (attr == &dev_total_eraseblocks) | ||
116 | return sprintf(buf, "%d\n", ubi->good_peb_count); | ||
117 | else if (attr == &dev_volumes_count) | ||
118 | return sprintf(buf, "%d\n", ubi->vol_count); | ||
119 | else if (attr == &dev_max_ec) | ||
120 | return sprintf(buf, "%d\n", ubi->max_ec); | ||
121 | else if (attr == &dev_reserved_for_bad) | ||
122 | return sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); | ||
123 | else if (attr == &dev_bad_peb_count) | ||
124 | return sprintf(buf, "%d\n", ubi->bad_peb_count); | ||
125 | else if (attr == &dev_max_vol_count) | ||
126 | return sprintf(buf, "%d\n", ubi->vtbl_slots); | ||
127 | else if (attr == &dev_min_io_size) | ||
128 | return sprintf(buf, "%d\n", ubi->min_io_size); | ||
129 | else if (attr == &dev_bgt_enabled) | ||
130 | return sprintf(buf, "%d\n", ubi->thread_enabled); | ||
131 | else | ||
132 | BUG(); | ||
133 | |||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | /* Fake "release" method for UBI devices */ | ||
138 | static void dev_release(struct device *dev) { } | ||
139 | |||
140 | /** | ||
141 | * ubi_sysfs_init - initialize sysfs for an UBI device. | ||
142 | * @ubi: UBI device description object | ||
143 | * | ||
144 | * This function returns zero in case of success and a negative error code in | ||
145 | * case of failure. | ||
146 | */ | ||
147 | static int ubi_sysfs_init(struct ubi_device *ubi) | ||
148 | { | ||
149 | int err; | ||
150 | |||
151 | ubi->dev.release = dev_release; | ||
152 | ubi->dev.devt = MKDEV(ubi->major, 0); | ||
153 | ubi->dev.class = ubi_class; | ||
154 | sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num); | ||
155 | err = device_register(&ubi->dev); | ||
156 | if (err) | ||
157 | goto out; | ||
158 | |||
159 | err = device_create_file(&ubi->dev, &dev_eraseblock_size); | ||
160 | if (err) | ||
161 | goto out_unregister; | ||
162 | err = device_create_file(&ubi->dev, &dev_avail_eraseblocks); | ||
163 | if (err) | ||
164 | goto out_eraseblock_size; | ||
165 | err = device_create_file(&ubi->dev, &dev_total_eraseblocks); | ||
166 | if (err) | ||
167 | goto out_avail_eraseblocks; | ||
168 | err = device_create_file(&ubi->dev, &dev_volumes_count); | ||
169 | if (err) | ||
170 | goto out_total_eraseblocks; | ||
171 | err = device_create_file(&ubi->dev, &dev_max_ec); | ||
172 | if (err) | ||
173 | goto out_volumes_count; | ||
174 | err = device_create_file(&ubi->dev, &dev_reserved_for_bad); | ||
175 | if (err) | ||
176 | goto out_volumes_max_ec; | ||
177 | err = device_create_file(&ubi->dev, &dev_bad_peb_count); | ||
178 | if (err) | ||
179 | goto out_reserved_for_bad; | ||
180 | err = device_create_file(&ubi->dev, &dev_max_vol_count); | ||
181 | if (err) | ||
182 | goto out_bad_peb_count; | ||
183 | err = device_create_file(&ubi->dev, &dev_min_io_size); | ||
184 | if (err) | ||
185 | goto out_max_vol_count; | ||
186 | err = device_create_file(&ubi->dev, &dev_bgt_enabled); | ||
187 | if (err) | ||
188 | goto out_min_io_size; | ||
189 | |||
190 | return 0; | ||
191 | |||
192 | out_min_io_size: | ||
193 | device_remove_file(&ubi->dev, &dev_min_io_size); | ||
194 | out_max_vol_count: | ||
195 | device_remove_file(&ubi->dev, &dev_max_vol_count); | ||
196 | out_bad_peb_count: | ||
197 | device_remove_file(&ubi->dev, &dev_bad_peb_count); | ||
198 | out_reserved_for_bad: | ||
199 | device_remove_file(&ubi->dev, &dev_reserved_for_bad); | ||
200 | out_volumes_max_ec: | ||
201 | device_remove_file(&ubi->dev, &dev_max_ec); | ||
202 | out_volumes_count: | ||
203 | device_remove_file(&ubi->dev, &dev_volumes_count); | ||
204 | out_total_eraseblocks: | ||
205 | device_remove_file(&ubi->dev, &dev_total_eraseblocks); | ||
206 | out_avail_eraseblocks: | ||
207 | device_remove_file(&ubi->dev, &dev_avail_eraseblocks); | ||
208 | out_eraseblock_size: | ||
209 | device_remove_file(&ubi->dev, &dev_eraseblock_size); | ||
210 | out_unregister: | ||
211 | device_unregister(&ubi->dev); | ||
212 | out: | ||
213 | ubi_err("failed to initialize sysfs for %s", ubi->ubi_name); | ||
214 | return err; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * ubi_sysfs_close - close sysfs for an UBI device. | ||
219 | * @ubi: UBI device description object | ||
220 | */ | ||
221 | static void ubi_sysfs_close(struct ubi_device *ubi) | ||
222 | { | ||
223 | device_remove_file(&ubi->dev, &dev_bgt_enabled); | ||
224 | device_remove_file(&ubi->dev, &dev_min_io_size); | ||
225 | device_remove_file(&ubi->dev, &dev_max_vol_count); | ||
226 | device_remove_file(&ubi->dev, &dev_bad_peb_count); | ||
227 | device_remove_file(&ubi->dev, &dev_reserved_for_bad); | ||
228 | device_remove_file(&ubi->dev, &dev_max_ec); | ||
229 | device_remove_file(&ubi->dev, &dev_volumes_count); | ||
230 | device_remove_file(&ubi->dev, &dev_total_eraseblocks); | ||
231 | device_remove_file(&ubi->dev, &dev_avail_eraseblocks); | ||
232 | device_remove_file(&ubi->dev, &dev_eraseblock_size); | ||
233 | device_unregister(&ubi->dev); | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * kill_volumes - destroy all volumes. | ||
238 | * @ubi: UBI device description object | ||
239 | */ | ||
240 | static void kill_volumes(struct ubi_device *ubi) | ||
241 | { | ||
242 | int i; | ||
243 | |||
244 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
245 | if (ubi->volumes[i]) | ||
246 | ubi_free_volume(ubi, i); | ||
247 | } | ||
248 | |||
249 | /** | ||
250 | * uif_init - initialize user interfaces for an UBI device. | ||
251 | * @ubi: UBI device description object | ||
252 | * | ||
253 | * This function returns zero in case of success and a negative error code in | ||
254 | * case of failure. | ||
255 | */ | ||
256 | static int uif_init(struct ubi_device *ubi) | ||
257 | { | ||
258 | int i, err; | ||
259 | dev_t dev; | ||
260 | |||
261 | mutex_init(&ubi->vtbl_mutex); | ||
262 | spin_lock_init(&ubi->volumes_lock); | ||
263 | |||
264 | sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); | ||
265 | |||
266 | /* | ||
267 | * Major numbers for the UBI character devices are allocated | ||
268 | * dynamically. Major numbers of volume character devices are | ||
269 | * equivalent to ones of the corresponding UBI character device. Minor | ||
270 | * numbers of UBI character devices are 0, while minor numbers of | ||
271 | * volume character devices start from 1. Thus, we allocate one major | ||
272 | * number and ubi->vtbl_slots + 1 minor numbers. | ||
273 | */ | ||
274 | err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); | ||
275 | if (err) { | ||
276 | ubi_err("cannot register UBI character devices"); | ||
277 | return err; | ||
278 | } | ||
279 | |||
280 | cdev_init(&ubi->cdev, &ubi_cdev_operations); | ||
281 | ubi->major = MAJOR(dev); | ||
282 | dbg_msg("%s major is %u", ubi->ubi_name, ubi->major); | ||
283 | ubi->cdev.owner = THIS_MODULE; | ||
284 | |||
285 | dev = MKDEV(ubi->major, 0); | ||
286 | err = cdev_add(&ubi->cdev, dev, 1); | ||
287 | if (err) { | ||
288 | ubi_err("cannot add character device %s", ubi->ubi_name); | ||
289 | goto out_unreg; | ||
290 | } | ||
291 | |||
292 | err = ubi_sysfs_init(ubi); | ||
293 | if (err) | ||
294 | goto out_cdev; | ||
295 | |||
296 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
297 | if (ubi->volumes[i]) { | ||
298 | err = ubi_add_volume(ubi, i); | ||
299 | if (err) | ||
300 | goto out_volumes; | ||
301 | } | ||
302 | |||
303 | return 0; | ||
304 | |||
305 | out_volumes: | ||
306 | kill_volumes(ubi); | ||
307 | ubi_sysfs_close(ubi); | ||
308 | out_cdev: | ||
309 | cdev_del(&ubi->cdev); | ||
310 | out_unreg: | ||
311 | unregister_chrdev_region(MKDEV(ubi->major, 0), | ||
312 | ubi->vtbl_slots + 1); | ||
313 | return err; | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * uif_close - close user interfaces for an UBI device. | ||
318 | * @ubi: UBI device description object | ||
319 | */ | ||
320 | static void uif_close(struct ubi_device *ubi) | ||
321 | { | ||
322 | kill_volumes(ubi); | ||
323 | ubi_sysfs_close(ubi); | ||
324 | cdev_del(&ubi->cdev); | ||
325 | unregister_chrdev_region(MKDEV(ubi->major, 0), ubi->vtbl_slots + 1); | ||
326 | } | ||
327 | |||
328 | /** | ||
329 | * attach_by_scanning - attach an MTD device using scanning method. | ||
330 | * @ubi: UBI device descriptor | ||
331 | * | ||
332 | * This function returns zero in case of success and a negative error code in | ||
333 | * case of failure. | ||
334 | * | ||
335 | * Note, currently this is the only method to attach UBI devices. Hopefully in | ||
336 | * the future we'll have more scalable attaching methods and avoid full media | ||
337 | * scanning. But even in this case scanning will be needed as a fall-back | ||
338 | * attaching method if there are some on-flash table corruptions. | ||
339 | */ | ||
340 | static int attach_by_scanning(struct ubi_device *ubi) | ||
341 | { | ||
342 | int err; | ||
343 | struct ubi_scan_info *si; | ||
344 | |||
345 | si = ubi_scan(ubi); | ||
346 | if (IS_ERR(si)) | ||
347 | return PTR_ERR(si); | ||
348 | |||
349 | ubi->bad_peb_count = si->bad_peb_count; | ||
350 | ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count; | ||
351 | ubi->max_ec = si->max_ec; | ||
352 | ubi->mean_ec = si->mean_ec; | ||
353 | |||
354 | err = ubi_read_volume_table(ubi, si); | ||
355 | if (err) | ||
356 | goto out_si; | ||
357 | |||
358 | err = ubi_wl_init_scan(ubi, si); | ||
359 | if (err) | ||
360 | goto out_vtbl; | ||
361 | |||
362 | err = ubi_eba_init_scan(ubi, si); | ||
363 | if (err) | ||
364 | goto out_wl; | ||
365 | |||
366 | ubi_scan_destroy_si(si); | ||
367 | return 0; | ||
368 | |||
369 | out_wl: | ||
370 | ubi_wl_close(ubi); | ||
371 | out_vtbl: | ||
372 | kfree(ubi->vtbl); | ||
373 | out_si: | ||
374 | ubi_scan_destroy_si(si); | ||
375 | return err; | ||
376 | } | ||
377 | |||
378 | /** | ||
379 | * io_init - initialize I/O unit for a given UBI device. | ||
380 | * @ubi: UBI device description object | ||
381 | * | ||
382 | * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are | ||
383 | * assumed: | ||
384 | * o EC header is always at offset zero - this cannot be changed; | ||
385 | * o VID header starts just after the EC header at the closest address | ||
386 | * aligned to @io->@hdrs_min_io_size; | ||
387 | * o data starts just after the VID header at the closest address aligned to | ||
388 | * @io->@min_io_size | ||
389 | * | ||
390 | * This function returns zero in case of success and a negative error code in | ||
391 | * case of failure. | ||
392 | */ | ||
393 | static int io_init(struct ubi_device *ubi) | ||
394 | { | ||
395 | if (ubi->mtd->numeraseregions != 0) { | ||
396 | /* | ||
397 | * Some flashes have several erase regions. Different regions | ||
398 | * may have different eraseblock size and other | ||
399 | * characteristics. It looks like mostly multi-region flashes | ||
400 | * have one "main" region and one or more small regions to | ||
401 | * store boot loader code or boot parameters or whatever. I | ||
402 | * guess we should just pick the largest region. But this is | ||
403 | * not implemented. | ||
404 | */ | ||
405 | ubi_err("multiple regions, not implemented"); | ||
406 | return -EINVAL; | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * Note, in this implementation we support MTD devices with 0x7FFFFFFF | ||
411 | * physical eraseblocks maximum. | ||
412 | */ | ||
413 | |||
414 | ubi->peb_size = ubi->mtd->erasesize; | ||
415 | ubi->peb_count = ubi->mtd->size / ubi->mtd->erasesize; | ||
416 | ubi->flash_size = ubi->mtd->size; | ||
417 | |||
418 | if (ubi->mtd->block_isbad && ubi->mtd->block_markbad) | ||
419 | ubi->bad_allowed = 1; | ||
420 | |||
421 | ubi->min_io_size = ubi->mtd->writesize; | ||
422 | ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; | ||
423 | |||
424 | /* Make sure minimal I/O unit is power of 2 */ | ||
425 | if (ubi->min_io_size == 0 || | ||
426 | (ubi->min_io_size & (ubi->min_io_size - 1))) { | ||
427 | ubi_err("bad min. I/O unit"); | ||
428 | return -EINVAL; | ||
429 | } | ||
430 | |||
431 | ubi_assert(ubi->hdrs_min_io_size > 0); | ||
432 | ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); | ||
433 | ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); | ||
434 | |||
435 | /* Calculate default aligned sizes of EC and VID headers */ | ||
436 | ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); | ||
437 | ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); | ||
438 | |||
439 | dbg_msg("min_io_size %d", ubi->min_io_size); | ||
440 | dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size); | ||
441 | dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize); | ||
442 | dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize); | ||
443 | |||
444 | if (ubi->vid_hdr_offset == 0) | ||
445 | /* Default offset */ | ||
446 | ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = | ||
447 | ubi->ec_hdr_alsize; | ||
448 | else { | ||
449 | ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & | ||
450 | ~(ubi->hdrs_min_io_size - 1); | ||
451 | ubi->vid_hdr_shift = ubi->vid_hdr_offset - | ||
452 | ubi->vid_hdr_aloffset; | ||
453 | } | ||
454 | |||
455 | /* Similar for the data offset */ | ||
456 | if (ubi->leb_start == 0) { | ||
457 | ubi->leb_start = ubi->vid_hdr_offset + ubi->vid_hdr_alsize; | ||
458 | ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); | ||
459 | } | ||
460 | |||
461 | dbg_msg("vid_hdr_offset %d", ubi->vid_hdr_offset); | ||
462 | dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); | ||
463 | dbg_msg("vid_hdr_shift %d", ubi->vid_hdr_shift); | ||
464 | dbg_msg("leb_start %d", ubi->leb_start); | ||
465 | |||
466 | /* The shift must be aligned to 32-bit boundary */ | ||
467 | if (ubi->vid_hdr_shift % 4) { | ||
468 | ubi_err("unaligned VID header shift %d", | ||
469 | ubi->vid_hdr_shift); | ||
470 | return -EINVAL; | ||
471 | } | ||
472 | |||
473 | /* Check sanity */ | ||
474 | if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || | ||
475 | ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || | ||
476 | ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || | ||
477 | ubi->leb_start % ubi->min_io_size) { | ||
478 | ubi_err("bad VID header (%d) or data offsets (%d)", | ||
479 | ubi->vid_hdr_offset, ubi->leb_start); | ||
480 | return -EINVAL; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * It may happen that EC and VID headers are situated in one minimal | ||
485 | * I/O unit. In this case we can only accept this UBI image in | ||
486 | * read-only mode. | ||
487 | */ | ||
488 | if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { | ||
489 | ubi_warn("EC and VID headers are in the same minimal I/O unit, " | ||
490 | "switch to read-only mode"); | ||
491 | ubi->ro_mode = 1; | ||
492 | } | ||
493 | |||
494 | ubi->leb_size = ubi->peb_size - ubi->leb_start; | ||
495 | |||
496 | if (!(ubi->mtd->flags & MTD_WRITEABLE)) { | ||
497 | ubi_msg("MTD device %d is write-protected, attach in " | ||
498 | "read-only mode", ubi->mtd->index); | ||
499 | ubi->ro_mode = 1; | ||
500 | } | ||
501 | |||
502 | dbg_msg("leb_size %d", ubi->leb_size); | ||
503 | dbg_msg("ro_mode %d", ubi->ro_mode); | ||
504 | |||
505 | /* | ||
506 | * Note, ideally, we have to initialize ubi->bad_peb_count here. But | ||
507 | * unfortunately, MTD does not provide this information. We should loop | ||
508 | * over all physical eraseblocks and invoke mtd->block_is_bad() for | ||
509 | * each physical eraseblock. So, we skip ubi->bad_peb_count | ||
510 | * uninitialized and initialize it after scanning. | ||
511 | */ | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | /** | ||
517 | * attach_mtd_dev - attach an MTD device. | ||
518 | * @mtd_dev: MTD device name or number string | ||
519 | * @vid_hdr_offset: VID header offset | ||
520 | * @data_offset: data offset | ||
521 | * | ||
522 | * This function attaches an MTD device to UBI. It first treats @mtd_dev as the | ||
523 | * MTD device name, and tries to open it by this name. If it is unable to open, | ||
524 | * it tries to convert @mtd_dev to an integer and open the MTD device by its | ||
525 | * number. Returns zero in case of success and a negative error code in case of | ||
526 | * failure. | ||
527 | */ | ||
528 | static int attach_mtd_dev(const char *mtd_dev, int vid_hdr_offset, | ||
529 | int data_offset) | ||
530 | { | ||
531 | struct ubi_device *ubi; | ||
532 | struct mtd_info *mtd; | ||
533 | int i, err; | ||
534 | |||
535 | mtd = get_mtd_device_nm(mtd_dev); | ||
536 | if (IS_ERR(mtd)) { | ||
537 | int mtd_num; | ||
538 | char *endp; | ||
539 | |||
540 | if (PTR_ERR(mtd) != -ENODEV) | ||
541 | return PTR_ERR(mtd); | ||
542 | |||
543 | /* | ||
544 | * Probably this is not MTD device name but MTD device number - | ||
545 | * check this out. | ||
546 | */ | ||
547 | mtd_num = simple_strtoul(mtd_dev, &endp, 0); | ||
548 | if (*endp != '\0' || mtd_dev == endp) { | ||
549 | ubi_err("incorrect MTD device: \"%s\"", mtd_dev); | ||
550 | return -ENODEV; | ||
551 | } | ||
552 | |||
553 | mtd = get_mtd_device(NULL, mtd_num); | ||
554 | if (IS_ERR(mtd)) | ||
555 | return PTR_ERR(mtd); | ||
556 | } | ||
557 | |||
558 | /* Check if we already have the same MTD device attached */ | ||
559 | for (i = 0; i < ubi_devices_cnt; i++) | ||
560 | if (ubi_devices[i]->mtd->index == mtd->index) { | ||
561 | ubi_err("mtd%d is already attached to ubi%d", | ||
562 | mtd->index, i); | ||
563 | err = -EINVAL; | ||
564 | goto out_mtd; | ||
565 | } | ||
566 | |||
567 | ubi = ubi_devices[ubi_devices_cnt] = kzalloc(sizeof(struct ubi_device), | ||
568 | GFP_KERNEL); | ||
569 | if (!ubi) { | ||
570 | err = -ENOMEM; | ||
571 | goto out_mtd; | ||
572 | } | ||
573 | |||
574 | ubi->ubi_num = ubi_devices_cnt; | ||
575 | ubi->mtd = mtd; | ||
576 | |||
577 | dbg_msg("attaching mtd%d to ubi%d: VID header offset %d data offset %d", | ||
578 | ubi->mtd->index, ubi_devices_cnt, vid_hdr_offset, data_offset); | ||
579 | |||
580 | ubi->vid_hdr_offset = vid_hdr_offset; | ||
581 | ubi->leb_start = data_offset; | ||
582 | err = io_init(ubi); | ||
583 | if (err) | ||
584 | goto out_free; | ||
585 | |||
586 | err = attach_by_scanning(ubi); | ||
587 | if (err) { | ||
588 | dbg_err("failed to attach by scanning, error %d", err); | ||
589 | goto out_free; | ||
590 | } | ||
591 | |||
592 | err = uif_init(ubi); | ||
593 | if (err) | ||
594 | goto out_detach; | ||
595 | |||
596 | ubi_devices_cnt += 1; | ||
597 | |||
598 | ubi_msg("attached mtd%d to ubi%d", ubi->mtd->index, ubi_devices_cnt); | ||
599 | ubi_msg("MTD device name: \"%s\"", ubi->mtd->name); | ||
600 | ubi_msg("MTD device size: %llu MiB", ubi->flash_size >> 20); | ||
601 | ubi_msg("physical eraseblock size: %d bytes (%d KiB)", | ||
602 | ubi->peb_size, ubi->peb_size >> 10); | ||
603 | ubi_msg("logical eraseblock size: %d bytes", ubi->leb_size); | ||
604 | ubi_msg("number of good PEBs: %d", ubi->good_peb_count); | ||
605 | ubi_msg("number of bad PEBs: %d", ubi->bad_peb_count); | ||
606 | ubi_msg("smallest flash I/O unit: %d", ubi->min_io_size); | ||
607 | ubi_msg("VID header offset: %d (aligned %d)", | ||
608 | ubi->vid_hdr_offset, ubi->vid_hdr_aloffset); | ||
609 | ubi_msg("data offset: %d", ubi->leb_start); | ||
610 | ubi_msg("max. allowed volumes: %d", ubi->vtbl_slots); | ||
611 | ubi_msg("wear-leveling threshold: %d", CONFIG_MTD_UBI_WL_THRESHOLD); | ||
612 | ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT); | ||
613 | ubi_msg("number of user volumes: %d", | ||
614 | ubi->vol_count - UBI_INT_VOL_COUNT); | ||
615 | ubi_msg("available PEBs: %d", ubi->avail_pebs); | ||
616 | ubi_msg("total number of reserved PEBs: %d", ubi->rsvd_pebs); | ||
617 | ubi_msg("number of PEBs reserved for bad PEB handling: %d", | ||
618 | ubi->beb_rsvd_pebs); | ||
619 | ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec); | ||
620 | |||
621 | /* Enable the background thread */ | ||
622 | if (!DBG_DISABLE_BGT) { | ||
623 | ubi->thread_enabled = 1; | ||
624 | wake_up_process(ubi->bgt_thread); | ||
625 | } | ||
626 | |||
627 | return 0; | ||
628 | |||
629 | out_detach: | ||
630 | ubi_eba_close(ubi); | ||
631 | ubi_wl_close(ubi); | ||
632 | kfree(ubi->vtbl); | ||
633 | out_free: | ||
634 | kfree(ubi); | ||
635 | out_mtd: | ||
636 | put_mtd_device(mtd); | ||
637 | ubi_devices[ubi_devices_cnt] = NULL; | ||
638 | return err; | ||
639 | } | ||
640 | |||
641 | /** | ||
642 | * detach_mtd_dev - detach an MTD device. | ||
643 | * @ubi: UBI device description object | ||
644 | */ | ||
645 | static void detach_mtd_dev(struct ubi_device *ubi) | ||
646 | { | ||
647 | int ubi_num = ubi->ubi_num, mtd_num = ubi->mtd->index; | ||
648 | |||
649 | dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num); | ||
650 | uif_close(ubi); | ||
651 | ubi_eba_close(ubi); | ||
652 | ubi_wl_close(ubi); | ||
653 | kfree(ubi->vtbl); | ||
654 | put_mtd_device(ubi->mtd); | ||
655 | kfree(ubi_devices[ubi_num]); | ||
656 | ubi_devices[ubi_num] = NULL; | ||
657 | ubi_devices_cnt -= 1; | ||
658 | ubi_assert(ubi_devices_cnt >= 0); | ||
659 | ubi_msg("mtd%d is detached from ubi%d", mtd_num, ubi_num); | ||
660 | } | ||
661 | |||
662 | static int __init ubi_init(void) | ||
663 | { | ||
664 | int err, i, k; | ||
665 | |||
666 | /* Ensure that EC and VID headers have correct size */ | ||
667 | BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); | ||
668 | BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); | ||
669 | |||
670 | if (mtd_devs > UBI_MAX_DEVICES) { | ||
671 | printk("UBI error: too many MTD devices, maximum is %d\n", | ||
672 | UBI_MAX_DEVICES); | ||
673 | return -EINVAL; | ||
674 | } | ||
675 | |||
676 | ubi_class = class_create(THIS_MODULE, UBI_NAME_STR); | ||
677 | if (IS_ERR(ubi_class)) | ||
678 | return PTR_ERR(ubi_class); | ||
679 | |||
680 | err = class_create_file(ubi_class, &ubi_version); | ||
681 | if (err) | ||
682 | goto out_class; | ||
683 | |||
684 | /* Attach MTD devices */ | ||
685 | for (i = 0; i < mtd_devs; i++) { | ||
686 | struct mtd_dev_param *p = &mtd_dev_param[i]; | ||
687 | |||
688 | cond_resched(); | ||
689 | |||
690 | if (!p->name) { | ||
691 | dbg_err("empty name"); | ||
692 | err = -EINVAL; | ||
693 | goto out_detach; | ||
694 | } | ||
695 | |||
696 | err = attach_mtd_dev(p->name, p->vid_hdr_offs, p->data_offs); | ||
697 | if (err) | ||
698 | goto out_detach; | ||
699 | } | ||
700 | |||
701 | return 0; | ||
702 | |||
703 | out_detach: | ||
704 | for (k = 0; k < i; k++) | ||
705 | detach_mtd_dev(ubi_devices[k]); | ||
706 | class_remove_file(ubi_class, &ubi_version); | ||
707 | out_class: | ||
708 | class_destroy(ubi_class); | ||
709 | return err; | ||
710 | } | ||
711 | module_init(ubi_init); | ||
712 | |||
713 | static void __exit ubi_exit(void) | ||
714 | { | ||
715 | int i, n = ubi_devices_cnt; | ||
716 | |||
717 | for (i = 0; i < n; i++) | ||
718 | detach_mtd_dev(ubi_devices[i]); | ||
719 | class_remove_file(ubi_class, &ubi_version); | ||
720 | class_destroy(ubi_class); | ||
721 | } | ||
722 | module_exit(ubi_exit); | ||
723 | |||
724 | /** | ||
725 | * bytes_str_to_int - convert a string representing number of bytes to an | ||
726 | * integer. | ||
727 | * @str: the string to convert | ||
728 | * | ||
729 | * This function returns positive resulting integer in case of success and a | ||
730 | * negative error code in case of failure. | ||
731 | */ | ||
732 | static int __init bytes_str_to_int(const char *str) | ||
733 | { | ||
734 | char *endp; | ||
735 | unsigned long result; | ||
736 | |||
737 | result = simple_strtoul(str, &endp, 0); | ||
738 | if (str == endp || result < 0) { | ||
739 | printk("UBI error: incorrect bytes count: \"%s\"\n", str); | ||
740 | return -EINVAL; | ||
741 | } | ||
742 | |||
743 | switch (*endp) { | ||
744 | case 'G': | ||
745 | result *= 1024; | ||
746 | case 'M': | ||
747 | result *= 1024; | ||
748 | case 'K': | ||
749 | case 'k': | ||
750 | result *= 1024; | ||
751 | if (endp[1] == 'i' && (endp[2] == '\0' || | ||
752 | endp[2] == 'B' || endp[2] == 'b')) | ||
753 | endp += 2; | ||
754 | case '\0': | ||
755 | break; | ||
756 | default: | ||
757 | printk("UBI error: incorrect bytes count: \"%s\"\n", str); | ||
758 | return -EINVAL; | ||
759 | } | ||
760 | |||
761 | return result; | ||
762 | } | ||
763 | |||
764 | /** | ||
765 | * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. | ||
766 | * @val: the parameter value to parse | ||
767 | * @kp: not used | ||
768 | * | ||
769 | * This function returns zero in case of success and a negative error code in | ||
770 | * case of error. | ||
771 | */ | ||
772 | static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp) | ||
773 | { | ||
774 | int i, len; | ||
775 | struct mtd_dev_param *p; | ||
776 | char buf[MTD_PARAM_LEN_MAX]; | ||
777 | char *pbuf = &buf[0]; | ||
778 | char *tokens[3] = {NULL, NULL, NULL}; | ||
779 | |||
780 | if (mtd_devs == UBI_MAX_DEVICES) { | ||
781 | printk("UBI error: too many parameters, max. is %d\n", | ||
782 | UBI_MAX_DEVICES); | ||
783 | return -EINVAL; | ||
784 | } | ||
785 | |||
786 | len = strnlen(val, MTD_PARAM_LEN_MAX); | ||
787 | if (len == MTD_PARAM_LEN_MAX) { | ||
788 | printk("UBI error: parameter \"%s\" is too long, max. is %d\n", | ||
789 | val, MTD_PARAM_LEN_MAX); | ||
790 | return -EINVAL; | ||
791 | } | ||
792 | |||
793 | if (len == 0) { | ||
794 | printk("UBI warning: empty 'mtd=' parameter - ignored\n"); | ||
795 | return 0; | ||
796 | } | ||
797 | |||
798 | strcpy(buf, val); | ||
799 | |||
800 | /* Get rid of the final newline */ | ||
801 | if (buf[len - 1] == '\n') | ||
802 | buf[len - 1] = 0; | ||
803 | |||
804 | for (i = 0; i < 3; i++) | ||
805 | tokens[i] = strsep(&pbuf, ","); | ||
806 | |||
807 | if (pbuf) { | ||
808 | printk("UBI error: too many arguments at \"%s\"\n", val); | ||
809 | return -EINVAL; | ||
810 | } | ||
811 | |||
812 | if (tokens[0] == '\0') | ||
813 | return -EINVAL; | ||
814 | |||
815 | p = &mtd_dev_param[mtd_devs]; | ||
816 | strcpy(&p->name[0], tokens[0]); | ||
817 | |||
818 | if (tokens[1]) | ||
819 | p->vid_hdr_offs = bytes_str_to_int(tokens[1]); | ||
820 | if (tokens[2]) | ||
821 | p->data_offs = bytes_str_to_int(tokens[2]); | ||
822 | |||
823 | if (p->vid_hdr_offs < 0) | ||
824 | return p->vid_hdr_offs; | ||
825 | if (p->data_offs < 0) | ||
826 | return p->data_offs; | ||
827 | |||
828 | mtd_devs += 1; | ||
829 | return 0; | ||
830 | } | ||
831 | |||
832 | module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000); | ||
833 | MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: " | ||
834 | "mtd=<name|num>[,<vid_hdr_offs>,<data_offs>]. " | ||
835 | "Multiple \"mtd\" parameters may be specified.\n" | ||
836 | "MTD devices may be specified by their number or name. " | ||
837 | "Optional \"vid_hdr_offs\" and \"data_offs\" parameters " | ||
838 | "specify UBI VID header position and data starting " | ||
839 | "position to be used by UBI.\n" | ||
840 | "Example: mtd=content,1984,2048 mtd=4 - attach MTD device" | ||
841 | "with name content using VID header offset 1984 and data " | ||
842 | "start 2048, and MTD device number 4 using default " | ||
843 | "offsets"); | ||
844 | |||
845 | MODULE_VERSION(__stringify(UBI_VERSION)); | ||
846 | MODULE_DESCRIPTION("UBI - Unsorted Block Images"); | ||
847 | MODULE_AUTHOR("Artem Bityutskiy"); | ||
848 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c new file mode 100644 index 000000000000..6612eb79bf17 --- /dev/null +++ b/drivers/mtd/ubi/cdev.c | |||
@@ -0,0 +1,722 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * This file includes implementation of UBI character device operations. | ||
23 | * | ||
24 | * There are two kinds of character devices in UBI: UBI character devices and | ||
25 | * UBI volume character devices. UBI character devices allow users to | ||
26 | * manipulate whole volumes: create, remove, and re-size them. Volume character | ||
27 | * devices provide volume I/O capabilities. | ||
28 | * | ||
29 | * Major and minor numbers are assigned dynamically to both UBI and volume | ||
30 | * character devices. | ||
31 | */ | ||
32 | |||
33 | #include <linux/module.h> | ||
34 | #include <linux/stat.h> | ||
35 | #include <linux/ioctl.h> | ||
36 | #include <linux/capability.h> | ||
37 | #include <mtd/ubi-user.h> | ||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/div64.h> | ||
40 | #include "ubi.h" | ||
41 | |||
42 | /* | ||
43 | * Maximum sequence numbers of UBI and volume character device IOCTLs (direct | ||
44 | * logical eraseblock erase is a debug-only feature). | ||
45 | */ | ||
46 | #define UBI_CDEV_IOC_MAX_SEQ 2 | ||
47 | #ifndef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO | ||
48 | #define VOL_CDEV_IOC_MAX_SEQ 1 | ||
49 | #else | ||
50 | #define VOL_CDEV_IOC_MAX_SEQ 2 | ||
51 | #endif | ||
52 | |||
53 | /** | ||
54 | * major_to_device - get UBI device object by character device major number. | ||
55 | * @major: major number | ||
56 | * | ||
57 | * This function returns a pointer to the UBI device object. | ||
58 | */ | ||
59 | static struct ubi_device *major_to_device(int major) | ||
60 | { | ||
61 | int i; | ||
62 | |||
63 | for (i = 0; i < ubi_devices_cnt; i++) | ||
64 | if (ubi_devices[i] && ubi_devices[i]->major == major) | ||
65 | return ubi_devices[i]; | ||
66 | BUG(); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * get_exclusive - get exclusive access to an UBI volume. | ||
71 | * @desc: volume descriptor | ||
72 | * | ||
73 | * This function changes UBI volume open mode to "exclusive". Returns previous | ||
74 | * mode value (positive integer) in case of success and a negative error code | ||
75 | * in case of failure. | ||
76 | */ | ||
77 | static int get_exclusive(struct ubi_volume_desc *desc) | ||
78 | { | ||
79 | int users, err; | ||
80 | struct ubi_volume *vol = desc->vol; | ||
81 | |||
82 | spin_lock(&vol->ubi->volumes_lock); | ||
83 | users = vol->readers + vol->writers + vol->exclusive; | ||
84 | ubi_assert(users > 0); | ||
85 | if (users > 1) { | ||
86 | dbg_err("%d users for volume %d", users, vol->vol_id); | ||
87 | err = -EBUSY; | ||
88 | } else { | ||
89 | vol->readers = vol->writers = 0; | ||
90 | vol->exclusive = 1; | ||
91 | err = desc->mode; | ||
92 | desc->mode = UBI_EXCLUSIVE; | ||
93 | } | ||
94 | spin_unlock(&vol->ubi->volumes_lock); | ||
95 | |||
96 | return err; | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * revoke_exclusive - revoke exclusive mode. | ||
101 | * @desc: volume descriptor | ||
102 | * @mode: new mode to switch to | ||
103 | */ | ||
104 | static void revoke_exclusive(struct ubi_volume_desc *desc, int mode) | ||
105 | { | ||
106 | struct ubi_volume *vol = desc->vol; | ||
107 | |||
108 | spin_lock(&vol->ubi->volumes_lock); | ||
109 | ubi_assert(vol->readers == 0 && vol->writers == 0); | ||
110 | ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE); | ||
111 | vol->exclusive = 0; | ||
112 | if (mode == UBI_READONLY) | ||
113 | vol->readers = 1; | ||
114 | else if (mode == UBI_READWRITE) | ||
115 | vol->writers = 1; | ||
116 | else | ||
117 | vol->exclusive = 1; | ||
118 | spin_unlock(&vol->ubi->volumes_lock); | ||
119 | |||
120 | desc->mode = mode; | ||
121 | } | ||
122 | |||
123 | static int vol_cdev_open(struct inode *inode, struct file *file) | ||
124 | { | ||
125 | struct ubi_volume_desc *desc; | ||
126 | const struct ubi_device *ubi = major_to_device(imajor(inode)); | ||
127 | int vol_id = iminor(inode) - 1; | ||
128 | int mode; | ||
129 | |||
130 | if (file->f_mode & FMODE_WRITE) | ||
131 | mode = UBI_READWRITE; | ||
132 | else | ||
133 | mode = UBI_READONLY; | ||
134 | |||
135 | dbg_msg("open volume %d, mode %d", vol_id, mode); | ||
136 | |||
137 | desc = ubi_open_volume(ubi->ubi_num, vol_id, mode); | ||
138 | if (IS_ERR(desc)) | ||
139 | return PTR_ERR(desc); | ||
140 | |||
141 | file->private_data = desc; | ||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | static int vol_cdev_release(struct inode *inode, struct file *file) | ||
146 | { | ||
147 | struct ubi_volume_desc *desc = file->private_data; | ||
148 | struct ubi_volume *vol = desc->vol; | ||
149 | |||
150 | dbg_msg("release volume %d, mode %d", vol->vol_id, desc->mode); | ||
151 | |||
152 | if (vol->updating) { | ||
153 | ubi_warn("update of volume %d not finished, volume is damaged", | ||
154 | vol->vol_id); | ||
155 | vol->updating = 0; | ||
156 | kfree(vol->upd_buf); | ||
157 | } | ||
158 | |||
159 | ubi_close_volume(desc); | ||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin) | ||
164 | { | ||
165 | struct ubi_volume_desc *desc = file->private_data; | ||
166 | struct ubi_volume *vol = desc->vol; | ||
167 | loff_t new_offset; | ||
168 | |||
169 | if (vol->updating) { | ||
170 | /* Update is in progress, seeking is prohibited */ | ||
171 | dbg_err("updating"); | ||
172 | return -EBUSY; | ||
173 | } | ||
174 | |||
175 | switch (origin) { | ||
176 | case 0: /* SEEK_SET */ | ||
177 | new_offset = offset; | ||
178 | break; | ||
179 | case 1: /* SEEK_CUR */ | ||
180 | new_offset = file->f_pos + offset; | ||
181 | break; | ||
182 | case 2: /* SEEK_END */ | ||
183 | new_offset = vol->used_bytes + offset; | ||
184 | break; | ||
185 | default: | ||
186 | return -EINVAL; | ||
187 | } | ||
188 | |||
189 | if (new_offset < 0 || new_offset > vol->used_bytes) { | ||
190 | dbg_err("bad seek %lld", new_offset); | ||
191 | return -EINVAL; | ||
192 | } | ||
193 | |||
194 | dbg_msg("seek volume %d, offset %lld, origin %d, new offset %lld", | ||
195 | vol->vol_id, offset, origin, new_offset); | ||
196 | |||
197 | file->f_pos = new_offset; | ||
198 | return new_offset; | ||
199 | } | ||
200 | |||
201 | static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count, | ||
202 | loff_t *offp) | ||
203 | { | ||
204 | struct ubi_volume_desc *desc = file->private_data; | ||
205 | struct ubi_volume *vol = desc->vol; | ||
206 | struct ubi_device *ubi = vol->ubi; | ||
207 | int err, lnum, off, len, vol_id = desc->vol->vol_id, tbuf_size; | ||
208 | size_t count_save = count; | ||
209 | void *tbuf; | ||
210 | uint64_t tmp; | ||
211 | |||
212 | dbg_msg("read %zd bytes from offset %lld of volume %d", | ||
213 | count, *offp, vol_id); | ||
214 | |||
215 | if (vol->updating) { | ||
216 | dbg_err("updating"); | ||
217 | return -EBUSY; | ||
218 | } | ||
219 | if (vol->upd_marker) { | ||
220 | dbg_err("damaged volume, update marker is set"); | ||
221 | return -EBADF; | ||
222 | } | ||
223 | if (*offp == vol->used_bytes || count == 0) | ||
224 | return 0; | ||
225 | |||
226 | if (vol->corrupted) | ||
227 | dbg_msg("read from corrupted volume %d", vol_id); | ||
228 | |||
229 | if (*offp + count > vol->used_bytes) | ||
230 | count_save = count = vol->used_bytes - *offp; | ||
231 | |||
232 | tbuf_size = vol->usable_leb_size; | ||
233 | if (count < tbuf_size) | ||
234 | tbuf_size = ALIGN(count, ubi->min_io_size); | ||
235 | tbuf = kmalloc(tbuf_size, GFP_KERNEL); | ||
236 | if (!tbuf) | ||
237 | return -ENOMEM; | ||
238 | |||
239 | len = count > tbuf_size ? tbuf_size : count; | ||
240 | |||
241 | tmp = *offp; | ||
242 | off = do_div(tmp, vol->usable_leb_size); | ||
243 | lnum = tmp; | ||
244 | |||
245 | do { | ||
246 | cond_resched(); | ||
247 | |||
248 | if (off + len >= vol->usable_leb_size) | ||
249 | len = vol->usable_leb_size - off; | ||
250 | |||
251 | err = ubi_eba_read_leb(ubi, vol_id, lnum, tbuf, off, len, 0); | ||
252 | if (err) | ||
253 | break; | ||
254 | |||
255 | off += len; | ||
256 | if (off == vol->usable_leb_size) { | ||
257 | lnum += 1; | ||
258 | off -= vol->usable_leb_size; | ||
259 | } | ||
260 | |||
261 | count -= len; | ||
262 | *offp += len; | ||
263 | |||
264 | err = copy_to_user(buf, tbuf, len); | ||
265 | if (err) { | ||
266 | err = -EFAULT; | ||
267 | break; | ||
268 | } | ||
269 | |||
270 | buf += len; | ||
271 | len = count > tbuf_size ? tbuf_size : count; | ||
272 | } while (count); | ||
273 | |||
274 | kfree(tbuf); | ||
275 | return err ? err : count_save - count; | ||
276 | } | ||
277 | |||
278 | #ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO | ||
279 | |||
280 | /* | ||
281 | * This function allows to directly write to dynamic UBI volumes, without | ||
282 | * issuing the volume update operation. Available only as a debugging feature. | ||
283 | * Very useful for testing UBI. | ||
284 | */ | ||
285 | static ssize_t vol_cdev_direct_write(struct file *file, const char __user *buf, | ||
286 | size_t count, loff_t *offp) | ||
287 | { | ||
288 | struct ubi_volume_desc *desc = file->private_data; | ||
289 | struct ubi_volume *vol = desc->vol; | ||
290 | struct ubi_device *ubi = vol->ubi; | ||
291 | int lnum, off, len, tbuf_size, vol_id = vol->vol_id, err = 0; | ||
292 | size_t count_save = count; | ||
293 | char *tbuf; | ||
294 | uint64_t tmp; | ||
295 | |||
296 | dbg_msg("requested: write %zd bytes to offset %lld of volume %u", | ||
297 | count, *offp, desc->vol->vol_id); | ||
298 | |||
299 | if (vol->vol_type == UBI_STATIC_VOLUME) | ||
300 | return -EROFS; | ||
301 | |||
302 | tmp = *offp; | ||
303 | off = do_div(tmp, vol->usable_leb_size); | ||
304 | lnum = tmp; | ||
305 | |||
306 | if (off % ubi->min_io_size) { | ||
307 | dbg_err("unaligned position"); | ||
308 | return -EINVAL; | ||
309 | } | ||
310 | |||
311 | if (*offp + count > vol->used_bytes) | ||
312 | count_save = count = vol->used_bytes - *offp; | ||
313 | |||
314 | /* We can write only in fractions of the minimum I/O unit */ | ||
315 | if (count % ubi->min_io_size) { | ||
316 | dbg_err("unaligned write length"); | ||
317 | return -EINVAL; | ||
318 | } | ||
319 | |||
320 | tbuf_size = vol->usable_leb_size; | ||
321 | if (count < tbuf_size) | ||
322 | tbuf_size = ALIGN(count, ubi->min_io_size); | ||
323 | tbuf = kmalloc(tbuf_size, GFP_KERNEL); | ||
324 | if (!tbuf) | ||
325 | return -ENOMEM; | ||
326 | |||
327 | len = count > tbuf_size ? tbuf_size : count; | ||
328 | |||
329 | while (count) { | ||
330 | cond_resched(); | ||
331 | |||
332 | if (off + len >= vol->usable_leb_size) | ||
333 | len = vol->usable_leb_size - off; | ||
334 | |||
335 | err = copy_from_user(tbuf, buf, len); | ||
336 | if (err) { | ||
337 | err = -EFAULT; | ||
338 | break; | ||
339 | } | ||
340 | |||
341 | err = ubi_eba_write_leb(ubi, vol_id, lnum, tbuf, off, len, | ||
342 | UBI_UNKNOWN); | ||
343 | if (err) | ||
344 | break; | ||
345 | |||
346 | off += len; | ||
347 | if (off == vol->usable_leb_size) { | ||
348 | lnum += 1; | ||
349 | off -= vol->usable_leb_size; | ||
350 | } | ||
351 | |||
352 | count -= len; | ||
353 | *offp += len; | ||
354 | buf += len; | ||
355 | len = count > tbuf_size ? tbuf_size : count; | ||
356 | } | ||
357 | |||
358 | kfree(tbuf); | ||
359 | return err ? err : count_save - count; | ||
360 | } | ||
361 | |||
362 | #else | ||
363 | #define vol_cdev_direct_write(file, buf, count, offp) -EPERM | ||
364 | #endif /* CONFIG_MTD_UBI_DEBUG_USERSPACE_IO */ | ||
365 | |||
366 | static ssize_t vol_cdev_write(struct file *file, const char __user *buf, | ||
367 | size_t count, loff_t *offp) | ||
368 | { | ||
369 | int err = 0; | ||
370 | struct ubi_volume_desc *desc = file->private_data; | ||
371 | struct ubi_volume *vol = desc->vol; | ||
372 | struct ubi_device *ubi = vol->ubi; | ||
373 | |||
374 | if (!vol->updating) | ||
375 | return vol_cdev_direct_write(file, buf, count, offp); | ||
376 | |||
377 | err = ubi_more_update_data(ubi, vol->vol_id, buf, count); | ||
378 | if (err < 0) { | ||
379 | ubi_err("cannot write %zd bytes of update data", count); | ||
380 | return err; | ||
381 | } | ||
382 | |||
383 | if (err) { | ||
384 | /* | ||
385 | * Update is finished, @err contains number of actually written | ||
386 | * bytes now. | ||
387 | */ | ||
388 | count = err; | ||
389 | |||
390 | err = ubi_check_volume(ubi, vol->vol_id); | ||
391 | if (err < 0) | ||
392 | return err; | ||
393 | |||
394 | if (err) { | ||
395 | ubi_warn("volume %d on UBI device %d is corrupted", | ||
396 | vol->vol_id, ubi->ubi_num); | ||
397 | vol->corrupted = 1; | ||
398 | } | ||
399 | vol->checked = 1; | ||
400 | revoke_exclusive(desc, UBI_READWRITE); | ||
401 | } | ||
402 | |||
403 | *offp += count; | ||
404 | return count; | ||
405 | } | ||
406 | |||
407 | static int vol_cdev_ioctl(struct inode *inode, struct file *file, | ||
408 | unsigned int cmd, unsigned long arg) | ||
409 | { | ||
410 | int err = 0; | ||
411 | struct ubi_volume_desc *desc = file->private_data; | ||
412 | struct ubi_volume *vol = desc->vol; | ||
413 | struct ubi_device *ubi = vol->ubi; | ||
414 | void __user *argp = (void __user *)arg; | ||
415 | |||
416 | if (_IOC_NR(cmd) > VOL_CDEV_IOC_MAX_SEQ || | ||
417 | _IOC_TYPE(cmd) != UBI_VOL_IOC_MAGIC) | ||
418 | return -ENOTTY; | ||
419 | |||
420 | if (_IOC_DIR(cmd) && _IOC_READ) | ||
421 | err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd)); | ||
422 | else if (_IOC_DIR(cmd) && _IOC_WRITE) | ||
423 | err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd)); | ||
424 | if (err) | ||
425 | return -EFAULT; | ||
426 | |||
427 | switch (cmd) { | ||
428 | |||
429 | /* Volume update command */ | ||
430 | case UBI_IOCVOLUP: | ||
431 | { | ||
432 | int64_t bytes, rsvd_bytes; | ||
433 | |||
434 | if (!capable(CAP_SYS_RESOURCE)) { | ||
435 | err = -EPERM; | ||
436 | break; | ||
437 | } | ||
438 | |||
439 | err = copy_from_user(&bytes, argp, sizeof(int64_t)); | ||
440 | if (err) { | ||
441 | err = -EFAULT; | ||
442 | break; | ||
443 | } | ||
444 | |||
445 | if (desc->mode == UBI_READONLY) { | ||
446 | err = -EROFS; | ||
447 | break; | ||
448 | } | ||
449 | |||
450 | rsvd_bytes = vol->reserved_pebs * (ubi->leb_size-vol->data_pad); | ||
451 | if (bytes < 0 || bytes > rsvd_bytes) { | ||
452 | err = -EINVAL; | ||
453 | break; | ||
454 | } | ||
455 | |||
456 | err = get_exclusive(desc); | ||
457 | if (err < 0) | ||
458 | break; | ||
459 | |||
460 | err = ubi_start_update(ubi, vol->vol_id, bytes); | ||
461 | if (bytes == 0) | ||
462 | revoke_exclusive(desc, UBI_READWRITE); | ||
463 | |||
464 | file->f_pos = 0; | ||
465 | break; | ||
466 | } | ||
467 | |||
468 | #ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO | ||
469 | /* Logical eraseblock erasure command */ | ||
470 | case UBI_IOCEBER: | ||
471 | { | ||
472 | int32_t lnum; | ||
473 | |||
474 | err = __get_user(lnum, (__user int32_t *)argp); | ||
475 | if (err) { | ||
476 | err = -EFAULT; | ||
477 | break; | ||
478 | } | ||
479 | |||
480 | if (desc->mode == UBI_READONLY) { | ||
481 | err = -EROFS; | ||
482 | break; | ||
483 | } | ||
484 | |||
485 | if (lnum < 0 || lnum >= vol->reserved_pebs) { | ||
486 | err = -EINVAL; | ||
487 | break; | ||
488 | } | ||
489 | |||
490 | if (vol->vol_type != UBI_DYNAMIC_VOLUME) { | ||
491 | err = -EROFS; | ||
492 | break; | ||
493 | } | ||
494 | |||
495 | dbg_msg("erase LEB %d:%d", vol->vol_id, lnum); | ||
496 | err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum); | ||
497 | if (err) | ||
498 | break; | ||
499 | |||
500 | err = ubi_wl_flush(ubi); | ||
501 | break; | ||
502 | } | ||
503 | #endif | ||
504 | |||
505 | default: | ||
506 | err = -ENOTTY; | ||
507 | break; | ||
508 | } | ||
509 | |||
510 | return err; | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * verify_mkvol_req - verify volume creation request. | ||
515 | * @ubi: UBI device description object | ||
516 | * @req: the request to check | ||
517 | * | ||
518 | * This function zero if the request is correct, and %-EINVAL if not. | ||
519 | */ | ||
520 | static int verify_mkvol_req(const struct ubi_device *ubi, | ||
521 | const struct ubi_mkvol_req *req) | ||
522 | { | ||
523 | int n, err = -EINVAL; | ||
524 | |||
525 | if (req->bytes < 0 || req->alignment < 0 || req->vol_type < 0 || | ||
526 | req->name_len < 0) | ||
527 | goto bad; | ||
528 | |||
529 | if ((req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) && | ||
530 | req->vol_id != UBI_VOL_NUM_AUTO) | ||
531 | goto bad; | ||
532 | |||
533 | if (req->alignment == 0) | ||
534 | goto bad; | ||
535 | |||
536 | if (req->bytes == 0) | ||
537 | goto bad; | ||
538 | |||
539 | if (req->vol_type != UBI_DYNAMIC_VOLUME && | ||
540 | req->vol_type != UBI_STATIC_VOLUME) | ||
541 | goto bad; | ||
542 | |||
543 | if (req->alignment > ubi->leb_size) | ||
544 | goto bad; | ||
545 | |||
546 | n = req->alignment % ubi->min_io_size; | ||
547 | if (req->alignment != 1 && n) | ||
548 | goto bad; | ||
549 | |||
550 | if (req->name_len > UBI_VOL_NAME_MAX) { | ||
551 | err = -ENAMETOOLONG; | ||
552 | goto bad; | ||
553 | } | ||
554 | |||
555 | return 0; | ||
556 | |||
557 | bad: | ||
558 | dbg_err("bad volume creation request"); | ||
559 | ubi_dbg_dump_mkvol_req(req); | ||
560 | return err; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * verify_rsvol_req - verify volume re-size request. | ||
565 | * @ubi: UBI device description object | ||
566 | * @req: the request to check | ||
567 | * | ||
568 | * This function returns zero if the request is correct, and %-EINVAL if not. | ||
569 | */ | ||
570 | static int verify_rsvol_req(const struct ubi_device *ubi, | ||
571 | const struct ubi_rsvol_req *req) | ||
572 | { | ||
573 | if (req->bytes <= 0) | ||
574 | return -EINVAL; | ||
575 | |||
576 | if (req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) | ||
577 | return -EINVAL; | ||
578 | |||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | static int ubi_cdev_ioctl(struct inode *inode, struct file *file, | ||
583 | unsigned int cmd, unsigned long arg) | ||
584 | { | ||
585 | int err = 0; | ||
586 | struct ubi_device *ubi; | ||
587 | struct ubi_volume_desc *desc; | ||
588 | void __user *argp = (void __user *)arg; | ||
589 | |||
590 | if (_IOC_NR(cmd) > UBI_CDEV_IOC_MAX_SEQ || | ||
591 | _IOC_TYPE(cmd) != UBI_IOC_MAGIC) | ||
592 | return -ENOTTY; | ||
593 | |||
594 | if (_IOC_DIR(cmd) && _IOC_READ) | ||
595 | err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd)); | ||
596 | else if (_IOC_DIR(cmd) && _IOC_WRITE) | ||
597 | err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd)); | ||
598 | if (err) | ||
599 | return -EFAULT; | ||
600 | |||
601 | if (!capable(CAP_SYS_RESOURCE)) | ||
602 | return -EPERM; | ||
603 | |||
604 | ubi = major_to_device(imajor(inode)); | ||
605 | if (IS_ERR(ubi)) | ||
606 | return PTR_ERR(ubi); | ||
607 | |||
608 | switch (cmd) { | ||
609 | /* Create volume command */ | ||
610 | case UBI_IOCMKVOL: | ||
611 | { | ||
612 | struct ubi_mkvol_req req; | ||
613 | |||
614 | dbg_msg("create volume"); | ||
615 | err = __copy_from_user(&req, argp, | ||
616 | sizeof(struct ubi_mkvol_req)); | ||
617 | if (err) { | ||
618 | err = -EFAULT; | ||
619 | break; | ||
620 | } | ||
621 | |||
622 | err = verify_mkvol_req(ubi, &req); | ||
623 | if (err) | ||
624 | break; | ||
625 | |||
626 | req.name[req.name_len] = '\0'; | ||
627 | |||
628 | err = ubi_create_volume(ubi, &req); | ||
629 | if (err) | ||
630 | break; | ||
631 | |||
632 | err = __put_user(req.vol_id, (__user int32_t *)argp); | ||
633 | if (err) | ||
634 | err = -EFAULT; | ||
635 | |||
636 | break; | ||
637 | } | ||
638 | |||
639 | /* Remove volume command */ | ||
640 | case UBI_IOCRMVOL: | ||
641 | { | ||
642 | int vol_id; | ||
643 | |||
644 | dbg_msg("remove volume"); | ||
645 | err = __get_user(vol_id, (__user int32_t *)argp); | ||
646 | if (err) { | ||
647 | err = -EFAULT; | ||
648 | break; | ||
649 | } | ||
650 | |||
651 | desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE); | ||
652 | if (IS_ERR(desc)) { | ||
653 | err = PTR_ERR(desc); | ||
654 | break; | ||
655 | } | ||
656 | |||
657 | err = ubi_remove_volume(desc); | ||
658 | if (err) | ||
659 | ubi_close_volume(desc); | ||
660 | |||
661 | break; | ||
662 | } | ||
663 | |||
664 | /* Re-size volume command */ | ||
665 | case UBI_IOCRSVOL: | ||
666 | { | ||
667 | int pebs; | ||
668 | uint64_t tmp; | ||
669 | struct ubi_rsvol_req req; | ||
670 | |||
671 | dbg_msg("re-size volume"); | ||
672 | err = __copy_from_user(&req, argp, | ||
673 | sizeof(struct ubi_rsvol_req)); | ||
674 | if (err) { | ||
675 | err = -EFAULT; | ||
676 | break; | ||
677 | } | ||
678 | |||
679 | err = verify_rsvol_req(ubi, &req); | ||
680 | if (err) | ||
681 | break; | ||
682 | |||
683 | desc = ubi_open_volume(ubi->ubi_num, req.vol_id, UBI_EXCLUSIVE); | ||
684 | if (IS_ERR(desc)) { | ||
685 | err = PTR_ERR(desc); | ||
686 | break; | ||
687 | } | ||
688 | |||
689 | tmp = req.bytes; | ||
690 | pebs = !!do_div(tmp, desc->vol->usable_leb_size); | ||
691 | pebs += tmp; | ||
692 | |||
693 | err = ubi_resize_volume(desc, pebs); | ||
694 | ubi_close_volume(desc); | ||
695 | break; | ||
696 | } | ||
697 | |||
698 | default: | ||
699 | err = -ENOTTY; | ||
700 | break; | ||
701 | } | ||
702 | |||
703 | return err; | ||
704 | } | ||
705 | |||
706 | /* UBI character device operations */ | ||
707 | struct file_operations ubi_cdev_operations = { | ||
708 | .owner = THIS_MODULE, | ||
709 | .ioctl = ubi_cdev_ioctl, | ||
710 | .llseek = no_llseek | ||
711 | }; | ||
712 | |||
713 | /* UBI volume character device operations */ | ||
714 | struct file_operations ubi_vol_cdev_operations = { | ||
715 | .owner = THIS_MODULE, | ||
716 | .open = vol_cdev_open, | ||
717 | .release = vol_cdev_release, | ||
718 | .llseek = vol_cdev_llseek, | ||
719 | .read = vol_cdev_read, | ||
720 | .write = vol_cdev_write, | ||
721 | .ioctl = vol_cdev_ioctl | ||
722 | }; | ||
diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c new file mode 100644 index 000000000000..86364221fafe --- /dev/null +++ b/drivers/mtd/ubi/debug.c | |||
@@ -0,0 +1,224 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * Here we keep all the UBI debugging stuff which should normally be disabled | ||
23 | * and compiled-out, but it is extremely helpful when hunting bugs or doing big | ||
24 | * changes. | ||
25 | */ | ||
26 | |||
27 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG | ||
28 | |||
29 | #include "ubi.h" | ||
30 | |||
31 | /** | ||
32 | * ubi_dbg_dump_ec_hdr - dump an erase counter header. | ||
33 | * @ec_hdr: the erase counter header to dump | ||
34 | */ | ||
35 | void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr) | ||
36 | { | ||
37 | dbg_msg("erase counter header dump:"); | ||
38 | dbg_msg("magic %#08x", ubi32_to_cpu(ec_hdr->magic)); | ||
39 | dbg_msg("version %d", (int)ec_hdr->version); | ||
40 | dbg_msg("ec %llu", (long long)ubi64_to_cpu(ec_hdr->ec)); | ||
41 | dbg_msg("vid_hdr_offset %d", ubi32_to_cpu(ec_hdr->vid_hdr_offset)); | ||
42 | dbg_msg("data_offset %d", ubi32_to_cpu(ec_hdr->data_offset)); | ||
43 | dbg_msg("hdr_crc %#08x", ubi32_to_cpu(ec_hdr->hdr_crc)); | ||
44 | dbg_msg("erase counter header hexdump:"); | ||
45 | ubi_dbg_hexdump(ec_hdr, UBI_EC_HDR_SIZE); | ||
46 | } | ||
47 | |||
48 | /** | ||
49 | * ubi_dbg_dump_vid_hdr - dump a volume identifier header. | ||
50 | * @vid_hdr: the volume identifier header to dump | ||
51 | */ | ||
52 | void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr) | ||
53 | { | ||
54 | dbg_msg("volume identifier header dump:"); | ||
55 | dbg_msg("magic %08x", ubi32_to_cpu(vid_hdr->magic)); | ||
56 | dbg_msg("version %d", (int)vid_hdr->version); | ||
57 | dbg_msg("vol_type %d", (int)vid_hdr->vol_type); | ||
58 | dbg_msg("copy_flag %d", (int)vid_hdr->copy_flag); | ||
59 | dbg_msg("compat %d", (int)vid_hdr->compat); | ||
60 | dbg_msg("vol_id %d", ubi32_to_cpu(vid_hdr->vol_id)); | ||
61 | dbg_msg("lnum %d", ubi32_to_cpu(vid_hdr->lnum)); | ||
62 | dbg_msg("leb_ver %u", ubi32_to_cpu(vid_hdr->leb_ver)); | ||
63 | dbg_msg("data_size %d", ubi32_to_cpu(vid_hdr->data_size)); | ||
64 | dbg_msg("used_ebs %d", ubi32_to_cpu(vid_hdr->used_ebs)); | ||
65 | dbg_msg("data_pad %d", ubi32_to_cpu(vid_hdr->data_pad)); | ||
66 | dbg_msg("sqnum %llu", | ||
67 | (unsigned long long)ubi64_to_cpu(vid_hdr->sqnum)); | ||
68 | dbg_msg("hdr_crc %08x", ubi32_to_cpu(vid_hdr->hdr_crc)); | ||
69 | dbg_msg("volume identifier header hexdump:"); | ||
70 | } | ||
71 | |||
72 | /** | ||
73 | * ubi_dbg_dump_vol_info- dump volume information. | ||
74 | * @vol: UBI volume description object | ||
75 | */ | ||
76 | void ubi_dbg_dump_vol_info(const struct ubi_volume *vol) | ||
77 | { | ||
78 | dbg_msg("volume information dump:"); | ||
79 | dbg_msg("vol_id %d", vol->vol_id); | ||
80 | dbg_msg("reserved_pebs %d", vol->reserved_pebs); | ||
81 | dbg_msg("alignment %d", vol->alignment); | ||
82 | dbg_msg("data_pad %d", vol->data_pad); | ||
83 | dbg_msg("vol_type %d", vol->vol_type); | ||
84 | dbg_msg("name_len %d", vol->name_len); | ||
85 | dbg_msg("usable_leb_size %d", vol->usable_leb_size); | ||
86 | dbg_msg("used_ebs %d", vol->used_ebs); | ||
87 | dbg_msg("used_bytes %lld", vol->used_bytes); | ||
88 | dbg_msg("last_eb_bytes %d", vol->last_eb_bytes); | ||
89 | dbg_msg("corrupted %d", vol->corrupted); | ||
90 | dbg_msg("upd_marker %d", vol->upd_marker); | ||
91 | |||
92 | if (vol->name_len <= UBI_VOL_NAME_MAX && | ||
93 | strnlen(vol->name, vol->name_len + 1) == vol->name_len) { | ||
94 | dbg_msg("name %s", vol->name); | ||
95 | } else { | ||
96 | dbg_msg("the 1st 5 characters of the name: %c%c%c%c%c", | ||
97 | vol->name[0], vol->name[1], vol->name[2], | ||
98 | vol->name[3], vol->name[4]); | ||
99 | } | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * ubi_dbg_dump_vtbl_record - dump a &struct ubi_vtbl_record object. | ||
104 | * @r: the object to dump | ||
105 | * @idx: volume table index | ||
106 | */ | ||
107 | void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx) | ||
108 | { | ||
109 | int name_len = ubi16_to_cpu(r->name_len); | ||
110 | |||
111 | dbg_msg("volume table record %d dump:", idx); | ||
112 | dbg_msg("reserved_pebs %d", ubi32_to_cpu(r->reserved_pebs)); | ||
113 | dbg_msg("alignment %d", ubi32_to_cpu(r->alignment)); | ||
114 | dbg_msg("data_pad %d", ubi32_to_cpu(r->data_pad)); | ||
115 | dbg_msg("vol_type %d", (int)r->vol_type); | ||
116 | dbg_msg("upd_marker %d", (int)r->upd_marker); | ||
117 | dbg_msg("name_len %d", name_len); | ||
118 | |||
119 | if (r->name[0] == '\0') { | ||
120 | dbg_msg("name NULL"); | ||
121 | return; | ||
122 | } | ||
123 | |||
124 | if (name_len <= UBI_VOL_NAME_MAX && | ||
125 | strnlen(&r->name[0], name_len + 1) == name_len) { | ||
126 | dbg_msg("name %s", &r->name[0]); | ||
127 | } else { | ||
128 | dbg_msg("1st 5 characters of the name: %c%c%c%c%c", | ||
129 | r->name[0], r->name[1], r->name[2], r->name[3], | ||
130 | r->name[4]); | ||
131 | } | ||
132 | dbg_msg("crc %#08x", ubi32_to_cpu(r->crc)); | ||
133 | } | ||
134 | |||
135 | /** | ||
136 | * ubi_dbg_dump_sv - dump a &struct ubi_scan_volume object. | ||
137 | * @sv: the object to dump | ||
138 | */ | ||
139 | void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv) | ||
140 | { | ||
141 | dbg_msg("volume scanning information dump:"); | ||
142 | dbg_msg("vol_id %d", sv->vol_id); | ||
143 | dbg_msg("highest_lnum %d", sv->highest_lnum); | ||
144 | dbg_msg("leb_count %d", sv->leb_count); | ||
145 | dbg_msg("compat %d", sv->compat); | ||
146 | dbg_msg("vol_type %d", sv->vol_type); | ||
147 | dbg_msg("used_ebs %d", sv->used_ebs); | ||
148 | dbg_msg("last_data_size %d", sv->last_data_size); | ||
149 | dbg_msg("data_pad %d", sv->data_pad); | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * ubi_dbg_dump_seb - dump a &struct ubi_scan_leb object. | ||
154 | * @seb: the object to dump | ||
155 | * @type: object type: 0 - not corrupted, 1 - corrupted | ||
156 | */ | ||
157 | void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type) | ||
158 | { | ||
159 | dbg_msg("eraseblock scanning information dump:"); | ||
160 | dbg_msg("ec %d", seb->ec); | ||
161 | dbg_msg("pnum %d", seb->pnum); | ||
162 | if (type == 0) { | ||
163 | dbg_msg("lnum %d", seb->lnum); | ||
164 | dbg_msg("scrub %d", seb->scrub); | ||
165 | dbg_msg("sqnum %llu", seb->sqnum); | ||
166 | dbg_msg("leb_ver %u", seb->leb_ver); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * ubi_dbg_dump_mkvol_req - dump a &struct ubi_mkvol_req object. | ||
172 | * @req: the object to dump | ||
173 | */ | ||
174 | void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req) | ||
175 | { | ||
176 | char nm[17]; | ||
177 | |||
178 | dbg_msg("volume creation request dump:"); | ||
179 | dbg_msg("vol_id %d", req->vol_id); | ||
180 | dbg_msg("alignment %d", req->alignment); | ||
181 | dbg_msg("bytes %lld", (long long)req->bytes); | ||
182 | dbg_msg("vol_type %d", req->vol_type); | ||
183 | dbg_msg("name_len %d", req->name_len); | ||
184 | |||
185 | memcpy(nm, req->name, 16); | ||
186 | nm[16] = 0; | ||
187 | dbg_msg("the 1st 16 characters of the name: %s", nm); | ||
188 | } | ||
189 | |||
190 | #define BYTES_PER_LINE 32 | ||
191 | |||
192 | /** | ||
193 | * ubi_dbg_hexdump - dump a buffer. | ||
194 | * @ptr: the buffer to dump | ||
195 | * @size: buffer size which must be multiple of 4 bytes | ||
196 | */ | ||
197 | void ubi_dbg_hexdump(const void *ptr, int size) | ||
198 | { | ||
199 | int i, k = 0, rows, columns; | ||
200 | const uint8_t *p = ptr; | ||
201 | |||
202 | size = ALIGN(size, 4); | ||
203 | rows = size/BYTES_PER_LINE + size % BYTES_PER_LINE; | ||
204 | for (i = 0; i < rows; i++) { | ||
205 | int j; | ||
206 | |||
207 | cond_resched(); | ||
208 | columns = min(size - k, BYTES_PER_LINE) / 4; | ||
209 | if (columns == 0) | ||
210 | break; | ||
211 | printk(KERN_DEBUG "%5d: ", i * BYTES_PER_LINE); | ||
212 | for (j = 0; j < columns; j++) { | ||
213 | int n, N; | ||
214 | |||
215 | N = size - k > 4 ? 4 : size - k; | ||
216 | for (n = 0; n < N; n++) | ||
217 | printk("%02x", p[k++]); | ||
218 | printk(" "); | ||
219 | } | ||
220 | printk("\n"); | ||
221 | } | ||
222 | } | ||
223 | |||
224 | #endif /* CONFIG_MTD_UBI_DEBUG_MSG */ | ||
diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h new file mode 100644 index 000000000000..f816ad9a36c0 --- /dev/null +++ b/drivers/mtd/ubi/debug.h | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | #ifndef __UBI_DEBUG_H__ | ||
22 | #define __UBI_DEBUG_H__ | ||
23 | |||
24 | #ifdef CONFIG_MTD_UBI_DEBUG | ||
25 | #include <linux/random.h> | ||
26 | |||
27 | #define ubi_assert(expr) BUG_ON(!(expr)) | ||
28 | #define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__) | ||
29 | #else | ||
30 | #define ubi_assert(expr) ({}) | ||
31 | #define dbg_err(fmt, ...) ({}) | ||
32 | #endif | ||
33 | |||
34 | #ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT | ||
35 | #define DBG_DISABLE_BGT 1 | ||
36 | #else | ||
37 | #define DBG_DISABLE_BGT 0 | ||
38 | #endif | ||
39 | |||
40 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG | ||
41 | /* Generic debugging message */ | ||
42 | #define dbg_msg(fmt, ...) \ | ||
43 | printk(KERN_DEBUG "UBI DBG: %s: " fmt "\n", __FUNCTION__, ##__VA_ARGS__) | ||
44 | |||
45 | #define ubi_dbg_dump_stack() dump_stack() | ||
46 | |||
47 | struct ubi_ec_hdr; | ||
48 | struct ubi_vid_hdr; | ||
49 | struct ubi_volume; | ||
50 | struct ubi_vtbl_record; | ||
51 | struct ubi_scan_volume; | ||
52 | struct ubi_scan_leb; | ||
53 | struct ubi_mkvol_req; | ||
54 | |||
55 | void ubi_dbg_print(int type, const char *func, const char *fmt, ...); | ||
56 | void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr); | ||
57 | void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr); | ||
58 | void ubi_dbg_dump_vol_info(const struct ubi_volume *vol); | ||
59 | void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx); | ||
60 | void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv); | ||
61 | void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type); | ||
62 | void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req); | ||
63 | void ubi_dbg_hexdump(const void *buf, int size); | ||
64 | |||
65 | #else | ||
66 | |||
67 | #define dbg_msg(fmt, ...) ({}) | ||
68 | #define ubi_dbg_dump_stack() ({}) | ||
69 | #define ubi_dbg_print(func, fmt, ...) ({}) | ||
70 | #define ubi_dbg_dump_ec_hdr(ec_hdr) ({}) | ||
71 | #define ubi_dbg_dump_vid_hdr(vid_hdr) ({}) | ||
72 | #define ubi_dbg_dump_vol_info(vol) ({}) | ||
73 | #define ubi_dbg_dump_vtbl_record(r, idx) ({}) | ||
74 | #define ubi_dbg_dump_sv(sv) ({}) | ||
75 | #define ubi_dbg_dump_seb(seb, type) ({}) | ||
76 | #define ubi_dbg_dump_mkvol_req(req) ({}) | ||
77 | #define ubi_dbg_hexdump(buf, size) ({}) | ||
78 | |||
79 | #endif /* CONFIG_MTD_UBI_DEBUG_MSG */ | ||
80 | |||
81 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG_EBA | ||
82 | /* Messages from the eraseblock association unit */ | ||
83 | #define dbg_eba(fmt, ...) \ | ||
84 | printk(KERN_DEBUG "UBI DBG eba: %s: " fmt "\n", __FUNCTION__, \ | ||
85 | ##__VA_ARGS__) | ||
86 | #else | ||
87 | #define dbg_eba(fmt, ...) ({}) | ||
88 | #endif | ||
89 | |||
90 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG_WL | ||
91 | /* Messages from the wear-leveling unit */ | ||
92 | #define dbg_wl(fmt, ...) \ | ||
93 | printk(KERN_DEBUG "UBI DBG wl: %s: " fmt "\n", __FUNCTION__, \ | ||
94 | ##__VA_ARGS__) | ||
95 | #else | ||
96 | #define dbg_wl(fmt, ...) ({}) | ||
97 | #endif | ||
98 | |||
99 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG_IO | ||
100 | /* Messages from the input/output unit */ | ||
101 | #define dbg_io(fmt, ...) \ | ||
102 | printk(KERN_DEBUG "UBI DBG io: %s: " fmt "\n", __FUNCTION__, \ | ||
103 | ##__VA_ARGS__) | ||
104 | #else | ||
105 | #define dbg_io(fmt, ...) ({}) | ||
106 | #endif | ||
107 | |||
108 | #ifdef CONFIG_MTD_UBI_DEBUG_MSG_BLD | ||
109 | /* Initialization and build messages */ | ||
110 | #define dbg_bld(fmt, ...) \ | ||
111 | printk(KERN_DEBUG "UBI DBG bld: %s: " fmt "\n", __FUNCTION__, \ | ||
112 | ##__VA_ARGS__) | ||
113 | #else | ||
114 | #define dbg_bld(fmt, ...) ({}) | ||
115 | #endif | ||
116 | |||
117 | #ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_BITFLIPS | ||
118 | /** | ||
119 | * ubi_dbg_is_bitflip - if it is time to emulate a bit-flip. | ||
120 | * | ||
121 | * Returns non-zero if a bit-flip should be emulated, otherwise returns zero. | ||
122 | */ | ||
123 | static inline int ubi_dbg_is_bitflip(void) | ||
124 | { | ||
125 | return !(random32() % 200); | ||
126 | } | ||
127 | #else | ||
128 | #define ubi_dbg_is_bitflip() 0 | ||
129 | #endif | ||
130 | |||
131 | #ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES | ||
132 | /** | ||
133 | * ubi_dbg_is_write_failure - if it is time to emulate a write failure. | ||
134 | * | ||
135 | * Returns non-zero if a write failure should be emulated, otherwise returns | ||
136 | * zero. | ||
137 | */ | ||
138 | static inline int ubi_dbg_is_write_failure(void) | ||
139 | { | ||
140 | return !(random32() % 500); | ||
141 | } | ||
142 | #else | ||
143 | #define ubi_dbg_is_write_failure() 0 | ||
144 | #endif | ||
145 | |||
146 | #ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES | ||
147 | /** | ||
148 | * ubi_dbg_is_erase_failure - if its time to emulate an erase failure. | ||
149 | * | ||
150 | * Returns non-zero if an erase failure should be emulated, otherwise returns | ||
151 | * zero. | ||
152 | */ | ||
153 | static inline int ubi_dbg_is_erase_failure(void) | ||
154 | { | ||
155 | return !(random32() % 400); | ||
156 | } | ||
157 | #else | ||
158 | #define ubi_dbg_is_erase_failure() 0 | ||
159 | #endif | ||
160 | |||
161 | #endif /* !__UBI_DEBUG_H__ */ | ||
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c new file mode 100644 index 000000000000..d847ee1da3d9 --- /dev/null +++ b/drivers/mtd/ubi/eba.c | |||
@@ -0,0 +1,1241 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * The UBI Eraseblock Association (EBA) unit. | ||
23 | * | ||
24 | * This unit is responsible for I/O to/from logical eraseblock. | ||
25 | * | ||
26 | * Although in this implementation the EBA table is fully kept and managed in | ||
27 | * RAM, which assumes poor scalability, it might be (partially) maintained on | ||
28 | * flash in future implementations. | ||
29 | * | ||
30 | * The EBA unit implements per-logical eraseblock locking. Before accessing a | ||
31 | * logical eraseblock it is locked for reading or writing. The per-logical | ||
32 | * eraseblock locking is implemented by means of the lock tree. The lock tree | ||
33 | * is an RB-tree which refers all the currently locked logical eraseblocks. The | ||
34 | * lock tree elements are &struct ltree_entry objects. They are indexed by | ||
35 | * (@vol_id, @lnum) pairs. | ||
36 | * | ||
37 | * EBA also maintains the global sequence counter which is incremented each | ||
38 | * time a logical eraseblock is mapped to a physical eraseblock and it is | ||
39 | * stored in the volume identifier header. This means that each VID header has | ||
40 | * a unique sequence number. The sequence number is only increased an we assume | ||
41 | * 64 bits is enough to never overflow. | ||
42 | */ | ||
43 | |||
44 | #include <linux/slab.h> | ||
45 | #include <linux/crc32.h> | ||
46 | #include <linux/err.h> | ||
47 | #include "ubi.h" | ||
48 | |||
49 | /** | ||
50 | * struct ltree_entry - an entry in the lock tree. | ||
51 | * @rb: links RB-tree nodes | ||
52 | * @vol_id: volume ID of the locked logical eraseblock | ||
53 | * @lnum: locked logical eraseblock number | ||
54 | * @users: how many tasks are using this logical eraseblock or wait for it | ||
55 | * @mutex: read/write mutex to implement read/write access serialization to | ||
56 | * the (@vol_id, @lnum) logical eraseblock | ||
57 | * | ||
58 | * When a logical eraseblock is being locked - corresponding &struct ltree_entry | ||
59 | * object is inserted to the lock tree (@ubi->ltree). | ||
60 | */ | ||
61 | struct ltree_entry { | ||
62 | struct rb_node rb; | ||
63 | int vol_id; | ||
64 | int lnum; | ||
65 | int users; | ||
66 | struct rw_semaphore mutex; | ||
67 | }; | ||
68 | |||
69 | /* Slab cache for lock-tree entries */ | ||
70 | static struct kmem_cache *ltree_slab; | ||
71 | |||
72 | /** | ||
73 | * next_sqnum - get next sequence number. | ||
74 | * @ubi: UBI device description object | ||
75 | * | ||
76 | * This function returns next sequence number to use, which is just the current | ||
77 | * global sequence counter value. It also increases the global sequence | ||
78 | * counter. | ||
79 | */ | ||
80 | static unsigned long long next_sqnum(struct ubi_device *ubi) | ||
81 | { | ||
82 | unsigned long long sqnum; | ||
83 | |||
84 | spin_lock(&ubi->ltree_lock); | ||
85 | sqnum = ubi->global_sqnum++; | ||
86 | spin_unlock(&ubi->ltree_lock); | ||
87 | |||
88 | return sqnum; | ||
89 | } | ||
90 | |||
91 | /** | ||
92 | * ubi_get_compat - get compatibility flags of a volume. | ||
93 | * @ubi: UBI device description object | ||
94 | * @vol_id: volume ID | ||
95 | * | ||
96 | * This function returns compatibility flags for an internal volume. User | ||
97 | * volumes have no compatibility flags, so %0 is returned. | ||
98 | */ | ||
99 | static int ubi_get_compat(const struct ubi_device *ubi, int vol_id) | ||
100 | { | ||
101 | if (vol_id == UBI_LAYOUT_VOL_ID) | ||
102 | return UBI_LAYOUT_VOLUME_COMPAT; | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | /** | ||
107 | * ltree_lookup - look up the lock tree. | ||
108 | * @ubi: UBI device description object | ||
109 | * @vol_id: volume ID | ||
110 | * @lnum: logical eraseblock number | ||
111 | * | ||
112 | * This function returns a pointer to the corresponding &struct ltree_entry | ||
113 | * object if the logical eraseblock is locked and %NULL if it is not. | ||
114 | * @ubi->ltree_lock has to be locked. | ||
115 | */ | ||
116 | static struct ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id, | ||
117 | int lnum) | ||
118 | { | ||
119 | struct rb_node *p; | ||
120 | |||
121 | p = ubi->ltree.rb_node; | ||
122 | while (p) { | ||
123 | struct ltree_entry *le; | ||
124 | |||
125 | le = rb_entry(p, struct ltree_entry, rb); | ||
126 | |||
127 | if (vol_id < le->vol_id) | ||
128 | p = p->rb_left; | ||
129 | else if (vol_id > le->vol_id) | ||
130 | p = p->rb_right; | ||
131 | else { | ||
132 | if (lnum < le->lnum) | ||
133 | p = p->rb_left; | ||
134 | else if (lnum > le->lnum) | ||
135 | p = p->rb_right; | ||
136 | else | ||
137 | return le; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * ltree_add_entry - add new entry to the lock tree. | ||
146 | * @ubi: UBI device description object | ||
147 | * @vol_id: volume ID | ||
148 | * @lnum: logical eraseblock number | ||
149 | * | ||
150 | * This function adds new entry for logical eraseblock (@vol_id, @lnum) to the | ||
151 | * lock tree. If such entry is already there, its usage counter is increased. | ||
152 | * Returns pointer to the lock tree entry or %-ENOMEM if memory allocation | ||
153 | * failed. | ||
154 | */ | ||
155 | static struct ltree_entry *ltree_add_entry(struct ubi_device *ubi, int vol_id, | ||
156 | int lnum) | ||
157 | { | ||
158 | struct ltree_entry *le, *le1, *le_free; | ||
159 | |||
160 | le = kmem_cache_alloc(ltree_slab, GFP_KERNEL); | ||
161 | if (!le) | ||
162 | return ERR_PTR(-ENOMEM); | ||
163 | |||
164 | le->vol_id = vol_id; | ||
165 | le->lnum = lnum; | ||
166 | |||
167 | spin_lock(&ubi->ltree_lock); | ||
168 | le1 = ltree_lookup(ubi, vol_id, lnum); | ||
169 | |||
170 | if (le1) { | ||
171 | /* | ||
172 | * This logical eraseblock is already locked. The newly | ||
173 | * allocated lock entry is not needed. | ||
174 | */ | ||
175 | le_free = le; | ||
176 | le = le1; | ||
177 | } else { | ||
178 | struct rb_node **p, *parent = NULL; | ||
179 | |||
180 | /* | ||
181 | * No lock entry, add the newly allocated one to the | ||
182 | * @ubi->ltree RB-tree. | ||
183 | */ | ||
184 | le_free = NULL; | ||
185 | |||
186 | p = &ubi->ltree.rb_node; | ||
187 | while (*p) { | ||
188 | parent = *p; | ||
189 | le1 = rb_entry(parent, struct ltree_entry, rb); | ||
190 | |||
191 | if (vol_id < le1->vol_id) | ||
192 | p = &(*p)->rb_left; | ||
193 | else if (vol_id > le1->vol_id) | ||
194 | p = &(*p)->rb_right; | ||
195 | else { | ||
196 | ubi_assert(lnum != le1->lnum); | ||
197 | if (lnum < le1->lnum) | ||
198 | p = &(*p)->rb_left; | ||
199 | else | ||
200 | p = &(*p)->rb_right; | ||
201 | } | ||
202 | } | ||
203 | |||
204 | rb_link_node(&le->rb, parent, p); | ||
205 | rb_insert_color(&le->rb, &ubi->ltree); | ||
206 | } | ||
207 | le->users += 1; | ||
208 | spin_unlock(&ubi->ltree_lock); | ||
209 | |||
210 | if (le_free) | ||
211 | kmem_cache_free(ltree_slab, le_free); | ||
212 | |||
213 | return le; | ||
214 | } | ||
215 | |||
216 | /** | ||
217 | * leb_read_lock - lock logical eraseblock for reading. | ||
218 | * @ubi: UBI device description object | ||
219 | * @vol_id: volume ID | ||
220 | * @lnum: logical eraseblock number | ||
221 | * | ||
222 | * This function locks a logical eraseblock for reading. Returns zero in case | ||
223 | * of success and a negative error code in case of failure. | ||
224 | */ | ||
225 | static int leb_read_lock(struct ubi_device *ubi, int vol_id, int lnum) | ||
226 | { | ||
227 | struct ltree_entry *le; | ||
228 | |||
229 | le = ltree_add_entry(ubi, vol_id, lnum); | ||
230 | if (IS_ERR(le)) | ||
231 | return PTR_ERR(le); | ||
232 | down_read(&le->mutex); | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * leb_read_unlock - unlock logical eraseblock. | ||
238 | * @ubi: UBI device description object | ||
239 | * @vol_id: volume ID | ||
240 | * @lnum: logical eraseblock number | ||
241 | */ | ||
242 | static void leb_read_unlock(struct ubi_device *ubi, int vol_id, int lnum) | ||
243 | { | ||
244 | int free = 0; | ||
245 | struct ltree_entry *le; | ||
246 | |||
247 | spin_lock(&ubi->ltree_lock); | ||
248 | le = ltree_lookup(ubi, vol_id, lnum); | ||
249 | le->users -= 1; | ||
250 | ubi_assert(le->users >= 0); | ||
251 | if (le->users == 0) { | ||
252 | rb_erase(&le->rb, &ubi->ltree); | ||
253 | free = 1; | ||
254 | } | ||
255 | spin_unlock(&ubi->ltree_lock); | ||
256 | |||
257 | up_read(&le->mutex); | ||
258 | if (free) | ||
259 | kmem_cache_free(ltree_slab, le); | ||
260 | } | ||
261 | |||
262 | /** | ||
263 | * leb_write_lock - lock logical eraseblock for writing. | ||
264 | * @ubi: UBI device description object | ||
265 | * @vol_id: volume ID | ||
266 | * @lnum: logical eraseblock number | ||
267 | * | ||
268 | * This function locks a logical eraseblock for writing. Returns zero in case | ||
269 | * of success and a negative error code in case of failure. | ||
270 | */ | ||
271 | static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum) | ||
272 | { | ||
273 | struct ltree_entry *le; | ||
274 | |||
275 | le = ltree_add_entry(ubi, vol_id, lnum); | ||
276 | if (IS_ERR(le)) | ||
277 | return PTR_ERR(le); | ||
278 | down_write(&le->mutex); | ||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | /** | ||
283 | * leb_write_unlock - unlock logical eraseblock. | ||
284 | * @ubi: UBI device description object | ||
285 | * @vol_id: volume ID | ||
286 | * @lnum: logical eraseblock number | ||
287 | */ | ||
288 | static void leb_write_unlock(struct ubi_device *ubi, int vol_id, int lnum) | ||
289 | { | ||
290 | int free; | ||
291 | struct ltree_entry *le; | ||
292 | |||
293 | spin_lock(&ubi->ltree_lock); | ||
294 | le = ltree_lookup(ubi, vol_id, lnum); | ||
295 | le->users -= 1; | ||
296 | ubi_assert(le->users >= 0); | ||
297 | if (le->users == 0) { | ||
298 | rb_erase(&le->rb, &ubi->ltree); | ||
299 | free = 1; | ||
300 | } else | ||
301 | free = 0; | ||
302 | spin_unlock(&ubi->ltree_lock); | ||
303 | |||
304 | up_write(&le->mutex); | ||
305 | if (free) | ||
306 | kmem_cache_free(ltree_slab, le); | ||
307 | } | ||
308 | |||
309 | /** | ||
310 | * ubi_eba_unmap_leb - un-map logical eraseblock. | ||
311 | * @ubi: UBI device description object | ||
312 | * @vol_id: volume ID | ||
313 | * @lnum: logical eraseblock number | ||
314 | * | ||
315 | * This function un-maps logical eraseblock @lnum and schedules corresponding | ||
316 | * physical eraseblock for erasure. Returns zero in case of success and a | ||
317 | * negative error code in case of failure. | ||
318 | */ | ||
319 | int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum) | ||
320 | { | ||
321 | int idx = vol_id2idx(ubi, vol_id), err, pnum; | ||
322 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
323 | |||
324 | if (ubi->ro_mode) | ||
325 | return -EROFS; | ||
326 | |||
327 | err = leb_write_lock(ubi, vol_id, lnum); | ||
328 | if (err) | ||
329 | return err; | ||
330 | |||
331 | pnum = vol->eba_tbl[lnum]; | ||
332 | if (pnum < 0) | ||
333 | /* This logical eraseblock is already unmapped */ | ||
334 | goto out_unlock; | ||
335 | |||
336 | dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum); | ||
337 | |||
338 | vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED; | ||
339 | err = ubi_wl_put_peb(ubi, pnum, 0); | ||
340 | |||
341 | out_unlock: | ||
342 | leb_write_unlock(ubi, vol_id, lnum); | ||
343 | return err; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * ubi_eba_read_leb - read data. | ||
348 | * @ubi: UBI device description object | ||
349 | * @vol_id: volume ID | ||
350 | * @lnum: logical eraseblock number | ||
351 | * @buf: buffer to store the read data | ||
352 | * @offset: offset from where to read | ||
353 | * @len: how many bytes to read | ||
354 | * @check: data CRC check flag | ||
355 | * | ||
356 | * If the logical eraseblock @lnum is unmapped, @buf is filled with 0xFF | ||
357 | * bytes. The @check flag only makes sense for static volumes and forces | ||
358 | * eraseblock data CRC checking. | ||
359 | * | ||
360 | * In case of success this function returns zero. In case of a static volume, | ||
361 | * if data CRC mismatches - %-EBADMSG is returned. %-EBADMSG may also be | ||
362 | * returned for any volume type if an ECC error was detected by the MTD device | ||
363 | * driver. Other negative error cored may be returned in case of other errors. | ||
364 | */ | ||
365 | int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, | ||
366 | int offset, int len, int check) | ||
367 | { | ||
368 | int err, pnum, scrub = 0, idx = vol_id2idx(ubi, vol_id); | ||
369 | struct ubi_vid_hdr *vid_hdr; | ||
370 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
371 | uint32_t crc, crc1; | ||
372 | |||
373 | err = leb_read_lock(ubi, vol_id, lnum); | ||
374 | if (err) | ||
375 | return err; | ||
376 | |||
377 | pnum = vol->eba_tbl[lnum]; | ||
378 | if (pnum < 0) { | ||
379 | /* | ||
380 | * The logical eraseblock is not mapped, fill the whole buffer | ||
381 | * with 0xFF bytes. The exception is static volumes for which | ||
382 | * it is an error to read unmapped logical eraseblocks. | ||
383 | */ | ||
384 | dbg_eba("read %d bytes from offset %d of LEB %d:%d (unmapped)", | ||
385 | len, offset, vol_id, lnum); | ||
386 | leb_read_unlock(ubi, vol_id, lnum); | ||
387 | ubi_assert(vol->vol_type != UBI_STATIC_VOLUME); | ||
388 | memset(buf, 0xFF, len); | ||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | dbg_eba("read %d bytes from offset %d of LEB %d:%d, PEB %d", | ||
393 | len, offset, vol_id, lnum, pnum); | ||
394 | |||
395 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) | ||
396 | check = 0; | ||
397 | |||
398 | retry: | ||
399 | if (check) { | ||
400 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
401 | if (!vid_hdr) { | ||
402 | err = -ENOMEM; | ||
403 | goto out_unlock; | ||
404 | } | ||
405 | |||
406 | err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1); | ||
407 | if (err && err != UBI_IO_BITFLIPS) { | ||
408 | if (err > 0) { | ||
409 | /* | ||
410 | * The header is either absent or corrupted. | ||
411 | * The former case means there is a bug - | ||
412 | * switch to read-only mode just in case. | ||
413 | * The latter case means a real corruption - we | ||
414 | * may try to recover data. FIXME: but this is | ||
415 | * not implemented. | ||
416 | */ | ||
417 | if (err == UBI_IO_BAD_VID_HDR) { | ||
418 | ubi_warn("bad VID header at PEB %d, LEB" | ||
419 | "%d:%d", pnum, vol_id, lnum); | ||
420 | err = -EBADMSG; | ||
421 | } else | ||
422 | ubi_ro_mode(ubi); | ||
423 | } | ||
424 | goto out_free; | ||
425 | } else if (err == UBI_IO_BITFLIPS) | ||
426 | scrub = 1; | ||
427 | |||
428 | ubi_assert(lnum < ubi32_to_cpu(vid_hdr->used_ebs)); | ||
429 | ubi_assert(len == ubi32_to_cpu(vid_hdr->data_size)); | ||
430 | |||
431 | crc = ubi32_to_cpu(vid_hdr->data_crc); | ||
432 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
433 | } | ||
434 | |||
435 | err = ubi_io_read_data(ubi, buf, pnum, offset, len); | ||
436 | if (err) { | ||
437 | if (err == UBI_IO_BITFLIPS) { | ||
438 | scrub = 1; | ||
439 | err = 0; | ||
440 | } else if (err == -EBADMSG) { | ||
441 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) | ||
442 | goto out_unlock; | ||
443 | scrub = 1; | ||
444 | if (!check) { | ||
445 | ubi_msg("force data checking"); | ||
446 | check = 1; | ||
447 | goto retry; | ||
448 | } | ||
449 | } else | ||
450 | goto out_unlock; | ||
451 | } | ||
452 | |||
453 | if (check) { | ||
454 | crc1 = crc32(UBI_CRC32_INIT, buf, len); | ||
455 | if (crc1 != crc) { | ||
456 | ubi_warn("CRC error: calculated %#08x, must be %#08x", | ||
457 | crc1, crc); | ||
458 | err = -EBADMSG; | ||
459 | goto out_unlock; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | if (scrub) | ||
464 | err = ubi_wl_scrub_peb(ubi, pnum); | ||
465 | |||
466 | leb_read_unlock(ubi, vol_id, lnum); | ||
467 | return err; | ||
468 | |||
469 | out_free: | ||
470 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
471 | out_unlock: | ||
472 | leb_read_unlock(ubi, vol_id, lnum); | ||
473 | return err; | ||
474 | } | ||
475 | |||
476 | /** | ||
477 | * recover_peb - recover from write failure. | ||
478 | * @ubi: UBI device description object | ||
479 | * @pnum: the physical eraseblock to recover | ||
480 | * @vol_id: volume ID | ||
481 | * @lnum: logical eraseblock number | ||
482 | * @buf: data which was not written because of the write failure | ||
483 | * @offset: offset of the failed write | ||
484 | * @len: how many bytes should have been written | ||
485 | * | ||
486 | * This function is called in case of a write failure and moves all good data | ||
487 | * from the potentially bad physical eraseblock to a good physical eraseblock. | ||
488 | * This function also writes the data which was not written due to the failure. | ||
489 | * Returns new physical eraseblock number in case of success, and a negative | ||
490 | * error code in case of failure. | ||
491 | */ | ||
492 | static int recover_peb(struct ubi_device *ubi, int pnum, int vol_id, int lnum, | ||
493 | const void *buf, int offset, int len) | ||
494 | { | ||
495 | int err, idx = vol_id2idx(ubi, vol_id), new_pnum, data_size, tries = 0; | ||
496 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
497 | struct ubi_vid_hdr *vid_hdr; | ||
498 | unsigned char *new_buf; | ||
499 | |||
500 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
501 | if (!vid_hdr) { | ||
502 | return -ENOMEM; | ||
503 | } | ||
504 | |||
505 | retry: | ||
506 | new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN); | ||
507 | if (new_pnum < 0) { | ||
508 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
509 | return new_pnum; | ||
510 | } | ||
511 | |||
512 | ubi_msg("recover PEB %d, move data to PEB %d", pnum, new_pnum); | ||
513 | |||
514 | err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1); | ||
515 | if (err && err != UBI_IO_BITFLIPS) { | ||
516 | if (err > 0) | ||
517 | err = -EIO; | ||
518 | goto out_put; | ||
519 | } | ||
520 | |||
521 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
522 | err = ubi_io_write_vid_hdr(ubi, new_pnum, vid_hdr); | ||
523 | if (err) | ||
524 | goto write_error; | ||
525 | |||
526 | data_size = offset + len; | ||
527 | new_buf = kmalloc(data_size, GFP_KERNEL); | ||
528 | if (!new_buf) { | ||
529 | err = -ENOMEM; | ||
530 | goto out_put; | ||
531 | } | ||
532 | memset(new_buf + offset, 0xFF, len); | ||
533 | |||
534 | /* Read everything before the area where the write failure happened */ | ||
535 | if (offset > 0) { | ||
536 | err = ubi_io_read_data(ubi, new_buf, pnum, 0, offset); | ||
537 | if (err && err != UBI_IO_BITFLIPS) { | ||
538 | kfree(new_buf); | ||
539 | goto out_put; | ||
540 | } | ||
541 | } | ||
542 | |||
543 | memcpy(new_buf + offset, buf, len); | ||
544 | |||
545 | err = ubi_io_write_data(ubi, new_buf, new_pnum, 0, data_size); | ||
546 | if (err) { | ||
547 | kfree(new_buf); | ||
548 | goto write_error; | ||
549 | } | ||
550 | |||
551 | kfree(new_buf); | ||
552 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
553 | |||
554 | vol->eba_tbl[lnum] = new_pnum; | ||
555 | ubi_wl_put_peb(ubi, pnum, 1); | ||
556 | |||
557 | ubi_msg("data was successfully recovered"); | ||
558 | return 0; | ||
559 | |||
560 | out_put: | ||
561 | ubi_wl_put_peb(ubi, new_pnum, 1); | ||
562 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
563 | return err; | ||
564 | |||
565 | write_error: | ||
566 | /* | ||
567 | * Bad luck? This physical eraseblock is bad too? Crud. Let's try to | ||
568 | * get another one. | ||
569 | */ | ||
570 | ubi_warn("failed to write to PEB %d", new_pnum); | ||
571 | ubi_wl_put_peb(ubi, new_pnum, 1); | ||
572 | if (++tries > UBI_IO_RETRIES) { | ||
573 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
574 | return err; | ||
575 | } | ||
576 | ubi_msg("try again"); | ||
577 | goto retry; | ||
578 | } | ||
579 | |||
580 | /** | ||
581 | * ubi_eba_write_leb - write data to dynamic volume. | ||
582 | * @ubi: UBI device description object | ||
583 | * @vol_id: volume ID | ||
584 | * @lnum: logical eraseblock number | ||
585 | * @buf: the data to write | ||
586 | * @offset: offset within the logical eraseblock where to write | ||
587 | * @len: how many bytes to write | ||
588 | * @dtype: data type | ||
589 | * | ||
590 | * This function writes data to logical eraseblock @lnum of a dynamic volume | ||
591 | * @vol_id. Returns zero in case of success and a negative error code in case | ||
592 | * of failure. In case of error, it is possible that something was still | ||
593 | * written to the flash media, but may be some garbage. | ||
594 | */ | ||
595 | int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, | ||
596 | const void *buf, int offset, int len, int dtype) | ||
597 | { | ||
598 | int idx = vol_id2idx(ubi, vol_id), err, pnum, tries = 0; | ||
599 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
600 | struct ubi_vid_hdr *vid_hdr; | ||
601 | |||
602 | if (ubi->ro_mode) | ||
603 | return -EROFS; | ||
604 | |||
605 | err = leb_write_lock(ubi, vol_id, lnum); | ||
606 | if (err) | ||
607 | return err; | ||
608 | |||
609 | pnum = vol->eba_tbl[lnum]; | ||
610 | if (pnum >= 0) { | ||
611 | dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d", | ||
612 | len, offset, vol_id, lnum, pnum); | ||
613 | |||
614 | err = ubi_io_write_data(ubi, buf, pnum, offset, len); | ||
615 | if (err) { | ||
616 | ubi_warn("failed to write data to PEB %d", pnum); | ||
617 | if (err == -EIO && ubi->bad_allowed) | ||
618 | err = recover_peb(ubi, pnum, vol_id, lnum, buf, offset, len); | ||
619 | if (err) | ||
620 | ubi_ro_mode(ubi); | ||
621 | } | ||
622 | leb_write_unlock(ubi, vol_id, lnum); | ||
623 | return err; | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * The logical eraseblock is not mapped. We have to get a free physical | ||
628 | * eraseblock and write the volume identifier header there first. | ||
629 | */ | ||
630 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
631 | if (!vid_hdr) { | ||
632 | leb_write_unlock(ubi, vol_id, lnum); | ||
633 | return -ENOMEM; | ||
634 | } | ||
635 | |||
636 | vid_hdr->vol_type = UBI_VID_DYNAMIC; | ||
637 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
638 | vid_hdr->vol_id = cpu_to_ubi32(vol_id); | ||
639 | vid_hdr->lnum = cpu_to_ubi32(lnum); | ||
640 | vid_hdr->compat = ubi_get_compat(ubi, vol_id); | ||
641 | vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); | ||
642 | |||
643 | retry: | ||
644 | pnum = ubi_wl_get_peb(ubi, dtype); | ||
645 | if (pnum < 0) { | ||
646 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
647 | leb_write_unlock(ubi, vol_id, lnum); | ||
648 | return pnum; | ||
649 | } | ||
650 | |||
651 | dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d", | ||
652 | len, offset, vol_id, lnum, pnum); | ||
653 | |||
654 | err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); | ||
655 | if (err) { | ||
656 | ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", | ||
657 | vol_id, lnum, pnum); | ||
658 | goto write_error; | ||
659 | } | ||
660 | |||
661 | err = ubi_io_write_data(ubi, buf, pnum, offset, len); | ||
662 | if (err) { | ||
663 | ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, " | ||
664 | "PEB %d", len, offset, vol_id, lnum, pnum); | ||
665 | goto write_error; | ||
666 | } | ||
667 | |||
668 | vol->eba_tbl[lnum] = pnum; | ||
669 | |||
670 | leb_write_unlock(ubi, vol_id, lnum); | ||
671 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
672 | return 0; | ||
673 | |||
674 | write_error: | ||
675 | if (err != -EIO || !ubi->bad_allowed) { | ||
676 | ubi_ro_mode(ubi); | ||
677 | leb_write_unlock(ubi, vol_id, lnum); | ||
678 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
679 | return err; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * Fortunately, this is the first write operation to this physical | ||
684 | * eraseblock, so just put it and request a new one. We assume that if | ||
685 | * this physical eraseblock went bad, the erase code will handle that. | ||
686 | */ | ||
687 | err = ubi_wl_put_peb(ubi, pnum, 1); | ||
688 | if (err || ++tries > UBI_IO_RETRIES) { | ||
689 | ubi_ro_mode(ubi); | ||
690 | leb_write_unlock(ubi, vol_id, lnum); | ||
691 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
692 | return err; | ||
693 | } | ||
694 | |||
695 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
696 | ubi_msg("try another PEB"); | ||
697 | goto retry; | ||
698 | } | ||
699 | |||
700 | /** | ||
701 | * ubi_eba_write_leb_st - write data to static volume. | ||
702 | * @ubi: UBI device description object | ||
703 | * @vol_id: volume ID | ||
704 | * @lnum: logical eraseblock number | ||
705 | * @buf: data to write | ||
706 | * @len: how many bytes to write | ||
707 | * @dtype: data type | ||
708 | * @used_ebs: how many logical eraseblocks will this volume contain | ||
709 | * | ||
710 | * This function writes data to logical eraseblock @lnum of static volume | ||
711 | * @vol_id. The @used_ebs argument should contain total number of logical | ||
712 | * eraseblock in this static volume. | ||
713 | * | ||
714 | * When writing to the last logical eraseblock, the @len argument doesn't have | ||
715 | * to be aligned to the minimal I/O unit size. Instead, it has to be equivalent | ||
716 | * to the real data size, although the @buf buffer has to contain the | ||
717 | * alignment. In all other cases, @len has to be aligned. | ||
718 | * | ||
719 | * It is prohibited to write more then once to logical eraseblocks of static | ||
720 | * volumes. This function returns zero in case of success and a negative error | ||
721 | * code in case of failure. | ||
722 | */ | ||
723 | int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, | ||
724 | const void *buf, int len, int dtype, int used_ebs) | ||
725 | { | ||
726 | int err, pnum, tries = 0, data_size = len; | ||
727 | int idx = vol_id2idx(ubi, vol_id); | ||
728 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
729 | struct ubi_vid_hdr *vid_hdr; | ||
730 | uint32_t crc; | ||
731 | |||
732 | if (ubi->ro_mode) | ||
733 | return -EROFS; | ||
734 | |||
735 | if (lnum == used_ebs - 1) | ||
736 | /* If this is the last LEB @len may be unaligned */ | ||
737 | len = ALIGN(data_size, ubi->min_io_size); | ||
738 | else | ||
739 | ubi_assert(len % ubi->min_io_size == 0); | ||
740 | |||
741 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
742 | if (!vid_hdr) | ||
743 | return -ENOMEM; | ||
744 | |||
745 | err = leb_write_lock(ubi, vol_id, lnum); | ||
746 | if (err) { | ||
747 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
748 | return err; | ||
749 | } | ||
750 | |||
751 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
752 | vid_hdr->vol_id = cpu_to_ubi32(vol_id); | ||
753 | vid_hdr->lnum = cpu_to_ubi32(lnum); | ||
754 | vid_hdr->compat = ubi_get_compat(ubi, vol_id); | ||
755 | vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); | ||
756 | |||
757 | crc = crc32(UBI_CRC32_INIT, buf, data_size); | ||
758 | vid_hdr->vol_type = UBI_VID_STATIC; | ||
759 | vid_hdr->data_size = cpu_to_ubi32(data_size); | ||
760 | vid_hdr->used_ebs = cpu_to_ubi32(used_ebs); | ||
761 | vid_hdr->data_crc = cpu_to_ubi32(crc); | ||
762 | |||
763 | retry: | ||
764 | pnum = ubi_wl_get_peb(ubi, dtype); | ||
765 | if (pnum < 0) { | ||
766 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
767 | leb_write_unlock(ubi, vol_id, lnum); | ||
768 | return pnum; | ||
769 | } | ||
770 | |||
771 | dbg_eba("write VID hdr and %d bytes at LEB %d:%d, PEB %d, used_ebs %d", | ||
772 | len, vol_id, lnum, pnum, used_ebs); | ||
773 | |||
774 | err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); | ||
775 | if (err) { | ||
776 | ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", | ||
777 | vol_id, lnum, pnum); | ||
778 | goto write_error; | ||
779 | } | ||
780 | |||
781 | err = ubi_io_write_data(ubi, buf, pnum, 0, len); | ||
782 | if (err) { | ||
783 | ubi_warn("failed to write %d bytes of data to PEB %d", | ||
784 | len, pnum); | ||
785 | goto write_error; | ||
786 | } | ||
787 | |||
788 | ubi_assert(vol->eba_tbl[lnum] < 0); | ||
789 | vol->eba_tbl[lnum] = pnum; | ||
790 | |||
791 | leb_write_unlock(ubi, vol_id, lnum); | ||
792 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
793 | return 0; | ||
794 | |||
795 | write_error: | ||
796 | if (err != -EIO || !ubi->bad_allowed) { | ||
797 | /* | ||
798 | * This flash device does not admit of bad eraseblocks or | ||
799 | * something nasty and unexpected happened. Switch to read-only | ||
800 | * mode just in case. | ||
801 | */ | ||
802 | ubi_ro_mode(ubi); | ||
803 | leb_write_unlock(ubi, vol_id, lnum); | ||
804 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
805 | return err; | ||
806 | } | ||
807 | |||
808 | err = ubi_wl_put_peb(ubi, pnum, 1); | ||
809 | if (err || ++tries > UBI_IO_RETRIES) { | ||
810 | ubi_ro_mode(ubi); | ||
811 | leb_write_unlock(ubi, vol_id, lnum); | ||
812 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
813 | return err; | ||
814 | } | ||
815 | |||
816 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
817 | ubi_msg("try another PEB"); | ||
818 | goto retry; | ||
819 | } | ||
820 | |||
821 | /* | ||
822 | * ubi_eba_atomic_leb_change - change logical eraseblock atomically. | ||
823 | * @ubi: UBI device description object | ||
824 | * @vol_id: volume ID | ||
825 | * @lnum: logical eraseblock number | ||
826 | * @buf: data to write | ||
827 | * @len: how many bytes to write | ||
828 | * @dtype: data type | ||
829 | * | ||
830 | * This function changes the contents of a logical eraseblock atomically. @buf | ||
831 | * has to contain new logical eraseblock data, and @len - the length of the | ||
832 | * data, which has to be aligned. This function guarantees that in case of an | ||
833 | * unclean reboot the old contents is preserved. Returns zero in case of | ||
834 | * success and a negative error code in case of failure. | ||
835 | */ | ||
836 | int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, | ||
837 | const void *buf, int len, int dtype) | ||
838 | { | ||
839 | int err, pnum, tries = 0, idx = vol_id2idx(ubi, vol_id); | ||
840 | struct ubi_volume *vol = ubi->volumes[idx]; | ||
841 | struct ubi_vid_hdr *vid_hdr; | ||
842 | uint32_t crc; | ||
843 | |||
844 | if (ubi->ro_mode) | ||
845 | return -EROFS; | ||
846 | |||
847 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
848 | if (!vid_hdr) | ||
849 | return -ENOMEM; | ||
850 | |||
851 | err = leb_write_lock(ubi, vol_id, lnum); | ||
852 | if (err) { | ||
853 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
854 | return err; | ||
855 | } | ||
856 | |||
857 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
858 | vid_hdr->vol_id = cpu_to_ubi32(vol_id); | ||
859 | vid_hdr->lnum = cpu_to_ubi32(lnum); | ||
860 | vid_hdr->compat = ubi_get_compat(ubi, vol_id); | ||
861 | vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); | ||
862 | |||
863 | crc = crc32(UBI_CRC32_INIT, buf, len); | ||
864 | vid_hdr->vol_type = UBI_VID_STATIC; | ||
865 | vid_hdr->data_size = cpu_to_ubi32(len); | ||
866 | vid_hdr->copy_flag = 1; | ||
867 | vid_hdr->data_crc = cpu_to_ubi32(crc); | ||
868 | |||
869 | retry: | ||
870 | pnum = ubi_wl_get_peb(ubi, dtype); | ||
871 | if (pnum < 0) { | ||
872 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
873 | leb_write_unlock(ubi, vol_id, lnum); | ||
874 | return pnum; | ||
875 | } | ||
876 | |||
877 | dbg_eba("change LEB %d:%d, PEB %d, write VID hdr to PEB %d", | ||
878 | vol_id, lnum, vol->eba_tbl[lnum], pnum); | ||
879 | |||
880 | err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); | ||
881 | if (err) { | ||
882 | ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", | ||
883 | vol_id, lnum, pnum); | ||
884 | goto write_error; | ||
885 | } | ||
886 | |||
887 | err = ubi_io_write_data(ubi, buf, pnum, 0, len); | ||
888 | if (err) { | ||
889 | ubi_warn("failed to write %d bytes of data to PEB %d", | ||
890 | len, pnum); | ||
891 | goto write_error; | ||
892 | } | ||
893 | |||
894 | err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 1); | ||
895 | if (err) { | ||
896 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
897 | leb_write_unlock(ubi, vol_id, lnum); | ||
898 | return err; | ||
899 | } | ||
900 | |||
901 | vol->eba_tbl[lnum] = pnum; | ||
902 | leb_write_unlock(ubi, vol_id, lnum); | ||
903 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
904 | return 0; | ||
905 | |||
906 | write_error: | ||
907 | if (err != -EIO || !ubi->bad_allowed) { | ||
908 | /* | ||
909 | * This flash device does not admit of bad eraseblocks or | ||
910 | * something nasty and unexpected happened. Switch to read-only | ||
911 | * mode just in case. | ||
912 | */ | ||
913 | ubi_ro_mode(ubi); | ||
914 | leb_write_unlock(ubi, vol_id, lnum); | ||
915 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
916 | return err; | ||
917 | } | ||
918 | |||
919 | err = ubi_wl_put_peb(ubi, pnum, 1); | ||
920 | if (err || ++tries > UBI_IO_RETRIES) { | ||
921 | ubi_ro_mode(ubi); | ||
922 | leb_write_unlock(ubi, vol_id, lnum); | ||
923 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
924 | return err; | ||
925 | } | ||
926 | |||
927 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
928 | ubi_msg("try another PEB"); | ||
929 | goto retry; | ||
930 | } | ||
931 | |||
932 | /** | ||
933 | * ltree_entry_ctor - lock tree entries slab cache constructor. | ||
934 | * @obj: the lock-tree entry to construct | ||
935 | * @cache: the lock tree entry slab cache | ||
936 | * @flags: constructor flags | ||
937 | */ | ||
938 | static void ltree_entry_ctor(void *obj, struct kmem_cache *cache, | ||
939 | unsigned long flags) | ||
940 | { | ||
941 | struct ltree_entry *le = obj; | ||
942 | |||
943 | if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) != | ||
944 | SLAB_CTOR_CONSTRUCTOR) | ||
945 | return; | ||
946 | |||
947 | le->users = 0; | ||
948 | init_rwsem(&le->mutex); | ||
949 | } | ||
950 | |||
951 | /** | ||
952 | * ubi_eba_copy_leb - copy logical eraseblock. | ||
953 | * @ubi: UBI device description object | ||
954 | * @from: physical eraseblock number from where to copy | ||
955 | * @to: physical eraseblock number where to copy | ||
956 | * @vid_hdr: VID header of the @from physical eraseblock | ||
957 | * | ||
958 | * This function copies logical eraseblock from physical eraseblock @from to | ||
959 | * physical eraseblock @to. The @vid_hdr buffer may be changed by this | ||
960 | * function. Returns zero in case of success, %UBI_IO_BITFLIPS if the operation | ||
961 | * was canceled because bit-flips were detected at the target PEB, and a | ||
962 | * negative error code in case of failure. | ||
963 | */ | ||
964 | int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, | ||
965 | struct ubi_vid_hdr *vid_hdr) | ||
966 | { | ||
967 | int err, vol_id, lnum, data_size, aldata_size, pnum, idx; | ||
968 | struct ubi_volume *vol; | ||
969 | uint32_t crc; | ||
970 | void *buf, *buf1 = NULL; | ||
971 | |||
972 | vol_id = ubi32_to_cpu(vid_hdr->vol_id); | ||
973 | lnum = ubi32_to_cpu(vid_hdr->lnum); | ||
974 | |||
975 | dbg_eba("copy LEB %d:%d, PEB %d to PEB %d", vol_id, lnum, from, to); | ||
976 | |||
977 | if (vid_hdr->vol_type == UBI_VID_STATIC) { | ||
978 | data_size = ubi32_to_cpu(vid_hdr->data_size); | ||
979 | aldata_size = ALIGN(data_size, ubi->min_io_size); | ||
980 | } else | ||
981 | data_size = aldata_size = | ||
982 | ubi->leb_size - ubi32_to_cpu(vid_hdr->data_pad); | ||
983 | |||
984 | buf = kmalloc(aldata_size, GFP_KERNEL); | ||
985 | if (!buf) | ||
986 | return -ENOMEM; | ||
987 | |||
988 | /* | ||
989 | * We do not want anybody to write to this logical eraseblock while we | ||
990 | * are moving it, so we lock it. | ||
991 | */ | ||
992 | err = leb_write_lock(ubi, vol_id, lnum); | ||
993 | if (err) { | ||
994 | kfree(buf); | ||
995 | return err; | ||
996 | } | ||
997 | |||
998 | /* | ||
999 | * But the logical eraseblock might have been put by this time. | ||
1000 | * Cancel if it is true. | ||
1001 | */ | ||
1002 | idx = vol_id2idx(ubi, vol_id); | ||
1003 | |||
1004 | /* | ||
1005 | * We may race with volume deletion/re-size, so we have to hold | ||
1006 | * @ubi->volumes_lock. | ||
1007 | */ | ||
1008 | spin_lock(&ubi->volumes_lock); | ||
1009 | vol = ubi->volumes[idx]; | ||
1010 | if (!vol) { | ||
1011 | dbg_eba("volume %d was removed meanwhile", vol_id); | ||
1012 | spin_unlock(&ubi->volumes_lock); | ||
1013 | goto out_unlock; | ||
1014 | } | ||
1015 | |||
1016 | pnum = vol->eba_tbl[lnum]; | ||
1017 | if (pnum != from) { | ||
1018 | dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to " | ||
1019 | "PEB %d, cancel", vol_id, lnum, from, pnum); | ||
1020 | spin_unlock(&ubi->volumes_lock); | ||
1021 | goto out_unlock; | ||
1022 | } | ||
1023 | spin_unlock(&ubi->volumes_lock); | ||
1024 | |||
1025 | /* OK, now the LEB is locked and we can safely start moving it */ | ||
1026 | |||
1027 | dbg_eba("read %d bytes of data", aldata_size); | ||
1028 | err = ubi_io_read_data(ubi, buf, from, 0, aldata_size); | ||
1029 | if (err && err != UBI_IO_BITFLIPS) { | ||
1030 | ubi_warn("error %d while reading data from PEB %d", | ||
1031 | err, from); | ||
1032 | goto out_unlock; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * Now we have got to calculate how much data we have to to copy. In | ||
1037 | * case of a static volume it is fairly easy - the VID header contains | ||
1038 | * the data size. In case of a dynamic volume it is more difficult - we | ||
1039 | * have to read the contents, cut 0xFF bytes from the end and copy only | ||
1040 | * the first part. We must do this to avoid writing 0xFF bytes as it | ||
1041 | * may have some side-effects. And not only this. It is important not | ||
1042 | * to include those 0xFFs to CRC because later the they may be filled | ||
1043 | * by data. | ||
1044 | */ | ||
1045 | if (vid_hdr->vol_type == UBI_VID_DYNAMIC) | ||
1046 | aldata_size = data_size = | ||
1047 | ubi_calc_data_len(ubi, buf, data_size); | ||
1048 | |||
1049 | cond_resched(); | ||
1050 | crc = crc32(UBI_CRC32_INIT, buf, data_size); | ||
1051 | cond_resched(); | ||
1052 | |||
1053 | /* | ||
1054 | * It may turn out to me that the whole @from physical eraseblock | ||
1055 | * contains only 0xFF bytes. Then we have to only write the VID header | ||
1056 | * and do not write any data. This also means we should not set | ||
1057 | * @vid_hdr->copy_flag, @vid_hdr->data_size, and @vid_hdr->data_crc. | ||
1058 | */ | ||
1059 | if (data_size > 0) { | ||
1060 | vid_hdr->copy_flag = 1; | ||
1061 | vid_hdr->data_size = cpu_to_ubi32(data_size); | ||
1062 | vid_hdr->data_crc = cpu_to_ubi32(crc); | ||
1063 | } | ||
1064 | vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); | ||
1065 | |||
1066 | err = ubi_io_write_vid_hdr(ubi, to, vid_hdr); | ||
1067 | if (err) | ||
1068 | goto out_unlock; | ||
1069 | |||
1070 | cond_resched(); | ||
1071 | |||
1072 | /* Read the VID header back and check if it was written correctly */ | ||
1073 | err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1); | ||
1074 | if (err) { | ||
1075 | if (err != UBI_IO_BITFLIPS) | ||
1076 | ubi_warn("cannot read VID header back from PEB %d", to); | ||
1077 | goto out_unlock; | ||
1078 | } | ||
1079 | |||
1080 | if (data_size > 0) { | ||
1081 | err = ubi_io_write_data(ubi, buf, to, 0, aldata_size); | ||
1082 | if (err) | ||
1083 | goto out_unlock; | ||
1084 | |||
1085 | /* | ||
1086 | * We've written the data and are going to read it back to make | ||
1087 | * sure it was written correctly. | ||
1088 | */ | ||
1089 | buf1 = kmalloc(aldata_size, GFP_KERNEL); | ||
1090 | if (!buf1) { | ||
1091 | err = -ENOMEM; | ||
1092 | goto out_unlock; | ||
1093 | } | ||
1094 | |||
1095 | cond_resched(); | ||
1096 | |||
1097 | err = ubi_io_read_data(ubi, buf1, to, 0, aldata_size); | ||
1098 | if (err) { | ||
1099 | if (err != UBI_IO_BITFLIPS) | ||
1100 | ubi_warn("cannot read data back from PEB %d", | ||
1101 | to); | ||
1102 | goto out_unlock; | ||
1103 | } | ||
1104 | |||
1105 | cond_resched(); | ||
1106 | |||
1107 | if (memcmp(buf, buf1, aldata_size)) { | ||
1108 | ubi_warn("read data back from PEB %d - it is different", | ||
1109 | to); | ||
1110 | goto out_unlock; | ||
1111 | } | ||
1112 | } | ||
1113 | |||
1114 | ubi_assert(vol->eba_tbl[lnum] == from); | ||
1115 | vol->eba_tbl[lnum] = to; | ||
1116 | |||
1117 | leb_write_unlock(ubi, vol_id, lnum); | ||
1118 | kfree(buf); | ||
1119 | kfree(buf1); | ||
1120 | |||
1121 | return 0; | ||
1122 | |||
1123 | out_unlock: | ||
1124 | leb_write_unlock(ubi, vol_id, lnum); | ||
1125 | kfree(buf); | ||
1126 | kfree(buf1); | ||
1127 | return err; | ||
1128 | } | ||
1129 | |||
1130 | /** | ||
1131 | * ubi_eba_init_scan - initialize the EBA unit using scanning information. | ||
1132 | * @ubi: UBI device description object | ||
1133 | * @si: scanning information | ||
1134 | * | ||
1135 | * This function returns zero in case of success and a negative error code in | ||
1136 | * case of failure. | ||
1137 | */ | ||
1138 | int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) | ||
1139 | { | ||
1140 | int i, j, err, num_volumes; | ||
1141 | struct ubi_scan_volume *sv; | ||
1142 | struct ubi_volume *vol; | ||
1143 | struct ubi_scan_leb *seb; | ||
1144 | struct rb_node *rb; | ||
1145 | |||
1146 | dbg_eba("initialize EBA unit"); | ||
1147 | |||
1148 | spin_lock_init(&ubi->ltree_lock); | ||
1149 | ubi->ltree = RB_ROOT; | ||
1150 | |||
1151 | if (ubi_devices_cnt == 0) { | ||
1152 | ltree_slab = kmem_cache_create("ubi_ltree_slab", | ||
1153 | sizeof(struct ltree_entry), 0, | ||
1154 | 0, <ree_entry_ctor, NULL); | ||
1155 | if (!ltree_slab) | ||
1156 | return -ENOMEM; | ||
1157 | } | ||
1158 | |||
1159 | ubi->global_sqnum = si->max_sqnum + 1; | ||
1160 | num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; | ||
1161 | |||
1162 | for (i = 0; i < num_volumes; i++) { | ||
1163 | vol = ubi->volumes[i]; | ||
1164 | if (!vol) | ||
1165 | continue; | ||
1166 | |||
1167 | cond_resched(); | ||
1168 | |||
1169 | vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), | ||
1170 | GFP_KERNEL); | ||
1171 | if (!vol->eba_tbl) { | ||
1172 | err = -ENOMEM; | ||
1173 | goto out_free; | ||
1174 | } | ||
1175 | |||
1176 | for (j = 0; j < vol->reserved_pebs; j++) | ||
1177 | vol->eba_tbl[j] = UBI_LEB_UNMAPPED; | ||
1178 | |||
1179 | sv = ubi_scan_find_sv(si, idx2vol_id(ubi, i)); | ||
1180 | if (!sv) | ||
1181 | continue; | ||
1182 | |||
1183 | ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) { | ||
1184 | if (seb->lnum >= vol->reserved_pebs) | ||
1185 | /* | ||
1186 | * This may happen in case of an unclean reboot | ||
1187 | * during re-size. | ||
1188 | */ | ||
1189 | ubi_scan_move_to_list(sv, seb, &si->erase); | ||
1190 | vol->eba_tbl[seb->lnum] = seb->pnum; | ||
1191 | } | ||
1192 | } | ||
1193 | |||
1194 | if (ubi->bad_allowed) { | ||
1195 | ubi_calculate_reserved(ubi); | ||
1196 | |||
1197 | if (ubi->avail_pebs < ubi->beb_rsvd_level) { | ||
1198 | /* No enough free physical eraseblocks */ | ||
1199 | ubi->beb_rsvd_pebs = ubi->avail_pebs; | ||
1200 | ubi_warn("cannot reserve enough PEBs for bad PEB " | ||
1201 | "handling, reserved %d, need %d", | ||
1202 | ubi->beb_rsvd_pebs, ubi->beb_rsvd_level); | ||
1203 | } else | ||
1204 | ubi->beb_rsvd_pebs = ubi->beb_rsvd_level; | ||
1205 | |||
1206 | ubi->avail_pebs -= ubi->beb_rsvd_pebs; | ||
1207 | ubi->rsvd_pebs += ubi->beb_rsvd_pebs; | ||
1208 | } | ||
1209 | |||
1210 | dbg_eba("EBA unit is initialized"); | ||
1211 | return 0; | ||
1212 | |||
1213 | out_free: | ||
1214 | for (i = 0; i < num_volumes; i++) { | ||
1215 | if (!ubi->volumes[i]) | ||
1216 | continue; | ||
1217 | kfree(ubi->volumes[i]->eba_tbl); | ||
1218 | } | ||
1219 | if (ubi_devices_cnt == 0) | ||
1220 | kmem_cache_destroy(ltree_slab); | ||
1221 | return err; | ||
1222 | } | ||
1223 | |||
1224 | /** | ||
1225 | * ubi_eba_close - close EBA unit. | ||
1226 | * @ubi: UBI device description object | ||
1227 | */ | ||
1228 | void ubi_eba_close(const struct ubi_device *ubi) | ||
1229 | { | ||
1230 | int i, num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; | ||
1231 | |||
1232 | dbg_eba("close EBA unit"); | ||
1233 | |||
1234 | for (i = 0; i < num_volumes; i++) { | ||
1235 | if (!ubi->volumes[i]) | ||
1236 | continue; | ||
1237 | kfree(ubi->volumes[i]->eba_tbl); | ||
1238 | } | ||
1239 | if (ubi_devices_cnt == 1) | ||
1240 | kmem_cache_destroy(ltree_slab); | ||
1241 | } | ||
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c new file mode 100644 index 000000000000..fc9478d605ff --- /dev/null +++ b/drivers/mtd/ubi/gluebi.c | |||
@@ -0,0 +1,323 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём), Joern Engel | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * This file includes implementation of fake MTD devices for each UBI volume. | ||
23 | * This sounds strange, but it is in fact quite useful to make MTD-oriented | ||
24 | * software (including all the legacy software) to work on top of UBI. | ||
25 | * | ||
26 | * Gluebi emulates MTD devices of "MTD_UBIVOLUME" type. Their minimal I/O unit | ||
27 | * size (mtd->writesize) is equivalent to the UBI minimal I/O unit. The | ||
28 | * eraseblock size is equivalent to the logical eraseblock size of the volume. | ||
29 | */ | ||
30 | |||
31 | #include <asm/div64.h> | ||
32 | #include "ubi.h" | ||
33 | |||
34 | /** | ||
35 | * gluebi_get_device - get MTD device reference. | ||
36 | * @mtd: the MTD device description object | ||
37 | * | ||
38 | * This function is called every time the MTD device is being opened and | ||
39 | * implements the MTD get_device() operation. Returns zero in case of success | ||
40 | * and a negative error code in case of failure. | ||
41 | */ | ||
42 | static int gluebi_get_device(struct mtd_info *mtd) | ||
43 | { | ||
44 | struct ubi_volume *vol; | ||
45 | |||
46 | vol = container_of(mtd, struct ubi_volume, gluebi_mtd); | ||
47 | |||
48 | /* | ||
49 | * We do not introduce locks for gluebi reference count because the | ||
50 | * get_device()/put_device() calls are already serialized at MTD. | ||
51 | */ | ||
52 | if (vol->gluebi_refcount > 0) { | ||
53 | /* | ||
54 | * The MTD device is already referenced and this is just one | ||
55 | * more reference. MTD allows many users to open the same | ||
56 | * volume simultaneously and do not distinguish between | ||
57 | * readers/writers/exclusive openers as UBI does. So we do not | ||
58 | * open the UBI volume again - just increase the reference | ||
59 | * counter and return. | ||
60 | */ | ||
61 | vol->gluebi_refcount += 1; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * This is the first reference to this UBI volume via the MTD device | ||
67 | * interface. Open the corresponding volume in read-write mode. | ||
68 | */ | ||
69 | vol->gluebi_desc = ubi_open_volume(vol->ubi->ubi_num, vol->vol_id, | ||
70 | UBI_READWRITE); | ||
71 | if (IS_ERR(vol->gluebi_desc)) | ||
72 | return PTR_ERR(vol->gluebi_desc); | ||
73 | vol->gluebi_refcount += 1; | ||
74 | return 0; | ||
75 | } | ||
76 | |||
77 | /** | ||
78 | * gluebi_put_device - put MTD device reference. | ||
79 | * @mtd: the MTD device description object | ||
80 | * | ||
81 | * This function is called every time the MTD device is being put. Returns | ||
82 | * zero in case of success and a negative error code in case of failure. | ||
83 | */ | ||
84 | static void gluebi_put_device(struct mtd_info *mtd) | ||
85 | { | ||
86 | struct ubi_volume *vol; | ||
87 | |||
88 | vol = container_of(mtd, struct ubi_volume, gluebi_mtd); | ||
89 | vol->gluebi_refcount -= 1; | ||
90 | ubi_assert(vol->gluebi_refcount >= 0); | ||
91 | if (vol->gluebi_refcount == 0) | ||
92 | ubi_close_volume(vol->gluebi_desc); | ||
93 | } | ||
94 | |||
95 | /** | ||
96 | * gluebi_read - read operation of emulated MTD devices. | ||
97 | * @mtd: MTD device description object | ||
98 | * @from: absolute offset from where to read | ||
99 | * @len: how many bytes to read | ||
100 | * @retlen: count of read bytes is returned here | ||
101 | * @buf: buffer to store the read data | ||
102 | * | ||
103 | * This function returns zero in case of success and a negative error code in | ||
104 | * case of failure. | ||
105 | */ | ||
106 | static int gluebi_read(struct mtd_info *mtd, loff_t from, size_t len, | ||
107 | size_t *retlen, unsigned char *buf) | ||
108 | { | ||
109 | int err = 0, lnum, offs, total_read; | ||
110 | struct ubi_volume *vol; | ||
111 | struct ubi_device *ubi; | ||
112 | uint64_t tmp = from; | ||
113 | |||
114 | dbg_msg("read %zd bytes from offset %lld", len, from); | ||
115 | |||
116 | if (len < 0 || from < 0 || from + len > mtd->size) | ||
117 | return -EINVAL; | ||
118 | |||
119 | vol = container_of(mtd, struct ubi_volume, gluebi_mtd); | ||
120 | ubi = vol->ubi; | ||
121 | |||
122 | offs = do_div(tmp, mtd->erasesize); | ||
123 | lnum = tmp; | ||
124 | |||
125 | total_read = len; | ||
126 | while (total_read) { | ||
127 | size_t to_read = mtd->erasesize - offs; | ||
128 | |||
129 | if (to_read > total_read) | ||
130 | to_read = total_read; | ||
131 | |||
132 | err = ubi_eba_read_leb(ubi, vol->vol_id, lnum, buf, offs, | ||
133 | to_read, 0); | ||
134 | if (err) | ||
135 | break; | ||
136 | |||
137 | lnum += 1; | ||
138 | offs = 0; | ||
139 | total_read -= to_read; | ||
140 | buf += to_read; | ||
141 | } | ||
142 | |||
143 | *retlen = len - total_read; | ||
144 | return err; | ||
145 | } | ||
146 | |||
147 | /** | ||
148 | * gluebi_write - write operation of emulated MTD devices. | ||
149 | * @mtd: MTD device description object | ||
150 | * @to: absolute offset where to write | ||
151 | * @len: how many bytes to write | ||
152 | * @retlen: count of written bytes is returned here | ||
153 | * @buf: buffer with data to write | ||
154 | * | ||
155 | * This function returns zero in case of success and a negative error code in | ||
156 | * case of failure. | ||
157 | */ | ||
158 | static int gluebi_write(struct mtd_info *mtd, loff_t to, size_t len, | ||
159 | size_t *retlen, const u_char *buf) | ||
160 | { | ||
161 | int err = 0, lnum, offs, total_written; | ||
162 | struct ubi_volume *vol; | ||
163 | struct ubi_device *ubi; | ||
164 | uint64_t tmp = to; | ||
165 | |||
166 | dbg_msg("write %zd bytes to offset %lld", len, to); | ||
167 | |||
168 | if (len < 0 || to < 0 || len + to > mtd->size) | ||
169 | return -EINVAL; | ||
170 | |||
171 | vol = container_of(mtd, struct ubi_volume, gluebi_mtd); | ||
172 | ubi = vol->ubi; | ||
173 | |||
174 | if (ubi->ro_mode) | ||
175 | return -EROFS; | ||
176 | |||
177 | offs = do_div(tmp, mtd->erasesize); | ||
178 | lnum = tmp; | ||
179 | |||
180 | if (len % mtd->writesize || offs % mtd->writesize) | ||
181 | return -EINVAL; | ||
182 | |||
183 | total_written = len; | ||
184 | while (total_written) { | ||
185 | size_t to_write = mtd->erasesize - offs; | ||
186 | |||
187 | if (to_write > total_written) | ||
188 | to_write = total_written; | ||
189 | |||
190 | err = ubi_eba_write_leb(ubi, vol->vol_id, lnum, buf, offs, | ||
191 | to_write, UBI_UNKNOWN); | ||
192 | if (err) | ||
193 | break; | ||
194 | |||
195 | lnum += 1; | ||
196 | offs = 0; | ||
197 | total_written -= to_write; | ||
198 | buf += to_write; | ||
199 | } | ||
200 | |||
201 | *retlen = len - total_written; | ||
202 | return err; | ||
203 | } | ||
204 | |||
205 | /** | ||
206 | * gluebi_erase - erase operation of emulated MTD devices. | ||
207 | * @mtd: the MTD device description object | ||
208 | * @instr: the erase operation description | ||
209 | * | ||
210 | * This function calls the erase callback when finishes. Returns zero in case | ||
211 | * of success and a negative error code in case of failure. | ||
212 | */ | ||
213 | static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr) | ||
214 | { | ||
215 | int err, i, lnum, count; | ||
216 | struct ubi_volume *vol; | ||
217 | struct ubi_device *ubi; | ||
218 | |||
219 | dbg_msg("erase %u bytes at offset %u", instr->len, instr->addr); | ||
220 | |||
221 | if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize) | ||
222 | return -EINVAL; | ||
223 | |||
224 | if (instr->len < 0 || instr->addr + instr->len > mtd->size) | ||
225 | return -EINVAL; | ||
226 | |||
227 | if (instr->addr % mtd->writesize || instr->len % mtd->writesize) | ||
228 | return -EINVAL; | ||
229 | |||
230 | lnum = instr->addr / mtd->erasesize; | ||
231 | count = instr->len / mtd->erasesize; | ||
232 | |||
233 | vol = container_of(mtd, struct ubi_volume, gluebi_mtd); | ||
234 | ubi = vol->ubi; | ||
235 | |||
236 | if (ubi->ro_mode) | ||
237 | return -EROFS; | ||
238 | |||
239 | for (i = 0; i < count; i++) { | ||
240 | err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum + i); | ||
241 | if (err) | ||
242 | goto out_err; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * MTD erase operations are synchronous, so we have to make sure the | ||
247 | * physical eraseblock is wiped out. | ||
248 | */ | ||
249 | err = ubi_wl_flush(ubi); | ||
250 | if (err) | ||
251 | goto out_err; | ||
252 | |||
253 | instr->state = MTD_ERASE_DONE; | ||
254 | mtd_erase_callback(instr); | ||
255 | return 0; | ||
256 | |||
257 | out_err: | ||
258 | instr->state = MTD_ERASE_FAILED; | ||
259 | instr->fail_addr = lnum * mtd->erasesize; | ||
260 | return err; | ||
261 | } | ||
262 | |||
263 | /** | ||
264 | * ubi_create_gluebi - initialize gluebi for an UBI volume. | ||
265 | * @ubi: UBI device description object | ||
266 | * @vol: volume description object | ||
267 | * | ||
268 | * This function is called when an UBI volume is created in order to create | ||
269 | * corresponding fake MTD device. Returns zero in case of success and a | ||
270 | * negative error code in case of failure. | ||
271 | */ | ||
272 | int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol) | ||
273 | { | ||
274 | struct mtd_info *mtd = &vol->gluebi_mtd; | ||
275 | |||
276 | mtd->name = kmemdup(vol->name, vol->name_len + 1, GFP_KERNEL); | ||
277 | if (!mtd->name) | ||
278 | return -ENOMEM; | ||
279 | |||
280 | mtd->type = MTD_UBIVOLUME; | ||
281 | if (!ubi->ro_mode) | ||
282 | mtd->flags = MTD_WRITEABLE; | ||
283 | mtd->writesize = ubi->min_io_size; | ||
284 | mtd->owner = THIS_MODULE; | ||
285 | mtd->size = vol->usable_leb_size * vol->reserved_pebs; | ||
286 | mtd->erasesize = vol->usable_leb_size; | ||
287 | mtd->read = gluebi_read; | ||
288 | mtd->write = gluebi_write; | ||
289 | mtd->erase = gluebi_erase; | ||
290 | mtd->get_device = gluebi_get_device; | ||
291 | mtd->put_device = gluebi_put_device; | ||
292 | |||
293 | if (add_mtd_device(mtd)) { | ||
294 | ubi_err("cannot not add MTD device\n"); | ||
295 | kfree(mtd->name); | ||
296 | return -ENFILE; | ||
297 | } | ||
298 | |||
299 | dbg_msg("added mtd%d (\"%s\"), size %u, EB size %u", | ||
300 | mtd->index, mtd->name, mtd->size, mtd->erasesize); | ||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | /** | ||
305 | * ubi_destroy_gluebi - close gluebi for an UBI volume. | ||
306 | * @vol: volume description object | ||
307 | * | ||
308 | * This function is called when an UBI volume is removed in order to remove | ||
309 | * corresponding fake MTD device. Returns zero in case of success and a | ||
310 | * negative error code in case of failure. | ||
311 | */ | ||
312 | int ubi_destroy_gluebi(struct ubi_volume *vol) | ||
313 | { | ||
314 | int err; | ||
315 | struct mtd_info *mtd = &vol->gluebi_mtd; | ||
316 | |||
317 | dbg_msg("remove mtd%d", mtd->index); | ||
318 | err = del_mtd_device(mtd); | ||
319 | if (err) | ||
320 | return err; | ||
321 | kfree(mtd->name); | ||
322 | return 0; | ||
323 | } | ||
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c new file mode 100644 index 000000000000..438914d05151 --- /dev/null +++ b/drivers/mtd/ubi/io.c | |||
@@ -0,0 +1,1259 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * Copyright (c) Nokia Corporation, 2006, 2007 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | * | ||
19 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
20 | */ | ||
21 | |||
22 | /* | ||
23 | * UBI input/output unit. | ||
24 | * | ||
25 | * This unit provides a uniform way to work with all kinds of the underlying | ||
26 | * MTD devices. It also implements handy functions for reading and writing UBI | ||
27 | * headers. | ||
28 | * | ||
29 | * We are trying to have a paranoid mindset and not to trust to what we read | ||
30 | * from the flash media in order to be more secure and robust. So this unit | ||
31 | * validates every single header it reads from the flash media. | ||
32 | * | ||
33 | * Some words about how the eraseblock headers are stored. | ||
34 | * | ||
35 | * The erase counter header is always stored at offset zero. By default, the | ||
36 | * VID header is stored after the EC header at the closest aligned offset | ||
37 | * (i.e. aligned to the minimum I/O unit size). Data starts next to the VID | ||
38 | * header at the closest aligned offset. But this default layout may be | ||
39 | * changed. For example, for different reasons (e.g., optimization) UBI may be | ||
40 | * asked to put the VID header at further offset, and even at an unaligned | ||
41 | * offset. Of course, if the offset of the VID header is unaligned, UBI adds | ||
42 | * proper padding in front of it. Data offset may also be changed but it has to | ||
43 | * be aligned. | ||
44 | * | ||
45 | * About minimal I/O units. In general, UBI assumes flash device model where | ||
46 | * there is only one minimal I/O unit size. E.g., in case of NOR flash it is 1, | ||
47 | * in case of NAND flash it is a NAND page, etc. This is reported by MTD in the | ||
48 | * @ubi->mtd->writesize field. But as an exception, UBI admits of using another | ||
49 | * (smaller) minimal I/O unit size for EC and VID headers to make it possible | ||
50 | * to do different optimizations. | ||
51 | * | ||
52 | * This is extremely useful in case of NAND flashes which admit of several | ||
53 | * write operations to one NAND page. In this case UBI can fit EC and VID | ||
54 | * headers at one NAND page. Thus, UBI may use "sub-page" size as the minimal | ||
55 | * I/O unit for the headers (the @ubi->hdrs_min_io_size field). But it still | ||
56 | * reports NAND page size (@ubi->min_io_size) as a minimal I/O unit for the UBI | ||
57 | * users. | ||
58 | * | ||
59 | * Example: some Samsung NANDs with 2KiB pages allow 4x 512-byte writes, so | ||
60 | * although the minimal I/O unit is 2K, UBI uses 512 bytes for EC and VID | ||
61 | * headers. | ||
62 | * | ||
63 | * Q: why not just to treat sub-page as a minimal I/O unit of this flash | ||
64 | * device, e.g., make @ubi->min_io_size = 512 in the example above? | ||
65 | * | ||
66 | * A: because when writing a sub-page, MTD still writes a full 2K page but the | ||
67 | * bytes which are no relevant to the sub-page are 0xFF. So, basically, writing | ||
68 | * 4x512 sub-pages is 4 times slower then writing one 2KiB NAND page. Thus, we | ||
69 | * prefer to use sub-pages only for EV and VID headers. | ||
70 | * | ||
71 | * As it was noted above, the VID header may start at a non-aligned offset. | ||
72 | * For example, in case of a 2KiB page NAND flash with a 512 bytes sub-page, | ||
73 | * the VID header may reside at offset 1984 which is the last 64 bytes of the | ||
74 | * last sub-page (EC header is always at offset zero). This causes some | ||
75 | * difficulties when reading and writing VID headers. | ||
76 | * | ||
77 | * Suppose we have a 64-byte buffer and we read a VID header at it. We change | ||
78 | * the data and want to write this VID header out. As we can only write in | ||
79 | * 512-byte chunks, we have to allocate one more buffer and copy our VID header | ||
80 | * to offset 448 of this buffer. | ||
81 | * | ||
82 | * The I/O unit does the following trick in order to avoid this extra copy. | ||
83 | * It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID header | ||
84 | * and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. When the | ||
85 | * VID header is being written out, it shifts the VID header pointer back and | ||
86 | * writes the whole sub-page. | ||
87 | */ | ||
88 | |||
89 | #include <linux/crc32.h> | ||
90 | #include <linux/err.h> | ||
91 | #include "ubi.h" | ||
92 | |||
93 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
94 | static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum); | ||
95 | static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum); | ||
96 | static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
97 | const struct ubi_ec_hdr *ec_hdr); | ||
98 | static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum); | ||
99 | static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
100 | const struct ubi_vid_hdr *vid_hdr); | ||
101 | static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum, | ||
102 | int offset, int len); | ||
103 | #else | ||
104 | #define paranoid_check_not_bad(ubi, pnum) 0 | ||
105 | #define paranoid_check_peb_ec_hdr(ubi, pnum) 0 | ||
106 | #define paranoid_check_ec_hdr(ubi, pnum, ec_hdr) 0 | ||
107 | #define paranoid_check_peb_vid_hdr(ubi, pnum) 0 | ||
108 | #define paranoid_check_vid_hdr(ubi, pnum, vid_hdr) 0 | ||
109 | #define paranoid_check_all_ff(ubi, pnum, offset, len) 0 | ||
110 | #endif | ||
111 | |||
112 | /** | ||
113 | * ubi_io_read - read data from a physical eraseblock. | ||
114 | * @ubi: UBI device description object | ||
115 | * @buf: buffer where to store the read data | ||
116 | * @pnum: physical eraseblock number to read from | ||
117 | * @offset: offset within the physical eraseblock from where to read | ||
118 | * @len: how many bytes to read | ||
119 | * | ||
120 | * This function reads data from offset @offset of physical eraseblock @pnum | ||
121 | * and stores the read data in the @buf buffer. The following return codes are | ||
122 | * possible: | ||
123 | * | ||
124 | * o %0 if all the requested data were successfully read; | ||
125 | * o %UBI_IO_BITFLIPS if all the requested data were successfully read, but | ||
126 | * correctable bit-flips were detected; this is harmless but may indicate | ||
127 | * that this eraseblock may become bad soon (but do not have to); | ||
128 | * o %-EBADMSG if the MTD subsystem reported about data data integrity | ||
129 | * problems, for example it can me an ECC error in case of NAND; this most | ||
130 | * probably means that the data is corrupted; | ||
131 | * o %-EIO if some I/O error occurred; | ||
132 | * o other negative error codes in case of other errors. | ||
133 | */ | ||
134 | int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, | ||
135 | int len) | ||
136 | { | ||
137 | int err, retries = 0; | ||
138 | size_t read; | ||
139 | loff_t addr; | ||
140 | |||
141 | dbg_io("read %d bytes from PEB %d:%d", len, pnum, offset); | ||
142 | |||
143 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
144 | ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); | ||
145 | ubi_assert(len > 0); | ||
146 | |||
147 | err = paranoid_check_not_bad(ubi, pnum); | ||
148 | if (err) | ||
149 | return err > 0 ? -EINVAL : err; | ||
150 | |||
151 | addr = (loff_t)pnum * ubi->peb_size + offset; | ||
152 | retry: | ||
153 | err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf); | ||
154 | if (err) { | ||
155 | if (err == -EUCLEAN) { | ||
156 | /* | ||
157 | * -EUCLEAN is reported if there was a bit-flip which | ||
158 | * was corrected, so this is harmless. | ||
159 | */ | ||
160 | ubi_msg("fixable bit-flip detected at PEB %d", pnum); | ||
161 | ubi_assert(len == read); | ||
162 | return UBI_IO_BITFLIPS; | ||
163 | } | ||
164 | |||
165 | if (read != len && retries++ < UBI_IO_RETRIES) { | ||
166 | dbg_io("error %d while reading %d bytes from PEB %d:%d, " | ||
167 | "read only %zd bytes, retry", | ||
168 | err, len, pnum, offset, read); | ||
169 | yield(); | ||
170 | goto retry; | ||
171 | } | ||
172 | |||
173 | ubi_err("error %d while reading %d bytes from PEB %d:%d, " | ||
174 | "read %zd bytes", err, len, pnum, offset, read); | ||
175 | ubi_dbg_dump_stack(); | ||
176 | } else { | ||
177 | ubi_assert(len == read); | ||
178 | |||
179 | if (ubi_dbg_is_bitflip()) { | ||
180 | dbg_msg("bit-flip (emulated)"); | ||
181 | err = UBI_IO_BITFLIPS; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | return err; | ||
186 | } | ||
187 | |||
188 | /** | ||
189 | * ubi_io_write - write data to a physical eraseblock. | ||
190 | * @ubi: UBI device description object | ||
191 | * @buf: buffer with the data to write | ||
192 | * @pnum: physical eraseblock number to write to | ||
193 | * @offset: offset within the physical eraseblock where to write | ||
194 | * @len: how many bytes to write | ||
195 | * | ||
196 | * This function writes @len bytes of data from buffer @buf to offset @offset | ||
197 | * of physical eraseblock @pnum. If all the data were successfully written, | ||
198 | * zero is returned. If an error occurred, this function returns a negative | ||
199 | * error code. If %-EIO is returned, the physical eraseblock most probably went | ||
200 | * bad. | ||
201 | * | ||
202 | * Note, in case of an error, it is possible that something was still written | ||
203 | * to the flash media, but may be some garbage. | ||
204 | */ | ||
205 | int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum, | ||
206 | int offset, int len) | ||
207 | { | ||
208 | int err; | ||
209 | size_t written; | ||
210 | loff_t addr; | ||
211 | |||
212 | dbg_io("write %d bytes to PEB %d:%d", len, pnum, offset); | ||
213 | |||
214 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
215 | ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); | ||
216 | ubi_assert(offset % ubi->hdrs_min_io_size == 0); | ||
217 | ubi_assert(len > 0 && len % ubi->hdrs_min_io_size == 0); | ||
218 | |||
219 | if (ubi->ro_mode) { | ||
220 | ubi_err("read-only mode"); | ||
221 | return -EROFS; | ||
222 | } | ||
223 | |||
224 | /* The below has to be compiled out if paranoid checks are disabled */ | ||
225 | |||
226 | err = paranoid_check_not_bad(ubi, pnum); | ||
227 | if (err) | ||
228 | return err > 0 ? -EINVAL : err; | ||
229 | |||
230 | /* The area we are writing to has to contain all 0xFF bytes */ | ||
231 | err = paranoid_check_all_ff(ubi, pnum, offset, len); | ||
232 | if (err) | ||
233 | return err > 0 ? -EINVAL : err; | ||
234 | |||
235 | if (offset >= ubi->leb_start) { | ||
236 | /* | ||
237 | * We write to the data area of the physical eraseblock. Make | ||
238 | * sure it has valid EC and VID headers. | ||
239 | */ | ||
240 | err = paranoid_check_peb_ec_hdr(ubi, pnum); | ||
241 | if (err) | ||
242 | return err > 0 ? -EINVAL : err; | ||
243 | err = paranoid_check_peb_vid_hdr(ubi, pnum); | ||
244 | if (err) | ||
245 | return err > 0 ? -EINVAL : err; | ||
246 | } | ||
247 | |||
248 | if (ubi_dbg_is_write_failure()) { | ||
249 | dbg_err("cannot write %d bytes to PEB %d:%d " | ||
250 | "(emulated)", len, pnum, offset); | ||
251 | ubi_dbg_dump_stack(); | ||
252 | return -EIO; | ||
253 | } | ||
254 | |||
255 | addr = (loff_t)pnum * ubi->peb_size + offset; | ||
256 | err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf); | ||
257 | if (err) { | ||
258 | ubi_err("error %d while writing %d bytes to PEB %d:%d, written" | ||
259 | " %zd bytes", err, len, pnum, offset, written); | ||
260 | ubi_dbg_dump_stack(); | ||
261 | } else | ||
262 | ubi_assert(written == len); | ||
263 | |||
264 | return err; | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * erase_callback - MTD erasure call-back. | ||
269 | * @ei: MTD erase information object. | ||
270 | * | ||
271 | * Note, even though MTD erase interface is asynchronous, all the current | ||
272 | * implementations are synchronous anyway. | ||
273 | */ | ||
274 | static void erase_callback(struct erase_info *ei) | ||
275 | { | ||
276 | wake_up_interruptible((wait_queue_head_t *)ei->priv); | ||
277 | } | ||
278 | |||
279 | /** | ||
280 | * do_sync_erase - synchronously erase a physical eraseblock. | ||
281 | * @ubi: UBI device description object | ||
282 | * @pnum: the physical eraseblock number to erase | ||
283 | * | ||
284 | * This function synchronously erases physical eraseblock @pnum and returns | ||
285 | * zero in case of success and a negative error code in case of failure. If | ||
286 | * %-EIO is returned, the physical eraseblock most probably went bad. | ||
287 | */ | ||
288 | static int do_sync_erase(const struct ubi_device *ubi, int pnum) | ||
289 | { | ||
290 | int err, retries = 0; | ||
291 | struct erase_info ei; | ||
292 | wait_queue_head_t wq; | ||
293 | |||
294 | dbg_io("erase PEB %d", pnum); | ||
295 | |||
296 | retry: | ||
297 | init_waitqueue_head(&wq); | ||
298 | memset(&ei, 0, sizeof(struct erase_info)); | ||
299 | |||
300 | ei.mtd = ubi->mtd; | ||
301 | ei.addr = pnum * ubi->peb_size; | ||
302 | ei.len = ubi->peb_size; | ||
303 | ei.callback = erase_callback; | ||
304 | ei.priv = (unsigned long)&wq; | ||
305 | |||
306 | err = ubi->mtd->erase(ubi->mtd, &ei); | ||
307 | if (err) { | ||
308 | if (retries++ < UBI_IO_RETRIES) { | ||
309 | dbg_io("error %d while erasing PEB %d, retry", | ||
310 | err, pnum); | ||
311 | yield(); | ||
312 | goto retry; | ||
313 | } | ||
314 | ubi_err("cannot erase PEB %d, error %d", pnum, err); | ||
315 | ubi_dbg_dump_stack(); | ||
316 | return err; | ||
317 | } | ||
318 | |||
319 | err = wait_event_interruptible(wq, ei.state == MTD_ERASE_DONE || | ||
320 | ei.state == MTD_ERASE_FAILED); | ||
321 | if (err) { | ||
322 | ubi_err("interrupted PEB %d erasure", pnum); | ||
323 | return -EINTR; | ||
324 | } | ||
325 | |||
326 | if (ei.state == MTD_ERASE_FAILED) { | ||
327 | if (retries++ < UBI_IO_RETRIES) { | ||
328 | dbg_io("error while erasing PEB %d, retry", pnum); | ||
329 | yield(); | ||
330 | goto retry; | ||
331 | } | ||
332 | ubi_err("cannot erase PEB %d", pnum); | ||
333 | ubi_dbg_dump_stack(); | ||
334 | return -EIO; | ||
335 | } | ||
336 | |||
337 | err = paranoid_check_all_ff(ubi, pnum, 0, ubi->peb_size); | ||
338 | if (err) | ||
339 | return err > 0 ? -EINVAL : err; | ||
340 | |||
341 | if (ubi_dbg_is_erase_failure() && !err) { | ||
342 | dbg_err("cannot erase PEB %d (emulated)", pnum); | ||
343 | return -EIO; | ||
344 | } | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | /** | ||
350 | * check_pattern - check if buffer contains only a certain byte pattern. | ||
351 | * @buf: buffer to check | ||
352 | * @patt: the pattern to check | ||
353 | * @size: buffer size in bytes | ||
354 | * | ||
355 | * This function returns %1 in there are only @patt bytes in @buf, and %0 if | ||
356 | * something else was also found. | ||
357 | */ | ||
358 | static int check_pattern(const void *buf, uint8_t patt, int size) | ||
359 | { | ||
360 | int i; | ||
361 | |||
362 | for (i = 0; i < size; i++) | ||
363 | if (((const uint8_t *)buf)[i] != patt) | ||
364 | return 0; | ||
365 | return 1; | ||
366 | } | ||
367 | |||
368 | /* Patterns to write to a physical eraseblock when torturing it */ | ||
369 | static uint8_t patterns[] = {0xa5, 0x5a, 0x0}; | ||
370 | |||
371 | /** | ||
372 | * torture_peb - test a supposedly bad physical eraseblock. | ||
373 | * @ubi: UBI device description object | ||
374 | * @pnum: the physical eraseblock number to test | ||
375 | * | ||
376 | * This function returns %-EIO if the physical eraseblock did not pass the | ||
377 | * test, a positive number of erase operations done if the test was | ||
378 | * successfully passed, and other negative error codes in case of other errors. | ||
379 | */ | ||
380 | static int torture_peb(const struct ubi_device *ubi, int pnum) | ||
381 | { | ||
382 | void *buf; | ||
383 | int err, i, patt_count; | ||
384 | |||
385 | buf = kmalloc(ubi->peb_size, GFP_KERNEL); | ||
386 | if (!buf) | ||
387 | return -ENOMEM; | ||
388 | |||
389 | patt_count = ARRAY_SIZE(patterns); | ||
390 | ubi_assert(patt_count > 0); | ||
391 | |||
392 | for (i = 0; i < patt_count; i++) { | ||
393 | err = do_sync_erase(ubi, pnum); | ||
394 | if (err) | ||
395 | goto out; | ||
396 | |||
397 | /* Make sure the PEB contains only 0xFF bytes */ | ||
398 | err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size); | ||
399 | if (err) | ||
400 | goto out; | ||
401 | |||
402 | err = check_pattern(buf, 0xFF, ubi->peb_size); | ||
403 | if (err == 0) { | ||
404 | ubi_err("erased PEB %d, but a non-0xFF byte found", | ||
405 | pnum); | ||
406 | err = -EIO; | ||
407 | goto out; | ||
408 | } | ||
409 | |||
410 | /* Write a pattern and check it */ | ||
411 | memset(buf, patterns[i], ubi->peb_size); | ||
412 | err = ubi_io_write(ubi, buf, pnum, 0, ubi->peb_size); | ||
413 | if (err) | ||
414 | goto out; | ||
415 | |||
416 | memset(buf, ~patterns[i], ubi->peb_size); | ||
417 | err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size); | ||
418 | if (err) | ||
419 | goto out; | ||
420 | |||
421 | err = check_pattern(buf, patterns[i], ubi->peb_size); | ||
422 | if (err == 0) { | ||
423 | ubi_err("pattern %x checking failed for PEB %d", | ||
424 | patterns[i], pnum); | ||
425 | err = -EIO; | ||
426 | goto out; | ||
427 | } | ||
428 | } | ||
429 | |||
430 | err = patt_count; | ||
431 | |||
432 | out: | ||
433 | if (err == UBI_IO_BITFLIPS || err == -EBADMSG) | ||
434 | /* | ||
435 | * If a bit-flip or data integrity error was detected, the test | ||
436 | * has not passed because it happened on a freshly erased | ||
437 | * physical eraseblock which means something is wrong with it. | ||
438 | */ | ||
439 | err = -EIO; | ||
440 | kfree(buf); | ||
441 | return err; | ||
442 | } | ||
443 | |||
444 | /** | ||
445 | * ubi_io_sync_erase - synchronously erase a physical eraseblock. | ||
446 | * @ubi: UBI device description object | ||
447 | * @pnum: physical eraseblock number to erase | ||
448 | * @torture: if this physical eraseblock has to be tortured | ||
449 | * | ||
450 | * This function synchronously erases physical eraseblock @pnum. If @torture | ||
451 | * flag is not zero, the physical eraseblock is checked by means of writing | ||
452 | * different patterns to it and reading them back. If the torturing is enabled, | ||
453 | * the physical eraseblock is erased more then once. | ||
454 | * | ||
455 | * This function returns the number of erasures made in case of success, %-EIO | ||
456 | * if the erasure failed or the torturing test failed, and other negative error | ||
457 | * codes in case of other errors. Note, %-EIO means that the physical | ||
458 | * eraseblock is bad. | ||
459 | */ | ||
460 | int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture) | ||
461 | { | ||
462 | int err, ret = 0; | ||
463 | |||
464 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
465 | |||
466 | err = paranoid_check_not_bad(ubi, pnum); | ||
467 | if (err != 0) | ||
468 | return err > 0 ? -EINVAL : err; | ||
469 | |||
470 | if (ubi->ro_mode) { | ||
471 | ubi_err("read-only mode"); | ||
472 | return -EROFS; | ||
473 | } | ||
474 | |||
475 | if (torture) { | ||
476 | ret = torture_peb(ubi, pnum); | ||
477 | if (ret < 0) | ||
478 | return ret; | ||
479 | } | ||
480 | |||
481 | err = do_sync_erase(ubi, pnum); | ||
482 | if (err) | ||
483 | return err; | ||
484 | |||
485 | return ret + 1; | ||
486 | } | ||
487 | |||
488 | /** | ||
489 | * ubi_io_is_bad - check if a physical eraseblock is bad. | ||
490 | * @ubi: UBI device description object | ||
491 | * @pnum: the physical eraseblock number to check | ||
492 | * | ||
493 | * This function returns a positive number if the physical eraseblock is bad, | ||
494 | * zero if not, and a negative error code if an error occurred. | ||
495 | */ | ||
496 | int ubi_io_is_bad(const struct ubi_device *ubi, int pnum) | ||
497 | { | ||
498 | struct mtd_info *mtd = ubi->mtd; | ||
499 | |||
500 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
501 | |||
502 | if (ubi->bad_allowed) { | ||
503 | int ret; | ||
504 | |||
505 | ret = mtd->block_isbad(mtd, (loff_t)pnum * ubi->peb_size); | ||
506 | if (ret < 0) | ||
507 | ubi_err("error %d while checking if PEB %d is bad", | ||
508 | ret, pnum); | ||
509 | else if (ret) | ||
510 | dbg_io("PEB %d is bad", pnum); | ||
511 | return ret; | ||
512 | } | ||
513 | |||
514 | return 0; | ||
515 | } | ||
516 | |||
517 | /** | ||
518 | * ubi_io_mark_bad - mark a physical eraseblock as bad. | ||
519 | * @ubi: UBI device description object | ||
520 | * @pnum: the physical eraseblock number to mark | ||
521 | * | ||
522 | * This function returns zero in case of success and a negative error code in | ||
523 | * case of failure. | ||
524 | */ | ||
525 | int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum) | ||
526 | { | ||
527 | int err; | ||
528 | struct mtd_info *mtd = ubi->mtd; | ||
529 | |||
530 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
531 | |||
532 | if (ubi->ro_mode) { | ||
533 | ubi_err("read-only mode"); | ||
534 | return -EROFS; | ||
535 | } | ||
536 | |||
537 | if (!ubi->bad_allowed) | ||
538 | return 0; | ||
539 | |||
540 | err = mtd->block_markbad(mtd, (loff_t)pnum * ubi->peb_size); | ||
541 | if (err) | ||
542 | ubi_err("cannot mark PEB %d bad, error %d", pnum, err); | ||
543 | return err; | ||
544 | } | ||
545 | |||
546 | /** | ||
547 | * validate_ec_hdr - validate an erase counter header. | ||
548 | * @ubi: UBI device description object | ||
549 | * @ec_hdr: the erase counter header to check | ||
550 | * | ||
551 | * This function returns zero if the erase counter header is OK, and %1 if | ||
552 | * not. | ||
553 | */ | ||
554 | static int validate_ec_hdr(const struct ubi_device *ubi, | ||
555 | const struct ubi_ec_hdr *ec_hdr) | ||
556 | { | ||
557 | long long ec; | ||
558 | int vid_hdr_offset, leb_start; | ||
559 | |||
560 | ec = ubi64_to_cpu(ec_hdr->ec); | ||
561 | vid_hdr_offset = ubi32_to_cpu(ec_hdr->vid_hdr_offset); | ||
562 | leb_start = ubi32_to_cpu(ec_hdr->data_offset); | ||
563 | |||
564 | if (ec_hdr->version != UBI_VERSION) { | ||
565 | ubi_err("node with incompatible UBI version found: " | ||
566 | "this UBI version is %d, image version is %d", | ||
567 | UBI_VERSION, (int)ec_hdr->version); | ||
568 | goto bad; | ||
569 | } | ||
570 | |||
571 | if (vid_hdr_offset != ubi->vid_hdr_offset) { | ||
572 | ubi_err("bad VID header offset %d, expected %d", | ||
573 | vid_hdr_offset, ubi->vid_hdr_offset); | ||
574 | goto bad; | ||
575 | } | ||
576 | |||
577 | if (leb_start != ubi->leb_start) { | ||
578 | ubi_err("bad data offset %d, expected %d", | ||
579 | leb_start, ubi->leb_start); | ||
580 | goto bad; | ||
581 | } | ||
582 | |||
583 | if (ec < 0 || ec > UBI_MAX_ERASECOUNTER) { | ||
584 | ubi_err("bad erase counter %lld", ec); | ||
585 | goto bad; | ||
586 | } | ||
587 | |||
588 | return 0; | ||
589 | |||
590 | bad: | ||
591 | ubi_err("bad EC header"); | ||
592 | ubi_dbg_dump_ec_hdr(ec_hdr); | ||
593 | ubi_dbg_dump_stack(); | ||
594 | return 1; | ||
595 | } | ||
596 | |||
597 | /** | ||
598 | * ubi_io_read_ec_hdr - read and check an erase counter header. | ||
599 | * @ubi: UBI device description object | ||
600 | * @pnum: physical eraseblock to read from | ||
601 | * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter | ||
602 | * header | ||
603 | * @verbose: be verbose if the header is corrupted or was not found | ||
604 | * | ||
605 | * This function reads erase counter header from physical eraseblock @pnum and | ||
606 | * stores it in @ec_hdr. This function also checks CRC checksum of the read | ||
607 | * erase counter header. The following codes may be returned: | ||
608 | * | ||
609 | * o %0 if the CRC checksum is correct and the header was successfully read; | ||
610 | * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected | ||
611 | * and corrected by the flash driver; this is harmless but may indicate that | ||
612 | * this eraseblock may become bad soon (but may be not); | ||
613 | * o %UBI_IO_BAD_EC_HDR if the erase counter header is corrupted (a CRC error); | ||
614 | * o %UBI_IO_PEB_EMPTY if the physical eraseblock is empty; | ||
615 | * o a negative error code in case of failure. | ||
616 | */ | ||
617 | int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
618 | struct ubi_ec_hdr *ec_hdr, int verbose) | ||
619 | { | ||
620 | int err, read_err = 0; | ||
621 | uint32_t crc, magic, hdr_crc; | ||
622 | |||
623 | dbg_io("read EC header from PEB %d", pnum); | ||
624 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
625 | |||
626 | err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); | ||
627 | if (err) { | ||
628 | if (err != UBI_IO_BITFLIPS && err != -EBADMSG) | ||
629 | return err; | ||
630 | |||
631 | /* | ||
632 | * We read all the data, but either a correctable bit-flip | ||
633 | * occurred, or MTD reported about some data integrity error, | ||
634 | * like an ECC error in case of NAND. The former is harmless, | ||
635 | * the later may mean that the read data is corrupted. But we | ||
636 | * have a CRC check-sum and we will detect this. If the EC | ||
637 | * header is still OK, we just report this as there was a | ||
638 | * bit-flip. | ||
639 | */ | ||
640 | read_err = err; | ||
641 | } | ||
642 | |||
643 | magic = ubi32_to_cpu(ec_hdr->magic); | ||
644 | if (magic != UBI_EC_HDR_MAGIC) { | ||
645 | /* | ||
646 | * The magic field is wrong. Let's check if we have read all | ||
647 | * 0xFF. If yes, this physical eraseblock is assumed to be | ||
648 | * empty. | ||
649 | * | ||
650 | * But if there was a read error, we do not test it for all | ||
651 | * 0xFFs. Even if it does contain all 0xFFs, this error | ||
652 | * indicates that something is still wrong with this physical | ||
653 | * eraseblock and we anyway cannot treat it as empty. | ||
654 | */ | ||
655 | if (read_err != -EBADMSG && | ||
656 | check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) { | ||
657 | /* The physical eraseblock is supposedly empty */ | ||
658 | |||
659 | /* | ||
660 | * The below is just a paranoid check, it has to be | ||
661 | * compiled out if paranoid checks are disabled. | ||
662 | */ | ||
663 | err = paranoid_check_all_ff(ubi, pnum, 0, | ||
664 | ubi->peb_size); | ||
665 | if (err) | ||
666 | return err > 0 ? UBI_IO_BAD_EC_HDR : err; | ||
667 | |||
668 | if (verbose) | ||
669 | ubi_warn("no EC header found at PEB %d, " | ||
670 | "only 0xFF bytes", pnum); | ||
671 | return UBI_IO_PEB_EMPTY; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * This is not a valid erase counter header, and these are not | ||
676 | * 0xFF bytes. Report that the header is corrupted. | ||
677 | */ | ||
678 | if (verbose) { | ||
679 | ubi_warn("bad magic number at PEB %d: %08x instead of " | ||
680 | "%08x", pnum, magic, UBI_EC_HDR_MAGIC); | ||
681 | ubi_dbg_dump_ec_hdr(ec_hdr); | ||
682 | } | ||
683 | return UBI_IO_BAD_EC_HDR; | ||
684 | } | ||
685 | |||
686 | crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); | ||
687 | hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc); | ||
688 | |||
689 | if (hdr_crc != crc) { | ||
690 | if (verbose) { | ||
691 | ubi_warn("bad EC header CRC at PEB %d, calculated %#08x," | ||
692 | " read %#08x", pnum, crc, hdr_crc); | ||
693 | ubi_dbg_dump_ec_hdr(ec_hdr); | ||
694 | } | ||
695 | return UBI_IO_BAD_EC_HDR; | ||
696 | } | ||
697 | |||
698 | /* And of course validate what has just been read from the media */ | ||
699 | err = validate_ec_hdr(ubi, ec_hdr); | ||
700 | if (err) { | ||
701 | ubi_err("validation failed for PEB %d", pnum); | ||
702 | return -EINVAL; | ||
703 | } | ||
704 | |||
705 | return read_err ? UBI_IO_BITFLIPS : 0; | ||
706 | } | ||
707 | |||
708 | /** | ||
709 | * ubi_io_write_ec_hdr - write an erase counter header. | ||
710 | * @ubi: UBI device description object | ||
711 | * @pnum: physical eraseblock to write to | ||
712 | * @ec_hdr: the erase counter header to write | ||
713 | * | ||
714 | * This function writes erase counter header described by @ec_hdr to physical | ||
715 | * eraseblock @pnum. It also fills most fields of @ec_hdr before writing, so | ||
716 | * the caller do not have to fill them. Callers must only fill the @ec_hdr->ec | ||
717 | * field. | ||
718 | * | ||
719 | * This function returns zero in case of success and a negative error code in | ||
720 | * case of failure. If %-EIO is returned, the physical eraseblock most probably | ||
721 | * went bad. | ||
722 | */ | ||
723 | int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
724 | struct ubi_ec_hdr *ec_hdr) | ||
725 | { | ||
726 | int err; | ||
727 | uint32_t crc; | ||
728 | |||
729 | dbg_io("write EC header to PEB %d", pnum); | ||
730 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
731 | |||
732 | ec_hdr->magic = cpu_to_ubi32(UBI_EC_HDR_MAGIC); | ||
733 | ec_hdr->version = UBI_VERSION; | ||
734 | ec_hdr->vid_hdr_offset = cpu_to_ubi32(ubi->vid_hdr_offset); | ||
735 | ec_hdr->data_offset = cpu_to_ubi32(ubi->leb_start); | ||
736 | crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); | ||
737 | ec_hdr->hdr_crc = cpu_to_ubi32(crc); | ||
738 | |||
739 | err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr); | ||
740 | if (err) | ||
741 | return -EINVAL; | ||
742 | |||
743 | err = ubi_io_write(ubi, ec_hdr, pnum, 0, ubi->ec_hdr_alsize); | ||
744 | return err; | ||
745 | } | ||
746 | |||
747 | /** | ||
748 | * validate_vid_hdr - validate a volume identifier header. | ||
749 | * @ubi: UBI device description object | ||
750 | * @vid_hdr: the volume identifier header to check | ||
751 | * | ||
752 | * This function checks that data stored in the volume identifier header | ||
753 | * @vid_hdr. Returns zero if the VID header is OK and %1 if not. | ||
754 | */ | ||
755 | static int validate_vid_hdr(const struct ubi_device *ubi, | ||
756 | const struct ubi_vid_hdr *vid_hdr) | ||
757 | { | ||
758 | int vol_type = vid_hdr->vol_type; | ||
759 | int copy_flag = vid_hdr->copy_flag; | ||
760 | int vol_id = ubi32_to_cpu(vid_hdr->vol_id); | ||
761 | int lnum = ubi32_to_cpu(vid_hdr->lnum); | ||
762 | int compat = vid_hdr->compat; | ||
763 | int data_size = ubi32_to_cpu(vid_hdr->data_size); | ||
764 | int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); | ||
765 | int data_pad = ubi32_to_cpu(vid_hdr->data_pad); | ||
766 | int data_crc = ubi32_to_cpu(vid_hdr->data_crc); | ||
767 | int usable_leb_size = ubi->leb_size - data_pad; | ||
768 | |||
769 | if (copy_flag != 0 && copy_flag != 1) { | ||
770 | dbg_err("bad copy_flag"); | ||
771 | goto bad; | ||
772 | } | ||
773 | |||
774 | if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 || | ||
775 | data_pad < 0) { | ||
776 | dbg_err("negative values"); | ||
777 | goto bad; | ||
778 | } | ||
779 | |||
780 | if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) { | ||
781 | dbg_err("bad vol_id"); | ||
782 | goto bad; | ||
783 | } | ||
784 | |||
785 | if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) { | ||
786 | dbg_err("bad compat"); | ||
787 | goto bad; | ||
788 | } | ||
789 | |||
790 | if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE && | ||
791 | compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE && | ||
792 | compat != UBI_COMPAT_REJECT) { | ||
793 | dbg_err("bad compat"); | ||
794 | goto bad; | ||
795 | } | ||
796 | |||
797 | if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { | ||
798 | dbg_err("bad vol_type"); | ||
799 | goto bad; | ||
800 | } | ||
801 | |||
802 | if (data_pad >= ubi->leb_size / 2) { | ||
803 | dbg_err("bad data_pad"); | ||
804 | goto bad; | ||
805 | } | ||
806 | |||
807 | if (vol_type == UBI_VID_STATIC) { | ||
808 | /* | ||
809 | * Although from high-level point of view static volumes may | ||
810 | * contain zero bytes of data, but no VID headers can contain | ||
811 | * zero at these fields, because they empty volumes do not have | ||
812 | * mapped logical eraseblocks. | ||
813 | */ | ||
814 | if (used_ebs == 0) { | ||
815 | dbg_err("zero used_ebs"); | ||
816 | goto bad; | ||
817 | } | ||
818 | if (data_size == 0) { | ||
819 | dbg_err("zero data_size"); | ||
820 | goto bad; | ||
821 | } | ||
822 | if (lnum < used_ebs - 1) { | ||
823 | if (data_size != usable_leb_size) { | ||
824 | dbg_err("bad data_size"); | ||
825 | goto bad; | ||
826 | } | ||
827 | } else if (lnum == used_ebs - 1) { | ||
828 | if (data_size == 0) { | ||
829 | dbg_err("bad data_size at last LEB"); | ||
830 | goto bad; | ||
831 | } | ||
832 | } else { | ||
833 | dbg_err("too high lnum"); | ||
834 | goto bad; | ||
835 | } | ||
836 | } else { | ||
837 | if (copy_flag == 0) { | ||
838 | if (data_crc != 0) { | ||
839 | dbg_err("non-zero data CRC"); | ||
840 | goto bad; | ||
841 | } | ||
842 | if (data_size != 0) { | ||
843 | dbg_err("non-zero data_size"); | ||
844 | goto bad; | ||
845 | } | ||
846 | } else { | ||
847 | if (data_size == 0) { | ||
848 | dbg_err("zero data_size of copy"); | ||
849 | goto bad; | ||
850 | } | ||
851 | } | ||
852 | if (used_ebs != 0) { | ||
853 | dbg_err("bad used_ebs"); | ||
854 | goto bad; | ||
855 | } | ||
856 | } | ||
857 | |||
858 | return 0; | ||
859 | |||
860 | bad: | ||
861 | ubi_err("bad VID header"); | ||
862 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
863 | ubi_dbg_dump_stack(); | ||
864 | return 1; | ||
865 | } | ||
866 | |||
867 | /** | ||
868 | * ubi_io_read_vid_hdr - read and check a volume identifier header. | ||
869 | * @ubi: UBI device description object | ||
870 | * @pnum: physical eraseblock number to read from | ||
871 | * @vid_hdr: &struct ubi_vid_hdr object where to store the read volume | ||
872 | * identifier header | ||
873 | * @verbose: be verbose if the header is corrupted or wasn't found | ||
874 | * | ||
875 | * This function reads the volume identifier header from physical eraseblock | ||
876 | * @pnum and stores it in @vid_hdr. It also checks CRC checksum of the read | ||
877 | * volume identifier header. The following codes may be returned: | ||
878 | * | ||
879 | * o %0 if the CRC checksum is correct and the header was successfully read; | ||
880 | * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected | ||
881 | * and corrected by the flash driver; this is harmless but may indicate that | ||
882 | * this eraseblock may become bad soon; | ||
883 | * o %UBI_IO_BAD_VID_HRD if the volume identifier header is corrupted (a CRC | ||
884 | * error detected); | ||
885 | * o %UBI_IO_PEB_FREE if the physical eraseblock is free (i.e., there is no VID | ||
886 | * header there); | ||
887 | * o a negative error code in case of failure. | ||
888 | */ | ||
889 | int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
890 | struct ubi_vid_hdr *vid_hdr, int verbose) | ||
891 | { | ||
892 | int err, read_err = 0; | ||
893 | uint32_t crc, magic, hdr_crc; | ||
894 | void *p; | ||
895 | |||
896 | dbg_io("read VID header from PEB %d", pnum); | ||
897 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
898 | |||
899 | p = (char *)vid_hdr - ubi->vid_hdr_shift; | ||
900 | err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, | ||
901 | ubi->vid_hdr_alsize); | ||
902 | if (err) { | ||
903 | if (err != UBI_IO_BITFLIPS && err != -EBADMSG) | ||
904 | return err; | ||
905 | |||
906 | /* | ||
907 | * We read all the data, but either a correctable bit-flip | ||
908 | * occurred, or MTD reported about some data integrity error, | ||
909 | * like an ECC error in case of NAND. The former is harmless, | ||
910 | * the later may mean the read data is corrupted. But we have a | ||
911 | * CRC check-sum and we will identify this. If the VID header is | ||
912 | * still OK, we just report this as there was a bit-flip. | ||
913 | */ | ||
914 | read_err = err; | ||
915 | } | ||
916 | |||
917 | magic = ubi32_to_cpu(vid_hdr->magic); | ||
918 | if (magic != UBI_VID_HDR_MAGIC) { | ||
919 | /* | ||
920 | * If we have read all 0xFF bytes, the VID header probably does | ||
921 | * not exist and the physical eraseblock is assumed to be free. | ||
922 | * | ||
923 | * But if there was a read error, we do not test the data for | ||
924 | * 0xFFs. Even if it does contain all 0xFFs, this error | ||
925 | * indicates that something is still wrong with this physical | ||
926 | * eraseblock and it cannot be regarded as free. | ||
927 | */ | ||
928 | if (read_err != -EBADMSG && | ||
929 | check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) { | ||
930 | /* The physical eraseblock is supposedly free */ | ||
931 | |||
932 | /* | ||
933 | * The below is just a paranoid check, it has to be | ||
934 | * compiled out if paranoid checks are disabled. | ||
935 | */ | ||
936 | err = paranoid_check_all_ff(ubi, pnum, ubi->leb_start, | ||
937 | ubi->leb_size); | ||
938 | if (err) | ||
939 | return err > 0 ? UBI_IO_BAD_VID_HDR : err; | ||
940 | |||
941 | if (verbose) | ||
942 | ubi_warn("no VID header found at PEB %d, " | ||
943 | "only 0xFF bytes", pnum); | ||
944 | return UBI_IO_PEB_FREE; | ||
945 | } | ||
946 | |||
947 | /* | ||
948 | * This is not a valid VID header, and these are not 0xFF | ||
949 | * bytes. Report that the header is corrupted. | ||
950 | */ | ||
951 | if (verbose) { | ||
952 | ubi_warn("bad magic number at PEB %d: %08x instead of " | ||
953 | "%08x", pnum, magic, UBI_VID_HDR_MAGIC); | ||
954 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
955 | } | ||
956 | return UBI_IO_BAD_VID_HDR; | ||
957 | } | ||
958 | |||
959 | crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); | ||
960 | hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc); | ||
961 | |||
962 | if (hdr_crc != crc) { | ||
963 | if (verbose) { | ||
964 | ubi_warn("bad CRC at PEB %d, calculated %#08x, " | ||
965 | "read %#08x", pnum, crc, hdr_crc); | ||
966 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
967 | } | ||
968 | return UBI_IO_BAD_VID_HDR; | ||
969 | } | ||
970 | |||
971 | /* Validate the VID header that we have just read */ | ||
972 | err = validate_vid_hdr(ubi, vid_hdr); | ||
973 | if (err) { | ||
974 | ubi_err("validation failed for PEB %d", pnum); | ||
975 | return -EINVAL; | ||
976 | } | ||
977 | |||
978 | return read_err ? UBI_IO_BITFLIPS : 0; | ||
979 | } | ||
980 | |||
981 | /** | ||
982 | * ubi_io_write_vid_hdr - write a volume identifier header. | ||
983 | * @ubi: UBI device description object | ||
984 | * @pnum: the physical eraseblock number to write to | ||
985 | * @vid_hdr: the volume identifier header to write | ||
986 | * | ||
987 | * This function writes the volume identifier header described by @vid_hdr to | ||
988 | * physical eraseblock @pnum. This function automatically fills the | ||
989 | * @vid_hdr->magic and the @vid_hdr->version fields, as well as calculates | ||
990 | * header CRC checksum and stores it at vid_hdr->hdr_crc. | ||
991 | * | ||
992 | * This function returns zero in case of success and a negative error code in | ||
993 | * case of failure. If %-EIO is returned, the physical eraseblock probably went | ||
994 | * bad. | ||
995 | */ | ||
996 | int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
997 | struct ubi_vid_hdr *vid_hdr) | ||
998 | { | ||
999 | int err; | ||
1000 | uint32_t crc; | ||
1001 | void *p; | ||
1002 | |||
1003 | dbg_io("write VID header to PEB %d", pnum); | ||
1004 | ubi_assert(pnum >= 0 && pnum < ubi->peb_count); | ||
1005 | |||
1006 | err = paranoid_check_peb_ec_hdr(ubi, pnum); | ||
1007 | if (err) | ||
1008 | return err > 0 ? -EINVAL: err; | ||
1009 | |||
1010 | vid_hdr->magic = cpu_to_ubi32(UBI_VID_HDR_MAGIC); | ||
1011 | vid_hdr->version = UBI_VERSION; | ||
1012 | crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); | ||
1013 | vid_hdr->hdr_crc = cpu_to_ubi32(crc); | ||
1014 | |||
1015 | err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr); | ||
1016 | if (err) | ||
1017 | return -EINVAL; | ||
1018 | |||
1019 | p = (char *)vid_hdr - ubi->vid_hdr_shift; | ||
1020 | err = ubi_io_write(ubi, p, pnum, ubi->vid_hdr_aloffset, | ||
1021 | ubi->vid_hdr_alsize); | ||
1022 | return err; | ||
1023 | } | ||
1024 | |||
1025 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
1026 | |||
1027 | /** | ||
1028 | * paranoid_check_not_bad - ensure that a physical eraseblock is not bad. | ||
1029 | * @ubi: UBI device description object | ||
1030 | * @pnum: physical eraseblock number to check | ||
1031 | * | ||
1032 | * This function returns zero if the physical eraseblock is good, a positive | ||
1033 | * number if it is bad and a negative error code if an error occurred. | ||
1034 | */ | ||
1035 | static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum) | ||
1036 | { | ||
1037 | int err; | ||
1038 | |||
1039 | err = ubi_io_is_bad(ubi, pnum); | ||
1040 | if (!err) | ||
1041 | return err; | ||
1042 | |||
1043 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1044 | ubi_dbg_dump_stack(); | ||
1045 | return err; | ||
1046 | } | ||
1047 | |||
1048 | /** | ||
1049 | * paranoid_check_ec_hdr - check if an erase counter header is all right. | ||
1050 | * @ubi: UBI device description object | ||
1051 | * @pnum: physical eraseblock number the erase counter header belongs to | ||
1052 | * @ec_hdr: the erase counter header to check | ||
1053 | * | ||
1054 | * This function returns zero if the erase counter header contains valid | ||
1055 | * values, and %1 if not. | ||
1056 | */ | ||
1057 | static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
1058 | const struct ubi_ec_hdr *ec_hdr) | ||
1059 | { | ||
1060 | int err; | ||
1061 | uint32_t magic; | ||
1062 | |||
1063 | magic = ubi32_to_cpu(ec_hdr->magic); | ||
1064 | if (magic != UBI_EC_HDR_MAGIC) { | ||
1065 | ubi_err("bad magic %#08x, must be %#08x", | ||
1066 | magic, UBI_EC_HDR_MAGIC); | ||
1067 | goto fail; | ||
1068 | } | ||
1069 | |||
1070 | err = validate_ec_hdr(ubi, ec_hdr); | ||
1071 | if (err) { | ||
1072 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1073 | goto fail; | ||
1074 | } | ||
1075 | |||
1076 | return 0; | ||
1077 | |||
1078 | fail: | ||
1079 | ubi_dbg_dump_ec_hdr(ec_hdr); | ||
1080 | ubi_dbg_dump_stack(); | ||
1081 | return 1; | ||
1082 | } | ||
1083 | |||
1084 | /** | ||
1085 | * paranoid_check_peb_ec_hdr - check that the erase counter header of a | ||
1086 | * physical eraseblock is in-place and is all right. | ||
1087 | * @ubi: UBI device description object | ||
1088 | * @pnum: the physical eraseblock number to check | ||
1089 | * | ||
1090 | * This function returns zero if the erase counter header is all right, %1 if | ||
1091 | * not, and a negative error code if an error occurred. | ||
1092 | */ | ||
1093 | static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum) | ||
1094 | { | ||
1095 | int err; | ||
1096 | uint32_t crc, hdr_crc; | ||
1097 | struct ubi_ec_hdr *ec_hdr; | ||
1098 | |||
1099 | ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); | ||
1100 | if (!ec_hdr) | ||
1101 | return -ENOMEM; | ||
1102 | |||
1103 | err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); | ||
1104 | if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG) | ||
1105 | goto exit; | ||
1106 | |||
1107 | crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); | ||
1108 | hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc); | ||
1109 | if (hdr_crc != crc) { | ||
1110 | ubi_err("bad CRC, calculated %#08x, read %#08x", crc, hdr_crc); | ||
1111 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1112 | ubi_dbg_dump_ec_hdr(ec_hdr); | ||
1113 | ubi_dbg_dump_stack(); | ||
1114 | err = 1; | ||
1115 | goto exit; | ||
1116 | } | ||
1117 | |||
1118 | err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr); | ||
1119 | |||
1120 | exit: | ||
1121 | kfree(ec_hdr); | ||
1122 | return err; | ||
1123 | } | ||
1124 | |||
1125 | /** | ||
1126 | * paranoid_check_vid_hdr - check that a volume identifier header is all right. | ||
1127 | * @ubi: UBI device description object | ||
1128 | * @pnum: physical eraseblock number the volume identifier header belongs to | ||
1129 | * @vid_hdr: the volume identifier header to check | ||
1130 | * | ||
1131 | * This function returns zero if the volume identifier header is all right, and | ||
1132 | * %1 if not. | ||
1133 | */ | ||
1134 | static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
1135 | const struct ubi_vid_hdr *vid_hdr) | ||
1136 | { | ||
1137 | int err; | ||
1138 | uint32_t magic; | ||
1139 | |||
1140 | magic = ubi32_to_cpu(vid_hdr->magic); | ||
1141 | if (magic != UBI_VID_HDR_MAGIC) { | ||
1142 | ubi_err("bad VID header magic %#08x at PEB %d, must be %#08x", | ||
1143 | magic, pnum, UBI_VID_HDR_MAGIC); | ||
1144 | goto fail; | ||
1145 | } | ||
1146 | |||
1147 | err = validate_vid_hdr(ubi, vid_hdr); | ||
1148 | if (err) { | ||
1149 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1150 | goto fail; | ||
1151 | } | ||
1152 | |||
1153 | return err; | ||
1154 | |||
1155 | fail: | ||
1156 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1157 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
1158 | ubi_dbg_dump_stack(); | ||
1159 | return 1; | ||
1160 | |||
1161 | } | ||
1162 | |||
1163 | /** | ||
1164 | * paranoid_check_peb_vid_hdr - check that the volume identifier header of a | ||
1165 | * physical eraseblock is in-place and is all right. | ||
1166 | * @ubi: UBI device description object | ||
1167 | * @pnum: the physical eraseblock number to check | ||
1168 | * | ||
1169 | * This function returns zero if the volume identifier header is all right, | ||
1170 | * %1 if not, and a negative error code if an error occurred. | ||
1171 | */ | ||
1172 | static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum) | ||
1173 | { | ||
1174 | int err; | ||
1175 | uint32_t crc, hdr_crc; | ||
1176 | struct ubi_vid_hdr *vid_hdr; | ||
1177 | void *p; | ||
1178 | |||
1179 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
1180 | if (!vid_hdr) | ||
1181 | return -ENOMEM; | ||
1182 | |||
1183 | p = (char *)vid_hdr - ubi->vid_hdr_shift; | ||
1184 | err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, | ||
1185 | ubi->vid_hdr_alsize); | ||
1186 | if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG) | ||
1187 | goto exit; | ||
1188 | |||
1189 | crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC); | ||
1190 | hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc); | ||
1191 | if (hdr_crc != crc) { | ||
1192 | ubi_err("bad VID header CRC at PEB %d, calculated %#08x, " | ||
1193 | "read %#08x", pnum, crc, hdr_crc); | ||
1194 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1195 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
1196 | ubi_dbg_dump_stack(); | ||
1197 | err = 1; | ||
1198 | goto exit; | ||
1199 | } | ||
1200 | |||
1201 | err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr); | ||
1202 | |||
1203 | exit: | ||
1204 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
1205 | return err; | ||
1206 | } | ||
1207 | |||
1208 | /** | ||
1209 | * paranoid_check_all_ff - check that a region of flash is empty. | ||
1210 | * @ubi: UBI device description object | ||
1211 | * @pnum: the physical eraseblock number to check | ||
1212 | * @offset: the starting offset within the physical eraseblock to check | ||
1213 | * @len: the length of the region to check | ||
1214 | * | ||
1215 | * This function returns zero if only 0xFF bytes are present at offset | ||
1216 | * @offset of the physical eraseblock @pnum, %1 if not, and a negative error | ||
1217 | * code if an error occurred. | ||
1218 | */ | ||
1219 | static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum, | ||
1220 | int offset, int len) | ||
1221 | { | ||
1222 | size_t read; | ||
1223 | int err; | ||
1224 | void *buf; | ||
1225 | loff_t addr = (loff_t)pnum * ubi->peb_size + offset; | ||
1226 | |||
1227 | buf = kzalloc(len, GFP_KERNEL); | ||
1228 | if (!buf) | ||
1229 | return -ENOMEM; | ||
1230 | |||
1231 | err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf); | ||
1232 | if (err && err != -EUCLEAN) { | ||
1233 | ubi_err("error %d while reading %d bytes from PEB %d:%d, " | ||
1234 | "read %zd bytes", err, len, pnum, offset, read); | ||
1235 | goto error; | ||
1236 | } | ||
1237 | |||
1238 | err = check_pattern(buf, 0xFF, len); | ||
1239 | if (err == 0) { | ||
1240 | ubi_err("flash region at PEB %d:%d, length %d does not " | ||
1241 | "contain all 0xFF bytes", pnum, offset, len); | ||
1242 | goto fail; | ||
1243 | } | ||
1244 | |||
1245 | kfree(buf); | ||
1246 | return 0; | ||
1247 | |||
1248 | fail: | ||
1249 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1250 | dbg_msg("hex dump of the %d-%d region", offset, offset + len); | ||
1251 | ubi_dbg_hexdump(buf, len); | ||
1252 | err = 1; | ||
1253 | error: | ||
1254 | ubi_dbg_dump_stack(); | ||
1255 | kfree(buf); | ||
1256 | return err; | ||
1257 | } | ||
1258 | |||
1259 | #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ | ||
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c new file mode 100644 index 000000000000..d352c4575c3d --- /dev/null +++ b/drivers/mtd/ubi/kapi.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* This file mostly implements UBI kernel API functions */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/err.h> | ||
25 | #include <asm/div64.h> | ||
26 | #include "ubi.h" | ||
27 | |||
28 | /** | ||
29 | * ubi_get_device_info - get information about UBI device. | ||
30 | * @ubi_num: UBI device number | ||
31 | * @di: the information is stored here | ||
32 | * | ||
33 | * This function returns %0 in case of success and a %-ENODEV if there is no | ||
34 | * such UBI device. | ||
35 | */ | ||
36 | int ubi_get_device_info(int ubi_num, struct ubi_device_info *di) | ||
37 | { | ||
38 | const struct ubi_device *ubi; | ||
39 | |||
40 | if (!try_module_get(THIS_MODULE)) | ||
41 | return -ENODEV; | ||
42 | |||
43 | if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || | ||
44 | !ubi_devices[ubi_num]) { | ||
45 | module_put(THIS_MODULE); | ||
46 | return -ENODEV; | ||
47 | } | ||
48 | |||
49 | ubi = ubi_devices[ubi_num]; | ||
50 | di->ubi_num = ubi->ubi_num; | ||
51 | di->leb_size = ubi->leb_size; | ||
52 | di->min_io_size = ubi->min_io_size; | ||
53 | di->ro_mode = ubi->ro_mode; | ||
54 | di->cdev = MKDEV(ubi->major, 0); | ||
55 | module_put(THIS_MODULE); | ||
56 | return 0; | ||
57 | } | ||
58 | EXPORT_SYMBOL_GPL(ubi_get_device_info); | ||
59 | |||
60 | /** | ||
61 | * ubi_get_volume_info - get information about UBI volume. | ||
62 | * @desc: volume descriptor | ||
63 | * @vi: the information is stored here | ||
64 | */ | ||
65 | void ubi_get_volume_info(struct ubi_volume_desc *desc, | ||
66 | struct ubi_volume_info *vi) | ||
67 | { | ||
68 | const struct ubi_volume *vol = desc->vol; | ||
69 | const struct ubi_device *ubi = vol->ubi; | ||
70 | |||
71 | vi->vol_id = vol->vol_id; | ||
72 | vi->ubi_num = ubi->ubi_num; | ||
73 | vi->size = vol->reserved_pebs; | ||
74 | vi->used_bytes = vol->used_bytes; | ||
75 | vi->vol_type = vol->vol_type; | ||
76 | vi->corrupted = vol->corrupted; | ||
77 | vi->upd_marker = vol->upd_marker; | ||
78 | vi->alignment = vol->alignment; | ||
79 | vi->usable_leb_size = vol->usable_leb_size; | ||
80 | vi->name_len = vol->name_len; | ||
81 | vi->name = vol->name; | ||
82 | vi->cdev = MKDEV(ubi->major, vi->vol_id + 1); | ||
83 | } | ||
84 | EXPORT_SYMBOL_GPL(ubi_get_volume_info); | ||
85 | |||
86 | /** | ||
87 | * ubi_open_volume - open UBI volume. | ||
88 | * @ubi_num: UBI device number | ||
89 | * @vol_id: volume ID | ||
90 | * @mode: open mode | ||
91 | * | ||
92 | * The @mode parameter specifies if the volume should be opened in read-only | ||
93 | * mode, read-write mode, or exclusive mode. The exclusive mode guarantees that | ||
94 | * nobody else will be able to open this volume. UBI allows to have many volume | ||
95 | * readers and one writer at a time. | ||
96 | * | ||
97 | * If a static volume is being opened for the first time since boot, it will be | ||
98 | * checked by this function, which means it will be fully read and the CRC | ||
99 | * checksum of each logical eraseblock will be checked. | ||
100 | * | ||
101 | * This function returns volume descriptor in case of success and a negative | ||
102 | * error code in case of failure. | ||
103 | */ | ||
104 | struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode) | ||
105 | { | ||
106 | int err; | ||
107 | struct ubi_volume_desc *desc; | ||
108 | struct ubi_device *ubi = ubi_devices[ubi_num]; | ||
109 | struct ubi_volume *vol; | ||
110 | |||
111 | dbg_msg("open device %d volume %d, mode %d", ubi_num, vol_id, mode); | ||
112 | |||
113 | err = -ENODEV; | ||
114 | if (!try_module_get(THIS_MODULE)) | ||
115 | return ERR_PTR(err); | ||
116 | |||
117 | if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi) | ||
118 | goto out_put; | ||
119 | |||
120 | err = -EINVAL; | ||
121 | if (vol_id < 0 || vol_id >= ubi->vtbl_slots) | ||
122 | goto out_put; | ||
123 | if (mode != UBI_READONLY && mode != UBI_READWRITE && | ||
124 | mode != UBI_EXCLUSIVE) | ||
125 | goto out_put; | ||
126 | |||
127 | desc = kmalloc(sizeof(struct ubi_volume_desc), GFP_KERNEL); | ||
128 | if (!desc) { | ||
129 | err = -ENOMEM; | ||
130 | goto out_put; | ||
131 | } | ||
132 | |||
133 | spin_lock(&ubi->volumes_lock); | ||
134 | vol = ubi->volumes[vol_id]; | ||
135 | if (!vol) { | ||
136 | err = -ENODEV; | ||
137 | goto out_unlock; | ||
138 | } | ||
139 | |||
140 | err = -EBUSY; | ||
141 | switch (mode) { | ||
142 | case UBI_READONLY: | ||
143 | if (vol->exclusive) | ||
144 | goto out_unlock; | ||
145 | vol->readers += 1; | ||
146 | break; | ||
147 | |||
148 | case UBI_READWRITE: | ||
149 | if (vol->exclusive || vol->writers > 0) | ||
150 | goto out_unlock; | ||
151 | vol->writers += 1; | ||
152 | break; | ||
153 | |||
154 | case UBI_EXCLUSIVE: | ||
155 | if (vol->exclusive || vol->writers || vol->readers) | ||
156 | goto out_unlock; | ||
157 | vol->exclusive = 1; | ||
158 | break; | ||
159 | } | ||
160 | spin_unlock(&ubi->volumes_lock); | ||
161 | |||
162 | desc->vol = vol; | ||
163 | desc->mode = mode; | ||
164 | |||
165 | /* | ||
166 | * To prevent simultaneous checks of the same volume we use @vtbl_mutex, | ||
167 | * although it is not the purpose it was introduced for. | ||
168 | */ | ||
169 | mutex_lock(&ubi->vtbl_mutex); | ||
170 | if (!vol->checked) { | ||
171 | /* This is the first open - check the volume */ | ||
172 | err = ubi_check_volume(ubi, vol_id); | ||
173 | if (err < 0) { | ||
174 | mutex_unlock(&ubi->vtbl_mutex); | ||
175 | ubi_close_volume(desc); | ||
176 | return ERR_PTR(err); | ||
177 | } | ||
178 | if (err == 1) { | ||
179 | ubi_warn("volume %d on UBI device %d is corrupted", | ||
180 | vol_id, ubi->ubi_num); | ||
181 | vol->corrupted = 1; | ||
182 | } | ||
183 | vol->checked = 1; | ||
184 | } | ||
185 | mutex_unlock(&ubi->vtbl_mutex); | ||
186 | return desc; | ||
187 | |||
188 | out_unlock: | ||
189 | spin_unlock(&ubi->volumes_lock); | ||
190 | kfree(desc); | ||
191 | out_put: | ||
192 | module_put(THIS_MODULE); | ||
193 | return ERR_PTR(err); | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(ubi_open_volume); | ||
196 | |||
197 | /** | ||
198 | * ubi_open_volume_nm - open UBI volume by name. | ||
199 | * @ubi_num: UBI device number | ||
200 | * @name: volume name | ||
201 | * @mode: open mode | ||
202 | * | ||
203 | * This function is similar to 'ubi_open_volume()', but opens a volume by name. | ||
204 | */ | ||
205 | struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, | ||
206 | int mode) | ||
207 | { | ||
208 | int i, vol_id = -1, len; | ||
209 | struct ubi_volume_desc *ret; | ||
210 | struct ubi_device *ubi; | ||
211 | |||
212 | dbg_msg("open volume %s, mode %d", name, mode); | ||
213 | |||
214 | if (!name) | ||
215 | return ERR_PTR(-EINVAL); | ||
216 | |||
217 | len = strnlen(name, UBI_VOL_NAME_MAX + 1); | ||
218 | if (len > UBI_VOL_NAME_MAX) | ||
219 | return ERR_PTR(-EINVAL); | ||
220 | |||
221 | ret = ERR_PTR(-ENODEV); | ||
222 | if (!try_module_get(THIS_MODULE)) | ||
223 | return ret; | ||
224 | |||
225 | if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi_devices[ubi_num]) | ||
226 | goto out_put; | ||
227 | |||
228 | ubi = ubi_devices[ubi_num]; | ||
229 | |||
230 | spin_lock(&ubi->volumes_lock); | ||
231 | /* Walk all volumes of this UBI device */ | ||
232 | for (i = 0; i < ubi->vtbl_slots; i++) { | ||
233 | struct ubi_volume *vol = ubi->volumes[i]; | ||
234 | |||
235 | if (vol && len == vol->name_len && !strcmp(name, vol->name)) { | ||
236 | vol_id = i; | ||
237 | break; | ||
238 | } | ||
239 | } | ||
240 | spin_unlock(&ubi->volumes_lock); | ||
241 | |||
242 | if (vol_id < 0) | ||
243 | goto out_put; | ||
244 | |||
245 | ret = ubi_open_volume(ubi_num, vol_id, mode); | ||
246 | |||
247 | out_put: | ||
248 | module_put(THIS_MODULE); | ||
249 | return ret; | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(ubi_open_volume_nm); | ||
252 | |||
253 | /** | ||
254 | * ubi_close_volume - close UBI volume. | ||
255 | * @desc: volume descriptor | ||
256 | */ | ||
257 | void ubi_close_volume(struct ubi_volume_desc *desc) | ||
258 | { | ||
259 | struct ubi_volume *vol = desc->vol; | ||
260 | |||
261 | dbg_msg("close volume %d, mode %d", vol->vol_id, desc->mode); | ||
262 | |||
263 | spin_lock(&vol->ubi->volumes_lock); | ||
264 | switch (desc->mode) { | ||
265 | case UBI_READONLY: | ||
266 | vol->readers -= 1; | ||
267 | break; | ||
268 | case UBI_READWRITE: | ||
269 | vol->writers -= 1; | ||
270 | break; | ||
271 | case UBI_EXCLUSIVE: | ||
272 | vol->exclusive = 0; | ||
273 | } | ||
274 | spin_unlock(&vol->ubi->volumes_lock); | ||
275 | |||
276 | kfree(desc); | ||
277 | module_put(THIS_MODULE); | ||
278 | } | ||
279 | EXPORT_SYMBOL_GPL(ubi_close_volume); | ||
280 | |||
281 | /** | ||
282 | * ubi_leb_read - read data. | ||
283 | * @desc: volume descriptor | ||
284 | * @lnum: logical eraseblock number to read from | ||
285 | * @buf: buffer where to store the read data | ||
286 | * @offset: offset within the logical eraseblock to read from | ||
287 | * @len: how many bytes to read | ||
288 | * @check: whether UBI has to check the read data's CRC or not. | ||
289 | * | ||
290 | * This function reads data from offset @offset of logical eraseblock @lnum and | ||
291 | * stores the data at @buf. When reading from static volumes, @check specifies | ||
292 | * whether the data has to be checked or not. If yes, the whole logical | ||
293 | * eraseblock will be read and its CRC checksum will be checked (i.e., the CRC | ||
294 | * checksum is per-eraseblock). So checking may substantially slow down the | ||
295 | * read speed. The @check argument is ignored for dynamic volumes. | ||
296 | * | ||
297 | * In case of success, this function returns zero. In case of failure, this | ||
298 | * function returns a negative error code. | ||
299 | * | ||
300 | * %-EBADMSG error code is returned: | ||
301 | * o for both static and dynamic volumes if MTD driver has detected a data | ||
302 | * integrity problem (unrecoverable ECC checksum mismatch in case of NAND); | ||
303 | * o for static volumes in case of data CRC mismatch. | ||
304 | * | ||
305 | * If the volume is damaged because of an interrupted update this function just | ||
306 | * returns immediately with %-EBADF error code. | ||
307 | */ | ||
308 | int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, | ||
309 | int len, int check) | ||
310 | { | ||
311 | struct ubi_volume *vol = desc->vol; | ||
312 | struct ubi_device *ubi = vol->ubi; | ||
313 | int err, vol_id = vol->vol_id; | ||
314 | |||
315 | dbg_msg("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset); | ||
316 | |||
317 | if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 || | ||
318 | lnum >= vol->used_ebs || offset < 0 || len < 0 || | ||
319 | offset + len > vol->usable_leb_size) | ||
320 | return -EINVAL; | ||
321 | |||
322 | if (vol->vol_type == UBI_STATIC_VOLUME && lnum == vol->used_ebs - 1 && | ||
323 | offset + len > vol->last_eb_bytes) | ||
324 | return -EINVAL; | ||
325 | |||
326 | if (vol->upd_marker) | ||
327 | return -EBADF; | ||
328 | if (len == 0) | ||
329 | return 0; | ||
330 | |||
331 | err = ubi_eba_read_leb(ubi, vol_id, lnum, buf, offset, len, check); | ||
332 | if (err && err == -EBADMSG && vol->vol_type == UBI_STATIC_VOLUME) { | ||
333 | ubi_warn("mark volume %d as corrupted", vol_id); | ||
334 | vol->corrupted = 1; | ||
335 | } | ||
336 | |||
337 | return err; | ||
338 | } | ||
339 | EXPORT_SYMBOL_GPL(ubi_leb_read); | ||
340 | |||
341 | /** | ||
342 | * ubi_leb_write - write data. | ||
343 | * @desc: volume descriptor | ||
344 | * @lnum: logical eraseblock number to write to | ||
345 | * @buf: data to write | ||
346 | * @offset: offset within the logical eraseblock where to write | ||
347 | * @len: how many bytes to write | ||
348 | * @dtype: expected data type | ||
349 | * | ||
350 | * This function writes @len bytes of data from @buf to offset @offset of | ||
351 | * logical eraseblock @lnum. The @dtype argument describes expected lifetime of | ||
352 | * the data. | ||
353 | * | ||
354 | * This function takes care of physical eraseblock write failures. If write to | ||
355 | * the physical eraseblock write operation fails, the logical eraseblock is | ||
356 | * re-mapped to another physical eraseblock, the data is recovered, and the | ||
357 | * write finishes. UBI has a pool of reserved physical eraseblocks for this. | ||
358 | * | ||
359 | * If all the data were successfully written, zero is returned. If an error | ||
360 | * occurred and UBI has not been able to recover from it, this function returns | ||
361 | * a negative error code. Note, in case of an error, it is possible that | ||
362 | * something was still written to the flash media, but that may be some | ||
363 | * garbage. | ||
364 | * | ||
365 | * If the volume is damaged because of an interrupted update this function just | ||
366 | * returns immediately with %-EBADF code. | ||
367 | */ | ||
368 | int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, | ||
369 | int offset, int len, int dtype) | ||
370 | { | ||
371 | struct ubi_volume *vol = desc->vol; | ||
372 | struct ubi_device *ubi = vol->ubi; | ||
373 | int vol_id = vol->vol_id; | ||
374 | |||
375 | dbg_msg("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset); | ||
376 | |||
377 | if (vol_id < 0 || vol_id >= ubi->vtbl_slots) | ||
378 | return -EINVAL; | ||
379 | |||
380 | if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) | ||
381 | return -EROFS; | ||
382 | |||
383 | if (lnum < 0 || lnum >= vol->reserved_pebs || offset < 0 || len < 0 || | ||
384 | offset + len > vol->usable_leb_size || offset % ubi->min_io_size || | ||
385 | len % ubi->min_io_size) | ||
386 | return -EINVAL; | ||
387 | |||
388 | if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && | ||
389 | dtype != UBI_UNKNOWN) | ||
390 | return -EINVAL; | ||
391 | |||
392 | if (vol->upd_marker) | ||
393 | return -EBADF; | ||
394 | |||
395 | if (len == 0) | ||
396 | return 0; | ||
397 | |||
398 | return ubi_eba_write_leb(ubi, vol_id, lnum, buf, offset, len, dtype); | ||
399 | } | ||
400 | EXPORT_SYMBOL_GPL(ubi_leb_write); | ||
401 | |||
402 | /* | ||
403 | * ubi_leb_change - change logical eraseblock atomically. | ||
404 | * @desc: volume descriptor | ||
405 | * @lnum: logical eraseblock number to change | ||
406 | * @buf: data to write | ||
407 | * @len: how many bytes to write | ||
408 | * @dtype: expected data type | ||
409 | * | ||
410 | * This function changes the contents of a logical eraseblock atomically. @buf | ||
411 | * has to contain new logical eraseblock data, and @len - the length of the | ||
412 | * data, which has to be aligned. The length may be shorter then the logical | ||
413 | * eraseblock size, ant the logical eraseblock may be appended to more times | ||
414 | * later on. This function guarantees that in case of an unclean reboot the old | ||
415 | * contents is preserved. Returns zero in case of success and a negative error | ||
416 | * code in case of failure. | ||
417 | */ | ||
418 | int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, | ||
419 | int len, int dtype) | ||
420 | { | ||
421 | struct ubi_volume *vol = desc->vol; | ||
422 | struct ubi_device *ubi = vol->ubi; | ||
423 | int vol_id = vol->vol_id; | ||
424 | |||
425 | dbg_msg("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum); | ||
426 | |||
427 | if (vol_id < 0 || vol_id >= ubi->vtbl_slots) | ||
428 | return -EINVAL; | ||
429 | |||
430 | if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) | ||
431 | return -EROFS; | ||
432 | |||
433 | if (lnum < 0 || lnum >= vol->reserved_pebs || len < 0 || | ||
434 | len > vol->usable_leb_size || len % ubi->min_io_size) | ||
435 | return -EINVAL; | ||
436 | |||
437 | if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && | ||
438 | dtype != UBI_UNKNOWN) | ||
439 | return -EINVAL; | ||
440 | |||
441 | if (vol->upd_marker) | ||
442 | return -EBADF; | ||
443 | |||
444 | if (len == 0) | ||
445 | return 0; | ||
446 | |||
447 | return ubi_eba_atomic_leb_change(ubi, vol_id, lnum, buf, len, dtype); | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(ubi_leb_change); | ||
450 | |||
451 | /** | ||
452 | * ubi_leb_erase - erase logical eraseblock. | ||
453 | * @desc: volume descriptor | ||
454 | * @lnum: logical eraseblock number | ||
455 | * | ||
456 | * This function un-maps logical eraseblock @lnum and synchronously erases the | ||
457 | * correspondent physical eraseblock. Returns zero in case of success and a | ||
458 | * negative error code in case of failure. | ||
459 | * | ||
460 | * If the volume is damaged because of an interrupted update this function just | ||
461 | * returns immediately with %-EBADF code. | ||
462 | */ | ||
463 | int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum) | ||
464 | { | ||
465 | struct ubi_volume *vol = desc->vol; | ||
466 | struct ubi_device *ubi = vol->ubi; | ||
467 | int err, vol_id = vol->vol_id; | ||
468 | |||
469 | dbg_msg("erase LEB %d:%d", vol_id, lnum); | ||
470 | |||
471 | if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) | ||
472 | return -EROFS; | ||
473 | |||
474 | if (lnum < 0 || lnum >= vol->reserved_pebs) | ||
475 | return -EINVAL; | ||
476 | |||
477 | if (vol->upd_marker) | ||
478 | return -EBADF; | ||
479 | |||
480 | err = ubi_eba_unmap_leb(ubi, vol_id, lnum); | ||
481 | if (err) | ||
482 | return err; | ||
483 | |||
484 | return ubi_wl_flush(ubi); | ||
485 | } | ||
486 | EXPORT_SYMBOL_GPL(ubi_leb_erase); | ||
487 | |||
488 | /** | ||
489 | * ubi_leb_unmap - un-map logical eraseblock. | ||
490 | * @desc: volume descriptor | ||
491 | * @lnum: logical eraseblock number | ||
492 | * | ||
493 | * This function un-maps logical eraseblock @lnum and schedules the | ||
494 | * corresponding physical eraseblock for erasure, so that it will eventually be | ||
495 | * physically erased in background. This operation is much faster then the | ||
496 | * erase operation. | ||
497 | * | ||
498 | * Unlike erase, the un-map operation does not guarantee that the logical | ||
499 | * eraseblock will contain all 0xFF bytes when UBI is initialized again. For | ||
500 | * example, if several logical eraseblocks are un-mapped, and an unclean reboot | ||
501 | * happens after this, the logical eraseblocks will not necessarily be | ||
502 | * un-mapped again when this MTD device is attached. They may actually be | ||
503 | * mapped to the same physical eraseblocks again. So, this function has to be | ||
504 | * used with care. | ||
505 | * | ||
506 | * In other words, when un-mapping a logical eraseblock, UBI does not store | ||
507 | * any information about this on the flash media, it just marks the logical | ||
508 | * eraseblock as "un-mapped" in RAM. If UBI is detached before the physical | ||
509 | * eraseblock is physically erased, it will be mapped again to the same logical | ||
510 | * eraseblock when the MTD device is attached again. | ||
511 | * | ||
512 | * The main and obvious use-case of this function is when the contents of a | ||
513 | * logical eraseblock has to be re-written. Then it is much more efficient to | ||
514 | * first un-map it, then write new data, rather then first erase it, then write | ||
515 | * new data. Note, once new data has been written to the logical eraseblock, | ||
516 | * UBI guarantees that the old contents has gone forever. In other words, if an | ||
517 | * unclean reboot happens after the logical eraseblock has been un-mapped and | ||
518 | * then written to, it will contain the last written data. | ||
519 | * | ||
520 | * This function returns zero in case of success and a negative error code in | ||
521 | * case of failure. If the volume is damaged because of an interrupted update | ||
522 | * this function just returns immediately with %-EBADF code. | ||
523 | */ | ||
524 | int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum) | ||
525 | { | ||
526 | struct ubi_volume *vol = desc->vol; | ||
527 | struct ubi_device *ubi = vol->ubi; | ||
528 | int vol_id = vol->vol_id; | ||
529 | |||
530 | dbg_msg("unmap LEB %d:%d", vol_id, lnum); | ||
531 | |||
532 | if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) | ||
533 | return -EROFS; | ||
534 | |||
535 | if (lnum < 0 || lnum >= vol->reserved_pebs) | ||
536 | return -EINVAL; | ||
537 | |||
538 | if (vol->upd_marker) | ||
539 | return -EBADF; | ||
540 | |||
541 | return ubi_eba_unmap_leb(ubi, vol_id, lnum); | ||
542 | } | ||
543 | EXPORT_SYMBOL_GPL(ubi_leb_unmap); | ||
544 | |||
545 | /** | ||
546 | * ubi_is_mapped - check if logical eraseblock is mapped. | ||
547 | * @desc: volume descriptor | ||
548 | * @lnum: logical eraseblock number | ||
549 | * | ||
550 | * This function checks if logical eraseblock @lnum is mapped to a physical | ||
551 | * eraseblock. If a logical eraseblock is un-mapped, this does not necessarily | ||
552 | * mean it will still be un-mapped after the UBI device is re-attached. The | ||
553 | * logical eraseblock may become mapped to the physical eraseblock it was last | ||
554 | * mapped to. | ||
555 | * | ||
556 | * This function returns %1 if the LEB is mapped, %0 if not, and a negative | ||
557 | * error code in case of failure. If the volume is damaged because of an | ||
558 | * interrupted update this function just returns immediately with %-EBADF error | ||
559 | * code. | ||
560 | */ | ||
561 | int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum) | ||
562 | { | ||
563 | struct ubi_volume *vol = desc->vol; | ||
564 | |||
565 | dbg_msg("test LEB %d:%d", vol->vol_id, lnum); | ||
566 | |||
567 | if (lnum < 0 || lnum >= vol->reserved_pebs) | ||
568 | return -EINVAL; | ||
569 | |||
570 | if (vol->upd_marker) | ||
571 | return -EBADF; | ||
572 | |||
573 | return vol->eba_tbl[lnum] >= 0; | ||
574 | } | ||
575 | EXPORT_SYMBOL_GPL(ubi_is_mapped); | ||
diff --git a/drivers/mtd/ubi/misc.c b/drivers/mtd/ubi/misc.c new file mode 100644 index 000000000000..38d4e6757dc7 --- /dev/null +++ b/drivers/mtd/ubi/misc.c | |||
@@ -0,0 +1,105 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* Here we keep miscellaneous functions which are used all over the UBI code */ | ||
22 | |||
23 | #include "ubi.h" | ||
24 | |||
25 | /** | ||
26 | * calc_data_len - calculate how much real data is stored in a buffer. | ||
27 | * @ubi: UBI device description object | ||
28 | * @buf: a buffer with the contents of the physical eraseblock | ||
29 | * @length: the buffer length | ||
30 | * | ||
31 | * This function calculates how much "real data" is stored in @buf and returnes | ||
32 | * the length. Continuous 0xFF bytes at the end of the buffer are not | ||
33 | * considered as "real data". | ||
34 | */ | ||
35 | int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, | ||
36 | int length) | ||
37 | { | ||
38 | int i; | ||
39 | |||
40 | ubi_assert(length % ubi->min_io_size == 0); | ||
41 | |||
42 | for (i = length - 1; i >= 0; i--) | ||
43 | if (((const uint8_t *)buf)[i] != 0xFF) | ||
44 | break; | ||
45 | |||
46 | /* The resulting length must be aligned to the minimum flash I/O size */ | ||
47 | length = ALIGN(i + 1, ubi->min_io_size); | ||
48 | return length; | ||
49 | } | ||
50 | |||
51 | /** | ||
52 | * ubi_check_volume - check the contents of a static volume. | ||
53 | * @ubi: UBI device description object | ||
54 | * @vol_id: ID of the volume to check | ||
55 | * | ||
56 | * This function checks if static volume @vol_id is corrupted by fully reading | ||
57 | * it and checking data CRC. This function returns %0 if the volume is not | ||
58 | * corrupted, %1 if it is corrupted and a negative error code in case of | ||
59 | * failure. Dynamic volumes are not checked and zero is returned immediately. | ||
60 | */ | ||
61 | int ubi_check_volume(struct ubi_device *ubi, int vol_id) | ||
62 | { | ||
63 | void *buf; | ||
64 | int err = 0, i; | ||
65 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
66 | |||
67 | if (vol->vol_type != UBI_STATIC_VOLUME) | ||
68 | return 0; | ||
69 | |||
70 | buf = kmalloc(vol->usable_leb_size, GFP_KERNEL); | ||
71 | if (!buf) | ||
72 | return -ENOMEM; | ||
73 | |||
74 | for (i = 0; i < vol->used_ebs; i++) { | ||
75 | int size; | ||
76 | |||
77 | if (i == vol->used_ebs - 1) | ||
78 | size = vol->last_eb_bytes; | ||
79 | else | ||
80 | size = vol->usable_leb_size; | ||
81 | |||
82 | err = ubi_eba_read_leb(ubi, vol_id, i, buf, 0, size, 1); | ||
83 | if (err) { | ||
84 | if (err == -EBADMSG) | ||
85 | err = 1; | ||
86 | break; | ||
87 | } | ||
88 | } | ||
89 | |||
90 | kfree(buf); | ||
91 | return err; | ||
92 | } | ||
93 | |||
94 | /** | ||
95 | * ubi_calculate_rsvd_pool - calculate how many PEBs must be reserved for bad | ||
96 | * eraseblock handling. | ||
97 | * @ubi: UBI device description object | ||
98 | */ | ||
99 | void ubi_calculate_reserved(struct ubi_device *ubi) | ||
100 | { | ||
101 | ubi->beb_rsvd_level = ubi->good_peb_count/100; | ||
102 | ubi->beb_rsvd_level *= CONFIG_MTD_UBI_BEB_RESERVE; | ||
103 | if (ubi->beb_rsvd_level < MIN_RESEVED_PEBS) | ||
104 | ubi->beb_rsvd_level = MIN_RESEVED_PEBS; | ||
105 | } | ||
diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c new file mode 100644 index 000000000000..473f3200b868 --- /dev/null +++ b/drivers/mtd/ubi/scan.c | |||
@@ -0,0 +1,1368 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * UBI scanning unit. | ||
23 | * | ||
24 | * This unit is responsible for scanning the flash media, checking UBI | ||
25 | * headers and providing complete information about the UBI flash image. | ||
26 | * | ||
27 | * The scanning information is reoresented by a &struct ubi_scan_info' object. | ||
28 | * Information about found volumes is represented by &struct ubi_scan_volume | ||
29 | * objects which are kept in volume RB-tree with root at the @volumes field. | ||
30 | * The RB-tree is indexed by the volume ID. | ||
31 | * | ||
32 | * Found logical eraseblocks are represented by &struct ubi_scan_leb objects. | ||
33 | * These objects are kept in per-volume RB-trees with the root at the | ||
34 | * corresponding &struct ubi_scan_volume object. To put it differently, we keep | ||
35 | * an RB-tree of per-volume objects and each of these objects is the root of | ||
36 | * RB-tree of per-eraseblock objects. | ||
37 | * | ||
38 | * Corrupted physical eraseblocks are put to the @corr list, free physical | ||
39 | * eraseblocks are put to the @free list and the physical eraseblock to be | ||
40 | * erased are put to the @erase list. | ||
41 | */ | ||
42 | |||
43 | #include <linux/err.h> | ||
44 | #include <linux/crc32.h> | ||
45 | #include "ubi.h" | ||
46 | |||
47 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
48 | static int paranoid_check_si(const struct ubi_device *ubi, | ||
49 | struct ubi_scan_info *si); | ||
50 | #else | ||
51 | #define paranoid_check_si(ubi, si) 0 | ||
52 | #endif | ||
53 | |||
54 | /* Temporary variables used during scanning */ | ||
55 | static struct ubi_ec_hdr *ech; | ||
56 | static struct ubi_vid_hdr *vidh; | ||
57 | |||
58 | int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec, | ||
59 | struct list_head *list) | ||
60 | { | ||
61 | struct ubi_scan_leb *seb; | ||
62 | |||
63 | if (list == &si->free) | ||
64 | dbg_bld("add to free: PEB %d, EC %d", pnum, ec); | ||
65 | else if (list == &si->erase) | ||
66 | dbg_bld("add to erase: PEB %d, EC %d", pnum, ec); | ||
67 | else if (list == &si->corr) | ||
68 | dbg_bld("add to corrupted: PEB %d, EC %d", pnum, ec); | ||
69 | else if (list == &si->alien) | ||
70 | dbg_bld("add to alien: PEB %d, EC %d", pnum, ec); | ||
71 | else | ||
72 | BUG(); | ||
73 | |||
74 | seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL); | ||
75 | if (!seb) | ||
76 | return -ENOMEM; | ||
77 | |||
78 | seb->pnum = pnum; | ||
79 | seb->ec = ec; | ||
80 | list_add_tail(&seb->u.list, list); | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * commit_to_mean_value - commit intermediate results to the final mean erase | ||
86 | * counter value. | ||
87 | * @si: scanning information | ||
88 | * | ||
89 | * This is a helper function which calculates partial mean erase counter mean | ||
90 | * value and adds it to the resulting mean value. As we can work only in | ||
91 | * integer arithmetic and we want to calculate the mean value of erase counter | ||
92 | * accurately, we first sum erase counter values in @si->ec_sum variable and | ||
93 | * count these components in @si->ec_count. If this temporary @si->ec_sum is | ||
94 | * going to overflow, we calculate the partial mean value | ||
95 | * (@si->ec_sum/@si->ec_count) and add it to @si->mean_ec. | ||
96 | */ | ||
97 | static void commit_to_mean_value(struct ubi_scan_info *si) | ||
98 | { | ||
99 | si->ec_sum /= si->ec_count; | ||
100 | if (si->ec_sum % si->ec_count >= si->ec_count / 2) | ||
101 | si->mean_ec += 1; | ||
102 | si->mean_ec += si->ec_sum; | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * validate_vid_hdr - check that volume identifier header is correct and | ||
107 | * consistent. | ||
108 | * @vid_hdr: the volume identifier header to check | ||
109 | * @sv: information about the volume this logical eraseblock belongs to | ||
110 | * @pnum: physical eraseblock number the VID header came from | ||
111 | * | ||
112 | * This function checks that data stored in @vid_hdr is consistent. Returns | ||
113 | * non-zero if an inconsistency was found and zero if not. | ||
114 | * | ||
115 | * Note, UBI does sanity check of everything it reads from the flash media. | ||
116 | * Most of the checks are done in the I/O unit. Here we check that the | ||
117 | * information in the VID header is consistent to the information in other VID | ||
118 | * headers of the same volume. | ||
119 | */ | ||
120 | static int validate_vid_hdr(const struct ubi_vid_hdr *vid_hdr, | ||
121 | const struct ubi_scan_volume *sv, int pnum) | ||
122 | { | ||
123 | int vol_type = vid_hdr->vol_type; | ||
124 | int vol_id = ubi32_to_cpu(vid_hdr->vol_id); | ||
125 | int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); | ||
126 | int data_pad = ubi32_to_cpu(vid_hdr->data_pad); | ||
127 | |||
128 | if (sv->leb_count != 0) { | ||
129 | int sv_vol_type; | ||
130 | |||
131 | /* | ||
132 | * This is not the first logical eraseblock belonging to this | ||
133 | * volume. Ensure that the data in its VID header is consistent | ||
134 | * to the data in previous logical eraseblock headers. | ||
135 | */ | ||
136 | |||
137 | if (vol_id != sv->vol_id) { | ||
138 | dbg_err("inconsistent vol_id"); | ||
139 | goto bad; | ||
140 | } | ||
141 | |||
142 | if (sv->vol_type == UBI_STATIC_VOLUME) | ||
143 | sv_vol_type = UBI_VID_STATIC; | ||
144 | else | ||
145 | sv_vol_type = UBI_VID_DYNAMIC; | ||
146 | |||
147 | if (vol_type != sv_vol_type) { | ||
148 | dbg_err("inconsistent vol_type"); | ||
149 | goto bad; | ||
150 | } | ||
151 | |||
152 | if (used_ebs != sv->used_ebs) { | ||
153 | dbg_err("inconsistent used_ebs"); | ||
154 | goto bad; | ||
155 | } | ||
156 | |||
157 | if (data_pad != sv->data_pad) { | ||
158 | dbg_err("inconsistent data_pad"); | ||
159 | goto bad; | ||
160 | } | ||
161 | } | ||
162 | |||
163 | return 0; | ||
164 | |||
165 | bad: | ||
166 | ubi_err("inconsistent VID header at PEB %d", pnum); | ||
167 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
168 | ubi_dbg_dump_sv(sv); | ||
169 | return -EINVAL; | ||
170 | } | ||
171 | |||
172 | /** | ||
173 | * add_volume - add volume to the scanning information. | ||
174 | * @si: scanning information | ||
175 | * @vol_id: ID of the volume to add | ||
176 | * @pnum: physical eraseblock number | ||
177 | * @vid_hdr: volume identifier header | ||
178 | * | ||
179 | * If the volume corresponding to the @vid_hdr logical eraseblock is already | ||
180 | * present in the scanning information, this function does nothing. Otherwise | ||
181 | * it adds corresponding volume to the scanning information. Returns a pointer | ||
182 | * to the scanning volume object in case of success and a negative error code | ||
183 | * in case of failure. | ||
184 | */ | ||
185 | static struct ubi_scan_volume *add_volume(struct ubi_scan_info *si, int vol_id, | ||
186 | int pnum, | ||
187 | const struct ubi_vid_hdr *vid_hdr) | ||
188 | { | ||
189 | struct ubi_scan_volume *sv; | ||
190 | struct rb_node **p = &si->volumes.rb_node, *parent = NULL; | ||
191 | |||
192 | ubi_assert(vol_id == ubi32_to_cpu(vid_hdr->vol_id)); | ||
193 | |||
194 | /* Walk the volume RB-tree to look if this volume is already present */ | ||
195 | while (*p) { | ||
196 | parent = *p; | ||
197 | sv = rb_entry(parent, struct ubi_scan_volume, rb); | ||
198 | |||
199 | if (vol_id == sv->vol_id) | ||
200 | return sv; | ||
201 | |||
202 | if (vol_id > sv->vol_id) | ||
203 | p = &(*p)->rb_left; | ||
204 | else | ||
205 | p = &(*p)->rb_right; | ||
206 | } | ||
207 | |||
208 | /* The volume is absent - add it */ | ||
209 | sv = kmalloc(sizeof(struct ubi_scan_volume), GFP_KERNEL); | ||
210 | if (!sv) | ||
211 | return ERR_PTR(-ENOMEM); | ||
212 | |||
213 | sv->highest_lnum = sv->leb_count = 0; | ||
214 | si->max_sqnum = 0; | ||
215 | sv->vol_id = vol_id; | ||
216 | sv->root = RB_ROOT; | ||
217 | sv->used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); | ||
218 | sv->data_pad = ubi32_to_cpu(vid_hdr->data_pad); | ||
219 | sv->compat = vid_hdr->compat; | ||
220 | sv->vol_type = vid_hdr->vol_type == UBI_VID_DYNAMIC ? UBI_DYNAMIC_VOLUME | ||
221 | : UBI_STATIC_VOLUME; | ||
222 | if (vol_id > si->highest_vol_id) | ||
223 | si->highest_vol_id = vol_id; | ||
224 | |||
225 | rb_link_node(&sv->rb, parent, p); | ||
226 | rb_insert_color(&sv->rb, &si->volumes); | ||
227 | si->vols_found += 1; | ||
228 | dbg_bld("added volume %d", vol_id); | ||
229 | return sv; | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * compare_lebs - find out which logical eraseblock is newer. | ||
234 | * @ubi: UBI device description object | ||
235 | * @seb: first logical eraseblock to compare | ||
236 | * @pnum: physical eraseblock number of the second logical eraseblock to | ||
237 | * compare | ||
238 | * @vid_hdr: volume identifier header of the second logical eraseblock | ||
239 | * | ||
240 | * This function compares 2 copies of a LEB and informs which one is newer. In | ||
241 | * case of success this function returns a positive value, in case of failure, a | ||
242 | * negative error code is returned. The success return codes use the following | ||
243 | * bits: | ||
244 | * o bit 0 is cleared: the first PEB (described by @seb) is newer then the | ||
245 | * second PEB (described by @pnum and @vid_hdr); | ||
246 | * o bit 0 is set: the second PEB is newer; | ||
247 | * o bit 1 is cleared: no bit-flips were detected in the newer LEB; | ||
248 | * o bit 1 is set: bit-flips were detected in the newer LEB; | ||
249 | * o bit 2 is cleared: the older LEB is not corrupted; | ||
250 | * o bit 2 is set: the older LEB is corrupted. | ||
251 | */ | ||
252 | static int compare_lebs(const struct ubi_device *ubi, | ||
253 | const struct ubi_scan_leb *seb, int pnum, | ||
254 | const struct ubi_vid_hdr *vid_hdr) | ||
255 | { | ||
256 | void *buf; | ||
257 | int len, err, second_is_newer, bitflips = 0, corrupted = 0; | ||
258 | uint32_t data_crc, crc; | ||
259 | struct ubi_vid_hdr *vidh = NULL; | ||
260 | unsigned long long sqnum2 = ubi64_to_cpu(vid_hdr->sqnum); | ||
261 | |||
262 | if (seb->sqnum == 0 && sqnum2 == 0) { | ||
263 | long long abs, v1 = seb->leb_ver, v2 = ubi32_to_cpu(vid_hdr->leb_ver); | ||
264 | |||
265 | /* | ||
266 | * UBI constantly increases the logical eraseblock version | ||
267 | * number and it can overflow. Thus, we have to bear in mind | ||
268 | * that versions that are close to %0xFFFFFFFF are less then | ||
269 | * versions that are close to %0. | ||
270 | * | ||
271 | * The UBI WL unit guarantees that the number of pending tasks | ||
272 | * is not greater then %0x7FFFFFFF. So, if the difference | ||
273 | * between any two versions is greater or equivalent to | ||
274 | * %0x7FFFFFFF, there was an overflow and the logical | ||
275 | * eraseblock with lower version is actually newer then the one | ||
276 | * with higher version. | ||
277 | * | ||
278 | * FIXME: but this is anyway obsolete and will be removed at | ||
279 | * some point. | ||
280 | */ | ||
281 | |||
282 | dbg_bld("using old crappy leb_ver stuff"); | ||
283 | |||
284 | abs = v1 - v2; | ||
285 | if (abs < 0) | ||
286 | abs = -abs; | ||
287 | |||
288 | if (abs < 0x7FFFFFFF) | ||
289 | /* Non-overflow situation */ | ||
290 | second_is_newer = (v2 > v1); | ||
291 | else | ||
292 | second_is_newer = (v2 < v1); | ||
293 | } else | ||
294 | /* Obviously the LEB with lower sequence counter is older */ | ||
295 | second_is_newer = sqnum2 > seb->sqnum; | ||
296 | |||
297 | /* | ||
298 | * Now we know which copy is newer. If the copy flag of the PEB with | ||
299 | * newer version is not set, then we just return, otherwise we have to | ||
300 | * check data CRC. For the second PEB we already have the VID header, | ||
301 | * for the first one - we'll need to re-read it from flash. | ||
302 | * | ||
303 | * FIXME: this may be optimized so that we wouldn't read twice. | ||
304 | */ | ||
305 | |||
306 | if (second_is_newer) { | ||
307 | if (!vid_hdr->copy_flag) { | ||
308 | /* It is not a copy, so it is newer */ | ||
309 | dbg_bld("second PEB %d is newer, copy_flag is unset", | ||
310 | pnum); | ||
311 | return 1; | ||
312 | } | ||
313 | } else { | ||
314 | pnum = seb->pnum; | ||
315 | |||
316 | vidh = ubi_zalloc_vid_hdr(ubi); | ||
317 | if (!vidh) | ||
318 | return -ENOMEM; | ||
319 | |||
320 | err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0); | ||
321 | if (err) { | ||
322 | if (err == UBI_IO_BITFLIPS) | ||
323 | bitflips = 1; | ||
324 | else { | ||
325 | dbg_err("VID of PEB %d header is bad, but it " | ||
326 | "was OK earlier", pnum); | ||
327 | if (err > 0) | ||
328 | err = -EIO; | ||
329 | |||
330 | goto out_free_vidh; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | if (!vidh->copy_flag) { | ||
335 | /* It is not a copy, so it is newer */ | ||
336 | dbg_bld("first PEB %d is newer, copy_flag is unset", | ||
337 | pnum); | ||
338 | err = bitflips << 1; | ||
339 | goto out_free_vidh; | ||
340 | } | ||
341 | |||
342 | vid_hdr = vidh; | ||
343 | } | ||
344 | |||
345 | /* Read the data of the copy and check the CRC */ | ||
346 | |||
347 | len = ubi32_to_cpu(vid_hdr->data_size); | ||
348 | buf = kmalloc(len, GFP_KERNEL); | ||
349 | if (!buf) { | ||
350 | err = -ENOMEM; | ||
351 | goto out_free_vidh; | ||
352 | } | ||
353 | |||
354 | err = ubi_io_read_data(ubi, buf, pnum, 0, len); | ||
355 | if (err && err != UBI_IO_BITFLIPS) | ||
356 | goto out_free_buf; | ||
357 | |||
358 | data_crc = ubi32_to_cpu(vid_hdr->data_crc); | ||
359 | crc = crc32(UBI_CRC32_INIT, buf, len); | ||
360 | if (crc != data_crc) { | ||
361 | dbg_bld("PEB %d CRC error: calculated %#08x, must be %#08x", | ||
362 | pnum, crc, data_crc); | ||
363 | corrupted = 1; | ||
364 | bitflips = 0; | ||
365 | second_is_newer = !second_is_newer; | ||
366 | } else { | ||
367 | dbg_bld("PEB %d CRC is OK", pnum); | ||
368 | bitflips = !!err; | ||
369 | } | ||
370 | |||
371 | kfree(buf); | ||
372 | ubi_free_vid_hdr(ubi, vidh); | ||
373 | |||
374 | if (second_is_newer) | ||
375 | dbg_bld("second PEB %d is newer, copy_flag is set", pnum); | ||
376 | else | ||
377 | dbg_bld("first PEB %d is newer, copy_flag is set", pnum); | ||
378 | |||
379 | return second_is_newer | (bitflips << 1) | (corrupted << 2); | ||
380 | |||
381 | out_free_buf: | ||
382 | kfree(buf); | ||
383 | out_free_vidh: | ||
384 | ubi_free_vid_hdr(ubi, vidh); | ||
385 | ubi_assert(err < 0); | ||
386 | return err; | ||
387 | } | ||
388 | |||
389 | /** | ||
390 | * ubi_scan_add_used - add information about a physical eraseblock to the | ||
391 | * scanning information. | ||
392 | * @ubi: UBI device description object | ||
393 | * @si: scanning information | ||
394 | * @pnum: the physical eraseblock number | ||
395 | * @ec: erase counter | ||
396 | * @vid_hdr: the volume identifier header | ||
397 | * @bitflips: if bit-flips were detected when this physical eraseblock was read | ||
398 | * | ||
399 | * This function returns zero in case of success and a negative error code in | ||
400 | * case of failure. | ||
401 | */ | ||
402 | int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si, | ||
403 | int pnum, int ec, const struct ubi_vid_hdr *vid_hdr, | ||
404 | int bitflips) | ||
405 | { | ||
406 | int err, vol_id, lnum; | ||
407 | uint32_t leb_ver; | ||
408 | unsigned long long sqnum; | ||
409 | struct ubi_scan_volume *sv; | ||
410 | struct ubi_scan_leb *seb; | ||
411 | struct rb_node **p, *parent = NULL; | ||
412 | |||
413 | vol_id = ubi32_to_cpu(vid_hdr->vol_id); | ||
414 | lnum = ubi32_to_cpu(vid_hdr->lnum); | ||
415 | sqnum = ubi64_to_cpu(vid_hdr->sqnum); | ||
416 | leb_ver = ubi32_to_cpu(vid_hdr->leb_ver); | ||
417 | |||
418 | dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, ver %u, bitflips %d", | ||
419 | pnum, vol_id, lnum, ec, sqnum, leb_ver, bitflips); | ||
420 | |||
421 | sv = add_volume(si, vol_id, pnum, vid_hdr); | ||
422 | if (IS_ERR(sv) < 0) | ||
423 | return PTR_ERR(sv); | ||
424 | |||
425 | /* | ||
426 | * Walk the RB-tree of logical eraseblocks of volume @vol_id to look | ||
427 | * if this is the first instance of this logical eraseblock or not. | ||
428 | */ | ||
429 | p = &sv->root.rb_node; | ||
430 | while (*p) { | ||
431 | int cmp_res; | ||
432 | |||
433 | parent = *p; | ||
434 | seb = rb_entry(parent, struct ubi_scan_leb, u.rb); | ||
435 | if (lnum != seb->lnum) { | ||
436 | if (lnum < seb->lnum) | ||
437 | p = &(*p)->rb_left; | ||
438 | else | ||
439 | p = &(*p)->rb_right; | ||
440 | continue; | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * There is already a physical eraseblock describing the same | ||
445 | * logical eraseblock present. | ||
446 | */ | ||
447 | |||
448 | dbg_bld("this LEB already exists: PEB %d, sqnum %llu, " | ||
449 | "LEB ver %u, EC %d", seb->pnum, seb->sqnum, | ||
450 | seb->leb_ver, seb->ec); | ||
451 | |||
452 | /* | ||
453 | * Make sure that the logical eraseblocks have different | ||
454 | * versions. Otherwise the image is bad. | ||
455 | */ | ||
456 | if (seb->leb_ver == leb_ver && leb_ver != 0) { | ||
457 | ubi_err("two LEBs with same version %u", leb_ver); | ||
458 | ubi_dbg_dump_seb(seb, 0); | ||
459 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
460 | return -EINVAL; | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * Make sure that the logical eraseblocks have different | ||
465 | * sequence numbers. Otherwise the image is bad. | ||
466 | * | ||
467 | * FIXME: remove 'sqnum != 0' check when leb_ver is removed. | ||
468 | */ | ||
469 | if (seb->sqnum == sqnum && sqnum != 0) { | ||
470 | ubi_err("two LEBs with same sequence number %llu", | ||
471 | sqnum); | ||
472 | ubi_dbg_dump_seb(seb, 0); | ||
473 | ubi_dbg_dump_vid_hdr(vid_hdr); | ||
474 | return -EINVAL; | ||
475 | } | ||
476 | |||
477 | /* | ||
478 | * Now we have to drop the older one and preserve the newer | ||
479 | * one. | ||
480 | */ | ||
481 | cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr); | ||
482 | if (cmp_res < 0) | ||
483 | return cmp_res; | ||
484 | |||
485 | if (cmp_res & 1) { | ||
486 | /* | ||
487 | * This logical eraseblock is newer then the one | ||
488 | * found earlier. | ||
489 | */ | ||
490 | err = validate_vid_hdr(vid_hdr, sv, pnum); | ||
491 | if (err) | ||
492 | return err; | ||
493 | |||
494 | if (cmp_res & 4) | ||
495 | err = ubi_scan_add_to_list(si, seb->pnum, | ||
496 | seb->ec, &si->corr); | ||
497 | else | ||
498 | err = ubi_scan_add_to_list(si, seb->pnum, | ||
499 | seb->ec, &si->erase); | ||
500 | if (err) | ||
501 | return err; | ||
502 | |||
503 | seb->ec = ec; | ||
504 | seb->pnum = pnum; | ||
505 | seb->scrub = ((cmp_res & 2) || bitflips); | ||
506 | seb->sqnum = sqnum; | ||
507 | seb->leb_ver = leb_ver; | ||
508 | |||
509 | if (sv->highest_lnum == lnum) | ||
510 | sv->last_data_size = | ||
511 | ubi32_to_cpu(vid_hdr->data_size); | ||
512 | |||
513 | return 0; | ||
514 | } else { | ||
515 | /* | ||
516 | * This logical eraseblock is older then the one found | ||
517 | * previously. | ||
518 | */ | ||
519 | if (cmp_res & 4) | ||
520 | return ubi_scan_add_to_list(si, pnum, ec, | ||
521 | &si->corr); | ||
522 | else | ||
523 | return ubi_scan_add_to_list(si, pnum, ec, | ||
524 | &si->erase); | ||
525 | } | ||
526 | } | ||
527 | |||
528 | /* | ||
529 | * We've met this logical eraseblock for the first time, add it to the | ||
530 | * scanning information. | ||
531 | */ | ||
532 | |||
533 | err = validate_vid_hdr(vid_hdr, sv, pnum); | ||
534 | if (err) | ||
535 | return err; | ||
536 | |||
537 | seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL); | ||
538 | if (!seb) | ||
539 | return -ENOMEM; | ||
540 | |||
541 | seb->ec = ec; | ||
542 | seb->pnum = pnum; | ||
543 | seb->lnum = lnum; | ||
544 | seb->sqnum = sqnum; | ||
545 | seb->scrub = bitflips; | ||
546 | seb->leb_ver = leb_ver; | ||
547 | |||
548 | if (sv->highest_lnum <= lnum) { | ||
549 | sv->highest_lnum = lnum; | ||
550 | sv->last_data_size = ubi32_to_cpu(vid_hdr->data_size); | ||
551 | } | ||
552 | |||
553 | if (si->max_sqnum < sqnum) | ||
554 | si->max_sqnum = sqnum; | ||
555 | |||
556 | sv->leb_count += 1; | ||
557 | rb_link_node(&seb->u.rb, parent, p); | ||
558 | rb_insert_color(&seb->u.rb, &sv->root); | ||
559 | return 0; | ||
560 | } | ||
561 | |||
562 | /** | ||
563 | * ubi_scan_find_sv - find information about a particular volume in the | ||
564 | * scanning information. | ||
565 | * @si: scanning information | ||
566 | * @vol_id: the requested volume ID | ||
567 | * | ||
568 | * This function returns a pointer to the volume description or %NULL if there | ||
569 | * are no data about this volume in the scanning information. | ||
570 | */ | ||
571 | struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si, | ||
572 | int vol_id) | ||
573 | { | ||
574 | struct ubi_scan_volume *sv; | ||
575 | struct rb_node *p = si->volumes.rb_node; | ||
576 | |||
577 | while (p) { | ||
578 | sv = rb_entry(p, struct ubi_scan_volume, rb); | ||
579 | |||
580 | if (vol_id == sv->vol_id) | ||
581 | return sv; | ||
582 | |||
583 | if (vol_id > sv->vol_id) | ||
584 | p = p->rb_left; | ||
585 | else | ||
586 | p = p->rb_right; | ||
587 | } | ||
588 | |||
589 | return NULL; | ||
590 | } | ||
591 | |||
592 | /** | ||
593 | * ubi_scan_find_seb - find information about a particular logical | ||
594 | * eraseblock in the volume scanning information. | ||
595 | * @sv: a pointer to the volume scanning information | ||
596 | * @lnum: the requested logical eraseblock | ||
597 | * | ||
598 | * This function returns a pointer to the scanning logical eraseblock or %NULL | ||
599 | * if there are no data about it in the scanning volume information. | ||
600 | */ | ||
601 | struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv, | ||
602 | int lnum) | ||
603 | { | ||
604 | struct ubi_scan_leb *seb; | ||
605 | struct rb_node *p = sv->root.rb_node; | ||
606 | |||
607 | while (p) { | ||
608 | seb = rb_entry(p, struct ubi_scan_leb, u.rb); | ||
609 | |||
610 | if (lnum == seb->lnum) | ||
611 | return seb; | ||
612 | |||
613 | if (lnum > seb->lnum) | ||
614 | p = p->rb_left; | ||
615 | else | ||
616 | p = p->rb_right; | ||
617 | } | ||
618 | |||
619 | return NULL; | ||
620 | } | ||
621 | |||
622 | /** | ||
623 | * ubi_scan_rm_volume - delete scanning information about a volume. | ||
624 | * @si: scanning information | ||
625 | * @sv: the volume scanning information to delete | ||
626 | */ | ||
627 | void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv) | ||
628 | { | ||
629 | struct rb_node *rb; | ||
630 | struct ubi_scan_leb *seb; | ||
631 | |||
632 | dbg_bld("remove scanning information about volume %d", sv->vol_id); | ||
633 | |||
634 | while ((rb = rb_first(&sv->root))) { | ||
635 | seb = rb_entry(rb, struct ubi_scan_leb, u.rb); | ||
636 | rb_erase(&seb->u.rb, &sv->root); | ||
637 | list_add_tail(&seb->u.list, &si->erase); | ||
638 | } | ||
639 | |||
640 | rb_erase(&sv->rb, &si->volumes); | ||
641 | kfree(sv); | ||
642 | si->vols_found -= 1; | ||
643 | } | ||
644 | |||
645 | /** | ||
646 | * ubi_scan_erase_peb - erase a physical eraseblock. | ||
647 | * @ubi: UBI device description object | ||
648 | * @si: scanning information | ||
649 | * @pnum: physical eraseblock number to erase; | ||
650 | * @ec: erase counter value to write (%UBI_SCAN_UNKNOWN_EC if it is unknown) | ||
651 | * | ||
652 | * This function erases physical eraseblock 'pnum', and writes the erase | ||
653 | * counter header to it. This function should only be used on UBI device | ||
654 | * initialization stages, when the EBA unit had not been yet initialized. This | ||
655 | * function returns zero in case of success and a negative error code in case | ||
656 | * of failure. | ||
657 | */ | ||
658 | int ubi_scan_erase_peb(const struct ubi_device *ubi, | ||
659 | const struct ubi_scan_info *si, int pnum, int ec) | ||
660 | { | ||
661 | int err; | ||
662 | struct ubi_ec_hdr *ec_hdr; | ||
663 | |||
664 | ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); | ||
665 | if (!ec_hdr) | ||
666 | return -ENOMEM; | ||
667 | |||
668 | if ((long long)ec >= UBI_MAX_ERASECOUNTER) { | ||
669 | /* | ||
670 | * Erase counter overflow. Upgrade UBI and use 64-bit | ||
671 | * erase counters internally. | ||
672 | */ | ||
673 | ubi_err("erase counter overflow at PEB %d, EC %d", pnum, ec); | ||
674 | return -EINVAL; | ||
675 | } | ||
676 | |||
677 | ec_hdr->ec = cpu_to_ubi64(ec); | ||
678 | |||
679 | err = ubi_io_sync_erase(ubi, pnum, 0); | ||
680 | if (err < 0) | ||
681 | goto out_free; | ||
682 | |||
683 | err = ubi_io_write_ec_hdr(ubi, pnum, ec_hdr); | ||
684 | |||
685 | out_free: | ||
686 | kfree(ec_hdr); | ||
687 | return err; | ||
688 | } | ||
689 | |||
690 | /** | ||
691 | * ubi_scan_get_free_peb - get a free physical eraseblock. | ||
692 | * @ubi: UBI device description object | ||
693 | * @si: scanning information | ||
694 | * | ||
695 | * This function returns a free physical eraseblock. It is supposed to be | ||
696 | * called on the UBI initialization stages when the wear-leveling unit is not | ||
697 | * initialized yet. This function picks a physical eraseblocks from one of the | ||
698 | * lists, writes the EC header if it is needed, and removes it from the list. | ||
699 | * | ||
700 | * This function returns scanning physical eraseblock information in case of | ||
701 | * success and an error code in case of failure. | ||
702 | */ | ||
703 | struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi, | ||
704 | struct ubi_scan_info *si) | ||
705 | { | ||
706 | int err = 0, i; | ||
707 | struct ubi_scan_leb *seb; | ||
708 | |||
709 | if (!list_empty(&si->free)) { | ||
710 | seb = list_entry(si->free.next, struct ubi_scan_leb, u.list); | ||
711 | list_del(&seb->u.list); | ||
712 | dbg_bld("return free PEB %d, EC %d", seb->pnum, seb->ec); | ||
713 | return seb; | ||
714 | } | ||
715 | |||
716 | for (i = 0; i < 2; i++) { | ||
717 | struct list_head *head; | ||
718 | struct ubi_scan_leb *tmp_seb; | ||
719 | |||
720 | if (i == 0) | ||
721 | head = &si->erase; | ||
722 | else | ||
723 | head = &si->corr; | ||
724 | |||
725 | /* | ||
726 | * We try to erase the first physical eraseblock from the @head | ||
727 | * list and pick it if we succeed, or try to erase the | ||
728 | * next one if not. And so forth. We don't want to take care | ||
729 | * about bad eraseblocks here - they'll be handled later. | ||
730 | */ | ||
731 | list_for_each_entry_safe(seb, tmp_seb, head, u.list) { | ||
732 | if (seb->ec == UBI_SCAN_UNKNOWN_EC) | ||
733 | seb->ec = si->mean_ec; | ||
734 | |||
735 | err = ubi_scan_erase_peb(ubi, si, seb->pnum, seb->ec+1); | ||
736 | if (err) | ||
737 | continue; | ||
738 | |||
739 | seb->ec += 1; | ||
740 | list_del(&seb->u.list); | ||
741 | dbg_bld("return PEB %d, EC %d", seb->pnum, seb->ec); | ||
742 | return seb; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | ubi_err("no eraseblocks found"); | ||
747 | return ERR_PTR(-ENOSPC); | ||
748 | } | ||
749 | |||
750 | /** | ||
751 | * process_eb - read UBI headers, check them and add corresponding data | ||
752 | * to the scanning information. | ||
753 | * @ubi: UBI device description object | ||
754 | * @si: scanning information | ||
755 | * @pnum: the physical eraseblock number | ||
756 | * | ||
757 | * This function returns a zero if the physical eraseblock was succesfully | ||
758 | * handled and a negative error code in case of failure. | ||
759 | */ | ||
760 | static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum) | ||
761 | { | ||
762 | long long ec; | ||
763 | int err, bitflips = 0, vol_id, ec_corr = 0; | ||
764 | |||
765 | dbg_bld("scan PEB %d", pnum); | ||
766 | |||
767 | /* Skip bad physical eraseblocks */ | ||
768 | err = ubi_io_is_bad(ubi, pnum); | ||
769 | if (err < 0) | ||
770 | return err; | ||
771 | else if (err) { | ||
772 | /* | ||
773 | * FIXME: this is actually duty of the I/O unit to initialize | ||
774 | * this, but MTD does not provide enough information. | ||
775 | */ | ||
776 | si->bad_peb_count += 1; | ||
777 | return 0; | ||
778 | } | ||
779 | |||
780 | err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0); | ||
781 | if (err < 0) | ||
782 | return err; | ||
783 | else if (err == UBI_IO_BITFLIPS) | ||
784 | bitflips = 1; | ||
785 | else if (err == UBI_IO_PEB_EMPTY) | ||
786 | return ubi_scan_add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC, | ||
787 | &si->erase); | ||
788 | else if (err == UBI_IO_BAD_EC_HDR) { | ||
789 | /* | ||
790 | * We have to also look at the VID header, possibly it is not | ||
791 | * corrupted. Set %bitflips flag in order to make this PEB be | ||
792 | * moved and EC be re-created. | ||
793 | */ | ||
794 | ec_corr = 1; | ||
795 | ec = UBI_SCAN_UNKNOWN_EC; | ||
796 | bitflips = 1; | ||
797 | } | ||
798 | |||
799 | si->is_empty = 0; | ||
800 | |||
801 | if (!ec_corr) { | ||
802 | /* Make sure UBI version is OK */ | ||
803 | if (ech->version != UBI_VERSION) { | ||
804 | ubi_err("this UBI version is %d, image version is %d", | ||
805 | UBI_VERSION, (int)ech->version); | ||
806 | return -EINVAL; | ||
807 | } | ||
808 | |||
809 | ec = ubi64_to_cpu(ech->ec); | ||
810 | if (ec > UBI_MAX_ERASECOUNTER) { | ||
811 | /* | ||
812 | * Erase counter overflow. The EC headers have 64 bits | ||
813 | * reserved, but we anyway make use of only 31 bit | ||
814 | * values, as this seems to be enough for any existing | ||
815 | * flash. Upgrade UBI and use 64-bit erase counters | ||
816 | * internally. | ||
817 | */ | ||
818 | ubi_err("erase counter overflow, max is %d", | ||
819 | UBI_MAX_ERASECOUNTER); | ||
820 | ubi_dbg_dump_ec_hdr(ech); | ||
821 | return -EINVAL; | ||
822 | } | ||
823 | } | ||
824 | |||
825 | /* OK, we've done with the EC header, let's look at the VID header */ | ||
826 | |||
827 | err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0); | ||
828 | if (err < 0) | ||
829 | return err; | ||
830 | else if (err == UBI_IO_BITFLIPS) | ||
831 | bitflips = 1; | ||
832 | else if (err == UBI_IO_BAD_VID_HDR || | ||
833 | (err == UBI_IO_PEB_FREE && ec_corr)) { | ||
834 | /* VID header is corrupted */ | ||
835 | err = ubi_scan_add_to_list(si, pnum, ec, &si->corr); | ||
836 | if (err) | ||
837 | return err; | ||
838 | goto adjust_mean_ec; | ||
839 | } else if (err == UBI_IO_PEB_FREE) { | ||
840 | /* No VID header - the physical eraseblock is free */ | ||
841 | err = ubi_scan_add_to_list(si, pnum, ec, &si->free); | ||
842 | if (err) | ||
843 | return err; | ||
844 | goto adjust_mean_ec; | ||
845 | } | ||
846 | |||
847 | vol_id = ubi32_to_cpu(vidh->vol_id); | ||
848 | if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOL_ID) { | ||
849 | int lnum = ubi32_to_cpu(vidh->lnum); | ||
850 | |||
851 | /* Unsupported internal volume */ | ||
852 | switch (vidh->compat) { | ||
853 | case UBI_COMPAT_DELETE: | ||
854 | ubi_msg("\"delete\" compatible internal volume %d:%d" | ||
855 | " found, remove it", vol_id, lnum); | ||
856 | err = ubi_scan_add_to_list(si, pnum, ec, &si->corr); | ||
857 | if (err) | ||
858 | return err; | ||
859 | break; | ||
860 | |||
861 | case UBI_COMPAT_RO: | ||
862 | ubi_msg("read-only compatible internal volume %d:%d" | ||
863 | " found, switch to read-only mode", | ||
864 | vol_id, lnum); | ||
865 | ubi->ro_mode = 1; | ||
866 | break; | ||
867 | |||
868 | case UBI_COMPAT_PRESERVE: | ||
869 | ubi_msg("\"preserve\" compatible internal volume %d:%d" | ||
870 | " found", vol_id, lnum); | ||
871 | err = ubi_scan_add_to_list(si, pnum, ec, &si->alien); | ||
872 | if (err) | ||
873 | return err; | ||
874 | si->alien_peb_count += 1; | ||
875 | return 0; | ||
876 | |||
877 | case UBI_COMPAT_REJECT: | ||
878 | ubi_err("incompatible internal volume %d:%d found", | ||
879 | vol_id, lnum); | ||
880 | return -EINVAL; | ||
881 | } | ||
882 | } | ||
883 | |||
884 | /* Both UBI headers seem to be fine */ | ||
885 | err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips); | ||
886 | if (err) | ||
887 | return err; | ||
888 | |||
889 | adjust_mean_ec: | ||
890 | if (!ec_corr) { | ||
891 | if (si->ec_sum + ec < ec) { | ||
892 | commit_to_mean_value(si); | ||
893 | si->ec_sum = 0; | ||
894 | si->ec_count = 0; | ||
895 | } else { | ||
896 | si->ec_sum += ec; | ||
897 | si->ec_count += 1; | ||
898 | } | ||
899 | |||
900 | if (ec > si->max_ec) | ||
901 | si->max_ec = ec; | ||
902 | if (ec < si->min_ec) | ||
903 | si->min_ec = ec; | ||
904 | } | ||
905 | |||
906 | return 0; | ||
907 | } | ||
908 | |||
909 | /** | ||
910 | * ubi_scan - scan an MTD device. | ||
911 | * @ubi: UBI device description object | ||
912 | * | ||
913 | * This function does full scanning of an MTD device and returns complete | ||
914 | * information about it. In case of failure, an error code is returned. | ||
915 | */ | ||
916 | struct ubi_scan_info *ubi_scan(struct ubi_device *ubi) | ||
917 | { | ||
918 | int err, pnum; | ||
919 | struct rb_node *rb1, *rb2; | ||
920 | struct ubi_scan_volume *sv; | ||
921 | struct ubi_scan_leb *seb; | ||
922 | struct ubi_scan_info *si; | ||
923 | |||
924 | si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL); | ||
925 | if (!si) | ||
926 | return ERR_PTR(-ENOMEM); | ||
927 | |||
928 | INIT_LIST_HEAD(&si->corr); | ||
929 | INIT_LIST_HEAD(&si->free); | ||
930 | INIT_LIST_HEAD(&si->erase); | ||
931 | INIT_LIST_HEAD(&si->alien); | ||
932 | si->volumes = RB_ROOT; | ||
933 | si->is_empty = 1; | ||
934 | |||
935 | err = -ENOMEM; | ||
936 | ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); | ||
937 | if (!ech) | ||
938 | goto out_si; | ||
939 | |||
940 | vidh = ubi_zalloc_vid_hdr(ubi); | ||
941 | if (!vidh) | ||
942 | goto out_ech; | ||
943 | |||
944 | for (pnum = 0; pnum < ubi->peb_count; pnum++) { | ||
945 | cond_resched(); | ||
946 | |||
947 | dbg_msg("process PEB %d", pnum); | ||
948 | err = process_eb(ubi, si, pnum); | ||
949 | if (err < 0) | ||
950 | goto out_vidh; | ||
951 | } | ||
952 | |||
953 | dbg_msg("scanning is finished"); | ||
954 | |||
955 | /* Finish mean erase counter calculations */ | ||
956 | if (si->ec_count) | ||
957 | commit_to_mean_value(si); | ||
958 | |||
959 | if (si->is_empty) | ||
960 | ubi_msg("empty MTD device detected"); | ||
961 | |||
962 | /* | ||
963 | * In case of unknown erase counter we use the mean erase counter | ||
964 | * value. | ||
965 | */ | ||
966 | ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { | ||
967 | ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) | ||
968 | if (seb->ec == UBI_SCAN_UNKNOWN_EC) | ||
969 | seb->ec = si->mean_ec; | ||
970 | } | ||
971 | |||
972 | list_for_each_entry(seb, &si->free, u.list) { | ||
973 | if (seb->ec == UBI_SCAN_UNKNOWN_EC) | ||
974 | seb->ec = si->mean_ec; | ||
975 | } | ||
976 | |||
977 | list_for_each_entry(seb, &si->corr, u.list) | ||
978 | if (seb->ec == UBI_SCAN_UNKNOWN_EC) | ||
979 | seb->ec = si->mean_ec; | ||
980 | |||
981 | list_for_each_entry(seb, &si->erase, u.list) | ||
982 | if (seb->ec == UBI_SCAN_UNKNOWN_EC) | ||
983 | seb->ec = si->mean_ec; | ||
984 | |||
985 | err = paranoid_check_si(ubi, si); | ||
986 | if (err) { | ||
987 | if (err > 0) | ||
988 | err = -EINVAL; | ||
989 | goto out_vidh; | ||
990 | } | ||
991 | |||
992 | ubi_free_vid_hdr(ubi, vidh); | ||
993 | kfree(ech); | ||
994 | |||
995 | return si; | ||
996 | |||
997 | out_vidh: | ||
998 | ubi_free_vid_hdr(ubi, vidh); | ||
999 | out_ech: | ||
1000 | kfree(ech); | ||
1001 | out_si: | ||
1002 | ubi_scan_destroy_si(si); | ||
1003 | return ERR_PTR(err); | ||
1004 | } | ||
1005 | |||
1006 | /** | ||
1007 | * destroy_sv - free the scanning volume information | ||
1008 | * @sv: scanning volume information | ||
1009 | * | ||
1010 | * This function destroys the volume RB-tree (@sv->root) and the scanning | ||
1011 | * volume information. | ||
1012 | */ | ||
1013 | static void destroy_sv(struct ubi_scan_volume *sv) | ||
1014 | { | ||
1015 | struct ubi_scan_leb *seb; | ||
1016 | struct rb_node *this = sv->root.rb_node; | ||
1017 | |||
1018 | while (this) { | ||
1019 | if (this->rb_left) | ||
1020 | this = this->rb_left; | ||
1021 | else if (this->rb_right) | ||
1022 | this = this->rb_right; | ||
1023 | else { | ||
1024 | seb = rb_entry(this, struct ubi_scan_leb, u.rb); | ||
1025 | this = rb_parent(this); | ||
1026 | if (this) { | ||
1027 | if (this->rb_left == &seb->u.rb) | ||
1028 | this->rb_left = NULL; | ||
1029 | else | ||
1030 | this->rb_right = NULL; | ||
1031 | } | ||
1032 | |||
1033 | kfree(seb); | ||
1034 | } | ||
1035 | } | ||
1036 | kfree(sv); | ||
1037 | } | ||
1038 | |||
1039 | /** | ||
1040 | * ubi_scan_destroy_si - destroy scanning information. | ||
1041 | * @si: scanning information | ||
1042 | */ | ||
1043 | void ubi_scan_destroy_si(struct ubi_scan_info *si) | ||
1044 | { | ||
1045 | struct ubi_scan_leb *seb, *seb_tmp; | ||
1046 | struct ubi_scan_volume *sv; | ||
1047 | struct rb_node *rb; | ||
1048 | |||
1049 | list_for_each_entry_safe(seb, seb_tmp, &si->alien, u.list) { | ||
1050 | list_del(&seb->u.list); | ||
1051 | kfree(seb); | ||
1052 | } | ||
1053 | list_for_each_entry_safe(seb, seb_tmp, &si->erase, u.list) { | ||
1054 | list_del(&seb->u.list); | ||
1055 | kfree(seb); | ||
1056 | } | ||
1057 | list_for_each_entry_safe(seb, seb_tmp, &si->corr, u.list) { | ||
1058 | list_del(&seb->u.list); | ||
1059 | kfree(seb); | ||
1060 | } | ||
1061 | list_for_each_entry_safe(seb, seb_tmp, &si->free, u.list) { | ||
1062 | list_del(&seb->u.list); | ||
1063 | kfree(seb); | ||
1064 | } | ||
1065 | |||
1066 | /* Destroy the volume RB-tree */ | ||
1067 | rb = si->volumes.rb_node; | ||
1068 | while (rb) { | ||
1069 | if (rb->rb_left) | ||
1070 | rb = rb->rb_left; | ||
1071 | else if (rb->rb_right) | ||
1072 | rb = rb->rb_right; | ||
1073 | else { | ||
1074 | sv = rb_entry(rb, struct ubi_scan_volume, rb); | ||
1075 | |||
1076 | rb = rb_parent(rb); | ||
1077 | if (rb) { | ||
1078 | if (rb->rb_left == &sv->rb) | ||
1079 | rb->rb_left = NULL; | ||
1080 | else | ||
1081 | rb->rb_right = NULL; | ||
1082 | } | ||
1083 | |||
1084 | destroy_sv(sv); | ||
1085 | } | ||
1086 | } | ||
1087 | |||
1088 | kfree(si); | ||
1089 | } | ||
1090 | |||
1091 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
1092 | |||
1093 | /** | ||
1094 | * paranoid_check_si - check if the scanning information is correct and | ||
1095 | * consistent. | ||
1096 | * @ubi: UBI device description object | ||
1097 | * @si: scanning information | ||
1098 | * | ||
1099 | * This function returns zero if the scanning information is all right, %1 if | ||
1100 | * not and a negative error code if an error occurred. | ||
1101 | */ | ||
1102 | static int paranoid_check_si(const struct ubi_device *ubi, | ||
1103 | struct ubi_scan_info *si) | ||
1104 | { | ||
1105 | int pnum, err, vols_found = 0; | ||
1106 | struct rb_node *rb1, *rb2; | ||
1107 | struct ubi_scan_volume *sv; | ||
1108 | struct ubi_scan_leb *seb, *last_seb; | ||
1109 | uint8_t *buf; | ||
1110 | |||
1111 | /* | ||
1112 | * At first, check that scanning information is ok. | ||
1113 | */ | ||
1114 | ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { | ||
1115 | int leb_count = 0; | ||
1116 | |||
1117 | cond_resched(); | ||
1118 | |||
1119 | vols_found += 1; | ||
1120 | |||
1121 | if (si->is_empty) { | ||
1122 | ubi_err("bad is_empty flag"); | ||
1123 | goto bad_sv; | ||
1124 | } | ||
1125 | |||
1126 | if (sv->vol_id < 0 || sv->highest_lnum < 0 || | ||
1127 | sv->leb_count < 0 || sv->vol_type < 0 || sv->used_ebs < 0 || | ||
1128 | sv->data_pad < 0 || sv->last_data_size < 0) { | ||
1129 | ubi_err("negative values"); | ||
1130 | goto bad_sv; | ||
1131 | } | ||
1132 | |||
1133 | if (sv->vol_id >= UBI_MAX_VOLUMES && | ||
1134 | sv->vol_id < UBI_INTERNAL_VOL_START) { | ||
1135 | ubi_err("bad vol_id"); | ||
1136 | goto bad_sv; | ||
1137 | } | ||
1138 | |||
1139 | if (sv->vol_id > si->highest_vol_id) { | ||
1140 | ubi_err("highest_vol_id is %d, but vol_id %d is there", | ||
1141 | si->highest_vol_id, sv->vol_id); | ||
1142 | goto out; | ||
1143 | } | ||
1144 | |||
1145 | if (sv->vol_type != UBI_DYNAMIC_VOLUME && | ||
1146 | sv->vol_type != UBI_STATIC_VOLUME) { | ||
1147 | ubi_err("bad vol_type"); | ||
1148 | goto bad_sv; | ||
1149 | } | ||
1150 | |||
1151 | if (sv->data_pad > ubi->leb_size / 2) { | ||
1152 | ubi_err("bad data_pad"); | ||
1153 | goto bad_sv; | ||
1154 | } | ||
1155 | |||
1156 | last_seb = NULL; | ||
1157 | ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { | ||
1158 | cond_resched(); | ||
1159 | |||
1160 | last_seb = seb; | ||
1161 | leb_count += 1; | ||
1162 | |||
1163 | if (seb->pnum < 0 || seb->ec < 0) { | ||
1164 | ubi_err("negative values"); | ||
1165 | goto bad_seb; | ||
1166 | } | ||
1167 | |||
1168 | if (seb->ec < si->min_ec) { | ||
1169 | ubi_err("bad si->min_ec (%d), %d found", | ||
1170 | si->min_ec, seb->ec); | ||
1171 | goto bad_seb; | ||
1172 | } | ||
1173 | |||
1174 | if (seb->ec > si->max_ec) { | ||
1175 | ubi_err("bad si->max_ec (%d), %d found", | ||
1176 | si->max_ec, seb->ec); | ||
1177 | goto bad_seb; | ||
1178 | } | ||
1179 | |||
1180 | if (seb->pnum >= ubi->peb_count) { | ||
1181 | ubi_err("too high PEB number %d, total PEBs %d", | ||
1182 | seb->pnum, ubi->peb_count); | ||
1183 | goto bad_seb; | ||
1184 | } | ||
1185 | |||
1186 | if (sv->vol_type == UBI_STATIC_VOLUME) { | ||
1187 | if (seb->lnum >= sv->used_ebs) { | ||
1188 | ubi_err("bad lnum or used_ebs"); | ||
1189 | goto bad_seb; | ||
1190 | } | ||
1191 | } else { | ||
1192 | if (sv->used_ebs != 0) { | ||
1193 | ubi_err("non-zero used_ebs"); | ||
1194 | goto bad_seb; | ||
1195 | } | ||
1196 | } | ||
1197 | |||
1198 | if (seb->lnum > sv->highest_lnum) { | ||
1199 | ubi_err("incorrect highest_lnum or lnum"); | ||
1200 | goto bad_seb; | ||
1201 | } | ||
1202 | } | ||
1203 | |||
1204 | if (sv->leb_count != leb_count) { | ||
1205 | ubi_err("bad leb_count, %d objects in the tree", | ||
1206 | leb_count); | ||
1207 | goto bad_sv; | ||
1208 | } | ||
1209 | |||
1210 | if (!last_seb) | ||
1211 | continue; | ||
1212 | |||
1213 | seb = last_seb; | ||
1214 | |||
1215 | if (seb->lnum != sv->highest_lnum) { | ||
1216 | ubi_err("bad highest_lnum"); | ||
1217 | goto bad_seb; | ||
1218 | } | ||
1219 | } | ||
1220 | |||
1221 | if (vols_found != si->vols_found) { | ||
1222 | ubi_err("bad si->vols_found %d, should be %d", | ||
1223 | si->vols_found, vols_found); | ||
1224 | goto out; | ||
1225 | } | ||
1226 | |||
1227 | /* Check that scanning information is correct */ | ||
1228 | ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { | ||
1229 | last_seb = NULL; | ||
1230 | ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { | ||
1231 | int vol_type; | ||
1232 | |||
1233 | cond_resched(); | ||
1234 | |||
1235 | last_seb = seb; | ||
1236 | |||
1237 | err = ubi_io_read_vid_hdr(ubi, seb->pnum, vidh, 1); | ||
1238 | if (err && err != UBI_IO_BITFLIPS) { | ||
1239 | ubi_err("VID header is not OK (%d)", err); | ||
1240 | if (err > 0) | ||
1241 | err = -EIO; | ||
1242 | return err; | ||
1243 | } | ||
1244 | |||
1245 | vol_type = vidh->vol_type == UBI_VID_DYNAMIC ? | ||
1246 | UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME; | ||
1247 | if (sv->vol_type != vol_type) { | ||
1248 | ubi_err("bad vol_type"); | ||
1249 | goto bad_vid_hdr; | ||
1250 | } | ||
1251 | |||
1252 | if (seb->sqnum != ubi64_to_cpu(vidh->sqnum)) { | ||
1253 | ubi_err("bad sqnum %llu", seb->sqnum); | ||
1254 | goto bad_vid_hdr; | ||
1255 | } | ||
1256 | |||
1257 | if (sv->vol_id != ubi32_to_cpu(vidh->vol_id)) { | ||
1258 | ubi_err("bad vol_id %d", sv->vol_id); | ||
1259 | goto bad_vid_hdr; | ||
1260 | } | ||
1261 | |||
1262 | if (sv->compat != vidh->compat) { | ||
1263 | ubi_err("bad compat %d", vidh->compat); | ||
1264 | goto bad_vid_hdr; | ||
1265 | } | ||
1266 | |||
1267 | if (seb->lnum != ubi32_to_cpu(vidh->lnum)) { | ||
1268 | ubi_err("bad lnum %d", seb->lnum); | ||
1269 | goto bad_vid_hdr; | ||
1270 | } | ||
1271 | |||
1272 | if (sv->used_ebs != ubi32_to_cpu(vidh->used_ebs)) { | ||
1273 | ubi_err("bad used_ebs %d", sv->used_ebs); | ||
1274 | goto bad_vid_hdr; | ||
1275 | } | ||
1276 | |||
1277 | if (sv->data_pad != ubi32_to_cpu(vidh->data_pad)) { | ||
1278 | ubi_err("bad data_pad %d", sv->data_pad); | ||
1279 | goto bad_vid_hdr; | ||
1280 | } | ||
1281 | |||
1282 | if (seb->leb_ver != ubi32_to_cpu(vidh->leb_ver)) { | ||
1283 | ubi_err("bad leb_ver %u", seb->leb_ver); | ||
1284 | goto bad_vid_hdr; | ||
1285 | } | ||
1286 | } | ||
1287 | |||
1288 | if (!last_seb) | ||
1289 | continue; | ||
1290 | |||
1291 | if (sv->highest_lnum != ubi32_to_cpu(vidh->lnum)) { | ||
1292 | ubi_err("bad highest_lnum %d", sv->highest_lnum); | ||
1293 | goto bad_vid_hdr; | ||
1294 | } | ||
1295 | |||
1296 | if (sv->last_data_size != ubi32_to_cpu(vidh->data_size)) { | ||
1297 | ubi_err("bad last_data_size %d", sv->last_data_size); | ||
1298 | goto bad_vid_hdr; | ||
1299 | } | ||
1300 | } | ||
1301 | |||
1302 | /* | ||
1303 | * Make sure that all the physical eraseblocks are in one of the lists | ||
1304 | * or trees. | ||
1305 | */ | ||
1306 | buf = kmalloc(ubi->peb_count, GFP_KERNEL); | ||
1307 | if (!buf) | ||
1308 | return -ENOMEM; | ||
1309 | |||
1310 | memset(buf, 1, ubi->peb_count); | ||
1311 | for (pnum = 0; pnum < ubi->peb_count; pnum++) { | ||
1312 | err = ubi_io_is_bad(ubi, pnum); | ||
1313 | if (err < 0) | ||
1314 | return err; | ||
1315 | else if (err) | ||
1316 | buf[pnum] = 0; | ||
1317 | } | ||
1318 | |||
1319 | ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) | ||
1320 | ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) | ||
1321 | buf[seb->pnum] = 0; | ||
1322 | |||
1323 | list_for_each_entry(seb, &si->free, u.list) | ||
1324 | buf[seb->pnum] = 0; | ||
1325 | |||
1326 | list_for_each_entry(seb, &si->corr, u.list) | ||
1327 | buf[seb->pnum] = 0; | ||
1328 | |||
1329 | list_for_each_entry(seb, &si->erase, u.list) | ||
1330 | buf[seb->pnum] = 0; | ||
1331 | |||
1332 | list_for_each_entry(seb, &si->alien, u.list) | ||
1333 | buf[seb->pnum] = 0; | ||
1334 | |||
1335 | err = 0; | ||
1336 | for (pnum = 0; pnum < ubi->peb_count; pnum++) | ||
1337 | if (buf[pnum]) { | ||
1338 | ubi_err("PEB %d is not referred", pnum); | ||
1339 | err = 1; | ||
1340 | } | ||
1341 | |||
1342 | kfree(buf); | ||
1343 | if (err) | ||
1344 | goto out; | ||
1345 | return 0; | ||
1346 | |||
1347 | bad_seb: | ||
1348 | ubi_err("bad scanning information about LEB %d", seb->lnum); | ||
1349 | ubi_dbg_dump_seb(seb, 0); | ||
1350 | ubi_dbg_dump_sv(sv); | ||
1351 | goto out; | ||
1352 | |||
1353 | bad_sv: | ||
1354 | ubi_err("bad scanning information about volume %d", sv->vol_id); | ||
1355 | ubi_dbg_dump_sv(sv); | ||
1356 | goto out; | ||
1357 | |||
1358 | bad_vid_hdr: | ||
1359 | ubi_err("bad scanning information about volume %d", sv->vol_id); | ||
1360 | ubi_dbg_dump_sv(sv); | ||
1361 | ubi_dbg_dump_vid_hdr(vidh); | ||
1362 | |||
1363 | out: | ||
1364 | ubi_dbg_dump_stack(); | ||
1365 | return 1; | ||
1366 | } | ||
1367 | |||
1368 | #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ | ||
diff --git a/drivers/mtd/ubi/scan.h b/drivers/mtd/ubi/scan.h new file mode 100644 index 000000000000..3949f6192c76 --- /dev/null +++ b/drivers/mtd/ubi/scan.h | |||
@@ -0,0 +1,167 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | #ifndef __UBI_SCAN_H__ | ||
22 | #define __UBI_SCAN_H__ | ||
23 | |||
24 | /* The erase counter value for this physical eraseblock is unknown */ | ||
25 | #define UBI_SCAN_UNKNOWN_EC (-1) | ||
26 | |||
27 | /** | ||
28 | * struct ubi_scan_leb - scanning information about a physical eraseblock. | ||
29 | * @ec: erase counter (%UBI_SCAN_UNKNOWN_EC if it is unknown) | ||
30 | * @pnum: physical eraseblock number | ||
31 | * @lnum: logical eraseblock number | ||
32 | * @scrub: if this physical eraseblock needs scrubbing | ||
33 | * @sqnum: sequence number | ||
34 | * @u: unions RB-tree or @list links | ||
35 | * @u.rb: link in the per-volume RB-tree of &struct ubi_scan_leb objects | ||
36 | * @u.list: link in one of the eraseblock lists | ||
37 | * @leb_ver: logical eraseblock version (obsolete) | ||
38 | * | ||
39 | * One object of this type is allocated for each physical eraseblock during | ||
40 | * scanning. | ||
41 | */ | ||
42 | struct ubi_scan_leb { | ||
43 | int ec; | ||
44 | int pnum; | ||
45 | int lnum; | ||
46 | int scrub; | ||
47 | unsigned long long sqnum; | ||
48 | union { | ||
49 | struct rb_node rb; | ||
50 | struct list_head list; | ||
51 | } u; | ||
52 | uint32_t leb_ver; | ||
53 | }; | ||
54 | |||
55 | /** | ||
56 | * struct ubi_scan_volume - scanning information about a volume. | ||
57 | * @vol_id: volume ID | ||
58 | * @highest_lnum: highest logical eraseblock number in this volume | ||
59 | * @leb_count: number of logical eraseblocks in this volume | ||
60 | * @vol_type: volume type | ||
61 | * @used_ebs: number of used logical eraseblocks in this volume (only for | ||
62 | * static volumes) | ||
63 | * @last_data_size: amount of data in the last logical eraseblock of this | ||
64 | * volume (always equivalent to the usable logical eraseblock size in case of | ||
65 | * dynamic volumes) | ||
66 | * @data_pad: how many bytes at the end of logical eraseblocks of this volume | ||
67 | * are not used (due to volume alignment) | ||
68 | * @compat: compatibility flags of this volume | ||
69 | * @rb: link in the volume RB-tree | ||
70 | * @root: root of the RB-tree containing all the eraseblock belonging to this | ||
71 | * volume (&struct ubi_scan_leb objects) | ||
72 | * | ||
73 | * One object of this type is allocated for each volume during scanning. | ||
74 | */ | ||
75 | struct ubi_scan_volume { | ||
76 | int vol_id; | ||
77 | int highest_lnum; | ||
78 | int leb_count; | ||
79 | int vol_type; | ||
80 | int used_ebs; | ||
81 | int last_data_size; | ||
82 | int data_pad; | ||
83 | int compat; | ||
84 | struct rb_node rb; | ||
85 | struct rb_root root; | ||
86 | }; | ||
87 | |||
88 | /** | ||
89 | * struct ubi_scan_info - UBI scanning information. | ||
90 | * @volumes: root of the volume RB-tree | ||
91 | * @corr: list of corrupted physical eraseblocks | ||
92 | * @free: list of free physical eraseblocks | ||
93 | * @erase: list of physical eraseblocks which have to be erased | ||
94 | * @alien: list of physical eraseblocks which should not be used by UBI (e.g., | ||
95 | * @bad_peb_count: count of bad physical eraseblocks | ||
96 | * those belonging to "preserve"-compatible internal volumes) | ||
97 | * @vols_found: number of volumes found during scanning | ||
98 | * @highest_vol_id: highest volume ID | ||
99 | * @alien_peb_count: count of physical eraseblocks in the @alien list | ||
100 | * @is_empty: flag indicating whether the MTD device is empty or not | ||
101 | * @min_ec: lowest erase counter value | ||
102 | * @max_ec: highest erase counter value | ||
103 | * @max_sqnum: highest sequence number value | ||
104 | * @mean_ec: mean erase counter value | ||
105 | * @ec_sum: a temporary variable used when calculating @mean_ec | ||
106 | * @ec_count: a temporary variable used when calculating @mean_ec | ||
107 | * | ||
108 | * This data structure contains the result of scanning and may be used by other | ||
109 | * UBI units to build final UBI data structures, further error-recovery and so | ||
110 | * on. | ||
111 | */ | ||
112 | struct ubi_scan_info { | ||
113 | struct rb_root volumes; | ||
114 | struct list_head corr; | ||
115 | struct list_head free; | ||
116 | struct list_head erase; | ||
117 | struct list_head alien; | ||
118 | int bad_peb_count; | ||
119 | int vols_found; | ||
120 | int highest_vol_id; | ||
121 | int alien_peb_count; | ||
122 | int is_empty; | ||
123 | int min_ec; | ||
124 | int max_ec; | ||
125 | unsigned long long max_sqnum; | ||
126 | int mean_ec; | ||
127 | int ec_sum; | ||
128 | int ec_count; | ||
129 | }; | ||
130 | |||
131 | struct ubi_device; | ||
132 | struct ubi_vid_hdr; | ||
133 | |||
134 | /* | ||
135 | * ubi_scan_move_to_list - move a physical eraseblock from the volume tree to a | ||
136 | * list. | ||
137 | * | ||
138 | * @sv: volume scanning information | ||
139 | * @seb: scanning eraseblock infprmation | ||
140 | * @list: the list to move to | ||
141 | */ | ||
142 | static inline void ubi_scan_move_to_list(struct ubi_scan_volume *sv, | ||
143 | struct ubi_scan_leb *seb, | ||
144 | struct list_head *list) | ||
145 | { | ||
146 | rb_erase(&seb->u.rb, &sv->root); | ||
147 | list_add_tail(&seb->u.list, list); | ||
148 | } | ||
149 | |||
150 | int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec, | ||
151 | struct list_head *list); | ||
152 | int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si, | ||
153 | int pnum, int ec, const struct ubi_vid_hdr *vid_hdr, | ||
154 | int bitflips); | ||
155 | struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si, | ||
156 | int vol_id); | ||
157 | struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv, | ||
158 | int lnum); | ||
159 | void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv); | ||
160 | struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi, | ||
161 | struct ubi_scan_info *si); | ||
162 | int ubi_scan_erase_peb(const struct ubi_device *ubi, | ||
163 | const struct ubi_scan_info *si, int pnum, int ec); | ||
164 | struct ubi_scan_info *ubi_scan(struct ubi_device *ubi); | ||
165 | void ubi_scan_destroy_si(struct ubi_scan_info *si); | ||
166 | |||
167 | #endif /* !__UBI_SCAN_H__ */ | ||
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h new file mode 100644 index 000000000000..feb647f108f0 --- /dev/null +++ b/drivers/mtd/ubi/ubi.h | |||
@@ -0,0 +1,535 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * Copyright (c) Nokia Corporation, 2006, 2007 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | * | ||
19 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
20 | */ | ||
21 | |||
22 | #ifndef __UBI_UBI_H__ | ||
23 | #define __UBI_UBI_H__ | ||
24 | |||
25 | #include <linux/init.h> | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/list.h> | ||
28 | #include <linux/rbtree.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/wait.h> | ||
31 | #include <linux/mutex.h> | ||
32 | #include <linux/rwsem.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | #include <linux/fs.h> | ||
35 | #include <linux/cdev.h> | ||
36 | #include <linux/device.h> | ||
37 | #include <linux/string.h> | ||
38 | #include <linux/mtd/mtd.h> | ||
39 | |||
40 | #include <mtd/ubi-header.h> | ||
41 | #include <linux/mtd/ubi.h> | ||
42 | |||
43 | #include "scan.h" | ||
44 | #include "debug.h" | ||
45 | |||
46 | /* Maximum number of supported UBI devices */ | ||
47 | #define UBI_MAX_DEVICES 32 | ||
48 | |||
49 | /* UBI name used for character devices, sysfs, etc */ | ||
50 | #define UBI_NAME_STR "ubi" | ||
51 | |||
52 | /* Normal UBI messages */ | ||
53 | #define ubi_msg(fmt, ...) printk(KERN_NOTICE "UBI: " fmt "\n", ##__VA_ARGS__) | ||
54 | /* UBI warning messages */ | ||
55 | #define ubi_warn(fmt, ...) printk(KERN_WARNING "UBI warning: %s: " fmt "\n", \ | ||
56 | __FUNCTION__, ##__VA_ARGS__) | ||
57 | /* UBI error messages */ | ||
58 | #define ubi_err(fmt, ...) printk(KERN_ERR "UBI error: %s: " fmt "\n", \ | ||
59 | __FUNCTION__, ##__VA_ARGS__) | ||
60 | |||
61 | /* Lowest number PEBs reserved for bad PEB handling */ | ||
62 | #define MIN_RESEVED_PEBS 2 | ||
63 | |||
64 | /* Background thread name pattern */ | ||
65 | #define UBI_BGT_NAME_PATTERN "ubi_bgt%dd" | ||
66 | |||
67 | /* This marker in the EBA table means that the LEB is um-mapped */ | ||
68 | #define UBI_LEB_UNMAPPED -1 | ||
69 | |||
70 | /* | ||
71 | * In case of errors, UBI tries to repeat the operation several times before | ||
72 | * returning error. The below constant defines how many times UBI re-tries. | ||
73 | */ | ||
74 | #define UBI_IO_RETRIES 3 | ||
75 | |||
76 | /* | ||
77 | * Error codes returned by the I/O unit. | ||
78 | * | ||
79 | * UBI_IO_PEB_EMPTY: the physical eraseblock is empty, i.e. it contains only | ||
80 | * 0xFF bytes | ||
81 | * UBI_IO_PEB_FREE: the physical eraseblock is free, i.e. it contains only a | ||
82 | * valid erase counter header, and the rest are %0xFF bytes | ||
83 | * UBI_IO_BAD_EC_HDR: the erase counter header is corrupted (bad magic or CRC) | ||
84 | * UBI_IO_BAD_VID_HDR: the volume identifier header is corrupted (bad magic or | ||
85 | * CRC) | ||
86 | * UBI_IO_BITFLIPS: bit-flips were detected and corrected | ||
87 | */ | ||
88 | enum { | ||
89 | UBI_IO_PEB_EMPTY = 1, | ||
90 | UBI_IO_PEB_FREE, | ||
91 | UBI_IO_BAD_EC_HDR, | ||
92 | UBI_IO_BAD_VID_HDR, | ||
93 | UBI_IO_BITFLIPS | ||
94 | }; | ||
95 | |||
96 | extern int ubi_devices_cnt; | ||
97 | extern struct ubi_device *ubi_devices[]; | ||
98 | |||
99 | struct ubi_volume_desc; | ||
100 | |||
101 | /** | ||
102 | * struct ubi_volume - UBI volume description data structure. | ||
103 | * @dev: device object to make use of the the Linux device model | ||
104 | * @cdev: character device object to create character device | ||
105 | * @ubi: reference to the UBI device description object | ||
106 | * @vol_id: volume ID | ||
107 | * @readers: number of users holding this volume in read-only mode | ||
108 | * @writers: number of users holding this volume in read-write mode | ||
109 | * @exclusive: whether somebody holds this volume in exclusive mode | ||
110 | * @removed: if the volume was removed | ||
111 | * @checked: if this static volume was checked | ||
112 | * | ||
113 | * @reserved_pebs: how many physical eraseblocks are reserved for this volume | ||
114 | * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) | ||
115 | * @usable_leb_size: logical eraseblock size without padding | ||
116 | * @used_ebs: how many logical eraseblocks in this volume contain data | ||
117 | * @last_eb_bytes: how many bytes are stored in the last logical eraseblock | ||
118 | * @used_bytes: how many bytes of data this volume contains | ||
119 | * @upd_marker: non-zero if the update marker is set for this volume | ||
120 | * @corrupted: non-zero if the volume is corrupted (static volumes only) | ||
121 | * @alignment: volume alignment | ||
122 | * @data_pad: how many bytes are not used at the end of physical eraseblocks to | ||
123 | * satisfy the requested alignment | ||
124 | * @name_len: volume name length | ||
125 | * @name: volume name | ||
126 | * | ||
127 | * @updating: whether the volume is being updated | ||
128 | * @upd_ebs: how many eraseblocks are expected to be updated | ||
129 | * @upd_bytes: how many bytes are expected to be received | ||
130 | * @upd_received: how many update bytes were already received | ||
131 | * @upd_buf: update buffer which is used to collect update data | ||
132 | * | ||
133 | * @eba_tbl: EBA table of this volume (LEB->PEB mapping) | ||
134 | * | ||
135 | * @gluebi_desc: gluebi UBI volume descriptor | ||
136 | * @gluebi_refcount: reference count of the gluebi MTD device | ||
137 | * @gluebi_mtd: MTD device description object of the gluebi MTD device | ||
138 | * | ||
139 | * The @corrupted field indicates that the volume's contents is corrupted. | ||
140 | * Since UBI protects only static volumes, this field is not relevant to | ||
141 | * dynamic volumes - it is user's responsibility to assure their data | ||
142 | * integrity. | ||
143 | * | ||
144 | * The @upd_marker flag indicates that this volume is either being updated at | ||
145 | * the moment or is damaged because of an unclean reboot. | ||
146 | */ | ||
147 | struct ubi_volume { | ||
148 | struct device dev; | ||
149 | struct cdev cdev; | ||
150 | struct ubi_device *ubi; | ||
151 | int vol_id; | ||
152 | int readers; | ||
153 | int writers; | ||
154 | int exclusive; | ||
155 | int removed; | ||
156 | int checked; | ||
157 | |||
158 | int reserved_pebs; | ||
159 | int vol_type; | ||
160 | int usable_leb_size; | ||
161 | int used_ebs; | ||
162 | int last_eb_bytes; | ||
163 | long long used_bytes; | ||
164 | int upd_marker; | ||
165 | int corrupted; | ||
166 | int alignment; | ||
167 | int data_pad; | ||
168 | int name_len; | ||
169 | char name[UBI_VOL_NAME_MAX+1]; | ||
170 | |||
171 | int updating; | ||
172 | int upd_ebs; | ||
173 | long long upd_bytes; | ||
174 | long long upd_received; | ||
175 | void *upd_buf; | ||
176 | |||
177 | int *eba_tbl; | ||
178 | |||
179 | #ifdef CONFIG_MTD_UBI_GLUEBI | ||
180 | /* Gluebi-related stuff may be compiled out */ | ||
181 | struct ubi_volume_desc *gluebi_desc; | ||
182 | int gluebi_refcount; | ||
183 | struct mtd_info gluebi_mtd; | ||
184 | #endif | ||
185 | }; | ||
186 | |||
187 | /** | ||
188 | * struct ubi_volume_desc - descriptor of the UBI volume returned when it is | ||
189 | * opened. | ||
190 | * @vol: reference to the corresponding volume description object | ||
191 | * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE) | ||
192 | */ | ||
193 | struct ubi_volume_desc { | ||
194 | struct ubi_volume *vol; | ||
195 | int mode; | ||
196 | }; | ||
197 | |||
198 | struct ubi_wl_entry; | ||
199 | |||
200 | /** | ||
201 | * struct ubi_device - UBI device description structure | ||
202 | * @dev: class device object to use the the Linux device model | ||
203 | * @cdev: character device object to create character device | ||
204 | * @ubi_num: UBI device number | ||
205 | * @ubi_name: UBI device name | ||
206 | * @major: character device major number | ||
207 | * @vol_count: number of volumes in this UBI device | ||
208 | * @volumes: volumes of this UBI device | ||
209 | * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs, | ||
210 | * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, @vol->readers, | ||
211 | * @vol->writers, @vol->exclusive, @vol->removed, @vol->mapping and | ||
212 | * @vol->eba_tbl. | ||
213 | * | ||
214 | * @rsvd_pebs: count of reserved physical eraseblocks | ||
215 | * @avail_pebs: count of available physical eraseblocks | ||
216 | * @beb_rsvd_pebs: how many physical eraseblocks are reserved for bad PEB | ||
217 | * handling | ||
218 | * @beb_rsvd_level: normal level of PEBs reserved for bad PEB handling | ||
219 | * | ||
220 | * @vtbl_slots: how many slots are available in the volume table | ||
221 | * @vtbl_size: size of the volume table in bytes | ||
222 | * @vtbl: in-RAM volume table copy | ||
223 | * | ||
224 | * @max_ec: current highest erase counter value | ||
225 | * @mean_ec: current mean erase counter value | ||
226 | * | ||
227 | * global_sqnum: global sequence number | ||
228 | * @ltree_lock: protects the lock tree and @global_sqnum | ||
229 | * @ltree: the lock tree | ||
230 | * @vtbl_mutex: protects on-flash volume table | ||
231 | * | ||
232 | * @used: RB-tree of used physical eraseblocks | ||
233 | * @free: RB-tree of free physical eraseblocks | ||
234 | * @scrub: RB-tree of physical eraseblocks which need scrubbing | ||
235 | * @prot: protection trees | ||
236 | * @prot.pnum: protection tree indexed by physical eraseblock numbers | ||
237 | * @prot.aec: protection tree indexed by absolute erase counter value | ||
238 | * @wl_lock: protects the @used, @free, @prot, @lookuptbl, @abs_ec, @move_from, | ||
239 | * @move_to, @move_to_put @erase_pending, @wl_scheduled, and @works | ||
240 | * fields | ||
241 | * @wl_scheduled: non-zero if the wear-leveling was scheduled | ||
242 | * @lookuptbl: a table to quickly find a &struct ubi_wl_entry object for any | ||
243 | * physical eraseblock | ||
244 | * @abs_ec: absolute erase counter | ||
245 | * @move_from: physical eraseblock from where the data is being moved | ||
246 | * @move_to: physical eraseblock where the data is being moved to | ||
247 | * @move_from_put: if the "from" PEB was put | ||
248 | * @move_to_put: if the "to" PEB was put | ||
249 | * @works: list of pending works | ||
250 | * @works_count: count of pending works | ||
251 | * @bgt_thread: background thread description object | ||
252 | * @thread_enabled: if the background thread is enabled | ||
253 | * @bgt_name: background thread name | ||
254 | * | ||
255 | * @flash_size: underlying MTD device size (in bytes) | ||
256 | * @peb_count: count of physical eraseblocks on the MTD device | ||
257 | * @peb_size: physical eraseblock size | ||
258 | * @bad_peb_count: count of bad physical eraseblocks | ||
259 | * @good_peb_count: count of good physical eraseblocks | ||
260 | * @min_io_size: minimal input/output unit size of the underlying MTD device | ||
261 | * @hdrs_min_io_size: minimal I/O unit size used for VID and EC headers | ||
262 | * @ro_mode: if the UBI device is in read-only mode | ||
263 | * @leb_size: logical eraseblock size | ||
264 | * @leb_start: starting offset of logical eraseblocks within physical | ||
265 | * eraseblocks | ||
266 | * @ec_hdr_alsize: size of the EC header aligned to @hdrs_min_io_size | ||
267 | * @vid_hdr_alsize: size of the VID header aligned to @hdrs_min_io_size | ||
268 | * @vid_hdr_offset: starting offset of the volume identifier header (might be | ||
269 | * unaligned) | ||
270 | * @vid_hdr_aloffset: starting offset of the VID header aligned to | ||
271 | * @hdrs_min_io_size | ||
272 | * @vid_hdr_shift: contains @vid_hdr_offset - @vid_hdr_aloffset | ||
273 | * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or | ||
274 | * not | ||
275 | * @mtd: MTD device descriptor | ||
276 | */ | ||
277 | struct ubi_device { | ||
278 | struct cdev cdev; | ||
279 | struct device dev; | ||
280 | int ubi_num; | ||
281 | char ubi_name[sizeof(UBI_NAME_STR)+5]; | ||
282 | int major; | ||
283 | int vol_count; | ||
284 | struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT]; | ||
285 | spinlock_t volumes_lock; | ||
286 | |||
287 | int rsvd_pebs; | ||
288 | int avail_pebs; | ||
289 | int beb_rsvd_pebs; | ||
290 | int beb_rsvd_level; | ||
291 | |||
292 | int vtbl_slots; | ||
293 | int vtbl_size; | ||
294 | struct ubi_vtbl_record *vtbl; | ||
295 | struct mutex vtbl_mutex; | ||
296 | |||
297 | int max_ec; | ||
298 | int mean_ec; | ||
299 | |||
300 | /* EBA unit's stuff */ | ||
301 | unsigned long long global_sqnum; | ||
302 | spinlock_t ltree_lock; | ||
303 | struct rb_root ltree; | ||
304 | |||
305 | /* Wear-leveling unit's stuff */ | ||
306 | struct rb_root used; | ||
307 | struct rb_root free; | ||
308 | struct rb_root scrub; | ||
309 | struct { | ||
310 | struct rb_root pnum; | ||
311 | struct rb_root aec; | ||
312 | } prot; | ||
313 | spinlock_t wl_lock; | ||
314 | int wl_scheduled; | ||
315 | struct ubi_wl_entry **lookuptbl; | ||
316 | unsigned long long abs_ec; | ||
317 | struct ubi_wl_entry *move_from; | ||
318 | struct ubi_wl_entry *move_to; | ||
319 | int move_from_put; | ||
320 | int move_to_put; | ||
321 | struct list_head works; | ||
322 | int works_count; | ||
323 | struct task_struct *bgt_thread; | ||
324 | int thread_enabled; | ||
325 | char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2]; | ||
326 | |||
327 | /* I/O unit's stuff */ | ||
328 | long long flash_size; | ||
329 | int peb_count; | ||
330 | int peb_size; | ||
331 | int bad_peb_count; | ||
332 | int good_peb_count; | ||
333 | int min_io_size; | ||
334 | int hdrs_min_io_size; | ||
335 | int ro_mode; | ||
336 | int leb_size; | ||
337 | int leb_start; | ||
338 | int ec_hdr_alsize; | ||
339 | int vid_hdr_alsize; | ||
340 | int vid_hdr_offset; | ||
341 | int vid_hdr_aloffset; | ||
342 | int vid_hdr_shift; | ||
343 | int bad_allowed; | ||
344 | struct mtd_info *mtd; | ||
345 | }; | ||
346 | |||
347 | extern struct file_operations ubi_cdev_operations; | ||
348 | extern struct file_operations ubi_vol_cdev_operations; | ||
349 | extern struct class *ubi_class; | ||
350 | |||
351 | /* vtbl.c */ | ||
352 | int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, | ||
353 | struct ubi_vtbl_record *vtbl_rec); | ||
354 | int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si); | ||
355 | |||
356 | /* vmt.c */ | ||
357 | int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req); | ||
358 | int ubi_remove_volume(struct ubi_volume_desc *desc); | ||
359 | int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs); | ||
360 | int ubi_add_volume(struct ubi_device *ubi, int vol_id); | ||
361 | void ubi_free_volume(struct ubi_device *ubi, int vol_id); | ||
362 | |||
363 | /* upd.c */ | ||
364 | int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes); | ||
365 | int ubi_more_update_data(struct ubi_device *ubi, int vol_id, | ||
366 | const void __user *buf, int count); | ||
367 | |||
368 | /* misc.c */ | ||
369 | int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length); | ||
370 | int ubi_check_volume(struct ubi_device *ubi, int vol_id); | ||
371 | void ubi_calculate_reserved(struct ubi_device *ubi); | ||
372 | |||
373 | /* gluebi.c */ | ||
374 | #ifdef CONFIG_MTD_UBI_GLUEBI | ||
375 | int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol); | ||
376 | int ubi_destroy_gluebi(struct ubi_volume *vol); | ||
377 | #else | ||
378 | #define ubi_create_gluebi(ubi, vol) 0 | ||
379 | #define ubi_destroy_gluebi(vol) 0 | ||
380 | #endif | ||
381 | |||
382 | /* eba.c */ | ||
383 | int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum); | ||
384 | int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, | ||
385 | int offset, int len, int check); | ||
386 | int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, | ||
387 | const void *buf, int offset, int len, int dtype); | ||
388 | int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, | ||
389 | const void *buf, int len, int dtype, | ||
390 | int used_ebs); | ||
391 | int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, | ||
392 | const void *buf, int len, int dtype); | ||
393 | int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, | ||
394 | struct ubi_vid_hdr *vid_hdr); | ||
395 | int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); | ||
396 | void ubi_eba_close(const struct ubi_device *ubi); | ||
397 | |||
398 | /* wl.c */ | ||
399 | int ubi_wl_get_peb(struct ubi_device *ubi, int dtype); | ||
400 | int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture); | ||
401 | int ubi_wl_flush(struct ubi_device *ubi); | ||
402 | int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum); | ||
403 | int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); | ||
404 | void ubi_wl_close(struct ubi_device *ubi); | ||
405 | |||
406 | /* io.c */ | ||
407 | int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, | ||
408 | int len); | ||
409 | int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum, | ||
410 | int offset, int len); | ||
411 | int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture); | ||
412 | int ubi_io_is_bad(const struct ubi_device *ubi, int pnum); | ||
413 | int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum); | ||
414 | int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
415 | struct ubi_ec_hdr *ec_hdr, int verbose); | ||
416 | int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum, | ||
417 | struct ubi_ec_hdr *ec_hdr); | ||
418 | int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
419 | struct ubi_vid_hdr *vid_hdr, int verbose); | ||
420 | int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum, | ||
421 | struct ubi_vid_hdr *vid_hdr); | ||
422 | |||
423 | /* | ||
424 | * ubi_rb_for_each_entry - walk an RB-tree. | ||
425 | * @rb: a pointer to type 'struct rb_node' to to use as a loop counter | ||
426 | * @pos: a pointer to RB-tree entry type to use as a loop counter | ||
427 | * @root: RB-tree's root | ||
428 | * @member: the name of the 'struct rb_node' within the RB-tree entry | ||
429 | */ | ||
430 | #define ubi_rb_for_each_entry(rb, pos, root, member) \ | ||
431 | for (rb = rb_first(root), \ | ||
432 | pos = (rb ? container_of(rb, typeof(*pos), member) : NULL); \ | ||
433 | rb; \ | ||
434 | rb = rb_next(rb), pos = container_of(rb, typeof(*pos), member)) | ||
435 | |||
436 | /** | ||
437 | * ubi_zalloc_vid_hdr - allocate a volume identifier header object. | ||
438 | * @ubi: UBI device description object | ||
439 | * | ||
440 | * This function returns a pointer to the newly allocated and zero-filled | ||
441 | * volume identifier header object in case of success and %NULL in case of | ||
442 | * failure. | ||
443 | */ | ||
444 | static inline struct ubi_vid_hdr *ubi_zalloc_vid_hdr(const struct ubi_device *ubi) | ||
445 | { | ||
446 | void *vid_hdr; | ||
447 | |||
448 | vid_hdr = kzalloc(ubi->vid_hdr_alsize, GFP_KERNEL); | ||
449 | if (!vid_hdr) | ||
450 | return NULL; | ||
451 | |||
452 | /* | ||
453 | * VID headers may be stored at un-aligned flash offsets, so we shift | ||
454 | * the pointer. | ||
455 | */ | ||
456 | return vid_hdr + ubi->vid_hdr_shift; | ||
457 | } | ||
458 | |||
459 | /** | ||
460 | * ubi_free_vid_hdr - free a volume identifier header object. | ||
461 | * @ubi: UBI device description object | ||
462 | * @vid_hdr: the object to free | ||
463 | */ | ||
464 | static inline void ubi_free_vid_hdr(const struct ubi_device *ubi, | ||
465 | struct ubi_vid_hdr *vid_hdr) | ||
466 | { | ||
467 | void *p = vid_hdr; | ||
468 | |||
469 | if (!p) | ||
470 | return; | ||
471 | |||
472 | kfree(p - ubi->vid_hdr_shift); | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * This function is equivalent to 'ubi_io_read()', but @offset is relative to | ||
477 | * the beginning of the logical eraseblock, not to the beginning of the | ||
478 | * physical eraseblock. | ||
479 | */ | ||
480 | static inline int ubi_io_read_data(const struct ubi_device *ubi, void *buf, | ||
481 | int pnum, int offset, int len) | ||
482 | { | ||
483 | ubi_assert(offset >= 0); | ||
484 | return ubi_io_read(ubi, buf, pnum, offset + ubi->leb_start, len); | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * This function is equivalent to 'ubi_io_write()', but @offset is relative to | ||
489 | * the beginning of the logical eraseblock, not to the beginning of the | ||
490 | * physical eraseblock. | ||
491 | */ | ||
492 | static inline int ubi_io_write_data(const struct ubi_device *ubi, const void *buf, | ||
493 | int pnum, int offset, int len) | ||
494 | { | ||
495 | ubi_assert(offset >= 0); | ||
496 | return ubi_io_write(ubi, buf, pnum, offset + ubi->leb_start, len); | ||
497 | } | ||
498 | |||
499 | /** | ||
500 | * ubi_ro_mode - switch to read-only mode. | ||
501 | * @ubi: UBI device description object | ||
502 | */ | ||
503 | static inline void ubi_ro_mode(struct ubi_device *ubi) | ||
504 | { | ||
505 | ubi->ro_mode = 1; | ||
506 | ubi_warn("switch to read-only mode"); | ||
507 | } | ||
508 | |||
509 | /** | ||
510 | * vol_id2idx - get table index by volume ID. | ||
511 | * @ubi: UBI device description object | ||
512 | * @vol_id: volume ID | ||
513 | */ | ||
514 | static inline int vol_id2idx(const struct ubi_device *ubi, int vol_id) | ||
515 | { | ||
516 | if (vol_id >= UBI_INTERNAL_VOL_START) | ||
517 | return vol_id - UBI_INTERNAL_VOL_START + ubi->vtbl_slots; | ||
518 | else | ||
519 | return vol_id; | ||
520 | } | ||
521 | |||
522 | /** | ||
523 | * idx2vol_id - get volume ID by table index. | ||
524 | * @ubi: UBI device description object | ||
525 | * @idx: table index | ||
526 | */ | ||
527 | static inline int idx2vol_id(const struct ubi_device *ubi, int idx) | ||
528 | { | ||
529 | if (idx >= ubi->vtbl_slots) | ||
530 | return idx - ubi->vtbl_slots + UBI_INTERNAL_VOL_START; | ||
531 | else | ||
532 | return idx; | ||
533 | } | ||
534 | |||
535 | #endif /* !__UBI_UBI_H__ */ | ||
diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c new file mode 100644 index 000000000000..8925b977e3dc --- /dev/null +++ b/drivers/mtd/ubi/upd.c | |||
@@ -0,0 +1,348 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * Copyright (c) Nokia Corporation, 2006 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | * | ||
19 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
20 | * | ||
21 | * Jan 2007: Alexander Schmidt, hacked per-volume update. | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file contains implementation of the volume update functionality. | ||
26 | * | ||
27 | * The update operation is based on the per-volume update marker which is | ||
28 | * stored in the volume table. The update marker is set before the update | ||
29 | * starts, and removed after the update has been finished. So if the update was | ||
30 | * interrupted by an unclean re-boot or due to some other reasons, the update | ||
31 | * marker stays on the flash media and UBI finds it when it attaches the MTD | ||
32 | * device next time. If the update marker is set for a volume, the volume is | ||
33 | * treated as damaged and most I/O operations are prohibited. Only a new update | ||
34 | * operation is allowed. | ||
35 | * | ||
36 | * Note, in general it is possible to implement the update operation as a | ||
37 | * transaction with a roll-back capability. | ||
38 | */ | ||
39 | |||
40 | #include <linux/err.h> | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/div64.h> | ||
43 | #include "ubi.h" | ||
44 | |||
45 | /** | ||
46 | * set_update_marker - set update marker. | ||
47 | * @ubi: UBI device description object | ||
48 | * @vol_id: volume ID | ||
49 | * | ||
50 | * This function sets the update marker flag for volume @vol_id. Returns zero | ||
51 | * in case of success and a negative error code in case of failure. | ||
52 | */ | ||
53 | static int set_update_marker(struct ubi_device *ubi, int vol_id) | ||
54 | { | ||
55 | int err; | ||
56 | struct ubi_vtbl_record vtbl_rec; | ||
57 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
58 | |||
59 | dbg_msg("set update marker for volume %d", vol_id); | ||
60 | |||
61 | if (vol->upd_marker) { | ||
62 | ubi_assert(ubi->vtbl[vol_id].upd_marker); | ||
63 | dbg_msg("already set"); | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); | ||
68 | vtbl_rec.upd_marker = 1; | ||
69 | |||
70 | err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); | ||
71 | vol->upd_marker = 1; | ||
72 | return err; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * clear_update_marker - clear update marker. | ||
77 | * @ubi: UBI device description object | ||
78 | * @vol_id: volume ID | ||
79 | * @bytes: new data size in bytes | ||
80 | * | ||
81 | * This function clears the update marker for volume @vol_id, sets new volume | ||
82 | * data size and clears the "corrupted" flag (static volumes only). Returns | ||
83 | * zero in case of success and a negative error code in case of failure. | ||
84 | */ | ||
85 | static int clear_update_marker(struct ubi_device *ubi, int vol_id, long long bytes) | ||
86 | { | ||
87 | int err; | ||
88 | uint64_t tmp; | ||
89 | struct ubi_vtbl_record vtbl_rec; | ||
90 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
91 | |||
92 | dbg_msg("clear update marker for volume %d", vol_id); | ||
93 | |||
94 | memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); | ||
95 | ubi_assert(vol->upd_marker && vtbl_rec.upd_marker); | ||
96 | vtbl_rec.upd_marker = 0; | ||
97 | |||
98 | if (vol->vol_type == UBI_STATIC_VOLUME) { | ||
99 | vol->corrupted = 0; | ||
100 | vol->used_bytes = tmp = bytes; | ||
101 | vol->last_eb_bytes = do_div(tmp, vol->usable_leb_size); | ||
102 | vol->used_ebs = tmp; | ||
103 | if (vol->last_eb_bytes) | ||
104 | vol->used_ebs += 1; | ||
105 | else | ||
106 | vol->last_eb_bytes = vol->usable_leb_size; | ||
107 | } | ||
108 | |||
109 | err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); | ||
110 | vol->upd_marker = 0; | ||
111 | return err; | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * ubi_start_update - start volume update. | ||
116 | * @ubi: UBI device description object | ||
117 | * @vol_id: volume ID | ||
118 | * @bytes: update bytes | ||
119 | * | ||
120 | * This function starts volume update operation. If @bytes is zero, the volume | ||
121 | * is just wiped out. Returns zero in case of success and a negative error code | ||
122 | * in case of failure. | ||
123 | */ | ||
124 | int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes) | ||
125 | { | ||
126 | int i, err; | ||
127 | uint64_t tmp; | ||
128 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
129 | |||
130 | dbg_msg("start update of volume %d, %llu bytes", vol_id, bytes); | ||
131 | vol->updating = 1; | ||
132 | |||
133 | err = set_update_marker(ubi, vol_id); | ||
134 | if (err) | ||
135 | return err; | ||
136 | |||
137 | /* Before updating - wipe out the volume */ | ||
138 | for (i = 0; i < vol->reserved_pebs; i++) { | ||
139 | err = ubi_eba_unmap_leb(ubi, vol_id, i); | ||
140 | if (err) | ||
141 | return err; | ||
142 | } | ||
143 | |||
144 | if (bytes == 0) { | ||
145 | err = clear_update_marker(ubi, vol_id, 0); | ||
146 | if (err) | ||
147 | return err; | ||
148 | err = ubi_wl_flush(ubi); | ||
149 | if (!err) | ||
150 | vol->updating = 0; | ||
151 | } | ||
152 | |||
153 | vol->upd_buf = kmalloc(ubi->leb_size, GFP_KERNEL); | ||
154 | if (!vol->upd_buf) | ||
155 | return -ENOMEM; | ||
156 | |||
157 | tmp = bytes; | ||
158 | vol->upd_ebs = !!do_div(tmp, vol->usable_leb_size); | ||
159 | vol->upd_ebs += tmp; | ||
160 | vol->upd_bytes = bytes; | ||
161 | vol->upd_received = 0; | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | /** | ||
166 | * write_leb - write update data. | ||
167 | * @ubi: UBI device description object | ||
168 | * @vol_id: volume ID | ||
169 | * @lnum: logical eraseblock number | ||
170 | * @buf: data to write | ||
171 | * @len: data size | ||
172 | * @used_ebs: how many logical eraseblocks will this volume contain (static | ||
173 | * volumes only) | ||
174 | * | ||
175 | * This function writes update data to corresponding logical eraseblock. In | ||
176 | * case of dynamic volume, this function checks if the data contains 0xFF bytes | ||
177 | * at the end. If yes, the 0xFF bytes are cut and not written. So if the whole | ||
178 | * buffer contains only 0xFF bytes, the LEB is left unmapped. | ||
179 | * | ||
180 | * The reason why we skip the trailing 0xFF bytes in case of dynamic volume is | ||
181 | * that we want to make sure that more data may be appended to the logical | ||
182 | * eraseblock in future. Indeed, writing 0xFF bytes may have side effects and | ||
183 | * this PEB won't be writable anymore. So if one writes the file-system image | ||
184 | * to the UBI volume where 0xFFs mean free space - UBI makes sure this free | ||
185 | * space is writable after the update. | ||
186 | * | ||
187 | * We do not do this for static volumes because they are read-only. But this | ||
188 | * also cannot be done because we have to store per-LEB CRC and the correct | ||
189 | * data length. | ||
190 | * | ||
191 | * This function returns zero in case of success and a negative error code in | ||
192 | * case of failure. | ||
193 | */ | ||
194 | static int write_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, | ||
195 | int len, int used_ebs) | ||
196 | { | ||
197 | int err, l; | ||
198 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
199 | |||
200 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) { | ||
201 | l = ALIGN(len, ubi->min_io_size); | ||
202 | memset(buf + len, 0xFF, l - len); | ||
203 | |||
204 | l = ubi_calc_data_len(ubi, buf, l); | ||
205 | if (l == 0) { | ||
206 | dbg_msg("all %d bytes contain 0xFF - skip", len); | ||
207 | return 0; | ||
208 | } | ||
209 | if (len != l) | ||
210 | dbg_msg("skip last %d bytes (0xFF)", len - l); | ||
211 | |||
212 | err = ubi_eba_write_leb(ubi, vol_id, lnum, buf, 0, l, | ||
213 | UBI_UNKNOWN); | ||
214 | } else { | ||
215 | /* | ||
216 | * When writing static volume, and this is the last logical | ||
217 | * eraseblock, the length (@len) does not have to be aligned to | ||
218 | * the minimal flash I/O unit. The 'ubi_eba_write_leb_st()' | ||
219 | * function accepts exact (unaligned) length and stores it in | ||
220 | * the VID header. And it takes care of proper alignment by | ||
221 | * padding the buffer. Here we just make sure the padding will | ||
222 | * contain zeros, not random trash. | ||
223 | */ | ||
224 | memset(buf + len, 0, vol->usable_leb_size - len); | ||
225 | err = ubi_eba_write_leb_st(ubi, vol_id, lnum, buf, len, | ||
226 | UBI_UNKNOWN, used_ebs); | ||
227 | } | ||
228 | |||
229 | return err; | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * ubi_more_update_data - write more update data. | ||
234 | * @vol: volume description object | ||
235 | * @buf: write data (user-space memory buffer) | ||
236 | * @count: how much bytes to write | ||
237 | * | ||
238 | * This function writes more data to the volume which is being updated. It may | ||
239 | * be called arbitrary number of times until all of the update data arrive. | ||
240 | * This function returns %0 in case of success, number of bytes written during | ||
241 | * the last call if the whole volume update was successfully finished, and a | ||
242 | * negative error code in case of failure. | ||
243 | */ | ||
244 | int ubi_more_update_data(struct ubi_device *ubi, int vol_id, | ||
245 | const void __user *buf, int count) | ||
246 | { | ||
247 | uint64_t tmp; | ||
248 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
249 | int lnum, offs, err = 0, len, to_write = count; | ||
250 | |||
251 | dbg_msg("write %d of %lld bytes, %lld already passed", | ||
252 | count, vol->upd_bytes, vol->upd_received); | ||
253 | |||
254 | if (ubi->ro_mode) | ||
255 | return -EROFS; | ||
256 | |||
257 | tmp = vol->upd_received; | ||
258 | offs = do_div(tmp, vol->usable_leb_size); | ||
259 | lnum = tmp; | ||
260 | |||
261 | if (vol->upd_received + count > vol->upd_bytes) | ||
262 | to_write = count = vol->upd_bytes - vol->upd_received; | ||
263 | |||
264 | /* | ||
265 | * When updating volumes, we accumulate whole logical eraseblock of | ||
266 | * data and write it at once. | ||
267 | */ | ||
268 | if (offs != 0) { | ||
269 | /* | ||
270 | * This is a write to the middle of the logical eraseblock. We | ||
271 | * copy the data to our update buffer and wait for more data or | ||
272 | * flush it if the whole eraseblock is written or the update | ||
273 | * is finished. | ||
274 | */ | ||
275 | |||
276 | len = vol->usable_leb_size - offs; | ||
277 | if (len > count) | ||
278 | len = count; | ||
279 | |||
280 | err = copy_from_user(vol->upd_buf + offs, buf, len); | ||
281 | if (err) | ||
282 | return -EFAULT; | ||
283 | |||
284 | if (offs + len == vol->usable_leb_size || | ||
285 | vol->upd_received + len == vol->upd_bytes) { | ||
286 | int flush_len = offs + len; | ||
287 | |||
288 | /* | ||
289 | * OK, we gathered either the whole eraseblock or this | ||
290 | * is the last chunk, it's time to flush the buffer. | ||
291 | */ | ||
292 | ubi_assert(flush_len <= vol->usable_leb_size); | ||
293 | err = write_leb(ubi, vol_id, lnum, vol->upd_buf, | ||
294 | flush_len, vol->upd_ebs); | ||
295 | if (err) | ||
296 | return err; | ||
297 | } | ||
298 | |||
299 | vol->upd_received += len; | ||
300 | count -= len; | ||
301 | buf += len; | ||
302 | lnum += 1; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * If we've got more to write, let's continue. At this point we know we | ||
307 | * are starting from the beginning of an eraseblock. | ||
308 | */ | ||
309 | while (count) { | ||
310 | if (count > vol->usable_leb_size) | ||
311 | len = vol->usable_leb_size; | ||
312 | else | ||
313 | len = count; | ||
314 | |||
315 | err = copy_from_user(vol->upd_buf, buf, len); | ||
316 | if (err) | ||
317 | return -EFAULT; | ||
318 | |||
319 | if (len == vol->usable_leb_size || | ||
320 | vol->upd_received + len == vol->upd_bytes) { | ||
321 | err = write_leb(ubi, vol_id, lnum, vol->upd_buf, len, | ||
322 | vol->upd_ebs); | ||
323 | if (err) | ||
324 | break; | ||
325 | } | ||
326 | |||
327 | vol->upd_received += len; | ||
328 | count -= len; | ||
329 | lnum += 1; | ||
330 | buf += len; | ||
331 | } | ||
332 | |||
333 | ubi_assert(vol->upd_received <= vol->upd_bytes); | ||
334 | if (vol->upd_received == vol->upd_bytes) { | ||
335 | /* The update is finished, clear the update marker */ | ||
336 | err = clear_update_marker(ubi, vol_id, vol->upd_bytes); | ||
337 | if (err) | ||
338 | return err; | ||
339 | err = ubi_wl_flush(ubi); | ||
340 | if (err == 0) { | ||
341 | err = to_write; | ||
342 | kfree(vol->upd_buf); | ||
343 | vol->updating = 0; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | return err; | ||
348 | } | ||
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c new file mode 100644 index 000000000000..622d0d18952c --- /dev/null +++ b/drivers/mtd/ubi/vmt.c | |||
@@ -0,0 +1,809 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * This file contains implementation of volume creation, deletion, updating and | ||
23 | * resizing. | ||
24 | */ | ||
25 | |||
26 | #include <linux/err.h> | ||
27 | #include <asm/div64.h> | ||
28 | #include "ubi.h" | ||
29 | |||
30 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
31 | static void paranoid_check_volumes(struct ubi_device *ubi); | ||
32 | #else | ||
33 | #define paranoid_check_volumes(ubi) | ||
34 | #endif | ||
35 | |||
36 | static ssize_t vol_attribute_show(struct device *dev, | ||
37 | struct device_attribute *attr, char *buf); | ||
38 | |||
39 | /* Device attributes corresponding to files in '/<sysfs>/class/ubi/ubiX_Y' */ | ||
40 | static struct device_attribute vol_reserved_ebs = | ||
41 | __ATTR(reserved_ebs, S_IRUGO, vol_attribute_show, NULL); | ||
42 | static struct device_attribute vol_type = | ||
43 | __ATTR(type, S_IRUGO, vol_attribute_show, NULL); | ||
44 | static struct device_attribute vol_name = | ||
45 | __ATTR(name, S_IRUGO, vol_attribute_show, NULL); | ||
46 | static struct device_attribute vol_corrupted = | ||
47 | __ATTR(corrupted, S_IRUGO, vol_attribute_show, NULL); | ||
48 | static struct device_attribute vol_alignment = | ||
49 | __ATTR(alignment, S_IRUGO, vol_attribute_show, NULL); | ||
50 | static struct device_attribute vol_usable_eb_size = | ||
51 | __ATTR(usable_eb_size, S_IRUGO, vol_attribute_show, NULL); | ||
52 | static struct device_attribute vol_data_bytes = | ||
53 | __ATTR(data_bytes, S_IRUGO, vol_attribute_show, NULL); | ||
54 | static struct device_attribute vol_upd_marker = | ||
55 | __ATTR(upd_marker, S_IRUGO, vol_attribute_show, NULL); | ||
56 | |||
57 | /* | ||
58 | * "Show" method for files in '/<sysfs>/class/ubi/ubiX_Y/'. | ||
59 | * | ||
60 | * Consider a situation: | ||
61 | * A. process 1 opens a sysfs file related to volume Y, say | ||
62 | * /<sysfs>/class/ubi/ubiX_Y/reserved_ebs; | ||
63 | * B. process 2 removes volume Y; | ||
64 | * C. process 1 starts reading the /<sysfs>/class/ubi/ubiX_Y/reserved_ebs file; | ||
65 | * | ||
66 | * What we want to do in a situation like that is to return error when the file | ||
67 | * is read. This is done by means of the 'removed' flag and the 'vol_lock' of | ||
68 | * the UBI volume description object. | ||
69 | */ | ||
70 | static ssize_t vol_attribute_show(struct device *dev, | ||
71 | struct device_attribute *attr, char *buf) | ||
72 | { | ||
73 | int ret; | ||
74 | struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); | ||
75 | |||
76 | spin_lock(&vol->ubi->volumes_lock); | ||
77 | if (vol->removed) { | ||
78 | spin_unlock(&vol->ubi->volumes_lock); | ||
79 | return -ENODEV; | ||
80 | } | ||
81 | if (attr == &vol_reserved_ebs) | ||
82 | ret = sprintf(buf, "%d\n", vol->reserved_pebs); | ||
83 | else if (attr == &vol_type) { | ||
84 | const char *tp; | ||
85 | tp = vol->vol_type == UBI_DYNAMIC_VOLUME ? "dynamic" : "static"; | ||
86 | ret = sprintf(buf, "%s\n", tp); | ||
87 | } else if (attr == &vol_name) | ||
88 | ret = sprintf(buf, "%s\n", vol->name); | ||
89 | else if (attr == &vol_corrupted) | ||
90 | ret = sprintf(buf, "%d\n", vol->corrupted); | ||
91 | else if (attr == &vol_alignment) | ||
92 | ret = sprintf(buf, "%d\n", vol->alignment); | ||
93 | else if (attr == &vol_usable_eb_size) { | ||
94 | ret = sprintf(buf, "%d\n", vol->usable_leb_size); | ||
95 | } else if (attr == &vol_data_bytes) | ||
96 | ret = sprintf(buf, "%lld\n", vol->used_bytes); | ||
97 | else if (attr == &vol_upd_marker) | ||
98 | ret = sprintf(buf, "%d\n", vol->upd_marker); | ||
99 | else | ||
100 | BUG(); | ||
101 | spin_unlock(&vol->ubi->volumes_lock); | ||
102 | return ret; | ||
103 | } | ||
104 | |||
105 | /* Release method for volume devices */ | ||
106 | static void vol_release(struct device *dev) | ||
107 | { | ||
108 | struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); | ||
109 | ubi_assert(vol->removed); | ||
110 | kfree(vol); | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * volume_sysfs_init - initialize sysfs for new volume. | ||
115 | * @ubi: UBI device description object | ||
116 | * @vol: volume description object | ||
117 | * | ||
118 | * This function returns zero in case of success and a negative error code in | ||
119 | * case of failure. | ||
120 | * | ||
121 | * Note, this function does not free allocated resources in case of failure - | ||
122 | * the caller does it. This is because this would cause release() here and the | ||
123 | * caller would oops. | ||
124 | */ | ||
125 | static int volume_sysfs_init(struct ubi_device *ubi, struct ubi_volume *vol) | ||
126 | { | ||
127 | int err; | ||
128 | |||
129 | err = device_create_file(&vol->dev, &vol_reserved_ebs); | ||
130 | if (err) | ||
131 | return err; | ||
132 | err = device_create_file(&vol->dev, &vol_type); | ||
133 | if (err) | ||
134 | return err; | ||
135 | err = device_create_file(&vol->dev, &vol_name); | ||
136 | if (err) | ||
137 | return err; | ||
138 | err = device_create_file(&vol->dev, &vol_corrupted); | ||
139 | if (err) | ||
140 | return err; | ||
141 | err = device_create_file(&vol->dev, &vol_alignment); | ||
142 | if (err) | ||
143 | return err; | ||
144 | err = device_create_file(&vol->dev, &vol_usable_eb_size); | ||
145 | if (err) | ||
146 | return err; | ||
147 | err = device_create_file(&vol->dev, &vol_data_bytes); | ||
148 | if (err) | ||
149 | return err; | ||
150 | err = device_create_file(&vol->dev, &vol_upd_marker); | ||
151 | if (err) | ||
152 | return err; | ||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * volume_sysfs_close - close sysfs for a volume. | ||
158 | * @vol: volume description object | ||
159 | */ | ||
160 | static void volume_sysfs_close(struct ubi_volume *vol) | ||
161 | { | ||
162 | device_remove_file(&vol->dev, &vol_upd_marker); | ||
163 | device_remove_file(&vol->dev, &vol_data_bytes); | ||
164 | device_remove_file(&vol->dev, &vol_usable_eb_size); | ||
165 | device_remove_file(&vol->dev, &vol_alignment); | ||
166 | device_remove_file(&vol->dev, &vol_corrupted); | ||
167 | device_remove_file(&vol->dev, &vol_name); | ||
168 | device_remove_file(&vol->dev, &vol_type); | ||
169 | device_remove_file(&vol->dev, &vol_reserved_ebs); | ||
170 | device_unregister(&vol->dev); | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * ubi_create_volume - create volume. | ||
175 | * @ubi: UBI device description object | ||
176 | * @req: volume creation request | ||
177 | * | ||
178 | * This function creates volume described by @req. If @req->vol_id id | ||
179 | * %UBI_VOL_NUM_AUTO, this function automatically assigne ID to the new volume | ||
180 | * and saves it in @req->vol_id. Returns zero in case of success and a negative | ||
181 | * error code in case of failure. | ||
182 | */ | ||
183 | int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) | ||
184 | { | ||
185 | int i, err, vol_id = req->vol_id; | ||
186 | struct ubi_volume *vol; | ||
187 | struct ubi_vtbl_record vtbl_rec; | ||
188 | uint64_t bytes; | ||
189 | |||
190 | if (ubi->ro_mode) | ||
191 | return -EROFS; | ||
192 | |||
193 | vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); | ||
194 | if (!vol) | ||
195 | return -ENOMEM; | ||
196 | |||
197 | spin_lock(&ubi->volumes_lock); | ||
198 | |||
199 | if (vol_id == UBI_VOL_NUM_AUTO) { | ||
200 | /* Find unused volume ID */ | ||
201 | dbg_msg("search for vacant volume ID"); | ||
202 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
203 | if (!ubi->volumes[i]) { | ||
204 | vol_id = i; | ||
205 | break; | ||
206 | } | ||
207 | |||
208 | if (vol_id == UBI_VOL_NUM_AUTO) { | ||
209 | dbg_err("out of volume IDs"); | ||
210 | err = -ENFILE; | ||
211 | goto out_unlock; | ||
212 | } | ||
213 | req->vol_id = vol_id; | ||
214 | } | ||
215 | |||
216 | dbg_msg("volume ID %d, %llu bytes, type %d, name %s", | ||
217 | vol_id, (unsigned long long)req->bytes, | ||
218 | (int)req->vol_type, req->name); | ||
219 | |||
220 | /* Ensure that this volume does not exist */ | ||
221 | err = -EEXIST; | ||
222 | if (ubi->volumes[vol_id]) { | ||
223 | dbg_err("volume %d already exists", vol_id); | ||
224 | goto out_unlock; | ||
225 | } | ||
226 | |||
227 | /* Ensure that the name is unique */ | ||
228 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
229 | if (ubi->volumes[i] && | ||
230 | ubi->volumes[i]->name_len == req->name_len && | ||
231 | strcmp(ubi->volumes[i]->name, req->name) == 0) { | ||
232 | dbg_err("volume \"%s\" exists (ID %d)", req->name, i); | ||
233 | goto out_unlock; | ||
234 | } | ||
235 | |||
236 | /* Calculate how many eraseblocks are requested */ | ||
237 | vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment; | ||
238 | bytes = req->bytes; | ||
239 | if (do_div(bytes, vol->usable_leb_size)) | ||
240 | vol->reserved_pebs = 1; | ||
241 | vol->reserved_pebs += bytes; | ||
242 | |||
243 | /* Reserve physical eraseblocks */ | ||
244 | if (vol->reserved_pebs > ubi->avail_pebs) { | ||
245 | dbg_err("not enough PEBs, only %d available", ubi->avail_pebs); | ||
246 | spin_unlock(&ubi->volumes_lock); | ||
247 | err = -ENOSPC; | ||
248 | goto out_unlock; | ||
249 | } | ||
250 | ubi->avail_pebs -= vol->reserved_pebs; | ||
251 | ubi->rsvd_pebs += vol->reserved_pebs; | ||
252 | |||
253 | vol->vol_id = vol_id; | ||
254 | vol->alignment = req->alignment; | ||
255 | vol->data_pad = ubi->leb_size % vol->alignment; | ||
256 | vol->vol_type = req->vol_type; | ||
257 | vol->name_len = req->name_len; | ||
258 | memcpy(vol->name, req->name, vol->name_len + 1); | ||
259 | vol->exclusive = 1; | ||
260 | vol->ubi = ubi; | ||
261 | ubi->volumes[vol_id] = vol; | ||
262 | spin_unlock(&ubi->volumes_lock); | ||
263 | |||
264 | /* | ||
265 | * Finish all pending erases because there may be some LEBs belonging | ||
266 | * to the same volume ID. | ||
267 | */ | ||
268 | err = ubi_wl_flush(ubi); | ||
269 | if (err) | ||
270 | goto out_acc; | ||
271 | |||
272 | vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), GFP_KERNEL); | ||
273 | if (!vol->eba_tbl) { | ||
274 | err = -ENOMEM; | ||
275 | goto out_acc; | ||
276 | } | ||
277 | |||
278 | for (i = 0; i < vol->reserved_pebs; i++) | ||
279 | vol->eba_tbl[i] = UBI_LEB_UNMAPPED; | ||
280 | |||
281 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) { | ||
282 | vol->used_ebs = vol->reserved_pebs; | ||
283 | vol->last_eb_bytes = vol->usable_leb_size; | ||
284 | vol->used_bytes = vol->used_ebs * vol->usable_leb_size; | ||
285 | } else { | ||
286 | bytes = vol->used_bytes; | ||
287 | vol->last_eb_bytes = do_div(bytes, vol->usable_leb_size); | ||
288 | vol->used_ebs = bytes; | ||
289 | if (vol->last_eb_bytes) | ||
290 | vol->used_ebs += 1; | ||
291 | else | ||
292 | vol->last_eb_bytes = vol->usable_leb_size; | ||
293 | } | ||
294 | |||
295 | /* Register character device for the volume */ | ||
296 | cdev_init(&vol->cdev, &ubi_vol_cdev_operations); | ||
297 | vol->cdev.owner = THIS_MODULE; | ||
298 | err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol_id + 1), 1); | ||
299 | if (err) { | ||
300 | ubi_err("cannot add character device for volume %d", vol_id); | ||
301 | goto out_mapping; | ||
302 | } | ||
303 | |||
304 | err = ubi_create_gluebi(ubi, vol); | ||
305 | if (err) | ||
306 | goto out_cdev; | ||
307 | |||
308 | vol->dev.release = vol_release; | ||
309 | vol->dev.parent = &ubi->dev; | ||
310 | vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); | ||
311 | vol->dev.class = ubi_class; | ||
312 | sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); | ||
313 | err = device_register(&vol->dev); | ||
314 | if (err) | ||
315 | goto out_gluebi; | ||
316 | |||
317 | err = volume_sysfs_init(ubi, vol); | ||
318 | if (err) | ||
319 | goto out_sysfs; | ||
320 | |||
321 | /* Fill volume table record */ | ||
322 | memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record)); | ||
323 | vtbl_rec.reserved_pebs = cpu_to_ubi32(vol->reserved_pebs); | ||
324 | vtbl_rec.alignment = cpu_to_ubi32(vol->alignment); | ||
325 | vtbl_rec.data_pad = cpu_to_ubi32(vol->data_pad); | ||
326 | vtbl_rec.name_len = cpu_to_ubi16(vol->name_len); | ||
327 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) | ||
328 | vtbl_rec.vol_type = UBI_VID_DYNAMIC; | ||
329 | else | ||
330 | vtbl_rec.vol_type = UBI_VID_STATIC; | ||
331 | memcpy(vtbl_rec.name, vol->name, vol->name_len + 1); | ||
332 | |||
333 | err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); | ||
334 | if (err) | ||
335 | goto out_sysfs; | ||
336 | |||
337 | spin_lock(&ubi->volumes_lock); | ||
338 | ubi->vol_count += 1; | ||
339 | vol->exclusive = 0; | ||
340 | spin_unlock(&ubi->volumes_lock); | ||
341 | |||
342 | paranoid_check_volumes(ubi); | ||
343 | return 0; | ||
344 | |||
345 | out_gluebi: | ||
346 | err = ubi_destroy_gluebi(vol); | ||
347 | out_cdev: | ||
348 | cdev_del(&vol->cdev); | ||
349 | out_mapping: | ||
350 | kfree(vol->eba_tbl); | ||
351 | out_acc: | ||
352 | spin_lock(&ubi->volumes_lock); | ||
353 | ubi->rsvd_pebs -= vol->reserved_pebs; | ||
354 | ubi->avail_pebs += vol->reserved_pebs; | ||
355 | out_unlock: | ||
356 | spin_unlock(&ubi->volumes_lock); | ||
357 | kfree(vol); | ||
358 | return err; | ||
359 | |||
360 | /* | ||
361 | * We are registered, so @vol is destroyed in the release function and | ||
362 | * we have to de-initialize differently. | ||
363 | */ | ||
364 | out_sysfs: | ||
365 | err = ubi_destroy_gluebi(vol); | ||
366 | cdev_del(&vol->cdev); | ||
367 | kfree(vol->eba_tbl); | ||
368 | spin_lock(&ubi->volumes_lock); | ||
369 | ubi->rsvd_pebs -= vol->reserved_pebs; | ||
370 | ubi->avail_pebs += vol->reserved_pebs; | ||
371 | spin_unlock(&ubi->volumes_lock); | ||
372 | volume_sysfs_close(vol); | ||
373 | return err; | ||
374 | } | ||
375 | |||
376 | /** | ||
377 | * ubi_remove_volume - remove volume. | ||
378 | * @desc: volume descriptor | ||
379 | * | ||
380 | * This function removes volume described by @desc. The volume has to be opened | ||
381 | * in "exclusive" mode. Returns zero in case of success and a negative error | ||
382 | * code in case of failure. | ||
383 | */ | ||
384 | int ubi_remove_volume(struct ubi_volume_desc *desc) | ||
385 | { | ||
386 | struct ubi_volume *vol = desc->vol; | ||
387 | struct ubi_device *ubi = vol->ubi; | ||
388 | int i, err, vol_id = vol->vol_id, reserved_pebs = vol->reserved_pebs; | ||
389 | |||
390 | dbg_msg("remove UBI volume %d", vol_id); | ||
391 | ubi_assert(desc->mode == UBI_EXCLUSIVE); | ||
392 | ubi_assert(vol == ubi->volumes[vol_id]); | ||
393 | |||
394 | if (ubi->ro_mode) | ||
395 | return -EROFS; | ||
396 | |||
397 | err = ubi_destroy_gluebi(vol); | ||
398 | if (err) | ||
399 | return err; | ||
400 | |||
401 | err = ubi_change_vtbl_record(ubi, vol_id, NULL); | ||
402 | if (err) | ||
403 | return err; | ||
404 | |||
405 | for (i = 0; i < vol->reserved_pebs; i++) { | ||
406 | err = ubi_eba_unmap_leb(ubi, vol_id, i); | ||
407 | if (err) | ||
408 | return err; | ||
409 | } | ||
410 | |||
411 | spin_lock(&ubi->volumes_lock); | ||
412 | vol->removed = 1; | ||
413 | ubi->volumes[vol_id] = NULL; | ||
414 | spin_unlock(&ubi->volumes_lock); | ||
415 | |||
416 | kfree(vol->eba_tbl); | ||
417 | vol->eba_tbl = NULL; | ||
418 | cdev_del(&vol->cdev); | ||
419 | volume_sysfs_close(vol); | ||
420 | kfree(desc); | ||
421 | |||
422 | spin_lock(&ubi->volumes_lock); | ||
423 | ubi->rsvd_pebs -= reserved_pebs; | ||
424 | ubi->avail_pebs += reserved_pebs; | ||
425 | i = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; | ||
426 | if (i > 0) { | ||
427 | i = ubi->avail_pebs >= i ? i : ubi->avail_pebs; | ||
428 | ubi->avail_pebs -= i; | ||
429 | ubi->rsvd_pebs += i; | ||
430 | ubi->beb_rsvd_pebs += i; | ||
431 | if (i > 0) | ||
432 | ubi_msg("reserve more %d PEBs", i); | ||
433 | } | ||
434 | ubi->vol_count -= 1; | ||
435 | spin_unlock(&ubi->volumes_lock); | ||
436 | |||
437 | paranoid_check_volumes(ubi); | ||
438 | module_put(THIS_MODULE); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | /** | ||
443 | * ubi_resize_volume - re-size volume. | ||
444 | * @desc: volume descriptor | ||
445 | * @reserved_pebs: new size in physical eraseblocks | ||
446 | * | ||
447 | * This function returns zero in case of success, and a negative error code in | ||
448 | * case of failure. | ||
449 | */ | ||
450 | int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) | ||
451 | { | ||
452 | int i, err, pebs, *new_mapping; | ||
453 | struct ubi_volume *vol = desc->vol; | ||
454 | struct ubi_device *ubi = vol->ubi; | ||
455 | struct ubi_vtbl_record vtbl_rec; | ||
456 | int vol_id = vol->vol_id; | ||
457 | |||
458 | if (ubi->ro_mode) | ||
459 | return -EROFS; | ||
460 | |||
461 | dbg_msg("re-size volume %d to from %d to %d PEBs", | ||
462 | vol_id, vol->reserved_pebs, reserved_pebs); | ||
463 | ubi_assert(desc->mode == UBI_EXCLUSIVE); | ||
464 | ubi_assert(vol == ubi->volumes[vol_id]); | ||
465 | |||
466 | if (vol->vol_type == UBI_STATIC_VOLUME && | ||
467 | reserved_pebs < vol->used_ebs) { | ||
468 | dbg_err("too small size %d, %d LEBs contain data", | ||
469 | reserved_pebs, vol->used_ebs); | ||
470 | return -EINVAL; | ||
471 | } | ||
472 | |||
473 | /* If the size is the same, we have nothing to do */ | ||
474 | if (reserved_pebs == vol->reserved_pebs) | ||
475 | return 0; | ||
476 | |||
477 | new_mapping = kmalloc(reserved_pebs * sizeof(int), GFP_KERNEL); | ||
478 | if (!new_mapping) | ||
479 | return -ENOMEM; | ||
480 | |||
481 | for (i = 0; i < reserved_pebs; i++) | ||
482 | new_mapping[i] = UBI_LEB_UNMAPPED; | ||
483 | |||
484 | /* Reserve physical eraseblocks */ | ||
485 | pebs = reserved_pebs - vol->reserved_pebs; | ||
486 | if (pebs > 0) { | ||
487 | spin_lock(&ubi->volumes_lock); | ||
488 | if (pebs > ubi->avail_pebs) { | ||
489 | dbg_err("not enough PEBs: requested %d, available %d", | ||
490 | pebs, ubi->avail_pebs); | ||
491 | spin_unlock(&ubi->volumes_lock); | ||
492 | err = -ENOSPC; | ||
493 | goto out_free; | ||
494 | } | ||
495 | ubi->avail_pebs -= pebs; | ||
496 | ubi->rsvd_pebs += pebs; | ||
497 | for (i = 0; i < vol->reserved_pebs; i++) | ||
498 | new_mapping[i] = vol->eba_tbl[i]; | ||
499 | kfree(vol->eba_tbl); | ||
500 | vol->eba_tbl = new_mapping; | ||
501 | spin_unlock(&ubi->volumes_lock); | ||
502 | } | ||
503 | |||
504 | /* Change volume table record */ | ||
505 | memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); | ||
506 | vtbl_rec.reserved_pebs = cpu_to_ubi32(reserved_pebs); | ||
507 | err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); | ||
508 | if (err) | ||
509 | goto out_acc; | ||
510 | |||
511 | if (pebs < 0) { | ||
512 | for (i = 0; i < -pebs; i++) { | ||
513 | err = ubi_eba_unmap_leb(ubi, vol_id, reserved_pebs + i); | ||
514 | if (err) | ||
515 | goto out_acc; | ||
516 | } | ||
517 | spin_lock(&ubi->volumes_lock); | ||
518 | ubi->rsvd_pebs += pebs; | ||
519 | ubi->avail_pebs -= pebs; | ||
520 | pebs = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; | ||
521 | if (pebs > 0) { | ||
522 | pebs = ubi->avail_pebs >= pebs ? pebs : ubi->avail_pebs; | ||
523 | ubi->avail_pebs -= pebs; | ||
524 | ubi->rsvd_pebs += pebs; | ||
525 | ubi->beb_rsvd_pebs += pebs; | ||
526 | if (pebs > 0) | ||
527 | ubi_msg("reserve more %d PEBs", pebs); | ||
528 | } | ||
529 | for (i = 0; i < reserved_pebs; i++) | ||
530 | new_mapping[i] = vol->eba_tbl[i]; | ||
531 | kfree(vol->eba_tbl); | ||
532 | vol->eba_tbl = new_mapping; | ||
533 | spin_unlock(&ubi->volumes_lock); | ||
534 | } | ||
535 | |||
536 | vol->reserved_pebs = reserved_pebs; | ||
537 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) { | ||
538 | vol->used_ebs = reserved_pebs; | ||
539 | vol->last_eb_bytes = vol->usable_leb_size; | ||
540 | vol->used_bytes = vol->used_ebs * vol->usable_leb_size; | ||
541 | } | ||
542 | |||
543 | paranoid_check_volumes(ubi); | ||
544 | return 0; | ||
545 | |||
546 | out_acc: | ||
547 | if (pebs > 0) { | ||
548 | spin_lock(&ubi->volumes_lock); | ||
549 | ubi->rsvd_pebs -= pebs; | ||
550 | ubi->avail_pebs += pebs; | ||
551 | spin_unlock(&ubi->volumes_lock); | ||
552 | } | ||
553 | out_free: | ||
554 | kfree(new_mapping); | ||
555 | return err; | ||
556 | } | ||
557 | |||
558 | /** | ||
559 | * ubi_add_volume - add volume. | ||
560 | * @ubi: UBI device description object | ||
561 | * @vol_id: volume ID | ||
562 | * | ||
563 | * This function adds an existin volume and initializes all its data | ||
564 | * structures. Returnes zero in case of success and a negative error code in | ||
565 | * case of failure. | ||
566 | */ | ||
567 | int ubi_add_volume(struct ubi_device *ubi, int vol_id) | ||
568 | { | ||
569 | int err; | ||
570 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
571 | |||
572 | dbg_msg("add volume %d", vol_id); | ||
573 | ubi_dbg_dump_vol_info(vol); | ||
574 | ubi_assert(vol); | ||
575 | |||
576 | /* Register character device for the volume */ | ||
577 | cdev_init(&vol->cdev, &ubi_vol_cdev_operations); | ||
578 | vol->cdev.owner = THIS_MODULE; | ||
579 | err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol->vol_id + 1), 1); | ||
580 | if (err) { | ||
581 | ubi_err("cannot add character device for volume %d", vol_id); | ||
582 | return err; | ||
583 | } | ||
584 | |||
585 | err = ubi_create_gluebi(ubi, vol); | ||
586 | if (err) | ||
587 | goto out_cdev; | ||
588 | |||
589 | vol->dev.release = vol_release; | ||
590 | vol->dev.parent = &ubi->dev; | ||
591 | vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); | ||
592 | vol->dev.class = ubi_class; | ||
593 | sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); | ||
594 | err = device_register(&vol->dev); | ||
595 | if (err) | ||
596 | goto out_gluebi; | ||
597 | |||
598 | err = volume_sysfs_init(ubi, vol); | ||
599 | if (err) { | ||
600 | cdev_del(&vol->cdev); | ||
601 | err = ubi_destroy_gluebi(vol); | ||
602 | volume_sysfs_close(vol); | ||
603 | return err; | ||
604 | } | ||
605 | |||
606 | paranoid_check_volumes(ubi); | ||
607 | return 0; | ||
608 | |||
609 | out_gluebi: | ||
610 | err = ubi_destroy_gluebi(vol); | ||
611 | out_cdev: | ||
612 | cdev_del(&vol->cdev); | ||
613 | return err; | ||
614 | } | ||
615 | |||
616 | /** | ||
617 | * ubi_free_volume - free volume. | ||
618 | * @ubi: UBI device description object | ||
619 | * @vol_id: volume ID | ||
620 | * | ||
621 | * This function frees all resources for volume @vol_id but does not remove it. | ||
622 | * Used only when the UBI device is detached. | ||
623 | */ | ||
624 | void ubi_free_volume(struct ubi_device *ubi, int vol_id) | ||
625 | { | ||
626 | int err; | ||
627 | struct ubi_volume *vol = ubi->volumes[vol_id]; | ||
628 | |||
629 | dbg_msg("free volume %d", vol_id); | ||
630 | ubi_assert(vol); | ||
631 | |||
632 | vol->removed = 1; | ||
633 | err = ubi_destroy_gluebi(vol); | ||
634 | ubi->volumes[vol_id] = NULL; | ||
635 | cdev_del(&vol->cdev); | ||
636 | volume_sysfs_close(vol); | ||
637 | } | ||
638 | |||
639 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
640 | |||
641 | /** | ||
642 | * paranoid_check_volume - check volume information. | ||
643 | * @ubi: UBI device description object | ||
644 | * @vol_id: volume ID | ||
645 | */ | ||
646 | static void paranoid_check_volume(const struct ubi_device *ubi, int vol_id) | ||
647 | { | ||
648 | int idx = vol_id2idx(ubi, vol_id); | ||
649 | int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker; | ||
650 | const struct ubi_volume *vol = ubi->volumes[idx]; | ||
651 | long long n; | ||
652 | const char *name; | ||
653 | |||
654 | reserved_pebs = ubi32_to_cpu(ubi->vtbl[vol_id].reserved_pebs); | ||
655 | |||
656 | if (!vol) { | ||
657 | if (reserved_pebs) { | ||
658 | ubi_err("no volume info, but volume exists"); | ||
659 | goto fail; | ||
660 | } | ||
661 | return; | ||
662 | } | ||
663 | |||
664 | if (vol->reserved_pebs < 0 || vol->alignment < 0 || vol->data_pad < 0 || | ||
665 | vol->name_len < 0) { | ||
666 | ubi_err("negative values"); | ||
667 | goto fail; | ||
668 | } | ||
669 | if (vol->alignment > ubi->leb_size || vol->alignment == 0) { | ||
670 | ubi_err("bad alignment"); | ||
671 | goto fail; | ||
672 | } | ||
673 | |||
674 | n = vol->alignment % ubi->min_io_size; | ||
675 | if (vol->alignment != 1 && n) { | ||
676 | ubi_err("alignment is not multiple of min I/O unit"); | ||
677 | goto fail; | ||
678 | } | ||
679 | |||
680 | n = ubi->leb_size % vol->alignment; | ||
681 | if (vol->data_pad != n) { | ||
682 | ubi_err("bad data_pad, has to be %lld", n); | ||
683 | goto fail; | ||
684 | } | ||
685 | |||
686 | if (vol->vol_type != UBI_DYNAMIC_VOLUME && | ||
687 | vol->vol_type != UBI_STATIC_VOLUME) { | ||
688 | ubi_err("bad vol_type"); | ||
689 | goto fail; | ||
690 | } | ||
691 | |||
692 | if (vol->upd_marker != 0 && vol->upd_marker != 1) { | ||
693 | ubi_err("bad upd_marker"); | ||
694 | goto fail; | ||
695 | } | ||
696 | |||
697 | if (vol->upd_marker && vol->corrupted) { | ||
698 | dbg_err("update marker and corrupted simultaneously"); | ||
699 | goto fail; | ||
700 | } | ||
701 | |||
702 | if (vol->reserved_pebs > ubi->good_peb_count) { | ||
703 | ubi_err("too large reserved_pebs"); | ||
704 | goto fail; | ||
705 | } | ||
706 | |||
707 | n = ubi->leb_size - vol->data_pad; | ||
708 | if (vol->usable_leb_size != ubi->leb_size - vol->data_pad) { | ||
709 | ubi_err("bad usable_leb_size, has to be %lld", n); | ||
710 | goto fail; | ||
711 | } | ||
712 | |||
713 | if (vol->name_len > UBI_VOL_NAME_MAX) { | ||
714 | ubi_err("too long volume name, max is %d", UBI_VOL_NAME_MAX); | ||
715 | goto fail; | ||
716 | } | ||
717 | |||
718 | if (!vol->name) { | ||
719 | ubi_err("NULL volume name"); | ||
720 | goto fail; | ||
721 | } | ||
722 | |||
723 | n = strnlen(vol->name, vol->name_len + 1); | ||
724 | if (n != vol->name_len) { | ||
725 | ubi_err("bad name_len %lld", n); | ||
726 | goto fail; | ||
727 | } | ||
728 | |||
729 | n = vol->used_ebs * vol->usable_leb_size; | ||
730 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) { | ||
731 | if (vol->corrupted != 0) { | ||
732 | ubi_err("corrupted dynamic volume"); | ||
733 | goto fail; | ||
734 | } | ||
735 | if (vol->used_ebs != vol->reserved_pebs) { | ||
736 | ubi_err("bad used_ebs"); | ||
737 | goto fail; | ||
738 | } | ||
739 | if (vol->last_eb_bytes != vol->usable_leb_size) { | ||
740 | ubi_err("bad last_eb_bytes"); | ||
741 | goto fail; | ||
742 | } | ||
743 | if (vol->used_bytes != n) { | ||
744 | ubi_err("bad used_bytes"); | ||
745 | goto fail; | ||
746 | } | ||
747 | } else { | ||
748 | if (vol->corrupted != 0 && vol->corrupted != 1) { | ||
749 | ubi_err("bad corrupted"); | ||
750 | goto fail; | ||
751 | } | ||
752 | if (vol->used_ebs < 0 || vol->used_ebs > vol->reserved_pebs) { | ||
753 | ubi_err("bad used_ebs"); | ||
754 | goto fail; | ||
755 | } | ||
756 | if (vol->last_eb_bytes < 0 || | ||
757 | vol->last_eb_bytes > vol->usable_leb_size) { | ||
758 | ubi_err("bad last_eb_bytes"); | ||
759 | goto fail; | ||
760 | } | ||
761 | if (vol->used_bytes < 0 || vol->used_bytes > n || | ||
762 | vol->used_bytes < n - vol->usable_leb_size) { | ||
763 | ubi_err("bad used_bytes"); | ||
764 | goto fail; | ||
765 | } | ||
766 | } | ||
767 | |||
768 | alignment = ubi32_to_cpu(ubi->vtbl[vol_id].alignment); | ||
769 | data_pad = ubi32_to_cpu(ubi->vtbl[vol_id].data_pad); | ||
770 | name_len = ubi16_to_cpu(ubi->vtbl[vol_id].name_len); | ||
771 | upd_marker = ubi->vtbl[vol_id].upd_marker; | ||
772 | name = &ubi->vtbl[vol_id].name[0]; | ||
773 | if (ubi->vtbl[vol_id].vol_type == UBI_VID_DYNAMIC) | ||
774 | vol_type = UBI_DYNAMIC_VOLUME; | ||
775 | else | ||
776 | vol_type = UBI_STATIC_VOLUME; | ||
777 | |||
778 | if (alignment != vol->alignment || data_pad != vol->data_pad || | ||
779 | upd_marker != vol->upd_marker || vol_type != vol->vol_type || | ||
780 | name_len!= vol->name_len || strncmp(name, vol->name, name_len)) { | ||
781 | ubi_err("volume info is different"); | ||
782 | goto fail; | ||
783 | } | ||
784 | |||
785 | return; | ||
786 | |||
787 | fail: | ||
788 | ubi_err("paranoid check failed"); | ||
789 | ubi_dbg_dump_vol_info(vol); | ||
790 | ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id); | ||
791 | BUG(); | ||
792 | } | ||
793 | |||
794 | /** | ||
795 | * paranoid_check_volumes - check information about all volumes. | ||
796 | * @ubi: UBI device description object | ||
797 | */ | ||
798 | static void paranoid_check_volumes(struct ubi_device *ubi) | ||
799 | { | ||
800 | int i; | ||
801 | |||
802 | mutex_lock(&ubi->vtbl_mutex); | ||
803 | spin_lock(&ubi->volumes_lock); | ||
804 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
805 | paranoid_check_volume(ubi, i); | ||
806 | spin_unlock(&ubi->volumes_lock); | ||
807 | mutex_unlock(&ubi->vtbl_mutex); | ||
808 | } | ||
809 | #endif | ||
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c new file mode 100644 index 000000000000..b6fd6bbd941e --- /dev/null +++ b/drivers/mtd/ubi/vtbl.c | |||
@@ -0,0 +1,809 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * Copyright (c) Nokia Corporation, 2006, 2007 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | * | ||
19 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
20 | */ | ||
21 | |||
22 | /* | ||
23 | * This file includes volume table manipulation code. The volume table is an | ||
24 | * on-flash table containing volume meta-data like name, number of reserved | ||
25 | * physical eraseblocks, type, etc. The volume table is stored in the so-called | ||
26 | * "layout volume". | ||
27 | * | ||
28 | * The layout volume is an internal volume which is organized as follows. It | ||
29 | * consists of two logical eraseblocks - LEB 0 and LEB 1. Each logical | ||
30 | * eraseblock stores one volume table copy, i.e. LEB 0 and LEB 1 duplicate each | ||
31 | * other. This redundancy guarantees robustness to unclean reboots. The volume | ||
32 | * table is basically an array of volume table records. Each record contains | ||
33 | * full information about the volume and protected by a CRC checksum. | ||
34 | * | ||
35 | * The volume table is changed, it is first changed in RAM. Then LEB 0 is | ||
36 | * erased, and the updated volume table is written back to LEB 0. Then same for | ||
37 | * LEB 1. This scheme guarantees recoverability from unclean reboots. | ||
38 | * | ||
39 | * In this UBI implementation the on-flash volume table does not contain any | ||
40 | * information about how many data static volumes contain. This information may | ||
41 | * be found from the scanning data. | ||
42 | * | ||
43 | * But it would still be beneficial to store this information in the volume | ||
44 | * table. For example, suppose we have a static volume X, and all its physical | ||
45 | * eraseblocks became bad for some reasons. Suppose we are attaching the | ||
46 | * corresponding MTD device, the scanning has found no logical eraseblocks | ||
47 | * corresponding to the volume X. According to the volume table volume X does | ||
48 | * exist. So we don't know whether it is just empty or all its physical | ||
49 | * eraseblocks went bad. So we cannot alarm the user about this corruption. | ||
50 | * | ||
51 | * The volume table also stores so-called "update marker", which is used for | ||
52 | * volume updates. Before updating the volume, the update marker is set, and | ||
53 | * after the update operation is finished, the update marker is cleared. So if | ||
54 | * the update operation was interrupted (e.g. by an unclean reboot) - the | ||
55 | * update marker is still there and we know that the volume's contents is | ||
56 | * damaged. | ||
57 | */ | ||
58 | |||
59 | #include <linux/crc32.h> | ||
60 | #include <linux/err.h> | ||
61 | #include <asm/div64.h> | ||
62 | #include "ubi.h" | ||
63 | |||
64 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
65 | static void paranoid_vtbl_check(const struct ubi_device *ubi); | ||
66 | #else | ||
67 | #define paranoid_vtbl_check(ubi) | ||
68 | #endif | ||
69 | |||
70 | /* Empty volume table record */ | ||
71 | static struct ubi_vtbl_record empty_vtbl_record; | ||
72 | |||
73 | /** | ||
74 | * ubi_change_vtbl_record - change volume table record. | ||
75 | * @ubi: UBI device description object | ||
76 | * @idx: table index to change | ||
77 | * @vtbl_rec: new volume table record | ||
78 | * | ||
79 | * This function changes volume table record @idx. If @vtbl_rec is %NULL, empty | ||
80 | * volume table record is written. The caller does not have to calculate CRC of | ||
81 | * the record as it is done by this function. Returns zero in case of success | ||
82 | * and a negative error code in case of failure. | ||
83 | */ | ||
84 | int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, | ||
85 | struct ubi_vtbl_record *vtbl_rec) | ||
86 | { | ||
87 | int i, err; | ||
88 | uint32_t crc; | ||
89 | |||
90 | ubi_assert(idx >= 0 && idx < ubi->vtbl_slots); | ||
91 | |||
92 | if (!vtbl_rec) | ||
93 | vtbl_rec = &empty_vtbl_record; | ||
94 | else { | ||
95 | crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC); | ||
96 | vtbl_rec->crc = cpu_to_ubi32(crc); | ||
97 | } | ||
98 | |||
99 | dbg_msg("change record %d", idx); | ||
100 | ubi_dbg_dump_vtbl_record(vtbl_rec, idx); | ||
101 | |||
102 | mutex_lock(&ubi->vtbl_mutex); | ||
103 | memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record)); | ||
104 | for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { | ||
105 | err = ubi_eba_unmap_leb(ubi, UBI_LAYOUT_VOL_ID, i); | ||
106 | if (err) { | ||
107 | mutex_unlock(&ubi->vtbl_mutex); | ||
108 | return err; | ||
109 | } | ||
110 | err = ubi_eba_write_leb(ubi, UBI_LAYOUT_VOL_ID, i, ubi->vtbl, 0, | ||
111 | ubi->vtbl_size, UBI_LONGTERM); | ||
112 | if (err) { | ||
113 | mutex_unlock(&ubi->vtbl_mutex); | ||
114 | return err; | ||
115 | } | ||
116 | } | ||
117 | |||
118 | paranoid_vtbl_check(ubi); | ||
119 | mutex_unlock(&ubi->vtbl_mutex); | ||
120 | return ubi_wl_flush(ubi); | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * vol_til_check - check if volume table is not corrupted and contains sensible | ||
125 | * data. | ||
126 | * | ||
127 | * @ubi: UBI device description object | ||
128 | * @vtbl: volume table | ||
129 | * | ||
130 | * This function returns zero if @vtbl is all right, %1 if CRC is incorrect, | ||
131 | * and %-EINVAL if it contains inconsistent data. | ||
132 | */ | ||
133 | static int vtbl_check(const struct ubi_device *ubi, | ||
134 | const struct ubi_vtbl_record *vtbl) | ||
135 | { | ||
136 | int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len; | ||
137 | int upd_marker; | ||
138 | uint32_t crc; | ||
139 | const char *name; | ||
140 | |||
141 | for (i = 0; i < ubi->vtbl_slots; i++) { | ||
142 | cond_resched(); | ||
143 | |||
144 | reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs); | ||
145 | alignment = ubi32_to_cpu(vtbl[i].alignment); | ||
146 | data_pad = ubi32_to_cpu(vtbl[i].data_pad); | ||
147 | upd_marker = vtbl[i].upd_marker; | ||
148 | vol_type = vtbl[i].vol_type; | ||
149 | name_len = ubi16_to_cpu(vtbl[i].name_len); | ||
150 | name = &vtbl[i].name[0]; | ||
151 | |||
152 | crc = crc32(UBI_CRC32_INIT, &vtbl[i], UBI_VTBL_RECORD_SIZE_CRC); | ||
153 | if (ubi32_to_cpu(vtbl[i].crc) != crc) { | ||
154 | ubi_err("bad CRC at record %u: %#08x, not %#08x", | ||
155 | i, crc, ubi32_to_cpu(vtbl[i].crc)); | ||
156 | ubi_dbg_dump_vtbl_record(&vtbl[i], i); | ||
157 | return 1; | ||
158 | } | ||
159 | |||
160 | if (reserved_pebs == 0) { | ||
161 | if (memcmp(&vtbl[i], &empty_vtbl_record, | ||
162 | UBI_VTBL_RECORD_SIZE)) { | ||
163 | dbg_err("bad empty record"); | ||
164 | goto bad; | ||
165 | } | ||
166 | continue; | ||
167 | } | ||
168 | |||
169 | if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 || | ||
170 | name_len < 0) { | ||
171 | dbg_err("negative values"); | ||
172 | goto bad; | ||
173 | } | ||
174 | |||
175 | if (alignment > ubi->leb_size || alignment == 0) { | ||
176 | dbg_err("bad alignment"); | ||
177 | goto bad; | ||
178 | } | ||
179 | |||
180 | n = alignment % ubi->min_io_size; | ||
181 | if (alignment != 1 && n) { | ||
182 | dbg_err("alignment is not multiple of min I/O unit"); | ||
183 | goto bad; | ||
184 | } | ||
185 | |||
186 | n = ubi->leb_size % alignment; | ||
187 | if (data_pad != n) { | ||
188 | dbg_err("bad data_pad, has to be %d", n); | ||
189 | goto bad; | ||
190 | } | ||
191 | |||
192 | if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { | ||
193 | dbg_err("bad vol_type"); | ||
194 | goto bad; | ||
195 | } | ||
196 | |||
197 | if (upd_marker != 0 && upd_marker != 1) { | ||
198 | dbg_err("bad upd_marker"); | ||
199 | goto bad; | ||
200 | } | ||
201 | |||
202 | if (reserved_pebs > ubi->good_peb_count) { | ||
203 | dbg_err("too large reserved_pebs, good PEBs %d", | ||
204 | ubi->good_peb_count); | ||
205 | goto bad; | ||
206 | } | ||
207 | |||
208 | if (name_len > UBI_VOL_NAME_MAX) { | ||
209 | dbg_err("too long volume name, max %d", | ||
210 | UBI_VOL_NAME_MAX); | ||
211 | goto bad; | ||
212 | } | ||
213 | |||
214 | if (name[0] == '\0') { | ||
215 | dbg_err("NULL volume name"); | ||
216 | goto bad; | ||
217 | } | ||
218 | |||
219 | if (name_len != strnlen(name, name_len + 1)) { | ||
220 | dbg_err("bad name_len"); | ||
221 | goto bad; | ||
222 | } | ||
223 | } | ||
224 | |||
225 | /* Checks that all names are unique */ | ||
226 | for (i = 0; i < ubi->vtbl_slots - 1; i++) { | ||
227 | for (n = i + 1; n < ubi->vtbl_slots; n++) { | ||
228 | int len1 = ubi16_to_cpu(vtbl[i].name_len); | ||
229 | int len2 = ubi16_to_cpu(vtbl[n].name_len); | ||
230 | |||
231 | if (len1 > 0 && len1 == len2 && | ||
232 | !strncmp(vtbl[i].name, vtbl[n].name, len1)) { | ||
233 | ubi_err("volumes %d and %d have the same name" | ||
234 | " \"%s\"", i, n, vtbl[i].name); | ||
235 | ubi_dbg_dump_vtbl_record(&vtbl[i], i); | ||
236 | ubi_dbg_dump_vtbl_record(&vtbl[n], n); | ||
237 | return -EINVAL; | ||
238 | } | ||
239 | } | ||
240 | } | ||
241 | |||
242 | return 0; | ||
243 | |||
244 | bad: | ||
245 | ubi_err("volume table check failed, record %d", i); | ||
246 | ubi_dbg_dump_vtbl_record(&vtbl[i], i); | ||
247 | return -EINVAL; | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * create_vtbl - create a copy of volume table. | ||
252 | * @ubi: UBI device description object | ||
253 | * @si: scanning information | ||
254 | * @copy: number of the volume table copy | ||
255 | * @vtbl: contents of the volume table | ||
256 | * | ||
257 | * This function returns zero in case of success and a negative error code in | ||
258 | * case of failure. | ||
259 | */ | ||
260 | static int create_vtbl(const struct ubi_device *ubi, struct ubi_scan_info *si, | ||
261 | int copy, void *vtbl) | ||
262 | { | ||
263 | int err, tries = 0; | ||
264 | static struct ubi_vid_hdr *vid_hdr; | ||
265 | struct ubi_scan_volume *sv; | ||
266 | struct ubi_scan_leb *new_seb, *old_seb = NULL; | ||
267 | |||
268 | ubi_msg("create volume table (copy #%d)", copy + 1); | ||
269 | |||
270 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
271 | if (!vid_hdr) | ||
272 | return -ENOMEM; | ||
273 | |||
274 | /* | ||
275 | * Check if there is a logical eraseblock which would have to contain | ||
276 | * this volume table copy was found during scanning. It has to be wiped | ||
277 | * out. | ||
278 | */ | ||
279 | sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); | ||
280 | if (sv) | ||
281 | old_seb = ubi_scan_find_seb(sv, copy); | ||
282 | |||
283 | retry: | ||
284 | new_seb = ubi_scan_get_free_peb(ubi, si); | ||
285 | if (IS_ERR(new_seb)) { | ||
286 | err = PTR_ERR(new_seb); | ||
287 | goto out_free; | ||
288 | } | ||
289 | |||
290 | vid_hdr->vol_type = UBI_VID_DYNAMIC; | ||
291 | vid_hdr->vol_id = cpu_to_ubi32(UBI_LAYOUT_VOL_ID); | ||
292 | vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT; | ||
293 | vid_hdr->data_size = vid_hdr->used_ebs = | ||
294 | vid_hdr->data_pad = cpu_to_ubi32(0); | ||
295 | vid_hdr->lnum = cpu_to_ubi32(copy); | ||
296 | vid_hdr->sqnum = cpu_to_ubi64(++si->max_sqnum); | ||
297 | vid_hdr->leb_ver = cpu_to_ubi32(old_seb ? old_seb->leb_ver + 1: 0); | ||
298 | |||
299 | /* The EC header is already there, write the VID header */ | ||
300 | err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr); | ||
301 | if (err) | ||
302 | goto write_error; | ||
303 | |||
304 | /* Write the layout volume contents */ | ||
305 | err = ubi_io_write_data(ubi, vtbl, new_seb->pnum, 0, ubi->vtbl_size); | ||
306 | if (err) | ||
307 | goto write_error; | ||
308 | |||
309 | /* | ||
310 | * And add it to the scanning information. Don't delete the old | ||
311 | * @old_seb as it will be deleted and freed in 'ubi_scan_add_used()'. | ||
312 | */ | ||
313 | err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec, | ||
314 | vid_hdr, 0); | ||
315 | kfree(new_seb); | ||
316 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
317 | return err; | ||
318 | |||
319 | write_error: | ||
320 | kfree(new_seb); | ||
321 | /* May be this physical eraseblock went bad, try to pick another one */ | ||
322 | if (++tries <= 5) { | ||
323 | err = ubi_scan_add_to_list(si, new_seb->pnum, new_seb->ec, | ||
324 | &si->corr); | ||
325 | if (!err) | ||
326 | goto retry; | ||
327 | } | ||
328 | out_free: | ||
329 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
330 | return err; | ||
331 | |||
332 | } | ||
333 | |||
334 | /** | ||
335 | * process_lvol - process the layout volume. | ||
336 | * @ubi: UBI device description object | ||
337 | * @si: scanning information | ||
338 | * @sv: layout volume scanning information | ||
339 | * | ||
340 | * This function is responsible for reading the layout volume, ensuring it is | ||
341 | * not corrupted, and recovering from corruptions if needed. Returns volume | ||
342 | * table in case of success and a negative error code in case of failure. | ||
343 | */ | ||
344 | static struct ubi_vtbl_record *process_lvol(const struct ubi_device *ubi, | ||
345 | struct ubi_scan_info *si, | ||
346 | struct ubi_scan_volume *sv) | ||
347 | { | ||
348 | int err; | ||
349 | struct rb_node *rb; | ||
350 | struct ubi_scan_leb *seb; | ||
351 | struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL }; | ||
352 | int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1}; | ||
353 | |||
354 | /* | ||
355 | * UBI goes through the following steps when it changes the layout | ||
356 | * volume: | ||
357 | * a. erase LEB 0; | ||
358 | * b. write new data to LEB 0; | ||
359 | * c. erase LEB 1; | ||
360 | * d. write new data to LEB 1. | ||
361 | * | ||
362 | * Before the change, both LEBs contain the same data. | ||
363 | * | ||
364 | * Due to unclean reboots, the contents of LEB 0 may be lost, but there | ||
365 | * should LEB 1. So it is OK if LEB 0 is corrupted while LEB 1 is not. | ||
366 | * Similarly, LEB 1 may be lost, but there should be LEB 0. And | ||
367 | * finally, unclean reboots may result in a situation when neither LEB | ||
368 | * 0 nor LEB 1 are corrupted, but they are different. In this case, LEB | ||
369 | * 0 contains more recent information. | ||
370 | * | ||
371 | * So the plan is to first check LEB 0. Then | ||
372 | * a. if LEB 0 is OK, it must be containing the most resent data; then | ||
373 | * we compare it with LEB 1, and if they are different, we copy LEB | ||
374 | * 0 to LEB 1; | ||
375 | * b. if LEB 0 is corrupted, but LEB 1 has to be OK, and we copy LEB 1 | ||
376 | * to LEB 0. | ||
377 | */ | ||
378 | |||
379 | dbg_msg("check layout volume"); | ||
380 | |||
381 | /* Read both LEB 0 and LEB 1 into memory */ | ||
382 | ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) { | ||
383 | leb[seb->lnum] = kzalloc(ubi->vtbl_size, GFP_KERNEL); | ||
384 | if (!leb[seb->lnum]) { | ||
385 | err = -ENOMEM; | ||
386 | goto out_free; | ||
387 | } | ||
388 | |||
389 | err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0, | ||
390 | ubi->vtbl_size); | ||
391 | if (err == UBI_IO_BITFLIPS || err == -EBADMSG) | ||
392 | /* Scrub the PEB later */ | ||
393 | seb->scrub = 1; | ||
394 | else if (err) | ||
395 | goto out_free; | ||
396 | } | ||
397 | |||
398 | err = -EINVAL; | ||
399 | if (leb[0]) { | ||
400 | leb_corrupted[0] = vtbl_check(ubi, leb[0]); | ||
401 | if (leb_corrupted[0] < 0) | ||
402 | goto out_free; | ||
403 | } | ||
404 | |||
405 | if (!leb_corrupted[0]) { | ||
406 | /* LEB 0 is OK */ | ||
407 | if (leb[1]) | ||
408 | leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size); | ||
409 | if (leb_corrupted[1]) { | ||
410 | ubi_warn("volume table copy #2 is corrupted"); | ||
411 | err = create_vtbl(ubi, si, 1, leb[0]); | ||
412 | if (err) | ||
413 | goto out_free; | ||
414 | ubi_msg("volume table was restored"); | ||
415 | } | ||
416 | |||
417 | /* Both LEB 1 and LEB 2 are OK and consistent */ | ||
418 | kfree(leb[1]); | ||
419 | return leb[0]; | ||
420 | } else { | ||
421 | /* LEB 0 is corrupted or does not exist */ | ||
422 | if (leb[1]) { | ||
423 | leb_corrupted[1] = vtbl_check(ubi, leb[1]); | ||
424 | if (leb_corrupted[1] < 0) | ||
425 | goto out_free; | ||
426 | } | ||
427 | if (leb_corrupted[1]) { | ||
428 | /* Both LEB 0 and LEB 1 are corrupted */ | ||
429 | ubi_err("both volume tables are corrupted"); | ||
430 | goto out_free; | ||
431 | } | ||
432 | |||
433 | ubi_warn("volume table copy #1 is corrupted"); | ||
434 | err = create_vtbl(ubi, si, 0, leb[1]); | ||
435 | if (err) | ||
436 | goto out_free; | ||
437 | ubi_msg("volume table was restored"); | ||
438 | |||
439 | kfree(leb[0]); | ||
440 | return leb[1]; | ||
441 | } | ||
442 | |||
443 | out_free: | ||
444 | kfree(leb[0]); | ||
445 | kfree(leb[1]); | ||
446 | return ERR_PTR(err); | ||
447 | } | ||
448 | |||
449 | /** | ||
450 | * create_empty_lvol - create empty layout volume. | ||
451 | * @ubi: UBI device description object | ||
452 | * @si: scanning information | ||
453 | * | ||
454 | * This function returns volume table contents in case of success and a | ||
455 | * negative error code in case of failure. | ||
456 | */ | ||
457 | static struct ubi_vtbl_record *create_empty_lvol(const struct ubi_device *ubi, | ||
458 | struct ubi_scan_info *si) | ||
459 | { | ||
460 | int i; | ||
461 | struct ubi_vtbl_record *vtbl; | ||
462 | |||
463 | vtbl = kzalloc(ubi->vtbl_size, GFP_KERNEL); | ||
464 | if (!vtbl) | ||
465 | return ERR_PTR(-ENOMEM); | ||
466 | |||
467 | for (i = 0; i < ubi->vtbl_slots; i++) | ||
468 | memcpy(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE); | ||
469 | |||
470 | for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { | ||
471 | int err; | ||
472 | |||
473 | err = create_vtbl(ubi, si, i, vtbl); | ||
474 | if (err) { | ||
475 | kfree(vtbl); | ||
476 | return ERR_PTR(err); | ||
477 | } | ||
478 | } | ||
479 | |||
480 | return vtbl; | ||
481 | } | ||
482 | |||
483 | /** | ||
484 | * init_volumes - initialize volume information for existing volumes. | ||
485 | * @ubi: UBI device description object | ||
486 | * @si: scanning information | ||
487 | * @vtbl: volume table | ||
488 | * | ||
489 | * This function allocates volume description objects for existing volumes. | ||
490 | * Returns zero in case of success and a negative error code in case of | ||
491 | * failure. | ||
492 | */ | ||
493 | static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si, | ||
494 | const struct ubi_vtbl_record *vtbl) | ||
495 | { | ||
496 | int i, reserved_pebs = 0; | ||
497 | struct ubi_scan_volume *sv; | ||
498 | struct ubi_volume *vol; | ||
499 | |||
500 | for (i = 0; i < ubi->vtbl_slots; i++) { | ||
501 | cond_resched(); | ||
502 | |||
503 | if (ubi32_to_cpu(vtbl[i].reserved_pebs) == 0) | ||
504 | continue; /* Empty record */ | ||
505 | |||
506 | vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); | ||
507 | if (!vol) | ||
508 | return -ENOMEM; | ||
509 | |||
510 | vol->reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs); | ||
511 | vol->alignment = ubi32_to_cpu(vtbl[i].alignment); | ||
512 | vol->data_pad = ubi32_to_cpu(vtbl[i].data_pad); | ||
513 | vol->vol_type = vtbl[i].vol_type == UBI_VID_DYNAMIC ? | ||
514 | UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME; | ||
515 | vol->name_len = ubi16_to_cpu(vtbl[i].name_len); | ||
516 | vol->usable_leb_size = ubi->leb_size - vol->data_pad; | ||
517 | memcpy(vol->name, vtbl[i].name, vol->name_len); | ||
518 | vol->name[vol->name_len] = '\0'; | ||
519 | vol->vol_id = i; | ||
520 | |||
521 | ubi_assert(!ubi->volumes[i]); | ||
522 | ubi->volumes[i] = vol; | ||
523 | ubi->vol_count += 1; | ||
524 | vol->ubi = ubi; | ||
525 | reserved_pebs += vol->reserved_pebs; | ||
526 | |||
527 | /* | ||
528 | * In case of dynamic volume UBI knows nothing about how many | ||
529 | * data is stored there. So assume the whole volume is used. | ||
530 | */ | ||
531 | if (vol->vol_type == UBI_DYNAMIC_VOLUME) { | ||
532 | vol->used_ebs = vol->reserved_pebs; | ||
533 | vol->last_eb_bytes = vol->usable_leb_size; | ||
534 | vol->used_bytes = vol->used_ebs * vol->usable_leb_size; | ||
535 | continue; | ||
536 | } | ||
537 | |||
538 | /* Static volumes only */ | ||
539 | sv = ubi_scan_find_sv(si, i); | ||
540 | if (!sv) { | ||
541 | /* | ||
542 | * No eraseblocks belonging to this volume found. We | ||
543 | * don't actually know whether this static volume is | ||
544 | * completely corrupted or just contains no data. And | ||
545 | * we cannot know this as long as data size is not | ||
546 | * stored on flash. So we just assume the volume is | ||
547 | * empty. FIXME: this should be handled. | ||
548 | */ | ||
549 | continue; | ||
550 | } | ||
551 | |||
552 | if (sv->leb_count != sv->used_ebs) { | ||
553 | /* | ||
554 | * We found a static volume which misses several | ||
555 | * eraseblocks. Treat it as corrupted. | ||
556 | */ | ||
557 | ubi_warn("static volume %d misses %d LEBs - corrupted", | ||
558 | sv->vol_id, sv->used_ebs - sv->leb_count); | ||
559 | vol->corrupted = 1; | ||
560 | continue; | ||
561 | } | ||
562 | |||
563 | vol->used_ebs = sv->used_ebs; | ||
564 | vol->used_bytes = (vol->used_ebs - 1) * vol->usable_leb_size; | ||
565 | vol->used_bytes += sv->last_data_size; | ||
566 | vol->last_eb_bytes = sv->last_data_size; | ||
567 | } | ||
568 | |||
569 | vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); | ||
570 | if (!vol) | ||
571 | return -ENOMEM; | ||
572 | |||
573 | vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS; | ||
574 | vol->alignment = 1; | ||
575 | vol->vol_type = UBI_DYNAMIC_VOLUME; | ||
576 | vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1; | ||
577 | memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1); | ||
578 | vol->usable_leb_size = ubi->leb_size; | ||
579 | vol->used_ebs = vol->reserved_pebs; | ||
580 | vol->last_eb_bytes = vol->reserved_pebs; | ||
581 | vol->used_bytes = vol->used_ebs * (ubi->leb_size - vol->data_pad); | ||
582 | vol->vol_id = UBI_LAYOUT_VOL_ID; | ||
583 | |||
584 | ubi_assert(!ubi->volumes[i]); | ||
585 | ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol; | ||
586 | reserved_pebs += vol->reserved_pebs; | ||
587 | ubi->vol_count += 1; | ||
588 | vol->ubi = ubi; | ||
589 | |||
590 | if (reserved_pebs > ubi->avail_pebs) | ||
591 | ubi_err("not enough PEBs, required %d, available %d", | ||
592 | reserved_pebs, ubi->avail_pebs); | ||
593 | ubi->rsvd_pebs += reserved_pebs; | ||
594 | ubi->avail_pebs -= reserved_pebs; | ||
595 | |||
596 | return 0; | ||
597 | } | ||
598 | |||
599 | /** | ||
600 | * check_sv - check volume scanning information. | ||
601 | * @vol: UBI volume description object | ||
602 | * @sv: volume scanning information | ||
603 | * | ||
604 | * This function returns zero if the volume scanning information is consistent | ||
605 | * to the data read from the volume tabla, and %-EINVAL if not. | ||
606 | */ | ||
607 | static int check_sv(const struct ubi_volume *vol, | ||
608 | const struct ubi_scan_volume *sv) | ||
609 | { | ||
610 | if (sv->highest_lnum >= vol->reserved_pebs) { | ||
611 | dbg_err("bad highest_lnum"); | ||
612 | goto bad; | ||
613 | } | ||
614 | if (sv->leb_count > vol->reserved_pebs) { | ||
615 | dbg_err("bad leb_count"); | ||
616 | goto bad; | ||
617 | } | ||
618 | if (sv->vol_type != vol->vol_type) { | ||
619 | dbg_err("bad vol_type"); | ||
620 | goto bad; | ||
621 | } | ||
622 | if (sv->used_ebs > vol->reserved_pebs) { | ||
623 | dbg_err("bad used_ebs"); | ||
624 | goto bad; | ||
625 | } | ||
626 | if (sv->data_pad != vol->data_pad) { | ||
627 | dbg_err("bad data_pad"); | ||
628 | goto bad; | ||
629 | } | ||
630 | return 0; | ||
631 | |||
632 | bad: | ||
633 | ubi_err("bad scanning information"); | ||
634 | ubi_dbg_dump_sv(sv); | ||
635 | ubi_dbg_dump_vol_info(vol); | ||
636 | return -EINVAL; | ||
637 | } | ||
638 | |||
639 | /** | ||
640 | * check_scanning_info - check that scanning information. | ||
641 | * @ubi: UBI device description object | ||
642 | * @si: scanning information | ||
643 | * | ||
644 | * Even though we protect on-flash data by CRC checksums, we still don't trust | ||
645 | * the media. This function ensures that scanning information is consistent to | ||
646 | * the information read from the volume table. Returns zero if the scanning | ||
647 | * information is OK and %-EINVAL if it is not. | ||
648 | */ | ||
649 | static int check_scanning_info(const struct ubi_device *ubi, | ||
650 | struct ubi_scan_info *si) | ||
651 | { | ||
652 | int err, i; | ||
653 | struct ubi_scan_volume *sv; | ||
654 | struct ubi_volume *vol; | ||
655 | |||
656 | if (si->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) { | ||
657 | ubi_err("scanning found %d volumes, maximum is %d + %d", | ||
658 | si->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots); | ||
659 | return -EINVAL; | ||
660 | } | ||
661 | |||
662 | if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT&& | ||
663 | si->highest_vol_id < UBI_INTERNAL_VOL_START) { | ||
664 | ubi_err("too large volume ID %d found by scanning", | ||
665 | si->highest_vol_id); | ||
666 | return -EINVAL; | ||
667 | } | ||
668 | |||
669 | |||
670 | for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { | ||
671 | cond_resched(); | ||
672 | |||
673 | sv = ubi_scan_find_sv(si, i); | ||
674 | vol = ubi->volumes[i]; | ||
675 | if (!vol) { | ||
676 | if (sv) | ||
677 | ubi_scan_rm_volume(si, sv); | ||
678 | continue; | ||
679 | } | ||
680 | |||
681 | if (vol->reserved_pebs == 0) { | ||
682 | ubi_assert(i < ubi->vtbl_slots); | ||
683 | |||
684 | if (!sv) | ||
685 | continue; | ||
686 | |||
687 | /* | ||
688 | * During scanning we found a volume which does not | ||
689 | * exist according to the information in the volume | ||
690 | * table. This must have happened due to an unclean | ||
691 | * reboot while the volume was being removed. Discard | ||
692 | * these eraseblocks. | ||
693 | */ | ||
694 | ubi_msg("finish volume %d removal", sv->vol_id); | ||
695 | ubi_scan_rm_volume(si, sv); | ||
696 | } else if (sv) { | ||
697 | err = check_sv(vol, sv); | ||
698 | if (err) | ||
699 | return err; | ||
700 | } | ||
701 | } | ||
702 | |||
703 | return 0; | ||
704 | } | ||
705 | |||
706 | /** | ||
707 | * ubi_read_volume_table - read volume table. | ||
708 | * information. | ||
709 | * @ubi: UBI device description object | ||
710 | * @si: scanning information | ||
711 | * | ||
712 | * This function reads volume table, checks it, recover from errors if needed, | ||
713 | * or creates it if needed. Returns zero in case of success and a negative | ||
714 | * error code in case of failure. | ||
715 | */ | ||
716 | int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si) | ||
717 | { | ||
718 | int i, err; | ||
719 | struct ubi_scan_volume *sv; | ||
720 | |||
721 | empty_vtbl_record.crc = cpu_to_ubi32(0xf116c36b); | ||
722 | |||
723 | /* | ||
724 | * The number of supported volumes is limited by the eraseblock size | ||
725 | * and by the UBI_MAX_VOLUMES constant. | ||
726 | */ | ||
727 | ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE; | ||
728 | if (ubi->vtbl_slots > UBI_MAX_VOLUMES) | ||
729 | ubi->vtbl_slots = UBI_MAX_VOLUMES; | ||
730 | |||
731 | ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE; | ||
732 | ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size); | ||
733 | |||
734 | sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); | ||
735 | if (!sv) { | ||
736 | /* | ||
737 | * No logical eraseblocks belonging to the layout volume were | ||
738 | * found. This could mean that the flash is just empty. In | ||
739 | * this case we create empty layout volume. | ||
740 | * | ||
741 | * But if flash is not empty this must be a corruption or the | ||
742 | * MTD device just contains garbage. | ||
743 | */ | ||
744 | if (si->is_empty) { | ||
745 | ubi->vtbl = create_empty_lvol(ubi, si); | ||
746 | if (IS_ERR(ubi->vtbl)) | ||
747 | return PTR_ERR(ubi->vtbl); | ||
748 | } else { | ||
749 | ubi_err("the layout volume was not found"); | ||
750 | return -EINVAL; | ||
751 | } | ||
752 | } else { | ||
753 | if (sv->leb_count > UBI_LAYOUT_VOLUME_EBS) { | ||
754 | /* This must not happen with proper UBI images */ | ||
755 | dbg_err("too many LEBs (%d) in layout volume", | ||
756 | sv->leb_count); | ||
757 | return -EINVAL; | ||
758 | } | ||
759 | |||
760 | ubi->vtbl = process_lvol(ubi, si, sv); | ||
761 | if (IS_ERR(ubi->vtbl)) | ||
762 | return PTR_ERR(ubi->vtbl); | ||
763 | } | ||
764 | |||
765 | ubi->avail_pebs = ubi->good_peb_count; | ||
766 | |||
767 | /* | ||
768 | * The layout volume is OK, initialize the corresponding in-RAM data | ||
769 | * structures. | ||
770 | */ | ||
771 | err = init_volumes(ubi, si, ubi->vtbl); | ||
772 | if (err) | ||
773 | goto out_free; | ||
774 | |||
775 | /* | ||
776 | * Get sure that the scanning information is consistent to the | ||
777 | * information stored in the volume table. | ||
778 | */ | ||
779 | err = check_scanning_info(ubi, si); | ||
780 | if (err) | ||
781 | goto out_free; | ||
782 | |||
783 | return 0; | ||
784 | |||
785 | out_free: | ||
786 | kfree(ubi->vtbl); | ||
787 | for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) | ||
788 | if (ubi->volumes[i]) { | ||
789 | kfree(ubi->volumes[i]); | ||
790 | ubi->volumes[i] = NULL; | ||
791 | } | ||
792 | return err; | ||
793 | } | ||
794 | |||
795 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
796 | |||
797 | /** | ||
798 | * paranoid_vtbl_check - check volume table. | ||
799 | * @ubi: UBI device description object | ||
800 | */ | ||
801 | static void paranoid_vtbl_check(const struct ubi_device *ubi) | ||
802 | { | ||
803 | if (vtbl_check(ubi, ubi->vtbl)) { | ||
804 | ubi_err("paranoid check failed"); | ||
805 | BUG(); | ||
806 | } | ||
807 | } | ||
808 | |||
809 | #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ | ||
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c new file mode 100644 index 000000000000..9ecaf77eca9e --- /dev/null +++ b/drivers/mtd/ubi/wl.c | |||
@@ -0,0 +1,1671 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * UBI wear-leveling unit. | ||
23 | * | ||
24 | * This unit is responsible for wear-leveling. It works in terms of physical | ||
25 | * eraseblocks and erase counters and knows nothing about logical eraseblocks, | ||
26 | * volumes, etc. From this unit's perspective all physical eraseblocks are of | ||
27 | * two types - used and free. Used physical eraseblocks are those that were | ||
28 | * "get" by the 'ubi_wl_get_peb()' function, and free physical eraseblocks are | ||
29 | * those that were put by the 'ubi_wl_put_peb()' function. | ||
30 | * | ||
31 | * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter | ||
32 | * header. The rest of the physical eraseblock contains only 0xFF bytes. | ||
33 | * | ||
34 | * When physical eraseblocks are returned to the WL unit by means of the | ||
35 | * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is | ||
36 | * done asynchronously in context of the per-UBI device background thread, | ||
37 | * which is also managed by the WL unit. | ||
38 | * | ||
39 | * The wear-leveling is ensured by means of moving the contents of used | ||
40 | * physical eraseblocks with low erase counter to free physical eraseblocks | ||
41 | * with high erase counter. | ||
42 | * | ||
43 | * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick | ||
44 | * an "optimal" physical eraseblock. For example, when it is known that the | ||
45 | * physical eraseblock will be "put" soon because it contains short-term data, | ||
46 | * the WL unit may pick a free physical eraseblock with low erase counter, and | ||
47 | * so forth. | ||
48 | * | ||
49 | * If the WL unit fails to erase a physical eraseblock, it marks it as bad. | ||
50 | * | ||
51 | * This unit is also responsible for scrubbing. If a bit-flip is detected in a | ||
52 | * physical eraseblock, it has to be moved. Technically this is the same as | ||
53 | * moving it for wear-leveling reasons. | ||
54 | * | ||
55 | * As it was said, for the UBI unit all physical eraseblocks are either "free" | ||
56 | * or "used". Free eraseblock are kept in the @wl->free RB-tree, while used | ||
57 | * eraseblocks are kept in a set of different RB-trees: @wl->used, | ||
58 | * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub. | ||
59 | * | ||
60 | * Note, in this implementation, we keep a small in-RAM object for each physical | ||
61 | * eraseblock. This is surely not a scalable solution. But it appears to be good | ||
62 | * enough for moderately large flashes and it is simple. In future, one may | ||
63 | * re-work this unit and make it more scalable. | ||
64 | * | ||
65 | * At the moment this unit does not utilize the sequence number, which was | ||
66 | * introduced relatively recently. But it would be wise to do this because the | ||
67 | * sequence number of a logical eraseblock characterizes how old is it. For | ||
68 | * example, when we move a PEB with low erase counter, and we need to pick the | ||
69 | * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we | ||
70 | * pick target PEB with an average EC if our PEB is not very "old". This is a | ||
71 | * room for future re-works of the WL unit. | ||
72 | * | ||
73 | * FIXME: looks too complex, should be simplified (later). | ||
74 | */ | ||
75 | |||
76 | #include <linux/slab.h> | ||
77 | #include <linux/crc32.h> | ||
78 | #include <linux/freezer.h> | ||
79 | #include <linux/kthread.h> | ||
80 | #include "ubi.h" | ||
81 | |||
82 | /* Number of physical eraseblocks reserved for wear-leveling purposes */ | ||
83 | #define WL_RESERVED_PEBS 1 | ||
84 | |||
85 | /* | ||
86 | * How many erase cycles are short term, unknown, and long term physical | ||
87 | * eraseblocks protected. | ||
88 | */ | ||
89 | #define ST_PROTECTION 16 | ||
90 | #define U_PROTECTION 10 | ||
91 | #define LT_PROTECTION 4 | ||
92 | |||
93 | /* | ||
94 | * Maximum difference between two erase counters. If this threshold is | ||
95 | * exceeded, the WL unit starts moving data from used physical eraseblocks with | ||
96 | * low erase counter to free physical eraseblocks with high erase counter. | ||
97 | */ | ||
98 | #define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD | ||
99 | |||
100 | /* | ||
101 | * When a physical eraseblock is moved, the WL unit has to pick the target | ||
102 | * physical eraseblock to move to. The simplest way would be just to pick the | ||
103 | * one with the highest erase counter. But in certain workloads this could lead | ||
104 | * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a | ||
105 | * situation when the picked physical eraseblock is constantly erased after the | ||
106 | * data is written to it. So, we have a constant which limits the highest erase | ||
107 | * counter of the free physical eraseblock to pick. Namely, the WL unit does | ||
108 | * not pick eraseblocks with erase counter greater then the lowest erase | ||
109 | * counter plus %WL_FREE_MAX_DIFF. | ||
110 | */ | ||
111 | #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) | ||
112 | |||
113 | /* | ||
114 | * Maximum number of consecutive background thread failures which is enough to | ||
115 | * switch to read-only mode. | ||
116 | */ | ||
117 | #define WL_MAX_FAILURES 32 | ||
118 | |||
119 | /** | ||
120 | * struct ubi_wl_entry - wear-leveling entry. | ||
121 | * @rb: link in the corresponding RB-tree | ||
122 | * @ec: erase counter | ||
123 | * @pnum: physical eraseblock number | ||
124 | * | ||
125 | * Each physical eraseblock has a corresponding &struct wl_entry object which | ||
126 | * may be kept in different RB-trees. | ||
127 | */ | ||
128 | struct ubi_wl_entry { | ||
129 | struct rb_node rb; | ||
130 | int ec; | ||
131 | int pnum; | ||
132 | }; | ||
133 | |||
134 | /** | ||
135 | * struct ubi_wl_prot_entry - PEB protection entry. | ||
136 | * @rb_pnum: link in the @wl->prot.pnum RB-tree | ||
137 | * @rb_aec: link in the @wl->prot.aec RB-tree | ||
138 | * @abs_ec: the absolute erase counter value when the protection ends | ||
139 | * @e: the wear-leveling entry of the physical eraseblock under protection | ||
140 | * | ||
141 | * When the WL unit returns a physical eraseblock, the physical eraseblock is | ||
142 | * protected from being moved for some "time". For this reason, the physical | ||
143 | * eraseblock is not directly moved from the @wl->free tree to the @wl->used | ||
144 | * tree. There is one more tree in between where this physical eraseblock is | ||
145 | * temporarily stored (@wl->prot). | ||
146 | * | ||
147 | * All this protection stuff is needed because: | ||
148 | * o we don't want to move physical eraseblocks just after we have given them | ||
149 | * to the user; instead, we first want to let users fill them up with data; | ||
150 | * | ||
151 | * o there is a chance that the user will put the physical eraseblock very | ||
152 | * soon, so it makes sense not to move it for some time, but wait; this is | ||
153 | * especially important in case of "short term" physical eraseblocks. | ||
154 | * | ||
155 | * Physical eraseblocks stay protected only for limited time. But the "time" is | ||
156 | * measured in erase cycles in this case. This is implemented with help of the | ||
157 | * absolute erase counter (@wl->abs_ec). When it reaches certain value, the | ||
158 | * physical eraseblocks are moved from the protection trees (@wl->prot.*) to | ||
159 | * the @wl->used tree. | ||
160 | * | ||
161 | * Protected physical eraseblocks are searched by physical eraseblock number | ||
162 | * (when they are put) and by the absolute erase counter (to check if it is | ||
163 | * time to move them to the @wl->used tree). So there are actually 2 RB-trees | ||
164 | * storing the protected physical eraseblocks: @wl->prot.pnum and | ||
165 | * @wl->prot.aec. They are referred to as the "protection" trees. The | ||
166 | * first one is indexed by the physical eraseblock number. The second one is | ||
167 | * indexed by the absolute erase counter. Both trees store | ||
168 | * &struct ubi_wl_prot_entry objects. | ||
169 | * | ||
170 | * Each physical eraseblock has 2 main states: free and used. The former state | ||
171 | * corresponds to the @wl->free tree. The latter state is split up on several | ||
172 | * sub-states: | ||
173 | * o the WL movement is allowed (@wl->used tree); | ||
174 | * o the WL movement is temporarily prohibited (@wl->prot.pnum and | ||
175 | * @wl->prot.aec trees); | ||
176 | * o scrubbing is needed (@wl->scrub tree). | ||
177 | * | ||
178 | * Depending on the sub-state, wear-leveling entries of the used physical | ||
179 | * eraseblocks may be kept in one of those trees. | ||
180 | */ | ||
181 | struct ubi_wl_prot_entry { | ||
182 | struct rb_node rb_pnum; | ||
183 | struct rb_node rb_aec; | ||
184 | unsigned long long abs_ec; | ||
185 | struct ubi_wl_entry *e; | ||
186 | }; | ||
187 | |||
188 | /** | ||
189 | * struct ubi_work - UBI work description data structure. | ||
190 | * @list: a link in the list of pending works | ||
191 | * @func: worker function | ||
192 | * @priv: private data of the worker function | ||
193 | * | ||
194 | * @e: physical eraseblock to erase | ||
195 | * @torture: if the physical eraseblock has to be tortured | ||
196 | * | ||
197 | * The @func pointer points to the worker function. If the @cancel argument is | ||
198 | * not zero, the worker has to free the resources and exit immediately. The | ||
199 | * worker has to return zero in case of success and a negative error code in | ||
200 | * case of failure. | ||
201 | */ | ||
202 | struct ubi_work { | ||
203 | struct list_head list; | ||
204 | int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel); | ||
205 | /* The below fields are only relevant to erasure works */ | ||
206 | struct ubi_wl_entry *e; | ||
207 | int torture; | ||
208 | }; | ||
209 | |||
210 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
211 | static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec); | ||
212 | static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, | ||
213 | struct rb_root *root); | ||
214 | #else | ||
215 | #define paranoid_check_ec(ubi, pnum, ec) 0 | ||
216 | #define paranoid_check_in_wl_tree(e, root) | ||
217 | #endif | ||
218 | |||
219 | /* Slab cache for wear-leveling entries */ | ||
220 | static struct kmem_cache *wl_entries_slab; | ||
221 | |||
222 | /** | ||
223 | * tree_empty - a helper function to check if an RB-tree is empty. | ||
224 | * @root: the root of the tree | ||
225 | * | ||
226 | * This function returns non-zero if the RB-tree is empty and zero if not. | ||
227 | */ | ||
228 | static inline int tree_empty(struct rb_root *root) | ||
229 | { | ||
230 | return root->rb_node == NULL; | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * wl_tree_add - add a wear-leveling entry to a WL RB-tree. | ||
235 | * @e: the wear-leveling entry to add | ||
236 | * @root: the root of the tree | ||
237 | * | ||
238 | * Note, we use (erase counter, physical eraseblock number) pairs as keys in | ||
239 | * the @ubi->used and @ubi->free RB-trees. | ||
240 | */ | ||
241 | static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root) | ||
242 | { | ||
243 | struct rb_node **p, *parent = NULL; | ||
244 | |||
245 | p = &root->rb_node; | ||
246 | while (*p) { | ||
247 | struct ubi_wl_entry *e1; | ||
248 | |||
249 | parent = *p; | ||
250 | e1 = rb_entry(parent, struct ubi_wl_entry, rb); | ||
251 | |||
252 | if (e->ec < e1->ec) | ||
253 | p = &(*p)->rb_left; | ||
254 | else if (e->ec > e1->ec) | ||
255 | p = &(*p)->rb_right; | ||
256 | else { | ||
257 | ubi_assert(e->pnum != e1->pnum); | ||
258 | if (e->pnum < e1->pnum) | ||
259 | p = &(*p)->rb_left; | ||
260 | else | ||
261 | p = &(*p)->rb_right; | ||
262 | } | ||
263 | } | ||
264 | |||
265 | rb_link_node(&e->rb, parent, p); | ||
266 | rb_insert_color(&e->rb, root); | ||
267 | } | ||
268 | |||
269 | |||
270 | /* | ||
271 | * Helper functions to add and delete wear-leveling entries from different | ||
272 | * trees. | ||
273 | */ | ||
274 | |||
275 | static void free_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e) | ||
276 | { | ||
277 | wl_tree_add(e, &ubi->free); | ||
278 | } | ||
279 | static inline void used_tree_add(struct ubi_device *ubi, | ||
280 | struct ubi_wl_entry *e) | ||
281 | { | ||
282 | wl_tree_add(e, &ubi->used); | ||
283 | } | ||
284 | static inline void scrub_tree_add(struct ubi_device *ubi, | ||
285 | struct ubi_wl_entry *e) | ||
286 | { | ||
287 | wl_tree_add(e, &ubi->scrub); | ||
288 | } | ||
289 | static inline void free_tree_del(struct ubi_device *ubi, | ||
290 | struct ubi_wl_entry *e) | ||
291 | { | ||
292 | paranoid_check_in_wl_tree(e, &ubi->free); | ||
293 | rb_erase(&e->rb, &ubi->free); | ||
294 | } | ||
295 | static inline void used_tree_del(struct ubi_device *ubi, | ||
296 | struct ubi_wl_entry *e) | ||
297 | { | ||
298 | paranoid_check_in_wl_tree(e, &ubi->used); | ||
299 | rb_erase(&e->rb, &ubi->used); | ||
300 | } | ||
301 | static inline void scrub_tree_del(struct ubi_device *ubi, | ||
302 | struct ubi_wl_entry *e) | ||
303 | { | ||
304 | paranoid_check_in_wl_tree(e, &ubi->scrub); | ||
305 | rb_erase(&e->rb, &ubi->scrub); | ||
306 | } | ||
307 | |||
308 | /** | ||
309 | * do_work - do one pending work. | ||
310 | * @ubi: UBI device description object | ||
311 | * | ||
312 | * This function returns zero in case of success and a negative error code in | ||
313 | * case of failure. | ||
314 | */ | ||
315 | static int do_work(struct ubi_device *ubi) | ||
316 | { | ||
317 | int err; | ||
318 | struct ubi_work *wrk; | ||
319 | |||
320 | spin_lock(&ubi->wl_lock); | ||
321 | |||
322 | if (list_empty(&ubi->works)) { | ||
323 | spin_unlock(&ubi->wl_lock); | ||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | wrk = list_entry(ubi->works.next, struct ubi_work, list); | ||
328 | list_del(&wrk->list); | ||
329 | spin_unlock(&ubi->wl_lock); | ||
330 | |||
331 | /* | ||
332 | * Call the worker function. Do not touch the work structure | ||
333 | * after this call as it will have been freed or reused by that | ||
334 | * time by the worker function. | ||
335 | */ | ||
336 | err = wrk->func(ubi, wrk, 0); | ||
337 | if (err) | ||
338 | ubi_err("work failed with error code %d", err); | ||
339 | |||
340 | spin_lock(&ubi->wl_lock); | ||
341 | ubi->works_count -= 1; | ||
342 | ubi_assert(ubi->works_count >= 0); | ||
343 | spin_unlock(&ubi->wl_lock); | ||
344 | return err; | ||
345 | } | ||
346 | |||
347 | /** | ||
348 | * produce_free_peb - produce a free physical eraseblock. | ||
349 | * @ubi: UBI device description object | ||
350 | * | ||
351 | * This function tries to make a free PEB by means of synchronous execution of | ||
352 | * pending works. This may be needed if, for example the background thread is | ||
353 | * disabled. Returns zero in case of success and a negative error code in case | ||
354 | * of failure. | ||
355 | */ | ||
356 | static int produce_free_peb(struct ubi_device *ubi) | ||
357 | { | ||
358 | int err; | ||
359 | |||
360 | spin_lock(&ubi->wl_lock); | ||
361 | while (tree_empty(&ubi->free)) { | ||
362 | spin_unlock(&ubi->wl_lock); | ||
363 | |||
364 | dbg_wl("do one work synchronously"); | ||
365 | err = do_work(ubi); | ||
366 | if (err) | ||
367 | return err; | ||
368 | |||
369 | spin_lock(&ubi->wl_lock); | ||
370 | } | ||
371 | spin_unlock(&ubi->wl_lock); | ||
372 | |||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | /** | ||
377 | * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree. | ||
378 | * @e: the wear-leveling entry to check | ||
379 | * @root: the root of the tree | ||
380 | * | ||
381 | * This function returns non-zero if @e is in the @root RB-tree and zero if it | ||
382 | * is not. | ||
383 | */ | ||
384 | static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) | ||
385 | { | ||
386 | struct rb_node *p; | ||
387 | |||
388 | p = root->rb_node; | ||
389 | while (p) { | ||
390 | struct ubi_wl_entry *e1; | ||
391 | |||
392 | e1 = rb_entry(p, struct ubi_wl_entry, rb); | ||
393 | |||
394 | if (e->pnum == e1->pnum) { | ||
395 | ubi_assert(e == e1); | ||
396 | return 1; | ||
397 | } | ||
398 | |||
399 | if (e->ec < e1->ec) | ||
400 | p = p->rb_left; | ||
401 | else if (e->ec > e1->ec) | ||
402 | p = p->rb_right; | ||
403 | else { | ||
404 | ubi_assert(e->pnum != e1->pnum); | ||
405 | if (e->pnum < e1->pnum) | ||
406 | p = p->rb_left; | ||
407 | else | ||
408 | p = p->rb_right; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | return 0; | ||
413 | } | ||
414 | |||
415 | /** | ||
416 | * prot_tree_add - add physical eraseblock to protection trees. | ||
417 | * @ubi: UBI device description object | ||
418 | * @e: the physical eraseblock to add | ||
419 | * @pe: protection entry object to use | ||
420 | * @abs_ec: absolute erase counter value when this physical eraseblock has | ||
421 | * to be removed from the protection trees. | ||
422 | * | ||
423 | * @wl->lock has to be locked. | ||
424 | */ | ||
425 | static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e, | ||
426 | struct ubi_wl_prot_entry *pe, int abs_ec) | ||
427 | { | ||
428 | struct rb_node **p, *parent = NULL; | ||
429 | struct ubi_wl_prot_entry *pe1; | ||
430 | |||
431 | pe->e = e; | ||
432 | pe->abs_ec = ubi->abs_ec + abs_ec; | ||
433 | |||
434 | p = &ubi->prot.pnum.rb_node; | ||
435 | while (*p) { | ||
436 | parent = *p; | ||
437 | pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum); | ||
438 | |||
439 | if (e->pnum < pe1->e->pnum) | ||
440 | p = &(*p)->rb_left; | ||
441 | else | ||
442 | p = &(*p)->rb_right; | ||
443 | } | ||
444 | rb_link_node(&pe->rb_pnum, parent, p); | ||
445 | rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum); | ||
446 | |||
447 | p = &ubi->prot.aec.rb_node; | ||
448 | parent = NULL; | ||
449 | while (*p) { | ||
450 | parent = *p; | ||
451 | pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec); | ||
452 | |||
453 | if (pe->abs_ec < pe1->abs_ec) | ||
454 | p = &(*p)->rb_left; | ||
455 | else | ||
456 | p = &(*p)->rb_right; | ||
457 | } | ||
458 | rb_link_node(&pe->rb_aec, parent, p); | ||
459 | rb_insert_color(&pe->rb_aec, &ubi->prot.aec); | ||
460 | } | ||
461 | |||
462 | /** | ||
463 | * find_wl_entry - find wear-leveling entry closest to certain erase counter. | ||
464 | * @root: the RB-tree where to look for | ||
465 | * @max: highest possible erase counter | ||
466 | * | ||
467 | * This function looks for a wear leveling entry with erase counter closest to | ||
468 | * @max and less then @max. | ||
469 | */ | ||
470 | static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max) | ||
471 | { | ||
472 | struct rb_node *p; | ||
473 | struct ubi_wl_entry *e; | ||
474 | |||
475 | e = rb_entry(rb_first(root), struct ubi_wl_entry, rb); | ||
476 | max += e->ec; | ||
477 | |||
478 | p = root->rb_node; | ||
479 | while (p) { | ||
480 | struct ubi_wl_entry *e1; | ||
481 | |||
482 | e1 = rb_entry(p, struct ubi_wl_entry, rb); | ||
483 | if (e1->ec >= max) | ||
484 | p = p->rb_left; | ||
485 | else { | ||
486 | p = p->rb_right; | ||
487 | e = e1; | ||
488 | } | ||
489 | } | ||
490 | |||
491 | return e; | ||
492 | } | ||
493 | |||
494 | /** | ||
495 | * ubi_wl_get_peb - get a physical eraseblock. | ||
496 | * @ubi: UBI device description object | ||
497 | * @dtype: type of data which will be stored in this physical eraseblock | ||
498 | * | ||
499 | * This function returns a physical eraseblock in case of success and a | ||
500 | * negative error code in case of failure. Might sleep. | ||
501 | */ | ||
502 | int ubi_wl_get_peb(struct ubi_device *ubi, int dtype) | ||
503 | { | ||
504 | int err, protect, medium_ec; | ||
505 | struct ubi_wl_entry *e, *first, *last; | ||
506 | struct ubi_wl_prot_entry *pe; | ||
507 | |||
508 | ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM || | ||
509 | dtype == UBI_UNKNOWN); | ||
510 | |||
511 | pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_KERNEL); | ||
512 | if (!pe) | ||
513 | return -ENOMEM; | ||
514 | |||
515 | retry: | ||
516 | spin_lock(&ubi->wl_lock); | ||
517 | if (tree_empty(&ubi->free)) { | ||
518 | if (ubi->works_count == 0) { | ||
519 | ubi_assert(list_empty(&ubi->works)); | ||
520 | ubi_err("no free eraseblocks"); | ||
521 | spin_unlock(&ubi->wl_lock); | ||
522 | kfree(pe); | ||
523 | return -ENOSPC; | ||
524 | } | ||
525 | spin_unlock(&ubi->wl_lock); | ||
526 | |||
527 | err = produce_free_peb(ubi); | ||
528 | if (err < 0) { | ||
529 | kfree(pe); | ||
530 | return err; | ||
531 | } | ||
532 | goto retry; | ||
533 | } | ||
534 | |||
535 | switch (dtype) { | ||
536 | case UBI_LONGTERM: | ||
537 | /* | ||
538 | * For long term data we pick a physical eraseblock | ||
539 | * with high erase counter. But the highest erase | ||
540 | * counter we can pick is bounded by the the lowest | ||
541 | * erase counter plus %WL_FREE_MAX_DIFF. | ||
542 | */ | ||
543 | e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); | ||
544 | protect = LT_PROTECTION; | ||
545 | break; | ||
546 | case UBI_UNKNOWN: | ||
547 | /* | ||
548 | * For unknown data we pick a physical eraseblock with | ||
549 | * medium erase counter. But we by no means can pick a | ||
550 | * physical eraseblock with erase counter greater or | ||
551 | * equivalent than the lowest erase counter plus | ||
552 | * %WL_FREE_MAX_DIFF. | ||
553 | */ | ||
554 | first = rb_entry(rb_first(&ubi->free), | ||
555 | struct ubi_wl_entry, rb); | ||
556 | last = rb_entry(rb_last(&ubi->free), | ||
557 | struct ubi_wl_entry, rb); | ||
558 | |||
559 | if (last->ec - first->ec < WL_FREE_MAX_DIFF) | ||
560 | e = rb_entry(ubi->free.rb_node, | ||
561 | struct ubi_wl_entry, rb); | ||
562 | else { | ||
563 | medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2; | ||
564 | e = find_wl_entry(&ubi->free, medium_ec); | ||
565 | } | ||
566 | protect = U_PROTECTION; | ||
567 | break; | ||
568 | case UBI_SHORTTERM: | ||
569 | /* | ||
570 | * For short term data we pick a physical eraseblock | ||
571 | * with the lowest erase counter as we expect it will | ||
572 | * be erased soon. | ||
573 | */ | ||
574 | e = rb_entry(rb_first(&ubi->free), | ||
575 | struct ubi_wl_entry, rb); | ||
576 | protect = ST_PROTECTION; | ||
577 | break; | ||
578 | default: | ||
579 | protect = 0; | ||
580 | e = NULL; | ||
581 | BUG(); | ||
582 | } | ||
583 | |||
584 | /* | ||
585 | * Move the physical eraseblock to the protection trees where it will | ||
586 | * be protected from being moved for some time. | ||
587 | */ | ||
588 | free_tree_del(ubi, e); | ||
589 | prot_tree_add(ubi, e, pe, protect); | ||
590 | |||
591 | dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect); | ||
592 | spin_unlock(&ubi->wl_lock); | ||
593 | |||
594 | return e->pnum; | ||
595 | } | ||
596 | |||
597 | /** | ||
598 | * prot_tree_del - remove a physical eraseblock from the protection trees | ||
599 | * @ubi: UBI device description object | ||
600 | * @pnum: the physical eraseblock to remove | ||
601 | */ | ||
602 | static void prot_tree_del(struct ubi_device *ubi, int pnum) | ||
603 | { | ||
604 | struct rb_node *p; | ||
605 | struct ubi_wl_prot_entry *pe = NULL; | ||
606 | |||
607 | p = ubi->prot.pnum.rb_node; | ||
608 | while (p) { | ||
609 | |||
610 | pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum); | ||
611 | |||
612 | if (pnum == pe->e->pnum) | ||
613 | break; | ||
614 | |||
615 | if (pnum < pe->e->pnum) | ||
616 | p = p->rb_left; | ||
617 | else | ||
618 | p = p->rb_right; | ||
619 | } | ||
620 | |||
621 | ubi_assert(pe->e->pnum == pnum); | ||
622 | rb_erase(&pe->rb_aec, &ubi->prot.aec); | ||
623 | rb_erase(&pe->rb_pnum, &ubi->prot.pnum); | ||
624 | kfree(pe); | ||
625 | } | ||
626 | |||
627 | /** | ||
628 | * sync_erase - synchronously erase a physical eraseblock. | ||
629 | * @ubi: UBI device description object | ||
630 | * @e: the the physical eraseblock to erase | ||
631 | * @torture: if the physical eraseblock has to be tortured | ||
632 | * | ||
633 | * This function returns zero in case of success and a negative error code in | ||
634 | * case of failure. | ||
635 | */ | ||
636 | static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) | ||
637 | { | ||
638 | int err; | ||
639 | struct ubi_ec_hdr *ec_hdr; | ||
640 | unsigned long long ec = e->ec; | ||
641 | |||
642 | dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec); | ||
643 | |||
644 | err = paranoid_check_ec(ubi, e->pnum, e->ec); | ||
645 | if (err > 0) | ||
646 | return -EINVAL; | ||
647 | |||
648 | ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); | ||
649 | if (!ec_hdr) | ||
650 | return -ENOMEM; | ||
651 | |||
652 | err = ubi_io_sync_erase(ubi, e->pnum, torture); | ||
653 | if (err < 0) | ||
654 | goto out_free; | ||
655 | |||
656 | ec += err; | ||
657 | if (ec > UBI_MAX_ERASECOUNTER) { | ||
658 | /* | ||
659 | * Erase counter overflow. Upgrade UBI and use 64-bit | ||
660 | * erase counters internally. | ||
661 | */ | ||
662 | ubi_err("erase counter overflow at PEB %d, EC %llu", | ||
663 | e->pnum, ec); | ||
664 | err = -EINVAL; | ||
665 | goto out_free; | ||
666 | } | ||
667 | |||
668 | dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec); | ||
669 | |||
670 | ec_hdr->ec = cpu_to_ubi64(ec); | ||
671 | |||
672 | err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr); | ||
673 | if (err) | ||
674 | goto out_free; | ||
675 | |||
676 | e->ec = ec; | ||
677 | spin_lock(&ubi->wl_lock); | ||
678 | if (e->ec > ubi->max_ec) | ||
679 | ubi->max_ec = e->ec; | ||
680 | spin_unlock(&ubi->wl_lock); | ||
681 | |||
682 | out_free: | ||
683 | kfree(ec_hdr); | ||
684 | return err; | ||
685 | } | ||
686 | |||
687 | /** | ||
688 | * check_protection_over - check if it is time to stop protecting some | ||
689 | * physical eraseblocks. | ||
690 | * @ubi: UBI device description object | ||
691 | * | ||
692 | * This function is called after each erase operation, when the absolute erase | ||
693 | * counter is incremented, to check if some physical eraseblock have not to be | ||
694 | * protected any longer. These physical eraseblocks are moved from the | ||
695 | * protection trees to the used tree. | ||
696 | */ | ||
697 | static void check_protection_over(struct ubi_device *ubi) | ||
698 | { | ||
699 | struct ubi_wl_prot_entry *pe; | ||
700 | |||
701 | /* | ||
702 | * There may be several protected physical eraseblock to remove, | ||
703 | * process them all. | ||
704 | */ | ||
705 | while (1) { | ||
706 | spin_lock(&ubi->wl_lock); | ||
707 | if (tree_empty(&ubi->prot.aec)) { | ||
708 | spin_unlock(&ubi->wl_lock); | ||
709 | break; | ||
710 | } | ||
711 | |||
712 | pe = rb_entry(rb_first(&ubi->prot.aec), | ||
713 | struct ubi_wl_prot_entry, rb_aec); | ||
714 | |||
715 | if (pe->abs_ec > ubi->abs_ec) { | ||
716 | spin_unlock(&ubi->wl_lock); | ||
717 | break; | ||
718 | } | ||
719 | |||
720 | dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu", | ||
721 | pe->e->pnum, ubi->abs_ec, pe->abs_ec); | ||
722 | rb_erase(&pe->rb_aec, &ubi->prot.aec); | ||
723 | rb_erase(&pe->rb_pnum, &ubi->prot.pnum); | ||
724 | used_tree_add(ubi, pe->e); | ||
725 | spin_unlock(&ubi->wl_lock); | ||
726 | |||
727 | kfree(pe); | ||
728 | cond_resched(); | ||
729 | } | ||
730 | } | ||
731 | |||
732 | /** | ||
733 | * schedule_ubi_work - schedule a work. | ||
734 | * @ubi: UBI device description object | ||
735 | * @wrk: the work to schedule | ||
736 | * | ||
737 | * This function enqueues a work defined by @wrk to the tail of the pending | ||
738 | * works list. | ||
739 | */ | ||
740 | static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) | ||
741 | { | ||
742 | spin_lock(&ubi->wl_lock); | ||
743 | list_add_tail(&wrk->list, &ubi->works); | ||
744 | ubi_assert(ubi->works_count >= 0); | ||
745 | ubi->works_count += 1; | ||
746 | if (ubi->thread_enabled) | ||
747 | wake_up_process(ubi->bgt_thread); | ||
748 | spin_unlock(&ubi->wl_lock); | ||
749 | } | ||
750 | |||
751 | static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, | ||
752 | int cancel); | ||
753 | |||
754 | /** | ||
755 | * schedule_erase - schedule an erase work. | ||
756 | * @ubi: UBI device description object | ||
757 | * @e: the WL entry of the physical eraseblock to erase | ||
758 | * @torture: if the physical eraseblock has to be tortured | ||
759 | * | ||
760 | * This function returns zero in case of success and a %-ENOMEM in case of | ||
761 | * failure. | ||
762 | */ | ||
763 | static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, | ||
764 | int torture) | ||
765 | { | ||
766 | struct ubi_work *wl_wrk; | ||
767 | |||
768 | dbg_wl("schedule erasure of PEB %d, EC %d, torture %d", | ||
769 | e->pnum, e->ec, torture); | ||
770 | |||
771 | wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL); | ||
772 | if (!wl_wrk) | ||
773 | return -ENOMEM; | ||
774 | |||
775 | wl_wrk->func = &erase_worker; | ||
776 | wl_wrk->e = e; | ||
777 | wl_wrk->torture = torture; | ||
778 | |||
779 | schedule_ubi_work(ubi, wl_wrk); | ||
780 | return 0; | ||
781 | } | ||
782 | |||
783 | /** | ||
784 | * wear_leveling_worker - wear-leveling worker function. | ||
785 | * @ubi: UBI device description object | ||
786 | * @wrk: the work object | ||
787 | * @cancel: non-zero if the worker has to free memory and exit | ||
788 | * | ||
789 | * This function copies a more worn out physical eraseblock to a less worn out | ||
790 | * one. Returns zero in case of success and a negative error code in case of | ||
791 | * failure. | ||
792 | */ | ||
793 | static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, | ||
794 | int cancel) | ||
795 | { | ||
796 | int err, put = 0; | ||
797 | struct ubi_wl_entry *e1, *e2; | ||
798 | struct ubi_vid_hdr *vid_hdr; | ||
799 | |||
800 | kfree(wrk); | ||
801 | |||
802 | if (cancel) | ||
803 | return 0; | ||
804 | |||
805 | vid_hdr = ubi_zalloc_vid_hdr(ubi); | ||
806 | if (!vid_hdr) | ||
807 | return -ENOMEM; | ||
808 | |||
809 | spin_lock(&ubi->wl_lock); | ||
810 | |||
811 | /* | ||
812 | * Only one WL worker at a time is supported at this implementation, so | ||
813 | * make sure a PEB is not being moved already. | ||
814 | */ | ||
815 | if (ubi->move_to || tree_empty(&ubi->free) || | ||
816 | (tree_empty(&ubi->used) && tree_empty(&ubi->scrub))) { | ||
817 | /* | ||
818 | * Only one WL worker at a time is supported at this | ||
819 | * implementation, so if a LEB is already being moved, cancel. | ||
820 | * | ||
821 | * No free physical eraseblocks? Well, we cancel wear-leveling | ||
822 | * then. It will be triggered again when a free physical | ||
823 | * eraseblock appears. | ||
824 | * | ||
825 | * No used physical eraseblocks? They must be temporarily | ||
826 | * protected from being moved. They will be moved to the | ||
827 | * @ubi->used tree later and the wear-leveling will be | ||
828 | * triggered again. | ||
829 | */ | ||
830 | dbg_wl("cancel WL, a list is empty: free %d, used %d", | ||
831 | tree_empty(&ubi->free), tree_empty(&ubi->used)); | ||
832 | ubi->wl_scheduled = 0; | ||
833 | spin_unlock(&ubi->wl_lock); | ||
834 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
835 | return 0; | ||
836 | } | ||
837 | |||
838 | if (tree_empty(&ubi->scrub)) { | ||
839 | /* | ||
840 | * Now pick the least worn-out used physical eraseblock and a | ||
841 | * highly worn-out free physical eraseblock. If the erase | ||
842 | * counters differ much enough, start wear-leveling. | ||
843 | */ | ||
844 | e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); | ||
845 | e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); | ||
846 | |||
847 | if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { | ||
848 | dbg_wl("no WL needed: min used EC %d, max free EC %d", | ||
849 | e1->ec, e2->ec); | ||
850 | ubi->wl_scheduled = 0; | ||
851 | spin_unlock(&ubi->wl_lock); | ||
852 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
853 | return 0; | ||
854 | } | ||
855 | used_tree_del(ubi, e1); | ||
856 | dbg_wl("move PEB %d EC %d to PEB %d EC %d", | ||
857 | e1->pnum, e1->ec, e2->pnum, e2->ec); | ||
858 | } else { | ||
859 | e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb); | ||
860 | e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); | ||
861 | scrub_tree_del(ubi, e1); | ||
862 | dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); | ||
863 | } | ||
864 | |||
865 | free_tree_del(ubi, e2); | ||
866 | ubi_assert(!ubi->move_from && !ubi->move_to); | ||
867 | ubi_assert(!ubi->move_to_put && !ubi->move_from_put); | ||
868 | ubi->move_from = e1; | ||
869 | ubi->move_to = e2; | ||
870 | spin_unlock(&ubi->wl_lock); | ||
871 | |||
872 | /* | ||
873 | * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum. | ||
874 | * We so far do not know which logical eraseblock our physical | ||
875 | * eraseblock (@e1) belongs to. We have to read the volume identifier | ||
876 | * header first. | ||
877 | */ | ||
878 | |||
879 | err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0); | ||
880 | if (err && err != UBI_IO_BITFLIPS) { | ||
881 | if (err == UBI_IO_PEB_FREE) { | ||
882 | /* | ||
883 | * We are trying to move PEB without a VID header. UBI | ||
884 | * always write VID headers shortly after the PEB was | ||
885 | * given, so we have a situation when it did not have | ||
886 | * chance to write it down because it was preempted. | ||
887 | * Just re-schedule the work, so that next time it will | ||
888 | * likely have the VID header in place. | ||
889 | */ | ||
890 | dbg_wl("PEB %d has no VID header", e1->pnum); | ||
891 | err = 0; | ||
892 | } else { | ||
893 | ubi_err("error %d while reading VID header from PEB %d", | ||
894 | err, e1->pnum); | ||
895 | if (err > 0) | ||
896 | err = -EIO; | ||
897 | } | ||
898 | goto error; | ||
899 | } | ||
900 | |||
901 | err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr); | ||
902 | if (err) { | ||
903 | if (err == UBI_IO_BITFLIPS) | ||
904 | err = 0; | ||
905 | goto error; | ||
906 | } | ||
907 | |||
908 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
909 | spin_lock(&ubi->wl_lock); | ||
910 | if (!ubi->move_to_put) | ||
911 | used_tree_add(ubi, e2); | ||
912 | else | ||
913 | put = 1; | ||
914 | ubi->move_from = ubi->move_to = NULL; | ||
915 | ubi->move_from_put = ubi->move_to_put = 0; | ||
916 | ubi->wl_scheduled = 0; | ||
917 | spin_unlock(&ubi->wl_lock); | ||
918 | |||
919 | if (put) { | ||
920 | /* | ||
921 | * Well, the target PEB was put meanwhile, schedule it for | ||
922 | * erasure. | ||
923 | */ | ||
924 | dbg_wl("PEB %d was put meanwhile, erase", e2->pnum); | ||
925 | err = schedule_erase(ubi, e2, 0); | ||
926 | if (err) { | ||
927 | kmem_cache_free(wl_entries_slab, e2); | ||
928 | ubi_ro_mode(ubi); | ||
929 | } | ||
930 | } | ||
931 | |||
932 | err = schedule_erase(ubi, e1, 0); | ||
933 | if (err) { | ||
934 | kmem_cache_free(wl_entries_slab, e1); | ||
935 | ubi_ro_mode(ubi); | ||
936 | } | ||
937 | |||
938 | dbg_wl("done"); | ||
939 | return err; | ||
940 | |||
941 | /* | ||
942 | * Some error occurred. @e1 was not changed, so return it back. @e2 | ||
943 | * might be changed, schedule it for erasure. | ||
944 | */ | ||
945 | error: | ||
946 | if (err) | ||
947 | dbg_wl("error %d occurred, cancel operation", err); | ||
948 | ubi_assert(err <= 0); | ||
949 | |||
950 | ubi_free_vid_hdr(ubi, vid_hdr); | ||
951 | spin_lock(&ubi->wl_lock); | ||
952 | ubi->wl_scheduled = 0; | ||
953 | if (ubi->move_from_put) | ||
954 | put = 1; | ||
955 | else | ||
956 | used_tree_add(ubi, e1); | ||
957 | ubi->move_from = ubi->move_to = NULL; | ||
958 | ubi->move_from_put = ubi->move_to_put = 0; | ||
959 | spin_unlock(&ubi->wl_lock); | ||
960 | |||
961 | if (put) { | ||
962 | /* | ||
963 | * Well, the target PEB was put meanwhile, schedule it for | ||
964 | * erasure. | ||
965 | */ | ||
966 | dbg_wl("PEB %d was put meanwhile, erase", e1->pnum); | ||
967 | err = schedule_erase(ubi, e1, 0); | ||
968 | if (err) { | ||
969 | kmem_cache_free(wl_entries_slab, e1); | ||
970 | ubi_ro_mode(ubi); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | err = schedule_erase(ubi, e2, 0); | ||
975 | if (err) { | ||
976 | kmem_cache_free(wl_entries_slab, e2); | ||
977 | ubi_ro_mode(ubi); | ||
978 | } | ||
979 | |||
980 | yield(); | ||
981 | return err; | ||
982 | } | ||
983 | |||
984 | /** | ||
985 | * ensure_wear_leveling - schedule wear-leveling if it is needed. | ||
986 | * @ubi: UBI device description object | ||
987 | * | ||
988 | * This function checks if it is time to start wear-leveling and schedules it | ||
989 | * if yes. This function returns zero in case of success and a negative error | ||
990 | * code in case of failure. | ||
991 | */ | ||
992 | static int ensure_wear_leveling(struct ubi_device *ubi) | ||
993 | { | ||
994 | int err = 0; | ||
995 | struct ubi_wl_entry *e1; | ||
996 | struct ubi_wl_entry *e2; | ||
997 | struct ubi_work *wrk; | ||
998 | |||
999 | spin_lock(&ubi->wl_lock); | ||
1000 | if (ubi->wl_scheduled) | ||
1001 | /* Wear-leveling is already in the work queue */ | ||
1002 | goto out_unlock; | ||
1003 | |||
1004 | /* | ||
1005 | * If the ubi->scrub tree is not empty, scrubbing is needed, and the | ||
1006 | * the WL worker has to be scheduled anyway. | ||
1007 | */ | ||
1008 | if (tree_empty(&ubi->scrub)) { | ||
1009 | if (tree_empty(&ubi->used) || tree_empty(&ubi->free)) | ||
1010 | /* No physical eraseblocks - no deal */ | ||
1011 | goto out_unlock; | ||
1012 | |||
1013 | /* | ||
1014 | * We schedule wear-leveling only if the difference between the | ||
1015 | * lowest erase counter of used physical eraseblocks and a high | ||
1016 | * erase counter of free physical eraseblocks is greater then | ||
1017 | * %UBI_WL_THRESHOLD. | ||
1018 | */ | ||
1019 | e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); | ||
1020 | e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); | ||
1021 | |||
1022 | if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) | ||
1023 | goto out_unlock; | ||
1024 | dbg_wl("schedule wear-leveling"); | ||
1025 | } else | ||
1026 | dbg_wl("schedule scrubbing"); | ||
1027 | |||
1028 | ubi->wl_scheduled = 1; | ||
1029 | spin_unlock(&ubi->wl_lock); | ||
1030 | |||
1031 | wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL); | ||
1032 | if (!wrk) { | ||
1033 | err = -ENOMEM; | ||
1034 | goto out_cancel; | ||
1035 | } | ||
1036 | |||
1037 | wrk->func = &wear_leveling_worker; | ||
1038 | schedule_ubi_work(ubi, wrk); | ||
1039 | return err; | ||
1040 | |||
1041 | out_cancel: | ||
1042 | spin_lock(&ubi->wl_lock); | ||
1043 | ubi->wl_scheduled = 0; | ||
1044 | out_unlock: | ||
1045 | spin_unlock(&ubi->wl_lock); | ||
1046 | return err; | ||
1047 | } | ||
1048 | |||
1049 | /** | ||
1050 | * erase_worker - physical eraseblock erase worker function. | ||
1051 | * @ubi: UBI device description object | ||
1052 | * @wl_wrk: the work object | ||
1053 | * @cancel: non-zero if the worker has to free memory and exit | ||
1054 | * | ||
1055 | * This function erases a physical eraseblock and perform torture testing if | ||
1056 | * needed. It also takes care about marking the physical eraseblock bad if | ||
1057 | * needed. Returns zero in case of success and a negative error code in case of | ||
1058 | * failure. | ||
1059 | */ | ||
1060 | static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, | ||
1061 | int cancel) | ||
1062 | { | ||
1063 | int err; | ||
1064 | struct ubi_wl_entry *e = wl_wrk->e; | ||
1065 | int pnum = e->pnum; | ||
1066 | |||
1067 | if (cancel) { | ||
1068 | dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec); | ||
1069 | kfree(wl_wrk); | ||
1070 | kmem_cache_free(wl_entries_slab, e); | ||
1071 | return 0; | ||
1072 | } | ||
1073 | |||
1074 | dbg_wl("erase PEB %d EC %d", pnum, e->ec); | ||
1075 | |||
1076 | err = sync_erase(ubi, e, wl_wrk->torture); | ||
1077 | if (!err) { | ||
1078 | /* Fine, we've erased it successfully */ | ||
1079 | kfree(wl_wrk); | ||
1080 | |||
1081 | spin_lock(&ubi->wl_lock); | ||
1082 | ubi->abs_ec += 1; | ||
1083 | free_tree_add(ubi, e); | ||
1084 | spin_unlock(&ubi->wl_lock); | ||
1085 | |||
1086 | /* | ||
1087 | * One more erase operation has happened, take care about protected | ||
1088 | * physical eraseblocks. | ||
1089 | */ | ||
1090 | check_protection_over(ubi); | ||
1091 | |||
1092 | /* And take care about wear-leveling */ | ||
1093 | err = ensure_wear_leveling(ubi); | ||
1094 | return err; | ||
1095 | } | ||
1096 | |||
1097 | kfree(wl_wrk); | ||
1098 | kmem_cache_free(wl_entries_slab, e); | ||
1099 | |||
1100 | if (err != -EIO) { | ||
1101 | /* | ||
1102 | * If this is not %-EIO, we have no idea what to do. Scheduling | ||
1103 | * this physical eraseblock for erasure again would cause | ||
1104 | * errors again and again. Well, lets switch to RO mode. | ||
1105 | */ | ||
1106 | ubi_ro_mode(ubi); | ||
1107 | return err; | ||
1108 | } | ||
1109 | |||
1110 | /* It is %-EIO, the PEB went bad */ | ||
1111 | |||
1112 | if (!ubi->bad_allowed) { | ||
1113 | ubi_err("bad physical eraseblock %d detected", pnum); | ||
1114 | ubi_ro_mode(ubi); | ||
1115 | err = -EIO; | ||
1116 | } else { | ||
1117 | int need; | ||
1118 | |||
1119 | spin_lock(&ubi->volumes_lock); | ||
1120 | need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs + 1; | ||
1121 | if (need > 0) { | ||
1122 | need = ubi->avail_pebs >= need ? need : ubi->avail_pebs; | ||
1123 | ubi->avail_pebs -= need; | ||
1124 | ubi->rsvd_pebs += need; | ||
1125 | ubi->beb_rsvd_pebs += need; | ||
1126 | if (need > 0) | ||
1127 | ubi_msg("reserve more %d PEBs", need); | ||
1128 | } | ||
1129 | |||
1130 | if (ubi->beb_rsvd_pebs == 0) { | ||
1131 | spin_unlock(&ubi->volumes_lock); | ||
1132 | ubi_err("no reserved physical eraseblocks"); | ||
1133 | ubi_ro_mode(ubi); | ||
1134 | return -EIO; | ||
1135 | } | ||
1136 | |||
1137 | spin_unlock(&ubi->volumes_lock); | ||
1138 | ubi_msg("mark PEB %d as bad", pnum); | ||
1139 | |||
1140 | err = ubi_io_mark_bad(ubi, pnum); | ||
1141 | if (err) { | ||
1142 | ubi_ro_mode(ubi); | ||
1143 | return err; | ||
1144 | } | ||
1145 | |||
1146 | spin_lock(&ubi->volumes_lock); | ||
1147 | ubi->beb_rsvd_pebs -= 1; | ||
1148 | ubi->bad_peb_count += 1; | ||
1149 | ubi->good_peb_count -= 1; | ||
1150 | ubi_calculate_reserved(ubi); | ||
1151 | if (ubi->beb_rsvd_pebs == 0) | ||
1152 | ubi_warn("last PEB from the reserved pool was used"); | ||
1153 | spin_unlock(&ubi->volumes_lock); | ||
1154 | } | ||
1155 | |||
1156 | return err; | ||
1157 | } | ||
1158 | |||
1159 | /** | ||
1160 | * ubi_wl_put_peb - return a physical eraseblock to the wear-leveling | ||
1161 | * unit. | ||
1162 | * @ubi: UBI device description object | ||
1163 | * @pnum: physical eraseblock to return | ||
1164 | * @torture: if this physical eraseblock has to be tortured | ||
1165 | * | ||
1166 | * This function is called to return physical eraseblock @pnum to the pool of | ||
1167 | * free physical eraseblocks. The @torture flag has to be set if an I/O error | ||
1168 | * occurred to this @pnum and it has to be tested. This function returns zero | ||
1169 | * in case of success and a negative error code in case of failure. | ||
1170 | */ | ||
1171 | int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture) | ||
1172 | { | ||
1173 | int err; | ||
1174 | struct ubi_wl_entry *e; | ||
1175 | |||
1176 | dbg_wl("PEB %d", pnum); | ||
1177 | ubi_assert(pnum >= 0); | ||
1178 | ubi_assert(pnum < ubi->peb_count); | ||
1179 | |||
1180 | spin_lock(&ubi->wl_lock); | ||
1181 | |||
1182 | e = ubi->lookuptbl[pnum]; | ||
1183 | if (e == ubi->move_from) { | ||
1184 | /* | ||
1185 | * User is putting the physical eraseblock which was selected to | ||
1186 | * be moved. It will be scheduled for erasure in the | ||
1187 | * wear-leveling worker. | ||
1188 | */ | ||
1189 | dbg_wl("PEB %d is being moved", pnum); | ||
1190 | ubi_assert(!ubi->move_from_put); | ||
1191 | ubi->move_from_put = 1; | ||
1192 | spin_unlock(&ubi->wl_lock); | ||
1193 | return 0; | ||
1194 | } else if (e == ubi->move_to) { | ||
1195 | /* | ||
1196 | * User is putting the physical eraseblock which was selected | ||
1197 | * as the target the data is moved to. It may happen if the EBA | ||
1198 | * unit already re-mapped the LEB but the WL unit did has not | ||
1199 | * put the PEB to the "used" tree. | ||
1200 | */ | ||
1201 | dbg_wl("PEB %d is the target of data moving", pnum); | ||
1202 | ubi_assert(!ubi->move_to_put); | ||
1203 | ubi->move_to_put = 1; | ||
1204 | spin_unlock(&ubi->wl_lock); | ||
1205 | return 0; | ||
1206 | } else { | ||
1207 | if (in_wl_tree(e, &ubi->used)) | ||
1208 | used_tree_del(ubi, e); | ||
1209 | else if (in_wl_tree(e, &ubi->scrub)) | ||
1210 | scrub_tree_del(ubi, e); | ||
1211 | else | ||
1212 | prot_tree_del(ubi, e->pnum); | ||
1213 | } | ||
1214 | spin_unlock(&ubi->wl_lock); | ||
1215 | |||
1216 | err = schedule_erase(ubi, e, torture); | ||
1217 | if (err) { | ||
1218 | spin_lock(&ubi->wl_lock); | ||
1219 | used_tree_add(ubi, e); | ||
1220 | spin_unlock(&ubi->wl_lock); | ||
1221 | } | ||
1222 | |||
1223 | return err; | ||
1224 | } | ||
1225 | |||
1226 | /** | ||
1227 | * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing. | ||
1228 | * @ubi: UBI device description object | ||
1229 | * @pnum: the physical eraseblock to schedule | ||
1230 | * | ||
1231 | * If a bit-flip in a physical eraseblock is detected, this physical eraseblock | ||
1232 | * needs scrubbing. This function schedules a physical eraseblock for | ||
1233 | * scrubbing which is done in background. This function returns zero in case of | ||
1234 | * success and a negative error code in case of failure. | ||
1235 | */ | ||
1236 | int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum) | ||
1237 | { | ||
1238 | struct ubi_wl_entry *e; | ||
1239 | |||
1240 | ubi_msg("schedule PEB %d for scrubbing", pnum); | ||
1241 | |||
1242 | retry: | ||
1243 | spin_lock(&ubi->wl_lock); | ||
1244 | e = ubi->lookuptbl[pnum]; | ||
1245 | if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub)) { | ||
1246 | spin_unlock(&ubi->wl_lock); | ||
1247 | return 0; | ||
1248 | } | ||
1249 | |||
1250 | if (e == ubi->move_to) { | ||
1251 | /* | ||
1252 | * This physical eraseblock was used to move data to. The data | ||
1253 | * was moved but the PEB was not yet inserted to the proper | ||
1254 | * tree. We should just wait a little and let the WL worker | ||
1255 | * proceed. | ||
1256 | */ | ||
1257 | spin_unlock(&ubi->wl_lock); | ||
1258 | dbg_wl("the PEB %d is not in proper tree, retry", pnum); | ||
1259 | yield(); | ||
1260 | goto retry; | ||
1261 | } | ||
1262 | |||
1263 | if (in_wl_tree(e, &ubi->used)) | ||
1264 | used_tree_del(ubi, e); | ||
1265 | else | ||
1266 | prot_tree_del(ubi, pnum); | ||
1267 | |||
1268 | scrub_tree_add(ubi, e); | ||
1269 | spin_unlock(&ubi->wl_lock); | ||
1270 | |||
1271 | /* | ||
1272 | * Technically scrubbing is the same as wear-leveling, so it is done | ||
1273 | * by the WL worker. | ||
1274 | */ | ||
1275 | return ensure_wear_leveling(ubi); | ||
1276 | } | ||
1277 | |||
1278 | /** | ||
1279 | * ubi_wl_flush - flush all pending works. | ||
1280 | * @ubi: UBI device description object | ||
1281 | * | ||
1282 | * This function returns zero in case of success and a negative error code in | ||
1283 | * case of failure. | ||
1284 | */ | ||
1285 | int ubi_wl_flush(struct ubi_device *ubi) | ||
1286 | { | ||
1287 | int err, pending_count; | ||
1288 | |||
1289 | pending_count = ubi->works_count; | ||
1290 | |||
1291 | dbg_wl("flush (%d pending works)", pending_count); | ||
1292 | |||
1293 | /* | ||
1294 | * Erase while the pending works queue is not empty, but not more then | ||
1295 | * the number of currently pending works. | ||
1296 | */ | ||
1297 | while (pending_count-- > 0) { | ||
1298 | err = do_work(ubi); | ||
1299 | if (err) | ||
1300 | return err; | ||
1301 | } | ||
1302 | |||
1303 | return 0; | ||
1304 | } | ||
1305 | |||
1306 | /** | ||
1307 | * tree_destroy - destroy an RB-tree. | ||
1308 | * @root: the root of the tree to destroy | ||
1309 | */ | ||
1310 | static void tree_destroy(struct rb_root *root) | ||
1311 | { | ||
1312 | struct rb_node *rb; | ||
1313 | struct ubi_wl_entry *e; | ||
1314 | |||
1315 | rb = root->rb_node; | ||
1316 | while (rb) { | ||
1317 | if (rb->rb_left) | ||
1318 | rb = rb->rb_left; | ||
1319 | else if (rb->rb_right) | ||
1320 | rb = rb->rb_right; | ||
1321 | else { | ||
1322 | e = rb_entry(rb, struct ubi_wl_entry, rb); | ||
1323 | |||
1324 | rb = rb_parent(rb); | ||
1325 | if (rb) { | ||
1326 | if (rb->rb_left == &e->rb) | ||
1327 | rb->rb_left = NULL; | ||
1328 | else | ||
1329 | rb->rb_right = NULL; | ||
1330 | } | ||
1331 | |||
1332 | kmem_cache_free(wl_entries_slab, e); | ||
1333 | } | ||
1334 | } | ||
1335 | } | ||
1336 | |||
1337 | /** | ||
1338 | * ubi_thread - UBI background thread. | ||
1339 | * @u: the UBI device description object pointer | ||
1340 | */ | ||
1341 | static int ubi_thread(void *u) | ||
1342 | { | ||
1343 | int failures = 0; | ||
1344 | struct ubi_device *ubi = u; | ||
1345 | |||
1346 | ubi_msg("background thread \"%s\" started, PID %d", | ||
1347 | ubi->bgt_name, current->pid); | ||
1348 | |||
1349 | for (;;) { | ||
1350 | int err; | ||
1351 | |||
1352 | if (kthread_should_stop()) | ||
1353 | goto out; | ||
1354 | |||
1355 | if (try_to_freeze()) | ||
1356 | continue; | ||
1357 | |||
1358 | spin_lock(&ubi->wl_lock); | ||
1359 | if (list_empty(&ubi->works) || ubi->ro_mode || | ||
1360 | !ubi->thread_enabled) { | ||
1361 | set_current_state(TASK_INTERRUPTIBLE); | ||
1362 | spin_unlock(&ubi->wl_lock); | ||
1363 | schedule(); | ||
1364 | continue; | ||
1365 | } | ||
1366 | spin_unlock(&ubi->wl_lock); | ||
1367 | |||
1368 | err = do_work(ubi); | ||
1369 | if (err) { | ||
1370 | ubi_err("%s: work failed with error code %d", | ||
1371 | ubi->bgt_name, err); | ||
1372 | if (failures++ > WL_MAX_FAILURES) { | ||
1373 | /* | ||
1374 | * Too many failures, disable the thread and | ||
1375 | * switch to read-only mode. | ||
1376 | */ | ||
1377 | ubi_msg("%s: %d consecutive failures", | ||
1378 | ubi->bgt_name, WL_MAX_FAILURES); | ||
1379 | ubi_ro_mode(ubi); | ||
1380 | break; | ||
1381 | } | ||
1382 | } else | ||
1383 | failures = 0; | ||
1384 | |||
1385 | cond_resched(); | ||
1386 | } | ||
1387 | |||
1388 | out: | ||
1389 | dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); | ||
1390 | return 0; | ||
1391 | } | ||
1392 | |||
1393 | /** | ||
1394 | * cancel_pending - cancel all pending works. | ||
1395 | * @ubi: UBI device description object | ||
1396 | */ | ||
1397 | static void cancel_pending(struct ubi_device *ubi) | ||
1398 | { | ||
1399 | while (!list_empty(&ubi->works)) { | ||
1400 | struct ubi_work *wrk; | ||
1401 | |||
1402 | wrk = list_entry(ubi->works.next, struct ubi_work, list); | ||
1403 | list_del(&wrk->list); | ||
1404 | wrk->func(ubi, wrk, 1); | ||
1405 | ubi->works_count -= 1; | ||
1406 | ubi_assert(ubi->works_count >= 0); | ||
1407 | } | ||
1408 | } | ||
1409 | |||
1410 | /** | ||
1411 | * ubi_wl_init_scan - initialize the wear-leveling unit using scanning | ||
1412 | * information. | ||
1413 | * @ubi: UBI device description object | ||
1414 | * @si: scanning information | ||
1415 | * | ||
1416 | * This function returns zero in case of success, and a negative error code in | ||
1417 | * case of failure. | ||
1418 | */ | ||
1419 | int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) | ||
1420 | { | ||
1421 | int err; | ||
1422 | struct rb_node *rb1, *rb2; | ||
1423 | struct ubi_scan_volume *sv; | ||
1424 | struct ubi_scan_leb *seb, *tmp; | ||
1425 | struct ubi_wl_entry *e; | ||
1426 | |||
1427 | |||
1428 | ubi->used = ubi->free = ubi->scrub = RB_ROOT; | ||
1429 | ubi->prot.pnum = ubi->prot.aec = RB_ROOT; | ||
1430 | spin_lock_init(&ubi->wl_lock); | ||
1431 | ubi->max_ec = si->max_ec; | ||
1432 | INIT_LIST_HEAD(&ubi->works); | ||
1433 | |||
1434 | sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); | ||
1435 | |||
1436 | ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name); | ||
1437 | if (IS_ERR(ubi->bgt_thread)) { | ||
1438 | err = PTR_ERR(ubi->bgt_thread); | ||
1439 | ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name, | ||
1440 | err); | ||
1441 | return err; | ||
1442 | } | ||
1443 | |||
1444 | if (ubi_devices_cnt == 0) { | ||
1445 | wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab", | ||
1446 | sizeof(struct ubi_wl_entry), | ||
1447 | 0, 0, NULL, NULL); | ||
1448 | if (!wl_entries_slab) | ||
1449 | return -ENOMEM; | ||
1450 | } | ||
1451 | |||
1452 | err = -ENOMEM; | ||
1453 | ubi->lookuptbl = kzalloc(ubi->peb_count * sizeof(void *), GFP_KERNEL); | ||
1454 | if (!ubi->lookuptbl) | ||
1455 | goto out_free; | ||
1456 | |||
1457 | list_for_each_entry_safe(seb, tmp, &si->erase, u.list) { | ||
1458 | cond_resched(); | ||
1459 | |||
1460 | e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); | ||
1461 | if (!e) | ||
1462 | goto out_free; | ||
1463 | |||
1464 | e->pnum = seb->pnum; | ||
1465 | e->ec = seb->ec; | ||
1466 | ubi->lookuptbl[e->pnum] = e; | ||
1467 | if (schedule_erase(ubi, e, 0)) { | ||
1468 | kmem_cache_free(wl_entries_slab, e); | ||
1469 | goto out_free; | ||
1470 | } | ||
1471 | } | ||
1472 | |||
1473 | list_for_each_entry(seb, &si->free, u.list) { | ||
1474 | cond_resched(); | ||
1475 | |||
1476 | e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); | ||
1477 | if (!e) | ||
1478 | goto out_free; | ||
1479 | |||
1480 | e->pnum = seb->pnum; | ||
1481 | e->ec = seb->ec; | ||
1482 | ubi_assert(e->ec >= 0); | ||
1483 | free_tree_add(ubi, e); | ||
1484 | ubi->lookuptbl[e->pnum] = e; | ||
1485 | } | ||
1486 | |||
1487 | list_for_each_entry(seb, &si->corr, u.list) { | ||
1488 | cond_resched(); | ||
1489 | |||
1490 | e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); | ||
1491 | if (!e) | ||
1492 | goto out_free; | ||
1493 | |||
1494 | e->pnum = seb->pnum; | ||
1495 | e->ec = seb->ec; | ||
1496 | ubi->lookuptbl[e->pnum] = e; | ||
1497 | if (schedule_erase(ubi, e, 0)) { | ||
1498 | kmem_cache_free(wl_entries_slab, e); | ||
1499 | goto out_free; | ||
1500 | } | ||
1501 | } | ||
1502 | |||
1503 | ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { | ||
1504 | ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { | ||
1505 | cond_resched(); | ||
1506 | |||
1507 | e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); | ||
1508 | if (!e) | ||
1509 | goto out_free; | ||
1510 | |||
1511 | e->pnum = seb->pnum; | ||
1512 | e->ec = seb->ec; | ||
1513 | ubi->lookuptbl[e->pnum] = e; | ||
1514 | if (!seb->scrub) { | ||
1515 | dbg_wl("add PEB %d EC %d to the used tree", | ||
1516 | e->pnum, e->ec); | ||
1517 | used_tree_add(ubi, e); | ||
1518 | } else { | ||
1519 | dbg_wl("add PEB %d EC %d to the scrub tree", | ||
1520 | e->pnum, e->ec); | ||
1521 | scrub_tree_add(ubi, e); | ||
1522 | } | ||
1523 | } | ||
1524 | } | ||
1525 | |||
1526 | if (WL_RESERVED_PEBS > ubi->avail_pebs) { | ||
1527 | ubi_err("no enough physical eraseblocks (%d, need %d)", | ||
1528 | ubi->avail_pebs, WL_RESERVED_PEBS); | ||
1529 | goto out_free; | ||
1530 | } | ||
1531 | ubi->avail_pebs -= WL_RESERVED_PEBS; | ||
1532 | ubi->rsvd_pebs += WL_RESERVED_PEBS; | ||
1533 | |||
1534 | /* Schedule wear-leveling if needed */ | ||
1535 | err = ensure_wear_leveling(ubi); | ||
1536 | if (err) | ||
1537 | goto out_free; | ||
1538 | |||
1539 | return 0; | ||
1540 | |||
1541 | out_free: | ||
1542 | cancel_pending(ubi); | ||
1543 | tree_destroy(&ubi->used); | ||
1544 | tree_destroy(&ubi->free); | ||
1545 | tree_destroy(&ubi->scrub); | ||
1546 | kfree(ubi->lookuptbl); | ||
1547 | if (ubi_devices_cnt == 0) | ||
1548 | kmem_cache_destroy(wl_entries_slab); | ||
1549 | return err; | ||
1550 | } | ||
1551 | |||
1552 | /** | ||
1553 | * protection_trees_destroy - destroy the protection RB-trees. | ||
1554 | * @ubi: UBI device description object | ||
1555 | */ | ||
1556 | static void protection_trees_destroy(struct ubi_device *ubi) | ||
1557 | { | ||
1558 | struct rb_node *rb; | ||
1559 | struct ubi_wl_prot_entry *pe; | ||
1560 | |||
1561 | rb = ubi->prot.aec.rb_node; | ||
1562 | while (rb) { | ||
1563 | if (rb->rb_left) | ||
1564 | rb = rb->rb_left; | ||
1565 | else if (rb->rb_right) | ||
1566 | rb = rb->rb_right; | ||
1567 | else { | ||
1568 | pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec); | ||
1569 | |||
1570 | rb = rb_parent(rb); | ||
1571 | if (rb) { | ||
1572 | if (rb->rb_left == &pe->rb_aec) | ||
1573 | rb->rb_left = NULL; | ||
1574 | else | ||
1575 | rb->rb_right = NULL; | ||
1576 | } | ||
1577 | |||
1578 | kmem_cache_free(wl_entries_slab, pe->e); | ||
1579 | kfree(pe); | ||
1580 | } | ||
1581 | } | ||
1582 | } | ||
1583 | |||
1584 | /** | ||
1585 | * ubi_wl_close - close the wear-leveling unit. | ||
1586 | * @ubi: UBI device description object | ||
1587 | */ | ||
1588 | void ubi_wl_close(struct ubi_device *ubi) | ||
1589 | { | ||
1590 | dbg_wl("disable \"%s\"", ubi->bgt_name); | ||
1591 | if (ubi->bgt_thread) | ||
1592 | kthread_stop(ubi->bgt_thread); | ||
1593 | |||
1594 | dbg_wl("close the UBI wear-leveling unit"); | ||
1595 | |||
1596 | cancel_pending(ubi); | ||
1597 | protection_trees_destroy(ubi); | ||
1598 | tree_destroy(&ubi->used); | ||
1599 | tree_destroy(&ubi->free); | ||
1600 | tree_destroy(&ubi->scrub); | ||
1601 | kfree(ubi->lookuptbl); | ||
1602 | if (ubi_devices_cnt == 1) | ||
1603 | kmem_cache_destroy(wl_entries_slab); | ||
1604 | } | ||
1605 | |||
1606 | #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID | ||
1607 | |||
1608 | /** | ||
1609 | * paranoid_check_ec - make sure that the erase counter of a physical eraseblock | ||
1610 | * is correct. | ||
1611 | * @ubi: UBI device description object | ||
1612 | * @pnum: the physical eraseblock number to check | ||
1613 | * @ec: the erase counter to check | ||
1614 | * | ||
1615 | * This function returns zero if the erase counter of physical eraseblock @pnum | ||
1616 | * is equivalent to @ec, %1 if not, and a negative error code if an error | ||
1617 | * occurred. | ||
1618 | */ | ||
1619 | static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec) | ||
1620 | { | ||
1621 | int err; | ||
1622 | long long read_ec; | ||
1623 | struct ubi_ec_hdr *ec_hdr; | ||
1624 | |||
1625 | ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); | ||
1626 | if (!ec_hdr) | ||
1627 | return -ENOMEM; | ||
1628 | |||
1629 | err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0); | ||
1630 | if (err && err != UBI_IO_BITFLIPS) { | ||
1631 | /* The header does not have to exist */ | ||
1632 | err = 0; | ||
1633 | goto out_free; | ||
1634 | } | ||
1635 | |||
1636 | read_ec = ubi64_to_cpu(ec_hdr->ec); | ||
1637 | if (ec != read_ec) { | ||
1638 | ubi_err("paranoid check failed for PEB %d", pnum); | ||
1639 | ubi_err("read EC is %lld, should be %d", read_ec, ec); | ||
1640 | ubi_dbg_dump_stack(); | ||
1641 | err = 1; | ||
1642 | } else | ||
1643 | err = 0; | ||
1644 | |||
1645 | out_free: | ||
1646 | kfree(ec_hdr); | ||
1647 | return err; | ||
1648 | } | ||
1649 | |||
1650 | /** | ||
1651 | * paranoid_check_in_wl_tree - make sure that a wear-leveling entry is present | ||
1652 | * in a WL RB-tree. | ||
1653 | * @e: the wear-leveling entry to check | ||
1654 | * @root: the root of the tree | ||
1655 | * | ||
1656 | * This function returns zero if @e is in the @root RB-tree and %1 if it | ||
1657 | * is not. | ||
1658 | */ | ||
1659 | static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, | ||
1660 | struct rb_root *root) | ||
1661 | { | ||
1662 | if (in_wl_tree(e, root)) | ||
1663 | return 0; | ||
1664 | |||
1665 | ubi_err("paranoid check failed for PEB %d, EC %d, RB-tree %p ", | ||
1666 | e->pnum, e->ec, root); | ||
1667 | ubi_dbg_dump_stack(); | ||
1668 | return 1; | ||
1669 | } | ||
1670 | |||
1671 | #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ | ||
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index abb90c0c09cc..8a649f602767 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c | |||
@@ -672,6 +672,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) { | |||
672 | return ret; | 672 | return ret; |
673 | } | 673 | } |
674 | 674 | ||
675 | /* and an UBI volume */ | ||
676 | if (jffs2_ubivol(c)) { | ||
677 | ret = jffs2_ubivol_setup(c); | ||
678 | if (ret) | ||
679 | return ret; | ||
680 | } | ||
681 | |||
675 | return ret; | 682 | return ret; |
676 | } | 683 | } |
677 | 684 | ||
@@ -690,4 +697,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) { | |||
690 | if (jffs2_nor_wbuf_flash(c)) { | 697 | if (jffs2_nor_wbuf_flash(c)) { |
691 | jffs2_nor_wbuf_flash_cleanup(c); | 698 | jffs2_nor_wbuf_flash_cleanup(c); |
692 | } | 699 | } |
700 | |||
701 | /* and an UBI volume */ | ||
702 | if (jffs2_ubivol(c)) { | ||
703 | jffs2_ubivol_cleanup(c); | ||
704 | } | ||
693 | } | 705 | } |
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index e07a0edcdb4f..8d92e45168ca 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h | |||
@@ -98,6 +98,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) | |||
98 | #define jffs2_nor_wbuf_flash(c) (0) | 98 | #define jffs2_nor_wbuf_flash(c) (0) |
99 | #define jffs2_nor_wbuf_flash_setup(c) (0) | 99 | #define jffs2_nor_wbuf_flash_setup(c) (0) |
100 | #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) | 100 | #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) |
101 | #define jffs2_ubivol(c) (0) | ||
102 | #define jffs2_ubivol_setup(c) (0) | ||
103 | #define jffs2_ubivol_cleanup(c) do {} while (0) | ||
101 | 104 | ||
102 | #else /* NAND and/or ECC'd NOR support present */ | 105 | #else /* NAND and/or ECC'd NOR support present */ |
103 | 106 | ||
@@ -133,6 +136,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c); | |||
133 | #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) | 136 | #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) |
134 | int jffs2_dataflash_setup(struct jffs2_sb_info *c); | 137 | int jffs2_dataflash_setup(struct jffs2_sb_info *c); |
135 | void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); | 138 | void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); |
139 | #define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME) | ||
140 | int jffs2_ubivol_setup(struct jffs2_sb_info *c); | ||
141 | void jffs2_ubivol_cleanup(struct jffs2_sb_info *c); | ||
136 | 142 | ||
137 | #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) | 143 | #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) |
138 | int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); | 144 | int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); |
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4fac6dd53954..ab86031b3c07 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c | |||
@@ -1208,3 +1208,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { | |||
1208 | void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { | 1208 | void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { |
1209 | kfree(c->wbuf); | 1209 | kfree(c->wbuf); |
1210 | } | 1210 | } |
1211 | |||
1212 | int jffs2_ubivol_setup(struct jffs2_sb_info *c) { | ||
1213 | c->cleanmarker_size = 0; | ||
1214 | |||
1215 | if (c->mtd->writesize == 1) | ||
1216 | /* We do not need write-buffer */ | ||
1217 | return 0; | ||
1218 | |||
1219 | init_rwsem(&c->wbuf_sem); | ||
1220 | |||
1221 | c->wbuf_pagesize = c->mtd->writesize; | ||
1222 | c->wbuf_ofs = 0xFFFFFFFF; | ||
1223 | c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); | ||
1224 | if (!c->wbuf) | ||
1225 | return -ENOMEM; | ||
1226 | |||
1227 | printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); | ||
1228 | |||
1229 | return 0; | ||
1230 | } | ||
1231 | |||
1232 | void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) { | ||
1233 | kfree(c->wbuf); | ||
1234 | } | ||
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf2..a0c8667caa72 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/types.h> | 27 | #include <linux/types.h> |
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/highmem.h> | 29 | #include <linux/highmem.h> |
30 | #include <linux/swap.h> | ||
30 | 31 | ||
31 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | 32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC |
32 | #include <cluster/masklog.h> | 33 | #include <cluster/masklog.h> |
@@ -34,6 +35,7 @@ | |||
34 | #include "ocfs2.h" | 35 | #include "ocfs2.h" |
35 | 36 | ||
36 | #include "alloc.h" | 37 | #include "alloc.h" |
38 | #include "aops.h" | ||
37 | #include "dlmglue.h" | 39 | #include "dlmglue.h" |
38 | #include "extent_map.h" | 40 | #include "extent_map.h" |
39 | #include "inode.h" | 41 | #include "inode.h" |
@@ -47,63 +49,243 @@ | |||
47 | 49 | ||
48 | #include "buffer_head_io.h" | 50 | #include "buffer_head_io.h" |
49 | 51 | ||
50 | static int ocfs2_extent_contig(struct inode *inode, | 52 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); |
51 | struct ocfs2_extent_rec *ext, | ||
52 | u64 blkno); | ||
53 | 53 | ||
54 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | 54 | /* |
55 | handle_t *handle, | 55 | * Structures which describe a path through a btree, and functions to |
56 | struct inode *inode, | 56 | * manipulate them. |
57 | int wanted, | 57 | * |
58 | struct ocfs2_alloc_context *meta_ac, | 58 | * The idea here is to be as generic as possible with the tree |
59 | struct buffer_head *bhs[]); | 59 | * manipulation code. |
60 | */ | ||
61 | struct ocfs2_path_item { | ||
62 | struct buffer_head *bh; | ||
63 | struct ocfs2_extent_list *el; | ||
64 | }; | ||
60 | 65 | ||
61 | static int ocfs2_add_branch(struct ocfs2_super *osb, | 66 | #define OCFS2_MAX_PATH_DEPTH 5 |
62 | handle_t *handle, | ||
63 | struct inode *inode, | ||
64 | struct buffer_head *fe_bh, | ||
65 | struct buffer_head *eb_bh, | ||
66 | struct buffer_head *last_eb_bh, | ||
67 | struct ocfs2_alloc_context *meta_ac); | ||
68 | 67 | ||
69 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | 68 | struct ocfs2_path { |
70 | handle_t *handle, | 69 | int p_tree_depth; |
71 | struct inode *inode, | 70 | struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; |
72 | struct buffer_head *fe_bh, | 71 | }; |
73 | struct ocfs2_alloc_context *meta_ac, | ||
74 | struct buffer_head **ret_new_eb_bh); | ||
75 | 72 | ||
76 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | 73 | #define path_root_bh(_path) ((_path)->p_node[0].bh) |
77 | handle_t *handle, | 74 | #define path_root_el(_path) ((_path)->p_node[0].el) |
78 | struct inode *inode, | 75 | #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) |
79 | struct buffer_head *fe_bh, | 76 | #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) |
80 | u64 blkno, | 77 | #define path_num_items(_path) ((_path)->p_tree_depth + 1) |
81 | u32 new_clusters); | ||
82 | 78 | ||
83 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | 79 | /* |
84 | struct inode *inode, | 80 | * Reset the actual path elements so that we can re-use the structure |
85 | struct buffer_head *fe_bh, | 81 | * to build another path. Generally, this involves freeing the buffer |
86 | struct buffer_head **target_bh); | 82 | * heads. |
83 | */ | ||
84 | static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) | ||
85 | { | ||
86 | int i, start = 0, depth = 0; | ||
87 | struct ocfs2_path_item *node; | ||
87 | 88 | ||
88 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | 89 | if (keep_root) |
89 | struct inode *inode, | 90 | start = 1; |
90 | struct ocfs2_dinode *fe, | 91 | |
91 | unsigned int new_i_clusters, | 92 | for(i = start; i < path_num_items(path); i++) { |
92 | struct buffer_head *old_last_eb, | 93 | node = &path->p_node[i]; |
93 | struct buffer_head **new_last_eb); | 94 | |
95 | brelse(node->bh); | ||
96 | node->bh = NULL; | ||
97 | node->el = NULL; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Tree depth may change during truncate, or insert. If we're | ||
102 | * keeping the root extent list, then make sure that our path | ||
103 | * structure reflects the proper depth. | ||
104 | */ | ||
105 | if (keep_root) | ||
106 | depth = le16_to_cpu(path_root_el(path)->l_tree_depth); | ||
107 | |||
108 | path->p_tree_depth = depth; | ||
109 | } | ||
110 | |||
111 | static void ocfs2_free_path(struct ocfs2_path *path) | ||
112 | { | ||
113 | if (path) { | ||
114 | ocfs2_reinit_path(path, 0); | ||
115 | kfree(path); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Make the *dest path the same as src and re-initialize src path to | ||
121 | * have a root only. | ||
122 | */ | ||
123 | static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) | ||
124 | { | ||
125 | int i; | ||
126 | |||
127 | BUG_ON(path_root_bh(dest) != path_root_bh(src)); | ||
128 | |||
129 | for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { | ||
130 | brelse(dest->p_node[i].bh); | ||
131 | |||
132 | dest->p_node[i].bh = src->p_node[i].bh; | ||
133 | dest->p_node[i].el = src->p_node[i].el; | ||
134 | |||
135 | src->p_node[i].bh = NULL; | ||
136 | src->p_node[i].el = NULL; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Insert an extent block at given index. | ||
142 | * | ||
143 | * This will not take an additional reference on eb_bh. | ||
144 | */ | ||
145 | static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, | ||
146 | struct buffer_head *eb_bh) | ||
147 | { | ||
148 | struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
149 | |||
150 | /* | ||
151 | * Right now, no root bh is an extent block, so this helps | ||
152 | * catch code errors with dinode trees. The assertion can be | ||
153 | * safely removed if we ever need to insert extent block | ||
154 | * structures at the root. | ||
155 | */ | ||
156 | BUG_ON(index == 0); | ||
157 | |||
158 | path->p_node[index].bh = eb_bh; | ||
159 | path->p_node[index].el = &eb->h_list; | ||
160 | } | ||
161 | |||
162 | static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, | ||
163 | struct ocfs2_extent_list *root_el) | ||
164 | { | ||
165 | struct ocfs2_path *path; | ||
166 | |||
167 | BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); | ||
168 | |||
169 | path = kzalloc(sizeof(*path), GFP_NOFS); | ||
170 | if (path) { | ||
171 | path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); | ||
172 | get_bh(root_bh); | ||
173 | path_root_bh(path) = root_bh; | ||
174 | path_root_el(path) = root_el; | ||
175 | } | ||
176 | |||
177 | return path; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Allocate and initialize a new path based on a disk inode tree. | ||
182 | */ | ||
183 | static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) | ||
184 | { | ||
185 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
186 | struct ocfs2_extent_list *el = &di->id2.i_list; | ||
187 | |||
188 | return ocfs2_new_path(di_bh, el); | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * Convenience function to journal all components in a path. | ||
193 | */ | ||
194 | static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, | ||
195 | struct ocfs2_path *path) | ||
196 | { | ||
197 | int i, ret = 0; | ||
198 | |||
199 | if (!path) | ||
200 | goto out; | ||
201 | |||
202 | for(i = 0; i < path_num_items(path); i++) { | ||
203 | ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, | ||
204 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
205 | if (ret < 0) { | ||
206 | mlog_errno(ret); | ||
207 | goto out; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | out: | ||
212 | return ret; | ||
213 | } | ||
214 | |||
215 | enum ocfs2_contig_type { | ||
216 | CONTIG_NONE = 0, | ||
217 | CONTIG_LEFT, | ||
218 | CONTIG_RIGHT | ||
219 | }; | ||
94 | 220 | ||
95 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); | ||
96 | 221 | ||
97 | static int ocfs2_extent_contig(struct inode *inode, | 222 | /* |
98 | struct ocfs2_extent_rec *ext, | 223 | * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and |
99 | u64 blkno) | 224 | * ocfs2_extent_contig only work properly against leaf nodes! |
225 | */ | ||
226 | static int ocfs2_block_extent_contig(struct super_block *sb, | ||
227 | struct ocfs2_extent_rec *ext, | ||
228 | u64 blkno) | ||
229 | { | ||
230 | u64 blk_end = le64_to_cpu(ext->e_blkno); | ||
231 | |||
232 | blk_end += ocfs2_clusters_to_blocks(sb, | ||
233 | le16_to_cpu(ext->e_leaf_clusters)); | ||
234 | |||
235 | return blkno == blk_end; | ||
236 | } | ||
237 | |||
238 | static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, | ||
239 | struct ocfs2_extent_rec *right) | ||
240 | { | ||
241 | u32 left_range; | ||
242 | |||
243 | left_range = le32_to_cpu(left->e_cpos) + | ||
244 | le16_to_cpu(left->e_leaf_clusters); | ||
245 | |||
246 | return (left_range == le32_to_cpu(right->e_cpos)); | ||
247 | } | ||
248 | |||
249 | static enum ocfs2_contig_type | ||
250 | ocfs2_extent_contig(struct inode *inode, | ||
251 | struct ocfs2_extent_rec *ext, | ||
252 | struct ocfs2_extent_rec *insert_rec) | ||
100 | { | 253 | { |
101 | return blkno == (le64_to_cpu(ext->e_blkno) + | 254 | u64 blkno = le64_to_cpu(insert_rec->e_blkno); |
102 | ocfs2_clusters_to_blocks(inode->i_sb, | 255 | |
103 | le32_to_cpu(ext->e_clusters))); | 256 | if (ocfs2_extents_adjacent(ext, insert_rec) && |
257 | ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) | ||
258 | return CONTIG_RIGHT; | ||
259 | |||
260 | blkno = le64_to_cpu(ext->e_blkno); | ||
261 | if (ocfs2_extents_adjacent(insert_rec, ext) && | ||
262 | ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) | ||
263 | return CONTIG_LEFT; | ||
264 | |||
265 | return CONTIG_NONE; | ||
104 | } | 266 | } |
105 | 267 | ||
106 | /* | 268 | /* |
269 | * NOTE: We can have pretty much any combination of contiguousness and | ||
270 | * appending. | ||
271 | * | ||
272 | * The usefulness of APPEND_TAIL is more in that it lets us know that | ||
273 | * we'll have to update the path to that leaf. | ||
274 | */ | ||
275 | enum ocfs2_append_type { | ||
276 | APPEND_NONE = 0, | ||
277 | APPEND_TAIL, | ||
278 | }; | ||
279 | |||
280 | struct ocfs2_insert_type { | ||
281 | enum ocfs2_append_type ins_appending; | ||
282 | enum ocfs2_contig_type ins_contig; | ||
283 | int ins_contig_index; | ||
284 | int ins_free_records; | ||
285 | int ins_tree_depth; | ||
286 | }; | ||
287 | |||
288 | /* | ||
107 | * How many free extents have we got before we need more meta data? | 289 | * How many free extents have we got before we need more meta data? |
108 | */ | 290 | */ |
109 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 291 | int ocfs2_num_free_extents(struct ocfs2_super *osb, |
@@ -242,6 +424,28 @@ bail: | |||
242 | } | 424 | } |
243 | 425 | ||
244 | /* | 426 | /* |
427 | * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). | ||
428 | * | ||
429 | * Returns the sum of the rightmost extent rec logical offset and | ||
430 | * cluster count. | ||
431 | * | ||
432 | * ocfs2_add_branch() uses this to determine what logical cluster | ||
433 | * value should be populated into the leftmost new branch records. | ||
434 | * | ||
435 | * ocfs2_shift_tree_depth() uses this to determine the # clusters | ||
436 | * value for the new topmost tree record. | ||
437 | */ | ||
438 | static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) | ||
439 | { | ||
440 | int i; | ||
441 | |||
442 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
443 | |||
444 | return le32_to_cpu(el->l_recs[i].e_cpos) + | ||
445 | ocfs2_rec_clusters(el, &el->l_recs[i]); | ||
446 | } | ||
447 | |||
448 | /* | ||
245 | * Add an entire tree branch to our inode. eb_bh is the extent block | 449 | * Add an entire tree branch to our inode. eb_bh is the extent block |
246 | * to start at, if we don't want to start the branch at the dinode | 450 | * to start at, if we don't want to start the branch at the dinode |
247 | * structure. | 451 | * structure. |
@@ -250,7 +454,7 @@ bail: | |||
250 | * for the new last extent block. | 454 | * for the new last extent block. |
251 | * | 455 | * |
252 | * the new branch will be 'empty' in the sense that every block will | 456 | * the new branch will be 'empty' in the sense that every block will |
253 | * contain a single record with e_clusters == 0. | 457 | * contain a single record with cluster count == 0. |
254 | */ | 458 | */ |
255 | static int ocfs2_add_branch(struct ocfs2_super *osb, | 459 | static int ocfs2_add_branch(struct ocfs2_super *osb, |
256 | handle_t *handle, | 460 | handle_t *handle, |
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
268 | struct ocfs2_extent_block *eb; | 472 | struct ocfs2_extent_block *eb; |
269 | struct ocfs2_extent_list *eb_el; | 473 | struct ocfs2_extent_list *eb_el; |
270 | struct ocfs2_extent_list *el; | 474 | struct ocfs2_extent_list *el; |
475 | u32 new_cpos; | ||
271 | 476 | ||
272 | mlog_entry_void(); | 477 | mlog_entry_void(); |
273 | 478 | ||
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
302 | goto bail; | 507 | goto bail; |
303 | } | 508 | } |
304 | 509 | ||
510 | eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; | ||
511 | new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); | ||
512 | |||
305 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | 513 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be |
306 | * linked with the rest of the tree. | 514 | * linked with the rest of the tree. |
307 | * conversly, new_eb_bhs[0] is the new bottommost leaf. | 515 | * conversly, new_eb_bhs[0] is the new bottommost leaf. |
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
330 | eb->h_next_leaf_blk = 0; | 538 | eb->h_next_leaf_blk = 0; |
331 | eb_el->l_tree_depth = cpu_to_le16(i); | 539 | eb_el->l_tree_depth = cpu_to_le16(i); |
332 | eb_el->l_next_free_rec = cpu_to_le16(1); | 540 | eb_el->l_next_free_rec = cpu_to_le16(1); |
333 | eb_el->l_recs[0].e_cpos = fe->i_clusters; | 541 | /* |
542 | * This actually counts as an empty extent as | ||
543 | * c_clusters == 0 | ||
544 | */ | ||
545 | eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); | ||
334 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); | 546 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); |
335 | eb_el->l_recs[0].e_clusters = cpu_to_le32(0); | 547 | /* |
548 | * eb_el isn't always an interior node, but even leaf | ||
549 | * nodes want a zero'd flags and reserved field so | ||
550 | * this gets the whole 32 bits regardless of use. | ||
551 | */ | ||
552 | eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); | ||
336 | if (!eb_el->l_tree_depth) | 553 | if (!eb_el->l_tree_depth) |
337 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); | 554 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); |
338 | 555 | ||
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
376 | * either be on the fe, or the extent block passed in. */ | 593 | * either be on the fe, or the extent block passed in. */ |
377 | i = le16_to_cpu(el->l_next_free_rec); | 594 | i = le16_to_cpu(el->l_next_free_rec); |
378 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); | 595 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); |
379 | el->l_recs[i].e_cpos = fe->i_clusters; | 596 | el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); |
380 | el->l_recs[i].e_clusters = 0; | 597 | el->l_recs[i].e_int_clusters = 0; |
381 | le16_add_cpu(&el->l_next_free_rec, 1); | 598 | le16_add_cpu(&el->l_next_free_rec, 1); |
382 | 599 | ||
383 | /* fe needs a new last extent block pointer, as does the | 600 | /* fe needs a new last extent block pointer, as does the |
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
425 | struct buffer_head **ret_new_eb_bh) | 642 | struct buffer_head **ret_new_eb_bh) |
426 | { | 643 | { |
427 | int status, i; | 644 | int status, i; |
645 | u32 new_clusters; | ||
428 | struct buffer_head *new_eb_bh = NULL; | 646 | struct buffer_head *new_eb_bh = NULL; |
429 | struct ocfs2_dinode *fe; | 647 | struct ocfs2_dinode *fe; |
430 | struct ocfs2_extent_block *eb; | 648 | struct ocfs2_extent_block *eb; |
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
461 | /* copy the fe data into the new extent block */ | 679 | /* copy the fe data into the new extent block */ |
462 | eb_el->l_tree_depth = fe_el->l_tree_depth; | 680 | eb_el->l_tree_depth = fe_el->l_tree_depth; |
463 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; | 681 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; |
464 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | 682 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) |
465 | eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; | 683 | eb_el->l_recs[i] = fe_el->l_recs[i]; |
466 | eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; | ||
467 | eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; | ||
468 | } | ||
469 | 684 | ||
470 | status = ocfs2_journal_dirty(handle, new_eb_bh); | 685 | status = ocfs2_journal_dirty(handle, new_eb_bh); |
471 | if (status < 0) { | 686 | if (status < 0) { |
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | |||
480 | goto bail; | 695 | goto bail; |
481 | } | 696 | } |
482 | 697 | ||
698 | new_clusters = ocfs2_sum_rightmost_rec(eb_el); | ||
699 | |||
483 | /* update fe now */ | 700 | /* update fe now */ |
484 | le16_add_cpu(&fe_el->l_tree_depth, 1); | 701 | le16_add_cpu(&fe_el->l_tree_depth, 1); |
485 | fe_el->l_recs[0].e_cpos = 0; | 702 | fe_el->l_recs[0].e_cpos = 0; |
486 | fe_el->l_recs[0].e_blkno = eb->h_blkno; | 703 | fe_el->l_recs[0].e_blkno = eb->h_blkno; |
487 | fe_el->l_recs[0].e_clusters = fe->i_clusters; | 704 | fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); |
488 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | 705 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) |
489 | fe_el->l_recs[i].e_cpos = 0; | 706 | memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); |
490 | fe_el->l_recs[i].e_clusters = 0; | ||
491 | fe_el->l_recs[i].e_blkno = 0; | ||
492 | } | ||
493 | fe_el->l_next_free_rec = cpu_to_le16(1); | 707 | fe_el->l_next_free_rec = cpu_to_le16(1); |
494 | 708 | ||
495 | /* If this is our 1st tree depth shift, then last_eb_blk | 709 | /* If this is our 1st tree depth shift, then last_eb_blk |
@@ -515,199 +729,6 @@ bail: | |||
515 | } | 729 | } |
516 | 730 | ||
517 | /* | 731 | /* |
518 | * Expects the tree to already have room in the rightmost leaf for the | ||
519 | * extent. Updates all the extent blocks (and the dinode) on the way | ||
520 | * down. | ||
521 | */ | ||
522 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
523 | handle_t *handle, | ||
524 | struct inode *inode, | ||
525 | struct buffer_head *fe_bh, | ||
526 | u64 start_blk, | ||
527 | u32 new_clusters) | ||
528 | { | ||
529 | int status, i, num_bhs = 0; | ||
530 | u64 next_blkno; | ||
531 | u16 next_free; | ||
532 | struct buffer_head **eb_bhs = NULL; | ||
533 | struct ocfs2_dinode *fe; | ||
534 | struct ocfs2_extent_block *eb; | ||
535 | struct ocfs2_extent_list *el; | ||
536 | |||
537 | mlog_entry_void(); | ||
538 | |||
539 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
540 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
541 | if (status < 0) { | ||
542 | mlog_errno(status); | ||
543 | goto bail; | ||
544 | } | ||
545 | |||
546 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
547 | el = &fe->id2.i_list; | ||
548 | if (el->l_tree_depth) { | ||
549 | /* This is another operation where we want to be | ||
550 | * careful about our tree updates. An error here means | ||
551 | * none of the previous changes we made should roll | ||
552 | * forward. As a result, we have to record the buffers | ||
553 | * for this part of the tree in an array and reserve a | ||
554 | * journal write to them before making any changes. */ | ||
555 | num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
556 | eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), | ||
557 | GFP_KERNEL); | ||
558 | if (!eb_bhs) { | ||
559 | status = -ENOMEM; | ||
560 | mlog_errno(status); | ||
561 | goto bail; | ||
562 | } | ||
563 | |||
564 | i = 0; | ||
565 | while(el->l_tree_depth) { | ||
566 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
567 | if (next_free == 0) { | ||
568 | ocfs2_error(inode->i_sb, | ||
569 | "Dinode %llu has a bad extent list", | ||
570 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
571 | status = -EIO; | ||
572 | goto bail; | ||
573 | } | ||
574 | next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); | ||
575 | |||
576 | BUG_ON(i >= num_bhs); | ||
577 | status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], | ||
578 | OCFS2_BH_CACHED, inode); | ||
579 | if (status < 0) { | ||
580 | mlog_errno(status); | ||
581 | goto bail; | ||
582 | } | ||
583 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
584 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
585 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
586 | eb); | ||
587 | status = -EIO; | ||
588 | goto bail; | ||
589 | } | ||
590 | |||
591 | status = ocfs2_journal_access(handle, inode, eb_bhs[i], | ||
592 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
593 | if (status < 0) { | ||
594 | mlog_errno(status); | ||
595 | goto bail; | ||
596 | } | ||
597 | |||
598 | el = &eb->h_list; | ||
599 | i++; | ||
600 | /* When we leave this loop, eb_bhs[num_bhs - 1] will | ||
601 | * hold the bottom-most leaf extent block. */ | ||
602 | } | ||
603 | BUG_ON(el->l_tree_depth); | ||
604 | |||
605 | el = &fe->id2.i_list; | ||
606 | /* If we have tree depth, then the fe update is | ||
607 | * trivial, and we want to switch el out for the | ||
608 | * bottom-most leaf in order to update it with the | ||
609 | * actual extent data below. */ | ||
610 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
611 | if (next_free == 0) { | ||
612 | ocfs2_error(inode->i_sb, | ||
613 | "Dinode %llu has a bad extent list", | ||
614 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
615 | status = -EIO; | ||
616 | goto bail; | ||
617 | } | ||
618 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
619 | new_clusters); | ||
620 | /* (num_bhs - 1) to avoid the leaf */ | ||
621 | for(i = 0; i < (num_bhs - 1); i++) { | ||
622 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
623 | el = &eb->h_list; | ||
624 | |||
625 | /* finally, make our actual change to the | ||
626 | * intermediate extent blocks. */ | ||
627 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
628 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
629 | new_clusters); | ||
630 | |||
631 | status = ocfs2_journal_dirty(handle, eb_bhs[i]); | ||
632 | if (status < 0) | ||
633 | mlog_errno(status); | ||
634 | } | ||
635 | BUG_ON(i != (num_bhs - 1)); | ||
636 | /* note that the leaf block wasn't touched in | ||
637 | * the loop above */ | ||
638 | eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; | ||
639 | el = &eb->h_list; | ||
640 | BUG_ON(el->l_tree_depth); | ||
641 | } | ||
642 | |||
643 | /* yay, we can finally add the actual extent now! */ | ||
644 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
645 | if (le16_to_cpu(el->l_next_free_rec) && | ||
646 | ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { | ||
647 | le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); | ||
648 | } else if (le16_to_cpu(el->l_next_free_rec) && | ||
649 | (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { | ||
650 | /* having an empty extent at eof is legal. */ | ||
651 | if (el->l_recs[i].e_cpos != fe->i_clusters) { | ||
652 | ocfs2_error(inode->i_sb, | ||
653 | "Dinode %llu trailing extent is bad: " | ||
654 | "cpos (%u) != number of clusters (%u)", | ||
655 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
656 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
657 | le32_to_cpu(fe->i_clusters)); | ||
658 | status = -EIO; | ||
659 | goto bail; | ||
660 | } | ||
661 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
662 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
663 | } else { | ||
664 | /* No contiguous record, or no empty record at eof, so | ||
665 | * we add a new one. */ | ||
666 | |||
667 | BUG_ON(le16_to_cpu(el->l_next_free_rec) >= | ||
668 | le16_to_cpu(el->l_count)); | ||
669 | i = le16_to_cpu(el->l_next_free_rec); | ||
670 | |||
671 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
672 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
673 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
674 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
675 | } | ||
676 | |||
677 | /* | ||
678 | * extent_map errors are not fatal, so they are ignored outside | ||
679 | * of flushing the thing. | ||
680 | */ | ||
681 | status = ocfs2_extent_map_append(inode, &el->l_recs[i], | ||
682 | new_clusters); | ||
683 | if (status) { | ||
684 | mlog_errno(status); | ||
685 | ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); | ||
686 | } | ||
687 | |||
688 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
689 | if (status < 0) | ||
690 | mlog_errno(status); | ||
691 | if (fe->id2.i_list.l_tree_depth) { | ||
692 | status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); | ||
693 | if (status < 0) | ||
694 | mlog_errno(status); | ||
695 | } | ||
696 | |||
697 | status = 0; | ||
698 | bail: | ||
699 | if (eb_bhs) { | ||
700 | for (i = 0; i < num_bhs; i++) | ||
701 | if (eb_bhs[i]) | ||
702 | brelse(eb_bhs[i]); | ||
703 | kfree(eb_bhs); | ||
704 | } | ||
705 | |||
706 | mlog_exit(status); | ||
707 | return status; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Should only be called when there is no space left in any of the | 732 | * Should only be called when there is no space left in any of the |
712 | * leaf nodes. What we want to do is find the lowest tree depth | 733 | * leaf nodes. What we want to do is find the lowest tree depth |
713 | * non-leaf extent block with room for new records. There are three | 734 | * non-leaf extent block with room for new records. There are three |
@@ -807,53 +828,1548 @@ bail: | |||
807 | return status; | 828 | return status; |
808 | } | 829 | } |
809 | 830 | ||
810 | /* the caller needs to update fe->i_clusters */ | 831 | /* |
811 | int ocfs2_insert_extent(struct ocfs2_super *osb, | 832 | * This is only valid for leaf nodes, which are the only ones that can |
812 | handle_t *handle, | 833 | * have empty extents anyway. |
813 | struct inode *inode, | 834 | */ |
814 | struct buffer_head *fe_bh, | 835 | static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) |
815 | u64 start_blk, | ||
816 | u32 new_clusters, | ||
817 | struct ocfs2_alloc_context *meta_ac) | ||
818 | { | 836 | { |
819 | int status, i, shift; | 837 | return !rec->e_leaf_clusters; |
820 | struct buffer_head *last_eb_bh = NULL; | 838 | } |
839 | |||
840 | /* | ||
841 | * This function will discard the rightmost extent record. | ||
842 | */ | ||
843 | static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) | ||
844 | { | ||
845 | int next_free = le16_to_cpu(el->l_next_free_rec); | ||
846 | int count = le16_to_cpu(el->l_count); | ||
847 | unsigned int num_bytes; | ||
848 | |||
849 | BUG_ON(!next_free); | ||
850 | /* This will cause us to go off the end of our extent list. */ | ||
851 | BUG_ON(next_free >= count); | ||
852 | |||
853 | num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; | ||
854 | |||
855 | memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); | ||
856 | } | ||
857 | |||
858 | static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, | ||
859 | struct ocfs2_extent_rec *insert_rec) | ||
860 | { | ||
861 | int i, insert_index, next_free, has_empty, num_bytes; | ||
862 | u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); | ||
863 | struct ocfs2_extent_rec *rec; | ||
864 | |||
865 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
866 | has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); | ||
867 | |||
868 | BUG_ON(!next_free); | ||
869 | |||
870 | /* The tree code before us didn't allow enough room in the leaf. */ | ||
871 | if (el->l_next_free_rec == el->l_count && !has_empty) | ||
872 | BUG(); | ||
873 | |||
874 | /* | ||
875 | * The easiest way to approach this is to just remove the | ||
876 | * empty extent and temporarily decrement next_free. | ||
877 | */ | ||
878 | if (has_empty) { | ||
879 | /* | ||
880 | * If next_free was 1 (only an empty extent), this | ||
881 | * loop won't execute, which is fine. We still want | ||
882 | * the decrement above to happen. | ||
883 | */ | ||
884 | for(i = 0; i < (next_free - 1); i++) | ||
885 | el->l_recs[i] = el->l_recs[i+1]; | ||
886 | |||
887 | next_free--; | ||
888 | } | ||
889 | |||
890 | /* | ||
891 | * Figure out what the new record index should be. | ||
892 | */ | ||
893 | for(i = 0; i < next_free; i++) { | ||
894 | rec = &el->l_recs[i]; | ||
895 | |||
896 | if (insert_cpos < le32_to_cpu(rec->e_cpos)) | ||
897 | break; | ||
898 | } | ||
899 | insert_index = i; | ||
900 | |||
901 | mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", | ||
902 | insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); | ||
903 | |||
904 | BUG_ON(insert_index < 0); | ||
905 | BUG_ON(insert_index >= le16_to_cpu(el->l_count)); | ||
906 | BUG_ON(insert_index > next_free); | ||
907 | |||
908 | /* | ||
909 | * No need to memmove if we're just adding to the tail. | ||
910 | */ | ||
911 | if (insert_index != next_free) { | ||
912 | BUG_ON(next_free >= le16_to_cpu(el->l_count)); | ||
913 | |||
914 | num_bytes = next_free - insert_index; | ||
915 | num_bytes *= sizeof(struct ocfs2_extent_rec); | ||
916 | memmove(&el->l_recs[insert_index + 1], | ||
917 | &el->l_recs[insert_index], | ||
918 | num_bytes); | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | * Either we had an empty extent, and need to re-increment or | ||
923 | * there was no empty extent on a non full rightmost leaf node, | ||
924 | * in which case we still need to increment. | ||
925 | */ | ||
926 | next_free++; | ||
927 | el->l_next_free_rec = cpu_to_le16(next_free); | ||
928 | /* | ||
929 | * Make sure none of the math above just messed up our tree. | ||
930 | */ | ||
931 | BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); | ||
932 | |||
933 | el->l_recs[insert_index] = *insert_rec; | ||
934 | |||
935 | } | ||
936 | |||
937 | /* | ||
938 | * Create an empty extent record . | ||
939 | * | ||
940 | * l_next_free_rec may be updated. | ||
941 | * | ||
942 | * If an empty extent already exists do nothing. | ||
943 | */ | ||
944 | static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) | ||
945 | { | ||
946 | int next_free = le16_to_cpu(el->l_next_free_rec); | ||
947 | |||
948 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
949 | |||
950 | if (next_free == 0) | ||
951 | goto set_and_inc; | ||
952 | |||
953 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
954 | return; | ||
955 | |||
956 | mlog_bug_on_msg(el->l_count == el->l_next_free_rec, | ||
957 | "Asked to create an empty extent in a full list:\n" | ||
958 | "count = %u, tree depth = %u", | ||
959 | le16_to_cpu(el->l_count), | ||
960 | le16_to_cpu(el->l_tree_depth)); | ||
961 | |||
962 | ocfs2_shift_records_right(el); | ||
963 | |||
964 | set_and_inc: | ||
965 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
966 | memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
967 | } | ||
968 | |||
969 | /* | ||
970 | * For a rotation which involves two leaf nodes, the "root node" is | ||
971 | * the lowest level tree node which contains a path to both leafs. This | ||
972 | * resulting set of information can be used to form a complete "subtree" | ||
973 | * | ||
974 | * This function is passed two full paths from the dinode down to a | ||
975 | * pair of adjacent leaves. It's task is to figure out which path | ||
976 | * index contains the subtree root - this can be the root index itself | ||
977 | * in a worst-case rotation. | ||
978 | * | ||
979 | * The array index of the subtree root is passed back. | ||
980 | */ | ||
981 | static int ocfs2_find_subtree_root(struct inode *inode, | ||
982 | struct ocfs2_path *left, | ||
983 | struct ocfs2_path *right) | ||
984 | { | ||
985 | int i = 0; | ||
986 | |||
987 | /* | ||
988 | * Check that the caller passed in two paths from the same tree. | ||
989 | */ | ||
990 | BUG_ON(path_root_bh(left) != path_root_bh(right)); | ||
991 | |||
992 | do { | ||
993 | i++; | ||
994 | |||
995 | /* | ||
996 | * The caller didn't pass two adjacent paths. | ||
997 | */ | ||
998 | mlog_bug_on_msg(i > left->p_tree_depth, | ||
999 | "Inode %lu, left depth %u, right depth %u\n" | ||
1000 | "left leaf blk %llu, right leaf blk %llu\n", | ||
1001 | inode->i_ino, left->p_tree_depth, | ||
1002 | right->p_tree_depth, | ||
1003 | (unsigned long long)path_leaf_bh(left)->b_blocknr, | ||
1004 | (unsigned long long)path_leaf_bh(right)->b_blocknr); | ||
1005 | } while (left->p_node[i].bh->b_blocknr == | ||
1006 | right->p_node[i].bh->b_blocknr); | ||
1007 | |||
1008 | return i - 1; | ||
1009 | } | ||
1010 | |||
1011 | typedef void (path_insert_t)(void *, struct buffer_head *); | ||
1012 | |||
1013 | /* | ||
1014 | * Traverse a btree path in search of cpos, starting at root_el. | ||
1015 | * | ||
1016 | * This code can be called with a cpos larger than the tree, in which | ||
1017 | * case it will return the rightmost path. | ||
1018 | */ | ||
1019 | static int __ocfs2_find_path(struct inode *inode, | ||
1020 | struct ocfs2_extent_list *root_el, u32 cpos, | ||
1021 | path_insert_t *func, void *data) | ||
1022 | { | ||
1023 | int i, ret = 0; | ||
1024 | u32 range; | ||
1025 | u64 blkno; | ||
821 | struct buffer_head *bh = NULL; | 1026 | struct buffer_head *bh = NULL; |
822 | struct ocfs2_dinode *fe; | ||
823 | struct ocfs2_extent_block *eb; | 1027 | struct ocfs2_extent_block *eb; |
824 | struct ocfs2_extent_list *el; | 1028 | struct ocfs2_extent_list *el; |
1029 | struct ocfs2_extent_rec *rec; | ||
1030 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
825 | 1031 | ||
826 | mlog_entry_void(); | 1032 | el = root_el; |
1033 | while (el->l_tree_depth) { | ||
1034 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
1035 | ocfs2_error(inode->i_sb, | ||
1036 | "Inode %llu has empty extent list at " | ||
1037 | "depth %u\n", | ||
1038 | (unsigned long long)oi->ip_blkno, | ||
1039 | le16_to_cpu(el->l_tree_depth)); | ||
1040 | ret = -EROFS; | ||
1041 | goto out; | ||
827 | 1042 | ||
828 | mlog(0, "add %u clusters starting at block %llu to inode %llu\n", | 1043 | } |
829 | new_clusters, (unsigned long long)start_blk, | ||
830 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
831 | 1044 | ||
832 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 1045 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { |
833 | el = &fe->id2.i_list; | 1046 | rec = &el->l_recs[i]; |
1047 | |||
1048 | /* | ||
1049 | * In the case that cpos is off the allocation | ||
1050 | * tree, this should just wind up returning the | ||
1051 | * rightmost record. | ||
1052 | */ | ||
1053 | range = le32_to_cpu(rec->e_cpos) + | ||
1054 | ocfs2_rec_clusters(el, rec); | ||
1055 | if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) | ||
1056 | break; | ||
1057 | } | ||
834 | 1058 | ||
835 | if (el->l_tree_depth) { | 1059 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); |
836 | /* jump to end of tree */ | 1060 | if (blkno == 0) { |
837 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | 1061 | ocfs2_error(inode->i_sb, |
838 | &last_eb_bh, OCFS2_BH_CACHED, inode); | 1062 | "Inode %llu has bad blkno in extent list " |
839 | if (status < 0) { | 1063 | "at depth %u (index %d)\n", |
840 | mlog_exit(status); | 1064 | (unsigned long long)oi->ip_blkno, |
841 | goto bail; | 1065 | le16_to_cpu(el->l_tree_depth), i); |
1066 | ret = -EROFS; | ||
1067 | goto out; | ||
842 | } | 1068 | } |
843 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | 1069 | |
1070 | brelse(bh); | ||
1071 | bh = NULL; | ||
1072 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, | ||
1073 | &bh, OCFS2_BH_CACHED, inode); | ||
1074 | if (ret) { | ||
1075 | mlog_errno(ret); | ||
1076 | goto out; | ||
1077 | } | ||
1078 | |||
1079 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
844 | el = &eb->h_list; | 1080 | el = &eb->h_list; |
1081 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1082 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1083 | ret = -EIO; | ||
1084 | goto out; | ||
1085 | } | ||
1086 | |||
1087 | if (le16_to_cpu(el->l_next_free_rec) > | ||
1088 | le16_to_cpu(el->l_count)) { | ||
1089 | ocfs2_error(inode->i_sb, | ||
1090 | "Inode %llu has bad count in extent list " | ||
1091 | "at block %llu (next free=%u, count=%u)\n", | ||
1092 | (unsigned long long)oi->ip_blkno, | ||
1093 | (unsigned long long)bh->b_blocknr, | ||
1094 | le16_to_cpu(el->l_next_free_rec), | ||
1095 | le16_to_cpu(el->l_count)); | ||
1096 | ret = -EROFS; | ||
1097 | goto out; | ||
1098 | } | ||
1099 | |||
1100 | if (func) | ||
1101 | func(data, bh); | ||
1102 | } | ||
1103 | |||
1104 | out: | ||
1105 | /* | ||
1106 | * Catch any trailing bh that the loop didn't handle. | ||
1107 | */ | ||
1108 | brelse(bh); | ||
1109 | |||
1110 | return ret; | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * Given an initialized path (that is, it has a valid root extent | ||
1115 | * list), this function will traverse the btree in search of the path | ||
1116 | * which would contain cpos. | ||
1117 | * | ||
1118 | * The path traveled is recorded in the path structure. | ||
1119 | * | ||
1120 | * Note that this will not do any comparisons on leaf node extent | ||
1121 | * records, so it will work fine in the case that we just added a tree | ||
1122 | * branch. | ||
1123 | */ | ||
1124 | struct find_path_data { | ||
1125 | int index; | ||
1126 | struct ocfs2_path *path; | ||
1127 | }; | ||
1128 | static void find_path_ins(void *data, struct buffer_head *bh) | ||
1129 | { | ||
1130 | struct find_path_data *fp = data; | ||
1131 | |||
1132 | get_bh(bh); | ||
1133 | ocfs2_path_insert_eb(fp->path, fp->index, bh); | ||
1134 | fp->index++; | ||
1135 | } | ||
1136 | static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, | ||
1137 | u32 cpos) | ||
1138 | { | ||
1139 | struct find_path_data data; | ||
1140 | |||
1141 | data.index = 1; | ||
1142 | data.path = path; | ||
1143 | return __ocfs2_find_path(inode, path_root_el(path), cpos, | ||
1144 | find_path_ins, &data); | ||
1145 | } | ||
1146 | |||
1147 | static void find_leaf_ins(void *data, struct buffer_head *bh) | ||
1148 | { | ||
1149 | struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; | ||
1150 | struct ocfs2_extent_list *el = &eb->h_list; | ||
1151 | struct buffer_head **ret = data; | ||
1152 | |||
1153 | /* We want to retain only the leaf block. */ | ||
1154 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
1155 | get_bh(bh); | ||
1156 | *ret = bh; | ||
1157 | } | ||
1158 | } | ||
1159 | /* | ||
1160 | * Find the leaf block in the tree which would contain cpos. No | ||
1161 | * checking of the actual leaf is done. | ||
1162 | * | ||
1163 | * Some paths want to call this instead of allocating a path structure | ||
1164 | * and calling ocfs2_find_path(). | ||
1165 | * | ||
1166 | * This function doesn't handle non btree extent lists. | ||
1167 | */ | ||
1168 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | ||
1169 | u32 cpos, struct buffer_head **leaf_bh) | ||
1170 | { | ||
1171 | int ret; | ||
1172 | struct buffer_head *bh = NULL; | ||
1173 | |||
1174 | ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); | ||
1175 | if (ret) { | ||
1176 | mlog_errno(ret); | ||
1177 | goto out; | ||
1178 | } | ||
1179 | |||
1180 | *leaf_bh = bh; | ||
1181 | out: | ||
1182 | return ret; | ||
1183 | } | ||
1184 | |||
1185 | /* | ||
1186 | * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. | ||
1187 | * | ||
1188 | * Basically, we've moved stuff around at the bottom of the tree and | ||
1189 | * we need to fix up the extent records above the changes to reflect | ||
1190 | * the new changes. | ||
1191 | * | ||
1192 | * left_rec: the record on the left. | ||
1193 | * left_child_el: is the child list pointed to by left_rec | ||
1194 | * right_rec: the record to the right of left_rec | ||
1195 | * right_child_el: is the child list pointed to by right_rec | ||
1196 | * | ||
1197 | * By definition, this only works on interior nodes. | ||
1198 | */ | ||
1199 | static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, | ||
1200 | struct ocfs2_extent_list *left_child_el, | ||
1201 | struct ocfs2_extent_rec *right_rec, | ||
1202 | struct ocfs2_extent_list *right_child_el) | ||
1203 | { | ||
1204 | u32 left_clusters, right_end; | ||
1205 | |||
1206 | /* | ||
1207 | * Interior nodes never have holes. Their cpos is the cpos of | ||
1208 | * the leftmost record in their child list. Their cluster | ||
1209 | * count covers the full theoretical range of their child list | ||
1210 | * - the range between their cpos and the cpos of the record | ||
1211 | * immediately to their right. | ||
1212 | */ | ||
1213 | left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); | ||
1214 | left_clusters -= le32_to_cpu(left_rec->e_cpos); | ||
1215 | left_rec->e_int_clusters = cpu_to_le32(left_clusters); | ||
1216 | |||
1217 | /* | ||
1218 | * Calculate the rightmost cluster count boundary before | ||
1219 | * moving cpos - we will need to adjust clusters after | ||
1220 | * updating e_cpos to keep the same highest cluster count. | ||
1221 | */ | ||
1222 | right_end = le32_to_cpu(right_rec->e_cpos); | ||
1223 | right_end += le32_to_cpu(right_rec->e_int_clusters); | ||
1224 | |||
1225 | right_rec->e_cpos = left_rec->e_cpos; | ||
1226 | le32_add_cpu(&right_rec->e_cpos, left_clusters); | ||
1227 | |||
1228 | right_end -= le32_to_cpu(right_rec->e_cpos); | ||
1229 | right_rec->e_int_clusters = cpu_to_le32(right_end); | ||
1230 | } | ||
1231 | |||
1232 | /* | ||
1233 | * Adjust the adjacent root node records involved in a | ||
1234 | * rotation. left_el_blkno is passed in as a key so that we can easily | ||
1235 | * find it's index in the root list. | ||
1236 | */ | ||
1237 | static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, | ||
1238 | struct ocfs2_extent_list *left_el, | ||
1239 | struct ocfs2_extent_list *right_el, | ||
1240 | u64 left_el_blkno) | ||
1241 | { | ||
1242 | int i; | ||
1243 | |||
1244 | BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= | ||
1245 | le16_to_cpu(left_el->l_tree_depth)); | ||
1246 | |||
1247 | for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { | ||
1248 | if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) | ||
1249 | break; | ||
1250 | } | ||
1251 | |||
1252 | /* | ||
1253 | * The path walking code should have never returned a root and | ||
1254 | * two paths which are not adjacent. | ||
1255 | */ | ||
1256 | BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); | ||
1257 | |||
1258 | ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, | ||
1259 | &root_el->l_recs[i + 1], right_el); | ||
1260 | } | ||
1261 | |||
1262 | /* | ||
1263 | * We've changed a leaf block (in right_path) and need to reflect that | ||
1264 | * change back up the subtree. | ||
1265 | * | ||
1266 | * This happens in multiple places: | ||
1267 | * - When we've moved an extent record from the left path leaf to the right | ||
1268 | * path leaf to make room for an empty extent in the left path leaf. | ||
1269 | * - When our insert into the right path leaf is at the leftmost edge | ||
1270 | * and requires an update of the path immediately to it's left. This | ||
1271 | * can occur at the end of some types of rotation and appending inserts. | ||
1272 | */ | ||
1273 | static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, | ||
1274 | struct ocfs2_path *left_path, | ||
1275 | struct ocfs2_path *right_path, | ||
1276 | int subtree_index) | ||
1277 | { | ||
1278 | int ret, i, idx; | ||
1279 | struct ocfs2_extent_list *el, *left_el, *right_el; | ||
1280 | struct ocfs2_extent_rec *left_rec, *right_rec; | ||
1281 | struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; | ||
1282 | |||
1283 | /* | ||
1284 | * Update the counts and position values within all the | ||
1285 | * interior nodes to reflect the leaf rotation we just did. | ||
1286 | * | ||
1287 | * The root node is handled below the loop. | ||
1288 | * | ||
1289 | * We begin the loop with right_el and left_el pointing to the | ||
1290 | * leaf lists and work our way up. | ||
1291 | * | ||
1292 | * NOTE: within this loop, left_el and right_el always refer | ||
1293 | * to the *child* lists. | ||
1294 | */ | ||
1295 | left_el = path_leaf_el(left_path); | ||
1296 | right_el = path_leaf_el(right_path); | ||
1297 | for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { | ||
1298 | mlog(0, "Adjust records at index %u\n", i); | ||
1299 | |||
1300 | /* | ||
1301 | * One nice property of knowing that all of these | ||
1302 | * nodes are below the root is that we only deal with | ||
1303 | * the leftmost right node record and the rightmost | ||
1304 | * left node record. | ||
1305 | */ | ||
1306 | el = left_path->p_node[i].el; | ||
1307 | idx = le16_to_cpu(left_el->l_next_free_rec) - 1; | ||
1308 | left_rec = &el->l_recs[idx]; | ||
1309 | |||
1310 | el = right_path->p_node[i].el; | ||
1311 | right_rec = &el->l_recs[0]; | ||
1312 | |||
1313 | ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, | ||
1314 | right_el); | ||
1315 | |||
1316 | ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); | ||
1317 | if (ret) | ||
1318 | mlog_errno(ret); | ||
1319 | |||
1320 | ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); | ||
1321 | if (ret) | ||
1322 | mlog_errno(ret); | ||
1323 | |||
1324 | /* | ||
1325 | * Setup our list pointers now so that the current | ||
1326 | * parents become children in the next iteration. | ||
1327 | */ | ||
1328 | left_el = left_path->p_node[i].el; | ||
1329 | right_el = right_path->p_node[i].el; | ||
1330 | } | ||
1331 | |||
1332 | /* | ||
1333 | * At the root node, adjust the two adjacent records which | ||
1334 | * begin our path to the leaves. | ||
1335 | */ | ||
1336 | |||
1337 | el = left_path->p_node[subtree_index].el; | ||
1338 | left_el = left_path->p_node[subtree_index + 1].el; | ||
1339 | right_el = right_path->p_node[subtree_index + 1].el; | ||
1340 | |||
1341 | ocfs2_adjust_root_records(el, left_el, right_el, | ||
1342 | left_path->p_node[subtree_index + 1].bh->b_blocknr); | ||
1343 | |||
1344 | root_bh = left_path->p_node[subtree_index].bh; | ||
1345 | |||
1346 | ret = ocfs2_journal_dirty(handle, root_bh); | ||
1347 | if (ret) | ||
1348 | mlog_errno(ret); | ||
1349 | } | ||
1350 | |||
1351 | static int ocfs2_rotate_subtree_right(struct inode *inode, | ||
1352 | handle_t *handle, | ||
1353 | struct ocfs2_path *left_path, | ||
1354 | struct ocfs2_path *right_path, | ||
1355 | int subtree_index) | ||
1356 | { | ||
1357 | int ret, i; | ||
1358 | struct buffer_head *right_leaf_bh; | ||
1359 | struct buffer_head *left_leaf_bh = NULL; | ||
1360 | struct buffer_head *root_bh; | ||
1361 | struct ocfs2_extent_list *right_el, *left_el; | ||
1362 | struct ocfs2_extent_rec move_rec; | ||
1363 | |||
1364 | left_leaf_bh = path_leaf_bh(left_path); | ||
1365 | left_el = path_leaf_el(left_path); | ||
1366 | |||
1367 | if (left_el->l_next_free_rec != left_el->l_count) { | ||
1368 | ocfs2_error(inode->i_sb, | ||
1369 | "Inode %llu has non-full interior leaf node %llu" | ||
1370 | "(next free = %u)", | ||
1371 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
1372 | (unsigned long long)left_leaf_bh->b_blocknr, | ||
1373 | le16_to_cpu(left_el->l_next_free_rec)); | ||
1374 | return -EROFS; | ||
1375 | } | ||
1376 | |||
1377 | /* | ||
1378 | * This extent block may already have an empty record, so we | ||
1379 | * return early if so. | ||
1380 | */ | ||
1381 | if (ocfs2_is_empty_extent(&left_el->l_recs[0])) | ||
1382 | return 0; | ||
1383 | |||
1384 | root_bh = left_path->p_node[subtree_index].bh; | ||
1385 | BUG_ON(root_bh != right_path->p_node[subtree_index].bh); | ||
1386 | |||
1387 | ret = ocfs2_journal_access(handle, inode, root_bh, | ||
1388 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1389 | if (ret) { | ||
1390 | mlog_errno(ret); | ||
1391 | goto out; | ||
1392 | } | ||
1393 | |||
1394 | for(i = subtree_index + 1; i < path_num_items(right_path); i++) { | ||
1395 | ret = ocfs2_journal_access(handle, inode, | ||
1396 | right_path->p_node[i].bh, | ||
1397 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1398 | if (ret) { | ||
1399 | mlog_errno(ret); | ||
1400 | goto out; | ||
1401 | } | ||
1402 | |||
1403 | ret = ocfs2_journal_access(handle, inode, | ||
1404 | left_path->p_node[i].bh, | ||
1405 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1406 | if (ret) { | ||
1407 | mlog_errno(ret); | ||
1408 | goto out; | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | right_leaf_bh = path_leaf_bh(right_path); | ||
1413 | right_el = path_leaf_el(right_path); | ||
1414 | |||
1415 | /* This is a code error, not a disk corruption. */ | ||
1416 | mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " | ||
1417 | "because rightmost leaf block %llu is empty\n", | ||
1418 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
1419 | (unsigned long long)right_leaf_bh->b_blocknr); | ||
1420 | |||
1421 | ocfs2_create_empty_extent(right_el); | ||
1422 | |||
1423 | ret = ocfs2_journal_dirty(handle, right_leaf_bh); | ||
1424 | if (ret) { | ||
1425 | mlog_errno(ret); | ||
1426 | goto out; | ||
1427 | } | ||
1428 | |||
1429 | /* Do the copy now. */ | ||
1430 | i = le16_to_cpu(left_el->l_next_free_rec) - 1; | ||
1431 | move_rec = left_el->l_recs[i]; | ||
1432 | right_el->l_recs[0] = move_rec; | ||
1433 | |||
1434 | /* | ||
1435 | * Clear out the record we just copied and shift everything | ||
1436 | * over, leaving an empty extent in the left leaf. | ||
1437 | * | ||
1438 | * We temporarily subtract from next_free_rec so that the | ||
1439 | * shift will lose the tail record (which is now defunct). | ||
1440 | */ | ||
1441 | le16_add_cpu(&left_el->l_next_free_rec, -1); | ||
1442 | ocfs2_shift_records_right(left_el); | ||
1443 | memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
1444 | le16_add_cpu(&left_el->l_next_free_rec, 1); | ||
1445 | |||
1446 | ret = ocfs2_journal_dirty(handle, left_leaf_bh); | ||
1447 | if (ret) { | ||
1448 | mlog_errno(ret); | ||
1449 | goto out; | ||
1450 | } | ||
1451 | |||
1452 | ocfs2_complete_edge_insert(inode, handle, left_path, right_path, | ||
1453 | subtree_index); | ||
1454 | |||
1455 | out: | ||
1456 | return ret; | ||
1457 | } | ||
1458 | |||
1459 | /* | ||
1460 | * Given a full path, determine what cpos value would return us a path | ||
1461 | * containing the leaf immediately to the left of the current one. | ||
1462 | * | ||
1463 | * Will return zero if the path passed in is already the leftmost path. | ||
1464 | */ | ||
1465 | static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, | ||
1466 | struct ocfs2_path *path, u32 *cpos) | ||
1467 | { | ||
1468 | int i, j, ret = 0; | ||
1469 | u64 blkno; | ||
1470 | struct ocfs2_extent_list *el; | ||
1471 | |||
1472 | BUG_ON(path->p_tree_depth == 0); | ||
1473 | |||
1474 | *cpos = 0; | ||
1475 | |||
1476 | blkno = path_leaf_bh(path)->b_blocknr; | ||
1477 | |||
1478 | /* Start at the tree node just above the leaf and work our way up. */ | ||
1479 | i = path->p_tree_depth - 1; | ||
1480 | while (i >= 0) { | ||
1481 | el = path->p_node[i].el; | ||
1482 | |||
1483 | /* | ||
1484 | * Find the extent record just before the one in our | ||
1485 | * path. | ||
1486 | */ | ||
1487 | for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { | ||
1488 | if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { | ||
1489 | if (j == 0) { | ||
1490 | if (i == 0) { | ||
1491 | /* | ||
1492 | * We've determined that the | ||
1493 | * path specified is already | ||
1494 | * the leftmost one - return a | ||
1495 | * cpos of zero. | ||
1496 | */ | ||
1497 | goto out; | ||
1498 | } | ||
1499 | /* | ||
1500 | * The leftmost record points to our | ||
1501 | * leaf - we need to travel up the | ||
1502 | * tree one level. | ||
1503 | */ | ||
1504 | goto next_node; | ||
1505 | } | ||
1506 | |||
1507 | *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); | ||
1508 | *cpos = *cpos + ocfs2_rec_clusters(el, | ||
1509 | &el->l_recs[j - 1]); | ||
1510 | *cpos = *cpos - 1; | ||
1511 | goto out; | ||
1512 | } | ||
1513 | } | ||
1514 | |||
1515 | /* | ||
1516 | * If we got here, we never found a valid node where | ||
1517 | * the tree indicated one should be. | ||
1518 | */ | ||
1519 | ocfs2_error(sb, | ||
1520 | "Invalid extent tree at extent block %llu\n", | ||
1521 | (unsigned long long)blkno); | ||
1522 | ret = -EROFS; | ||
1523 | goto out; | ||
1524 | |||
1525 | next_node: | ||
1526 | blkno = path->p_node[i].bh->b_blocknr; | ||
1527 | i--; | ||
1528 | } | ||
1529 | |||
1530 | out: | ||
1531 | return ret; | ||
1532 | } | ||
1533 | |||
1534 | static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, | ||
1535 | struct ocfs2_path *path) | ||
1536 | { | ||
1537 | int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; | ||
1538 | |||
1539 | if (handle->h_buffer_credits < credits) | ||
1540 | return ocfs2_extend_trans(handle, credits); | ||
1541 | |||
1542 | return 0; | ||
1543 | } | ||
1544 | |||
1545 | /* | ||
1546 | * Trap the case where we're inserting into the theoretical range past | ||
1547 | * the _actual_ left leaf range. Otherwise, we'll rotate a record | ||
1548 | * whose cpos is less than ours into the right leaf. | ||
1549 | * | ||
1550 | * It's only necessary to look at the rightmost record of the left | ||
1551 | * leaf because the logic that calls us should ensure that the | ||
1552 | * theoretical ranges in the path components above the leaves are | ||
1553 | * correct. | ||
1554 | */ | ||
1555 | static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, | ||
1556 | u32 insert_cpos) | ||
1557 | { | ||
1558 | struct ocfs2_extent_list *left_el; | ||
1559 | struct ocfs2_extent_rec *rec; | ||
1560 | int next_free; | ||
1561 | |||
1562 | left_el = path_leaf_el(left_path); | ||
1563 | next_free = le16_to_cpu(left_el->l_next_free_rec); | ||
1564 | rec = &left_el->l_recs[next_free - 1]; | ||
1565 | |||
1566 | if (insert_cpos > le32_to_cpu(rec->e_cpos)) | ||
1567 | return 1; | ||
1568 | return 0; | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * Rotate all the records in a btree right one record, starting at insert_cpos. | ||
1573 | * | ||
1574 | * The path to the rightmost leaf should be passed in. | ||
1575 | * | ||
1576 | * The array is assumed to be large enough to hold an entire path (tree depth). | ||
1577 | * | ||
1578 | * Upon succesful return from this function: | ||
1579 | * | ||
1580 | * - The 'right_path' array will contain a path to the leaf block | ||
1581 | * whose range contains e_cpos. | ||
1582 | * - That leaf block will have a single empty extent in list index 0. | ||
1583 | * - In the case that the rotation requires a post-insert update, | ||
1584 | * *ret_left_path will contain a valid path which can be passed to | ||
1585 | * ocfs2_insert_path(). | ||
1586 | */ | ||
1587 | static int ocfs2_rotate_tree_right(struct inode *inode, | ||
1588 | handle_t *handle, | ||
1589 | u32 insert_cpos, | ||
1590 | struct ocfs2_path *right_path, | ||
1591 | struct ocfs2_path **ret_left_path) | ||
1592 | { | ||
1593 | int ret, start; | ||
1594 | u32 cpos; | ||
1595 | struct ocfs2_path *left_path = NULL; | ||
1596 | |||
1597 | *ret_left_path = NULL; | ||
1598 | |||
1599 | left_path = ocfs2_new_path(path_root_bh(right_path), | ||
1600 | path_root_el(right_path)); | ||
1601 | if (!left_path) { | ||
1602 | ret = -ENOMEM; | ||
1603 | mlog_errno(ret); | ||
1604 | goto out; | ||
1605 | } | ||
1606 | |||
1607 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); | ||
1608 | if (ret) { | ||
1609 | mlog_errno(ret); | ||
1610 | goto out; | ||
1611 | } | ||
1612 | |||
1613 | mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); | ||
1614 | |||
1615 | /* | ||
1616 | * What we want to do here is: | ||
1617 | * | ||
1618 | * 1) Start with the rightmost path. | ||
1619 | * | ||
1620 | * 2) Determine a path to the leaf block directly to the left | ||
1621 | * of that leaf. | ||
1622 | * | ||
1623 | * 3) Determine the 'subtree root' - the lowest level tree node | ||
1624 | * which contains a path to both leaves. | ||
1625 | * | ||
1626 | * 4) Rotate the subtree. | ||
1627 | * | ||
1628 | * 5) Find the next subtree by considering the left path to be | ||
1629 | * the new right path. | ||
1630 | * | ||
1631 | * The check at the top of this while loop also accepts | ||
1632 | * insert_cpos == cpos because cpos is only a _theoretical_ | ||
1633 | * value to get us the left path - insert_cpos might very well | ||
1634 | * be filling that hole. | ||
1635 | * | ||
1636 | * Stop at a cpos of '0' because we either started at the | ||
1637 | * leftmost branch (i.e., a tree with one branch and a | ||
1638 | * rotation inside of it), or we've gone as far as we can in | ||
1639 | * rotating subtrees. | ||
1640 | */ | ||
1641 | while (cpos && insert_cpos <= cpos) { | ||
1642 | mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", | ||
1643 | insert_cpos, cpos); | ||
1644 | |||
1645 | ret = ocfs2_find_path(inode, left_path, cpos); | ||
1646 | if (ret) { | ||
1647 | mlog_errno(ret); | ||
1648 | goto out; | ||
1649 | } | ||
1650 | |||
1651 | mlog_bug_on_msg(path_leaf_bh(left_path) == | ||
1652 | path_leaf_bh(right_path), | ||
1653 | "Inode %lu: error during insert of %u " | ||
1654 | "(left path cpos %u) results in two identical " | ||
1655 | "paths ending at %llu\n", | ||
1656 | inode->i_ino, insert_cpos, cpos, | ||
1657 | (unsigned long long) | ||
1658 | path_leaf_bh(left_path)->b_blocknr); | ||
1659 | |||
1660 | if (ocfs2_rotate_requires_path_adjustment(left_path, | ||
1661 | insert_cpos)) { | ||
1662 | mlog(0, "Path adjustment required\n"); | ||
1663 | |||
1664 | /* | ||
1665 | * We've rotated the tree as much as we | ||
1666 | * should. The rest is up to | ||
1667 | * ocfs2_insert_path() to complete, after the | ||
1668 | * record insertion. We indicate this | ||
1669 | * situation by returning the left path. | ||
1670 | * | ||
1671 | * The reason we don't adjust the records here | ||
1672 | * before the record insert is that an error | ||
1673 | * later might break the rule where a parent | ||
1674 | * record e_cpos will reflect the actual | ||
1675 | * e_cpos of the 1st nonempty record of the | ||
1676 | * child list. | ||
1677 | */ | ||
1678 | *ret_left_path = left_path; | ||
1679 | goto out_ret_path; | ||
1680 | } | ||
1681 | |||
1682 | start = ocfs2_find_subtree_root(inode, left_path, right_path); | ||
1683 | |||
1684 | mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", | ||
1685 | start, | ||
1686 | (unsigned long long) right_path->p_node[start].bh->b_blocknr, | ||
1687 | right_path->p_tree_depth); | ||
1688 | |||
1689 | ret = ocfs2_extend_rotate_transaction(handle, start, | ||
1690 | right_path); | ||
1691 | if (ret) { | ||
1692 | mlog_errno(ret); | ||
1693 | goto out; | ||
1694 | } | ||
1695 | |||
1696 | ret = ocfs2_rotate_subtree_right(inode, handle, left_path, | ||
1697 | right_path, start); | ||
1698 | if (ret) { | ||
1699 | mlog_errno(ret); | ||
1700 | goto out; | ||
1701 | } | ||
1702 | |||
1703 | /* | ||
1704 | * There is no need to re-read the next right path | ||
1705 | * as we know that it'll be our current left | ||
1706 | * path. Optimize by copying values instead. | ||
1707 | */ | ||
1708 | ocfs2_mv_path(right_path, left_path); | ||
1709 | |||
1710 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, | ||
1711 | &cpos); | ||
1712 | if (ret) { | ||
1713 | mlog_errno(ret); | ||
1714 | goto out; | ||
1715 | } | ||
1716 | } | ||
1717 | |||
1718 | out: | ||
1719 | ocfs2_free_path(left_path); | ||
1720 | |||
1721 | out_ret_path: | ||
1722 | return ret; | ||
1723 | } | ||
1724 | |||
1725 | /* | ||
1726 | * Do the final bits of extent record insertion at the target leaf | ||
1727 | * list. If this leaf is part of an allocation tree, it is assumed | ||
1728 | * that the tree above has been prepared. | ||
1729 | */ | ||
1730 | static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, | ||
1731 | struct ocfs2_extent_list *el, | ||
1732 | struct ocfs2_insert_type *insert, | ||
1733 | struct inode *inode) | ||
1734 | { | ||
1735 | int i = insert->ins_contig_index; | ||
1736 | unsigned int range; | ||
1737 | struct ocfs2_extent_rec *rec; | ||
1738 | |||
1739 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
1740 | |||
1741 | /* | ||
1742 | * Contiguous insert - either left or right. | ||
1743 | */ | ||
1744 | if (insert->ins_contig != CONTIG_NONE) { | ||
1745 | rec = &el->l_recs[i]; | ||
1746 | if (insert->ins_contig == CONTIG_LEFT) { | ||
1747 | rec->e_blkno = insert_rec->e_blkno; | ||
1748 | rec->e_cpos = insert_rec->e_cpos; | ||
1749 | } | ||
1750 | le16_add_cpu(&rec->e_leaf_clusters, | ||
1751 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
1752 | return; | ||
1753 | } | ||
1754 | |||
1755 | /* | ||
1756 | * Handle insert into an empty leaf. | ||
1757 | */ | ||
1758 | if (le16_to_cpu(el->l_next_free_rec) == 0 || | ||
1759 | ((le16_to_cpu(el->l_next_free_rec) == 1) && | ||
1760 | ocfs2_is_empty_extent(&el->l_recs[0]))) { | ||
1761 | el->l_recs[0] = *insert_rec; | ||
1762 | el->l_next_free_rec = cpu_to_le16(1); | ||
1763 | return; | ||
1764 | } | ||
1765 | |||
1766 | /* | ||
1767 | * Appending insert. | ||
1768 | */ | ||
1769 | if (insert->ins_appending == APPEND_TAIL) { | ||
1770 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1771 | rec = &el->l_recs[i]; | ||
1772 | range = le32_to_cpu(rec->e_cpos) | ||
1773 | + le16_to_cpu(rec->e_leaf_clusters); | ||
1774 | BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); | ||
1775 | |||
1776 | mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= | ||
1777 | le16_to_cpu(el->l_count), | ||
1778 | "inode %lu, depth %u, count %u, next free %u, " | ||
1779 | "rec.cpos %u, rec.clusters %u, " | ||
1780 | "insert.cpos %u, insert.clusters %u\n", | ||
1781 | inode->i_ino, | ||
1782 | le16_to_cpu(el->l_tree_depth), | ||
1783 | le16_to_cpu(el->l_count), | ||
1784 | le16_to_cpu(el->l_next_free_rec), | ||
1785 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1786 | le16_to_cpu(el->l_recs[i].e_leaf_clusters), | ||
1787 | le32_to_cpu(insert_rec->e_cpos), | ||
1788 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
1789 | i++; | ||
1790 | el->l_recs[i] = *insert_rec; | ||
1791 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
1792 | return; | ||
1793 | } | ||
1794 | |||
1795 | /* | ||
1796 | * Ok, we have to rotate. | ||
1797 | * | ||
1798 | * At this point, it is safe to assume that inserting into an | ||
1799 | * empty leaf and appending to a leaf have both been handled | ||
1800 | * above. | ||
1801 | * | ||
1802 | * This leaf needs to have space, either by the empty 1st | ||
1803 | * extent record, or by virtue of an l_next_rec < l_count. | ||
1804 | */ | ||
1805 | ocfs2_rotate_leaf(el, insert_rec); | ||
1806 | } | ||
1807 | |||
1808 | static inline void ocfs2_update_dinode_clusters(struct inode *inode, | ||
1809 | struct ocfs2_dinode *di, | ||
1810 | u32 clusters) | ||
1811 | { | ||
1812 | le32_add_cpu(&di->i_clusters, clusters); | ||
1813 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1814 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); | ||
1815 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1816 | } | ||
1817 | |||
1818 | static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, | ||
1819 | struct ocfs2_extent_rec *insert_rec, | ||
1820 | struct ocfs2_path *right_path, | ||
1821 | struct ocfs2_path **ret_left_path) | ||
1822 | { | ||
1823 | int ret, i, next_free; | ||
1824 | struct buffer_head *bh; | ||
1825 | struct ocfs2_extent_list *el; | ||
1826 | struct ocfs2_path *left_path = NULL; | ||
1827 | |||
1828 | *ret_left_path = NULL; | ||
1829 | |||
1830 | /* | ||
1831 | * This shouldn't happen for non-trees. The extent rec cluster | ||
1832 | * count manipulation below only works for interior nodes. | ||
1833 | */ | ||
1834 | BUG_ON(right_path->p_tree_depth == 0); | ||
1835 | |||
1836 | /* | ||
1837 | * If our appending insert is at the leftmost edge of a leaf, | ||
1838 | * then we might need to update the rightmost records of the | ||
1839 | * neighboring path. | ||
1840 | */ | ||
1841 | el = path_leaf_el(right_path); | ||
1842 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
1843 | if (next_free == 0 || | ||
1844 | (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { | ||
1845 | u32 left_cpos; | ||
1846 | |||
1847 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, | ||
1848 | &left_cpos); | ||
1849 | if (ret) { | ||
1850 | mlog_errno(ret); | ||
1851 | goto out; | ||
1852 | } | ||
1853 | |||
1854 | mlog(0, "Append may need a left path update. cpos: %u, " | ||
1855 | "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), | ||
1856 | left_cpos); | ||
1857 | |||
1858 | /* | ||
1859 | * No need to worry if the append is already in the | ||
1860 | * leftmost leaf. | ||
1861 | */ | ||
1862 | if (left_cpos) { | ||
1863 | left_path = ocfs2_new_path(path_root_bh(right_path), | ||
1864 | path_root_el(right_path)); | ||
1865 | if (!left_path) { | ||
1866 | ret = -ENOMEM; | ||
1867 | mlog_errno(ret); | ||
1868 | goto out; | ||
1869 | } | ||
1870 | |||
1871 | ret = ocfs2_find_path(inode, left_path, left_cpos); | ||
1872 | if (ret) { | ||
1873 | mlog_errno(ret); | ||
1874 | goto out; | ||
1875 | } | ||
1876 | |||
1877 | /* | ||
1878 | * ocfs2_insert_path() will pass the left_path to the | ||
1879 | * journal for us. | ||
1880 | */ | ||
1881 | } | ||
1882 | } | ||
1883 | |||
1884 | ret = ocfs2_journal_access_path(inode, handle, right_path); | ||
1885 | if (ret) { | ||
1886 | mlog_errno(ret); | ||
1887 | goto out; | ||
1888 | } | ||
1889 | |||
1890 | el = path_root_el(right_path); | ||
1891 | bh = path_root_bh(right_path); | ||
1892 | i = 0; | ||
1893 | while (1) { | ||
1894 | struct ocfs2_extent_rec *rec; | ||
1895 | |||
1896 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
1897 | if (next_free == 0) { | ||
1898 | ocfs2_error(inode->i_sb, | ||
1899 | "Dinode %llu has a bad extent list", | ||
1900 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1901 | ret = -EIO; | ||
1902 | goto out; | ||
1903 | } | ||
1904 | |||
1905 | rec = &el->l_recs[next_free - 1]; | ||
1906 | |||
1907 | rec->e_int_clusters = insert_rec->e_cpos; | ||
1908 | le32_add_cpu(&rec->e_int_clusters, | ||
1909 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
1910 | le32_add_cpu(&rec->e_int_clusters, | ||
1911 | -le32_to_cpu(rec->e_cpos)); | ||
1912 | |||
1913 | ret = ocfs2_journal_dirty(handle, bh); | ||
1914 | if (ret) | ||
1915 | mlog_errno(ret); | ||
1916 | |||
1917 | /* Don't touch the leaf node */ | ||
1918 | if (++i >= right_path->p_tree_depth) | ||
1919 | break; | ||
1920 | |||
1921 | bh = right_path->p_node[i].bh; | ||
1922 | el = right_path->p_node[i].el; | ||
1923 | } | ||
1924 | |||
1925 | *ret_left_path = left_path; | ||
1926 | ret = 0; | ||
1927 | out: | ||
1928 | if (ret != 0) | ||
1929 | ocfs2_free_path(left_path); | ||
1930 | |||
1931 | return ret; | ||
1932 | } | ||
1933 | |||
1934 | /* | ||
1935 | * This function only does inserts on an allocation b-tree. For dinode | ||
1936 | * lists, ocfs2_insert_at_leaf() is called directly. | ||
1937 | * | ||
1938 | * right_path is the path we want to do the actual insert | ||
1939 | * in. left_path should only be passed in if we need to update that | ||
1940 | * portion of the tree after an edge insert. | ||
1941 | */ | ||
1942 | static int ocfs2_insert_path(struct inode *inode, | ||
1943 | handle_t *handle, | ||
1944 | struct ocfs2_path *left_path, | ||
1945 | struct ocfs2_path *right_path, | ||
1946 | struct ocfs2_extent_rec *insert_rec, | ||
1947 | struct ocfs2_insert_type *insert) | ||
1948 | { | ||
1949 | int ret, subtree_index; | ||
1950 | struct buffer_head *leaf_bh = path_leaf_bh(right_path); | ||
1951 | struct ocfs2_extent_list *el; | ||
1952 | |||
1953 | /* | ||
1954 | * Pass both paths to the journal. The majority of inserts | ||
1955 | * will be touching all components anyway. | ||
1956 | */ | ||
1957 | ret = ocfs2_journal_access_path(inode, handle, right_path); | ||
1958 | if (ret < 0) { | ||
1959 | mlog_errno(ret); | ||
1960 | goto out; | ||
1961 | } | ||
1962 | |||
1963 | if (left_path) { | ||
1964 | int credits = handle->h_buffer_credits; | ||
1965 | |||
1966 | /* | ||
1967 | * There's a chance that left_path got passed back to | ||
1968 | * us without being accounted for in the | ||
1969 | * journal. Extend our transaction here to be sure we | ||
1970 | * can change those blocks. | ||
1971 | */ | ||
1972 | credits += left_path->p_tree_depth; | ||
1973 | |||
1974 | ret = ocfs2_extend_trans(handle, credits); | ||
1975 | if (ret < 0) { | ||
1976 | mlog_errno(ret); | ||
1977 | goto out; | ||
1978 | } | ||
1979 | |||
1980 | ret = ocfs2_journal_access_path(inode, handle, left_path); | ||
1981 | if (ret < 0) { | ||
1982 | mlog_errno(ret); | ||
1983 | goto out; | ||
1984 | } | ||
1985 | } | ||
1986 | |||
1987 | el = path_leaf_el(right_path); | ||
1988 | |||
1989 | ocfs2_insert_at_leaf(insert_rec, el, insert, inode); | ||
1990 | ret = ocfs2_journal_dirty(handle, leaf_bh); | ||
1991 | if (ret) | ||
1992 | mlog_errno(ret); | ||
1993 | |||
1994 | if (left_path) { | ||
1995 | /* | ||
1996 | * The rotate code has indicated that we need to fix | ||
1997 | * up portions of the tree after the insert. | ||
1998 | * | ||
1999 | * XXX: Should we extend the transaction here? | ||
2000 | */ | ||
2001 | subtree_index = ocfs2_find_subtree_root(inode, left_path, | ||
2002 | right_path); | ||
2003 | ocfs2_complete_edge_insert(inode, handle, left_path, | ||
2004 | right_path, subtree_index); | ||
2005 | } | ||
2006 | |||
2007 | ret = 0; | ||
2008 | out: | ||
2009 | return ret; | ||
2010 | } | ||
2011 | |||
2012 | static int ocfs2_do_insert_extent(struct inode *inode, | ||
2013 | handle_t *handle, | ||
2014 | struct buffer_head *di_bh, | ||
2015 | struct ocfs2_extent_rec *insert_rec, | ||
2016 | struct ocfs2_insert_type *type) | ||
2017 | { | ||
2018 | int ret, rotate = 0; | ||
2019 | u32 cpos; | ||
2020 | struct ocfs2_path *right_path = NULL; | ||
2021 | struct ocfs2_path *left_path = NULL; | ||
2022 | struct ocfs2_dinode *di; | ||
2023 | struct ocfs2_extent_list *el; | ||
2024 | |||
2025 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
2026 | el = &di->id2.i_list; | ||
2027 | |||
2028 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
2029 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2030 | if (ret) { | ||
2031 | mlog_errno(ret); | ||
2032 | goto out; | ||
2033 | } | ||
2034 | |||
2035 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
2036 | ocfs2_insert_at_leaf(insert_rec, el, type, inode); | ||
2037 | goto out_update_clusters; | ||
2038 | } | ||
2039 | |||
2040 | right_path = ocfs2_new_inode_path(di_bh); | ||
2041 | if (!right_path) { | ||
2042 | ret = -ENOMEM; | ||
2043 | mlog_errno(ret); | ||
2044 | goto out; | ||
2045 | } | ||
2046 | |||
2047 | /* | ||
2048 | * Determine the path to start with. Rotations need the | ||
2049 | * rightmost path, everything else can go directly to the | ||
2050 | * target leaf. | ||
2051 | */ | ||
2052 | cpos = le32_to_cpu(insert_rec->e_cpos); | ||
2053 | if (type->ins_appending == APPEND_NONE && | ||
2054 | type->ins_contig == CONTIG_NONE) { | ||
2055 | rotate = 1; | ||
2056 | cpos = UINT_MAX; | ||
2057 | } | ||
2058 | |||
2059 | ret = ocfs2_find_path(inode, right_path, cpos); | ||
2060 | if (ret) { | ||
2061 | mlog_errno(ret); | ||
2062 | goto out; | ||
2063 | } | ||
2064 | |||
2065 | /* | ||
2066 | * Rotations and appends need special treatment - they modify | ||
2067 | * parts of the tree's above them. | ||
2068 | * | ||
2069 | * Both might pass back a path immediate to the left of the | ||
2070 | * one being inserted to. This will be cause | ||
2071 | * ocfs2_insert_path() to modify the rightmost records of | ||
2072 | * left_path to account for an edge insert. | ||
2073 | * | ||
2074 | * XXX: When modifying this code, keep in mind that an insert | ||
2075 | * can wind up skipping both of these two special cases... | ||
2076 | */ | ||
2077 | if (rotate) { | ||
2078 | ret = ocfs2_rotate_tree_right(inode, handle, | ||
2079 | le32_to_cpu(insert_rec->e_cpos), | ||
2080 | right_path, &left_path); | ||
2081 | if (ret) { | ||
2082 | mlog_errno(ret); | ||
2083 | goto out; | ||
2084 | } | ||
2085 | } else if (type->ins_appending == APPEND_TAIL | ||
2086 | && type->ins_contig != CONTIG_LEFT) { | ||
2087 | ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, | ||
2088 | right_path, &left_path); | ||
2089 | if (ret) { | ||
2090 | mlog_errno(ret); | ||
2091 | goto out; | ||
2092 | } | ||
2093 | } | ||
2094 | |||
2095 | ret = ocfs2_insert_path(inode, handle, left_path, right_path, | ||
2096 | insert_rec, type); | ||
2097 | if (ret) { | ||
2098 | mlog_errno(ret); | ||
2099 | goto out; | ||
2100 | } | ||
2101 | |||
2102 | out_update_clusters: | ||
2103 | ocfs2_update_dinode_clusters(inode, di, | ||
2104 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
2105 | |||
2106 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
2107 | if (ret) | ||
2108 | mlog_errno(ret); | ||
2109 | |||
2110 | out: | ||
2111 | ocfs2_free_path(left_path); | ||
2112 | ocfs2_free_path(right_path); | ||
2113 | |||
2114 | return ret; | ||
2115 | } | ||
2116 | |||
2117 | static void ocfs2_figure_contig_type(struct inode *inode, | ||
2118 | struct ocfs2_insert_type *insert, | ||
2119 | struct ocfs2_extent_list *el, | ||
2120 | struct ocfs2_extent_rec *insert_rec) | ||
2121 | { | ||
2122 | int i; | ||
2123 | enum ocfs2_contig_type contig_type = CONTIG_NONE; | ||
2124 | |||
2125 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
2126 | |||
2127 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
2128 | contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], | ||
2129 | insert_rec); | ||
2130 | if (contig_type != CONTIG_NONE) { | ||
2131 | insert->ins_contig_index = i; | ||
2132 | break; | ||
2133 | } | ||
2134 | } | ||
2135 | insert->ins_contig = contig_type; | ||
2136 | } | ||
2137 | |||
2138 | /* | ||
2139 | * This should only be called against the righmost leaf extent list. | ||
2140 | * | ||
2141 | * ocfs2_figure_appending_type() will figure out whether we'll have to | ||
2142 | * insert at the tail of the rightmost leaf. | ||
2143 | * | ||
2144 | * This should also work against the dinode list for tree's with 0 | ||
2145 | * depth. If we consider the dinode list to be the rightmost leaf node | ||
2146 | * then the logic here makes sense. | ||
2147 | */ | ||
2148 | static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, | ||
2149 | struct ocfs2_extent_list *el, | ||
2150 | struct ocfs2_extent_rec *insert_rec) | ||
2151 | { | ||
2152 | int i; | ||
2153 | u32 cpos = le32_to_cpu(insert_rec->e_cpos); | ||
2154 | struct ocfs2_extent_rec *rec; | ||
2155 | |||
2156 | insert->ins_appending = APPEND_NONE; | ||
2157 | |||
2158 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | ||
2159 | |||
2160 | if (!el->l_next_free_rec) | ||
2161 | goto set_tail_append; | ||
2162 | |||
2163 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
2164 | /* Were all records empty? */ | ||
2165 | if (le16_to_cpu(el->l_next_free_rec) == 1) | ||
2166 | goto set_tail_append; | ||
845 | } | 2167 | } |
846 | 2168 | ||
847 | /* Can we allocate without adding/shifting tree bits? */ | ||
848 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 2169 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
849 | if (le16_to_cpu(el->l_next_free_rec) == 0 | 2170 | rec = &el->l_recs[i]; |
850 | || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) | 2171 | |
851 | || le32_to_cpu(el->l_recs[i].e_clusters) == 0 | 2172 | if (cpos >= |
852 | || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) | 2173 | (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) |
853 | goto out_add; | 2174 | goto set_tail_append; |
2175 | |||
2176 | return; | ||
2177 | |||
2178 | set_tail_append: | ||
2179 | insert->ins_appending = APPEND_TAIL; | ||
2180 | } | ||
2181 | |||
2182 | /* | ||
2183 | * Helper function called at the begining of an insert. | ||
2184 | * | ||
2185 | * This computes a few things that are commonly used in the process of | ||
2186 | * inserting into the btree: | ||
2187 | * - Whether the new extent is contiguous with an existing one. | ||
2188 | * - The current tree depth. | ||
2189 | * - Whether the insert is an appending one. | ||
2190 | * - The total # of free records in the tree. | ||
2191 | * | ||
2192 | * All of the information is stored on the ocfs2_insert_type | ||
2193 | * structure. | ||
2194 | */ | ||
2195 | static int ocfs2_figure_insert_type(struct inode *inode, | ||
2196 | struct buffer_head *di_bh, | ||
2197 | struct buffer_head **last_eb_bh, | ||
2198 | struct ocfs2_extent_rec *insert_rec, | ||
2199 | struct ocfs2_insert_type *insert) | ||
2200 | { | ||
2201 | int ret; | ||
2202 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
2203 | struct ocfs2_extent_block *eb; | ||
2204 | struct ocfs2_extent_list *el; | ||
2205 | struct ocfs2_path *path = NULL; | ||
2206 | struct buffer_head *bh = NULL; | ||
2207 | |||
2208 | el = &di->id2.i_list; | ||
2209 | insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); | ||
2210 | |||
2211 | if (el->l_tree_depth) { | ||
2212 | /* | ||
2213 | * If we have tree depth, we read in the | ||
2214 | * rightmost extent block ahead of time as | ||
2215 | * ocfs2_figure_insert_type() and ocfs2_add_branch() | ||
2216 | * may want it later. | ||
2217 | */ | ||
2218 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
2219 | le64_to_cpu(di->i_last_eb_blk), &bh, | ||
2220 | OCFS2_BH_CACHED, inode); | ||
2221 | if (ret) { | ||
2222 | mlog_exit(ret); | ||
2223 | goto out; | ||
2224 | } | ||
2225 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
2226 | el = &eb->h_list; | ||
2227 | } | ||
2228 | |||
2229 | /* | ||
2230 | * Unless we have a contiguous insert, we'll need to know if | ||
2231 | * there is room left in our allocation tree for another | ||
2232 | * extent record. | ||
2233 | * | ||
2234 | * XXX: This test is simplistic, we can search for empty | ||
2235 | * extent records too. | ||
2236 | */ | ||
2237 | insert->ins_free_records = le16_to_cpu(el->l_count) - | ||
2238 | le16_to_cpu(el->l_next_free_rec); | ||
2239 | |||
2240 | if (!insert->ins_tree_depth) { | ||
2241 | ocfs2_figure_contig_type(inode, insert, el, insert_rec); | ||
2242 | ocfs2_figure_appending_type(insert, el, insert_rec); | ||
2243 | return 0; | ||
2244 | } | ||
2245 | |||
2246 | path = ocfs2_new_inode_path(di_bh); | ||
2247 | if (!path) { | ||
2248 | ret = -ENOMEM; | ||
2249 | mlog_errno(ret); | ||
2250 | goto out; | ||
2251 | } | ||
2252 | |||
2253 | /* | ||
2254 | * In the case that we're inserting past what the tree | ||
2255 | * currently accounts for, ocfs2_find_path() will return for | ||
2256 | * us the rightmost tree path. This is accounted for below in | ||
2257 | * the appending code. | ||
2258 | */ | ||
2259 | ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); | ||
2260 | if (ret) { | ||
2261 | mlog_errno(ret); | ||
2262 | goto out; | ||
2263 | } | ||
2264 | |||
2265 | el = path_leaf_el(path); | ||
2266 | |||
2267 | /* | ||
2268 | * Now that we have the path, there's two things we want to determine: | ||
2269 | * 1) Contiguousness (also set contig_index if this is so) | ||
2270 | * | ||
2271 | * 2) Are we doing an append? We can trivially break this up | ||
2272 | * into two types of appends: simple record append, or a | ||
2273 | * rotate inside the tail leaf. | ||
2274 | */ | ||
2275 | ocfs2_figure_contig_type(inode, insert, el, insert_rec); | ||
2276 | |||
2277 | /* | ||
2278 | * The insert code isn't quite ready to deal with all cases of | ||
2279 | * left contiguousness. Specifically, if it's an insert into | ||
2280 | * the 1st record in a leaf, it will require the adjustment of | ||
2281 | * cluster count on the last record of the path directly to it's | ||
2282 | * left. For now, just catch that case and fool the layers | ||
2283 | * above us. This works just fine for tree_depth == 0, which | ||
2284 | * is why we allow that above. | ||
2285 | */ | ||
2286 | if (insert->ins_contig == CONTIG_LEFT && | ||
2287 | insert->ins_contig_index == 0) | ||
2288 | insert->ins_contig = CONTIG_NONE; | ||
2289 | |||
2290 | /* | ||
2291 | * Ok, so we can simply compare against last_eb to figure out | ||
2292 | * whether the path doesn't exist. This will only happen in | ||
2293 | * the case that we're doing a tail append, so maybe we can | ||
2294 | * take advantage of that information somehow. | ||
2295 | */ | ||
2296 | if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { | ||
2297 | /* | ||
2298 | * Ok, ocfs2_find_path() returned us the rightmost | ||
2299 | * tree path. This might be an appending insert. There are | ||
2300 | * two cases: | ||
2301 | * 1) We're doing a true append at the tail: | ||
2302 | * -This might even be off the end of the leaf | ||
2303 | * 2) We're "appending" by rotating in the tail | ||
2304 | */ | ||
2305 | ocfs2_figure_appending_type(insert, el, insert_rec); | ||
2306 | } | ||
2307 | |||
2308 | out: | ||
2309 | ocfs2_free_path(path); | ||
2310 | |||
2311 | if (ret == 0) | ||
2312 | *last_eb_bh = bh; | ||
2313 | else | ||
2314 | brelse(bh); | ||
2315 | return ret; | ||
2316 | } | ||
2317 | |||
2318 | /* | ||
2319 | * Insert an extent into an inode btree. | ||
2320 | * | ||
2321 | * The caller needs to update fe->i_clusters | ||
2322 | */ | ||
2323 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
2324 | handle_t *handle, | ||
2325 | struct inode *inode, | ||
2326 | struct buffer_head *fe_bh, | ||
2327 | u32 cpos, | ||
2328 | u64 start_blk, | ||
2329 | u32 new_clusters, | ||
2330 | struct ocfs2_alloc_context *meta_ac) | ||
2331 | { | ||
2332 | int status, shift; | ||
2333 | struct buffer_head *last_eb_bh = NULL; | ||
2334 | struct buffer_head *bh = NULL; | ||
2335 | struct ocfs2_insert_type insert = {0, }; | ||
2336 | struct ocfs2_extent_rec rec; | ||
2337 | |||
2338 | mlog(0, "add %u clusters at position %u to inode %llu\n", | ||
2339 | new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
2340 | |||
2341 | mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && | ||
2342 | (OCFS2_I(inode)->ip_clusters != cpos), | ||
2343 | "Device %s, asking for sparse allocation: inode %llu, " | ||
2344 | "cpos %u, clusters %u\n", | ||
2345 | osb->dev_str, | ||
2346 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, | ||
2347 | OCFS2_I(inode)->ip_clusters); | ||
2348 | |||
2349 | memset(&rec, 0, sizeof(rec)); | ||
2350 | rec.e_cpos = cpu_to_le32(cpos); | ||
2351 | rec.e_blkno = cpu_to_le64(start_blk); | ||
2352 | rec.e_leaf_clusters = cpu_to_le16(new_clusters); | ||
2353 | |||
2354 | status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, | ||
2355 | &insert); | ||
2356 | if (status < 0) { | ||
2357 | mlog_errno(status); | ||
2358 | goto bail; | ||
2359 | } | ||
854 | 2360 | ||
855 | mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " | 2361 | mlog(0, "Insert.appending: %u, Insert.Contig: %u, " |
856 | "tree now.\n"); | 2362 | "Insert.contig_index: %d, Insert.free_records: %d, " |
2363 | "Insert.tree_depth: %d\n", | ||
2364 | insert.ins_appending, insert.ins_contig, insert.ins_contig_index, | ||
2365 | insert.ins_free_records, insert.ins_tree_depth); | ||
2366 | |||
2367 | /* | ||
2368 | * Avoid growing the tree unless we're out of records and the | ||
2369 | * insert type requres one. | ||
2370 | */ | ||
2371 | if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) | ||
2372 | goto out_add; | ||
857 | 2373 | ||
858 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); | 2374 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); |
859 | if (shift < 0) { | 2375 | if (shift < 0) { |
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
866 | * and didn't find room for any more extents - we need to add | 2382 | * and didn't find room for any more extents - we need to add |
867 | * another tree level */ | 2383 | * another tree level */ |
868 | if (shift) { | 2384 | if (shift) { |
869 | /* if we hit a leaf, we'd better be empty :) */ | ||
870 | BUG_ON(le16_to_cpu(el->l_next_free_rec) != | ||
871 | le16_to_cpu(el->l_count)); | ||
872 | BUG_ON(bh); | 2385 | BUG_ON(bh); |
873 | mlog(0, "ocfs2_allocate_extent: need to shift tree depth " | 2386 | mlog(0, "need to shift tree depth " |
874 | "(current = %u)\n", | 2387 | "(current = %d)\n", insert.ins_tree_depth); |
875 | le16_to_cpu(fe->id2.i_list.l_tree_depth)); | ||
876 | 2388 | ||
877 | /* ocfs2_shift_tree_depth will return us a buffer with | 2389 | /* ocfs2_shift_tree_depth will return us a buffer with |
878 | * the new extent block (so we can pass that to | 2390 | * the new extent block (so we can pass that to |
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
883 | mlog_errno(status); | 2395 | mlog_errno(status); |
884 | goto bail; | 2396 | goto bail; |
885 | } | 2397 | } |
2398 | insert.ins_tree_depth++; | ||
886 | /* Special case: we have room now if we shifted from | 2399 | /* Special case: we have room now if we shifted from |
887 | * tree_depth 0 */ | 2400 | * tree_depth 0 */ |
888 | if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) | 2401 | if (insert.ins_tree_depth == 1) |
889 | goto out_add; | 2402 | goto out_add; |
890 | } | 2403 | } |
891 | 2404 | ||
892 | /* call ocfs2_add_branch to add the final part of the tree with | 2405 | /* call ocfs2_add_branch to add the final part of the tree with |
893 | * the new data. */ | 2406 | * the new data. */ |
894 | mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); | 2407 | mlog(0, "add branch. bh = %p\n", bh); |
895 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, | 2408 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, |
896 | meta_ac); | 2409 | meta_ac); |
897 | if (status < 0) { | 2410 | if (status < 0) { |
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
900 | } | 2413 | } |
901 | 2414 | ||
902 | out_add: | 2415 | out_add: |
903 | /* Finally, we can add clusters. */ | 2416 | /* Finally, we can add clusters. This might rotate the tree for us. */ |
904 | status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, | 2417 | status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); |
905 | start_blk, new_clusters); | ||
906 | if (status < 0) | 2418 | if (status < 0) |
907 | mlog_errno(status); | 2419 | mlog_errno(status); |
2420 | else | ||
2421 | ocfs2_extent_map_insert_rec(inode, &rec); | ||
908 | 2422 | ||
909 | bail: | 2423 | bail: |
910 | if (bh) | 2424 | if (bh) |
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) | |||
1447 | * block will be deleted, and if it will, what the new last extent | 2961 | * block will be deleted, and if it will, what the new last extent |
1448 | * block will be so we can update his h_next_leaf_blk field, as well | 2962 | * block will be so we can update his h_next_leaf_blk field, as well |
1449 | * as the dinodes i_last_eb_blk */ | 2963 | * as the dinodes i_last_eb_blk */ |
1450 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | 2964 | static int ocfs2_find_new_last_ext_blk(struct inode *inode, |
1451 | struct inode *inode, | 2965 | unsigned int clusters_to_del, |
1452 | struct ocfs2_dinode *fe, | 2966 | struct ocfs2_path *path, |
1453 | u32 new_i_clusters, | ||
1454 | struct buffer_head *old_last_eb, | ||
1455 | struct buffer_head **new_last_eb) | 2967 | struct buffer_head **new_last_eb) |
1456 | { | 2968 | { |
1457 | int i, status = 0; | 2969 | int next_free, ret = 0; |
1458 | u64 block = 0; | 2970 | u32 cpos; |
2971 | struct ocfs2_extent_rec *rec; | ||
1459 | struct ocfs2_extent_block *eb; | 2972 | struct ocfs2_extent_block *eb; |
1460 | struct ocfs2_extent_list *el; | 2973 | struct ocfs2_extent_list *el; |
1461 | struct buffer_head *bh = NULL; | 2974 | struct buffer_head *bh = NULL; |
1462 | 2975 | ||
1463 | *new_last_eb = NULL; | 2976 | *new_last_eb = NULL; |
1464 | 2977 | ||
1465 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1466 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
1467 | status = -EIO; | ||
1468 | goto bail; | ||
1469 | } | ||
1470 | |||
1471 | /* we have no tree, so of course, no last_eb. */ | 2978 | /* we have no tree, so of course, no last_eb. */ |
1472 | if (!fe->id2.i_list.l_tree_depth) | 2979 | if (!path->p_tree_depth) |
1473 | goto bail; | 2980 | goto out; |
1474 | 2981 | ||
1475 | /* trunc to zero special case - this makes tree_depth = 0 | 2982 | /* trunc to zero special case - this makes tree_depth = 0 |
1476 | * regardless of what it is. */ | 2983 | * regardless of what it is. */ |
1477 | if (!new_i_clusters) | 2984 | if (OCFS2_I(inode)->ip_clusters == clusters_to_del) |
1478 | goto bail; | 2985 | goto out; |
1479 | 2986 | ||
1480 | eb = (struct ocfs2_extent_block *) old_last_eb->b_data; | 2987 | el = path_leaf_el(path); |
1481 | el = &(eb->h_list); | ||
1482 | BUG_ON(!el->l_next_free_rec); | 2988 | BUG_ON(!el->l_next_free_rec); |
1483 | 2989 | ||
1484 | /* Make sure that this guy will actually be empty after we | 2990 | /* |
1485 | * clear away the data. */ | 2991 | * Make sure that this extent list will actually be empty |
1486 | if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) | 2992 | * after we clear away the data. We can shortcut out if |
1487 | goto bail; | 2993 | * there's more than one non-empty extent in the |
2994 | * list. Otherwise, a check of the remaining extent is | ||
2995 | * necessary. | ||
2996 | */ | ||
2997 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
2998 | rec = NULL; | ||
2999 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
3000 | if (next_free > 2) | ||
3001 | goto out; | ||
1488 | 3002 | ||
1489 | /* Ok, at this point, we know that last_eb will definitely | 3003 | /* We may have a valid extent in index 1, check it. */ |
1490 | * change, so lets traverse the tree and find the second to | 3004 | if (next_free == 2) |
1491 | * last extent block. */ | 3005 | rec = &el->l_recs[1]; |
1492 | el = &(fe->id2.i_list); | 3006 | |
1493 | /* go down the tree, */ | 3007 | /* |
1494 | do { | 3008 | * Fall through - no more nonempty extents, so we want |
1495 | for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { | 3009 | * to delete this leaf. |
1496 | if (le32_to_cpu(el->l_recs[i].e_cpos) < | 3010 | */ |
1497 | new_i_clusters) { | 3011 | } else { |
1498 | block = le64_to_cpu(el->l_recs[i].e_blkno); | 3012 | if (next_free > 1) |
1499 | break; | 3013 | goto out; |
1500 | } | 3014 | |
3015 | rec = &el->l_recs[0]; | ||
3016 | } | ||
3017 | |||
3018 | if (rec) { | ||
3019 | /* | ||
3020 | * Check it we'll only be trimming off the end of this | ||
3021 | * cluster. | ||
3022 | */ | ||
3023 | if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) | ||
3024 | goto out; | ||
3025 | } | ||
3026 | |||
3027 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); | ||
3028 | if (ret) { | ||
3029 | mlog_errno(ret); | ||
3030 | goto out; | ||
3031 | } | ||
3032 | |||
3033 | ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); | ||
3034 | if (ret) { | ||
3035 | mlog_errno(ret); | ||
3036 | goto out; | ||
3037 | } | ||
3038 | |||
3039 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
3040 | el = &eb->h_list; | ||
3041 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
3042 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
3043 | ret = -EROFS; | ||
3044 | goto out; | ||
3045 | } | ||
3046 | |||
3047 | *new_last_eb = bh; | ||
3048 | get_bh(*new_last_eb); | ||
3049 | mlog(0, "returning block %llu, (cpos: %u)\n", | ||
3050 | (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); | ||
3051 | out: | ||
3052 | brelse(bh); | ||
3053 | |||
3054 | return ret; | ||
3055 | } | ||
3056 | |||
3057 | /* | ||
3058 | * Trim some clusters off the rightmost edge of a tree. Only called | ||
3059 | * during truncate. | ||
3060 | * | ||
3061 | * The caller needs to: | ||
3062 | * - start journaling of each path component. | ||
3063 | * - compute and fully set up any new last ext block | ||
3064 | */ | ||
3065 | static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, | ||
3066 | handle_t *handle, struct ocfs2_truncate_context *tc, | ||
3067 | u32 clusters_to_del, u64 *delete_start) | ||
3068 | { | ||
3069 | int ret, i, index = path->p_tree_depth; | ||
3070 | u32 new_edge = 0; | ||
3071 | u64 deleted_eb = 0; | ||
3072 | struct buffer_head *bh; | ||
3073 | struct ocfs2_extent_list *el; | ||
3074 | struct ocfs2_extent_rec *rec; | ||
3075 | |||
3076 | *delete_start = 0; | ||
3077 | |||
3078 | while (index >= 0) { | ||
3079 | bh = path->p_node[index].bh; | ||
3080 | el = path->p_node[index].el; | ||
3081 | |||
3082 | mlog(0, "traveling tree (index = %d, block = %llu)\n", | ||
3083 | index, (unsigned long long)bh->b_blocknr); | ||
3084 | |||
3085 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | ||
3086 | |||
3087 | if (index != | ||
3088 | (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { | ||
3089 | ocfs2_error(inode->i_sb, | ||
3090 | "Inode %lu has invalid ext. block %llu", | ||
3091 | inode->i_ino, | ||
3092 | (unsigned long long)bh->b_blocknr); | ||
3093 | ret = -EROFS; | ||
3094 | goto out; | ||
1501 | } | 3095 | } |
1502 | BUG_ON(i < 0); | ||
1503 | 3096 | ||
1504 | if (bh) { | 3097 | find_tail_record: |
1505 | brelse(bh); | 3098 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
1506 | bh = NULL; | 3099 | rec = &el->l_recs[i]; |
3100 | |||
3101 | mlog(0, "Extent list before: record %d: (%u, %u, %llu), " | ||
3102 | "next = %u\n", i, le32_to_cpu(rec->e_cpos), | ||
3103 | ocfs2_rec_clusters(el, rec), | ||
3104 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
3105 | le16_to_cpu(el->l_next_free_rec)); | ||
3106 | |||
3107 | BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); | ||
3108 | |||
3109 | if (le16_to_cpu(el->l_tree_depth) == 0) { | ||
3110 | /* | ||
3111 | * If the leaf block contains a single empty | ||
3112 | * extent and no records, we can just remove | ||
3113 | * the block. | ||
3114 | */ | ||
3115 | if (i == 0 && ocfs2_is_empty_extent(rec)) { | ||
3116 | memset(rec, 0, | ||
3117 | sizeof(struct ocfs2_extent_rec)); | ||
3118 | el->l_next_free_rec = cpu_to_le16(0); | ||
3119 | |||
3120 | goto delete; | ||
3121 | } | ||
3122 | |||
3123 | /* | ||
3124 | * Remove any empty extents by shifting things | ||
3125 | * left. That should make life much easier on | ||
3126 | * the code below. This condition is rare | ||
3127 | * enough that we shouldn't see a performance | ||
3128 | * hit. | ||
3129 | */ | ||
3130 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
3131 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
3132 | |||
3133 | for(i = 0; | ||
3134 | i < le16_to_cpu(el->l_next_free_rec); i++) | ||
3135 | el->l_recs[i] = el->l_recs[i + 1]; | ||
3136 | |||
3137 | memset(&el->l_recs[i], 0, | ||
3138 | sizeof(struct ocfs2_extent_rec)); | ||
3139 | |||
3140 | /* | ||
3141 | * We've modified our extent list. The | ||
3142 | * simplest way to handle this change | ||
3143 | * is to being the search from the | ||
3144 | * start again. | ||
3145 | */ | ||
3146 | goto find_tail_record; | ||
3147 | } | ||
3148 | |||
3149 | le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); | ||
3150 | |||
3151 | /* | ||
3152 | * We'll use "new_edge" on our way back up the | ||
3153 | * tree to know what our rightmost cpos is. | ||
3154 | */ | ||
3155 | new_edge = le16_to_cpu(rec->e_leaf_clusters); | ||
3156 | new_edge += le32_to_cpu(rec->e_cpos); | ||
3157 | |||
3158 | /* | ||
3159 | * The caller will use this to delete data blocks. | ||
3160 | */ | ||
3161 | *delete_start = le64_to_cpu(rec->e_blkno) | ||
3162 | + ocfs2_clusters_to_blocks(inode->i_sb, | ||
3163 | le16_to_cpu(rec->e_leaf_clusters)); | ||
3164 | |||
3165 | /* | ||
3166 | * If it's now empty, remove this record. | ||
3167 | */ | ||
3168 | if (le16_to_cpu(rec->e_leaf_clusters) == 0) { | ||
3169 | memset(rec, 0, | ||
3170 | sizeof(struct ocfs2_extent_rec)); | ||
3171 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
3172 | } | ||
3173 | } else { | ||
3174 | if (le64_to_cpu(rec->e_blkno) == deleted_eb) { | ||
3175 | memset(rec, 0, | ||
3176 | sizeof(struct ocfs2_extent_rec)); | ||
3177 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
3178 | |||
3179 | goto delete; | ||
3180 | } | ||
3181 | |||
3182 | /* Can this actually happen? */ | ||
3183 | if (le16_to_cpu(el->l_next_free_rec) == 0) | ||
3184 | goto delete; | ||
3185 | |||
3186 | /* | ||
3187 | * We never actually deleted any clusters | ||
3188 | * because our leaf was empty. There's no | ||
3189 | * reason to adjust the rightmost edge then. | ||
3190 | */ | ||
3191 | if (new_edge == 0) | ||
3192 | goto delete; | ||
3193 | |||
3194 | rec->e_int_clusters = cpu_to_le32(new_edge); | ||
3195 | le32_add_cpu(&rec->e_int_clusters, | ||
3196 | -le32_to_cpu(rec->e_cpos)); | ||
3197 | |||
3198 | /* | ||
3199 | * A deleted child record should have been | ||
3200 | * caught above. | ||
3201 | */ | ||
3202 | BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); | ||
1507 | } | 3203 | } |
1508 | 3204 | ||
1509 | status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, | 3205 | delete: |
1510 | inode); | 3206 | ret = ocfs2_journal_dirty(handle, bh); |
1511 | if (status < 0) { | 3207 | if (ret) { |
1512 | mlog_errno(status); | 3208 | mlog_errno(ret); |
1513 | goto bail; | 3209 | goto out; |
1514 | } | 3210 | } |
1515 | eb = (struct ocfs2_extent_block *) bh->b_data; | 3211 | |
1516 | el = &eb->h_list; | 3212 | mlog(0, "extent list container %llu, after: record %d: " |
1517 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3213 | "(%u, %u, %llu), next = %u.\n", |
1518 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3214 | (unsigned long long)bh->b_blocknr, i, |
1519 | status = -EIO; | 3215 | le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), |
1520 | goto bail; | 3216 | (unsigned long long)le64_to_cpu(rec->e_blkno), |
3217 | le16_to_cpu(el->l_next_free_rec)); | ||
3218 | |||
3219 | /* | ||
3220 | * We must be careful to only attempt delete of an | ||
3221 | * extent block (and not the root inode block). | ||
3222 | */ | ||
3223 | if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { | ||
3224 | struct ocfs2_extent_block *eb = | ||
3225 | (struct ocfs2_extent_block *)bh->b_data; | ||
3226 | |||
3227 | /* | ||
3228 | * Save this for use when processing the | ||
3229 | * parent block. | ||
3230 | */ | ||
3231 | deleted_eb = le64_to_cpu(eb->h_blkno); | ||
3232 | |||
3233 | mlog(0, "deleting this extent block.\n"); | ||
3234 | |||
3235 | ocfs2_remove_from_cache(inode, bh); | ||
3236 | |||
3237 | BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); | ||
3238 | BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); | ||
3239 | BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); | ||
3240 | |||
3241 | if (le16_to_cpu(eb->h_suballoc_slot) == 0) { | ||
3242 | /* | ||
3243 | * This code only understands how to | ||
3244 | * lock the suballocator in slot 0, | ||
3245 | * which is fine because allocation is | ||
3246 | * only ever done out of that | ||
3247 | * suballocator too. A future version | ||
3248 | * might change that however, so avoid | ||
3249 | * a free if we don't know how to | ||
3250 | * handle it. This way an fs incompat | ||
3251 | * bit will not be necessary. | ||
3252 | */ | ||
3253 | ret = ocfs2_free_extent_block(handle, | ||
3254 | tc->tc_ext_alloc_inode, | ||
3255 | tc->tc_ext_alloc_bh, | ||
3256 | eb); | ||
3257 | |||
3258 | /* An error here is not fatal. */ | ||
3259 | if (ret < 0) | ||
3260 | mlog_errno(ret); | ||
3261 | } | ||
3262 | } else { | ||
3263 | deleted_eb = 0; | ||
1521 | } | 3264 | } |
1522 | } while (el->l_tree_depth); | ||
1523 | 3265 | ||
1524 | *new_last_eb = bh; | 3266 | index--; |
1525 | get_bh(*new_last_eb); | 3267 | } |
1526 | mlog(0, "returning block %llu\n", | ||
1527 | (unsigned long long)le64_to_cpu(eb->h_blkno)); | ||
1528 | bail: | ||
1529 | if (bh) | ||
1530 | brelse(bh); | ||
1531 | 3268 | ||
1532 | return status; | 3269 | ret = 0; |
3270 | out: | ||
3271 | return ret; | ||
1533 | } | 3272 | } |
1534 | 3273 | ||
1535 | static int ocfs2_do_truncate(struct ocfs2_super *osb, | 3274 | static int ocfs2_do_truncate(struct ocfs2_super *osb, |
1536 | unsigned int clusters_to_del, | 3275 | unsigned int clusters_to_del, |
1537 | struct inode *inode, | 3276 | struct inode *inode, |
1538 | struct buffer_head *fe_bh, | 3277 | struct buffer_head *fe_bh, |
1539 | struct buffer_head *old_last_eb_bh, | ||
1540 | handle_t *handle, | 3278 | handle_t *handle, |
1541 | struct ocfs2_truncate_context *tc) | 3279 | struct ocfs2_truncate_context *tc, |
3280 | struct ocfs2_path *path) | ||
1542 | { | 3281 | { |
1543 | int status, i, depth; | 3282 | int status; |
1544 | struct ocfs2_dinode *fe; | 3283 | struct ocfs2_dinode *fe; |
1545 | struct ocfs2_extent_block *eb; | ||
1546 | struct ocfs2_extent_block *last_eb = NULL; | 3284 | struct ocfs2_extent_block *last_eb = NULL; |
1547 | struct ocfs2_extent_list *el; | 3285 | struct ocfs2_extent_list *el; |
1548 | struct buffer_head *eb_bh = NULL; | ||
1549 | struct buffer_head *last_eb_bh = NULL; | 3286 | struct buffer_head *last_eb_bh = NULL; |
1550 | u64 next_eb = 0; | ||
1551 | u64 delete_blk = 0; | 3287 | u64 delete_blk = 0; |
1552 | 3288 | ||
1553 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 3289 | fe = (struct ocfs2_dinode *) fe_bh->b_data; |
1554 | 3290 | ||
1555 | status = ocfs2_find_new_last_ext_blk(osb, | 3291 | status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, |
1556 | inode, | 3292 | path, &last_eb_bh); |
1557 | fe, | ||
1558 | le32_to_cpu(fe->i_clusters) - | ||
1559 | clusters_to_del, | ||
1560 | old_last_eb_bh, | ||
1561 | &last_eb_bh); | ||
1562 | if (status < 0) { | 3293 | if (status < 0) { |
1563 | mlog_errno(status); | 3294 | mlog_errno(status); |
1564 | goto bail; | 3295 | goto bail; |
1565 | } | 3296 | } |
1566 | if (last_eb_bh) | ||
1567 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1568 | 3297 | ||
1569 | status = ocfs2_journal_access(handle, inode, fe_bh, | 3298 | /* |
1570 | OCFS2_JOURNAL_ACCESS_WRITE); | 3299 | * Each component will be touched, so we might as well journal |
3300 | * here to avoid having to handle errors later. | ||
3301 | */ | ||
3302 | status = ocfs2_journal_access_path(inode, handle, path); | ||
1571 | if (status < 0) { | 3303 | if (status < 0) { |
1572 | mlog_errno(status); | 3304 | mlog_errno(status); |
1573 | goto bail; | 3305 | goto bail; |
1574 | } | 3306 | } |
3307 | |||
3308 | if (last_eb_bh) { | ||
3309 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
3310 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
3311 | if (status < 0) { | ||
3312 | mlog_errno(status); | ||
3313 | goto bail; | ||
3314 | } | ||
3315 | |||
3316 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
3317 | } | ||
3318 | |||
1575 | el = &(fe->id2.i_list); | 3319 | el = &(fe->id2.i_list); |
1576 | 3320 | ||
3321 | /* | ||
3322 | * Lower levels depend on this never happening, but it's best | ||
3323 | * to check it up here before changing the tree. | ||
3324 | */ | ||
3325 | if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { | ||
3326 | ocfs2_error(inode->i_sb, | ||
3327 | "Inode %lu has an empty extent record, depth %u\n", | ||
3328 | inode->i_ino, le16_to_cpu(el->l_tree_depth)); | ||
3329 | status = -EROFS; | ||
3330 | goto bail; | ||
3331 | } | ||
3332 | |||
1577 | spin_lock(&OCFS2_I(inode)->ip_lock); | 3333 | spin_lock(&OCFS2_I(inode)->ip_lock); |
1578 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - | 3334 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - |
1579 | clusters_to_del; | 3335 | clusters_to_del; |
1580 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 3336 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
1581 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); | 3337 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); |
1582 | fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
1583 | fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
1584 | |||
1585 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1586 | |||
1587 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
1588 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
1589 | /* tree depth zero, we can just delete the clusters, otherwise | ||
1590 | * we need to record the offset of the next level extent block | ||
1591 | * as we may overwrite it. */ | ||
1592 | if (!el->l_tree_depth) | ||
1593 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1594 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1595 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1596 | else | ||
1597 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1598 | 3338 | ||
1599 | if (!el->l_recs[i].e_clusters) { | 3339 | status = ocfs2_trim_tree(inode, path, handle, tc, |
1600 | /* if we deleted the whole extent record, then clear | 3340 | clusters_to_del, &delete_blk); |
1601 | * out the other fields and update the extent | 3341 | if (status) { |
1602 | * list. For depth > 0 trees, we've already recorded | 3342 | mlog_errno(status); |
1603 | * the extent block in 'next_eb' */ | 3343 | goto bail; |
1604 | el->l_recs[i].e_cpos = 0; | ||
1605 | el->l_recs[i].e_blkno = 0; | ||
1606 | BUG_ON(!el->l_next_free_rec); | ||
1607 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1608 | } | 3344 | } |
1609 | 3345 | ||
1610 | depth = le16_to_cpu(el->l_tree_depth); | 3346 | if (le32_to_cpu(fe->i_clusters) == 0) { |
1611 | if (!fe->i_clusters) { | ||
1612 | /* trunc to zero is a special case. */ | 3347 | /* trunc to zero is a special case. */ |
1613 | el->l_tree_depth = 0; | 3348 | el->l_tree_depth = 0; |
1614 | fe->i_last_eb_blk = 0; | 3349 | fe->i_last_eb_blk = 0; |
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, | |||
1625 | /* If there will be a new last extent block, then by | 3360 | /* If there will be a new last extent block, then by |
1626 | * definition, there cannot be any leaves to the right of | 3361 | * definition, there cannot be any leaves to the right of |
1627 | * him. */ | 3362 | * him. */ |
1628 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
1629 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1630 | if (status < 0) { | ||
1631 | mlog_errno(status); | ||
1632 | goto bail; | ||
1633 | } | ||
1634 | last_eb->h_next_leaf_blk = 0; | 3363 | last_eb->h_next_leaf_blk = 0; |
1635 | status = ocfs2_journal_dirty(handle, last_eb_bh); | 3364 | status = ocfs2_journal_dirty(handle, last_eb_bh); |
1636 | if (status < 0) { | 3365 | if (status < 0) { |
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, | |||
1639 | } | 3368 | } |
1640 | } | 3369 | } |
1641 | 3370 | ||
1642 | /* if our tree depth > 0, update all the tree blocks below us. */ | 3371 | if (delete_blk) { |
1643 | while (depth) { | 3372 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, |
1644 | mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", | 3373 | clusters_to_del); |
1645 | depth, (unsigned long long)next_eb); | ||
1646 | status = ocfs2_read_block(osb, next_eb, &eb_bh, | ||
1647 | OCFS2_BH_CACHED, inode); | ||
1648 | if (status < 0) { | 3374 | if (status < 0) { |
1649 | mlog_errno(status); | 3375 | mlog_errno(status); |
1650 | goto bail; | 3376 | goto bail; |
1651 | } | 3377 | } |
1652 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | 3378 | } |
1653 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3379 | status = 0; |
1654 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3380 | bail: |
1655 | status = -EIO; | 3381 | |
1656 | goto bail; | 3382 | mlog_exit(status); |
3383 | return status; | ||
3384 | } | ||
3385 | |||
3386 | static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) | ||
3387 | { | ||
3388 | set_buffer_uptodate(bh); | ||
3389 | mark_buffer_dirty(bh); | ||
3390 | return 0; | ||
3391 | } | ||
3392 | |||
3393 | static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) | ||
3394 | { | ||
3395 | set_buffer_uptodate(bh); | ||
3396 | mark_buffer_dirty(bh); | ||
3397 | return ocfs2_journal_dirty_data(handle, bh); | ||
3398 | } | ||
3399 | |||
3400 | static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, | ||
3401 | struct page **pages, int numpages, | ||
3402 | u64 phys, handle_t *handle) | ||
3403 | { | ||
3404 | int i, ret, partial = 0; | ||
3405 | void *kaddr; | ||
3406 | struct page *page; | ||
3407 | unsigned int from, to = PAGE_CACHE_SIZE; | ||
3408 | struct super_block *sb = inode->i_sb; | ||
3409 | |||
3410 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); | ||
3411 | |||
3412 | if (numpages == 0) | ||
3413 | goto out; | ||
3414 | |||
3415 | from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ | ||
3416 | if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { | ||
3417 | /* | ||
3418 | * Since 'from' has been capped to a value below page | ||
3419 | * size, this calculation won't be able to overflow | ||
3420 | * 'to' | ||
3421 | */ | ||
3422 | to = ocfs2_align_bytes_to_clusters(sb, from); | ||
3423 | |||
3424 | /* | ||
3425 | * The truncate tail in this case should never contain | ||
3426 | * more than one page at maximum. The loop below also | ||
3427 | * assumes this. | ||
3428 | */ | ||
3429 | BUG_ON(numpages != 1); | ||
3430 | } | ||
3431 | |||
3432 | for(i = 0; i < numpages; i++) { | ||
3433 | page = pages[i]; | ||
3434 | |||
3435 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
3436 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
3437 | |||
3438 | ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); | ||
3439 | if (ret) | ||
3440 | mlog_errno(ret); | ||
3441 | |||
3442 | kaddr = kmap_atomic(page, KM_USER0); | ||
3443 | memset(kaddr + from, 0, to - from); | ||
3444 | kunmap_atomic(kaddr, KM_USER0); | ||
3445 | |||
3446 | /* | ||
3447 | * Need to set the buffers we zero'd into uptodate | ||
3448 | * here if they aren't - ocfs2_map_page_blocks() | ||
3449 | * might've skipped some | ||
3450 | */ | ||
3451 | if (ocfs2_should_order_data(inode)) { | ||
3452 | ret = walk_page_buffers(handle, | ||
3453 | page_buffers(page), | ||
3454 | from, to, &partial, | ||
3455 | ocfs2_ordered_zero_func); | ||
3456 | if (ret < 0) | ||
3457 | mlog_errno(ret); | ||
3458 | } else { | ||
3459 | ret = walk_page_buffers(handle, page_buffers(page), | ||
3460 | from, to, &partial, | ||
3461 | ocfs2_writeback_zero_func); | ||
3462 | if (ret < 0) | ||
3463 | mlog_errno(ret); | ||
1657 | } | 3464 | } |
1658 | el = &(eb->h_list); | ||
1659 | 3465 | ||
1660 | status = ocfs2_journal_access(handle, inode, eb_bh, | 3466 | if (!partial) |
1661 | OCFS2_JOURNAL_ACCESS_WRITE); | 3467 | SetPageUptodate(page); |
1662 | if (status < 0) { | 3468 | |
1663 | mlog_errno(status); | 3469 | flush_dcache_page(page); |
1664 | goto bail; | 3470 | |
3471 | /* | ||
3472 | * Every page after the 1st one should be completely zero'd. | ||
3473 | */ | ||
3474 | from = 0; | ||
3475 | } | ||
3476 | out: | ||
3477 | if (pages) { | ||
3478 | for (i = 0; i < numpages; i++) { | ||
3479 | page = pages[i]; | ||
3480 | unlock_page(page); | ||
3481 | mark_page_accessed(page); | ||
3482 | page_cache_release(page); | ||
1665 | } | 3483 | } |
3484 | } | ||
3485 | } | ||
1666 | 3486 | ||
1667 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | 3487 | static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, |
1668 | BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); | 3488 | int *num, u64 *phys) |
3489 | { | ||
3490 | int i, numpages = 0, ret = 0; | ||
3491 | unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; | ||
3492 | unsigned int ext_flags; | ||
3493 | struct super_block *sb = inode->i_sb; | ||
3494 | struct address_space *mapping = inode->i_mapping; | ||
3495 | unsigned long index; | ||
3496 | u64 next_cluster_bytes; | ||
3497 | |||
3498 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); | ||
3499 | |||
3500 | /* Cluster boundary, so we don't need to grab any pages. */ | ||
3501 | if ((isize & (csize - 1)) == 0) | ||
3502 | goto out; | ||
1669 | 3503 | ||
1670 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 3504 | ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, |
3505 | phys, NULL, &ext_flags); | ||
3506 | if (ret) { | ||
3507 | mlog_errno(ret); | ||
3508 | goto out; | ||
3509 | } | ||
1671 | 3510 | ||
1672 | mlog(0, "extent block %llu, before: record %d: " | 3511 | /* Tail is a hole. */ |
1673 | "(%u, %u, %llu), next = %u\n", | 3512 | if (*phys == 0) |
1674 | (unsigned long long)le64_to_cpu(eb->h_blkno), i, | 3513 | goto out; |
1675 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1676 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1677 | (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), | ||
1678 | le16_to_cpu(el->l_next_free_rec)); | ||
1679 | 3514 | ||
1680 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | 3515 | /* Tail is marked as unwritten, we can count on write to zero |
1681 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | 3516 | * in that case. */ |
1682 | 3517 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | |
1683 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | 3518 | goto out; |
1684 | /* bottom-most block requires us to delete data.*/ | ||
1685 | if (!el->l_tree_depth) | ||
1686 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1687 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1688 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1689 | if (!el->l_recs[i].e_clusters) { | ||
1690 | el->l_recs[i].e_cpos = 0; | ||
1691 | el->l_recs[i].e_blkno = 0; | ||
1692 | BUG_ON(!el->l_next_free_rec); | ||
1693 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1694 | } | ||
1695 | mlog(0, "extent block %llu, after: record %d: " | ||
1696 | "(%u, %u, %llu), next = %u\n", | ||
1697 | (unsigned long long)le64_to_cpu(eb->h_blkno), i, | ||
1698 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1699 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1700 | (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), | ||
1701 | le16_to_cpu(el->l_next_free_rec)); | ||
1702 | 3519 | ||
1703 | status = ocfs2_journal_dirty(handle, eb_bh); | 3520 | next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); |
1704 | if (status < 0) { | 3521 | index = isize >> PAGE_CACHE_SHIFT; |
1705 | mlog_errno(status); | 3522 | do { |
1706 | goto bail; | 3523 | pages[numpages] = grab_cache_page(mapping, index); |
3524 | if (!pages[numpages]) { | ||
3525 | ret = -ENOMEM; | ||
3526 | mlog_errno(ret); | ||
3527 | goto out; | ||
1707 | } | 3528 | } |
1708 | 3529 | ||
1709 | if (!el->l_next_free_rec) { | 3530 | numpages++; |
1710 | mlog(0, "deleting this extent block.\n"); | 3531 | index++; |
1711 | 3532 | } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); | |
1712 | ocfs2_remove_from_cache(inode, eb_bh); | ||
1713 | 3533 | ||
1714 | BUG_ON(el->l_recs[0].e_clusters); | 3534 | out: |
1715 | BUG_ON(el->l_recs[0].e_cpos); | 3535 | if (ret != 0) { |
1716 | BUG_ON(el->l_recs[0].e_blkno); | 3536 | if (pages) { |
1717 | if (eb->h_suballoc_slot == 0) { | 3537 | for (i = 0; i < numpages; i++) { |
1718 | /* | 3538 | if (pages[i]) { |
1719 | * This code only understands how to | 3539 | unlock_page(pages[i]); |
1720 | * lock the suballocator in slot 0, | 3540 | page_cache_release(pages[i]); |
1721 | * which is fine because allocation is | ||
1722 | * only ever done out of that | ||
1723 | * suballocator too. A future version | ||
1724 | * might change that however, so avoid | ||
1725 | * a free if we don't know how to | ||
1726 | * handle it. This way an fs incompat | ||
1727 | * bit will not be necessary. | ||
1728 | */ | ||
1729 | status = ocfs2_free_extent_block(handle, | ||
1730 | tc->tc_ext_alloc_inode, | ||
1731 | tc->tc_ext_alloc_bh, | ||
1732 | eb); | ||
1733 | if (status < 0) { | ||
1734 | mlog_errno(status); | ||
1735 | goto bail; | ||
1736 | } | 3541 | } |
1737 | } | 3542 | } |
1738 | } | 3543 | } |
1739 | brelse(eb_bh); | 3544 | numpages = 0; |
1740 | eb_bh = NULL; | ||
1741 | depth--; | ||
1742 | } | 3545 | } |
1743 | 3546 | ||
1744 | BUG_ON(!delete_blk); | 3547 | *num = numpages; |
1745 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, | 3548 | |
1746 | clusters_to_del); | 3549 | return ret; |
1747 | if (status < 0) { | 3550 | } |
1748 | mlog_errno(status); | 3551 | |
1749 | goto bail; | 3552 | /* |
3553 | * Zero the area past i_size but still within an allocated | ||
3554 | * cluster. This avoids exposing nonzero data on subsequent file | ||
3555 | * extends. | ||
3556 | * | ||
3557 | * We need to call this before i_size is updated on the inode because | ||
3558 | * otherwise block_write_full_page() will skip writeout of pages past | ||
3559 | * i_size. The new_i_size parameter is passed for this reason. | ||
3560 | */ | ||
3561 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | ||
3562 | u64 new_i_size) | ||
3563 | { | ||
3564 | int ret, numpages; | ||
3565 | loff_t endbyte; | ||
3566 | struct page **pages = NULL; | ||
3567 | u64 phys; | ||
3568 | |||
3569 | /* | ||
3570 | * File systems which don't support sparse files zero on every | ||
3571 | * extend. | ||
3572 | */ | ||
3573 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
3574 | return 0; | ||
3575 | |||
3576 | pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), | ||
3577 | sizeof(struct page *), GFP_NOFS); | ||
3578 | if (pages == NULL) { | ||
3579 | ret = -ENOMEM; | ||
3580 | mlog_errno(ret); | ||
3581 | goto out; | ||
1750 | } | 3582 | } |
1751 | status = 0; | 3583 | |
1752 | bail: | 3584 | ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); |
1753 | if (!status) | 3585 | if (ret) { |
1754 | ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); | 3586 | mlog_errno(ret); |
1755 | else | 3587 | goto out; |
1756 | ocfs2_extent_map_drop(inode, 0); | 3588 | } |
1757 | mlog_exit(status); | 3589 | |
1758 | return status; | 3590 | if (numpages == 0) |
3591 | goto out; | ||
3592 | |||
3593 | ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, | ||
3594 | handle); | ||
3595 | |||
3596 | /* | ||
3597 | * Initiate writeout of the pages we zero'd here. We don't | ||
3598 | * wait on them - the truncate_inode_pages() call later will | ||
3599 | * do that for us. | ||
3600 | */ | ||
3601 | endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); | ||
3602 | ret = do_sync_mapping_range(inode->i_mapping, new_i_size, | ||
3603 | endbyte - 1, SYNC_FILE_RANGE_WRITE); | ||
3604 | if (ret) | ||
3605 | mlog_errno(ret); | ||
3606 | |||
3607 | out: | ||
3608 | if (pages) | ||
3609 | kfree(pages); | ||
3610 | |||
3611 | return ret; | ||
1759 | } | 3612 | } |
1760 | 3613 | ||
1761 | /* | 3614 | /* |
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
1770 | struct ocfs2_truncate_context *tc) | 3623 | struct ocfs2_truncate_context *tc) |
1771 | { | 3624 | { |
1772 | int status, i, credits, tl_sem = 0; | 3625 | int status, i, credits, tl_sem = 0; |
1773 | u32 clusters_to_del, target_i_clusters; | 3626 | u32 clusters_to_del, new_highest_cpos, range; |
1774 | u64 last_eb = 0; | ||
1775 | struct ocfs2_dinode *fe; | ||
1776 | struct ocfs2_extent_block *eb; | ||
1777 | struct ocfs2_extent_list *el; | 3627 | struct ocfs2_extent_list *el; |
1778 | struct buffer_head *last_eb_bh; | ||
1779 | handle_t *handle = NULL; | 3628 | handle_t *handle = NULL; |
1780 | struct inode *tl_inode = osb->osb_tl_inode; | 3629 | struct inode *tl_inode = osb->osb_tl_inode; |
3630 | struct ocfs2_path *path = NULL; | ||
1781 | 3631 | ||
1782 | mlog_entry_void(); | 3632 | mlog_entry_void(); |
1783 | 3633 | ||
1784 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 3634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
1785 | 3635 | ||
1786 | target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | 3636 | new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, |
1787 | i_size_read(inode)); | 3637 | i_size_read(inode)); |
1788 | 3638 | ||
1789 | last_eb_bh = tc->tc_last_eb_bh; | 3639 | path = ocfs2_new_inode_path(fe_bh); |
1790 | tc->tc_last_eb_bh = NULL; | 3640 | if (!path) { |
3641 | status = -ENOMEM; | ||
3642 | mlog_errno(status); | ||
3643 | goto bail; | ||
3644 | } | ||
1791 | 3645 | ||
1792 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 3646 | ocfs2_extent_map_trunc(inode, new_highest_cpos); |
1793 | 3647 | ||
1794 | if (fe->id2.i_list.l_tree_depth) { | ||
1795 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1796 | el = &eb->h_list; | ||
1797 | } else | ||
1798 | el = &fe->id2.i_list; | ||
1799 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1800 | start: | 3648 | start: |
1801 | mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " | 3649 | /* |
1802 | "last_eb = %llu, fe->i_last_eb_blk = %llu, " | 3650 | * Check that we still have allocation to delete. |
1803 | "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", | 3651 | */ |
1804 | le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, | 3652 | if (OCFS2_I(inode)->ip_clusters == 0) { |
1805 | (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), | 3653 | status = 0; |
1806 | le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); | 3654 | goto bail; |
1807 | 3655 | } | |
1808 | if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { | ||
1809 | mlog(0, "last_eb changed!\n"); | ||
1810 | BUG_ON(!fe->id2.i_list.l_tree_depth); | ||
1811 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1812 | /* i_last_eb_blk may have changed, read it if | ||
1813 | * necessary. We don't have to worry about the | ||
1814 | * truncate to zero case here (where there becomes no | ||
1815 | * last_eb) because we never loop back after our work | ||
1816 | * is done. */ | ||
1817 | if (last_eb_bh) { | ||
1818 | brelse(last_eb_bh); | ||
1819 | last_eb_bh = NULL; | ||
1820 | } | ||
1821 | 3656 | ||
1822 | status = ocfs2_read_block(osb, last_eb, | 3657 | /* |
1823 | &last_eb_bh, OCFS2_BH_CACHED, | 3658 | * Truncate always works against the rightmost tree branch. |
1824 | inode); | 3659 | */ |
1825 | if (status < 0) { | 3660 | status = ocfs2_find_path(inode, path, UINT_MAX); |
1826 | mlog_errno(status); | 3661 | if (status) { |
1827 | goto bail; | 3662 | mlog_errno(status); |
1828 | } | 3663 | goto bail; |
1829 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | 3664 | } |
1830 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 3665 | |
1831 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | 3666 | mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", |
1832 | status = -EIO; | 3667 | OCFS2_I(inode)->ip_clusters, path->p_tree_depth); |
1833 | goto bail; | 3668 | |
1834 | } | 3669 | /* |
1835 | el = &(eb->h_list); | 3670 | * By now, el will point to the extent list on the bottom most |
3671 | * portion of this tree. Only the tail record is considered in | ||
3672 | * each pass. | ||
3673 | * | ||
3674 | * We handle the following cases, in order: | ||
3675 | * - empty extent: delete the remaining branch | ||
3676 | * - remove the entire record | ||
3677 | * - remove a partial record | ||
3678 | * - no record needs to be removed (truncate has completed) | ||
3679 | */ | ||
3680 | el = path_leaf_el(path); | ||
3681 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
3682 | ocfs2_error(inode->i_sb, | ||
3683 | "Inode %llu has empty extent block at %llu\n", | ||
3684 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
3685 | (unsigned long long)path_leaf_bh(path)->b_blocknr); | ||
3686 | status = -EROFS; | ||
3687 | goto bail; | ||
1836 | } | 3688 | } |
1837 | 3689 | ||
1838 | /* by now, el will point to the extent list on the bottom most | ||
1839 | * portion of this tree. */ | ||
1840 | i = le16_to_cpu(el->l_next_free_rec) - 1; | 3690 | i = le16_to_cpu(el->l_next_free_rec) - 1; |
1841 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) | 3691 | range = le32_to_cpu(el->l_recs[i].e_cpos) + |
1842 | clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); | 3692 | ocfs2_rec_clusters(el, &el->l_recs[i]); |
1843 | else | 3693 | if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { |
1844 | clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + | 3694 | clusters_to_del = 0; |
3695 | } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { | ||
3696 | clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); | ||
3697 | } else if (range > new_highest_cpos) { | ||
3698 | clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + | ||
1845 | le32_to_cpu(el->l_recs[i].e_cpos)) - | 3699 | le32_to_cpu(el->l_recs[i].e_cpos)) - |
1846 | target_i_clusters; | 3700 | new_highest_cpos; |
3701 | } else { | ||
3702 | status = 0; | ||
3703 | goto bail; | ||
3704 | } | ||
1847 | 3705 | ||
1848 | mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); | 3706 | mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", |
3707 | clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); | ||
3708 | |||
3709 | BUG_ON(clusters_to_del == 0); | ||
1849 | 3710 | ||
1850 | mutex_lock(&tl_inode->i_mutex); | 3711 | mutex_lock(&tl_inode->i_mutex); |
1851 | tl_sem = 1; | 3712 | tl_sem = 1; |
@@ -1861,7 +3722,8 @@ start: | |||
1861 | } | 3722 | } |
1862 | 3723 | ||
1863 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, | 3724 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, |
1864 | fe, el); | 3725 | (struct ocfs2_dinode *)fe_bh->b_data, |
3726 | el); | ||
1865 | handle = ocfs2_start_trans(osb, credits); | 3727 | handle = ocfs2_start_trans(osb, credits); |
1866 | if (IS_ERR(handle)) { | 3728 | if (IS_ERR(handle)) { |
1867 | status = PTR_ERR(handle); | 3729 | status = PTR_ERR(handle); |
@@ -1870,13 +3732,8 @@ start: | |||
1870 | goto bail; | 3732 | goto bail; |
1871 | } | 3733 | } |
1872 | 3734 | ||
1873 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 3735 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, |
1874 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 3736 | tc, path); |
1875 | if (status < 0) | ||
1876 | mlog_errno(status); | ||
1877 | |||
1878 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, | ||
1879 | last_eb_bh, handle, tc); | ||
1880 | if (status < 0) { | 3737 | if (status < 0) { |
1881 | mlog_errno(status); | 3738 | mlog_errno(status); |
1882 | goto bail; | 3739 | goto bail; |
@@ -1888,9 +3745,14 @@ start: | |||
1888 | ocfs2_commit_trans(osb, handle); | 3745 | ocfs2_commit_trans(osb, handle); |
1889 | handle = NULL; | 3746 | handle = NULL; |
1890 | 3747 | ||
1891 | BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); | 3748 | ocfs2_reinit_path(path, 1); |
1892 | if (le32_to_cpu(fe->i_clusters) > target_i_clusters) | 3749 | |
1893 | goto start; | 3750 | /* |
3751 | * The check above will catch the case where we've truncated | ||
3752 | * away all allocation. | ||
3753 | */ | ||
3754 | goto start; | ||
3755 | |||
1894 | bail: | 3756 | bail: |
1895 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 3757 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
1896 | 3758 | ||
@@ -1902,8 +3764,7 @@ bail: | |||
1902 | if (handle) | 3764 | if (handle) |
1903 | ocfs2_commit_trans(osb, handle); | 3765 | ocfs2_commit_trans(osb, handle); |
1904 | 3766 | ||
1905 | if (last_eb_bh) | 3767 | ocfs2_free_path(path); |
1906 | brelse(last_eb_bh); | ||
1907 | 3768 | ||
1908 | /* This will drop the ext_alloc cluster lock for us */ | 3769 | /* This will drop the ext_alloc cluster lock for us */ |
1909 | ocfs2_free_truncate_context(tc); | 3770 | ocfs2_free_truncate_context(tc); |
@@ -1912,7 +3773,6 @@ bail: | |||
1912 | return status; | 3773 | return status; |
1913 | } | 3774 | } |
1914 | 3775 | ||
1915 | |||
1916 | /* | 3776 | /* |
1917 | * Expects the inode to already be locked. This will figure out which | 3777 | * Expects the inode to already be locked. This will figure out which |
1918 | * inodes need to be locked and will put them on the returned truncate | 3778 | * inodes need to be locked and will put them on the returned truncate |
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
1923 | struct buffer_head *fe_bh, | 3783 | struct buffer_head *fe_bh, |
1924 | struct ocfs2_truncate_context **tc) | 3784 | struct ocfs2_truncate_context **tc) |
1925 | { | 3785 | { |
1926 | int status, metadata_delete; | 3786 | int status, metadata_delete, i; |
1927 | unsigned int new_i_clusters; | 3787 | unsigned int new_i_clusters; |
1928 | struct ocfs2_dinode *fe; | 3788 | struct ocfs2_dinode *fe; |
1929 | struct ocfs2_extent_block *eb; | 3789 | struct ocfs2_extent_block *eb; |
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
1944 | "%llu\n", fe->i_clusters, new_i_clusters, | 3804 | "%llu\n", fe->i_clusters, new_i_clusters, |
1945 | (unsigned long long)fe->i_size); | 3805 | (unsigned long long)fe->i_size); |
1946 | 3806 | ||
1947 | if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { | ||
1948 | ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " | ||
1949 | "%u and size %llu whereas struct inode has " | ||
1950 | "cluster count %u and size %llu which caused an " | ||
1951 | "invalid truncate to %u clusters.", | ||
1952 | (unsigned long long)le64_to_cpu(fe->i_blkno), | ||
1953 | le32_to_cpu(fe->i_clusters), | ||
1954 | (unsigned long long)le64_to_cpu(fe->i_size), | ||
1955 | OCFS2_I(inode)->ip_clusters, i_size_read(inode), | ||
1956 | new_i_clusters); | ||
1957 | mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); | ||
1958 | status = -EIO; | ||
1959 | goto bail; | ||
1960 | } | ||
1961 | |||
1962 | *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); | 3807 | *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); |
1963 | if (!(*tc)) { | 3808 | if (!(*tc)) { |
1964 | status = -ENOMEM; | 3809 | status = -ENOMEM; |
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
1986 | goto bail; | 3831 | goto bail; |
1987 | } | 3832 | } |
1988 | el = &(eb->h_list); | 3833 | el = &(eb->h_list); |
1989 | if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) | 3834 | |
3835 | i = 0; | ||
3836 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
3837 | i = 1; | ||
3838 | /* | ||
3839 | * XXX: Should we check that next_free_rec contains | ||
3840 | * the extent? | ||
3841 | */ | ||
3842 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) | ||
1990 | metadata_delete = 1; | 3843 | metadata_delete = 1; |
1991 | } | 3844 | } |
1992 | 3845 | ||
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 0b82e8044325..fbcb5934a081 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h | |||
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
31 | handle_t *handle, | 31 | handle_t *handle, |
32 | struct inode *inode, | 32 | struct inode *inode, |
33 | struct buffer_head *fe_bh, | 33 | struct buffer_head *fe_bh, |
34 | u64 blkno, | 34 | u32 cpos, |
35 | u64 start_blk, | ||
35 | u32 new_clusters, | 36 | u32 new_clusters, |
36 | struct ocfs2_alloc_context *meta_ac); | 37 | struct ocfs2_alloc_context *meta_ac); |
37 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 38 | int ocfs2_num_free_extents(struct ocfs2_super *osb, |
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context { | |||
70 | struct buffer_head *tc_last_eb_bh; | 71 | struct buffer_head *tc_last_eb_bh; |
71 | }; | 72 | }; |
72 | 73 | ||
74 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | ||
75 | u64 new_i_size); | ||
73 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | 76 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, |
74 | struct inode *inode, | 77 | struct inode *inode, |
75 | struct buffer_head *fe_bh, | 78 | struct buffer_head *fe_bh, |
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
79 | struct buffer_head *fe_bh, | 82 | struct buffer_head *fe_bh, |
80 | struct ocfs2_truncate_context *tc); | 83 | struct ocfs2_truncate_context *tc); |
81 | 84 | ||
85 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | ||
86 | u32 cpos, struct buffer_head **leaf_bh); | ||
87 | |||
88 | /* | ||
89 | * Helper function to look at the # of clusters in an extent record. | ||
90 | */ | ||
91 | static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, | ||
92 | struct ocfs2_extent_rec *rec) | ||
93 | { | ||
94 | /* | ||
95 | * Cluster count in extent records is slightly different | ||
96 | * between interior nodes and leaf nodes. This is to support | ||
97 | * unwritten extents which need a flags field in leaf node | ||
98 | * records, thus shrinking the available space for a clusters | ||
99 | * field. | ||
100 | */ | ||
101 | if (el->l_tree_depth) | ||
102 | return le32_to_cpu(rec->e_int_clusters); | ||
103 | else | ||
104 | return le16_to_cpu(rec->e_leaf_clusters); | ||
105 | } | ||
106 | |||
82 | #endif /* OCFS2_ALLOC_H */ | 107 | #endif /* OCFS2_ALLOC_H */ |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..56963e6c46c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
27 | #include <linux/swap.h> | ||
28 | #include <linux/pipe_fs_i.h> | ||
27 | 29 | ||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 30 | #define MLOG_MASK_PREFIX ML_FILE_IO |
29 | #include <cluster/masklog.h> | 31 | #include <cluster/masklog.h> |
@@ -37,6 +39,7 @@ | |||
37 | #include "file.h" | 39 | #include "file.h" |
38 | #include "inode.h" | 40 | #include "inode.h" |
39 | #include "journal.h" | 41 | #include "journal.h" |
42 | #include "suballoc.h" | ||
40 | #include "super.h" | 43 | #include "super.h" |
41 | #include "symlink.h" | 44 | #include "symlink.h" |
42 | 45 | ||
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
134 | struct buffer_head *bh_result, int create) | 137 | struct buffer_head *bh_result, int create) |
135 | { | 138 | { |
136 | int err = 0; | 139 | int err = 0; |
140 | unsigned int ext_flags; | ||
137 | u64 p_blkno, past_eof; | 141 | u64 p_blkno, past_eof; |
142 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
138 | 143 | ||
139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 144 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
140 | (unsigned long long)iblock, bh_result, create); | 145 | (unsigned long long)iblock, bh_result, create); |
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
149 | goto bail; | 154 | goto bail; |
150 | } | 155 | } |
151 | 156 | ||
152 | /* this can happen if another node truncs after our extend! */ | 157 | err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, |
153 | spin_lock(&OCFS2_I(inode)->ip_lock); | 158 | &ext_flags); |
154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
155 | OCFS2_I(inode)->ip_clusters)) | ||
156 | err = -EIO; | ||
157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
158 | if (err) | ||
159 | goto bail; | ||
160 | |||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
162 | NULL); | ||
163 | if (err) { | 159 | if (err) { |
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 160 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 161 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
167 | goto bail; | 163 | goto bail; |
168 | } | 164 | } |
169 | 165 | ||
170 | map_bh(bh_result, inode->i_sb, p_blkno); | 166 | /* |
171 | 167 | * ocfs2 never allocates in this function - the only time we | |
172 | if (bh_result->b_blocknr == 0) { | 168 | * need to use BH_New is when we're extending i_size on a file |
173 | err = -EIO; | 169 | * system which doesn't support holes, in which case BH_New |
174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | 170 | * allows block_prepare_write() to zero. |
175 | (unsigned long long)iblock, | 171 | */ |
176 | (unsigned long long)p_blkno, | 172 | mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), |
177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 173 | "ino %lu, iblock %llu\n", inode->i_ino, |
178 | } | 174 | (unsigned long long)iblock); |
175 | |||
176 | /* Treat the unwritten extent as a hole for zeroing purposes. */ | ||
177 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
178 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
179 | |||
180 | if (!ocfs2_sparse_alloc(osb)) { | ||
181 | if (p_blkno == 0) { | ||
182 | err = -EIO; | ||
183 | mlog(ML_ERROR, | ||
184 | "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | ||
185 | (unsigned long long)iblock, | ||
186 | (unsigned long long)p_blkno, | ||
187 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
188 | mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); | ||
189 | dump_stack(); | ||
190 | } | ||
179 | 191 | ||
180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 192 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | 193 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, |
182 | (unsigned long long)past_eof); | 194 | (unsigned long long)past_eof); |
183 | 195 | ||
184 | if (create && (iblock >= past_eof)) | 196 | if (create && (iblock >= past_eof)) |
185 | set_buffer_new(bh_result); | 197 | set_buffer_new(bh_result); |
198 | } | ||
186 | 199 | ||
187 | bail: | 200 | bail: |
188 | if (err < 0) | 201 | if (err < 0) |
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | |||
276 | return ret; | 289 | return ret; |
277 | } | 290 | } |
278 | 291 | ||
279 | /* This can also be called from ocfs2_write_zero_page() which has done | 292 | /* |
280 | * it's own cluster locking. */ | 293 | * This is called from ocfs2_write_zero_page() which has handled it's |
294 | * own cluster locking and has ensured allocation exists for those | ||
295 | * blocks to be written. | ||
296 | */ | ||
281 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | 297 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, |
282 | unsigned from, unsigned to) | 298 | unsigned from, unsigned to) |
283 | { | 299 | { |
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | |||
292 | return ret; | 308 | return ret; |
293 | } | 309 | } |
294 | 310 | ||
295 | /* | ||
296 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
297 | * from loopback. It must be able to perform its own locking around | ||
298 | * ocfs2_get_block(). | ||
299 | */ | ||
300 | static int ocfs2_prepare_write(struct file *file, struct page *page, | ||
301 | unsigned from, unsigned to) | ||
302 | { | ||
303 | struct inode *inode = page->mapping->host; | ||
304 | int ret; | ||
305 | |||
306 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
307 | |||
308 | ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); | ||
309 | if (ret != 0) { | ||
310 | mlog_errno(ret); | ||
311 | goto out; | ||
312 | } | ||
313 | |||
314 | ret = ocfs2_prepare_write_nolock(inode, page, from, to); | ||
315 | |||
316 | ocfs2_meta_unlock(inode, 0); | ||
317 | out: | ||
318 | mlog_exit(ret); | ||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | /* Taken from ext3. We don't necessarily need the full blown | 311 | /* Taken from ext3. We don't necessarily need the full blown |
323 | * functionality yet, but IMHO it's better to cut and paste the whole | 312 | * functionality yet, but IMHO it's better to cut and paste the whole |
324 | * thing so we can avoid introducing our own bugs (and easily pick up | 313 | * thing so we can avoid introducing our own bugs (and easily pick up |
325 | * their fixes when they happen) --Mark */ | 314 | * their fixes when they happen) --Mark */ |
326 | static int walk_page_buffers( handle_t *handle, | 315 | int walk_page_buffers( handle_t *handle, |
327 | struct buffer_head *head, | 316 | struct buffer_head *head, |
328 | unsigned from, | 317 | unsigned from, |
329 | unsigned to, | 318 | unsigned to, |
330 | int *partial, | 319 | int *partial, |
331 | int (*fn)( handle_t *handle, | 320 | int (*fn)( handle_t *handle, |
332 | struct buffer_head *bh)) | 321 | struct buffer_head *bh)) |
333 | { | 322 | { |
334 | struct buffer_head *bh; | 323 | struct buffer_head *bh; |
335 | unsigned block_start, block_end; | 324 | unsigned block_start, block_end; |
@@ -388,95 +377,6 @@ out: | |||
388 | return handle; | 377 | return handle; |
389 | } | 378 | } |
390 | 379 | ||
391 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
392 | unsigned from, unsigned to) | ||
393 | { | ||
394 | int ret; | ||
395 | struct buffer_head *di_bh = NULL; | ||
396 | struct inode *inode = page->mapping->host; | ||
397 | handle_t *handle = NULL; | ||
398 | struct ocfs2_dinode *di; | ||
399 | |||
400 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
401 | |||
402 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
403 | * us to continue here without rechecking the I/O against | ||
404 | * changed inode values. | ||
405 | * | ||
406 | * 1) We're currently holding the inode alloc lock, so no | ||
407 | * nodes can change it underneath us. | ||
408 | * | ||
409 | * 2) We've had to take the metadata lock at least once | ||
410 | * already to check for extending writes, suid removal, etc. | ||
411 | * The meta data update code then ensures that we don't get a | ||
412 | * stale inode allocation image (i_size, i_clusters, etc). | ||
413 | */ | ||
414 | |||
415 | ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); | ||
416 | if (ret != 0) { | ||
417 | mlog_errno(ret); | ||
418 | goto out; | ||
419 | } | ||
420 | |||
421 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
422 | if (ret != 0) { | ||
423 | mlog_errno(ret); | ||
424 | goto out_unlock_meta; | ||
425 | } | ||
426 | |||
427 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
428 | if (IS_ERR(handle)) { | ||
429 | ret = PTR_ERR(handle); | ||
430 | goto out_unlock_data; | ||
431 | } | ||
432 | |||
433 | /* Mark our buffer early. We'd rather catch this error up here | ||
434 | * as opposed to after a successful commit_write which would | ||
435 | * require us to set back inode->i_size. */ | ||
436 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
437 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
438 | if (ret < 0) { | ||
439 | mlog_errno(ret); | ||
440 | goto out_commit; | ||
441 | } | ||
442 | |||
443 | /* might update i_size */ | ||
444 | ret = generic_commit_write(file, page, from, to); | ||
445 | if (ret < 0) { | ||
446 | mlog_errno(ret); | ||
447 | goto out_commit; | ||
448 | } | ||
449 | |||
450 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
451 | |||
452 | /* ocfs2_mark_inode_dirty() is too heavy to use here. */ | ||
453 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
454 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
455 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
456 | |||
457 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
458 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
459 | |||
460 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
461 | if (ret < 0) { | ||
462 | mlog_errno(ret); | ||
463 | goto out_commit; | ||
464 | } | ||
465 | |||
466 | out_commit: | ||
467 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
468 | out_unlock_data: | ||
469 | ocfs2_data_unlock(inode, 1); | ||
470 | out_unlock_meta: | ||
471 | ocfs2_meta_unlock(inode, 1); | ||
472 | out: | ||
473 | if (di_bh) | ||
474 | brelse(di_bh); | ||
475 | |||
476 | mlog_exit(ret); | ||
477 | return ret; | ||
478 | } | ||
479 | |||
480 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | 380 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) |
481 | { | 381 | { |
482 | sector_t status; | 382 | sector_t status; |
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | |||
499 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 399 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
500 | } | 400 | } |
501 | 401 | ||
502 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 402 | err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); |
503 | NULL); | ||
504 | 403 | ||
505 | if (!INODE_JOURNAL(inode)) { | 404 | if (!INODE_JOURNAL(inode)) { |
506 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 405 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
540 | struct buffer_head *bh_result, int create) | 439 | struct buffer_head *bh_result, int create) |
541 | { | 440 | { |
542 | int ret; | 441 | int ret; |
543 | u64 p_blkno, inode_blocks; | 442 | u64 p_blkno, inode_blocks, contig_blocks; |
544 | int contig_blocks; | 443 | unsigned int ext_flags; |
545 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | 444 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; |
546 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 445 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
547 | 446 | ||
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
549 | * nicely aligned and of the right size, so there's no need | 448 | * nicely aligned and of the right size, so there's no need |
550 | * for us to check any of that. */ | 449 | * for us to check any of that. */ |
551 | 450 | ||
552 | spin_lock(&OCFS2_I(inode)->ip_lock); | 451 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
553 | inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, | ||
554 | OCFS2_I(inode)->ip_clusters); | ||
555 | |||
556 | /* | ||
557 | * For a read which begins past the end of file, we return a hole. | ||
558 | */ | ||
559 | if (!create && (iblock >= inode_blocks)) { | ||
560 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
561 | ret = 0; | ||
562 | goto bail; | ||
563 | } | ||
564 | 452 | ||
565 | /* | 453 | /* |
566 | * Any write past EOF is not allowed because we'd be extending. | 454 | * Any write past EOF is not allowed because we'd be extending. |
567 | */ | 455 | */ |
568 | if (create && (iblock + max_blocks) > inode_blocks) { | 456 | if (create && (iblock + max_blocks) > inode_blocks) { |
569 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
570 | ret = -EIO; | 457 | ret = -EIO; |
571 | goto bail; | 458 | goto bail; |
572 | } | 459 | } |
573 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
574 | 460 | ||
575 | /* This figures out the size of the next contiguous block, and | 461 | /* This figures out the size of the next contiguous block, and |
576 | * our logical offset */ | 462 | * our logical offset */ |
577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 463 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
578 | &contig_blocks); | 464 | &contig_blocks, &ext_flags); |
579 | if (ret) { | 465 | if (ret) { |
580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 466 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
581 | (unsigned long long)iblock); | 467 | (unsigned long long)iblock); |
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
583 | goto bail; | 469 | goto bail; |
584 | } | 470 | } |
585 | 471 | ||
586 | map_bh(bh_result, inode->i_sb, p_blkno); | 472 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { |
473 | ocfs2_error(inode->i_sb, | ||
474 | "Inode %llu has a hole at block %llu\n", | ||
475 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
476 | (unsigned long long)iblock); | ||
477 | ret = -EROFS; | ||
478 | goto bail; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * get_more_blocks() expects us to describe a hole by clearing | ||
483 | * the mapped bit on bh_result(). | ||
484 | * | ||
485 | * Consider an unwritten extent as a hole. | ||
486 | */ | ||
487 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
488 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
489 | else { | ||
490 | /* | ||
491 | * ocfs2_prepare_inode_for_write() should have caught | ||
492 | * the case where we'd be filling a hole and triggered | ||
493 | * a buffered write instead. | ||
494 | */ | ||
495 | if (create) { | ||
496 | ret = -EIO; | ||
497 | mlog_errno(ret); | ||
498 | goto bail; | ||
499 | } | ||
500 | |||
501 | clear_buffer_mapped(bh_result); | ||
502 | } | ||
587 | 503 | ||
588 | /* make sure we don't map more than max_blocks blocks here as | 504 | /* make sure we don't map more than max_blocks blocks here as |
589 | that's all the kernel will handle at this point. */ | 505 | that's all the kernel will handle at this point. */ |
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
606 | void *private) | 522 | void *private) |
607 | { | 523 | { |
608 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 524 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
525 | int level; | ||
609 | 526 | ||
610 | /* this io's submitter should not have unlocked this before we could */ | 527 | /* this io's submitter should not have unlocked this before we could */ |
611 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 528 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
529 | |||
612 | ocfs2_iocb_clear_rw_locked(iocb); | 530 | ocfs2_iocb_clear_rw_locked(iocb); |
613 | up_read(&inode->i_alloc_sem); | 531 | |
614 | ocfs2_rw_unlock(inode, 0); | 532 | level = ocfs2_iocb_rw_locked_level(iocb); |
533 | if (!level) | ||
534 | up_read(&inode->i_alloc_sem); | ||
535 | ocfs2_rw_unlock(inode, level); | ||
615 | } | 536 | } |
616 | 537 | ||
617 | /* | 538 | /* |
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
647 | 568 | ||
648 | mlog_entry_void(); | 569 | mlog_entry_void(); |
649 | 570 | ||
650 | /* | 571 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
651 | * We get PR data locks even for O_DIRECT. This allows | 572 | /* |
652 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 573 | * We get PR data locks even for O_DIRECT. This |
653 | * extending and buffered zeroing writes race. If they did | 574 | * allows concurrent O_DIRECT I/O but doesn't let |
654 | * race then the buffered zeroing could be written back after | 575 | * O_DIRECT with extending and buffered zeroing writes |
655 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 576 | * race. If they did race then the buffered zeroing |
656 | * buffered and O_DIRECT writes, but expecting them to | 577 | * could be written back after the O_DIRECT I/O. It's |
657 | * understand that file extension is also an implicit buffered | 578 | * one thing to tell people not to mix buffered and |
658 | * write is too much. By getting the PR we force writeback of | 579 | * O_DIRECT writes, but expecting them to understand |
659 | * the buffered zeroing before proceeding. | 580 | * that file extension is also an implicit buffered |
660 | */ | 581 | * write is too much. By getting the PR we force |
661 | ret = ocfs2_data_lock(inode, 0); | 582 | * writeback of the buffered zeroing before |
662 | if (ret < 0) { | 583 | * proceeding. |
663 | mlog_errno(ret); | 584 | */ |
664 | goto out; | 585 | ret = ocfs2_data_lock(inode, 0); |
586 | if (ret < 0) { | ||
587 | mlog_errno(ret); | ||
588 | goto out; | ||
589 | } | ||
590 | ocfs2_data_unlock(inode, 0); | ||
665 | } | 591 | } |
666 | ocfs2_data_unlock(inode, 0); | ||
667 | 592 | ||
668 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 593 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
669 | inode->i_sb->s_bdev, iov, offset, | 594 | inode->i_sb->s_bdev, iov, offset, |
@@ -675,11 +600,715 @@ out: | |||
675 | return ret; | 600 | return ret; |
676 | } | 601 | } |
677 | 602 | ||
603 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
604 | u32 cpos, | ||
605 | unsigned int *start, | ||
606 | unsigned int *end) | ||
607 | { | ||
608 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
609 | |||
610 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
611 | unsigned int cpp; | ||
612 | |||
613 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
614 | |||
615 | cluster_start = cpos % cpp; | ||
616 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
617 | |||
618 | cluster_end = cluster_start + osb->s_clustersize; | ||
619 | } | ||
620 | |||
621 | BUG_ON(cluster_start > PAGE_SIZE); | ||
622 | BUG_ON(cluster_end > PAGE_SIZE); | ||
623 | |||
624 | if (start) | ||
625 | *start = cluster_start; | ||
626 | if (end) | ||
627 | *end = cluster_end; | ||
628 | } | ||
629 | |||
630 | /* | ||
631 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
632 | * | ||
633 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
634 | * of the cluster boundary. | ||
635 | * | ||
636 | * from == to == 0 is code for "zero the entire cluster region" | ||
637 | */ | ||
638 | static void ocfs2_clear_page_regions(struct page *page, | ||
639 | struct ocfs2_super *osb, u32 cpos, | ||
640 | unsigned from, unsigned to) | ||
641 | { | ||
642 | void *kaddr; | ||
643 | unsigned int cluster_start, cluster_end; | ||
644 | |||
645 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
646 | |||
647 | kaddr = kmap_atomic(page, KM_USER0); | ||
648 | |||
649 | if (from || to) { | ||
650 | if (from > cluster_start) | ||
651 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
652 | if (to < cluster_end) | ||
653 | memset(kaddr + to, 0, cluster_end - to); | ||
654 | } else { | ||
655 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
656 | } | ||
657 | |||
658 | kunmap_atomic(kaddr, KM_USER0); | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Some of this taken from block_prepare_write(). We already have our | ||
663 | * mapping by now though, and the entire write will be allocating or | ||
664 | * it won't, so not much need to use BH_New. | ||
665 | * | ||
666 | * This will also skip zeroing, which is handled externally. | ||
667 | */ | ||
668 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
669 | struct inode *inode, unsigned int from, | ||
670 | unsigned int to, int new) | ||
671 | { | ||
672 | int ret = 0; | ||
673 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
674 | unsigned int block_end, block_start; | ||
675 | unsigned int bsize = 1 << inode->i_blkbits; | ||
676 | |||
677 | if (!page_has_buffers(page)) | ||
678 | create_empty_buffers(page, bsize, 0); | ||
679 | |||
680 | head = page_buffers(page); | ||
681 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
682 | bh = bh->b_this_page, block_start += bsize) { | ||
683 | block_end = block_start + bsize; | ||
684 | |||
685 | /* | ||
686 | * Ignore blocks outside of our i/o range - | ||
687 | * they may belong to unallocated clusters. | ||
688 | */ | ||
689 | if (block_start >= to || block_end <= from) { | ||
690 | if (PageUptodate(page)) | ||
691 | set_buffer_uptodate(bh); | ||
692 | continue; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * For an allocating write with cluster size >= page | ||
697 | * size, we always write the entire page. | ||
698 | */ | ||
699 | |||
700 | if (buffer_new(bh)) | ||
701 | clear_buffer_new(bh); | ||
702 | |||
703 | if (!buffer_mapped(bh)) { | ||
704 | map_bh(bh, inode->i_sb, *p_blkno); | ||
705 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
706 | } | ||
707 | |||
708 | if (PageUptodate(page)) { | ||
709 | if (!buffer_uptodate(bh)) | ||
710 | set_buffer_uptodate(bh); | ||
711 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
712 | (block_start < from || block_end > to)) { | ||
713 | ll_rw_block(READ, 1, &bh); | ||
714 | *wait_bh++=bh; | ||
715 | } | ||
716 | |||
717 | *p_blkno = *p_blkno + 1; | ||
718 | } | ||
719 | |||
720 | /* | ||
721 | * If we issued read requests - let them complete. | ||
722 | */ | ||
723 | while(wait_bh > wait) { | ||
724 | wait_on_buffer(*--wait_bh); | ||
725 | if (!buffer_uptodate(*wait_bh)) | ||
726 | ret = -EIO; | ||
727 | } | ||
728 | |||
729 | if (ret == 0 || !new) | ||
730 | return ret; | ||
731 | |||
732 | /* | ||
733 | * If we get -EIO above, zero out any newly allocated blocks | ||
734 | * to avoid exposing stale data. | ||
735 | */ | ||
736 | bh = head; | ||
737 | block_start = 0; | ||
738 | do { | ||
739 | void *kaddr; | ||
740 | |||
741 | block_end = block_start + bsize; | ||
742 | if (block_end <= from) | ||
743 | goto next_bh; | ||
744 | if (block_start >= to) | ||
745 | break; | ||
746 | |||
747 | kaddr = kmap_atomic(page, KM_USER0); | ||
748 | memset(kaddr+block_start, 0, bh->b_size); | ||
749 | flush_dcache_page(page); | ||
750 | kunmap_atomic(kaddr, KM_USER0); | ||
751 | set_buffer_uptodate(bh); | ||
752 | mark_buffer_dirty(bh); | ||
753 | |||
754 | next_bh: | ||
755 | block_start = block_end; | ||
756 | bh = bh->b_this_page; | ||
757 | } while (bh != head); | ||
758 | |||
759 | return ret; | ||
760 | } | ||
761 | |||
762 | /* | ||
763 | * This will copy user data from the buffer page in the splice | ||
764 | * context. | ||
765 | * | ||
766 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
767 | * communication out all the way to ocfs2_write(). | ||
768 | */ | ||
769 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
770 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
771 | unsigned int *ret_from, unsigned int *ret_to) | ||
772 | { | ||
773 | int ret; | ||
774 | unsigned int to, from, cluster_start, cluster_end; | ||
775 | char *src, *dst; | ||
776 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
777 | struct pipe_buffer *buf = sp->s_buf; | ||
778 | unsigned long bytes, src_from; | ||
779 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
780 | |||
781 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
782 | &cluster_end); | ||
783 | |||
784 | from = sp->s_offset; | ||
785 | src_from = sp->s_buf_offset; | ||
786 | bytes = wc->w_count; | ||
787 | |||
788 | if (wc->w_large_pages) { | ||
789 | /* | ||
790 | * For cluster size < page size, we have to | ||
791 | * calculate pos within the cluster and obey | ||
792 | * the rightmost boundary. | ||
793 | */ | ||
794 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
795 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
796 | } | ||
797 | to = from + bytes; | ||
798 | |||
799 | if (wc->w_this_page_new) | ||
800 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
801 | cluster_start, cluster_end, 1); | ||
802 | else | ||
803 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
804 | from, to, 0); | ||
805 | if (ret) { | ||
806 | mlog_errno(ret); | ||
807 | goto out; | ||
808 | } | ||
809 | |||
810 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
811 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
812 | BUG_ON(from > osb->s_clustersize); | ||
813 | BUG_ON(to > osb->s_clustersize); | ||
814 | |||
815 | src = buf->ops->map(sp->s_pipe, buf, 1); | ||
816 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | ||
817 | memcpy(dst + from, src + src_from, bytes); | ||
818 | kunmap_atomic(wc->w_this_page, KM_USER1); | ||
819 | buf->ops->unmap(sp->s_pipe, buf, src); | ||
820 | |||
821 | wc->w_finished_copy = 1; | ||
822 | |||
823 | *ret_from = from; | ||
824 | *ret_to = to; | ||
825 | out: | ||
826 | |||
827 | return bytes ? (unsigned int)bytes : ret; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * This will copy user data from the iovec in the buffered write | ||
832 | * context. | ||
833 | */ | ||
834 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
835 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
836 | unsigned int *ret_from, unsigned int *ret_to) | ||
837 | { | ||
838 | int ret; | ||
839 | unsigned int to, from, cluster_start, cluster_end; | ||
840 | unsigned long bytes, src_from; | ||
841 | char *dst; | ||
842 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
843 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
844 | char __user *buf; | ||
845 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
846 | |||
847 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
848 | &cluster_end); | ||
849 | |||
850 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
851 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
852 | |||
853 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
854 | |||
855 | /* | ||
856 | * This is a lot of comparisons, but it reads quite | ||
857 | * easily, which is important here. | ||
858 | */ | ||
859 | /* Stay within the src page */ | ||
860 | bytes = PAGE_SIZE - src_from; | ||
861 | /* Stay within the vector */ | ||
862 | bytes = min(bytes, | ||
863 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
864 | /* Stay within count */ | ||
865 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
866 | /* | ||
867 | * For clustersize > page size, just stay within | ||
868 | * target page, otherwise we have to calculate pos | ||
869 | * within the cluster and obey the rightmost | ||
870 | * boundary. | ||
871 | */ | ||
872 | if (wc->w_large_pages) { | ||
873 | /* | ||
874 | * For cluster size < page size, we have to | ||
875 | * calculate pos within the cluster and obey | ||
876 | * the rightmost boundary. | ||
877 | */ | ||
878 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
879 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
880 | } else { | ||
881 | /* | ||
882 | * cluster size > page size is the most common | ||
883 | * case - we just stay within the target page | ||
884 | * boundary. | ||
885 | */ | ||
886 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
887 | } | ||
888 | |||
889 | to = from + bytes; | ||
890 | |||
891 | if (wc->w_this_page_new) | ||
892 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
893 | cluster_start, cluster_end, 1); | ||
894 | else | ||
895 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
896 | from, to, 0); | ||
897 | if (ret) { | ||
898 | mlog_errno(ret); | ||
899 | goto out; | ||
900 | } | ||
901 | |||
902 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
903 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
904 | BUG_ON(from > osb->s_clustersize); | ||
905 | BUG_ON(to > osb->s_clustersize); | ||
906 | |||
907 | dst = kmap(wc->w_this_page); | ||
908 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
909 | kunmap(wc->w_this_page); | ||
910 | |||
911 | /* | ||
912 | * XXX: This is slow, but simple. The caller of | ||
913 | * ocfs2_buffered_write_cluster() is responsible for | ||
914 | * passing through the iovecs, so it's difficult to | ||
915 | * predict what our next step is in here after our | ||
916 | * initial write. A future version should be pushing | ||
917 | * that iovec manipulation further down. | ||
918 | * | ||
919 | * By setting this, we indicate that a copy from user | ||
920 | * data was done, and subsequent calls for this | ||
921 | * cluster will skip copying more data. | ||
922 | */ | ||
923 | wc->w_finished_copy = 1; | ||
924 | |||
925 | *ret_from = from; | ||
926 | *ret_to = to; | ||
927 | out: | ||
928 | |||
929 | return bytes ? (unsigned int)bytes : ret; | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * Map, fill and write a page to disk. | ||
934 | * | ||
935 | * The work of copying data is done via callback. Newly allocated | ||
936 | * pages which don't take user data will be zero'd (set 'new' to | ||
937 | * indicate an allocating write) | ||
938 | * | ||
939 | * Returns a negative error code or the number of bytes copied into | ||
940 | * the page. | ||
941 | */ | ||
942 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
943 | u64 *p_blkno, struct page *page, | ||
944 | struct ocfs2_write_ctxt *wc, int new) | ||
945 | { | ||
946 | int ret, copied = 0; | ||
947 | unsigned int from = 0, to = 0; | ||
948 | unsigned int cluster_start, cluster_end; | ||
949 | unsigned int zero_from = 0, zero_to = 0; | ||
950 | |||
951 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
952 | &cluster_start, &cluster_end); | ||
953 | |||
954 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
955 | && !wc->w_finished_copy) { | ||
956 | |||
957 | wc->w_this_page = page; | ||
958 | wc->w_this_page_new = new; | ||
959 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
960 | if (ret < 0) { | ||
961 | mlog_errno(ret); | ||
962 | goto out; | ||
963 | } | ||
964 | |||
965 | copied = ret; | ||
966 | |||
967 | zero_from = from; | ||
968 | zero_to = to; | ||
969 | if (new) { | ||
970 | from = cluster_start; | ||
971 | to = cluster_end; | ||
972 | } | ||
973 | } else { | ||
974 | /* | ||
975 | * If we haven't allocated the new page yet, we | ||
976 | * shouldn't be writing it out without copying user | ||
977 | * data. This is likely a math error from the caller. | ||
978 | */ | ||
979 | BUG_ON(!new); | ||
980 | |||
981 | from = cluster_start; | ||
982 | to = cluster_end; | ||
983 | |||
984 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
985 | cluster_start, cluster_end, 1); | ||
986 | if (ret) { | ||
987 | mlog_errno(ret); | ||
988 | goto out; | ||
989 | } | ||
990 | } | ||
991 | |||
992 | /* | ||
993 | * Parts of newly allocated pages need to be zero'd. | ||
994 | * | ||
995 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
996 | * the rest of the function is concerned, the entire cluster | ||
997 | * range inside of a page needs to be written. | ||
998 | * | ||
999 | * We can skip this if the page is up to date - it's already | ||
1000 | * been zero'd from being read in as a hole. | ||
1001 | */ | ||
1002 | if (new && !PageUptodate(page)) | ||
1003 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
1004 | wc->w_cpos, zero_from, zero_to); | ||
1005 | |||
1006 | flush_dcache_page(page); | ||
1007 | |||
1008 | if (ocfs2_should_order_data(inode)) { | ||
1009 | ret = walk_page_buffers(handle, | ||
1010 | page_buffers(page), | ||
1011 | from, to, NULL, | ||
1012 | ocfs2_journal_dirty_data); | ||
1013 | if (ret < 0) | ||
1014 | mlog_errno(ret); | ||
1015 | } | ||
1016 | |||
1017 | /* | ||
1018 | * We don't use generic_commit_write() because we need to | ||
1019 | * handle our own i_size update. | ||
1020 | */ | ||
1021 | ret = block_commit_write(page, from, to); | ||
1022 | if (ret) | ||
1023 | mlog_errno(ret); | ||
1024 | out: | ||
1025 | |||
1026 | return copied ? copied : ret; | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
1030 | * Do the actual write of some data into an inode. Optionally allocate | ||
1031 | * in order to fulfill the write. | ||
1032 | * | ||
1033 | * cpos is the logical cluster offset within the file to write at | ||
1034 | * | ||
1035 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1036 | * zero indicates that allocation is required. In this case, data_ac | ||
1037 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1038 | * allocation isn't required). | ||
1039 | */ | ||
1040 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
1041 | struct buffer_head *di_bh, | ||
1042 | struct ocfs2_alloc_context *data_ac, | ||
1043 | struct ocfs2_alloc_context *meta_ac, | ||
1044 | struct ocfs2_write_ctxt *wc) | ||
1045 | { | ||
1046 | int ret, i, numpages = 1, new; | ||
1047 | unsigned int copied = 0; | ||
1048 | u32 tmp_pos; | ||
1049 | u64 v_blkno, p_blkno; | ||
1050 | struct address_space *mapping = file->f_mapping; | ||
1051 | struct inode *inode = mapping->host; | ||
1052 | unsigned long index, start; | ||
1053 | struct page **cpages; | ||
1054 | |||
1055 | new = phys == 0 ? 1 : 0; | ||
1056 | |||
1057 | /* | ||
1058 | * Figure out how many pages we'll be manipulating here. For | ||
1059 | * non allocating write, we just change the one | ||
1060 | * page. Otherwise, we'll need a whole clusters worth. | ||
1061 | */ | ||
1062 | if (new) | ||
1063 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
1064 | |||
1065 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1066 | if (!cpages) { | ||
1067 | ret = -ENOMEM; | ||
1068 | mlog_errno(ret); | ||
1069 | return ret; | ||
1070 | } | ||
1071 | |||
1072 | /* | ||
1073 | * Fill our page array first. That way we've grabbed enough so | ||
1074 | * that we can zero and flush if we error after adding the | ||
1075 | * extent. | ||
1076 | */ | ||
1077 | if (new) { | ||
1078 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
1079 | wc->w_cpos); | ||
1080 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1081 | } else { | ||
1082 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
1083 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
1084 | } | ||
1085 | |||
1086 | for(i = 0; i < numpages; i++) { | ||
1087 | index = start + i; | ||
1088 | |||
1089 | cpages[i] = grab_cache_page(mapping, index); | ||
1090 | if (!cpages[i]) { | ||
1091 | ret = -ENOMEM; | ||
1092 | mlog_errno(ret); | ||
1093 | goto out; | ||
1094 | } | ||
1095 | } | ||
1096 | |||
1097 | if (new) { | ||
1098 | /* | ||
1099 | * This is safe to call with the page locks - it won't take | ||
1100 | * any additional semaphores or cluster locks. | ||
1101 | */ | ||
1102 | tmp_pos = wc->w_cpos; | ||
1103 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
1104 | &tmp_pos, 1, di_bh, handle, | ||
1105 | data_ac, meta_ac, NULL); | ||
1106 | /* | ||
1107 | * This shouldn't happen because we must have already | ||
1108 | * calculated the correct meta data allocation required. The | ||
1109 | * internal tree allocation code should know how to increase | ||
1110 | * transaction credits itself. | ||
1111 | * | ||
1112 | * If need be, we could handle -EAGAIN for a | ||
1113 | * RESTART_TRANS here. | ||
1114 | */ | ||
1115 | mlog_bug_on_msg(ret == -EAGAIN, | ||
1116 | "Inode %llu: EAGAIN return during allocation.\n", | ||
1117 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1118 | if (ret < 0) { | ||
1119 | mlog_errno(ret); | ||
1120 | goto out; | ||
1121 | } | ||
1122 | } | ||
1123 | |||
1124 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | ||
1125 | NULL); | ||
1126 | if (ret < 0) { | ||
1127 | |||
1128 | /* | ||
1129 | * XXX: Should we go readonly here? | ||
1130 | */ | ||
1131 | |||
1132 | mlog_errno(ret); | ||
1133 | goto out; | ||
1134 | } | ||
1135 | |||
1136 | BUG_ON(p_blkno == 0); | ||
1137 | |||
1138 | for(i = 0; i < numpages; i++) { | ||
1139 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
1140 | wc, new); | ||
1141 | if (ret < 0) { | ||
1142 | mlog_errno(ret); | ||
1143 | goto out; | ||
1144 | } | ||
1145 | |||
1146 | copied += ret; | ||
1147 | } | ||
1148 | |||
1149 | out: | ||
1150 | for(i = 0; i < numpages; i++) { | ||
1151 | unlock_page(cpages[i]); | ||
1152 | mark_page_accessed(cpages[i]); | ||
1153 | page_cache_release(cpages[i]); | ||
1154 | } | ||
1155 | kfree(cpages); | ||
1156 | |||
1157 | return copied ? copied : ret; | ||
1158 | } | ||
1159 | |||
1160 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
1161 | struct ocfs2_super *osb, loff_t pos, | ||
1162 | size_t count, ocfs2_page_writer *cb, | ||
1163 | void *cb_priv) | ||
1164 | { | ||
1165 | wc->w_count = count; | ||
1166 | wc->w_pos = pos; | ||
1167 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1168 | wc->w_finished_copy = 0; | ||
1169 | |||
1170 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
1171 | wc->w_large_pages = 1; | ||
1172 | else | ||
1173 | wc->w_large_pages = 0; | ||
1174 | |||
1175 | wc->w_write_data_page = cb; | ||
1176 | wc->w_private = cb_priv; | ||
1177 | } | ||
1178 | |||
1179 | /* | ||
1180 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
1181 | * in which case it will be. This only exists for buffered writes - | ||
1182 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1183 | * | ||
1184 | * The caller is responsible for incrementing pos, written counts, etc | ||
1185 | * | ||
1186 | * For file systems that don't support sparse files, pre-allocation | ||
1187 | * and page zeroing up until cpos should be done prior to this | ||
1188 | * function call. | ||
1189 | * | ||
1190 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1191 | * | ||
1192 | * Returns the number of user bytes written, or less than zero for | ||
1193 | * error. | ||
1194 | */ | ||
1195 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
1196 | size_t count, ocfs2_page_writer *actor, | ||
1197 | void *priv) | ||
1198 | { | ||
1199 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
1200 | ssize_t written = 0; | ||
1201 | u32 phys; | ||
1202 | struct inode *inode = file->f_mapping->host; | ||
1203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1204 | struct buffer_head *di_bh = NULL; | ||
1205 | struct ocfs2_dinode *di; | ||
1206 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1207 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
1208 | handle_t *handle; | ||
1209 | struct ocfs2_write_ctxt wc; | ||
1210 | |||
1211 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1212 | |||
1213 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1214 | if (ret) { | ||
1215 | mlog_errno(ret); | ||
1216 | goto out; | ||
1217 | } | ||
1218 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1219 | |||
1220 | /* | ||
1221 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1222 | * the mapping, zeroing and tree manipulation within | ||
1223 | * ocfs2_write() will be safe against ->readpage(). This | ||
1224 | * should also serve to lock out allocation from a shared | ||
1225 | * writeable region. | ||
1226 | */ | ||
1227 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1228 | |||
1229 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | ||
1230 | if (ret) { | ||
1231 | mlog_errno(ret); | ||
1232 | goto out_meta; | ||
1233 | } | ||
1234 | |||
1235 | /* phys == 0 means that allocation is required. */ | ||
1236 | if (phys == 0) { | ||
1237 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
1238 | if (ret) { | ||
1239 | mlog_errno(ret); | ||
1240 | goto out_meta; | ||
1241 | } | ||
1242 | |||
1243 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
1244 | } | ||
1245 | |||
1246 | ret = ocfs2_data_lock(inode, 1); | ||
1247 | if (ret) { | ||
1248 | mlog_errno(ret); | ||
1249 | goto out_meta; | ||
1250 | } | ||
1251 | |||
1252 | handle = ocfs2_start_trans(osb, credits); | ||
1253 | if (IS_ERR(handle)) { | ||
1254 | ret = PTR_ERR(handle); | ||
1255 | mlog_errno(ret); | ||
1256 | goto out_data; | ||
1257 | } | ||
1258 | |||
1259 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
1260 | meta_ac, &wc); | ||
1261 | if (written < 0) { | ||
1262 | ret = written; | ||
1263 | mlog_errno(ret); | ||
1264 | goto out_commit; | ||
1265 | } | ||
1266 | |||
1267 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
1268 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1269 | if (ret) { | ||
1270 | mlog_errno(ret); | ||
1271 | goto out_commit; | ||
1272 | } | ||
1273 | |||
1274 | pos += written; | ||
1275 | if (pos > inode->i_size) { | ||
1276 | i_size_write(inode, pos); | ||
1277 | mark_inode_dirty(inode); | ||
1278 | } | ||
1279 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
1280 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1281 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1282 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1283 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1284 | |||
1285 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1286 | if (ret) | ||
1287 | mlog_errno(ret); | ||
1288 | |||
1289 | out_commit: | ||
1290 | ocfs2_commit_trans(osb, handle); | ||
1291 | |||
1292 | out_data: | ||
1293 | ocfs2_data_unlock(inode, 1); | ||
1294 | |||
1295 | out_meta: | ||
1296 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1297 | ocfs2_meta_unlock(inode, 1); | ||
1298 | |||
1299 | out: | ||
1300 | brelse(di_bh); | ||
1301 | if (data_ac) | ||
1302 | ocfs2_free_alloc_context(data_ac); | ||
1303 | if (meta_ac) | ||
1304 | ocfs2_free_alloc_context(meta_ac); | ||
1305 | |||
1306 | return written ? written : ret; | ||
1307 | } | ||
1308 | |||
678 | const struct address_space_operations ocfs2_aops = { | 1309 | const struct address_space_operations ocfs2_aops = { |
679 | .readpage = ocfs2_readpage, | 1310 | .readpage = ocfs2_readpage, |
680 | .writepage = ocfs2_writepage, | 1311 | .writepage = ocfs2_writepage, |
681 | .prepare_write = ocfs2_prepare_write, | ||
682 | .commit_write = ocfs2_commit_write, | ||
683 | .bmap = ocfs2_bmap, | 1312 | .bmap = ocfs2_bmap, |
684 | .sync_page = block_sync_page, | 1313 | .sync_page = block_sync_page, |
685 | .direct_IO = ocfs2_direct_IO, | 1314 | .direct_IO = ocfs2_direct_IO, |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f446a15eab88..45821d479b5a 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | |||
30 | unsigned from, | 30 | unsigned from, |
31 | unsigned to); | 31 | unsigned to); |
32 | 32 | ||
33 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
34 | struct inode *inode, unsigned int from, | ||
35 | unsigned int to, int new); | ||
36 | |||
37 | int walk_page_buffers( handle_t *handle, | ||
38 | struct buffer_head *head, | ||
39 | unsigned from, | ||
40 | unsigned to, | ||
41 | int *partial, | ||
42 | int (*fn)( handle_t *handle, | ||
43 | struct buffer_head *bh)); | ||
44 | |||
45 | struct ocfs2_write_ctxt; | ||
46 | typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | ||
47 | u64 *, unsigned int *, unsigned int *); | ||
48 | |||
49 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
50 | size_t count, ocfs2_page_writer *actor, | ||
51 | void *priv); | ||
52 | |||
53 | struct ocfs2_write_ctxt { | ||
54 | size_t w_count; | ||
55 | loff_t w_pos; | ||
56 | u32 w_cpos; | ||
57 | unsigned int w_finished_copy; | ||
58 | |||
59 | /* This is true if page_size > cluster_size */ | ||
60 | unsigned int w_large_pages; | ||
61 | |||
62 | /* Filler callback and private data */ | ||
63 | ocfs2_page_writer *w_write_data_page; | ||
64 | void *w_private; | ||
65 | |||
66 | /* Only valid for the filler callback */ | ||
67 | struct page *w_this_page; | ||
68 | unsigned int w_this_page_new; | ||
69 | }; | ||
70 | |||
71 | struct ocfs2_buffered_write_priv { | ||
72 | char *b_src_buf; | ||
73 | const struct iovec *b_cur_iov; /* Current iovec */ | ||
74 | size_t b_cur_off; /* Offset in the | ||
75 | * current iovec */ | ||
76 | }; | ||
77 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
78 | struct ocfs2_write_ctxt *wc, | ||
79 | u64 *p_blkno, | ||
80 | unsigned int *ret_from, | ||
81 | unsigned int *ret_to); | ||
82 | |||
83 | struct ocfs2_splice_write_priv { | ||
84 | struct splice_desc *s_sd; | ||
85 | struct pipe_buffer *s_buf; | ||
86 | struct pipe_inode_info *s_pipe; | ||
87 | /* Neither offset value is ever larger than one page */ | ||
88 | unsigned int s_offset; | ||
89 | unsigned int s_buf_offset; | ||
90 | }; | ||
91 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
92 | struct ocfs2_write_ctxt *wc, | ||
93 | u64 *p_blkno, | ||
94 | unsigned int *ret_from, | ||
95 | unsigned int *ret_to); | ||
96 | |||
33 | /* all ocfs2_dio_end_io()'s fault */ | 97 | /* all ocfs2_dio_end_io()'s fault */ |
34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 98 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
35 | test_bit(0, (unsigned long *)&iocb->private) | 99 | test_bit(0, (unsigned long *)&iocb->private) |
36 | #define ocfs2_iocb_set_rw_locked(iocb) \ | 100 | static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) |
37 | set_bit(0, (unsigned long *)&iocb->private) | 101 | { |
102 | set_bit(0, (unsigned long *)&iocb->private); | ||
103 | if (level) | ||
104 | set_bit(1, (unsigned long *)&iocb->private); | ||
105 | else | ||
106 | clear_bit(1, (unsigned long *)&iocb->private); | ||
107 | } | ||
38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ | 108 | #define ocfs2_iocb_clear_rw_locked(iocb) \ |
39 | clear_bit(0, (unsigned long *)&iocb->private) | 109 | clear_bit(0, (unsigned long *)&iocb->private) |
40 | 110 | #define ocfs2_iocb_rw_locked_level(iocb) \ | |
111 | test_bit(1, (unsigned long *)&iocb->private) | ||
41 | #endif /* OCFS2_FILE_H */ | 112 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 4705d659fe57..bbacf7da48a4 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/kernel.h> | 46 | #include <linux/kernel.h> |
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
49 | #include <linux/reboot.h> | ||
49 | 50 | ||
50 | #include "heartbeat.h" | 51 | #include "heartbeat.h" |
51 | #include "nodemanager.h" | 52 | #include "nodemanager.h" |
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void) | |||
72 | /* panic spins with interrupts enabled. with preempt | 73 | /* panic spins with interrupts enabled. with preempt |
73 | * threads can still schedule, etc, etc */ | 74 | * threads can still schedule, etc, etc */ |
74 | o2hb_stop_all_regions(); | 75 | o2hb_stop_all_regions(); |
75 | panic("ocfs2 is very sorry to be fencing this system by panicing\n"); | 76 | |
77 | printk("ocfs2 is very sorry to be fencing this system by restarting\n"); | ||
78 | emergency_restart(); | ||
76 | } | 79 | } |
77 | 80 | ||
78 | /* Indicate that a timeout occured on a hearbeat region write. The | 81 | /* Indicate that a timeout occured on a hearbeat region write. The |
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4dae5df5e467..9606111fe89d 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
@@ -38,6 +38,9 @@ | |||
38 | * locking semantics of the file system using the protocol. It should | 38 | * locking semantics of the file system using the protocol. It should |
39 | * be somewhere else, I'm sure, but right now it isn't. | 39 | * be somewhere else, I'm sure, but right now it isn't. |
40 | * | 40 | * |
41 | * New in version 8: | ||
42 | * - Replace delete inode votes with a cluster lock | ||
43 | * | ||
41 | * New in version 7: | 44 | * New in version 7: |
42 | * - DLM join domain includes the live nodemap | 45 | * - DLM join domain includes the live nodemap |
43 | * | 46 | * |
@@ -57,7 +60,7 @@ | |||
57 | * - full 64 bit i_size in the metadata lock lvbs | 60 | * - full 64 bit i_size in the metadata lock lvbs |
58 | * - introduction of "rw" lock and pushing meta/data locking down | 61 | * - introduction of "rw" lock and pushing meta/data locking down |
59 | */ | 62 | */ |
60 | #define O2NET_PROTOCOL_VERSION 7ULL | 63 | #define O2NET_PROTOCOL_VERSION 8ULL |
61 | struct o2net_handshake { | 64 | struct o2net_handshake { |
62 | __be64 protocol_version; | 65 | __be64 protocol_version; |
63 | __be64 connector_id; | 66 | __be64 connector_id; |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 66821e178167..67e6866a2a4f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
358 | { | 358 | { |
359 | int status; | 359 | int status; |
360 | int extend; | 360 | int extend; |
361 | u64 p_blkno; | 361 | u64 p_blkno, v_blkno; |
362 | 362 | ||
363 | spin_lock(&OCFS2_I(dir)->ip_lock); | 363 | spin_lock(&OCFS2_I(dir)->ip_lock); |
364 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); | 364 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); |
365 | spin_unlock(&OCFS2_I(dir)->ip_lock); | 365 | spin_unlock(&OCFS2_I(dir)->ip_lock); |
366 | 366 | ||
367 | if (extend) { | 367 | if (extend) { |
368 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, | 368 | u32 offset = OCFS2_I(dir)->ip_clusters; |
369 | parent_fe_bh, handle, | 369 | |
370 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, | ||
371 | 1, parent_fe_bh, handle, | ||
370 | data_ac, meta_ac, NULL); | 372 | data_ac, meta_ac, NULL); |
371 | BUG_ON(status == -EAGAIN); | 373 | BUG_ON(status == -EAGAIN); |
372 | if (status < 0) { | 374 | if (status < 0) { |
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
375 | } | 377 | } |
376 | } | 378 | } |
377 | 379 | ||
378 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> | 380 | v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); |
379 | (sb->s_blocksize_bits - 9)), | 381 | status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); |
380 | 1, &p_blkno, NULL); | ||
381 | if (status < 0) { | 382 | if (status < 0) { |
382 | mlog_errno(status); | 383 | mlog_errno(status); |
383 | goto bail; | 384 | goto bail; |
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, | |||
486 | 487 | ||
487 | dir_i_size += dir->i_sb->s_blocksize; | 488 | dir_i_size += dir->i_sb->s_blocksize; |
488 | i_size_write(dir, dir_i_size); | 489 | i_size_write(dir, dir_i_size); |
489 | dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); | 490 | dir->i_blocks = ocfs2_inode_sector_count(dir); |
490 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); | 491 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); |
491 | if (status < 0) { | 492 | if (status < 0) { |
492 | mlog_errno(status); | 493 | mlog_errno(status); |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c558442a0b44..d836b98dd99a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -430,11 +430,10 @@ redo_bucket: | |||
430 | 430 | ||
431 | dlm_lockres_put(res); | 431 | dlm_lockres_put(res); |
432 | 432 | ||
433 | cond_resched_lock(&dlm->spinlock); | ||
434 | |||
435 | if (dropped) | 433 | if (dropped) |
436 | goto redo_bucket; | 434 | goto redo_bucket; |
437 | } | 435 | } |
436 | cond_resched_lock(&dlm->spinlock); | ||
438 | num += n; | 437 | num += n; |
439 | mlog(0, "%s: touched %d lockreses in bucket %d " | 438 | mlog(0, "%s: touched %d lockreses in bucket %d " |
440 | "(tot=%d)\n", dlm->name, n, i, num); | 439 | "(tot=%d)\n", dlm->name, n, i, num); |
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | |||
1035 | { | 1034 | { |
1036 | int status = 0, tmpstat, node; | 1035 | int status = 0, tmpstat, node; |
1037 | struct domain_join_ctxt *ctxt; | 1036 | struct domain_join_ctxt *ctxt; |
1038 | enum dlm_query_join_response response; | 1037 | enum dlm_query_join_response response = JOIN_DISALLOW; |
1039 | 1038 | ||
1040 | mlog_entry("%p", dlm); | 1039 | mlog_entry("%p", dlm); |
1041 | 1040 | ||
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 6d4a83d50152..c1807a42c49f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
611 | } | 611 | } |
612 | } while (status != 0); | 612 | } while (status != 0); |
613 | 613 | ||
614 | spin_lock(&dlm_reco_state_lock); | ||
614 | switch (ndata->state) { | 615 | switch (ndata->state) { |
615 | case DLM_RECO_NODE_DATA_INIT: | 616 | case DLM_RECO_NODE_DATA_INIT: |
616 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 617 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: |
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
641 | ndata->node_num, dead_node); | 642 | ndata->node_num, dead_node); |
642 | break; | 643 | break; |
643 | } | 644 | } |
645 | spin_unlock(&dlm_reco_state_lock); | ||
644 | } | 646 | } |
645 | 647 | ||
646 | mlog(0, "done requesting all lock info\n"); | 648 | mlog(0, "done requesting all lock info\n"); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e335541727f9..27e43b0c0eae 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { | |||
225 | .flags = 0, | 225 | .flags = 0, |
226 | }; | 226 | }; |
227 | 227 | ||
228 | static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { | ||
229 | .get_osb = ocfs2_get_inode_osb, | ||
230 | .flags = 0, | ||
231 | }; | ||
232 | |||
228 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) | 233 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) |
229 | { | 234 | { |
230 | return lockres->l_type == OCFS2_LOCK_TYPE_META || | 235 | return lockres->l_type == OCFS2_LOCK_TYPE_META || |
231 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || | 236 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || |
232 | lockres->l_type == OCFS2_LOCK_TYPE_RW; | 237 | lockres->l_type == OCFS2_LOCK_TYPE_RW || |
238 | lockres->l_type == OCFS2_LOCK_TYPE_OPEN; | ||
233 | } | 239 | } |
234 | 240 | ||
235 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) | 241 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) |
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | |||
373 | case OCFS2_LOCK_TYPE_DATA: | 379 | case OCFS2_LOCK_TYPE_DATA: |
374 | ops = &ocfs2_inode_data_lops; | 380 | ops = &ocfs2_inode_data_lops; |
375 | break; | 381 | break; |
382 | case OCFS2_LOCK_TYPE_OPEN: | ||
383 | ops = &ocfs2_inode_open_lops; | ||
384 | break; | ||
376 | default: | 385 | default: |
377 | mlog_bug_on_msg(1, "type: %d\n", type); | 386 | mlog_bug_on_msg(1, "type: %d\n", type); |
378 | ops = NULL; /* thanks, gcc */ | 387 | ops = NULL; /* thanks, gcc */ |
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
1129 | goto bail; | 1138 | goto bail; |
1130 | } | 1139 | } |
1131 | 1140 | ||
1141 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); | ||
1142 | if (ret) { | ||
1143 | mlog_errno(ret); | ||
1144 | goto bail; | ||
1145 | } | ||
1146 | |||
1132 | bail: | 1147 | bail: |
1133 | mlog_exit(ret); | 1148 | mlog_exit(ret); |
1134 | return ret; | 1149 | return ret; |
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) | |||
1182 | mlog_exit_void(); | 1197 | mlog_exit_void(); |
1183 | } | 1198 | } |
1184 | 1199 | ||
1200 | /* | ||
1201 | * ocfs2_open_lock always get PR mode lock. | ||
1202 | */ | ||
1203 | int ocfs2_open_lock(struct inode *inode) | ||
1204 | { | ||
1205 | int status = 0; | ||
1206 | struct ocfs2_lock_res *lockres; | ||
1207 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1208 | |||
1209 | BUG_ON(!inode); | ||
1210 | |||
1211 | mlog_entry_void(); | ||
1212 | |||
1213 | mlog(0, "inode %llu take PRMODE open lock\n", | ||
1214 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1215 | |||
1216 | if (ocfs2_mount_local(osb)) | ||
1217 | goto out; | ||
1218 | |||
1219 | lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
1220 | |||
1221 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | ||
1222 | LKM_PRMODE, 0, 0); | ||
1223 | if (status < 0) | ||
1224 | mlog_errno(status); | ||
1225 | |||
1226 | out: | ||
1227 | mlog_exit(status); | ||
1228 | return status; | ||
1229 | } | ||
1230 | |||
1231 | int ocfs2_try_open_lock(struct inode *inode, int write) | ||
1232 | { | ||
1233 | int status = 0, level; | ||
1234 | struct ocfs2_lock_res *lockres; | ||
1235 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1236 | |||
1237 | BUG_ON(!inode); | ||
1238 | |||
1239 | mlog_entry_void(); | ||
1240 | |||
1241 | mlog(0, "inode %llu try to take %s open lock\n", | ||
1242 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
1243 | write ? "EXMODE" : "PRMODE"); | ||
1244 | |||
1245 | if (ocfs2_mount_local(osb)) | ||
1246 | goto out; | ||
1247 | |||
1248 | lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
1249 | |||
1250 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1251 | |||
1252 | /* | ||
1253 | * The file system may already holding a PRMODE/EXMODE open lock. | ||
1254 | * Since we pass LKM_NOQUEUE, the request won't block waiting on | ||
1255 | * other nodes and the -EAGAIN will indicate to the caller that | ||
1256 | * this inode is still in use. | ||
1257 | */ | ||
1258 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | ||
1259 | level, LKM_NOQUEUE, 0); | ||
1260 | |||
1261 | out: | ||
1262 | mlog_exit(status); | ||
1263 | return status; | ||
1264 | } | ||
1265 | |||
1266 | /* | ||
1267 | * ocfs2_open_unlock unlock PR and EX mode open locks. | ||
1268 | */ | ||
1269 | void ocfs2_open_unlock(struct inode *inode) | ||
1270 | { | ||
1271 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; | ||
1272 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1273 | |||
1274 | mlog_entry_void(); | ||
1275 | |||
1276 | mlog(0, "inode %llu drop open lock\n", | ||
1277 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1278 | |||
1279 | if (ocfs2_mount_local(osb)) | ||
1280 | goto out; | ||
1281 | |||
1282 | if(lockres->l_ro_holders) | ||
1283 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | ||
1284 | LKM_PRMODE); | ||
1285 | if(lockres->l_ex_holders) | ||
1286 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | ||
1287 | LKM_EXMODE); | ||
1288 | |||
1289 | out: | ||
1290 | mlog_exit_void(); | ||
1291 | } | ||
1292 | |||
1185 | int ocfs2_data_lock_full(struct inode *inode, | 1293 | int ocfs2_data_lock_full(struct inode *inode, |
1186 | int write, | 1294 | int write, |
1187 | int arg_flags) | 1295 | int arg_flags) |
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | |||
1387 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) | 1495 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) |
1388 | inode->i_blocks = 0; | 1496 | inode->i_blocks = 0; |
1389 | else | 1497 | else |
1390 | inode->i_blocks = | 1498 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
1391 | ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
1392 | 1499 | ||
1393 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); | 1500 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); |
1394 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); | 1501 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); |
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
1479 | { | 1586 | { |
1480 | int status = 0; | 1587 | int status = 0; |
1481 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 1588 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
1482 | struct ocfs2_lock_res *lockres = NULL; | 1589 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; |
1483 | struct ocfs2_dinode *fe; | 1590 | struct ocfs2_dinode *fe; |
1484 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1591 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1485 | 1592 | ||
1486 | mlog_entry_void(); | 1593 | mlog_entry_void(); |
1487 | 1594 | ||
1595 | if (ocfs2_mount_local(osb)) | ||
1596 | goto bail; | ||
1597 | |||
1488 | spin_lock(&oi->ip_lock); | 1598 | spin_lock(&oi->ip_lock); |
1489 | if (oi->ip_flags & OCFS2_INODE_DELETED) { | 1599 | if (oi->ip_flags & OCFS2_INODE_DELETED) { |
1490 | mlog(0, "Orphaned inode %llu was deleted while we " | 1600 | mlog(0, "Orphaned inode %llu was deleted while we " |
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
1496 | } | 1606 | } |
1497 | spin_unlock(&oi->ip_lock); | 1607 | spin_unlock(&oi->ip_lock); |
1498 | 1608 | ||
1499 | if (!ocfs2_mount_local(osb)) { | 1609 | if (!ocfs2_should_refresh_lock_res(lockres)) |
1500 | lockres = &oi->ip_meta_lockres; | 1610 | goto bail; |
1501 | |||
1502 | if (!ocfs2_should_refresh_lock_res(lockres)) | ||
1503 | goto bail; | ||
1504 | } | ||
1505 | 1611 | ||
1506 | /* This will discard any caching information we might have had | 1612 | /* This will discard any caching information we might have had |
1507 | * for the inode metadata. */ | 1613 | * for the inode metadata. */ |
1508 | ocfs2_metadata_cache_purge(inode); | 1614 | ocfs2_metadata_cache_purge(inode); |
1509 | 1615 | ||
1510 | /* will do nothing for inode types that don't use the extent | ||
1511 | * map (directories, bitmap files, etc) */ | ||
1512 | ocfs2_extent_map_trunc(inode, 0); | 1616 | ocfs2_extent_map_trunc(inode, 0); |
1513 | 1617 | ||
1514 | if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { | 1618 | if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { |
1515 | mlog(0, "Trusting LVB on inode %llu\n", | 1619 | mlog(0, "Trusting LVB on inode %llu\n", |
1516 | (unsigned long long)oi->ip_blkno); | 1620 | (unsigned long long)oi->ip_blkno); |
1517 | ocfs2_refresh_inode_from_lvb(inode); | 1621 | ocfs2_refresh_inode_from_lvb(inode); |
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, | |||
1558 | 1662 | ||
1559 | status = 0; | 1663 | status = 0; |
1560 | bail_refresh: | 1664 | bail_refresh: |
1561 | if (lockres) | 1665 | ocfs2_complete_lock_res_refresh(lockres, status); |
1562 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
1563 | bail: | 1666 | bail: |
1564 | mlog_exit(status); | 1667 | mlog_exit(status); |
1565 | return status; | 1668 | return status; |
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode, | |||
1630 | wait_event(osb->recovery_event, | 1733 | wait_event(osb->recovery_event, |
1631 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | 1734 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); |
1632 | 1735 | ||
1633 | acquired = 0; | ||
1634 | lockres = &OCFS2_I(inode)->ip_meta_lockres; | 1736 | lockres = &OCFS2_I(inode)->ip_meta_lockres; |
1635 | level = ex ? LKM_EXMODE : LKM_PRMODE; | 1737 | level = ex ? LKM_EXMODE : LKM_PRMODE; |
1636 | dlm_flags = 0; | 1738 | dlm_flags = 0; |
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode) | |||
2458 | * ocfs2_clear_inode has done it for us. */ | 2560 | * ocfs2_clear_inode has done it for us. */ |
2459 | 2561 | ||
2460 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | 2562 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), |
2461 | &OCFS2_I(inode)->ip_data_lockres); | 2563 | &OCFS2_I(inode)->ip_open_lockres); |
2462 | if (err < 0) | 2564 | if (err < 0) |
2463 | mlog_errno(err); | 2565 | mlog_errno(err); |
2464 | 2566 | ||
2465 | status = err; | 2567 | status = err; |
2466 | 2568 | ||
2467 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | 2569 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), |
2570 | &OCFS2_I(inode)->ip_data_lockres); | ||
2571 | if (err < 0) | ||
2572 | mlog_errno(err); | ||
2573 | if (err < 0 && !status) | ||
2574 | status = err; | ||
2575 | |||
2576 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2468 | &OCFS2_I(inode)->ip_meta_lockres); | 2577 | &OCFS2_I(inode)->ip_meta_lockres); |
2469 | if (err < 0) | 2578 | if (err < 0) |
2470 | mlog_errno(err); | 2579 | mlog_errno(err); |
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index c343fca68cf1..59cb566e7983 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h | |||
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, | |||
80 | int write); | 80 | int write); |
81 | int ocfs2_rw_lock(struct inode *inode, int write); | 81 | int ocfs2_rw_lock(struct inode *inode, int write); |
82 | void ocfs2_rw_unlock(struct inode *inode, int write); | 82 | void ocfs2_rw_unlock(struct inode *inode, int write); |
83 | int ocfs2_open_lock(struct inode *inode); | ||
84 | int ocfs2_try_open_lock(struct inode *inode, int write); | ||
85 | void ocfs2_open_unlock(struct inode *inode); | ||
83 | int ocfs2_meta_lock_atime(struct inode *inode, | 86 | int ocfs2_meta_lock_atime(struct inode *inode, |
84 | struct vfsmount *vfsmnt, | 87 | struct vfsmount *vfsmnt, |
85 | int *level); | 88 | int *level); |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..ba2b2ab1c6e4 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -3,8 +3,7 @@ | |||
3 | * | 3 | * |
4 | * extent_map.c | 4 | * extent_map.c |
5 | * | 5 | * |
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | 6 | * Block/Cluster mapping functions |
7 | * the library. | ||
8 | * | 7 | * |
9 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
10 | * | 9 | * |
@@ -26,1016 +25,528 @@ | |||
26 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
27 | #include <linux/init.h> | 26 | #include <linux/init.h> |
28 | #include <linux/types.h> | 27 | #include <linux/types.h> |
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | 28 | ||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | 29 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP |
33 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
34 | 31 | ||
35 | #include "ocfs2.h" | 32 | #include "ocfs2.h" |
36 | 33 | ||
34 | #include "alloc.h" | ||
37 | #include "extent_map.h" | 35 | #include "extent_map.h" |
38 | #include "inode.h" | 36 | #include "inode.h" |
39 | #include "super.h" | 37 | #include "super.h" |
40 | 38 | ||
41 | #include "buffer_head_io.h" | 39 | #include "buffer_head_io.h" |
42 | 40 | ||
43 | |||
44 | /* | 41 | /* |
45 | * SUCK SUCK SUCK | 42 | * The extent caching implementation is intentionally trivial. |
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static struct kmem_cache *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Find an entry in the tree that intersects the region passed in. | ||
105 | * Note that this will find straddled intervals, it is up to the | ||
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | 43 | * |
112 | * The rb_node garbage lets insertion share the search. Trivial | 44 | * We only cache a small number of extents stored directly on the |
113 | * callers pass NULL. | 45 | * inode, so linear order operations are acceptable. If we ever want |
46 | * to increase the size of the extent map, then these algorithms must | ||
47 | * get smarter. | ||
114 | */ | 48 | */ |
115 | static struct ocfs2_extent_map_entry * | 49 | |
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | 50 | void ocfs2_extent_map_init(struct inode *inode) |
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | 51 | { |
121 | struct rb_node **p = &em->em_extents.rb_node; | 52 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
122 | struct rb_node *parent = NULL; | ||
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | 53 | ||
141 | if (ret_p != NULL) | 54 | oi->ip_extent_map.em_num_items = 0; |
142 | *ret_p = p; | 55 | INIT_LIST_HEAD(&oi->ip_extent_map.em_list); |
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | 56 | } |
147 | 57 | ||
148 | /* | 58 | static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, |
149 | * Find the leaf containing the interval we want. While we're on our | 59 | unsigned int cpos, |
150 | * way down the tree, fill in every record we see at any depth, because | 60 | struct ocfs2_extent_map_item **ret_emi) |
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | 61 | { |
161 | int i, ret; | 62 | unsigned int range; |
162 | struct buffer_head *eb_bh = NULL; | 63 | struct ocfs2_extent_map_item *emi; |
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | ||
167 | |||
168 | /* | ||
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | ocfs2_error(inode->i_sb, | ||
185 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | ||
186 | i, | ||
187 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
188 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
189 | OCFS2_I(inode)->ip_clusters); | ||
190 | goto out_free; | ||
191 | } | ||
192 | |||
193 | if (rec_end <= cpos) { | ||
194 | ret = ocfs2_extent_map_insert(inode, rec, | ||
195 | le16_to_cpu(el->l_tree_depth)); | ||
196 | if (ret && (ret != -EEXIST)) { | ||
197 | mlog_errno(ret); | ||
198 | goto out_free; | ||
199 | } | ||
200 | continue; | ||
201 | } | ||
202 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
203 | ret = ocfs2_extent_map_insert(inode, rec, | ||
204 | le16_to_cpu(el->l_tree_depth)); | ||
205 | if (ret && (ret != -EEXIST)) { | ||
206 | mlog_errno(ret); | ||
207 | goto out_free; | ||
208 | } | ||
209 | continue; | ||
210 | } | ||
211 | 64 | ||
212 | /* | 65 | *ret_emi = NULL; |
213 | * We've found a record that matches our | ||
214 | * interval. We don't insert it because we're | ||
215 | * about to traverse it. | ||
216 | */ | ||
217 | |||
218 | /* Check to see if we're stradling */ | ||
219 | ret = -ESRCH; | ||
220 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
221 | cpos, | ||
222 | clusters)) { | ||
223 | mlog_errno(ret); | ||
224 | goto out_free; | ||
225 | } | ||
226 | 66 | ||
227 | /* | 67 | list_for_each_entry(emi, &em->em_list, ei_list) { |
228 | * If we've already found a record, the el has | 68 | range = emi->ei_cpos + emi->ei_clusters; |
229 | * two records covering the same interval. | ||
230 | * EEEK! | ||
231 | */ | ||
232 | ret = -EBADR; | ||
233 | if (blkno) { | ||
234 | mlog_errno(ret); | ||
235 | ocfs2_error(inode->i_sb, | ||
236 | "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", | ||
237 | cpos, clusters, | ||
238 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
239 | (unsigned long long)blkno, i, | ||
240 | (unsigned long long)le64_to_cpu(rec->e_blkno)); | ||
241 | goto out_free; | ||
242 | } | ||
243 | 69 | ||
244 | blkno = le64_to_cpu(rec->e_blkno); | 70 | if (cpos >= emi->ei_cpos && cpos < range) { |
245 | } | 71 | list_move(&emi->ei_list, &em->em_list); |
246 | 72 | ||
247 | /* | 73 | *ret_emi = emi; |
248 | * We don't support holes, and we're still up | 74 | break; |
249 | * in the branches, so we'd better have found someone | ||
250 | */ | ||
251 | ret = -EBADR; | ||
252 | if (!blkno) { | ||
253 | ocfs2_error(inode->i_sb, | ||
254 | "No record found for (cpos = %u, clusters = %u) on inode %llu\n", | ||
255 | cpos, clusters, | ||
256 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
257 | mlog_errno(ret); | ||
258 | goto out_free; | ||
259 | } | ||
260 | |||
261 | if (eb_bh) { | ||
262 | brelse(eb_bh); | ||
263 | eb_bh = NULL; | ||
264 | } | ||
265 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
266 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
267 | inode); | ||
268 | if (ret) { | ||
269 | mlog_errno(ret); | ||
270 | goto out_free; | ||
271 | } | ||
272 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
273 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
274 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
275 | ret = -EIO; | ||
276 | goto out_free; | ||
277 | } | 75 | } |
278 | el = &eb->h_list; | ||
279 | } | 76 | } |
77 | } | ||
280 | 78 | ||
281 | BUG_ON(el->l_tree_depth); | 79 | static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, |
282 | 80 | unsigned int *phys, unsigned int *len, | |
283 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | 81 | unsigned int *flags) |
284 | rec = &el->l_recs[i]; | 82 | { |
285 | 83 | unsigned int coff; | |
286 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 84 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
287 | OCFS2_I(inode)->ip_clusters) { | 85 | struct ocfs2_extent_map_item *emi; |
288 | ret = -EBADR; | 86 | |
289 | mlog_errno(ret); | 87 | spin_lock(&oi->ip_lock); |
290 | ocfs2_error(inode->i_sb, | 88 | |
291 | "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", | 89 | __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); |
292 | i, | 90 | if (emi) { |
293 | (unsigned long long)le64_to_cpu(rec->e_blkno), | 91 | coff = cpos - emi->ei_cpos; |
294 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 92 | *phys = emi->ei_phys + coff; |
295 | OCFS2_I(inode)->ip_clusters); | 93 | if (len) |
296 | return ret; | 94 | *len = emi->ei_clusters - coff; |
297 | } | 95 | if (flags) |
298 | 96 | *flags = emi->ei_flags; | |
299 | ret = ocfs2_extent_map_insert(inode, rec, | ||
300 | le16_to_cpu(el->l_tree_depth)); | ||
301 | if (ret && (ret != -EEXIST)) { | ||
302 | mlog_errno(ret); | ||
303 | goto out_free; | ||
304 | } | ||
305 | } | 97 | } |
306 | 98 | ||
307 | ret = 0; | 99 | spin_unlock(&oi->ip_lock); |
308 | 100 | ||
309 | out_free: | 101 | if (emi == NULL) |
310 | if (eb_bh) | 102 | return -ENOENT; |
311 | brelse(eb_bh); | ||
312 | 103 | ||
313 | return ret; | 104 | return 0; |
314 | } | 105 | } |
315 | 106 | ||
316 | /* | 107 | /* |
317 | * This lookup actually will read from disk. It has one invariant: | 108 | * Forget about all clusters equal to or greater than cpos. |
318 | * It will never re-traverse blocks. This means that all inserts should | ||
319 | * be new regions or more granular regions (both allowed by insert). | ||
320 | */ | 109 | */ |
321 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | 110 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) |
322 | u32 cpos, | ||
323 | u32 clusters, | ||
324 | struct ocfs2_extent_map_entry **ret_ent) | ||
325 | { | 111 | { |
326 | int ret; | 112 | struct list_head *p, *n; |
327 | u64 blkno; | 113 | struct ocfs2_extent_map_item *emi; |
328 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 114 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
329 | struct ocfs2_extent_map_entry *ent; | 115 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
330 | struct buffer_head *bh = NULL; | 116 | LIST_HEAD(tmp_list); |
331 | struct ocfs2_extent_block *eb; | 117 | unsigned int range; |
332 | struct ocfs2_dinode *di; | 118 | |
333 | struct ocfs2_extent_list *el; | 119 | spin_lock(&oi->ip_lock); |
334 | 120 | list_for_each_safe(p, n, &em->em_list) { | |
335 | spin_lock(&OCFS2_I(inode)->ip_lock); | 121 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
336 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 122 | |
337 | if (ent) { | 123 | if (emi->ei_cpos >= cpos) { |
338 | if (!ent->e_tree_depth) { | 124 | /* Full truncate of this record. */ |
339 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 125 | list_move(&emi->ei_list, &tmp_list); |
340 | *ret_ent = ent; | 126 | BUG_ON(em->em_num_items == 0); |
341 | return 0; | 127 | em->em_num_items--; |
342 | } | 128 | continue; |
343 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
344 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
345 | |||
346 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
347 | OCFS2_BH_CACHED, inode); | ||
348 | if (ret) { | ||
349 | mlog_errno(ret); | ||
350 | if (bh) | ||
351 | brelse(bh); | ||
352 | return ret; | ||
353 | } | 129 | } |
354 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
355 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
356 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
357 | brelse(bh); | ||
358 | return -EIO; | ||
359 | } | ||
360 | el = &eb->h_list; | ||
361 | } else { | ||
362 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
363 | 130 | ||
364 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | 131 | range = emi->ei_cpos + emi->ei_clusters; |
365 | OCFS2_I(inode)->ip_blkno, &bh, | 132 | if (range > cpos) { |
366 | OCFS2_BH_CACHED, inode); | 133 | /* Partial truncate */ |
367 | if (ret) { | 134 | emi->ei_clusters = cpos - emi->ei_cpos; |
368 | mlog_errno(ret); | ||
369 | if (bh) | ||
370 | brelse(bh); | ||
371 | return ret; | ||
372 | } | 135 | } |
373 | di = (struct ocfs2_dinode *)bh->b_data; | ||
374 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
375 | brelse(bh); | ||
376 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
377 | return -EIO; | ||
378 | } | ||
379 | el = &di->id2.i_list; | ||
380 | } | ||
381 | |||
382 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
383 | brelse(bh); | ||
384 | if (ret) { | ||
385 | mlog_errno(ret); | ||
386 | return ret; | ||
387 | } | 136 | } |
137 | spin_unlock(&oi->ip_lock); | ||
388 | 138 | ||
389 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | 139 | list_for_each_safe(p, n, &tmp_list) { |
390 | if (!ent) { | 140 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); |
391 | ret = -ESRCH; | 141 | list_del(&emi->ei_list); |
392 | mlog_errno(ret); | 142 | kfree(emi); |
393 | return ret; | ||
394 | } | 143 | } |
395 | |||
396 | /* FIXME: Make sure this isn't a corruption */ | ||
397 | BUG_ON(ent->e_tree_depth); | ||
398 | |||
399 | *ret_ent = ent; | ||
400 | |||
401 | return 0; | ||
402 | } | 144 | } |
403 | 145 | ||
404 | /* | 146 | /* |
405 | * Callers must hold ip_lock. This can insert pieces of the tree, | 147 | * Is any part of emi2 contained within emi1 |
406 | * thus racing lookup if the lock weren't held. | ||
407 | */ | 148 | */ |
408 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | 149 | static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, |
409 | struct ocfs2_extent_map_entry *ent) | 150 | struct ocfs2_extent_map_item *emi2) |
410 | { | 151 | { |
411 | struct rb_node **p, *parent; | 152 | unsigned int range1, range2; |
412 | struct ocfs2_extent_map_entry *old_ent; | ||
413 | 153 | ||
414 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | 154 | /* |
415 | le32_to_cpu(ent->e_rec.e_clusters), | 155 | * Check if logical start of emi2 is inside emi1 |
416 | &p, &parent); | 156 | */ |
417 | if (old_ent) | 157 | range1 = emi1->ei_cpos + emi1->ei_clusters; |
418 | return -EEXIST; | 158 | if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) |
159 | return 1; | ||
419 | 160 | ||
420 | rb_link_node(&ent->e_node, parent, p); | 161 | /* |
421 | rb_insert_color(&ent->e_node, &em->em_extents); | 162 | * Check if logical end of emi2 is inside emi1 |
163 | */ | ||
164 | range2 = emi2->ei_cpos + emi2->ei_clusters; | ||
165 | if (range2 > emi1->ei_cpos && range2 <= range1) | ||
166 | return 1; | ||
422 | 167 | ||
423 | return 0; | 168 | return 0; |
424 | } | 169 | } |
425 | 170 | ||
171 | static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, | ||
172 | struct ocfs2_extent_map_item *src) | ||
173 | { | ||
174 | dest->ei_cpos = src->ei_cpos; | ||
175 | dest->ei_phys = src->ei_phys; | ||
176 | dest->ei_clusters = src->ei_clusters; | ||
177 | dest->ei_flags = src->ei_flags; | ||
178 | } | ||
426 | 179 | ||
427 | /* | 180 | /* |
428 | * Simple rule: on any return code other than -EAGAIN, anything left | 181 | * Try to merge emi with ins. Returns 1 if merge succeeds, zero |
429 | * in the insert_context will be freed. | 182 | * otherwise. |
430 | * | ||
431 | * Simple rule #2: A return code of -EEXIST from this function or | ||
432 | * its calls to ocfs2_extent_map_insert_entry() signifies that another | ||
433 | * thread beat us to the insert. It is not an actual error, but it | ||
434 | * tells the caller we have no more work to do. | ||
435 | */ | 183 | */ |
436 | static int ocfs2_extent_map_try_insert(struct inode *inode, | 184 | static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, |
437 | struct ocfs2_extent_rec *rec, | 185 | struct ocfs2_extent_map_item *ins) |
438 | int tree_depth, | ||
439 | struct ocfs2_em_insert_context *ctxt) | ||
440 | { | 186 | { |
441 | int ret; | ||
442 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
443 | struct ocfs2_extent_map_entry *old_ent; | ||
444 | |||
445 | ctxt->need_left = 0; | ||
446 | ctxt->need_right = 0; | ||
447 | ctxt->old_ent = NULL; | ||
448 | |||
449 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
450 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
451 | if (!ret) { | ||
452 | ctxt->new_ent = NULL; | ||
453 | goto out_unlock; | ||
454 | } | ||
455 | |||
456 | /* Since insert_entry failed, the map MUST have old_ent */ | ||
457 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
458 | le32_to_cpu(rec->e_clusters), | ||
459 | NULL, NULL); | ||
460 | |||
461 | BUG_ON(!old_ent); | ||
462 | |||
463 | if (old_ent->e_tree_depth < tree_depth) { | ||
464 | /* Another thread beat us to the lower tree_depth */ | ||
465 | ret = -EEXIST; | ||
466 | goto out_unlock; | ||
467 | } | ||
468 | |||
469 | if (old_ent->e_tree_depth == tree_depth) { | ||
470 | /* | ||
471 | * Another thread beat us to this tree_depth. | ||
472 | * Let's make sure we agree with that thread (the | ||
473 | * extent_rec should be identical). | ||
474 | */ | ||
475 | if (!memcmp(rec, &old_ent->e_rec, | ||
476 | sizeof(struct ocfs2_extent_rec))) | ||
477 | ret = 0; | ||
478 | else | ||
479 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
480 | ret = -EEXIST; | ||
481 | |||
482 | goto out_unlock; | ||
483 | } | ||
484 | |||
485 | /* | 187 | /* |
486 | * We do it in this order specifically so that no actual tree | 188 | * Handle contiguousness |
487 | * changes occur until we have all the pieces we need. We | ||
488 | * don't want malloc failures to leave an inconsistent tree. | ||
489 | * Whenever we drop the lock, another process could be | ||
490 | * inserting. Also note that, if another process just beat us | ||
491 | * to an insert, we might not need the same pieces we needed | ||
492 | * the first go round. In the end, the pieces we need will | ||
493 | * be used, and the pieces we don't will be freed. | ||
494 | */ | 189 | */ |
495 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | 190 | if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && |
496 | le32_to_cpu(old_ent->e_rec.e_cpos)); | 191 | ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && |
497 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | 192 | ins->ei_flags == emi->ei_flags) { |
498 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | 193 | emi->ei_clusters += ins->ei_clusters; |
499 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | 194 | return 1; |
500 | ret = -EAGAIN; | 195 | } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && |
501 | if (ctxt->need_left) { | 196 | (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && |
502 | if (!ctxt->left_ent) | 197 | ins->ei_flags == emi->ei_flags) { |
503 | goto out_unlock; | 198 | emi->ei_phys = ins->ei_phys; |
504 | *(ctxt->left_ent) = *old_ent; | 199 | emi->ei_cpos = ins->ei_cpos; |
505 | ctxt->left_ent->e_rec.e_clusters = | 200 | emi->ei_clusters += ins->ei_clusters; |
506 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | 201 | return 1; |
507 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
508 | } | ||
509 | if (ctxt->need_right) { | ||
510 | if (!ctxt->right_ent) | ||
511 | goto out_unlock; | ||
512 | *(ctxt->right_ent) = *old_ent; | ||
513 | ctxt->right_ent->e_rec.e_cpos = | ||
514 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
515 | le32_to_cpu(rec->e_clusters)); | ||
516 | ctxt->right_ent->e_rec.e_clusters = | ||
517 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
518 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
519 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
520 | } | ||
521 | |||
522 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
523 | /* Now that he's erased, set him up for deletion */ | ||
524 | ctxt->old_ent = old_ent; | ||
525 | |||
526 | if (ctxt->need_left) { | ||
527 | ret = ocfs2_extent_map_insert_entry(em, | ||
528 | ctxt->left_ent); | ||
529 | if (ret) | ||
530 | goto out_unlock; | ||
531 | ctxt->left_ent = NULL; | ||
532 | } | 202 | } |
533 | 203 | ||
534 | if (ctxt->need_right) { | 204 | /* |
535 | ret = ocfs2_extent_map_insert_entry(em, | 205 | * Overlapping extents - this shouldn't happen unless we've |
536 | ctxt->right_ent); | 206 | * split an extent to change it's flags. That is exceedingly |
537 | if (ret) | 207 | * rare, so there's no sense in trying to optimize it yet. |
538 | goto out_unlock; | 208 | */ |
539 | ctxt->right_ent = NULL; | 209 | if (ocfs2_ei_is_contained(emi, ins) || |
210 | ocfs2_ei_is_contained(ins, emi)) { | ||
211 | ocfs2_copy_emi_fields(emi, ins); | ||
212 | return 1; | ||
540 | } | 213 | } |
541 | 214 | ||
542 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | 215 | /* No merge was possible. */ |
543 | 216 | return 0; | |
544 | if (!ret) | ||
545 | ctxt->new_ent = NULL; | ||
546 | |||
547 | out_unlock: | ||
548 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
549 | |||
550 | return ret; | ||
551 | } | 217 | } |
552 | 218 | ||
553 | 219 | /* | |
554 | static int ocfs2_extent_map_insert(struct inode *inode, | 220 | * In order to reduce complexity on the caller, this insert function |
555 | struct ocfs2_extent_rec *rec, | 221 | * is intentionally liberal in what it will accept. |
556 | int tree_depth) | 222 | * |
223 | * The only rule is that the truncate call *must* be used whenever | ||
224 | * records have been deleted. This avoids inserting overlapping | ||
225 | * records with different physical mappings. | ||
226 | */ | ||
227 | void ocfs2_extent_map_insert_rec(struct inode *inode, | ||
228 | struct ocfs2_extent_rec *rec) | ||
557 | { | 229 | { |
558 | int ret; | 230 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
559 | struct ocfs2_em_insert_context ctxt = {0, }; | 231 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
560 | 232 | struct ocfs2_extent_map_item *emi, *new_emi = NULL; | |
561 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | 233 | struct ocfs2_extent_map_item ins; |
562 | OCFS2_I(inode)->ip_map.em_clusters) { | 234 | |
563 | ret = -EBADR; | 235 | ins.ei_cpos = le32_to_cpu(rec->e_cpos); |
564 | mlog_errno(ret); | 236 | ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, |
565 | return ret; | 237 | le64_to_cpu(rec->e_blkno)); |
238 | ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); | ||
239 | ins.ei_flags = rec->e_flags; | ||
240 | |||
241 | search: | ||
242 | spin_lock(&oi->ip_lock); | ||
243 | |||
244 | list_for_each_entry(emi, &em->em_list, ei_list) { | ||
245 | if (ocfs2_try_to_merge_extent_map(emi, &ins)) { | ||
246 | list_move(&emi->ei_list, &em->em_list); | ||
247 | spin_unlock(&oi->ip_lock); | ||
248 | goto out; | ||
249 | } | ||
566 | } | 250 | } |
567 | 251 | ||
568 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | 252 | /* |
569 | if (!rec->e_clusters) { | 253 | * No item could be merged. |
570 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | 254 | * |
571 | OCFS2_I(inode)->ip_map.em_clusters) { | 255 | * Either allocate and add a new item, or overwrite the last recently |
572 | ret = -EBADR; | 256 | * inserted. |
573 | mlog_errno(ret); | 257 | */ |
574 | ocfs2_error(inode->i_sb, | ||
575 | "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", | ||
576 | (unsigned long long)le64_to_cpu(rec->e_blkno), | ||
577 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
578 | return ret; | ||
579 | } | ||
580 | 258 | ||
581 | /* Ignore the truncated tail */ | 259 | if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { |
582 | return 0; | 260 | if (new_emi == NULL) { |
583 | } | 261 | spin_unlock(&oi->ip_lock); |
584 | 262 | ||
585 | ret = -ENOMEM; | 263 | new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); |
586 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | 264 | if (new_emi == NULL) |
587 | GFP_NOFS); | 265 | goto out; |
588 | if (!ctxt.new_ent) { | ||
589 | mlog_errno(ret); | ||
590 | return ret; | ||
591 | } | ||
592 | 266 | ||
593 | ctxt.new_ent->e_rec = *rec; | 267 | goto search; |
594 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
595 | |||
596 | do { | ||
597 | ret = -ENOMEM; | ||
598 | if (ctxt.need_left && !ctxt.left_ent) { | ||
599 | ctxt.left_ent = | ||
600 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
601 | GFP_NOFS); | ||
602 | if (!ctxt.left_ent) | ||
603 | break; | ||
604 | } | ||
605 | if (ctxt.need_right && !ctxt.right_ent) { | ||
606 | ctxt.right_ent = | ||
607 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
608 | GFP_NOFS); | ||
609 | if (!ctxt.right_ent) | ||
610 | break; | ||
611 | } | 268 | } |
612 | 269 | ||
613 | ret = ocfs2_extent_map_try_insert(inode, rec, | 270 | ocfs2_copy_emi_fields(new_emi, &ins); |
614 | tree_depth, &ctxt); | 271 | list_add(&new_emi->ei_list, &em->em_list); |
615 | } while (ret == -EAGAIN); | 272 | em->em_num_items++; |
616 | 273 | new_emi = NULL; | |
617 | if ((ret < 0) && (ret != -EEXIST)) | 274 | } else { |
618 | mlog_errno(ret); | 275 | BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); |
276 | emi = list_entry(em->em_list.prev, | ||
277 | struct ocfs2_extent_map_item, ei_list); | ||
278 | list_move(&emi->ei_list, &em->em_list); | ||
279 | ocfs2_copy_emi_fields(emi, &ins); | ||
280 | } | ||
619 | 281 | ||
620 | if (ctxt.left_ent) | 282 | spin_unlock(&oi->ip_lock); |
621 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
622 | if (ctxt.right_ent) | ||
623 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
624 | if (ctxt.old_ent) | ||
625 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
626 | if (ctxt.new_ent) | ||
627 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
628 | 283 | ||
629 | return ret; | 284 | out: |
285 | if (new_emi) | ||
286 | kfree(new_emi); | ||
630 | } | 287 | } |
631 | 288 | ||
632 | /* | 289 | /* |
633 | * Append this record to the tail of the extent map. It must be | 290 | * Return the 1st index within el which contains an extent start |
634 | * tree_depth 0. The record might be an extension of an existing | 291 | * larger than v_cluster. |
635 | * record, and as such that needs to be handled. eg: | ||
636 | * | ||
637 | * Existing record in the extent map: | ||
638 | * | ||
639 | * cpos = 10, len = 10 | ||
640 | * |---------| | ||
641 | * | ||
642 | * New Record: | ||
643 | * | ||
644 | * cpos = 10, len = 20 | ||
645 | * |------------------| | ||
646 | * | ||
647 | * The passed record is the new on-disk record. The new_clusters value | ||
648 | * is how many clusters were added to the file. If the append is a | ||
649 | * contiguous append, the new_clusters has been added to | ||
650 | * rec->e_clusters. If the append is an entirely new extent, then | ||
651 | * rec->e_clusters is == new_clusters. | ||
652 | */ | 292 | */ |
653 | int ocfs2_extent_map_append(struct inode *inode, | 293 | static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, |
654 | struct ocfs2_extent_rec *rec, | 294 | u32 v_cluster) |
655 | u32 new_clusters) | ||
656 | { | 295 | { |
657 | int ret; | 296 | int i; |
658 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 297 | struct ocfs2_extent_rec *rec; |
659 | struct ocfs2_extent_map_entry *ent; | ||
660 | struct ocfs2_extent_rec *old; | ||
661 | |||
662 | BUG_ON(!new_clusters); | ||
663 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
664 | 298 | ||
665 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | 299 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
666 | /* | 300 | rec = &el->l_recs[i]; |
667 | * Size changed underneath us on disk. Drop any | ||
668 | * straddling records and update our idea of | ||
669 | * i_clusters | ||
670 | */ | ||
671 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
672 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
673 | } | ||
674 | 301 | ||
675 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | 302 | if (v_cluster < le32_to_cpu(rec->e_cpos)) |
676 | le32_to_cpu(rec->e_clusters)) != | 303 | break; |
677 | (em->em_clusters + new_clusters), | ||
678 | "Inode %llu:\n" | ||
679 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
680 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
681 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
682 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
683 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
684 | em->em_clusters, new_clusters, | ||
685 | em->em_clusters + new_clusters); | ||
686 | |||
687 | em->em_clusters += new_clusters; | ||
688 | |||
689 | ret = -ENOENT; | ||
690 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
691 | /* This is a contiguous append */ | ||
692 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
693 | NULL, NULL); | ||
694 | if (ent) { | ||
695 | old = &ent->e_rec; | ||
696 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
697 | le32_to_cpu(rec->e_clusters)) != | ||
698 | (le32_to_cpu(old->e_cpos) + | ||
699 | le32_to_cpu(old->e_clusters) + | ||
700 | new_clusters)); | ||
701 | if (ent->e_tree_depth == 0) { | ||
702 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
703 | le32_to_cpu(rec->e_cpos)); | ||
704 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
705 | le64_to_cpu(rec->e_blkno)); | ||
706 | ret = 0; | ||
707 | } | ||
708 | /* | ||
709 | * Let non-leafs fall through as -ENOENT to | ||
710 | * force insertion of the new leaf. | ||
711 | */ | ||
712 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
713 | } | ||
714 | } | 304 | } |
715 | 305 | ||
716 | if (ret == -ENOENT) | 306 | return i; |
717 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
718 | if (ret < 0) | ||
719 | mlog_errno(ret); | ||
720 | return ret; | ||
721 | } | 307 | } |
722 | 308 | ||
723 | #if 0 | ||
724 | /* Code here is included but defined out as it completes the extent | ||
725 | * map api and may be used in the future. */ | ||
726 | |||
727 | /* | 309 | /* |
728 | * Look up the record containing this cluster offset. This record is | 310 | * Figure out the size of a hole which starts at v_cluster within the given |
729 | * part of the extent map. Do not free it. Any changes you make to | 311 | * extent list. |
730 | * it will reflect in the extent map. So, if your last extent | ||
731 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
732 | * clusters, you can do: | ||
733 | * | 312 | * |
734 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | 313 | * If there is no more allocation past v_cluster, we return the maximum |
735 | * rec->e_clusters -= 5; | 314 | * cluster size minus v_cluster. |
736 | * | 315 | * |
737 | * The lookup does not read from disk. If the map isn't filled in for | 316 | * If we have in-inode extents, then el points to the dinode list and |
738 | * an entry, you won't find it. | 317 | * eb_bh is NULL. Otherwise, eb_bh should point to the extent block |
739 | * | 318 | * containing el. |
740 | * Also note that the returned record is valid until alloc_sem is | ||
741 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
742 | */ | 319 | */ |
743 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | 320 | static int ocfs2_figure_hole_clusters(struct inode *inode, |
744 | struct ocfs2_extent_rec **rec, | 321 | struct ocfs2_extent_list *el, |
745 | int *tree_depth) | 322 | struct buffer_head *eb_bh, |
323 | u32 v_cluster, | ||
324 | u32 *num_clusters) | ||
746 | { | 325 | { |
747 | int ret = -ENOENT; | 326 | int ret, i; |
748 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 327 | struct buffer_head *next_eb_bh = NULL; |
749 | struct ocfs2_extent_map_entry *ent; | 328 | struct ocfs2_extent_block *eb, *next_eb; |
750 | 329 | ||
751 | *rec = NULL; | 330 | i = ocfs2_search_for_hole_index(el, v_cluster); |
752 | 331 | ||
753 | if (cpos >= OCFS2_I(inode)->ip_clusters) | 332 | if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { |
754 | return -EINVAL; | 333 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; |
755 | 334 | ||
756 | if (cpos >= em->em_clusters) { | ||
757 | /* | 335 | /* |
758 | * Size changed underneath us on disk. Drop any | 336 | * Check the next leaf for any extents. |
759 | * straddling records and update our idea of | ||
760 | * i_clusters | ||
761 | */ | 337 | */ |
762 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
763 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
764 | } | ||
765 | |||
766 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
767 | NULL, NULL); | ||
768 | 338 | ||
769 | if (ent) { | 339 | if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) |
770 | *rec = &ent->e_rec; | 340 | goto no_more_extents; |
771 | if (tree_depth) | ||
772 | *tree_depth = ent->e_tree_depth; | ||
773 | ret = 0; | ||
774 | } | ||
775 | 341 | ||
776 | return ret; | 342 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), |
777 | } | 343 | le64_to_cpu(eb->h_next_leaf_blk), |
344 | &next_eb_bh, OCFS2_BH_CACHED, inode); | ||
345 | if (ret) { | ||
346 | mlog_errno(ret); | ||
347 | goto out; | ||
348 | } | ||
349 | next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; | ||
778 | 350 | ||
779 | int ocfs2_extent_map_get_clusters(struct inode *inode, | 351 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { |
780 | u32 v_cpos, int count, | 352 | ret = -EROFS; |
781 | u32 *p_cpos, int *ret_count) | 353 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); |
782 | { | 354 | goto out; |
783 | int ret; | 355 | } |
784 | u32 coff, ccount; | ||
785 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
786 | struct ocfs2_extent_map_entry *ent = NULL; | ||
787 | 356 | ||
788 | *p_cpos = ccount = 0; | 357 | el = &next_eb->h_list; |
789 | 358 | ||
790 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | 359 | i = ocfs2_search_for_hole_index(el, v_cluster); |
791 | return -EINVAL; | 360 | } |
792 | 361 | ||
793 | if ((v_cpos + count) > em->em_clusters) { | 362 | no_more_extents: |
363 | if (i == le16_to_cpu(el->l_next_free_rec)) { | ||
794 | /* | 364 | /* |
795 | * Size changed underneath us on disk. Drop any | 365 | * We're at the end of our existing allocation. Just |
796 | * straddling records and update our idea of | 366 | * return the maximum number of clusters we could |
797 | * i_clusters | 367 | * possibly allocate. |
798 | */ | 368 | */ |
799 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | 369 | *num_clusters = UINT_MAX - v_cluster; |
800 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | 370 | } else { |
371 | *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; | ||
801 | } | 372 | } |
802 | 373 | ||
374 | ret = 0; | ||
375 | out: | ||
376 | brelse(next_eb_bh); | ||
377 | return ret; | ||
378 | } | ||
803 | 379 | ||
804 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | 380 | /* |
805 | if (ret) | 381 | * Return the index of the extent record which contains cluster #v_cluster. |
806 | return ret; | 382 | * -1 is returned if it was not found. |
383 | * | ||
384 | * Should work fine on interior and exterior nodes. | ||
385 | */ | ||
386 | static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, | ||
387 | u32 v_cluster) | ||
388 | { | ||
389 | int ret = -1; | ||
390 | int i; | ||
391 | struct ocfs2_extent_rec *rec; | ||
392 | u32 rec_end, rec_start, clusters; | ||
807 | 393 | ||
808 | if (ent) { | 394 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { |
809 | /* We should never find ourselves straddling an interval */ | 395 | rec = &el->l_recs[i]; |
810 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
811 | v_cpos, | ||
812 | count)) | ||
813 | return -ESRCH; | ||
814 | 396 | ||
815 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | 397 | rec_start = le32_to_cpu(rec->e_cpos); |
816 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | 398 | clusters = ocfs2_rec_clusters(el, rec); |
817 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
818 | coff; | ||
819 | 399 | ||
820 | if (ret_count) | 400 | rec_end = rec_start + clusters; |
821 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
822 | 401 | ||
823 | return 0; | 402 | if (v_cluster >= rec_start && v_cluster < rec_end) { |
403 | ret = i; | ||
404 | break; | ||
405 | } | ||
824 | } | 406 | } |
825 | 407 | ||
826 | 408 | return ret; | |
827 | return -ENOENT; | ||
828 | } | 409 | } |
829 | 410 | ||
830 | #endif /* 0 */ | 411 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
831 | 412 | u32 *p_cluster, u32 *num_clusters, | |
832 | int ocfs2_extent_map_get_blocks(struct inode *inode, | 413 | unsigned int *extent_flags) |
833 | u64 v_blkno, int count, | ||
834 | u64 *p_blkno, int *ret_count) | ||
835 | { | 414 | { |
836 | int ret; | 415 | int ret, i; |
837 | u64 boff; | 416 | unsigned int flags = 0; |
838 | u32 cpos, clusters; | 417 | struct buffer_head *di_bh = NULL; |
839 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | 418 | struct buffer_head *eb_bh = NULL; |
840 | struct ocfs2_extent_map_entry *ent = NULL; | 419 | struct ocfs2_dinode *di; |
841 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 420 | struct ocfs2_extent_block *eb; |
421 | struct ocfs2_extent_list *el; | ||
842 | struct ocfs2_extent_rec *rec; | 422 | struct ocfs2_extent_rec *rec; |
423 | u32 coff; | ||
843 | 424 | ||
844 | *p_blkno = 0; | 425 | ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, |
845 | 426 | num_clusters, extent_flags); | |
846 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | 427 | if (ret == 0) |
847 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | 428 | goto out; |
848 | (u64)count + bpc - 1); | ||
849 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
850 | ret = -EINVAL; | ||
851 | mlog_errno(ret); | ||
852 | return ret; | ||
853 | } | ||
854 | |||
855 | if ((cpos + clusters) > em->em_clusters) { | ||
856 | /* | ||
857 | * Size changed underneath us on disk. Drop any | ||
858 | * straddling records and update our idea of | ||
859 | * i_clusters | ||
860 | */ | ||
861 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
862 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
863 | } | ||
864 | 429 | ||
865 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | 430 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, |
431 | &di_bh, OCFS2_BH_CACHED, inode); | ||
866 | if (ret) { | 432 | if (ret) { |
867 | mlog_errno(ret); | 433 | mlog_errno(ret); |
868 | return ret; | 434 | goto out; |
869 | } | 435 | } |
870 | 436 | ||
871 | if (ent) | 437 | di = (struct ocfs2_dinode *) di_bh->b_data; |
872 | { | 438 | el = &di->id2.i_list; |
873 | rec = &ent->e_rec; | ||
874 | 439 | ||
875 | /* We should never find ourselves straddling an interval */ | 440 | if (el->l_tree_depth) { |
876 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | 441 | ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); |
877 | ret = -ESRCH; | 442 | if (ret) { |
878 | mlog_errno(ret); | 443 | mlog_errno(ret); |
879 | return ret; | 444 | goto out; |
880 | } | 445 | } |
881 | 446 | ||
882 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | 447 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; |
883 | le32_to_cpu(rec->e_cpos)); | 448 | el = &eb->h_list; |
884 | boff += (v_blkno & (u64)(bpc - 1)); | ||
885 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
886 | 449 | ||
887 | if (ret_count) { | 450 | if (el->l_tree_depth) { |
888 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | 451 | ocfs2_error(inode->i_sb, |
889 | le32_to_cpu(rec->e_clusters)) - boff; | 452 | "Inode %lu has non zero tree depth in " |
453 | "leaf block %llu\n", inode->i_ino, | ||
454 | (unsigned long long)eb_bh->b_blocknr); | ||
455 | ret = -EROFS; | ||
456 | goto out; | ||
890 | } | 457 | } |
891 | |||
892 | return 0; | ||
893 | } | 458 | } |
894 | 459 | ||
895 | return -ENOENT; | 460 | i = ocfs2_search_extent_list(el, v_cluster); |
896 | } | 461 | if (i == -1) { |
897 | 462 | /* | |
898 | int ocfs2_extent_map_init(struct inode *inode) | 463 | * A hole was found. Return some canned values that |
899 | { | 464 | * callers can key on. If asked for, num_clusters will |
900 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 465 | * be populated with the size of the hole. |
901 | 466 | */ | |
902 | em->em_extents = RB_ROOT; | 467 | *p_cluster = 0; |
903 | em->em_clusters = 0; | 468 | if (num_clusters) { |
904 | 469 | ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, | |
905 | return 0; | 470 | v_cluster, |
906 | } | 471 | num_clusters); |
907 | 472 | if (ret) { | |
908 | /* Needs the lock */ | 473 | mlog_errno(ret); |
909 | static void __ocfs2_extent_map_drop(struct inode *inode, | 474 | goto out; |
910 | u32 new_clusters, | 475 | } |
911 | struct rb_node **free_head, | 476 | } |
912 | struct ocfs2_extent_map_entry **tail_ent) | 477 | } else { |
913 | { | 478 | rec = &el->l_recs[i]; |
914 | struct rb_node *node, *next; | ||
915 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
916 | struct ocfs2_extent_map_entry *ent; | ||
917 | 479 | ||
918 | *free_head = NULL; | 480 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
919 | 481 | ||
920 | ent = NULL; | 482 | if (!rec->e_blkno) { |
921 | node = rb_last(&em->em_extents); | 483 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " |
922 | while (node) | 484 | "record (%u, %u, 0)", inode->i_ino, |
923 | { | 485 | le32_to_cpu(rec->e_cpos), |
924 | next = rb_prev(node); | 486 | ocfs2_rec_clusters(el, rec)); |
487 | ret = -EROFS; | ||
488 | goto out; | ||
489 | } | ||
925 | 490 | ||
926 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 491 | coff = v_cluster - le32_to_cpu(rec->e_cpos); |
927 | e_node); | ||
928 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
929 | break; | ||
930 | 492 | ||
931 | rb_erase(&ent->e_node, &em->em_extents); | 493 | *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, |
494 | le64_to_cpu(rec->e_blkno)); | ||
495 | *p_cluster = *p_cluster + coff; | ||
932 | 496 | ||
933 | node->rb_right = *free_head; | 497 | if (num_clusters) |
934 | *free_head = node; | 498 | *num_clusters = ocfs2_rec_clusters(el, rec) - coff; |
935 | 499 | ||
936 | ent = NULL; | 500 | flags = rec->e_flags; |
937 | node = next; | ||
938 | } | ||
939 | 501 | ||
940 | /* Do we have an entry straddling new_clusters? */ | 502 | ocfs2_extent_map_insert_rec(inode, rec); |
941 | if (tail_ent) { | ||
942 | if (ent && | ||
943 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
944 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
945 | *tail_ent = ent; | ||
946 | else | ||
947 | *tail_ent = NULL; | ||
948 | } | 503 | } |
949 | } | ||
950 | |||
951 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
952 | { | ||
953 | struct rb_node *node; | ||
954 | struct ocfs2_extent_map_entry *ent; | ||
955 | 504 | ||
956 | while (free_head) { | 505 | if (extent_flags) |
957 | node = free_head; | 506 | *extent_flags = flags; |
958 | free_head = node->rb_right; | ||
959 | 507 | ||
960 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | 508 | out: |
961 | e_node); | 509 | brelse(di_bh); |
962 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | 510 | brelse(eb_bh); |
963 | } | 511 | return ret; |
964 | } | 512 | } |
965 | 513 | ||
966 | /* | 514 | /* |
967 | * Remove all entries past new_clusters, inclusive of an entry that | 515 | * This expects alloc_sem to be held. The allocation cannot change at |
968 | * contains new_clusters. This is effectively a cache forget. | 516 | * all while the map is in the process of being updated. |
969 | * | ||
970 | * If you want to also clip the last extent by some number of clusters, | ||
971 | * you need to call ocfs2_extent_map_trunc(). | ||
972 | * This code does not check or modify ip_clusters. | ||
973 | */ | 517 | */ |
974 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | 518 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
519 | u64 *ret_count, unsigned int *extent_flags) | ||
975 | { | 520 | { |
976 | struct rb_node *free_head = NULL; | 521 | int ret; |
977 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | 522 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); |
978 | struct ocfs2_extent_map_entry *ent; | 523 | u32 cpos, num_clusters, p_cluster; |
979 | 524 | u64 boff = 0; | |
980 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
981 | 525 | ||
982 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | 526 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); |
983 | 527 | ||
984 | if (ent) { | 528 | ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, |
985 | rb_erase(&ent->e_node, &em->em_extents); | 529 | extent_flags); |
986 | ent->e_node.rb_right = free_head; | 530 | if (ret) { |
987 | free_head = &ent->e_node; | 531 | mlog_errno(ret); |
532 | goto out; | ||
988 | } | 533 | } |
989 | 534 | ||
990 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 535 | /* |
991 | 536 | * p_cluster == 0 indicates a hole. | |
992 | if (free_head) | 537 | */ |
993 | __ocfs2_extent_map_drop_cleanup(free_head); | 538 | if (p_cluster) { |
994 | 539 | boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); | |
995 | return 0; | 540 | boff += (v_blkno & (u64)(bpc - 1)); |
996 | } | 541 | } |
997 | |||
998 | /* | ||
999 | * Remove all entries past new_clusters and also clip any extent | ||
1000 | * straddling new_clusters, if there is one. This does not check | ||
1001 | * or modify ip_clusters | ||
1002 | */ | ||
1003 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
1004 | { | ||
1005 | struct rb_node *free_head = NULL; | ||
1006 | struct ocfs2_extent_map_entry *ent = NULL; | ||
1007 | |||
1008 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1009 | |||
1010 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
1011 | |||
1012 | if (ent) | ||
1013 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
1014 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
1015 | |||
1016 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
1017 | |||
1018 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1019 | |||
1020 | if (free_head) | ||
1021 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
1022 | |||
1023 | return 0; | ||
1024 | } | ||
1025 | 542 | ||
1026 | int __init init_ocfs2_extent_maps(void) | 543 | *p_blkno = boff; |
1027 | { | ||
1028 | ocfs2_em_ent_cachep = | ||
1029 | kmem_cache_create("ocfs2_em_ent", | ||
1030 | sizeof(struct ocfs2_extent_map_entry), | ||
1031 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
1032 | if (!ocfs2_em_ent_cachep) | ||
1033 | return -ENOMEM; | ||
1034 | 544 | ||
1035 | return 0; | 545 | if (ret_count) { |
1036 | } | 546 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); |
547 | *ret_count -= v_blkno & (u64)(bpc - 1); | ||
548 | } | ||
1037 | 549 | ||
1038 | void exit_ocfs2_extent_maps(void) | 550 | out: |
1039 | { | 551 | return ret; |
1040 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
1041 | } | 552 | } |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa886..de91e3e41a22 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
@@ -25,22 +25,29 @@ | |||
25 | #ifndef _EXTENT_MAP_H | 25 | #ifndef _EXTENT_MAP_H |
26 | #define _EXTENT_MAP_H | 26 | #define _EXTENT_MAP_H |
27 | 27 | ||
28 | int init_ocfs2_extent_maps(void); | 28 | struct ocfs2_extent_map_item { |
29 | void exit_ocfs2_extent_maps(void); | 29 | unsigned int ei_cpos; |
30 | unsigned int ei_phys; | ||
31 | unsigned int ei_clusters; | ||
32 | unsigned int ei_flags; | ||
30 | 33 | ||
31 | /* | 34 | struct list_head ei_list; |
32 | * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem | 35 | }; |
33 | * to be held. The allocation cannot change at all while the map is | 36 | |
34 | * in the process of being updated. | 37 | #define OCFS2_MAX_EXTENT_MAP_ITEMS 3 |
35 | */ | 38 | struct ocfs2_extent_map { |
36 | int ocfs2_extent_map_init(struct inode *inode); | 39 | unsigned int em_num_items; |
37 | int ocfs2_extent_map_append(struct inode *inode, | 40 | struct list_head em_list; |
38 | struct ocfs2_extent_rec *rec, | 41 | }; |
39 | u32 new_clusters); | 42 | |
40 | int ocfs2_extent_map_get_blocks(struct inode *inode, | 43 | void ocfs2_extent_map_init(struct inode *inode); |
41 | u64 v_blkno, int count, | 44 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); |
42 | u64 *p_blkno, int *ret_count); | 45 | void ocfs2_extent_map_insert_rec(struct inode *inode, |
43 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); | 46 | struct ocfs2_extent_rec *rec); |
44 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); | 47 | |
48 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, | ||
49 | u32 *num_clusters, unsigned int *extent_flags); | ||
50 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, | ||
51 | u64 *ret_count, unsigned int *extent_flags); | ||
45 | 52 | ||
46 | #endif /* _EXTENT_MAP_H */ | 53 | #endif /* _EXTENT_MAP_H */ |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f2cd3bf9efb2..520a2a6d7670 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/pipe_fs_i.h> | 34 | #include <linux/pipe_fs_i.h> |
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/writeback.h> | ||
36 | 37 | ||
37 | #define MLOG_MASK_PREFIX ML_INODE | 38 | #define MLOG_MASK_PREFIX ML_INODE |
38 | #include <cluster/masklog.h> | 39 | #include <cluster/masklog.h> |
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, | |||
215 | 216 | ||
216 | mlog_entry_void(); | 217 | mlog_entry_void(); |
217 | i_size_write(inode, new_i_size); | 218 | i_size_write(inode, new_i_size); |
218 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | 219 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
219 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 220 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
220 | 221 | ||
221 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 222 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
261 | { | 262 | { |
262 | int status; | 263 | int status; |
263 | handle_t *handle; | 264 | handle_t *handle; |
265 | struct ocfs2_dinode *di; | ||
264 | 266 | ||
265 | mlog_entry_void(); | 267 | mlog_entry_void(); |
266 | 268 | ||
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
274 | goto out; | 276 | goto out; |
275 | } | 277 | } |
276 | 278 | ||
277 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | 279 | status = ocfs2_journal_access(handle, inode, fe_bh, |
280 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
281 | if (status < 0) { | ||
282 | mlog_errno(status); | ||
283 | goto out_commit; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Do this before setting i_size. | ||
288 | */ | ||
289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); | ||
290 | if (status) { | ||
291 | mlog_errno(status); | ||
292 | goto out_commit; | ||
293 | } | ||
294 | |||
295 | i_size_write(inode, new_i_size); | ||
296 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | ||
297 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
298 | |||
299 | di = (struct ocfs2_dinode *) fe_bh->b_data; | ||
300 | di->i_size = cpu_to_le64(new_i_size); | ||
301 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
302 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
303 | |||
304 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
278 | if (status < 0) | 305 | if (status < 0) |
279 | mlog_errno(status); | 306 | mlog_errno(status); |
280 | 307 | ||
308 | out_commit: | ||
281 | ocfs2_commit_trans(osb, handle); | 309 | ocfs2_commit_trans(osb, handle); |
282 | out: | 310 | out: |
311 | |||
283 | mlog_exit(status); | 312 | mlog_exit(status); |
284 | return status; | 313 | return status; |
285 | } | 314 | } |
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
342 | mlog_errno(status); | 371 | mlog_errno(status); |
343 | goto bail; | 372 | goto bail; |
344 | } | 373 | } |
345 | ocfs2_data_unlock(inode, 1); | ||
346 | |||
347 | if (le32_to_cpu(fe->i_clusters) == | ||
348 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | ||
349 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | ||
350 | fe->i_clusters); | ||
351 | /* No allocation change is required, so lets fast path | ||
352 | * this truncate. */ | ||
353 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
354 | if (status < 0) | ||
355 | mlog_errno(status); | ||
356 | goto bail; | ||
357 | } | ||
358 | 374 | ||
359 | /* alright, we're going to need to do a full blown alloc size | 375 | /* alright, we're going to need to do a full blown alloc size |
360 | * change. Orphan the inode so that recovery can complete the | 376 | * change. Orphan the inode so that recovery can complete the |
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
363 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 379 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
364 | if (status < 0) { | 380 | if (status < 0) { |
365 | mlog_errno(status); | 381 | mlog_errno(status); |
366 | goto bail; | 382 | goto bail_unlock_data; |
367 | } | 383 | } |
368 | 384 | ||
369 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 385 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
370 | if (status < 0) { | 386 | if (status < 0) { |
371 | mlog_errno(status); | 387 | mlog_errno(status); |
372 | goto bail; | 388 | goto bail_unlock_data; |
373 | } | 389 | } |
374 | 390 | ||
375 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 391 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
376 | if (status < 0) { | 392 | if (status < 0) { |
377 | mlog_errno(status); | 393 | mlog_errno(status); |
378 | goto bail; | 394 | goto bail_unlock_data; |
379 | } | 395 | } |
380 | 396 | ||
381 | /* TODO: orphan dir cleanup here. */ | 397 | /* TODO: orphan dir cleanup here. */ |
398 | bail_unlock_data: | ||
399 | ocfs2_data_unlock(inode, 1); | ||
400 | |||
382 | bail: | 401 | bail: |
383 | 402 | ||
384 | mlog_exit(status); | 403 | mlog_exit(status); |
@@ -397,6 +416,7 @@ bail: | |||
397 | */ | 416 | */ |
398 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 417 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
399 | struct inode *inode, | 418 | struct inode *inode, |
419 | u32 *logical_offset, | ||
400 | u32 clusters_to_add, | 420 | u32 clusters_to_add, |
401 | struct buffer_head *fe_bh, | 421 | struct buffer_head *fe_bh, |
402 | handle_t *handle, | 422 | handle_t *handle, |
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
460 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | 480 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); |
461 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", | 481 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", |
462 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 482 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
463 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | 483 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, |
464 | num_bits, meta_ac); | 484 | *logical_offset, block, num_bits, |
485 | meta_ac); | ||
465 | if (status < 0) { | 486 | if (status < 0) { |
466 | mlog_errno(status); | 487 | mlog_errno(status); |
467 | goto leave; | 488 | goto leave; |
468 | } | 489 | } |
469 | 490 | ||
470 | le32_add_cpu(&fe->i_clusters, num_bits); | ||
471 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
472 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
473 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
474 | |||
475 | status = ocfs2_journal_dirty(handle, fe_bh); | 491 | status = ocfs2_journal_dirty(handle, fe_bh); |
476 | if (status < 0) { | 492 | if (status < 0) { |
477 | mlog_errno(status); | 493 | mlog_errno(status); |
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
479 | } | 495 | } |
480 | 496 | ||
481 | clusters_to_add -= num_bits; | 497 | clusters_to_add -= num_bits; |
498 | *logical_offset += num_bits; | ||
482 | 499 | ||
483 | if (clusters_to_add) { | 500 | if (clusters_to_add) { |
484 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | 501 | mlog(0, "need to alloc once more, clusters = %u, wanted = " |
@@ -494,14 +511,87 @@ leave: | |||
494 | return status; | 511 | return status; |
495 | } | 512 | } |
496 | 513 | ||
514 | /* | ||
515 | * For a given allocation, determine which allocators will need to be | ||
516 | * accessed, and lock them, reserving the appropriate number of bits. | ||
517 | * | ||
518 | * Called from ocfs2_extend_allocation() for file systems which don't | ||
519 | * support holes, and from ocfs2_write() for file systems which | ||
520 | * understand sparse inodes. | ||
521 | */ | ||
522 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
523 | u32 clusters_to_add, | ||
524 | struct ocfs2_alloc_context **data_ac, | ||
525 | struct ocfs2_alloc_context **meta_ac) | ||
526 | { | ||
527 | int ret, num_free_extents; | ||
528 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
529 | |||
530 | *meta_ac = NULL; | ||
531 | *data_ac = NULL; | ||
532 | |||
533 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | ||
534 | "clusters_to_add = %u\n", | ||
535 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
536 | le32_to_cpu(di->i_clusters), clusters_to_add); | ||
537 | |||
538 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); | ||
539 | if (num_free_extents < 0) { | ||
540 | ret = num_free_extents; | ||
541 | mlog_errno(ret); | ||
542 | goto out; | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Sparse allocation file systems need to be more conservative | ||
547 | * with reserving room for expansion - the actual allocation | ||
548 | * happens while we've got a journal handle open so re-taking | ||
549 | * a cluster lock (because we ran out of room for another | ||
550 | * extent) will violate ordering rules. | ||
551 | * | ||
552 | * Most of the time we'll only be seeing this 1 cluster at a time | ||
553 | * anyway. | ||
554 | */ | ||
555 | if (!num_free_extents || | ||
556 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { | ||
557 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); | ||
558 | if (ret < 0) { | ||
559 | if (ret != -ENOSPC) | ||
560 | mlog_errno(ret); | ||
561 | goto out; | ||
562 | } | ||
563 | } | ||
564 | |||
565 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); | ||
566 | if (ret < 0) { | ||
567 | if (ret != -ENOSPC) | ||
568 | mlog_errno(ret); | ||
569 | goto out; | ||
570 | } | ||
571 | |||
572 | out: | ||
573 | if (ret) { | ||
574 | if (*meta_ac) { | ||
575 | ocfs2_free_alloc_context(*meta_ac); | ||
576 | *meta_ac = NULL; | ||
577 | } | ||
578 | |||
579 | /* | ||
580 | * We cannot have an error and a non null *data_ac. | ||
581 | */ | ||
582 | } | ||
583 | |||
584 | return ret; | ||
585 | } | ||
586 | |||
497 | static int ocfs2_extend_allocation(struct inode *inode, | 587 | static int ocfs2_extend_allocation(struct inode *inode, |
498 | u32 clusters_to_add) | 588 | u32 clusters_to_add) |
499 | { | 589 | { |
500 | int status = 0; | 590 | int status = 0; |
501 | int restart_func = 0; | 591 | int restart_func = 0; |
502 | int drop_alloc_sem = 0; | 592 | int drop_alloc_sem = 0; |
503 | int credits, num_free_extents; | 593 | int credits; |
504 | u32 prev_clusters; | 594 | u32 prev_clusters, logical_start; |
505 | struct buffer_head *bh = NULL; | 595 | struct buffer_head *bh = NULL; |
506 | struct ocfs2_dinode *fe = NULL; | 596 | struct ocfs2_dinode *fe = NULL; |
507 | handle_t *handle = NULL; | 597 | handle_t *handle = NULL; |
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
512 | 602 | ||
513 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 603 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
514 | 604 | ||
605 | /* | ||
606 | * This function only exists for file systems which don't | ||
607 | * support holes. | ||
608 | */ | ||
609 | BUG_ON(ocfs2_sparse_alloc(osb)); | ||
610 | |||
515 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 611 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
516 | OCFS2_BH_CACHED, inode); | 612 | OCFS2_BH_CACHED, inode); |
517 | if (status < 0) { | 613 | if (status < 0) { |
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
526 | goto leave; | 622 | goto leave; |
527 | } | 623 | } |
528 | 624 | ||
625 | logical_start = OCFS2_I(inode)->ip_clusters; | ||
626 | |||
529 | restart_all: | 627 | restart_all: |
530 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 628 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
531 | 629 | ||
532 | mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " | ||
533 | "clusters_to_add = %u\n", | ||
534 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
535 | fe->i_clusters, clusters_to_add); | ||
536 | |||
537 | num_free_extents = ocfs2_num_free_extents(osb, | ||
538 | inode, | ||
539 | fe); | ||
540 | if (num_free_extents < 0) { | ||
541 | status = num_free_extents; | ||
542 | mlog_errno(status); | ||
543 | goto leave; | ||
544 | } | ||
545 | |||
546 | if (!num_free_extents) { | ||
547 | status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); | ||
548 | if (status < 0) { | ||
549 | if (status != -ENOSPC) | ||
550 | mlog_errno(status); | ||
551 | goto leave; | ||
552 | } | ||
553 | } | ||
554 | |||
555 | status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); | ||
556 | if (status < 0) { | ||
557 | if (status != -ENOSPC) | ||
558 | mlog_errno(status); | ||
559 | goto leave; | ||
560 | } | ||
561 | |||
562 | /* blocks peope in read/write from reading our allocation | 630 | /* blocks peope in read/write from reading our allocation |
563 | * until we're done changing it. We depend on i_mutex to block | 631 | * until we're done changing it. We depend on i_mutex to block |
564 | * other extend/truncate calls while we're here. Ordering wrt | 632 | * other extend/truncate calls while we're here. Ordering wrt |
@@ -566,6 +634,13 @@ restart_all: | |||
566 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
567 | drop_alloc_sem = 1; | 635 | drop_alloc_sem = 1; |
568 | 636 | ||
637 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
638 | &meta_ac); | ||
639 | if (status) { | ||
640 | mlog_errno(status); | ||
641 | goto leave; | ||
642 | } | ||
643 | |||
569 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 644 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
570 | handle = ocfs2_start_trans(osb, credits); | 645 | handle = ocfs2_start_trans(osb, credits); |
571 | if (IS_ERR(handle)) { | 646 | if (IS_ERR(handle)) { |
@@ -590,6 +665,7 @@ restarted_transaction: | |||
590 | 665 | ||
591 | status = ocfs2_do_extend_allocation(osb, | 666 | status = ocfs2_do_extend_allocation(osb, |
592 | inode, | 667 | inode, |
668 | &logical_start, | ||
593 | clusters_to_add, | 669 | clusters_to_add, |
594 | bh, | 670 | bh, |
595 | handle, | 671 | handle, |
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
778 | size_t tail_to_skip) | 854 | size_t tail_to_skip) |
779 | { | 855 | { |
780 | int ret = 0; | 856 | int ret = 0; |
781 | u32 clusters_to_add; | 857 | u32 clusters_to_add = 0; |
782 | 858 | ||
783 | BUG_ON(!tail_to_skip && !di_bh); | 859 | BUG_ON(!tail_to_skip && !di_bh); |
784 | 860 | ||
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, | |||
790 | goto out; | 866 | goto out; |
791 | BUG_ON(new_i_size < i_size_read(inode)); | 867 | BUG_ON(new_i_size < i_size_read(inode)); |
792 | 868 | ||
869 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
870 | BUG_ON(tail_to_skip != 0); | ||
871 | goto out_update_size; | ||
872 | } | ||
873 | |||
793 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | 874 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - |
794 | OCFS2_I(inode)->ip_clusters; | 875 | OCFS2_I(inode)->ip_clusters; |
795 | 876 | ||
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
825 | goto out_unlock; | 906 | goto out_unlock; |
826 | } | 907 | } |
827 | 908 | ||
909 | out_update_size: | ||
828 | if (!tail_to_skip) { | 910 | if (!tail_to_skip) { |
829 | /* We're being called from ocfs2_setattr() which wants | 911 | /* We're being called from ocfs2_setattr() which wants |
830 | * us to update i_size */ | 912 | * us to update i_size */ |
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode, | |||
834 | } | 916 | } |
835 | 917 | ||
836 | out_unlock: | 918 | out_unlock: |
837 | ocfs2_data_unlock(inode, 1); | 919 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
920 | ocfs2_data_unlock(inode, 1); | ||
838 | 921 | ||
839 | out: | 922 | out: |
840 | return ret; | 923 | return ret; |
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) | |||
972 | 1055 | ||
973 | ret = ocfs2_meta_lock(inode, NULL, 0); | 1056 | ret = ocfs2_meta_lock(inode, NULL, 0); |
974 | if (ret) { | 1057 | if (ret) { |
975 | mlog_errno(ret); | 1058 | if (ret != -ENOENT) |
1059 | mlog_errno(ret); | ||
976 | goto out; | 1060 | goto out; |
977 | } | 1061 | } |
978 | 1062 | ||
@@ -1035,10 +1119,49 @@ out: | |||
1035 | return ret; | 1119 | return ret; |
1036 | } | 1120 | } |
1037 | 1121 | ||
1122 | /* | ||
1123 | * Will look for holes and unwritten extents in the range starting at | ||
1124 | * pos for count bytes (inclusive). | ||
1125 | */ | ||
1126 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
1127 | size_t count) | ||
1128 | { | ||
1129 | int ret = 0; | ||
1130 | unsigned int extent_flags; | ||
1131 | u32 cpos, clusters, extent_len, phys_cpos; | ||
1132 | struct super_block *sb = inode->i_sb; | ||
1133 | |||
1134 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
1135 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
1136 | |||
1137 | while (clusters) { | ||
1138 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
1139 | &extent_flags); | ||
1140 | if (ret < 0) { | ||
1141 | mlog_errno(ret); | ||
1142 | goto out; | ||
1143 | } | ||
1144 | |||
1145 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
1146 | ret = 1; | ||
1147 | break; | ||
1148 | } | ||
1149 | |||
1150 | if (extent_len > clusters) | ||
1151 | extent_len = clusters; | ||
1152 | |||
1153 | clusters -= extent_len; | ||
1154 | cpos += extent_len; | ||
1155 | } | ||
1156 | out: | ||
1157 | return ret; | ||
1158 | } | ||
1159 | |||
1038 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1160 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
1039 | loff_t *ppos, | 1161 | loff_t *ppos, |
1040 | size_t count, | 1162 | size_t count, |
1041 | int appending) | 1163 | int appending, |
1164 | int *direct_io) | ||
1042 | { | 1165 | { |
1043 | int ret = 0, meta_level = appending; | 1166 | int ret = 0, meta_level = appending; |
1044 | struct inode *inode = dentry->d_inode; | 1167 | struct inode *inode = dentry->d_inode; |
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
1089 | } else { | 1212 | } else { |
1090 | saved_pos = *ppos; | 1213 | saved_pos = *ppos; |
1091 | } | 1214 | } |
1215 | |||
1216 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
1217 | loff_t end = saved_pos + count; | ||
1218 | |||
1219 | /* | ||
1220 | * Skip the O_DIRECT checks if we don't need | ||
1221 | * them. | ||
1222 | */ | ||
1223 | if (!direct_io || !(*direct_io)) | ||
1224 | break; | ||
1225 | |||
1226 | /* | ||
1227 | * Allowing concurrent direct writes means | ||
1228 | * i_size changes wouldn't be synchronized, so | ||
1229 | * one node could wind up truncating another | ||
1230 | * nodes writes. | ||
1231 | */ | ||
1232 | if (end > i_size_read(inode)) { | ||
1233 | *direct_io = 0; | ||
1234 | break; | ||
1235 | } | ||
1236 | |||
1237 | /* | ||
1238 | * We don't fill holes during direct io, so | ||
1239 | * check for them here. If any are found, the | ||
1240 | * caller will have to retake some cluster | ||
1241 | * locks and initiate the io as buffered. | ||
1242 | */ | ||
1243 | ret = ocfs2_check_range_for_holes(inode, saved_pos, | ||
1244 | count); | ||
1245 | if (ret == 1) { | ||
1246 | *direct_io = 0; | ||
1247 | ret = 0; | ||
1248 | } else if (ret < 0) | ||
1249 | mlog_errno(ret); | ||
1250 | break; | ||
1251 | } | ||
1252 | |||
1253 | /* | ||
1254 | * The rest of this loop is concerned with legacy file | ||
1255 | * systems which don't support sparse files. | ||
1256 | */ | ||
1257 | |||
1092 | newsize = count + saved_pos; | 1258 | newsize = count + saved_pos; |
1093 | 1259 | ||
1094 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", | 1260 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", |
@@ -1141,55 +1307,264 @@ out: | |||
1141 | return ret; | 1307 | return ret; |
1142 | } | 1308 | } |
1143 | 1309 | ||
1310 | static inline void | ||
1311 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
1312 | { | ||
1313 | const struct iovec *iov = *iovp; | ||
1314 | size_t base = *basep; | ||
1315 | |||
1316 | do { | ||
1317 | int copy = min(bytes, iov->iov_len - base); | ||
1318 | |||
1319 | bytes -= copy; | ||
1320 | base += copy; | ||
1321 | if (iov->iov_len == base) { | ||
1322 | iov++; | ||
1323 | base = 0; | ||
1324 | } | ||
1325 | } while (bytes); | ||
1326 | *iovp = iov; | ||
1327 | *basep = base; | ||
1328 | } | ||
1329 | |||
1330 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1331 | const struct iovec *cur_iov, | ||
1332 | size_t iov_offset) | ||
1333 | { | ||
1334 | int ret; | ||
1335 | char *buf; | ||
1336 | struct page *src_page = NULL; | ||
1337 | |||
1338 | buf = cur_iov->iov_base + iov_offset; | ||
1339 | |||
1340 | if (!segment_eq(get_fs(), KERNEL_DS)) { | ||
1341 | /* | ||
1342 | * Pull in the user page. We want to do this outside | ||
1343 | * of the meta data locks in order to preserve locking | ||
1344 | * order in case of page fault. | ||
1345 | */ | ||
1346 | ret = get_user_pages(current, current->mm, | ||
1347 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | ||
1348 | 0, 0, &src_page, NULL); | ||
1349 | if (ret == 1) | ||
1350 | bp->b_src_buf = kmap(src_page); | ||
1351 | else | ||
1352 | src_page = ERR_PTR(-EFAULT); | ||
1353 | } else { | ||
1354 | bp->b_src_buf = buf; | ||
1355 | } | ||
1356 | |||
1357 | return src_page; | ||
1358 | } | ||
1359 | |||
1360 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1361 | struct page *page) | ||
1362 | { | ||
1363 | if (page) { | ||
1364 | kunmap(page); | ||
1365 | page_cache_release(page); | ||
1366 | } | ||
1367 | } | ||
1368 | |||
1369 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | ||
1370 | const struct iovec *iov, | ||
1371 | unsigned long nr_segs, | ||
1372 | size_t count, | ||
1373 | ssize_t o_direct_written) | ||
1374 | { | ||
1375 | int ret = 0; | ||
1376 | ssize_t copied, total = 0; | ||
1377 | size_t iov_offset = 0; | ||
1378 | const struct iovec *cur_iov = iov; | ||
1379 | struct ocfs2_buffered_write_priv bp; | ||
1380 | struct page *page; | ||
1381 | |||
1382 | /* | ||
1383 | * handle partial DIO write. Adjust cur_iov if needed. | ||
1384 | */ | ||
1385 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | ||
1386 | |||
1387 | do { | ||
1388 | bp.b_cur_off = iov_offset; | ||
1389 | bp.b_cur_iov = cur_iov; | ||
1390 | |||
1391 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | ||
1392 | if (IS_ERR(page)) { | ||
1393 | ret = PTR_ERR(page); | ||
1394 | goto out; | ||
1395 | } | ||
1396 | |||
1397 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | ||
1398 | ocfs2_map_and_write_user_data, | ||
1399 | &bp); | ||
1400 | |||
1401 | ocfs2_put_write_source(&bp, page); | ||
1402 | |||
1403 | if (copied < 0) { | ||
1404 | mlog_errno(copied); | ||
1405 | ret = copied; | ||
1406 | goto out; | ||
1407 | } | ||
1408 | |||
1409 | total += copied; | ||
1410 | *ppos = *ppos + copied; | ||
1411 | count -= copied; | ||
1412 | |||
1413 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | ||
1414 | } while(count); | ||
1415 | |||
1416 | out: | ||
1417 | return total ? total : ret; | ||
1418 | } | ||
1419 | |||
1420 | static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, | ||
1421 | unsigned long *nr_segs) | ||
1422 | { | ||
1423 | size_t ocount; /* original count */ | ||
1424 | unsigned long seg; | ||
1425 | |||
1426 | ocount = 0; | ||
1427 | for (seg = 0; seg < *nr_segs; seg++) { | ||
1428 | const struct iovec *iv = &iov[seg]; | ||
1429 | |||
1430 | /* | ||
1431 | * If any segment has a negative length, or the cumulative | ||
1432 | * length ever wraps negative then return -EINVAL. | ||
1433 | */ | ||
1434 | ocount += iv->iov_len; | ||
1435 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
1436 | return -EINVAL; | ||
1437 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
1438 | continue; | ||
1439 | if (seg == 0) | ||
1440 | return -EFAULT; | ||
1441 | *nr_segs = seg; | ||
1442 | ocount -= iv->iov_len; /* This segment is no good */ | ||
1443 | break; | ||
1444 | } | ||
1445 | |||
1446 | *counted = ocount; | ||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1144 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1450 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
1145 | const struct iovec *iov, | 1451 | const struct iovec *iov, |
1146 | unsigned long nr_segs, | 1452 | unsigned long nr_segs, |
1147 | loff_t pos) | 1453 | loff_t pos) |
1148 | { | 1454 | { |
1149 | int ret, rw_level, have_alloc_sem = 0; | 1455 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
1150 | struct file *filp = iocb->ki_filp; | 1456 | int can_do_direct, sync = 0; |
1151 | struct inode *inode = filp->f_path.dentry->d_inode; | 1457 | ssize_t written = 0; |
1152 | int appending = filp->f_flags & O_APPEND ? 1 : 0; | 1458 | size_t ocount; /* original count */ |
1153 | 1459 | size_t count; /* after file limit checks */ | |
1154 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1460 | loff_t *ppos = &iocb->ki_pos; |
1461 | struct file *file = iocb->ki_filp; | ||
1462 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1463 | |||
1464 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | ||
1155 | (unsigned int)nr_segs, | 1465 | (unsigned int)nr_segs, |
1156 | filp->f_path.dentry->d_name.len, | 1466 | file->f_path.dentry->d_name.len, |
1157 | filp->f_path.dentry->d_name.name); | 1467 | file->f_path.dentry->d_name.name); |
1158 | 1468 | ||
1159 | /* happy write of zero bytes */ | ||
1160 | if (iocb->ki_left == 0) | 1469 | if (iocb->ki_left == 0) |
1161 | return 0; | 1470 | return 0; |
1162 | 1471 | ||
1472 | ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); | ||
1473 | if (ret) | ||
1474 | return ret; | ||
1475 | |||
1476 | count = ocount; | ||
1477 | |||
1478 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
1479 | |||
1480 | appending = file->f_flags & O_APPEND ? 1 : 0; | ||
1481 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | ||
1482 | |||
1163 | mutex_lock(&inode->i_mutex); | 1483 | mutex_lock(&inode->i_mutex); |
1484 | |||
1485 | relock: | ||
1164 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1486 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
1165 | if (filp->f_flags & O_DIRECT) { | 1487 | if (direct_io) { |
1166 | have_alloc_sem = 1; | ||
1167 | down_read(&inode->i_alloc_sem); | 1488 | down_read(&inode->i_alloc_sem); |
1489 | have_alloc_sem = 1; | ||
1168 | } | 1490 | } |
1169 | 1491 | ||
1170 | /* concurrent O_DIRECT writes are allowed */ | 1492 | /* concurrent O_DIRECT writes are allowed */ |
1171 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | 1493 | rw_level = !direct_io; |
1172 | ret = ocfs2_rw_lock(inode, rw_level); | 1494 | ret = ocfs2_rw_lock(inode, rw_level); |
1173 | if (ret < 0) { | 1495 | if (ret < 0) { |
1174 | rw_level = -1; | ||
1175 | mlog_errno(ret); | 1496 | mlog_errno(ret); |
1176 | goto out; | 1497 | goto out_sems; |
1177 | } | 1498 | } |
1178 | 1499 | ||
1179 | ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, | 1500 | can_do_direct = direct_io; |
1180 | iocb->ki_left, appending); | 1501 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
1502 | iocb->ki_left, appending, | ||
1503 | &can_do_direct); | ||
1181 | if (ret < 0) { | 1504 | if (ret < 0) { |
1182 | mlog_errno(ret); | 1505 | mlog_errno(ret); |
1183 | goto out; | 1506 | goto out; |
1184 | } | 1507 | } |
1185 | 1508 | ||
1186 | /* communicate with ocfs2_dio_end_io */ | 1509 | /* |
1187 | ocfs2_iocb_set_rw_locked(iocb); | 1510 | * We can't complete the direct I/O as requested, fall back to |
1511 | * buffered I/O. | ||
1512 | */ | ||
1513 | if (direct_io && !can_do_direct) { | ||
1514 | ocfs2_rw_unlock(inode, rw_level); | ||
1515 | up_read(&inode->i_alloc_sem); | ||
1516 | |||
1517 | have_alloc_sem = 0; | ||
1518 | rw_level = -1; | ||
1188 | 1519 | ||
1189 | ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); | 1520 | direct_io = 0; |
1521 | sync = 1; | ||
1522 | goto relock; | ||
1523 | } | ||
1524 | |||
1525 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) | ||
1526 | sync = 1; | ||
1527 | |||
1528 | /* | ||
1529 | * XXX: Is it ok to execute these checks a second time? | ||
1530 | */ | ||
1531 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); | ||
1532 | if (ret) | ||
1533 | goto out; | ||
1534 | |||
1535 | /* | ||
1536 | * Set pos so that sync_page_range_nolock() below understands | ||
1537 | * where to start from. We might've moved it around via the | ||
1538 | * calls above. The range we want to actually sync starts from | ||
1539 | * *ppos here. | ||
1540 | * | ||
1541 | */ | ||
1542 | pos = *ppos; | ||
1543 | |||
1544 | /* communicate with ocfs2_dio_end_io */ | ||
1545 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | ||
1546 | |||
1547 | if (direct_io) { | ||
1548 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | ||
1549 | ppos, count, ocount); | ||
1550 | if (written < 0) { | ||
1551 | ret = written; | ||
1552 | goto out_dio; | ||
1553 | } | ||
1554 | } else { | ||
1555 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, | ||
1556 | count, written); | ||
1557 | if (written < 0) { | ||
1558 | ret = written; | ||
1559 | if (ret != -EFAULT || ret != -ENOSPC) | ||
1560 | mlog_errno(ret); | ||
1561 | goto out; | ||
1562 | } | ||
1563 | } | ||
1190 | 1564 | ||
1565 | out_dio: | ||
1191 | /* buffered aio wouldn't have proper lock coverage today */ | 1566 | /* buffered aio wouldn't have proper lock coverage today */ |
1192 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1567 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
1193 | 1568 | ||
1194 | /* | 1569 | /* |
1195 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1570 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
1207 | } | 1582 | } |
1208 | 1583 | ||
1209 | out: | 1584 | out: |
1585 | if (rw_level != -1) | ||
1586 | ocfs2_rw_unlock(inode, rw_level); | ||
1587 | |||
1588 | out_sems: | ||
1210 | if (have_alloc_sem) | 1589 | if (have_alloc_sem) |
1211 | up_read(&inode->i_alloc_sem); | 1590 | up_read(&inode->i_alloc_sem); |
1212 | if (rw_level != -1) | 1591 | |
1213 | ocfs2_rw_unlock(inode, rw_level); | 1592 | if (written > 0 && sync) { |
1593 | ssize_t err; | ||
1594 | |||
1595 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); | ||
1596 | if (err < 0) | ||
1597 | written = err; | ||
1598 | } | ||
1599 | |||
1214 | mutex_unlock(&inode->i_mutex); | 1600 | mutex_unlock(&inode->i_mutex); |
1215 | 1601 | ||
1216 | mlog_exit(ret); | 1602 | mlog_exit(ret); |
1603 | return written ? written : ret; | ||
1604 | } | ||
1605 | |||
1606 | static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | ||
1607 | struct pipe_buffer *buf, | ||
1608 | struct splice_desc *sd) | ||
1609 | { | ||
1610 | int ret, count, total = 0; | ||
1611 | ssize_t copied = 0; | ||
1612 | struct ocfs2_splice_write_priv sp; | ||
1613 | |||
1614 | ret = buf->ops->pin(pipe, buf); | ||
1615 | if (ret) | ||
1616 | goto out; | ||
1617 | |||
1618 | sp.s_sd = sd; | ||
1619 | sp.s_buf = buf; | ||
1620 | sp.s_pipe = pipe; | ||
1621 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | ||
1622 | sp.s_buf_offset = buf->offset; | ||
1623 | |||
1624 | count = sd->len; | ||
1625 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | ||
1626 | count = PAGE_CACHE_SIZE - sp.s_offset; | ||
1627 | |||
1628 | do { | ||
1629 | /* | ||
1630 | * splice wants us to copy up to one page at a | ||
1631 | * time. For pagesize > cluster size, this means we | ||
1632 | * might enter ocfs2_buffered_write_cluster() more | ||
1633 | * than once, so keep track of our progress here. | ||
1634 | */ | ||
1635 | copied = ocfs2_buffered_write_cluster(sd->file, | ||
1636 | (loff_t)sd->pos + total, | ||
1637 | count, | ||
1638 | ocfs2_map_and_write_splice_data, | ||
1639 | &sp); | ||
1640 | if (copied < 0) { | ||
1641 | mlog_errno(copied); | ||
1642 | ret = copied; | ||
1643 | goto out; | ||
1644 | } | ||
1645 | |||
1646 | count -= copied; | ||
1647 | sp.s_offset += copied; | ||
1648 | sp.s_buf_offset += copied; | ||
1649 | total += copied; | ||
1650 | } while (count); | ||
1651 | |||
1652 | ret = 0; | ||
1653 | out: | ||
1654 | |||
1655 | return total ? total : ret; | ||
1656 | } | ||
1657 | |||
1658 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | ||
1659 | struct file *out, | ||
1660 | loff_t *ppos, | ||
1661 | size_t len, | ||
1662 | unsigned int flags) | ||
1663 | { | ||
1664 | int ret, err; | ||
1665 | struct address_space *mapping = out->f_mapping; | ||
1666 | struct inode *inode = mapping->host; | ||
1667 | |||
1668 | ret = __splice_from_pipe(pipe, out, ppos, len, flags, | ||
1669 | ocfs2_splice_write_actor); | ||
1670 | if (ret > 0) { | ||
1671 | *ppos += ret; | ||
1672 | |||
1673 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
1674 | err = generic_osync_inode(inode, mapping, | ||
1675 | OSYNC_METADATA|OSYNC_DATA); | ||
1676 | if (err) | ||
1677 | ret = err; | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1217 | return ret; | 1681 | return ret; |
1218 | } | 1682 | } |
1219 | 1683 | ||
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | |||
1239 | goto out; | 1703 | goto out; |
1240 | } | 1704 | } |
1241 | 1705 | ||
1242 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); | 1706 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
1707 | NULL); | ||
1243 | if (ret < 0) { | 1708 | if (ret < 0) { |
1244 | mlog_errno(ret); | 1709 | mlog_errno(ret); |
1245 | goto out_unlock; | 1710 | goto out_unlock; |
1246 | } | 1711 | } |
1247 | 1712 | ||
1248 | /* ok, we're done with i_size and alloc work */ | 1713 | /* ok, we're done with i_size and alloc work */ |
1249 | ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); | 1714 | ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); |
1250 | 1715 | ||
1251 | out_unlock: | 1716 | out_unlock: |
1252 | ocfs2_rw_unlock(inode, 1); | 1717 | ocfs2_rw_unlock(inode, 1); |
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | |||
1323 | } | 1788 | } |
1324 | rw_level = 0; | 1789 | rw_level = 0; |
1325 | /* communicate with ocfs2_dio_end_io */ | 1790 | /* communicate with ocfs2_dio_end_io */ |
1326 | ocfs2_iocb_set_rw_locked(iocb); | 1791 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
1327 | } | 1792 | } |
1328 | 1793 | ||
1329 | /* | 1794 | /* |
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index cc973f01f6ce..2c4460fced52 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h | |||
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted { | |||
39 | }; | 39 | }; |
40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
41 | struct inode *inode, | 41 | struct inode *inode, |
42 | u32 *cluster_start, | ||
42 | u32 clusters_to_add, | 43 | u32 clusters_to_add, |
43 | struct buffer_head *fe_bh, | 44 | struct buffer_head *fe_bh, |
44 | handle_t *handle, | 45 | handle_t *handle, |
45 | struct ocfs2_alloc_context *data_ac, | 46 | struct ocfs2_alloc_context *data_ac, |
46 | struct ocfs2_alloc_context *meta_ac, | 47 | struct ocfs2_alloc_context *meta_ac, |
47 | enum ocfs2_alloc_restarted *reason); | 48 | enum ocfs2_alloc_restarted *reason); |
49 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
50 | u32 clusters_to_add, | ||
51 | struct ocfs2_alloc_context **data_ac, | ||
52 | struct ocfs2_alloc_context **meta_ac); | ||
48 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | 53 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); |
49 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, | 54 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, |
50 | struct kstat *stat); | 55 | struct kstat *stat); |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab56f2b98c..21a605079c62 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode) | |||
89 | inode->i_flags |= S_DIRSYNC; | 89 | inode->i_flags |= S_DIRSYNC; |
90 | } | 90 | } |
91 | 91 | ||
92 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
93 | u64 blkno, | ||
94 | int delete_vote) | ||
95 | { | ||
96 | struct ocfs2_find_inode_args args; | ||
97 | |||
98 | /* ocfs2_ilookup_for_vote should *only* be called from the | ||
99 | * vote thread */ | ||
100 | BUG_ON(current != osb->vote_task); | ||
101 | |||
102 | args.fi_blkno = blkno; | ||
103 | args.fi_flags = OCFS2_FI_FLAG_NOWAIT; | ||
104 | if (delete_vote) | ||
105 | args.fi_flags |= OCFS2_FI_FLAG_DELETE; | ||
106 | args.fi_ino = ino_from_blkno(osb->sb, blkno); | ||
107 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | ||
108 | } | ||
109 | |||
110 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) | 92 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) |
111 | { | 93 | { |
112 | struct inode *inode = NULL; | 94 | struct inode *inode = NULL; |
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) | |||
182 | if (oi->ip_blkno != args->fi_blkno) | 164 | if (oi->ip_blkno != args->fi_blkno) |
183 | goto bail; | 165 | goto bail; |
184 | 166 | ||
185 | /* OCFS2_FI_FLAG_NOWAIT is *only* set from | ||
186 | * ocfs2_ilookup_for_vote which won't create an inode for one | ||
187 | * that isn't found. The vote thread which doesn't want to get | ||
188 | * an inode which is in the process of going away - otherwise | ||
189 | * the call to __wait_on_freeing_inode in find_inode_fast will | ||
190 | * cause it to deadlock on an inode which may be waiting on a | ||
191 | * vote (or lock release) in delete_inode */ | ||
192 | if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && | ||
193 | (inode->i_state & (I_FREEING|I_CLEAR))) { | ||
194 | /* As stated above, we're not going to return an | ||
195 | * inode. In the case of a delete vote, the voting | ||
196 | * code is going to signal the other node to go | ||
197 | * ahead. Mark that state here, so this freeing inode | ||
198 | * has the state when it gets to delete_inode. */ | ||
199 | if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { | ||
200 | spin_lock(&oi->ip_lock); | ||
201 | ocfs2_mark_inode_remotely_deleted(inode); | ||
202 | spin_unlock(&oi->ip_lock); | ||
203 | } | ||
204 | goto bail; | ||
205 | } | ||
206 | |||
207 | ret = 1; | 167 | ret = 1; |
208 | bail: | 168 | bail: |
209 | mlog_exit(ret); | 169 | mlog_exit(ret); |
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
261 | goto bail; | 221 | goto bail; |
262 | } | 222 | } |
263 | 223 | ||
224 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
225 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | ||
226 | |||
264 | inode->i_version = 1; | 227 | inode->i_version = 1; |
265 | inode->i_generation = le32_to_cpu(fe->i_generation); | 228 | inode->i_generation = le32_to_cpu(fe->i_generation); |
266 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | 229 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); |
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
272 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) | 235 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) |
273 | inode->i_blocks = 0; | 236 | inode->i_blocks = 0; |
274 | else | 237 | else |
275 | inode->i_blocks = | 238 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
276 | ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); | ||
277 | inode->i_mapping->a_ops = &ocfs2_aops; | 239 | inode->i_mapping->a_ops = &ocfs2_aops; |
278 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | 240 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); |
279 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | 241 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); |
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
288 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 250 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
289 | (unsigned long long)fe->i_blkno); | 251 | (unsigned long long)fe->i_blkno); |
290 | 252 | ||
291 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
292 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
293 | OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); | ||
294 | |||
295 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | 253 | inode->i_nlink = le16_to_cpu(fe->i_links_count); |
296 | 254 | ||
297 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | 255 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) |
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
347 | 305 | ||
348 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | 306 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, |
349 | OCFS2_LOCK_TYPE_META, 0, inode); | 307 | OCFS2_LOCK_TYPE_META, 0, inode); |
308 | |||
309 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, | ||
310 | OCFS2_LOCK_TYPE_OPEN, 0, inode); | ||
350 | } | 311 | } |
351 | 312 | ||
352 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | 313 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, |
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
421 | * cluster lock before trusting anything anyway. | 382 | * cluster lock before trusting anything anyway. |
422 | */ | 383 | */ |
423 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) | 384 | can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) |
424 | && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) | 385 | && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) |
425 | && !ocfs2_mount_local(osb); | 386 | && !ocfs2_mount_local(osb); |
426 | 387 | ||
427 | /* | 388 | /* |
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
438 | OCFS2_LOCK_TYPE_META, | 399 | OCFS2_LOCK_TYPE_META, |
439 | generation, inode); | 400 | generation, inode); |
440 | 401 | ||
402 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, | ||
403 | OCFS2_LOCK_TYPE_OPEN, | ||
404 | 0, inode); | ||
405 | |||
441 | if (can_lock) { | 406 | if (can_lock) { |
407 | status = ocfs2_open_lock(inode); | ||
408 | if (status) { | ||
409 | make_bad_inode(inode); | ||
410 | mlog_errno(status); | ||
411 | return status; | ||
412 | } | ||
442 | status = ocfs2_meta_lock(inode, NULL, 0); | 413 | status = ocfs2_meta_lock(inode, NULL, 0); |
443 | if (status) { | 414 | if (status) { |
444 | make_bad_inode(inode); | 415 | make_bad_inode(inode); |
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
447 | } | 418 | } |
448 | } | 419 | } |
449 | 420 | ||
421 | if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { | ||
422 | status = ocfs2_try_open_lock(inode, 0); | ||
423 | if (status) { | ||
424 | make_bad_inode(inode); | ||
425 | return status; | ||
426 | } | ||
427 | } | ||
428 | |||
450 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, | 429 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, |
451 | can_lock ? inode : NULL); | 430 | can_lock ? inode : NULL); |
452 | if (status < 0) { | 431 | if (status < 0) { |
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | |||
507 | struct buffer_head *fe_bh) | 486 | struct buffer_head *fe_bh) |
508 | { | 487 | { |
509 | int status = 0; | 488 | int status = 0; |
510 | handle_t *handle = NULL; | ||
511 | struct ocfs2_truncate_context *tc = NULL; | 489 | struct ocfs2_truncate_context *tc = NULL; |
512 | struct ocfs2_dinode *fe; | 490 | struct ocfs2_dinode *fe; |
491 | handle_t *handle = NULL; | ||
513 | 492 | ||
514 | mlog_entry_void(); | 493 | mlog_entry_void(); |
515 | 494 | ||
516 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 495 | fe = (struct ocfs2_dinode *) fe_bh->b_data; |
517 | 496 | ||
518 | /* zero allocation, zero truncate :) */ | 497 | if (fe->i_clusters) { |
519 | if (!fe->i_clusters) | 498 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
520 | goto bail; | 499 | if (IS_ERR(handle)) { |
500 | status = PTR_ERR(handle); | ||
501 | mlog_errno(status); | ||
502 | goto out; | ||
503 | } | ||
521 | 504 | ||
522 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 505 | status = ocfs2_journal_access(handle, inode, fe_bh, |
523 | if (IS_ERR(handle)) { | 506 | OCFS2_JOURNAL_ACCESS_WRITE); |
524 | status = PTR_ERR(handle); | 507 | if (status < 0) { |
525 | handle = NULL; | 508 | mlog_errno(status); |
526 | mlog_errno(status); | 509 | goto out; |
527 | goto bail; | 510 | } |
528 | } | ||
529 | 511 | ||
530 | status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); | 512 | i_size_write(inode, 0); |
531 | if (status < 0) { | ||
532 | mlog_errno(status); | ||
533 | goto bail; | ||
534 | } | ||
535 | 513 | ||
536 | ocfs2_commit_trans(osb, handle); | 514 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
537 | handle = NULL; | 515 | if (status < 0) { |
516 | mlog_errno(status); | ||
517 | goto out; | ||
518 | } | ||
538 | 519 | ||
539 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); | 520 | ocfs2_commit_trans(osb, handle); |
540 | if (status < 0) { | 521 | handle = NULL; |
541 | mlog_errno(status); | ||
542 | goto bail; | ||
543 | } | ||
544 | 522 | ||
545 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | 523 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); |
546 | if (status < 0) { | 524 | if (status < 0) { |
547 | mlog_errno(status); | 525 | mlog_errno(status); |
548 | goto bail; | 526 | goto out; |
527 | } | ||
528 | |||
529 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | ||
530 | if (status < 0) { | ||
531 | mlog_errno(status); | ||
532 | goto out; | ||
533 | } | ||
549 | } | 534 | } |
550 | bail: | 535 | |
536 | out: | ||
551 | if (handle) | 537 | if (handle) |
552 | ocfs2_commit_trans(osb, handle); | 538 | ocfs2_commit_trans(osb, handle); |
553 | |||
554 | mlog_exit(status); | 539 | mlog_exit(status); |
555 | return status; | 540 | return status; |
556 | } | 541 | } |
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode, | |||
678 | struct inode *orphan_dir_inode = NULL; | 663 | struct inode *orphan_dir_inode = NULL; |
679 | struct buffer_head *orphan_dir_bh = NULL; | 664 | struct buffer_head *orphan_dir_bh = NULL; |
680 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 665 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
666 | struct ocfs2_dinode *di; | ||
681 | 667 | ||
682 | /* We've already voted on this so it should be readonly - no | 668 | di = (struct ocfs2_dinode *) di_bh->b_data; |
683 | * spinlock needed. */ | 669 | orphaned_slot = le16_to_cpu(di->i_orphaned_slot); |
684 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
685 | 670 | ||
686 | status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); | 671 | status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); |
687 | if (status) | 672 | if (status) |
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode, | |||
839 | goto bail; | 824 | goto bail; |
840 | } | 825 | } |
841 | 826 | ||
842 | status = ocfs2_request_delete_vote(inode); | 827 | /* |
843 | /* -EBUSY means that other nodes are still using the | 828 | * This is how ocfs2 determines whether an inode is still live |
844 | * inode. We're done here though, so avoid doing anything on | 829 | * within the cluster. Every node takes a shared read lock on |
845 | * disk and let them worry about deleting it. */ | 830 | * the inode open lock in ocfs2_read_locked_inode(). When we |
846 | if (status == -EBUSY) { | 831 | * get to ->delete_inode(), each node tries to convert it's |
832 | * lock to an exclusive. Trylocks are serialized by the inode | ||
833 | * meta data lock. If the upconvert suceeds, we know the inode | ||
834 | * is no longer live and can be deleted. | ||
835 | * | ||
836 | * Though we call this with the meta data lock held, the | ||
837 | * trylock keeps us from ABBA deadlock. | ||
838 | */ | ||
839 | status = ocfs2_try_open_lock(inode, 1); | ||
840 | if (status == -EAGAIN) { | ||
847 | status = 0; | 841 | status = 0; |
848 | mlog(0, "Skipping delete of %llu because it is in use on" | 842 | mlog(0, "Skipping delete of %llu because it is in use on" |
849 | "other nodes\n", (unsigned long long)oi->ip_blkno); | 843 | "other nodes\n", (unsigned long long)oi->ip_blkno); |
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, | |||
854 | goto bail; | 848 | goto bail; |
855 | } | 849 | } |
856 | 850 | ||
857 | spin_lock(&oi->ip_lock); | 851 | *wipe = 1; |
858 | if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { | 852 | mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", |
859 | /* Nobody knew which slot this inode was orphaned | 853 | (unsigned long long)oi->ip_blkno, |
860 | * into. This may happen during node death and | 854 | le16_to_cpu(di->i_orphaned_slot)); |
861 | * recovery knows how to clean it up so we can safely | ||
862 | * ignore this inode for now on. */ | ||
863 | mlog(0, "Nobody knew where inode %llu was orphaned!\n", | ||
864 | (unsigned long long)oi->ip_blkno); | ||
865 | } else { | ||
866 | *wipe = 1; | ||
867 | |||
868 | mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", | ||
869 | (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); | ||
870 | } | ||
871 | spin_unlock(&oi->ip_lock); | ||
872 | 855 | ||
873 | bail: | 856 | bail: |
874 | return status; | 857 | return status; |
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode) | |||
1001 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, | 984 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, |
1002 | "Inode=%lu\n", inode->i_ino); | 985 | "Inode=%lu\n", inode->i_ino); |
1003 | 986 | ||
987 | /* For remove delete_inode vote, we hold open lock before, | ||
988 | * now it is time to unlock PR and EX open locks. */ | ||
989 | ocfs2_open_unlock(inode); | ||
990 | |||
1004 | /* Do these before all the other work so that we don't bounce | 991 | /* Do these before all the other work so that we don't bounce |
1005 | * the vote thread while waiting to destroy the locks. */ | 992 | * the vote thread while waiting to destroy the locks. */ |
1006 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); | 993 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); |
1007 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); | 994 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); |
1008 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); | 995 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); |
996 | ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); | ||
1009 | 997 | ||
1010 | /* We very well may get a clear_inode before all an inodes | 998 | /* We very well may get a clear_inode before all an inodes |
1011 | * metadata has hit disk. Of course, we can't drop any cluster | 999 | * metadata has hit disk. Of course, we can't drop any cluster |
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode) | |||
1020 | "Clear inode of %llu, inode has io markers\n", | 1008 | "Clear inode of %llu, inode has io markers\n", |
1021 | (unsigned long long)oi->ip_blkno); | 1009 | (unsigned long long)oi->ip_blkno); |
1022 | 1010 | ||
1023 | ocfs2_extent_map_drop(inode, 0); | 1011 | ocfs2_extent_map_trunc(inode, 0); |
1024 | ocfs2_extent_map_init(inode); | ||
1025 | 1012 | ||
1026 | status = ocfs2_drop_inode_locks(inode); | 1013 | status = ocfs2_drop_inode_locks(inode); |
1027 | if (status < 0) | 1014 | if (status < 0) |
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode) | |||
1030 | ocfs2_lock_res_free(&oi->ip_rw_lockres); | 1017 | ocfs2_lock_res_free(&oi->ip_rw_lockres); |
1031 | ocfs2_lock_res_free(&oi->ip_meta_lockres); | 1018 | ocfs2_lock_res_free(&oi->ip_meta_lockres); |
1032 | ocfs2_lock_res_free(&oi->ip_data_lockres); | 1019 | ocfs2_lock_res_free(&oi->ip_data_lockres); |
1020 | ocfs2_lock_res_free(&oi->ip_open_lockres); | ||
1033 | 1021 | ||
1034 | ocfs2_metadata_cache_purge(inode); | 1022 | ocfs2_metadata_cache_purge(inode); |
1035 | 1023 | ||
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode) | |||
1086 | mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", | 1074 | mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", |
1087 | (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); | 1075 | (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); |
1088 | 1076 | ||
1089 | /* Testing ip_orphaned_slot here wouldn't work because we may | ||
1090 | * not have gotten a delete_inode vote from any other nodes | ||
1091 | * yet. */ | ||
1092 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) | 1077 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) |
1093 | generic_delete_inode(inode); | 1078 | generic_delete_inode(inode); |
1094 | else | 1079 | else |
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, | |||
1121 | return NULL; | 1106 | return NULL; |
1122 | } | 1107 | } |
1123 | 1108 | ||
1124 | tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, | 1109 | tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, |
1125 | &p_blkno, NULL); | 1110 | NULL); |
1126 | if (tmperr < 0) { | 1111 | if (tmperr < 0) { |
1127 | mlog_errno(tmperr); | 1112 | mlog_errno(tmperr); |
1128 | goto fail; | 1113 | goto fail; |
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode, | |||
1259 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) | 1244 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) |
1260 | inode->i_blocks = 0; | 1245 | inode->i_blocks = 0; |
1261 | else | 1246 | else |
1262 | inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); | 1247 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
1263 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | 1248 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); |
1264 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | 1249 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); |
1265 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | 1250 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1a7dd2945b34..03ae075869ee 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -26,6 +26,8 @@ | |||
26 | #ifndef OCFS2_INODE_H | 26 | #ifndef OCFS2_INODE_H |
27 | #define OCFS2_INODE_H | 27 | #define OCFS2_INODE_H |
28 | 28 | ||
29 | #include "extent_map.h" | ||
30 | |||
29 | /* OCFS2 Inode Private Data */ | 31 | /* OCFS2 Inode Private Data */ |
30 | struct ocfs2_inode_info | 32 | struct ocfs2_inode_info |
31 | { | 33 | { |
@@ -34,6 +36,7 @@ struct ocfs2_inode_info | |||
34 | struct ocfs2_lock_res ip_rw_lockres; | 36 | struct ocfs2_lock_res ip_rw_lockres; |
35 | struct ocfs2_lock_res ip_meta_lockres; | 37 | struct ocfs2_lock_res ip_meta_lockres; |
36 | struct ocfs2_lock_res ip_data_lockres; | 38 | struct ocfs2_lock_res ip_data_lockres; |
39 | struct ocfs2_lock_res ip_open_lockres; | ||
37 | 40 | ||
38 | /* protects allocation changes on this inode. */ | 41 | /* protects allocation changes on this inode. */ |
39 | struct rw_semaphore ip_alloc_sem; | 42 | struct rw_semaphore ip_alloc_sem; |
@@ -42,9 +45,7 @@ struct ocfs2_inode_info | |||
42 | spinlock_t ip_lock; | 45 | spinlock_t ip_lock; |
43 | u32 ip_open_count; | 46 | u32 ip_open_count; |
44 | u32 ip_clusters; | 47 | u32 ip_clusters; |
45 | struct ocfs2_extent_map ip_map; | ||
46 | struct list_head ip_io_markers; | 48 | struct list_head ip_io_markers; |
47 | int ip_orphaned_slot; | ||
48 | 49 | ||
49 | struct mutex ip_io_mutex; | 50 | struct mutex ip_io_mutex; |
50 | 51 | ||
@@ -64,6 +65,8 @@ struct ocfs2_inode_info | |||
64 | 65 | ||
65 | struct ocfs2_caching_info ip_metadata_cache; | 66 | struct ocfs2_caching_info ip_metadata_cache; |
66 | 67 | ||
68 | struct ocfs2_extent_map ip_extent_map; | ||
69 | |||
67 | struct inode vfs_inode; | 70 | struct inode vfs_inode; |
68 | }; | 71 | }; |
69 | 72 | ||
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode); | |||
117 | void ocfs2_drop_inode(struct inode *inode); | 120 | void ocfs2_drop_inode(struct inode *inode); |
118 | 121 | ||
119 | /* Flags for ocfs2_iget() */ | 122 | /* Flags for ocfs2_iget() */ |
120 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | 123 | #define OCFS2_FI_FLAG_SYSFILE 0x4 |
121 | #define OCFS2_FI_FLAG_DELETE 0x2 | 124 | #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 |
122 | #define OCFS2_FI_FLAG_SYSFILE 0x4 | ||
123 | #define OCFS2_FI_FLAG_NOLOCK 0x8 | ||
124 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); | 125 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); |
125 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
126 | u64 blkno, | ||
127 | int delete_vote); | ||
128 | int ocfs2_inode_init_private(struct inode *inode); | 126 | int ocfs2_inode_init_private(struct inode *inode); |
129 | int ocfs2_inode_revalidate(struct dentry *dentry); | 127 | int ocfs2_inode_revalidate(struct dentry *dentry); |
130 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | 128 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, |
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | |||
144 | 142 | ||
145 | void ocfs2_set_inode_flags(struct inode *inode); | 143 | void ocfs2_set_inode_flags(struct inode *inode); |
146 | 144 | ||
145 | static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) | ||
146 | { | ||
147 | int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; | ||
148 | |||
149 | return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); | ||
150 | } | ||
151 | |||
147 | #endif /* OCFS2_INODE_H */ | 152 | #endif /* OCFS2_INODE_H */ |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 825cb0ae1b4c..5a8a90d1c787 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -649,29 +649,20 @@ bail: | |||
649 | static int ocfs2_force_read_journal(struct inode *inode) | 649 | static int ocfs2_force_read_journal(struct inode *inode) |
650 | { | 650 | { |
651 | int status = 0; | 651 | int status = 0; |
652 | int i, p_blocks; | 652 | int i; |
653 | u64 v_blkno, p_blkno; | 653 | u64 v_blkno, p_blkno, p_blocks, num_blocks; |
654 | #define CONCURRENT_JOURNAL_FILL 32 | 654 | #define CONCURRENT_JOURNAL_FILL 32ULL |
655 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; | 655 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; |
656 | 656 | ||
657 | mlog_entry_void(); | 657 | mlog_entry_void(); |
658 | 658 | ||
659 | BUG_ON(inode->i_blocks != | ||
660 | ocfs2_align_bytes_to_sectors(i_size_read(inode))); | ||
661 | |||
662 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); | 659 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); |
663 | 660 | ||
664 | mlog(0, "Force reading %llu blocks\n", | 661 | num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); |
665 | (unsigned long long)(inode->i_blocks >> | ||
666 | (inode->i_sb->s_blocksize_bits - 9))); | ||
667 | |||
668 | v_blkno = 0; | 662 | v_blkno = 0; |
669 | while (v_blkno < | 663 | while (v_blkno < num_blocks) { |
670 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { | ||
671 | |||
672 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, | 664 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, |
673 | 1, &p_blkno, | 665 | &p_blkno, &p_blocks, NULL); |
674 | &p_blocks); | ||
675 | if (status < 0) { | 666 | if (status < 0) { |
676 | mlog_errno(status); | 667 | mlog_errno(status); |
677 | goto bail; | 668 | goto bail; |
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, | |||
1306 | continue; | 1297 | continue; |
1307 | 1298 | ||
1308 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode), | 1299 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode), |
1309 | OCFS2_FI_FLAG_NOLOCK); | 1300 | OCFS2_FI_FLAG_ORPHAN_RECOVERY); |
1310 | if (IS_ERR(iter)) | 1301 | if (IS_ERR(iter)) |
1311 | continue; | 1302 | continue; |
1312 | 1303 | ||
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
1418 | /* Set the proper information to get us going into | 1409 | /* Set the proper information to get us going into |
1419 | * ocfs2_delete_inode. */ | 1410 | * ocfs2_delete_inode. */ |
1420 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | 1411 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; |
1421 | oi->ip_orphaned_slot = slot; | ||
1422 | spin_unlock(&oi->ip_lock); | 1412 | spin_unlock(&oi->ip_lock); |
1423 | 1413 | ||
1424 | iput(inode); | 1414 | iput(inode); |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d026b4f27757..3db5de4506da 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, | |||
390 | /* We may be deleting metadata blocks, so metadata alloc dinode + | 390 | /* We may be deleting metadata blocks, so metadata alloc dinode + |
391 | one desc. block for each possible delete. */ | 391 | one desc. block for each possible delete. */ |
392 | if (tree_depth && next_free == 1 && | 392 | if (tree_depth && next_free == 1 && |
393 | le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) | 393 | ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) |
394 | credits += 1 + tree_depth; | 394 | credits += 1 + tree_depth; |
395 | 395 | ||
396 | /* update to the truncate log. */ | 396 | /* update to the truncate log. */ |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 51b020447683..af01158b39f5 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | |||
85 | int ret = 0, lock_level = 0; | 85 | int ret = 0, lock_level = 0; |
86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); | 86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); |
87 | 87 | ||
88 | /* We don't want to support shared writable mappings yet. */ | 88 | /* |
89 | if (!ocfs2_mount_local(osb) && | 89 | * Only support shared writeable mmap for local mounts which |
90 | * don't know about holes. | ||
91 | */ | ||
92 | if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && | ||
90 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && | 93 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && |
91 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | 94 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { |
92 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | 95 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 28dd757ff67d..2bcf353fd7c5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | |||
175 | 175 | ||
176 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); | 176 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); |
177 | if (IS_ERR(inode)) { | 177 | if (IS_ERR(inode)) { |
178 | mlog(ML_ERROR, "Unable to create inode %llu\n", | ||
179 | (unsigned long long)blkno); | ||
180 | ret = ERR_PTR(-EACCES); | 178 | ret = ERR_PTR(-EACCES); |
181 | goto bail_unlock; | 179 | goto bail_unlock; |
182 | } | 180 | } |
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | |||
189 | * unlink. */ | 187 | * unlink. */ |
190 | spin_lock(&oi->ip_lock); | 188 | spin_lock(&oi->ip_lock); |
191 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; | 189 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; |
192 | oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
193 | spin_unlock(&oi->ip_lock); | 190 | spin_unlock(&oi->ip_lock); |
194 | 191 | ||
195 | bail_add: | 192 | bail_add: |
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | |||
288 | 285 | ||
289 | i_size_write(inode, inode->i_sb->s_blocksize); | 286 | i_size_write(inode, inode->i_sb->s_blocksize); |
290 | inode->i_nlink = 2; | 287 | inode->i_nlink = 2; |
291 | inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); | 288 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
292 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 289 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
293 | if (status < 0) { | 290 | if (status < 0) { |
294 | mlog_errno(status); | 291 | mlog_errno(status); |
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | |||
1486 | struct buffer_head **bhs = NULL; | 1483 | struct buffer_head **bhs = NULL; |
1487 | const char *c; | 1484 | const char *c; |
1488 | struct super_block *sb = osb->sb; | 1485 | struct super_block *sb = osb->sb; |
1489 | u64 p_blkno; | 1486 | u64 p_blkno, p_blocks; |
1490 | int p_blocks; | ||
1491 | int virtual, blocks, status, i, bytes_left; | 1487 | int virtual, blocks, status, i, bytes_left; |
1492 | 1488 | ||
1493 | bytes_left = i_size_read(inode) + 1; | 1489 | bytes_left = i_size_read(inode) + 1; |
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | |||
1514 | goto bail; | 1510 | goto bail; |
1515 | } | 1511 | } |
1516 | 1512 | ||
1517 | status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, | 1513 | status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, |
1518 | &p_blocks); | 1514 | NULL); |
1519 | if (status < 0) { | 1515 | if (status < 0) { |
1520 | mlog_errno(status); | 1516 | mlog_errno(status); |
1521 | goto bail; | 1517 | goto bail; |
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir, | |||
1674 | inode->i_rdev = 0; | 1670 | inode->i_rdev = 0; |
1675 | newsize = l - 1; | 1671 | newsize = l - 1; |
1676 | if (l > ocfs2_fast_symlink_chars(sb)) { | 1672 | if (l > ocfs2_fast_symlink_chars(sb)) { |
1673 | u32 offset = 0; | ||
1674 | |||
1677 | inode->i_op = &ocfs2_symlink_inode_operations; | 1675 | inode->i_op = &ocfs2_symlink_inode_operations; |
1678 | status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, | 1676 | status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, |
1677 | new_fe_bh, | ||
1679 | handle, data_ac, NULL, | 1678 | handle, data_ac, NULL, |
1680 | NULL); | 1679 | NULL); |
1681 | if (status < 0) { | 1680 | if (status < 0) { |
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir, | |||
1689 | goto bail; | 1688 | goto bail; |
1690 | } | 1689 | } |
1691 | i_size_write(inode, newsize); | 1690 | i_size_write(inode, newsize); |
1692 | inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); | 1691 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
1693 | } else { | 1692 | } else { |
1694 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | 1693 | inode->i_op = &ocfs2_fast_symlink_inode_operations; |
1695 | memcpy((char *) fe->id2.i_symlink, symname, l); | 1694 | memcpy((char *) fe->id2.i_symlink, symname, l); |
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, | |||
2222 | /* Record which orphan dir our inode now resides | 2221 | /* Record which orphan dir our inode now resides |
2223 | * in. delete_inode will use this to determine which orphan | 2222 | * in. delete_inode will use this to determine which orphan |
2224 | * dir to lock. */ | 2223 | * dir to lock. */ |
2225 | spin_lock(&OCFS2_I(inode)->ip_lock); | 2224 | fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); |
2226 | OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; | ||
2227 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
2228 | 2225 | ||
2229 | mlog(0, "Inode %llu orphaned in slot %d\n", | 2226 | mlog(0, "Inode %llu orphaned in slot %d\n", |
2230 | (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); | 2227 | (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index db8e77cd35d3..82cc92dcf8a6 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -46,11 +46,6 @@ | |||
46 | #include "endian.h" | 46 | #include "endian.h" |
47 | #include "ocfs2_lockid.h" | 47 | #include "ocfs2_lockid.h" |
48 | 48 | ||
49 | struct ocfs2_extent_map { | ||
50 | u32 em_clusters; | ||
51 | struct rb_root em_extents; | ||
52 | }; | ||
53 | |||
54 | /* Most user visible OCFS2 inodes will have very few pieces of | 49 | /* Most user visible OCFS2 inodes will have very few pieces of |
55 | * metadata, but larger files (including bitmaps, etc) must be taken | 50 | * metadata, but larger files (including bitmaps, etc) must be taken |
56 | * into account when designing an access scheme. We allow a small | 51 | * into account when designing an access scheme. We allow a small |
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) | |||
303 | return 1; | 298 | return 1; |
304 | } | 299 | } |
305 | 300 | ||
301 | static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) | ||
302 | { | ||
303 | if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) | ||
304 | return 1; | ||
305 | return 0; | ||
306 | } | ||
307 | |||
306 | /* set / clear functions because cluster events can make these happen | 308 | /* set / clear functions because cluster events can make these happen |
307 | * in parallel so we want the transitions to be atomic. this also | 309 | * in parallel so we want the transitions to be atomic. this also |
308 | * means that any future flags osb_flags must be protected by spinlock | 310 | * means that any future flags osb_flags must be protected by spinlock |
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) | |||
461 | return (unsigned long)((bytes + 511) >> 9); | 463 | return (unsigned long)((bytes + 511) >> 9); |
462 | } | 464 | } |
463 | 465 | ||
466 | static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, | ||
467 | unsigned long pg_index) | ||
468 | { | ||
469 | u32 clusters = pg_index; | ||
470 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
471 | |||
472 | if (unlikely(PAGE_CACHE_SHIFT > cbits)) | ||
473 | clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); | ||
474 | else if (PAGE_CACHE_SHIFT < cbits) | ||
475 | clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); | ||
476 | |||
477 | return clusters; | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Find the 1st page index which covers the given clusters. | ||
482 | */ | ||
483 | static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, | ||
484 | u32 clusters) | ||
485 | { | ||
486 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
487 | unsigned long index = clusters; | ||
488 | |||
489 | if (PAGE_CACHE_SHIFT > cbits) { | ||
490 | index = clusters >> (PAGE_CACHE_SHIFT - cbits); | ||
491 | } else if (PAGE_CACHE_SHIFT < cbits) { | ||
492 | index = clusters << (cbits - PAGE_CACHE_SHIFT); | ||
493 | } | ||
494 | |||
495 | return index; | ||
496 | } | ||
497 | |||
498 | static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) | ||
499 | { | ||
500 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
501 | unsigned int pages_per_cluster = 1; | ||
502 | |||
503 | if (PAGE_CACHE_SHIFT < cbits) | ||
504 | pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); | ||
505 | |||
506 | return pages_per_cluster; | ||
507 | } | ||
508 | |||
464 | #define ocfs2_set_bit ext2_set_bit | 509 | #define ocfs2_set_bit ext2_set_bit |
465 | #define ocfs2_clear_bit ext2_clear_bit | 510 | #define ocfs2_clear_bit ext2_clear_bit |
466 | #define ocfs2_test_bit ext2_test_bit | 511 | #define ocfs2_test_bit ext2_test_bit |
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e61e218f5e0b..71306479c68f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -86,7 +86,8 @@ | |||
86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) | 86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) |
87 | 87 | ||
88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB | 88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB |
89 | #define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT | 89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ |
90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) | ||
90 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 | 91 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 |
91 | 92 | ||
92 | /* | 93 | /* |
@@ -155,6 +156,12 @@ | |||
155 | #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ | 156 | #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ |
156 | 157 | ||
157 | /* | 158 | /* |
159 | * Extent record flags (e_node.leaf.flags) | ||
160 | */ | ||
161 | #define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but | ||
162 | * unwritten */ | ||
163 | |||
164 | /* | ||
158 | * ioctl commands | 165 | * ioctl commands |
159 | */ | 166 | */ |
160 | #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) | 167 | #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) |
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { | |||
282 | /* | 289 | /* |
283 | * On disk extent record for OCFS2 | 290 | * On disk extent record for OCFS2 |
284 | * It describes a range of clusters on disk. | 291 | * It describes a range of clusters on disk. |
292 | * | ||
293 | * Length fields are divided into interior and leaf node versions. | ||
294 | * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. | ||
285 | */ | 295 | */ |
286 | struct ocfs2_extent_rec { | 296 | struct ocfs2_extent_rec { |
287 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ | 297 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ |
288 | __le32 e_clusters; /* Clusters covered by this extent */ | 298 | union { |
299 | __le32 e_int_clusters; /* Clusters covered by all children */ | ||
300 | struct { | ||
301 | __le16 e_leaf_clusters; /* Clusters covered by this | ||
302 | extent */ | ||
303 | __u8 e_reserved1; | ||
304 | __u8 e_flags; /* Extent flags */ | ||
305 | }; | ||
306 | }; | ||
289 | __le64 e_blkno; /* Physical disk offset, in blocks */ | 307 | __le64 e_blkno; /* Physical disk offset, in blocks */ |
290 | /*10*/ | 308 | /*10*/ |
291 | }; | 309 | }; |
@@ -311,7 +329,10 @@ struct ocfs2_extent_list { | |||
311 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this | 329 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this |
312 | point. 0 means data extents | 330 | point. 0 means data extents |
313 | hang directly off this | 331 | hang directly off this |
314 | header (a leaf) */ | 332 | header (a leaf) |
333 | NOTE: The high 8 bits cannot be | ||
334 | used - tree_depth is never that big. | ||
335 | */ | ||
315 | __le16 l_count; /* Number of extent records */ | 336 | __le16 l_count; /* Number of extent records */ |
316 | __le16 l_next_free_rec; /* Next unused extent slot */ | 337 | __le16 l_next_free_rec; /* Next unused extent slot */ |
317 | __le16 l_reserved1; | 338 | __le16 l_reserved1; |
@@ -446,7 +467,9 @@ struct ocfs2_dinode { | |||
446 | __le32 i_ctime_nsec; | 467 | __le32 i_ctime_nsec; |
447 | __le32 i_mtime_nsec; | 468 | __le32 i_mtime_nsec; |
448 | __le32 i_attr; | 469 | __le32 i_attr; |
449 | __le32 i_reserved1; | 470 | __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL |
471 | was set in i_flags */ | ||
472 | __le16 i_reserved1; | ||
450 | /*70*/ __le64 i_reserved2[8]; | 473 | /*70*/ __le64 i_reserved2[8]; |
451 | /*B8*/ union { | 474 | /*B8*/ union { |
452 | __le64 i_pad1; /* Generic way to refer to this | 475 | __le64 i_pad1; /* Generic way to refer to this |
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4d5d5655c185..4ca02b1c38ac 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h | |||
@@ -44,6 +44,7 @@ enum ocfs2_lock_type { | |||
44 | OCFS2_LOCK_TYPE_RENAME, | 44 | OCFS2_LOCK_TYPE_RENAME, |
45 | OCFS2_LOCK_TYPE_RW, | 45 | OCFS2_LOCK_TYPE_RW, |
46 | OCFS2_LOCK_TYPE_DENTRY, | 46 | OCFS2_LOCK_TYPE_DENTRY, |
47 | OCFS2_LOCK_TYPE_OPEN, | ||
47 | OCFS2_NUM_LOCK_TYPES | 48 | OCFS2_NUM_LOCK_TYPES |
48 | }; | 49 | }; |
49 | 50 | ||
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) | |||
69 | case OCFS2_LOCK_TYPE_DENTRY: | 70 | case OCFS2_LOCK_TYPE_DENTRY: |
70 | c = 'N'; | 71 | c = 'N'; |
71 | break; | 72 | break; |
73 | case OCFS2_LOCK_TYPE_OPEN: | ||
74 | c = 'O'; | ||
75 | break; | ||
72 | default: | 76 | default: |
73 | c = '\0'; | 77 | c = '\0'; |
74 | } | 78 | } |
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { | |||
85 | * important job it does, anyway. */ | 89 | * important job it does, anyway. */ |
86 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", | 90 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", |
87 | [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", | 91 | [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", |
92 | [OCFS2_LOCK_TYPE_OPEN] = "Open", | ||
88 | }; | 93 | }; |
89 | 94 | ||
90 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) | 95 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74e..d921a28329dc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
197 | goto bail; | 197 | goto bail; |
198 | } | 198 | } |
199 | 199 | ||
200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); | 200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); |
201 | if (status < 0) { | 201 | if (status < 0) { |
202 | mlog_errno(status); | 202 | mlog_errno(status); |
203 | goto bail; | 203 | goto bail; |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6dbb11762759..0da655ae5d6f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | |||
381 | le32_to_cpu(fe->i_clusters))); | 381 | le32_to_cpu(fe->i_clusters))); |
382 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); | 382 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); |
383 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); | 383 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); |
384 | alloc_inode->i_blocks = | 384 | alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); |
385 | ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); | ||
386 | 385 | ||
387 | status = 0; | 386 | status = 0; |
388 | bail: | 387 | bail: |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6534f92424dd..5c9e8243691f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void) | |||
806 | 806 | ||
807 | ocfs2_print_version(); | 807 | ocfs2_print_version(); |
808 | 808 | ||
809 | if (init_ocfs2_extent_maps()) | ||
810 | return -ENOMEM; | ||
811 | |||
812 | status = init_ocfs2_uptodate_cache(); | 809 | status = init_ocfs2_uptodate_cache(); |
813 | if (status < 0) { | 810 | if (status < 0) { |
814 | mlog_errno(status); | 811 | mlog_errno(status); |
@@ -837,7 +834,6 @@ leave: | |||
837 | if (status < 0) { | 834 | if (status < 0) { |
838 | ocfs2_free_mem_caches(); | 835 | ocfs2_free_mem_caches(); |
839 | exit_ocfs2_uptodate_cache(); | 836 | exit_ocfs2_uptodate_cache(); |
840 | exit_ocfs2_extent_maps(); | ||
841 | } | 837 | } |
842 | 838 | ||
843 | mlog_exit(status); | 839 | mlog_exit(status); |
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) | |||
863 | 859 | ||
864 | unregister_filesystem(&ocfs2_fs_type); | 860 | unregister_filesystem(&ocfs2_fs_type); |
865 | 861 | ||
866 | exit_ocfs2_extent_maps(); | ||
867 | |||
868 | exit_ocfs2_uptodate_cache(); | 862 | exit_ocfs2_uptodate_cache(); |
869 | 863 | ||
870 | mlog_exit_void(); | 864 | mlog_exit_void(); |
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data, | |||
963 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); | 957 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); |
964 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); | 958 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); |
965 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); | 959 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); |
960 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); | ||
966 | 961 | ||
967 | ocfs2_metadata_cache_init(&oi->vfs_inode); | 962 | ocfs2_metadata_cache_init(&oi->vfs_inode); |
968 | 963 | ||
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index f30e63b9910c..4f82a2f0efef 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c | |||
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr | |||
63 | __be32 h_node_num; /* node sending this particular message. */ | 63 | __be32 h_node_num; /* node sending this particular message. */ |
64 | }; | 64 | }; |
65 | 65 | ||
66 | /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this | ||
67 | * for the network. */ | ||
68 | #define OCFS2_VOTE_FILENAME_LEN 256 | ||
69 | struct ocfs2_vote_msg | 66 | struct ocfs2_vote_msg |
70 | { | 67 | { |
71 | struct ocfs2_msg_hdr v_hdr; | 68 | struct ocfs2_msg_hdr v_hdr; |
72 | union { | 69 | __be32 v_reserved1; |
73 | __be32 v_generic1; | ||
74 | __be32 v_orphaned_slot; /* Used during delete votes */ | ||
75 | __be32 v_nlink; /* Used during unlink votes */ | ||
76 | } md1; /* Message type dependant 1 */ | ||
77 | }; | 70 | }; |
78 | 71 | ||
79 | /* Responses are given these values to maintain backwards | 72 | /* Responses are given these values to maintain backwards |
@@ -86,7 +79,6 @@ struct ocfs2_response_msg | |||
86 | { | 79 | { |
87 | struct ocfs2_msg_hdr r_hdr; | 80 | struct ocfs2_msg_hdr r_hdr; |
88 | __be32 r_response; | 81 | __be32 r_response; |
89 | __be32 r_orphaned_slot; | ||
90 | }; | 82 | }; |
91 | 83 | ||
92 | struct ocfs2_vote_work { | 84 | struct ocfs2_vote_work { |
@@ -96,7 +88,6 @@ struct ocfs2_vote_work { | |||
96 | 88 | ||
97 | enum ocfs2_vote_request { | 89 | enum ocfs2_vote_request { |
98 | OCFS2_VOTE_REQ_INVALID = 0, | 90 | OCFS2_VOTE_REQ_INVALID = 0, |
99 | OCFS2_VOTE_REQ_DELETE, | ||
100 | OCFS2_VOTE_REQ_MOUNT, | 91 | OCFS2_VOTE_REQ_MOUNT, |
101 | OCFS2_VOTE_REQ_UMOUNT, | 92 | OCFS2_VOTE_REQ_UMOUNT, |
102 | OCFS2_VOTE_REQ_LAST | 93 | OCFS2_VOTE_REQ_LAST |
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb, | |||
151 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); | 142 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); |
152 | } | 143 | } |
153 | 144 | ||
154 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode) | ||
155 | { | ||
156 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
157 | |||
158 | assert_spin_locked(&oi->ip_lock); | ||
159 | /* We set the SKIP_DELETE flag on the inode so we don't try to | ||
160 | * delete it in delete_inode ourselves, thus avoiding | ||
161 | * unecessary lock pinging. If the other node failed to wipe | ||
162 | * the inode as a result of a crash, then recovery will pick | ||
163 | * up the slack. */ | ||
164 | oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; | ||
165 | } | ||
166 | |||
167 | static int ocfs2_process_delete_request(struct inode *inode, | ||
168 | int *orphaned_slot) | ||
169 | { | ||
170 | int response = OCFS2_RESPONSE_BUSY; | ||
171 | |||
172 | mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", | ||
173 | inode->i_ino, inode->i_nlink, *orphaned_slot); | ||
174 | |||
175 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
176 | |||
177 | /* Whatever our vote response is, we want to make sure that | ||
178 | * the orphaned slot is recorded properly on this node *and* | ||
179 | * on the requesting node. Technically, if the requesting node | ||
180 | * did not know which slot the inode is orphaned in but we | ||
181 | * respond with BUSY he doesn't actually need the orphaned | ||
182 | * slot, but it doesn't hurt to do it here anyway. */ | ||
183 | if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { | ||
184 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != | ||
185 | OCFS2_INVALID_SLOT && | ||
186 | OCFS2_I(inode)->ip_orphaned_slot != | ||
187 | (*orphaned_slot), | ||
188 | "Inode %llu: This node thinks it's " | ||
189 | "orphaned in slot %d, messaged it's in %d\n", | ||
190 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
191 | OCFS2_I(inode)->ip_orphaned_slot, | ||
192 | *orphaned_slot); | ||
193 | |||
194 | mlog(0, "Setting orphaned slot for inode %llu to %d\n", | ||
195 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
196 | *orphaned_slot); | ||
197 | |||
198 | OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; | ||
199 | } else { | ||
200 | mlog(0, "Sending back orphaned slot %d for inode %llu\n", | ||
201 | OCFS2_I(inode)->ip_orphaned_slot, | ||
202 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
203 | |||
204 | *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
205 | } | ||
206 | |||
207 | /* vote no if the file is still open. */ | ||
208 | if (OCFS2_I(inode)->ip_open_count) { | ||
209 | mlog(0, "open count = %u\n", | ||
210 | OCFS2_I(inode)->ip_open_count); | ||
211 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
212 | goto done; | ||
213 | } | ||
214 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
215 | |||
216 | /* directories are a bit ugly... What if someone is sitting in | ||
217 | * it? We want to make sure the inode is removed completely as | ||
218 | * a result of the iput in process_vote. */ | ||
219 | if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { | ||
220 | mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); | ||
221 | goto done; | ||
222 | } | ||
223 | |||
224 | if (filemap_fdatawrite(inode->i_mapping)) { | ||
225 | mlog(ML_ERROR, "Could not sync inode %llu for delete!\n", | ||
226 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
227 | goto done; | ||
228 | } | ||
229 | sync_mapping_buffers(inode->i_mapping); | ||
230 | truncate_inode_pages(inode->i_mapping, 0); | ||
231 | ocfs2_extent_map_trunc(inode, 0); | ||
232 | |||
233 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
234 | /* double check open count - someone might have raced this | ||
235 | * thread into ocfs2_file_open while we were writing out | ||
236 | * data. If we're to allow a wipe of this inode now, we *must* | ||
237 | * hold the spinlock until we've marked it. */ | ||
238 | if (OCFS2_I(inode)->ip_open_count) { | ||
239 | mlog(0, "Raced to wipe! open count = %u\n", | ||
240 | OCFS2_I(inode)->ip_open_count); | ||
241 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
242 | goto done; | ||
243 | } | ||
244 | |||
245 | /* Mark the inode as being wiped from disk. */ | ||
246 | ocfs2_mark_inode_remotely_deleted(inode); | ||
247 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
248 | |||
249 | /* Not sure this is necessary anymore. */ | ||
250 | d_prune_aliases(inode); | ||
251 | |||
252 | /* If we get here, then we're voting 'yes', so commit the | ||
253 | * delete on our side. */ | ||
254 | response = OCFS2_RESPONSE_OK; | ||
255 | done: | ||
256 | return response; | ||
257 | } | ||
258 | |||
259 | static void ocfs2_process_vote(struct ocfs2_super *osb, | 145 | static void ocfs2_process_vote(struct ocfs2_super *osb, |
260 | struct ocfs2_vote_msg *msg) | 146 | struct ocfs2_vote_msg *msg) |
261 | { | 147 | { |
262 | int net_status, vote_response; | 148 | int net_status, vote_response; |
263 | int orphaned_slot = 0; | 149 | unsigned int node_num; |
264 | unsigned int node_num, generation; | ||
265 | u64 blkno; | 150 | u64 blkno; |
266 | enum ocfs2_vote_request request; | 151 | enum ocfs2_vote_request request; |
267 | struct inode *inode = NULL; | ||
268 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; | 152 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; |
269 | struct ocfs2_response_msg response; | 153 | struct ocfs2_response_msg response; |
270 | 154 | ||
271 | /* decode the network mumbo jumbo into local variables. */ | 155 | /* decode the network mumbo jumbo into local variables. */ |
272 | request = be32_to_cpu(hdr->h_request); | 156 | request = be32_to_cpu(hdr->h_request); |
273 | blkno = be64_to_cpu(hdr->h_blkno); | 157 | blkno = be64_to_cpu(hdr->h_blkno); |
274 | generation = be32_to_cpu(hdr->h_generation); | ||
275 | node_num = be32_to_cpu(hdr->h_node_num); | 158 | node_num = be32_to_cpu(hdr->h_node_num); |
276 | if (request == OCFS2_VOTE_REQ_DELETE) | ||
277 | orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); | ||
278 | 159 | ||
279 | mlog(0, "processing vote: request = %u, blkno = %llu, " | 160 | mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n", |
280 | "generation = %u, node_num = %u, priv1 = %u\n", request, | 161 | request, (unsigned long long)blkno, node_num); |
281 | (unsigned long long)blkno, generation, node_num, | ||
282 | be32_to_cpu(msg->md1.v_generic1)); | ||
283 | 162 | ||
284 | if (!ocfs2_is_valid_vote_request(request)) { | 163 | if (!ocfs2_is_valid_vote_request(request)) { |
285 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", | 164 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", |
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, | |||
302 | break; | 181 | break; |
303 | } | 182 | } |
304 | 183 | ||
305 | /* We cannot process the remaining message types before we're | ||
306 | * fully mounted. It's perfectly safe however to send a 'yes' | ||
307 | * response as we can't possibly have any of the state they're | ||
308 | * asking us to modify yet. */ | ||
309 | if (atomic_read(&osb->vol_state) == VOLUME_INIT) | ||
310 | goto respond; | ||
311 | |||
312 | /* If we get here, then the request is against an inode. */ | ||
313 | inode = ocfs2_ilookup_for_vote(osb, blkno, | ||
314 | request == OCFS2_VOTE_REQ_DELETE); | ||
315 | |||
316 | /* Not finding the inode is perfectly valid - it means we're | ||
317 | * not interested in what the other node is about to do to it | ||
318 | * so in those cases we automatically respond with an | ||
319 | * affirmative. Cluster locking ensures that we won't race | ||
320 | * interest in the inode with this vote request. */ | ||
321 | if (!inode) | ||
322 | goto respond; | ||
323 | |||
324 | /* Check generation values. It's possible for us to get a | ||
325 | * request against a stale inode. If so then we proceed as if | ||
326 | * we had not found an inode in the first place. */ | ||
327 | if (inode->i_generation != generation) { | ||
328 | mlog(0, "generation passed %u != inode generation = %u, " | ||
329 | "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, " | ||
330 | "message type = %u\n", generation, inode->i_generation, | ||
331 | OCFS2_I(inode)->ip_flags, | ||
332 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
333 | (unsigned long long)blkno, atomic_read(&inode->i_count), | ||
334 | request); | ||
335 | iput(inode); | ||
336 | inode = NULL; | ||
337 | goto respond; | ||
338 | } | ||
339 | |||
340 | switch (request) { | ||
341 | case OCFS2_VOTE_REQ_DELETE: | ||
342 | vote_response = ocfs2_process_delete_request(inode, | ||
343 | &orphaned_slot); | ||
344 | break; | ||
345 | default: | ||
346 | mlog(ML_ERROR, "node %u, invalid request: %u\n", | ||
347 | node_num, request); | ||
348 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
349 | } | ||
350 | |||
351 | respond: | 184 | respond: |
352 | /* Response struture is small so we just put it on the stack | 185 | /* Response struture is small so we just put it on the stack |
353 | * and stuff it inline. */ | 186 | * and stuff it inline. */ |
@@ -357,7 +190,6 @@ respond: | |||
357 | response.r_hdr.h_generation = hdr->h_generation; | 190 | response.r_hdr.h_generation = hdr->h_generation; |
358 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); | 191 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); |
359 | response.r_response = cpu_to_be32(vote_response); | 192 | response.r_response = cpu_to_be32(vote_response); |
360 | response.r_orphaned_slot = cpu_to_be32(orphaned_slot); | ||
361 | 193 | ||
362 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, | 194 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, |
363 | osb->net_key, | 195 | osb->net_key, |
@@ -373,9 +205,6 @@ respond: | |||
373 | && net_status != -ENOTCONN) | 205 | && net_status != -ENOTCONN) |
374 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", | 206 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", |
375 | node_num, net_status); | 207 | node_num, net_status); |
376 | |||
377 | if (inode) | ||
378 | iput(inode); | ||
379 | } | 208 | } |
380 | 209 | ||
381 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) | 210 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) |
@@ -634,8 +463,7 @@ bail: | |||
634 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | 463 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, |
635 | u64 blkno, | 464 | u64 blkno, |
636 | unsigned int generation, | 465 | unsigned int generation, |
637 | enum ocfs2_vote_request type, | 466 | enum ocfs2_vote_request type) |
638 | u32 priv) | ||
639 | { | 467 | { |
640 | struct ocfs2_vote_msg *request; | 468 | struct ocfs2_vote_msg *request; |
641 | struct ocfs2_msg_hdr *hdr; | 469 | struct ocfs2_msg_hdr *hdr; |
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | |||
651 | hdr->h_request = cpu_to_be32(type); | 479 | hdr->h_request = cpu_to_be32(type); |
652 | hdr->h_blkno = cpu_to_be64(blkno); | 480 | hdr->h_blkno = cpu_to_be64(blkno); |
653 | hdr->h_generation = cpu_to_be32(generation); | 481 | hdr->h_generation = cpu_to_be32(generation); |
654 | |||
655 | request->md1.v_generic1 = cpu_to_be32(priv); | ||
656 | } | 482 | } |
657 | 483 | ||
658 | return request; | 484 | return request; |
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, | |||
664 | struct ocfs2_vote_msg *request, | 490 | struct ocfs2_vote_msg *request, |
665 | struct ocfs2_net_response_cb *callback) | 491 | struct ocfs2_net_response_cb *callback) |
666 | { | 492 | { |
667 | int status, response; | 493 | int status, response = -EBUSY; |
668 | unsigned int response_id; | 494 | unsigned int response_id; |
669 | struct ocfs2_msg_hdr *hdr; | 495 | struct ocfs2_msg_hdr *hdr; |
670 | 496 | ||
@@ -686,109 +512,12 @@ bail: | |||
686 | return status; | 512 | return status; |
687 | } | 513 | } |
688 | 514 | ||
689 | static int ocfs2_request_vote(struct inode *inode, | ||
690 | struct ocfs2_vote_msg *request, | ||
691 | struct ocfs2_net_response_cb *callback) | ||
692 | { | ||
693 | int status; | ||
694 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
695 | |||
696 | if (ocfs2_inode_is_new(inode)) | ||
697 | return 0; | ||
698 | |||
699 | status = -EAGAIN; | ||
700 | while (status == -EAGAIN) { | ||
701 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
702 | signal_pending(current)) | ||
703 | return -ERESTARTSYS; | ||
704 | |||
705 | status = ocfs2_super_lock(osb, 0); | ||
706 | if (status < 0) { | ||
707 | mlog_errno(status); | ||
708 | break; | ||
709 | } | ||
710 | |||
711 | status = 0; | ||
712 | if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
713 | osb->node_num)) | ||
714 | status = ocfs2_do_request_vote(osb, request, callback); | ||
715 | |||
716 | ocfs2_super_unlock(osb, 0); | ||
717 | } | ||
718 | return status; | ||
719 | } | ||
720 | |||
721 | static void ocfs2_delete_response_cb(void *priv, | ||
722 | struct ocfs2_response_msg *resp) | ||
723 | { | ||
724 | int orphaned_slot, node; | ||
725 | struct inode *inode = priv; | ||
726 | |||
727 | orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); | ||
728 | node = be32_to_cpu(resp->r_hdr.h_node_num); | ||
729 | mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n", | ||
730 | node, (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
731 | orphaned_slot); | ||
732 | |||
733 | /* The other node may not actually know which slot the inode | ||
734 | * is orphaned in. */ | ||
735 | if (orphaned_slot == OCFS2_INVALID_SLOT) | ||
736 | return; | ||
737 | |||
738 | /* Ok, the responding node knows which slot this inode is | ||
739 | * orphaned in. We verify that the information is correct and | ||
740 | * then record this in the inode. ocfs2_delete_inode will use | ||
741 | * this information to determine which lock to take. */ | ||
742 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
743 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && | ||
744 | OCFS2_I(inode)->ip_orphaned_slot | ||
745 | != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's " | ||
746 | "orphaned in slot %d, we think it's in %d\n", | ||
747 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
748 | be32_to_cpu(resp->r_hdr.h_node_num), | ||
749 | orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); | ||
750 | |||
751 | OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; | ||
752 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
753 | } | ||
754 | |||
755 | int ocfs2_request_delete_vote(struct inode *inode) | ||
756 | { | ||
757 | int orphaned_slot, status; | ||
758 | struct ocfs2_net_response_cb delete_cb; | ||
759 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
760 | struct ocfs2_vote_msg *request; | ||
761 | |||
762 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
763 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
764 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
765 | |||
766 | delete_cb.rc_cb = ocfs2_delete_response_cb; | ||
767 | delete_cb.rc_priv = inode; | ||
768 | |||
769 | mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n", | ||
770 | (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
771 | |||
772 | status = -ENOMEM; | ||
773 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
774 | inode->i_generation, | ||
775 | OCFS2_VOTE_REQ_DELETE, orphaned_slot); | ||
776 | if (request) { | ||
777 | status = ocfs2_request_vote(inode, request, &delete_cb); | ||
778 | |||
779 | kfree(request); | ||
780 | } | ||
781 | |||
782 | return status; | ||
783 | } | ||
784 | |||
785 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) | 515 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) |
786 | { | 516 | { |
787 | int status; | 517 | int status; |
788 | struct ocfs2_vote_msg *request = NULL; | 518 | struct ocfs2_vote_msg *request = NULL; |
789 | 519 | ||
790 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | 520 | request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT); |
791 | OCFS2_VOTE_REQ_MOUNT, 0); | ||
792 | if (!request) { | 521 | if (!request) { |
793 | status = -ENOMEM; | 522 | status = -ENOMEM; |
794 | goto bail; | 523 | goto bail; |
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) | |||
821 | int status; | 550 | int status; |
822 | struct ocfs2_vote_msg *request = NULL; | 551 | struct ocfs2_vote_msg *request = NULL; |
823 | 552 | ||
824 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | 553 | request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT); |
825 | OCFS2_VOTE_REQ_UMOUNT, 0); | ||
826 | if (!request) { | 554 | if (!request) { |
827 | status = -ENOMEM; | 555 | status = -ENOMEM; |
828 | goto bail; | 556 | goto bail; |
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, | |||
969 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); | 697 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); |
970 | mlog(0, "h_node_num = %u\n", | 698 | mlog(0, "h_node_num = %u\n", |
971 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); | 699 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); |
972 | mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); | ||
973 | 700 | ||
974 | spin_lock(&osb->vote_task_lock); | 701 | spin_lock(&osb->vote_task_lock); |
975 | list_add_tail(&work->w_list, &osb->vote_list); | 702 | list_add_tail(&work->w_list, &osb->vote_list); |
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h index 53ebc1c69e56..9ea46f62de31 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/vote.h | |||
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) | |||
38 | wake_up(&osb->vote_event); | 38 | wake_up(&osb->vote_event); |
39 | } | 39 | } |
40 | 40 | ||
41 | int ocfs2_request_delete_vote(struct inode *inode); | ||
42 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); | 41 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); |
43 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); | 42 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); |
44 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); | 43 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); |
45 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); | 44 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); |
46 | 45 | ||
47 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode); | ||
48 | |||
49 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | 46 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, |
50 | int node_num); | 47 | int node_num); |
51 | #endif | 48 | #endif |
@@ -239,13 +239,11 @@ out: | |||
239 | /* | 239 | /* |
240 | * `endbyte' is inclusive | 240 | * `endbyte' is inclusive |
241 | */ | 241 | */ |
242 | int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | 242 | int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
243 | unsigned int flags) | 243 | loff_t endbyte, unsigned int flags) |
244 | { | 244 | { |
245 | int ret; | 245 | int ret; |
246 | struct address_space *mapping; | ||
247 | 246 | ||
248 | mapping = file->f_mapping; | ||
249 | if (!mapping) { | 247 | if (!mapping) { |
250 | ret = -EINVAL; | 248 | ret = -EINVAL; |
251 | goto out; | 249 | goto out; |
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | |||
275 | out: | 273 | out: |
276 | return ret; | 274 | return ret; |
277 | } | 275 | } |
278 | EXPORT_SYMBOL_GPL(do_sync_file_range); | 276 | EXPORT_SYMBOL_GPL(do_sync_mapping_range); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 86ec3f4a7da6..095a9c9a64fb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); | |||
843 | extern int fcntl_getlease(struct file *filp); | 843 | extern int fcntl_getlease(struct file *filp); |
844 | 844 | ||
845 | /* fs/sync.c */ | 845 | /* fs/sync.c */ |
846 | extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | 846 | extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, |
847 | unsigned int flags); | 847 | loff_t endbyte, unsigned int flags); |
848 | static inline int do_sync_file_range(struct file *file, loff_t offset, | ||
849 | loff_t endbyte, unsigned int flags) | ||
850 | { | ||
851 | return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); | ||
852 | } | ||
848 | 853 | ||
849 | /* fs/locks.c */ | 854 | /* fs/locks.c */ |
850 | extern void locks_init_lock(struct file_lock *); | 855 | extern void locks_init_lock(struct file_lock *); |
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h new file mode 100644 index 000000000000..3d967b6b120a --- /dev/null +++ b/include/linux/mtd/ubi.h | |||
@@ -0,0 +1,202 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | #ifndef __LINUX_UBI_H__ | ||
22 | #define __LINUX_UBI_H__ | ||
23 | |||
24 | #include <asm/ioctl.h> | ||
25 | #include <linux/types.h> | ||
26 | #include <mtd/ubi-user.h> | ||
27 | |||
28 | /* | ||
29 | * UBI data type hint constants. | ||
30 | * | ||
31 | * UBI_LONGTERM: long-term data | ||
32 | * UBI_SHORTTERM: short-term data | ||
33 | * UBI_UNKNOWN: data persistence is unknown | ||
34 | * | ||
35 | * These constants are used when data is written to UBI volumes in order to | ||
36 | * help the UBI wear-leveling unit to find more appropriate physical | ||
37 | * eraseblocks. | ||
38 | */ | ||
39 | enum { | ||
40 | UBI_LONGTERM = 1, | ||
41 | UBI_SHORTTERM, | ||
42 | UBI_UNKNOWN | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * enum ubi_open_mode - UBI volume open mode constants. | ||
47 | * | ||
48 | * UBI_READONLY: read-only mode | ||
49 | * UBI_READWRITE: read-write mode | ||
50 | * UBI_EXCLUSIVE: exclusive mode | ||
51 | */ | ||
52 | enum { | ||
53 | UBI_READONLY = 1, | ||
54 | UBI_READWRITE, | ||
55 | UBI_EXCLUSIVE | ||
56 | }; | ||
57 | |||
58 | /** | ||
59 | * struct ubi_volume_info - UBI volume description data structure. | ||
60 | * @vol_id: volume ID | ||
61 | * @ubi_num: UBI device number this volume belongs to | ||
62 | * @size: how many physical eraseblocks are reserved for this volume | ||
63 | * @used_bytes: how many bytes of data this volume contains | ||
64 | * @used_ebs: how many physical eraseblocks of this volume actually contain any | ||
65 | * data | ||
66 | * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) | ||
67 | * @corrupted: non-zero if the volume is corrupted (static volumes only) | ||
68 | * @upd_marker: non-zero if the volume has update marker set | ||
69 | * @alignment: volume alignment | ||
70 | * @usable_leb_size: how many bytes are available in logical eraseblocks of | ||
71 | * this volume | ||
72 | * @name_len: volume name length | ||
73 | * @name: volume name | ||
74 | * @cdev: UBI volume character device major and minor numbers | ||
75 | * | ||
76 | * The @corrupted flag is only relevant to static volumes and is always zero | ||
77 | * for dynamic ones. This is because UBI does not care about dynamic volume | ||
78 | * data protection and only cares about protecting static volume data. | ||
79 | * | ||
80 | * The @upd_marker flag is set if the volume update operation was interrupted. | ||
81 | * Before touching the volume data during the update operation, UBI first sets | ||
82 | * the update marker flag for this volume. If the volume update operation was | ||
83 | * further interrupted, the update marker indicates this. If the update marker | ||
84 | * is set, the contents of the volume is certainly damaged and a new volume | ||
85 | * update operation has to be started. | ||
86 | * | ||
87 | * To put it differently, @corrupted and @upd_marker fields have different | ||
88 | * semantics: | ||
89 | * o the @corrupted flag means that this static volume is corrupted for some | ||
90 | * reasons, but not because an interrupted volume update | ||
91 | * o the @upd_marker field means that the volume is damaged because of an | ||
92 | * interrupted update operation. | ||
93 | * | ||
94 | * I.e., the @corrupted flag is never set if the @upd_marker flag is set. | ||
95 | * | ||
96 | * The @used_bytes and @used_ebs fields are only really needed for static | ||
97 | * volumes and contain the number of bytes stored in this static volume and how | ||
98 | * many eraseblock this data occupies. In case of dynamic volumes, the | ||
99 | * @used_bytes field is equivalent to @size*@usable_leb_size, and the @used_ebs | ||
100 | * field is equivalent to @size. | ||
101 | * | ||
102 | * In general, logical eraseblock size is a property of the UBI device, not | ||
103 | * of the UBI volume. Indeed, the logical eraseblock size depends on the | ||
104 | * physical eraseblock size and on how much bytes UBI headers consume. But | ||
105 | * because of the volume alignment (@alignment), the usable size of logical | ||
106 | * eraseblocks if a volume may be less. The following equation is true: | ||
107 | * @usable_leb_size = LEB size - (LEB size mod @alignment), | ||
108 | * where LEB size is the logical eraseblock size defined by the UBI device. | ||
109 | * | ||
110 | * The alignment is multiple to the minimal flash input/output unit size or %1 | ||
111 | * if all the available space is used. | ||
112 | * | ||
113 | * To put this differently, alignment may be considered is a way to change | ||
114 | * volume logical eraseblock sizes. | ||
115 | */ | ||
116 | struct ubi_volume_info { | ||
117 | int ubi_num; | ||
118 | int vol_id; | ||
119 | int size; | ||
120 | long long used_bytes; | ||
121 | int used_ebs; | ||
122 | int vol_type; | ||
123 | int corrupted; | ||
124 | int upd_marker; | ||
125 | int alignment; | ||
126 | int usable_leb_size; | ||
127 | int name_len; | ||
128 | const char *name; | ||
129 | dev_t cdev; | ||
130 | }; | ||
131 | |||
132 | /** | ||
133 | * struct ubi_device_info - UBI device description data structure. | ||
134 | * @ubi_num: ubi device number | ||
135 | * @leb_size: logical eraseblock size on this UBI device | ||
136 | * @min_io_size: minimal I/O unit size | ||
137 | * @ro_mode: if this device is in read-only mode | ||
138 | * @cdev: UBI character device major and minor numbers | ||
139 | * | ||
140 | * Note, @leb_size is the logical eraseblock size offered by the UBI device. | ||
141 | * Volumes of this UBI device may have smaller logical eraseblock size if their | ||
142 | * alignment is not equivalent to %1. | ||
143 | */ | ||
144 | struct ubi_device_info { | ||
145 | int ubi_num; | ||
146 | int leb_size; | ||
147 | int min_io_size; | ||
148 | int ro_mode; | ||
149 | dev_t cdev; | ||
150 | }; | ||
151 | |||
152 | /* UBI descriptor given to users when they open UBI volumes */ | ||
153 | struct ubi_volume_desc; | ||
154 | |||
155 | int ubi_get_device_info(int ubi_num, struct ubi_device_info *di); | ||
156 | void ubi_get_volume_info(struct ubi_volume_desc *desc, | ||
157 | struct ubi_volume_info *vi); | ||
158 | struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode); | ||
159 | struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, | ||
160 | int mode); | ||
161 | void ubi_close_volume(struct ubi_volume_desc *desc); | ||
162 | int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, | ||
163 | int len, int check); | ||
164 | int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, | ||
165 | int offset, int len, int dtype); | ||
166 | int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, | ||
167 | int len, int dtype); | ||
168 | int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum); | ||
169 | int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum); | ||
170 | int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum); | ||
171 | |||
172 | /* | ||
173 | * This function is the same as the 'ubi_leb_read()' function, but it does not | ||
174 | * provide the checking capability. | ||
175 | */ | ||
176 | static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf, | ||
177 | int offset, int len) | ||
178 | { | ||
179 | return ubi_leb_read(desc, lnum, buf, offset, len, 0); | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * This function is the same as the 'ubi_leb_write()' functions, but it does | ||
184 | * not have the data type argument. | ||
185 | */ | ||
186 | static inline int ubi_write(struct ubi_volume_desc *desc, int lnum, | ||
187 | const void *buf, int offset, int len) | ||
188 | { | ||
189 | return ubi_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * This function is the same as the 'ubi_leb_change()' functions, but it does | ||
194 | * not have the data type argument. | ||
195 | */ | ||
196 | static inline int ubi_change(struct ubi_volume_desc *desc, int lnum, | ||
197 | const void *buf, int len) | ||
198 | { | ||
199 | return ubi_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); | ||
200 | } | ||
201 | |||
202 | #endif /* !__LINUX_UBI_H__ */ | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 49fe2997a016..a1707583de49 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -196,13 +196,13 @@ extern void init_idle(struct task_struct *idle, int cpu); | |||
196 | extern cpumask_t nohz_cpu_mask; | 196 | extern cpumask_t nohz_cpu_mask; |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * Only dump TASK_* tasks. (-1 for all tasks) | 199 | * Only dump TASK_* tasks. (0 for all tasks) |
200 | */ | 200 | */ |
201 | extern void show_state_filter(unsigned long state_filter); | 201 | extern void show_state_filter(unsigned long state_filter); |
202 | 202 | ||
203 | static inline void show_state(void) | 203 | static inline void show_state(void) |
204 | { | 204 | { |
205 | show_state_filter(-1); | 205 | show_state_filter(0); |
206 | } | 206 | } |
207 | 207 | ||
208 | extern void show_regs(struct pt_regs *); | 208 | extern void show_regs(struct pt_regs *); |
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 52c9eb9b6df2..26e4925bc35b 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h | |||
@@ -61,10 +61,10 @@ static inline void write_seqlock(seqlock_t *sl) | |||
61 | { | 61 | { |
62 | spin_lock(&sl->lock); | 62 | spin_lock(&sl->lock); |
63 | ++sl->sequence; | 63 | ++sl->sequence; |
64 | smp_wmb(); | 64 | smp_wmb(); |
65 | } | 65 | } |
66 | 66 | ||
67 | static inline void write_sequnlock(seqlock_t *sl) | 67 | static inline void write_sequnlock(seqlock_t *sl) |
68 | { | 68 | { |
69 | smp_wmb(); | 69 | smp_wmb(); |
70 | sl->sequence++; | 70 | sl->sequence++; |
@@ -77,7 +77,7 @@ static inline int write_tryseqlock(seqlock_t *sl) | |||
77 | 77 | ||
78 | if (ret) { | 78 | if (ret) { |
79 | ++sl->sequence; | 79 | ++sl->sequence; |
80 | smp_wmb(); | 80 | smp_wmb(); |
81 | } | 81 | } |
82 | return ret; | 82 | return ret; |
83 | } | 83 | } |
diff --git a/include/mtd/Kbuild b/include/mtd/Kbuild index e0fe92b03a4e..4d46b3bdebd8 100644 --- a/include/mtd/Kbuild +++ b/include/mtd/Kbuild | |||
@@ -3,3 +3,5 @@ header-y += jffs2-user.h | |||
3 | header-y += mtd-abi.h | 3 | header-y += mtd-abi.h |
4 | header-y += mtd-user.h | 4 | header-y += mtd-user.h |
5 | header-y += nftl-user.h | 5 | header-y += nftl-user.h |
6 | header-y += ubi-header.h | ||
7 | header-y += ubi-user.h | ||
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h index 8e501a75a764..f71dac420394 100644 --- a/include/mtd/mtd-abi.h +++ b/include/mtd/mtd-abi.h | |||
@@ -24,6 +24,7 @@ struct mtd_oob_buf { | |||
24 | #define MTD_NORFLASH 3 | 24 | #define MTD_NORFLASH 3 |
25 | #define MTD_NANDFLASH 4 | 25 | #define MTD_NANDFLASH 4 |
26 | #define MTD_DATAFLASH 6 | 26 | #define MTD_DATAFLASH 6 |
27 | #define MTD_UBIVOLUME 7 | ||
27 | 28 | ||
28 | #define MTD_WRITEABLE 0x400 /* Device is writeable */ | 29 | #define MTD_WRITEABLE 0x400 /* Device is writeable */ |
29 | #define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ | 30 | #define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ |
diff --git a/include/mtd/ubi-header.h b/include/mtd/ubi-header.h new file mode 100644 index 000000000000..fa479c71aa34 --- /dev/null +++ b/include/mtd/ubi-header.h | |||
@@ -0,0 +1,360 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Authors: Artem Bityutskiy (Битюцкий Артём) | ||
19 | * Thomas Gleixner | ||
20 | * Frank Haverkamp | ||
21 | * Oliver Lohmann | ||
22 | * Andreas Arnez | ||
23 | */ | ||
24 | |||
25 | /* | ||
26 | * This file defines the layout of UBI headers and all the other UBI on-flash | ||
27 | * data structures. May be included by user-space. | ||
28 | */ | ||
29 | |||
30 | #ifndef __UBI_HEADER_H__ | ||
31 | #define __UBI_HEADER_H__ | ||
32 | |||
33 | #include <asm/byteorder.h> | ||
34 | |||
35 | /* The version of UBI images supported by this implementation */ | ||
36 | #define UBI_VERSION 1 | ||
37 | |||
38 | /* The highest erase counter value supported by this implementation */ | ||
39 | #define UBI_MAX_ERASECOUNTER 0x7FFFFFFF | ||
40 | |||
41 | /* The initial CRC32 value used when calculating CRC checksums */ | ||
42 | #define UBI_CRC32_INIT 0xFFFFFFFFU | ||
43 | |||
44 | /* Erase counter header magic number (ASCII "UBI#") */ | ||
45 | #define UBI_EC_HDR_MAGIC 0x55424923 | ||
46 | /* Volume identifier header magic number (ASCII "UBI!") */ | ||
47 | #define UBI_VID_HDR_MAGIC 0x55424921 | ||
48 | |||
49 | /* | ||
50 | * Volume type constants used in the volume identifier header. | ||
51 | * | ||
52 | * @UBI_VID_DYNAMIC: dynamic volume | ||
53 | * @UBI_VID_STATIC: static volume | ||
54 | */ | ||
55 | enum { | ||
56 | UBI_VID_DYNAMIC = 1, | ||
57 | UBI_VID_STATIC = 2 | ||
58 | }; | ||
59 | |||
60 | /* | ||
61 | * Compatibility constants used by internal volumes. | ||
62 | * | ||
63 | * @UBI_COMPAT_DELETE: delete this internal volume before anything is written | ||
64 | * to the flash | ||
65 | * @UBI_COMPAT_RO: attach this device in read-only mode | ||
66 | * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its | ||
67 | * physical eraseblocks, don't allow the wear-leveling unit to move them | ||
68 | * @UBI_COMPAT_REJECT: reject this UBI image | ||
69 | */ | ||
70 | enum { | ||
71 | UBI_COMPAT_DELETE = 1, | ||
72 | UBI_COMPAT_RO = 2, | ||
73 | UBI_COMPAT_PRESERVE = 4, | ||
74 | UBI_COMPAT_REJECT = 5 | ||
75 | }; | ||
76 | |||
77 | /* | ||
78 | * ubi16_t/ubi32_t/ubi64_t - 16, 32, and 64-bit integers used in UBI on-flash | ||
79 | * data structures. | ||
80 | */ | ||
81 | typedef struct { | ||
82 | uint16_t int16; | ||
83 | } __attribute__ ((packed)) ubi16_t; | ||
84 | |||
85 | typedef struct { | ||
86 | uint32_t int32; | ||
87 | } __attribute__ ((packed)) ubi32_t; | ||
88 | |||
89 | typedef struct { | ||
90 | uint64_t int64; | ||
91 | } __attribute__ ((packed)) ubi64_t; | ||
92 | |||
93 | /* | ||
94 | * In this implementation of UBI uses the big-endian format for on-flash | ||
95 | * integers. The below are the corresponding conversion macros. | ||
96 | */ | ||
97 | #define cpu_to_ubi16(x) ((ubi16_t){__cpu_to_be16(x)}) | ||
98 | #define ubi16_to_cpu(x) ((uint16_t)__be16_to_cpu((x).int16)) | ||
99 | |||
100 | #define cpu_to_ubi32(x) ((ubi32_t){__cpu_to_be32(x)}) | ||
101 | #define ubi32_to_cpu(x) ((uint32_t)__be32_to_cpu((x).int32)) | ||
102 | |||
103 | #define cpu_to_ubi64(x) ((ubi64_t){__cpu_to_be64(x)}) | ||
104 | #define ubi64_to_cpu(x) ((uint64_t)__be64_to_cpu((x).int64)) | ||
105 | |||
106 | /* Sizes of UBI headers */ | ||
107 | #define UBI_EC_HDR_SIZE sizeof(struct ubi_ec_hdr) | ||
108 | #define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr) | ||
109 | |||
110 | /* Sizes of UBI headers without the ending CRC */ | ||
111 | #define UBI_EC_HDR_SIZE_CRC (UBI_EC_HDR_SIZE - sizeof(ubi32_t)) | ||
112 | #define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(ubi32_t)) | ||
113 | |||
114 | /** | ||
115 | * struct ubi_ec_hdr - UBI erase counter header. | ||
116 | * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC) | ||
117 | * @version: version of UBI implementation which is supposed to accept this | ||
118 | * UBI image | ||
119 | * @padding1: reserved for future, zeroes | ||
120 | * @ec: the erase counter | ||
121 | * @vid_hdr_offset: where the VID header starts | ||
122 | * @data_offset: where the user data start | ||
123 | * @padding2: reserved for future, zeroes | ||
124 | * @hdr_crc: erase counter header CRC checksum | ||
125 | * | ||
126 | * The erase counter header takes 64 bytes and has a plenty of unused space for | ||
127 | * future usage. The unused fields are zeroed. The @version field is used to | ||
128 | * indicate the version of UBI implementation which is supposed to be able to | ||
129 | * work with this UBI image. If @version is greater then the current UBI | ||
130 | * version, the image is rejected. This may be useful in future if something | ||
131 | * is changed radically. This field is duplicated in the volume identifier | ||
132 | * header. | ||
133 | * | ||
134 | * The @vid_hdr_offset and @data_offset fields contain the offset of the the | ||
135 | * volume identifier header and user data, relative to the beginning of the | ||
136 | * physical eraseblock. These values have to be the same for all physical | ||
137 | * eraseblocks. | ||
138 | */ | ||
139 | struct ubi_ec_hdr { | ||
140 | ubi32_t magic; | ||
141 | uint8_t version; | ||
142 | uint8_t padding1[3]; | ||
143 | ubi64_t ec; /* Warning: the current limit is 31-bit anyway! */ | ||
144 | ubi32_t vid_hdr_offset; | ||
145 | ubi32_t data_offset; | ||
146 | uint8_t padding2[36]; | ||
147 | ubi32_t hdr_crc; | ||
148 | } __attribute__ ((packed)); | ||
149 | |||
150 | /** | ||
151 | * struct ubi_vid_hdr - on-flash UBI volume identifier header. | ||
152 | * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC) | ||
153 | * @version: UBI implementation version which is supposed to accept this UBI | ||
154 | * image (%UBI_VERSION) | ||
155 | * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC) | ||
156 | * @copy_flag: if this logical eraseblock was copied from another physical | ||
157 | * eraseblock (for wear-leveling reasons) | ||
158 | * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE, | ||
159 | * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT) | ||
160 | * @vol_id: ID of this volume | ||
161 | * @lnum: logical eraseblock number | ||
162 | * @leb_ver: version of this logical eraseblock (IMPORTANT: obsolete, to be | ||
163 | * removed, kept only for not breaking older UBI users) | ||
164 | * @data_size: how many bytes of data this logical eraseblock contains | ||
165 | * @used_ebs: total number of used logical eraseblocks in this volume | ||
166 | * @data_pad: how many bytes at the end of this physical eraseblock are not | ||
167 | * used | ||
168 | * @data_crc: CRC checksum of the data stored in this logical eraseblock | ||
169 | * @padding1: reserved for future, zeroes | ||
170 | * @sqnum: sequence number | ||
171 | * @padding2: reserved for future, zeroes | ||
172 | * @hdr_crc: volume identifier header CRC checksum | ||
173 | * | ||
174 | * The @sqnum is the value of the global sequence counter at the time when this | ||
175 | * VID header was created. The global sequence counter is incremented each time | ||
176 | * UBI writes a new VID header to the flash, i.e. when it maps a logical | ||
177 | * eraseblock to a new physical eraseblock. The global sequence counter is an | ||
178 | * unsigned 64-bit integer and we assume it never overflows. The @sqnum | ||
179 | * (sequence number) is used to distinguish between older and newer versions of | ||
180 | * logical eraseblocks. | ||
181 | * | ||
182 | * There are 2 situations when there may be more then one physical eraseblock | ||
183 | * corresponding to the same logical eraseblock, i.e., having the same @vol_id | ||
184 | * and @lnum values in the volume identifier header. Suppose we have a logical | ||
185 | * eraseblock L and it is mapped to the physical eraseblock P. | ||
186 | * | ||
187 | * 1. Because UBI may erase physical eraseblocks asynchronously, the following | ||
188 | * situation is possible: L is asynchronously erased, so P is scheduled for | ||
189 | * erasure, then L is written to,i.e. mapped to another physical eraseblock P1, | ||
190 | * so P1 is written to, then an unclean reboot happens. Result - there are 2 | ||
191 | * physical eraseblocks P and P1 corresponding to the same logical eraseblock | ||
192 | * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the | ||
193 | * flash. | ||
194 | * | ||
195 | * 2. From time to time UBI moves logical eraseblocks to other physical | ||
196 | * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P | ||
197 | * to P1, and an unclean reboot happens before P is physically erased, there | ||
198 | * are two physical eraseblocks P and P1 corresponding to L and UBI has to | ||
199 | * select one of them when the flash is attached. The @sqnum field says which | ||
200 | * PEB is the original (obviously P will have lower @sqnum) and the copy. But | ||
201 | * it is not enough to select the physical eraseblock with the higher sequence | ||
202 | * number, because the unclean reboot could have happen in the middle of the | ||
203 | * copying process, so the data in P is corrupted. It is also not enough to | ||
204 | * just select the physical eraseblock with lower sequence number, because the | ||
205 | * data there may be old (consider a case if more data was added to P1 after | ||
206 | * the copying). Moreover, the unclean reboot may happen when the erasure of P | ||
207 | * was just started, so it result in unstable P, which is "mostly" OK, but | ||
208 | * still has unstable bits. | ||
209 | * | ||
210 | * UBI uses the @copy_flag field to indicate that this logical eraseblock is a | ||
211 | * copy. UBI also calculates data CRC when the data is moved and stores it at | ||
212 | * the @data_crc field of the copy (P1). So when UBI needs to pick one physical | ||
213 | * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is | ||
214 | * examined. If it is cleared, the situation* is simple and the newer one is | ||
215 | * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC | ||
216 | * checksum is correct, this physical eraseblock is selected (P1). Otherwise | ||
217 | * the older one (P) is selected. | ||
218 | * | ||
219 | * Note, there is an obsolete @leb_ver field which was used instead of @sqnum | ||
220 | * in the past. But it is not used anymore and we keep it in order to be able | ||
221 | * to deal with old UBI images. It will be removed at some point. | ||
222 | * | ||
223 | * There are 2 sorts of volumes in UBI: user volumes and internal volumes. | ||
224 | * Internal volumes are not seen from outside and are used for various internal | ||
225 | * UBI purposes. In this implementation there is only one internal volume - the | ||
226 | * layout volume. Internal volumes are the main mechanism of UBI extensions. | ||
227 | * For example, in future one may introduce a journal internal volume. Internal | ||
228 | * volumes have their own reserved range of IDs. | ||
229 | * | ||
230 | * The @compat field is only used for internal volumes and contains the "degree | ||
231 | * of their compatibility". It is always zero for user volumes. This field | ||
232 | * provides a mechanism to introduce UBI extensions and to be still compatible | ||
233 | * with older UBI binaries. For example, if someone introduced a journal in | ||
234 | * future, he would probably use %UBI_COMPAT_DELETE compatibility for the | ||
235 | * journal volume. And in this case, older UBI binaries, which know nothing | ||
236 | * about the journal volume, would just delete this volume and work perfectly | ||
237 | * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image | ||
238 | * - it just ignores the Ext3fs journal. | ||
239 | * | ||
240 | * The @data_crc field contains the CRC checksum of the contents of the logical | ||
241 | * eraseblock if this is a static volume. In case of dynamic volumes, it does | ||
242 | * not contain the CRC checksum as a rule. The only exception is when the | ||
243 | * data of the physical eraseblock was moved by the wear-leveling unit, then | ||
244 | * the wear-leveling unit calculates the data CRC and stores it in the | ||
245 | * @data_crc field. And of course, the @copy_flag is %in this case. | ||
246 | * | ||
247 | * The @data_size field is used only for static volumes because UBI has to know | ||
248 | * how many bytes of data are stored in this eraseblock. For dynamic volumes, | ||
249 | * this field usually contains zero. The only exception is when the data of the | ||
250 | * physical eraseblock was moved to another physical eraseblock for | ||
251 | * wear-leveling reasons. In this case, UBI calculates CRC checksum of the | ||
252 | * contents and uses both @data_crc and @data_size fields. In this case, the | ||
253 | * @data_size field contains data size. | ||
254 | * | ||
255 | * The @used_ebs field is used only for static volumes and indicates how many | ||
256 | * eraseblocks the data of the volume takes. For dynamic volumes this field is | ||
257 | * not used and always contains zero. | ||
258 | * | ||
259 | * The @data_pad is calculated when volumes are created using the alignment | ||
260 | * parameter. So, effectively, the @data_pad field reduces the size of logical | ||
261 | * eraseblocks of this volume. This is very handy when one uses block-oriented | ||
262 | * software (say, cramfs) on top of the UBI volume. | ||
263 | */ | ||
264 | struct ubi_vid_hdr { | ||
265 | ubi32_t magic; | ||
266 | uint8_t version; | ||
267 | uint8_t vol_type; | ||
268 | uint8_t copy_flag; | ||
269 | uint8_t compat; | ||
270 | ubi32_t vol_id; | ||
271 | ubi32_t lnum; | ||
272 | ubi32_t leb_ver; /* obsolete, to be removed, don't use */ | ||
273 | ubi32_t data_size; | ||
274 | ubi32_t used_ebs; | ||
275 | ubi32_t data_pad; | ||
276 | ubi32_t data_crc; | ||
277 | uint8_t padding1[4]; | ||
278 | ubi64_t sqnum; | ||
279 | uint8_t padding2[12]; | ||
280 | ubi32_t hdr_crc; | ||
281 | } __attribute__ ((packed)); | ||
282 | |||
283 | /* Internal UBI volumes count */ | ||
284 | #define UBI_INT_VOL_COUNT 1 | ||
285 | |||
286 | /* | ||
287 | * Starting ID of internal volumes. There is reserved room for 4096 internal | ||
288 | * volumes. | ||
289 | */ | ||
290 | #define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096) | ||
291 | |||
292 | /* The layout volume contains the volume table */ | ||
293 | |||
294 | #define UBI_LAYOUT_VOL_ID UBI_INTERNAL_VOL_START | ||
295 | #define UBI_LAYOUT_VOLUME_EBS 2 | ||
296 | #define UBI_LAYOUT_VOLUME_NAME "layout volume" | ||
297 | #define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT | ||
298 | |||
299 | /* The maximum number of volumes per one UBI device */ | ||
300 | #define UBI_MAX_VOLUMES 128 | ||
301 | |||
302 | /* The maximum volume name length */ | ||
303 | #define UBI_VOL_NAME_MAX 127 | ||
304 | |||
305 | /* Size of the volume table record */ | ||
306 | #define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record) | ||
307 | |||
308 | /* Size of the volume table record without the ending CRC */ | ||
309 | #define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(ubi32_t)) | ||
310 | |||
311 | /** | ||
312 | * struct ubi_vtbl_record - a record in the volume table. | ||
313 | * @reserved_pebs: how many physical eraseblocks are reserved for this volume | ||
314 | * @alignment: volume alignment | ||
315 | * @data_pad: how many bytes are unused at the end of the each physical | ||
316 | * eraseblock to satisfy the requested alignment | ||
317 | * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) | ||
318 | * @upd_marker: if volume update was started but not finished | ||
319 | * @name_len: volume name length | ||
320 | * @name: the volume name | ||
321 | * @padding2: reserved, zeroes | ||
322 | * @crc: a CRC32 checksum of the record | ||
323 | * | ||
324 | * The volume table records are stored in the volume table, which is stored in | ||
325 | * the layout volume. The layout volume consists of 2 logical eraseblock, each | ||
326 | * of which contains a copy of the volume table (i.e., the volume table is | ||
327 | * duplicated). The volume table is an array of &struct ubi_vtbl_record | ||
328 | * objects indexed by the volume ID. | ||
329 | * | ||
330 | * If the size of the logical eraseblock is large enough to fit | ||
331 | * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES | ||
332 | * records. Otherwise, it contains as many records as it can fit (i.e., size of | ||
333 | * logical eraseblock divided by sizeof(struct ubi_vtbl_record)). | ||
334 | * | ||
335 | * The @upd_marker flag is used to implement volume update. It is set to %1 | ||
336 | * before update and set to %0 after the update. So if the update operation was | ||
337 | * interrupted, UBI knows that the volume is corrupted. | ||
338 | * | ||
339 | * The @alignment field is specified when the volume is created and cannot be | ||
340 | * later changed. It may be useful, for example, when a block-oriented file | ||
341 | * system works on top of UBI. The @data_pad field is calculated using the | ||
342 | * logical eraseblock size and @alignment. The alignment must be multiple to the | ||
343 | * minimal flash I/O unit. If @alignment is 1, all the available space of | ||
344 | * the physical eraseblocks is used. | ||
345 | * | ||
346 | * Empty records contain all zeroes and the CRC checksum of those zeroes. | ||
347 | */ | ||
348 | struct ubi_vtbl_record { | ||
349 | ubi32_t reserved_pebs; | ||
350 | ubi32_t alignment; | ||
351 | ubi32_t data_pad; | ||
352 | uint8_t vol_type; | ||
353 | uint8_t upd_marker; | ||
354 | ubi16_t name_len; | ||
355 | uint8_t name[UBI_VOL_NAME_MAX+1]; | ||
356 | uint8_t padding2[24]; | ||
357 | ubi32_t crc; | ||
358 | } __attribute__ ((packed)); | ||
359 | |||
360 | #endif /* !__UBI_HEADER_H__ */ | ||
diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h new file mode 100644 index 000000000000..fe06ded0e6b8 --- /dev/null +++ b/include/mtd/ubi-user.h | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2006 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Author: Artem Bityutskiy (Битюцкий Артём) | ||
19 | */ | ||
20 | |||
21 | #ifndef __UBI_USER_H__ | ||
22 | #define __UBI_USER_H__ | ||
23 | |||
24 | /* | ||
25 | * UBI volume creation | ||
26 | * ~~~~~~~~~~~~~~~~~~~ | ||
27 | * | ||
28 | * UBI volumes are created via the %UBI_IOCMKVOL IOCTL command of UBI character | ||
29 | * device. A &struct ubi_mkvol_req object has to be properly filled and a | ||
30 | * pointer to it has to be passed to the IOCTL. | ||
31 | * | ||
32 | * UBI volume deletion | ||
33 | * ~~~~~~~~~~~~~~~~~~~ | ||
34 | * | ||
35 | * To delete a volume, the %UBI_IOCRMVOL IOCTL command of the UBI character | ||
36 | * device should be used. A pointer to the 32-bit volume ID hast to be passed | ||
37 | * to the IOCTL. | ||
38 | * | ||
39 | * UBI volume re-size | ||
40 | * ~~~~~~~~~~~~~~~~~~ | ||
41 | * | ||
42 | * To re-size a volume, the %UBI_IOCRSVOL IOCTL command of the UBI character | ||
43 | * device should be used. A &struct ubi_rsvol_req object has to be properly | ||
44 | * filled and a pointer to it has to be passed to the IOCTL. | ||
45 | * | ||
46 | * UBI volume update | ||
47 | * ~~~~~~~~~~~~~~~~~ | ||
48 | * | ||
49 | * Volume update should be done via the %UBI_IOCVOLUP IOCTL command of the | ||
50 | * corresponding UBI volume character device. A pointer to a 64-bit update | ||
51 | * size should be passed to the IOCTL. After then, UBI expects user to write | ||
52 | * this number of bytes to the volume character device. The update is finished | ||
53 | * when the claimed number of bytes is passed. So, the volume update sequence | ||
54 | * is something like: | ||
55 | * | ||
56 | * fd = open("/dev/my_volume"); | ||
57 | * ioctl(fd, UBI_IOCVOLUP, &image_size); | ||
58 | * write(fd, buf, image_size); | ||
59 | * close(fd); | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * When a new volume is created, users may either specify the volume number they | ||
64 | * want to create or to let UBI automatically assign a volume number using this | ||
65 | * constant. | ||
66 | */ | ||
67 | #define UBI_VOL_NUM_AUTO (-1) | ||
68 | |||
69 | /* Maximum volume name length */ | ||
70 | #define UBI_MAX_VOLUME_NAME 127 | ||
71 | |||
72 | /* IOCTL commands of UBI character devices */ | ||
73 | |||
74 | #define UBI_IOC_MAGIC 'o' | ||
75 | |||
76 | /* Create an UBI volume */ | ||
77 | #define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req) | ||
78 | /* Remove an UBI volume */ | ||
79 | #define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t) | ||
80 | /* Re-size an UBI volume */ | ||
81 | #define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req) | ||
82 | |||
83 | /* IOCTL commands of UBI volume character devices */ | ||
84 | |||
85 | #define UBI_VOL_IOC_MAGIC 'O' | ||
86 | |||
87 | /* Start UBI volume update */ | ||
88 | #define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t) | ||
89 | /* An eraseblock erasure command, used for debugging, disabled by default */ | ||
90 | #define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t) | ||
91 | |||
92 | /* | ||
93 | * UBI volume type constants. | ||
94 | * | ||
95 | * @UBI_DYNAMIC_VOLUME: dynamic volume | ||
96 | * @UBI_STATIC_VOLUME: static volume | ||
97 | */ | ||
98 | enum { | ||
99 | UBI_DYNAMIC_VOLUME = 3, | ||
100 | UBI_STATIC_VOLUME = 4 | ||
101 | }; | ||
102 | |||
103 | /** | ||
104 | * struct ubi_mkvol_req - volume description data structure used in | ||
105 | * volume creation requests. | ||
106 | * @vol_id: volume number | ||
107 | * @alignment: volume alignment | ||
108 | * @bytes: volume size in bytes | ||
109 | * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) | ||
110 | * @padding1: reserved for future, not used | ||
111 | * @name_len: volume name length | ||
112 | * @padding2: reserved for future, not used | ||
113 | * @name: volume name | ||
114 | * | ||
115 | * This structure is used by userspace programs when creating new volumes. The | ||
116 | * @used_bytes field is only necessary when creating static volumes. | ||
117 | * | ||
118 | * The @alignment field specifies the required alignment of the volume logical | ||
119 | * eraseblock. This means, that the size of logical eraseblocks will be aligned | ||
120 | * to this number, i.e., | ||
121 | * (UBI device logical eraseblock size) mod (@alignment) = 0. | ||
122 | * | ||
123 | * To put it differently, the logical eraseblock of this volume may be slightly | ||
124 | * shortened in order to make it properly aligned. The alignment has to be | ||
125 | * multiple of the flash minimal input/output unit, or %1 to utilize the entire | ||
126 | * available space of logical eraseblocks. | ||
127 | * | ||
128 | * The @alignment field may be useful, for example, when one wants to maintain | ||
129 | * a block device on top of an UBI volume. In this case, it is desirable to fit | ||
130 | * an integer number of blocks in logical eraseblocks of this UBI volume. With | ||
131 | * alignment it is possible to update this volume using plane UBI volume image | ||
132 | * BLOBs, without caring about how to properly align them. | ||
133 | */ | ||
134 | struct ubi_mkvol_req { | ||
135 | int32_t vol_id; | ||
136 | int32_t alignment; | ||
137 | int64_t bytes; | ||
138 | int8_t vol_type; | ||
139 | int8_t padding1; | ||
140 | int16_t name_len; | ||
141 | int8_t padding2[4]; | ||
142 | char name[UBI_MAX_VOLUME_NAME+1]; | ||
143 | } __attribute__ ((packed)); | ||
144 | |||
145 | /** | ||
146 | * struct ubi_rsvol_req - a data structure used in volume re-size requests. | ||
147 | * @vol_id: ID of the volume to re-size | ||
148 | * @bytes: new size of the volume in bytes | ||
149 | * | ||
150 | * Re-sizing is possible for both dynamic and static volumes. But while dynamic | ||
151 | * volumes may be re-sized arbitrarily, static volumes cannot be made to be | ||
152 | * smaller then the number of bytes they bear. To arbitrarily shrink a static | ||
153 | * volume, it must be wiped out first (by means of volume update operation with | ||
154 | * zero number of bytes). | ||
155 | */ | ||
156 | struct ubi_rsvol_req { | ||
157 | int64_t bytes; | ||
158 | int32_t vol_id; | ||
159 | } __attribute__ ((packed)); | ||
160 | |||
161 | #endif /* __UBI_USER_H__ */ | ||
diff --git a/kernel/sched.c b/kernel/sched.c index b9a683730148..960d7c5fca39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4746,7 +4746,7 @@ void show_state_filter(unsigned long state_filter) | |||
4746 | * console might take alot of time: | 4746 | * console might take alot of time: |
4747 | */ | 4747 | */ |
4748 | touch_nmi_watchdog(); | 4748 | touch_nmi_watchdog(); |
4749 | if (p->state & state_filter) | 4749 | if (!state_filter || (p->state & state_filter)) |
4750 | show_task(p); | 4750 | show_task(p); |
4751 | } while_each_thread(g, p); | 4751 | } while_each_thread(g, p); |
4752 | 4752 | ||
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 11a3404d65af..e1f18489db1d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
@@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0; | |||
92 | int cipso_v4_rbm_strictvalid = 1; | 92 | int cipso_v4_rbm_strictvalid = 1; |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * Protocol Constants | ||
96 | */ | ||
97 | |||
98 | /* Maximum size of the CIPSO IP option, derived from the fact that the maximum | ||
99 | * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ | ||
100 | #define CIPSO_V4_OPT_LEN_MAX 40 | ||
101 | |||
102 | /* Length of the base CIPSO option, this includes the option type (1 byte), the | ||
103 | * option length (1 byte), and the DOI (4 bytes). */ | ||
104 | #define CIPSO_V4_HDR_LEN 6 | ||
105 | |||
106 | /* Base length of the restrictive category bitmap tag (tag #1). */ | ||
107 | #define CIPSO_V4_TAG_RBM_BLEN 4 | ||
108 | |||
109 | /* Base length of the enumerated category tag (tag #2). */ | ||
110 | #define CIPSO_V4_TAG_ENUM_BLEN 4 | ||
111 | |||
112 | /* Base length of the ranged categories bitmap tag (tag #5). */ | ||
113 | #define CIPSO_V4_TAG_RNG_BLEN 4 | ||
114 | /* The maximum number of category ranges permitted in the ranged category tag | ||
115 | * (tag #5). You may note that the IETF draft states that the maximum number | ||
116 | * of category ranges is 7, but if the low end of the last category range is | ||
117 | * zero then it is possibile to fit 8 category ranges because the zero should | ||
118 | * be omitted. */ | ||
119 | #define CIPSO_V4_TAG_RNG_CAT_MAX 8 | ||
120 | |||
121 | /* | ||
95 | * Helper Functions | 122 | * Helper Functions |
96 | */ | 123 | */ |
97 | 124 | ||
@@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, | |||
1109 | unsigned char *net_cat, | 1136 | unsigned char *net_cat, |
1110 | u32 net_cat_len) | 1137 | u32 net_cat_len) |
1111 | { | 1138 | { |
1112 | /* The constant '16' is not random, it is the maximum number of | ||
1113 | * high/low category range pairs as permitted by the CIPSO draft based | ||
1114 | * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion | ||
1115 | * does a sanity check to make sure we don't overflow the array. */ | ||
1116 | int iter = -1; | 1139 | int iter = -1; |
1117 | u16 array[16]; | 1140 | u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; |
1118 | u32 array_cnt = 0; | 1141 | u32 array_cnt = 0; |
1119 | u32 cat_size = 0; | 1142 | u32 cat_size = 0; |
1120 | 1143 | ||
1121 | BUG_ON(net_cat_len > 30); | 1144 | /* make sure we don't overflow the 'array[]' variable */ |
1145 | if (net_cat_len > | ||
1146 | (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) | ||
1147 | return -ENOSPC; | ||
1122 | 1148 | ||
1123 | for (;;) { | 1149 | for (;;) { |
1124 | iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); | 1150 | iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); |
@@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, | |||
1196 | * Protocol Handling Functions | 1222 | * Protocol Handling Functions |
1197 | */ | 1223 | */ |
1198 | 1224 | ||
1199 | #define CIPSO_V4_OPT_LEN_MAX 40 | ||
1200 | #define CIPSO_V4_HDR_LEN 6 | ||
1201 | |||
1202 | /** | 1225 | /** |
1203 | * cipso_v4_gentag_hdr - Generate a CIPSO option header | 1226 | * cipso_v4_gentag_hdr - Generate a CIPSO option header |
1204 | * @doi_def: the DOI definition | 1227 | * @doi_def: the DOI definition |
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index e03a3282c551..f2535e7f2869 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c | |||
@@ -263,9 +263,6 @@ int netlbl_socket_setattr(const struct socket *sock, | |||
263 | int ret_val = -ENOENT; | 263 | int ret_val = -ENOENT; |
264 | struct netlbl_dom_map *dom_entry; | 264 | struct netlbl_dom_map *dom_entry; |
265 | 265 | ||
266 | if ((secattr->flags & NETLBL_SECATTR_DOMAIN) == 0) | ||
267 | return -ENOENT; | ||
268 | |||
269 | rcu_read_lock(); | 266 | rcu_read_lock(); |
270 | dom_entry = netlbl_domhsh_getentry(secattr->domain); | 267 | dom_entry = netlbl_domhsh_getentry(secattr->domain); |
271 | if (dom_entry == NULL) | 268 | if (dom_entry == NULL) |
diff --git a/security/selinux/Makefile b/security/selinux/Makefile index faf2e02e4410..dc3502e30b19 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile | |||
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o | |||
8 | 8 | ||
9 | selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o | 9 | selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o |
10 | 10 | ||
11 | selinux-$(CONFIG_NETLABEL) += netlabel.o | ||
12 | |||
11 | EXTRA_CFLAGS += -Isecurity/selinux/include | 13 | EXTRA_CFLAGS += -Isecurity/selinux/include |
12 | 14 | ||
diff --git a/security/selinux/avc.c b/security/selinux/avc.c index da8caf10ef97..e4396a89edc6 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c | |||
@@ -217,6 +217,8 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla | |||
217 | audit_log_format(ab, " tcontext=%s", scontext); | 217 | audit_log_format(ab, " tcontext=%s", scontext); |
218 | kfree(scontext); | 218 | kfree(scontext); |
219 | } | 219 | } |
220 | |||
221 | BUG_ON(tclass >= ARRAY_SIZE(class_to_string) || !class_to_string[tclass]); | ||
220 | audit_log_format(ab, " tclass=%s", class_to_string[tclass]); | 222 | audit_log_format(ab, " tclass=%s", class_to_string[tclass]); |
221 | } | 223 | } |
222 | 224 | ||
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5f02b4be1917..885a9a958b8d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c | |||
@@ -77,7 +77,7 @@ | |||
77 | #include "objsec.h" | 77 | #include "objsec.h" |
78 | #include "netif.h" | 78 | #include "netif.h" |
79 | #include "xfrm.h" | 79 | #include "xfrm.h" |
80 | #include "selinux_netlabel.h" | 80 | #include "netlabel.h" |
81 | 81 | ||
82 | #define XATTR_SELINUX_SUFFIX "selinux" | 82 | #define XATTR_SELINUX_SUFFIX "selinux" |
83 | #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX | 83 | #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX |
@@ -3123,6 +3123,34 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, | |||
3123 | return ret; | 3123 | return ret; |
3124 | } | 3124 | } |
3125 | 3125 | ||
3126 | /** | ||
3127 | * selinux_skb_extlbl_sid - Determine the external label of a packet | ||
3128 | * @skb: the packet | ||
3129 | * @base_sid: the SELinux SID to use as a context for MLS only external labels | ||
3130 | * @sid: the packet's SID | ||
3131 | * | ||
3132 | * Description: | ||
3133 | * Check the various different forms of external packet labeling and determine | ||
3134 | * the external SID for the packet. | ||
3135 | * | ||
3136 | */ | ||
3137 | static void selinux_skb_extlbl_sid(struct sk_buff *skb, | ||
3138 | u32 base_sid, | ||
3139 | u32 *sid) | ||
3140 | { | ||
3141 | u32 xfrm_sid; | ||
3142 | u32 nlbl_sid; | ||
3143 | |||
3144 | selinux_skb_xfrm_sid(skb, &xfrm_sid); | ||
3145 | if (selinux_netlbl_skbuff_getsid(skb, | ||
3146 | (xfrm_sid == SECSID_NULL ? | ||
3147 | base_sid : xfrm_sid), | ||
3148 | &nlbl_sid) != 0) | ||
3149 | nlbl_sid = SECSID_NULL; | ||
3150 | |||
3151 | *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); | ||
3152 | } | ||
3153 | |||
3126 | /* socket security operations */ | 3154 | /* socket security operations */ |
3127 | static int socket_has_perm(struct task_struct *task, struct socket *sock, | 3155 | static int socket_has_perm(struct task_struct *task, struct socket *sock, |
3128 | u32 perms) | 3156 | u32 perms) |
@@ -3664,9 +3692,7 @@ static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff * | |||
3664 | if (sock && sock->sk->sk_family == PF_UNIX) | 3692 | if (sock && sock->sk->sk_family == PF_UNIX) |
3665 | selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); | 3693 | selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); |
3666 | else if (skb) | 3694 | else if (skb) |
3667 | security_skb_extlbl_sid(skb, | 3695 | selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peer_secid); |
3668 | SECINITSID_UNLABELED, | ||
3669 | &peer_secid); | ||
3670 | 3696 | ||
3671 | if (peer_secid == SECSID_NULL) | 3697 | if (peer_secid == SECSID_NULL) |
3672 | err = -EINVAL; | 3698 | err = -EINVAL; |
@@ -3727,7 +3753,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, | |||
3727 | u32 newsid; | 3753 | u32 newsid; |
3728 | u32 peersid; | 3754 | u32 peersid; |
3729 | 3755 | ||
3730 | security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid); | 3756 | selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid); |
3731 | if (peersid == SECSID_NULL) { | 3757 | if (peersid == SECSID_NULL) { |
3732 | req->secid = sksec->sid; | 3758 | req->secid = sksec->sid; |
3733 | req->peer_secid = SECSID_NULL; | 3759 | req->peer_secid = SECSID_NULL; |
@@ -3765,7 +3791,7 @@ static void selinux_inet_conn_established(struct sock *sk, | |||
3765 | { | 3791 | { |
3766 | struct sk_security_struct *sksec = sk->sk_security; | 3792 | struct sk_security_struct *sksec = sk->sk_security; |
3767 | 3793 | ||
3768 | security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid); | 3794 | selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid); |
3769 | } | 3795 | } |
3770 | 3796 | ||
3771 | static void selinux_req_classify_flow(const struct request_sock *req, | 3797 | static void selinux_req_classify_flow(const struct request_sock *req, |
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index ad9fb2d69b50..b83e74012a97 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h | |||
@@ -128,96 +128,6 @@ | |||
128 | S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease") | 128 | S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease") |
129 | S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write") | 129 | S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write") |
130 | S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control") | 130 | S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control") |
131 | S_(SECCLASS_PASSWD, PASSWD__PASSWD, "passwd") | ||
132 | S_(SECCLASS_PASSWD, PASSWD__CHFN, "chfn") | ||
133 | S_(SECCLASS_PASSWD, PASSWD__CHSH, "chsh") | ||
134 | S_(SECCLASS_PASSWD, PASSWD__ROOTOK, "rootok") | ||
135 | S_(SECCLASS_PASSWD, PASSWD__CRONTAB, "crontab") | ||
136 | S_(SECCLASS_DRAWABLE, DRAWABLE__CREATE, "create") | ||
137 | S_(SECCLASS_DRAWABLE, DRAWABLE__DESTROY, "destroy") | ||
138 | S_(SECCLASS_DRAWABLE, DRAWABLE__DRAW, "draw") | ||
139 | S_(SECCLASS_DRAWABLE, DRAWABLE__COPY, "copy") | ||
140 | S_(SECCLASS_DRAWABLE, DRAWABLE__GETATTR, "getattr") | ||
141 | S_(SECCLASS_GC, GC__CREATE, "create") | ||
142 | S_(SECCLASS_GC, GC__FREE, "free") | ||
143 | S_(SECCLASS_GC, GC__GETATTR, "getattr") | ||
144 | S_(SECCLASS_GC, GC__SETATTR, "setattr") | ||
145 | S_(SECCLASS_WINDOW, WINDOW__ADDCHILD, "addchild") | ||
146 | S_(SECCLASS_WINDOW, WINDOW__CREATE, "create") | ||
147 | S_(SECCLASS_WINDOW, WINDOW__DESTROY, "destroy") | ||
148 | S_(SECCLASS_WINDOW, WINDOW__MAP, "map") | ||
149 | S_(SECCLASS_WINDOW, WINDOW__UNMAP, "unmap") | ||
150 | S_(SECCLASS_WINDOW, WINDOW__CHSTACK, "chstack") | ||
151 | S_(SECCLASS_WINDOW, WINDOW__CHPROPLIST, "chproplist") | ||
152 | S_(SECCLASS_WINDOW, WINDOW__CHPROP, "chprop") | ||
153 | S_(SECCLASS_WINDOW, WINDOW__LISTPROP, "listprop") | ||
154 | S_(SECCLASS_WINDOW, WINDOW__GETATTR, "getattr") | ||
155 | S_(SECCLASS_WINDOW, WINDOW__SETATTR, "setattr") | ||
156 | S_(SECCLASS_WINDOW, WINDOW__SETFOCUS, "setfocus") | ||
157 | S_(SECCLASS_WINDOW, WINDOW__MOVE, "move") | ||
158 | S_(SECCLASS_WINDOW, WINDOW__CHSELECTION, "chselection") | ||
159 | S_(SECCLASS_WINDOW, WINDOW__CHPARENT, "chparent") | ||
160 | S_(SECCLASS_WINDOW, WINDOW__CTRLLIFE, "ctrllife") | ||
161 | S_(SECCLASS_WINDOW, WINDOW__ENUMERATE, "enumerate") | ||
162 | S_(SECCLASS_WINDOW, WINDOW__TRANSPARENT, "transparent") | ||
163 | S_(SECCLASS_WINDOW, WINDOW__MOUSEMOTION, "mousemotion") | ||
164 | S_(SECCLASS_WINDOW, WINDOW__CLIENTCOMEVENT, "clientcomevent") | ||
165 | S_(SECCLASS_WINDOW, WINDOW__INPUTEVENT, "inputevent") | ||
166 | S_(SECCLASS_WINDOW, WINDOW__DRAWEVENT, "drawevent") | ||
167 | S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEEVENT, "windowchangeevent") | ||
168 | S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEREQUEST, "windowchangerequest") | ||
169 | S_(SECCLASS_WINDOW, WINDOW__SERVERCHANGEEVENT, "serverchangeevent") | ||
170 | S_(SECCLASS_WINDOW, WINDOW__EXTENSIONEVENT, "extensionevent") | ||
171 | S_(SECCLASS_FONT, FONT__LOAD, "load") | ||
172 | S_(SECCLASS_FONT, FONT__FREE, "free") | ||
173 | S_(SECCLASS_FONT, FONT__GETATTR, "getattr") | ||
174 | S_(SECCLASS_FONT, FONT__USE, "use") | ||
175 | S_(SECCLASS_COLORMAP, COLORMAP__CREATE, "create") | ||
176 | S_(SECCLASS_COLORMAP, COLORMAP__FREE, "free") | ||
177 | S_(SECCLASS_COLORMAP, COLORMAP__INSTALL, "install") | ||
178 | S_(SECCLASS_COLORMAP, COLORMAP__UNINSTALL, "uninstall") | ||
179 | S_(SECCLASS_COLORMAP, COLORMAP__LIST, "list") | ||
180 | S_(SECCLASS_COLORMAP, COLORMAP__READ, "read") | ||
181 | S_(SECCLASS_COLORMAP, COLORMAP__STORE, "store") | ||
182 | S_(SECCLASS_COLORMAP, COLORMAP__GETATTR, "getattr") | ||
183 | S_(SECCLASS_COLORMAP, COLORMAP__SETATTR, "setattr") | ||
184 | S_(SECCLASS_PROPERTY, PROPERTY__CREATE, "create") | ||
185 | S_(SECCLASS_PROPERTY, PROPERTY__FREE, "free") | ||
186 | S_(SECCLASS_PROPERTY, PROPERTY__READ, "read") | ||
187 | S_(SECCLASS_PROPERTY, PROPERTY__WRITE, "write") | ||
188 | S_(SECCLASS_CURSOR, CURSOR__CREATE, "create") | ||
189 | S_(SECCLASS_CURSOR, CURSOR__CREATEGLYPH, "createglyph") | ||
190 | S_(SECCLASS_CURSOR, CURSOR__FREE, "free") | ||
191 | S_(SECCLASS_CURSOR, CURSOR__ASSIGN, "assign") | ||
192 | S_(SECCLASS_CURSOR, CURSOR__SETATTR, "setattr") | ||
193 | S_(SECCLASS_XCLIENT, XCLIENT__KILL, "kill") | ||
194 | S_(SECCLASS_XINPUT, XINPUT__LOOKUP, "lookup") | ||
195 | S_(SECCLASS_XINPUT, XINPUT__GETATTR, "getattr") | ||
196 | S_(SECCLASS_XINPUT, XINPUT__SETATTR, "setattr") | ||
197 | S_(SECCLASS_XINPUT, XINPUT__SETFOCUS, "setfocus") | ||
198 | S_(SECCLASS_XINPUT, XINPUT__WARPPOINTER, "warppointer") | ||
199 | S_(SECCLASS_XINPUT, XINPUT__ACTIVEGRAB, "activegrab") | ||
200 | S_(SECCLASS_XINPUT, XINPUT__PASSIVEGRAB, "passivegrab") | ||
201 | S_(SECCLASS_XINPUT, XINPUT__UNGRAB, "ungrab") | ||
202 | S_(SECCLASS_XINPUT, XINPUT__BELL, "bell") | ||
203 | S_(SECCLASS_XINPUT, XINPUT__MOUSEMOTION, "mousemotion") | ||
204 | S_(SECCLASS_XINPUT, XINPUT__RELABELINPUT, "relabelinput") | ||
205 | S_(SECCLASS_XSERVER, XSERVER__SCREENSAVER, "screensaver") | ||
206 | S_(SECCLASS_XSERVER, XSERVER__GETHOSTLIST, "gethostlist") | ||
207 | S_(SECCLASS_XSERVER, XSERVER__SETHOSTLIST, "sethostlist") | ||
208 | S_(SECCLASS_XSERVER, XSERVER__GETFONTPATH, "getfontpath") | ||
209 | S_(SECCLASS_XSERVER, XSERVER__SETFONTPATH, "setfontpath") | ||
210 | S_(SECCLASS_XSERVER, XSERVER__GETATTR, "getattr") | ||
211 | S_(SECCLASS_XSERVER, XSERVER__GRAB, "grab") | ||
212 | S_(SECCLASS_XSERVER, XSERVER__UNGRAB, "ungrab") | ||
213 | S_(SECCLASS_XEXTENSION, XEXTENSION__QUERY, "query") | ||
214 | S_(SECCLASS_XEXTENSION, XEXTENSION__USE, "use") | ||
215 | S_(SECCLASS_PAX, PAX__PAGEEXEC, "pageexec") | ||
216 | S_(SECCLASS_PAX, PAX__EMUTRAMP, "emutramp") | ||
217 | S_(SECCLASS_PAX, PAX__MPROTECT, "mprotect") | ||
218 | S_(SECCLASS_PAX, PAX__RANDMMAP, "randmmap") | ||
219 | S_(SECCLASS_PAX, PAX__RANDEXEC, "randexec") | ||
220 | S_(SECCLASS_PAX, PAX__SEGMEXEC, "segmexec") | ||
221 | S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read") | 131 | S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read") |
222 | S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write") | 132 | S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write") |
223 | S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read") | 133 | S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read") |
@@ -232,16 +142,6 @@ | |||
232 | S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv") | 142 | S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv") |
233 | S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read") | 143 | S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read") |
234 | S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write") | 144 | S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write") |
235 | S_(SECCLASS_DBUS, DBUS__ACQUIRE_SVC, "acquire_svc") | ||
236 | S_(SECCLASS_DBUS, DBUS__SEND_MSG, "send_msg") | ||
237 | S_(SECCLASS_NSCD, NSCD__GETPWD, "getpwd") | ||
238 | S_(SECCLASS_NSCD, NSCD__GETGRP, "getgrp") | ||
239 | S_(SECCLASS_NSCD, NSCD__GETHOST, "gethost") | ||
240 | S_(SECCLASS_NSCD, NSCD__GETSTAT, "getstat") | ||
241 | S_(SECCLASS_NSCD, NSCD__ADMIN, "admin") | ||
242 | S_(SECCLASS_NSCD, NSCD__SHMEMPWD, "shmempwd") | ||
243 | S_(SECCLASS_NSCD, NSCD__SHMEMGRP, "shmemgrp") | ||
244 | S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost") | ||
245 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") | 145 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") |
246 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") | 146 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") |
247 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext") | 147 | S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext") |
@@ -256,7 +156,5 @@ | |||
256 | S_(SECCLASS_KEY, KEY__LINK, "link") | 156 | S_(SECCLASS_KEY, KEY__LINK, "link") |
257 | S_(SECCLASS_KEY, KEY__SETATTR, "setattr") | 157 | S_(SECCLASS_KEY, KEY__SETATTR, "setattr") |
258 | S_(SECCLASS_KEY, KEY__CREATE, "create") | 158 | S_(SECCLASS_KEY, KEY__CREATE, "create") |
259 | S_(SECCLASS_CONTEXT, CONTEXT__TRANSLATE, "translate") | ||
260 | S_(SECCLASS_CONTEXT, CONTEXT__CONTAINS, "contains") | ||
261 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") | 159 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") |
262 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") | 160 | S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") |
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index 2de4b5fe3aa1..5fee1735bffe 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h | |||
@@ -16,7 +16,6 @@ | |||
16 | #define COMMON_FILE__SWAPON 0x00004000UL | 16 | #define COMMON_FILE__SWAPON 0x00004000UL |
17 | #define COMMON_FILE__QUOTAON 0x00008000UL | 17 | #define COMMON_FILE__QUOTAON 0x00008000UL |
18 | #define COMMON_FILE__MOUNTON 0x00010000UL | 18 | #define COMMON_FILE__MOUNTON 0x00010000UL |
19 | |||
20 | #define COMMON_SOCKET__IOCTL 0x00000001UL | 19 | #define COMMON_SOCKET__IOCTL 0x00000001UL |
21 | #define COMMON_SOCKET__READ 0x00000002UL | 20 | #define COMMON_SOCKET__READ 0x00000002UL |
22 | #define COMMON_SOCKET__WRITE 0x00000004UL | 21 | #define COMMON_SOCKET__WRITE 0x00000004UL |
@@ -39,7 +38,6 @@ | |||
39 | #define COMMON_SOCKET__RECV_MSG 0x00080000UL | 38 | #define COMMON_SOCKET__RECV_MSG 0x00080000UL |
40 | #define COMMON_SOCKET__SEND_MSG 0x00100000UL | 39 | #define COMMON_SOCKET__SEND_MSG 0x00100000UL |
41 | #define COMMON_SOCKET__NAME_BIND 0x00200000UL | 40 | #define COMMON_SOCKET__NAME_BIND 0x00200000UL |
42 | |||
43 | #define COMMON_IPC__CREATE 0x00000001UL | 41 | #define COMMON_IPC__CREATE 0x00000001UL |
44 | #define COMMON_IPC__DESTROY 0x00000002UL | 42 | #define COMMON_IPC__DESTROY 0x00000002UL |
45 | #define COMMON_IPC__GETATTR 0x00000004UL | 43 | #define COMMON_IPC__GETATTR 0x00000004UL |
@@ -49,7 +47,6 @@ | |||
49 | #define COMMON_IPC__ASSOCIATE 0x00000040UL | 47 | #define COMMON_IPC__ASSOCIATE 0x00000040UL |
50 | #define COMMON_IPC__UNIX_READ 0x00000080UL | 48 | #define COMMON_IPC__UNIX_READ 0x00000080UL |
51 | #define COMMON_IPC__UNIX_WRITE 0x00000100UL | 49 | #define COMMON_IPC__UNIX_WRITE 0x00000100UL |
52 | |||
53 | #define FILESYSTEM__MOUNT 0x00000001UL | 50 | #define FILESYSTEM__MOUNT 0x00000001UL |
54 | #define FILESYSTEM__REMOUNT 0x00000002UL | 51 | #define FILESYSTEM__REMOUNT 0x00000002UL |
55 | #define FILESYSTEM__UNMOUNT 0x00000004UL | 52 | #define FILESYSTEM__UNMOUNT 0x00000004UL |
@@ -60,7 +57,6 @@ | |||
60 | #define FILESYSTEM__ASSOCIATE 0x00000080UL | 57 | #define FILESYSTEM__ASSOCIATE 0x00000080UL |
61 | #define FILESYSTEM__QUOTAMOD 0x00000100UL | 58 | #define FILESYSTEM__QUOTAMOD 0x00000100UL |
62 | #define FILESYSTEM__QUOTAGET 0x00000200UL | 59 | #define FILESYSTEM__QUOTAGET 0x00000200UL |
63 | |||
64 | #define DIR__IOCTL 0x00000001UL | 60 | #define DIR__IOCTL 0x00000001UL |
65 | #define DIR__READ 0x00000002UL | 61 | #define DIR__READ 0x00000002UL |
66 | #define DIR__WRITE 0x00000004UL | 62 | #define DIR__WRITE 0x00000004UL |
@@ -78,13 +74,11 @@ | |||
78 | #define DIR__SWAPON 0x00004000UL | 74 | #define DIR__SWAPON 0x00004000UL |
79 | #define DIR__QUOTAON 0x00008000UL | 75 | #define DIR__QUOTAON 0x00008000UL |
80 | #define DIR__MOUNTON 0x00010000UL | 76 | #define DIR__MOUNTON 0x00010000UL |
81 | |||
82 | #define DIR__ADD_NAME 0x00020000UL | 77 | #define DIR__ADD_NAME 0x00020000UL |
83 | #define DIR__REMOVE_NAME 0x00040000UL | 78 | #define DIR__REMOVE_NAME 0x00040000UL |
84 | #define DIR__REPARENT 0x00080000UL | 79 | #define DIR__REPARENT 0x00080000UL |
85 | #define DIR__SEARCH 0x00100000UL | 80 | #define DIR__SEARCH 0x00100000UL |
86 | #define DIR__RMDIR 0x00200000UL | 81 | #define DIR__RMDIR 0x00200000UL |
87 | |||
88 | #define FILE__IOCTL 0x00000001UL | 82 | #define FILE__IOCTL 0x00000001UL |
89 | #define FILE__READ 0x00000002UL | 83 | #define FILE__READ 0x00000002UL |
90 | #define FILE__WRITE 0x00000004UL | 84 | #define FILE__WRITE 0x00000004UL |
@@ -102,11 +96,9 @@ | |||
102 | #define FILE__SWAPON 0x00004000UL | 96 | #define FILE__SWAPON 0x00004000UL |
103 | #define FILE__QUOTAON 0x00008000UL | 97 | #define FILE__QUOTAON 0x00008000UL |
104 | #define FILE__MOUNTON 0x00010000UL | 98 | #define FILE__MOUNTON 0x00010000UL |
105 | |||
106 | #define FILE__EXECUTE_NO_TRANS 0x00020000UL | 99 | #define FILE__EXECUTE_NO_TRANS 0x00020000UL |
107 | #define FILE__ENTRYPOINT 0x00040000UL | 100 | #define FILE__ENTRYPOINT 0x00040000UL |
108 | #define FILE__EXECMOD 0x00080000UL | 101 | #define FILE__EXECMOD 0x00080000UL |
109 | |||
110 | #define LNK_FILE__IOCTL 0x00000001UL | 102 | #define LNK_FILE__IOCTL 0x00000001UL |
111 | #define LNK_FILE__READ 0x00000002UL | 103 | #define LNK_FILE__READ 0x00000002UL |
112 | #define LNK_FILE__WRITE 0x00000004UL | 104 | #define LNK_FILE__WRITE 0x00000004UL |
@@ -124,7 +116,6 @@ | |||
124 | #define LNK_FILE__SWAPON 0x00004000UL | 116 | #define LNK_FILE__SWAPON 0x00004000UL |
125 | #define LNK_FILE__QUOTAON 0x00008000UL | 117 | #define LNK_FILE__QUOTAON 0x00008000UL |
126 | #define LNK_FILE__MOUNTON 0x00010000UL | 118 | #define LNK_FILE__MOUNTON 0x00010000UL |
127 | |||
128 | #define CHR_FILE__IOCTL 0x00000001UL | 119 | #define CHR_FILE__IOCTL 0x00000001UL |
129 | #define CHR_FILE__READ 0x00000002UL | 120 | #define CHR_FILE__READ 0x00000002UL |
130 | #define CHR_FILE__WRITE 0x00000004UL | 121 | #define CHR_FILE__WRITE 0x00000004UL |
@@ -142,11 +133,9 @@ | |||
142 | #define CHR_FILE__SWAPON 0x00004000UL | 133 | #define CHR_FILE__SWAPON 0x00004000UL |
143 | #define CHR_FILE__QUOTAON 0x00008000UL | 134 | #define CHR_FILE__QUOTAON 0x00008000UL |
144 | #define CHR_FILE__MOUNTON 0x00010000UL | 135 | #define CHR_FILE__MOUNTON 0x00010000UL |
145 | |||
146 | #define CHR_FILE__EXECUTE_NO_TRANS 0x00020000UL | 136 | #define CHR_FILE__EXECUTE_NO_TRANS 0x00020000UL |
147 | #define CHR_FILE__ENTRYPOINT 0x00040000UL | 137 | #define CHR_FILE__ENTRYPOINT 0x00040000UL |
148 | #define CHR_FILE__EXECMOD 0x00080000UL | 138 | #define CHR_FILE__EXECMOD 0x00080000UL |
149 | |||
150 | #define BLK_FILE__IOCTL 0x00000001UL | 139 | #define BLK_FILE__IOCTL 0x00000001UL |
151 | #define BLK_FILE__READ 0x00000002UL | 140 | #define BLK_FILE__READ 0x00000002UL |
152 | #define BLK_FILE__WRITE 0x00000004UL | 141 | #define BLK_FILE__WRITE 0x00000004UL |
@@ -164,7 +153,6 @@ | |||
164 | #define BLK_FILE__SWAPON 0x00004000UL | 153 | #define BLK_FILE__SWAPON 0x00004000UL |
165 | #define BLK_FILE__QUOTAON 0x00008000UL | 154 | #define BLK_FILE__QUOTAON 0x00008000UL |
166 | #define BLK_FILE__MOUNTON 0x00010000UL | 155 | #define BLK_FILE__MOUNTON 0x00010000UL |
167 | |||
168 | #define SOCK_FILE__IOCTL 0x00000001UL | 156 | #define SOCK_FILE__IOCTL 0x00000001UL |
169 | #define SOCK_FILE__READ 0x00000002UL | 157 | #define SOCK_FILE__READ 0x00000002UL |
170 | #define SOCK_FILE__WRITE 0x00000004UL | 158 | #define SOCK_FILE__WRITE 0x00000004UL |
@@ -182,7 +170,6 @@ | |||
182 | #define SOCK_FILE__SWAPON 0x00004000UL | 170 | #define SOCK_FILE__SWAPON 0x00004000UL |
183 | #define SOCK_FILE__QUOTAON 0x00008000UL | 171 | #define SOCK_FILE__QUOTAON 0x00008000UL |
184 | #define SOCK_FILE__MOUNTON 0x00010000UL | 172 | #define SOCK_FILE__MOUNTON 0x00010000UL |
185 | |||
186 | #define FIFO_FILE__IOCTL 0x00000001UL | 173 | #define FIFO_FILE__IOCTL 0x00000001UL |
187 | #define FIFO_FILE__READ 0x00000002UL | 174 | #define FIFO_FILE__READ 0x00000002UL |
188 | #define FIFO_FILE__WRITE 0x00000004UL | 175 | #define FIFO_FILE__WRITE 0x00000004UL |
@@ -200,9 +187,7 @@ | |||
200 | #define FIFO_FILE__SWAPON 0x00004000UL | 187 | #define FIFO_FILE__SWAPON 0x00004000UL |
201 | #define FIFO_FILE__QUOTAON 0x00008000UL | 188 | #define FIFO_FILE__QUOTAON 0x00008000UL |
202 | #define FIFO_FILE__MOUNTON 0x00010000UL | 189 | #define FIFO_FILE__MOUNTON 0x00010000UL |
203 | |||
204 | #define FD__USE 0x00000001UL | 190 | #define FD__USE 0x00000001UL |
205 | |||
206 | #define SOCKET__IOCTL 0x00000001UL | 191 | #define SOCKET__IOCTL 0x00000001UL |
207 | #define SOCKET__READ 0x00000002UL | 192 | #define SOCKET__READ 0x00000002UL |
208 | #define SOCKET__WRITE 0x00000004UL | 193 | #define SOCKET__WRITE 0x00000004UL |
@@ -225,7 +210,6 @@ | |||
225 | #define SOCKET__RECV_MSG 0x00080000UL | 210 | #define SOCKET__RECV_MSG 0x00080000UL |
226 | #define SOCKET__SEND_MSG 0x00100000UL | 211 | #define SOCKET__SEND_MSG 0x00100000UL |
227 | #define SOCKET__NAME_BIND 0x00200000UL | 212 | #define SOCKET__NAME_BIND 0x00200000UL |
228 | |||
229 | #define TCP_SOCKET__IOCTL 0x00000001UL | 213 | #define TCP_SOCKET__IOCTL 0x00000001UL |
230 | #define TCP_SOCKET__READ 0x00000002UL | 214 | #define TCP_SOCKET__READ 0x00000002UL |
231 | #define TCP_SOCKET__WRITE 0x00000004UL | 215 | #define TCP_SOCKET__WRITE 0x00000004UL |
@@ -248,13 +232,11 @@ | |||
248 | #define TCP_SOCKET__RECV_MSG 0x00080000UL | 232 | #define TCP_SOCKET__RECV_MSG 0x00080000UL |
249 | #define TCP_SOCKET__SEND_MSG 0x00100000UL | 233 | #define TCP_SOCKET__SEND_MSG 0x00100000UL |
250 | #define TCP_SOCKET__NAME_BIND 0x00200000UL | 234 | #define TCP_SOCKET__NAME_BIND 0x00200000UL |
251 | |||
252 | #define TCP_SOCKET__CONNECTTO 0x00400000UL | 235 | #define TCP_SOCKET__CONNECTTO 0x00400000UL |
253 | #define TCP_SOCKET__NEWCONN 0x00800000UL | 236 | #define TCP_SOCKET__NEWCONN 0x00800000UL |
254 | #define TCP_SOCKET__ACCEPTFROM 0x01000000UL | 237 | #define TCP_SOCKET__ACCEPTFROM 0x01000000UL |
255 | #define TCP_SOCKET__NODE_BIND 0x02000000UL | 238 | #define TCP_SOCKET__NODE_BIND 0x02000000UL |
256 | #define TCP_SOCKET__NAME_CONNECT 0x04000000UL | 239 | #define TCP_SOCKET__NAME_CONNECT 0x04000000UL |
257 | |||
258 | #define UDP_SOCKET__IOCTL 0x00000001UL | 240 | #define UDP_SOCKET__IOCTL 0x00000001UL |
259 | #define UDP_SOCKET__READ 0x00000002UL | 241 | #define UDP_SOCKET__READ 0x00000002UL |
260 | #define UDP_SOCKET__WRITE 0x00000004UL | 242 | #define UDP_SOCKET__WRITE 0x00000004UL |
@@ -277,9 +259,7 @@ | |||
277 | #define UDP_SOCKET__RECV_MSG 0x00080000UL | 259 | #define UDP_SOCKET__RECV_MSG 0x00080000UL |
278 | #define UDP_SOCKET__SEND_MSG 0x00100000UL | 260 | #define UDP_SOCKET__SEND_MSG 0x00100000UL |
279 | #define UDP_SOCKET__NAME_BIND 0x00200000UL | 261 | #define UDP_SOCKET__NAME_BIND 0x00200000UL |
280 | |||
281 | #define UDP_SOCKET__NODE_BIND 0x00400000UL | 262 | #define UDP_SOCKET__NODE_BIND 0x00400000UL |
282 | |||
283 | #define RAWIP_SOCKET__IOCTL 0x00000001UL | 263 | #define RAWIP_SOCKET__IOCTL 0x00000001UL |
284 | #define RAWIP_SOCKET__READ 0x00000002UL | 264 | #define RAWIP_SOCKET__READ 0x00000002UL |
285 | #define RAWIP_SOCKET__WRITE 0x00000004UL | 265 | #define RAWIP_SOCKET__WRITE 0x00000004UL |
@@ -302,9 +282,7 @@ | |||
302 | #define RAWIP_SOCKET__RECV_MSG 0x00080000UL | 282 | #define RAWIP_SOCKET__RECV_MSG 0x00080000UL |
303 | #define RAWIP_SOCKET__SEND_MSG 0x00100000UL | 283 | #define RAWIP_SOCKET__SEND_MSG 0x00100000UL |
304 | #define RAWIP_SOCKET__NAME_BIND 0x00200000UL | 284 | #define RAWIP_SOCKET__NAME_BIND 0x00200000UL |
305 | |||
306 | #define RAWIP_SOCKET__NODE_BIND 0x00400000UL | 285 | #define RAWIP_SOCKET__NODE_BIND 0x00400000UL |
307 | |||
308 | #define NODE__TCP_RECV 0x00000001UL | 286 | #define NODE__TCP_RECV 0x00000001UL |
309 | #define NODE__TCP_SEND 0x00000002UL | 287 | #define NODE__TCP_SEND 0x00000002UL |
310 | #define NODE__UDP_RECV 0x00000004UL | 288 | #define NODE__UDP_RECV 0x00000004UL |
@@ -314,7 +292,6 @@ | |||
314 | #define NODE__ENFORCE_DEST 0x00000040UL | 292 | #define NODE__ENFORCE_DEST 0x00000040UL |
315 | #define NODE__DCCP_RECV 0x00000080UL | 293 | #define NODE__DCCP_RECV 0x00000080UL |
316 | #define NODE__DCCP_SEND 0x00000100UL | 294 | #define NODE__DCCP_SEND 0x00000100UL |
317 | |||
318 | #define NETIF__TCP_RECV 0x00000001UL | 295 | #define NETIF__TCP_RECV 0x00000001UL |
319 | #define NETIF__TCP_SEND 0x00000002UL | 296 | #define NETIF__TCP_SEND 0x00000002UL |
320 | #define NETIF__UDP_RECV 0x00000004UL | 297 | #define NETIF__UDP_RECV 0x00000004UL |
@@ -323,7 +300,6 @@ | |||
323 | #define NETIF__RAWIP_SEND 0x00000020UL | 300 | #define NETIF__RAWIP_SEND 0x00000020UL |
324 | #define NETIF__DCCP_RECV 0x00000040UL | 301 | #define NETIF__DCCP_RECV 0x00000040UL |
325 | #define NETIF__DCCP_SEND 0x00000080UL | 302 | #define NETIF__DCCP_SEND 0x00000080UL |
326 | |||
327 | #define NETLINK_SOCKET__IOCTL 0x00000001UL | 303 | #define NETLINK_SOCKET__IOCTL 0x00000001UL |
328 | #define NETLINK_SOCKET__READ 0x00000002UL | 304 | #define NETLINK_SOCKET__READ 0x00000002UL |
329 | #define NETLINK_SOCKET__WRITE 0x00000004UL | 305 | #define NETLINK_SOCKET__WRITE 0x00000004UL |
@@ -346,7 +322,6 @@ | |||
346 | #define NETLINK_SOCKET__RECV_MSG 0x00080000UL | 322 | #define NETLINK_SOCKET__RECV_MSG 0x00080000UL |
347 | #define NETLINK_SOCKET__SEND_MSG 0x00100000UL | 323 | #define NETLINK_SOCKET__SEND_MSG 0x00100000UL |
348 | #define NETLINK_SOCKET__NAME_BIND 0x00200000UL | 324 | #define NETLINK_SOCKET__NAME_BIND 0x00200000UL |
349 | |||
350 | #define PACKET_SOCKET__IOCTL 0x00000001UL | 325 | #define PACKET_SOCKET__IOCTL 0x00000001UL |
351 | #define PACKET_SOCKET__READ 0x00000002UL | 326 | #define PACKET_SOCKET__READ 0x00000002UL |
352 | #define PACKET_SOCKET__WRITE 0x00000004UL | 327 | #define PACKET_SOCKET__WRITE 0x00000004UL |
@@ -369,7 +344,6 @@ | |||
369 | #define PACKET_SOCKET__RECV_MSG 0x00080000UL | 344 | #define PACKET_SOCKET__RECV_MSG 0x00080000UL |
370 | #define PACKET_SOCKET__SEND_MSG 0x00100000UL | 345 | #define PACKET_SOCKET__SEND_MSG 0x00100000UL |
371 | #define PACKET_SOCKET__NAME_BIND 0x00200000UL | 346 | #define PACKET_SOCKET__NAME_BIND 0x00200000UL |
372 | |||
373 | #define KEY_SOCKET__IOCTL 0x00000001UL | 347 | #define KEY_SOCKET__IOCTL 0x00000001UL |
374 | #define KEY_SOCKET__READ 0x00000002UL | 348 | #define KEY_SOCKET__READ 0x00000002UL |
375 | #define KEY_SOCKET__WRITE 0x00000004UL | 349 | #define KEY_SOCKET__WRITE 0x00000004UL |
@@ -392,7 +366,6 @@ | |||
392 | #define KEY_SOCKET__RECV_MSG 0x00080000UL | 366 | #define KEY_SOCKET__RECV_MSG 0x00080000UL |
393 | #define KEY_SOCKET__SEND_MSG 0x00100000UL | 367 | #define KEY_SOCKET__SEND_MSG 0x00100000UL |
394 | #define KEY_SOCKET__NAME_BIND 0x00200000UL | 368 | #define KEY_SOCKET__NAME_BIND 0x00200000UL |
395 | |||
396 | #define UNIX_STREAM_SOCKET__IOCTL 0x00000001UL | 369 | #define UNIX_STREAM_SOCKET__IOCTL 0x00000001UL |
397 | #define UNIX_STREAM_SOCKET__READ 0x00000002UL | 370 | #define UNIX_STREAM_SOCKET__READ 0x00000002UL |
398 | #define UNIX_STREAM_SOCKET__WRITE 0x00000004UL | 371 | #define UNIX_STREAM_SOCKET__WRITE 0x00000004UL |
@@ -415,11 +388,9 @@ | |||
415 | #define UNIX_STREAM_SOCKET__RECV_MSG 0x00080000UL | 388 | #define UNIX_STREAM_SOCKET__RECV_MSG 0x00080000UL |
416 | #define UNIX_STREAM_SOCKET__SEND_MSG 0x00100000UL | 389 | #define UNIX_STREAM_SOCKET__SEND_MSG 0x00100000UL |
417 | #define UNIX_STREAM_SOCKET__NAME_BIND 0x00200000UL | 390 | #define UNIX_STREAM_SOCKET__NAME_BIND 0x00200000UL |
418 | |||
419 | #define UNIX_STREAM_SOCKET__CONNECTTO 0x00400000UL | 391 | #define UNIX_STREAM_SOCKET__CONNECTTO 0x00400000UL |
420 | #define UNIX_STREAM_SOCKET__NEWCONN 0x00800000UL | 392 | #define UNIX_STREAM_SOCKET__NEWCONN 0x00800000UL |
421 | #define UNIX_STREAM_SOCKET__ACCEPTFROM 0x01000000UL | 393 | #define UNIX_STREAM_SOCKET__ACCEPTFROM 0x01000000UL |
422 | |||
423 | #define UNIX_DGRAM_SOCKET__IOCTL 0x00000001UL | 394 | #define UNIX_DGRAM_SOCKET__IOCTL 0x00000001UL |
424 | #define UNIX_DGRAM_SOCKET__READ 0x00000002UL | 395 | #define UNIX_DGRAM_SOCKET__READ 0x00000002UL |
425 | #define UNIX_DGRAM_SOCKET__WRITE 0x00000004UL | 396 | #define UNIX_DGRAM_SOCKET__WRITE 0x00000004UL |
@@ -442,7 +413,6 @@ | |||
442 | #define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL | 413 | #define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL |
443 | #define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL | 414 | #define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL |
444 | #define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL | 415 | #define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL |
445 | |||
446 | #define PROCESS__FORK 0x00000001UL | 416 | #define PROCESS__FORK 0x00000001UL |
447 | #define PROCESS__TRANSITION 0x00000002UL | 417 | #define PROCESS__TRANSITION 0x00000002UL |
448 | #define PROCESS__SIGCHLD 0x00000004UL | 418 | #define PROCESS__SIGCHLD 0x00000004UL |
@@ -473,7 +443,6 @@ | |||
473 | #define PROCESS__EXECHEAP 0x08000000UL | 443 | #define PROCESS__EXECHEAP 0x08000000UL |
474 | #define PROCESS__SETKEYCREATE 0x10000000UL | 444 | #define PROCESS__SETKEYCREATE 0x10000000UL |
475 | #define PROCESS__SETSOCKCREATE 0x20000000UL | 445 | #define PROCESS__SETSOCKCREATE 0x20000000UL |
476 | |||
477 | #define IPC__CREATE 0x00000001UL | 446 | #define IPC__CREATE 0x00000001UL |
478 | #define IPC__DESTROY 0x00000002UL | 447 | #define IPC__DESTROY 0x00000002UL |
479 | #define IPC__GETATTR 0x00000004UL | 448 | #define IPC__GETATTR 0x00000004UL |
@@ -483,7 +452,6 @@ | |||
483 | #define IPC__ASSOCIATE 0x00000040UL | 452 | #define IPC__ASSOCIATE 0x00000040UL |
484 | #define IPC__UNIX_READ 0x00000080UL | 453 | #define IPC__UNIX_READ 0x00000080UL |
485 | #define IPC__UNIX_WRITE 0x00000100UL | 454 | #define IPC__UNIX_WRITE 0x00000100UL |
486 | |||
487 | #define SEM__CREATE 0x00000001UL | 455 | #define SEM__CREATE 0x00000001UL |
488 | #define SEM__DESTROY 0x00000002UL | 456 | #define SEM__DESTROY 0x00000002UL |
489 | #define SEM__GETATTR 0x00000004UL | 457 | #define SEM__GETATTR 0x00000004UL |
@@ -493,7 +461,6 @@ | |||
493 | #define SEM__ASSOCIATE 0x00000040UL | 461 | #define SEM__ASSOCIATE 0x00000040UL |
494 | #define SEM__UNIX_READ 0x00000080UL | 462 | #define SEM__UNIX_READ 0x00000080UL |
495 | #define SEM__UNIX_WRITE 0x00000100UL | 463 | #define SEM__UNIX_WRITE 0x00000100UL |
496 | |||
497 | #define MSGQ__CREATE 0x00000001UL | 464 | #define MSGQ__CREATE 0x00000001UL |
498 | #define MSGQ__DESTROY 0x00000002UL | 465 | #define MSGQ__DESTROY 0x00000002UL |
499 | #define MSGQ__GETATTR 0x00000004UL | 466 | #define MSGQ__GETATTR 0x00000004UL |
@@ -503,12 +470,9 @@ | |||
503 | #define MSGQ__ASSOCIATE 0x00000040UL | 470 | #define MSGQ__ASSOCIATE 0x00000040UL |
504 | #define MSGQ__UNIX_READ 0x00000080UL | 471 | #define MSGQ__UNIX_READ 0x00000080UL |
505 | #define MSGQ__UNIX_WRITE 0x00000100UL | 472 | #define MSGQ__UNIX_WRITE 0x00000100UL |
506 | |||
507 | #define MSGQ__ENQUEUE 0x00000200UL | 473 | #define MSGQ__ENQUEUE 0x00000200UL |
508 | |||
509 | #define MSG__SEND 0x00000001UL | 474 | #define MSG__SEND 0x00000001UL |
510 | #define MSG__RECEIVE 0x00000002UL | 475 | #define MSG__RECEIVE 0x00000002UL |
511 | |||
512 | #define SHM__CREATE 0x00000001UL | 476 | #define SHM__CREATE 0x00000001UL |
513 | #define SHM__DESTROY 0x00000002UL | 477 | #define SHM__DESTROY 0x00000002UL |
514 | #define SHM__GETATTR 0x00000004UL | 478 | #define SHM__GETATTR 0x00000004UL |
@@ -518,9 +482,7 @@ | |||
518 | #define SHM__ASSOCIATE 0x00000040UL | 482 | #define SHM__ASSOCIATE 0x00000040UL |
519 | #define SHM__UNIX_READ 0x00000080UL | 483 | #define SHM__UNIX_READ 0x00000080UL |
520 | #define SHM__UNIX_WRITE 0x00000100UL | 484 | #define SHM__UNIX_WRITE 0x00000100UL |
521 | |||
522 | #define SHM__LOCK 0x00000200UL | 485 | #define SHM__LOCK 0x00000200UL |
523 | |||
524 | #define SECURITY__COMPUTE_AV 0x00000001UL | 486 | #define SECURITY__COMPUTE_AV 0x00000001UL |
525 | #define SECURITY__COMPUTE_CREATE 0x00000002UL | 487 | #define SECURITY__COMPUTE_CREATE 0x00000002UL |
526 | #define SECURITY__COMPUTE_MEMBER 0x00000004UL | 488 | #define SECURITY__COMPUTE_MEMBER 0x00000004UL |
@@ -532,12 +494,10 @@ | |||
532 | #define SECURITY__SETBOOL 0x00000100UL | 494 | #define SECURITY__SETBOOL 0x00000100UL |
533 | #define SECURITY__SETSECPARAM 0x00000200UL | 495 | #define SECURITY__SETSECPARAM 0x00000200UL |
534 | #define SECURITY__SETCHECKREQPROT 0x00000400UL | 496 | #define SECURITY__SETCHECKREQPROT 0x00000400UL |
535 | |||
536 | #define SYSTEM__IPC_INFO 0x00000001UL | 497 | #define SYSTEM__IPC_INFO 0x00000001UL |
537 | #define SYSTEM__SYSLOG_READ 0x00000002UL | 498 | #define SYSTEM__SYSLOG_READ 0x00000002UL |
538 | #define SYSTEM__SYSLOG_MOD 0x00000004UL | 499 | #define SYSTEM__SYSLOG_MOD 0x00000004UL |
539 | #define SYSTEM__SYSLOG_CONSOLE 0x00000008UL | 500 | #define SYSTEM__SYSLOG_CONSOLE 0x00000008UL |
540 | |||
541 | #define CAPABILITY__CHOWN 0x00000001UL | 501 | #define CAPABILITY__CHOWN 0x00000001UL |
542 | #define CAPABILITY__DAC_OVERRIDE 0x00000002UL | 502 | #define CAPABILITY__DAC_OVERRIDE 0x00000002UL |
543 | #define CAPABILITY__DAC_READ_SEARCH 0x00000004UL | 503 | #define CAPABILITY__DAC_READ_SEARCH 0x00000004UL |
@@ -569,110 +529,6 @@ | |||
569 | #define CAPABILITY__LEASE 0x10000000UL | 529 | #define CAPABILITY__LEASE 0x10000000UL |
570 | #define CAPABILITY__AUDIT_WRITE 0x20000000UL | 530 | #define CAPABILITY__AUDIT_WRITE 0x20000000UL |
571 | #define CAPABILITY__AUDIT_CONTROL 0x40000000UL | 531 | #define CAPABILITY__AUDIT_CONTROL 0x40000000UL |
572 | |||
573 | #define PASSWD__PASSWD 0x00000001UL | ||
574 | #define PASSWD__CHFN 0x00000002UL | ||
575 | #define PASSWD__CHSH 0x00000004UL | ||
576 | #define PASSWD__ROOTOK 0x00000008UL | ||
577 | #define PASSWD__CRONTAB 0x00000010UL | ||
578 | |||
579 | #define DRAWABLE__CREATE 0x00000001UL | ||
580 | #define DRAWABLE__DESTROY 0x00000002UL | ||
581 | #define DRAWABLE__DRAW 0x00000004UL | ||
582 | #define DRAWABLE__COPY 0x00000008UL | ||
583 | #define DRAWABLE__GETATTR 0x00000010UL | ||
584 | |||
585 | #define GC__CREATE 0x00000001UL | ||
586 | #define GC__FREE 0x00000002UL | ||
587 | #define GC__GETATTR 0x00000004UL | ||
588 | #define GC__SETATTR 0x00000008UL | ||
589 | |||
590 | #define WINDOW__ADDCHILD 0x00000001UL | ||
591 | #define WINDOW__CREATE 0x00000002UL | ||
592 | #define WINDOW__DESTROY 0x00000004UL | ||
593 | #define WINDOW__MAP 0x00000008UL | ||
594 | #define WINDOW__UNMAP 0x00000010UL | ||
595 | #define WINDOW__CHSTACK 0x00000020UL | ||
596 | #define WINDOW__CHPROPLIST 0x00000040UL | ||
597 | #define WINDOW__CHPROP 0x00000080UL | ||
598 | #define WINDOW__LISTPROP 0x00000100UL | ||
599 | #define WINDOW__GETATTR 0x00000200UL | ||
600 | #define WINDOW__SETATTR 0x00000400UL | ||
601 | #define WINDOW__SETFOCUS 0x00000800UL | ||
602 | #define WINDOW__MOVE 0x00001000UL | ||
603 | #define WINDOW__CHSELECTION 0x00002000UL | ||
604 | #define WINDOW__CHPARENT 0x00004000UL | ||
605 | #define WINDOW__CTRLLIFE 0x00008000UL | ||
606 | #define WINDOW__ENUMERATE 0x00010000UL | ||
607 | #define WINDOW__TRANSPARENT 0x00020000UL | ||
608 | #define WINDOW__MOUSEMOTION 0x00040000UL | ||
609 | #define WINDOW__CLIENTCOMEVENT 0x00080000UL | ||
610 | #define WINDOW__INPUTEVENT 0x00100000UL | ||
611 | #define WINDOW__DRAWEVENT 0x00200000UL | ||
612 | #define WINDOW__WINDOWCHANGEEVENT 0x00400000UL | ||
613 | #define WINDOW__WINDOWCHANGEREQUEST 0x00800000UL | ||
614 | #define WINDOW__SERVERCHANGEEVENT 0x01000000UL | ||
615 | #define WINDOW__EXTENSIONEVENT 0x02000000UL | ||
616 | |||
617 | #define FONT__LOAD 0x00000001UL | ||
618 | #define FONT__FREE 0x00000002UL | ||
619 | #define FONT__GETATTR 0x00000004UL | ||
620 | #define FONT__USE 0x00000008UL | ||
621 | |||
622 | #define COLORMAP__CREATE 0x00000001UL | ||
623 | #define COLORMAP__FREE 0x00000002UL | ||
624 | #define COLORMAP__INSTALL 0x00000004UL | ||
625 | #define COLORMAP__UNINSTALL 0x00000008UL | ||
626 | #define COLORMAP__LIST 0x00000010UL | ||
627 | #define COLORMAP__READ 0x00000020UL | ||
628 | #define COLORMAP__STORE 0x00000040UL | ||
629 | #define COLORMAP__GETATTR 0x00000080UL | ||
630 | #define COLORMAP__SETATTR 0x00000100UL | ||
631 | |||
632 | #define PROPERTY__CREATE 0x00000001UL | ||
633 | #define PROPERTY__FREE 0x00000002UL | ||
634 | #define PROPERTY__READ 0x00000004UL | ||
635 | #define PROPERTY__WRITE 0x00000008UL | ||
636 | |||
637 | #define CURSOR__CREATE 0x00000001UL | ||
638 | #define CURSOR__CREATEGLYPH 0x00000002UL | ||
639 | #define CURSOR__FREE 0x00000004UL | ||
640 | #define CURSOR__ASSIGN 0x00000008UL | ||
641 | #define CURSOR__SETATTR 0x00000010UL | ||
642 | |||
643 | #define XCLIENT__KILL 0x00000001UL | ||
644 | |||
645 | #define XINPUT__LOOKUP 0x00000001UL | ||
646 | #define XINPUT__GETATTR 0x00000002UL | ||
647 | #define XINPUT__SETATTR 0x00000004UL | ||
648 | #define XINPUT__SETFOCUS 0x00000008UL | ||
649 | #define XINPUT__WARPPOINTER 0x00000010UL | ||
650 | #define XINPUT__ACTIVEGRAB 0x00000020UL | ||
651 | #define XINPUT__PASSIVEGRAB 0x00000040UL | ||
652 | #define XINPUT__UNGRAB 0x00000080UL | ||
653 | #define XINPUT__BELL 0x00000100UL | ||
654 | #define XINPUT__MOUSEMOTION 0x00000200UL | ||
655 | #define XINPUT__RELABELINPUT 0x00000400UL | ||
656 | |||
657 | #define XSERVER__SCREENSAVER 0x00000001UL | ||
658 | #define XSERVER__GETHOSTLIST 0x00000002UL | ||
659 | #define XSERVER__SETHOSTLIST 0x00000004UL | ||
660 | #define XSERVER__GETFONTPATH 0x00000008UL | ||
661 | #define XSERVER__SETFONTPATH 0x00000010UL | ||
662 | #define XSERVER__GETATTR 0x00000020UL | ||
663 | #define XSERVER__GRAB 0x00000040UL | ||
664 | #define XSERVER__UNGRAB 0x00000080UL | ||
665 | |||
666 | #define XEXTENSION__QUERY 0x00000001UL | ||
667 | #define XEXTENSION__USE 0x00000002UL | ||
668 | |||
669 | #define PAX__PAGEEXEC 0x00000001UL | ||
670 | #define PAX__EMUTRAMP 0x00000002UL | ||
671 | #define PAX__MPROTECT 0x00000004UL | ||
672 | #define PAX__RANDMMAP 0x00000008UL | ||
673 | #define PAX__RANDEXEC 0x00000010UL | ||
674 | #define PAX__SEGMEXEC 0x00000020UL | ||
675 | |||
676 | #define NETLINK_ROUTE_SOCKET__IOCTL 0x00000001UL | 532 | #define NETLINK_ROUTE_SOCKET__IOCTL 0x00000001UL |
677 | #define NETLINK_ROUTE_SOCKET__READ 0x00000002UL | 533 | #define NETLINK_ROUTE_SOCKET__READ 0x00000002UL |
678 | #define NETLINK_ROUTE_SOCKET__WRITE 0x00000004UL | 534 | #define NETLINK_ROUTE_SOCKET__WRITE 0x00000004UL |
@@ -695,10 +551,8 @@ | |||
695 | #define NETLINK_ROUTE_SOCKET__RECV_MSG 0x00080000UL | 551 | #define NETLINK_ROUTE_SOCKET__RECV_MSG 0x00080000UL |
696 | #define NETLINK_ROUTE_SOCKET__SEND_MSG 0x00100000UL | 552 | #define NETLINK_ROUTE_SOCKET__SEND_MSG 0x00100000UL |
697 | #define NETLINK_ROUTE_SOCKET__NAME_BIND 0x00200000UL | 553 | #define NETLINK_ROUTE_SOCKET__NAME_BIND 0x00200000UL |
698 | |||
699 | #define NETLINK_ROUTE_SOCKET__NLMSG_READ 0x00400000UL | 554 | #define NETLINK_ROUTE_SOCKET__NLMSG_READ 0x00400000UL |
700 | #define NETLINK_ROUTE_SOCKET__NLMSG_WRITE 0x00800000UL | 555 | #define NETLINK_ROUTE_SOCKET__NLMSG_WRITE 0x00800000UL |
701 | |||
702 | #define NETLINK_FIREWALL_SOCKET__IOCTL 0x00000001UL | 556 | #define NETLINK_FIREWALL_SOCKET__IOCTL 0x00000001UL |
703 | #define NETLINK_FIREWALL_SOCKET__READ 0x00000002UL | 557 | #define NETLINK_FIREWALL_SOCKET__READ 0x00000002UL |
704 | #define NETLINK_FIREWALL_SOCKET__WRITE 0x00000004UL | 558 | #define NETLINK_FIREWALL_SOCKET__WRITE 0x00000004UL |
@@ -721,10 +575,8 @@ | |||
721 | #define NETLINK_FIREWALL_SOCKET__RECV_MSG 0x00080000UL | 575 | #define NETLINK_FIREWALL_SOCKET__RECV_MSG 0x00080000UL |
722 | #define NETLINK_FIREWALL_SOCKET__SEND_MSG 0x00100000UL | 576 | #define NETLINK_FIREWALL_SOCKET__SEND_MSG 0x00100000UL |
723 | #define NETLINK_FIREWALL_SOCKET__NAME_BIND 0x00200000UL | 577 | #define NETLINK_FIREWALL_SOCKET__NAME_BIND 0x00200000UL |
724 | |||
725 | #define NETLINK_FIREWALL_SOCKET__NLMSG_READ 0x00400000UL | 578 | #define NETLINK_FIREWALL_SOCKET__NLMSG_READ 0x00400000UL |
726 | #define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE 0x00800000UL | 579 | #define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE 0x00800000UL |
727 | |||
728 | #define NETLINK_TCPDIAG_SOCKET__IOCTL 0x00000001UL | 580 | #define NETLINK_TCPDIAG_SOCKET__IOCTL 0x00000001UL |
729 | #define NETLINK_TCPDIAG_SOCKET__READ 0x00000002UL | 581 | #define NETLINK_TCPDIAG_SOCKET__READ 0x00000002UL |
730 | #define NETLINK_TCPDIAG_SOCKET__WRITE 0x00000004UL | 582 | #define NETLINK_TCPDIAG_SOCKET__WRITE 0x00000004UL |
@@ -747,10 +599,8 @@ | |||
747 | #define NETLINK_TCPDIAG_SOCKET__RECV_MSG 0x00080000UL | 599 | #define NETLINK_TCPDIAG_SOCKET__RECV_MSG 0x00080000UL |
748 | #define NETLINK_TCPDIAG_SOCKET__SEND_MSG 0x00100000UL | 600 | #define NETLINK_TCPDIAG_SOCKET__SEND_MSG 0x00100000UL |
749 | #define NETLINK_TCPDIAG_SOCKET__NAME_BIND 0x00200000UL | 601 | #define NETLINK_TCPDIAG_SOCKET__NAME_BIND 0x00200000UL |
750 | |||
751 | #define NETLINK_TCPDIAG_SOCKET__NLMSG_READ 0x00400000UL | 602 | #define NETLINK_TCPDIAG_SOCKET__NLMSG_READ 0x00400000UL |
752 | #define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE 0x00800000UL | 603 | #define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE 0x00800000UL |
753 | |||
754 | #define NETLINK_NFLOG_SOCKET__IOCTL 0x00000001UL | 604 | #define NETLINK_NFLOG_SOCKET__IOCTL 0x00000001UL |
755 | #define NETLINK_NFLOG_SOCKET__READ 0x00000002UL | 605 | #define NETLINK_NFLOG_SOCKET__READ 0x00000002UL |
756 | #define NETLINK_NFLOG_SOCKET__WRITE 0x00000004UL | 606 | #define NETLINK_NFLOG_SOCKET__WRITE 0x00000004UL |
@@ -773,7 +623,6 @@ | |||
773 | #define NETLINK_NFLOG_SOCKET__RECV_MSG 0x00080000UL | 623 | #define NETLINK_NFLOG_SOCKET__RECV_MSG 0x00080000UL |
774 | #define NETLINK_NFLOG_SOCKET__SEND_MSG 0x00100000UL | 624 | #define NETLINK_NFLOG_SOCKET__SEND_MSG 0x00100000UL |
775 | #define NETLINK_NFLOG_SOCKET__NAME_BIND 0x00200000UL | 625 | #define NETLINK_NFLOG_SOCKET__NAME_BIND 0x00200000UL |
776 | |||
777 | #define NETLINK_XFRM_SOCKET__IOCTL 0x00000001UL | 626 | #define NETLINK_XFRM_SOCKET__IOCTL 0x00000001UL |
778 | #define NETLINK_XFRM_SOCKET__READ 0x00000002UL | 627 | #define NETLINK_XFRM_SOCKET__READ 0x00000002UL |
779 | #define NETLINK_XFRM_SOCKET__WRITE 0x00000004UL | 628 | #define NETLINK_XFRM_SOCKET__WRITE 0x00000004UL |
@@ -796,10 +645,8 @@ | |||
796 | #define NETLINK_XFRM_SOCKET__RECV_MSG 0x00080000UL | 645 | #define NETLINK_XFRM_SOCKET__RECV_MSG 0x00080000UL |
797 | #define NETLINK_XFRM_SOCKET__SEND_MSG 0x00100000UL | 646 | #define NETLINK_XFRM_SOCKET__SEND_MSG 0x00100000UL |
798 | #define NETLINK_XFRM_SOCKET__NAME_BIND 0x00200000UL | 647 | #define NETLINK_XFRM_SOCKET__NAME_BIND 0x00200000UL |
799 | |||
800 | #define NETLINK_XFRM_SOCKET__NLMSG_READ 0x00400000UL | 648 | #define NETLINK_XFRM_SOCKET__NLMSG_READ 0x00400000UL |
801 | #define NETLINK_XFRM_SOCKET__NLMSG_WRITE 0x00800000UL | 649 | #define NETLINK_XFRM_SOCKET__NLMSG_WRITE 0x00800000UL |
802 | |||
803 | #define NETLINK_SELINUX_SOCKET__IOCTL 0x00000001UL | 650 | #define NETLINK_SELINUX_SOCKET__IOCTL 0x00000001UL |
804 | #define NETLINK_SELINUX_SOCKET__READ 0x00000002UL | 651 | #define NETLINK_SELINUX_SOCKET__READ 0x00000002UL |
805 | #define NETLINK_SELINUX_SOCKET__WRITE 0x00000004UL | 652 | #define NETLINK_SELINUX_SOCKET__WRITE 0x00000004UL |
@@ -822,7 +669,6 @@ | |||
822 | #define NETLINK_SELINUX_SOCKET__RECV_MSG 0x00080000UL | 669 | #define NETLINK_SELINUX_SOCKET__RECV_MSG 0x00080000UL |
823 | #define NETLINK_SELINUX_SOCKET__SEND_MSG 0x00100000UL | 670 | #define NETLINK_SELINUX_SOCKET__SEND_MSG 0x00100000UL |
824 | #define NETLINK_SELINUX_SOCKET__NAME_BIND 0x00200000UL | 671 | #define NETLINK_SELINUX_SOCKET__NAME_BIND 0x00200000UL |
825 | |||
826 | #define NETLINK_AUDIT_SOCKET__IOCTL 0x00000001UL | 672 | #define NETLINK_AUDIT_SOCKET__IOCTL 0x00000001UL |
827 | #define NETLINK_AUDIT_SOCKET__READ 0x00000002UL | 673 | #define NETLINK_AUDIT_SOCKET__READ 0x00000002UL |
828 | #define NETLINK_AUDIT_SOCKET__WRITE 0x00000004UL | 674 | #define NETLINK_AUDIT_SOCKET__WRITE 0x00000004UL |
@@ -845,12 +691,10 @@ | |||
845 | #define NETLINK_AUDIT_SOCKET__RECV_MSG 0x00080000UL | 691 | #define NETLINK_AUDIT_SOCKET__RECV_MSG 0x00080000UL |
846 | #define NETLINK_AUDIT_SOCKET__SEND_MSG 0x00100000UL | 692 | #define NETLINK_AUDIT_SOCKET__SEND_MSG 0x00100000UL |
847 | #define NETLINK_AUDIT_SOCKET__NAME_BIND 0x00200000UL | 693 | #define NETLINK_AUDIT_SOCKET__NAME_BIND 0x00200000UL |
848 | |||
849 | #define NETLINK_AUDIT_SOCKET__NLMSG_READ 0x00400000UL | 694 | #define NETLINK_AUDIT_SOCKET__NLMSG_READ 0x00400000UL |
850 | #define NETLINK_AUDIT_SOCKET__NLMSG_WRITE 0x00800000UL | 695 | #define NETLINK_AUDIT_SOCKET__NLMSG_WRITE 0x00800000UL |
851 | #define NETLINK_AUDIT_SOCKET__NLMSG_RELAY 0x01000000UL | 696 | #define NETLINK_AUDIT_SOCKET__NLMSG_RELAY 0x01000000UL |
852 | #define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV 0x02000000UL | 697 | #define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV 0x02000000UL |
853 | |||
854 | #define NETLINK_IP6FW_SOCKET__IOCTL 0x00000001UL | 698 | #define NETLINK_IP6FW_SOCKET__IOCTL 0x00000001UL |
855 | #define NETLINK_IP6FW_SOCKET__READ 0x00000002UL | 699 | #define NETLINK_IP6FW_SOCKET__READ 0x00000002UL |
856 | #define NETLINK_IP6FW_SOCKET__WRITE 0x00000004UL | 700 | #define NETLINK_IP6FW_SOCKET__WRITE 0x00000004UL |
@@ -873,10 +717,8 @@ | |||
873 | #define NETLINK_IP6FW_SOCKET__RECV_MSG 0x00080000UL | 717 | #define NETLINK_IP6FW_SOCKET__RECV_MSG 0x00080000UL |
874 | #define NETLINK_IP6FW_SOCKET__SEND_MSG 0x00100000UL | 718 | #define NETLINK_IP6FW_SOCKET__SEND_MSG 0x00100000UL |
875 | #define NETLINK_IP6FW_SOCKET__NAME_BIND 0x00200000UL | 719 | #define NETLINK_IP6FW_SOCKET__NAME_BIND 0x00200000UL |
876 | |||
877 | #define NETLINK_IP6FW_SOCKET__NLMSG_READ 0x00400000UL | 720 | #define NETLINK_IP6FW_SOCKET__NLMSG_READ 0x00400000UL |
878 | #define NETLINK_IP6FW_SOCKET__NLMSG_WRITE 0x00800000UL | 721 | #define NETLINK_IP6FW_SOCKET__NLMSG_WRITE 0x00800000UL |
879 | |||
880 | #define NETLINK_DNRT_SOCKET__IOCTL 0x00000001UL | 722 | #define NETLINK_DNRT_SOCKET__IOCTL 0x00000001UL |
881 | #define NETLINK_DNRT_SOCKET__READ 0x00000002UL | 723 | #define NETLINK_DNRT_SOCKET__READ 0x00000002UL |
882 | #define NETLINK_DNRT_SOCKET__WRITE 0x00000004UL | 724 | #define NETLINK_DNRT_SOCKET__WRITE 0x00000004UL |
@@ -899,24 +741,10 @@ | |||
899 | #define NETLINK_DNRT_SOCKET__RECV_MSG 0x00080000UL | 741 | #define NETLINK_DNRT_SOCKET__RECV_MSG 0x00080000UL |
900 | #define NETLINK_DNRT_SOCKET__SEND_MSG 0x00100000UL | 742 | #define NETLINK_DNRT_SOCKET__SEND_MSG 0x00100000UL |
901 | #define NETLINK_DNRT_SOCKET__NAME_BIND 0x00200000UL | 743 | #define NETLINK_DNRT_SOCKET__NAME_BIND 0x00200000UL |
902 | |||
903 | #define DBUS__ACQUIRE_SVC 0x00000001UL | ||
904 | #define DBUS__SEND_MSG 0x00000002UL | ||
905 | |||
906 | #define NSCD__GETPWD 0x00000001UL | ||
907 | #define NSCD__GETGRP 0x00000002UL | ||
908 | #define NSCD__GETHOST 0x00000004UL | ||
909 | #define NSCD__GETSTAT 0x00000008UL | ||
910 | #define NSCD__ADMIN 0x00000010UL | ||
911 | #define NSCD__SHMEMPWD 0x00000020UL | ||
912 | #define NSCD__SHMEMGRP 0x00000040UL | ||
913 | #define NSCD__SHMEMHOST 0x00000080UL | ||
914 | |||
915 | #define ASSOCIATION__SENDTO 0x00000001UL | 744 | #define ASSOCIATION__SENDTO 0x00000001UL |
916 | #define ASSOCIATION__RECVFROM 0x00000002UL | 745 | #define ASSOCIATION__RECVFROM 0x00000002UL |
917 | #define ASSOCIATION__SETCONTEXT 0x00000004UL | 746 | #define ASSOCIATION__SETCONTEXT 0x00000004UL |
918 | #define ASSOCIATION__POLMATCH 0x00000008UL | 747 | #define ASSOCIATION__POLMATCH 0x00000008UL |
919 | |||
920 | #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL | 748 | #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL |
921 | #define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL | 749 | #define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL |
922 | #define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE 0x00000004UL | 750 | #define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE 0x00000004UL |
@@ -939,7 +767,6 @@ | |||
939 | #define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG 0x00080000UL | 767 | #define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG 0x00080000UL |
940 | #define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG 0x00100000UL | 768 | #define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG 0x00100000UL |
941 | #define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND 0x00200000UL | 769 | #define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND 0x00200000UL |
942 | |||
943 | #define APPLETALK_SOCKET__IOCTL 0x00000001UL | 770 | #define APPLETALK_SOCKET__IOCTL 0x00000001UL |
944 | #define APPLETALK_SOCKET__READ 0x00000002UL | 771 | #define APPLETALK_SOCKET__READ 0x00000002UL |
945 | #define APPLETALK_SOCKET__WRITE 0x00000004UL | 772 | #define APPLETALK_SOCKET__WRITE 0x00000004UL |
@@ -962,11 +789,9 @@ | |||
962 | #define APPLETALK_SOCKET__RECV_MSG 0x00080000UL | 789 | #define APPLETALK_SOCKET__RECV_MSG 0x00080000UL |
963 | #define APPLETALK_SOCKET__SEND_MSG 0x00100000UL | 790 | #define APPLETALK_SOCKET__SEND_MSG 0x00100000UL |
964 | #define APPLETALK_SOCKET__NAME_BIND 0x00200000UL | 791 | #define APPLETALK_SOCKET__NAME_BIND 0x00200000UL |
965 | |||
966 | #define PACKET__SEND 0x00000001UL | 792 | #define PACKET__SEND 0x00000001UL |
967 | #define PACKET__RECV 0x00000002UL | 793 | #define PACKET__RECV 0x00000002UL |
968 | #define PACKET__RELABELTO 0x00000004UL | 794 | #define PACKET__RELABELTO 0x00000004UL |
969 | |||
970 | #define KEY__VIEW 0x00000001UL | 795 | #define KEY__VIEW 0x00000001UL |
971 | #define KEY__READ 0x00000002UL | 796 | #define KEY__READ 0x00000002UL |
972 | #define KEY__WRITE 0x00000004UL | 797 | #define KEY__WRITE 0x00000004UL |
@@ -974,10 +799,6 @@ | |||
974 | #define KEY__LINK 0x00000010UL | 799 | #define KEY__LINK 0x00000010UL |
975 | #define KEY__SETATTR 0x00000020UL | 800 | #define KEY__SETATTR 0x00000020UL |
976 | #define KEY__CREATE 0x00000040UL | 801 | #define KEY__CREATE 0x00000040UL |
977 | |||
978 | #define CONTEXT__TRANSLATE 0x00000001UL | ||
979 | #define CONTEXT__CONTAINS 0x00000002UL | ||
980 | |||
981 | #define DCCP_SOCKET__IOCTL 0x00000001UL | 802 | #define DCCP_SOCKET__IOCTL 0x00000001UL |
982 | #define DCCP_SOCKET__READ 0x00000002UL | 803 | #define DCCP_SOCKET__READ 0x00000002UL |
983 | #define DCCP_SOCKET__WRITE 0x00000004UL | 804 | #define DCCP_SOCKET__WRITE 0x00000004UL |
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index 9f3ebb1bfae6..378799068441 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h | |||
@@ -2,7 +2,7 @@ | |||
2 | /* | 2 | /* |
3 | * Security object class definitions | 3 | * Security object class definitions |
4 | */ | 4 | */ |
5 | S_("null") | 5 | S_(NULL) |
6 | S_("security") | 6 | S_("security") |
7 | S_("process") | 7 | S_("process") |
8 | S_("system") | 8 | S_("system") |
@@ -32,19 +32,19 @@ | |||
32 | S_("msgq") | 32 | S_("msgq") |
33 | S_("shm") | 33 | S_("shm") |
34 | S_("ipc") | 34 | S_("ipc") |
35 | S_("passwd") | 35 | S_(NULL) |
36 | S_("drawable") | 36 | S_(NULL) |
37 | S_("window") | 37 | S_(NULL) |
38 | S_("gc") | 38 | S_(NULL) |
39 | S_("font") | 39 | S_(NULL) |
40 | S_("colormap") | 40 | S_(NULL) |
41 | S_("property") | 41 | S_(NULL) |
42 | S_("cursor") | 42 | S_(NULL) |
43 | S_("xclient") | 43 | S_(NULL) |
44 | S_("xinput") | 44 | S_(NULL) |
45 | S_("xserver") | 45 | S_(NULL) |
46 | S_("xextension") | 46 | S_(NULL) |
47 | S_("pax") | 47 | S_(NULL) |
48 | S_("netlink_route_socket") | 48 | S_("netlink_route_socket") |
49 | S_("netlink_firewall_socket") | 49 | S_("netlink_firewall_socket") |
50 | S_("netlink_tcpdiag_socket") | 50 | S_("netlink_tcpdiag_socket") |
@@ -54,12 +54,12 @@ | |||
54 | S_("netlink_audit_socket") | 54 | S_("netlink_audit_socket") |
55 | S_("netlink_ip6fw_socket") | 55 | S_("netlink_ip6fw_socket") |
56 | S_("netlink_dnrt_socket") | 56 | S_("netlink_dnrt_socket") |
57 | S_("dbus") | 57 | S_(NULL) |
58 | S_("nscd") | 58 | S_(NULL) |
59 | S_("association") | 59 | S_("association") |
60 | S_("netlink_kobject_uevent_socket") | 60 | S_("netlink_kobject_uevent_socket") |
61 | S_("appletalk_socket") | 61 | S_("appletalk_socket") |
62 | S_("packet") | 62 | S_("packet") |
63 | S_("key") | 63 | S_("key") |
64 | S_("context") | 64 | S_(NULL) |
65 | S_("dccp_socket") | 65 | S_("dccp_socket") |
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index 67cef371ee00..35f309f47873 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h | |||
@@ -34,19 +34,6 @@ | |||
34 | #define SECCLASS_MSGQ 27 | 34 | #define SECCLASS_MSGQ 27 |
35 | #define SECCLASS_SHM 28 | 35 | #define SECCLASS_SHM 28 |
36 | #define SECCLASS_IPC 29 | 36 | #define SECCLASS_IPC 29 |
37 | #define SECCLASS_PASSWD 30 | ||
38 | #define SECCLASS_DRAWABLE 31 | ||
39 | #define SECCLASS_WINDOW 32 | ||
40 | #define SECCLASS_GC 33 | ||
41 | #define SECCLASS_FONT 34 | ||
42 | #define SECCLASS_COLORMAP 35 | ||
43 | #define SECCLASS_PROPERTY 36 | ||
44 | #define SECCLASS_CURSOR 37 | ||
45 | #define SECCLASS_XCLIENT 38 | ||
46 | #define SECCLASS_XINPUT 39 | ||
47 | #define SECCLASS_XSERVER 40 | ||
48 | #define SECCLASS_XEXTENSION 41 | ||
49 | #define SECCLASS_PAX 42 | ||
50 | #define SECCLASS_NETLINK_ROUTE_SOCKET 43 | 37 | #define SECCLASS_NETLINK_ROUTE_SOCKET 43 |
51 | #define SECCLASS_NETLINK_FIREWALL_SOCKET 44 | 38 | #define SECCLASS_NETLINK_FIREWALL_SOCKET 44 |
52 | #define SECCLASS_NETLINK_TCPDIAG_SOCKET 45 | 39 | #define SECCLASS_NETLINK_TCPDIAG_SOCKET 45 |
@@ -56,14 +43,11 @@ | |||
56 | #define SECCLASS_NETLINK_AUDIT_SOCKET 49 | 43 | #define SECCLASS_NETLINK_AUDIT_SOCKET 49 |
57 | #define SECCLASS_NETLINK_IP6FW_SOCKET 50 | 44 | #define SECCLASS_NETLINK_IP6FW_SOCKET 50 |
58 | #define SECCLASS_NETLINK_DNRT_SOCKET 51 | 45 | #define SECCLASS_NETLINK_DNRT_SOCKET 51 |
59 | #define SECCLASS_DBUS 52 | ||
60 | #define SECCLASS_NSCD 53 | ||
61 | #define SECCLASS_ASSOCIATION 54 | 46 | #define SECCLASS_ASSOCIATION 54 |
62 | #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55 | 47 | #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55 |
63 | #define SECCLASS_APPLETALK_SOCKET 56 | 48 | #define SECCLASS_APPLETALK_SOCKET 56 |
64 | #define SECCLASS_PACKET 57 | 49 | #define SECCLASS_PACKET 57 |
65 | #define SECCLASS_KEY 58 | 50 | #define SECCLASS_KEY 58 |
66 | #define SECCLASS_CONTEXT 59 | ||
67 | #define SECCLASS_DCCP_SOCKET 60 | 51 | #define SECCLASS_DCCP_SOCKET 60 |
68 | 52 | ||
69 | /* | 53 | /* |
diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/netlabel.h index 2a732c9033e3..218e3f77c350 100644 --- a/security/selinux/include/selinux_netlabel.h +++ b/security/selinux/include/netlabel.h | |||
@@ -38,19 +38,22 @@ | |||
38 | 38 | ||
39 | #ifdef CONFIG_NETLABEL | 39 | #ifdef CONFIG_NETLABEL |
40 | void selinux_netlbl_cache_invalidate(void); | 40 | void selinux_netlbl_cache_invalidate(void); |
41 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); | 41 | |
42 | int selinux_netlbl_socket_post_create(struct socket *sock); | ||
43 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); | ||
44 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | ||
45 | struct sk_buff *skb, | ||
46 | struct avc_audit_data *ad); | ||
47 | void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, | 42 | void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, |
48 | int family); | 43 | int family); |
49 | void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, | 44 | void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, |
50 | int family); | 45 | int family); |
51 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | 46 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, |
52 | struct sk_security_struct *newssec); | 47 | struct sk_security_struct *newssec); |
48 | |||
49 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); | ||
50 | |||
51 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); | ||
52 | int selinux_netlbl_socket_post_create(struct socket *sock); | ||
53 | int selinux_netlbl_inode_permission(struct inode *inode, int mask); | 53 | int selinux_netlbl_inode_permission(struct inode *inode, int mask); |
54 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | ||
55 | struct sk_buff *skb, | ||
56 | struct avc_audit_data *ad); | ||
54 | int selinux_netlbl_socket_setsockopt(struct socket *sock, | 57 | int selinux_netlbl_socket_setsockopt(struct socket *sock, |
55 | int level, | 58 | int level, |
56 | int optname); | 59 | int optname); |
@@ -60,59 +63,53 @@ static inline void selinux_netlbl_cache_invalidate(void) | |||
60 | return; | 63 | return; |
61 | } | 64 | } |
62 | 65 | ||
63 | static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, | 66 | static inline void selinux_netlbl_sk_security_reset( |
64 | u32 base_sid, | 67 | struct sk_security_struct *ssec, |
65 | u32 *sid) | 68 | int family) |
66 | { | 69 | { |
67 | *sid = SECSID_NULL; | 70 | return; |
68 | return 0; | ||
69 | } | 71 | } |
70 | 72 | static inline void selinux_netlbl_sk_security_init( | |
71 | static inline int selinux_netlbl_socket_post_create(struct socket *sock) | 73 | struct sk_security_struct *ssec, |
74 | int family) | ||
72 | { | 75 | { |
73 | return 0; | 76 | return; |
74 | } | 77 | } |
75 | 78 | static inline void selinux_netlbl_sk_security_clone( | |
76 | static inline void selinux_netlbl_sock_graft(struct sock *sk, | 79 | struct sk_security_struct *ssec, |
77 | struct socket *sock) | 80 | struct sk_security_struct *newssec) |
78 | { | 81 | { |
79 | return; | 82 | return; |
80 | } | 83 | } |
81 | 84 | ||
82 | static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | 85 | static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, |
83 | struct sk_buff *skb, | 86 | u32 base_sid, |
84 | struct avc_audit_data *ad) | 87 | u32 *sid) |
85 | { | 88 | { |
89 | *sid = SECSID_NULL; | ||
86 | return 0; | 90 | return 0; |
87 | } | 91 | } |
88 | 92 | ||
89 | static inline void selinux_netlbl_sk_security_reset( | 93 | static inline void selinux_netlbl_sock_graft(struct sock *sk, |
90 | struct sk_security_struct *ssec, | 94 | struct socket *sock) |
91 | int family) | ||
92 | { | ||
93 | return; | ||
94 | } | ||
95 | |||
96 | static inline void selinux_netlbl_sk_security_init( | ||
97 | struct sk_security_struct *ssec, | ||
98 | int family) | ||
99 | { | 95 | { |
100 | return; | 96 | return; |
101 | } | 97 | } |
102 | 98 | static inline int selinux_netlbl_socket_post_create(struct socket *sock) | |
103 | static inline void selinux_netlbl_sk_security_clone( | ||
104 | struct sk_security_struct *ssec, | ||
105 | struct sk_security_struct *newssec) | ||
106 | { | 99 | { |
107 | return; | 100 | return 0; |
108 | } | 101 | } |
109 | |||
110 | static inline int selinux_netlbl_inode_permission(struct inode *inode, | 102 | static inline int selinux_netlbl_inode_permission(struct inode *inode, |
111 | int mask) | 103 | int mask) |
112 | { | 104 | { |
113 | return 0; | 105 | return 0; |
114 | } | 106 | } |
115 | 107 | static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | |
108 | struct sk_buff *skb, | ||
109 | struct avc_audit_data *ad) | ||
110 | { | ||
111 | return 0; | ||
112 | } | ||
116 | static inline int selinux_netlbl_socket_setsockopt(struct socket *sock, | 113 | static inline int selinux_netlbl_socket_setsockopt(struct socket *sock, |
117 | int level, | 114 | int level, |
118 | int optname) | 115 | int optname) |
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 210eec77e7ff..b94378afea25 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h | |||
@@ -34,7 +34,7 @@ | |||
34 | #define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS | 34 | #define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS |
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | struct sk_buff; | 37 | struct netlbl_lsm_secattr; |
38 | 38 | ||
39 | extern int selinux_enabled; | 39 | extern int selinux_enabled; |
40 | extern int selinux_mls_enabled; | 40 | extern int selinux_mls_enabled; |
@@ -82,8 +82,6 @@ int security_netif_sid(char *name, u32 *if_sid, | |||
82 | int security_node_sid(u16 domain, void *addr, u32 addrlen, | 82 | int security_node_sid(u16 domain, void *addr, u32 addrlen, |
83 | u32 *out_sid); | 83 | u32 *out_sid); |
84 | 84 | ||
85 | void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid); | ||
86 | |||
87 | int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, | 85 | int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, |
88 | u16 tclass); | 86 | u16 tclass); |
89 | 87 | ||
@@ -102,5 +100,30 @@ int security_fs_use(const char *fstype, unsigned int *behavior, | |||
102 | int security_genfs_sid(const char *fstype, char *name, u16 sclass, | 100 | int security_genfs_sid(const char *fstype, char *name, u16 sclass, |
103 | u32 *sid); | 101 | u32 *sid); |
104 | 102 | ||
103 | #ifdef CONFIG_NETLABEL | ||
104 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, | ||
105 | u32 base_sid, | ||
106 | u32 *sid); | ||
107 | |||
108 | int security_netlbl_sid_to_secattr(u32 sid, | ||
109 | struct netlbl_lsm_secattr *secattr); | ||
110 | #else | ||
111 | static inline int security_netlbl_secattr_to_sid( | ||
112 | struct netlbl_lsm_secattr *secattr, | ||
113 | u32 base_sid, | ||
114 | u32 *sid) | ||
115 | { | ||
116 | return -EIDRM; | ||
117 | } | ||
118 | |||
119 | static inline int security_netlbl_sid_to_secattr(u32 sid, | ||
120 | struct netlbl_lsm_secattr *secattr) | ||
121 | { | ||
122 | return -ENOENT; | ||
123 | } | ||
124 | #endif /* CONFIG_NETLABEL */ | ||
125 | |||
126 | const char *security_get_initial_sid_context(u32 sid); | ||
127 | |||
105 | #endif /* _SELINUX_SECURITY_H_ */ | 128 | #endif /* _SELINUX_SECURITY_H_ */ |
106 | 129 | ||
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c new file mode 100644 index 000000000000..bf8750791dd1 --- /dev/null +++ b/security/selinux/netlabel.c | |||
@@ -0,0 +1,363 @@ | |||
1 | /* | ||
2 | * SELinux NetLabel Support | ||
3 | * | ||
4 | * This file provides the necessary glue to tie NetLabel into the SELinux | ||
5 | * subsystem. | ||
6 | * | ||
7 | * Author: Paul Moore <paul.moore@hp.com> | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License as published by | ||
16 | * the Free Software Foundation; either version 2 of the License, or | ||
17 | * (at your option) any later version. | ||
18 | * | ||
19 | * This program is distributed in the hope that it will be useful, | ||
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
22 | * the GNU General Public License for more details. | ||
23 | * | ||
24 | * You should have received a copy of the GNU General Public License | ||
25 | * along with this program; if not, write to the Free Software | ||
26 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
27 | * | ||
28 | */ | ||
29 | |||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/rcupdate.h> | ||
32 | #include <net/sock.h> | ||
33 | #include <net/netlabel.h> | ||
34 | |||
35 | #include "objsec.h" | ||
36 | #include "security.h" | ||
37 | |||
38 | /** | ||
39 | * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism | ||
40 | * @sock: the socket to label | ||
41 | * @sid: the SID to use | ||
42 | * | ||
43 | * Description: | ||
44 | * Attempt to label a socket using the NetLabel mechanism using the given | ||
45 | * SID. Returns zero values on success, negative values on failure. The | ||
46 | * caller is responsibile for calling rcu_read_lock() before calling this | ||
47 | * this function and rcu_read_unlock() after this function returns. | ||
48 | * | ||
49 | */ | ||
50 | static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) | ||
51 | { | ||
52 | int rc; | ||
53 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
54 | struct netlbl_lsm_secattr secattr; | ||
55 | |||
56 | rc = security_netlbl_sid_to_secattr(sid, &secattr); | ||
57 | if (rc != 0) | ||
58 | return rc; | ||
59 | |||
60 | rc = netlbl_socket_setattr(sock, &secattr); | ||
61 | if (rc == 0) { | ||
62 | spin_lock_bh(&sksec->nlbl_lock); | ||
63 | sksec->nlbl_state = NLBL_LABELED; | ||
64 | spin_unlock_bh(&sksec->nlbl_lock); | ||
65 | } | ||
66 | |||
67 | return rc; | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache | ||
72 | * | ||
73 | * Description: | ||
74 | * Invalidate the NetLabel security attribute mapping cache. | ||
75 | * | ||
76 | */ | ||
77 | void selinux_netlbl_cache_invalidate(void) | ||
78 | { | ||
79 | netlbl_cache_invalidate(); | ||
80 | } | ||
81 | |||
82 | /** | ||
83 | * selinux_netlbl_sk_security_reset - Reset the NetLabel fields | ||
84 | * @ssec: the sk_security_struct | ||
85 | * @family: the socket family | ||
86 | * | ||
87 | * Description: | ||
88 | * Called when the NetLabel state of a sk_security_struct needs to be reset. | ||
89 | * The caller is responsibile for all the NetLabel sk_security_struct locking. | ||
90 | * | ||
91 | */ | ||
92 | void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, | ||
93 | int family) | ||
94 | { | ||
95 | if (family == PF_INET) | ||
96 | ssec->nlbl_state = NLBL_REQUIRE; | ||
97 | else | ||
98 | ssec->nlbl_state = NLBL_UNSET; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * selinux_netlbl_sk_security_init - Setup the NetLabel fields | ||
103 | * @ssec: the sk_security_struct | ||
104 | * @family: the socket family | ||
105 | * | ||
106 | * Description: | ||
107 | * Called when a new sk_security_struct is allocated to initialize the NetLabel | ||
108 | * fields. | ||
109 | * | ||
110 | */ | ||
111 | void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, | ||
112 | int family) | ||
113 | { | ||
114 | /* No locking needed, we are the only one who has access to ssec */ | ||
115 | selinux_netlbl_sk_security_reset(ssec, family); | ||
116 | spin_lock_init(&ssec->nlbl_lock); | ||
117 | } | ||
118 | |||
119 | /** | ||
120 | * selinux_netlbl_sk_security_clone - Copy the NetLabel fields | ||
121 | * @ssec: the original sk_security_struct | ||
122 | * @newssec: the cloned sk_security_struct | ||
123 | * | ||
124 | * Description: | ||
125 | * Clone the NetLabel specific sk_security_struct fields from @ssec to | ||
126 | * @newssec. | ||
127 | * | ||
128 | */ | ||
129 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | ||
130 | struct sk_security_struct *newssec) | ||
131 | { | ||
132 | /* We don't need to take newssec->nlbl_lock because we are the only | ||
133 | * thread with access to newssec, but we do need to take the RCU read | ||
134 | * lock as other threads could have access to ssec */ | ||
135 | rcu_read_lock(); | ||
136 | selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); | ||
137 | newssec->sclass = ssec->sclass; | ||
138 | rcu_read_unlock(); | ||
139 | } | ||
140 | |||
141 | /** | ||
142 | * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel | ||
143 | * @skb: the packet | ||
144 | * @base_sid: the SELinux SID to use as a context for MLS only attributes | ||
145 | * @sid: the SID | ||
146 | * | ||
147 | * Description: | ||
148 | * Call the NetLabel mechanism to get the security attributes of the given | ||
149 | * packet and use those attributes to determine the correct context/SID to | ||
150 | * assign to the packet. Returns zero on success, negative values on failure. | ||
151 | * | ||
152 | */ | ||
153 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) | ||
154 | { | ||
155 | int rc; | ||
156 | struct netlbl_lsm_secattr secattr; | ||
157 | |||
158 | netlbl_secattr_init(&secattr); | ||
159 | rc = netlbl_skbuff_getattr(skb, &secattr); | ||
160 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) | ||
161 | rc = security_netlbl_secattr_to_sid(&secattr, | ||
162 | base_sid, | ||
163 | sid); | ||
164 | else | ||
165 | *sid = SECSID_NULL; | ||
166 | netlbl_secattr_destroy(&secattr); | ||
167 | |||
168 | return rc; | ||
169 | } | ||
170 | |||
171 | /** | ||
172 | * selinux_netlbl_sock_graft - Netlabel the new socket | ||
173 | * @sk: the new connection | ||
174 | * @sock: the new socket | ||
175 | * | ||
176 | * Description: | ||
177 | * The connection represented by @sk is being grafted onto @sock so set the | ||
178 | * socket's NetLabel to match the SID of @sk. | ||
179 | * | ||
180 | */ | ||
181 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) | ||
182 | { | ||
183 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
184 | struct sk_security_struct *sksec = sk->sk_security; | ||
185 | struct netlbl_lsm_secattr secattr; | ||
186 | u32 nlbl_peer_sid; | ||
187 | |||
188 | sksec->sclass = isec->sclass; | ||
189 | |||
190 | rcu_read_lock(); | ||
191 | |||
192 | if (sksec->nlbl_state != NLBL_REQUIRE) { | ||
193 | rcu_read_unlock(); | ||
194 | return; | ||
195 | } | ||
196 | |||
197 | netlbl_secattr_init(&secattr); | ||
198 | if (netlbl_sock_getattr(sk, &secattr) == 0 && | ||
199 | secattr.flags != NETLBL_SECATTR_NONE && | ||
200 | security_netlbl_secattr_to_sid(&secattr, | ||
201 | SECINITSID_UNLABELED, | ||
202 | &nlbl_peer_sid) == 0) | ||
203 | sksec->peer_sid = nlbl_peer_sid; | ||
204 | netlbl_secattr_destroy(&secattr); | ||
205 | |||
206 | /* Try to set the NetLabel on the socket to save time later, if we fail | ||
207 | * here we will pick up the pieces in later calls to | ||
208 | * selinux_netlbl_inode_permission(). */ | ||
209 | selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
210 | |||
211 | rcu_read_unlock(); | ||
212 | } | ||
213 | |||
214 | /** | ||
215 | * selinux_netlbl_socket_post_create - Label a socket using NetLabel | ||
216 | * @sock: the socket to label | ||
217 | * | ||
218 | * Description: | ||
219 | * Attempt to label a socket using the NetLabel mechanism using the given | ||
220 | * SID. Returns zero values on success, negative values on failure. | ||
221 | * | ||
222 | */ | ||
223 | int selinux_netlbl_socket_post_create(struct socket *sock) | ||
224 | { | ||
225 | int rc = 0; | ||
226 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
227 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
228 | |||
229 | sksec->sclass = isec->sclass; | ||
230 | |||
231 | rcu_read_lock(); | ||
232 | if (sksec->nlbl_state == NLBL_REQUIRE) | ||
233 | rc = selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
234 | rcu_read_unlock(); | ||
235 | |||
236 | return rc; | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled | ||
241 | * @inode: the file descriptor's inode | ||
242 | * @mask: the permission mask | ||
243 | * | ||
244 | * Description: | ||
245 | * Looks at a file's inode and if it is marked as a socket protected by | ||
246 | * NetLabel then verify that the socket has been labeled, if not try to label | ||
247 | * the socket now with the inode's SID. Returns zero on success, negative | ||
248 | * values on failure. | ||
249 | * | ||
250 | */ | ||
251 | int selinux_netlbl_inode_permission(struct inode *inode, int mask) | ||
252 | { | ||
253 | int rc; | ||
254 | struct sk_security_struct *sksec; | ||
255 | struct socket *sock; | ||
256 | |||
257 | if (!S_ISSOCK(inode->i_mode) || | ||
258 | ((mask & (MAY_WRITE | MAY_APPEND)) == 0)) | ||
259 | return 0; | ||
260 | sock = SOCKET_I(inode); | ||
261 | sksec = sock->sk->sk_security; | ||
262 | |||
263 | rcu_read_lock(); | ||
264 | if (sksec->nlbl_state != NLBL_REQUIRE) { | ||
265 | rcu_read_unlock(); | ||
266 | return 0; | ||
267 | } | ||
268 | local_bh_disable(); | ||
269 | bh_lock_sock_nested(sock->sk); | ||
270 | rc = selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
271 | bh_unlock_sock(sock->sk); | ||
272 | local_bh_enable(); | ||
273 | rcu_read_unlock(); | ||
274 | |||
275 | return rc; | ||
276 | } | ||
277 | |||
278 | /** | ||
279 | * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel | ||
280 | * @sksec: the sock's sk_security_struct | ||
281 | * @skb: the packet | ||
282 | * @ad: the audit data | ||
283 | * | ||
284 | * Description: | ||
285 | * Fetch the NetLabel security attributes from @skb and perform an access check | ||
286 | * against the receiving socket. Returns zero on success, negative values on | ||
287 | * error. | ||
288 | * | ||
289 | */ | ||
290 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | ||
291 | struct sk_buff *skb, | ||
292 | struct avc_audit_data *ad) | ||
293 | { | ||
294 | int rc; | ||
295 | u32 netlbl_sid; | ||
296 | u32 recv_perm; | ||
297 | |||
298 | rc = selinux_netlbl_skbuff_getsid(skb, | ||
299 | SECINITSID_UNLABELED, | ||
300 | &netlbl_sid); | ||
301 | if (rc != 0) | ||
302 | return rc; | ||
303 | |||
304 | if (netlbl_sid == SECSID_NULL) | ||
305 | return 0; | ||
306 | |||
307 | switch (sksec->sclass) { | ||
308 | case SECCLASS_UDP_SOCKET: | ||
309 | recv_perm = UDP_SOCKET__RECVFROM; | ||
310 | break; | ||
311 | case SECCLASS_TCP_SOCKET: | ||
312 | recv_perm = TCP_SOCKET__RECVFROM; | ||
313 | break; | ||
314 | default: | ||
315 | recv_perm = RAWIP_SOCKET__RECVFROM; | ||
316 | } | ||
317 | |||
318 | rc = avc_has_perm(sksec->sid, | ||
319 | netlbl_sid, | ||
320 | sksec->sclass, | ||
321 | recv_perm, | ||
322 | ad); | ||
323 | if (rc == 0) | ||
324 | return 0; | ||
325 | |||
326 | netlbl_skbuff_err(skb, rc); | ||
327 | return rc; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel | ||
332 | * @sock: the socket | ||
333 | * @level: the socket level or protocol | ||
334 | * @optname: the socket option name | ||
335 | * | ||
336 | * Description: | ||
337 | * Check the setsockopt() call and if the user is trying to replace the IP | ||
338 | * options on a socket and a NetLabel is in place for the socket deny the | ||
339 | * access; otherwise allow the access. Returns zero when the access is | ||
340 | * allowed, -EACCES when denied, and other negative values on error. | ||
341 | * | ||
342 | */ | ||
343 | int selinux_netlbl_socket_setsockopt(struct socket *sock, | ||
344 | int level, | ||
345 | int optname) | ||
346 | { | ||
347 | int rc = 0; | ||
348 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
349 | struct netlbl_lsm_secattr secattr; | ||
350 | |||
351 | rcu_read_lock(); | ||
352 | if (level == IPPROTO_IP && optname == IP_OPTIONS && | ||
353 | sksec->nlbl_state == NLBL_LABELED) { | ||
354 | netlbl_secattr_init(&secattr); | ||
355 | rc = netlbl_socket_getattr(sock, &secattr); | ||
356 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) | ||
357 | rc = -EACCES; | ||
358 | netlbl_secattr_destroy(&secattr); | ||
359 | } | ||
360 | rcu_read_unlock(); | ||
361 | |||
362 | return rc; | ||
363 | } | ||
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 93b3177c7585..aca099aa2ed3 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c | |||
@@ -96,12 +96,18 @@ enum sel_inos { | |||
96 | SEL_COMMIT_BOOLS, /* commit new boolean values */ | 96 | SEL_COMMIT_BOOLS, /* commit new boolean values */ |
97 | SEL_MLS, /* return if MLS policy is enabled */ | 97 | SEL_MLS, /* return if MLS policy is enabled */ |
98 | SEL_DISABLE, /* disable SELinux until next reboot */ | 98 | SEL_DISABLE, /* disable SELinux until next reboot */ |
99 | SEL_AVC, /* AVC management directory */ | ||
100 | SEL_MEMBER, /* compute polyinstantiation membership decision */ | 99 | SEL_MEMBER, /* compute polyinstantiation membership decision */ |
101 | SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */ | 100 | SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */ |
102 | SEL_COMPAT_NET, /* whether to use old compat network packet controls */ | 101 | SEL_COMPAT_NET, /* whether to use old compat network packet controls */ |
102 | SEL_INO_NEXT, /* The next inode number to use */ | ||
103 | }; | 103 | }; |
104 | 104 | ||
105 | static unsigned long sel_last_ino = SEL_INO_NEXT - 1; | ||
106 | |||
107 | #define SEL_INITCON_INO_OFFSET 0x01000000 | ||
108 | #define SEL_BOOL_INO_OFFSET 0x02000000 | ||
109 | #define SEL_INO_MASK 0x00ffffff | ||
110 | |||
105 | #define TMPBUFLEN 12 | 111 | #define TMPBUFLEN 12 |
106 | static ssize_t sel_read_enforce(struct file *filp, char __user *buf, | 112 | static ssize_t sel_read_enforce(struct file *filp, char __user *buf, |
107 | size_t count, loff_t *ppos) | 113 | size_t count, loff_t *ppos) |
@@ -777,8 +783,6 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode) | |||
777 | return ret; | 783 | return ret; |
778 | } | 784 | } |
779 | 785 | ||
780 | #define BOOL_INO_OFFSET 30 | ||
781 | |||
782 | static ssize_t sel_read_bool(struct file *filep, char __user *buf, | 786 | static ssize_t sel_read_bool(struct file *filep, char __user *buf, |
783 | size_t count, loff_t *ppos) | 787 | size_t count, loff_t *ppos) |
784 | { | 788 | { |
@@ -806,14 +810,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, | |||
806 | } | 810 | } |
807 | 811 | ||
808 | inode = filep->f_path.dentry->d_inode; | 812 | inode = filep->f_path.dentry->d_inode; |
809 | cur_enforcing = security_get_bool_value(inode->i_ino - BOOL_INO_OFFSET); | 813 | cur_enforcing = security_get_bool_value(inode->i_ino&SEL_INO_MASK); |
810 | if (cur_enforcing < 0) { | 814 | if (cur_enforcing < 0) { |
811 | ret = cur_enforcing; | 815 | ret = cur_enforcing; |
812 | goto out; | 816 | goto out; |
813 | } | 817 | } |
814 | 818 | ||
815 | length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, | 819 | length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, |
816 | bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]); | 820 | bool_pending_values[inode->i_ino&SEL_INO_MASK]); |
817 | ret = simple_read_from_buffer(buf, count, ppos, page, length); | 821 | ret = simple_read_from_buffer(buf, count, ppos, page, length); |
818 | out: | 822 | out: |
819 | mutex_unlock(&sel_mutex); | 823 | mutex_unlock(&sel_mutex); |
@@ -865,7 +869,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, | |||
865 | new_value = 1; | 869 | new_value = 1; |
866 | 870 | ||
867 | inode = filep->f_path.dentry->d_inode; | 871 | inode = filep->f_path.dentry->d_inode; |
868 | bool_pending_values[inode->i_ino - BOOL_INO_OFFSET] = new_value; | 872 | bool_pending_values[inode->i_ino&SEL_INO_MASK] = new_value; |
869 | length = count; | 873 | length = count; |
870 | 874 | ||
871 | out: | 875 | out: |
@@ -1029,7 +1033,7 @@ static int sel_make_bools(void) | |||
1029 | isec->sid = sid; | 1033 | isec->sid = sid; |
1030 | isec->initialized = 1; | 1034 | isec->initialized = 1; |
1031 | inode->i_fop = &sel_bool_ops; | 1035 | inode->i_fop = &sel_bool_ops; |
1032 | inode->i_ino = i + BOOL_INO_OFFSET; | 1036 | inode->i_ino = i|SEL_BOOL_INO_OFFSET; |
1033 | d_add(dentry, inode); | 1037 | d_add(dentry, inode); |
1034 | } | 1038 | } |
1035 | bool_num = num; | 1039 | bool_num = num; |
@@ -1234,6 +1238,56 @@ static int sel_make_avc_files(struct dentry *dir) | |||
1234 | goto out; | 1238 | goto out; |
1235 | } | 1239 | } |
1236 | inode->i_fop = files[i].ops; | 1240 | inode->i_fop = files[i].ops; |
1241 | inode->i_ino = ++sel_last_ino; | ||
1242 | d_add(dentry, inode); | ||
1243 | } | ||
1244 | out: | ||
1245 | return ret; | ||
1246 | } | ||
1247 | |||
1248 | static ssize_t sel_read_initcon(struct file * file, char __user *buf, | ||
1249 | size_t count, loff_t *ppos) | ||
1250 | { | ||
1251 | struct inode *inode; | ||
1252 | char *con; | ||
1253 | u32 sid, len; | ||
1254 | ssize_t ret; | ||
1255 | |||
1256 | inode = file->f_path.dentry->d_inode; | ||
1257 | sid = inode->i_ino&SEL_INO_MASK; | ||
1258 | ret = security_sid_to_context(sid, &con, &len); | ||
1259 | if (ret < 0) | ||
1260 | return ret; | ||
1261 | |||
1262 | ret = simple_read_from_buffer(buf, count, ppos, con, len); | ||
1263 | kfree(con); | ||
1264 | return ret; | ||
1265 | } | ||
1266 | |||
1267 | static const struct file_operations sel_initcon_ops = { | ||
1268 | .read = sel_read_initcon, | ||
1269 | }; | ||
1270 | |||
1271 | static int sel_make_initcon_files(struct dentry *dir) | ||
1272 | { | ||
1273 | int i, ret = 0; | ||
1274 | |||
1275 | for (i = 1; i <= SECINITSID_NUM; i++) { | ||
1276 | struct inode *inode; | ||
1277 | struct dentry *dentry; | ||
1278 | dentry = d_alloc_name(dir, security_get_initial_sid_context(i)); | ||
1279 | if (!dentry) { | ||
1280 | ret = -ENOMEM; | ||
1281 | goto out; | ||
1282 | } | ||
1283 | |||
1284 | inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO); | ||
1285 | if (!inode) { | ||
1286 | ret = -ENOMEM; | ||
1287 | goto out; | ||
1288 | } | ||
1289 | inode->i_fop = &sel_initcon_ops; | ||
1290 | inode->i_ino = i|SEL_INITCON_INO_OFFSET; | ||
1237 | d_add(dentry, inode); | 1291 | d_add(dentry, inode); |
1238 | } | 1292 | } |
1239 | out: | 1293 | out: |
@@ -1252,6 +1306,7 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry) | |||
1252 | } | 1306 | } |
1253 | inode->i_op = &simple_dir_inode_operations; | 1307 | inode->i_op = &simple_dir_inode_operations; |
1254 | inode->i_fop = &simple_dir_operations; | 1308 | inode->i_fop = &simple_dir_operations; |
1309 | inode->i_ino = ++sel_last_ino; | ||
1255 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | 1310 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ |
1256 | inc_nlink(inode); | 1311 | inc_nlink(inode); |
1257 | d_add(dentry, inode); | 1312 | d_add(dentry, inode); |
@@ -1314,6 +1369,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) | |||
1314 | ret = -ENOMEM; | 1369 | ret = -ENOMEM; |
1315 | goto err; | 1370 | goto err; |
1316 | } | 1371 | } |
1372 | inode->i_ino = ++sel_last_ino; | ||
1317 | isec = (struct inode_security_struct*)inode->i_security; | 1373 | isec = (struct inode_security_struct*)inode->i_security; |
1318 | isec->sid = SECINITSID_DEVNULL; | 1374 | isec->sid = SECINITSID_DEVNULL; |
1319 | isec->sclass = SECCLASS_CHR_FILE; | 1375 | isec->sclass = SECCLASS_CHR_FILE; |
@@ -1336,6 +1392,21 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) | |||
1336 | ret = sel_make_avc_files(dentry); | 1392 | ret = sel_make_avc_files(dentry); |
1337 | if (ret) | 1393 | if (ret) |
1338 | goto err; | 1394 | goto err; |
1395 | |||
1396 | dentry = d_alloc_name(sb->s_root, "initial_contexts"); | ||
1397 | if (!dentry) { | ||
1398 | ret = -ENOMEM; | ||
1399 | goto err; | ||
1400 | } | ||
1401 | |||
1402 | ret = sel_make_dir(root_inode, dentry); | ||
1403 | if (ret) | ||
1404 | goto err; | ||
1405 | |||
1406 | ret = sel_make_initcon_files(dentry); | ||
1407 | if (ret) | ||
1408 | goto err; | ||
1409 | |||
1339 | out: | 1410 | out: |
1340 | return ret; | 1411 | return ret; |
1341 | err: | 1412 | err: |
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 1e52356664d6..40660ffd49b6 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
40 | #include <linux/audit.h> | 40 | #include <linux/audit.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <net/sock.h> | ||
43 | #include <net/netlabel.h> | 42 | #include <net/netlabel.h> |
44 | 43 | ||
45 | #include "flask.h" | 44 | #include "flask.h" |
@@ -53,7 +52,7 @@ | |||
53 | #include "conditional.h" | 52 | #include "conditional.h" |
54 | #include "mls.h" | 53 | #include "mls.h" |
55 | #include "objsec.h" | 54 | #include "objsec.h" |
56 | #include "selinux_netlabel.h" | 55 | #include "netlabel.h" |
57 | #include "xfrm.h" | 56 | #include "xfrm.h" |
58 | #include "ebitmap.h" | 57 | #include "ebitmap.h" |
59 | 58 | ||
@@ -594,6 +593,13 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 | |||
594 | 593 | ||
595 | #include "initial_sid_to_string.h" | 594 | #include "initial_sid_to_string.h" |
596 | 595 | ||
596 | const char *security_get_initial_sid_context(u32 sid) | ||
597 | { | ||
598 | if (unlikely(sid > SECINITSID_NUM)) | ||
599 | return NULL; | ||
600 | return initial_sid_to_string[sid]; | ||
601 | } | ||
602 | |||
597 | /** | 603 | /** |
598 | * security_sid_to_context - Obtain a context for a given SID. | 604 | * security_sid_to_context - Obtain a context for a given SID. |
599 | * @sid: security identifier, SID | 605 | * @sid: security identifier, SID |
@@ -1050,6 +1056,8 @@ static int validate_classes(struct policydb *p) | |||
1050 | 1056 | ||
1051 | for (i = 1; i < kdefs->cts_len; i++) { | 1057 | for (i = 1; i < kdefs->cts_len; i++) { |
1052 | def_class = kdefs->class_to_string[i]; | 1058 | def_class = kdefs->class_to_string[i]; |
1059 | if (!def_class) | ||
1060 | continue; | ||
1053 | if (i > p->p_classes.nprim) { | 1061 | if (i > p->p_classes.nprim) { |
1054 | printk(KERN_INFO | 1062 | printk(KERN_INFO |
1055 | "security: class %s not defined in policy\n", | 1063 | "security: class %s not defined in policy\n", |
@@ -1249,6 +1257,7 @@ bad: | |||
1249 | } | 1257 | } |
1250 | 1258 | ||
1251 | extern void selinux_complete_init(void); | 1259 | extern void selinux_complete_init(void); |
1260 | static int security_preserve_bools(struct policydb *p); | ||
1252 | 1261 | ||
1253 | /** | 1262 | /** |
1254 | * security_load_policy - Load a security policy configuration. | 1263 | * security_load_policy - Load a security policy configuration. |
@@ -1325,6 +1334,12 @@ int security_load_policy(void *data, size_t len) | |||
1325 | goto err; | 1334 | goto err; |
1326 | } | 1335 | } |
1327 | 1336 | ||
1337 | rc = security_preserve_bools(&newpolicydb); | ||
1338 | if (rc) { | ||
1339 | printk(KERN_ERR "security: unable to preserve booleans\n"); | ||
1340 | goto err; | ||
1341 | } | ||
1342 | |||
1328 | /* Clone the SID table. */ | 1343 | /* Clone the SID table. */ |
1329 | sidtab_shutdown(&sidtab); | 1344 | sidtab_shutdown(&sidtab); |
1330 | if (sidtab_map(&sidtab, clone_sid, &newsidtab)) { | 1345 | if (sidtab_map(&sidtab, clone_sid, &newsidtab)) { |
@@ -1882,6 +1897,37 @@ out: | |||
1882 | return rc; | 1897 | return rc; |
1883 | } | 1898 | } |
1884 | 1899 | ||
1900 | static int security_preserve_bools(struct policydb *p) | ||
1901 | { | ||
1902 | int rc, nbools = 0, *bvalues = NULL, i; | ||
1903 | char **bnames = NULL; | ||
1904 | struct cond_bool_datum *booldatum; | ||
1905 | struct cond_node *cur; | ||
1906 | |||
1907 | rc = security_get_bools(&nbools, &bnames, &bvalues); | ||
1908 | if (rc) | ||
1909 | goto out; | ||
1910 | for (i = 0; i < nbools; i++) { | ||
1911 | booldatum = hashtab_search(p->p_bools.table, bnames[i]); | ||
1912 | if (booldatum) | ||
1913 | booldatum->state = bvalues[i]; | ||
1914 | } | ||
1915 | for (cur = p->cond_list; cur != NULL; cur = cur->next) { | ||
1916 | rc = evaluate_cond_node(p, cur); | ||
1917 | if (rc) | ||
1918 | goto out; | ||
1919 | } | ||
1920 | |||
1921 | out: | ||
1922 | if (bnames) { | ||
1923 | for (i = 0; i < nbools; i++) | ||
1924 | kfree(bnames[i]); | ||
1925 | } | ||
1926 | kfree(bnames); | ||
1927 | kfree(bvalues); | ||
1928 | return rc; | ||
1929 | } | ||
1930 | |||
1885 | /* | 1931 | /* |
1886 | * security_sid_mls_copy() - computes a new sid based on the given | 1932 | * security_sid_mls_copy() - computes a new sid based on the given |
1887 | * sid and the mls portion of mls_sid. | 1933 | * sid and the mls portion of mls_sid. |
@@ -2198,41 +2244,15 @@ void selinux_audit_set_callback(int (*callback)(void)) | |||
2198 | aurule_callback = callback; | 2244 | aurule_callback = callback; |
2199 | } | 2245 | } |
2200 | 2246 | ||
2201 | /** | ||
2202 | * security_skb_extlbl_sid - Determine the external label of a packet | ||
2203 | * @skb: the packet | ||
2204 | * @base_sid: the SELinux SID to use as a context for MLS only external labels | ||
2205 | * @sid: the packet's SID | ||
2206 | * | ||
2207 | * Description: | ||
2208 | * Check the various different forms of external packet labeling and determine | ||
2209 | * the external SID for the packet. | ||
2210 | * | ||
2211 | */ | ||
2212 | void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid) | ||
2213 | { | ||
2214 | u32 xfrm_sid; | ||
2215 | u32 nlbl_sid; | ||
2216 | |||
2217 | selinux_skb_xfrm_sid(skb, &xfrm_sid); | ||
2218 | if (selinux_netlbl_skbuff_getsid(skb, | ||
2219 | (xfrm_sid == SECSID_NULL ? | ||
2220 | base_sid : xfrm_sid), | ||
2221 | &nlbl_sid) != 0) | ||
2222 | nlbl_sid = SECSID_NULL; | ||
2223 | |||
2224 | *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); | ||
2225 | } | ||
2226 | |||
2227 | #ifdef CONFIG_NETLABEL | 2247 | #ifdef CONFIG_NETLABEL |
2228 | /* | 2248 | /* |
2229 | * This is the structure we store inside the NetLabel cache block. | 2249 | * NetLabel cache structure |
2230 | */ | 2250 | */ |
2231 | #define NETLBL_CACHE(x) ((struct netlbl_cache *)(x)) | 2251 | #define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x)) |
2232 | #define NETLBL_CACHE_T_NONE 0 | 2252 | #define NETLBL_CACHE_T_NONE 0 |
2233 | #define NETLBL_CACHE_T_SID 1 | 2253 | #define NETLBL_CACHE_T_SID 1 |
2234 | #define NETLBL_CACHE_T_MLS 2 | 2254 | #define NETLBL_CACHE_T_MLS 2 |
2235 | struct netlbl_cache { | 2255 | struct selinux_netlbl_cache { |
2236 | u32 type; | 2256 | u32 type; |
2237 | union { | 2257 | union { |
2238 | u32 sid; | 2258 | u32 sid; |
@@ -2241,7 +2261,7 @@ struct netlbl_cache { | |||
2241 | }; | 2261 | }; |
2242 | 2262 | ||
2243 | /** | 2263 | /** |
2244 | * selinux_netlbl_cache_free - Free the NetLabel cached data | 2264 | * security_netlbl_cache_free - Free the NetLabel cached data |
2245 | * @data: the data to free | 2265 | * @data: the data to free |
2246 | * | 2266 | * |
2247 | * Description: | 2267 | * Description: |
@@ -2249,9 +2269,9 @@ struct netlbl_cache { | |||
2249 | * netlbl_lsm_cache structure. | 2269 | * netlbl_lsm_cache structure. |
2250 | * | 2270 | * |
2251 | */ | 2271 | */ |
2252 | static void selinux_netlbl_cache_free(const void *data) | 2272 | static void security_netlbl_cache_free(const void *data) |
2253 | { | 2273 | { |
2254 | struct netlbl_cache *cache; | 2274 | struct selinux_netlbl_cache *cache; |
2255 | 2275 | ||
2256 | if (data == NULL) | 2276 | if (data == NULL) |
2257 | return; | 2277 | return; |
@@ -2266,33 +2286,33 @@ static void selinux_netlbl_cache_free(const void *data) | |||
2266 | } | 2286 | } |
2267 | 2287 | ||
2268 | /** | 2288 | /** |
2269 | * selinux_netlbl_cache_add - Add an entry to the NetLabel cache | 2289 | * security_netlbl_cache_add - Add an entry to the NetLabel cache |
2270 | * @skb: the packet | 2290 | * @secattr: the NetLabel packet security attributes |
2271 | * @ctx: the SELinux context | 2291 | * @ctx: the SELinux context |
2272 | * | 2292 | * |
2273 | * Description: | 2293 | * Description: |
2274 | * Attempt to cache the context in @ctx, which was derived from the packet in | 2294 | * Attempt to cache the context in @ctx, which was derived from the packet in |
2275 | * @skb, in the NetLabel subsystem cache. | 2295 | * @skb, in the NetLabel subsystem cache. This function assumes @secattr has |
2296 | * already been initialized. | ||
2276 | * | 2297 | * |
2277 | */ | 2298 | */ |
2278 | static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) | 2299 | static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, |
2300 | struct context *ctx) | ||
2279 | { | 2301 | { |
2280 | struct netlbl_cache *cache = NULL; | 2302 | struct selinux_netlbl_cache *cache = NULL; |
2281 | struct netlbl_lsm_secattr secattr; | ||
2282 | 2303 | ||
2283 | netlbl_secattr_init(&secattr); | 2304 | secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); |
2284 | secattr.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); | 2305 | if (secattr->cache == NULL) |
2285 | if (secattr.cache == NULL) | 2306 | return; |
2286 | goto netlbl_cache_add_return; | ||
2287 | 2307 | ||
2288 | cache = kzalloc(sizeof(*cache), GFP_ATOMIC); | 2308 | cache = kzalloc(sizeof(*cache), GFP_ATOMIC); |
2289 | if (cache == NULL) | 2309 | if (cache == NULL) |
2290 | goto netlbl_cache_add_return; | 2310 | return; |
2291 | 2311 | ||
2292 | cache->type = NETLBL_CACHE_T_MLS; | 2312 | cache->type = NETLBL_CACHE_T_MLS; |
2293 | if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, | 2313 | if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, |
2294 | &ctx->range.level[0].cat) != 0) | 2314 | &ctx->range.level[0].cat) != 0) |
2295 | goto netlbl_cache_add_return; | 2315 | return; |
2296 | cache->data.mls_label.level[1].cat.highbit = | 2316 | cache->data.mls_label.level[1].cat.highbit = |
2297 | cache->data.mls_label.level[0].cat.highbit; | 2317 | cache->data.mls_label.level[0].cat.highbit; |
2298 | cache->data.mls_label.level[1].cat.node = | 2318 | cache->data.mls_label.level[1].cat.node = |
@@ -2300,52 +2320,40 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) | |||
2300 | cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; | 2320 | cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; |
2301 | cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; | 2321 | cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; |
2302 | 2322 | ||
2303 | secattr.cache->free = selinux_netlbl_cache_free; | 2323 | secattr->cache->free = security_netlbl_cache_free; |
2304 | secattr.cache->data = (void *)cache; | 2324 | secattr->cache->data = (void *)cache; |
2305 | secattr.flags = NETLBL_SECATTR_CACHE; | 2325 | secattr->flags |= NETLBL_SECATTR_CACHE; |
2306 | |||
2307 | netlbl_cache_add(skb, &secattr); | ||
2308 | |||
2309 | netlbl_cache_add_return: | ||
2310 | netlbl_secattr_destroy(&secattr); | ||
2311 | } | 2326 | } |
2312 | 2327 | ||
2313 | /** | 2328 | /** |
2314 | * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache | 2329 | * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID |
2315 | * | ||
2316 | * Description: | ||
2317 | * Invalidate the NetLabel security attribute mapping cache. | ||
2318 | * | ||
2319 | */ | ||
2320 | void selinux_netlbl_cache_invalidate(void) | ||
2321 | { | ||
2322 | netlbl_cache_invalidate(); | ||
2323 | } | ||
2324 | |||
2325 | /** | ||
2326 | * selinux_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID | ||
2327 | * @skb: the network packet | ||
2328 | * @secattr: the NetLabel packet security attributes | 2330 | * @secattr: the NetLabel packet security attributes |
2329 | * @base_sid: the SELinux SID to use as a context for MLS only attributes | 2331 | * @base_sid: the SELinux SID to use as a context for MLS only attributes |
2330 | * @sid: the SELinux SID | 2332 | * @sid: the SELinux SID |
2331 | * | 2333 | * |
2332 | * Description: | 2334 | * Description: |
2333 | * Convert the given NetLabel packet security attributes in @secattr into a | 2335 | * Convert the given NetLabel security attributes in @secattr into a |
2334 | * SELinux SID. If the @secattr field does not contain a full SELinux | 2336 | * SELinux SID. If the @secattr field does not contain a full SELinux |
2335 | * SID/context then use the context in @base_sid as the foundation. If @skb | 2337 | * SID/context then use the context in @base_sid as the foundation. If |
2336 | * is not NULL attempt to cache as much data as possibile. Returns zero on | 2338 | * possibile the 'cache' field of @secattr is set and the CACHE flag is set; |
2337 | * success, negative values on failure. | 2339 | * this is to allow the @secattr to be used by NetLabel to cache the secattr to |
2340 | * SID conversion for future lookups. Returns zero on success, negative | ||
2341 | * values on failure. | ||
2338 | * | 2342 | * |
2339 | */ | 2343 | */ |
2340 | static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, | 2344 | int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, |
2341 | struct netlbl_lsm_secattr *secattr, | 2345 | u32 base_sid, |
2342 | u32 base_sid, | 2346 | u32 *sid) |
2343 | u32 *sid) | ||
2344 | { | 2347 | { |
2345 | int rc = -EIDRM; | 2348 | int rc = -EIDRM; |
2346 | struct context *ctx; | 2349 | struct context *ctx; |
2347 | struct context ctx_new; | 2350 | struct context ctx_new; |
2348 | struct netlbl_cache *cache; | 2351 | struct selinux_netlbl_cache *cache; |
2352 | |||
2353 | if (!ss_initialized) { | ||
2354 | *sid = SECSID_NULL; | ||
2355 | return 0; | ||
2356 | } | ||
2349 | 2357 | ||
2350 | POLICY_RDLOCK; | 2358 | POLICY_RDLOCK; |
2351 | 2359 | ||
@@ -2410,8 +2418,8 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, | |||
2410 | if (rc != 0) | 2418 | if (rc != 0) |
2411 | goto netlbl_secattr_to_sid_return_cleanup; | 2419 | goto netlbl_secattr_to_sid_return_cleanup; |
2412 | 2420 | ||
2413 | if (skb != NULL) | 2421 | security_netlbl_cache_add(secattr, &ctx_new); |
2414 | selinux_netlbl_cache_add(skb, &ctx_new); | 2422 | |
2415 | ebitmap_destroy(&ctx_new.range.level[0].cat); | 2423 | ebitmap_destroy(&ctx_new.range.level[0].cat); |
2416 | } else { | 2424 | } else { |
2417 | *sid = SECSID_NULL; | 2425 | *sid = SECSID_NULL; |
@@ -2427,338 +2435,43 @@ netlbl_secattr_to_sid_return_cleanup: | |||
2427 | } | 2435 | } |
2428 | 2436 | ||
2429 | /** | 2437 | /** |
2430 | * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel | 2438 | * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr |
2431 | * @skb: the packet | 2439 | * @sid: the SELinux SID |
2432 | * @base_sid: the SELinux SID to use as a context for MLS only attributes | 2440 | * @secattr: the NetLabel packet security attributes |
2433 | * @sid: the SID | ||
2434 | * | ||
2435 | * Description: | ||
2436 | * Call the NetLabel mechanism to get the security attributes of the given | ||
2437 | * packet and use those attributes to determine the correct context/SID to | ||
2438 | * assign to the packet. Returns zero on success, negative values on failure. | ||
2439 | * | ||
2440 | */ | ||
2441 | int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) | ||
2442 | { | ||
2443 | int rc; | ||
2444 | struct netlbl_lsm_secattr secattr; | ||
2445 | |||
2446 | netlbl_secattr_init(&secattr); | ||
2447 | rc = netlbl_skbuff_getattr(skb, &secattr); | ||
2448 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) | ||
2449 | rc = selinux_netlbl_secattr_to_sid(skb, | ||
2450 | &secattr, | ||
2451 | base_sid, | ||
2452 | sid); | ||
2453 | else | ||
2454 | *sid = SECSID_NULL; | ||
2455 | netlbl_secattr_destroy(&secattr); | ||
2456 | |||
2457 | return rc; | ||
2458 | } | ||
2459 | |||
2460 | /** | ||
2461 | * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism | ||
2462 | * @sock: the socket to label | ||
2463 | * @sid: the SID to use | ||
2464 | * | 2441 | * |
2465 | * Description: | 2442 | * Description: |
2466 | * Attempt to label a socket using the NetLabel mechanism using the given | 2443 | * Convert the given SELinux SID in @sid into a NetLabel security attribute. |
2467 | * SID. Returns zero values on success, negative values on failure. The | 2444 | * Returns zero on success, negative values on failure. |
2468 | * caller is responsibile for calling rcu_read_lock() before calling this | ||
2469 | * this function and rcu_read_unlock() after this function returns. | ||
2470 | * | 2445 | * |
2471 | */ | 2446 | */ |
2472 | static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) | 2447 | int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) |
2473 | { | 2448 | { |
2474 | int rc = -ENOENT; | 2449 | int rc = -ENOENT; |
2475 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
2476 | struct netlbl_lsm_secattr secattr; | ||
2477 | struct context *ctx; | 2450 | struct context *ctx; |
2478 | 2451 | ||
2452 | netlbl_secattr_init(secattr); | ||
2453 | |||
2479 | if (!ss_initialized) | 2454 | if (!ss_initialized) |
2480 | return 0; | 2455 | return 0; |
2481 | 2456 | ||
2482 | netlbl_secattr_init(&secattr); | ||
2483 | |||
2484 | POLICY_RDLOCK; | 2457 | POLICY_RDLOCK; |
2485 | |||
2486 | ctx = sidtab_search(&sidtab, sid); | 2458 | ctx = sidtab_search(&sidtab, sid); |
2487 | if (ctx == NULL) | 2459 | if (ctx == NULL) |
2488 | goto netlbl_socket_setsid_return; | 2460 | goto netlbl_sid_to_secattr_failure; |
2489 | 2461 | secattr->domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1], | |
2490 | secattr.domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1], | 2462 | GFP_ATOMIC); |
2491 | GFP_ATOMIC); | 2463 | secattr->flags |= NETLBL_SECATTR_DOMAIN; |
2492 | secattr.flags |= NETLBL_SECATTR_DOMAIN; | 2464 | mls_export_netlbl_lvl(ctx, secattr); |
2493 | mls_export_netlbl_lvl(ctx, &secattr); | 2465 | rc = mls_export_netlbl_cat(ctx, secattr); |
2494 | rc = mls_export_netlbl_cat(ctx, &secattr); | ||
2495 | if (rc != 0) | 2466 | if (rc != 0) |
2496 | goto netlbl_socket_setsid_return; | 2467 | goto netlbl_sid_to_secattr_failure; |
2497 | |||
2498 | rc = netlbl_socket_setattr(sock, &secattr); | ||
2499 | if (rc == 0) { | ||
2500 | spin_lock_bh(&sksec->nlbl_lock); | ||
2501 | sksec->nlbl_state = NLBL_LABELED; | ||
2502 | spin_unlock_bh(&sksec->nlbl_lock); | ||
2503 | } | ||
2504 | |||
2505 | netlbl_socket_setsid_return: | ||
2506 | POLICY_RDUNLOCK; | 2468 | POLICY_RDUNLOCK; |
2507 | netlbl_secattr_destroy(&secattr); | ||
2508 | return rc; | ||
2509 | } | ||
2510 | |||
2511 | /** | ||
2512 | * selinux_netlbl_sk_security_reset - Reset the NetLabel fields | ||
2513 | * @ssec: the sk_security_struct | ||
2514 | * @family: the socket family | ||
2515 | * | ||
2516 | * Description: | ||
2517 | * Called when the NetLabel state of a sk_security_struct needs to be reset. | ||
2518 | * The caller is responsibile for all the NetLabel sk_security_struct locking. | ||
2519 | * | ||
2520 | */ | ||
2521 | void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, | ||
2522 | int family) | ||
2523 | { | ||
2524 | if (family == PF_INET) | ||
2525 | ssec->nlbl_state = NLBL_REQUIRE; | ||
2526 | else | ||
2527 | ssec->nlbl_state = NLBL_UNSET; | ||
2528 | } | ||
2529 | 2469 | ||
2530 | /** | 2470 | return 0; |
2531 | * selinux_netlbl_sk_security_init - Setup the NetLabel fields | ||
2532 | * @ssec: the sk_security_struct | ||
2533 | * @family: the socket family | ||
2534 | * | ||
2535 | * Description: | ||
2536 | * Called when a new sk_security_struct is allocated to initialize the NetLabel | ||
2537 | * fields. | ||
2538 | * | ||
2539 | */ | ||
2540 | void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, | ||
2541 | int family) | ||
2542 | { | ||
2543 | /* No locking needed, we are the only one who has access to ssec */ | ||
2544 | selinux_netlbl_sk_security_reset(ssec, family); | ||
2545 | spin_lock_init(&ssec->nlbl_lock); | ||
2546 | } | ||
2547 | |||
2548 | /** | ||
2549 | * selinux_netlbl_sk_security_clone - Copy the NetLabel fields | ||
2550 | * @ssec: the original sk_security_struct | ||
2551 | * @newssec: the cloned sk_security_struct | ||
2552 | * | ||
2553 | * Description: | ||
2554 | * Clone the NetLabel specific sk_security_struct fields from @ssec to | ||
2555 | * @newssec. | ||
2556 | * | ||
2557 | */ | ||
2558 | void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, | ||
2559 | struct sk_security_struct *newssec) | ||
2560 | { | ||
2561 | /* We don't need to take newssec->nlbl_lock because we are the only | ||
2562 | * thread with access to newssec, but we do need to take the RCU read | ||
2563 | * lock as other threads could have access to ssec */ | ||
2564 | rcu_read_lock(); | ||
2565 | selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); | ||
2566 | newssec->sclass = ssec->sclass; | ||
2567 | rcu_read_unlock(); | ||
2568 | } | ||
2569 | |||
2570 | /** | ||
2571 | * selinux_netlbl_socket_post_create - Label a socket using NetLabel | ||
2572 | * @sock: the socket to label | ||
2573 | * | ||
2574 | * Description: | ||
2575 | * Attempt to label a socket using the NetLabel mechanism using the given | ||
2576 | * SID. Returns zero values on success, negative values on failure. | ||
2577 | * | ||
2578 | */ | ||
2579 | int selinux_netlbl_socket_post_create(struct socket *sock) | ||
2580 | { | ||
2581 | int rc = 0; | ||
2582 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
2583 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
2584 | |||
2585 | sksec->sclass = isec->sclass; | ||
2586 | |||
2587 | rcu_read_lock(); | ||
2588 | if (sksec->nlbl_state == NLBL_REQUIRE) | ||
2589 | rc = selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
2590 | rcu_read_unlock(); | ||
2591 | |||
2592 | return rc; | ||
2593 | } | ||
2594 | |||
2595 | /** | ||
2596 | * selinux_netlbl_sock_graft - Netlabel the new socket | ||
2597 | * @sk: the new connection | ||
2598 | * @sock: the new socket | ||
2599 | * | ||
2600 | * Description: | ||
2601 | * The connection represented by @sk is being grafted onto @sock so set the | ||
2602 | * socket's NetLabel to match the SID of @sk. | ||
2603 | * | ||
2604 | */ | ||
2605 | void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) | ||
2606 | { | ||
2607 | struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; | ||
2608 | struct sk_security_struct *sksec = sk->sk_security; | ||
2609 | struct netlbl_lsm_secattr secattr; | ||
2610 | u32 nlbl_peer_sid; | ||
2611 | |||
2612 | sksec->sclass = isec->sclass; | ||
2613 | |||
2614 | rcu_read_lock(); | ||
2615 | |||
2616 | if (sksec->nlbl_state != NLBL_REQUIRE) { | ||
2617 | rcu_read_unlock(); | ||
2618 | return; | ||
2619 | } | ||
2620 | |||
2621 | netlbl_secattr_init(&secattr); | ||
2622 | if (netlbl_sock_getattr(sk, &secattr) == 0 && | ||
2623 | secattr.flags != NETLBL_SECATTR_NONE && | ||
2624 | selinux_netlbl_secattr_to_sid(NULL, | ||
2625 | &secattr, | ||
2626 | SECINITSID_UNLABELED, | ||
2627 | &nlbl_peer_sid) == 0) | ||
2628 | sksec->peer_sid = nlbl_peer_sid; | ||
2629 | netlbl_secattr_destroy(&secattr); | ||
2630 | |||
2631 | /* Try to set the NetLabel on the socket to save time later, if we fail | ||
2632 | * here we will pick up the pieces in later calls to | ||
2633 | * selinux_netlbl_inode_permission(). */ | ||
2634 | selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
2635 | |||
2636 | rcu_read_unlock(); | ||
2637 | } | ||
2638 | |||
2639 | /** | ||
2640 | * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled | ||
2641 | * @inode: the file descriptor's inode | ||
2642 | * @mask: the permission mask | ||
2643 | * | ||
2644 | * Description: | ||
2645 | * Looks at a file's inode and if it is marked as a socket protected by | ||
2646 | * NetLabel then verify that the socket has been labeled, if not try to label | ||
2647 | * the socket now with the inode's SID. Returns zero on success, negative | ||
2648 | * values on failure. | ||
2649 | * | ||
2650 | */ | ||
2651 | int selinux_netlbl_inode_permission(struct inode *inode, int mask) | ||
2652 | { | ||
2653 | int rc; | ||
2654 | struct sk_security_struct *sksec; | ||
2655 | struct socket *sock; | ||
2656 | |||
2657 | if (!S_ISSOCK(inode->i_mode) || | ||
2658 | ((mask & (MAY_WRITE | MAY_APPEND)) == 0)) | ||
2659 | return 0; | ||
2660 | sock = SOCKET_I(inode); | ||
2661 | sksec = sock->sk->sk_security; | ||
2662 | |||
2663 | rcu_read_lock(); | ||
2664 | if (sksec->nlbl_state != NLBL_REQUIRE) { | ||
2665 | rcu_read_unlock(); | ||
2666 | return 0; | ||
2667 | } | ||
2668 | local_bh_disable(); | ||
2669 | bh_lock_sock_nested(sock->sk); | ||
2670 | rc = selinux_netlbl_socket_setsid(sock, sksec->sid); | ||
2671 | bh_unlock_sock(sock->sk); | ||
2672 | local_bh_enable(); | ||
2673 | rcu_read_unlock(); | ||
2674 | |||
2675 | return rc; | ||
2676 | } | ||
2677 | |||
2678 | /** | ||
2679 | * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel | ||
2680 | * @sksec: the sock's sk_security_struct | ||
2681 | * @skb: the packet | ||
2682 | * @ad: the audit data | ||
2683 | * | ||
2684 | * Description: | ||
2685 | * Fetch the NetLabel security attributes from @skb and perform an access check | ||
2686 | * against the receiving socket. Returns zero on success, negative values on | ||
2687 | * error. | ||
2688 | * | ||
2689 | */ | ||
2690 | int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, | ||
2691 | struct sk_buff *skb, | ||
2692 | struct avc_audit_data *ad) | ||
2693 | { | ||
2694 | int rc; | ||
2695 | u32 netlbl_sid; | ||
2696 | u32 recv_perm; | ||
2697 | |||
2698 | rc = selinux_netlbl_skbuff_getsid(skb, | ||
2699 | SECINITSID_UNLABELED, | ||
2700 | &netlbl_sid); | ||
2701 | if (rc != 0) | ||
2702 | return rc; | ||
2703 | |||
2704 | if (netlbl_sid == SECSID_NULL) | ||
2705 | return 0; | ||
2706 | |||
2707 | switch (sksec->sclass) { | ||
2708 | case SECCLASS_UDP_SOCKET: | ||
2709 | recv_perm = UDP_SOCKET__RECVFROM; | ||
2710 | break; | ||
2711 | case SECCLASS_TCP_SOCKET: | ||
2712 | recv_perm = TCP_SOCKET__RECVFROM; | ||
2713 | break; | ||
2714 | default: | ||
2715 | recv_perm = RAWIP_SOCKET__RECVFROM; | ||
2716 | } | ||
2717 | |||
2718 | rc = avc_has_perm(sksec->sid, | ||
2719 | netlbl_sid, | ||
2720 | sksec->sclass, | ||
2721 | recv_perm, | ||
2722 | ad); | ||
2723 | if (rc == 0) | ||
2724 | return 0; | ||
2725 | |||
2726 | netlbl_skbuff_err(skb, rc); | ||
2727 | return rc; | ||
2728 | } | ||
2729 | |||
2730 | /** | ||
2731 | * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel | ||
2732 | * @sock: the socket | ||
2733 | * @level: the socket level or protocol | ||
2734 | * @optname: the socket option name | ||
2735 | * | ||
2736 | * Description: | ||
2737 | * Check the setsockopt() call and if the user is trying to replace the IP | ||
2738 | * options on a socket and a NetLabel is in place for the socket deny the | ||
2739 | * access; otherwise allow the access. Returns zero when the access is | ||
2740 | * allowed, -EACCES when denied, and other negative values on error. | ||
2741 | * | ||
2742 | */ | ||
2743 | int selinux_netlbl_socket_setsockopt(struct socket *sock, | ||
2744 | int level, | ||
2745 | int optname) | ||
2746 | { | ||
2747 | int rc = 0; | ||
2748 | struct sk_security_struct *sksec = sock->sk->sk_security; | ||
2749 | struct netlbl_lsm_secattr secattr; | ||
2750 | |||
2751 | rcu_read_lock(); | ||
2752 | if (level == IPPROTO_IP && optname == IP_OPTIONS && | ||
2753 | sksec->nlbl_state == NLBL_LABELED) { | ||
2754 | netlbl_secattr_init(&secattr); | ||
2755 | rc = netlbl_socket_getattr(sock, &secattr); | ||
2756 | if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) | ||
2757 | rc = -EACCES; | ||
2758 | netlbl_secattr_destroy(&secattr); | ||
2759 | } | ||
2760 | rcu_read_unlock(); | ||
2761 | 2471 | ||
2472 | netlbl_sid_to_secattr_failure: | ||
2473 | POLICY_RDUNLOCK; | ||
2474 | netlbl_secattr_destroy(secattr); | ||
2762 | return rc; | 2475 | return rc; |
2763 | } | 2476 | } |
2764 | #endif /* CONFIG_NETLABEL */ | 2477 | #endif /* CONFIG_NETLABEL */ |