aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS12
-rw-r--r--drivers/mtd/Kconfig2
-rw-r--r--drivers/mtd/Makefile2
-rw-r--r--drivers/mtd/ubi/Kconfig58
-rw-r--r--drivers/mtd/ubi/Kconfig.debug104
-rw-r--r--drivers/mtd/ubi/Makefile7
-rw-r--r--drivers/mtd/ubi/build.c848
-rw-r--r--drivers/mtd/ubi/cdev.c722
-rw-r--r--drivers/mtd/ubi/debug.c224
-rw-r--r--drivers/mtd/ubi/debug.h161
-rw-r--r--drivers/mtd/ubi/eba.c1241
-rw-r--r--drivers/mtd/ubi/gluebi.c323
-rw-r--r--drivers/mtd/ubi/io.c1259
-rw-r--r--drivers/mtd/ubi/kapi.c575
-rw-r--r--drivers/mtd/ubi/misc.c105
-rw-r--r--drivers/mtd/ubi/scan.c1368
-rw-r--r--drivers/mtd/ubi/scan.h167
-rw-r--r--drivers/mtd/ubi/ubi.h535
-rw-r--r--drivers/mtd/ubi/upd.c348
-rw-r--r--drivers/mtd/ubi/vmt.c809
-rw-r--r--drivers/mtd/ubi/vtbl.c809
-rw-r--r--drivers/mtd/ubi/wl.c1671
-rw-r--r--fs/jffs2/fs.c12
-rw-r--r--fs/jffs2/os-linux.h6
-rw-r--r--fs/jffs2/wbuf.c24
-rw-r--r--fs/ocfs2/alloc.c3037
-rw-r--r--fs/ocfs2/alloc.h27
-rw-r--r--fs/ocfs2/aops.c1011
-rw-r--r--fs/ocfs2/aops.h77
-rw-r--r--fs/ocfs2/cluster/quorum.c5
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dir.c15
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmglue.c143
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/extent_map.c1233
-rw-r--r--fs/ocfs2/extent_map.h39
-rw-r--r--fs/ocfs2/file.c637
-rw-r--r--fs/ocfs2/file.h5
-rw-r--r--fs/ocfs2/inode.c199
-rw-r--r--fs/ocfs2/inode.h23
-rw-r--r--fs/ocfs2/journal.c24
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/ocfs2.h55
-rw-r--r--fs/ocfs2/ocfs2_fs.h31
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/suballoc.c3
-rw-r--r--fs/ocfs2/super.c7
-rw-r--r--fs/ocfs2/vote.c289
-rw-r--r--fs/ocfs2/vote.h3
-rw-r--r--fs/sync.c8
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/mtd/ubi.h202
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/seqlock.h8
-rw-r--r--include/mtd/Kbuild2
-rw-r--r--include/mtd/mtd-abi.h1
-rw-r--r--include/mtd/ubi-header.h360
-rw-r--r--include/mtd/ubi-user.h161
-rw-r--r--kernel/sched.c2
-rw-r--r--net/ipv4/cipso_ipv4.c41
-rw-r--r--net/netlabel/netlabel_kapi.c3
-rw-r--r--security/selinux/Makefile2
-rw-r--r--security/selinux/avc.c2
-rw-r--r--security/selinux/hooks.c38
-rw-r--r--security/selinux/include/av_perm_to_string.h102
-rw-r--r--security/selinux/include/av_permissions.h179
-rw-r--r--security/selinux/include/class_to_string.h34
-rw-r--r--security/selinux/include/flask.h16
-rw-r--r--security/selinux/include/netlabel.h (renamed from security/selinux/include/selinux_netlabel.h)71
-rw-r--r--security/selinux/include/security.h29
-rw-r--r--security/selinux/netlabel.c363
-rw-r--r--security/selinux/selinuxfs.c85
-rw-r--r--security/selinux/ss/services.c499
78 files changed, 17513 insertions, 3017 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f56c7e172cee..5519d257b556 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2246,6 +2246,14 @@ L: linux-mtd@lists.infradead.org
2246T: git git://git.infradead.org/mtd-2.6.git 2246T: git git://git.infradead.org/mtd-2.6.git
2247S: Maintained 2247S: Maintained
2248 2248
2249UNSORTED BLOCK IMAGES (UBI)
2250P: Artem Bityutskiy
2251M: dedekind@infradead.org
2252W: http://www.linux-mtd.infradead.org/
2253L: linux-mtd@lists.infradead.org
2254T: git git://git.infradead.org/ubi-2.6.git
2255S: Maintained
2256
2249MICROTEK X6 SCANNER 2257MICROTEK X6 SCANNER
2250P: Oliver Neukum 2258P: Oliver Neukum
2251M: oliver@neukum.name 2259M: oliver@neukum.name
@@ -2972,8 +2980,10 @@ P: Stephen Smalley
2972M: sds@tycho.nsa.gov 2980M: sds@tycho.nsa.gov
2973P: James Morris 2981P: James Morris
2974M: jmorris@namei.org 2982M: jmorris@namei.org
2983P: Eric Paris
2984M: eparis@parisplace.org
2975L: linux-kernel@vger.kernel.org (kernel issues) 2985L: linux-kernel@vger.kernel.org (kernel issues)
2976L: selinux@tycho.nsa.gov (general discussion) 2986L: selinux@tycho.nsa.gov (subscribers-only, general discussion)
2977W: http://www.nsa.gov/selinux 2987W: http://www.nsa.gov/selinux
2978S: Supported 2988S: Supported
2979 2989
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 26f75c299440..6d1b91bf7ad5 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -292,5 +292,7 @@ source "drivers/mtd/nand/Kconfig"
292 292
293source "drivers/mtd/onenand/Kconfig" 293source "drivers/mtd/onenand/Kconfig"
294 294
295source "drivers/mtd/ubi/Kconfig"
296
295endmenu 297endmenu
296 298
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index c130e6261adf..92055405cb30 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -28,3 +28,5 @@ nftl-objs := nftlcore.o nftlmount.o
28inftl-objs := inftlcore.o inftlmount.o 28inftl-objs := inftlcore.o inftlmount.o
29 29
30obj-y += chips/ maps/ devices/ nand/ onenand/ 30obj-y += chips/ maps/ devices/ nand/ onenand/
31
32obj-$(CONFIG_MTD_UBI) += ubi/
diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig
new file mode 100644
index 000000000000..b9daf159a4a7
--- /dev/null
+++ b/drivers/mtd/ubi/Kconfig
@@ -0,0 +1,58 @@
1# drivers/mtd/ubi/Kconfig
2
3menu "UBI - Unsorted block images"
4 depends on MTD
5
6config MTD_UBI
7 tristate "Enable UBI"
8 depends on MTD
9 select CRC32
10 help
11 UBI is a software layer above MTD layer which admits of LVM-like
12 logical volumes on top of MTD devices, hides some complexities of
13 flash chips like wear and bad blocks and provides some other useful
14 capabilities. Please, consult the MTD web site for more details
15 (www.linux-mtd.infradead.org).
16
17config MTD_UBI_WL_THRESHOLD
18 int "UBI wear-leveling threshold"
19 default 4096
20 range 2 65536
21 depends on MTD_UBI
22 help
23 This parameter defines the maximum difference between the highest
24 erase counter value and the lowest erase counter value of eraseblocks
25 of UBI devices. When this threshold is exceeded, UBI starts performing
26 wear leveling by means of moving data from eraseblock with low erase
27 counter to eraseblocks with high erase counter. Leave the default
28 value if unsure.
29
30config MTD_UBI_BEB_RESERVE
31 int "Percentage of reserved eraseblocks for bad eraseblocks handling"
32 default 1
33 range 0 25
34 depends on MTD_UBI
35 help
36 If the MTD device admits of bad eraseblocks (e.g. NAND flash), UBI
37 reserves some amount of physical eraseblocks to handle new bad
38 eraseblocks. For example, if a flash physical eraseblock becomes bad,
39 UBI uses these reserved physical eraseblocks to relocate the bad one.
40 This option specifies how many physical eraseblocks will be reserved
41 for bad eraseblock handling (percents of total number of good flash
42 eraseblocks). If the underlying flash does not admit of bad
43 eraseblocks (e.g. NOR flash), this value is ignored and nothing is
44 reserved. Leave the default value if unsure.
45
46config MTD_UBI_GLUEBI
47 bool "Emulate MTD devices"
48 default n
49 depends on MTD_UBI
50 help
51 This option enables MTD devices emulation on top of UBI volumes: for
52 each UBI volumes an MTD device is created, and all I/O to this MTD
53 device is redirected to the UBI volume. This is handy to make
54 MTD-oriented software (like JFFS2) work on top of UBI. Do not enable
55 this if no legacy software will be used.
56
57source "drivers/mtd/ubi/Kconfig.debug"
58endmenu
diff --git a/drivers/mtd/ubi/Kconfig.debug b/drivers/mtd/ubi/Kconfig.debug
new file mode 100644
index 000000000000..1e2ee22edeff
--- /dev/null
+++ b/drivers/mtd/ubi/Kconfig.debug
@@ -0,0 +1,104 @@
1comment "UBI debugging options"
2 depends on MTD_UBI
3
4config MTD_UBI_DEBUG
5 bool "UBI debugging"
6 depends on SYSFS
7 depends on MTD_UBI
8 select DEBUG_FS
9 select KALLSYMS_ALL
10 help
11 This option enables UBI debugging.
12
13config MTD_UBI_DEBUG_MSG
14 bool "UBI debugging messages"
15 depends on MTD_UBI_DEBUG
16 default n
17 help
18 This option enables UBI debugging messages.
19
20config MTD_UBI_DEBUG_PARANOID
21 bool "Extra self-checks"
22 default n
23 depends on MTD_UBI_DEBUG
24 help
25 This option enables extra checks in UBI code. Note this slows UBI down
26 significantly.
27
28config MTD_UBI_DEBUG_DISABLE_BGT
29 bool "Do not enable the UBI background thread"
30 depends on MTD_UBI_DEBUG
31 default n
32 help
33 This option switches the background thread off by default. The thread
34 may be also be enabled/disabled via UBI sysfs.
35
36config MTD_UBI_DEBUG_USERSPACE_IO
37 bool "Direct user-space write/erase support"
38 default n
39 depends on MTD_UBI_DEBUG
40 help
41 By default, users cannot directly write and erase individual
42 eraseblocks of dynamic volumes, and have to use update operation
43 instead. This option enables this capability - it is very useful for
44 debugging and testing.
45
46config MTD_UBI_DEBUG_EMULATE_BITFLIPS
47 bool "Emulate flash bit-flips"
48 depends on MTD_UBI_DEBUG
49 default n
50 help
51 This option emulates bit-flips with probability 1/50, which in turn
52 causes scrubbing. Useful for debugging and stressing UBI.
53
54config MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES
55 bool "Emulate flash write failures"
56 depends on MTD_UBI_DEBUG
57 default n
58 help
59 This option emulates write failures with probability 1/100. Useful for
60 debugging and testing how UBI handlines errors.
61
62config MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES
63 bool "Emulate flash erase failures"
64 depends on MTD_UBI_DEBUG
65 default n
66 help
67 This option emulates erase failures with probability 1/100. Useful for
68 debugging and testing how UBI handlines errors.
69
70menu "Additional UBI debugging messages"
71 depends on MTD_UBI_DEBUG
72
73config MTD_UBI_DEBUG_MSG_BLD
74 bool "Additional UBI initialization and build messages"
75 default n
76 depends on MTD_UBI_DEBUG
77 help
78 This option enables detailed UBI initialization and device build
79 debugging messages.
80
81config MTD_UBI_DEBUG_MSG_EBA
82 bool "Eraseblock association unit messages"
83 default n
84 depends on MTD_UBI_DEBUG
85 help
86 This option enables debugging messages from the UBI eraseblock
87 association unit.
88
89config MTD_UBI_DEBUG_MSG_WL
90 bool "Wear-leveling unit messages"
91 default n
92 depends on MTD_UBI_DEBUG
93 help
94 This option enables debugging messages from the UBI wear-leveling
95 unit.
96
97config MTD_UBI_DEBUG_MSG_IO
98 bool "Input/output unit messages"
99 default n
100 depends on MTD_UBI_DEBUG
101 help
102 This option enables debugging messages from the UBI input/output unit.
103
104endmenu # UBI debugging messages
diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile
new file mode 100644
index 000000000000..dd834e04151b
--- /dev/null
+++ b/drivers/mtd/ubi/Makefile
@@ -0,0 +1,7 @@
1obj-$(CONFIG_MTD_UBI) += ubi.o
2
3ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o scan.o
4ubi-y += misc.o
5
6ubi-$(CONFIG_MTD_UBI_DEBUG) += debug.o
7ubi-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
new file mode 100644
index 000000000000..555d594d1811
--- /dev/null
+++ b/drivers/mtd/ubi/build.c
@@ -0,0 +1,848 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 * Copyright (c) Nokia Corporation, 2007
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * Author: Artem Bityutskiy (Битюцкий Артём),
20 * Frank Haverkamp
21 */
22
23/*
24 * This file includes UBI initialization and building of UBI devices. At the
25 * moment UBI devices may only be added while UBI is initialized, but dynamic
26 * device add/remove functionality is planned. Also, at the moment we only
27 * attach UBI devices by scanning, which will become a bottleneck when flashes
28 * reach certain large size. Then one may improve UBI and add other methods.
29 */
30
31#include <linux/err.h>
32#include <linux/module.h>
33#include <linux/moduleparam.h>
34#include <linux/stringify.h>
35#include <linux/stat.h>
36#include "ubi.h"
37
38/* Maximum length of the 'mtd=' parameter */
39#define MTD_PARAM_LEN_MAX 64
40
41/**
42 * struct mtd_dev_param - MTD device parameter description data structure.
43 * @name: MTD device name or number string
44 * @vid_hdr_offs: VID header offset
45 * @data_offs: data offset
46 */
47struct mtd_dev_param
48{
49 char name[MTD_PARAM_LEN_MAX];
50 int vid_hdr_offs;
51 int data_offs;
52};
53
54/* Numbers of elements set in the @mtd_dev_param array */
55static int mtd_devs = 0;
56
57/* MTD devices specification parameters */
58static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES];
59
60/* Number of UBI devices in system */
61int ubi_devices_cnt;
62
63/* All UBI devices in system */
64struct ubi_device *ubi_devices[UBI_MAX_DEVICES];
65
66/* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */
67struct class *ubi_class;
68
69/* "Show" method for files in '/<sysfs>/class/ubi/' */
70static ssize_t ubi_version_show(struct class *class, char *buf)
71{
72 return sprintf(buf, "%d\n", UBI_VERSION);
73}
74
75/* UBI version attribute ('/<sysfs>/class/ubi/version') */
76static struct class_attribute ubi_version =
77 __ATTR(version, S_IRUGO, ubi_version_show, NULL);
78
79static ssize_t dev_attribute_show(struct device *dev,
80 struct device_attribute *attr, char *buf);
81
82/* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */
83static struct device_attribute dev_eraseblock_size =
84 __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL);
85static struct device_attribute dev_avail_eraseblocks =
86 __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL);
87static struct device_attribute dev_total_eraseblocks =
88 __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL);
89static struct device_attribute dev_volumes_count =
90 __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL);
91static struct device_attribute dev_max_ec =
92 __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL);
93static struct device_attribute dev_reserved_for_bad =
94 __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL);
95static struct device_attribute dev_bad_peb_count =
96 __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL);
97static struct device_attribute dev_max_vol_count =
98 __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL);
99static struct device_attribute dev_min_io_size =
100 __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL);
101static struct device_attribute dev_bgt_enabled =
102 __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL);
103
104/* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */
105static ssize_t dev_attribute_show(struct device *dev,
106 struct device_attribute *attr, char *buf)
107{
108 const struct ubi_device *ubi;
109
110 ubi = container_of(dev, struct ubi_device, dev);
111 if (attr == &dev_eraseblock_size)
112 return sprintf(buf, "%d\n", ubi->leb_size);
113 else if (attr == &dev_avail_eraseblocks)
114 return sprintf(buf, "%d\n", ubi->avail_pebs);
115 else if (attr == &dev_total_eraseblocks)
116 return sprintf(buf, "%d\n", ubi->good_peb_count);
117 else if (attr == &dev_volumes_count)
118 return sprintf(buf, "%d\n", ubi->vol_count);
119 else if (attr == &dev_max_ec)
120 return sprintf(buf, "%d\n", ubi->max_ec);
121 else if (attr == &dev_reserved_for_bad)
122 return sprintf(buf, "%d\n", ubi->beb_rsvd_pebs);
123 else if (attr == &dev_bad_peb_count)
124 return sprintf(buf, "%d\n", ubi->bad_peb_count);
125 else if (attr == &dev_max_vol_count)
126 return sprintf(buf, "%d\n", ubi->vtbl_slots);
127 else if (attr == &dev_min_io_size)
128 return sprintf(buf, "%d\n", ubi->min_io_size);
129 else if (attr == &dev_bgt_enabled)
130 return sprintf(buf, "%d\n", ubi->thread_enabled);
131 else
132 BUG();
133
134 return 0;
135}
136
137/* Fake "release" method for UBI devices */
138static void dev_release(struct device *dev) { }
139
140/**
141 * ubi_sysfs_init - initialize sysfs for an UBI device.
142 * @ubi: UBI device description object
143 *
144 * This function returns zero in case of success and a negative error code in
145 * case of failure.
146 */
147static int ubi_sysfs_init(struct ubi_device *ubi)
148{
149 int err;
150
151 ubi->dev.release = dev_release;
152 ubi->dev.devt = MKDEV(ubi->major, 0);
153 ubi->dev.class = ubi_class;
154 sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num);
155 err = device_register(&ubi->dev);
156 if (err)
157 goto out;
158
159 err = device_create_file(&ubi->dev, &dev_eraseblock_size);
160 if (err)
161 goto out_unregister;
162 err = device_create_file(&ubi->dev, &dev_avail_eraseblocks);
163 if (err)
164 goto out_eraseblock_size;
165 err = device_create_file(&ubi->dev, &dev_total_eraseblocks);
166 if (err)
167 goto out_avail_eraseblocks;
168 err = device_create_file(&ubi->dev, &dev_volumes_count);
169 if (err)
170 goto out_total_eraseblocks;
171 err = device_create_file(&ubi->dev, &dev_max_ec);
172 if (err)
173 goto out_volumes_count;
174 err = device_create_file(&ubi->dev, &dev_reserved_for_bad);
175 if (err)
176 goto out_volumes_max_ec;
177 err = device_create_file(&ubi->dev, &dev_bad_peb_count);
178 if (err)
179 goto out_reserved_for_bad;
180 err = device_create_file(&ubi->dev, &dev_max_vol_count);
181 if (err)
182 goto out_bad_peb_count;
183 err = device_create_file(&ubi->dev, &dev_min_io_size);
184 if (err)
185 goto out_max_vol_count;
186 err = device_create_file(&ubi->dev, &dev_bgt_enabled);
187 if (err)
188 goto out_min_io_size;
189
190 return 0;
191
192out_min_io_size:
193 device_remove_file(&ubi->dev, &dev_min_io_size);
194out_max_vol_count:
195 device_remove_file(&ubi->dev, &dev_max_vol_count);
196out_bad_peb_count:
197 device_remove_file(&ubi->dev, &dev_bad_peb_count);
198out_reserved_for_bad:
199 device_remove_file(&ubi->dev, &dev_reserved_for_bad);
200out_volumes_max_ec:
201 device_remove_file(&ubi->dev, &dev_max_ec);
202out_volumes_count:
203 device_remove_file(&ubi->dev, &dev_volumes_count);
204out_total_eraseblocks:
205 device_remove_file(&ubi->dev, &dev_total_eraseblocks);
206out_avail_eraseblocks:
207 device_remove_file(&ubi->dev, &dev_avail_eraseblocks);
208out_eraseblock_size:
209 device_remove_file(&ubi->dev, &dev_eraseblock_size);
210out_unregister:
211 device_unregister(&ubi->dev);
212out:
213 ubi_err("failed to initialize sysfs for %s", ubi->ubi_name);
214 return err;
215}
216
217/**
218 * ubi_sysfs_close - close sysfs for an UBI device.
219 * @ubi: UBI device description object
220 */
221static void ubi_sysfs_close(struct ubi_device *ubi)
222{
223 device_remove_file(&ubi->dev, &dev_bgt_enabled);
224 device_remove_file(&ubi->dev, &dev_min_io_size);
225 device_remove_file(&ubi->dev, &dev_max_vol_count);
226 device_remove_file(&ubi->dev, &dev_bad_peb_count);
227 device_remove_file(&ubi->dev, &dev_reserved_for_bad);
228 device_remove_file(&ubi->dev, &dev_max_ec);
229 device_remove_file(&ubi->dev, &dev_volumes_count);
230 device_remove_file(&ubi->dev, &dev_total_eraseblocks);
231 device_remove_file(&ubi->dev, &dev_avail_eraseblocks);
232 device_remove_file(&ubi->dev, &dev_eraseblock_size);
233 device_unregister(&ubi->dev);
234}
235
236/**
237 * kill_volumes - destroy all volumes.
238 * @ubi: UBI device description object
239 */
240static void kill_volumes(struct ubi_device *ubi)
241{
242 int i;
243
244 for (i = 0; i < ubi->vtbl_slots; i++)
245 if (ubi->volumes[i])
246 ubi_free_volume(ubi, i);
247}
248
249/**
250 * uif_init - initialize user interfaces for an UBI device.
251 * @ubi: UBI device description object
252 *
253 * This function returns zero in case of success and a negative error code in
254 * case of failure.
255 */
256static int uif_init(struct ubi_device *ubi)
257{
258 int i, err;
259 dev_t dev;
260
261 mutex_init(&ubi->vtbl_mutex);
262 spin_lock_init(&ubi->volumes_lock);
263
264 sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num);
265
266 /*
267 * Major numbers for the UBI character devices are allocated
268 * dynamically. Major numbers of volume character devices are
269 * equivalent to ones of the corresponding UBI character device. Minor
270 * numbers of UBI character devices are 0, while minor numbers of
271 * volume character devices start from 1. Thus, we allocate one major
272 * number and ubi->vtbl_slots + 1 minor numbers.
273 */
274 err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name);
275 if (err) {
276 ubi_err("cannot register UBI character devices");
277 return err;
278 }
279
280 cdev_init(&ubi->cdev, &ubi_cdev_operations);
281 ubi->major = MAJOR(dev);
282 dbg_msg("%s major is %u", ubi->ubi_name, ubi->major);
283 ubi->cdev.owner = THIS_MODULE;
284
285 dev = MKDEV(ubi->major, 0);
286 err = cdev_add(&ubi->cdev, dev, 1);
287 if (err) {
288 ubi_err("cannot add character device %s", ubi->ubi_name);
289 goto out_unreg;
290 }
291
292 err = ubi_sysfs_init(ubi);
293 if (err)
294 goto out_cdev;
295
296 for (i = 0; i < ubi->vtbl_slots; i++)
297 if (ubi->volumes[i]) {
298 err = ubi_add_volume(ubi, i);
299 if (err)
300 goto out_volumes;
301 }
302
303 return 0;
304
305out_volumes:
306 kill_volumes(ubi);
307 ubi_sysfs_close(ubi);
308out_cdev:
309 cdev_del(&ubi->cdev);
310out_unreg:
311 unregister_chrdev_region(MKDEV(ubi->major, 0),
312 ubi->vtbl_slots + 1);
313 return err;
314}
315
316/**
317 * uif_close - close user interfaces for an UBI device.
318 * @ubi: UBI device description object
319 */
320static void uif_close(struct ubi_device *ubi)
321{
322 kill_volumes(ubi);
323 ubi_sysfs_close(ubi);
324 cdev_del(&ubi->cdev);
325 unregister_chrdev_region(MKDEV(ubi->major, 0), ubi->vtbl_slots + 1);
326}
327
328/**
329 * attach_by_scanning - attach an MTD device using scanning method.
330 * @ubi: UBI device descriptor
331 *
332 * This function returns zero in case of success and a negative error code in
333 * case of failure.
334 *
335 * Note, currently this is the only method to attach UBI devices. Hopefully in
336 * the future we'll have more scalable attaching methods and avoid full media
337 * scanning. But even in this case scanning will be needed as a fall-back
338 * attaching method if there are some on-flash table corruptions.
339 */
340static int attach_by_scanning(struct ubi_device *ubi)
341{
342 int err;
343 struct ubi_scan_info *si;
344
345 si = ubi_scan(ubi);
346 if (IS_ERR(si))
347 return PTR_ERR(si);
348
349 ubi->bad_peb_count = si->bad_peb_count;
350 ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count;
351 ubi->max_ec = si->max_ec;
352 ubi->mean_ec = si->mean_ec;
353
354 err = ubi_read_volume_table(ubi, si);
355 if (err)
356 goto out_si;
357
358 err = ubi_wl_init_scan(ubi, si);
359 if (err)
360 goto out_vtbl;
361
362 err = ubi_eba_init_scan(ubi, si);
363 if (err)
364 goto out_wl;
365
366 ubi_scan_destroy_si(si);
367 return 0;
368
369out_wl:
370 ubi_wl_close(ubi);
371out_vtbl:
372 kfree(ubi->vtbl);
373out_si:
374 ubi_scan_destroy_si(si);
375 return err;
376}
377
378/**
379 * io_init - initialize I/O unit for a given UBI device.
380 * @ubi: UBI device description object
381 *
382 * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are
383 * assumed:
384 * o EC header is always at offset zero - this cannot be changed;
385 * o VID header starts just after the EC header at the closest address
386 * aligned to @io->@hdrs_min_io_size;
387 * o data starts just after the VID header at the closest address aligned to
388 * @io->@min_io_size
389 *
390 * This function returns zero in case of success and a negative error code in
391 * case of failure.
392 */
393static int io_init(struct ubi_device *ubi)
394{
395 if (ubi->mtd->numeraseregions != 0) {
396 /*
397 * Some flashes have several erase regions. Different regions
398 * may have different eraseblock size and other
399 * characteristics. It looks like mostly multi-region flashes
400 * have one "main" region and one or more small regions to
401 * store boot loader code or boot parameters or whatever. I
402 * guess we should just pick the largest region. But this is
403 * not implemented.
404 */
405 ubi_err("multiple regions, not implemented");
406 return -EINVAL;
407 }
408
409 /*
410 * Note, in this implementation we support MTD devices with 0x7FFFFFFF
411 * physical eraseblocks maximum.
412 */
413
414 ubi->peb_size = ubi->mtd->erasesize;
415 ubi->peb_count = ubi->mtd->size / ubi->mtd->erasesize;
416 ubi->flash_size = ubi->mtd->size;
417
418 if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)
419 ubi->bad_allowed = 1;
420
421 ubi->min_io_size = ubi->mtd->writesize;
422 ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft;
423
424 /* Make sure minimal I/O unit is power of 2 */
425 if (ubi->min_io_size == 0 ||
426 (ubi->min_io_size & (ubi->min_io_size - 1))) {
427 ubi_err("bad min. I/O unit");
428 return -EINVAL;
429 }
430
431 ubi_assert(ubi->hdrs_min_io_size > 0);
432 ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size);
433 ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0);
434
435 /* Calculate default aligned sizes of EC and VID headers */
436 ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size);
437 ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size);
438
439 dbg_msg("min_io_size %d", ubi->min_io_size);
440 dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size);
441 dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize);
442 dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize);
443
444 if (ubi->vid_hdr_offset == 0)
445 /* Default offset */
446 ubi->vid_hdr_offset = ubi->vid_hdr_aloffset =
447 ubi->ec_hdr_alsize;
448 else {
449 ubi->vid_hdr_aloffset = ubi->vid_hdr_offset &
450 ~(ubi->hdrs_min_io_size - 1);
451 ubi->vid_hdr_shift = ubi->vid_hdr_offset -
452 ubi->vid_hdr_aloffset;
453 }
454
455 /* Similar for the data offset */
456 if (ubi->leb_start == 0) {
457 ubi->leb_start = ubi->vid_hdr_offset + ubi->vid_hdr_alsize;
458 ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size);
459 }
460
461 dbg_msg("vid_hdr_offset %d", ubi->vid_hdr_offset);
462 dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset);
463 dbg_msg("vid_hdr_shift %d", ubi->vid_hdr_shift);
464 dbg_msg("leb_start %d", ubi->leb_start);
465
466 /* The shift must be aligned to 32-bit boundary */
467 if (ubi->vid_hdr_shift % 4) {
468 ubi_err("unaligned VID header shift %d",
469 ubi->vid_hdr_shift);
470 return -EINVAL;
471 }
472
473 /* Check sanity */
474 if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE ||
475 ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE ||
476 ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE ||
477 ubi->leb_start % ubi->min_io_size) {
478 ubi_err("bad VID header (%d) or data offsets (%d)",
479 ubi->vid_hdr_offset, ubi->leb_start);
480 return -EINVAL;
481 }
482
483 /*
484 * It may happen that EC and VID headers are situated in one minimal
485 * I/O unit. In this case we can only accept this UBI image in
486 * read-only mode.
487 */
488 if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) {
489 ubi_warn("EC and VID headers are in the same minimal I/O unit, "
490 "switch to read-only mode");
491 ubi->ro_mode = 1;
492 }
493
494 ubi->leb_size = ubi->peb_size - ubi->leb_start;
495
496 if (!(ubi->mtd->flags & MTD_WRITEABLE)) {
497 ubi_msg("MTD device %d is write-protected, attach in "
498 "read-only mode", ubi->mtd->index);
499 ubi->ro_mode = 1;
500 }
501
502 dbg_msg("leb_size %d", ubi->leb_size);
503 dbg_msg("ro_mode %d", ubi->ro_mode);
504
505 /*
506 * Note, ideally, we have to initialize ubi->bad_peb_count here. But
507 * unfortunately, MTD does not provide this information. We should loop
508 * over all physical eraseblocks and invoke mtd->block_is_bad() for
509 * each physical eraseblock. So, we skip ubi->bad_peb_count
510 * uninitialized and initialize it after scanning.
511 */
512
513 return 0;
514}
515
516/**
517 * attach_mtd_dev - attach an MTD device.
518 * @mtd_dev: MTD device name or number string
519 * @vid_hdr_offset: VID header offset
520 * @data_offset: data offset
521 *
522 * This function attaches an MTD device to UBI. It first treats @mtd_dev as the
523 * MTD device name, and tries to open it by this name. If it is unable to open,
524 * it tries to convert @mtd_dev to an integer and open the MTD device by its
525 * number. Returns zero in case of success and a negative error code in case of
526 * failure.
527 */
528static int attach_mtd_dev(const char *mtd_dev, int vid_hdr_offset,
529 int data_offset)
530{
531 struct ubi_device *ubi;
532 struct mtd_info *mtd;
533 int i, err;
534
535 mtd = get_mtd_device_nm(mtd_dev);
536 if (IS_ERR(mtd)) {
537 int mtd_num;
538 char *endp;
539
540 if (PTR_ERR(mtd) != -ENODEV)
541 return PTR_ERR(mtd);
542
543 /*
544 * Probably this is not MTD device name but MTD device number -
545 * check this out.
546 */
547 mtd_num = simple_strtoul(mtd_dev, &endp, 0);
548 if (*endp != '\0' || mtd_dev == endp) {
549 ubi_err("incorrect MTD device: \"%s\"", mtd_dev);
550 return -ENODEV;
551 }
552
553 mtd = get_mtd_device(NULL, mtd_num);
554 if (IS_ERR(mtd))
555 return PTR_ERR(mtd);
556 }
557
558 /* Check if we already have the same MTD device attached */
559 for (i = 0; i < ubi_devices_cnt; i++)
560 if (ubi_devices[i]->mtd->index == mtd->index) {
561 ubi_err("mtd%d is already attached to ubi%d",
562 mtd->index, i);
563 err = -EINVAL;
564 goto out_mtd;
565 }
566
567 ubi = ubi_devices[ubi_devices_cnt] = kzalloc(sizeof(struct ubi_device),
568 GFP_KERNEL);
569 if (!ubi) {
570 err = -ENOMEM;
571 goto out_mtd;
572 }
573
574 ubi->ubi_num = ubi_devices_cnt;
575 ubi->mtd = mtd;
576
577 dbg_msg("attaching mtd%d to ubi%d: VID header offset %d data offset %d",
578 ubi->mtd->index, ubi_devices_cnt, vid_hdr_offset, data_offset);
579
580 ubi->vid_hdr_offset = vid_hdr_offset;
581 ubi->leb_start = data_offset;
582 err = io_init(ubi);
583 if (err)
584 goto out_free;
585
586 err = attach_by_scanning(ubi);
587 if (err) {
588 dbg_err("failed to attach by scanning, error %d", err);
589 goto out_free;
590 }
591
592 err = uif_init(ubi);
593 if (err)
594 goto out_detach;
595
596 ubi_devices_cnt += 1;
597
598 ubi_msg("attached mtd%d to ubi%d", ubi->mtd->index, ubi_devices_cnt);
599 ubi_msg("MTD device name: \"%s\"", ubi->mtd->name);
600 ubi_msg("MTD device size: %llu MiB", ubi->flash_size >> 20);
601 ubi_msg("physical eraseblock size: %d bytes (%d KiB)",
602 ubi->peb_size, ubi->peb_size >> 10);
603 ubi_msg("logical eraseblock size: %d bytes", ubi->leb_size);
604 ubi_msg("number of good PEBs: %d", ubi->good_peb_count);
605 ubi_msg("number of bad PEBs: %d", ubi->bad_peb_count);
606 ubi_msg("smallest flash I/O unit: %d", ubi->min_io_size);
607 ubi_msg("VID header offset: %d (aligned %d)",
608 ubi->vid_hdr_offset, ubi->vid_hdr_aloffset);
609 ubi_msg("data offset: %d", ubi->leb_start);
610 ubi_msg("max. allowed volumes: %d", ubi->vtbl_slots);
611 ubi_msg("wear-leveling threshold: %d", CONFIG_MTD_UBI_WL_THRESHOLD);
612 ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT);
613 ubi_msg("number of user volumes: %d",
614 ubi->vol_count - UBI_INT_VOL_COUNT);
615 ubi_msg("available PEBs: %d", ubi->avail_pebs);
616 ubi_msg("total number of reserved PEBs: %d", ubi->rsvd_pebs);
617 ubi_msg("number of PEBs reserved for bad PEB handling: %d",
618 ubi->beb_rsvd_pebs);
619 ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec);
620
621 /* Enable the background thread */
622 if (!DBG_DISABLE_BGT) {
623 ubi->thread_enabled = 1;
624 wake_up_process(ubi->bgt_thread);
625 }
626
627 return 0;
628
629out_detach:
630 ubi_eba_close(ubi);
631 ubi_wl_close(ubi);
632 kfree(ubi->vtbl);
633out_free:
634 kfree(ubi);
635out_mtd:
636 put_mtd_device(mtd);
637 ubi_devices[ubi_devices_cnt] = NULL;
638 return err;
639}
640
641/**
642 * detach_mtd_dev - detach an MTD device.
643 * @ubi: UBI device description object
644 */
645static void detach_mtd_dev(struct ubi_device *ubi)
646{
647 int ubi_num = ubi->ubi_num, mtd_num = ubi->mtd->index;
648
649 dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num);
650 uif_close(ubi);
651 ubi_eba_close(ubi);
652 ubi_wl_close(ubi);
653 kfree(ubi->vtbl);
654 put_mtd_device(ubi->mtd);
655 kfree(ubi_devices[ubi_num]);
656 ubi_devices[ubi_num] = NULL;
657 ubi_devices_cnt -= 1;
658 ubi_assert(ubi_devices_cnt >= 0);
659 ubi_msg("mtd%d is detached from ubi%d", mtd_num, ubi_num);
660}
661
662static int __init ubi_init(void)
663{
664 int err, i, k;
665
666 /* Ensure that EC and VID headers have correct size */
667 BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64);
668 BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64);
669
670 if (mtd_devs > UBI_MAX_DEVICES) {
671 printk("UBI error: too many MTD devices, maximum is %d\n",
672 UBI_MAX_DEVICES);
673 return -EINVAL;
674 }
675
676 ubi_class = class_create(THIS_MODULE, UBI_NAME_STR);
677 if (IS_ERR(ubi_class))
678 return PTR_ERR(ubi_class);
679
680 err = class_create_file(ubi_class, &ubi_version);
681 if (err)
682 goto out_class;
683
684 /* Attach MTD devices */
685 for (i = 0; i < mtd_devs; i++) {
686 struct mtd_dev_param *p = &mtd_dev_param[i];
687
688 cond_resched();
689
690 if (!p->name) {
691 dbg_err("empty name");
692 err = -EINVAL;
693 goto out_detach;
694 }
695
696 err = attach_mtd_dev(p->name, p->vid_hdr_offs, p->data_offs);
697 if (err)
698 goto out_detach;
699 }
700
701 return 0;
702
703out_detach:
704 for (k = 0; k < i; k++)
705 detach_mtd_dev(ubi_devices[k]);
706 class_remove_file(ubi_class, &ubi_version);
707out_class:
708 class_destroy(ubi_class);
709 return err;
710}
711module_init(ubi_init);
712
713static void __exit ubi_exit(void)
714{
715 int i, n = ubi_devices_cnt;
716
717 for (i = 0; i < n; i++)
718 detach_mtd_dev(ubi_devices[i]);
719 class_remove_file(ubi_class, &ubi_version);
720 class_destroy(ubi_class);
721}
722module_exit(ubi_exit);
723
724/**
725 * bytes_str_to_int - convert a string representing number of bytes to an
726 * integer.
727 * @str: the string to convert
728 *
729 * This function returns positive resulting integer in case of success and a
730 * negative error code in case of failure.
731 */
732static int __init bytes_str_to_int(const char *str)
733{
734 char *endp;
735 unsigned long result;
736
737 result = simple_strtoul(str, &endp, 0);
738 if (str == endp || result < 0) {
739 printk("UBI error: incorrect bytes count: \"%s\"\n", str);
740 return -EINVAL;
741 }
742
743 switch (*endp) {
744 case 'G':
745 result *= 1024;
746 case 'M':
747 result *= 1024;
748 case 'K':
749 case 'k':
750 result *= 1024;
751 if (endp[1] == 'i' && (endp[2] == '\0' ||
752 endp[2] == 'B' || endp[2] == 'b'))
753 endp += 2;
754 case '\0':
755 break;
756 default:
757 printk("UBI error: incorrect bytes count: \"%s\"\n", str);
758 return -EINVAL;
759 }
760
761 return result;
762}
763
764/**
765 * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter.
766 * @val: the parameter value to parse
767 * @kp: not used
768 *
769 * This function returns zero in case of success and a negative error code in
770 * case of error.
771 */
772static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp)
773{
774 int i, len;
775 struct mtd_dev_param *p;
776 char buf[MTD_PARAM_LEN_MAX];
777 char *pbuf = &buf[0];
778 char *tokens[3] = {NULL, NULL, NULL};
779
780 if (mtd_devs == UBI_MAX_DEVICES) {
781 printk("UBI error: too many parameters, max. is %d\n",
782 UBI_MAX_DEVICES);
783 return -EINVAL;
784 }
785
786 len = strnlen(val, MTD_PARAM_LEN_MAX);
787 if (len == MTD_PARAM_LEN_MAX) {
788 printk("UBI error: parameter \"%s\" is too long, max. is %d\n",
789 val, MTD_PARAM_LEN_MAX);
790 return -EINVAL;
791 }
792
793 if (len == 0) {
794 printk("UBI warning: empty 'mtd=' parameter - ignored\n");
795 return 0;
796 }
797
798 strcpy(buf, val);
799
800 /* Get rid of the final newline */
801 if (buf[len - 1] == '\n')
802 buf[len - 1] = 0;
803
804 for (i = 0; i < 3; i++)
805 tokens[i] = strsep(&pbuf, ",");
806
807 if (pbuf) {
808 printk("UBI error: too many arguments at \"%s\"\n", val);
809 return -EINVAL;
810 }
811
812 if (tokens[0] == '\0')
813 return -EINVAL;
814
815 p = &mtd_dev_param[mtd_devs];
816 strcpy(&p->name[0], tokens[0]);
817
818 if (tokens[1])
819 p->vid_hdr_offs = bytes_str_to_int(tokens[1]);
820 if (tokens[2])
821 p->data_offs = bytes_str_to_int(tokens[2]);
822
823 if (p->vid_hdr_offs < 0)
824 return p->vid_hdr_offs;
825 if (p->data_offs < 0)
826 return p->data_offs;
827
828 mtd_devs += 1;
829 return 0;
830}
831
832module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000);
833MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: "
834 "mtd=<name|num>[,<vid_hdr_offs>,<data_offs>]. "
835 "Multiple \"mtd\" parameters may be specified.\n"
836 "MTD devices may be specified by their number or name. "
837 "Optional \"vid_hdr_offs\" and \"data_offs\" parameters "
838 "specify UBI VID header position and data starting "
839 "position to be used by UBI.\n"
840 "Example: mtd=content,1984,2048 mtd=4 - attach MTD device"
841 "with name content using VID header offset 1984 and data "
842 "start 2048, and MTD device number 4 using default "
843 "offsets");
844
845MODULE_VERSION(__stringify(UBI_VERSION));
846MODULE_DESCRIPTION("UBI - Unsorted Block Images");
847MODULE_AUTHOR("Artem Bityutskiy");
848MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
new file mode 100644
index 000000000000..6612eb79bf17
--- /dev/null
+++ b/drivers/mtd/ubi/cdev.c
@@ -0,0 +1,722 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/*
22 * This file includes implementation of UBI character device operations.
23 *
24 * There are two kinds of character devices in UBI: UBI character devices and
25 * UBI volume character devices. UBI character devices allow users to
26 * manipulate whole volumes: create, remove, and re-size them. Volume character
27 * devices provide volume I/O capabilities.
28 *
29 * Major and minor numbers are assigned dynamically to both UBI and volume
30 * character devices.
31 */
32
33#include <linux/module.h>
34#include <linux/stat.h>
35#include <linux/ioctl.h>
36#include <linux/capability.h>
37#include <mtd/ubi-user.h>
38#include <asm/uaccess.h>
39#include <asm/div64.h>
40#include "ubi.h"
41
42/*
43 * Maximum sequence numbers of UBI and volume character device IOCTLs (direct
44 * logical eraseblock erase is a debug-only feature).
45 */
46#define UBI_CDEV_IOC_MAX_SEQ 2
47#ifndef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
48#define VOL_CDEV_IOC_MAX_SEQ 1
49#else
50#define VOL_CDEV_IOC_MAX_SEQ 2
51#endif
52
53/**
54 * major_to_device - get UBI device object by character device major number.
55 * @major: major number
56 *
57 * This function returns a pointer to the UBI device object.
58 */
59static struct ubi_device *major_to_device(int major)
60{
61 int i;
62
63 for (i = 0; i < ubi_devices_cnt; i++)
64 if (ubi_devices[i] && ubi_devices[i]->major == major)
65 return ubi_devices[i];
66 BUG();
67}
68
69/**
70 * get_exclusive - get exclusive access to an UBI volume.
71 * @desc: volume descriptor
72 *
73 * This function changes UBI volume open mode to "exclusive". Returns previous
74 * mode value (positive integer) in case of success and a negative error code
75 * in case of failure.
76 */
77static int get_exclusive(struct ubi_volume_desc *desc)
78{
79 int users, err;
80 struct ubi_volume *vol = desc->vol;
81
82 spin_lock(&vol->ubi->volumes_lock);
83 users = vol->readers + vol->writers + vol->exclusive;
84 ubi_assert(users > 0);
85 if (users > 1) {
86 dbg_err("%d users for volume %d", users, vol->vol_id);
87 err = -EBUSY;
88 } else {
89 vol->readers = vol->writers = 0;
90 vol->exclusive = 1;
91 err = desc->mode;
92 desc->mode = UBI_EXCLUSIVE;
93 }
94 spin_unlock(&vol->ubi->volumes_lock);
95
96 return err;
97}
98
99/**
100 * revoke_exclusive - revoke exclusive mode.
101 * @desc: volume descriptor
102 * @mode: new mode to switch to
103 */
104static void revoke_exclusive(struct ubi_volume_desc *desc, int mode)
105{
106 struct ubi_volume *vol = desc->vol;
107
108 spin_lock(&vol->ubi->volumes_lock);
109 ubi_assert(vol->readers == 0 && vol->writers == 0);
110 ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE);
111 vol->exclusive = 0;
112 if (mode == UBI_READONLY)
113 vol->readers = 1;
114 else if (mode == UBI_READWRITE)
115 vol->writers = 1;
116 else
117 vol->exclusive = 1;
118 spin_unlock(&vol->ubi->volumes_lock);
119
120 desc->mode = mode;
121}
122
123static int vol_cdev_open(struct inode *inode, struct file *file)
124{
125 struct ubi_volume_desc *desc;
126 const struct ubi_device *ubi = major_to_device(imajor(inode));
127 int vol_id = iminor(inode) - 1;
128 int mode;
129
130 if (file->f_mode & FMODE_WRITE)
131 mode = UBI_READWRITE;
132 else
133 mode = UBI_READONLY;
134
135 dbg_msg("open volume %d, mode %d", vol_id, mode);
136
137 desc = ubi_open_volume(ubi->ubi_num, vol_id, mode);
138 if (IS_ERR(desc))
139 return PTR_ERR(desc);
140
141 file->private_data = desc;
142 return 0;
143}
144
145static int vol_cdev_release(struct inode *inode, struct file *file)
146{
147 struct ubi_volume_desc *desc = file->private_data;
148 struct ubi_volume *vol = desc->vol;
149
150 dbg_msg("release volume %d, mode %d", vol->vol_id, desc->mode);
151
152 if (vol->updating) {
153 ubi_warn("update of volume %d not finished, volume is damaged",
154 vol->vol_id);
155 vol->updating = 0;
156 kfree(vol->upd_buf);
157 }
158
159 ubi_close_volume(desc);
160 return 0;
161}
162
163static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin)
164{
165 struct ubi_volume_desc *desc = file->private_data;
166 struct ubi_volume *vol = desc->vol;
167 loff_t new_offset;
168
169 if (vol->updating) {
170 /* Update is in progress, seeking is prohibited */
171 dbg_err("updating");
172 return -EBUSY;
173 }
174
175 switch (origin) {
176 case 0: /* SEEK_SET */
177 new_offset = offset;
178 break;
179 case 1: /* SEEK_CUR */
180 new_offset = file->f_pos + offset;
181 break;
182 case 2: /* SEEK_END */
183 new_offset = vol->used_bytes + offset;
184 break;
185 default:
186 return -EINVAL;
187 }
188
189 if (new_offset < 0 || new_offset > vol->used_bytes) {
190 dbg_err("bad seek %lld", new_offset);
191 return -EINVAL;
192 }
193
194 dbg_msg("seek volume %d, offset %lld, origin %d, new offset %lld",
195 vol->vol_id, offset, origin, new_offset);
196
197 file->f_pos = new_offset;
198 return new_offset;
199}
200
201static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count,
202 loff_t *offp)
203{
204 struct ubi_volume_desc *desc = file->private_data;
205 struct ubi_volume *vol = desc->vol;
206 struct ubi_device *ubi = vol->ubi;
207 int err, lnum, off, len, vol_id = desc->vol->vol_id, tbuf_size;
208 size_t count_save = count;
209 void *tbuf;
210 uint64_t tmp;
211
212 dbg_msg("read %zd bytes from offset %lld of volume %d",
213 count, *offp, vol_id);
214
215 if (vol->updating) {
216 dbg_err("updating");
217 return -EBUSY;
218 }
219 if (vol->upd_marker) {
220 dbg_err("damaged volume, update marker is set");
221 return -EBADF;
222 }
223 if (*offp == vol->used_bytes || count == 0)
224 return 0;
225
226 if (vol->corrupted)
227 dbg_msg("read from corrupted volume %d", vol_id);
228
229 if (*offp + count > vol->used_bytes)
230 count_save = count = vol->used_bytes - *offp;
231
232 tbuf_size = vol->usable_leb_size;
233 if (count < tbuf_size)
234 tbuf_size = ALIGN(count, ubi->min_io_size);
235 tbuf = kmalloc(tbuf_size, GFP_KERNEL);
236 if (!tbuf)
237 return -ENOMEM;
238
239 len = count > tbuf_size ? tbuf_size : count;
240
241 tmp = *offp;
242 off = do_div(tmp, vol->usable_leb_size);
243 lnum = tmp;
244
245 do {
246 cond_resched();
247
248 if (off + len >= vol->usable_leb_size)
249 len = vol->usable_leb_size - off;
250
251 err = ubi_eba_read_leb(ubi, vol_id, lnum, tbuf, off, len, 0);
252 if (err)
253 break;
254
255 off += len;
256 if (off == vol->usable_leb_size) {
257 lnum += 1;
258 off -= vol->usable_leb_size;
259 }
260
261 count -= len;
262 *offp += len;
263
264 err = copy_to_user(buf, tbuf, len);
265 if (err) {
266 err = -EFAULT;
267 break;
268 }
269
270 buf += len;
271 len = count > tbuf_size ? tbuf_size : count;
272 } while (count);
273
274 kfree(tbuf);
275 return err ? err : count_save - count;
276}
277
278#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
279
280/*
281 * This function allows to directly write to dynamic UBI volumes, without
282 * issuing the volume update operation. Available only as a debugging feature.
283 * Very useful for testing UBI.
284 */
285static ssize_t vol_cdev_direct_write(struct file *file, const char __user *buf,
286 size_t count, loff_t *offp)
287{
288 struct ubi_volume_desc *desc = file->private_data;
289 struct ubi_volume *vol = desc->vol;
290 struct ubi_device *ubi = vol->ubi;
291 int lnum, off, len, tbuf_size, vol_id = vol->vol_id, err = 0;
292 size_t count_save = count;
293 char *tbuf;
294 uint64_t tmp;
295
296 dbg_msg("requested: write %zd bytes to offset %lld of volume %u",
297 count, *offp, desc->vol->vol_id);
298
299 if (vol->vol_type == UBI_STATIC_VOLUME)
300 return -EROFS;
301
302 tmp = *offp;
303 off = do_div(tmp, vol->usable_leb_size);
304 lnum = tmp;
305
306 if (off % ubi->min_io_size) {
307 dbg_err("unaligned position");
308 return -EINVAL;
309 }
310
311 if (*offp + count > vol->used_bytes)
312 count_save = count = vol->used_bytes - *offp;
313
314 /* We can write only in fractions of the minimum I/O unit */
315 if (count % ubi->min_io_size) {
316 dbg_err("unaligned write length");
317 return -EINVAL;
318 }
319
320 tbuf_size = vol->usable_leb_size;
321 if (count < tbuf_size)
322 tbuf_size = ALIGN(count, ubi->min_io_size);
323 tbuf = kmalloc(tbuf_size, GFP_KERNEL);
324 if (!tbuf)
325 return -ENOMEM;
326
327 len = count > tbuf_size ? tbuf_size : count;
328
329 while (count) {
330 cond_resched();
331
332 if (off + len >= vol->usable_leb_size)
333 len = vol->usable_leb_size - off;
334
335 err = copy_from_user(tbuf, buf, len);
336 if (err) {
337 err = -EFAULT;
338 break;
339 }
340
341 err = ubi_eba_write_leb(ubi, vol_id, lnum, tbuf, off, len,
342 UBI_UNKNOWN);
343 if (err)
344 break;
345
346 off += len;
347 if (off == vol->usable_leb_size) {
348 lnum += 1;
349 off -= vol->usable_leb_size;
350 }
351
352 count -= len;
353 *offp += len;
354 buf += len;
355 len = count > tbuf_size ? tbuf_size : count;
356 }
357
358 kfree(tbuf);
359 return err ? err : count_save - count;
360}
361
362#else
363#define vol_cdev_direct_write(file, buf, count, offp) -EPERM
364#endif /* CONFIG_MTD_UBI_DEBUG_USERSPACE_IO */
365
366static ssize_t vol_cdev_write(struct file *file, const char __user *buf,
367 size_t count, loff_t *offp)
368{
369 int err = 0;
370 struct ubi_volume_desc *desc = file->private_data;
371 struct ubi_volume *vol = desc->vol;
372 struct ubi_device *ubi = vol->ubi;
373
374 if (!vol->updating)
375 return vol_cdev_direct_write(file, buf, count, offp);
376
377 err = ubi_more_update_data(ubi, vol->vol_id, buf, count);
378 if (err < 0) {
379 ubi_err("cannot write %zd bytes of update data", count);
380 return err;
381 }
382
383 if (err) {
384 /*
385 * Update is finished, @err contains number of actually written
386 * bytes now.
387 */
388 count = err;
389
390 err = ubi_check_volume(ubi, vol->vol_id);
391 if (err < 0)
392 return err;
393
394 if (err) {
395 ubi_warn("volume %d on UBI device %d is corrupted",
396 vol->vol_id, ubi->ubi_num);
397 vol->corrupted = 1;
398 }
399 vol->checked = 1;
400 revoke_exclusive(desc, UBI_READWRITE);
401 }
402
403 *offp += count;
404 return count;
405}
406
407static int vol_cdev_ioctl(struct inode *inode, struct file *file,
408 unsigned int cmd, unsigned long arg)
409{
410 int err = 0;
411 struct ubi_volume_desc *desc = file->private_data;
412 struct ubi_volume *vol = desc->vol;
413 struct ubi_device *ubi = vol->ubi;
414 void __user *argp = (void __user *)arg;
415
416 if (_IOC_NR(cmd) > VOL_CDEV_IOC_MAX_SEQ ||
417 _IOC_TYPE(cmd) != UBI_VOL_IOC_MAGIC)
418 return -ENOTTY;
419
420 if (_IOC_DIR(cmd) && _IOC_READ)
421 err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd));
422 else if (_IOC_DIR(cmd) && _IOC_WRITE)
423 err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd));
424 if (err)
425 return -EFAULT;
426
427 switch (cmd) {
428
429 /* Volume update command */
430 case UBI_IOCVOLUP:
431 {
432 int64_t bytes, rsvd_bytes;
433
434 if (!capable(CAP_SYS_RESOURCE)) {
435 err = -EPERM;
436 break;
437 }
438
439 err = copy_from_user(&bytes, argp, sizeof(int64_t));
440 if (err) {
441 err = -EFAULT;
442 break;
443 }
444
445 if (desc->mode == UBI_READONLY) {
446 err = -EROFS;
447 break;
448 }
449
450 rsvd_bytes = vol->reserved_pebs * (ubi->leb_size-vol->data_pad);
451 if (bytes < 0 || bytes > rsvd_bytes) {
452 err = -EINVAL;
453 break;
454 }
455
456 err = get_exclusive(desc);
457 if (err < 0)
458 break;
459
460 err = ubi_start_update(ubi, vol->vol_id, bytes);
461 if (bytes == 0)
462 revoke_exclusive(desc, UBI_READWRITE);
463
464 file->f_pos = 0;
465 break;
466 }
467
468#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
469 /* Logical eraseblock erasure command */
470 case UBI_IOCEBER:
471 {
472 int32_t lnum;
473
474 err = __get_user(lnum, (__user int32_t *)argp);
475 if (err) {
476 err = -EFAULT;
477 break;
478 }
479
480 if (desc->mode == UBI_READONLY) {
481 err = -EROFS;
482 break;
483 }
484
485 if (lnum < 0 || lnum >= vol->reserved_pebs) {
486 err = -EINVAL;
487 break;
488 }
489
490 if (vol->vol_type != UBI_DYNAMIC_VOLUME) {
491 err = -EROFS;
492 break;
493 }
494
495 dbg_msg("erase LEB %d:%d", vol->vol_id, lnum);
496 err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum);
497 if (err)
498 break;
499
500 err = ubi_wl_flush(ubi);
501 break;
502 }
503#endif
504
505 default:
506 err = -ENOTTY;
507 break;
508 }
509
510 return err;
511}
512
513/**
514 * verify_mkvol_req - verify volume creation request.
515 * @ubi: UBI device description object
516 * @req: the request to check
517 *
518 * This function zero if the request is correct, and %-EINVAL if not.
519 */
520static int verify_mkvol_req(const struct ubi_device *ubi,
521 const struct ubi_mkvol_req *req)
522{
523 int n, err = -EINVAL;
524
525 if (req->bytes < 0 || req->alignment < 0 || req->vol_type < 0 ||
526 req->name_len < 0)
527 goto bad;
528
529 if ((req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) &&
530 req->vol_id != UBI_VOL_NUM_AUTO)
531 goto bad;
532
533 if (req->alignment == 0)
534 goto bad;
535
536 if (req->bytes == 0)
537 goto bad;
538
539 if (req->vol_type != UBI_DYNAMIC_VOLUME &&
540 req->vol_type != UBI_STATIC_VOLUME)
541 goto bad;
542
543 if (req->alignment > ubi->leb_size)
544 goto bad;
545
546 n = req->alignment % ubi->min_io_size;
547 if (req->alignment != 1 && n)
548 goto bad;
549
550 if (req->name_len > UBI_VOL_NAME_MAX) {
551 err = -ENAMETOOLONG;
552 goto bad;
553 }
554
555 return 0;
556
557bad:
558 dbg_err("bad volume creation request");
559 ubi_dbg_dump_mkvol_req(req);
560 return err;
561}
562
563/**
564 * verify_rsvol_req - verify volume re-size request.
565 * @ubi: UBI device description object
566 * @req: the request to check
567 *
568 * This function returns zero if the request is correct, and %-EINVAL if not.
569 */
570static int verify_rsvol_req(const struct ubi_device *ubi,
571 const struct ubi_rsvol_req *req)
572{
573 if (req->bytes <= 0)
574 return -EINVAL;
575
576 if (req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots)
577 return -EINVAL;
578
579 return 0;
580}
581
582static int ubi_cdev_ioctl(struct inode *inode, struct file *file,
583 unsigned int cmd, unsigned long arg)
584{
585 int err = 0;
586 struct ubi_device *ubi;
587 struct ubi_volume_desc *desc;
588 void __user *argp = (void __user *)arg;
589
590 if (_IOC_NR(cmd) > UBI_CDEV_IOC_MAX_SEQ ||
591 _IOC_TYPE(cmd) != UBI_IOC_MAGIC)
592 return -ENOTTY;
593
594 if (_IOC_DIR(cmd) && _IOC_READ)
595 err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd));
596 else if (_IOC_DIR(cmd) && _IOC_WRITE)
597 err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd));
598 if (err)
599 return -EFAULT;
600
601 if (!capable(CAP_SYS_RESOURCE))
602 return -EPERM;
603
604 ubi = major_to_device(imajor(inode));
605 if (IS_ERR(ubi))
606 return PTR_ERR(ubi);
607
608 switch (cmd) {
609 /* Create volume command */
610 case UBI_IOCMKVOL:
611 {
612 struct ubi_mkvol_req req;
613
614 dbg_msg("create volume");
615 err = __copy_from_user(&req, argp,
616 sizeof(struct ubi_mkvol_req));
617 if (err) {
618 err = -EFAULT;
619 break;
620 }
621
622 err = verify_mkvol_req(ubi, &req);
623 if (err)
624 break;
625
626 req.name[req.name_len] = '\0';
627
628 err = ubi_create_volume(ubi, &req);
629 if (err)
630 break;
631
632 err = __put_user(req.vol_id, (__user int32_t *)argp);
633 if (err)
634 err = -EFAULT;
635
636 break;
637 }
638
639 /* Remove volume command */
640 case UBI_IOCRMVOL:
641 {
642 int vol_id;
643
644 dbg_msg("remove volume");
645 err = __get_user(vol_id, (__user int32_t *)argp);
646 if (err) {
647 err = -EFAULT;
648 break;
649 }
650
651 desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE);
652 if (IS_ERR(desc)) {
653 err = PTR_ERR(desc);
654 break;
655 }
656
657 err = ubi_remove_volume(desc);
658 if (err)
659 ubi_close_volume(desc);
660
661 break;
662 }
663
664 /* Re-size volume command */
665 case UBI_IOCRSVOL:
666 {
667 int pebs;
668 uint64_t tmp;
669 struct ubi_rsvol_req req;
670
671 dbg_msg("re-size volume");
672 err = __copy_from_user(&req, argp,
673 sizeof(struct ubi_rsvol_req));
674 if (err) {
675 err = -EFAULT;
676 break;
677 }
678
679 err = verify_rsvol_req(ubi, &req);
680 if (err)
681 break;
682
683 desc = ubi_open_volume(ubi->ubi_num, req.vol_id, UBI_EXCLUSIVE);
684 if (IS_ERR(desc)) {
685 err = PTR_ERR(desc);
686 break;
687 }
688
689 tmp = req.bytes;
690 pebs = !!do_div(tmp, desc->vol->usable_leb_size);
691 pebs += tmp;
692
693 err = ubi_resize_volume(desc, pebs);
694 ubi_close_volume(desc);
695 break;
696 }
697
698 default:
699 err = -ENOTTY;
700 break;
701 }
702
703 return err;
704}
705
706/* UBI character device operations */
707struct file_operations ubi_cdev_operations = {
708 .owner = THIS_MODULE,
709 .ioctl = ubi_cdev_ioctl,
710 .llseek = no_llseek
711};
712
713/* UBI volume character device operations */
714struct file_operations ubi_vol_cdev_operations = {
715 .owner = THIS_MODULE,
716 .open = vol_cdev_open,
717 .release = vol_cdev_release,
718 .llseek = vol_cdev_llseek,
719 .read = vol_cdev_read,
720 .write = vol_cdev_write,
721 .ioctl = vol_cdev_ioctl
722};
diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
new file mode 100644
index 000000000000..86364221fafe
--- /dev/null
+++ b/drivers/mtd/ubi/debug.c
@@ -0,0 +1,224 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/*
22 * Here we keep all the UBI debugging stuff which should normally be disabled
23 * and compiled-out, but it is extremely helpful when hunting bugs or doing big
24 * changes.
25 */
26
27#ifdef CONFIG_MTD_UBI_DEBUG_MSG
28
29#include "ubi.h"
30
31/**
32 * ubi_dbg_dump_ec_hdr - dump an erase counter header.
33 * @ec_hdr: the erase counter header to dump
34 */
35void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr)
36{
37 dbg_msg("erase counter header dump:");
38 dbg_msg("magic %#08x", ubi32_to_cpu(ec_hdr->magic));
39 dbg_msg("version %d", (int)ec_hdr->version);
40 dbg_msg("ec %llu", (long long)ubi64_to_cpu(ec_hdr->ec));
41 dbg_msg("vid_hdr_offset %d", ubi32_to_cpu(ec_hdr->vid_hdr_offset));
42 dbg_msg("data_offset %d", ubi32_to_cpu(ec_hdr->data_offset));
43 dbg_msg("hdr_crc %#08x", ubi32_to_cpu(ec_hdr->hdr_crc));
44 dbg_msg("erase counter header hexdump:");
45 ubi_dbg_hexdump(ec_hdr, UBI_EC_HDR_SIZE);
46}
47
48/**
49 * ubi_dbg_dump_vid_hdr - dump a volume identifier header.
50 * @vid_hdr: the volume identifier header to dump
51 */
52void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr)
53{
54 dbg_msg("volume identifier header dump:");
55 dbg_msg("magic %08x", ubi32_to_cpu(vid_hdr->magic));
56 dbg_msg("version %d", (int)vid_hdr->version);
57 dbg_msg("vol_type %d", (int)vid_hdr->vol_type);
58 dbg_msg("copy_flag %d", (int)vid_hdr->copy_flag);
59 dbg_msg("compat %d", (int)vid_hdr->compat);
60 dbg_msg("vol_id %d", ubi32_to_cpu(vid_hdr->vol_id));
61 dbg_msg("lnum %d", ubi32_to_cpu(vid_hdr->lnum));
62 dbg_msg("leb_ver %u", ubi32_to_cpu(vid_hdr->leb_ver));
63 dbg_msg("data_size %d", ubi32_to_cpu(vid_hdr->data_size));
64 dbg_msg("used_ebs %d", ubi32_to_cpu(vid_hdr->used_ebs));
65 dbg_msg("data_pad %d", ubi32_to_cpu(vid_hdr->data_pad));
66 dbg_msg("sqnum %llu",
67 (unsigned long long)ubi64_to_cpu(vid_hdr->sqnum));
68 dbg_msg("hdr_crc %08x", ubi32_to_cpu(vid_hdr->hdr_crc));
69 dbg_msg("volume identifier header hexdump:");
70}
71
72/**
73 * ubi_dbg_dump_vol_info- dump volume information.
74 * @vol: UBI volume description object
75 */
76void ubi_dbg_dump_vol_info(const struct ubi_volume *vol)
77{
78 dbg_msg("volume information dump:");
79 dbg_msg("vol_id %d", vol->vol_id);
80 dbg_msg("reserved_pebs %d", vol->reserved_pebs);
81 dbg_msg("alignment %d", vol->alignment);
82 dbg_msg("data_pad %d", vol->data_pad);
83 dbg_msg("vol_type %d", vol->vol_type);
84 dbg_msg("name_len %d", vol->name_len);
85 dbg_msg("usable_leb_size %d", vol->usable_leb_size);
86 dbg_msg("used_ebs %d", vol->used_ebs);
87 dbg_msg("used_bytes %lld", vol->used_bytes);
88 dbg_msg("last_eb_bytes %d", vol->last_eb_bytes);
89 dbg_msg("corrupted %d", vol->corrupted);
90 dbg_msg("upd_marker %d", vol->upd_marker);
91
92 if (vol->name_len <= UBI_VOL_NAME_MAX &&
93 strnlen(vol->name, vol->name_len + 1) == vol->name_len) {
94 dbg_msg("name %s", vol->name);
95 } else {
96 dbg_msg("the 1st 5 characters of the name: %c%c%c%c%c",
97 vol->name[0], vol->name[1], vol->name[2],
98 vol->name[3], vol->name[4]);
99 }
100}
101
102/**
103 * ubi_dbg_dump_vtbl_record - dump a &struct ubi_vtbl_record object.
104 * @r: the object to dump
105 * @idx: volume table index
106 */
107void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx)
108{
109 int name_len = ubi16_to_cpu(r->name_len);
110
111 dbg_msg("volume table record %d dump:", idx);
112 dbg_msg("reserved_pebs %d", ubi32_to_cpu(r->reserved_pebs));
113 dbg_msg("alignment %d", ubi32_to_cpu(r->alignment));
114 dbg_msg("data_pad %d", ubi32_to_cpu(r->data_pad));
115 dbg_msg("vol_type %d", (int)r->vol_type);
116 dbg_msg("upd_marker %d", (int)r->upd_marker);
117 dbg_msg("name_len %d", name_len);
118
119 if (r->name[0] == '\0') {
120 dbg_msg("name NULL");
121 return;
122 }
123
124 if (name_len <= UBI_VOL_NAME_MAX &&
125 strnlen(&r->name[0], name_len + 1) == name_len) {
126 dbg_msg("name %s", &r->name[0]);
127 } else {
128 dbg_msg("1st 5 characters of the name: %c%c%c%c%c",
129 r->name[0], r->name[1], r->name[2], r->name[3],
130 r->name[4]);
131 }
132 dbg_msg("crc %#08x", ubi32_to_cpu(r->crc));
133}
134
135/**
136 * ubi_dbg_dump_sv - dump a &struct ubi_scan_volume object.
137 * @sv: the object to dump
138 */
139void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv)
140{
141 dbg_msg("volume scanning information dump:");
142 dbg_msg("vol_id %d", sv->vol_id);
143 dbg_msg("highest_lnum %d", sv->highest_lnum);
144 dbg_msg("leb_count %d", sv->leb_count);
145 dbg_msg("compat %d", sv->compat);
146 dbg_msg("vol_type %d", sv->vol_type);
147 dbg_msg("used_ebs %d", sv->used_ebs);
148 dbg_msg("last_data_size %d", sv->last_data_size);
149 dbg_msg("data_pad %d", sv->data_pad);
150}
151
152/**
153 * ubi_dbg_dump_seb - dump a &struct ubi_scan_leb object.
154 * @seb: the object to dump
155 * @type: object type: 0 - not corrupted, 1 - corrupted
156 */
157void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type)
158{
159 dbg_msg("eraseblock scanning information dump:");
160 dbg_msg("ec %d", seb->ec);
161 dbg_msg("pnum %d", seb->pnum);
162 if (type == 0) {
163 dbg_msg("lnum %d", seb->lnum);
164 dbg_msg("scrub %d", seb->scrub);
165 dbg_msg("sqnum %llu", seb->sqnum);
166 dbg_msg("leb_ver %u", seb->leb_ver);
167 }
168}
169
170/**
171 * ubi_dbg_dump_mkvol_req - dump a &struct ubi_mkvol_req object.
172 * @req: the object to dump
173 */
174void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req)
175{
176 char nm[17];
177
178 dbg_msg("volume creation request dump:");
179 dbg_msg("vol_id %d", req->vol_id);
180 dbg_msg("alignment %d", req->alignment);
181 dbg_msg("bytes %lld", (long long)req->bytes);
182 dbg_msg("vol_type %d", req->vol_type);
183 dbg_msg("name_len %d", req->name_len);
184
185 memcpy(nm, req->name, 16);
186 nm[16] = 0;
187 dbg_msg("the 1st 16 characters of the name: %s", nm);
188}
189
190#define BYTES_PER_LINE 32
191
192/**
193 * ubi_dbg_hexdump - dump a buffer.
194 * @ptr: the buffer to dump
195 * @size: buffer size which must be multiple of 4 bytes
196 */
197void ubi_dbg_hexdump(const void *ptr, int size)
198{
199 int i, k = 0, rows, columns;
200 const uint8_t *p = ptr;
201
202 size = ALIGN(size, 4);
203 rows = size/BYTES_PER_LINE + size % BYTES_PER_LINE;
204 for (i = 0; i < rows; i++) {
205 int j;
206
207 cond_resched();
208 columns = min(size - k, BYTES_PER_LINE) / 4;
209 if (columns == 0)
210 break;
211 printk(KERN_DEBUG "%5d: ", i * BYTES_PER_LINE);
212 for (j = 0; j < columns; j++) {
213 int n, N;
214
215 N = size - k > 4 ? 4 : size - k;
216 for (n = 0; n < N; n++)
217 printk("%02x", p[k++]);
218 printk(" ");
219 }
220 printk("\n");
221 }
222}
223
224#endif /* CONFIG_MTD_UBI_DEBUG_MSG */
diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h
new file mode 100644
index 000000000000..f816ad9a36c0
--- /dev/null
+++ b/drivers/mtd/ubi/debug.h
@@ -0,0 +1,161 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21#ifndef __UBI_DEBUG_H__
22#define __UBI_DEBUG_H__
23
24#ifdef CONFIG_MTD_UBI_DEBUG
25#include <linux/random.h>
26
27#define ubi_assert(expr) BUG_ON(!(expr))
28#define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__)
29#else
30#define ubi_assert(expr) ({})
31#define dbg_err(fmt, ...) ({})
32#endif
33
34#ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT
35#define DBG_DISABLE_BGT 1
36#else
37#define DBG_DISABLE_BGT 0
38#endif
39
40#ifdef CONFIG_MTD_UBI_DEBUG_MSG
41/* Generic debugging message */
42#define dbg_msg(fmt, ...) \
43 printk(KERN_DEBUG "UBI DBG: %s: " fmt "\n", __FUNCTION__, ##__VA_ARGS__)
44
45#define ubi_dbg_dump_stack() dump_stack()
46
47struct ubi_ec_hdr;
48struct ubi_vid_hdr;
49struct ubi_volume;
50struct ubi_vtbl_record;
51struct ubi_scan_volume;
52struct ubi_scan_leb;
53struct ubi_mkvol_req;
54
55void ubi_dbg_print(int type, const char *func, const char *fmt, ...);
56void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr);
57void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr);
58void ubi_dbg_dump_vol_info(const struct ubi_volume *vol);
59void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx);
60void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv);
61void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type);
62void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req);
63void ubi_dbg_hexdump(const void *buf, int size);
64
65#else
66
67#define dbg_msg(fmt, ...) ({})
68#define ubi_dbg_dump_stack() ({})
69#define ubi_dbg_print(func, fmt, ...) ({})
70#define ubi_dbg_dump_ec_hdr(ec_hdr) ({})
71#define ubi_dbg_dump_vid_hdr(vid_hdr) ({})
72#define ubi_dbg_dump_vol_info(vol) ({})
73#define ubi_dbg_dump_vtbl_record(r, idx) ({})
74#define ubi_dbg_dump_sv(sv) ({})
75#define ubi_dbg_dump_seb(seb, type) ({})
76#define ubi_dbg_dump_mkvol_req(req) ({})
77#define ubi_dbg_hexdump(buf, size) ({})
78
79#endif /* CONFIG_MTD_UBI_DEBUG_MSG */
80
81#ifdef CONFIG_MTD_UBI_DEBUG_MSG_EBA
82/* Messages from the eraseblock association unit */
83#define dbg_eba(fmt, ...) \
84 printk(KERN_DEBUG "UBI DBG eba: %s: " fmt "\n", __FUNCTION__, \
85 ##__VA_ARGS__)
86#else
87#define dbg_eba(fmt, ...) ({})
88#endif
89
90#ifdef CONFIG_MTD_UBI_DEBUG_MSG_WL
91/* Messages from the wear-leveling unit */
92#define dbg_wl(fmt, ...) \
93 printk(KERN_DEBUG "UBI DBG wl: %s: " fmt "\n", __FUNCTION__, \
94 ##__VA_ARGS__)
95#else
96#define dbg_wl(fmt, ...) ({})
97#endif
98
99#ifdef CONFIG_MTD_UBI_DEBUG_MSG_IO
100/* Messages from the input/output unit */
101#define dbg_io(fmt, ...) \
102 printk(KERN_DEBUG "UBI DBG io: %s: " fmt "\n", __FUNCTION__, \
103 ##__VA_ARGS__)
104#else
105#define dbg_io(fmt, ...) ({})
106#endif
107
108#ifdef CONFIG_MTD_UBI_DEBUG_MSG_BLD
109/* Initialization and build messages */
110#define dbg_bld(fmt, ...) \
111 printk(KERN_DEBUG "UBI DBG bld: %s: " fmt "\n", __FUNCTION__, \
112 ##__VA_ARGS__)
113#else
114#define dbg_bld(fmt, ...) ({})
115#endif
116
117#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_BITFLIPS
118/**
119 * ubi_dbg_is_bitflip - if it is time to emulate a bit-flip.
120 *
121 * Returns non-zero if a bit-flip should be emulated, otherwise returns zero.
122 */
123static inline int ubi_dbg_is_bitflip(void)
124{
125 return !(random32() % 200);
126}
127#else
128#define ubi_dbg_is_bitflip() 0
129#endif
130
131#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES
132/**
133 * ubi_dbg_is_write_failure - if it is time to emulate a write failure.
134 *
135 * Returns non-zero if a write failure should be emulated, otherwise returns
136 * zero.
137 */
138static inline int ubi_dbg_is_write_failure(void)
139{
140 return !(random32() % 500);
141}
142#else
143#define ubi_dbg_is_write_failure() 0
144#endif
145
146#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES
147/**
148 * ubi_dbg_is_erase_failure - if its time to emulate an erase failure.
149 *
150 * Returns non-zero if an erase failure should be emulated, otherwise returns
151 * zero.
152 */
153static inline int ubi_dbg_is_erase_failure(void)
154{
155 return !(random32() % 400);
156}
157#else
158#define ubi_dbg_is_erase_failure() 0
159#endif
160
161#endif /* !__UBI_DEBUG_H__ */
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
new file mode 100644
index 000000000000..d847ee1da3d9
--- /dev/null
+++ b/drivers/mtd/ubi/eba.c
@@ -0,0 +1,1241 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/*
22 * The UBI Eraseblock Association (EBA) unit.
23 *
24 * This unit is responsible for I/O to/from logical eraseblock.
25 *
26 * Although in this implementation the EBA table is fully kept and managed in
27 * RAM, which assumes poor scalability, it might be (partially) maintained on
28 * flash in future implementations.
29 *
30 * The EBA unit implements per-logical eraseblock locking. Before accessing a
31 * logical eraseblock it is locked for reading or writing. The per-logical
32 * eraseblock locking is implemented by means of the lock tree. The lock tree
33 * is an RB-tree which refers all the currently locked logical eraseblocks. The
34 * lock tree elements are &struct ltree_entry objects. They are indexed by
35 * (@vol_id, @lnum) pairs.
36 *
37 * EBA also maintains the global sequence counter which is incremented each
38 * time a logical eraseblock is mapped to a physical eraseblock and it is
39 * stored in the volume identifier header. This means that each VID header has
40 * a unique sequence number. The sequence number is only increased an we assume
41 * 64 bits is enough to never overflow.
42 */
43
44#include <linux/slab.h>
45#include <linux/crc32.h>
46#include <linux/err.h>
47#include "ubi.h"
48
49/**
50 * struct ltree_entry - an entry in the lock tree.
51 * @rb: links RB-tree nodes
52 * @vol_id: volume ID of the locked logical eraseblock
53 * @lnum: locked logical eraseblock number
54 * @users: how many tasks are using this logical eraseblock or wait for it
55 * @mutex: read/write mutex to implement read/write access serialization to
56 * the (@vol_id, @lnum) logical eraseblock
57 *
58 * When a logical eraseblock is being locked - corresponding &struct ltree_entry
59 * object is inserted to the lock tree (@ubi->ltree).
60 */
61struct ltree_entry {
62 struct rb_node rb;
63 int vol_id;
64 int lnum;
65 int users;
66 struct rw_semaphore mutex;
67};
68
69/* Slab cache for lock-tree entries */
70static struct kmem_cache *ltree_slab;
71
72/**
73 * next_sqnum - get next sequence number.
74 * @ubi: UBI device description object
75 *
76 * This function returns next sequence number to use, which is just the current
77 * global sequence counter value. It also increases the global sequence
78 * counter.
79 */
80static unsigned long long next_sqnum(struct ubi_device *ubi)
81{
82 unsigned long long sqnum;
83
84 spin_lock(&ubi->ltree_lock);
85 sqnum = ubi->global_sqnum++;
86 spin_unlock(&ubi->ltree_lock);
87
88 return sqnum;
89}
90
91/**
92 * ubi_get_compat - get compatibility flags of a volume.
93 * @ubi: UBI device description object
94 * @vol_id: volume ID
95 *
96 * This function returns compatibility flags for an internal volume. User
97 * volumes have no compatibility flags, so %0 is returned.
98 */
99static int ubi_get_compat(const struct ubi_device *ubi, int vol_id)
100{
101 if (vol_id == UBI_LAYOUT_VOL_ID)
102 return UBI_LAYOUT_VOLUME_COMPAT;
103 return 0;
104}
105
106/**
107 * ltree_lookup - look up the lock tree.
108 * @ubi: UBI device description object
109 * @vol_id: volume ID
110 * @lnum: logical eraseblock number
111 *
112 * This function returns a pointer to the corresponding &struct ltree_entry
113 * object if the logical eraseblock is locked and %NULL if it is not.
114 * @ubi->ltree_lock has to be locked.
115 */
116static struct ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id,
117 int lnum)
118{
119 struct rb_node *p;
120
121 p = ubi->ltree.rb_node;
122 while (p) {
123 struct ltree_entry *le;
124
125 le = rb_entry(p, struct ltree_entry, rb);
126
127 if (vol_id < le->vol_id)
128 p = p->rb_left;
129 else if (vol_id > le->vol_id)
130 p = p->rb_right;
131 else {
132 if (lnum < le->lnum)
133 p = p->rb_left;
134 else if (lnum > le->lnum)
135 p = p->rb_right;
136 else
137 return le;
138 }
139 }
140
141 return NULL;
142}
143
144/**
145 * ltree_add_entry - add new entry to the lock tree.
146 * @ubi: UBI device description object
147 * @vol_id: volume ID
148 * @lnum: logical eraseblock number
149 *
150 * This function adds new entry for logical eraseblock (@vol_id, @lnum) to the
151 * lock tree. If such entry is already there, its usage counter is increased.
152 * Returns pointer to the lock tree entry or %-ENOMEM if memory allocation
153 * failed.
154 */
155static struct ltree_entry *ltree_add_entry(struct ubi_device *ubi, int vol_id,
156 int lnum)
157{
158 struct ltree_entry *le, *le1, *le_free;
159
160 le = kmem_cache_alloc(ltree_slab, GFP_KERNEL);
161 if (!le)
162 return ERR_PTR(-ENOMEM);
163
164 le->vol_id = vol_id;
165 le->lnum = lnum;
166
167 spin_lock(&ubi->ltree_lock);
168 le1 = ltree_lookup(ubi, vol_id, lnum);
169
170 if (le1) {
171 /*
172 * This logical eraseblock is already locked. The newly
173 * allocated lock entry is not needed.
174 */
175 le_free = le;
176 le = le1;
177 } else {
178 struct rb_node **p, *parent = NULL;
179
180 /*
181 * No lock entry, add the newly allocated one to the
182 * @ubi->ltree RB-tree.
183 */
184 le_free = NULL;
185
186 p = &ubi->ltree.rb_node;
187 while (*p) {
188 parent = *p;
189 le1 = rb_entry(parent, struct ltree_entry, rb);
190
191 if (vol_id < le1->vol_id)
192 p = &(*p)->rb_left;
193 else if (vol_id > le1->vol_id)
194 p = &(*p)->rb_right;
195 else {
196 ubi_assert(lnum != le1->lnum);
197 if (lnum < le1->lnum)
198 p = &(*p)->rb_left;
199 else
200 p = &(*p)->rb_right;
201 }
202 }
203
204 rb_link_node(&le->rb, parent, p);
205 rb_insert_color(&le->rb, &ubi->ltree);
206 }
207 le->users += 1;
208 spin_unlock(&ubi->ltree_lock);
209
210 if (le_free)
211 kmem_cache_free(ltree_slab, le_free);
212
213 return le;
214}
215
216/**
217 * leb_read_lock - lock logical eraseblock for reading.
218 * @ubi: UBI device description object
219 * @vol_id: volume ID
220 * @lnum: logical eraseblock number
221 *
222 * This function locks a logical eraseblock for reading. Returns zero in case
223 * of success and a negative error code in case of failure.
224 */
225static int leb_read_lock(struct ubi_device *ubi, int vol_id, int lnum)
226{
227 struct ltree_entry *le;
228
229 le = ltree_add_entry(ubi, vol_id, lnum);
230 if (IS_ERR(le))
231 return PTR_ERR(le);
232 down_read(&le->mutex);
233 return 0;
234}
235
236/**
237 * leb_read_unlock - unlock logical eraseblock.
238 * @ubi: UBI device description object
239 * @vol_id: volume ID
240 * @lnum: logical eraseblock number
241 */
242static void leb_read_unlock(struct ubi_device *ubi, int vol_id, int lnum)
243{
244 int free = 0;
245 struct ltree_entry *le;
246
247 spin_lock(&ubi->ltree_lock);
248 le = ltree_lookup(ubi, vol_id, lnum);
249 le->users -= 1;
250 ubi_assert(le->users >= 0);
251 if (le->users == 0) {
252 rb_erase(&le->rb, &ubi->ltree);
253 free = 1;
254 }
255 spin_unlock(&ubi->ltree_lock);
256
257 up_read(&le->mutex);
258 if (free)
259 kmem_cache_free(ltree_slab, le);
260}
261
262/**
263 * leb_write_lock - lock logical eraseblock for writing.
264 * @ubi: UBI device description object
265 * @vol_id: volume ID
266 * @lnum: logical eraseblock number
267 *
268 * This function locks a logical eraseblock for writing. Returns zero in case
269 * of success and a negative error code in case of failure.
270 */
271static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum)
272{
273 struct ltree_entry *le;
274
275 le = ltree_add_entry(ubi, vol_id, lnum);
276 if (IS_ERR(le))
277 return PTR_ERR(le);
278 down_write(&le->mutex);
279 return 0;
280}
281
282/**
283 * leb_write_unlock - unlock logical eraseblock.
284 * @ubi: UBI device description object
285 * @vol_id: volume ID
286 * @lnum: logical eraseblock number
287 */
288static void leb_write_unlock(struct ubi_device *ubi, int vol_id, int lnum)
289{
290 int free;
291 struct ltree_entry *le;
292
293 spin_lock(&ubi->ltree_lock);
294 le = ltree_lookup(ubi, vol_id, lnum);
295 le->users -= 1;
296 ubi_assert(le->users >= 0);
297 if (le->users == 0) {
298 rb_erase(&le->rb, &ubi->ltree);
299 free = 1;
300 } else
301 free = 0;
302 spin_unlock(&ubi->ltree_lock);
303
304 up_write(&le->mutex);
305 if (free)
306 kmem_cache_free(ltree_slab, le);
307}
308
309/**
310 * ubi_eba_unmap_leb - un-map logical eraseblock.
311 * @ubi: UBI device description object
312 * @vol_id: volume ID
313 * @lnum: logical eraseblock number
314 *
315 * This function un-maps logical eraseblock @lnum and schedules corresponding
316 * physical eraseblock for erasure. Returns zero in case of success and a
317 * negative error code in case of failure.
318 */
319int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum)
320{
321 int idx = vol_id2idx(ubi, vol_id), err, pnum;
322 struct ubi_volume *vol = ubi->volumes[idx];
323
324 if (ubi->ro_mode)
325 return -EROFS;
326
327 err = leb_write_lock(ubi, vol_id, lnum);
328 if (err)
329 return err;
330
331 pnum = vol->eba_tbl[lnum];
332 if (pnum < 0)
333 /* This logical eraseblock is already unmapped */
334 goto out_unlock;
335
336 dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);
337
338 vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;
339 err = ubi_wl_put_peb(ubi, pnum, 0);
340
341out_unlock:
342 leb_write_unlock(ubi, vol_id, lnum);
343 return err;
344}
345
346/**
347 * ubi_eba_read_leb - read data.
348 * @ubi: UBI device description object
349 * @vol_id: volume ID
350 * @lnum: logical eraseblock number
351 * @buf: buffer to store the read data
352 * @offset: offset from where to read
353 * @len: how many bytes to read
354 * @check: data CRC check flag
355 *
356 * If the logical eraseblock @lnum is unmapped, @buf is filled with 0xFF
357 * bytes. The @check flag only makes sense for static volumes and forces
358 * eraseblock data CRC checking.
359 *
360 * In case of success this function returns zero. In case of a static volume,
361 * if data CRC mismatches - %-EBADMSG is returned. %-EBADMSG may also be
362 * returned for any volume type if an ECC error was detected by the MTD device
363 * driver. Other negative error cored may be returned in case of other errors.
364 */
365int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
366 int offset, int len, int check)
367{
368 int err, pnum, scrub = 0, idx = vol_id2idx(ubi, vol_id);
369 struct ubi_vid_hdr *vid_hdr;
370 struct ubi_volume *vol = ubi->volumes[idx];
371 uint32_t crc, crc1;
372
373 err = leb_read_lock(ubi, vol_id, lnum);
374 if (err)
375 return err;
376
377 pnum = vol->eba_tbl[lnum];
378 if (pnum < 0) {
379 /*
380 * The logical eraseblock is not mapped, fill the whole buffer
381 * with 0xFF bytes. The exception is static volumes for which
382 * it is an error to read unmapped logical eraseblocks.
383 */
384 dbg_eba("read %d bytes from offset %d of LEB %d:%d (unmapped)",
385 len, offset, vol_id, lnum);
386 leb_read_unlock(ubi, vol_id, lnum);
387 ubi_assert(vol->vol_type != UBI_STATIC_VOLUME);
388 memset(buf, 0xFF, len);
389 return 0;
390 }
391
392 dbg_eba("read %d bytes from offset %d of LEB %d:%d, PEB %d",
393 len, offset, vol_id, lnum, pnum);
394
395 if (vol->vol_type == UBI_DYNAMIC_VOLUME)
396 check = 0;
397
398retry:
399 if (check) {
400 vid_hdr = ubi_zalloc_vid_hdr(ubi);
401 if (!vid_hdr) {
402 err = -ENOMEM;
403 goto out_unlock;
404 }
405
406 err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1);
407 if (err && err != UBI_IO_BITFLIPS) {
408 if (err > 0) {
409 /*
410 * The header is either absent or corrupted.
411 * The former case means there is a bug -
412 * switch to read-only mode just in case.
413 * The latter case means a real corruption - we
414 * may try to recover data. FIXME: but this is
415 * not implemented.
416 */
417 if (err == UBI_IO_BAD_VID_HDR) {
418 ubi_warn("bad VID header at PEB %d, LEB"
419 "%d:%d", pnum, vol_id, lnum);
420 err = -EBADMSG;
421 } else
422 ubi_ro_mode(ubi);
423 }
424 goto out_free;
425 } else if (err == UBI_IO_BITFLIPS)
426 scrub = 1;
427
428 ubi_assert(lnum < ubi32_to_cpu(vid_hdr->used_ebs));
429 ubi_assert(len == ubi32_to_cpu(vid_hdr->data_size));
430
431 crc = ubi32_to_cpu(vid_hdr->data_crc);
432 ubi_free_vid_hdr(ubi, vid_hdr);
433 }
434
435 err = ubi_io_read_data(ubi, buf, pnum, offset, len);
436 if (err) {
437 if (err == UBI_IO_BITFLIPS) {
438 scrub = 1;
439 err = 0;
440 } else if (err == -EBADMSG) {
441 if (vol->vol_type == UBI_DYNAMIC_VOLUME)
442 goto out_unlock;
443 scrub = 1;
444 if (!check) {
445 ubi_msg("force data checking");
446 check = 1;
447 goto retry;
448 }
449 } else
450 goto out_unlock;
451 }
452
453 if (check) {
454 crc1 = crc32(UBI_CRC32_INIT, buf, len);
455 if (crc1 != crc) {
456 ubi_warn("CRC error: calculated %#08x, must be %#08x",
457 crc1, crc);
458 err = -EBADMSG;
459 goto out_unlock;
460 }
461 }
462
463 if (scrub)
464 err = ubi_wl_scrub_peb(ubi, pnum);
465
466 leb_read_unlock(ubi, vol_id, lnum);
467 return err;
468
469out_free:
470 ubi_free_vid_hdr(ubi, vid_hdr);
471out_unlock:
472 leb_read_unlock(ubi, vol_id, lnum);
473 return err;
474}
475
476/**
477 * recover_peb - recover from write failure.
478 * @ubi: UBI device description object
479 * @pnum: the physical eraseblock to recover
480 * @vol_id: volume ID
481 * @lnum: logical eraseblock number
482 * @buf: data which was not written because of the write failure
483 * @offset: offset of the failed write
484 * @len: how many bytes should have been written
485 *
486 * This function is called in case of a write failure and moves all good data
487 * from the potentially bad physical eraseblock to a good physical eraseblock.
488 * This function also writes the data which was not written due to the failure.
489 * Returns new physical eraseblock number in case of success, and a negative
490 * error code in case of failure.
491 */
492static int recover_peb(struct ubi_device *ubi, int pnum, int vol_id, int lnum,
493 const void *buf, int offset, int len)
494{
495 int err, idx = vol_id2idx(ubi, vol_id), new_pnum, data_size, tries = 0;
496 struct ubi_volume *vol = ubi->volumes[idx];
497 struct ubi_vid_hdr *vid_hdr;
498 unsigned char *new_buf;
499
500 vid_hdr = ubi_zalloc_vid_hdr(ubi);
501 if (!vid_hdr) {
502 return -ENOMEM;
503 }
504
505retry:
506 new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN);
507 if (new_pnum < 0) {
508 ubi_free_vid_hdr(ubi, vid_hdr);
509 return new_pnum;
510 }
511
512 ubi_msg("recover PEB %d, move data to PEB %d", pnum, new_pnum);
513
514 err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1);
515 if (err && err != UBI_IO_BITFLIPS) {
516 if (err > 0)
517 err = -EIO;
518 goto out_put;
519 }
520
521 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
522 err = ubi_io_write_vid_hdr(ubi, new_pnum, vid_hdr);
523 if (err)
524 goto write_error;
525
526 data_size = offset + len;
527 new_buf = kmalloc(data_size, GFP_KERNEL);
528 if (!new_buf) {
529 err = -ENOMEM;
530 goto out_put;
531 }
532 memset(new_buf + offset, 0xFF, len);
533
534 /* Read everything before the area where the write failure happened */
535 if (offset > 0) {
536 err = ubi_io_read_data(ubi, new_buf, pnum, 0, offset);
537 if (err && err != UBI_IO_BITFLIPS) {
538 kfree(new_buf);
539 goto out_put;
540 }
541 }
542
543 memcpy(new_buf + offset, buf, len);
544
545 err = ubi_io_write_data(ubi, new_buf, new_pnum, 0, data_size);
546 if (err) {
547 kfree(new_buf);
548 goto write_error;
549 }
550
551 kfree(new_buf);
552 ubi_free_vid_hdr(ubi, vid_hdr);
553
554 vol->eba_tbl[lnum] = new_pnum;
555 ubi_wl_put_peb(ubi, pnum, 1);
556
557 ubi_msg("data was successfully recovered");
558 return 0;
559
560out_put:
561 ubi_wl_put_peb(ubi, new_pnum, 1);
562 ubi_free_vid_hdr(ubi, vid_hdr);
563 return err;
564
565write_error:
566 /*
567 * Bad luck? This physical eraseblock is bad too? Crud. Let's try to
568 * get another one.
569 */
570 ubi_warn("failed to write to PEB %d", new_pnum);
571 ubi_wl_put_peb(ubi, new_pnum, 1);
572 if (++tries > UBI_IO_RETRIES) {
573 ubi_free_vid_hdr(ubi, vid_hdr);
574 return err;
575 }
576 ubi_msg("try again");
577 goto retry;
578}
579
580/**
581 * ubi_eba_write_leb - write data to dynamic volume.
582 * @ubi: UBI device description object
583 * @vol_id: volume ID
584 * @lnum: logical eraseblock number
585 * @buf: the data to write
586 * @offset: offset within the logical eraseblock where to write
587 * @len: how many bytes to write
588 * @dtype: data type
589 *
590 * This function writes data to logical eraseblock @lnum of a dynamic volume
591 * @vol_id. Returns zero in case of success and a negative error code in case
592 * of failure. In case of error, it is possible that something was still
593 * written to the flash media, but may be some garbage.
594 */
595int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum,
596 const void *buf, int offset, int len, int dtype)
597{
598 int idx = vol_id2idx(ubi, vol_id), err, pnum, tries = 0;
599 struct ubi_volume *vol = ubi->volumes[idx];
600 struct ubi_vid_hdr *vid_hdr;
601
602 if (ubi->ro_mode)
603 return -EROFS;
604
605 err = leb_write_lock(ubi, vol_id, lnum);
606 if (err)
607 return err;
608
609 pnum = vol->eba_tbl[lnum];
610 if (pnum >= 0) {
611 dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d",
612 len, offset, vol_id, lnum, pnum);
613
614 err = ubi_io_write_data(ubi, buf, pnum, offset, len);
615 if (err) {
616 ubi_warn("failed to write data to PEB %d", pnum);
617 if (err == -EIO && ubi->bad_allowed)
618 err = recover_peb(ubi, pnum, vol_id, lnum, buf, offset, len);
619 if (err)
620 ubi_ro_mode(ubi);
621 }
622 leb_write_unlock(ubi, vol_id, lnum);
623 return err;
624 }
625
626 /*
627 * The logical eraseblock is not mapped. We have to get a free physical
628 * eraseblock and write the volume identifier header there first.
629 */
630 vid_hdr = ubi_zalloc_vid_hdr(ubi);
631 if (!vid_hdr) {
632 leb_write_unlock(ubi, vol_id, lnum);
633 return -ENOMEM;
634 }
635
636 vid_hdr->vol_type = UBI_VID_DYNAMIC;
637 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
638 vid_hdr->vol_id = cpu_to_ubi32(vol_id);
639 vid_hdr->lnum = cpu_to_ubi32(lnum);
640 vid_hdr->compat = ubi_get_compat(ubi, vol_id);
641 vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
642
643retry:
644 pnum = ubi_wl_get_peb(ubi, dtype);
645 if (pnum < 0) {
646 ubi_free_vid_hdr(ubi, vid_hdr);
647 leb_write_unlock(ubi, vol_id, lnum);
648 return pnum;
649 }
650
651 dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d",
652 len, offset, vol_id, lnum, pnum);
653
654 err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
655 if (err) {
656 ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
657 vol_id, lnum, pnum);
658 goto write_error;
659 }
660
661 err = ubi_io_write_data(ubi, buf, pnum, offset, len);
662 if (err) {
663 ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, "
664 "PEB %d", len, offset, vol_id, lnum, pnum);
665 goto write_error;
666 }
667
668 vol->eba_tbl[lnum] = pnum;
669
670 leb_write_unlock(ubi, vol_id, lnum);
671 ubi_free_vid_hdr(ubi, vid_hdr);
672 return 0;
673
674write_error:
675 if (err != -EIO || !ubi->bad_allowed) {
676 ubi_ro_mode(ubi);
677 leb_write_unlock(ubi, vol_id, lnum);
678 ubi_free_vid_hdr(ubi, vid_hdr);
679 return err;
680 }
681
682 /*
683 * Fortunately, this is the first write operation to this physical
684 * eraseblock, so just put it and request a new one. We assume that if
685 * this physical eraseblock went bad, the erase code will handle that.
686 */
687 err = ubi_wl_put_peb(ubi, pnum, 1);
688 if (err || ++tries > UBI_IO_RETRIES) {
689 ubi_ro_mode(ubi);
690 leb_write_unlock(ubi, vol_id, lnum);
691 ubi_free_vid_hdr(ubi, vid_hdr);
692 return err;
693 }
694
695 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
696 ubi_msg("try another PEB");
697 goto retry;
698}
699
700/**
701 * ubi_eba_write_leb_st - write data to static volume.
702 * @ubi: UBI device description object
703 * @vol_id: volume ID
704 * @lnum: logical eraseblock number
705 * @buf: data to write
706 * @len: how many bytes to write
707 * @dtype: data type
708 * @used_ebs: how many logical eraseblocks will this volume contain
709 *
710 * This function writes data to logical eraseblock @lnum of static volume
711 * @vol_id. The @used_ebs argument should contain total number of logical
712 * eraseblock in this static volume.
713 *
714 * When writing to the last logical eraseblock, the @len argument doesn't have
715 * to be aligned to the minimal I/O unit size. Instead, it has to be equivalent
716 * to the real data size, although the @buf buffer has to contain the
717 * alignment. In all other cases, @len has to be aligned.
718 *
719 * It is prohibited to write more then once to logical eraseblocks of static
720 * volumes. This function returns zero in case of success and a negative error
721 * code in case of failure.
722 */
723int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum,
724 const void *buf, int len, int dtype, int used_ebs)
725{
726 int err, pnum, tries = 0, data_size = len;
727 int idx = vol_id2idx(ubi, vol_id);
728 struct ubi_volume *vol = ubi->volumes[idx];
729 struct ubi_vid_hdr *vid_hdr;
730 uint32_t crc;
731
732 if (ubi->ro_mode)
733 return -EROFS;
734
735 if (lnum == used_ebs - 1)
736 /* If this is the last LEB @len may be unaligned */
737 len = ALIGN(data_size, ubi->min_io_size);
738 else
739 ubi_assert(len % ubi->min_io_size == 0);
740
741 vid_hdr = ubi_zalloc_vid_hdr(ubi);
742 if (!vid_hdr)
743 return -ENOMEM;
744
745 err = leb_write_lock(ubi, vol_id, lnum);
746 if (err) {
747 ubi_free_vid_hdr(ubi, vid_hdr);
748 return err;
749 }
750
751 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
752 vid_hdr->vol_id = cpu_to_ubi32(vol_id);
753 vid_hdr->lnum = cpu_to_ubi32(lnum);
754 vid_hdr->compat = ubi_get_compat(ubi, vol_id);
755 vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
756
757 crc = crc32(UBI_CRC32_INIT, buf, data_size);
758 vid_hdr->vol_type = UBI_VID_STATIC;
759 vid_hdr->data_size = cpu_to_ubi32(data_size);
760 vid_hdr->used_ebs = cpu_to_ubi32(used_ebs);
761 vid_hdr->data_crc = cpu_to_ubi32(crc);
762
763retry:
764 pnum = ubi_wl_get_peb(ubi, dtype);
765 if (pnum < 0) {
766 ubi_free_vid_hdr(ubi, vid_hdr);
767 leb_write_unlock(ubi, vol_id, lnum);
768 return pnum;
769 }
770
771 dbg_eba("write VID hdr and %d bytes at LEB %d:%d, PEB %d, used_ebs %d",
772 len, vol_id, lnum, pnum, used_ebs);
773
774 err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
775 if (err) {
776 ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
777 vol_id, lnum, pnum);
778 goto write_error;
779 }
780
781 err = ubi_io_write_data(ubi, buf, pnum, 0, len);
782 if (err) {
783 ubi_warn("failed to write %d bytes of data to PEB %d",
784 len, pnum);
785 goto write_error;
786 }
787
788 ubi_assert(vol->eba_tbl[lnum] < 0);
789 vol->eba_tbl[lnum] = pnum;
790
791 leb_write_unlock(ubi, vol_id, lnum);
792 ubi_free_vid_hdr(ubi, vid_hdr);
793 return 0;
794
795write_error:
796 if (err != -EIO || !ubi->bad_allowed) {
797 /*
798 * This flash device does not admit of bad eraseblocks or
799 * something nasty and unexpected happened. Switch to read-only
800 * mode just in case.
801 */
802 ubi_ro_mode(ubi);
803 leb_write_unlock(ubi, vol_id, lnum);
804 ubi_free_vid_hdr(ubi, vid_hdr);
805 return err;
806 }
807
808 err = ubi_wl_put_peb(ubi, pnum, 1);
809 if (err || ++tries > UBI_IO_RETRIES) {
810 ubi_ro_mode(ubi);
811 leb_write_unlock(ubi, vol_id, lnum);
812 ubi_free_vid_hdr(ubi, vid_hdr);
813 return err;
814 }
815
816 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
817 ubi_msg("try another PEB");
818 goto retry;
819}
820
821/*
822 * ubi_eba_atomic_leb_change - change logical eraseblock atomically.
823 * @ubi: UBI device description object
824 * @vol_id: volume ID
825 * @lnum: logical eraseblock number
826 * @buf: data to write
827 * @len: how many bytes to write
828 * @dtype: data type
829 *
830 * This function changes the contents of a logical eraseblock atomically. @buf
831 * has to contain new logical eraseblock data, and @len - the length of the
832 * data, which has to be aligned. This function guarantees that in case of an
833 * unclean reboot the old contents is preserved. Returns zero in case of
834 * success and a negative error code in case of failure.
835 */
836int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum,
837 const void *buf, int len, int dtype)
838{
839 int err, pnum, tries = 0, idx = vol_id2idx(ubi, vol_id);
840 struct ubi_volume *vol = ubi->volumes[idx];
841 struct ubi_vid_hdr *vid_hdr;
842 uint32_t crc;
843
844 if (ubi->ro_mode)
845 return -EROFS;
846
847 vid_hdr = ubi_zalloc_vid_hdr(ubi);
848 if (!vid_hdr)
849 return -ENOMEM;
850
851 err = leb_write_lock(ubi, vol_id, lnum);
852 if (err) {
853 ubi_free_vid_hdr(ubi, vid_hdr);
854 return err;
855 }
856
857 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
858 vid_hdr->vol_id = cpu_to_ubi32(vol_id);
859 vid_hdr->lnum = cpu_to_ubi32(lnum);
860 vid_hdr->compat = ubi_get_compat(ubi, vol_id);
861 vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
862
863 crc = crc32(UBI_CRC32_INIT, buf, len);
864 vid_hdr->vol_type = UBI_VID_STATIC;
865 vid_hdr->data_size = cpu_to_ubi32(len);
866 vid_hdr->copy_flag = 1;
867 vid_hdr->data_crc = cpu_to_ubi32(crc);
868
869retry:
870 pnum = ubi_wl_get_peb(ubi, dtype);
871 if (pnum < 0) {
872 ubi_free_vid_hdr(ubi, vid_hdr);
873 leb_write_unlock(ubi, vol_id, lnum);
874 return pnum;
875 }
876
877 dbg_eba("change LEB %d:%d, PEB %d, write VID hdr to PEB %d",
878 vol_id, lnum, vol->eba_tbl[lnum], pnum);
879
880 err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
881 if (err) {
882 ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
883 vol_id, lnum, pnum);
884 goto write_error;
885 }
886
887 err = ubi_io_write_data(ubi, buf, pnum, 0, len);
888 if (err) {
889 ubi_warn("failed to write %d bytes of data to PEB %d",
890 len, pnum);
891 goto write_error;
892 }
893
894 err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 1);
895 if (err) {
896 ubi_free_vid_hdr(ubi, vid_hdr);
897 leb_write_unlock(ubi, vol_id, lnum);
898 return err;
899 }
900
901 vol->eba_tbl[lnum] = pnum;
902 leb_write_unlock(ubi, vol_id, lnum);
903 ubi_free_vid_hdr(ubi, vid_hdr);
904 return 0;
905
906write_error:
907 if (err != -EIO || !ubi->bad_allowed) {
908 /*
909 * This flash device does not admit of bad eraseblocks or
910 * something nasty and unexpected happened. Switch to read-only
911 * mode just in case.
912 */
913 ubi_ro_mode(ubi);
914 leb_write_unlock(ubi, vol_id, lnum);
915 ubi_free_vid_hdr(ubi, vid_hdr);
916 return err;
917 }
918
919 err = ubi_wl_put_peb(ubi, pnum, 1);
920 if (err || ++tries > UBI_IO_RETRIES) {
921 ubi_ro_mode(ubi);
922 leb_write_unlock(ubi, vol_id, lnum);
923 ubi_free_vid_hdr(ubi, vid_hdr);
924 return err;
925 }
926
927 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
928 ubi_msg("try another PEB");
929 goto retry;
930}
931
932/**
933 * ltree_entry_ctor - lock tree entries slab cache constructor.
934 * @obj: the lock-tree entry to construct
935 * @cache: the lock tree entry slab cache
936 * @flags: constructor flags
937 */
938static void ltree_entry_ctor(void *obj, struct kmem_cache *cache,
939 unsigned long flags)
940{
941 struct ltree_entry *le = obj;
942
943 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) !=
944 SLAB_CTOR_CONSTRUCTOR)
945 return;
946
947 le->users = 0;
948 init_rwsem(&le->mutex);
949}
950
951/**
952 * ubi_eba_copy_leb - copy logical eraseblock.
953 * @ubi: UBI device description object
954 * @from: physical eraseblock number from where to copy
955 * @to: physical eraseblock number where to copy
956 * @vid_hdr: VID header of the @from physical eraseblock
957 *
958 * This function copies logical eraseblock from physical eraseblock @from to
959 * physical eraseblock @to. The @vid_hdr buffer may be changed by this
960 * function. Returns zero in case of success, %UBI_IO_BITFLIPS if the operation
961 * was canceled because bit-flips were detected at the target PEB, and a
962 * negative error code in case of failure.
963 */
964int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
965 struct ubi_vid_hdr *vid_hdr)
966{
967 int err, vol_id, lnum, data_size, aldata_size, pnum, idx;
968 struct ubi_volume *vol;
969 uint32_t crc;
970 void *buf, *buf1 = NULL;
971
972 vol_id = ubi32_to_cpu(vid_hdr->vol_id);
973 lnum = ubi32_to_cpu(vid_hdr->lnum);
974
975 dbg_eba("copy LEB %d:%d, PEB %d to PEB %d", vol_id, lnum, from, to);
976
977 if (vid_hdr->vol_type == UBI_VID_STATIC) {
978 data_size = ubi32_to_cpu(vid_hdr->data_size);
979 aldata_size = ALIGN(data_size, ubi->min_io_size);
980 } else
981 data_size = aldata_size =
982 ubi->leb_size - ubi32_to_cpu(vid_hdr->data_pad);
983
984 buf = kmalloc(aldata_size, GFP_KERNEL);
985 if (!buf)
986 return -ENOMEM;
987
988 /*
989 * We do not want anybody to write to this logical eraseblock while we
990 * are moving it, so we lock it.
991 */
992 err = leb_write_lock(ubi, vol_id, lnum);
993 if (err) {
994 kfree(buf);
995 return err;
996 }
997
998 /*
999 * But the logical eraseblock might have been put by this time.
1000 * Cancel if it is true.
1001 */
1002 idx = vol_id2idx(ubi, vol_id);
1003
1004 /*
1005 * We may race with volume deletion/re-size, so we have to hold
1006 * @ubi->volumes_lock.
1007 */
1008 spin_lock(&ubi->volumes_lock);
1009 vol = ubi->volumes[idx];
1010 if (!vol) {
1011 dbg_eba("volume %d was removed meanwhile", vol_id);
1012 spin_unlock(&ubi->volumes_lock);
1013 goto out_unlock;
1014 }
1015
1016 pnum = vol->eba_tbl[lnum];
1017 if (pnum != from) {
1018 dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to "
1019 "PEB %d, cancel", vol_id, lnum, from, pnum);
1020 spin_unlock(&ubi->volumes_lock);
1021 goto out_unlock;
1022 }
1023 spin_unlock(&ubi->volumes_lock);
1024
1025 /* OK, now the LEB is locked and we can safely start moving it */
1026
1027 dbg_eba("read %d bytes of data", aldata_size);
1028 err = ubi_io_read_data(ubi, buf, from, 0, aldata_size);
1029 if (err && err != UBI_IO_BITFLIPS) {
1030 ubi_warn("error %d while reading data from PEB %d",
1031 err, from);
1032 goto out_unlock;
1033 }
1034
1035 /*
1036 * Now we have got to calculate how much data we have to to copy. In
1037 * case of a static volume it is fairly easy - the VID header contains
1038 * the data size. In case of a dynamic volume it is more difficult - we
1039 * have to read the contents, cut 0xFF bytes from the end and copy only
1040 * the first part. We must do this to avoid writing 0xFF bytes as it
1041 * may have some side-effects. And not only this. It is important not
1042 * to include those 0xFFs to CRC because later the they may be filled
1043 * by data.
1044 */
1045 if (vid_hdr->vol_type == UBI_VID_DYNAMIC)
1046 aldata_size = data_size =
1047 ubi_calc_data_len(ubi, buf, data_size);
1048
1049 cond_resched();
1050 crc = crc32(UBI_CRC32_INIT, buf, data_size);
1051 cond_resched();
1052
1053 /*
1054 * It may turn out to me that the whole @from physical eraseblock
1055 * contains only 0xFF bytes. Then we have to only write the VID header
1056 * and do not write any data. This also means we should not set
1057 * @vid_hdr->copy_flag, @vid_hdr->data_size, and @vid_hdr->data_crc.
1058 */
1059 if (data_size > 0) {
1060 vid_hdr->copy_flag = 1;
1061 vid_hdr->data_size = cpu_to_ubi32(data_size);
1062 vid_hdr->data_crc = cpu_to_ubi32(crc);
1063 }
1064 vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
1065
1066 err = ubi_io_write_vid_hdr(ubi, to, vid_hdr);
1067 if (err)
1068 goto out_unlock;
1069
1070 cond_resched();
1071
1072 /* Read the VID header back and check if it was written correctly */
1073 err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1);
1074 if (err) {
1075 if (err != UBI_IO_BITFLIPS)
1076 ubi_warn("cannot read VID header back from PEB %d", to);
1077 goto out_unlock;
1078 }
1079
1080 if (data_size > 0) {
1081 err = ubi_io_write_data(ubi, buf, to, 0, aldata_size);
1082 if (err)
1083 goto out_unlock;
1084
1085 /*
1086 * We've written the data and are going to read it back to make
1087 * sure it was written correctly.
1088 */
1089 buf1 = kmalloc(aldata_size, GFP_KERNEL);
1090 if (!buf1) {
1091 err = -ENOMEM;
1092 goto out_unlock;
1093 }
1094
1095 cond_resched();
1096
1097 err = ubi_io_read_data(ubi, buf1, to, 0, aldata_size);
1098 if (err) {
1099 if (err != UBI_IO_BITFLIPS)
1100 ubi_warn("cannot read data back from PEB %d",
1101 to);
1102 goto out_unlock;
1103 }
1104
1105 cond_resched();
1106
1107 if (memcmp(buf, buf1, aldata_size)) {
1108 ubi_warn("read data back from PEB %d - it is different",
1109 to);
1110 goto out_unlock;
1111 }
1112 }
1113
1114 ubi_assert(vol->eba_tbl[lnum] == from);
1115 vol->eba_tbl[lnum] = to;
1116
1117 leb_write_unlock(ubi, vol_id, lnum);
1118 kfree(buf);
1119 kfree(buf1);
1120
1121 return 0;
1122
1123out_unlock:
1124 leb_write_unlock(ubi, vol_id, lnum);
1125 kfree(buf);
1126 kfree(buf1);
1127 return err;
1128}
1129
1130/**
1131 * ubi_eba_init_scan - initialize the EBA unit using scanning information.
1132 * @ubi: UBI device description object
1133 * @si: scanning information
1134 *
1135 * This function returns zero in case of success and a negative error code in
1136 * case of failure.
1137 */
1138int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
1139{
1140 int i, j, err, num_volumes;
1141 struct ubi_scan_volume *sv;
1142 struct ubi_volume *vol;
1143 struct ubi_scan_leb *seb;
1144 struct rb_node *rb;
1145
1146 dbg_eba("initialize EBA unit");
1147
1148 spin_lock_init(&ubi->ltree_lock);
1149 ubi->ltree = RB_ROOT;
1150
1151 if (ubi_devices_cnt == 0) {
1152 ltree_slab = kmem_cache_create("ubi_ltree_slab",
1153 sizeof(struct ltree_entry), 0,
1154 0, &ltree_entry_ctor, NULL);
1155 if (!ltree_slab)
1156 return -ENOMEM;
1157 }
1158
1159 ubi->global_sqnum = si->max_sqnum + 1;
1160 num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
1161
1162 for (i = 0; i < num_volumes; i++) {
1163 vol = ubi->volumes[i];
1164 if (!vol)
1165 continue;
1166
1167 cond_resched();
1168
1169 vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int),
1170 GFP_KERNEL);
1171 if (!vol->eba_tbl) {
1172 err = -ENOMEM;
1173 goto out_free;
1174 }
1175
1176 for (j = 0; j < vol->reserved_pebs; j++)
1177 vol->eba_tbl[j] = UBI_LEB_UNMAPPED;
1178
1179 sv = ubi_scan_find_sv(si, idx2vol_id(ubi, i));
1180 if (!sv)
1181 continue;
1182
1183 ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
1184 if (seb->lnum >= vol->reserved_pebs)
1185 /*
1186 * This may happen in case of an unclean reboot
1187 * during re-size.
1188 */
1189 ubi_scan_move_to_list(sv, seb, &si->erase);
1190 vol->eba_tbl[seb->lnum] = seb->pnum;
1191 }
1192 }
1193
1194 if (ubi->bad_allowed) {
1195 ubi_calculate_reserved(ubi);
1196
1197 if (ubi->avail_pebs < ubi->beb_rsvd_level) {
1198 /* No enough free physical eraseblocks */
1199 ubi->beb_rsvd_pebs = ubi->avail_pebs;
1200 ubi_warn("cannot reserve enough PEBs for bad PEB "
1201 "handling, reserved %d, need %d",
1202 ubi->beb_rsvd_pebs, ubi->beb_rsvd_level);
1203 } else
1204 ubi->beb_rsvd_pebs = ubi->beb_rsvd_level;
1205
1206 ubi->avail_pebs -= ubi->beb_rsvd_pebs;
1207 ubi->rsvd_pebs += ubi->beb_rsvd_pebs;
1208 }
1209
1210 dbg_eba("EBA unit is initialized");
1211 return 0;
1212
1213out_free:
1214 for (i = 0; i < num_volumes; i++) {
1215 if (!ubi->volumes[i])
1216 continue;
1217 kfree(ubi->volumes[i]->eba_tbl);
1218 }
1219 if (ubi_devices_cnt == 0)
1220 kmem_cache_destroy(ltree_slab);
1221 return err;
1222}
1223
1224/**
1225 * ubi_eba_close - close EBA unit.
1226 * @ubi: UBI device description object
1227 */
1228void ubi_eba_close(const struct ubi_device *ubi)
1229{
1230 int i, num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
1231
1232 dbg_eba("close EBA unit");
1233
1234 for (i = 0; i < num_volumes; i++) {
1235 if (!ubi->volumes[i])
1236 continue;
1237 kfree(ubi->volumes[i]->eba_tbl);
1238 }
1239 if (ubi_devices_cnt == 1)
1240 kmem_cache_destroy(ltree_slab);
1241}
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c
new file mode 100644
index 000000000000..fc9478d605ff
--- /dev/null
+++ b/drivers/mtd/ubi/gluebi.c
@@ -0,0 +1,323 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём), Joern Engel
19 */
20
21/*
22 * This file includes implementation of fake MTD devices for each UBI volume.
23 * This sounds strange, but it is in fact quite useful to make MTD-oriented
24 * software (including all the legacy software) to work on top of UBI.
25 *
26 * Gluebi emulates MTD devices of "MTD_UBIVOLUME" type. Their minimal I/O unit
27 * size (mtd->writesize) is equivalent to the UBI minimal I/O unit. The
28 * eraseblock size is equivalent to the logical eraseblock size of the volume.
29 */
30
31#include <asm/div64.h>
32#include "ubi.h"
33
34/**
35 * gluebi_get_device - get MTD device reference.
36 * @mtd: the MTD device description object
37 *
38 * This function is called every time the MTD device is being opened and
39 * implements the MTD get_device() operation. Returns zero in case of success
40 * and a negative error code in case of failure.
41 */
42static int gluebi_get_device(struct mtd_info *mtd)
43{
44 struct ubi_volume *vol;
45
46 vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
47
48 /*
49 * We do not introduce locks for gluebi reference count because the
50 * get_device()/put_device() calls are already serialized at MTD.
51 */
52 if (vol->gluebi_refcount > 0) {
53 /*
54 * The MTD device is already referenced and this is just one
55 * more reference. MTD allows many users to open the same
56 * volume simultaneously and do not distinguish between
57 * readers/writers/exclusive openers as UBI does. So we do not
58 * open the UBI volume again - just increase the reference
59 * counter and return.
60 */
61 vol->gluebi_refcount += 1;
62 return 0;
63 }
64
65 /*
66 * This is the first reference to this UBI volume via the MTD device
67 * interface. Open the corresponding volume in read-write mode.
68 */
69 vol->gluebi_desc = ubi_open_volume(vol->ubi->ubi_num, vol->vol_id,
70 UBI_READWRITE);
71 if (IS_ERR(vol->gluebi_desc))
72 return PTR_ERR(vol->gluebi_desc);
73 vol->gluebi_refcount += 1;
74 return 0;
75}
76
77/**
78 * gluebi_put_device - put MTD device reference.
79 * @mtd: the MTD device description object
80 *
81 * This function is called every time the MTD device is being put. Returns
82 * zero in case of success and a negative error code in case of failure.
83 */
84static void gluebi_put_device(struct mtd_info *mtd)
85{
86 struct ubi_volume *vol;
87
88 vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
89 vol->gluebi_refcount -= 1;
90 ubi_assert(vol->gluebi_refcount >= 0);
91 if (vol->gluebi_refcount == 0)
92 ubi_close_volume(vol->gluebi_desc);
93}
94
95/**
96 * gluebi_read - read operation of emulated MTD devices.
97 * @mtd: MTD device description object
98 * @from: absolute offset from where to read
99 * @len: how many bytes to read
100 * @retlen: count of read bytes is returned here
101 * @buf: buffer to store the read data
102 *
103 * This function returns zero in case of success and a negative error code in
104 * case of failure.
105 */
106static int gluebi_read(struct mtd_info *mtd, loff_t from, size_t len,
107 size_t *retlen, unsigned char *buf)
108{
109 int err = 0, lnum, offs, total_read;
110 struct ubi_volume *vol;
111 struct ubi_device *ubi;
112 uint64_t tmp = from;
113
114 dbg_msg("read %zd bytes from offset %lld", len, from);
115
116 if (len < 0 || from < 0 || from + len > mtd->size)
117 return -EINVAL;
118
119 vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
120 ubi = vol->ubi;
121
122 offs = do_div(tmp, mtd->erasesize);
123 lnum = tmp;
124
125 total_read = len;
126 while (total_read) {
127 size_t to_read = mtd->erasesize - offs;
128
129 if (to_read > total_read)
130 to_read = total_read;
131
132 err = ubi_eba_read_leb(ubi, vol->vol_id, lnum, buf, offs,
133 to_read, 0);
134 if (err)
135 break;
136
137 lnum += 1;
138 offs = 0;
139 total_read -= to_read;
140 buf += to_read;
141 }
142
143 *retlen = len - total_read;
144 return err;
145}
146
147/**
148 * gluebi_write - write operation of emulated MTD devices.
149 * @mtd: MTD device description object
150 * @to: absolute offset where to write
151 * @len: how many bytes to write
152 * @retlen: count of written bytes is returned here
153 * @buf: buffer with data to write
154 *
155 * This function returns zero in case of success and a negative error code in
156 * case of failure.
157 */
158static int gluebi_write(struct mtd_info *mtd, loff_t to, size_t len,
159 size_t *retlen, const u_char *buf)
160{
161 int err = 0, lnum, offs, total_written;
162 struct ubi_volume *vol;
163 struct ubi_device *ubi;
164 uint64_t tmp = to;
165
166 dbg_msg("write %zd bytes to offset %lld", len, to);
167
168 if (len < 0 || to < 0 || len + to > mtd->size)
169 return -EINVAL;
170
171 vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
172 ubi = vol->ubi;
173
174 if (ubi->ro_mode)
175 return -EROFS;
176
177 offs = do_div(tmp, mtd->erasesize);
178 lnum = tmp;
179
180 if (len % mtd->writesize || offs % mtd->writesize)
181 return -EINVAL;
182
183 total_written = len;
184 while (total_written) {
185 size_t to_write = mtd->erasesize - offs;
186
187 if (to_write > total_written)
188 to_write = total_written;
189
190 err = ubi_eba_write_leb(ubi, vol->vol_id, lnum, buf, offs,
191 to_write, UBI_UNKNOWN);
192 if (err)
193 break;
194
195 lnum += 1;
196 offs = 0;
197 total_written -= to_write;
198 buf += to_write;
199 }
200
201 *retlen = len - total_written;
202 return err;
203}
204
205/**
206 * gluebi_erase - erase operation of emulated MTD devices.
207 * @mtd: the MTD device description object
208 * @instr: the erase operation description
209 *
210 * This function calls the erase callback when finishes. Returns zero in case
211 * of success and a negative error code in case of failure.
212 */
213static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
214{
215 int err, i, lnum, count;
216 struct ubi_volume *vol;
217 struct ubi_device *ubi;
218
219 dbg_msg("erase %u bytes at offset %u", instr->len, instr->addr);
220
221 if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize)
222 return -EINVAL;
223
224 if (instr->len < 0 || instr->addr + instr->len > mtd->size)
225 return -EINVAL;
226
227 if (instr->addr % mtd->writesize || instr->len % mtd->writesize)
228 return -EINVAL;
229
230 lnum = instr->addr / mtd->erasesize;
231 count = instr->len / mtd->erasesize;
232
233 vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
234 ubi = vol->ubi;
235
236 if (ubi->ro_mode)
237 return -EROFS;
238
239 for (i = 0; i < count; i++) {
240 err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum + i);
241 if (err)
242 goto out_err;
243 }
244
245 /*
246 * MTD erase operations are synchronous, so we have to make sure the
247 * physical eraseblock is wiped out.
248 */
249 err = ubi_wl_flush(ubi);
250 if (err)
251 goto out_err;
252
253 instr->state = MTD_ERASE_DONE;
254 mtd_erase_callback(instr);
255 return 0;
256
257out_err:
258 instr->state = MTD_ERASE_FAILED;
259 instr->fail_addr = lnum * mtd->erasesize;
260 return err;
261}
262
263/**
264 * ubi_create_gluebi - initialize gluebi for an UBI volume.
265 * @ubi: UBI device description object
266 * @vol: volume description object
267 *
268 * This function is called when an UBI volume is created in order to create
269 * corresponding fake MTD device. Returns zero in case of success and a
270 * negative error code in case of failure.
271 */
272int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
273{
274 struct mtd_info *mtd = &vol->gluebi_mtd;
275
276 mtd->name = kmemdup(vol->name, vol->name_len + 1, GFP_KERNEL);
277 if (!mtd->name)
278 return -ENOMEM;
279
280 mtd->type = MTD_UBIVOLUME;
281 if (!ubi->ro_mode)
282 mtd->flags = MTD_WRITEABLE;
283 mtd->writesize = ubi->min_io_size;
284 mtd->owner = THIS_MODULE;
285 mtd->size = vol->usable_leb_size * vol->reserved_pebs;
286 mtd->erasesize = vol->usable_leb_size;
287 mtd->read = gluebi_read;
288 mtd->write = gluebi_write;
289 mtd->erase = gluebi_erase;
290 mtd->get_device = gluebi_get_device;
291 mtd->put_device = gluebi_put_device;
292
293 if (add_mtd_device(mtd)) {
294 ubi_err("cannot not add MTD device\n");
295 kfree(mtd->name);
296 return -ENFILE;
297 }
298
299 dbg_msg("added mtd%d (\"%s\"), size %u, EB size %u",
300 mtd->index, mtd->name, mtd->size, mtd->erasesize);
301 return 0;
302}
303
304/**
305 * ubi_destroy_gluebi - close gluebi for an UBI volume.
306 * @vol: volume description object
307 *
308 * This function is called when an UBI volume is removed in order to remove
309 * corresponding fake MTD device. Returns zero in case of success and a
310 * negative error code in case of failure.
311 */
312int ubi_destroy_gluebi(struct ubi_volume *vol)
313{
314 int err;
315 struct mtd_info *mtd = &vol->gluebi_mtd;
316
317 dbg_msg("remove mtd%d", mtd->index);
318 err = del_mtd_device(mtd);
319 if (err)
320 return err;
321 kfree(mtd->name);
322 return 0;
323}
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
new file mode 100644
index 000000000000..438914d05151
--- /dev/null
+++ b/drivers/mtd/ubi/io.c
@@ -0,0 +1,1259 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 * Copyright (c) Nokia Corporation, 2006, 2007
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * Author: Artem Bityutskiy (Битюцкий Артём)
20 */
21
22/*
23 * UBI input/output unit.
24 *
25 * This unit provides a uniform way to work with all kinds of the underlying
26 * MTD devices. It also implements handy functions for reading and writing UBI
27 * headers.
28 *
29 * We are trying to have a paranoid mindset and not to trust to what we read
30 * from the flash media in order to be more secure and robust. So this unit
31 * validates every single header it reads from the flash media.
32 *
33 * Some words about how the eraseblock headers are stored.
34 *
35 * The erase counter header is always stored at offset zero. By default, the
36 * VID header is stored after the EC header at the closest aligned offset
37 * (i.e. aligned to the minimum I/O unit size). Data starts next to the VID
38 * header at the closest aligned offset. But this default layout may be
39 * changed. For example, for different reasons (e.g., optimization) UBI may be
40 * asked to put the VID header at further offset, and even at an unaligned
41 * offset. Of course, if the offset of the VID header is unaligned, UBI adds
42 * proper padding in front of it. Data offset may also be changed but it has to
43 * be aligned.
44 *
45 * About minimal I/O units. In general, UBI assumes flash device model where
46 * there is only one minimal I/O unit size. E.g., in case of NOR flash it is 1,
47 * in case of NAND flash it is a NAND page, etc. This is reported by MTD in the
48 * @ubi->mtd->writesize field. But as an exception, UBI admits of using another
49 * (smaller) minimal I/O unit size for EC and VID headers to make it possible
50 * to do different optimizations.
51 *
52 * This is extremely useful in case of NAND flashes which admit of several
53 * write operations to one NAND page. In this case UBI can fit EC and VID
54 * headers at one NAND page. Thus, UBI may use "sub-page" size as the minimal
55 * I/O unit for the headers (the @ubi->hdrs_min_io_size field). But it still
56 * reports NAND page size (@ubi->min_io_size) as a minimal I/O unit for the UBI
57 * users.
58 *
59 * Example: some Samsung NANDs with 2KiB pages allow 4x 512-byte writes, so
60 * although the minimal I/O unit is 2K, UBI uses 512 bytes for EC and VID
61 * headers.
62 *
63 * Q: why not just to treat sub-page as a minimal I/O unit of this flash
64 * device, e.g., make @ubi->min_io_size = 512 in the example above?
65 *
66 * A: because when writing a sub-page, MTD still writes a full 2K page but the
67 * bytes which are no relevant to the sub-page are 0xFF. So, basically, writing
68 * 4x512 sub-pages is 4 times slower then writing one 2KiB NAND page. Thus, we
69 * prefer to use sub-pages only for EV and VID headers.
70 *
71 * As it was noted above, the VID header may start at a non-aligned offset.
72 * For example, in case of a 2KiB page NAND flash with a 512 bytes sub-page,
73 * the VID header may reside at offset 1984 which is the last 64 bytes of the
74 * last sub-page (EC header is always at offset zero). This causes some
75 * difficulties when reading and writing VID headers.
76 *
77 * Suppose we have a 64-byte buffer and we read a VID header at it. We change
78 * the data and want to write this VID header out. As we can only write in
79 * 512-byte chunks, we have to allocate one more buffer and copy our VID header
80 * to offset 448 of this buffer.
81 *
82 * The I/O unit does the following trick in order to avoid this extra copy.
83 * It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID header
84 * and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. When the
85 * VID header is being written out, it shifts the VID header pointer back and
86 * writes the whole sub-page.
87 */
88
89#include <linux/crc32.h>
90#include <linux/err.h>
91#include "ubi.h"
92
93#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
94static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum);
95static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum);
96static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
97 const struct ubi_ec_hdr *ec_hdr);
98static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum);
99static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
100 const struct ubi_vid_hdr *vid_hdr);
101static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum,
102 int offset, int len);
103#else
104#define paranoid_check_not_bad(ubi, pnum) 0
105#define paranoid_check_peb_ec_hdr(ubi, pnum) 0
106#define paranoid_check_ec_hdr(ubi, pnum, ec_hdr) 0
107#define paranoid_check_peb_vid_hdr(ubi, pnum) 0
108#define paranoid_check_vid_hdr(ubi, pnum, vid_hdr) 0
109#define paranoid_check_all_ff(ubi, pnum, offset, len) 0
110#endif
111
112/**
113 * ubi_io_read - read data from a physical eraseblock.
114 * @ubi: UBI device description object
115 * @buf: buffer where to store the read data
116 * @pnum: physical eraseblock number to read from
117 * @offset: offset within the physical eraseblock from where to read
118 * @len: how many bytes to read
119 *
120 * This function reads data from offset @offset of physical eraseblock @pnum
121 * and stores the read data in the @buf buffer. The following return codes are
122 * possible:
123 *
124 * o %0 if all the requested data were successfully read;
125 * o %UBI_IO_BITFLIPS if all the requested data were successfully read, but
126 * correctable bit-flips were detected; this is harmless but may indicate
127 * that this eraseblock may become bad soon (but do not have to);
128 * o %-EBADMSG if the MTD subsystem reported about data data integrity
129 * problems, for example it can me an ECC error in case of NAND; this most
130 * probably means that the data is corrupted;
131 * o %-EIO if some I/O error occurred;
132 * o other negative error codes in case of other errors.
133 */
134int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
135 int len)
136{
137 int err, retries = 0;
138 size_t read;
139 loff_t addr;
140
141 dbg_io("read %d bytes from PEB %d:%d", len, pnum, offset);
142
143 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
144 ubi_assert(offset >= 0 && offset + len <= ubi->peb_size);
145 ubi_assert(len > 0);
146
147 err = paranoid_check_not_bad(ubi, pnum);
148 if (err)
149 return err > 0 ? -EINVAL : err;
150
151 addr = (loff_t)pnum * ubi->peb_size + offset;
152retry:
153 err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
154 if (err) {
155 if (err == -EUCLEAN) {
156 /*
157 * -EUCLEAN is reported if there was a bit-flip which
158 * was corrected, so this is harmless.
159 */
160 ubi_msg("fixable bit-flip detected at PEB %d", pnum);
161 ubi_assert(len == read);
162 return UBI_IO_BITFLIPS;
163 }
164
165 if (read != len && retries++ < UBI_IO_RETRIES) {
166 dbg_io("error %d while reading %d bytes from PEB %d:%d, "
167 "read only %zd bytes, retry",
168 err, len, pnum, offset, read);
169 yield();
170 goto retry;
171 }
172
173 ubi_err("error %d while reading %d bytes from PEB %d:%d, "
174 "read %zd bytes", err, len, pnum, offset, read);
175 ubi_dbg_dump_stack();
176 } else {
177 ubi_assert(len == read);
178
179 if (ubi_dbg_is_bitflip()) {
180 dbg_msg("bit-flip (emulated)");
181 err = UBI_IO_BITFLIPS;
182 }
183 }
184
185 return err;
186}
187
188/**
189 * ubi_io_write - write data to a physical eraseblock.
190 * @ubi: UBI device description object
191 * @buf: buffer with the data to write
192 * @pnum: physical eraseblock number to write to
193 * @offset: offset within the physical eraseblock where to write
194 * @len: how many bytes to write
195 *
196 * This function writes @len bytes of data from buffer @buf to offset @offset
197 * of physical eraseblock @pnum. If all the data were successfully written,
198 * zero is returned. If an error occurred, this function returns a negative
199 * error code. If %-EIO is returned, the physical eraseblock most probably went
200 * bad.
201 *
202 * Note, in case of an error, it is possible that something was still written
203 * to the flash media, but may be some garbage.
204 */
205int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum,
206 int offset, int len)
207{
208 int err;
209 size_t written;
210 loff_t addr;
211
212 dbg_io("write %d bytes to PEB %d:%d", len, pnum, offset);
213
214 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
215 ubi_assert(offset >= 0 && offset + len <= ubi->peb_size);
216 ubi_assert(offset % ubi->hdrs_min_io_size == 0);
217 ubi_assert(len > 0 && len % ubi->hdrs_min_io_size == 0);
218
219 if (ubi->ro_mode) {
220 ubi_err("read-only mode");
221 return -EROFS;
222 }
223
224 /* The below has to be compiled out if paranoid checks are disabled */
225
226 err = paranoid_check_not_bad(ubi, pnum);
227 if (err)
228 return err > 0 ? -EINVAL : err;
229
230 /* The area we are writing to has to contain all 0xFF bytes */
231 err = paranoid_check_all_ff(ubi, pnum, offset, len);
232 if (err)
233 return err > 0 ? -EINVAL : err;
234
235 if (offset >= ubi->leb_start) {
236 /*
237 * We write to the data area of the physical eraseblock. Make
238 * sure it has valid EC and VID headers.
239 */
240 err = paranoid_check_peb_ec_hdr(ubi, pnum);
241 if (err)
242 return err > 0 ? -EINVAL : err;
243 err = paranoid_check_peb_vid_hdr(ubi, pnum);
244 if (err)
245 return err > 0 ? -EINVAL : err;
246 }
247
248 if (ubi_dbg_is_write_failure()) {
249 dbg_err("cannot write %d bytes to PEB %d:%d "
250 "(emulated)", len, pnum, offset);
251 ubi_dbg_dump_stack();
252 return -EIO;
253 }
254
255 addr = (loff_t)pnum * ubi->peb_size + offset;
256 err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf);
257 if (err) {
258 ubi_err("error %d while writing %d bytes to PEB %d:%d, written"
259 " %zd bytes", err, len, pnum, offset, written);
260 ubi_dbg_dump_stack();
261 } else
262 ubi_assert(written == len);
263
264 return err;
265}
266
267/**
268 * erase_callback - MTD erasure call-back.
269 * @ei: MTD erase information object.
270 *
271 * Note, even though MTD erase interface is asynchronous, all the current
272 * implementations are synchronous anyway.
273 */
274static void erase_callback(struct erase_info *ei)
275{
276 wake_up_interruptible((wait_queue_head_t *)ei->priv);
277}
278
279/**
280 * do_sync_erase - synchronously erase a physical eraseblock.
281 * @ubi: UBI device description object
282 * @pnum: the physical eraseblock number to erase
283 *
284 * This function synchronously erases physical eraseblock @pnum and returns
285 * zero in case of success and a negative error code in case of failure. If
286 * %-EIO is returned, the physical eraseblock most probably went bad.
287 */
288static int do_sync_erase(const struct ubi_device *ubi, int pnum)
289{
290 int err, retries = 0;
291 struct erase_info ei;
292 wait_queue_head_t wq;
293
294 dbg_io("erase PEB %d", pnum);
295
296retry:
297 init_waitqueue_head(&wq);
298 memset(&ei, 0, sizeof(struct erase_info));
299
300 ei.mtd = ubi->mtd;
301 ei.addr = pnum * ubi->peb_size;
302 ei.len = ubi->peb_size;
303 ei.callback = erase_callback;
304 ei.priv = (unsigned long)&wq;
305
306 err = ubi->mtd->erase(ubi->mtd, &ei);
307 if (err) {
308 if (retries++ < UBI_IO_RETRIES) {
309 dbg_io("error %d while erasing PEB %d, retry",
310 err, pnum);
311 yield();
312 goto retry;
313 }
314 ubi_err("cannot erase PEB %d, error %d", pnum, err);
315 ubi_dbg_dump_stack();
316 return err;
317 }
318
319 err = wait_event_interruptible(wq, ei.state == MTD_ERASE_DONE ||
320 ei.state == MTD_ERASE_FAILED);
321 if (err) {
322 ubi_err("interrupted PEB %d erasure", pnum);
323 return -EINTR;
324 }
325
326 if (ei.state == MTD_ERASE_FAILED) {
327 if (retries++ < UBI_IO_RETRIES) {
328 dbg_io("error while erasing PEB %d, retry", pnum);
329 yield();
330 goto retry;
331 }
332 ubi_err("cannot erase PEB %d", pnum);
333 ubi_dbg_dump_stack();
334 return -EIO;
335 }
336
337 err = paranoid_check_all_ff(ubi, pnum, 0, ubi->peb_size);
338 if (err)
339 return err > 0 ? -EINVAL : err;
340
341 if (ubi_dbg_is_erase_failure() && !err) {
342 dbg_err("cannot erase PEB %d (emulated)", pnum);
343 return -EIO;
344 }
345
346 return 0;
347}
348
349/**
350 * check_pattern - check if buffer contains only a certain byte pattern.
351 * @buf: buffer to check
352 * @patt: the pattern to check
353 * @size: buffer size in bytes
354 *
355 * This function returns %1 in there are only @patt bytes in @buf, and %0 if
356 * something else was also found.
357 */
358static int check_pattern(const void *buf, uint8_t patt, int size)
359{
360 int i;
361
362 for (i = 0; i < size; i++)
363 if (((const uint8_t *)buf)[i] != patt)
364 return 0;
365 return 1;
366}
367
368/* Patterns to write to a physical eraseblock when torturing it */
369static uint8_t patterns[] = {0xa5, 0x5a, 0x0};
370
371/**
372 * torture_peb - test a supposedly bad physical eraseblock.
373 * @ubi: UBI device description object
374 * @pnum: the physical eraseblock number to test
375 *
376 * This function returns %-EIO if the physical eraseblock did not pass the
377 * test, a positive number of erase operations done if the test was
378 * successfully passed, and other negative error codes in case of other errors.
379 */
380static int torture_peb(const struct ubi_device *ubi, int pnum)
381{
382 void *buf;
383 int err, i, patt_count;
384
385 buf = kmalloc(ubi->peb_size, GFP_KERNEL);
386 if (!buf)
387 return -ENOMEM;
388
389 patt_count = ARRAY_SIZE(patterns);
390 ubi_assert(patt_count > 0);
391
392 for (i = 0; i < patt_count; i++) {
393 err = do_sync_erase(ubi, pnum);
394 if (err)
395 goto out;
396
397 /* Make sure the PEB contains only 0xFF bytes */
398 err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size);
399 if (err)
400 goto out;
401
402 err = check_pattern(buf, 0xFF, ubi->peb_size);
403 if (err == 0) {
404 ubi_err("erased PEB %d, but a non-0xFF byte found",
405 pnum);
406 err = -EIO;
407 goto out;
408 }
409
410 /* Write a pattern and check it */
411 memset(buf, patterns[i], ubi->peb_size);
412 err = ubi_io_write(ubi, buf, pnum, 0, ubi->peb_size);
413 if (err)
414 goto out;
415
416 memset(buf, ~patterns[i], ubi->peb_size);
417 err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size);
418 if (err)
419 goto out;
420
421 err = check_pattern(buf, patterns[i], ubi->peb_size);
422 if (err == 0) {
423 ubi_err("pattern %x checking failed for PEB %d",
424 patterns[i], pnum);
425 err = -EIO;
426 goto out;
427 }
428 }
429
430 err = patt_count;
431
432out:
433 if (err == UBI_IO_BITFLIPS || err == -EBADMSG)
434 /*
435 * If a bit-flip or data integrity error was detected, the test
436 * has not passed because it happened on a freshly erased
437 * physical eraseblock which means something is wrong with it.
438 */
439 err = -EIO;
440 kfree(buf);
441 return err;
442}
443
444/**
445 * ubi_io_sync_erase - synchronously erase a physical eraseblock.
446 * @ubi: UBI device description object
447 * @pnum: physical eraseblock number to erase
448 * @torture: if this physical eraseblock has to be tortured
449 *
450 * This function synchronously erases physical eraseblock @pnum. If @torture
451 * flag is not zero, the physical eraseblock is checked by means of writing
452 * different patterns to it and reading them back. If the torturing is enabled,
453 * the physical eraseblock is erased more then once.
454 *
455 * This function returns the number of erasures made in case of success, %-EIO
456 * if the erasure failed or the torturing test failed, and other negative error
457 * codes in case of other errors. Note, %-EIO means that the physical
458 * eraseblock is bad.
459 */
460int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture)
461{
462 int err, ret = 0;
463
464 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
465
466 err = paranoid_check_not_bad(ubi, pnum);
467 if (err != 0)
468 return err > 0 ? -EINVAL : err;
469
470 if (ubi->ro_mode) {
471 ubi_err("read-only mode");
472 return -EROFS;
473 }
474
475 if (torture) {
476 ret = torture_peb(ubi, pnum);
477 if (ret < 0)
478 return ret;
479 }
480
481 err = do_sync_erase(ubi, pnum);
482 if (err)
483 return err;
484
485 return ret + 1;
486}
487
488/**
489 * ubi_io_is_bad - check if a physical eraseblock is bad.
490 * @ubi: UBI device description object
491 * @pnum: the physical eraseblock number to check
492 *
493 * This function returns a positive number if the physical eraseblock is bad,
494 * zero if not, and a negative error code if an error occurred.
495 */
496int ubi_io_is_bad(const struct ubi_device *ubi, int pnum)
497{
498 struct mtd_info *mtd = ubi->mtd;
499
500 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
501
502 if (ubi->bad_allowed) {
503 int ret;
504
505 ret = mtd->block_isbad(mtd, (loff_t)pnum * ubi->peb_size);
506 if (ret < 0)
507 ubi_err("error %d while checking if PEB %d is bad",
508 ret, pnum);
509 else if (ret)
510 dbg_io("PEB %d is bad", pnum);
511 return ret;
512 }
513
514 return 0;
515}
516
517/**
518 * ubi_io_mark_bad - mark a physical eraseblock as bad.
519 * @ubi: UBI device description object
520 * @pnum: the physical eraseblock number to mark
521 *
522 * This function returns zero in case of success and a negative error code in
523 * case of failure.
524 */
525int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum)
526{
527 int err;
528 struct mtd_info *mtd = ubi->mtd;
529
530 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
531
532 if (ubi->ro_mode) {
533 ubi_err("read-only mode");
534 return -EROFS;
535 }
536
537 if (!ubi->bad_allowed)
538 return 0;
539
540 err = mtd->block_markbad(mtd, (loff_t)pnum * ubi->peb_size);
541 if (err)
542 ubi_err("cannot mark PEB %d bad, error %d", pnum, err);
543 return err;
544}
545
546/**
547 * validate_ec_hdr - validate an erase counter header.
548 * @ubi: UBI device description object
549 * @ec_hdr: the erase counter header to check
550 *
551 * This function returns zero if the erase counter header is OK, and %1 if
552 * not.
553 */
554static int validate_ec_hdr(const struct ubi_device *ubi,
555 const struct ubi_ec_hdr *ec_hdr)
556{
557 long long ec;
558 int vid_hdr_offset, leb_start;
559
560 ec = ubi64_to_cpu(ec_hdr->ec);
561 vid_hdr_offset = ubi32_to_cpu(ec_hdr->vid_hdr_offset);
562 leb_start = ubi32_to_cpu(ec_hdr->data_offset);
563
564 if (ec_hdr->version != UBI_VERSION) {
565 ubi_err("node with incompatible UBI version found: "
566 "this UBI version is %d, image version is %d",
567 UBI_VERSION, (int)ec_hdr->version);
568 goto bad;
569 }
570
571 if (vid_hdr_offset != ubi->vid_hdr_offset) {
572 ubi_err("bad VID header offset %d, expected %d",
573 vid_hdr_offset, ubi->vid_hdr_offset);
574 goto bad;
575 }
576
577 if (leb_start != ubi->leb_start) {
578 ubi_err("bad data offset %d, expected %d",
579 leb_start, ubi->leb_start);
580 goto bad;
581 }
582
583 if (ec < 0 || ec > UBI_MAX_ERASECOUNTER) {
584 ubi_err("bad erase counter %lld", ec);
585 goto bad;
586 }
587
588 return 0;
589
590bad:
591 ubi_err("bad EC header");
592 ubi_dbg_dump_ec_hdr(ec_hdr);
593 ubi_dbg_dump_stack();
594 return 1;
595}
596
597/**
598 * ubi_io_read_ec_hdr - read and check an erase counter header.
599 * @ubi: UBI device description object
600 * @pnum: physical eraseblock to read from
601 * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter
602 * header
603 * @verbose: be verbose if the header is corrupted or was not found
604 *
605 * This function reads erase counter header from physical eraseblock @pnum and
606 * stores it in @ec_hdr. This function also checks CRC checksum of the read
607 * erase counter header. The following codes may be returned:
608 *
609 * o %0 if the CRC checksum is correct and the header was successfully read;
610 * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected
611 * and corrected by the flash driver; this is harmless but may indicate that
612 * this eraseblock may become bad soon (but may be not);
613 * o %UBI_IO_BAD_EC_HDR if the erase counter header is corrupted (a CRC error);
614 * o %UBI_IO_PEB_EMPTY if the physical eraseblock is empty;
615 * o a negative error code in case of failure.
616 */
617int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum,
618 struct ubi_ec_hdr *ec_hdr, int verbose)
619{
620 int err, read_err = 0;
621 uint32_t crc, magic, hdr_crc;
622
623 dbg_io("read EC header from PEB %d", pnum);
624 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
625
626 err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE);
627 if (err) {
628 if (err != UBI_IO_BITFLIPS && err != -EBADMSG)
629 return err;
630
631 /*
632 * We read all the data, but either a correctable bit-flip
633 * occurred, or MTD reported about some data integrity error,
634 * like an ECC error in case of NAND. The former is harmless,
635 * the later may mean that the read data is corrupted. But we
636 * have a CRC check-sum and we will detect this. If the EC
637 * header is still OK, we just report this as there was a
638 * bit-flip.
639 */
640 read_err = err;
641 }
642
643 magic = ubi32_to_cpu(ec_hdr->magic);
644 if (magic != UBI_EC_HDR_MAGIC) {
645 /*
646 * The magic field is wrong. Let's check if we have read all
647 * 0xFF. If yes, this physical eraseblock is assumed to be
648 * empty.
649 *
650 * But if there was a read error, we do not test it for all
651 * 0xFFs. Even if it does contain all 0xFFs, this error
652 * indicates that something is still wrong with this physical
653 * eraseblock and we anyway cannot treat it as empty.
654 */
655 if (read_err != -EBADMSG &&
656 check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) {
657 /* The physical eraseblock is supposedly empty */
658
659 /*
660 * The below is just a paranoid check, it has to be
661 * compiled out if paranoid checks are disabled.
662 */
663 err = paranoid_check_all_ff(ubi, pnum, 0,
664 ubi->peb_size);
665 if (err)
666 return err > 0 ? UBI_IO_BAD_EC_HDR : err;
667
668 if (verbose)
669 ubi_warn("no EC header found at PEB %d, "
670 "only 0xFF bytes", pnum);
671 return UBI_IO_PEB_EMPTY;
672 }
673
674 /*
675 * This is not a valid erase counter header, and these are not
676 * 0xFF bytes. Report that the header is corrupted.
677 */
678 if (verbose) {
679 ubi_warn("bad magic number at PEB %d: %08x instead of "
680 "%08x", pnum, magic, UBI_EC_HDR_MAGIC);
681 ubi_dbg_dump_ec_hdr(ec_hdr);
682 }
683 return UBI_IO_BAD_EC_HDR;
684 }
685
686 crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
687 hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc);
688
689 if (hdr_crc != crc) {
690 if (verbose) {
691 ubi_warn("bad EC header CRC at PEB %d, calculated %#08x,"
692 " read %#08x", pnum, crc, hdr_crc);
693 ubi_dbg_dump_ec_hdr(ec_hdr);
694 }
695 return UBI_IO_BAD_EC_HDR;
696 }
697
698 /* And of course validate what has just been read from the media */
699 err = validate_ec_hdr(ubi, ec_hdr);
700 if (err) {
701 ubi_err("validation failed for PEB %d", pnum);
702 return -EINVAL;
703 }
704
705 return read_err ? UBI_IO_BITFLIPS : 0;
706}
707
708/**
709 * ubi_io_write_ec_hdr - write an erase counter header.
710 * @ubi: UBI device description object
711 * @pnum: physical eraseblock to write to
712 * @ec_hdr: the erase counter header to write
713 *
714 * This function writes erase counter header described by @ec_hdr to physical
715 * eraseblock @pnum. It also fills most fields of @ec_hdr before writing, so
716 * the caller do not have to fill them. Callers must only fill the @ec_hdr->ec
717 * field.
718 *
719 * This function returns zero in case of success and a negative error code in
720 * case of failure. If %-EIO is returned, the physical eraseblock most probably
721 * went bad.
722 */
723int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum,
724 struct ubi_ec_hdr *ec_hdr)
725{
726 int err;
727 uint32_t crc;
728
729 dbg_io("write EC header to PEB %d", pnum);
730 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
731
732 ec_hdr->magic = cpu_to_ubi32(UBI_EC_HDR_MAGIC);
733 ec_hdr->version = UBI_VERSION;
734 ec_hdr->vid_hdr_offset = cpu_to_ubi32(ubi->vid_hdr_offset);
735 ec_hdr->data_offset = cpu_to_ubi32(ubi->leb_start);
736 crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
737 ec_hdr->hdr_crc = cpu_to_ubi32(crc);
738
739 err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
740 if (err)
741 return -EINVAL;
742
743 err = ubi_io_write(ubi, ec_hdr, pnum, 0, ubi->ec_hdr_alsize);
744 return err;
745}
746
747/**
748 * validate_vid_hdr - validate a volume identifier header.
749 * @ubi: UBI device description object
750 * @vid_hdr: the volume identifier header to check
751 *
752 * This function checks that data stored in the volume identifier header
753 * @vid_hdr. Returns zero if the VID header is OK and %1 if not.
754 */
755static int validate_vid_hdr(const struct ubi_device *ubi,
756 const struct ubi_vid_hdr *vid_hdr)
757{
758 int vol_type = vid_hdr->vol_type;
759 int copy_flag = vid_hdr->copy_flag;
760 int vol_id = ubi32_to_cpu(vid_hdr->vol_id);
761 int lnum = ubi32_to_cpu(vid_hdr->lnum);
762 int compat = vid_hdr->compat;
763 int data_size = ubi32_to_cpu(vid_hdr->data_size);
764 int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
765 int data_pad = ubi32_to_cpu(vid_hdr->data_pad);
766 int data_crc = ubi32_to_cpu(vid_hdr->data_crc);
767 int usable_leb_size = ubi->leb_size - data_pad;
768
769 if (copy_flag != 0 && copy_flag != 1) {
770 dbg_err("bad copy_flag");
771 goto bad;
772 }
773
774 if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 ||
775 data_pad < 0) {
776 dbg_err("negative values");
777 goto bad;
778 }
779
780 if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) {
781 dbg_err("bad vol_id");
782 goto bad;
783 }
784
785 if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) {
786 dbg_err("bad compat");
787 goto bad;
788 }
789
790 if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE &&
791 compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE &&
792 compat != UBI_COMPAT_REJECT) {
793 dbg_err("bad compat");
794 goto bad;
795 }
796
797 if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) {
798 dbg_err("bad vol_type");
799 goto bad;
800 }
801
802 if (data_pad >= ubi->leb_size / 2) {
803 dbg_err("bad data_pad");
804 goto bad;
805 }
806
807 if (vol_type == UBI_VID_STATIC) {
808 /*
809 * Although from high-level point of view static volumes may
810 * contain zero bytes of data, but no VID headers can contain
811 * zero at these fields, because they empty volumes do not have
812 * mapped logical eraseblocks.
813 */
814 if (used_ebs == 0) {
815 dbg_err("zero used_ebs");
816 goto bad;
817 }
818 if (data_size == 0) {
819 dbg_err("zero data_size");
820 goto bad;
821 }
822 if (lnum < used_ebs - 1) {
823 if (data_size != usable_leb_size) {
824 dbg_err("bad data_size");
825 goto bad;
826 }
827 } else if (lnum == used_ebs - 1) {
828 if (data_size == 0) {
829 dbg_err("bad data_size at last LEB");
830 goto bad;
831 }
832 } else {
833 dbg_err("too high lnum");
834 goto bad;
835 }
836 } else {
837 if (copy_flag == 0) {
838 if (data_crc != 0) {
839 dbg_err("non-zero data CRC");
840 goto bad;
841 }
842 if (data_size != 0) {
843 dbg_err("non-zero data_size");
844 goto bad;
845 }
846 } else {
847 if (data_size == 0) {
848 dbg_err("zero data_size of copy");
849 goto bad;
850 }
851 }
852 if (used_ebs != 0) {
853 dbg_err("bad used_ebs");
854 goto bad;
855 }
856 }
857
858 return 0;
859
860bad:
861 ubi_err("bad VID header");
862 ubi_dbg_dump_vid_hdr(vid_hdr);
863 ubi_dbg_dump_stack();
864 return 1;
865}
866
867/**
868 * ubi_io_read_vid_hdr - read and check a volume identifier header.
869 * @ubi: UBI device description object
870 * @pnum: physical eraseblock number to read from
871 * @vid_hdr: &struct ubi_vid_hdr object where to store the read volume
872 * identifier header
873 * @verbose: be verbose if the header is corrupted or wasn't found
874 *
875 * This function reads the volume identifier header from physical eraseblock
876 * @pnum and stores it in @vid_hdr. It also checks CRC checksum of the read
877 * volume identifier header. The following codes may be returned:
878 *
879 * o %0 if the CRC checksum is correct and the header was successfully read;
880 * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected
881 * and corrected by the flash driver; this is harmless but may indicate that
882 * this eraseblock may become bad soon;
883 * o %UBI_IO_BAD_VID_HRD if the volume identifier header is corrupted (a CRC
884 * error detected);
885 * o %UBI_IO_PEB_FREE if the physical eraseblock is free (i.e., there is no VID
886 * header there);
887 * o a negative error code in case of failure.
888 */
889int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum,
890 struct ubi_vid_hdr *vid_hdr, int verbose)
891{
892 int err, read_err = 0;
893 uint32_t crc, magic, hdr_crc;
894 void *p;
895
896 dbg_io("read VID header from PEB %d", pnum);
897 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
898
899 p = (char *)vid_hdr - ubi->vid_hdr_shift;
900 err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset,
901 ubi->vid_hdr_alsize);
902 if (err) {
903 if (err != UBI_IO_BITFLIPS && err != -EBADMSG)
904 return err;
905
906 /*
907 * We read all the data, but either a correctable bit-flip
908 * occurred, or MTD reported about some data integrity error,
909 * like an ECC error in case of NAND. The former is harmless,
910 * the later may mean the read data is corrupted. But we have a
911 * CRC check-sum and we will identify this. If the VID header is
912 * still OK, we just report this as there was a bit-flip.
913 */
914 read_err = err;
915 }
916
917 magic = ubi32_to_cpu(vid_hdr->magic);
918 if (magic != UBI_VID_HDR_MAGIC) {
919 /*
920 * If we have read all 0xFF bytes, the VID header probably does
921 * not exist and the physical eraseblock is assumed to be free.
922 *
923 * But if there was a read error, we do not test the data for
924 * 0xFFs. Even if it does contain all 0xFFs, this error
925 * indicates that something is still wrong with this physical
926 * eraseblock and it cannot be regarded as free.
927 */
928 if (read_err != -EBADMSG &&
929 check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) {
930 /* The physical eraseblock is supposedly free */
931
932 /*
933 * The below is just a paranoid check, it has to be
934 * compiled out if paranoid checks are disabled.
935 */
936 err = paranoid_check_all_ff(ubi, pnum, ubi->leb_start,
937 ubi->leb_size);
938 if (err)
939 return err > 0 ? UBI_IO_BAD_VID_HDR : err;
940
941 if (verbose)
942 ubi_warn("no VID header found at PEB %d, "
943 "only 0xFF bytes", pnum);
944 return UBI_IO_PEB_FREE;
945 }
946
947 /*
948 * This is not a valid VID header, and these are not 0xFF
949 * bytes. Report that the header is corrupted.
950 */
951 if (verbose) {
952 ubi_warn("bad magic number at PEB %d: %08x instead of "
953 "%08x", pnum, magic, UBI_VID_HDR_MAGIC);
954 ubi_dbg_dump_vid_hdr(vid_hdr);
955 }
956 return UBI_IO_BAD_VID_HDR;
957 }
958
959 crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
960 hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc);
961
962 if (hdr_crc != crc) {
963 if (verbose) {
964 ubi_warn("bad CRC at PEB %d, calculated %#08x, "
965 "read %#08x", pnum, crc, hdr_crc);
966 ubi_dbg_dump_vid_hdr(vid_hdr);
967 }
968 return UBI_IO_BAD_VID_HDR;
969 }
970
971 /* Validate the VID header that we have just read */
972 err = validate_vid_hdr(ubi, vid_hdr);
973 if (err) {
974 ubi_err("validation failed for PEB %d", pnum);
975 return -EINVAL;
976 }
977
978 return read_err ? UBI_IO_BITFLIPS : 0;
979}
980
981/**
982 * ubi_io_write_vid_hdr - write a volume identifier header.
983 * @ubi: UBI device description object
984 * @pnum: the physical eraseblock number to write to
985 * @vid_hdr: the volume identifier header to write
986 *
987 * This function writes the volume identifier header described by @vid_hdr to
988 * physical eraseblock @pnum. This function automatically fills the
989 * @vid_hdr->magic and the @vid_hdr->version fields, as well as calculates
990 * header CRC checksum and stores it at vid_hdr->hdr_crc.
991 *
992 * This function returns zero in case of success and a negative error code in
993 * case of failure. If %-EIO is returned, the physical eraseblock probably went
994 * bad.
995 */
996int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum,
997 struct ubi_vid_hdr *vid_hdr)
998{
999 int err;
1000 uint32_t crc;
1001 void *p;
1002
1003 dbg_io("write VID header to PEB %d", pnum);
1004 ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
1005
1006 err = paranoid_check_peb_ec_hdr(ubi, pnum);
1007 if (err)
1008 return err > 0 ? -EINVAL: err;
1009
1010 vid_hdr->magic = cpu_to_ubi32(UBI_VID_HDR_MAGIC);
1011 vid_hdr->version = UBI_VERSION;
1012 crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
1013 vid_hdr->hdr_crc = cpu_to_ubi32(crc);
1014
1015 err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
1016 if (err)
1017 return -EINVAL;
1018
1019 p = (char *)vid_hdr - ubi->vid_hdr_shift;
1020 err = ubi_io_write(ubi, p, pnum, ubi->vid_hdr_aloffset,
1021 ubi->vid_hdr_alsize);
1022 return err;
1023}
1024
1025#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
1026
1027/**
1028 * paranoid_check_not_bad - ensure that a physical eraseblock is not bad.
1029 * @ubi: UBI device description object
1030 * @pnum: physical eraseblock number to check
1031 *
1032 * This function returns zero if the physical eraseblock is good, a positive
1033 * number if it is bad and a negative error code if an error occurred.
1034 */
1035static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum)
1036{
1037 int err;
1038
1039 err = ubi_io_is_bad(ubi, pnum);
1040 if (!err)
1041 return err;
1042
1043 ubi_err("paranoid check failed for PEB %d", pnum);
1044 ubi_dbg_dump_stack();
1045 return err;
1046}
1047
1048/**
1049 * paranoid_check_ec_hdr - check if an erase counter header is all right.
1050 * @ubi: UBI device description object
1051 * @pnum: physical eraseblock number the erase counter header belongs to
1052 * @ec_hdr: the erase counter header to check
1053 *
1054 * This function returns zero if the erase counter header contains valid
1055 * values, and %1 if not.
1056 */
1057static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
1058 const struct ubi_ec_hdr *ec_hdr)
1059{
1060 int err;
1061 uint32_t magic;
1062
1063 magic = ubi32_to_cpu(ec_hdr->magic);
1064 if (magic != UBI_EC_HDR_MAGIC) {
1065 ubi_err("bad magic %#08x, must be %#08x",
1066 magic, UBI_EC_HDR_MAGIC);
1067 goto fail;
1068 }
1069
1070 err = validate_ec_hdr(ubi, ec_hdr);
1071 if (err) {
1072 ubi_err("paranoid check failed for PEB %d", pnum);
1073 goto fail;
1074 }
1075
1076 return 0;
1077
1078fail:
1079 ubi_dbg_dump_ec_hdr(ec_hdr);
1080 ubi_dbg_dump_stack();
1081 return 1;
1082}
1083
1084/**
1085 * paranoid_check_peb_ec_hdr - check that the erase counter header of a
1086 * physical eraseblock is in-place and is all right.
1087 * @ubi: UBI device description object
1088 * @pnum: the physical eraseblock number to check
1089 *
1090 * This function returns zero if the erase counter header is all right, %1 if
1091 * not, and a negative error code if an error occurred.
1092 */
1093static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum)
1094{
1095 int err;
1096 uint32_t crc, hdr_crc;
1097 struct ubi_ec_hdr *ec_hdr;
1098
1099 ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
1100 if (!ec_hdr)
1101 return -ENOMEM;
1102
1103 err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE);
1104 if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG)
1105 goto exit;
1106
1107 crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
1108 hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc);
1109 if (hdr_crc != crc) {
1110 ubi_err("bad CRC, calculated %#08x, read %#08x", crc, hdr_crc);
1111 ubi_err("paranoid check failed for PEB %d", pnum);
1112 ubi_dbg_dump_ec_hdr(ec_hdr);
1113 ubi_dbg_dump_stack();
1114 err = 1;
1115 goto exit;
1116 }
1117
1118 err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
1119
1120exit:
1121 kfree(ec_hdr);
1122 return err;
1123}
1124
1125/**
1126 * paranoid_check_vid_hdr - check that a volume identifier header is all right.
1127 * @ubi: UBI device description object
1128 * @pnum: physical eraseblock number the volume identifier header belongs to
1129 * @vid_hdr: the volume identifier header to check
1130 *
1131 * This function returns zero if the volume identifier header is all right, and
1132 * %1 if not.
1133 */
1134static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
1135 const struct ubi_vid_hdr *vid_hdr)
1136{
1137 int err;
1138 uint32_t magic;
1139
1140 magic = ubi32_to_cpu(vid_hdr->magic);
1141 if (magic != UBI_VID_HDR_MAGIC) {
1142 ubi_err("bad VID header magic %#08x at PEB %d, must be %#08x",
1143 magic, pnum, UBI_VID_HDR_MAGIC);
1144 goto fail;
1145 }
1146
1147 err = validate_vid_hdr(ubi, vid_hdr);
1148 if (err) {
1149 ubi_err("paranoid check failed for PEB %d", pnum);
1150 goto fail;
1151 }
1152
1153 return err;
1154
1155fail:
1156 ubi_err("paranoid check failed for PEB %d", pnum);
1157 ubi_dbg_dump_vid_hdr(vid_hdr);
1158 ubi_dbg_dump_stack();
1159 return 1;
1160
1161}
1162
1163/**
1164 * paranoid_check_peb_vid_hdr - check that the volume identifier header of a
1165 * physical eraseblock is in-place and is all right.
1166 * @ubi: UBI device description object
1167 * @pnum: the physical eraseblock number to check
1168 *
1169 * This function returns zero if the volume identifier header is all right,
1170 * %1 if not, and a negative error code if an error occurred.
1171 */
1172static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum)
1173{
1174 int err;
1175 uint32_t crc, hdr_crc;
1176 struct ubi_vid_hdr *vid_hdr;
1177 void *p;
1178
1179 vid_hdr = ubi_zalloc_vid_hdr(ubi);
1180 if (!vid_hdr)
1181 return -ENOMEM;
1182
1183 p = (char *)vid_hdr - ubi->vid_hdr_shift;
1184 err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset,
1185 ubi->vid_hdr_alsize);
1186 if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG)
1187 goto exit;
1188
1189 crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC);
1190 hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc);
1191 if (hdr_crc != crc) {
1192 ubi_err("bad VID header CRC at PEB %d, calculated %#08x, "
1193 "read %#08x", pnum, crc, hdr_crc);
1194 ubi_err("paranoid check failed for PEB %d", pnum);
1195 ubi_dbg_dump_vid_hdr(vid_hdr);
1196 ubi_dbg_dump_stack();
1197 err = 1;
1198 goto exit;
1199 }
1200
1201 err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
1202
1203exit:
1204 ubi_free_vid_hdr(ubi, vid_hdr);
1205 return err;
1206}
1207
1208/**
1209 * paranoid_check_all_ff - check that a region of flash is empty.
1210 * @ubi: UBI device description object
1211 * @pnum: the physical eraseblock number to check
1212 * @offset: the starting offset within the physical eraseblock to check
1213 * @len: the length of the region to check
1214 *
1215 * This function returns zero if only 0xFF bytes are present at offset
1216 * @offset of the physical eraseblock @pnum, %1 if not, and a negative error
1217 * code if an error occurred.
1218 */
1219static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum,
1220 int offset, int len)
1221{
1222 size_t read;
1223 int err;
1224 void *buf;
1225 loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
1226
1227 buf = kzalloc(len, GFP_KERNEL);
1228 if (!buf)
1229 return -ENOMEM;
1230
1231 err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
1232 if (err && err != -EUCLEAN) {
1233 ubi_err("error %d while reading %d bytes from PEB %d:%d, "
1234 "read %zd bytes", err, len, pnum, offset, read);
1235 goto error;
1236 }
1237
1238 err = check_pattern(buf, 0xFF, len);
1239 if (err == 0) {
1240 ubi_err("flash region at PEB %d:%d, length %d does not "
1241 "contain all 0xFF bytes", pnum, offset, len);
1242 goto fail;
1243 }
1244
1245 kfree(buf);
1246 return 0;
1247
1248fail:
1249 ubi_err("paranoid check failed for PEB %d", pnum);
1250 dbg_msg("hex dump of the %d-%d region", offset, offset + len);
1251 ubi_dbg_hexdump(buf, len);
1252 err = 1;
1253error:
1254 ubi_dbg_dump_stack();
1255 kfree(buf);
1256 return err;
1257}
1258
1259#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
new file mode 100644
index 000000000000..d352c4575c3d
--- /dev/null
+++ b/drivers/mtd/ubi/kapi.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/* This file mostly implements UBI kernel API functions */
22
23#include <linux/module.h>
24#include <linux/err.h>
25#include <asm/div64.h>
26#include "ubi.h"
27
28/**
29 * ubi_get_device_info - get information about UBI device.
30 * @ubi_num: UBI device number
31 * @di: the information is stored here
32 *
33 * This function returns %0 in case of success and a %-ENODEV if there is no
34 * such UBI device.
35 */
36int ubi_get_device_info(int ubi_num, struct ubi_device_info *di)
37{
38 const struct ubi_device *ubi;
39
40 if (!try_module_get(THIS_MODULE))
41 return -ENODEV;
42
43 if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES ||
44 !ubi_devices[ubi_num]) {
45 module_put(THIS_MODULE);
46 return -ENODEV;
47 }
48
49 ubi = ubi_devices[ubi_num];
50 di->ubi_num = ubi->ubi_num;
51 di->leb_size = ubi->leb_size;
52 di->min_io_size = ubi->min_io_size;
53 di->ro_mode = ubi->ro_mode;
54 di->cdev = MKDEV(ubi->major, 0);
55 module_put(THIS_MODULE);
56 return 0;
57}
58EXPORT_SYMBOL_GPL(ubi_get_device_info);
59
60/**
61 * ubi_get_volume_info - get information about UBI volume.
62 * @desc: volume descriptor
63 * @vi: the information is stored here
64 */
65void ubi_get_volume_info(struct ubi_volume_desc *desc,
66 struct ubi_volume_info *vi)
67{
68 const struct ubi_volume *vol = desc->vol;
69 const struct ubi_device *ubi = vol->ubi;
70
71 vi->vol_id = vol->vol_id;
72 vi->ubi_num = ubi->ubi_num;
73 vi->size = vol->reserved_pebs;
74 vi->used_bytes = vol->used_bytes;
75 vi->vol_type = vol->vol_type;
76 vi->corrupted = vol->corrupted;
77 vi->upd_marker = vol->upd_marker;
78 vi->alignment = vol->alignment;
79 vi->usable_leb_size = vol->usable_leb_size;
80 vi->name_len = vol->name_len;
81 vi->name = vol->name;
82 vi->cdev = MKDEV(ubi->major, vi->vol_id + 1);
83}
84EXPORT_SYMBOL_GPL(ubi_get_volume_info);
85
86/**
87 * ubi_open_volume - open UBI volume.
88 * @ubi_num: UBI device number
89 * @vol_id: volume ID
90 * @mode: open mode
91 *
92 * The @mode parameter specifies if the volume should be opened in read-only
93 * mode, read-write mode, or exclusive mode. The exclusive mode guarantees that
94 * nobody else will be able to open this volume. UBI allows to have many volume
95 * readers and one writer at a time.
96 *
97 * If a static volume is being opened for the first time since boot, it will be
98 * checked by this function, which means it will be fully read and the CRC
99 * checksum of each logical eraseblock will be checked.
100 *
101 * This function returns volume descriptor in case of success and a negative
102 * error code in case of failure.
103 */
104struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
105{
106 int err;
107 struct ubi_volume_desc *desc;
108 struct ubi_device *ubi = ubi_devices[ubi_num];
109 struct ubi_volume *vol;
110
111 dbg_msg("open device %d volume %d, mode %d", ubi_num, vol_id, mode);
112
113 err = -ENODEV;
114 if (!try_module_get(THIS_MODULE))
115 return ERR_PTR(err);
116
117 if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi)
118 goto out_put;
119
120 err = -EINVAL;
121 if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
122 goto out_put;
123 if (mode != UBI_READONLY && mode != UBI_READWRITE &&
124 mode != UBI_EXCLUSIVE)
125 goto out_put;
126
127 desc = kmalloc(sizeof(struct ubi_volume_desc), GFP_KERNEL);
128 if (!desc) {
129 err = -ENOMEM;
130 goto out_put;
131 }
132
133 spin_lock(&ubi->volumes_lock);
134 vol = ubi->volumes[vol_id];
135 if (!vol) {
136 err = -ENODEV;
137 goto out_unlock;
138 }
139
140 err = -EBUSY;
141 switch (mode) {
142 case UBI_READONLY:
143 if (vol->exclusive)
144 goto out_unlock;
145 vol->readers += 1;
146 break;
147
148 case UBI_READWRITE:
149 if (vol->exclusive || vol->writers > 0)
150 goto out_unlock;
151 vol->writers += 1;
152 break;
153
154 case UBI_EXCLUSIVE:
155 if (vol->exclusive || vol->writers || vol->readers)
156 goto out_unlock;
157 vol->exclusive = 1;
158 break;
159 }
160 spin_unlock(&ubi->volumes_lock);
161
162 desc->vol = vol;
163 desc->mode = mode;
164
165 /*
166 * To prevent simultaneous checks of the same volume we use @vtbl_mutex,
167 * although it is not the purpose it was introduced for.
168 */
169 mutex_lock(&ubi->vtbl_mutex);
170 if (!vol->checked) {
171 /* This is the first open - check the volume */
172 err = ubi_check_volume(ubi, vol_id);
173 if (err < 0) {
174 mutex_unlock(&ubi->vtbl_mutex);
175 ubi_close_volume(desc);
176 return ERR_PTR(err);
177 }
178 if (err == 1) {
179 ubi_warn("volume %d on UBI device %d is corrupted",
180 vol_id, ubi->ubi_num);
181 vol->corrupted = 1;
182 }
183 vol->checked = 1;
184 }
185 mutex_unlock(&ubi->vtbl_mutex);
186 return desc;
187
188out_unlock:
189 spin_unlock(&ubi->volumes_lock);
190 kfree(desc);
191out_put:
192 module_put(THIS_MODULE);
193 return ERR_PTR(err);
194}
195EXPORT_SYMBOL_GPL(ubi_open_volume);
196
197/**
198 * ubi_open_volume_nm - open UBI volume by name.
199 * @ubi_num: UBI device number
200 * @name: volume name
201 * @mode: open mode
202 *
203 * This function is similar to 'ubi_open_volume()', but opens a volume by name.
204 */
205struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name,
206 int mode)
207{
208 int i, vol_id = -1, len;
209 struct ubi_volume_desc *ret;
210 struct ubi_device *ubi;
211
212 dbg_msg("open volume %s, mode %d", name, mode);
213
214 if (!name)
215 return ERR_PTR(-EINVAL);
216
217 len = strnlen(name, UBI_VOL_NAME_MAX + 1);
218 if (len > UBI_VOL_NAME_MAX)
219 return ERR_PTR(-EINVAL);
220
221 ret = ERR_PTR(-ENODEV);
222 if (!try_module_get(THIS_MODULE))
223 return ret;
224
225 if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi_devices[ubi_num])
226 goto out_put;
227
228 ubi = ubi_devices[ubi_num];
229
230 spin_lock(&ubi->volumes_lock);
231 /* Walk all volumes of this UBI device */
232 for (i = 0; i < ubi->vtbl_slots; i++) {
233 struct ubi_volume *vol = ubi->volumes[i];
234
235 if (vol && len == vol->name_len && !strcmp(name, vol->name)) {
236 vol_id = i;
237 break;
238 }
239 }
240 spin_unlock(&ubi->volumes_lock);
241
242 if (vol_id < 0)
243 goto out_put;
244
245 ret = ubi_open_volume(ubi_num, vol_id, mode);
246
247out_put:
248 module_put(THIS_MODULE);
249 return ret;
250}
251EXPORT_SYMBOL_GPL(ubi_open_volume_nm);
252
253/**
254 * ubi_close_volume - close UBI volume.
255 * @desc: volume descriptor
256 */
257void ubi_close_volume(struct ubi_volume_desc *desc)
258{
259 struct ubi_volume *vol = desc->vol;
260
261 dbg_msg("close volume %d, mode %d", vol->vol_id, desc->mode);
262
263 spin_lock(&vol->ubi->volumes_lock);
264 switch (desc->mode) {
265 case UBI_READONLY:
266 vol->readers -= 1;
267 break;
268 case UBI_READWRITE:
269 vol->writers -= 1;
270 break;
271 case UBI_EXCLUSIVE:
272 vol->exclusive = 0;
273 }
274 spin_unlock(&vol->ubi->volumes_lock);
275
276 kfree(desc);
277 module_put(THIS_MODULE);
278}
279EXPORT_SYMBOL_GPL(ubi_close_volume);
280
281/**
282 * ubi_leb_read - read data.
283 * @desc: volume descriptor
284 * @lnum: logical eraseblock number to read from
285 * @buf: buffer where to store the read data
286 * @offset: offset within the logical eraseblock to read from
287 * @len: how many bytes to read
288 * @check: whether UBI has to check the read data's CRC or not.
289 *
290 * This function reads data from offset @offset of logical eraseblock @lnum and
291 * stores the data at @buf. When reading from static volumes, @check specifies
292 * whether the data has to be checked or not. If yes, the whole logical
293 * eraseblock will be read and its CRC checksum will be checked (i.e., the CRC
294 * checksum is per-eraseblock). So checking may substantially slow down the
295 * read speed. The @check argument is ignored for dynamic volumes.
296 *
297 * In case of success, this function returns zero. In case of failure, this
298 * function returns a negative error code.
299 *
300 * %-EBADMSG error code is returned:
301 * o for both static and dynamic volumes if MTD driver has detected a data
302 * integrity problem (unrecoverable ECC checksum mismatch in case of NAND);
303 * o for static volumes in case of data CRC mismatch.
304 *
305 * If the volume is damaged because of an interrupted update this function just
306 * returns immediately with %-EBADF error code.
307 */
308int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
309 int len, int check)
310{
311 struct ubi_volume *vol = desc->vol;
312 struct ubi_device *ubi = vol->ubi;
313 int err, vol_id = vol->vol_id;
314
315 dbg_msg("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
316
317 if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
318 lnum >= vol->used_ebs || offset < 0 || len < 0 ||
319 offset + len > vol->usable_leb_size)
320 return -EINVAL;
321
322 if (vol->vol_type == UBI_STATIC_VOLUME && lnum == vol->used_ebs - 1 &&
323 offset + len > vol->last_eb_bytes)
324 return -EINVAL;
325
326 if (vol->upd_marker)
327 return -EBADF;
328 if (len == 0)
329 return 0;
330
331 err = ubi_eba_read_leb(ubi, vol_id, lnum, buf, offset, len, check);
332 if (err && err == -EBADMSG && vol->vol_type == UBI_STATIC_VOLUME) {
333 ubi_warn("mark volume %d as corrupted", vol_id);
334 vol->corrupted = 1;
335 }
336
337 return err;
338}
339EXPORT_SYMBOL_GPL(ubi_leb_read);
340
341/**
342 * ubi_leb_write - write data.
343 * @desc: volume descriptor
344 * @lnum: logical eraseblock number to write to
345 * @buf: data to write
346 * @offset: offset within the logical eraseblock where to write
347 * @len: how many bytes to write
348 * @dtype: expected data type
349 *
350 * This function writes @len bytes of data from @buf to offset @offset of
351 * logical eraseblock @lnum. The @dtype argument describes expected lifetime of
352 * the data.
353 *
354 * This function takes care of physical eraseblock write failures. If write to
355 * the physical eraseblock write operation fails, the logical eraseblock is
356 * re-mapped to another physical eraseblock, the data is recovered, and the
357 * write finishes. UBI has a pool of reserved physical eraseblocks for this.
358 *
359 * If all the data were successfully written, zero is returned. If an error
360 * occurred and UBI has not been able to recover from it, this function returns
361 * a negative error code. Note, in case of an error, it is possible that
362 * something was still written to the flash media, but that may be some
363 * garbage.
364 *
365 * If the volume is damaged because of an interrupted update this function just
366 * returns immediately with %-EBADF code.
367 */
368int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
369 int offset, int len, int dtype)
370{
371 struct ubi_volume *vol = desc->vol;
372 struct ubi_device *ubi = vol->ubi;
373 int vol_id = vol->vol_id;
374
375 dbg_msg("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset);
376
377 if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
378 return -EINVAL;
379
380 if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
381 return -EROFS;
382
383 if (lnum < 0 || lnum >= vol->reserved_pebs || offset < 0 || len < 0 ||
384 offset + len > vol->usable_leb_size || offset % ubi->min_io_size ||
385 len % ubi->min_io_size)
386 return -EINVAL;
387
388 if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
389 dtype != UBI_UNKNOWN)
390 return -EINVAL;
391
392 if (vol->upd_marker)
393 return -EBADF;
394
395 if (len == 0)
396 return 0;
397
398 return ubi_eba_write_leb(ubi, vol_id, lnum, buf, offset, len, dtype);
399}
400EXPORT_SYMBOL_GPL(ubi_leb_write);
401
402/*
403 * ubi_leb_change - change logical eraseblock atomically.
404 * @desc: volume descriptor
405 * @lnum: logical eraseblock number to change
406 * @buf: data to write
407 * @len: how many bytes to write
408 * @dtype: expected data type
409 *
410 * This function changes the contents of a logical eraseblock atomically. @buf
411 * has to contain new logical eraseblock data, and @len - the length of the
412 * data, which has to be aligned. The length may be shorter then the logical
413 * eraseblock size, ant the logical eraseblock may be appended to more times
414 * later on. This function guarantees that in case of an unclean reboot the old
415 * contents is preserved. Returns zero in case of success and a negative error
416 * code in case of failure.
417 */
418int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
419 int len, int dtype)
420{
421 struct ubi_volume *vol = desc->vol;
422 struct ubi_device *ubi = vol->ubi;
423 int vol_id = vol->vol_id;
424
425 dbg_msg("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum);
426
427 if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
428 return -EINVAL;
429
430 if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
431 return -EROFS;
432
433 if (lnum < 0 || lnum >= vol->reserved_pebs || len < 0 ||
434 len > vol->usable_leb_size || len % ubi->min_io_size)
435 return -EINVAL;
436
437 if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
438 dtype != UBI_UNKNOWN)
439 return -EINVAL;
440
441 if (vol->upd_marker)
442 return -EBADF;
443
444 if (len == 0)
445 return 0;
446
447 return ubi_eba_atomic_leb_change(ubi, vol_id, lnum, buf, len, dtype);
448}
449EXPORT_SYMBOL_GPL(ubi_leb_change);
450
451/**
452 * ubi_leb_erase - erase logical eraseblock.
453 * @desc: volume descriptor
454 * @lnum: logical eraseblock number
455 *
456 * This function un-maps logical eraseblock @lnum and synchronously erases the
457 * correspondent physical eraseblock. Returns zero in case of success and a
458 * negative error code in case of failure.
459 *
460 * If the volume is damaged because of an interrupted update this function just
461 * returns immediately with %-EBADF code.
462 */
463int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum)
464{
465 struct ubi_volume *vol = desc->vol;
466 struct ubi_device *ubi = vol->ubi;
467 int err, vol_id = vol->vol_id;
468
469 dbg_msg("erase LEB %d:%d", vol_id, lnum);
470
471 if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
472 return -EROFS;
473
474 if (lnum < 0 || lnum >= vol->reserved_pebs)
475 return -EINVAL;
476
477 if (vol->upd_marker)
478 return -EBADF;
479
480 err = ubi_eba_unmap_leb(ubi, vol_id, lnum);
481 if (err)
482 return err;
483
484 return ubi_wl_flush(ubi);
485}
486EXPORT_SYMBOL_GPL(ubi_leb_erase);
487
488/**
489 * ubi_leb_unmap - un-map logical eraseblock.
490 * @desc: volume descriptor
491 * @lnum: logical eraseblock number
492 *
493 * This function un-maps logical eraseblock @lnum and schedules the
494 * corresponding physical eraseblock for erasure, so that it will eventually be
495 * physically erased in background. This operation is much faster then the
496 * erase operation.
497 *
498 * Unlike erase, the un-map operation does not guarantee that the logical
499 * eraseblock will contain all 0xFF bytes when UBI is initialized again. For
500 * example, if several logical eraseblocks are un-mapped, and an unclean reboot
501 * happens after this, the logical eraseblocks will not necessarily be
502 * un-mapped again when this MTD device is attached. They may actually be
503 * mapped to the same physical eraseblocks again. So, this function has to be
504 * used with care.
505 *
506 * In other words, when un-mapping a logical eraseblock, UBI does not store
507 * any information about this on the flash media, it just marks the logical
508 * eraseblock as "un-mapped" in RAM. If UBI is detached before the physical
509 * eraseblock is physically erased, it will be mapped again to the same logical
510 * eraseblock when the MTD device is attached again.
511 *
512 * The main and obvious use-case of this function is when the contents of a
513 * logical eraseblock has to be re-written. Then it is much more efficient to
514 * first un-map it, then write new data, rather then first erase it, then write
515 * new data. Note, once new data has been written to the logical eraseblock,
516 * UBI guarantees that the old contents has gone forever. In other words, if an
517 * unclean reboot happens after the logical eraseblock has been un-mapped and
518 * then written to, it will contain the last written data.
519 *
520 * This function returns zero in case of success and a negative error code in
521 * case of failure. If the volume is damaged because of an interrupted update
522 * this function just returns immediately with %-EBADF code.
523 */
524int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum)
525{
526 struct ubi_volume *vol = desc->vol;
527 struct ubi_device *ubi = vol->ubi;
528 int vol_id = vol->vol_id;
529
530 dbg_msg("unmap LEB %d:%d", vol_id, lnum);
531
532 if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
533 return -EROFS;
534
535 if (lnum < 0 || lnum >= vol->reserved_pebs)
536 return -EINVAL;
537
538 if (vol->upd_marker)
539 return -EBADF;
540
541 return ubi_eba_unmap_leb(ubi, vol_id, lnum);
542}
543EXPORT_SYMBOL_GPL(ubi_leb_unmap);
544
545/**
546 * ubi_is_mapped - check if logical eraseblock is mapped.
547 * @desc: volume descriptor
548 * @lnum: logical eraseblock number
549 *
550 * This function checks if logical eraseblock @lnum is mapped to a physical
551 * eraseblock. If a logical eraseblock is un-mapped, this does not necessarily
552 * mean it will still be un-mapped after the UBI device is re-attached. The
553 * logical eraseblock may become mapped to the physical eraseblock it was last
554 * mapped to.
555 *
556 * This function returns %1 if the LEB is mapped, %0 if not, and a negative
557 * error code in case of failure. If the volume is damaged because of an
558 * interrupted update this function just returns immediately with %-EBADF error
559 * code.
560 */
561int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum)
562{
563 struct ubi_volume *vol = desc->vol;
564
565 dbg_msg("test LEB %d:%d", vol->vol_id, lnum);
566
567 if (lnum < 0 || lnum >= vol->reserved_pebs)
568 return -EINVAL;
569
570 if (vol->upd_marker)
571 return -EBADF;
572
573 return vol->eba_tbl[lnum] >= 0;
574}
575EXPORT_SYMBOL_GPL(ubi_is_mapped);
diff --git a/drivers/mtd/ubi/misc.c b/drivers/mtd/ubi/misc.c
new file mode 100644
index 000000000000..38d4e6757dc7
--- /dev/null
+++ b/drivers/mtd/ubi/misc.c
@@ -0,0 +1,105 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/* Here we keep miscellaneous functions which are used all over the UBI code */
22
23#include "ubi.h"
24
25/**
26 * calc_data_len - calculate how much real data is stored in a buffer.
27 * @ubi: UBI device description object
28 * @buf: a buffer with the contents of the physical eraseblock
29 * @length: the buffer length
30 *
31 * This function calculates how much "real data" is stored in @buf and returnes
32 * the length. Continuous 0xFF bytes at the end of the buffer are not
33 * considered as "real data".
34 */
35int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf,
36 int length)
37{
38 int i;
39
40 ubi_assert(length % ubi->min_io_size == 0);
41
42 for (i = length - 1; i >= 0; i--)
43 if (((const uint8_t *)buf)[i] != 0xFF)
44 break;
45
46 /* The resulting length must be aligned to the minimum flash I/O size */
47 length = ALIGN(i + 1, ubi->min_io_size);
48 return length;
49}
50
51/**
52 * ubi_check_volume - check the contents of a static volume.
53 * @ubi: UBI device description object
54 * @vol_id: ID of the volume to check
55 *
56 * This function checks if static volume @vol_id is corrupted by fully reading
57 * it and checking data CRC. This function returns %0 if the volume is not
58 * corrupted, %1 if it is corrupted and a negative error code in case of
59 * failure. Dynamic volumes are not checked and zero is returned immediately.
60 */
61int ubi_check_volume(struct ubi_device *ubi, int vol_id)
62{
63 void *buf;
64 int err = 0, i;
65 struct ubi_volume *vol = ubi->volumes[vol_id];
66
67 if (vol->vol_type != UBI_STATIC_VOLUME)
68 return 0;
69
70 buf = kmalloc(vol->usable_leb_size, GFP_KERNEL);
71 if (!buf)
72 return -ENOMEM;
73
74 for (i = 0; i < vol->used_ebs; i++) {
75 int size;
76
77 if (i == vol->used_ebs - 1)
78 size = vol->last_eb_bytes;
79 else
80 size = vol->usable_leb_size;
81
82 err = ubi_eba_read_leb(ubi, vol_id, i, buf, 0, size, 1);
83 if (err) {
84 if (err == -EBADMSG)
85 err = 1;
86 break;
87 }
88 }
89
90 kfree(buf);
91 return err;
92}
93
94/**
95 * ubi_calculate_rsvd_pool - calculate how many PEBs must be reserved for bad
96 * eraseblock handling.
97 * @ubi: UBI device description object
98 */
99void ubi_calculate_reserved(struct ubi_device *ubi)
100{
101 ubi->beb_rsvd_level = ubi->good_peb_count/100;
102 ubi->beb_rsvd_level *= CONFIG_MTD_UBI_BEB_RESERVE;
103 if (ubi->beb_rsvd_level < MIN_RESEVED_PEBS)
104 ubi->beb_rsvd_level = MIN_RESEVED_PEBS;
105}
diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c
new file mode 100644
index 000000000000..473f3200b868
--- /dev/null
+++ b/drivers/mtd/ubi/scan.c
@@ -0,0 +1,1368 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/*
22 * UBI scanning unit.
23 *
24 * This unit is responsible for scanning the flash media, checking UBI
25 * headers and providing complete information about the UBI flash image.
26 *
27 * The scanning information is reoresented by a &struct ubi_scan_info' object.
28 * Information about found volumes is represented by &struct ubi_scan_volume
29 * objects which are kept in volume RB-tree with root at the @volumes field.
30 * The RB-tree is indexed by the volume ID.
31 *
32 * Found logical eraseblocks are represented by &struct ubi_scan_leb objects.
33 * These objects are kept in per-volume RB-trees with the root at the
34 * corresponding &struct ubi_scan_volume object. To put it differently, we keep
35 * an RB-tree of per-volume objects and each of these objects is the root of
36 * RB-tree of per-eraseblock objects.
37 *
38 * Corrupted physical eraseblocks are put to the @corr list, free physical
39 * eraseblocks are put to the @free list and the physical eraseblock to be
40 * erased are put to the @erase list.
41 */
42
43#include <linux/err.h>
44#include <linux/crc32.h>
45#include "ubi.h"
46
47#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
48static int paranoid_check_si(const struct ubi_device *ubi,
49 struct ubi_scan_info *si);
50#else
51#define paranoid_check_si(ubi, si) 0
52#endif
53
54/* Temporary variables used during scanning */
55static struct ubi_ec_hdr *ech;
56static struct ubi_vid_hdr *vidh;
57
58int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec,
59 struct list_head *list)
60{
61 struct ubi_scan_leb *seb;
62
63 if (list == &si->free)
64 dbg_bld("add to free: PEB %d, EC %d", pnum, ec);
65 else if (list == &si->erase)
66 dbg_bld("add to erase: PEB %d, EC %d", pnum, ec);
67 else if (list == &si->corr)
68 dbg_bld("add to corrupted: PEB %d, EC %d", pnum, ec);
69 else if (list == &si->alien)
70 dbg_bld("add to alien: PEB %d, EC %d", pnum, ec);
71 else
72 BUG();
73
74 seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);
75 if (!seb)
76 return -ENOMEM;
77
78 seb->pnum = pnum;
79 seb->ec = ec;
80 list_add_tail(&seb->u.list, list);
81 return 0;
82}
83
84/**
85 * commit_to_mean_value - commit intermediate results to the final mean erase
86 * counter value.
87 * @si: scanning information
88 *
89 * This is a helper function which calculates partial mean erase counter mean
90 * value and adds it to the resulting mean value. As we can work only in
91 * integer arithmetic and we want to calculate the mean value of erase counter
92 * accurately, we first sum erase counter values in @si->ec_sum variable and
93 * count these components in @si->ec_count. If this temporary @si->ec_sum is
94 * going to overflow, we calculate the partial mean value
95 * (@si->ec_sum/@si->ec_count) and add it to @si->mean_ec.
96 */
97static void commit_to_mean_value(struct ubi_scan_info *si)
98{
99 si->ec_sum /= si->ec_count;
100 if (si->ec_sum % si->ec_count >= si->ec_count / 2)
101 si->mean_ec += 1;
102 si->mean_ec += si->ec_sum;
103}
104
105/**
106 * validate_vid_hdr - check that volume identifier header is correct and
107 * consistent.
108 * @vid_hdr: the volume identifier header to check
109 * @sv: information about the volume this logical eraseblock belongs to
110 * @pnum: physical eraseblock number the VID header came from
111 *
112 * This function checks that data stored in @vid_hdr is consistent. Returns
113 * non-zero if an inconsistency was found and zero if not.
114 *
115 * Note, UBI does sanity check of everything it reads from the flash media.
116 * Most of the checks are done in the I/O unit. Here we check that the
117 * information in the VID header is consistent to the information in other VID
118 * headers of the same volume.
119 */
120static int validate_vid_hdr(const struct ubi_vid_hdr *vid_hdr,
121 const struct ubi_scan_volume *sv, int pnum)
122{
123 int vol_type = vid_hdr->vol_type;
124 int vol_id = ubi32_to_cpu(vid_hdr->vol_id);
125 int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
126 int data_pad = ubi32_to_cpu(vid_hdr->data_pad);
127
128 if (sv->leb_count != 0) {
129 int sv_vol_type;
130
131 /*
132 * This is not the first logical eraseblock belonging to this
133 * volume. Ensure that the data in its VID header is consistent
134 * to the data in previous logical eraseblock headers.
135 */
136
137 if (vol_id != sv->vol_id) {
138 dbg_err("inconsistent vol_id");
139 goto bad;
140 }
141
142 if (sv->vol_type == UBI_STATIC_VOLUME)
143 sv_vol_type = UBI_VID_STATIC;
144 else
145 sv_vol_type = UBI_VID_DYNAMIC;
146
147 if (vol_type != sv_vol_type) {
148 dbg_err("inconsistent vol_type");
149 goto bad;
150 }
151
152 if (used_ebs != sv->used_ebs) {
153 dbg_err("inconsistent used_ebs");
154 goto bad;
155 }
156
157 if (data_pad != sv->data_pad) {
158 dbg_err("inconsistent data_pad");
159 goto bad;
160 }
161 }
162
163 return 0;
164
165bad:
166 ubi_err("inconsistent VID header at PEB %d", pnum);
167 ubi_dbg_dump_vid_hdr(vid_hdr);
168 ubi_dbg_dump_sv(sv);
169 return -EINVAL;
170}
171
172/**
173 * add_volume - add volume to the scanning information.
174 * @si: scanning information
175 * @vol_id: ID of the volume to add
176 * @pnum: physical eraseblock number
177 * @vid_hdr: volume identifier header
178 *
179 * If the volume corresponding to the @vid_hdr logical eraseblock is already
180 * present in the scanning information, this function does nothing. Otherwise
181 * it adds corresponding volume to the scanning information. Returns a pointer
182 * to the scanning volume object in case of success and a negative error code
183 * in case of failure.
184 */
185static struct ubi_scan_volume *add_volume(struct ubi_scan_info *si, int vol_id,
186 int pnum,
187 const struct ubi_vid_hdr *vid_hdr)
188{
189 struct ubi_scan_volume *sv;
190 struct rb_node **p = &si->volumes.rb_node, *parent = NULL;
191
192 ubi_assert(vol_id == ubi32_to_cpu(vid_hdr->vol_id));
193
194 /* Walk the volume RB-tree to look if this volume is already present */
195 while (*p) {
196 parent = *p;
197 sv = rb_entry(parent, struct ubi_scan_volume, rb);
198
199 if (vol_id == sv->vol_id)
200 return sv;
201
202 if (vol_id > sv->vol_id)
203 p = &(*p)->rb_left;
204 else
205 p = &(*p)->rb_right;
206 }
207
208 /* The volume is absent - add it */
209 sv = kmalloc(sizeof(struct ubi_scan_volume), GFP_KERNEL);
210 if (!sv)
211 return ERR_PTR(-ENOMEM);
212
213 sv->highest_lnum = sv->leb_count = 0;
214 si->max_sqnum = 0;
215 sv->vol_id = vol_id;
216 sv->root = RB_ROOT;
217 sv->used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
218 sv->data_pad = ubi32_to_cpu(vid_hdr->data_pad);
219 sv->compat = vid_hdr->compat;
220 sv->vol_type = vid_hdr->vol_type == UBI_VID_DYNAMIC ? UBI_DYNAMIC_VOLUME
221 : UBI_STATIC_VOLUME;
222 if (vol_id > si->highest_vol_id)
223 si->highest_vol_id = vol_id;
224
225 rb_link_node(&sv->rb, parent, p);
226 rb_insert_color(&sv->rb, &si->volumes);
227 si->vols_found += 1;
228 dbg_bld("added volume %d", vol_id);
229 return sv;
230}
231
232/**
233 * compare_lebs - find out which logical eraseblock is newer.
234 * @ubi: UBI device description object
235 * @seb: first logical eraseblock to compare
236 * @pnum: physical eraseblock number of the second logical eraseblock to
237 * compare
238 * @vid_hdr: volume identifier header of the second logical eraseblock
239 *
240 * This function compares 2 copies of a LEB and informs which one is newer. In
241 * case of success this function returns a positive value, in case of failure, a
242 * negative error code is returned. The success return codes use the following
243 * bits:
244 * o bit 0 is cleared: the first PEB (described by @seb) is newer then the
245 * second PEB (described by @pnum and @vid_hdr);
246 * o bit 0 is set: the second PEB is newer;
247 * o bit 1 is cleared: no bit-flips were detected in the newer LEB;
248 * o bit 1 is set: bit-flips were detected in the newer LEB;
249 * o bit 2 is cleared: the older LEB is not corrupted;
250 * o bit 2 is set: the older LEB is corrupted.
251 */
252static int compare_lebs(const struct ubi_device *ubi,
253 const struct ubi_scan_leb *seb, int pnum,
254 const struct ubi_vid_hdr *vid_hdr)
255{
256 void *buf;
257 int len, err, second_is_newer, bitflips = 0, corrupted = 0;
258 uint32_t data_crc, crc;
259 struct ubi_vid_hdr *vidh = NULL;
260 unsigned long long sqnum2 = ubi64_to_cpu(vid_hdr->sqnum);
261
262 if (seb->sqnum == 0 && sqnum2 == 0) {
263 long long abs, v1 = seb->leb_ver, v2 = ubi32_to_cpu(vid_hdr->leb_ver);
264
265 /*
266 * UBI constantly increases the logical eraseblock version
267 * number and it can overflow. Thus, we have to bear in mind
268 * that versions that are close to %0xFFFFFFFF are less then
269 * versions that are close to %0.
270 *
271 * The UBI WL unit guarantees that the number of pending tasks
272 * is not greater then %0x7FFFFFFF. So, if the difference
273 * between any two versions is greater or equivalent to
274 * %0x7FFFFFFF, there was an overflow and the logical
275 * eraseblock with lower version is actually newer then the one
276 * with higher version.
277 *
278 * FIXME: but this is anyway obsolete and will be removed at
279 * some point.
280 */
281
282 dbg_bld("using old crappy leb_ver stuff");
283
284 abs = v1 - v2;
285 if (abs < 0)
286 abs = -abs;
287
288 if (abs < 0x7FFFFFFF)
289 /* Non-overflow situation */
290 second_is_newer = (v2 > v1);
291 else
292 second_is_newer = (v2 < v1);
293 } else
294 /* Obviously the LEB with lower sequence counter is older */
295 second_is_newer = sqnum2 > seb->sqnum;
296
297 /*
298 * Now we know which copy is newer. If the copy flag of the PEB with
299 * newer version is not set, then we just return, otherwise we have to
300 * check data CRC. For the second PEB we already have the VID header,
301 * for the first one - we'll need to re-read it from flash.
302 *
303 * FIXME: this may be optimized so that we wouldn't read twice.
304 */
305
306 if (second_is_newer) {
307 if (!vid_hdr->copy_flag) {
308 /* It is not a copy, so it is newer */
309 dbg_bld("second PEB %d is newer, copy_flag is unset",
310 pnum);
311 return 1;
312 }
313 } else {
314 pnum = seb->pnum;
315
316 vidh = ubi_zalloc_vid_hdr(ubi);
317 if (!vidh)
318 return -ENOMEM;
319
320 err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);
321 if (err) {
322 if (err == UBI_IO_BITFLIPS)
323 bitflips = 1;
324 else {
325 dbg_err("VID of PEB %d header is bad, but it "
326 "was OK earlier", pnum);
327 if (err > 0)
328 err = -EIO;
329
330 goto out_free_vidh;
331 }
332 }
333
334 if (!vidh->copy_flag) {
335 /* It is not a copy, so it is newer */
336 dbg_bld("first PEB %d is newer, copy_flag is unset",
337 pnum);
338 err = bitflips << 1;
339 goto out_free_vidh;
340 }
341
342 vid_hdr = vidh;
343 }
344
345 /* Read the data of the copy and check the CRC */
346
347 len = ubi32_to_cpu(vid_hdr->data_size);
348 buf = kmalloc(len, GFP_KERNEL);
349 if (!buf) {
350 err = -ENOMEM;
351 goto out_free_vidh;
352 }
353
354 err = ubi_io_read_data(ubi, buf, pnum, 0, len);
355 if (err && err != UBI_IO_BITFLIPS)
356 goto out_free_buf;
357
358 data_crc = ubi32_to_cpu(vid_hdr->data_crc);
359 crc = crc32(UBI_CRC32_INIT, buf, len);
360 if (crc != data_crc) {
361 dbg_bld("PEB %d CRC error: calculated %#08x, must be %#08x",
362 pnum, crc, data_crc);
363 corrupted = 1;
364 bitflips = 0;
365 second_is_newer = !second_is_newer;
366 } else {
367 dbg_bld("PEB %d CRC is OK", pnum);
368 bitflips = !!err;
369 }
370
371 kfree(buf);
372 ubi_free_vid_hdr(ubi, vidh);
373
374 if (second_is_newer)
375 dbg_bld("second PEB %d is newer, copy_flag is set", pnum);
376 else
377 dbg_bld("first PEB %d is newer, copy_flag is set", pnum);
378
379 return second_is_newer | (bitflips << 1) | (corrupted << 2);
380
381out_free_buf:
382 kfree(buf);
383out_free_vidh:
384 ubi_free_vid_hdr(ubi, vidh);
385 ubi_assert(err < 0);
386 return err;
387}
388
389/**
390 * ubi_scan_add_used - add information about a physical eraseblock to the
391 * scanning information.
392 * @ubi: UBI device description object
393 * @si: scanning information
394 * @pnum: the physical eraseblock number
395 * @ec: erase counter
396 * @vid_hdr: the volume identifier header
397 * @bitflips: if bit-flips were detected when this physical eraseblock was read
398 *
399 * This function returns zero in case of success and a negative error code in
400 * case of failure.
401 */
402int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si,
403 int pnum, int ec, const struct ubi_vid_hdr *vid_hdr,
404 int bitflips)
405{
406 int err, vol_id, lnum;
407 uint32_t leb_ver;
408 unsigned long long sqnum;
409 struct ubi_scan_volume *sv;
410 struct ubi_scan_leb *seb;
411 struct rb_node **p, *parent = NULL;
412
413 vol_id = ubi32_to_cpu(vid_hdr->vol_id);
414 lnum = ubi32_to_cpu(vid_hdr->lnum);
415 sqnum = ubi64_to_cpu(vid_hdr->sqnum);
416 leb_ver = ubi32_to_cpu(vid_hdr->leb_ver);
417
418 dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, ver %u, bitflips %d",
419 pnum, vol_id, lnum, ec, sqnum, leb_ver, bitflips);
420
421 sv = add_volume(si, vol_id, pnum, vid_hdr);
422 if (IS_ERR(sv) < 0)
423 return PTR_ERR(sv);
424
425 /*
426 * Walk the RB-tree of logical eraseblocks of volume @vol_id to look
427 * if this is the first instance of this logical eraseblock or not.
428 */
429 p = &sv->root.rb_node;
430 while (*p) {
431 int cmp_res;
432
433 parent = *p;
434 seb = rb_entry(parent, struct ubi_scan_leb, u.rb);
435 if (lnum != seb->lnum) {
436 if (lnum < seb->lnum)
437 p = &(*p)->rb_left;
438 else
439 p = &(*p)->rb_right;
440 continue;
441 }
442
443 /*
444 * There is already a physical eraseblock describing the same
445 * logical eraseblock present.
446 */
447
448 dbg_bld("this LEB already exists: PEB %d, sqnum %llu, "
449 "LEB ver %u, EC %d", seb->pnum, seb->sqnum,
450 seb->leb_ver, seb->ec);
451
452 /*
453 * Make sure that the logical eraseblocks have different
454 * versions. Otherwise the image is bad.
455 */
456 if (seb->leb_ver == leb_ver && leb_ver != 0) {
457 ubi_err("two LEBs with same version %u", leb_ver);
458 ubi_dbg_dump_seb(seb, 0);
459 ubi_dbg_dump_vid_hdr(vid_hdr);
460 return -EINVAL;
461 }
462
463 /*
464 * Make sure that the logical eraseblocks have different
465 * sequence numbers. Otherwise the image is bad.
466 *
467 * FIXME: remove 'sqnum != 0' check when leb_ver is removed.
468 */
469 if (seb->sqnum == sqnum && sqnum != 0) {
470 ubi_err("two LEBs with same sequence number %llu",
471 sqnum);
472 ubi_dbg_dump_seb(seb, 0);
473 ubi_dbg_dump_vid_hdr(vid_hdr);
474 return -EINVAL;
475 }
476
477 /*
478 * Now we have to drop the older one and preserve the newer
479 * one.
480 */
481 cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr);
482 if (cmp_res < 0)
483 return cmp_res;
484
485 if (cmp_res & 1) {
486 /*
487 * This logical eraseblock is newer then the one
488 * found earlier.
489 */
490 err = validate_vid_hdr(vid_hdr, sv, pnum);
491 if (err)
492 return err;
493
494 if (cmp_res & 4)
495 err = ubi_scan_add_to_list(si, seb->pnum,
496 seb->ec, &si->corr);
497 else
498 err = ubi_scan_add_to_list(si, seb->pnum,
499 seb->ec, &si->erase);
500 if (err)
501 return err;
502
503 seb->ec = ec;
504 seb->pnum = pnum;
505 seb->scrub = ((cmp_res & 2) || bitflips);
506 seb->sqnum = sqnum;
507 seb->leb_ver = leb_ver;
508
509 if (sv->highest_lnum == lnum)
510 sv->last_data_size =
511 ubi32_to_cpu(vid_hdr->data_size);
512
513 return 0;
514 } else {
515 /*
516 * This logical eraseblock is older then the one found
517 * previously.
518 */
519 if (cmp_res & 4)
520 return ubi_scan_add_to_list(si, pnum, ec,
521 &si->corr);
522 else
523 return ubi_scan_add_to_list(si, pnum, ec,
524 &si->erase);
525 }
526 }
527
528 /*
529 * We've met this logical eraseblock for the first time, add it to the
530 * scanning information.
531 */
532
533 err = validate_vid_hdr(vid_hdr, sv, pnum);
534 if (err)
535 return err;
536
537 seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);
538 if (!seb)
539 return -ENOMEM;
540
541 seb->ec = ec;
542 seb->pnum = pnum;
543 seb->lnum = lnum;
544 seb->sqnum = sqnum;
545 seb->scrub = bitflips;
546 seb->leb_ver = leb_ver;
547
548 if (sv->highest_lnum <= lnum) {
549 sv->highest_lnum = lnum;
550 sv->last_data_size = ubi32_to_cpu(vid_hdr->data_size);
551 }
552
553 if (si->max_sqnum < sqnum)
554 si->max_sqnum = sqnum;
555
556 sv->leb_count += 1;
557 rb_link_node(&seb->u.rb, parent, p);
558 rb_insert_color(&seb->u.rb, &sv->root);
559 return 0;
560}
561
562/**
563 * ubi_scan_find_sv - find information about a particular volume in the
564 * scanning information.
565 * @si: scanning information
566 * @vol_id: the requested volume ID
567 *
568 * This function returns a pointer to the volume description or %NULL if there
569 * are no data about this volume in the scanning information.
570 */
571struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si,
572 int vol_id)
573{
574 struct ubi_scan_volume *sv;
575 struct rb_node *p = si->volumes.rb_node;
576
577 while (p) {
578 sv = rb_entry(p, struct ubi_scan_volume, rb);
579
580 if (vol_id == sv->vol_id)
581 return sv;
582
583 if (vol_id > sv->vol_id)
584 p = p->rb_left;
585 else
586 p = p->rb_right;
587 }
588
589 return NULL;
590}
591
592/**
593 * ubi_scan_find_seb - find information about a particular logical
594 * eraseblock in the volume scanning information.
595 * @sv: a pointer to the volume scanning information
596 * @lnum: the requested logical eraseblock
597 *
598 * This function returns a pointer to the scanning logical eraseblock or %NULL
599 * if there are no data about it in the scanning volume information.
600 */
601struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv,
602 int lnum)
603{
604 struct ubi_scan_leb *seb;
605 struct rb_node *p = sv->root.rb_node;
606
607 while (p) {
608 seb = rb_entry(p, struct ubi_scan_leb, u.rb);
609
610 if (lnum == seb->lnum)
611 return seb;
612
613 if (lnum > seb->lnum)
614 p = p->rb_left;
615 else
616 p = p->rb_right;
617 }
618
619 return NULL;
620}
621
622/**
623 * ubi_scan_rm_volume - delete scanning information about a volume.
624 * @si: scanning information
625 * @sv: the volume scanning information to delete
626 */
627void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv)
628{
629 struct rb_node *rb;
630 struct ubi_scan_leb *seb;
631
632 dbg_bld("remove scanning information about volume %d", sv->vol_id);
633
634 while ((rb = rb_first(&sv->root))) {
635 seb = rb_entry(rb, struct ubi_scan_leb, u.rb);
636 rb_erase(&seb->u.rb, &sv->root);
637 list_add_tail(&seb->u.list, &si->erase);
638 }
639
640 rb_erase(&sv->rb, &si->volumes);
641 kfree(sv);
642 si->vols_found -= 1;
643}
644
645/**
646 * ubi_scan_erase_peb - erase a physical eraseblock.
647 * @ubi: UBI device description object
648 * @si: scanning information
649 * @pnum: physical eraseblock number to erase;
650 * @ec: erase counter value to write (%UBI_SCAN_UNKNOWN_EC if it is unknown)
651 *
652 * This function erases physical eraseblock 'pnum', and writes the erase
653 * counter header to it. This function should only be used on UBI device
654 * initialization stages, when the EBA unit had not been yet initialized. This
655 * function returns zero in case of success and a negative error code in case
656 * of failure.
657 */
658int ubi_scan_erase_peb(const struct ubi_device *ubi,
659 const struct ubi_scan_info *si, int pnum, int ec)
660{
661 int err;
662 struct ubi_ec_hdr *ec_hdr;
663
664 ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
665 if (!ec_hdr)
666 return -ENOMEM;
667
668 if ((long long)ec >= UBI_MAX_ERASECOUNTER) {
669 /*
670 * Erase counter overflow. Upgrade UBI and use 64-bit
671 * erase counters internally.
672 */
673 ubi_err("erase counter overflow at PEB %d, EC %d", pnum, ec);
674 return -EINVAL;
675 }
676
677 ec_hdr->ec = cpu_to_ubi64(ec);
678
679 err = ubi_io_sync_erase(ubi, pnum, 0);
680 if (err < 0)
681 goto out_free;
682
683 err = ubi_io_write_ec_hdr(ubi, pnum, ec_hdr);
684
685out_free:
686 kfree(ec_hdr);
687 return err;
688}
689
690/**
691 * ubi_scan_get_free_peb - get a free physical eraseblock.
692 * @ubi: UBI device description object
693 * @si: scanning information
694 *
695 * This function returns a free physical eraseblock. It is supposed to be
696 * called on the UBI initialization stages when the wear-leveling unit is not
697 * initialized yet. This function picks a physical eraseblocks from one of the
698 * lists, writes the EC header if it is needed, and removes it from the list.
699 *
700 * This function returns scanning physical eraseblock information in case of
701 * success and an error code in case of failure.
702 */
703struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi,
704 struct ubi_scan_info *si)
705{
706 int err = 0, i;
707 struct ubi_scan_leb *seb;
708
709 if (!list_empty(&si->free)) {
710 seb = list_entry(si->free.next, struct ubi_scan_leb, u.list);
711 list_del(&seb->u.list);
712 dbg_bld("return free PEB %d, EC %d", seb->pnum, seb->ec);
713 return seb;
714 }
715
716 for (i = 0; i < 2; i++) {
717 struct list_head *head;
718 struct ubi_scan_leb *tmp_seb;
719
720 if (i == 0)
721 head = &si->erase;
722 else
723 head = &si->corr;
724
725 /*
726 * We try to erase the first physical eraseblock from the @head
727 * list and pick it if we succeed, or try to erase the
728 * next one if not. And so forth. We don't want to take care
729 * about bad eraseblocks here - they'll be handled later.
730 */
731 list_for_each_entry_safe(seb, tmp_seb, head, u.list) {
732 if (seb->ec == UBI_SCAN_UNKNOWN_EC)
733 seb->ec = si->mean_ec;
734
735 err = ubi_scan_erase_peb(ubi, si, seb->pnum, seb->ec+1);
736 if (err)
737 continue;
738
739 seb->ec += 1;
740 list_del(&seb->u.list);
741 dbg_bld("return PEB %d, EC %d", seb->pnum, seb->ec);
742 return seb;
743 }
744 }
745
746 ubi_err("no eraseblocks found");
747 return ERR_PTR(-ENOSPC);
748}
749
750/**
751 * process_eb - read UBI headers, check them and add corresponding data
752 * to the scanning information.
753 * @ubi: UBI device description object
754 * @si: scanning information
755 * @pnum: the physical eraseblock number
756 *
757 * This function returns a zero if the physical eraseblock was succesfully
758 * handled and a negative error code in case of failure.
759 */
760static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum)
761{
762 long long ec;
763 int err, bitflips = 0, vol_id, ec_corr = 0;
764
765 dbg_bld("scan PEB %d", pnum);
766
767 /* Skip bad physical eraseblocks */
768 err = ubi_io_is_bad(ubi, pnum);
769 if (err < 0)
770 return err;
771 else if (err) {
772 /*
773 * FIXME: this is actually duty of the I/O unit to initialize
774 * this, but MTD does not provide enough information.
775 */
776 si->bad_peb_count += 1;
777 return 0;
778 }
779
780 err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);
781 if (err < 0)
782 return err;
783 else if (err == UBI_IO_BITFLIPS)
784 bitflips = 1;
785 else if (err == UBI_IO_PEB_EMPTY)
786 return ubi_scan_add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC,
787 &si->erase);
788 else if (err == UBI_IO_BAD_EC_HDR) {
789 /*
790 * We have to also look at the VID header, possibly it is not
791 * corrupted. Set %bitflips flag in order to make this PEB be
792 * moved and EC be re-created.
793 */
794 ec_corr = 1;
795 ec = UBI_SCAN_UNKNOWN_EC;
796 bitflips = 1;
797 }
798
799 si->is_empty = 0;
800
801 if (!ec_corr) {
802 /* Make sure UBI version is OK */
803 if (ech->version != UBI_VERSION) {
804 ubi_err("this UBI version is %d, image version is %d",
805 UBI_VERSION, (int)ech->version);
806 return -EINVAL;
807 }
808
809 ec = ubi64_to_cpu(ech->ec);
810 if (ec > UBI_MAX_ERASECOUNTER) {
811 /*
812 * Erase counter overflow. The EC headers have 64 bits
813 * reserved, but we anyway make use of only 31 bit
814 * values, as this seems to be enough for any existing
815 * flash. Upgrade UBI and use 64-bit erase counters
816 * internally.
817 */
818 ubi_err("erase counter overflow, max is %d",
819 UBI_MAX_ERASECOUNTER);
820 ubi_dbg_dump_ec_hdr(ech);
821 return -EINVAL;
822 }
823 }
824
825 /* OK, we've done with the EC header, let's look at the VID header */
826
827 err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);
828 if (err < 0)
829 return err;
830 else if (err == UBI_IO_BITFLIPS)
831 bitflips = 1;
832 else if (err == UBI_IO_BAD_VID_HDR ||
833 (err == UBI_IO_PEB_FREE && ec_corr)) {
834 /* VID header is corrupted */
835 err = ubi_scan_add_to_list(si, pnum, ec, &si->corr);
836 if (err)
837 return err;
838 goto adjust_mean_ec;
839 } else if (err == UBI_IO_PEB_FREE) {
840 /* No VID header - the physical eraseblock is free */
841 err = ubi_scan_add_to_list(si, pnum, ec, &si->free);
842 if (err)
843 return err;
844 goto adjust_mean_ec;
845 }
846
847 vol_id = ubi32_to_cpu(vidh->vol_id);
848 if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOL_ID) {
849 int lnum = ubi32_to_cpu(vidh->lnum);
850
851 /* Unsupported internal volume */
852 switch (vidh->compat) {
853 case UBI_COMPAT_DELETE:
854 ubi_msg("\"delete\" compatible internal volume %d:%d"
855 " found, remove it", vol_id, lnum);
856 err = ubi_scan_add_to_list(si, pnum, ec, &si->corr);
857 if (err)
858 return err;
859 break;
860
861 case UBI_COMPAT_RO:
862 ubi_msg("read-only compatible internal volume %d:%d"
863 " found, switch to read-only mode",
864 vol_id, lnum);
865 ubi->ro_mode = 1;
866 break;
867
868 case UBI_COMPAT_PRESERVE:
869 ubi_msg("\"preserve\" compatible internal volume %d:%d"
870 " found", vol_id, lnum);
871 err = ubi_scan_add_to_list(si, pnum, ec, &si->alien);
872 if (err)
873 return err;
874 si->alien_peb_count += 1;
875 return 0;
876
877 case UBI_COMPAT_REJECT:
878 ubi_err("incompatible internal volume %d:%d found",
879 vol_id, lnum);
880 return -EINVAL;
881 }
882 }
883
884 /* Both UBI headers seem to be fine */
885 err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips);
886 if (err)
887 return err;
888
889adjust_mean_ec:
890 if (!ec_corr) {
891 if (si->ec_sum + ec < ec) {
892 commit_to_mean_value(si);
893 si->ec_sum = 0;
894 si->ec_count = 0;
895 } else {
896 si->ec_sum += ec;
897 si->ec_count += 1;
898 }
899
900 if (ec > si->max_ec)
901 si->max_ec = ec;
902 if (ec < si->min_ec)
903 si->min_ec = ec;
904 }
905
906 return 0;
907}
908
909/**
910 * ubi_scan - scan an MTD device.
911 * @ubi: UBI device description object
912 *
913 * This function does full scanning of an MTD device and returns complete
914 * information about it. In case of failure, an error code is returned.
915 */
916struct ubi_scan_info *ubi_scan(struct ubi_device *ubi)
917{
918 int err, pnum;
919 struct rb_node *rb1, *rb2;
920 struct ubi_scan_volume *sv;
921 struct ubi_scan_leb *seb;
922 struct ubi_scan_info *si;
923
924 si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL);
925 if (!si)
926 return ERR_PTR(-ENOMEM);
927
928 INIT_LIST_HEAD(&si->corr);
929 INIT_LIST_HEAD(&si->free);
930 INIT_LIST_HEAD(&si->erase);
931 INIT_LIST_HEAD(&si->alien);
932 si->volumes = RB_ROOT;
933 si->is_empty = 1;
934
935 err = -ENOMEM;
936 ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
937 if (!ech)
938 goto out_si;
939
940 vidh = ubi_zalloc_vid_hdr(ubi);
941 if (!vidh)
942 goto out_ech;
943
944 for (pnum = 0; pnum < ubi->peb_count; pnum++) {
945 cond_resched();
946
947 dbg_msg("process PEB %d", pnum);
948 err = process_eb(ubi, si, pnum);
949 if (err < 0)
950 goto out_vidh;
951 }
952
953 dbg_msg("scanning is finished");
954
955 /* Finish mean erase counter calculations */
956 if (si->ec_count)
957 commit_to_mean_value(si);
958
959 if (si->is_empty)
960 ubi_msg("empty MTD device detected");
961
962 /*
963 * In case of unknown erase counter we use the mean erase counter
964 * value.
965 */
966 ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
967 ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)
968 if (seb->ec == UBI_SCAN_UNKNOWN_EC)
969 seb->ec = si->mean_ec;
970 }
971
972 list_for_each_entry(seb, &si->free, u.list) {
973 if (seb->ec == UBI_SCAN_UNKNOWN_EC)
974 seb->ec = si->mean_ec;
975 }
976
977 list_for_each_entry(seb, &si->corr, u.list)
978 if (seb->ec == UBI_SCAN_UNKNOWN_EC)
979 seb->ec = si->mean_ec;
980
981 list_for_each_entry(seb, &si->erase, u.list)
982 if (seb->ec == UBI_SCAN_UNKNOWN_EC)
983 seb->ec = si->mean_ec;
984
985 err = paranoid_check_si(ubi, si);
986 if (err) {
987 if (err > 0)
988 err = -EINVAL;
989 goto out_vidh;
990 }
991
992 ubi_free_vid_hdr(ubi, vidh);
993 kfree(ech);
994
995 return si;
996
997out_vidh:
998 ubi_free_vid_hdr(ubi, vidh);
999out_ech:
1000 kfree(ech);
1001out_si:
1002 ubi_scan_destroy_si(si);
1003 return ERR_PTR(err);
1004}
1005
1006/**
1007 * destroy_sv - free the scanning volume information
1008 * @sv: scanning volume information
1009 *
1010 * This function destroys the volume RB-tree (@sv->root) and the scanning
1011 * volume information.
1012 */
1013static void destroy_sv(struct ubi_scan_volume *sv)
1014{
1015 struct ubi_scan_leb *seb;
1016 struct rb_node *this = sv->root.rb_node;
1017
1018 while (this) {
1019 if (this->rb_left)
1020 this = this->rb_left;
1021 else if (this->rb_right)
1022 this = this->rb_right;
1023 else {
1024 seb = rb_entry(this, struct ubi_scan_leb, u.rb);
1025 this = rb_parent(this);
1026 if (this) {
1027 if (this->rb_left == &seb->u.rb)
1028 this->rb_left = NULL;
1029 else
1030 this->rb_right = NULL;
1031 }
1032
1033 kfree(seb);
1034 }
1035 }
1036 kfree(sv);
1037}
1038
1039/**
1040 * ubi_scan_destroy_si - destroy scanning information.
1041 * @si: scanning information
1042 */
1043void ubi_scan_destroy_si(struct ubi_scan_info *si)
1044{
1045 struct ubi_scan_leb *seb, *seb_tmp;
1046 struct ubi_scan_volume *sv;
1047 struct rb_node *rb;
1048
1049 list_for_each_entry_safe(seb, seb_tmp, &si->alien, u.list) {
1050 list_del(&seb->u.list);
1051 kfree(seb);
1052 }
1053 list_for_each_entry_safe(seb, seb_tmp, &si->erase, u.list) {
1054 list_del(&seb->u.list);
1055 kfree(seb);
1056 }
1057 list_for_each_entry_safe(seb, seb_tmp, &si->corr, u.list) {
1058 list_del(&seb->u.list);
1059 kfree(seb);
1060 }
1061 list_for_each_entry_safe(seb, seb_tmp, &si->free, u.list) {
1062 list_del(&seb->u.list);
1063 kfree(seb);
1064 }
1065
1066 /* Destroy the volume RB-tree */
1067 rb = si->volumes.rb_node;
1068 while (rb) {
1069 if (rb->rb_left)
1070 rb = rb->rb_left;
1071 else if (rb->rb_right)
1072 rb = rb->rb_right;
1073 else {
1074 sv = rb_entry(rb, struct ubi_scan_volume, rb);
1075
1076 rb = rb_parent(rb);
1077 if (rb) {
1078 if (rb->rb_left == &sv->rb)
1079 rb->rb_left = NULL;
1080 else
1081 rb->rb_right = NULL;
1082 }
1083
1084 destroy_sv(sv);
1085 }
1086 }
1087
1088 kfree(si);
1089}
1090
1091#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
1092
1093/**
1094 * paranoid_check_si - check if the scanning information is correct and
1095 * consistent.
1096 * @ubi: UBI device description object
1097 * @si: scanning information
1098 *
1099 * This function returns zero if the scanning information is all right, %1 if
1100 * not and a negative error code if an error occurred.
1101 */
1102static int paranoid_check_si(const struct ubi_device *ubi,
1103 struct ubi_scan_info *si)
1104{
1105 int pnum, err, vols_found = 0;
1106 struct rb_node *rb1, *rb2;
1107 struct ubi_scan_volume *sv;
1108 struct ubi_scan_leb *seb, *last_seb;
1109 uint8_t *buf;
1110
1111 /*
1112 * At first, check that scanning information is ok.
1113 */
1114 ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
1115 int leb_count = 0;
1116
1117 cond_resched();
1118
1119 vols_found += 1;
1120
1121 if (si->is_empty) {
1122 ubi_err("bad is_empty flag");
1123 goto bad_sv;
1124 }
1125
1126 if (sv->vol_id < 0 || sv->highest_lnum < 0 ||
1127 sv->leb_count < 0 || sv->vol_type < 0 || sv->used_ebs < 0 ||
1128 sv->data_pad < 0 || sv->last_data_size < 0) {
1129 ubi_err("negative values");
1130 goto bad_sv;
1131 }
1132
1133 if (sv->vol_id >= UBI_MAX_VOLUMES &&
1134 sv->vol_id < UBI_INTERNAL_VOL_START) {
1135 ubi_err("bad vol_id");
1136 goto bad_sv;
1137 }
1138
1139 if (sv->vol_id > si->highest_vol_id) {
1140 ubi_err("highest_vol_id is %d, but vol_id %d is there",
1141 si->highest_vol_id, sv->vol_id);
1142 goto out;
1143 }
1144
1145 if (sv->vol_type != UBI_DYNAMIC_VOLUME &&
1146 sv->vol_type != UBI_STATIC_VOLUME) {
1147 ubi_err("bad vol_type");
1148 goto bad_sv;
1149 }
1150
1151 if (sv->data_pad > ubi->leb_size / 2) {
1152 ubi_err("bad data_pad");
1153 goto bad_sv;
1154 }
1155
1156 last_seb = NULL;
1157 ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
1158 cond_resched();
1159
1160 last_seb = seb;
1161 leb_count += 1;
1162
1163 if (seb->pnum < 0 || seb->ec < 0) {
1164 ubi_err("negative values");
1165 goto bad_seb;
1166 }
1167
1168 if (seb->ec < si->min_ec) {
1169 ubi_err("bad si->min_ec (%d), %d found",
1170 si->min_ec, seb->ec);
1171 goto bad_seb;
1172 }
1173
1174 if (seb->ec > si->max_ec) {
1175 ubi_err("bad si->max_ec (%d), %d found",
1176 si->max_ec, seb->ec);
1177 goto bad_seb;
1178 }
1179
1180 if (seb->pnum >= ubi->peb_count) {
1181 ubi_err("too high PEB number %d, total PEBs %d",
1182 seb->pnum, ubi->peb_count);
1183 goto bad_seb;
1184 }
1185
1186 if (sv->vol_type == UBI_STATIC_VOLUME) {
1187 if (seb->lnum >= sv->used_ebs) {
1188 ubi_err("bad lnum or used_ebs");
1189 goto bad_seb;
1190 }
1191 } else {
1192 if (sv->used_ebs != 0) {
1193 ubi_err("non-zero used_ebs");
1194 goto bad_seb;
1195 }
1196 }
1197
1198 if (seb->lnum > sv->highest_lnum) {
1199 ubi_err("incorrect highest_lnum or lnum");
1200 goto bad_seb;
1201 }
1202 }
1203
1204 if (sv->leb_count != leb_count) {
1205 ubi_err("bad leb_count, %d objects in the tree",
1206 leb_count);
1207 goto bad_sv;
1208 }
1209
1210 if (!last_seb)
1211 continue;
1212
1213 seb = last_seb;
1214
1215 if (seb->lnum != sv->highest_lnum) {
1216 ubi_err("bad highest_lnum");
1217 goto bad_seb;
1218 }
1219 }
1220
1221 if (vols_found != si->vols_found) {
1222 ubi_err("bad si->vols_found %d, should be %d",
1223 si->vols_found, vols_found);
1224 goto out;
1225 }
1226
1227 /* Check that scanning information is correct */
1228 ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
1229 last_seb = NULL;
1230 ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
1231 int vol_type;
1232
1233 cond_resched();
1234
1235 last_seb = seb;
1236
1237 err = ubi_io_read_vid_hdr(ubi, seb->pnum, vidh, 1);
1238 if (err && err != UBI_IO_BITFLIPS) {
1239 ubi_err("VID header is not OK (%d)", err);
1240 if (err > 0)
1241 err = -EIO;
1242 return err;
1243 }
1244
1245 vol_type = vidh->vol_type == UBI_VID_DYNAMIC ?
1246 UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME;
1247 if (sv->vol_type != vol_type) {
1248 ubi_err("bad vol_type");
1249 goto bad_vid_hdr;
1250 }
1251
1252 if (seb->sqnum != ubi64_to_cpu(vidh->sqnum)) {
1253 ubi_err("bad sqnum %llu", seb->sqnum);
1254 goto bad_vid_hdr;
1255 }
1256
1257 if (sv->vol_id != ubi32_to_cpu(vidh->vol_id)) {
1258 ubi_err("bad vol_id %d", sv->vol_id);
1259 goto bad_vid_hdr;
1260 }
1261
1262 if (sv->compat != vidh->compat) {
1263 ubi_err("bad compat %d", vidh->compat);
1264 goto bad_vid_hdr;
1265 }
1266
1267 if (seb->lnum != ubi32_to_cpu(vidh->lnum)) {
1268 ubi_err("bad lnum %d", seb->lnum);
1269 goto bad_vid_hdr;
1270 }
1271
1272 if (sv->used_ebs != ubi32_to_cpu(vidh->used_ebs)) {
1273 ubi_err("bad used_ebs %d", sv->used_ebs);
1274 goto bad_vid_hdr;
1275 }
1276
1277 if (sv->data_pad != ubi32_to_cpu(vidh->data_pad)) {
1278 ubi_err("bad data_pad %d", sv->data_pad);
1279 goto bad_vid_hdr;
1280 }
1281
1282 if (seb->leb_ver != ubi32_to_cpu(vidh->leb_ver)) {
1283 ubi_err("bad leb_ver %u", seb->leb_ver);
1284 goto bad_vid_hdr;
1285 }
1286 }
1287
1288 if (!last_seb)
1289 continue;
1290
1291 if (sv->highest_lnum != ubi32_to_cpu(vidh->lnum)) {
1292 ubi_err("bad highest_lnum %d", sv->highest_lnum);
1293 goto bad_vid_hdr;
1294 }
1295
1296 if (sv->last_data_size != ubi32_to_cpu(vidh->data_size)) {
1297 ubi_err("bad last_data_size %d", sv->last_data_size);
1298 goto bad_vid_hdr;
1299 }
1300 }
1301
1302 /*
1303 * Make sure that all the physical eraseblocks are in one of the lists
1304 * or trees.
1305 */
1306 buf = kmalloc(ubi->peb_count, GFP_KERNEL);
1307 if (!buf)
1308 return -ENOMEM;
1309
1310 memset(buf, 1, ubi->peb_count);
1311 for (pnum = 0; pnum < ubi->peb_count; pnum++) {
1312 err = ubi_io_is_bad(ubi, pnum);
1313 if (err < 0)
1314 return err;
1315 else if (err)
1316 buf[pnum] = 0;
1317 }
1318
1319 ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb)
1320 ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)
1321 buf[seb->pnum] = 0;
1322
1323 list_for_each_entry(seb, &si->free, u.list)
1324 buf[seb->pnum] = 0;
1325
1326 list_for_each_entry(seb, &si->corr, u.list)
1327 buf[seb->pnum] = 0;
1328
1329 list_for_each_entry(seb, &si->erase, u.list)
1330 buf[seb->pnum] = 0;
1331
1332 list_for_each_entry(seb, &si->alien, u.list)
1333 buf[seb->pnum] = 0;
1334
1335 err = 0;
1336 for (pnum = 0; pnum < ubi->peb_count; pnum++)
1337 if (buf[pnum]) {
1338 ubi_err("PEB %d is not referred", pnum);
1339 err = 1;
1340 }
1341
1342 kfree(buf);
1343 if (err)
1344 goto out;
1345 return 0;
1346
1347bad_seb:
1348 ubi_err("bad scanning information about LEB %d", seb->lnum);
1349 ubi_dbg_dump_seb(seb, 0);
1350 ubi_dbg_dump_sv(sv);
1351 goto out;
1352
1353bad_sv:
1354 ubi_err("bad scanning information about volume %d", sv->vol_id);
1355 ubi_dbg_dump_sv(sv);
1356 goto out;
1357
1358bad_vid_hdr:
1359 ubi_err("bad scanning information about volume %d", sv->vol_id);
1360 ubi_dbg_dump_sv(sv);
1361 ubi_dbg_dump_vid_hdr(vidh);
1362
1363out:
1364 ubi_dbg_dump_stack();
1365 return 1;
1366}
1367
1368#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/scan.h b/drivers/mtd/ubi/scan.h
new file mode 100644
index 000000000000..3949f6192c76
--- /dev/null
+++ b/drivers/mtd/ubi/scan.h
@@ -0,0 +1,167 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21#ifndef __UBI_SCAN_H__
22#define __UBI_SCAN_H__
23
24/* The erase counter value for this physical eraseblock is unknown */
25#define UBI_SCAN_UNKNOWN_EC (-1)
26
27/**
28 * struct ubi_scan_leb - scanning information about a physical eraseblock.
29 * @ec: erase counter (%UBI_SCAN_UNKNOWN_EC if it is unknown)
30 * @pnum: physical eraseblock number
31 * @lnum: logical eraseblock number
32 * @scrub: if this physical eraseblock needs scrubbing
33 * @sqnum: sequence number
34 * @u: unions RB-tree or @list links
35 * @u.rb: link in the per-volume RB-tree of &struct ubi_scan_leb objects
36 * @u.list: link in one of the eraseblock lists
37 * @leb_ver: logical eraseblock version (obsolete)
38 *
39 * One object of this type is allocated for each physical eraseblock during
40 * scanning.
41 */
42struct ubi_scan_leb {
43 int ec;
44 int pnum;
45 int lnum;
46 int scrub;
47 unsigned long long sqnum;
48 union {
49 struct rb_node rb;
50 struct list_head list;
51 } u;
52 uint32_t leb_ver;
53};
54
55/**
56 * struct ubi_scan_volume - scanning information about a volume.
57 * @vol_id: volume ID
58 * @highest_lnum: highest logical eraseblock number in this volume
59 * @leb_count: number of logical eraseblocks in this volume
60 * @vol_type: volume type
61 * @used_ebs: number of used logical eraseblocks in this volume (only for
62 * static volumes)
63 * @last_data_size: amount of data in the last logical eraseblock of this
64 * volume (always equivalent to the usable logical eraseblock size in case of
65 * dynamic volumes)
66 * @data_pad: how many bytes at the end of logical eraseblocks of this volume
67 * are not used (due to volume alignment)
68 * @compat: compatibility flags of this volume
69 * @rb: link in the volume RB-tree
70 * @root: root of the RB-tree containing all the eraseblock belonging to this
71 * volume (&struct ubi_scan_leb objects)
72 *
73 * One object of this type is allocated for each volume during scanning.
74 */
75struct ubi_scan_volume {
76 int vol_id;
77 int highest_lnum;
78 int leb_count;
79 int vol_type;
80 int used_ebs;
81 int last_data_size;
82 int data_pad;
83 int compat;
84 struct rb_node rb;
85 struct rb_root root;
86};
87
88/**
89 * struct ubi_scan_info - UBI scanning information.
90 * @volumes: root of the volume RB-tree
91 * @corr: list of corrupted physical eraseblocks
92 * @free: list of free physical eraseblocks
93 * @erase: list of physical eraseblocks which have to be erased
94 * @alien: list of physical eraseblocks which should not be used by UBI (e.g.,
95 * @bad_peb_count: count of bad physical eraseblocks
96 * those belonging to "preserve"-compatible internal volumes)
97 * @vols_found: number of volumes found during scanning
98 * @highest_vol_id: highest volume ID
99 * @alien_peb_count: count of physical eraseblocks in the @alien list
100 * @is_empty: flag indicating whether the MTD device is empty or not
101 * @min_ec: lowest erase counter value
102 * @max_ec: highest erase counter value
103 * @max_sqnum: highest sequence number value
104 * @mean_ec: mean erase counter value
105 * @ec_sum: a temporary variable used when calculating @mean_ec
106 * @ec_count: a temporary variable used when calculating @mean_ec
107 *
108 * This data structure contains the result of scanning and may be used by other
109 * UBI units to build final UBI data structures, further error-recovery and so
110 * on.
111 */
112struct ubi_scan_info {
113 struct rb_root volumes;
114 struct list_head corr;
115 struct list_head free;
116 struct list_head erase;
117 struct list_head alien;
118 int bad_peb_count;
119 int vols_found;
120 int highest_vol_id;
121 int alien_peb_count;
122 int is_empty;
123 int min_ec;
124 int max_ec;
125 unsigned long long max_sqnum;
126 int mean_ec;
127 int ec_sum;
128 int ec_count;
129};
130
131struct ubi_device;
132struct ubi_vid_hdr;
133
134/*
135 * ubi_scan_move_to_list - move a physical eraseblock from the volume tree to a
136 * list.
137 *
138 * @sv: volume scanning information
139 * @seb: scanning eraseblock infprmation
140 * @list: the list to move to
141 */
142static inline void ubi_scan_move_to_list(struct ubi_scan_volume *sv,
143 struct ubi_scan_leb *seb,
144 struct list_head *list)
145{
146 rb_erase(&seb->u.rb, &sv->root);
147 list_add_tail(&seb->u.list, list);
148}
149
150int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec,
151 struct list_head *list);
152int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si,
153 int pnum, int ec, const struct ubi_vid_hdr *vid_hdr,
154 int bitflips);
155struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si,
156 int vol_id);
157struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv,
158 int lnum);
159void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv);
160struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi,
161 struct ubi_scan_info *si);
162int ubi_scan_erase_peb(const struct ubi_device *ubi,
163 const struct ubi_scan_info *si, int pnum, int ec);
164struct ubi_scan_info *ubi_scan(struct ubi_device *ubi);
165void ubi_scan_destroy_si(struct ubi_scan_info *si);
166
167#endif /* !__UBI_SCAN_H__ */
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
new file mode 100644
index 000000000000..feb647f108f0
--- /dev/null
+++ b/drivers/mtd/ubi/ubi.h
@@ -0,0 +1,535 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 * Copyright (c) Nokia Corporation, 2006, 2007
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * Author: Artem Bityutskiy (Битюцкий Артём)
20 */
21
22#ifndef __UBI_UBI_H__
23#define __UBI_UBI_H__
24
25#include <linux/init.h>
26#include <linux/types.h>
27#include <linux/list.h>
28#include <linux/rbtree.h>
29#include <linux/sched.h>
30#include <linux/wait.h>
31#include <linux/mutex.h>
32#include <linux/rwsem.h>
33#include <linux/spinlock.h>
34#include <linux/fs.h>
35#include <linux/cdev.h>
36#include <linux/device.h>
37#include <linux/string.h>
38#include <linux/mtd/mtd.h>
39
40#include <mtd/ubi-header.h>
41#include <linux/mtd/ubi.h>
42
43#include "scan.h"
44#include "debug.h"
45
46/* Maximum number of supported UBI devices */
47#define UBI_MAX_DEVICES 32
48
49/* UBI name used for character devices, sysfs, etc */
50#define UBI_NAME_STR "ubi"
51
52/* Normal UBI messages */
53#define ubi_msg(fmt, ...) printk(KERN_NOTICE "UBI: " fmt "\n", ##__VA_ARGS__)
54/* UBI warning messages */
55#define ubi_warn(fmt, ...) printk(KERN_WARNING "UBI warning: %s: " fmt "\n", \
56 __FUNCTION__, ##__VA_ARGS__)
57/* UBI error messages */
58#define ubi_err(fmt, ...) printk(KERN_ERR "UBI error: %s: " fmt "\n", \
59 __FUNCTION__, ##__VA_ARGS__)
60
61/* Lowest number PEBs reserved for bad PEB handling */
62#define MIN_RESEVED_PEBS 2
63
64/* Background thread name pattern */
65#define UBI_BGT_NAME_PATTERN "ubi_bgt%dd"
66
67/* This marker in the EBA table means that the LEB is um-mapped */
68#define UBI_LEB_UNMAPPED -1
69
70/*
71 * In case of errors, UBI tries to repeat the operation several times before
72 * returning error. The below constant defines how many times UBI re-tries.
73 */
74#define UBI_IO_RETRIES 3
75
76/*
77 * Error codes returned by the I/O unit.
78 *
79 * UBI_IO_PEB_EMPTY: the physical eraseblock is empty, i.e. it contains only
80 * 0xFF bytes
81 * UBI_IO_PEB_FREE: the physical eraseblock is free, i.e. it contains only a
82 * valid erase counter header, and the rest are %0xFF bytes
83 * UBI_IO_BAD_EC_HDR: the erase counter header is corrupted (bad magic or CRC)
84 * UBI_IO_BAD_VID_HDR: the volume identifier header is corrupted (bad magic or
85 * CRC)
86 * UBI_IO_BITFLIPS: bit-flips were detected and corrected
87 */
88enum {
89 UBI_IO_PEB_EMPTY = 1,
90 UBI_IO_PEB_FREE,
91 UBI_IO_BAD_EC_HDR,
92 UBI_IO_BAD_VID_HDR,
93 UBI_IO_BITFLIPS
94};
95
96extern int ubi_devices_cnt;
97extern struct ubi_device *ubi_devices[];
98
99struct ubi_volume_desc;
100
101/**
102 * struct ubi_volume - UBI volume description data structure.
103 * @dev: device object to make use of the the Linux device model
104 * @cdev: character device object to create character device
105 * @ubi: reference to the UBI device description object
106 * @vol_id: volume ID
107 * @readers: number of users holding this volume in read-only mode
108 * @writers: number of users holding this volume in read-write mode
109 * @exclusive: whether somebody holds this volume in exclusive mode
110 * @removed: if the volume was removed
111 * @checked: if this static volume was checked
112 *
113 * @reserved_pebs: how many physical eraseblocks are reserved for this volume
114 * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
115 * @usable_leb_size: logical eraseblock size without padding
116 * @used_ebs: how many logical eraseblocks in this volume contain data
117 * @last_eb_bytes: how many bytes are stored in the last logical eraseblock
118 * @used_bytes: how many bytes of data this volume contains
119 * @upd_marker: non-zero if the update marker is set for this volume
120 * @corrupted: non-zero if the volume is corrupted (static volumes only)
121 * @alignment: volume alignment
122 * @data_pad: how many bytes are not used at the end of physical eraseblocks to
123 * satisfy the requested alignment
124 * @name_len: volume name length
125 * @name: volume name
126 *
127 * @updating: whether the volume is being updated
128 * @upd_ebs: how many eraseblocks are expected to be updated
129 * @upd_bytes: how many bytes are expected to be received
130 * @upd_received: how many update bytes were already received
131 * @upd_buf: update buffer which is used to collect update data
132 *
133 * @eba_tbl: EBA table of this volume (LEB->PEB mapping)
134 *
135 * @gluebi_desc: gluebi UBI volume descriptor
136 * @gluebi_refcount: reference count of the gluebi MTD device
137 * @gluebi_mtd: MTD device description object of the gluebi MTD device
138 *
139 * The @corrupted field indicates that the volume's contents is corrupted.
140 * Since UBI protects only static volumes, this field is not relevant to
141 * dynamic volumes - it is user's responsibility to assure their data
142 * integrity.
143 *
144 * The @upd_marker flag indicates that this volume is either being updated at
145 * the moment or is damaged because of an unclean reboot.
146 */
147struct ubi_volume {
148 struct device dev;
149 struct cdev cdev;
150 struct ubi_device *ubi;
151 int vol_id;
152 int readers;
153 int writers;
154 int exclusive;
155 int removed;
156 int checked;
157
158 int reserved_pebs;
159 int vol_type;
160 int usable_leb_size;
161 int used_ebs;
162 int last_eb_bytes;
163 long long used_bytes;
164 int upd_marker;
165 int corrupted;
166 int alignment;
167 int data_pad;
168 int name_len;
169 char name[UBI_VOL_NAME_MAX+1];
170
171 int updating;
172 int upd_ebs;
173 long long upd_bytes;
174 long long upd_received;
175 void *upd_buf;
176
177 int *eba_tbl;
178
179#ifdef CONFIG_MTD_UBI_GLUEBI
180 /* Gluebi-related stuff may be compiled out */
181 struct ubi_volume_desc *gluebi_desc;
182 int gluebi_refcount;
183 struct mtd_info gluebi_mtd;
184#endif
185};
186
187/**
188 * struct ubi_volume_desc - descriptor of the UBI volume returned when it is
189 * opened.
190 * @vol: reference to the corresponding volume description object
191 * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE)
192 */
193struct ubi_volume_desc {
194 struct ubi_volume *vol;
195 int mode;
196};
197
198struct ubi_wl_entry;
199
200/**
201 * struct ubi_device - UBI device description structure
202 * @dev: class device object to use the the Linux device model
203 * @cdev: character device object to create character device
204 * @ubi_num: UBI device number
205 * @ubi_name: UBI device name
206 * @major: character device major number
207 * @vol_count: number of volumes in this UBI device
208 * @volumes: volumes of this UBI device
209 * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs,
210 * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, @vol->readers,
211 * @vol->writers, @vol->exclusive, @vol->removed, @vol->mapping and
212 * @vol->eba_tbl.
213 *
214 * @rsvd_pebs: count of reserved physical eraseblocks
215 * @avail_pebs: count of available physical eraseblocks
216 * @beb_rsvd_pebs: how many physical eraseblocks are reserved for bad PEB
217 * handling
218 * @beb_rsvd_level: normal level of PEBs reserved for bad PEB handling
219 *
220 * @vtbl_slots: how many slots are available in the volume table
221 * @vtbl_size: size of the volume table in bytes
222 * @vtbl: in-RAM volume table copy
223 *
224 * @max_ec: current highest erase counter value
225 * @mean_ec: current mean erase counter value
226 *
227 * global_sqnum: global sequence number
228 * @ltree_lock: protects the lock tree and @global_sqnum
229 * @ltree: the lock tree
230 * @vtbl_mutex: protects on-flash volume table
231 *
232 * @used: RB-tree of used physical eraseblocks
233 * @free: RB-tree of free physical eraseblocks
234 * @scrub: RB-tree of physical eraseblocks which need scrubbing
235 * @prot: protection trees
236 * @prot.pnum: protection tree indexed by physical eraseblock numbers
237 * @prot.aec: protection tree indexed by absolute erase counter value
238 * @wl_lock: protects the @used, @free, @prot, @lookuptbl, @abs_ec, @move_from,
239 * @move_to, @move_to_put @erase_pending, @wl_scheduled, and @works
240 * fields
241 * @wl_scheduled: non-zero if the wear-leveling was scheduled
242 * @lookuptbl: a table to quickly find a &struct ubi_wl_entry object for any
243 * physical eraseblock
244 * @abs_ec: absolute erase counter
245 * @move_from: physical eraseblock from where the data is being moved
246 * @move_to: physical eraseblock where the data is being moved to
247 * @move_from_put: if the "from" PEB was put
248 * @move_to_put: if the "to" PEB was put
249 * @works: list of pending works
250 * @works_count: count of pending works
251 * @bgt_thread: background thread description object
252 * @thread_enabled: if the background thread is enabled
253 * @bgt_name: background thread name
254 *
255 * @flash_size: underlying MTD device size (in bytes)
256 * @peb_count: count of physical eraseblocks on the MTD device
257 * @peb_size: physical eraseblock size
258 * @bad_peb_count: count of bad physical eraseblocks
259 * @good_peb_count: count of good physical eraseblocks
260 * @min_io_size: minimal input/output unit size of the underlying MTD device
261 * @hdrs_min_io_size: minimal I/O unit size used for VID and EC headers
262 * @ro_mode: if the UBI device is in read-only mode
263 * @leb_size: logical eraseblock size
264 * @leb_start: starting offset of logical eraseblocks within physical
265 * eraseblocks
266 * @ec_hdr_alsize: size of the EC header aligned to @hdrs_min_io_size
267 * @vid_hdr_alsize: size of the VID header aligned to @hdrs_min_io_size
268 * @vid_hdr_offset: starting offset of the volume identifier header (might be
269 * unaligned)
270 * @vid_hdr_aloffset: starting offset of the VID header aligned to
271 * @hdrs_min_io_size
272 * @vid_hdr_shift: contains @vid_hdr_offset - @vid_hdr_aloffset
273 * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or
274 * not
275 * @mtd: MTD device descriptor
276 */
277struct ubi_device {
278 struct cdev cdev;
279 struct device dev;
280 int ubi_num;
281 char ubi_name[sizeof(UBI_NAME_STR)+5];
282 int major;
283 int vol_count;
284 struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT];
285 spinlock_t volumes_lock;
286
287 int rsvd_pebs;
288 int avail_pebs;
289 int beb_rsvd_pebs;
290 int beb_rsvd_level;
291
292 int vtbl_slots;
293 int vtbl_size;
294 struct ubi_vtbl_record *vtbl;
295 struct mutex vtbl_mutex;
296
297 int max_ec;
298 int mean_ec;
299
300 /* EBA unit's stuff */
301 unsigned long long global_sqnum;
302 spinlock_t ltree_lock;
303 struct rb_root ltree;
304
305 /* Wear-leveling unit's stuff */
306 struct rb_root used;
307 struct rb_root free;
308 struct rb_root scrub;
309 struct {
310 struct rb_root pnum;
311 struct rb_root aec;
312 } prot;
313 spinlock_t wl_lock;
314 int wl_scheduled;
315 struct ubi_wl_entry **lookuptbl;
316 unsigned long long abs_ec;
317 struct ubi_wl_entry *move_from;
318 struct ubi_wl_entry *move_to;
319 int move_from_put;
320 int move_to_put;
321 struct list_head works;
322 int works_count;
323 struct task_struct *bgt_thread;
324 int thread_enabled;
325 char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2];
326
327 /* I/O unit's stuff */
328 long long flash_size;
329 int peb_count;
330 int peb_size;
331 int bad_peb_count;
332 int good_peb_count;
333 int min_io_size;
334 int hdrs_min_io_size;
335 int ro_mode;
336 int leb_size;
337 int leb_start;
338 int ec_hdr_alsize;
339 int vid_hdr_alsize;
340 int vid_hdr_offset;
341 int vid_hdr_aloffset;
342 int vid_hdr_shift;
343 int bad_allowed;
344 struct mtd_info *mtd;
345};
346
347extern struct file_operations ubi_cdev_operations;
348extern struct file_operations ubi_vol_cdev_operations;
349extern struct class *ubi_class;
350
351/* vtbl.c */
352int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
353 struct ubi_vtbl_record *vtbl_rec);
354int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si);
355
356/* vmt.c */
357int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req);
358int ubi_remove_volume(struct ubi_volume_desc *desc);
359int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs);
360int ubi_add_volume(struct ubi_device *ubi, int vol_id);
361void ubi_free_volume(struct ubi_device *ubi, int vol_id);
362
363/* upd.c */
364int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes);
365int ubi_more_update_data(struct ubi_device *ubi, int vol_id,
366 const void __user *buf, int count);
367
368/* misc.c */
369int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length);
370int ubi_check_volume(struct ubi_device *ubi, int vol_id);
371void ubi_calculate_reserved(struct ubi_device *ubi);
372
373/* gluebi.c */
374#ifdef CONFIG_MTD_UBI_GLUEBI
375int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol);
376int ubi_destroy_gluebi(struct ubi_volume *vol);
377#else
378#define ubi_create_gluebi(ubi, vol) 0
379#define ubi_destroy_gluebi(vol) 0
380#endif
381
382/* eba.c */
383int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum);
384int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
385 int offset, int len, int check);
386int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum,
387 const void *buf, int offset, int len, int dtype);
388int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum,
389 const void *buf, int len, int dtype,
390 int used_ebs);
391int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum,
392 const void *buf, int len, int dtype);
393int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
394 struct ubi_vid_hdr *vid_hdr);
395int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
396void ubi_eba_close(const struct ubi_device *ubi);
397
398/* wl.c */
399int ubi_wl_get_peb(struct ubi_device *ubi, int dtype);
400int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture);
401int ubi_wl_flush(struct ubi_device *ubi);
402int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum);
403int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
404void ubi_wl_close(struct ubi_device *ubi);
405
406/* io.c */
407int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
408 int len);
409int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum,
410 int offset, int len);
411int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture);
412int ubi_io_is_bad(const struct ubi_device *ubi, int pnum);
413int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum);
414int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum,
415 struct ubi_ec_hdr *ec_hdr, int verbose);
416int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum,
417 struct ubi_ec_hdr *ec_hdr);
418int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum,
419 struct ubi_vid_hdr *vid_hdr, int verbose);
420int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum,
421 struct ubi_vid_hdr *vid_hdr);
422
423/*
424 * ubi_rb_for_each_entry - walk an RB-tree.
425 * @rb: a pointer to type 'struct rb_node' to to use as a loop counter
426 * @pos: a pointer to RB-tree entry type to use as a loop counter
427 * @root: RB-tree's root
428 * @member: the name of the 'struct rb_node' within the RB-tree entry
429 */
430#define ubi_rb_for_each_entry(rb, pos, root, member) \
431 for (rb = rb_first(root), \
432 pos = (rb ? container_of(rb, typeof(*pos), member) : NULL); \
433 rb; \
434 rb = rb_next(rb), pos = container_of(rb, typeof(*pos), member))
435
436/**
437 * ubi_zalloc_vid_hdr - allocate a volume identifier header object.
438 * @ubi: UBI device description object
439 *
440 * This function returns a pointer to the newly allocated and zero-filled
441 * volume identifier header object in case of success and %NULL in case of
442 * failure.
443 */
444static inline struct ubi_vid_hdr *ubi_zalloc_vid_hdr(const struct ubi_device *ubi)
445{
446 void *vid_hdr;
447
448 vid_hdr = kzalloc(ubi->vid_hdr_alsize, GFP_KERNEL);
449 if (!vid_hdr)
450 return NULL;
451
452 /*
453 * VID headers may be stored at un-aligned flash offsets, so we shift
454 * the pointer.
455 */
456 return vid_hdr + ubi->vid_hdr_shift;
457}
458
459/**
460 * ubi_free_vid_hdr - free a volume identifier header object.
461 * @ubi: UBI device description object
462 * @vid_hdr: the object to free
463 */
464static inline void ubi_free_vid_hdr(const struct ubi_device *ubi,
465 struct ubi_vid_hdr *vid_hdr)
466{
467 void *p = vid_hdr;
468
469 if (!p)
470 return;
471
472 kfree(p - ubi->vid_hdr_shift);
473}
474
475/*
476 * This function is equivalent to 'ubi_io_read()', but @offset is relative to
477 * the beginning of the logical eraseblock, not to the beginning of the
478 * physical eraseblock.
479 */
480static inline int ubi_io_read_data(const struct ubi_device *ubi, void *buf,
481 int pnum, int offset, int len)
482{
483 ubi_assert(offset >= 0);
484 return ubi_io_read(ubi, buf, pnum, offset + ubi->leb_start, len);
485}
486
487/*
488 * This function is equivalent to 'ubi_io_write()', but @offset is relative to
489 * the beginning of the logical eraseblock, not to the beginning of the
490 * physical eraseblock.
491 */
492static inline int ubi_io_write_data(const struct ubi_device *ubi, const void *buf,
493 int pnum, int offset, int len)
494{
495 ubi_assert(offset >= 0);
496 return ubi_io_write(ubi, buf, pnum, offset + ubi->leb_start, len);
497}
498
499/**
500 * ubi_ro_mode - switch to read-only mode.
501 * @ubi: UBI device description object
502 */
503static inline void ubi_ro_mode(struct ubi_device *ubi)
504{
505 ubi->ro_mode = 1;
506 ubi_warn("switch to read-only mode");
507}
508
509/**
510 * vol_id2idx - get table index by volume ID.
511 * @ubi: UBI device description object
512 * @vol_id: volume ID
513 */
514static inline int vol_id2idx(const struct ubi_device *ubi, int vol_id)
515{
516 if (vol_id >= UBI_INTERNAL_VOL_START)
517 return vol_id - UBI_INTERNAL_VOL_START + ubi->vtbl_slots;
518 else
519 return vol_id;
520}
521
522/**
523 * idx2vol_id - get volume ID by table index.
524 * @ubi: UBI device description object
525 * @idx: table index
526 */
527static inline int idx2vol_id(const struct ubi_device *ubi, int idx)
528{
529 if (idx >= ubi->vtbl_slots)
530 return idx - ubi->vtbl_slots + UBI_INTERNAL_VOL_START;
531 else
532 return idx;
533}
534
535#endif /* !__UBI_UBI_H__ */
diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c
new file mode 100644
index 000000000000..8925b977e3dc
--- /dev/null
+++ b/drivers/mtd/ubi/upd.c
@@ -0,0 +1,348 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 * Copyright (c) Nokia Corporation, 2006
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * Author: Artem Bityutskiy (Битюцкий Артём)
20 *
21 * Jan 2007: Alexander Schmidt, hacked per-volume update.
22 */
23
24/*
25 * This file contains implementation of the volume update functionality.
26 *
27 * The update operation is based on the per-volume update marker which is
28 * stored in the volume table. The update marker is set before the update
29 * starts, and removed after the update has been finished. So if the update was
30 * interrupted by an unclean re-boot or due to some other reasons, the update
31 * marker stays on the flash media and UBI finds it when it attaches the MTD
32 * device next time. If the update marker is set for a volume, the volume is
33 * treated as damaged and most I/O operations are prohibited. Only a new update
34 * operation is allowed.
35 *
36 * Note, in general it is possible to implement the update operation as a
37 * transaction with a roll-back capability.
38 */
39
40#include <linux/err.h>
41#include <asm/uaccess.h>
42#include <asm/div64.h>
43#include "ubi.h"
44
45/**
46 * set_update_marker - set update marker.
47 * @ubi: UBI device description object
48 * @vol_id: volume ID
49 *
50 * This function sets the update marker flag for volume @vol_id. Returns zero
51 * in case of success and a negative error code in case of failure.
52 */
53static int set_update_marker(struct ubi_device *ubi, int vol_id)
54{
55 int err;
56 struct ubi_vtbl_record vtbl_rec;
57 struct ubi_volume *vol = ubi->volumes[vol_id];
58
59 dbg_msg("set update marker for volume %d", vol_id);
60
61 if (vol->upd_marker) {
62 ubi_assert(ubi->vtbl[vol_id].upd_marker);
63 dbg_msg("already set");
64 return 0;
65 }
66
67 memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
68 vtbl_rec.upd_marker = 1;
69
70 err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
71 vol->upd_marker = 1;
72 return err;
73}
74
75/**
76 * clear_update_marker - clear update marker.
77 * @ubi: UBI device description object
78 * @vol_id: volume ID
79 * @bytes: new data size in bytes
80 *
81 * This function clears the update marker for volume @vol_id, sets new volume
82 * data size and clears the "corrupted" flag (static volumes only). Returns
83 * zero in case of success and a negative error code in case of failure.
84 */
85static int clear_update_marker(struct ubi_device *ubi, int vol_id, long long bytes)
86{
87 int err;
88 uint64_t tmp;
89 struct ubi_vtbl_record vtbl_rec;
90 struct ubi_volume *vol = ubi->volumes[vol_id];
91
92 dbg_msg("clear update marker for volume %d", vol_id);
93
94 memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
95 ubi_assert(vol->upd_marker && vtbl_rec.upd_marker);
96 vtbl_rec.upd_marker = 0;
97
98 if (vol->vol_type == UBI_STATIC_VOLUME) {
99 vol->corrupted = 0;
100 vol->used_bytes = tmp = bytes;
101 vol->last_eb_bytes = do_div(tmp, vol->usable_leb_size);
102 vol->used_ebs = tmp;
103 if (vol->last_eb_bytes)
104 vol->used_ebs += 1;
105 else
106 vol->last_eb_bytes = vol->usable_leb_size;
107 }
108
109 err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
110 vol->upd_marker = 0;
111 return err;
112}
113
114/**
115 * ubi_start_update - start volume update.
116 * @ubi: UBI device description object
117 * @vol_id: volume ID
118 * @bytes: update bytes
119 *
120 * This function starts volume update operation. If @bytes is zero, the volume
121 * is just wiped out. Returns zero in case of success and a negative error code
122 * in case of failure.
123 */
124int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes)
125{
126 int i, err;
127 uint64_t tmp;
128 struct ubi_volume *vol = ubi->volumes[vol_id];
129
130 dbg_msg("start update of volume %d, %llu bytes", vol_id, bytes);
131 vol->updating = 1;
132
133 err = set_update_marker(ubi, vol_id);
134 if (err)
135 return err;
136
137 /* Before updating - wipe out the volume */
138 for (i = 0; i < vol->reserved_pebs; i++) {
139 err = ubi_eba_unmap_leb(ubi, vol_id, i);
140 if (err)
141 return err;
142 }
143
144 if (bytes == 0) {
145 err = clear_update_marker(ubi, vol_id, 0);
146 if (err)
147 return err;
148 err = ubi_wl_flush(ubi);
149 if (!err)
150 vol->updating = 0;
151 }
152
153 vol->upd_buf = kmalloc(ubi->leb_size, GFP_KERNEL);
154 if (!vol->upd_buf)
155 return -ENOMEM;
156
157 tmp = bytes;
158 vol->upd_ebs = !!do_div(tmp, vol->usable_leb_size);
159 vol->upd_ebs += tmp;
160 vol->upd_bytes = bytes;
161 vol->upd_received = 0;
162 return 0;
163}
164
165/**
166 * write_leb - write update data.
167 * @ubi: UBI device description object
168 * @vol_id: volume ID
169 * @lnum: logical eraseblock number
170 * @buf: data to write
171 * @len: data size
172 * @used_ebs: how many logical eraseblocks will this volume contain (static
173 * volumes only)
174 *
175 * This function writes update data to corresponding logical eraseblock. In
176 * case of dynamic volume, this function checks if the data contains 0xFF bytes
177 * at the end. If yes, the 0xFF bytes are cut and not written. So if the whole
178 * buffer contains only 0xFF bytes, the LEB is left unmapped.
179 *
180 * The reason why we skip the trailing 0xFF bytes in case of dynamic volume is
181 * that we want to make sure that more data may be appended to the logical
182 * eraseblock in future. Indeed, writing 0xFF bytes may have side effects and
183 * this PEB won't be writable anymore. So if one writes the file-system image
184 * to the UBI volume where 0xFFs mean free space - UBI makes sure this free
185 * space is writable after the update.
186 *
187 * We do not do this for static volumes because they are read-only. But this
188 * also cannot be done because we have to store per-LEB CRC and the correct
189 * data length.
190 *
191 * This function returns zero in case of success and a negative error code in
192 * case of failure.
193 */
194static int write_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
195 int len, int used_ebs)
196{
197 int err, l;
198 struct ubi_volume *vol = ubi->volumes[vol_id];
199
200 if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
201 l = ALIGN(len, ubi->min_io_size);
202 memset(buf + len, 0xFF, l - len);
203
204 l = ubi_calc_data_len(ubi, buf, l);
205 if (l == 0) {
206 dbg_msg("all %d bytes contain 0xFF - skip", len);
207 return 0;
208 }
209 if (len != l)
210 dbg_msg("skip last %d bytes (0xFF)", len - l);
211
212 err = ubi_eba_write_leb(ubi, vol_id, lnum, buf, 0, l,
213 UBI_UNKNOWN);
214 } else {
215 /*
216 * When writing static volume, and this is the last logical
217 * eraseblock, the length (@len) does not have to be aligned to
218 * the minimal flash I/O unit. The 'ubi_eba_write_leb_st()'
219 * function accepts exact (unaligned) length and stores it in
220 * the VID header. And it takes care of proper alignment by
221 * padding the buffer. Here we just make sure the padding will
222 * contain zeros, not random trash.
223 */
224 memset(buf + len, 0, vol->usable_leb_size - len);
225 err = ubi_eba_write_leb_st(ubi, vol_id, lnum, buf, len,
226 UBI_UNKNOWN, used_ebs);
227 }
228
229 return err;
230}
231
232/**
233 * ubi_more_update_data - write more update data.
234 * @vol: volume description object
235 * @buf: write data (user-space memory buffer)
236 * @count: how much bytes to write
237 *
238 * This function writes more data to the volume which is being updated. It may
239 * be called arbitrary number of times until all of the update data arrive.
240 * This function returns %0 in case of success, number of bytes written during
241 * the last call if the whole volume update was successfully finished, and a
242 * negative error code in case of failure.
243 */
244int ubi_more_update_data(struct ubi_device *ubi, int vol_id,
245 const void __user *buf, int count)
246{
247 uint64_t tmp;
248 struct ubi_volume *vol = ubi->volumes[vol_id];
249 int lnum, offs, err = 0, len, to_write = count;
250
251 dbg_msg("write %d of %lld bytes, %lld already passed",
252 count, vol->upd_bytes, vol->upd_received);
253
254 if (ubi->ro_mode)
255 return -EROFS;
256
257 tmp = vol->upd_received;
258 offs = do_div(tmp, vol->usable_leb_size);
259 lnum = tmp;
260
261 if (vol->upd_received + count > vol->upd_bytes)
262 to_write = count = vol->upd_bytes - vol->upd_received;
263
264 /*
265 * When updating volumes, we accumulate whole logical eraseblock of
266 * data and write it at once.
267 */
268 if (offs != 0) {
269 /*
270 * This is a write to the middle of the logical eraseblock. We
271 * copy the data to our update buffer and wait for more data or
272 * flush it if the whole eraseblock is written or the update
273 * is finished.
274 */
275
276 len = vol->usable_leb_size - offs;
277 if (len > count)
278 len = count;
279
280 err = copy_from_user(vol->upd_buf + offs, buf, len);
281 if (err)
282 return -EFAULT;
283
284 if (offs + len == vol->usable_leb_size ||
285 vol->upd_received + len == vol->upd_bytes) {
286 int flush_len = offs + len;
287
288 /*
289 * OK, we gathered either the whole eraseblock or this
290 * is the last chunk, it's time to flush the buffer.
291 */
292 ubi_assert(flush_len <= vol->usable_leb_size);
293 err = write_leb(ubi, vol_id, lnum, vol->upd_buf,
294 flush_len, vol->upd_ebs);
295 if (err)
296 return err;
297 }
298
299 vol->upd_received += len;
300 count -= len;
301 buf += len;
302 lnum += 1;
303 }
304
305 /*
306 * If we've got more to write, let's continue. At this point we know we
307 * are starting from the beginning of an eraseblock.
308 */
309 while (count) {
310 if (count > vol->usable_leb_size)
311 len = vol->usable_leb_size;
312 else
313 len = count;
314
315 err = copy_from_user(vol->upd_buf, buf, len);
316 if (err)
317 return -EFAULT;
318
319 if (len == vol->usable_leb_size ||
320 vol->upd_received + len == vol->upd_bytes) {
321 err = write_leb(ubi, vol_id, lnum, vol->upd_buf, len,
322 vol->upd_ebs);
323 if (err)
324 break;
325 }
326
327 vol->upd_received += len;
328 count -= len;
329 lnum += 1;
330 buf += len;
331 }
332
333 ubi_assert(vol->upd_received <= vol->upd_bytes);
334 if (vol->upd_received == vol->upd_bytes) {
335 /* The update is finished, clear the update marker */
336 err = clear_update_marker(ubi, vol_id, vol->upd_bytes);
337 if (err)
338 return err;
339 err = ubi_wl_flush(ubi);
340 if (err == 0) {
341 err = to_write;
342 kfree(vol->upd_buf);
343 vol->updating = 0;
344 }
345 }
346
347 return err;
348}
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
new file mode 100644
index 000000000000..622d0d18952c
--- /dev/null
+++ b/drivers/mtd/ubi/vmt.c
@@ -0,0 +1,809 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21/*
22 * This file contains implementation of volume creation, deletion, updating and
23 * resizing.
24 */
25
26#include <linux/err.h>
27#include <asm/div64.h>
28#include "ubi.h"
29
30#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
31static void paranoid_check_volumes(struct ubi_device *ubi);
32#else
33#define paranoid_check_volumes(ubi)
34#endif
35
36static ssize_t vol_attribute_show(struct device *dev,
37 struct device_attribute *attr, char *buf);
38
39/* Device attributes corresponding to files in '/<sysfs>/class/ubi/ubiX_Y' */
40static struct device_attribute vol_reserved_ebs =
41 __ATTR(reserved_ebs, S_IRUGO, vol_attribute_show, NULL);
42static struct device_attribute vol_type =
43 __ATTR(type, S_IRUGO, vol_attribute_show, NULL);
44static struct device_attribute vol_name =
45 __ATTR(name, S_IRUGO, vol_attribute_show, NULL);
46static struct device_attribute vol_corrupted =
47 __ATTR(corrupted, S_IRUGO, vol_attribute_show, NULL);
48static struct device_attribute vol_alignment =
49 __ATTR(alignment, S_IRUGO, vol_attribute_show, NULL);
50static struct device_attribute vol_usable_eb_size =
51 __ATTR(usable_eb_size, S_IRUGO, vol_attribute_show, NULL);
52static struct device_attribute vol_data_bytes =
53 __ATTR(data_bytes, S_IRUGO, vol_attribute_show, NULL);
54static struct device_attribute vol_upd_marker =
55 __ATTR(upd_marker, S_IRUGO, vol_attribute_show, NULL);
56
57/*
58 * "Show" method for files in '/<sysfs>/class/ubi/ubiX_Y/'.
59 *
60 * Consider a situation:
61 * A. process 1 opens a sysfs file related to volume Y, say
62 * /<sysfs>/class/ubi/ubiX_Y/reserved_ebs;
63 * B. process 2 removes volume Y;
64 * C. process 1 starts reading the /<sysfs>/class/ubi/ubiX_Y/reserved_ebs file;
65 *
66 * What we want to do in a situation like that is to return error when the file
67 * is read. This is done by means of the 'removed' flag and the 'vol_lock' of
68 * the UBI volume description object.
69 */
70static ssize_t vol_attribute_show(struct device *dev,
71 struct device_attribute *attr, char *buf)
72{
73 int ret;
74 struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev);
75
76 spin_lock(&vol->ubi->volumes_lock);
77 if (vol->removed) {
78 spin_unlock(&vol->ubi->volumes_lock);
79 return -ENODEV;
80 }
81 if (attr == &vol_reserved_ebs)
82 ret = sprintf(buf, "%d\n", vol->reserved_pebs);
83 else if (attr == &vol_type) {
84 const char *tp;
85 tp = vol->vol_type == UBI_DYNAMIC_VOLUME ? "dynamic" : "static";
86 ret = sprintf(buf, "%s\n", tp);
87 } else if (attr == &vol_name)
88 ret = sprintf(buf, "%s\n", vol->name);
89 else if (attr == &vol_corrupted)
90 ret = sprintf(buf, "%d\n", vol->corrupted);
91 else if (attr == &vol_alignment)
92 ret = sprintf(buf, "%d\n", vol->alignment);
93 else if (attr == &vol_usable_eb_size) {
94 ret = sprintf(buf, "%d\n", vol->usable_leb_size);
95 } else if (attr == &vol_data_bytes)
96 ret = sprintf(buf, "%lld\n", vol->used_bytes);
97 else if (attr == &vol_upd_marker)
98 ret = sprintf(buf, "%d\n", vol->upd_marker);
99 else
100 BUG();
101 spin_unlock(&vol->ubi->volumes_lock);
102 return ret;
103}
104
105/* Release method for volume devices */
106static void vol_release(struct device *dev)
107{
108 struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev);
109 ubi_assert(vol->removed);
110 kfree(vol);
111}
112
113/**
114 * volume_sysfs_init - initialize sysfs for new volume.
115 * @ubi: UBI device description object
116 * @vol: volume description object
117 *
118 * This function returns zero in case of success and a negative error code in
119 * case of failure.
120 *
121 * Note, this function does not free allocated resources in case of failure -
122 * the caller does it. This is because this would cause release() here and the
123 * caller would oops.
124 */
125static int volume_sysfs_init(struct ubi_device *ubi, struct ubi_volume *vol)
126{
127 int err;
128
129 err = device_create_file(&vol->dev, &vol_reserved_ebs);
130 if (err)
131 return err;
132 err = device_create_file(&vol->dev, &vol_type);
133 if (err)
134 return err;
135 err = device_create_file(&vol->dev, &vol_name);
136 if (err)
137 return err;
138 err = device_create_file(&vol->dev, &vol_corrupted);
139 if (err)
140 return err;
141 err = device_create_file(&vol->dev, &vol_alignment);
142 if (err)
143 return err;
144 err = device_create_file(&vol->dev, &vol_usable_eb_size);
145 if (err)
146 return err;
147 err = device_create_file(&vol->dev, &vol_data_bytes);
148 if (err)
149 return err;
150 err = device_create_file(&vol->dev, &vol_upd_marker);
151 if (err)
152 return err;
153 return 0;
154}
155
156/**
157 * volume_sysfs_close - close sysfs for a volume.
158 * @vol: volume description object
159 */
160static void volume_sysfs_close(struct ubi_volume *vol)
161{
162 device_remove_file(&vol->dev, &vol_upd_marker);
163 device_remove_file(&vol->dev, &vol_data_bytes);
164 device_remove_file(&vol->dev, &vol_usable_eb_size);
165 device_remove_file(&vol->dev, &vol_alignment);
166 device_remove_file(&vol->dev, &vol_corrupted);
167 device_remove_file(&vol->dev, &vol_name);
168 device_remove_file(&vol->dev, &vol_type);
169 device_remove_file(&vol->dev, &vol_reserved_ebs);
170 device_unregister(&vol->dev);
171}
172
173/**
174 * ubi_create_volume - create volume.
175 * @ubi: UBI device description object
176 * @req: volume creation request
177 *
178 * This function creates volume described by @req. If @req->vol_id id
179 * %UBI_VOL_NUM_AUTO, this function automatically assigne ID to the new volume
180 * and saves it in @req->vol_id. Returns zero in case of success and a negative
181 * error code in case of failure.
182 */
183int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)
184{
185 int i, err, vol_id = req->vol_id;
186 struct ubi_volume *vol;
187 struct ubi_vtbl_record vtbl_rec;
188 uint64_t bytes;
189
190 if (ubi->ro_mode)
191 return -EROFS;
192
193 vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
194 if (!vol)
195 return -ENOMEM;
196
197 spin_lock(&ubi->volumes_lock);
198
199 if (vol_id == UBI_VOL_NUM_AUTO) {
200 /* Find unused volume ID */
201 dbg_msg("search for vacant volume ID");
202 for (i = 0; i < ubi->vtbl_slots; i++)
203 if (!ubi->volumes[i]) {
204 vol_id = i;
205 break;
206 }
207
208 if (vol_id == UBI_VOL_NUM_AUTO) {
209 dbg_err("out of volume IDs");
210 err = -ENFILE;
211 goto out_unlock;
212 }
213 req->vol_id = vol_id;
214 }
215
216 dbg_msg("volume ID %d, %llu bytes, type %d, name %s",
217 vol_id, (unsigned long long)req->bytes,
218 (int)req->vol_type, req->name);
219
220 /* Ensure that this volume does not exist */
221 err = -EEXIST;
222 if (ubi->volumes[vol_id]) {
223 dbg_err("volume %d already exists", vol_id);
224 goto out_unlock;
225 }
226
227 /* Ensure that the name is unique */
228 for (i = 0; i < ubi->vtbl_slots; i++)
229 if (ubi->volumes[i] &&
230 ubi->volumes[i]->name_len == req->name_len &&
231 strcmp(ubi->volumes[i]->name, req->name) == 0) {
232 dbg_err("volume \"%s\" exists (ID %d)", req->name, i);
233 goto out_unlock;
234 }
235
236 /* Calculate how many eraseblocks are requested */
237 vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment;
238 bytes = req->bytes;
239 if (do_div(bytes, vol->usable_leb_size))
240 vol->reserved_pebs = 1;
241 vol->reserved_pebs += bytes;
242
243 /* Reserve physical eraseblocks */
244 if (vol->reserved_pebs > ubi->avail_pebs) {
245 dbg_err("not enough PEBs, only %d available", ubi->avail_pebs);
246 spin_unlock(&ubi->volumes_lock);
247 err = -ENOSPC;
248 goto out_unlock;
249 }
250 ubi->avail_pebs -= vol->reserved_pebs;
251 ubi->rsvd_pebs += vol->reserved_pebs;
252
253 vol->vol_id = vol_id;
254 vol->alignment = req->alignment;
255 vol->data_pad = ubi->leb_size % vol->alignment;
256 vol->vol_type = req->vol_type;
257 vol->name_len = req->name_len;
258 memcpy(vol->name, req->name, vol->name_len + 1);
259 vol->exclusive = 1;
260 vol->ubi = ubi;
261 ubi->volumes[vol_id] = vol;
262 spin_unlock(&ubi->volumes_lock);
263
264 /*
265 * Finish all pending erases because there may be some LEBs belonging
266 * to the same volume ID.
267 */
268 err = ubi_wl_flush(ubi);
269 if (err)
270 goto out_acc;
271
272 vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), GFP_KERNEL);
273 if (!vol->eba_tbl) {
274 err = -ENOMEM;
275 goto out_acc;
276 }
277
278 for (i = 0; i < vol->reserved_pebs; i++)
279 vol->eba_tbl[i] = UBI_LEB_UNMAPPED;
280
281 if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
282 vol->used_ebs = vol->reserved_pebs;
283 vol->last_eb_bytes = vol->usable_leb_size;
284 vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
285 } else {
286 bytes = vol->used_bytes;
287 vol->last_eb_bytes = do_div(bytes, vol->usable_leb_size);
288 vol->used_ebs = bytes;
289 if (vol->last_eb_bytes)
290 vol->used_ebs += 1;
291 else
292 vol->last_eb_bytes = vol->usable_leb_size;
293 }
294
295 /* Register character device for the volume */
296 cdev_init(&vol->cdev, &ubi_vol_cdev_operations);
297 vol->cdev.owner = THIS_MODULE;
298 err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol_id + 1), 1);
299 if (err) {
300 ubi_err("cannot add character device for volume %d", vol_id);
301 goto out_mapping;
302 }
303
304 err = ubi_create_gluebi(ubi, vol);
305 if (err)
306 goto out_cdev;
307
308 vol->dev.release = vol_release;
309 vol->dev.parent = &ubi->dev;
310 vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1);
311 vol->dev.class = ubi_class;
312 sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
313 err = device_register(&vol->dev);
314 if (err)
315 goto out_gluebi;
316
317 err = volume_sysfs_init(ubi, vol);
318 if (err)
319 goto out_sysfs;
320
321 /* Fill volume table record */
322 memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record));
323 vtbl_rec.reserved_pebs = cpu_to_ubi32(vol->reserved_pebs);
324 vtbl_rec.alignment = cpu_to_ubi32(vol->alignment);
325 vtbl_rec.data_pad = cpu_to_ubi32(vol->data_pad);
326 vtbl_rec.name_len = cpu_to_ubi16(vol->name_len);
327 if (vol->vol_type == UBI_DYNAMIC_VOLUME)
328 vtbl_rec.vol_type = UBI_VID_DYNAMIC;
329 else
330 vtbl_rec.vol_type = UBI_VID_STATIC;
331 memcpy(vtbl_rec.name, vol->name, vol->name_len + 1);
332
333 err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
334 if (err)
335 goto out_sysfs;
336
337 spin_lock(&ubi->volumes_lock);
338 ubi->vol_count += 1;
339 vol->exclusive = 0;
340 spin_unlock(&ubi->volumes_lock);
341
342 paranoid_check_volumes(ubi);
343 return 0;
344
345out_gluebi:
346 err = ubi_destroy_gluebi(vol);
347out_cdev:
348 cdev_del(&vol->cdev);
349out_mapping:
350 kfree(vol->eba_tbl);
351out_acc:
352 spin_lock(&ubi->volumes_lock);
353 ubi->rsvd_pebs -= vol->reserved_pebs;
354 ubi->avail_pebs += vol->reserved_pebs;
355out_unlock:
356 spin_unlock(&ubi->volumes_lock);
357 kfree(vol);
358 return err;
359
360 /*
361 * We are registered, so @vol is destroyed in the release function and
362 * we have to de-initialize differently.
363 */
364out_sysfs:
365 err = ubi_destroy_gluebi(vol);
366 cdev_del(&vol->cdev);
367 kfree(vol->eba_tbl);
368 spin_lock(&ubi->volumes_lock);
369 ubi->rsvd_pebs -= vol->reserved_pebs;
370 ubi->avail_pebs += vol->reserved_pebs;
371 spin_unlock(&ubi->volumes_lock);
372 volume_sysfs_close(vol);
373 return err;
374}
375
376/**
377 * ubi_remove_volume - remove volume.
378 * @desc: volume descriptor
379 *
380 * This function removes volume described by @desc. The volume has to be opened
381 * in "exclusive" mode. Returns zero in case of success and a negative error
382 * code in case of failure.
383 */
384int ubi_remove_volume(struct ubi_volume_desc *desc)
385{
386 struct ubi_volume *vol = desc->vol;
387 struct ubi_device *ubi = vol->ubi;
388 int i, err, vol_id = vol->vol_id, reserved_pebs = vol->reserved_pebs;
389
390 dbg_msg("remove UBI volume %d", vol_id);
391 ubi_assert(desc->mode == UBI_EXCLUSIVE);
392 ubi_assert(vol == ubi->volumes[vol_id]);
393
394 if (ubi->ro_mode)
395 return -EROFS;
396
397 err = ubi_destroy_gluebi(vol);
398 if (err)
399 return err;
400
401 err = ubi_change_vtbl_record(ubi, vol_id, NULL);
402 if (err)
403 return err;
404
405 for (i = 0; i < vol->reserved_pebs; i++) {
406 err = ubi_eba_unmap_leb(ubi, vol_id, i);
407 if (err)
408 return err;
409 }
410
411 spin_lock(&ubi->volumes_lock);
412 vol->removed = 1;
413 ubi->volumes[vol_id] = NULL;
414 spin_unlock(&ubi->volumes_lock);
415
416 kfree(vol->eba_tbl);
417 vol->eba_tbl = NULL;
418 cdev_del(&vol->cdev);
419 volume_sysfs_close(vol);
420 kfree(desc);
421
422 spin_lock(&ubi->volumes_lock);
423 ubi->rsvd_pebs -= reserved_pebs;
424 ubi->avail_pebs += reserved_pebs;
425 i = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
426 if (i > 0) {
427 i = ubi->avail_pebs >= i ? i : ubi->avail_pebs;
428 ubi->avail_pebs -= i;
429 ubi->rsvd_pebs += i;
430 ubi->beb_rsvd_pebs += i;
431 if (i > 0)
432 ubi_msg("reserve more %d PEBs", i);
433 }
434 ubi->vol_count -= 1;
435 spin_unlock(&ubi->volumes_lock);
436
437 paranoid_check_volumes(ubi);
438 module_put(THIS_MODULE);
439 return 0;
440}
441
442/**
443 * ubi_resize_volume - re-size volume.
444 * @desc: volume descriptor
445 * @reserved_pebs: new size in physical eraseblocks
446 *
447 * This function returns zero in case of success, and a negative error code in
448 * case of failure.
449 */
450int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
451{
452 int i, err, pebs, *new_mapping;
453 struct ubi_volume *vol = desc->vol;
454 struct ubi_device *ubi = vol->ubi;
455 struct ubi_vtbl_record vtbl_rec;
456 int vol_id = vol->vol_id;
457
458 if (ubi->ro_mode)
459 return -EROFS;
460
461 dbg_msg("re-size volume %d to from %d to %d PEBs",
462 vol_id, vol->reserved_pebs, reserved_pebs);
463 ubi_assert(desc->mode == UBI_EXCLUSIVE);
464 ubi_assert(vol == ubi->volumes[vol_id]);
465
466 if (vol->vol_type == UBI_STATIC_VOLUME &&
467 reserved_pebs < vol->used_ebs) {
468 dbg_err("too small size %d, %d LEBs contain data",
469 reserved_pebs, vol->used_ebs);
470 return -EINVAL;
471 }
472
473 /* If the size is the same, we have nothing to do */
474 if (reserved_pebs == vol->reserved_pebs)
475 return 0;
476
477 new_mapping = kmalloc(reserved_pebs * sizeof(int), GFP_KERNEL);
478 if (!new_mapping)
479 return -ENOMEM;
480
481 for (i = 0; i < reserved_pebs; i++)
482 new_mapping[i] = UBI_LEB_UNMAPPED;
483
484 /* Reserve physical eraseblocks */
485 pebs = reserved_pebs - vol->reserved_pebs;
486 if (pebs > 0) {
487 spin_lock(&ubi->volumes_lock);
488 if (pebs > ubi->avail_pebs) {
489 dbg_err("not enough PEBs: requested %d, available %d",
490 pebs, ubi->avail_pebs);
491 spin_unlock(&ubi->volumes_lock);
492 err = -ENOSPC;
493 goto out_free;
494 }
495 ubi->avail_pebs -= pebs;
496 ubi->rsvd_pebs += pebs;
497 for (i = 0; i < vol->reserved_pebs; i++)
498 new_mapping[i] = vol->eba_tbl[i];
499 kfree(vol->eba_tbl);
500 vol->eba_tbl = new_mapping;
501 spin_unlock(&ubi->volumes_lock);
502 }
503
504 /* Change volume table record */
505 memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
506 vtbl_rec.reserved_pebs = cpu_to_ubi32(reserved_pebs);
507 err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
508 if (err)
509 goto out_acc;
510
511 if (pebs < 0) {
512 for (i = 0; i < -pebs; i++) {
513 err = ubi_eba_unmap_leb(ubi, vol_id, reserved_pebs + i);
514 if (err)
515 goto out_acc;
516 }
517 spin_lock(&ubi->volumes_lock);
518 ubi->rsvd_pebs += pebs;
519 ubi->avail_pebs -= pebs;
520 pebs = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
521 if (pebs > 0) {
522 pebs = ubi->avail_pebs >= pebs ? pebs : ubi->avail_pebs;
523 ubi->avail_pebs -= pebs;
524 ubi->rsvd_pebs += pebs;
525 ubi->beb_rsvd_pebs += pebs;
526 if (pebs > 0)
527 ubi_msg("reserve more %d PEBs", pebs);
528 }
529 for (i = 0; i < reserved_pebs; i++)
530 new_mapping[i] = vol->eba_tbl[i];
531 kfree(vol->eba_tbl);
532 vol->eba_tbl = new_mapping;
533 spin_unlock(&ubi->volumes_lock);
534 }
535
536 vol->reserved_pebs = reserved_pebs;
537 if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
538 vol->used_ebs = reserved_pebs;
539 vol->last_eb_bytes = vol->usable_leb_size;
540 vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
541 }
542
543 paranoid_check_volumes(ubi);
544 return 0;
545
546out_acc:
547 if (pebs > 0) {
548 spin_lock(&ubi->volumes_lock);
549 ubi->rsvd_pebs -= pebs;
550 ubi->avail_pebs += pebs;
551 spin_unlock(&ubi->volumes_lock);
552 }
553out_free:
554 kfree(new_mapping);
555 return err;
556}
557
558/**
559 * ubi_add_volume - add volume.
560 * @ubi: UBI device description object
561 * @vol_id: volume ID
562 *
563 * This function adds an existin volume and initializes all its data
564 * structures. Returnes zero in case of success and a negative error code in
565 * case of failure.
566 */
567int ubi_add_volume(struct ubi_device *ubi, int vol_id)
568{
569 int err;
570 struct ubi_volume *vol = ubi->volumes[vol_id];
571
572 dbg_msg("add volume %d", vol_id);
573 ubi_dbg_dump_vol_info(vol);
574 ubi_assert(vol);
575
576 /* Register character device for the volume */
577 cdev_init(&vol->cdev, &ubi_vol_cdev_operations);
578 vol->cdev.owner = THIS_MODULE;
579 err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol->vol_id + 1), 1);
580 if (err) {
581 ubi_err("cannot add character device for volume %d", vol_id);
582 return err;
583 }
584
585 err = ubi_create_gluebi(ubi, vol);
586 if (err)
587 goto out_cdev;
588
589 vol->dev.release = vol_release;
590 vol->dev.parent = &ubi->dev;
591 vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1);
592 vol->dev.class = ubi_class;
593 sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
594 err = device_register(&vol->dev);
595 if (err)
596 goto out_gluebi;
597
598 err = volume_sysfs_init(ubi, vol);
599 if (err) {
600 cdev_del(&vol->cdev);
601 err = ubi_destroy_gluebi(vol);
602 volume_sysfs_close(vol);
603 return err;
604 }
605
606 paranoid_check_volumes(ubi);
607 return 0;
608
609out_gluebi:
610 err = ubi_destroy_gluebi(vol);
611out_cdev:
612 cdev_del(&vol->cdev);
613 return err;
614}
615
616/**
617 * ubi_free_volume - free volume.
618 * @ubi: UBI device description object
619 * @vol_id: volume ID
620 *
621 * This function frees all resources for volume @vol_id but does not remove it.
622 * Used only when the UBI device is detached.
623 */
624void ubi_free_volume(struct ubi_device *ubi, int vol_id)
625{
626 int err;
627 struct ubi_volume *vol = ubi->volumes[vol_id];
628
629 dbg_msg("free volume %d", vol_id);
630 ubi_assert(vol);
631
632 vol->removed = 1;
633 err = ubi_destroy_gluebi(vol);
634 ubi->volumes[vol_id] = NULL;
635 cdev_del(&vol->cdev);
636 volume_sysfs_close(vol);
637}
638
639#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
640
641/**
642 * paranoid_check_volume - check volume information.
643 * @ubi: UBI device description object
644 * @vol_id: volume ID
645 */
646static void paranoid_check_volume(const struct ubi_device *ubi, int vol_id)
647{
648 int idx = vol_id2idx(ubi, vol_id);
649 int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker;
650 const struct ubi_volume *vol = ubi->volumes[idx];
651 long long n;
652 const char *name;
653
654 reserved_pebs = ubi32_to_cpu(ubi->vtbl[vol_id].reserved_pebs);
655
656 if (!vol) {
657 if (reserved_pebs) {
658 ubi_err("no volume info, but volume exists");
659 goto fail;
660 }
661 return;
662 }
663
664 if (vol->reserved_pebs < 0 || vol->alignment < 0 || vol->data_pad < 0 ||
665 vol->name_len < 0) {
666 ubi_err("negative values");
667 goto fail;
668 }
669 if (vol->alignment > ubi->leb_size || vol->alignment == 0) {
670 ubi_err("bad alignment");
671 goto fail;
672 }
673
674 n = vol->alignment % ubi->min_io_size;
675 if (vol->alignment != 1 && n) {
676 ubi_err("alignment is not multiple of min I/O unit");
677 goto fail;
678 }
679
680 n = ubi->leb_size % vol->alignment;
681 if (vol->data_pad != n) {
682 ubi_err("bad data_pad, has to be %lld", n);
683 goto fail;
684 }
685
686 if (vol->vol_type != UBI_DYNAMIC_VOLUME &&
687 vol->vol_type != UBI_STATIC_VOLUME) {
688 ubi_err("bad vol_type");
689 goto fail;
690 }
691
692 if (vol->upd_marker != 0 && vol->upd_marker != 1) {
693 ubi_err("bad upd_marker");
694 goto fail;
695 }
696
697 if (vol->upd_marker && vol->corrupted) {
698 dbg_err("update marker and corrupted simultaneously");
699 goto fail;
700 }
701
702 if (vol->reserved_pebs > ubi->good_peb_count) {
703 ubi_err("too large reserved_pebs");
704 goto fail;
705 }
706
707 n = ubi->leb_size - vol->data_pad;
708 if (vol->usable_leb_size != ubi->leb_size - vol->data_pad) {
709 ubi_err("bad usable_leb_size, has to be %lld", n);
710 goto fail;
711 }
712
713 if (vol->name_len > UBI_VOL_NAME_MAX) {
714 ubi_err("too long volume name, max is %d", UBI_VOL_NAME_MAX);
715 goto fail;
716 }
717
718 if (!vol->name) {
719 ubi_err("NULL volume name");
720 goto fail;
721 }
722
723 n = strnlen(vol->name, vol->name_len + 1);
724 if (n != vol->name_len) {
725 ubi_err("bad name_len %lld", n);
726 goto fail;
727 }
728
729 n = vol->used_ebs * vol->usable_leb_size;
730 if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
731 if (vol->corrupted != 0) {
732 ubi_err("corrupted dynamic volume");
733 goto fail;
734 }
735 if (vol->used_ebs != vol->reserved_pebs) {
736 ubi_err("bad used_ebs");
737 goto fail;
738 }
739 if (vol->last_eb_bytes != vol->usable_leb_size) {
740 ubi_err("bad last_eb_bytes");
741 goto fail;
742 }
743 if (vol->used_bytes != n) {
744 ubi_err("bad used_bytes");
745 goto fail;
746 }
747 } else {
748 if (vol->corrupted != 0 && vol->corrupted != 1) {
749 ubi_err("bad corrupted");
750 goto fail;
751 }
752 if (vol->used_ebs < 0 || vol->used_ebs > vol->reserved_pebs) {
753 ubi_err("bad used_ebs");
754 goto fail;
755 }
756 if (vol->last_eb_bytes < 0 ||
757 vol->last_eb_bytes > vol->usable_leb_size) {
758 ubi_err("bad last_eb_bytes");
759 goto fail;
760 }
761 if (vol->used_bytes < 0 || vol->used_bytes > n ||
762 vol->used_bytes < n - vol->usable_leb_size) {
763 ubi_err("bad used_bytes");
764 goto fail;
765 }
766 }
767
768 alignment = ubi32_to_cpu(ubi->vtbl[vol_id].alignment);
769 data_pad = ubi32_to_cpu(ubi->vtbl[vol_id].data_pad);
770 name_len = ubi16_to_cpu(ubi->vtbl[vol_id].name_len);
771 upd_marker = ubi->vtbl[vol_id].upd_marker;
772 name = &ubi->vtbl[vol_id].name[0];
773 if (ubi->vtbl[vol_id].vol_type == UBI_VID_DYNAMIC)
774 vol_type = UBI_DYNAMIC_VOLUME;
775 else
776 vol_type = UBI_STATIC_VOLUME;
777
778 if (alignment != vol->alignment || data_pad != vol->data_pad ||
779 upd_marker != vol->upd_marker || vol_type != vol->vol_type ||
780 name_len!= vol->name_len || strncmp(name, vol->name, name_len)) {
781 ubi_err("volume info is different");
782 goto fail;
783 }
784
785 return;
786
787fail:
788 ubi_err("paranoid check failed");
789 ubi_dbg_dump_vol_info(vol);
790 ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id);
791 BUG();
792}
793
794/**
795 * paranoid_check_volumes - check information about all volumes.
796 * @ubi: UBI device description object
797 */
798static void paranoid_check_volumes(struct ubi_device *ubi)
799{
800 int i;
801
802 mutex_lock(&ubi->vtbl_mutex);
803 spin_lock(&ubi->volumes_lock);
804 for (i = 0; i < ubi->vtbl_slots; i++)
805 paranoid_check_volume(ubi, i);
806 spin_unlock(&ubi->volumes_lock);
807 mutex_unlock(&ubi->vtbl_mutex);
808}
809#endif
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
new file mode 100644
index 000000000000..b6fd6bbd941e
--- /dev/null
+++ b/drivers/mtd/ubi/vtbl.c
@@ -0,0 +1,809 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 * Copyright (c) Nokia Corporation, 2006, 2007
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * Author: Artem Bityutskiy (Битюцкий Артём)
20 */
21
22/*
23 * This file includes volume table manipulation code. The volume table is an
24 * on-flash table containing volume meta-data like name, number of reserved
25 * physical eraseblocks, type, etc. The volume table is stored in the so-called
26 * "layout volume".
27 *
28 * The layout volume is an internal volume which is organized as follows. It
29 * consists of two logical eraseblocks - LEB 0 and LEB 1. Each logical
30 * eraseblock stores one volume table copy, i.e. LEB 0 and LEB 1 duplicate each
31 * other. This redundancy guarantees robustness to unclean reboots. The volume
32 * table is basically an array of volume table records. Each record contains
33 * full information about the volume and protected by a CRC checksum.
34 *
35 * The volume table is changed, it is first changed in RAM. Then LEB 0 is
36 * erased, and the updated volume table is written back to LEB 0. Then same for
37 * LEB 1. This scheme guarantees recoverability from unclean reboots.
38 *
39 * In this UBI implementation the on-flash volume table does not contain any
40 * information about how many data static volumes contain. This information may
41 * be found from the scanning data.
42 *
43 * But it would still be beneficial to store this information in the volume
44 * table. For example, suppose we have a static volume X, and all its physical
45 * eraseblocks became bad for some reasons. Suppose we are attaching the
46 * corresponding MTD device, the scanning has found no logical eraseblocks
47 * corresponding to the volume X. According to the volume table volume X does
48 * exist. So we don't know whether it is just empty or all its physical
49 * eraseblocks went bad. So we cannot alarm the user about this corruption.
50 *
51 * The volume table also stores so-called "update marker", which is used for
52 * volume updates. Before updating the volume, the update marker is set, and
53 * after the update operation is finished, the update marker is cleared. So if
54 * the update operation was interrupted (e.g. by an unclean reboot) - the
55 * update marker is still there and we know that the volume's contents is
56 * damaged.
57 */
58
59#include <linux/crc32.h>
60#include <linux/err.h>
61#include <asm/div64.h>
62#include "ubi.h"
63
64#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
65static void paranoid_vtbl_check(const struct ubi_device *ubi);
66#else
67#define paranoid_vtbl_check(ubi)
68#endif
69
70/* Empty volume table record */
71static struct ubi_vtbl_record empty_vtbl_record;
72
73/**
74 * ubi_change_vtbl_record - change volume table record.
75 * @ubi: UBI device description object
76 * @idx: table index to change
77 * @vtbl_rec: new volume table record
78 *
79 * This function changes volume table record @idx. If @vtbl_rec is %NULL, empty
80 * volume table record is written. The caller does not have to calculate CRC of
81 * the record as it is done by this function. Returns zero in case of success
82 * and a negative error code in case of failure.
83 */
84int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
85 struct ubi_vtbl_record *vtbl_rec)
86{
87 int i, err;
88 uint32_t crc;
89
90 ubi_assert(idx >= 0 && idx < ubi->vtbl_slots);
91
92 if (!vtbl_rec)
93 vtbl_rec = &empty_vtbl_record;
94 else {
95 crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC);
96 vtbl_rec->crc = cpu_to_ubi32(crc);
97 }
98
99 dbg_msg("change record %d", idx);
100 ubi_dbg_dump_vtbl_record(vtbl_rec, idx);
101
102 mutex_lock(&ubi->vtbl_mutex);
103 memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record));
104 for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) {
105 err = ubi_eba_unmap_leb(ubi, UBI_LAYOUT_VOL_ID, i);
106 if (err) {
107 mutex_unlock(&ubi->vtbl_mutex);
108 return err;
109 }
110 err = ubi_eba_write_leb(ubi, UBI_LAYOUT_VOL_ID, i, ubi->vtbl, 0,
111 ubi->vtbl_size, UBI_LONGTERM);
112 if (err) {
113 mutex_unlock(&ubi->vtbl_mutex);
114 return err;
115 }
116 }
117
118 paranoid_vtbl_check(ubi);
119 mutex_unlock(&ubi->vtbl_mutex);
120 return ubi_wl_flush(ubi);
121}
122
123/**
124 * vol_til_check - check if volume table is not corrupted and contains sensible
125 * data.
126 *
127 * @ubi: UBI device description object
128 * @vtbl: volume table
129 *
130 * This function returns zero if @vtbl is all right, %1 if CRC is incorrect,
131 * and %-EINVAL if it contains inconsistent data.
132 */
133static int vtbl_check(const struct ubi_device *ubi,
134 const struct ubi_vtbl_record *vtbl)
135{
136 int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len;
137 int upd_marker;
138 uint32_t crc;
139 const char *name;
140
141 for (i = 0; i < ubi->vtbl_slots; i++) {
142 cond_resched();
143
144 reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs);
145 alignment = ubi32_to_cpu(vtbl[i].alignment);
146 data_pad = ubi32_to_cpu(vtbl[i].data_pad);
147 upd_marker = vtbl[i].upd_marker;
148 vol_type = vtbl[i].vol_type;
149 name_len = ubi16_to_cpu(vtbl[i].name_len);
150 name = &vtbl[i].name[0];
151
152 crc = crc32(UBI_CRC32_INIT, &vtbl[i], UBI_VTBL_RECORD_SIZE_CRC);
153 if (ubi32_to_cpu(vtbl[i].crc) != crc) {
154 ubi_err("bad CRC at record %u: %#08x, not %#08x",
155 i, crc, ubi32_to_cpu(vtbl[i].crc));
156 ubi_dbg_dump_vtbl_record(&vtbl[i], i);
157 return 1;
158 }
159
160 if (reserved_pebs == 0) {
161 if (memcmp(&vtbl[i], &empty_vtbl_record,
162 UBI_VTBL_RECORD_SIZE)) {
163 dbg_err("bad empty record");
164 goto bad;
165 }
166 continue;
167 }
168
169 if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 ||
170 name_len < 0) {
171 dbg_err("negative values");
172 goto bad;
173 }
174
175 if (alignment > ubi->leb_size || alignment == 0) {
176 dbg_err("bad alignment");
177 goto bad;
178 }
179
180 n = alignment % ubi->min_io_size;
181 if (alignment != 1 && n) {
182 dbg_err("alignment is not multiple of min I/O unit");
183 goto bad;
184 }
185
186 n = ubi->leb_size % alignment;
187 if (data_pad != n) {
188 dbg_err("bad data_pad, has to be %d", n);
189 goto bad;
190 }
191
192 if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) {
193 dbg_err("bad vol_type");
194 goto bad;
195 }
196
197 if (upd_marker != 0 && upd_marker != 1) {
198 dbg_err("bad upd_marker");
199 goto bad;
200 }
201
202 if (reserved_pebs > ubi->good_peb_count) {
203 dbg_err("too large reserved_pebs, good PEBs %d",
204 ubi->good_peb_count);
205 goto bad;
206 }
207
208 if (name_len > UBI_VOL_NAME_MAX) {
209 dbg_err("too long volume name, max %d",
210 UBI_VOL_NAME_MAX);
211 goto bad;
212 }
213
214 if (name[0] == '\0') {
215 dbg_err("NULL volume name");
216 goto bad;
217 }
218
219 if (name_len != strnlen(name, name_len + 1)) {
220 dbg_err("bad name_len");
221 goto bad;
222 }
223 }
224
225 /* Checks that all names are unique */
226 for (i = 0; i < ubi->vtbl_slots - 1; i++) {
227 for (n = i + 1; n < ubi->vtbl_slots; n++) {
228 int len1 = ubi16_to_cpu(vtbl[i].name_len);
229 int len2 = ubi16_to_cpu(vtbl[n].name_len);
230
231 if (len1 > 0 && len1 == len2 &&
232 !strncmp(vtbl[i].name, vtbl[n].name, len1)) {
233 ubi_err("volumes %d and %d have the same name"
234 " \"%s\"", i, n, vtbl[i].name);
235 ubi_dbg_dump_vtbl_record(&vtbl[i], i);
236 ubi_dbg_dump_vtbl_record(&vtbl[n], n);
237 return -EINVAL;
238 }
239 }
240 }
241
242 return 0;
243
244bad:
245 ubi_err("volume table check failed, record %d", i);
246 ubi_dbg_dump_vtbl_record(&vtbl[i], i);
247 return -EINVAL;
248}
249
250/**
251 * create_vtbl - create a copy of volume table.
252 * @ubi: UBI device description object
253 * @si: scanning information
254 * @copy: number of the volume table copy
255 * @vtbl: contents of the volume table
256 *
257 * This function returns zero in case of success and a negative error code in
258 * case of failure.
259 */
260static int create_vtbl(const struct ubi_device *ubi, struct ubi_scan_info *si,
261 int copy, void *vtbl)
262{
263 int err, tries = 0;
264 static struct ubi_vid_hdr *vid_hdr;
265 struct ubi_scan_volume *sv;
266 struct ubi_scan_leb *new_seb, *old_seb = NULL;
267
268 ubi_msg("create volume table (copy #%d)", copy + 1);
269
270 vid_hdr = ubi_zalloc_vid_hdr(ubi);
271 if (!vid_hdr)
272 return -ENOMEM;
273
274 /*
275 * Check if there is a logical eraseblock which would have to contain
276 * this volume table copy was found during scanning. It has to be wiped
277 * out.
278 */
279 sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID);
280 if (sv)
281 old_seb = ubi_scan_find_seb(sv, copy);
282
283retry:
284 new_seb = ubi_scan_get_free_peb(ubi, si);
285 if (IS_ERR(new_seb)) {
286 err = PTR_ERR(new_seb);
287 goto out_free;
288 }
289
290 vid_hdr->vol_type = UBI_VID_DYNAMIC;
291 vid_hdr->vol_id = cpu_to_ubi32(UBI_LAYOUT_VOL_ID);
292 vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT;
293 vid_hdr->data_size = vid_hdr->used_ebs =
294 vid_hdr->data_pad = cpu_to_ubi32(0);
295 vid_hdr->lnum = cpu_to_ubi32(copy);
296 vid_hdr->sqnum = cpu_to_ubi64(++si->max_sqnum);
297 vid_hdr->leb_ver = cpu_to_ubi32(old_seb ? old_seb->leb_ver + 1: 0);
298
299 /* The EC header is already there, write the VID header */
300 err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr);
301 if (err)
302 goto write_error;
303
304 /* Write the layout volume contents */
305 err = ubi_io_write_data(ubi, vtbl, new_seb->pnum, 0, ubi->vtbl_size);
306 if (err)
307 goto write_error;
308
309 /*
310 * And add it to the scanning information. Don't delete the old
311 * @old_seb as it will be deleted and freed in 'ubi_scan_add_used()'.
312 */
313 err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec,
314 vid_hdr, 0);
315 kfree(new_seb);
316 ubi_free_vid_hdr(ubi, vid_hdr);
317 return err;
318
319write_error:
320 kfree(new_seb);
321 /* May be this physical eraseblock went bad, try to pick another one */
322 if (++tries <= 5) {
323 err = ubi_scan_add_to_list(si, new_seb->pnum, new_seb->ec,
324 &si->corr);
325 if (!err)
326 goto retry;
327 }
328out_free:
329 ubi_free_vid_hdr(ubi, vid_hdr);
330 return err;
331
332}
333
334/**
335 * process_lvol - process the layout volume.
336 * @ubi: UBI device description object
337 * @si: scanning information
338 * @sv: layout volume scanning information
339 *
340 * This function is responsible for reading the layout volume, ensuring it is
341 * not corrupted, and recovering from corruptions if needed. Returns volume
342 * table in case of success and a negative error code in case of failure.
343 */
344static struct ubi_vtbl_record *process_lvol(const struct ubi_device *ubi,
345 struct ubi_scan_info *si,
346 struct ubi_scan_volume *sv)
347{
348 int err;
349 struct rb_node *rb;
350 struct ubi_scan_leb *seb;
351 struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL };
352 int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1};
353
354 /*
355 * UBI goes through the following steps when it changes the layout
356 * volume:
357 * a. erase LEB 0;
358 * b. write new data to LEB 0;
359 * c. erase LEB 1;
360 * d. write new data to LEB 1.
361 *
362 * Before the change, both LEBs contain the same data.
363 *
364 * Due to unclean reboots, the contents of LEB 0 may be lost, but there
365 * should LEB 1. So it is OK if LEB 0 is corrupted while LEB 1 is not.
366 * Similarly, LEB 1 may be lost, but there should be LEB 0. And
367 * finally, unclean reboots may result in a situation when neither LEB
368 * 0 nor LEB 1 are corrupted, but they are different. In this case, LEB
369 * 0 contains more recent information.
370 *
371 * So the plan is to first check LEB 0. Then
372 * a. if LEB 0 is OK, it must be containing the most resent data; then
373 * we compare it with LEB 1, and if they are different, we copy LEB
374 * 0 to LEB 1;
375 * b. if LEB 0 is corrupted, but LEB 1 has to be OK, and we copy LEB 1
376 * to LEB 0.
377 */
378
379 dbg_msg("check layout volume");
380
381 /* Read both LEB 0 and LEB 1 into memory */
382 ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
383 leb[seb->lnum] = kzalloc(ubi->vtbl_size, GFP_KERNEL);
384 if (!leb[seb->lnum]) {
385 err = -ENOMEM;
386 goto out_free;
387 }
388
389 err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0,
390 ubi->vtbl_size);
391 if (err == UBI_IO_BITFLIPS || err == -EBADMSG)
392 /* Scrub the PEB later */
393 seb->scrub = 1;
394 else if (err)
395 goto out_free;
396 }
397
398 err = -EINVAL;
399 if (leb[0]) {
400 leb_corrupted[0] = vtbl_check(ubi, leb[0]);
401 if (leb_corrupted[0] < 0)
402 goto out_free;
403 }
404
405 if (!leb_corrupted[0]) {
406 /* LEB 0 is OK */
407 if (leb[1])
408 leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size);
409 if (leb_corrupted[1]) {
410 ubi_warn("volume table copy #2 is corrupted");
411 err = create_vtbl(ubi, si, 1, leb[0]);
412 if (err)
413 goto out_free;
414 ubi_msg("volume table was restored");
415 }
416
417 /* Both LEB 1 and LEB 2 are OK and consistent */
418 kfree(leb[1]);
419 return leb[0];
420 } else {
421 /* LEB 0 is corrupted or does not exist */
422 if (leb[1]) {
423 leb_corrupted[1] = vtbl_check(ubi, leb[1]);
424 if (leb_corrupted[1] < 0)
425 goto out_free;
426 }
427 if (leb_corrupted[1]) {
428 /* Both LEB 0 and LEB 1 are corrupted */
429 ubi_err("both volume tables are corrupted");
430 goto out_free;
431 }
432
433 ubi_warn("volume table copy #1 is corrupted");
434 err = create_vtbl(ubi, si, 0, leb[1]);
435 if (err)
436 goto out_free;
437 ubi_msg("volume table was restored");
438
439 kfree(leb[0]);
440 return leb[1];
441 }
442
443out_free:
444 kfree(leb[0]);
445 kfree(leb[1]);
446 return ERR_PTR(err);
447}
448
449/**
450 * create_empty_lvol - create empty layout volume.
451 * @ubi: UBI device description object
452 * @si: scanning information
453 *
454 * This function returns volume table contents in case of success and a
455 * negative error code in case of failure.
456 */
457static struct ubi_vtbl_record *create_empty_lvol(const struct ubi_device *ubi,
458 struct ubi_scan_info *si)
459{
460 int i;
461 struct ubi_vtbl_record *vtbl;
462
463 vtbl = kzalloc(ubi->vtbl_size, GFP_KERNEL);
464 if (!vtbl)
465 return ERR_PTR(-ENOMEM);
466
467 for (i = 0; i < ubi->vtbl_slots; i++)
468 memcpy(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE);
469
470 for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) {
471 int err;
472
473 err = create_vtbl(ubi, si, i, vtbl);
474 if (err) {
475 kfree(vtbl);
476 return ERR_PTR(err);
477 }
478 }
479
480 return vtbl;
481}
482
483/**
484 * init_volumes - initialize volume information for existing volumes.
485 * @ubi: UBI device description object
486 * @si: scanning information
487 * @vtbl: volume table
488 *
489 * This function allocates volume description objects for existing volumes.
490 * Returns zero in case of success and a negative error code in case of
491 * failure.
492 */
493static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si,
494 const struct ubi_vtbl_record *vtbl)
495{
496 int i, reserved_pebs = 0;
497 struct ubi_scan_volume *sv;
498 struct ubi_volume *vol;
499
500 for (i = 0; i < ubi->vtbl_slots; i++) {
501 cond_resched();
502
503 if (ubi32_to_cpu(vtbl[i].reserved_pebs) == 0)
504 continue; /* Empty record */
505
506 vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
507 if (!vol)
508 return -ENOMEM;
509
510 vol->reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs);
511 vol->alignment = ubi32_to_cpu(vtbl[i].alignment);
512 vol->data_pad = ubi32_to_cpu(vtbl[i].data_pad);
513 vol->vol_type = vtbl[i].vol_type == UBI_VID_DYNAMIC ?
514 UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME;
515 vol->name_len = ubi16_to_cpu(vtbl[i].name_len);
516 vol->usable_leb_size = ubi->leb_size - vol->data_pad;
517 memcpy(vol->name, vtbl[i].name, vol->name_len);
518 vol->name[vol->name_len] = '\0';
519 vol->vol_id = i;
520
521 ubi_assert(!ubi->volumes[i]);
522 ubi->volumes[i] = vol;
523 ubi->vol_count += 1;
524 vol->ubi = ubi;
525 reserved_pebs += vol->reserved_pebs;
526
527 /*
528 * In case of dynamic volume UBI knows nothing about how many
529 * data is stored there. So assume the whole volume is used.
530 */
531 if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
532 vol->used_ebs = vol->reserved_pebs;
533 vol->last_eb_bytes = vol->usable_leb_size;
534 vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
535 continue;
536 }
537
538 /* Static volumes only */
539 sv = ubi_scan_find_sv(si, i);
540 if (!sv) {
541 /*
542 * No eraseblocks belonging to this volume found. We
543 * don't actually know whether this static volume is
544 * completely corrupted or just contains no data. And
545 * we cannot know this as long as data size is not
546 * stored on flash. So we just assume the volume is
547 * empty. FIXME: this should be handled.
548 */
549 continue;
550 }
551
552 if (sv->leb_count != sv->used_ebs) {
553 /*
554 * We found a static volume which misses several
555 * eraseblocks. Treat it as corrupted.
556 */
557 ubi_warn("static volume %d misses %d LEBs - corrupted",
558 sv->vol_id, sv->used_ebs - sv->leb_count);
559 vol->corrupted = 1;
560 continue;
561 }
562
563 vol->used_ebs = sv->used_ebs;
564 vol->used_bytes = (vol->used_ebs - 1) * vol->usable_leb_size;
565 vol->used_bytes += sv->last_data_size;
566 vol->last_eb_bytes = sv->last_data_size;
567 }
568
569 vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
570 if (!vol)
571 return -ENOMEM;
572
573 vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS;
574 vol->alignment = 1;
575 vol->vol_type = UBI_DYNAMIC_VOLUME;
576 vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1;
577 memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1);
578 vol->usable_leb_size = ubi->leb_size;
579 vol->used_ebs = vol->reserved_pebs;
580 vol->last_eb_bytes = vol->reserved_pebs;
581 vol->used_bytes = vol->used_ebs * (ubi->leb_size - vol->data_pad);
582 vol->vol_id = UBI_LAYOUT_VOL_ID;
583
584 ubi_assert(!ubi->volumes[i]);
585 ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol;
586 reserved_pebs += vol->reserved_pebs;
587 ubi->vol_count += 1;
588 vol->ubi = ubi;
589
590 if (reserved_pebs > ubi->avail_pebs)
591 ubi_err("not enough PEBs, required %d, available %d",
592 reserved_pebs, ubi->avail_pebs);
593 ubi->rsvd_pebs += reserved_pebs;
594 ubi->avail_pebs -= reserved_pebs;
595
596 return 0;
597}
598
599/**
600 * check_sv - check volume scanning information.
601 * @vol: UBI volume description object
602 * @sv: volume scanning information
603 *
604 * This function returns zero if the volume scanning information is consistent
605 * to the data read from the volume tabla, and %-EINVAL if not.
606 */
607static int check_sv(const struct ubi_volume *vol,
608 const struct ubi_scan_volume *sv)
609{
610 if (sv->highest_lnum >= vol->reserved_pebs) {
611 dbg_err("bad highest_lnum");
612 goto bad;
613 }
614 if (sv->leb_count > vol->reserved_pebs) {
615 dbg_err("bad leb_count");
616 goto bad;
617 }
618 if (sv->vol_type != vol->vol_type) {
619 dbg_err("bad vol_type");
620 goto bad;
621 }
622 if (sv->used_ebs > vol->reserved_pebs) {
623 dbg_err("bad used_ebs");
624 goto bad;
625 }
626 if (sv->data_pad != vol->data_pad) {
627 dbg_err("bad data_pad");
628 goto bad;
629 }
630 return 0;
631
632bad:
633 ubi_err("bad scanning information");
634 ubi_dbg_dump_sv(sv);
635 ubi_dbg_dump_vol_info(vol);
636 return -EINVAL;
637}
638
639/**
640 * check_scanning_info - check that scanning information.
641 * @ubi: UBI device description object
642 * @si: scanning information
643 *
644 * Even though we protect on-flash data by CRC checksums, we still don't trust
645 * the media. This function ensures that scanning information is consistent to
646 * the information read from the volume table. Returns zero if the scanning
647 * information is OK and %-EINVAL if it is not.
648 */
649static int check_scanning_info(const struct ubi_device *ubi,
650 struct ubi_scan_info *si)
651{
652 int err, i;
653 struct ubi_scan_volume *sv;
654 struct ubi_volume *vol;
655
656 if (si->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) {
657 ubi_err("scanning found %d volumes, maximum is %d + %d",
658 si->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots);
659 return -EINVAL;
660 }
661
662 if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT&&
663 si->highest_vol_id < UBI_INTERNAL_VOL_START) {
664 ubi_err("too large volume ID %d found by scanning",
665 si->highest_vol_id);
666 return -EINVAL;
667 }
668
669
670 for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) {
671 cond_resched();
672
673 sv = ubi_scan_find_sv(si, i);
674 vol = ubi->volumes[i];
675 if (!vol) {
676 if (sv)
677 ubi_scan_rm_volume(si, sv);
678 continue;
679 }
680
681 if (vol->reserved_pebs == 0) {
682 ubi_assert(i < ubi->vtbl_slots);
683
684 if (!sv)
685 continue;
686
687 /*
688 * During scanning we found a volume which does not
689 * exist according to the information in the volume
690 * table. This must have happened due to an unclean
691 * reboot while the volume was being removed. Discard
692 * these eraseblocks.
693 */
694 ubi_msg("finish volume %d removal", sv->vol_id);
695 ubi_scan_rm_volume(si, sv);
696 } else if (sv) {
697 err = check_sv(vol, sv);
698 if (err)
699 return err;
700 }
701 }
702
703 return 0;
704}
705
706/**
707 * ubi_read_volume_table - read volume table.
708 * information.
709 * @ubi: UBI device description object
710 * @si: scanning information
711 *
712 * This function reads volume table, checks it, recover from errors if needed,
713 * or creates it if needed. Returns zero in case of success and a negative
714 * error code in case of failure.
715 */
716int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si)
717{
718 int i, err;
719 struct ubi_scan_volume *sv;
720
721 empty_vtbl_record.crc = cpu_to_ubi32(0xf116c36b);
722
723 /*
724 * The number of supported volumes is limited by the eraseblock size
725 * and by the UBI_MAX_VOLUMES constant.
726 */
727 ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE;
728 if (ubi->vtbl_slots > UBI_MAX_VOLUMES)
729 ubi->vtbl_slots = UBI_MAX_VOLUMES;
730
731 ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE;
732 ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size);
733
734 sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID);
735 if (!sv) {
736 /*
737 * No logical eraseblocks belonging to the layout volume were
738 * found. This could mean that the flash is just empty. In
739 * this case we create empty layout volume.
740 *
741 * But if flash is not empty this must be a corruption or the
742 * MTD device just contains garbage.
743 */
744 if (si->is_empty) {
745 ubi->vtbl = create_empty_lvol(ubi, si);
746 if (IS_ERR(ubi->vtbl))
747 return PTR_ERR(ubi->vtbl);
748 } else {
749 ubi_err("the layout volume was not found");
750 return -EINVAL;
751 }
752 } else {
753 if (sv->leb_count > UBI_LAYOUT_VOLUME_EBS) {
754 /* This must not happen with proper UBI images */
755 dbg_err("too many LEBs (%d) in layout volume",
756 sv->leb_count);
757 return -EINVAL;
758 }
759
760 ubi->vtbl = process_lvol(ubi, si, sv);
761 if (IS_ERR(ubi->vtbl))
762 return PTR_ERR(ubi->vtbl);
763 }
764
765 ubi->avail_pebs = ubi->good_peb_count;
766
767 /*
768 * The layout volume is OK, initialize the corresponding in-RAM data
769 * structures.
770 */
771 err = init_volumes(ubi, si, ubi->vtbl);
772 if (err)
773 goto out_free;
774
775 /*
776 * Get sure that the scanning information is consistent to the
777 * information stored in the volume table.
778 */
779 err = check_scanning_info(ubi, si);
780 if (err)
781 goto out_free;
782
783 return 0;
784
785out_free:
786 kfree(ubi->vtbl);
787 for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++)
788 if (ubi->volumes[i]) {
789 kfree(ubi->volumes[i]);
790 ubi->volumes[i] = NULL;
791 }
792 return err;
793}
794
795#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
796
797/**
798 * paranoid_vtbl_check - check volume table.
799 * @ubi: UBI device description object
800 */
801static void paranoid_vtbl_check(const struct ubi_device *ubi)
802{
803 if (vtbl_check(ubi, ubi->vtbl)) {
804 ubi_err("paranoid check failed");
805 BUG();
806 }
807}
808
809#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
new file mode 100644
index 000000000000..9ecaf77eca9e
--- /dev/null
+++ b/drivers/mtd/ubi/wl.c
@@ -0,0 +1,1671 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner
19 */
20
21/*
22 * UBI wear-leveling unit.
23 *
24 * This unit is responsible for wear-leveling. It works in terms of physical
25 * eraseblocks and erase counters and knows nothing about logical eraseblocks,
26 * volumes, etc. From this unit's perspective all physical eraseblocks are of
27 * two types - used and free. Used physical eraseblocks are those that were
28 * "get" by the 'ubi_wl_get_peb()' function, and free physical eraseblocks are
29 * those that were put by the 'ubi_wl_put_peb()' function.
30 *
31 * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter
32 * header. The rest of the physical eraseblock contains only 0xFF bytes.
33 *
34 * When physical eraseblocks are returned to the WL unit by means of the
35 * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is
36 * done asynchronously in context of the per-UBI device background thread,
37 * which is also managed by the WL unit.
38 *
39 * The wear-leveling is ensured by means of moving the contents of used
40 * physical eraseblocks with low erase counter to free physical eraseblocks
41 * with high erase counter.
42 *
43 * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick
44 * an "optimal" physical eraseblock. For example, when it is known that the
45 * physical eraseblock will be "put" soon because it contains short-term data,
46 * the WL unit may pick a free physical eraseblock with low erase counter, and
47 * so forth.
48 *
49 * If the WL unit fails to erase a physical eraseblock, it marks it as bad.
50 *
51 * This unit is also responsible for scrubbing. If a bit-flip is detected in a
52 * physical eraseblock, it has to be moved. Technically this is the same as
53 * moving it for wear-leveling reasons.
54 *
55 * As it was said, for the UBI unit all physical eraseblocks are either "free"
56 * or "used". Free eraseblock are kept in the @wl->free RB-tree, while used
57 * eraseblocks are kept in a set of different RB-trees: @wl->used,
58 * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub.
59 *
60 * Note, in this implementation, we keep a small in-RAM object for each physical
61 * eraseblock. This is surely not a scalable solution. But it appears to be good
62 * enough for moderately large flashes and it is simple. In future, one may
63 * re-work this unit and make it more scalable.
64 *
65 * At the moment this unit does not utilize the sequence number, which was
66 * introduced relatively recently. But it would be wise to do this because the
67 * sequence number of a logical eraseblock characterizes how old is it. For
68 * example, when we move a PEB with low erase counter, and we need to pick the
69 * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we
70 * pick target PEB with an average EC if our PEB is not very "old". This is a
71 * room for future re-works of the WL unit.
72 *
73 * FIXME: looks too complex, should be simplified (later).
74 */
75
76#include <linux/slab.h>
77#include <linux/crc32.h>
78#include <linux/freezer.h>
79#include <linux/kthread.h>
80#include "ubi.h"
81
82/* Number of physical eraseblocks reserved for wear-leveling purposes */
83#define WL_RESERVED_PEBS 1
84
85/*
86 * How many erase cycles are short term, unknown, and long term physical
87 * eraseblocks protected.
88 */
89#define ST_PROTECTION 16
90#define U_PROTECTION 10
91#define LT_PROTECTION 4
92
93/*
94 * Maximum difference between two erase counters. If this threshold is
95 * exceeded, the WL unit starts moving data from used physical eraseblocks with
96 * low erase counter to free physical eraseblocks with high erase counter.
97 */
98#define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD
99
100/*
101 * When a physical eraseblock is moved, the WL unit has to pick the target
102 * physical eraseblock to move to. The simplest way would be just to pick the
103 * one with the highest erase counter. But in certain workloads this could lead
104 * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a
105 * situation when the picked physical eraseblock is constantly erased after the
106 * data is written to it. So, we have a constant which limits the highest erase
107 * counter of the free physical eraseblock to pick. Namely, the WL unit does
108 * not pick eraseblocks with erase counter greater then the lowest erase
109 * counter plus %WL_FREE_MAX_DIFF.
110 */
111#define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD)
112
113/*
114 * Maximum number of consecutive background thread failures which is enough to
115 * switch to read-only mode.
116 */
117#define WL_MAX_FAILURES 32
118
119/**
120 * struct ubi_wl_entry - wear-leveling entry.
121 * @rb: link in the corresponding RB-tree
122 * @ec: erase counter
123 * @pnum: physical eraseblock number
124 *
125 * Each physical eraseblock has a corresponding &struct wl_entry object which
126 * may be kept in different RB-trees.
127 */
128struct ubi_wl_entry {
129 struct rb_node rb;
130 int ec;
131 int pnum;
132};
133
134/**
135 * struct ubi_wl_prot_entry - PEB protection entry.
136 * @rb_pnum: link in the @wl->prot.pnum RB-tree
137 * @rb_aec: link in the @wl->prot.aec RB-tree
138 * @abs_ec: the absolute erase counter value when the protection ends
139 * @e: the wear-leveling entry of the physical eraseblock under protection
140 *
141 * When the WL unit returns a physical eraseblock, the physical eraseblock is
142 * protected from being moved for some "time". For this reason, the physical
143 * eraseblock is not directly moved from the @wl->free tree to the @wl->used
144 * tree. There is one more tree in between where this physical eraseblock is
145 * temporarily stored (@wl->prot).
146 *
147 * All this protection stuff is needed because:
148 * o we don't want to move physical eraseblocks just after we have given them
149 * to the user; instead, we first want to let users fill them up with data;
150 *
151 * o there is a chance that the user will put the physical eraseblock very
152 * soon, so it makes sense not to move it for some time, but wait; this is
153 * especially important in case of "short term" physical eraseblocks.
154 *
155 * Physical eraseblocks stay protected only for limited time. But the "time" is
156 * measured in erase cycles in this case. This is implemented with help of the
157 * absolute erase counter (@wl->abs_ec). When it reaches certain value, the
158 * physical eraseblocks are moved from the protection trees (@wl->prot.*) to
159 * the @wl->used tree.
160 *
161 * Protected physical eraseblocks are searched by physical eraseblock number
162 * (when they are put) and by the absolute erase counter (to check if it is
163 * time to move them to the @wl->used tree). So there are actually 2 RB-trees
164 * storing the protected physical eraseblocks: @wl->prot.pnum and
165 * @wl->prot.aec. They are referred to as the "protection" trees. The
166 * first one is indexed by the physical eraseblock number. The second one is
167 * indexed by the absolute erase counter. Both trees store
168 * &struct ubi_wl_prot_entry objects.
169 *
170 * Each physical eraseblock has 2 main states: free and used. The former state
171 * corresponds to the @wl->free tree. The latter state is split up on several
172 * sub-states:
173 * o the WL movement is allowed (@wl->used tree);
174 * o the WL movement is temporarily prohibited (@wl->prot.pnum and
175 * @wl->prot.aec trees);
176 * o scrubbing is needed (@wl->scrub tree).
177 *
178 * Depending on the sub-state, wear-leveling entries of the used physical
179 * eraseblocks may be kept in one of those trees.
180 */
181struct ubi_wl_prot_entry {
182 struct rb_node rb_pnum;
183 struct rb_node rb_aec;
184 unsigned long long abs_ec;
185 struct ubi_wl_entry *e;
186};
187
188/**
189 * struct ubi_work - UBI work description data structure.
190 * @list: a link in the list of pending works
191 * @func: worker function
192 * @priv: private data of the worker function
193 *
194 * @e: physical eraseblock to erase
195 * @torture: if the physical eraseblock has to be tortured
196 *
197 * The @func pointer points to the worker function. If the @cancel argument is
198 * not zero, the worker has to free the resources and exit immediately. The
199 * worker has to return zero in case of success and a negative error code in
200 * case of failure.
201 */
202struct ubi_work {
203 struct list_head list;
204 int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);
205 /* The below fields are only relevant to erasure works */
206 struct ubi_wl_entry *e;
207 int torture;
208};
209
210#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
211static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec);
212static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
213 struct rb_root *root);
214#else
215#define paranoid_check_ec(ubi, pnum, ec) 0
216#define paranoid_check_in_wl_tree(e, root)
217#endif
218
219/* Slab cache for wear-leveling entries */
220static struct kmem_cache *wl_entries_slab;
221
222/**
223 * tree_empty - a helper function to check if an RB-tree is empty.
224 * @root: the root of the tree
225 *
226 * This function returns non-zero if the RB-tree is empty and zero if not.
227 */
228static inline int tree_empty(struct rb_root *root)
229{
230 return root->rb_node == NULL;
231}
232
233/**
234 * wl_tree_add - add a wear-leveling entry to a WL RB-tree.
235 * @e: the wear-leveling entry to add
236 * @root: the root of the tree
237 *
238 * Note, we use (erase counter, physical eraseblock number) pairs as keys in
239 * the @ubi->used and @ubi->free RB-trees.
240 */
241static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)
242{
243 struct rb_node **p, *parent = NULL;
244
245 p = &root->rb_node;
246 while (*p) {
247 struct ubi_wl_entry *e1;
248
249 parent = *p;
250 e1 = rb_entry(parent, struct ubi_wl_entry, rb);
251
252 if (e->ec < e1->ec)
253 p = &(*p)->rb_left;
254 else if (e->ec > e1->ec)
255 p = &(*p)->rb_right;
256 else {
257 ubi_assert(e->pnum != e1->pnum);
258 if (e->pnum < e1->pnum)
259 p = &(*p)->rb_left;
260 else
261 p = &(*p)->rb_right;
262 }
263 }
264
265 rb_link_node(&e->rb, parent, p);
266 rb_insert_color(&e->rb, root);
267}
268
269
270/*
271 * Helper functions to add and delete wear-leveling entries from different
272 * trees.
273 */
274
275static void free_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e)
276{
277 wl_tree_add(e, &ubi->free);
278}
279static inline void used_tree_add(struct ubi_device *ubi,
280 struct ubi_wl_entry *e)
281{
282 wl_tree_add(e, &ubi->used);
283}
284static inline void scrub_tree_add(struct ubi_device *ubi,
285 struct ubi_wl_entry *e)
286{
287 wl_tree_add(e, &ubi->scrub);
288}
289static inline void free_tree_del(struct ubi_device *ubi,
290 struct ubi_wl_entry *e)
291{
292 paranoid_check_in_wl_tree(e, &ubi->free);
293 rb_erase(&e->rb, &ubi->free);
294}
295static inline void used_tree_del(struct ubi_device *ubi,
296 struct ubi_wl_entry *e)
297{
298 paranoid_check_in_wl_tree(e, &ubi->used);
299 rb_erase(&e->rb, &ubi->used);
300}
301static inline void scrub_tree_del(struct ubi_device *ubi,
302 struct ubi_wl_entry *e)
303{
304 paranoid_check_in_wl_tree(e, &ubi->scrub);
305 rb_erase(&e->rb, &ubi->scrub);
306}
307
308/**
309 * do_work - do one pending work.
310 * @ubi: UBI device description object
311 *
312 * This function returns zero in case of success and a negative error code in
313 * case of failure.
314 */
315static int do_work(struct ubi_device *ubi)
316{
317 int err;
318 struct ubi_work *wrk;
319
320 spin_lock(&ubi->wl_lock);
321
322 if (list_empty(&ubi->works)) {
323 spin_unlock(&ubi->wl_lock);
324 return 0;
325 }
326
327 wrk = list_entry(ubi->works.next, struct ubi_work, list);
328 list_del(&wrk->list);
329 spin_unlock(&ubi->wl_lock);
330
331 /*
332 * Call the worker function. Do not touch the work structure
333 * after this call as it will have been freed or reused by that
334 * time by the worker function.
335 */
336 err = wrk->func(ubi, wrk, 0);
337 if (err)
338 ubi_err("work failed with error code %d", err);
339
340 spin_lock(&ubi->wl_lock);
341 ubi->works_count -= 1;
342 ubi_assert(ubi->works_count >= 0);
343 spin_unlock(&ubi->wl_lock);
344 return err;
345}
346
347/**
348 * produce_free_peb - produce a free physical eraseblock.
349 * @ubi: UBI device description object
350 *
351 * This function tries to make a free PEB by means of synchronous execution of
352 * pending works. This may be needed if, for example the background thread is
353 * disabled. Returns zero in case of success and a negative error code in case
354 * of failure.
355 */
356static int produce_free_peb(struct ubi_device *ubi)
357{
358 int err;
359
360 spin_lock(&ubi->wl_lock);
361 while (tree_empty(&ubi->free)) {
362 spin_unlock(&ubi->wl_lock);
363
364 dbg_wl("do one work synchronously");
365 err = do_work(ubi);
366 if (err)
367 return err;
368
369 spin_lock(&ubi->wl_lock);
370 }
371 spin_unlock(&ubi->wl_lock);
372
373 return 0;
374}
375
376/**
377 * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree.
378 * @e: the wear-leveling entry to check
379 * @root: the root of the tree
380 *
381 * This function returns non-zero if @e is in the @root RB-tree and zero if it
382 * is not.
383 */
384static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)
385{
386 struct rb_node *p;
387
388 p = root->rb_node;
389 while (p) {
390 struct ubi_wl_entry *e1;
391
392 e1 = rb_entry(p, struct ubi_wl_entry, rb);
393
394 if (e->pnum == e1->pnum) {
395 ubi_assert(e == e1);
396 return 1;
397 }
398
399 if (e->ec < e1->ec)
400 p = p->rb_left;
401 else if (e->ec > e1->ec)
402 p = p->rb_right;
403 else {
404 ubi_assert(e->pnum != e1->pnum);
405 if (e->pnum < e1->pnum)
406 p = p->rb_left;
407 else
408 p = p->rb_right;
409 }
410 }
411
412 return 0;
413}
414
415/**
416 * prot_tree_add - add physical eraseblock to protection trees.
417 * @ubi: UBI device description object
418 * @e: the physical eraseblock to add
419 * @pe: protection entry object to use
420 * @abs_ec: absolute erase counter value when this physical eraseblock has
421 * to be removed from the protection trees.
422 *
423 * @wl->lock has to be locked.
424 */
425static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e,
426 struct ubi_wl_prot_entry *pe, int abs_ec)
427{
428 struct rb_node **p, *parent = NULL;
429 struct ubi_wl_prot_entry *pe1;
430
431 pe->e = e;
432 pe->abs_ec = ubi->abs_ec + abs_ec;
433
434 p = &ubi->prot.pnum.rb_node;
435 while (*p) {
436 parent = *p;
437 pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum);
438
439 if (e->pnum < pe1->e->pnum)
440 p = &(*p)->rb_left;
441 else
442 p = &(*p)->rb_right;
443 }
444 rb_link_node(&pe->rb_pnum, parent, p);
445 rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum);
446
447 p = &ubi->prot.aec.rb_node;
448 parent = NULL;
449 while (*p) {
450 parent = *p;
451 pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec);
452
453 if (pe->abs_ec < pe1->abs_ec)
454 p = &(*p)->rb_left;
455 else
456 p = &(*p)->rb_right;
457 }
458 rb_link_node(&pe->rb_aec, parent, p);
459 rb_insert_color(&pe->rb_aec, &ubi->prot.aec);
460}
461
462/**
463 * find_wl_entry - find wear-leveling entry closest to certain erase counter.
464 * @root: the RB-tree where to look for
465 * @max: highest possible erase counter
466 *
467 * This function looks for a wear leveling entry with erase counter closest to
468 * @max and less then @max.
469 */
470static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)
471{
472 struct rb_node *p;
473 struct ubi_wl_entry *e;
474
475 e = rb_entry(rb_first(root), struct ubi_wl_entry, rb);
476 max += e->ec;
477
478 p = root->rb_node;
479 while (p) {
480 struct ubi_wl_entry *e1;
481
482 e1 = rb_entry(p, struct ubi_wl_entry, rb);
483 if (e1->ec >= max)
484 p = p->rb_left;
485 else {
486 p = p->rb_right;
487 e = e1;
488 }
489 }
490
491 return e;
492}
493
494/**
495 * ubi_wl_get_peb - get a physical eraseblock.
496 * @ubi: UBI device description object
497 * @dtype: type of data which will be stored in this physical eraseblock
498 *
499 * This function returns a physical eraseblock in case of success and a
500 * negative error code in case of failure. Might sleep.
501 */
502int ubi_wl_get_peb(struct ubi_device *ubi, int dtype)
503{
504 int err, protect, medium_ec;
505 struct ubi_wl_entry *e, *first, *last;
506 struct ubi_wl_prot_entry *pe;
507
508 ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM ||
509 dtype == UBI_UNKNOWN);
510
511 pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_KERNEL);
512 if (!pe)
513 return -ENOMEM;
514
515retry:
516 spin_lock(&ubi->wl_lock);
517 if (tree_empty(&ubi->free)) {
518 if (ubi->works_count == 0) {
519 ubi_assert(list_empty(&ubi->works));
520 ubi_err("no free eraseblocks");
521 spin_unlock(&ubi->wl_lock);
522 kfree(pe);
523 return -ENOSPC;
524 }
525 spin_unlock(&ubi->wl_lock);
526
527 err = produce_free_peb(ubi);
528 if (err < 0) {
529 kfree(pe);
530 return err;
531 }
532 goto retry;
533 }
534
535 switch (dtype) {
536 case UBI_LONGTERM:
537 /*
538 * For long term data we pick a physical eraseblock
539 * with high erase counter. But the highest erase
540 * counter we can pick is bounded by the the lowest
541 * erase counter plus %WL_FREE_MAX_DIFF.
542 */
543 e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
544 protect = LT_PROTECTION;
545 break;
546 case UBI_UNKNOWN:
547 /*
548 * For unknown data we pick a physical eraseblock with
549 * medium erase counter. But we by no means can pick a
550 * physical eraseblock with erase counter greater or
551 * equivalent than the lowest erase counter plus
552 * %WL_FREE_MAX_DIFF.
553 */
554 first = rb_entry(rb_first(&ubi->free),
555 struct ubi_wl_entry, rb);
556 last = rb_entry(rb_last(&ubi->free),
557 struct ubi_wl_entry, rb);
558
559 if (last->ec - first->ec < WL_FREE_MAX_DIFF)
560 e = rb_entry(ubi->free.rb_node,
561 struct ubi_wl_entry, rb);
562 else {
563 medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2;
564 e = find_wl_entry(&ubi->free, medium_ec);
565 }
566 protect = U_PROTECTION;
567 break;
568 case UBI_SHORTTERM:
569 /*
570 * For short term data we pick a physical eraseblock
571 * with the lowest erase counter as we expect it will
572 * be erased soon.
573 */
574 e = rb_entry(rb_first(&ubi->free),
575 struct ubi_wl_entry, rb);
576 protect = ST_PROTECTION;
577 break;
578 default:
579 protect = 0;
580 e = NULL;
581 BUG();
582 }
583
584 /*
585 * Move the physical eraseblock to the protection trees where it will
586 * be protected from being moved for some time.
587 */
588 free_tree_del(ubi, e);
589 prot_tree_add(ubi, e, pe, protect);
590
591 dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect);
592 spin_unlock(&ubi->wl_lock);
593
594 return e->pnum;
595}
596
597/**
598 * prot_tree_del - remove a physical eraseblock from the protection trees
599 * @ubi: UBI device description object
600 * @pnum: the physical eraseblock to remove
601 */
602static void prot_tree_del(struct ubi_device *ubi, int pnum)
603{
604 struct rb_node *p;
605 struct ubi_wl_prot_entry *pe = NULL;
606
607 p = ubi->prot.pnum.rb_node;
608 while (p) {
609
610 pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum);
611
612 if (pnum == pe->e->pnum)
613 break;
614
615 if (pnum < pe->e->pnum)
616 p = p->rb_left;
617 else
618 p = p->rb_right;
619 }
620
621 ubi_assert(pe->e->pnum == pnum);
622 rb_erase(&pe->rb_aec, &ubi->prot.aec);
623 rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
624 kfree(pe);
625}
626
627/**
628 * sync_erase - synchronously erase a physical eraseblock.
629 * @ubi: UBI device description object
630 * @e: the the physical eraseblock to erase
631 * @torture: if the physical eraseblock has to be tortured
632 *
633 * This function returns zero in case of success and a negative error code in
634 * case of failure.
635 */
636static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture)
637{
638 int err;
639 struct ubi_ec_hdr *ec_hdr;
640 unsigned long long ec = e->ec;
641
642 dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec);
643
644 err = paranoid_check_ec(ubi, e->pnum, e->ec);
645 if (err > 0)
646 return -EINVAL;
647
648 ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
649 if (!ec_hdr)
650 return -ENOMEM;
651
652 err = ubi_io_sync_erase(ubi, e->pnum, torture);
653 if (err < 0)
654 goto out_free;
655
656 ec += err;
657 if (ec > UBI_MAX_ERASECOUNTER) {
658 /*
659 * Erase counter overflow. Upgrade UBI and use 64-bit
660 * erase counters internally.
661 */
662 ubi_err("erase counter overflow at PEB %d, EC %llu",
663 e->pnum, ec);
664 err = -EINVAL;
665 goto out_free;
666 }
667
668 dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec);
669
670 ec_hdr->ec = cpu_to_ubi64(ec);
671
672 err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr);
673 if (err)
674 goto out_free;
675
676 e->ec = ec;
677 spin_lock(&ubi->wl_lock);
678 if (e->ec > ubi->max_ec)
679 ubi->max_ec = e->ec;
680 spin_unlock(&ubi->wl_lock);
681
682out_free:
683 kfree(ec_hdr);
684 return err;
685}
686
687/**
688 * check_protection_over - check if it is time to stop protecting some
689 * physical eraseblocks.
690 * @ubi: UBI device description object
691 *
692 * This function is called after each erase operation, when the absolute erase
693 * counter is incremented, to check if some physical eraseblock have not to be
694 * protected any longer. These physical eraseblocks are moved from the
695 * protection trees to the used tree.
696 */
697static void check_protection_over(struct ubi_device *ubi)
698{
699 struct ubi_wl_prot_entry *pe;
700
701 /*
702 * There may be several protected physical eraseblock to remove,
703 * process them all.
704 */
705 while (1) {
706 spin_lock(&ubi->wl_lock);
707 if (tree_empty(&ubi->prot.aec)) {
708 spin_unlock(&ubi->wl_lock);
709 break;
710 }
711
712 pe = rb_entry(rb_first(&ubi->prot.aec),
713 struct ubi_wl_prot_entry, rb_aec);
714
715 if (pe->abs_ec > ubi->abs_ec) {
716 spin_unlock(&ubi->wl_lock);
717 break;
718 }
719
720 dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu",
721 pe->e->pnum, ubi->abs_ec, pe->abs_ec);
722 rb_erase(&pe->rb_aec, &ubi->prot.aec);
723 rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
724 used_tree_add(ubi, pe->e);
725 spin_unlock(&ubi->wl_lock);
726
727 kfree(pe);
728 cond_resched();
729 }
730}
731
732/**
733 * schedule_ubi_work - schedule a work.
734 * @ubi: UBI device description object
735 * @wrk: the work to schedule
736 *
737 * This function enqueues a work defined by @wrk to the tail of the pending
738 * works list.
739 */
740static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
741{
742 spin_lock(&ubi->wl_lock);
743 list_add_tail(&wrk->list, &ubi->works);
744 ubi_assert(ubi->works_count >= 0);
745 ubi->works_count += 1;
746 if (ubi->thread_enabled)
747 wake_up_process(ubi->bgt_thread);
748 spin_unlock(&ubi->wl_lock);
749}
750
751static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
752 int cancel);
753
754/**
755 * schedule_erase - schedule an erase work.
756 * @ubi: UBI device description object
757 * @e: the WL entry of the physical eraseblock to erase
758 * @torture: if the physical eraseblock has to be tortured
759 *
760 * This function returns zero in case of success and a %-ENOMEM in case of
761 * failure.
762 */
763static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
764 int torture)
765{
766 struct ubi_work *wl_wrk;
767
768 dbg_wl("schedule erasure of PEB %d, EC %d, torture %d",
769 e->pnum, e->ec, torture);
770
771 wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL);
772 if (!wl_wrk)
773 return -ENOMEM;
774
775 wl_wrk->func = &erase_worker;
776 wl_wrk->e = e;
777 wl_wrk->torture = torture;
778
779 schedule_ubi_work(ubi, wl_wrk);
780 return 0;
781}
782
783/**
784 * wear_leveling_worker - wear-leveling worker function.
785 * @ubi: UBI device description object
786 * @wrk: the work object
787 * @cancel: non-zero if the worker has to free memory and exit
788 *
789 * This function copies a more worn out physical eraseblock to a less worn out
790 * one. Returns zero in case of success and a negative error code in case of
791 * failure.
792 */
793static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
794 int cancel)
795{
796 int err, put = 0;
797 struct ubi_wl_entry *e1, *e2;
798 struct ubi_vid_hdr *vid_hdr;
799
800 kfree(wrk);
801
802 if (cancel)
803 return 0;
804
805 vid_hdr = ubi_zalloc_vid_hdr(ubi);
806 if (!vid_hdr)
807 return -ENOMEM;
808
809 spin_lock(&ubi->wl_lock);
810
811 /*
812 * Only one WL worker at a time is supported at this implementation, so
813 * make sure a PEB is not being moved already.
814 */
815 if (ubi->move_to || tree_empty(&ubi->free) ||
816 (tree_empty(&ubi->used) && tree_empty(&ubi->scrub))) {
817 /*
818 * Only one WL worker at a time is supported at this
819 * implementation, so if a LEB is already being moved, cancel.
820 *
821 * No free physical eraseblocks? Well, we cancel wear-leveling
822 * then. It will be triggered again when a free physical
823 * eraseblock appears.
824 *
825 * No used physical eraseblocks? They must be temporarily
826 * protected from being moved. They will be moved to the
827 * @ubi->used tree later and the wear-leveling will be
828 * triggered again.
829 */
830 dbg_wl("cancel WL, a list is empty: free %d, used %d",
831 tree_empty(&ubi->free), tree_empty(&ubi->used));
832 ubi->wl_scheduled = 0;
833 spin_unlock(&ubi->wl_lock);
834 ubi_free_vid_hdr(ubi, vid_hdr);
835 return 0;
836 }
837
838 if (tree_empty(&ubi->scrub)) {
839 /*
840 * Now pick the least worn-out used physical eraseblock and a
841 * highly worn-out free physical eraseblock. If the erase
842 * counters differ much enough, start wear-leveling.
843 */
844 e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
845 e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
846
847 if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {
848 dbg_wl("no WL needed: min used EC %d, max free EC %d",
849 e1->ec, e2->ec);
850 ubi->wl_scheduled = 0;
851 spin_unlock(&ubi->wl_lock);
852 ubi_free_vid_hdr(ubi, vid_hdr);
853 return 0;
854 }
855 used_tree_del(ubi, e1);
856 dbg_wl("move PEB %d EC %d to PEB %d EC %d",
857 e1->pnum, e1->ec, e2->pnum, e2->ec);
858 } else {
859 e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb);
860 e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
861 scrub_tree_del(ubi, e1);
862 dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);
863 }
864
865 free_tree_del(ubi, e2);
866 ubi_assert(!ubi->move_from && !ubi->move_to);
867 ubi_assert(!ubi->move_to_put && !ubi->move_from_put);
868 ubi->move_from = e1;
869 ubi->move_to = e2;
870 spin_unlock(&ubi->wl_lock);
871
872 /*
873 * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum.
874 * We so far do not know which logical eraseblock our physical
875 * eraseblock (@e1) belongs to. We have to read the volume identifier
876 * header first.
877 */
878
879 err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);
880 if (err && err != UBI_IO_BITFLIPS) {
881 if (err == UBI_IO_PEB_FREE) {
882 /*
883 * We are trying to move PEB without a VID header. UBI
884 * always write VID headers shortly after the PEB was
885 * given, so we have a situation when it did not have
886 * chance to write it down because it was preempted.
887 * Just re-schedule the work, so that next time it will
888 * likely have the VID header in place.
889 */
890 dbg_wl("PEB %d has no VID header", e1->pnum);
891 err = 0;
892 } else {
893 ubi_err("error %d while reading VID header from PEB %d",
894 err, e1->pnum);
895 if (err > 0)
896 err = -EIO;
897 }
898 goto error;
899 }
900
901 err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);
902 if (err) {
903 if (err == UBI_IO_BITFLIPS)
904 err = 0;
905 goto error;
906 }
907
908 ubi_free_vid_hdr(ubi, vid_hdr);
909 spin_lock(&ubi->wl_lock);
910 if (!ubi->move_to_put)
911 used_tree_add(ubi, e2);
912 else
913 put = 1;
914 ubi->move_from = ubi->move_to = NULL;
915 ubi->move_from_put = ubi->move_to_put = 0;
916 ubi->wl_scheduled = 0;
917 spin_unlock(&ubi->wl_lock);
918
919 if (put) {
920 /*
921 * Well, the target PEB was put meanwhile, schedule it for
922 * erasure.
923 */
924 dbg_wl("PEB %d was put meanwhile, erase", e2->pnum);
925 err = schedule_erase(ubi, e2, 0);
926 if (err) {
927 kmem_cache_free(wl_entries_slab, e2);
928 ubi_ro_mode(ubi);
929 }
930 }
931
932 err = schedule_erase(ubi, e1, 0);
933 if (err) {
934 kmem_cache_free(wl_entries_slab, e1);
935 ubi_ro_mode(ubi);
936 }
937
938 dbg_wl("done");
939 return err;
940
941 /*
942 * Some error occurred. @e1 was not changed, so return it back. @e2
943 * might be changed, schedule it for erasure.
944 */
945error:
946 if (err)
947 dbg_wl("error %d occurred, cancel operation", err);
948 ubi_assert(err <= 0);
949
950 ubi_free_vid_hdr(ubi, vid_hdr);
951 spin_lock(&ubi->wl_lock);
952 ubi->wl_scheduled = 0;
953 if (ubi->move_from_put)
954 put = 1;
955 else
956 used_tree_add(ubi, e1);
957 ubi->move_from = ubi->move_to = NULL;
958 ubi->move_from_put = ubi->move_to_put = 0;
959 spin_unlock(&ubi->wl_lock);
960
961 if (put) {
962 /*
963 * Well, the target PEB was put meanwhile, schedule it for
964 * erasure.
965 */
966 dbg_wl("PEB %d was put meanwhile, erase", e1->pnum);
967 err = schedule_erase(ubi, e1, 0);
968 if (err) {
969 kmem_cache_free(wl_entries_slab, e1);
970 ubi_ro_mode(ubi);
971 }
972 }
973
974 err = schedule_erase(ubi, e2, 0);
975 if (err) {
976 kmem_cache_free(wl_entries_slab, e2);
977 ubi_ro_mode(ubi);
978 }
979
980 yield();
981 return err;
982}
983
984/**
985 * ensure_wear_leveling - schedule wear-leveling if it is needed.
986 * @ubi: UBI device description object
987 *
988 * This function checks if it is time to start wear-leveling and schedules it
989 * if yes. This function returns zero in case of success and a negative error
990 * code in case of failure.
991 */
992static int ensure_wear_leveling(struct ubi_device *ubi)
993{
994 int err = 0;
995 struct ubi_wl_entry *e1;
996 struct ubi_wl_entry *e2;
997 struct ubi_work *wrk;
998
999 spin_lock(&ubi->wl_lock);
1000 if (ubi->wl_scheduled)
1001 /* Wear-leveling is already in the work queue */
1002 goto out_unlock;
1003
1004 /*
1005 * If the ubi->scrub tree is not empty, scrubbing is needed, and the
1006 * the WL worker has to be scheduled anyway.
1007 */
1008 if (tree_empty(&ubi->scrub)) {
1009 if (tree_empty(&ubi->used) || tree_empty(&ubi->free))
1010 /* No physical eraseblocks - no deal */
1011 goto out_unlock;
1012
1013 /*
1014 * We schedule wear-leveling only if the difference between the
1015 * lowest erase counter of used physical eraseblocks and a high
1016 * erase counter of free physical eraseblocks is greater then
1017 * %UBI_WL_THRESHOLD.
1018 */
1019 e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
1020 e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
1021
1022 if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))
1023 goto out_unlock;
1024 dbg_wl("schedule wear-leveling");
1025 } else
1026 dbg_wl("schedule scrubbing");
1027
1028 ubi->wl_scheduled = 1;
1029 spin_unlock(&ubi->wl_lock);
1030
1031 wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL);
1032 if (!wrk) {
1033 err = -ENOMEM;
1034 goto out_cancel;
1035 }
1036
1037 wrk->func = &wear_leveling_worker;
1038 schedule_ubi_work(ubi, wrk);
1039 return err;
1040
1041out_cancel:
1042 spin_lock(&ubi->wl_lock);
1043 ubi->wl_scheduled = 0;
1044out_unlock:
1045 spin_unlock(&ubi->wl_lock);
1046 return err;
1047}
1048
1049/**
1050 * erase_worker - physical eraseblock erase worker function.
1051 * @ubi: UBI device description object
1052 * @wl_wrk: the work object
1053 * @cancel: non-zero if the worker has to free memory and exit
1054 *
1055 * This function erases a physical eraseblock and perform torture testing if
1056 * needed. It also takes care about marking the physical eraseblock bad if
1057 * needed. Returns zero in case of success and a negative error code in case of
1058 * failure.
1059 */
1060static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
1061 int cancel)
1062{
1063 int err;
1064 struct ubi_wl_entry *e = wl_wrk->e;
1065 int pnum = e->pnum;
1066
1067 if (cancel) {
1068 dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec);
1069 kfree(wl_wrk);
1070 kmem_cache_free(wl_entries_slab, e);
1071 return 0;
1072 }
1073
1074 dbg_wl("erase PEB %d EC %d", pnum, e->ec);
1075
1076 err = sync_erase(ubi, e, wl_wrk->torture);
1077 if (!err) {
1078 /* Fine, we've erased it successfully */
1079 kfree(wl_wrk);
1080
1081 spin_lock(&ubi->wl_lock);
1082 ubi->abs_ec += 1;
1083 free_tree_add(ubi, e);
1084 spin_unlock(&ubi->wl_lock);
1085
1086 /*
1087 * One more erase operation has happened, take care about protected
1088 * physical eraseblocks.
1089 */
1090 check_protection_over(ubi);
1091
1092 /* And take care about wear-leveling */
1093 err = ensure_wear_leveling(ubi);
1094 return err;
1095 }
1096
1097 kfree(wl_wrk);
1098 kmem_cache_free(wl_entries_slab, e);
1099
1100 if (err != -EIO) {
1101 /*
1102 * If this is not %-EIO, we have no idea what to do. Scheduling
1103 * this physical eraseblock for erasure again would cause
1104 * errors again and again. Well, lets switch to RO mode.
1105 */
1106 ubi_ro_mode(ubi);
1107 return err;
1108 }
1109
1110 /* It is %-EIO, the PEB went bad */
1111
1112 if (!ubi->bad_allowed) {
1113 ubi_err("bad physical eraseblock %d detected", pnum);
1114 ubi_ro_mode(ubi);
1115 err = -EIO;
1116 } else {
1117 int need;
1118
1119 spin_lock(&ubi->volumes_lock);
1120 need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs + 1;
1121 if (need > 0) {
1122 need = ubi->avail_pebs >= need ? need : ubi->avail_pebs;
1123 ubi->avail_pebs -= need;
1124 ubi->rsvd_pebs += need;
1125 ubi->beb_rsvd_pebs += need;
1126 if (need > 0)
1127 ubi_msg("reserve more %d PEBs", need);
1128 }
1129
1130 if (ubi->beb_rsvd_pebs == 0) {
1131 spin_unlock(&ubi->volumes_lock);
1132 ubi_err("no reserved physical eraseblocks");
1133 ubi_ro_mode(ubi);
1134 return -EIO;
1135 }
1136
1137 spin_unlock(&ubi->volumes_lock);
1138 ubi_msg("mark PEB %d as bad", pnum);
1139
1140 err = ubi_io_mark_bad(ubi, pnum);
1141 if (err) {
1142 ubi_ro_mode(ubi);
1143 return err;
1144 }
1145
1146 spin_lock(&ubi->volumes_lock);
1147 ubi->beb_rsvd_pebs -= 1;
1148 ubi->bad_peb_count += 1;
1149 ubi->good_peb_count -= 1;
1150 ubi_calculate_reserved(ubi);
1151 if (ubi->beb_rsvd_pebs == 0)
1152 ubi_warn("last PEB from the reserved pool was used");
1153 spin_unlock(&ubi->volumes_lock);
1154 }
1155
1156 return err;
1157}
1158
1159/**
1160 * ubi_wl_put_peb - return a physical eraseblock to the wear-leveling
1161 * unit.
1162 * @ubi: UBI device description object
1163 * @pnum: physical eraseblock to return
1164 * @torture: if this physical eraseblock has to be tortured
1165 *
1166 * This function is called to return physical eraseblock @pnum to the pool of
1167 * free physical eraseblocks. The @torture flag has to be set if an I/O error
1168 * occurred to this @pnum and it has to be tested. This function returns zero
1169 * in case of success and a negative error code in case of failure.
1170 */
1171int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture)
1172{
1173 int err;
1174 struct ubi_wl_entry *e;
1175
1176 dbg_wl("PEB %d", pnum);
1177 ubi_assert(pnum >= 0);
1178 ubi_assert(pnum < ubi->peb_count);
1179
1180 spin_lock(&ubi->wl_lock);
1181
1182 e = ubi->lookuptbl[pnum];
1183 if (e == ubi->move_from) {
1184 /*
1185 * User is putting the physical eraseblock which was selected to
1186 * be moved. It will be scheduled for erasure in the
1187 * wear-leveling worker.
1188 */
1189 dbg_wl("PEB %d is being moved", pnum);
1190 ubi_assert(!ubi->move_from_put);
1191 ubi->move_from_put = 1;
1192 spin_unlock(&ubi->wl_lock);
1193 return 0;
1194 } else if (e == ubi->move_to) {
1195 /*
1196 * User is putting the physical eraseblock which was selected
1197 * as the target the data is moved to. It may happen if the EBA
1198 * unit already re-mapped the LEB but the WL unit did has not
1199 * put the PEB to the "used" tree.
1200 */
1201 dbg_wl("PEB %d is the target of data moving", pnum);
1202 ubi_assert(!ubi->move_to_put);
1203 ubi->move_to_put = 1;
1204 spin_unlock(&ubi->wl_lock);
1205 return 0;
1206 } else {
1207 if (in_wl_tree(e, &ubi->used))
1208 used_tree_del(ubi, e);
1209 else if (in_wl_tree(e, &ubi->scrub))
1210 scrub_tree_del(ubi, e);
1211 else
1212 prot_tree_del(ubi, e->pnum);
1213 }
1214 spin_unlock(&ubi->wl_lock);
1215
1216 err = schedule_erase(ubi, e, torture);
1217 if (err) {
1218 spin_lock(&ubi->wl_lock);
1219 used_tree_add(ubi, e);
1220 spin_unlock(&ubi->wl_lock);
1221 }
1222
1223 return err;
1224}
1225
1226/**
1227 * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing.
1228 * @ubi: UBI device description object
1229 * @pnum: the physical eraseblock to schedule
1230 *
1231 * If a bit-flip in a physical eraseblock is detected, this physical eraseblock
1232 * needs scrubbing. This function schedules a physical eraseblock for
1233 * scrubbing which is done in background. This function returns zero in case of
1234 * success and a negative error code in case of failure.
1235 */
1236int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum)
1237{
1238 struct ubi_wl_entry *e;
1239
1240 ubi_msg("schedule PEB %d for scrubbing", pnum);
1241
1242retry:
1243 spin_lock(&ubi->wl_lock);
1244 e = ubi->lookuptbl[pnum];
1245 if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub)) {
1246 spin_unlock(&ubi->wl_lock);
1247 return 0;
1248 }
1249
1250 if (e == ubi->move_to) {
1251 /*
1252 * This physical eraseblock was used to move data to. The data
1253 * was moved but the PEB was not yet inserted to the proper
1254 * tree. We should just wait a little and let the WL worker
1255 * proceed.
1256 */
1257 spin_unlock(&ubi->wl_lock);
1258 dbg_wl("the PEB %d is not in proper tree, retry", pnum);
1259 yield();
1260 goto retry;
1261 }
1262
1263 if (in_wl_tree(e, &ubi->used))
1264 used_tree_del(ubi, e);
1265 else
1266 prot_tree_del(ubi, pnum);
1267
1268 scrub_tree_add(ubi, e);
1269 spin_unlock(&ubi->wl_lock);
1270
1271 /*
1272 * Technically scrubbing is the same as wear-leveling, so it is done
1273 * by the WL worker.
1274 */
1275 return ensure_wear_leveling(ubi);
1276}
1277
1278/**
1279 * ubi_wl_flush - flush all pending works.
1280 * @ubi: UBI device description object
1281 *
1282 * This function returns zero in case of success and a negative error code in
1283 * case of failure.
1284 */
1285int ubi_wl_flush(struct ubi_device *ubi)
1286{
1287 int err, pending_count;
1288
1289 pending_count = ubi->works_count;
1290
1291 dbg_wl("flush (%d pending works)", pending_count);
1292
1293 /*
1294 * Erase while the pending works queue is not empty, but not more then
1295 * the number of currently pending works.
1296 */
1297 while (pending_count-- > 0) {
1298 err = do_work(ubi);
1299 if (err)
1300 return err;
1301 }
1302
1303 return 0;
1304}
1305
1306/**
1307 * tree_destroy - destroy an RB-tree.
1308 * @root: the root of the tree to destroy
1309 */
1310static void tree_destroy(struct rb_root *root)
1311{
1312 struct rb_node *rb;
1313 struct ubi_wl_entry *e;
1314
1315 rb = root->rb_node;
1316 while (rb) {
1317 if (rb->rb_left)
1318 rb = rb->rb_left;
1319 else if (rb->rb_right)
1320 rb = rb->rb_right;
1321 else {
1322 e = rb_entry(rb, struct ubi_wl_entry, rb);
1323
1324 rb = rb_parent(rb);
1325 if (rb) {
1326 if (rb->rb_left == &e->rb)
1327 rb->rb_left = NULL;
1328 else
1329 rb->rb_right = NULL;
1330 }
1331
1332 kmem_cache_free(wl_entries_slab, e);
1333 }
1334 }
1335}
1336
1337/**
1338 * ubi_thread - UBI background thread.
1339 * @u: the UBI device description object pointer
1340 */
1341static int ubi_thread(void *u)
1342{
1343 int failures = 0;
1344 struct ubi_device *ubi = u;
1345
1346 ubi_msg("background thread \"%s\" started, PID %d",
1347 ubi->bgt_name, current->pid);
1348
1349 for (;;) {
1350 int err;
1351
1352 if (kthread_should_stop())
1353 goto out;
1354
1355 if (try_to_freeze())
1356 continue;
1357
1358 spin_lock(&ubi->wl_lock);
1359 if (list_empty(&ubi->works) || ubi->ro_mode ||
1360 !ubi->thread_enabled) {
1361 set_current_state(TASK_INTERRUPTIBLE);
1362 spin_unlock(&ubi->wl_lock);
1363 schedule();
1364 continue;
1365 }
1366 spin_unlock(&ubi->wl_lock);
1367
1368 err = do_work(ubi);
1369 if (err) {
1370 ubi_err("%s: work failed with error code %d",
1371 ubi->bgt_name, err);
1372 if (failures++ > WL_MAX_FAILURES) {
1373 /*
1374 * Too many failures, disable the thread and
1375 * switch to read-only mode.
1376 */
1377 ubi_msg("%s: %d consecutive failures",
1378 ubi->bgt_name, WL_MAX_FAILURES);
1379 ubi_ro_mode(ubi);
1380 break;
1381 }
1382 } else
1383 failures = 0;
1384
1385 cond_resched();
1386 }
1387
1388out:
1389 dbg_wl("background thread \"%s\" is killed", ubi->bgt_name);
1390 return 0;
1391}
1392
1393/**
1394 * cancel_pending - cancel all pending works.
1395 * @ubi: UBI device description object
1396 */
1397static void cancel_pending(struct ubi_device *ubi)
1398{
1399 while (!list_empty(&ubi->works)) {
1400 struct ubi_work *wrk;
1401
1402 wrk = list_entry(ubi->works.next, struct ubi_work, list);
1403 list_del(&wrk->list);
1404 wrk->func(ubi, wrk, 1);
1405 ubi->works_count -= 1;
1406 ubi_assert(ubi->works_count >= 0);
1407 }
1408}
1409
1410/**
1411 * ubi_wl_init_scan - initialize the wear-leveling unit using scanning
1412 * information.
1413 * @ubi: UBI device description object
1414 * @si: scanning information
1415 *
1416 * This function returns zero in case of success, and a negative error code in
1417 * case of failure.
1418 */
1419int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
1420{
1421 int err;
1422 struct rb_node *rb1, *rb2;
1423 struct ubi_scan_volume *sv;
1424 struct ubi_scan_leb *seb, *tmp;
1425 struct ubi_wl_entry *e;
1426
1427
1428 ubi->used = ubi->free = ubi->scrub = RB_ROOT;
1429 ubi->prot.pnum = ubi->prot.aec = RB_ROOT;
1430 spin_lock_init(&ubi->wl_lock);
1431 ubi->max_ec = si->max_ec;
1432 INIT_LIST_HEAD(&ubi->works);
1433
1434 sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num);
1435
1436 ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name);
1437 if (IS_ERR(ubi->bgt_thread)) {
1438 err = PTR_ERR(ubi->bgt_thread);
1439 ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name,
1440 err);
1441 return err;
1442 }
1443
1444 if (ubi_devices_cnt == 0) {
1445 wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab",
1446 sizeof(struct ubi_wl_entry),
1447 0, 0, NULL, NULL);
1448 if (!wl_entries_slab)
1449 return -ENOMEM;
1450 }
1451
1452 err = -ENOMEM;
1453 ubi->lookuptbl = kzalloc(ubi->peb_count * sizeof(void *), GFP_KERNEL);
1454 if (!ubi->lookuptbl)
1455 goto out_free;
1456
1457 list_for_each_entry_safe(seb, tmp, &si->erase, u.list) {
1458 cond_resched();
1459
1460 e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
1461 if (!e)
1462 goto out_free;
1463
1464 e->pnum = seb->pnum;
1465 e->ec = seb->ec;
1466 ubi->lookuptbl[e->pnum] = e;
1467 if (schedule_erase(ubi, e, 0)) {
1468 kmem_cache_free(wl_entries_slab, e);
1469 goto out_free;
1470 }
1471 }
1472
1473 list_for_each_entry(seb, &si->free, u.list) {
1474 cond_resched();
1475
1476 e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
1477 if (!e)
1478 goto out_free;
1479
1480 e->pnum = seb->pnum;
1481 e->ec = seb->ec;
1482 ubi_assert(e->ec >= 0);
1483 free_tree_add(ubi, e);
1484 ubi->lookuptbl[e->pnum] = e;
1485 }
1486
1487 list_for_each_entry(seb, &si->corr, u.list) {
1488 cond_resched();
1489
1490 e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
1491 if (!e)
1492 goto out_free;
1493
1494 e->pnum = seb->pnum;
1495 e->ec = seb->ec;
1496 ubi->lookuptbl[e->pnum] = e;
1497 if (schedule_erase(ubi, e, 0)) {
1498 kmem_cache_free(wl_entries_slab, e);
1499 goto out_free;
1500 }
1501 }
1502
1503 ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
1504 ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
1505 cond_resched();
1506
1507 e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
1508 if (!e)
1509 goto out_free;
1510
1511 e->pnum = seb->pnum;
1512 e->ec = seb->ec;
1513 ubi->lookuptbl[e->pnum] = e;
1514 if (!seb->scrub) {
1515 dbg_wl("add PEB %d EC %d to the used tree",
1516 e->pnum, e->ec);
1517 used_tree_add(ubi, e);
1518 } else {
1519 dbg_wl("add PEB %d EC %d to the scrub tree",
1520 e->pnum, e->ec);
1521 scrub_tree_add(ubi, e);
1522 }
1523 }
1524 }
1525
1526 if (WL_RESERVED_PEBS > ubi->avail_pebs) {
1527 ubi_err("no enough physical eraseblocks (%d, need %d)",
1528 ubi->avail_pebs, WL_RESERVED_PEBS);
1529 goto out_free;
1530 }
1531 ubi->avail_pebs -= WL_RESERVED_PEBS;
1532 ubi->rsvd_pebs += WL_RESERVED_PEBS;
1533
1534 /* Schedule wear-leveling if needed */
1535 err = ensure_wear_leveling(ubi);
1536 if (err)
1537 goto out_free;
1538
1539 return 0;
1540
1541out_free:
1542 cancel_pending(ubi);
1543 tree_destroy(&ubi->used);
1544 tree_destroy(&ubi->free);
1545 tree_destroy(&ubi->scrub);
1546 kfree(ubi->lookuptbl);
1547 if (ubi_devices_cnt == 0)
1548 kmem_cache_destroy(wl_entries_slab);
1549 return err;
1550}
1551
1552/**
1553 * protection_trees_destroy - destroy the protection RB-trees.
1554 * @ubi: UBI device description object
1555 */
1556static void protection_trees_destroy(struct ubi_device *ubi)
1557{
1558 struct rb_node *rb;
1559 struct ubi_wl_prot_entry *pe;
1560
1561 rb = ubi->prot.aec.rb_node;
1562 while (rb) {
1563 if (rb->rb_left)
1564 rb = rb->rb_left;
1565 else if (rb->rb_right)
1566 rb = rb->rb_right;
1567 else {
1568 pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec);
1569
1570 rb = rb_parent(rb);
1571 if (rb) {
1572 if (rb->rb_left == &pe->rb_aec)
1573 rb->rb_left = NULL;
1574 else
1575 rb->rb_right = NULL;
1576 }
1577
1578 kmem_cache_free(wl_entries_slab, pe->e);
1579 kfree(pe);
1580 }
1581 }
1582}
1583
1584/**
1585 * ubi_wl_close - close the wear-leveling unit.
1586 * @ubi: UBI device description object
1587 */
1588void ubi_wl_close(struct ubi_device *ubi)
1589{
1590 dbg_wl("disable \"%s\"", ubi->bgt_name);
1591 if (ubi->bgt_thread)
1592 kthread_stop(ubi->bgt_thread);
1593
1594 dbg_wl("close the UBI wear-leveling unit");
1595
1596 cancel_pending(ubi);
1597 protection_trees_destroy(ubi);
1598 tree_destroy(&ubi->used);
1599 tree_destroy(&ubi->free);
1600 tree_destroy(&ubi->scrub);
1601 kfree(ubi->lookuptbl);
1602 if (ubi_devices_cnt == 1)
1603 kmem_cache_destroy(wl_entries_slab);
1604}
1605
1606#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
1607
1608/**
1609 * paranoid_check_ec - make sure that the erase counter of a physical eraseblock
1610 * is correct.
1611 * @ubi: UBI device description object
1612 * @pnum: the physical eraseblock number to check
1613 * @ec: the erase counter to check
1614 *
1615 * This function returns zero if the erase counter of physical eraseblock @pnum
1616 * is equivalent to @ec, %1 if not, and a negative error code if an error
1617 * occurred.
1618 */
1619static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec)
1620{
1621 int err;
1622 long long read_ec;
1623 struct ubi_ec_hdr *ec_hdr;
1624
1625 ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
1626 if (!ec_hdr)
1627 return -ENOMEM;
1628
1629 err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0);
1630 if (err && err != UBI_IO_BITFLIPS) {
1631 /* The header does not have to exist */
1632 err = 0;
1633 goto out_free;
1634 }
1635
1636 read_ec = ubi64_to_cpu(ec_hdr->ec);
1637 if (ec != read_ec) {
1638 ubi_err("paranoid check failed for PEB %d", pnum);
1639 ubi_err("read EC is %lld, should be %d", read_ec, ec);
1640 ubi_dbg_dump_stack();
1641 err = 1;
1642 } else
1643 err = 0;
1644
1645out_free:
1646 kfree(ec_hdr);
1647 return err;
1648}
1649
1650/**
1651 * paranoid_check_in_wl_tree - make sure that a wear-leveling entry is present
1652 * in a WL RB-tree.
1653 * @e: the wear-leveling entry to check
1654 * @root: the root of the tree
1655 *
1656 * This function returns zero if @e is in the @root RB-tree and %1 if it
1657 * is not.
1658 */
1659static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
1660 struct rb_root *root)
1661{
1662 if (in_wl_tree(e, root))
1663 return 0;
1664
1665 ubi_err("paranoid check failed for PEB %d, EC %d, RB-tree %p ",
1666 e->pnum, e->ec, root);
1667 ubi_dbg_dump_stack();
1668 return 1;
1669}
1670
1671#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index abb90c0c09cc..8a649f602767 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -672,6 +672,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
672 return ret; 672 return ret;
673 } 673 }
674 674
675 /* and an UBI volume */
676 if (jffs2_ubivol(c)) {
677 ret = jffs2_ubivol_setup(c);
678 if (ret)
679 return ret;
680 }
681
675 return ret; 682 return ret;
676} 683}
677 684
@@ -690,4 +697,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
690 if (jffs2_nor_wbuf_flash(c)) { 697 if (jffs2_nor_wbuf_flash(c)) {
691 jffs2_nor_wbuf_flash_cleanup(c); 698 jffs2_nor_wbuf_flash_cleanup(c);
692 } 699 }
700
701 /* and an UBI volume */
702 if (jffs2_ubivol(c)) {
703 jffs2_ubivol_cleanup(c);
704 }
693} 705}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index e07a0edcdb4f..8d92e45168ca 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -98,6 +98,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
98#define jffs2_nor_wbuf_flash(c) (0) 98#define jffs2_nor_wbuf_flash(c) (0)
99#define jffs2_nor_wbuf_flash_setup(c) (0) 99#define jffs2_nor_wbuf_flash_setup(c) (0)
100#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) 100#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
101#define jffs2_ubivol(c) (0)
102#define jffs2_ubivol_setup(c) (0)
103#define jffs2_ubivol_cleanup(c) do {} while (0)
101 104
102#else /* NAND and/or ECC'd NOR support present */ 105#else /* NAND and/or ECC'd NOR support present */
103 106
@@ -133,6 +136,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
133#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) 136#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
134int jffs2_dataflash_setup(struct jffs2_sb_info *c); 137int jffs2_dataflash_setup(struct jffs2_sb_info *c);
135void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); 138void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
139#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
140int jffs2_ubivol_setup(struct jffs2_sb_info *c);
141void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
136 142
137#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) 143#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
138int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); 144int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4fac6dd53954..ab86031b3c07 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1208,3 +1208,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1208void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { 1208void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
1209 kfree(c->wbuf); 1209 kfree(c->wbuf);
1210} 1210}
1211
1212int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
1213 c->cleanmarker_size = 0;
1214
1215 if (c->mtd->writesize == 1)
1216 /* We do not need write-buffer */
1217 return 0;
1218
1219 init_rwsem(&c->wbuf_sem);
1220
1221 c->wbuf_pagesize = c->mtd->writesize;
1222 c->wbuf_ofs = 0xFFFFFFFF;
1223 c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
1224 if (!c->wbuf)
1225 return -ENOMEM;
1226
1227 printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
1228
1229 return 0;
1230}
1231
1232void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
1233 kfree(c->wbuf);
1234}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a0c8667caa72 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h>
30 31
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -34,6 +35,7 @@
34#include "ocfs2.h" 35#include "ocfs2.h"
35 36
36#include "alloc.h" 37#include "alloc.h"
38#include "aops.h"
37#include "dlmglue.h" 39#include "dlmglue.h"
38#include "extent_map.h" 40#include "extent_map.h"
39#include "inode.h" 41#include "inode.h"
@@ -47,63 +49,243 @@
47 49
48#include "buffer_head_io.h" 50#include "buffer_head_io.h"
49 51
50static int ocfs2_extent_contig(struct inode *inode, 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53 53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 54/*
55 handle_t *handle, 55 * Structures which describe a path through a btree, and functions to
56 struct inode *inode, 56 * manipulate them.
57 int wanted, 57 *
58 struct ocfs2_alloc_context *meta_ac, 58 * The idea here is to be as generic as possible with the tree
59 struct buffer_head *bhs[]); 59 * manipulation code.
60 */
61struct ocfs2_path_item {
62 struct buffer_head *bh;
63 struct ocfs2_extent_list *el;
64};
60 65
61static int ocfs2_add_branch(struct ocfs2_super *osb, 66#define OCFS2_MAX_PATH_DEPTH 5
62 handle_t *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68 67
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 68struct ocfs2_path {
70 handle_t *handle, 69 int p_tree_depth;
71 struct inode *inode, 70 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
72 struct buffer_head *fe_bh, 71};
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75 72
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb, 73#define path_root_bh(_path) ((_path)->p_node[0].bh)
77 handle_t *handle, 74#define path_root_el(_path) ((_path)->p_node[0].el)
78 struct inode *inode, 75#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
79 struct buffer_head *fe_bh, 76#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
80 u64 blkno, 77#define path_num_items(_path) ((_path)->p_tree_depth + 1)
81 u32 new_clusters);
82 78
83static int ocfs2_find_branch_target(struct ocfs2_super *osb, 79/*
84 struct inode *inode, 80 * Reset the actual path elements so that we can re-use the structure
85 struct buffer_head *fe_bh, 81 * to build another path. Generally, this involves freeing the buffer
86 struct buffer_head **target_bh); 82 * heads.
83 */
84static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
85{
86 int i, start = 0, depth = 0;
87 struct ocfs2_path_item *node;
87 88
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 89 if (keep_root)
89 struct inode *inode, 90 start = 1;
90 struct ocfs2_dinode *fe, 91
91 unsigned int new_i_clusters, 92 for(i = start; i < path_num_items(path); i++) {
92 struct buffer_head *old_last_eb, 93 node = &path->p_node[i];
93 struct buffer_head **new_last_eb); 94
95 brelse(node->bh);
96 node->bh = NULL;
97 node->el = NULL;
98 }
99
100 /*
101 * Tree depth may change during truncate, or insert. If we're
102 * keeping the root extent list, then make sure that our path
103 * structure reflects the proper depth.
104 */
105 if (keep_root)
106 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
107
108 path->p_tree_depth = depth;
109}
110
111static void ocfs2_free_path(struct ocfs2_path *path)
112{
113 if (path) {
114 ocfs2_reinit_path(path, 0);
115 kfree(path);
116 }
117}
118
119/*
120 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only.
122 */
123static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
124{
125 int i;
126
127 BUG_ON(path_root_bh(dest) != path_root_bh(src));
128
129 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
130 brelse(dest->p_node[i].bh);
131
132 dest->p_node[i].bh = src->p_node[i].bh;
133 dest->p_node[i].el = src->p_node[i].el;
134
135 src->p_node[i].bh = NULL;
136 src->p_node[i].el = NULL;
137 }
138}
139
140/*
141 * Insert an extent block at given index.
142 *
143 * This will not take an additional reference on eb_bh.
144 */
145static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
146 struct buffer_head *eb_bh)
147{
148 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
149
150 /*
151 * Right now, no root bh is an extent block, so this helps
152 * catch code errors with dinode trees. The assertion can be
153 * safely removed if we ever need to insert extent block
154 * structures at the root.
155 */
156 BUG_ON(index == 0);
157
158 path->p_node[index].bh = eb_bh;
159 path->p_node[index].el = &eb->h_list;
160}
161
162static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
163 struct ocfs2_extent_list *root_el)
164{
165 struct ocfs2_path *path;
166
167 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
168
169 path = kzalloc(sizeof(*path), GFP_NOFS);
170 if (path) {
171 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
172 get_bh(root_bh);
173 path_root_bh(path) = root_bh;
174 path_root_el(path) = root_el;
175 }
176
177 return path;
178}
179
180/*
181 * Allocate and initialize a new path based on a disk inode tree.
182 */
183static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
184{
185 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
186 struct ocfs2_extent_list *el = &di->id2.i_list;
187
188 return ocfs2_new_path(di_bh, el);
189}
190
191/*
192 * Convenience function to journal all components in a path.
193 */
194static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
195 struct ocfs2_path *path)
196{
197 int i, ret = 0;
198
199 if (!path)
200 goto out;
201
202 for(i = 0; i < path_num_items(path); i++) {
203 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
204 OCFS2_JOURNAL_ACCESS_WRITE);
205 if (ret < 0) {
206 mlog_errno(ret);
207 goto out;
208 }
209 }
210
211out:
212 return ret;
213}
214
215enum ocfs2_contig_type {
216 CONTIG_NONE = 0,
217 CONTIG_LEFT,
218 CONTIG_RIGHT
219};
94 220
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96 221
97static int ocfs2_extent_contig(struct inode *inode, 222/*
98 struct ocfs2_extent_rec *ext, 223 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
99 u64 blkno) 224 * ocfs2_extent_contig only work properly against leaf nodes!
225 */
226static int ocfs2_block_extent_contig(struct super_block *sb,
227 struct ocfs2_extent_rec *ext,
228 u64 blkno)
229{
230 u64 blk_end = le64_to_cpu(ext->e_blkno);
231
232 blk_end += ocfs2_clusters_to_blocks(sb,
233 le16_to_cpu(ext->e_leaf_clusters));
234
235 return blkno == blk_end;
236}
237
238static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
239 struct ocfs2_extent_rec *right)
240{
241 u32 left_range;
242
243 left_range = le32_to_cpu(left->e_cpos) +
244 le16_to_cpu(left->e_leaf_clusters);
245
246 return (left_range == le32_to_cpu(right->e_cpos));
247}
248
249static enum ocfs2_contig_type
250 ocfs2_extent_contig(struct inode *inode,
251 struct ocfs2_extent_rec *ext,
252 struct ocfs2_extent_rec *insert_rec)
100{ 253{
101 return blkno == (le64_to_cpu(ext->e_blkno) + 254 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
102 ocfs2_clusters_to_blocks(inode->i_sb, 255
103 le32_to_cpu(ext->e_clusters))); 256 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT;
259
260 blkno = le64_to_cpu(ext->e_blkno);
261 if (ocfs2_extents_adjacent(insert_rec, ext) &&
262 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
263 return CONTIG_LEFT;
264
265 return CONTIG_NONE;
104} 266}
105 267
106/* 268/*
269 * NOTE: We can have pretty much any combination of contiguousness and
270 * appending.
271 *
272 * The usefulness of APPEND_TAIL is more in that it lets us know that
273 * we'll have to update the path to that leaf.
274 */
275enum ocfs2_append_type {
276 APPEND_NONE = 0,
277 APPEND_TAIL,
278};
279
280struct ocfs2_insert_type {
281 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index;
284 int ins_free_records;
285 int ins_tree_depth;
286};
287
288/*
107 * How many free extents have we got before we need more meta data? 289 * How many free extents have we got before we need more meta data?
108 */ 290 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb, 291int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
242} 424}
243 425
244/* 426/*
427 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
428 *
429 * Returns the sum of the rightmost extent rec logical offset and
430 * cluster count.
431 *
432 * ocfs2_add_branch() uses this to determine what logical cluster
433 * value should be populated into the leftmost new branch records.
434 *
435 * ocfs2_shift_tree_depth() uses this to determine the # clusters
436 * value for the new topmost tree record.
437 */
438static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
439{
440 int i;
441
442 i = le16_to_cpu(el->l_next_free_rec) - 1;
443
444 return le32_to_cpu(el->l_recs[i].e_cpos) +
445 ocfs2_rec_clusters(el, &el->l_recs[i]);
446}
447
448/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block 449 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode 450 * to start at, if we don't want to start the branch at the dinode
247 * structure. 451 * structure.
@@ -250,7 +454,7 @@ bail:
250 * for the new last extent block. 454 * for the new last extent block.
251 * 455 *
252 * the new branch will be 'empty' in the sense that every block will 456 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0. 457 * contain a single record with cluster count == 0.
254 */ 458 */
255static int ocfs2_add_branch(struct ocfs2_super *osb, 459static int ocfs2_add_branch(struct ocfs2_super *osb,
256 handle_t *handle, 460 handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
268 struct ocfs2_extent_block *eb; 472 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el; 473 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el; 474 struct ocfs2_extent_list *el;
475 u32 new_cpos;
271 476
272 mlog_entry_void(); 477 mlog_entry_void();
273 478
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
302 goto bail; 507 goto bail;
303 } 508 }
304 509
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree. 514 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf. 515 * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
330 eb->h_next_leaf_blk = 0; 538 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i); 539 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1); 540 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters; 541 /*
542 * This actually counts as an empty extent as
543 * c_clusters == 0
544 */
545 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 546 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0); 547 /*
548 * eb_el isn't always an interior node, but even leaf
549 * nodes want a zero'd flags and reserved field so
550 * this gets the whole 32 bits regardless of use.
551 */
552 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth) 553 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 554 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338 555
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
376 * either be on the fe, or the extent block passed in. */ 593 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec); 594 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 595 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters; 596 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
380 el->l_recs[i].e_clusters = 0; 597 el->l_recs[i].e_int_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1); 598 le16_add_cpu(&el->l_next_free_rec, 1);
382 599
383 /* fe needs a new last extent block pointer, as does the 600 /* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
425 struct buffer_head **ret_new_eb_bh) 642 struct buffer_head **ret_new_eb_bh)
426{ 643{
427 int status, i; 644 int status, i;
645 u32 new_clusters;
428 struct buffer_head *new_eb_bh = NULL; 646 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe; 647 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb; 648 struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
461 /* copy the fe data into the new extent block */ 679 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth; 680 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec; 681 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { 682 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; 683 eb_el->l_recs[i] = fe_el->l_recs[i];
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469 684
470 status = ocfs2_journal_dirty(handle, new_eb_bh); 685 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) { 686 if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
480 goto bail; 695 goto bail;
481 } 696 }
482 697
698 new_clusters = ocfs2_sum_rightmost_rec(eb_el);
699
483 /* update fe now */ 700 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1); 701 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0; 702 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno; 703 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters; 704 fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { 705 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
489 fe_el->l_recs[i].e_cpos = 0; 706 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1); 707 fe_el->l_next_free_rec = cpu_to_le16(1);
494 708
495 /* If this is our 1st tree depth shift, then last_eb_blk 709 /* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
515} 729}
516 730
517/* 731/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 handle_t *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %llu has a bad extent list",
570 (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 status = -EIO;
572 goto bail;
573 }
574 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575
576 BUG_ON(i >= num_bhs);
577 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 OCFS2_BH_CACHED, inode);
579 if (status < 0) {
580 mlog_errno(status);
581 goto bail;
582 }
583 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586 eb);
587 status = -EIO;
588 goto bail;
589 }
590
591 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) {
594 mlog_errno(status);
595 goto bail;
596 }
597
598 el = &eb->h_list;
599 i++;
600 /* When we leave this loop, eb_bhs[num_bhs - 1] will
601 * hold the bottom-most leaf extent block. */
602 }
603 BUG_ON(el->l_tree_depth);
604
605 el = &fe->id2.i_list;
606 /* If we have tree depth, then the fe update is
607 * trivial, and we want to switch el out for the
608 * bottom-most leaf in order to update it with the
609 * actual extent data below. */
610 next_free = le16_to_cpu(el->l_next_free_rec);
611 if (next_free == 0) {
612 ocfs2_error(inode->i_sb,
613 "Dinode %llu has a bad extent list",
614 (unsigned long long)OCFS2_I(inode)->ip_blkno);
615 status = -EIO;
616 goto bail;
617 }
618 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619 new_clusters);
620 /* (num_bhs - 1) to avoid the leaf */
621 for(i = 0; i < (num_bhs - 1); i++) {
622 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623 el = &eb->h_list;
624
625 /* finally, make our actual change to the
626 * intermediate extent blocks. */
627 next_free = le16_to_cpu(el->l_next_free_rec);
628 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629 new_clusters);
630
631 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632 if (status < 0)
633 mlog_errno(status);
634 }
635 BUG_ON(i != (num_bhs - 1));
636 /* note that the leaf block wasn't touched in
637 * the loop above */
638 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639 el = &eb->h_list;
640 BUG_ON(el->l_tree_depth);
641 }
642
643 /* yay, we can finally add the actual extent now! */
644 i = le16_to_cpu(el->l_next_free_rec) - 1;
645 if (le16_to_cpu(el->l_next_free_rec) &&
646 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 } else if (le16_to_cpu(el->l_next_free_rec) &&
649 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 /* having an empty extent at eof is legal. */
651 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 ocfs2_error(inode->i_sb,
653 "Dinode %llu trailing extent is bad: "
654 "cpos (%u) != number of clusters (%u)",
655 (unsigned long long)OCFS2_I(inode)->ip_blkno,
656 le32_to_cpu(el->l_recs[i].e_cpos),
657 le32_to_cpu(fe->i_clusters));
658 status = -EIO;
659 goto bail;
660 }
661 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663 } else {
664 /* No contiguous record, or no empty record at eof, so
665 * we add a new one. */
666
667 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 le16_to_cpu(el->l_count));
669 i = le16_to_cpu(el->l_next_free_rec);
670
671 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 el->l_recs[i].e_cpos = fe->i_clusters;
674 le16_add_cpu(&el->l_next_free_rec, 1);
675 }
676
677 /*
678 * extent_map errors are not fatal, so they are ignored outside
679 * of flushing the thing.
680 */
681 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682 new_clusters);
683 if (status) {
684 mlog_errno(status);
685 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686 }
687
688 status = ocfs2_journal_dirty(handle, fe_bh);
689 if (status < 0)
690 mlog_errno(status);
691 if (fe->id2.i_list.l_tree_depth) {
692 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693 if (status < 0)
694 mlog_errno(status);
695 }
696
697 status = 0;
698bail:
699 if (eb_bhs) {
700 for (i = 0; i < num_bhs; i++)
701 if (eb_bhs[i])
702 brelse(eb_bhs[i]);
703 kfree(eb_bhs);
704 }
705
706 mlog_exit(status);
707 return status;
708}
709
710/*
711 * Should only be called when there is no space left in any of the 732 * Should only be called when there is no space left in any of the
712 * leaf nodes. What we want to do is find the lowest tree depth 733 * leaf nodes. What we want to do is find the lowest tree depth
713 * non-leaf extent block with room for new records. There are three 734 * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
807 return status; 828 return status;
808} 829}
809 830
810/* the caller needs to update fe->i_clusters */ 831/*
811int ocfs2_insert_extent(struct ocfs2_super *osb, 832 * This is only valid for leaf nodes, which are the only ones that can
812 handle_t *handle, 833 * have empty extents anyway.
813 struct inode *inode, 834 */
814 struct buffer_head *fe_bh, 835static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
815 u64 start_blk,
816 u32 new_clusters,
817 struct ocfs2_alloc_context *meta_ac)
818{ 836{
819 int status, i, shift; 837 return !rec->e_leaf_clusters;
820 struct buffer_head *last_eb_bh = NULL; 838}
839
840/*
841 * This function will discard the rightmost extent record.
842 */
843static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
844{
845 int next_free = le16_to_cpu(el->l_next_free_rec);
846 int count = le16_to_cpu(el->l_count);
847 unsigned int num_bytes;
848
849 BUG_ON(!next_free);
850 /* This will cause us to go off the end of our extent list. */
851 BUG_ON(next_free >= count);
852
853 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
854
855 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
856}
857
858static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
859 struct ocfs2_extent_rec *insert_rec)
860{
861 int i, insert_index, next_free, has_empty, num_bytes;
862 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
863 struct ocfs2_extent_rec *rec;
864
865 next_free = le16_to_cpu(el->l_next_free_rec);
866 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
867
868 BUG_ON(!next_free);
869
870 /* The tree code before us didn't allow enough room in the leaf. */
871 if (el->l_next_free_rec == el->l_count && !has_empty)
872 BUG();
873
874 /*
875 * The easiest way to approach this is to just remove the
876 * empty extent and temporarily decrement next_free.
877 */
878 if (has_empty) {
879 /*
880 * If next_free was 1 (only an empty extent), this
881 * loop won't execute, which is fine. We still want
882 * the decrement above to happen.
883 */
884 for(i = 0; i < (next_free - 1); i++)
885 el->l_recs[i] = el->l_recs[i+1];
886
887 next_free--;
888 }
889
890 /*
891 * Figure out what the new record index should be.
892 */
893 for(i = 0; i < next_free; i++) {
894 rec = &el->l_recs[i];
895
896 if (insert_cpos < le32_to_cpu(rec->e_cpos))
897 break;
898 }
899 insert_index = i;
900
901 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
902 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
903
904 BUG_ON(insert_index < 0);
905 BUG_ON(insert_index >= le16_to_cpu(el->l_count));
906 BUG_ON(insert_index > next_free);
907
908 /*
909 * No need to memmove if we're just adding to the tail.
910 */
911 if (insert_index != next_free) {
912 BUG_ON(next_free >= le16_to_cpu(el->l_count));
913
914 num_bytes = next_free - insert_index;
915 num_bytes *= sizeof(struct ocfs2_extent_rec);
916 memmove(&el->l_recs[insert_index + 1],
917 &el->l_recs[insert_index],
918 num_bytes);
919 }
920
921 /*
922 * Either we had an empty extent, and need to re-increment or
923 * there was no empty extent on a non full rightmost leaf node,
924 * in which case we still need to increment.
925 */
926 next_free++;
927 el->l_next_free_rec = cpu_to_le16(next_free);
928 /*
929 * Make sure none of the math above just messed up our tree.
930 */
931 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
932
933 el->l_recs[insert_index] = *insert_rec;
934
935}
936
937/*
938 * Create an empty extent record .
939 *
940 * l_next_free_rec may be updated.
941 *
942 * If an empty extent already exists do nothing.
943 */
944static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
945{
946 int next_free = le16_to_cpu(el->l_next_free_rec);
947
948 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
949
950 if (next_free == 0)
951 goto set_and_inc;
952
953 if (ocfs2_is_empty_extent(&el->l_recs[0]))
954 return;
955
956 mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
957 "Asked to create an empty extent in a full list:\n"
958 "count = %u, tree depth = %u",
959 le16_to_cpu(el->l_count),
960 le16_to_cpu(el->l_tree_depth));
961
962 ocfs2_shift_records_right(el);
963
964set_and_inc:
965 le16_add_cpu(&el->l_next_free_rec, 1);
966 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
967}
968
969/*
970 * For a rotation which involves two leaf nodes, the "root node" is
971 * the lowest level tree node which contains a path to both leafs. This
972 * resulting set of information can be used to form a complete "subtree"
973 *
974 * This function is passed two full paths from the dinode down to a
975 * pair of adjacent leaves. It's task is to figure out which path
976 * index contains the subtree root - this can be the root index itself
977 * in a worst-case rotation.
978 *
979 * The array index of the subtree root is passed back.
980 */
981static int ocfs2_find_subtree_root(struct inode *inode,
982 struct ocfs2_path *left,
983 struct ocfs2_path *right)
984{
985 int i = 0;
986
987 /*
988 * Check that the caller passed in two paths from the same tree.
989 */
990 BUG_ON(path_root_bh(left) != path_root_bh(right));
991
992 do {
993 i++;
994
995 /*
996 * The caller didn't pass two adjacent paths.
997 */
998 mlog_bug_on_msg(i > left->p_tree_depth,
999 "Inode %lu, left depth %u, right depth %u\n"
1000 "left leaf blk %llu, right leaf blk %llu\n",
1001 inode->i_ino, left->p_tree_depth,
1002 right->p_tree_depth,
1003 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1004 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1005 } while (left->p_node[i].bh->b_blocknr ==
1006 right->p_node[i].bh->b_blocknr);
1007
1008 return i - 1;
1009}
1010
1011typedef void (path_insert_t)(void *, struct buffer_head *);
1012
1013/*
1014 * Traverse a btree path in search of cpos, starting at root_el.
1015 *
1016 * This code can be called with a cpos larger than the tree, in which
1017 * case it will return the rightmost path.
1018 */
1019static int __ocfs2_find_path(struct inode *inode,
1020 struct ocfs2_extent_list *root_el, u32 cpos,
1021 path_insert_t *func, void *data)
1022{
1023 int i, ret = 0;
1024 u32 range;
1025 u64 blkno;
821 struct buffer_head *bh = NULL; 1026 struct buffer_head *bh = NULL;
822 struct ocfs2_dinode *fe;
823 struct ocfs2_extent_block *eb; 1027 struct ocfs2_extent_block *eb;
824 struct ocfs2_extent_list *el; 1028 struct ocfs2_extent_list *el;
1029 struct ocfs2_extent_rec *rec;
1030 struct ocfs2_inode_info *oi = OCFS2_I(inode);
825 1031
826 mlog_entry_void(); 1032 el = root_el;
1033 while (el->l_tree_depth) {
1034 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1035 ocfs2_error(inode->i_sb,
1036 "Inode %llu has empty extent list at "
1037 "depth %u\n",
1038 (unsigned long long)oi->ip_blkno,
1039 le16_to_cpu(el->l_tree_depth));
1040 ret = -EROFS;
1041 goto out;
827 1042
828 mlog(0, "add %u clusters starting at block %llu to inode %llu\n", 1043 }
829 new_clusters, (unsigned long long)start_blk,
830 (unsigned long long)OCFS2_I(inode)->ip_blkno);
831 1044
832 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1045 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
833 el = &fe->id2.i_list; 1046 rec = &el->l_recs[i];
1047
1048 /*
1049 * In the case that cpos is off the allocation
1050 * tree, this should just wind up returning the
1051 * rightmost record.
1052 */
1053 range = le32_to_cpu(rec->e_cpos) +
1054 ocfs2_rec_clusters(el, rec);
1055 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1056 break;
1057 }
834 1058
835 if (el->l_tree_depth) { 1059 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
836 /* jump to end of tree */ 1060 if (blkno == 0) {
837 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 1061 ocfs2_error(inode->i_sb,
838 &last_eb_bh, OCFS2_BH_CACHED, inode); 1062 "Inode %llu has bad blkno in extent list "
839 if (status < 0) { 1063 "at depth %u (index %d)\n",
840 mlog_exit(status); 1064 (unsigned long long)oi->ip_blkno,
841 goto bail; 1065 le16_to_cpu(el->l_tree_depth), i);
1066 ret = -EROFS;
1067 goto out;
842 } 1068 }
843 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 1069
1070 brelse(bh);
1071 bh = NULL;
1072 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1073 &bh, OCFS2_BH_CACHED, inode);
1074 if (ret) {
1075 mlog_errno(ret);
1076 goto out;
1077 }
1078
1079 eb = (struct ocfs2_extent_block *) bh->b_data;
844 el = &eb->h_list; 1080 el = &eb->h_list;
1081 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1082 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1083 ret = -EIO;
1084 goto out;
1085 }
1086
1087 if (le16_to_cpu(el->l_next_free_rec) >
1088 le16_to_cpu(el->l_count)) {
1089 ocfs2_error(inode->i_sb,
1090 "Inode %llu has bad count in extent list "
1091 "at block %llu (next free=%u, count=%u)\n",
1092 (unsigned long long)oi->ip_blkno,
1093 (unsigned long long)bh->b_blocknr,
1094 le16_to_cpu(el->l_next_free_rec),
1095 le16_to_cpu(el->l_count));
1096 ret = -EROFS;
1097 goto out;
1098 }
1099
1100 if (func)
1101 func(data, bh);
1102 }
1103
1104out:
1105 /*
1106 * Catch any trailing bh that the loop didn't handle.
1107 */
1108 brelse(bh);
1109
1110 return ret;
1111}
1112
1113/*
1114 * Given an initialized path (that is, it has a valid root extent
1115 * list), this function will traverse the btree in search of the path
1116 * which would contain cpos.
1117 *
1118 * The path traveled is recorded in the path structure.
1119 *
1120 * Note that this will not do any comparisons on leaf node extent
1121 * records, so it will work fine in the case that we just added a tree
1122 * branch.
1123 */
1124struct find_path_data {
1125 int index;
1126 struct ocfs2_path *path;
1127};
1128static void find_path_ins(void *data, struct buffer_head *bh)
1129{
1130 struct find_path_data *fp = data;
1131
1132 get_bh(bh);
1133 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1134 fp->index++;
1135}
1136static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1137 u32 cpos)
1138{
1139 struct find_path_data data;
1140
1141 data.index = 1;
1142 data.path = path;
1143 return __ocfs2_find_path(inode, path_root_el(path), cpos,
1144 find_path_ins, &data);
1145}
1146
1147static void find_leaf_ins(void *data, struct buffer_head *bh)
1148{
1149 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1150 struct ocfs2_extent_list *el = &eb->h_list;
1151 struct buffer_head **ret = data;
1152
1153 /* We want to retain only the leaf block. */
1154 if (le16_to_cpu(el->l_tree_depth) == 0) {
1155 get_bh(bh);
1156 *ret = bh;
1157 }
1158}
1159/*
1160 * Find the leaf block in the tree which would contain cpos. No
1161 * checking of the actual leaf is done.
1162 *
1163 * Some paths want to call this instead of allocating a path structure
1164 * and calling ocfs2_find_path().
1165 *
1166 * This function doesn't handle non btree extent lists.
1167 */
1168int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1169 u32 cpos, struct buffer_head **leaf_bh)
1170{
1171 int ret;
1172 struct buffer_head *bh = NULL;
1173
1174 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1175 if (ret) {
1176 mlog_errno(ret);
1177 goto out;
1178 }
1179
1180 *leaf_bh = bh;
1181out:
1182 return ret;
1183}
1184
1185/*
1186 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1187 *
1188 * Basically, we've moved stuff around at the bottom of the tree and
1189 * we need to fix up the extent records above the changes to reflect
1190 * the new changes.
1191 *
1192 * left_rec: the record on the left.
1193 * left_child_el: is the child list pointed to by left_rec
1194 * right_rec: the record to the right of left_rec
1195 * right_child_el: is the child list pointed to by right_rec
1196 *
1197 * By definition, this only works on interior nodes.
1198 */
1199static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1200 struct ocfs2_extent_list *left_child_el,
1201 struct ocfs2_extent_rec *right_rec,
1202 struct ocfs2_extent_list *right_child_el)
1203{
1204 u32 left_clusters, right_end;
1205
1206 /*
1207 * Interior nodes never have holes. Their cpos is the cpos of
1208 * the leftmost record in their child list. Their cluster
1209 * count covers the full theoretical range of their child list
1210 * - the range between their cpos and the cpos of the record
1211 * immediately to their right.
1212 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216
1217 /*
1218 * Calculate the rightmost cluster count boundary before
1219 * moving cpos - we will need to adjust clusters after
1220 * updating e_cpos to keep the same highest cluster count.
1221 */
1222 right_end = le32_to_cpu(right_rec->e_cpos);
1223 right_end += le32_to_cpu(right_rec->e_int_clusters);
1224
1225 right_rec->e_cpos = left_rec->e_cpos;
1226 le32_add_cpu(&right_rec->e_cpos, left_clusters);
1227
1228 right_end -= le32_to_cpu(right_rec->e_cpos);
1229 right_rec->e_int_clusters = cpu_to_le32(right_end);
1230}
1231
1232/*
1233 * Adjust the adjacent root node records involved in a
1234 * rotation. left_el_blkno is passed in as a key so that we can easily
1235 * find it's index in the root list.
1236 */
1237static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1238 struct ocfs2_extent_list *left_el,
1239 struct ocfs2_extent_list *right_el,
1240 u64 left_el_blkno)
1241{
1242 int i;
1243
1244 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1245 le16_to_cpu(left_el->l_tree_depth));
1246
1247 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1248 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1249 break;
1250 }
1251
1252 /*
1253 * The path walking code should have never returned a root and
1254 * two paths which are not adjacent.
1255 */
1256 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1257
1258 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1259 &root_el->l_recs[i + 1], right_el);
1260}
1261
1262/*
1263 * We've changed a leaf block (in right_path) and need to reflect that
1264 * change back up the subtree.
1265 *
1266 * This happens in multiple places:
1267 * - When we've moved an extent record from the left path leaf to the right
1268 * path leaf to make room for an empty extent in the left path leaf.
1269 * - When our insert into the right path leaf is at the leftmost edge
1270 * and requires an update of the path immediately to it's left. This
1271 * can occur at the end of some types of rotation and appending inserts.
1272 */
1273static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1274 struct ocfs2_path *left_path,
1275 struct ocfs2_path *right_path,
1276 int subtree_index)
1277{
1278 int ret, i, idx;
1279 struct ocfs2_extent_list *el, *left_el, *right_el;
1280 struct ocfs2_extent_rec *left_rec, *right_rec;
1281 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1282
1283 /*
1284 * Update the counts and position values within all the
1285 * interior nodes to reflect the leaf rotation we just did.
1286 *
1287 * The root node is handled below the loop.
1288 *
1289 * We begin the loop with right_el and left_el pointing to the
1290 * leaf lists and work our way up.
1291 *
1292 * NOTE: within this loop, left_el and right_el always refer
1293 * to the *child* lists.
1294 */
1295 left_el = path_leaf_el(left_path);
1296 right_el = path_leaf_el(right_path);
1297 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1298 mlog(0, "Adjust records at index %u\n", i);
1299
1300 /*
1301 * One nice property of knowing that all of these
1302 * nodes are below the root is that we only deal with
1303 * the leftmost right node record and the rightmost
1304 * left node record.
1305 */
1306 el = left_path->p_node[i].el;
1307 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1308 left_rec = &el->l_recs[idx];
1309
1310 el = right_path->p_node[i].el;
1311 right_rec = &el->l_recs[0];
1312
1313 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1314 right_el);
1315
1316 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1317 if (ret)
1318 mlog_errno(ret);
1319
1320 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1321 if (ret)
1322 mlog_errno(ret);
1323
1324 /*
1325 * Setup our list pointers now so that the current
1326 * parents become children in the next iteration.
1327 */
1328 left_el = left_path->p_node[i].el;
1329 right_el = right_path->p_node[i].el;
1330 }
1331
1332 /*
1333 * At the root node, adjust the two adjacent records which
1334 * begin our path to the leaves.
1335 */
1336
1337 el = left_path->p_node[subtree_index].el;
1338 left_el = left_path->p_node[subtree_index + 1].el;
1339 right_el = right_path->p_node[subtree_index + 1].el;
1340
1341 ocfs2_adjust_root_records(el, left_el, right_el,
1342 left_path->p_node[subtree_index + 1].bh->b_blocknr);
1343
1344 root_bh = left_path->p_node[subtree_index].bh;
1345
1346 ret = ocfs2_journal_dirty(handle, root_bh);
1347 if (ret)
1348 mlog_errno(ret);
1349}
1350
1351static int ocfs2_rotate_subtree_right(struct inode *inode,
1352 handle_t *handle,
1353 struct ocfs2_path *left_path,
1354 struct ocfs2_path *right_path,
1355 int subtree_index)
1356{
1357 int ret, i;
1358 struct buffer_head *right_leaf_bh;
1359 struct buffer_head *left_leaf_bh = NULL;
1360 struct buffer_head *root_bh;
1361 struct ocfs2_extent_list *right_el, *left_el;
1362 struct ocfs2_extent_rec move_rec;
1363
1364 left_leaf_bh = path_leaf_bh(left_path);
1365 left_el = path_leaf_el(left_path);
1366
1367 if (left_el->l_next_free_rec != left_el->l_count) {
1368 ocfs2_error(inode->i_sb,
1369 "Inode %llu has non-full interior leaf node %llu"
1370 "(next free = %u)",
1371 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1372 (unsigned long long)left_leaf_bh->b_blocknr,
1373 le16_to_cpu(left_el->l_next_free_rec));
1374 return -EROFS;
1375 }
1376
1377 /*
1378 * This extent block may already have an empty record, so we
1379 * return early if so.
1380 */
1381 if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1382 return 0;
1383
1384 root_bh = left_path->p_node[subtree_index].bh;
1385 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1386
1387 ret = ocfs2_journal_access(handle, inode, root_bh,
1388 OCFS2_JOURNAL_ACCESS_WRITE);
1389 if (ret) {
1390 mlog_errno(ret);
1391 goto out;
1392 }
1393
1394 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1395 ret = ocfs2_journal_access(handle, inode,
1396 right_path->p_node[i].bh,
1397 OCFS2_JOURNAL_ACCESS_WRITE);
1398 if (ret) {
1399 mlog_errno(ret);
1400 goto out;
1401 }
1402
1403 ret = ocfs2_journal_access(handle, inode,
1404 left_path->p_node[i].bh,
1405 OCFS2_JOURNAL_ACCESS_WRITE);
1406 if (ret) {
1407 mlog_errno(ret);
1408 goto out;
1409 }
1410 }
1411
1412 right_leaf_bh = path_leaf_bh(right_path);
1413 right_el = path_leaf_el(right_path);
1414
1415 /* This is a code error, not a disk corruption. */
1416 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1417 "because rightmost leaf block %llu is empty\n",
1418 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1419 (unsigned long long)right_leaf_bh->b_blocknr);
1420
1421 ocfs2_create_empty_extent(right_el);
1422
1423 ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1424 if (ret) {
1425 mlog_errno(ret);
1426 goto out;
1427 }
1428
1429 /* Do the copy now. */
1430 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1431 move_rec = left_el->l_recs[i];
1432 right_el->l_recs[0] = move_rec;
1433
1434 /*
1435 * Clear out the record we just copied and shift everything
1436 * over, leaving an empty extent in the left leaf.
1437 *
1438 * We temporarily subtract from next_free_rec so that the
1439 * shift will lose the tail record (which is now defunct).
1440 */
1441 le16_add_cpu(&left_el->l_next_free_rec, -1);
1442 ocfs2_shift_records_right(left_el);
1443 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1444 le16_add_cpu(&left_el->l_next_free_rec, 1);
1445
1446 ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1447 if (ret) {
1448 mlog_errno(ret);
1449 goto out;
1450 }
1451
1452 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1453 subtree_index);
1454
1455out:
1456 return ret;
1457}
1458
1459/*
1460 * Given a full path, determine what cpos value would return us a path
1461 * containing the leaf immediately to the left of the current one.
1462 *
1463 * Will return zero if the path passed in is already the leftmost path.
1464 */
1465static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1466 struct ocfs2_path *path, u32 *cpos)
1467{
1468 int i, j, ret = 0;
1469 u64 blkno;
1470 struct ocfs2_extent_list *el;
1471
1472 BUG_ON(path->p_tree_depth == 0);
1473
1474 *cpos = 0;
1475
1476 blkno = path_leaf_bh(path)->b_blocknr;
1477
1478 /* Start at the tree node just above the leaf and work our way up. */
1479 i = path->p_tree_depth - 1;
1480 while (i >= 0) {
1481 el = path->p_node[i].el;
1482
1483 /*
1484 * Find the extent record just before the one in our
1485 * path.
1486 */
1487 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1488 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1489 if (j == 0) {
1490 if (i == 0) {
1491 /*
1492 * We've determined that the
1493 * path specified is already
1494 * the leftmost one - return a
1495 * cpos of zero.
1496 */
1497 goto out;
1498 }
1499 /*
1500 * The leftmost record points to our
1501 * leaf - we need to travel up the
1502 * tree one level.
1503 */
1504 goto next_node;
1505 }
1506
1507 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1508 *cpos = *cpos + ocfs2_rec_clusters(el,
1509 &el->l_recs[j - 1]);
1510 *cpos = *cpos - 1;
1511 goto out;
1512 }
1513 }
1514
1515 /*
1516 * If we got here, we never found a valid node where
1517 * the tree indicated one should be.
1518 */
1519 ocfs2_error(sb,
1520 "Invalid extent tree at extent block %llu\n",
1521 (unsigned long long)blkno);
1522 ret = -EROFS;
1523 goto out;
1524
1525next_node:
1526 blkno = path->p_node[i].bh->b_blocknr;
1527 i--;
1528 }
1529
1530out:
1531 return ret;
1532}
1533
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1535 struct ocfs2_path *path)
1536{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1538
1539 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits);
1541
1542 return 0;
1543}
1544
1545/*
1546 * Trap the case where we're inserting into the theoretical range past
1547 * the _actual_ left leaf range. Otherwise, we'll rotate a record
1548 * whose cpos is less than ours into the right leaf.
1549 *
1550 * It's only necessary to look at the rightmost record of the left
1551 * leaf because the logic that calls us should ensure that the
1552 * theoretical ranges in the path components above the leaves are
1553 * correct.
1554 */
1555static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1556 u32 insert_cpos)
1557{
1558 struct ocfs2_extent_list *left_el;
1559 struct ocfs2_extent_rec *rec;
1560 int next_free;
1561
1562 left_el = path_leaf_el(left_path);
1563 next_free = le16_to_cpu(left_el->l_next_free_rec);
1564 rec = &left_el->l_recs[next_free - 1];
1565
1566 if (insert_cpos > le32_to_cpu(rec->e_cpos))
1567 return 1;
1568 return 0;
1569}
1570
1571/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 *
1574 * The path to the rightmost leaf should be passed in.
1575 *
1576 * The array is assumed to be large enough to hold an entire path (tree depth).
1577 *
1578 * Upon succesful return from this function:
1579 *
1580 * - The 'right_path' array will contain a path to the leaf block
1581 * whose range contains e_cpos.
1582 * - That leaf block will have a single empty extent in list index 0.
1583 * - In the case that the rotation requires a post-insert update,
1584 * *ret_left_path will contain a valid path which can be passed to
1585 * ocfs2_insert_path().
1586 */
1587static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle,
1589 u32 insert_cpos,
1590 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path)
1592{
1593 int ret, start;
1594 u32 cpos;
1595 struct ocfs2_path *left_path = NULL;
1596
1597 *ret_left_path = NULL;
1598
1599 left_path = ocfs2_new_path(path_root_bh(right_path),
1600 path_root_el(right_path));
1601 if (!left_path) {
1602 ret = -ENOMEM;
1603 mlog_errno(ret);
1604 goto out;
1605 }
1606
1607 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1608 if (ret) {
1609 mlog_errno(ret);
1610 goto out;
1611 }
1612
1613 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1614
1615 /*
1616 * What we want to do here is:
1617 *
1618 * 1) Start with the rightmost path.
1619 *
1620 * 2) Determine a path to the leaf block directly to the left
1621 * of that leaf.
1622 *
1623 * 3) Determine the 'subtree root' - the lowest level tree node
1624 * which contains a path to both leaves.
1625 *
1626 * 4) Rotate the subtree.
1627 *
1628 * 5) Find the next subtree by considering the left path to be
1629 * the new right path.
1630 *
1631 * The check at the top of this while loop also accepts
1632 * insert_cpos == cpos because cpos is only a _theoretical_
1633 * value to get us the left path - insert_cpos might very well
1634 * be filling that hole.
1635 *
1636 * Stop at a cpos of '0' because we either started at the
1637 * leftmost branch (i.e., a tree with one branch and a
1638 * rotation inside of it), or we've gone as far as we can in
1639 * rotating subtrees.
1640 */
1641 while (cpos && insert_cpos <= cpos) {
1642 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1643 insert_cpos, cpos);
1644
1645 ret = ocfs2_find_path(inode, left_path, cpos);
1646 if (ret) {
1647 mlog_errno(ret);
1648 goto out;
1649 }
1650
1651 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1652 path_leaf_bh(right_path),
1653 "Inode %lu: error during insert of %u "
1654 "(left path cpos %u) results in two identical "
1655 "paths ending at %llu\n",
1656 inode->i_ino, insert_cpos, cpos,
1657 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr);
1659
1660 if (ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663
1664 /*
1665 * We've rotated the tree as much as we
1666 * should. The rest is up to
1667 * ocfs2_insert_path() to complete, after the
1668 * record insertion. We indicate this
1669 * situation by returning the left path.
1670 *
1671 * The reason we don't adjust the records here
1672 * before the record insert is that an error
1673 * later might break the rule where a parent
1674 * record e_cpos will reflect the actual
1675 * e_cpos of the 1st nonempty record of the
1676 * child list.
1677 */
1678 *ret_left_path = left_path;
1679 goto out_ret_path;
1680 }
1681
1682 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1683
1684 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1685 start,
1686 (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1687 right_path->p_tree_depth);
1688
1689 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path);
1691 if (ret) {
1692 mlog_errno(ret);
1693 goto out;
1694 }
1695
1696 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1697 right_path, start);
1698 if (ret) {
1699 mlog_errno(ret);
1700 goto out;
1701 }
1702
1703 /*
1704 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left
1706 * path. Optimize by copying values instead.
1707 */
1708 ocfs2_mv_path(right_path, left_path);
1709
1710 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1711 &cpos);
1712 if (ret) {
1713 mlog_errno(ret);
1714 goto out;
1715 }
1716 }
1717
1718out:
1719 ocfs2_free_path(left_path);
1720
1721out_ret_path:
1722 return ret;
1723}
1724
1725/*
1726 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed
1728 * that the tree above has been prepared.
1729 */
1730static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1731 struct ocfs2_extent_list *el,
1732 struct ocfs2_insert_type *insert,
1733 struct inode *inode)
1734{
1735 int i = insert->ins_contig_index;
1736 unsigned int range;
1737 struct ocfs2_extent_rec *rec;
1738
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740
1741 /*
1742 * Contiguous insert - either left or right.
1743 */
1744 if (insert->ins_contig != CONTIG_NONE) {
1745 rec = &el->l_recs[i];
1746 if (insert->ins_contig == CONTIG_LEFT) {
1747 rec->e_blkno = insert_rec->e_blkno;
1748 rec->e_cpos = insert_rec->e_cpos;
1749 }
1750 le16_add_cpu(&rec->e_leaf_clusters,
1751 le16_to_cpu(insert_rec->e_leaf_clusters));
1752 return;
1753 }
1754
1755 /*
1756 * Handle insert into an empty leaf.
1757 */
1758 if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1759 ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1760 ocfs2_is_empty_extent(&el->l_recs[0]))) {
1761 el->l_recs[0] = *insert_rec;
1762 el->l_next_free_rec = cpu_to_le16(1);
1763 return;
1764 }
1765
1766 /*
1767 * Appending insert.
1768 */
1769 if (insert->ins_appending == APPEND_TAIL) {
1770 i = le16_to_cpu(el->l_next_free_rec) - 1;
1771 rec = &el->l_recs[i];
1772 range = le32_to_cpu(rec->e_cpos)
1773 + le16_to_cpu(rec->e_leaf_clusters);
1774 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1775
1776 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1777 le16_to_cpu(el->l_count),
1778 "inode %lu, depth %u, count %u, next free %u, "
1779 "rec.cpos %u, rec.clusters %u, "
1780 "insert.cpos %u, insert.clusters %u\n",
1781 inode->i_ino,
1782 le16_to_cpu(el->l_tree_depth),
1783 le16_to_cpu(el->l_count),
1784 le16_to_cpu(el->l_next_free_rec),
1785 le32_to_cpu(el->l_recs[i].e_cpos),
1786 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
1787 le32_to_cpu(insert_rec->e_cpos),
1788 le16_to_cpu(insert_rec->e_leaf_clusters));
1789 i++;
1790 el->l_recs[i] = *insert_rec;
1791 le16_add_cpu(&el->l_next_free_rec, 1);
1792 return;
1793 }
1794
1795 /*
1796 * Ok, we have to rotate.
1797 *
1798 * At this point, it is safe to assume that inserting into an
1799 * empty leaf and appending to a leaf have both been handled
1800 * above.
1801 *
1802 * This leaf needs to have space, either by the empty 1st
1803 * extent record, or by virtue of an l_next_rec < l_count.
1804 */
1805 ocfs2_rotate_leaf(el, insert_rec);
1806}
1807
1808static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1809 struct ocfs2_dinode *di,
1810 u32 clusters)
1811{
1812 le32_add_cpu(&di->i_clusters, clusters);
1813 spin_lock(&OCFS2_I(inode)->ip_lock);
1814 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1815 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816}
1817
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path)
1822{
1823 int ret, i, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL;
1827
1828 *ret_left_path = NULL;
1829
1830 /*
1831 * This shouldn't happen for non-trees. The extent rec cluster
1832 * count manipulation below only works for interior nodes.
1833 */
1834 BUG_ON(right_path->p_tree_depth == 0);
1835
1836 /*
1837 * If our appending insert is at the leftmost edge of a leaf,
1838 * then we might need to update the rightmost records of the
1839 * neighboring path.
1840 */
1841 el = path_leaf_el(right_path);
1842 next_free = le16_to_cpu(el->l_next_free_rec);
1843 if (next_free == 0 ||
1844 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1845 u32 left_cpos;
1846
1847 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1848 &left_cpos);
1849 if (ret) {
1850 mlog_errno(ret);
1851 goto out;
1852 }
1853
1854 mlog(0, "Append may need a left path update. cpos: %u, "
1855 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1856 left_cpos);
1857
1858 /*
1859 * No need to worry if the append is already in the
1860 * leftmost leaf.
1861 */
1862 if (left_cpos) {
1863 left_path = ocfs2_new_path(path_root_bh(right_path),
1864 path_root_el(right_path));
1865 if (!left_path) {
1866 ret = -ENOMEM;
1867 mlog_errno(ret);
1868 goto out;
1869 }
1870
1871 ret = ocfs2_find_path(inode, left_path, left_cpos);
1872 if (ret) {
1873 mlog_errno(ret);
1874 goto out;
1875 }
1876
1877 /*
1878 * ocfs2_insert_path() will pass the left_path to the
1879 * journal for us.
1880 */
1881 }
1882 }
1883
1884 ret = ocfs2_journal_access_path(inode, handle, right_path);
1885 if (ret) {
1886 mlog_errno(ret);
1887 goto out;
1888 }
1889
1890 el = path_root_el(right_path);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924
1925 *ret_left_path = left_path;
1926 ret = 0;
1927out:
1928 if (ret != 0)
1929 ocfs2_free_path(left_path);
1930
1931 return ret;
1932}
1933
1934/*
1935 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly.
1937 *
1938 * right_path is the path we want to do the actual insert
1939 * in. left_path should only be passed in if we need to update that
1940 * portion of the tree after an edge insert.
1941 */
1942static int ocfs2_insert_path(struct inode *inode,
1943 handle_t *handle,
1944 struct ocfs2_path *left_path,
1945 struct ocfs2_path *right_path,
1946 struct ocfs2_extent_rec *insert_rec,
1947 struct ocfs2_insert_type *insert)
1948{
1949 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952
1953 /*
1954 * Pass both paths to the journal. The majority of inserts
1955 * will be touching all components anyway.
1956 */
1957 ret = ocfs2_journal_access_path(inode, handle, right_path);
1958 if (ret < 0) {
1959 mlog_errno(ret);
1960 goto out;
1961 }
1962
1963 if (left_path) {
1964 int credits = handle->h_buffer_credits;
1965
1966 /*
1967 * There's a chance that left_path got passed back to
1968 * us without being accounted for in the
1969 * journal. Extend our transaction here to be sure we
1970 * can change those blocks.
1971 */
1972 credits += left_path->p_tree_depth;
1973
1974 ret = ocfs2_extend_trans(handle, credits);
1975 if (ret < 0) {
1976 mlog_errno(ret);
1977 goto out;
1978 }
1979
1980 ret = ocfs2_journal_access_path(inode, handle, left_path);
1981 if (ret < 0) {
1982 mlog_errno(ret);
1983 goto out;
1984 }
1985 }
1986
1987 el = path_leaf_el(right_path);
1988
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret)
1992 mlog_errno(ret);
1993
1994 if (left_path) {
1995 /*
1996 * The rotate code has indicated that we need to fix
1997 * up portions of the tree after the insert.
1998 *
1999 * XXX: Should we extend the transaction here?
2000 */
2001 subtree_index = ocfs2_find_subtree_root(inode, left_path,
2002 right_path);
2003 ocfs2_complete_edge_insert(inode, handle, left_path,
2004 right_path, subtree_index);
2005 }
2006
2007 ret = 0;
2008out:
2009 return ret;
2010}
2011
2012static int ocfs2_do_insert_extent(struct inode *inode,
2013 handle_t *handle,
2014 struct buffer_head *di_bh,
2015 struct ocfs2_extent_rec *insert_rec,
2016 struct ocfs2_insert_type *type)
2017{
2018 int ret, rotate = 0;
2019 u32 cpos;
2020 struct ocfs2_path *right_path = NULL;
2021 struct ocfs2_path *left_path = NULL;
2022 struct ocfs2_dinode *di;
2023 struct ocfs2_extent_list *el;
2024
2025 di = (struct ocfs2_dinode *) di_bh->b_data;
2026 el = &di->id2.i_list;
2027
2028 ret = ocfs2_journal_access(handle, inode, di_bh,
2029 OCFS2_JOURNAL_ACCESS_WRITE);
2030 if (ret) {
2031 mlog_errno(ret);
2032 goto out;
2033 }
2034
2035 if (le16_to_cpu(el->l_tree_depth) == 0) {
2036 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2037 goto out_update_clusters;
2038 }
2039
2040 right_path = ocfs2_new_inode_path(di_bh);
2041 if (!right_path) {
2042 ret = -ENOMEM;
2043 mlog_errno(ret);
2044 goto out;
2045 }
2046
2047 /*
2048 * Determine the path to start with. Rotations need the
2049 * rightmost path, everything else can go directly to the
2050 * target leaf.
2051 */
2052 cpos = le32_to_cpu(insert_rec->e_cpos);
2053 if (type->ins_appending == APPEND_NONE &&
2054 type->ins_contig == CONTIG_NONE) {
2055 rotate = 1;
2056 cpos = UINT_MAX;
2057 }
2058
2059 ret = ocfs2_find_path(inode, right_path, cpos);
2060 if (ret) {
2061 mlog_errno(ret);
2062 goto out;
2063 }
2064
2065 /*
2066 * Rotations and appends need special treatment - they modify
2067 * parts of the tree's above them.
2068 *
2069 * Both might pass back a path immediate to the left of the
2070 * one being inserted to. This will be cause
2071 * ocfs2_insert_path() to modify the rightmost records of
2072 * left_path to account for an edge insert.
2073 *
2074 * XXX: When modifying this code, keep in mind that an insert
2075 * can wind up skipping both of these two special cases...
2076 */
2077 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle,
2079 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path);
2081 if (ret) {
2082 mlog_errno(ret);
2083 goto out;
2084 }
2085 } else if (type->ins_appending == APPEND_TAIL
2086 && type->ins_contig != CONTIG_LEFT) {
2087 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2088 right_path, &left_path);
2089 if (ret) {
2090 mlog_errno(ret);
2091 goto out;
2092 }
2093 }
2094
2095 ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2096 insert_rec, type);
2097 if (ret) {
2098 mlog_errno(ret);
2099 goto out;
2100 }
2101
2102out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di,
2104 le16_to_cpu(insert_rec->e_leaf_clusters));
2105
2106 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret)
2108 mlog_errno(ret);
2109
2110out:
2111 ocfs2_free_path(left_path);
2112 ocfs2_free_path(right_path);
2113
2114 return ret;
2115}
2116
2117static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el,
2120 struct ocfs2_extent_rec *insert_rec)
2121{
2122 int i;
2123 enum ocfs2_contig_type contig_type = CONTIG_NONE;
2124
2125 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2126
2127 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2128 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2129 insert_rec);
2130 if (contig_type != CONTIG_NONE) {
2131 insert->ins_contig_index = i;
2132 break;
2133 }
2134 }
2135 insert->ins_contig = contig_type;
2136}
2137
2138/*
2139 * This should only be called against the righmost leaf extent list.
2140 *
2141 * ocfs2_figure_appending_type() will figure out whether we'll have to
2142 * insert at the tail of the rightmost leaf.
2143 *
2144 * This should also work against the dinode list for tree's with 0
2145 * depth. If we consider the dinode list to be the rightmost leaf node
2146 * then the logic here makes sense.
2147 */
2148static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2149 struct ocfs2_extent_list *el,
2150 struct ocfs2_extent_rec *insert_rec)
2151{
2152 int i;
2153 u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2154 struct ocfs2_extent_rec *rec;
2155
2156 insert->ins_appending = APPEND_NONE;
2157
2158 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2159
2160 if (!el->l_next_free_rec)
2161 goto set_tail_append;
2162
2163 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2164 /* Were all records empty? */
2165 if (le16_to_cpu(el->l_next_free_rec) == 1)
2166 goto set_tail_append;
845 } 2167 }
846 2168
847 /* Can we allocate without adding/shifting tree bits? */
848 i = le16_to_cpu(el->l_next_free_rec) - 1; 2169 i = le16_to_cpu(el->l_next_free_rec) - 1;
849 if (le16_to_cpu(el->l_next_free_rec) == 0 2170 rec = &el->l_recs[i];
850 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) 2171
851 || le32_to_cpu(el->l_recs[i].e_clusters) == 0 2172 if (cpos >=
852 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) 2173 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
853 goto out_add; 2174 goto set_tail_append;
2175
2176 return;
2177
2178set_tail_append:
2179 insert->ins_appending = APPEND_TAIL;
2180}
2181
2182/*
2183 * Helper function called at the begining of an insert.
2184 *
2185 * This computes a few things that are commonly used in the process of
2186 * inserting into the btree:
2187 * - Whether the new extent is contiguous with an existing one.
2188 * - The current tree depth.
2189 * - Whether the insert is an appending one.
2190 * - The total # of free records in the tree.
2191 *
2192 * All of the information is stored on the ocfs2_insert_type
2193 * structure.
2194 */
2195static int ocfs2_figure_insert_type(struct inode *inode,
2196 struct buffer_head *di_bh,
2197 struct buffer_head **last_eb_bh,
2198 struct ocfs2_extent_rec *insert_rec,
2199 struct ocfs2_insert_type *insert)
2200{
2201 int ret;
2202 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2203 struct ocfs2_extent_block *eb;
2204 struct ocfs2_extent_list *el;
2205 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL;
2207
2208 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210
2211 if (el->l_tree_depth) {
2212 /*
2213 * If we have tree depth, we read in the
2214 * rightmost extent block ahead of time as
2215 * ocfs2_figure_insert_type() and ocfs2_add_branch()
2216 * may want it later.
2217 */
2218 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2219 le64_to_cpu(di->i_last_eb_blk), &bh,
2220 OCFS2_BH_CACHED, inode);
2221 if (ret) {
2222 mlog_exit(ret);
2223 goto out;
2224 }
2225 eb = (struct ocfs2_extent_block *) bh->b_data;
2226 el = &eb->h_list;
2227 }
2228
2229 /*
2230 * Unless we have a contiguous insert, we'll need to know if
2231 * there is room left in our allocation tree for another
2232 * extent record.
2233 *
2234 * XXX: This test is simplistic, we can search for empty
2235 * extent records too.
2236 */
2237 insert->ins_free_records = le16_to_cpu(el->l_count) -
2238 le16_to_cpu(el->l_next_free_rec);
2239
2240 if (!insert->ins_tree_depth) {
2241 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2242 ocfs2_figure_appending_type(insert, el, insert_rec);
2243 return 0;
2244 }
2245
2246 path = ocfs2_new_inode_path(di_bh);
2247 if (!path) {
2248 ret = -ENOMEM;
2249 mlog_errno(ret);
2250 goto out;
2251 }
2252
2253 /*
2254 * In the case that we're inserting past what the tree
2255 * currently accounts for, ocfs2_find_path() will return for
2256 * us the rightmost tree path. This is accounted for below in
2257 * the appending code.
2258 */
2259 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2260 if (ret) {
2261 mlog_errno(ret);
2262 goto out;
2263 }
2264
2265 el = path_leaf_el(path);
2266
2267 /*
2268 * Now that we have the path, there's two things we want to determine:
2269 * 1) Contiguousness (also set contig_index if this is so)
2270 *
2271 * 2) Are we doing an append? We can trivially break this up
2272 * into two types of appends: simple record append, or a
2273 * rotate inside the tail leaf.
2274 */
2275 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2276
2277 /*
2278 * The insert code isn't quite ready to deal with all cases of
2279 * left contiguousness. Specifically, if it's an insert into
2280 * the 1st record in a leaf, it will require the adjustment of
2281 * cluster count on the last record of the path directly to it's
2282 * left. For now, just catch that case and fool the layers
2283 * above us. This works just fine for tree_depth == 0, which
2284 * is why we allow that above.
2285 */
2286 if (insert->ins_contig == CONTIG_LEFT &&
2287 insert->ins_contig_index == 0)
2288 insert->ins_contig = CONTIG_NONE;
2289
2290 /*
2291 * Ok, so we can simply compare against last_eb to figure out
2292 * whether the path doesn't exist. This will only happen in
2293 * the case that we're doing a tail append, so maybe we can
2294 * take advantage of that information somehow.
2295 */
2296 if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2297 /*
2298 * Ok, ocfs2_find_path() returned us the rightmost
2299 * tree path. This might be an appending insert. There are
2300 * two cases:
2301 * 1) We're doing a true append at the tail:
2302 * -This might even be off the end of the leaf
2303 * 2) We're "appending" by rotating in the tail
2304 */
2305 ocfs2_figure_appending_type(insert, el, insert_rec);
2306 }
2307
2308out:
2309 ocfs2_free_path(path);
2310
2311 if (ret == 0)
2312 *last_eb_bh = bh;
2313 else
2314 brelse(bh);
2315 return ret;
2316}
2317
2318/*
2319 * Insert an extent into an inode btree.
2320 *
2321 * The caller needs to update fe->i_clusters
2322 */
2323int ocfs2_insert_extent(struct ocfs2_super *osb,
2324 handle_t *handle,
2325 struct inode *inode,
2326 struct buffer_head *fe_bh,
2327 u32 cpos,
2328 u64 start_blk,
2329 u32 new_clusters,
2330 struct ocfs2_alloc_context *meta_ac)
2331{
2332 int status, shift;
2333 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, };
2336 struct ocfs2_extent_rec rec;
2337
2338 mlog(0, "add %u clusters at position %u to inode %llu\n",
2339 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2340
2341 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2342 (OCFS2_I(inode)->ip_clusters != cpos),
2343 "Device %s, asking for sparse allocation: inode %llu, "
2344 "cpos %u, clusters %u\n",
2345 osb->dev_str,
2346 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2347 OCFS2_I(inode)->ip_clusters);
2348
2349 memset(&rec, 0, sizeof(rec));
2350 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
2353
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert);
2356 if (status < 0) {
2357 mlog_errno(status);
2358 goto bail;
2359 }
854 2360
855 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " 2361 mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
856 "tree now.\n"); 2362 "Insert.contig_index: %d, Insert.free_records: %d, "
2363 "Insert.tree_depth: %d\n",
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth);
2366
2367 /*
2368 * Avoid growing the tree unless we're out of records and the
2369 * insert type requres one.
2370 */
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2372 goto out_add;
857 2373
858 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); 2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859 if (shift < 0) { 2375 if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
866 * and didn't find room for any more extents - we need to add 2382 * and didn't find room for any more extents - we need to add
867 * another tree level */ 2383 * another tree level */
868 if (shift) { 2384 if (shift) {
869 /* if we hit a leaf, we'd better be empty :) */
870 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 le16_to_cpu(el->l_count));
872 BUG_ON(bh); 2385 BUG_ON(bh);
873 mlog(0, "ocfs2_allocate_extent: need to shift tree depth " 2386 mlog(0, "need to shift tree depth "
874 "(current = %u)\n", 2387 "(current = %d)\n", insert.ins_tree_depth);
875 le16_to_cpu(fe->id2.i_list.l_tree_depth));
876 2388
877 /* ocfs2_shift_tree_depth will return us a buffer with 2389 /* ocfs2_shift_tree_depth will return us a buffer with
878 * the new extent block (so we can pass that to 2390 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
883 mlog_errno(status); 2395 mlog_errno(status);
884 goto bail; 2396 goto bail;
885 } 2397 }
2398 insert.ins_tree_depth++;
886 /* Special case: we have room now if we shifted from 2399 /* Special case: we have room now if we shifted from
887 * tree_depth 0 */ 2400 * tree_depth 0 */
888 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) 2401 if (insert.ins_tree_depth == 1)
889 goto out_add; 2402 goto out_add;
890 } 2403 }
891 2404
892 /* call ocfs2_add_branch to add the final part of the tree with 2405 /* call ocfs2_add_branch to add the final part of the tree with
893 * the new data. */ 2406 * the new data. */
894 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); 2407 mlog(0, "add branch. bh = %p\n", bh);
895 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896 meta_ac); 2409 meta_ac);
897 if (status < 0) { 2410 if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
900 } 2413 }
901 2414
902out_add: 2415out_add:
903 /* Finally, we can add clusters. */ 2416 /* Finally, we can add clusters. This might rotate the tree for us. */
904 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, 2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
905 start_blk, new_clusters);
906 if (status < 0) 2418 if (status < 0)
907 mlog_errno(status); 2419 mlog_errno(status);
2420 else
2421 ocfs2_extent_map_insert_rec(inode, &rec);
908 2422
909bail: 2423bail:
910 if (bh) 2424 if (bh)
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1447 * block will be deleted, and if it will, what the new last extent 2961 * block will be deleted, and if it will, what the new last extent
1448 * block will be so we can update his h_next_leaf_blk field, as well 2962 * block will be so we can update his h_next_leaf_blk field, as well
1449 * as the dinodes i_last_eb_blk */ 2963 * as the dinodes i_last_eb_blk */
1450static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 2964static int ocfs2_find_new_last_ext_blk(struct inode *inode,
1451 struct inode *inode, 2965 unsigned int clusters_to_del,
1452 struct ocfs2_dinode *fe, 2966 struct ocfs2_path *path,
1453 u32 new_i_clusters,
1454 struct buffer_head *old_last_eb,
1455 struct buffer_head **new_last_eb) 2967 struct buffer_head **new_last_eb)
1456{ 2968{
1457 int i, status = 0; 2969 int next_free, ret = 0;
1458 u64 block = 0; 2970 u32 cpos;
2971 struct ocfs2_extent_rec *rec;
1459 struct ocfs2_extent_block *eb; 2972 struct ocfs2_extent_block *eb;
1460 struct ocfs2_extent_list *el; 2973 struct ocfs2_extent_list *el;
1461 struct buffer_head *bh = NULL; 2974 struct buffer_head *bh = NULL;
1462 2975
1463 *new_last_eb = NULL; 2976 *new_last_eb = NULL;
1464 2977
1465 if (!OCFS2_IS_VALID_DINODE(fe)) {
1466 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1467 status = -EIO;
1468 goto bail;
1469 }
1470
1471 /* we have no tree, so of course, no last_eb. */ 2978 /* we have no tree, so of course, no last_eb. */
1472 if (!fe->id2.i_list.l_tree_depth) 2979 if (!path->p_tree_depth)
1473 goto bail; 2980 goto out;
1474 2981
1475 /* trunc to zero special case - this makes tree_depth = 0 2982 /* trunc to zero special case - this makes tree_depth = 0
1476 * regardless of what it is. */ 2983 * regardless of what it is. */
1477 if (!new_i_clusters) 2984 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
1478 goto bail; 2985 goto out;
1479 2986
1480 eb = (struct ocfs2_extent_block *) old_last_eb->b_data; 2987 el = path_leaf_el(path);
1481 el = &(eb->h_list);
1482 BUG_ON(!el->l_next_free_rec); 2988 BUG_ON(!el->l_next_free_rec);
1483 2989
1484 /* Make sure that this guy will actually be empty after we 2990 /*
1485 * clear away the data. */ 2991 * Make sure that this extent list will actually be empty
1486 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) 2992 * after we clear away the data. We can shortcut out if
1487 goto bail; 2993 * there's more than one non-empty extent in the
2994 * list. Otherwise, a check of the remaining extent is
2995 * necessary.
2996 */
2997 next_free = le16_to_cpu(el->l_next_free_rec);
2998 rec = NULL;
2999 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3000 if (next_free > 2)
3001 goto out;
1488 3002
1489 /* Ok, at this point, we know that last_eb will definitely 3003 /* We may have a valid extent in index 1, check it. */
1490 * change, so lets traverse the tree and find the second to 3004 if (next_free == 2)
1491 * last extent block. */ 3005 rec = &el->l_recs[1];
1492 el = &(fe->id2.i_list); 3006
1493 /* go down the tree, */ 3007 /*
1494 do { 3008 * Fall through - no more nonempty extents, so we want
1495 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { 3009 * to delete this leaf.
1496 if (le32_to_cpu(el->l_recs[i].e_cpos) < 3010 */
1497 new_i_clusters) { 3011 } else {
1498 block = le64_to_cpu(el->l_recs[i].e_blkno); 3012 if (next_free > 1)
1499 break; 3013 goto out;
1500 } 3014
3015 rec = &el->l_recs[0];
3016 }
3017
3018 if (rec) {
3019 /*
3020 * Check it we'll only be trimming off the end of this
3021 * cluster.
3022 */
3023 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
3024 goto out;
3025 }
3026
3027 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
3028 if (ret) {
3029 mlog_errno(ret);
3030 goto out;
3031 }
3032
3033 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
3034 if (ret) {
3035 mlog_errno(ret);
3036 goto out;
3037 }
3038
3039 eb = (struct ocfs2_extent_block *) bh->b_data;
3040 el = &eb->h_list;
3041 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3042 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3043 ret = -EROFS;
3044 goto out;
3045 }
3046
3047 *new_last_eb = bh;
3048 get_bh(*new_last_eb);
3049 mlog(0, "returning block %llu, (cpos: %u)\n",
3050 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
3051out:
3052 brelse(bh);
3053
3054 return ret;
3055}
3056
3057/*
3058 * Trim some clusters off the rightmost edge of a tree. Only called
3059 * during truncate.
3060 *
3061 * The caller needs to:
3062 * - start journaling of each path component.
3063 * - compute and fully set up any new last ext block
3064 */
3065static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
3066 handle_t *handle, struct ocfs2_truncate_context *tc,
3067 u32 clusters_to_del, u64 *delete_start)
3068{
3069 int ret, i, index = path->p_tree_depth;
3070 u32 new_edge = 0;
3071 u64 deleted_eb = 0;
3072 struct buffer_head *bh;
3073 struct ocfs2_extent_list *el;
3074 struct ocfs2_extent_rec *rec;
3075
3076 *delete_start = 0;
3077
3078 while (index >= 0) {
3079 bh = path->p_node[index].bh;
3080 el = path->p_node[index].el;
3081
3082 mlog(0, "traveling tree (index = %d, block = %llu)\n",
3083 index, (unsigned long long)bh->b_blocknr);
3084
3085 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3086
3087 if (index !=
3088 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3089 ocfs2_error(inode->i_sb,
3090 "Inode %lu has invalid ext. block %llu",
3091 inode->i_ino,
3092 (unsigned long long)bh->b_blocknr);
3093 ret = -EROFS;
3094 goto out;
1501 } 3095 }
1502 BUG_ON(i < 0);
1503 3096
1504 if (bh) { 3097find_tail_record:
1505 brelse(bh); 3098 i = le16_to_cpu(el->l_next_free_rec) - 1;
1506 bh = NULL; 3099 rec = &el->l_recs[i];
3100
3101 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
3102 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
3103 ocfs2_rec_clusters(el, rec),
3104 (unsigned long long)le64_to_cpu(rec->e_blkno),
3105 le16_to_cpu(el->l_next_free_rec));
3106
3107 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
3108
3109 if (le16_to_cpu(el->l_tree_depth) == 0) {
3110 /*
3111 * If the leaf block contains a single empty
3112 * extent and no records, we can just remove
3113 * the block.
3114 */
3115 if (i == 0 && ocfs2_is_empty_extent(rec)) {
3116 memset(rec, 0,
3117 sizeof(struct ocfs2_extent_rec));
3118 el->l_next_free_rec = cpu_to_le16(0);
3119
3120 goto delete;
3121 }
3122
3123 /*
3124 * Remove any empty extents by shifting things
3125 * left. That should make life much easier on
3126 * the code below. This condition is rare
3127 * enough that we shouldn't see a performance
3128 * hit.
3129 */
3130 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3131 le16_add_cpu(&el->l_next_free_rec, -1);
3132
3133 for(i = 0;
3134 i < le16_to_cpu(el->l_next_free_rec); i++)
3135 el->l_recs[i] = el->l_recs[i + 1];
3136
3137 memset(&el->l_recs[i], 0,
3138 sizeof(struct ocfs2_extent_rec));
3139
3140 /*
3141 * We've modified our extent list. The
3142 * simplest way to handle this change
3143 * is to being the search from the
3144 * start again.
3145 */
3146 goto find_tail_record;
3147 }
3148
3149 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
3150
3151 /*
3152 * We'll use "new_edge" on our way back up the
3153 * tree to know what our rightmost cpos is.
3154 */
3155 new_edge = le16_to_cpu(rec->e_leaf_clusters);
3156 new_edge += le32_to_cpu(rec->e_cpos);
3157
3158 /*
3159 * The caller will use this to delete data blocks.
3160 */
3161 *delete_start = le64_to_cpu(rec->e_blkno)
3162 + ocfs2_clusters_to_blocks(inode->i_sb,
3163 le16_to_cpu(rec->e_leaf_clusters));
3164
3165 /*
3166 * If it's now empty, remove this record.
3167 */
3168 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
3169 memset(rec, 0,
3170 sizeof(struct ocfs2_extent_rec));
3171 le16_add_cpu(&el->l_next_free_rec, -1);
3172 }
3173 } else {
3174 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
3175 memset(rec, 0,
3176 sizeof(struct ocfs2_extent_rec));
3177 le16_add_cpu(&el->l_next_free_rec, -1);
3178
3179 goto delete;
3180 }
3181
3182 /* Can this actually happen? */
3183 if (le16_to_cpu(el->l_next_free_rec) == 0)
3184 goto delete;
3185
3186 /*
3187 * We never actually deleted any clusters
3188 * because our leaf was empty. There's no
3189 * reason to adjust the rightmost edge then.
3190 */
3191 if (new_edge == 0)
3192 goto delete;
3193
3194 rec->e_int_clusters = cpu_to_le32(new_edge);
3195 le32_add_cpu(&rec->e_int_clusters,
3196 -le32_to_cpu(rec->e_cpos));
3197
3198 /*
3199 * A deleted child record should have been
3200 * caught above.
3201 */
3202 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
1507 } 3203 }
1508 3204
1509 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, 3205delete:
1510 inode); 3206 ret = ocfs2_journal_dirty(handle, bh);
1511 if (status < 0) { 3207 if (ret) {
1512 mlog_errno(status); 3208 mlog_errno(ret);
1513 goto bail; 3209 goto out;
1514 } 3210 }
1515 eb = (struct ocfs2_extent_block *) bh->b_data; 3211
1516 el = &eb->h_list; 3212 mlog(0, "extent list container %llu, after: record %d: "
1517 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3213 "(%u, %u, %llu), next = %u.\n",
1518 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3214 (unsigned long long)bh->b_blocknr, i,
1519 status = -EIO; 3215 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
1520 goto bail; 3216 (unsigned long long)le64_to_cpu(rec->e_blkno),
3217 le16_to_cpu(el->l_next_free_rec));
3218
3219 /*
3220 * We must be careful to only attempt delete of an
3221 * extent block (and not the root inode block).
3222 */
3223 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
3224 struct ocfs2_extent_block *eb =
3225 (struct ocfs2_extent_block *)bh->b_data;
3226
3227 /*
3228 * Save this for use when processing the
3229 * parent block.
3230 */
3231 deleted_eb = le64_to_cpu(eb->h_blkno);
3232
3233 mlog(0, "deleting this extent block.\n");
3234
3235 ocfs2_remove_from_cache(inode, bh);
3236
3237 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
3242 /*
3243 * This code only understands how to
3244 * lock the suballocator in slot 0,
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else {
3263 deleted_eb = 0;
1521 } 3264 }
1522 } while (el->l_tree_depth);
1523 3265
1524 *new_last_eb = bh; 3266 index--;
1525 get_bh(*new_last_eb); 3267 }
1526 mlog(0, "returning block %llu\n",
1527 (unsigned long long)le64_to_cpu(eb->h_blkno));
1528bail:
1529 if (bh)
1530 brelse(bh);
1531 3268
1532 return status; 3269 ret = 0;
3270out:
3271 return ret;
1533} 3272}
1534 3273
1535static int ocfs2_do_truncate(struct ocfs2_super *osb, 3274static int ocfs2_do_truncate(struct ocfs2_super *osb,
1536 unsigned int clusters_to_del, 3275 unsigned int clusters_to_del,
1537 struct inode *inode, 3276 struct inode *inode,
1538 struct buffer_head *fe_bh, 3277 struct buffer_head *fe_bh,
1539 struct buffer_head *old_last_eb_bh,
1540 handle_t *handle, 3278 handle_t *handle,
1541 struct ocfs2_truncate_context *tc) 3279 struct ocfs2_truncate_context *tc,
3280 struct ocfs2_path *path)
1542{ 3281{
1543 int status, i, depth; 3282 int status;
1544 struct ocfs2_dinode *fe; 3283 struct ocfs2_dinode *fe;
1545 struct ocfs2_extent_block *eb;
1546 struct ocfs2_extent_block *last_eb = NULL; 3284 struct ocfs2_extent_block *last_eb = NULL;
1547 struct ocfs2_extent_list *el; 3285 struct ocfs2_extent_list *el;
1548 struct buffer_head *eb_bh = NULL;
1549 struct buffer_head *last_eb_bh = NULL; 3286 struct buffer_head *last_eb_bh = NULL;
1550 u64 next_eb = 0;
1551 u64 delete_blk = 0; 3287 u64 delete_blk = 0;
1552 3288
1553 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3289 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1554 3290
1555 status = ocfs2_find_new_last_ext_blk(osb, 3291 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
1556 inode, 3292 path, &last_eb_bh);
1557 fe,
1558 le32_to_cpu(fe->i_clusters) -
1559 clusters_to_del,
1560 old_last_eb_bh,
1561 &last_eb_bh);
1562 if (status < 0) { 3293 if (status < 0) {
1563 mlog_errno(status); 3294 mlog_errno(status);
1564 goto bail; 3295 goto bail;
1565 } 3296 }
1566 if (last_eb_bh)
1567 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1568 3297
1569 status = ocfs2_journal_access(handle, inode, fe_bh, 3298 /*
1570 OCFS2_JOURNAL_ACCESS_WRITE); 3299 * Each component will be touched, so we might as well journal
3300 * here to avoid having to handle errors later.
3301 */
3302 status = ocfs2_journal_access_path(inode, handle, path);
1571 if (status < 0) { 3303 if (status < 0) {
1572 mlog_errno(status); 3304 mlog_errno(status);
1573 goto bail; 3305 goto bail;
1574 } 3306 }
3307
3308 if (last_eb_bh) {
3309 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3310 OCFS2_JOURNAL_ACCESS_WRITE);
3311 if (status < 0) {
3312 mlog_errno(status);
3313 goto bail;
3314 }
3315
3316 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3317 }
3318
1575 el = &(fe->id2.i_list); 3319 el = &(fe->id2.i_list);
1576 3320
3321 /*
3322 * Lower levels depend on this never happening, but it's best
3323 * to check it up here before changing the tree.
3324 */
3325 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
3326 ocfs2_error(inode->i_sb,
3327 "Inode %lu has an empty extent record, depth %u\n",
3328 inode->i_ino, le16_to_cpu(el->l_tree_depth));
3329 status = -EROFS;
3330 goto bail;
3331 }
3332
1577 spin_lock(&OCFS2_I(inode)->ip_lock); 3333 spin_lock(&OCFS2_I(inode)->ip_lock);
1578 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3334 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1579 clusters_to_del; 3335 clusters_to_del;
1580 spin_unlock(&OCFS2_I(inode)->ip_lock); 3336 spin_unlock(&OCFS2_I(inode)->ip_lock);
1581 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3337 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1582 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1583 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1584
1585 i = le16_to_cpu(el->l_next_free_rec) - 1;
1586
1587 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1588 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1589 /* tree depth zero, we can just delete the clusters, otherwise
1590 * we need to record the offset of the next level extent block
1591 * as we may overwrite it. */
1592 if (!el->l_tree_depth)
1593 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1594 + ocfs2_clusters_to_blocks(osb->sb,
1595 le32_to_cpu(el->l_recs[i].e_clusters));
1596 else
1597 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1598 3338
1599 if (!el->l_recs[i].e_clusters) { 3339 status = ocfs2_trim_tree(inode, path, handle, tc,
1600 /* if we deleted the whole extent record, then clear 3340 clusters_to_del, &delete_blk);
1601 * out the other fields and update the extent 3341 if (status) {
1602 * list. For depth > 0 trees, we've already recorded 3342 mlog_errno(status);
1603 * the extent block in 'next_eb' */ 3343 goto bail;
1604 el->l_recs[i].e_cpos = 0;
1605 el->l_recs[i].e_blkno = 0;
1606 BUG_ON(!el->l_next_free_rec);
1607 le16_add_cpu(&el->l_next_free_rec, -1);
1608 } 3344 }
1609 3345
1610 depth = le16_to_cpu(el->l_tree_depth); 3346 if (le32_to_cpu(fe->i_clusters) == 0) {
1611 if (!fe->i_clusters) {
1612 /* trunc to zero is a special case. */ 3347 /* trunc to zero is a special case. */
1613 el->l_tree_depth = 0; 3348 el->l_tree_depth = 0;
1614 fe->i_last_eb_blk = 0; 3349 fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1625 /* If there will be a new last extent block, then by 3360 /* If there will be a new last extent block, then by
1626 * definition, there cannot be any leaves to the right of 3361 * definition, there cannot be any leaves to the right of
1627 * him. */ 3362 * him. */
1628 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1629 OCFS2_JOURNAL_ACCESS_WRITE);
1630 if (status < 0) {
1631 mlog_errno(status);
1632 goto bail;
1633 }
1634 last_eb->h_next_leaf_blk = 0; 3363 last_eb->h_next_leaf_blk = 0;
1635 status = ocfs2_journal_dirty(handle, last_eb_bh); 3364 status = ocfs2_journal_dirty(handle, last_eb_bh);
1636 if (status < 0) { 3365 if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1639 } 3368 }
1640 } 3369 }
1641 3370
1642 /* if our tree depth > 0, update all the tree blocks below us. */ 3371 if (delete_blk) {
1643 while (depth) { 3372 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1644 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", 3373 clusters_to_del);
1645 depth, (unsigned long long)next_eb);
1646 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1647 OCFS2_BH_CACHED, inode);
1648 if (status < 0) { 3374 if (status < 0) {
1649 mlog_errno(status); 3375 mlog_errno(status);
1650 goto bail; 3376 goto bail;
1651 } 3377 }
1652 eb = (struct ocfs2_extent_block *)eb_bh->b_data; 3378 }
1653 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3379 status = 0;
1654 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3380bail:
1655 status = -EIO; 3381
1656 goto bail; 3382 mlog_exit(status);
3383 return status;
3384}
3385
3386static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
3387{
3388 set_buffer_uptodate(bh);
3389 mark_buffer_dirty(bh);
3390 return 0;
3391}
3392
3393static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3394{
3395 set_buffer_uptodate(bh);
3396 mark_buffer_dirty(bh);
3397 return ocfs2_journal_dirty_data(handle, bh);
3398}
3399
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3401 struct page **pages, int numpages,
3402 u64 phys, handle_t *handle)
3403{
3404 int i, ret, partial = 0;
3405 void *kaddr;
3406 struct page *page;
3407 unsigned int from, to = PAGE_CACHE_SIZE;
3408 struct super_block *sb = inode->i_sb;
3409
3410 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3411
3412 if (numpages == 0)
3413 goto out;
3414
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) {
3433 page = pages[i];
3434
3435 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE);
3437
3438 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
3439 if (ret)
3440 mlog_errno(ret);
3441
3442 kaddr = kmap_atomic(page, KM_USER0);
3443 memset(kaddr + from, 0, to - from);
3444 kunmap_atomic(kaddr, KM_USER0);
3445
3446 /*
3447 * Need to set the buffers we zero'd into uptodate
3448 * here if they aren't - ocfs2_map_page_blocks()
3449 * might've skipped some
3450 */
3451 if (ocfs2_should_order_data(inode)) {
3452 ret = walk_page_buffers(handle,
3453 page_buffers(page),
3454 from, to, &partial,
3455 ocfs2_ordered_zero_func);
3456 if (ret < 0)
3457 mlog_errno(ret);
3458 } else {
3459 ret = walk_page_buffers(handle, page_buffers(page),
3460 from, to, &partial,
3461 ocfs2_writeback_zero_func);
3462 if (ret < 0)
3463 mlog_errno(ret);
1657 } 3464 }
1658 el = &(eb->h_list);
1659 3465
1660 status = ocfs2_journal_access(handle, inode, eb_bh, 3466 if (!partial)
1661 OCFS2_JOURNAL_ACCESS_WRITE); 3467 SetPageUptodate(page);
1662 if (status < 0) { 3468
1663 mlog_errno(status); 3469 flush_dcache_page(page);
1664 goto bail; 3470
3471 /*
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 }
3476out:
3477 if (pages) {
3478 for (i = 0; i < numpages; i++) {
3479 page = pages[i];
3480 unlock_page(page);
3481 mark_page_accessed(page);
3482 page_cache_release(page);
1665 } 3483 }
3484 }
3485}
1666 3486
1667 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
1668 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); 3488 int *num, u64 *phys)
3489{
3490 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index;
3496 u64 next_cluster_bytes;
3497
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3499
3500 /* Cluster boundary, so we don't need to grab any pages. */
3501 if ((isize & (csize - 1)) == 0)
3502 goto out;
1669 3503
1670 i = le16_to_cpu(el->l_next_free_rec) - 1; 3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags);
3506 if (ret) {
3507 mlog_errno(ret);
3508 goto out;
3509 }
1671 3510
1672 mlog(0, "extent block %llu, before: record %d: " 3511 /* Tail is a hole. */
1673 "(%u, %u, %llu), next = %u\n", 3512 if (*phys == 0)
1674 (unsigned long long)le64_to_cpu(eb->h_blkno), i, 3513 goto out;
1675 le32_to_cpu(el->l_recs[i].e_cpos),
1676 le32_to_cpu(el->l_recs[i].e_clusters),
1677 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1678 le16_to_cpu(el->l_next_free_rec));
1679 3514
1680 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); 3515 /* Tail is marked as unwritten, we can count on write to zero
1681 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); 3516 * in that case. */
1682 3517 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1683 next_eb = le64_to_cpu(el->l_recs[i].e_blkno); 3518 goto out;
1684 /* bottom-most block requires us to delete data.*/
1685 if (!el->l_tree_depth)
1686 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1687 + ocfs2_clusters_to_blocks(osb->sb,
1688 le32_to_cpu(el->l_recs[i].e_clusters));
1689 if (!el->l_recs[i].e_clusters) {
1690 el->l_recs[i].e_cpos = 0;
1691 el->l_recs[i].e_blkno = 0;
1692 BUG_ON(!el->l_next_free_rec);
1693 le16_add_cpu(&el->l_next_free_rec, -1);
1694 }
1695 mlog(0, "extent block %llu, after: record %d: "
1696 "(%u, %u, %llu), next = %u\n",
1697 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1698 le32_to_cpu(el->l_recs[i].e_cpos),
1699 le32_to_cpu(el->l_recs[i].e_clusters),
1700 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1701 le16_to_cpu(el->l_next_free_rec));
1702 3519
1703 status = ocfs2_journal_dirty(handle, eb_bh); 3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
1704 if (status < 0) { 3521 index = isize >> PAGE_CACHE_SHIFT;
1705 mlog_errno(status); 3522 do {
1706 goto bail; 3523 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) {
3525 ret = -ENOMEM;
3526 mlog_errno(ret);
3527 goto out;
1707 } 3528 }
1708 3529
1709 if (!el->l_next_free_rec) { 3530 numpages++;
1710 mlog(0, "deleting this extent block.\n"); 3531 index++;
1711 3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
1712 ocfs2_remove_from_cache(inode, eb_bh);
1713 3533
1714 BUG_ON(el->l_recs[0].e_clusters); 3534out:
1715 BUG_ON(el->l_recs[0].e_cpos); 3535 if (ret != 0) {
1716 BUG_ON(el->l_recs[0].e_blkno); 3536 if (pages) {
1717 if (eb->h_suballoc_slot == 0) { 3537 for (i = 0; i < numpages; i++) {
1718 /* 3538 if (pages[i]) {
1719 * This code only understands how to 3539 unlock_page(pages[i]);
1720 * lock the suballocator in slot 0, 3540 page_cache_release(pages[i]);
1721 * which is fine because allocation is
1722 * only ever done out of that
1723 * suballocator too. A future version
1724 * might change that however, so avoid
1725 * a free if we don't know how to
1726 * handle it. This way an fs incompat
1727 * bit will not be necessary.
1728 */
1729 status = ocfs2_free_extent_block(handle,
1730 tc->tc_ext_alloc_inode,
1731 tc->tc_ext_alloc_bh,
1732 eb);
1733 if (status < 0) {
1734 mlog_errno(status);
1735 goto bail;
1736 } 3541 }
1737 } 3542 }
1738 } 3543 }
1739 brelse(eb_bh); 3544 numpages = 0;
1740 eb_bh = NULL;
1741 depth--;
1742 } 3545 }
1743 3546
1744 BUG_ON(!delete_blk); 3547 *num = numpages;
1745 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 3548
1746 clusters_to_del); 3549 return ret;
1747 if (status < 0) { 3550}
1748 mlog_errno(status); 3551
1749 goto bail; 3552/*
3553 * Zero the area past i_size but still within an allocated
3554 * cluster. This avoids exposing nonzero data on subsequent file
3555 * extends.
3556 *
3557 * We need to call this before i_size is updated on the inode because
3558 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason.
3560 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size)
3563{
3564 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL;
3567 u64 phys;
3568
3569 /*
3570 * File systems which don't support sparse files zero on every
3571 * extend.
3572 */
3573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
3574 return 0;
3575
3576 pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
3577 sizeof(struct page *), GFP_NOFS);
3578 if (pages == NULL) {
3579 ret = -ENOMEM;
3580 mlog_errno(ret);
3581 goto out;
1750 } 3582 }
1751 status = 0; 3583
1752bail: 3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
1753 if (!status) 3585 if (ret) {
1754 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); 3586 mlog_errno(ret);
1755 else 3587 goto out;
1756 ocfs2_extent_map_drop(inode, 0); 3588 }
1757 mlog_exit(status); 3589
1758 return status; 3590 if (numpages == 0)
3591 goto out;
3592
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
3594 handle);
3595
3596 /*
3597 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us.
3600 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret)
3605 mlog_errno(ret);
3606
3607out:
3608 if (pages)
3609 kfree(pages);
3610
3611 return ret;
1759} 3612}
1760 3613
1761/* 3614/*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
1770 struct ocfs2_truncate_context *tc) 3623 struct ocfs2_truncate_context *tc)
1771{ 3624{
1772 int status, i, credits, tl_sem = 0; 3625 int status, i, credits, tl_sem = 0;
1773 u32 clusters_to_del, target_i_clusters; 3626 u32 clusters_to_del, new_highest_cpos, range;
1774 u64 last_eb = 0;
1775 struct ocfs2_dinode *fe;
1776 struct ocfs2_extent_block *eb;
1777 struct ocfs2_extent_list *el; 3627 struct ocfs2_extent_list *el;
1778 struct buffer_head *last_eb_bh;
1779 handle_t *handle = NULL; 3628 handle_t *handle = NULL;
1780 struct inode *tl_inode = osb->osb_tl_inode; 3629 struct inode *tl_inode = osb->osb_tl_inode;
3630 struct ocfs2_path *path = NULL;
1781 3631
1782 mlog_entry_void(); 3632 mlog_entry_void();
1783 3633
1784 down_write(&OCFS2_I(inode)->ip_alloc_sem); 3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1785 3635
1786 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
1787 i_size_read(inode)); 3637 i_size_read(inode));
1788 3638
1789 last_eb_bh = tc->tc_last_eb_bh; 3639 path = ocfs2_new_inode_path(fe_bh);
1790 tc->tc_last_eb_bh = NULL; 3640 if (!path) {
3641 status = -ENOMEM;
3642 mlog_errno(status);
3643 goto bail;
3644 }
1791 3645
1792 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3646 ocfs2_extent_map_trunc(inode, new_highest_cpos);
1793 3647
1794 if (fe->id2.i_list.l_tree_depth) {
1795 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1796 el = &eb->h_list;
1797 } else
1798 el = &fe->id2.i_list;
1799 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1800start: 3648start:
1801 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " 3649 /*
1802 "last_eb = %llu, fe->i_last_eb_blk = %llu, " 3650 * Check that we still have allocation to delete.
1803 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", 3651 */
1804 le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, 3652 if (OCFS2_I(inode)->ip_clusters == 0) {
1805 (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), 3653 status = 0;
1806 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); 3654 goto bail;
1807 3655 }
1808 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1809 mlog(0, "last_eb changed!\n");
1810 BUG_ON(!fe->id2.i_list.l_tree_depth);
1811 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1812 /* i_last_eb_blk may have changed, read it if
1813 * necessary. We don't have to worry about the
1814 * truncate to zero case here (where there becomes no
1815 * last_eb) because we never loop back after our work
1816 * is done. */
1817 if (last_eb_bh) {
1818 brelse(last_eb_bh);
1819 last_eb_bh = NULL;
1820 }
1821 3656
1822 status = ocfs2_read_block(osb, last_eb, 3657 /*
1823 &last_eb_bh, OCFS2_BH_CACHED, 3658 * Truncate always works against the rightmost tree branch.
1824 inode); 3659 */
1825 if (status < 0) { 3660 status = ocfs2_find_path(inode, path, UINT_MAX);
1826 mlog_errno(status); 3661 if (status) {
1827 goto bail; 3662 mlog_errno(status);
1828 } 3663 goto bail;
1829 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3664 }
1830 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3665
1831 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3666 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
1832 status = -EIO; 3667 OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
1833 goto bail; 3668
1834 } 3669 /*
1835 el = &(eb->h_list); 3670 * By now, el will point to the extent list on the bottom most
3671 * portion of this tree. Only the tail record is considered in
3672 * each pass.
3673 *
3674 * We handle the following cases, in order:
3675 * - empty extent: delete the remaining branch
3676 * - remove the entire record
3677 * - remove a partial record
3678 * - no record needs to be removed (truncate has completed)
3679 */
3680 el = path_leaf_el(path);
3681 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3682 ocfs2_error(inode->i_sb,
3683 "Inode %llu has empty extent block at %llu\n",
3684 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3685 (unsigned long long)path_leaf_bh(path)->b_blocknr);
3686 status = -EROFS;
3687 goto bail;
1836 } 3688 }
1837 3689
1838 /* by now, el will point to the extent list on the bottom most
1839 * portion of this tree. */
1840 i = le16_to_cpu(el->l_next_free_rec) - 1; 3690 i = le16_to_cpu(el->l_next_free_rec) - 1;
1841 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) 3691 range = le32_to_cpu(el->l_recs[i].e_cpos) +
1842 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); 3692 ocfs2_rec_clusters(el, &el->l_recs[i]);
1843 else 3693 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
1844 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + 3694 clusters_to_del = 0;
3695 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3696 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
3697 } else if (range > new_highest_cpos) {
3698 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
1845 le32_to_cpu(el->l_recs[i].e_cpos)) - 3699 le32_to_cpu(el->l_recs[i].e_cpos)) -
1846 target_i_clusters; 3700 new_highest_cpos;
3701 } else {
3702 status = 0;
3703 goto bail;
3704 }
1847 3705
1848 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); 3706 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3707 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3708
3709 BUG_ON(clusters_to_del == 0);
1849 3710
1850 mutex_lock(&tl_inode->i_mutex); 3711 mutex_lock(&tl_inode->i_mutex);
1851 tl_sem = 1; 3712 tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
1861 } 3722 }
1862 3723
1863 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3724 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1864 fe, el); 3725 (struct ocfs2_dinode *)fe_bh->b_data,
3726 el);
1865 handle = ocfs2_start_trans(osb, credits); 3727 handle = ocfs2_start_trans(osb, credits);
1866 if (IS_ERR(handle)) { 3728 if (IS_ERR(handle)) {
1867 status = PTR_ERR(handle); 3729 status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
1870 goto bail; 3732 goto bail;
1871 } 3733 }
1872 3734
1873 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 3735 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
1874 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 3736 tc, path);
1875 if (status < 0)
1876 mlog_errno(status);
1877
1878 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1879 last_eb_bh, handle, tc);
1880 if (status < 0) { 3737 if (status < 0) {
1881 mlog_errno(status); 3738 mlog_errno(status);
1882 goto bail; 3739 goto bail;
@@ -1888,9 +3745,14 @@ start:
1888 ocfs2_commit_trans(osb, handle); 3745 ocfs2_commit_trans(osb, handle);
1889 handle = NULL; 3746 handle = NULL;
1890 3747
1891 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); 3748 ocfs2_reinit_path(path, 1);
1892 if (le32_to_cpu(fe->i_clusters) > target_i_clusters) 3749
1893 goto start; 3750 /*
3751 * The check above will catch the case where we've truncated
3752 * away all allocation.
3753 */
3754 goto start;
3755
1894bail: 3756bail:
1895 up_write(&OCFS2_I(inode)->ip_alloc_sem); 3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1896 3758
@@ -1902,8 +3764,7 @@ bail:
1902 if (handle) 3764 if (handle)
1903 ocfs2_commit_trans(osb, handle); 3765 ocfs2_commit_trans(osb, handle);
1904 3766
1905 if (last_eb_bh) 3767 ocfs2_free_path(path);
1906 brelse(last_eb_bh);
1907 3768
1908 /* This will drop the ext_alloc cluster lock for us */ 3769 /* This will drop the ext_alloc cluster lock for us */
1909 ocfs2_free_truncate_context(tc); 3770 ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
1912 return status; 3773 return status;
1913} 3774}
1914 3775
1915
1916/* 3776/*
1917 * Expects the inode to already be locked. This will figure out which 3777 * Expects the inode to already be locked. This will figure out which
1918 * inodes need to be locked and will put them on the returned truncate 3778 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1923 struct buffer_head *fe_bh, 3783 struct buffer_head *fe_bh,
1924 struct ocfs2_truncate_context **tc) 3784 struct ocfs2_truncate_context **tc)
1925{ 3785{
1926 int status, metadata_delete; 3786 int status, metadata_delete, i;
1927 unsigned int new_i_clusters; 3787 unsigned int new_i_clusters;
1928 struct ocfs2_dinode *fe; 3788 struct ocfs2_dinode *fe;
1929 struct ocfs2_extent_block *eb; 3789 struct ocfs2_extent_block *eb;
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1944 "%llu\n", fe->i_clusters, new_i_clusters, 3804 "%llu\n", fe->i_clusters, new_i_clusters,
1945 (unsigned long long)fe->i_size); 3805 (unsigned long long)fe->i_size);
1946 3806
1947 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1948 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1949 "%u and size %llu whereas struct inode has "
1950 "cluster count %u and size %llu which caused an "
1951 "invalid truncate to %u clusters.",
1952 (unsigned long long)le64_to_cpu(fe->i_blkno),
1953 le32_to_cpu(fe->i_clusters),
1954 (unsigned long long)le64_to_cpu(fe->i_size),
1955 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1956 new_i_clusters);
1957 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1958 status = -EIO;
1959 goto bail;
1960 }
1961
1962 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 3807 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1963 if (!(*tc)) { 3808 if (!(*tc)) {
1964 status = -ENOMEM; 3809 status = -ENOMEM;
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1986 goto bail; 3831 goto bail;
1987 } 3832 }
1988 el = &(eb->h_list); 3833 el = &(eb->h_list);
1989 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) 3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
1990 metadata_delete = 1; 3843 metadata_delete = 1;
1991 } 3844 }
1992 3845
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
31 handle_t *handle, 31 handle_t *handle,
32 struct inode *inode, 32 struct inode *inode,
33 struct buffer_head *fe_bh, 33 struct buffer_head *fe_bh,
34 u64 blkno, 34 u32 cpos,
35 u64 start_blk,
35 u32 new_clusters, 36 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac); 37 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb, 38int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
70 struct buffer_head *tc_last_eb_bh; 71 struct buffer_head *tc_last_eb_bh;
71}; 72};
72 73
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size);
73int ocfs2_prepare_truncate(struct ocfs2_super *osb, 76int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode, 77 struct inode *inode,
75 struct buffer_head *fe_bh, 78 struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
79 struct buffer_head *fe_bh, 82 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc); 83 struct ocfs2_truncate_context *tc);
81 84
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh);
87
88/*
89 * Helper function to look at the # of clusters in an extent record.
90 */
91static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
92 struct ocfs2_extent_rec *rec)
93{
94 /*
95 * Cluster count in extent records is slightly different
96 * between interior nodes and leaf nodes. This is to support
97 * unwritten extents which need a flags field in leaf node
98 * records, thus shrinking the available space for a clusters
99 * field.
100 */
101 if (el->l_tree_depth)
102 return le32_to_cpu(rec->e_int_clusters);
103 else
104 return le16_to_cpu(rec->e_leaf_clusters);
105}
106
82#endif /* OCFS2_ALLOC_H */ 107#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..56963e6c46c0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h>
27 29
28#define MLOG_MASK_PREFIX ML_FILE_IO 30#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -37,6 +39,7 @@
37#include "file.h" 39#include "file.h"
38#include "inode.h" 40#include "inode.h"
39#include "journal.h" 41#include "journal.h"
42#include "suballoc.h"
40#include "super.h" 43#include "super.h"
41#include "symlink.h" 44#include "symlink.h"
42 45
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create) 137 struct buffer_head *bh_result, int create)
135{ 138{
136 int err = 0; 139 int err = 0;
140 unsigned int ext_flags;
137 u64 p_blkno, past_eof; 141 u64 p_blkno, past_eof;
142 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
138 143
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 144 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create); 145 (unsigned long long)iblock, bh_result, create);
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
149 goto bail; 154 goto bail;
150 } 155 }
151 156
152 /* this can happen if another node truncs after our extend! */ 157 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
153 spin_lock(&OCFS2_I(inode)->ip_lock); 158 &ext_flags);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) { 159 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 160 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 161 "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
167 goto bail; 163 goto bail;
168 } 164 }
169 165
170 map_bh(bh_result, inode->i_sb, p_blkno); 166 /*
171 167 * ocfs2 never allocates in this function - the only time we
172 if (bh_result->b_blocknr == 0) { 168 * need to use BH_New is when we're extending i_size on a file
173 err = -EIO; 169 * system which doesn't support holes, in which case BH_New
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 170 * allows block_prepare_write() to zero.
175 (unsigned long long)iblock, 171 */
176 (unsigned long long)p_blkno, 172 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
177 (unsigned long long)OCFS2_I(inode)->ip_blkno); 173 "ino %lu, iblock %llu\n", inode->i_ino,
178 } 174 (unsigned long long)iblock);
175
176 /* Treat the unwritten extent as a hole for zeroing purposes. */
177 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
178 map_bh(bh_result, inode->i_sb, p_blkno);
179
180 if (!ocfs2_sparse_alloc(osb)) {
181 if (p_blkno == 0) {
182 err = -EIO;
183 mlog(ML_ERROR,
184 "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
185 (unsigned long long)iblock,
186 (unsigned long long)p_blkno,
187 (unsigned long long)OCFS2_I(inode)->ip_blkno);
188 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
189 dump_stack();
190 }
179 191
180 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 192 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
181 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 193 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
182 (unsigned long long)past_eof); 194 (unsigned long long)past_eof);
183 195
184 if (create && (iblock >= past_eof)) 196 if (create && (iblock >= past_eof))
185 set_buffer_new(bh_result); 197 set_buffer_new(bh_result);
198 }
186 199
187bail: 200bail:
188 if (err < 0) 201 if (err < 0)
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
276 return ret; 289 return ret;
277} 290}
278 291
279/* This can also be called from ocfs2_write_zero_page() which has done 292/*
280 * it's own cluster locking. */ 293 * This is called from ocfs2_write_zero_page() which has handled it's
294 * own cluster locking and has ensured allocation exists for those
295 * blocks to be written.
296 */
281int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 297int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
282 unsigned from, unsigned to) 298 unsigned from, unsigned to)
283{ 299{
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
292 return ret; 308 return ret;
293} 309}
294 310
295/*
296 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
297 * from loopback. It must be able to perform its own locking around
298 * ocfs2_get_block().
299 */
300static int ocfs2_prepare_write(struct file *file, struct page *page,
301 unsigned from, unsigned to)
302{
303 struct inode *inode = page->mapping->host;
304 int ret;
305
306 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
307
308 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
309 if (ret != 0) {
310 mlog_errno(ret);
311 goto out;
312 }
313
314 ret = ocfs2_prepare_write_nolock(inode, page, from, to);
315
316 ocfs2_meta_unlock(inode, 0);
317out:
318 mlog_exit(ret);
319 return ret;
320}
321
322/* Taken from ext3. We don't necessarily need the full blown 311/* Taken from ext3. We don't necessarily need the full blown
323 * functionality yet, but IMHO it's better to cut and paste the whole 312 * functionality yet, but IMHO it's better to cut and paste the whole
324 * thing so we can avoid introducing our own bugs (and easily pick up 313 * thing so we can avoid introducing our own bugs (and easily pick up
325 * their fixes when they happen) --Mark */ 314 * their fixes when they happen) --Mark */
326static int walk_page_buffers( handle_t *handle, 315int walk_page_buffers( handle_t *handle,
327 struct buffer_head *head, 316 struct buffer_head *head,
328 unsigned from, 317 unsigned from,
329 unsigned to, 318 unsigned to,
330 int *partial, 319 int *partial,
331 int (*fn)( handle_t *handle, 320 int (*fn)( handle_t *handle,
332 struct buffer_head *bh)) 321 struct buffer_head *bh))
333{ 322{
334 struct buffer_head *bh; 323 struct buffer_head *bh;
335 unsigned block_start, block_end; 324 unsigned block_start, block_end;
@@ -388,95 +377,6 @@ out:
388 return handle; 377 return handle;
389} 378}
390 379
391static int ocfs2_commit_write(struct file *file, struct page *page,
392 unsigned from, unsigned to)
393{
394 int ret;
395 struct buffer_head *di_bh = NULL;
396 struct inode *inode = page->mapping->host;
397 handle_t *handle = NULL;
398 struct ocfs2_dinode *di;
399
400 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
401
402 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
403 * us to continue here without rechecking the I/O against
404 * changed inode values.
405 *
406 * 1) We're currently holding the inode alloc lock, so no
407 * nodes can change it underneath us.
408 *
409 * 2) We've had to take the metadata lock at least once
410 * already to check for extending writes, suid removal, etc.
411 * The meta data update code then ensures that we don't get a
412 * stale inode allocation image (i_size, i_clusters, etc).
413 */
414
415 ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
416 if (ret != 0) {
417 mlog_errno(ret);
418 goto out;
419 }
420
421 ret = ocfs2_data_lock_with_page(inode, 1, page);
422 if (ret != 0) {
423 mlog_errno(ret);
424 goto out_unlock_meta;
425 }
426
427 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
428 if (IS_ERR(handle)) {
429 ret = PTR_ERR(handle);
430 goto out_unlock_data;
431 }
432
433 /* Mark our buffer early. We'd rather catch this error up here
434 * as opposed to after a successful commit_write which would
435 * require us to set back inode->i_size. */
436 ret = ocfs2_journal_access(handle, inode, di_bh,
437 OCFS2_JOURNAL_ACCESS_WRITE);
438 if (ret < 0) {
439 mlog_errno(ret);
440 goto out_commit;
441 }
442
443 /* might update i_size */
444 ret = generic_commit_write(file, page, from, to);
445 if (ret < 0) {
446 mlog_errno(ret);
447 goto out_commit;
448 }
449
450 di = (struct ocfs2_dinode *)di_bh->b_data;
451
452 /* ocfs2_mark_inode_dirty() is too heavy to use here. */
453 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
454 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
455 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
458 di->i_size = cpu_to_le64((u64)i_size_read(inode));
459
460 ret = ocfs2_journal_dirty(handle, di_bh);
461 if (ret < 0) {
462 mlog_errno(ret);
463 goto out_commit;
464 }
465
466out_commit:
467 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
468out_unlock_data:
469 ocfs2_data_unlock(inode, 1);
470out_unlock_meta:
471 ocfs2_meta_unlock(inode, 1);
472out:
473 if (di_bh)
474 brelse(di_bh);
475
476 mlog_exit(ret);
477 return ret;
478}
479
480static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 380static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
481{ 381{
482 sector_t status; 382 sector_t status;
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
499 down_read(&OCFS2_I(inode)->ip_alloc_sem); 399 down_read(&OCFS2_I(inode)->ip_alloc_sem);
500 } 400 }
501 401
502 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, 402 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
503 NULL);
504 403
505 if (!INODE_JOURNAL(inode)) { 404 if (!INODE_JOURNAL(inode)) {
506 up_read(&OCFS2_I(inode)->ip_alloc_sem); 405 up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
540 struct buffer_head *bh_result, int create) 439 struct buffer_head *bh_result, int create)
541{ 440{
542 int ret; 441 int ret;
543 u64 p_blkno, inode_blocks; 442 u64 p_blkno, inode_blocks, contig_blocks;
544 int contig_blocks; 443 unsigned int ext_flags;
545 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 444 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
546 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 445 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
547 446
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
549 * nicely aligned and of the right size, so there's no need 448 * nicely aligned and of the right size, so there's no need
550 * for us to check any of that. */ 449 * for us to check any of that. */
551 450
552 spin_lock(&OCFS2_I(inode)->ip_lock); 451 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
553 inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
554 OCFS2_I(inode)->ip_clusters);
555
556 /*
557 * For a read which begins past the end of file, we return a hole.
558 */
559 if (!create && (iblock >= inode_blocks)) {
560 spin_unlock(&OCFS2_I(inode)->ip_lock);
561 ret = 0;
562 goto bail;
563 }
564 452
565 /* 453 /*
566 * Any write past EOF is not allowed because we'd be extending. 454 * Any write past EOF is not allowed because we'd be extending.
567 */ 455 */
568 if (create && (iblock + max_blocks) > inode_blocks) { 456 if (create && (iblock + max_blocks) > inode_blocks) {
569 spin_unlock(&OCFS2_I(inode)->ip_lock);
570 ret = -EIO; 457 ret = -EIO;
571 goto bail; 458 goto bail;
572 } 459 }
573 spin_unlock(&OCFS2_I(inode)->ip_lock);
574 460
575 /* This figures out the size of the next contiguous block, and 461 /* This figures out the size of the next contiguous block, and
576 * our logical offset */ 462 * our logical offset */
577 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, 463 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
578 &contig_blocks); 464 &contig_blocks, &ext_flags);
579 if (ret) { 465 if (ret) {
580 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 466 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
581 (unsigned long long)iblock); 467 (unsigned long long)iblock);
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
583 goto bail; 469 goto bail;
584 } 470 }
585 471
586 map_bh(bh_result, inode->i_sb, p_blkno); 472 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
473 ocfs2_error(inode->i_sb,
474 "Inode %llu has a hole at block %llu\n",
475 (unsigned long long)OCFS2_I(inode)->ip_blkno,
476 (unsigned long long)iblock);
477 ret = -EROFS;
478 goto bail;
479 }
480
481 /*
482 * get_more_blocks() expects us to describe a hole by clearing
483 * the mapped bit on bh_result().
484 *
485 * Consider an unwritten extent as a hole.
486 */
487 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
488 map_bh(bh_result, inode->i_sb, p_blkno);
489 else {
490 /*
491 * ocfs2_prepare_inode_for_write() should have caught
492 * the case where we'd be filling a hole and triggered
493 * a buffered write instead.
494 */
495 if (create) {
496 ret = -EIO;
497 mlog_errno(ret);
498 goto bail;
499 }
500
501 clear_buffer_mapped(bh_result);
502 }
587 503
588 /* make sure we don't map more than max_blocks blocks here as 504 /* make sure we don't map more than max_blocks blocks here as
589 that's all the kernel will handle at this point. */ 505 that's all the kernel will handle at this point. */
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
606 void *private) 522 void *private)
607{ 523{
608 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 524 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
525 int level;
609 526
610 /* this io's submitter should not have unlocked this before we could */ 527 /* this io's submitter should not have unlocked this before we could */
611 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 528 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
529
612 ocfs2_iocb_clear_rw_locked(iocb); 530 ocfs2_iocb_clear_rw_locked(iocb);
613 up_read(&inode->i_alloc_sem); 531
614 ocfs2_rw_unlock(inode, 0); 532 level = ocfs2_iocb_rw_locked_level(iocb);
533 if (!level)
534 up_read(&inode->i_alloc_sem);
535 ocfs2_rw_unlock(inode, level);
615} 536}
616 537
617/* 538/*
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw,
647 568
648 mlog_entry_void(); 569 mlog_entry_void();
649 570
650 /* 571 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
651 * We get PR data locks even for O_DIRECT. This allows 572 /*
652 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with 573 * We get PR data locks even for O_DIRECT. This
653 * extending and buffered zeroing writes race. If they did 574 * allows concurrent O_DIRECT I/O but doesn't let
654 * race then the buffered zeroing could be written back after 575 * O_DIRECT with extending and buffered zeroing writes
655 * the O_DIRECT I/O. It's one thing to tell people not to mix 576 * race. If they did race then the buffered zeroing
656 * buffered and O_DIRECT writes, but expecting them to 577 * could be written back after the O_DIRECT I/O. It's
657 * understand that file extension is also an implicit buffered 578 * one thing to tell people not to mix buffered and
658 * write is too much. By getting the PR we force writeback of 579 * O_DIRECT writes, but expecting them to understand
659 * the buffered zeroing before proceeding. 580 * that file extension is also an implicit buffered
660 */ 581 * write is too much. By getting the PR we force
661 ret = ocfs2_data_lock(inode, 0); 582 * writeback of the buffered zeroing before
662 if (ret < 0) { 583 * proceeding.
663 mlog_errno(ret); 584 */
664 goto out; 585 ret = ocfs2_data_lock(inode, 0);
586 if (ret < 0) {
587 mlog_errno(ret);
588 goto out;
589 }
590 ocfs2_data_unlock(inode, 0);
665 } 591 }
666 ocfs2_data_unlock(inode, 0);
667 592
668 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 593 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
669 inode->i_sb->s_bdev, iov, offset, 594 inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +600,715 @@ out:
675 return ret; 600 return ret;
676} 601}
677 602
603static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
604 u32 cpos,
605 unsigned int *start,
606 unsigned int *end)
607{
608 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
609
610 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
611 unsigned int cpp;
612
613 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
614
615 cluster_start = cpos % cpp;
616 cluster_start = cluster_start << osb->s_clustersize_bits;
617
618 cluster_end = cluster_start + osb->s_clustersize;
619 }
620
621 BUG_ON(cluster_start > PAGE_SIZE);
622 BUG_ON(cluster_end > PAGE_SIZE);
623
624 if (start)
625 *start = cluster_start;
626 if (end)
627 *end = cluster_end;
628}
629
630/*
631 * 'from' and 'to' are the region in the page to avoid zeroing.
632 *
633 * If pagesize > clustersize, this function will avoid zeroing outside
634 * of the cluster boundary.
635 *
636 * from == to == 0 is code for "zero the entire cluster region"
637 */
638static void ocfs2_clear_page_regions(struct page *page,
639 struct ocfs2_super *osb, u32 cpos,
640 unsigned from, unsigned to)
641{
642 void *kaddr;
643 unsigned int cluster_start, cluster_end;
644
645 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
646
647 kaddr = kmap_atomic(page, KM_USER0);
648
649 if (from || to) {
650 if (from > cluster_start)
651 memset(kaddr + cluster_start, 0, from - cluster_start);
652 if (to < cluster_end)
653 memset(kaddr + to, 0, cluster_end - to);
654 } else {
655 memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
656 }
657
658 kunmap_atomic(kaddr, KM_USER0);
659}
660
661/*
662 * Some of this taken from block_prepare_write(). We already have our
663 * mapping by now though, and the entire write will be allocating or
664 * it won't, so not much need to use BH_New.
665 *
666 * This will also skip zeroing, which is handled externally.
667 */
668int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
669 struct inode *inode, unsigned int from,
670 unsigned int to, int new)
671{
672 int ret = 0;
673 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
674 unsigned int block_end, block_start;
675 unsigned int bsize = 1 << inode->i_blkbits;
676
677 if (!page_has_buffers(page))
678 create_empty_buffers(page, bsize, 0);
679
680 head = page_buffers(page);
681 for (bh = head, block_start = 0; bh != head || !block_start;
682 bh = bh->b_this_page, block_start += bsize) {
683 block_end = block_start + bsize;
684
685 /*
686 * Ignore blocks outside of our i/o range -
687 * they may belong to unallocated clusters.
688 */
689 if (block_start >= to || block_end <= from) {
690 if (PageUptodate(page))
691 set_buffer_uptodate(bh);
692 continue;
693 }
694
695 /*
696 * For an allocating write with cluster size >= page
697 * size, we always write the entire page.
698 */
699
700 if (buffer_new(bh))
701 clear_buffer_new(bh);
702
703 if (!buffer_mapped(bh)) {
704 map_bh(bh, inode->i_sb, *p_blkno);
705 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
706 }
707
708 if (PageUptodate(page)) {
709 if (!buffer_uptodate(bh))
710 set_buffer_uptodate(bh);
711 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
712 (block_start < from || block_end > to)) {
713 ll_rw_block(READ, 1, &bh);
714 *wait_bh++=bh;
715 }
716
717 *p_blkno = *p_blkno + 1;
718 }
719
720 /*
721 * If we issued read requests - let them complete.
722 */
723 while(wait_bh > wait) {
724 wait_on_buffer(*--wait_bh);
725 if (!buffer_uptodate(*wait_bh))
726 ret = -EIO;
727 }
728
729 if (ret == 0 || !new)
730 return ret;
731
732 /*
733 * If we get -EIO above, zero out any newly allocated blocks
734 * to avoid exposing stale data.
735 */
736 bh = head;
737 block_start = 0;
738 do {
739 void *kaddr;
740
741 block_end = block_start + bsize;
742 if (block_end <= from)
743 goto next_bh;
744 if (block_start >= to)
745 break;
746
747 kaddr = kmap_atomic(page, KM_USER0);
748 memset(kaddr+block_start, 0, bh->b_size);
749 flush_dcache_page(page);
750 kunmap_atomic(kaddr, KM_USER0);
751 set_buffer_uptodate(bh);
752 mark_buffer_dirty(bh);
753
754next_bh:
755 block_start = block_end;
756 bh = bh->b_this_page;
757 } while (bh != head);
758
759 return ret;
760}
761
762/*
763 * This will copy user data from the buffer page in the splice
764 * context.
765 *
766 * For now, we ignore SPLICE_F_MOVE as that would require some extra
767 * communication out all the way to ocfs2_write().
768 */
769int ocfs2_map_and_write_splice_data(struct inode *inode,
770 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
771 unsigned int *ret_from, unsigned int *ret_to)
772{
773 int ret;
774 unsigned int to, from, cluster_start, cluster_end;
775 char *src, *dst;
776 struct ocfs2_splice_write_priv *sp = wc->w_private;
777 struct pipe_buffer *buf = sp->s_buf;
778 unsigned long bytes, src_from;
779 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
780
781 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
782 &cluster_end);
783
784 from = sp->s_offset;
785 src_from = sp->s_buf_offset;
786 bytes = wc->w_count;
787
788 if (wc->w_large_pages) {
789 /*
790 * For cluster size < page size, we have to
791 * calculate pos within the cluster and obey
792 * the rightmost boundary.
793 */
794 bytes = min(bytes, (unsigned long)(osb->s_clustersize
795 - (wc->w_pos & (osb->s_clustersize - 1))));
796 }
797 to = from + bytes;
798
799 if (wc->w_this_page_new)
800 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
801 cluster_start, cluster_end, 1);
802 else
803 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
804 from, to, 0);
805 if (ret) {
806 mlog_errno(ret);
807 goto out;
808 }
809
810 BUG_ON(from > PAGE_CACHE_SIZE);
811 BUG_ON(to > PAGE_CACHE_SIZE);
812 BUG_ON(from > osb->s_clustersize);
813 BUG_ON(to > osb->s_clustersize);
814
815 src = buf->ops->map(sp->s_pipe, buf, 1);
816 dst = kmap_atomic(wc->w_this_page, KM_USER1);
817 memcpy(dst + from, src + src_from, bytes);
818 kunmap_atomic(wc->w_this_page, KM_USER1);
819 buf->ops->unmap(sp->s_pipe, buf, src);
820
821 wc->w_finished_copy = 1;
822
823 *ret_from = from;
824 *ret_to = to;
825out:
826
827 return bytes ? (unsigned int)bytes : ret;
828}
829
830/*
831 * This will copy user data from the iovec in the buffered write
832 * context.
833 */
834int ocfs2_map_and_write_user_data(struct inode *inode,
835 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
836 unsigned int *ret_from, unsigned int *ret_to)
837{
838 int ret;
839 unsigned int to, from, cluster_start, cluster_end;
840 unsigned long bytes, src_from;
841 char *dst;
842 struct ocfs2_buffered_write_priv *bp = wc->w_private;
843 const struct iovec *cur_iov = bp->b_cur_iov;
844 char __user *buf;
845 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
846
847 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
848 &cluster_end);
849
850 buf = cur_iov->iov_base + bp->b_cur_off;
851 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
852
853 from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
854
855 /*
856 * This is a lot of comparisons, but it reads quite
857 * easily, which is important here.
858 */
859 /* Stay within the src page */
860 bytes = PAGE_SIZE - src_from;
861 /* Stay within the vector */
862 bytes = min(bytes,
863 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
864 /* Stay within count */
865 bytes = min(bytes, (unsigned long)wc->w_count);
866 /*
867 * For clustersize > page size, just stay within
868 * target page, otherwise we have to calculate pos
869 * within the cluster and obey the rightmost
870 * boundary.
871 */
872 if (wc->w_large_pages) {
873 /*
874 * For cluster size < page size, we have to
875 * calculate pos within the cluster and obey
876 * the rightmost boundary.
877 */
878 bytes = min(bytes, (unsigned long)(osb->s_clustersize
879 - (wc->w_pos & (osb->s_clustersize - 1))));
880 } else {
881 /*
882 * cluster size > page size is the most common
883 * case - we just stay within the target page
884 * boundary.
885 */
886 bytes = min(bytes, PAGE_CACHE_SIZE - from);
887 }
888
889 to = from + bytes;
890
891 if (wc->w_this_page_new)
892 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
893 cluster_start, cluster_end, 1);
894 else
895 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
896 from, to, 0);
897 if (ret) {
898 mlog_errno(ret);
899 goto out;
900 }
901
902 BUG_ON(from > PAGE_CACHE_SIZE);
903 BUG_ON(to > PAGE_CACHE_SIZE);
904 BUG_ON(from > osb->s_clustersize);
905 BUG_ON(to > osb->s_clustersize);
906
907 dst = kmap(wc->w_this_page);
908 memcpy(dst + from, bp->b_src_buf + src_from, bytes);
909 kunmap(wc->w_this_page);
910
911 /*
912 * XXX: This is slow, but simple. The caller of
913 * ocfs2_buffered_write_cluster() is responsible for
914 * passing through the iovecs, so it's difficult to
915 * predict what our next step is in here after our
916 * initial write. A future version should be pushing
917 * that iovec manipulation further down.
918 *
919 * By setting this, we indicate that a copy from user
920 * data was done, and subsequent calls for this
921 * cluster will skip copying more data.
922 */
923 wc->w_finished_copy = 1;
924
925 *ret_from = from;
926 *ret_to = to;
927out:
928
929 return bytes ? (unsigned int)bytes : ret;
930}
931
932/*
933 * Map, fill and write a page to disk.
934 *
935 * The work of copying data is done via callback. Newly allocated
936 * pages which don't take user data will be zero'd (set 'new' to
937 * indicate an allocating write)
938 *
939 * Returns a negative error code or the number of bytes copied into
940 * the page.
941 */
942int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
943 u64 *p_blkno, struct page *page,
944 struct ocfs2_write_ctxt *wc, int new)
945{
946 int ret, copied = 0;
947 unsigned int from = 0, to = 0;
948 unsigned int cluster_start, cluster_end;
949 unsigned int zero_from = 0, zero_to = 0;
950
951 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
952 &cluster_start, &cluster_end);
953
954 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
955 && !wc->w_finished_copy) {
956
957 wc->w_this_page = page;
958 wc->w_this_page_new = new;
959 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
960 if (ret < 0) {
961 mlog_errno(ret);
962 goto out;
963 }
964
965 copied = ret;
966
967 zero_from = from;
968 zero_to = to;
969 if (new) {
970 from = cluster_start;
971 to = cluster_end;
972 }
973 } else {
974 /*
975 * If we haven't allocated the new page yet, we
976 * shouldn't be writing it out without copying user
977 * data. This is likely a math error from the caller.
978 */
979 BUG_ON(!new);
980
981 from = cluster_start;
982 to = cluster_end;
983
984 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
985 cluster_start, cluster_end, 1);
986 if (ret) {
987 mlog_errno(ret);
988 goto out;
989 }
990 }
991
992 /*
993 * Parts of newly allocated pages need to be zero'd.
994 *
995 * Above, we have also rewritten 'to' and 'from' - as far as
996 * the rest of the function is concerned, the entire cluster
997 * range inside of a page needs to be written.
998 *
999 * We can skip this if the page is up to date - it's already
1000 * been zero'd from being read in as a hole.
1001 */
1002 if (new && !PageUptodate(page))
1003 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1004 wc->w_cpos, zero_from, zero_to);
1005
1006 flush_dcache_page(page);
1007
1008 if (ocfs2_should_order_data(inode)) {
1009 ret = walk_page_buffers(handle,
1010 page_buffers(page),
1011 from, to, NULL,
1012 ocfs2_journal_dirty_data);
1013 if (ret < 0)
1014 mlog_errno(ret);
1015 }
1016
1017 /*
1018 * We don't use generic_commit_write() because we need to
1019 * handle our own i_size update.
1020 */
1021 ret = block_commit_write(page, from, to);
1022 if (ret)
1023 mlog_errno(ret);
1024out:
1025
1026 return copied ? copied : ret;
1027}
1028
1029/*
1030 * Do the actual write of some data into an inode. Optionally allocate
1031 * in order to fulfill the write.
1032 *
1033 * cpos is the logical cluster offset within the file to write at
1034 *
1035 * 'phys' is the physical mapping of that offset. a 'phys' value of
1036 * zero indicates that allocation is required. In this case, data_ac
1037 * and meta_ac should be valid (meta_ac can be null if metadata
1038 * allocation isn't required).
1039 */
1040static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1041 struct buffer_head *di_bh,
1042 struct ocfs2_alloc_context *data_ac,
1043 struct ocfs2_alloc_context *meta_ac,
1044 struct ocfs2_write_ctxt *wc)
1045{
1046 int ret, i, numpages = 1, new;
1047 unsigned int copied = 0;
1048 u32 tmp_pos;
1049 u64 v_blkno, p_blkno;
1050 struct address_space *mapping = file->f_mapping;
1051 struct inode *inode = mapping->host;
1052 unsigned long index, start;
1053 struct page **cpages;
1054
1055 new = phys == 0 ? 1 : 0;
1056
1057 /*
1058 * Figure out how many pages we'll be manipulating here. For
1059 * non allocating write, we just change the one
1060 * page. Otherwise, we'll need a whole clusters worth.
1061 */
1062 if (new)
1063 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1064
1065 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1066 if (!cpages) {
1067 ret = -ENOMEM;
1068 mlog_errno(ret);
1069 return ret;
1070 }
1071
1072 /*
1073 * Fill our page array first. That way we've grabbed enough so
1074 * that we can zero and flush if we error after adding the
1075 * extent.
1076 */
1077 if (new) {
1078 start = ocfs2_align_clusters_to_page_index(inode->i_sb,
1079 wc->w_cpos);
1080 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1081 } else {
1082 start = wc->w_pos >> PAGE_CACHE_SHIFT;
1083 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
1084 }
1085
1086 for(i = 0; i < numpages; i++) {
1087 index = start + i;
1088
1089 cpages[i] = grab_cache_page(mapping, index);
1090 if (!cpages[i]) {
1091 ret = -ENOMEM;
1092 mlog_errno(ret);
1093 goto out;
1094 }
1095 }
1096
1097 if (new) {
1098 /*
1099 * This is safe to call with the page locks - it won't take
1100 * any additional semaphores or cluster locks.
1101 */
1102 tmp_pos = wc->w_cpos;
1103 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1104 &tmp_pos, 1, di_bh, handle,
1105 data_ac, meta_ac, NULL);
1106 /*
1107 * This shouldn't happen because we must have already
1108 * calculated the correct meta data allocation required. The
1109 * internal tree allocation code should know how to increase
1110 * transaction credits itself.
1111 *
1112 * If need be, we could handle -EAGAIN for a
1113 * RESTART_TRANS here.
1114 */
1115 mlog_bug_on_msg(ret == -EAGAIN,
1116 "Inode %llu: EAGAIN return during allocation.\n",
1117 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1118 if (ret < 0) {
1119 mlog_errno(ret);
1120 goto out;
1121 }
1122 }
1123
1124 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1125 NULL);
1126 if (ret < 0) {
1127
1128 /*
1129 * XXX: Should we go readonly here?
1130 */
1131
1132 mlog_errno(ret);
1133 goto out;
1134 }
1135
1136 BUG_ON(p_blkno == 0);
1137
1138 for(i = 0; i < numpages; i++) {
1139 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
1140 wc, new);
1141 if (ret < 0) {
1142 mlog_errno(ret);
1143 goto out;
1144 }
1145
1146 copied += ret;
1147 }
1148
1149out:
1150 for(i = 0; i < numpages; i++) {
1151 unlock_page(cpages[i]);
1152 mark_page_accessed(cpages[i]);
1153 page_cache_release(cpages[i]);
1154 }
1155 kfree(cpages);
1156
1157 return copied ? copied : ret;
1158}
1159
1160static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
1161 struct ocfs2_super *osb, loff_t pos,
1162 size_t count, ocfs2_page_writer *cb,
1163 void *cb_priv)
1164{
1165 wc->w_count = count;
1166 wc->w_pos = pos;
1167 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1168 wc->w_finished_copy = 0;
1169
1170 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1171 wc->w_large_pages = 1;
1172 else
1173 wc->w_large_pages = 0;
1174
1175 wc->w_write_data_page = cb;
1176 wc->w_private = cb_priv;
1177}
1178
1179/*
1180 * Write a cluster to an inode. The cluster may not be allocated yet,
1181 * in which case it will be. This only exists for buffered writes -
1182 * O_DIRECT takes a more "traditional" path through the kernel.
1183 *
1184 * The caller is responsible for incrementing pos, written counts, etc
1185 *
1186 * For file systems that don't support sparse files, pre-allocation
1187 * and page zeroing up until cpos should be done prior to this
1188 * function call.
1189 *
1190 * Callers should be holding i_sem, and the rw cluster lock.
1191 *
1192 * Returns the number of user bytes written, or less than zero for
1193 * error.
1194 */
1195ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1196 size_t count, ocfs2_page_writer *actor,
1197 void *priv)
1198{
1199 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1200 ssize_t written = 0;
1201 u32 phys;
1202 struct inode *inode = file->f_mapping->host;
1203 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1204 struct buffer_head *di_bh = NULL;
1205 struct ocfs2_dinode *di;
1206 struct ocfs2_alloc_context *data_ac = NULL;
1207 struct ocfs2_alloc_context *meta_ac = NULL;
1208 handle_t *handle;
1209 struct ocfs2_write_ctxt wc;
1210
1211 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1212
1213 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1214 if (ret) {
1215 mlog_errno(ret);
1216 goto out;
1217 }
1218 di = (struct ocfs2_dinode *)di_bh->b_data;
1219
1220 /*
1221 * Take alloc sem here to prevent concurrent lookups. That way
1222 * the mapping, zeroing and tree manipulation within
1223 * ocfs2_write() will be safe against ->readpage(). This
1224 * should also serve to lock out allocation from a shared
1225 * writeable region.
1226 */
1227 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1228
1229 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
1230 if (ret) {
1231 mlog_errno(ret);
1232 goto out_meta;
1233 }
1234
1235 /* phys == 0 means that allocation is required. */
1236 if (phys == 0) {
1237 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
1238 if (ret) {
1239 mlog_errno(ret);
1240 goto out_meta;
1241 }
1242
1243 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
1244 }
1245
1246 ret = ocfs2_data_lock(inode, 1);
1247 if (ret) {
1248 mlog_errno(ret);
1249 goto out_meta;
1250 }
1251
1252 handle = ocfs2_start_trans(osb, credits);
1253 if (IS_ERR(handle)) {
1254 ret = PTR_ERR(handle);
1255 mlog_errno(ret);
1256 goto out_data;
1257 }
1258
1259 written = ocfs2_write(file, phys, handle, di_bh, data_ac,
1260 meta_ac, &wc);
1261 if (written < 0) {
1262 ret = written;
1263 mlog_errno(ret);
1264 goto out_commit;
1265 }
1266
1267 ret = ocfs2_journal_access(handle, inode, di_bh,
1268 OCFS2_JOURNAL_ACCESS_WRITE);
1269 if (ret) {
1270 mlog_errno(ret);
1271 goto out_commit;
1272 }
1273
1274 pos += written;
1275 if (pos > inode->i_size) {
1276 i_size_write(inode, pos);
1277 mark_inode_dirty(inode);
1278 }
1279 inode->i_blocks = ocfs2_inode_sector_count(inode);
1280 di->i_size = cpu_to_le64((u64)i_size_read(inode));
1281 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1282 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1283 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1284
1285 ret = ocfs2_journal_dirty(handle, di_bh);
1286 if (ret)
1287 mlog_errno(ret);
1288
1289out_commit:
1290 ocfs2_commit_trans(osb, handle);
1291
1292out_data:
1293 ocfs2_data_unlock(inode, 1);
1294
1295out_meta:
1296 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1297 ocfs2_meta_unlock(inode, 1);
1298
1299out:
1300 brelse(di_bh);
1301 if (data_ac)
1302 ocfs2_free_alloc_context(data_ac);
1303 if (meta_ac)
1304 ocfs2_free_alloc_context(meta_ac);
1305
1306 return written ? written : ret;
1307}
1308
678const struct address_space_operations ocfs2_aops = { 1309const struct address_space_operations ocfs2_aops = {
679 .readpage = ocfs2_readpage, 1310 .readpage = ocfs2_readpage,
680 .writepage = ocfs2_writepage, 1311 .writepage = ocfs2_writepage,
681 .prepare_write = ocfs2_prepare_write,
682 .commit_write = ocfs2_commit_write,
683 .bmap = ocfs2_bmap, 1312 .bmap = ocfs2_bmap,
684 .sync_page = block_sync_page, 1313 .sync_page = block_sync_page,
685 .direct_IO = ocfs2_direct_IO, 1314 .direct_IO = ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
30 unsigned from, 30 unsigned from,
31 unsigned to); 31 unsigned to);
32 32
33int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
34 struct inode *inode, unsigned int from,
35 unsigned int to, int new);
36
37int walk_page_buffers( handle_t *handle,
38 struct buffer_head *head,
39 unsigned from,
40 unsigned to,
41 int *partial,
42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh));
44
45struct ocfs2_write_ctxt;
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
47 u64 *, unsigned int *, unsigned int *);
48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
50 size_t count, ocfs2_page_writer *actor,
51 void *priv);
52
53struct ocfs2_write_ctxt {
54 size_t w_count;
55 loff_t w_pos;
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58
59 /* This is true if page_size > cluster_size */
60 unsigned int w_large_pages;
61
62 /* Filler callback and private data */
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96
33/* all ocfs2_dio_end_io()'s fault */ 97/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \ 98#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private) 99 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \ 100static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
37 set_bit(0, (unsigned long *)&iocb->private) 101{
102 set_bit(0, (unsigned long *)&iocb->private);
103 if (level)
104 set_bit(1, (unsigned long *)&iocb->private);
105 else
106 clear_bit(1, (unsigned long *)&iocb->private);
107}
38#define ocfs2_iocb_clear_rw_locked(iocb) \ 108#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private) 109 clear_bit(0, (unsigned long *)&iocb->private)
40 110#define ocfs2_iocb_rw_locked_level(iocb) \
111 test_bit(1, (unsigned long *)&iocb->private)
41#endif /* OCFS2_FILE_H */ 112#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/reboot.h>
49 50
50#include "heartbeat.h" 51#include "heartbeat.h"
51#include "nodemanager.h" 52#include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
72 /* panic spins with interrupts enabled. with preempt 73 /* panic spins with interrupts enabled. with preempt
73 * threads can still schedule, etc, etc */ 74 * threads can still schedule, etc, etc */
74 o2hb_stop_all_regions(); 75 o2hb_stop_all_regions();
75 panic("ocfs2 is very sorry to be fencing this system by panicing\n"); 76
77 printk("ocfs2 is very sorry to be fencing this system by restarting\n");
78 emergency_restart();
76} 79}
77 80
78/* Indicate that a timeout occured on a hearbeat region write. The 81/* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 8:
42 * - Replace delete inode votes with a cluster lock
43 *
41 * New in version 7: 44 * New in version 7:
42 * - DLM join domain includes the live nodemap 45 * - DLM join domain includes the live nodemap
43 * 46 *
@@ -57,7 +60,7 @@
57 * - full 64 bit i_size in the metadata lock lvbs 60 * - full 64 bit i_size in the metadata lock lvbs
58 * - introduction of "rw" lock and pushing meta/data locking down 61 * - introduction of "rw" lock and pushing meta/data locking down
59 */ 62 */
60#define O2NET_PROTOCOL_VERSION 7ULL 63#define O2NET_PROTOCOL_VERSION 8ULL
61struct o2net_handshake { 64struct o2net_handshake {
62 __be64 protocol_version; 65 __be64 protocol_version;
63 __be64 connector_id; 66 __be64 connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..67e6866a2a4f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
358{ 358{
359 int status; 359 int status;
360 int extend; 360 int extend;
361 u64 p_blkno; 361 u64 p_blkno, v_blkno;
362 362
363 spin_lock(&OCFS2_I(dir)->ip_lock); 363 spin_lock(&OCFS2_I(dir)->ip_lock);
364 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); 364 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
365 spin_unlock(&OCFS2_I(dir)->ip_lock); 365 spin_unlock(&OCFS2_I(dir)->ip_lock);
366 366
367 if (extend) { 367 if (extend) {
368 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 parent_fe_bh, handle, 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle,
370 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
371 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
372 if (status < 0) { 374 if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
375 } 377 }
376 } 378 }
377 379
378 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> 380 v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
379 (sb->s_blocksize_bits - 9)), 381 status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
380 1, &p_blkno, NULL);
381 if (status < 0) { 382 if (status < 0) {
382 mlog_errno(status); 383 mlog_errno(status);
383 goto bail; 384 goto bail;
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
486 487
487 dir_i_size += dir->i_sb->s_blocksize; 488 dir_i_size += dir->i_sb->s_blocksize;
488 i_size_write(dir, dir_i_size); 489 i_size_write(dir, dir_i_size);
489 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); 490 dir->i_blocks = ocfs2_inode_sector_count(dir);
490 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 491 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
491 if (status < 0) { 492 if (status < 0) {
492 mlog_errno(status); 493 mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
430 430
431 dlm_lockres_put(res); 431 dlm_lockres_put(res);
432 432
433 cond_resched_lock(&dlm->spinlock);
434
435 if (dropped) 433 if (dropped)
436 goto redo_bucket; 434 goto redo_bucket;
437 } 435 }
436 cond_resched_lock(&dlm->spinlock);
438 num += n; 437 num += n;
439 mlog(0, "%s: touched %d lockreses in bucket %d " 438 mlog(0, "%s: touched %d lockreses in bucket %d "
440 "(tot=%d)\n", dlm->name, n, i, num); 439 "(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1035{ 1034{
1036 int status = 0, tmpstat, node; 1035 int status = 0, tmpstat, node;
1037 struct domain_join_ctxt *ctxt; 1036 struct domain_join_ctxt *ctxt;
1038 enum dlm_query_join_response response; 1037 enum dlm_query_join_response response = JOIN_DISALLOW;
1039 1038
1040 mlog_entry("%p", dlm); 1039 mlog_entry("%p", dlm);
1041 1040
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..c1807a42c49f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
611 } 611 }
612 } while (status != 0); 612 } while (status != 0);
613 613
614 spin_lock(&dlm_reco_state_lock);
614 switch (ndata->state) { 615 switch (ndata->state) {
615 case DLM_RECO_NODE_DATA_INIT: 616 case DLM_RECO_NODE_DATA_INIT:
616 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 617 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
641 ndata->node_num, dead_node); 642 ndata->node_num, dead_node);
642 break; 643 break;
643 } 644 }
645 spin_unlock(&dlm_reco_state_lock);
644 } 646 }
645 647
646 mlog(0, "done requesting all lock info\n"); 648 mlog(0, "done requesting all lock info\n");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..27e43b0c0eae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
229 .get_osb = ocfs2_get_inode_osb,
230 .flags = 0,
231};
232
228static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 233static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
229{ 234{
230 return lockres->l_type == OCFS2_LOCK_TYPE_META || 235 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
231 lockres->l_type == OCFS2_LOCK_TYPE_DATA || 236 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
232 lockres->l_type == OCFS2_LOCK_TYPE_RW; 237 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
238 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
233} 239}
234 240
235static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 241static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
373 case OCFS2_LOCK_TYPE_DATA: 379 case OCFS2_LOCK_TYPE_DATA:
374 ops = &ocfs2_inode_data_lops; 380 ops = &ocfs2_inode_data_lops;
375 break; 381 break;
382 case OCFS2_LOCK_TYPE_OPEN:
383 ops = &ocfs2_inode_open_lops;
384 break;
376 default: 385 default:
377 mlog_bug_on_msg(1, "type: %d\n", type); 386 mlog_bug_on_msg(1, "type: %d\n", type);
378 ops = NULL; /* thanks, gcc */ 387 ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1129 goto bail; 1138 goto bail;
1130 } 1139 }
1131 1140
1141 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
1142 if (ret) {
1143 mlog_errno(ret);
1144 goto bail;
1145 }
1146
1132bail: 1147bail:
1133 mlog_exit(ret); 1148 mlog_exit(ret);
1134 return ret; 1149 return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
1182 mlog_exit_void(); 1197 mlog_exit_void();
1183} 1198}
1184 1199
1200/*
1201 * ocfs2_open_lock always get PR mode lock.
1202 */
1203int ocfs2_open_lock(struct inode *inode)
1204{
1205 int status = 0;
1206 struct ocfs2_lock_res *lockres;
1207 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1208
1209 BUG_ON(!inode);
1210
1211 mlog_entry_void();
1212
1213 mlog(0, "inode %llu take PRMODE open lock\n",
1214 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1215
1216 if (ocfs2_mount_local(osb))
1217 goto out;
1218
1219 lockres = &OCFS2_I(inode)->ip_open_lockres;
1220
1221 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1222 LKM_PRMODE, 0, 0);
1223 if (status < 0)
1224 mlog_errno(status);
1225
1226out:
1227 mlog_exit(status);
1228 return status;
1229}
1230
1231int ocfs2_try_open_lock(struct inode *inode, int write)
1232{
1233 int status = 0, level;
1234 struct ocfs2_lock_res *lockres;
1235 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1236
1237 BUG_ON(!inode);
1238
1239 mlog_entry_void();
1240
1241 mlog(0, "inode %llu try to take %s open lock\n",
1242 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1243 write ? "EXMODE" : "PRMODE");
1244
1245 if (ocfs2_mount_local(osb))
1246 goto out;
1247
1248 lockres = &OCFS2_I(inode)->ip_open_lockres;
1249
1250 level = write ? LKM_EXMODE : LKM_PRMODE;
1251
1252 /*
1253 * The file system may already holding a PRMODE/EXMODE open lock.
1254 * Since we pass LKM_NOQUEUE, the request won't block waiting on
1255 * other nodes and the -EAGAIN will indicate to the caller that
1256 * this inode is still in use.
1257 */
1258 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1259 level, LKM_NOQUEUE, 0);
1260
1261out:
1262 mlog_exit(status);
1263 return status;
1264}
1265
1266/*
1267 * ocfs2_open_unlock unlock PR and EX mode open locks.
1268 */
1269void ocfs2_open_unlock(struct inode *inode)
1270{
1271 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
1272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1273
1274 mlog_entry_void();
1275
1276 mlog(0, "inode %llu drop open lock\n",
1277 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1278
1279 if (ocfs2_mount_local(osb))
1280 goto out;
1281
1282 if(lockres->l_ro_holders)
1283 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1284 LKM_PRMODE);
1285 if(lockres->l_ex_holders)
1286 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1287 LKM_EXMODE);
1288
1289out:
1290 mlog_exit_void();
1291}
1292
1185int ocfs2_data_lock_full(struct inode *inode, 1293int ocfs2_data_lock_full(struct inode *inode,
1186 int write, 1294 int write,
1187 int arg_flags) 1295 int arg_flags)
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1387 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) 1495 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1388 inode->i_blocks = 0; 1496 inode->i_blocks = 0;
1389 else 1497 else
1390 inode->i_blocks = 1498 inode->i_blocks = ocfs2_inode_sector_count(inode);
1391 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1392 1499
1393 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 1500 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1394 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 1501 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1479{ 1586{
1480 int status = 0; 1587 int status = 0;
1481 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1588 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1482 struct ocfs2_lock_res *lockres = NULL; 1589 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1483 struct ocfs2_dinode *fe; 1590 struct ocfs2_dinode *fe;
1484 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1591 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1485 1592
1486 mlog_entry_void(); 1593 mlog_entry_void();
1487 1594
1595 if (ocfs2_mount_local(osb))
1596 goto bail;
1597
1488 spin_lock(&oi->ip_lock); 1598 spin_lock(&oi->ip_lock);
1489 if (oi->ip_flags & OCFS2_INODE_DELETED) { 1599 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1490 mlog(0, "Orphaned inode %llu was deleted while we " 1600 mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1496 } 1606 }
1497 spin_unlock(&oi->ip_lock); 1607 spin_unlock(&oi->ip_lock);
1498 1608
1499 if (!ocfs2_mount_local(osb)) { 1609 if (!ocfs2_should_refresh_lock_res(lockres))
1500 lockres = &oi->ip_meta_lockres; 1610 goto bail;
1501
1502 if (!ocfs2_should_refresh_lock_res(lockres))
1503 goto bail;
1504 }
1505 1611
1506 /* This will discard any caching information we might have had 1612 /* This will discard any caching information we might have had
1507 * for the inode metadata. */ 1613 * for the inode metadata. */
1508 ocfs2_metadata_cache_purge(inode); 1614 ocfs2_metadata_cache_purge(inode);
1509 1615
1510 /* will do nothing for inode types that don't use the extent
1511 * map (directories, bitmap files, etc) */
1512 ocfs2_extent_map_trunc(inode, 0); 1616 ocfs2_extent_map_trunc(inode, 0);
1513 1617
1514 if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { 1618 if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
1515 mlog(0, "Trusting LVB on inode %llu\n", 1619 mlog(0, "Trusting LVB on inode %llu\n",
1516 (unsigned long long)oi->ip_blkno); 1620 (unsigned long long)oi->ip_blkno);
1517 ocfs2_refresh_inode_from_lvb(inode); 1621 ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1558 1662
1559 status = 0; 1663 status = 0;
1560bail_refresh: 1664bail_refresh:
1561 if (lockres) 1665 ocfs2_complete_lock_res_refresh(lockres, status);
1562 ocfs2_complete_lock_res_refresh(lockres, status);
1563bail: 1666bail:
1564 mlog_exit(status); 1667 mlog_exit(status);
1565 return status; 1668 return status;
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
1630 wait_event(osb->recovery_event, 1733 wait_event(osb->recovery_event,
1631 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1734 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1632 1735
1633 acquired = 0;
1634 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1736 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1635 level = ex ? LKM_EXMODE : LKM_PRMODE; 1737 level = ex ? LKM_EXMODE : LKM_PRMODE;
1636 dlm_flags = 0; 1738 dlm_flags = 0;
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2458 * ocfs2_clear_inode has done it for us. */ 2560 * ocfs2_clear_inode has done it for us. */
2459 2561
2460 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2562 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2461 &OCFS2_I(inode)->ip_data_lockres); 2563 &OCFS2_I(inode)->ip_open_lockres);
2462 if (err < 0) 2564 if (err < 0)
2463 mlog_errno(err); 2565 mlog_errno(err);
2464 2566
2465 status = err; 2567 status = err;
2466 2568
2467 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2569 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2570 &OCFS2_I(inode)->ip_data_lockres);
2571 if (err < 0)
2572 mlog_errno(err);
2573 if (err < 0 && !status)
2574 status = err;
2575
2576 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2468 &OCFS2_I(inode)->ip_meta_lockres); 2577 &OCFS2_I(inode)->ip_meta_lockres);
2469 if (err < 0) 2578 if (err < 0)
2470 mlog_errno(err); 2579 mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..59cb566e7983 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
80 int write); 80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 81int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 82void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode);
83int ocfs2_meta_lock_atime(struct inode *inode, 86int ocfs2_meta_lock_atime(struct inode *inode,
84 struct vfsmount *vfsmnt, 87 struct vfsmount *vfsmnt,
85 int *level); 88 int *level);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
3 * 3 *
4 * extent_map.c 4 * extent_map.c
5 * 5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in 6 * Block/Cluster mapping functions
7 * the library.
8 * 7 *
9 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
10 * 9 *
@@ -26,1016 +25,528 @@
26#include <linux/fs.h> 25#include <linux/fs.h>
27#include <linux/init.h> 26#include <linux/init.h>
28#include <linux/types.h> 27#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31 28
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP 29#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h> 30#include <cluster/masklog.h>
34 31
35#include "ocfs2.h" 32#include "ocfs2.h"
36 33
34#include "alloc.h"
37#include "extent_map.h" 35#include "extent_map.h"
38#include "inode.h" 36#include "inode.h"
39#include "super.h" 37#include "super.h"
40 38
41#include "buffer_head_io.h" 39#include "buffer_head_io.h"
42 40
43
44/* 41/*
45 * SUCK SUCK SUCK 42 * The extent caching implementation is intentionally trivial.
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 * 43 *
112 * The rb_node garbage lets insertion share the search. Trivial 44 * We only cache a small number of extents stored directly on the
113 * callers pass NULL. 45 * inode, so linear order operations are acceptable. If we ever want
46 * to increase the size of the extent map, then these algorithms must
47 * get smarter.
114 */ 48 */
115static struct ocfs2_extent_map_entry * 49
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, 50void ocfs2_extent_map_init(struct inode *inode)
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{ 51{
121 struct rb_node **p = &em->em_extents.rb_node; 52 struct ocfs2_inode_info *oi = OCFS2_I(inode);
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140 53
141 if (ret_p != NULL) 54 oi->ip_extent_map.em_num_items = 0;
142 *ret_p = p; 55 INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146} 56}
147 57
148/* 58static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
149 * Find the leaf containing the interval we want. While we're on our 59 unsigned int cpos,
150 * way down the tree, fill in every record we see at any depth, because 60 struct ocfs2_extent_map_item **ret_emi)
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{ 61{
161 int i, ret; 62 unsigned int range;
162 struct buffer_head *eb_bh = NULL; 63 struct ocfs2_extent_map_item *emi;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 ocfs2_error(inode->i_sb,
185 "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
186 i,
187 (unsigned long long)le64_to_cpu(rec->e_blkno),
188 (unsigned long long)OCFS2_I(inode)->ip_blkno,
189 OCFS2_I(inode)->ip_clusters);
190 goto out_free;
191 }
192
193 if (rec_end <= cpos) {
194 ret = ocfs2_extent_map_insert(inode, rec,
195 le16_to_cpu(el->l_tree_depth));
196 if (ret && (ret != -EEXIST)) {
197 mlog_errno(ret);
198 goto out_free;
199 }
200 continue;
201 }
202 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
203 ret = ocfs2_extent_map_insert(inode, rec,
204 le16_to_cpu(el->l_tree_depth));
205 if (ret && (ret != -EEXIST)) {
206 mlog_errno(ret);
207 goto out_free;
208 }
209 continue;
210 }
211 64
212 /* 65 *ret_emi = NULL;
213 * We've found a record that matches our
214 * interval. We don't insert it because we're
215 * about to traverse it.
216 */
217
218 /* Check to see if we're stradling */
219 ret = -ESRCH;
220 if (!ocfs2_extent_rec_contains_clusters(rec,
221 cpos,
222 clusters)) {
223 mlog_errno(ret);
224 goto out_free;
225 }
226 66
227 /* 67 list_for_each_entry(emi, &em->em_list, ei_list) {
228 * If we've already found a record, the el has 68 range = emi->ei_cpos + emi->ei_clusters;
229 * two records covering the same interval.
230 * EEEK!
231 */
232 ret = -EBADR;
233 if (blkno) {
234 mlog_errno(ret);
235 ocfs2_error(inode->i_sb,
236 "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
237 cpos, clusters,
238 (unsigned long long)OCFS2_I(inode)->ip_blkno,
239 (unsigned long long)blkno, i,
240 (unsigned long long)le64_to_cpu(rec->e_blkno));
241 goto out_free;
242 }
243 69
244 blkno = le64_to_cpu(rec->e_blkno); 70 if (cpos >= emi->ei_cpos && cpos < range) {
245 } 71 list_move(&emi->ei_list, &em->em_list);
246 72
247 /* 73 *ret_emi = emi;
248 * We don't support holes, and we're still up 74 break;
249 * in the branches, so we'd better have found someone
250 */
251 ret = -EBADR;
252 if (!blkno) {
253 ocfs2_error(inode->i_sb,
254 "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
255 cpos, clusters,
256 (unsigned long long)OCFS2_I(inode)->ip_blkno);
257 mlog_errno(ret);
258 goto out_free;
259 }
260
261 if (eb_bh) {
262 brelse(eb_bh);
263 eb_bh = NULL;
264 }
265 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
266 blkno, &eb_bh, OCFS2_BH_CACHED,
267 inode);
268 if (ret) {
269 mlog_errno(ret);
270 goto out_free;
271 }
272 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
273 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
274 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
275 ret = -EIO;
276 goto out_free;
277 } 75 }
278 el = &eb->h_list;
279 } 76 }
77}
280 78
281 BUG_ON(el->l_tree_depth); 79static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
282 80 unsigned int *phys, unsigned int *len,
283 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 81 unsigned int *flags)
284 rec = &el->l_recs[i]; 82{
285 83 unsigned int coff;
286 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > 84 struct ocfs2_inode_info *oi = OCFS2_I(inode);
287 OCFS2_I(inode)->ip_clusters) { 85 struct ocfs2_extent_map_item *emi;
288 ret = -EBADR; 86
289 mlog_errno(ret); 87 spin_lock(&oi->ip_lock);
290 ocfs2_error(inode->i_sb, 88
291 "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", 89 __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
292 i, 90 if (emi) {
293 (unsigned long long)le64_to_cpu(rec->e_blkno), 91 coff = cpos - emi->ei_cpos;
294 (unsigned long long)OCFS2_I(inode)->ip_blkno, 92 *phys = emi->ei_phys + coff;
295 OCFS2_I(inode)->ip_clusters); 93 if (len)
296 return ret; 94 *len = emi->ei_clusters - coff;
297 } 95 if (flags)
298 96 *flags = emi->ei_flags;
299 ret = ocfs2_extent_map_insert(inode, rec,
300 le16_to_cpu(el->l_tree_depth));
301 if (ret && (ret != -EEXIST)) {
302 mlog_errno(ret);
303 goto out_free;
304 }
305 } 97 }
306 98
307 ret = 0; 99 spin_unlock(&oi->ip_lock);
308 100
309out_free: 101 if (emi == NULL)
310 if (eb_bh) 102 return -ENOENT;
311 brelse(eb_bh);
312 103
313 return ret; 104 return 0;
314} 105}
315 106
316/* 107/*
317 * This lookup actually will read from disk. It has one invariant: 108 * Forget about all clusters equal to or greater than cpos.
318 * It will never re-traverse blocks. This means that all inserts should
319 * be new regions or more granular regions (both allowed by insert).
320 */ 109 */
321static int ocfs2_extent_map_lookup_read(struct inode *inode, 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
322 u32 cpos,
323 u32 clusters,
324 struct ocfs2_extent_map_entry **ret_ent)
325{ 111{
326 int ret; 112 struct list_head *p, *n;
327 u64 blkno; 113 struct ocfs2_extent_map_item *emi;
328 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 114 struct ocfs2_inode_info *oi = OCFS2_I(inode);
329 struct ocfs2_extent_map_entry *ent; 115 struct ocfs2_extent_map *em = &oi->ip_extent_map;
330 struct buffer_head *bh = NULL; 116 LIST_HEAD(tmp_list);
331 struct ocfs2_extent_block *eb; 117 unsigned int range;
332 struct ocfs2_dinode *di; 118
333 struct ocfs2_extent_list *el; 119 spin_lock(&oi->ip_lock);
334 120 list_for_each_safe(p, n, &em->em_list) {
335 spin_lock(&OCFS2_I(inode)->ip_lock); 121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
336 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); 122
337 if (ent) { 123 if (emi->ei_cpos >= cpos) {
338 if (!ent->e_tree_depth) { 124 /* Full truncate of this record. */
339 spin_unlock(&OCFS2_I(inode)->ip_lock); 125 list_move(&emi->ei_list, &tmp_list);
340 *ret_ent = ent; 126 BUG_ON(em->em_num_items == 0);
341 return 0; 127 em->em_num_items--;
342 } 128 continue;
343 blkno = le64_to_cpu(ent->e_rec.e_blkno);
344 spin_unlock(&OCFS2_I(inode)->ip_lock);
345
346 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
347 OCFS2_BH_CACHED, inode);
348 if (ret) {
349 mlog_errno(ret);
350 if (bh)
351 brelse(bh);
352 return ret;
353 } 129 }
354 eb = (struct ocfs2_extent_block *)bh->b_data;
355 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
356 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
357 brelse(bh);
358 return -EIO;
359 }
360 el = &eb->h_list;
361 } else {
362 spin_unlock(&OCFS2_I(inode)->ip_lock);
363 130
364 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 131 range = emi->ei_cpos + emi->ei_clusters;
365 OCFS2_I(inode)->ip_blkno, &bh, 132 if (range > cpos) {
366 OCFS2_BH_CACHED, inode); 133 /* Partial truncate */
367 if (ret) { 134 emi->ei_clusters = cpos - emi->ei_cpos;
368 mlog_errno(ret);
369 if (bh)
370 brelse(bh);
371 return ret;
372 } 135 }
373 di = (struct ocfs2_dinode *)bh->b_data;
374 if (!OCFS2_IS_VALID_DINODE(di)) {
375 brelse(bh);
376 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
377 return -EIO;
378 }
379 el = &di->id2.i_list;
380 }
381
382 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
383 brelse(bh);
384 if (ret) {
385 mlog_errno(ret);
386 return ret;
387 } 136 }
137 spin_unlock(&oi->ip_lock);
388 138
389 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); 139 list_for_each_safe(p, n, &tmp_list) {
390 if (!ent) { 140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
391 ret = -ESRCH; 141 list_del(&emi->ei_list);
392 mlog_errno(ret); 142 kfree(emi);
393 return ret;
394 } 143 }
395
396 /* FIXME: Make sure this isn't a corruption */
397 BUG_ON(ent->e_tree_depth);
398
399 *ret_ent = ent;
400
401 return 0;
402} 144}
403 145
404/* 146/*
405 * Callers must hold ip_lock. This can insert pieces of the tree, 147 * Is any part of emi2 contained within emi1
406 * thus racing lookup if the lock weren't held.
407 */ 148 */
408static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, 149static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
409 struct ocfs2_extent_map_entry *ent) 150 struct ocfs2_extent_map_item *emi2)
410{ 151{
411 struct rb_node **p, *parent; 152 unsigned int range1, range2;
412 struct ocfs2_extent_map_entry *old_ent;
413 153
414 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), 154 /*
415 le32_to_cpu(ent->e_rec.e_clusters), 155 * Check if logical start of emi2 is inside emi1
416 &p, &parent); 156 */
417 if (old_ent) 157 range1 = emi1->ei_cpos + emi1->ei_clusters;
418 return -EEXIST; 158 if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
159 return 1;
419 160
420 rb_link_node(&ent->e_node, parent, p); 161 /*
421 rb_insert_color(&ent->e_node, &em->em_extents); 162 * Check if logical end of emi2 is inside emi1
163 */
164 range2 = emi2->ei_cpos + emi2->ei_clusters;
165 if (range2 > emi1->ei_cpos && range2 <= range1)
166 return 1;
422 167
423 return 0; 168 return 0;
424} 169}
425 170
171static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
172 struct ocfs2_extent_map_item *src)
173{
174 dest->ei_cpos = src->ei_cpos;
175 dest->ei_phys = src->ei_phys;
176 dest->ei_clusters = src->ei_clusters;
177 dest->ei_flags = src->ei_flags;
178}
426 179
427/* 180/*
428 * Simple rule: on any return code other than -EAGAIN, anything left 181 * Try to merge emi with ins. Returns 1 if merge succeeds, zero
429 * in the insert_context will be freed. 182 * otherwise.
430 *
431 * Simple rule #2: A return code of -EEXIST from this function or
432 * its calls to ocfs2_extent_map_insert_entry() signifies that another
433 * thread beat us to the insert. It is not an actual error, but it
434 * tells the caller we have no more work to do.
435 */ 183 */
436static int ocfs2_extent_map_try_insert(struct inode *inode, 184static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
437 struct ocfs2_extent_rec *rec, 185 struct ocfs2_extent_map_item *ins)
438 int tree_depth,
439 struct ocfs2_em_insert_context *ctxt)
440{ 186{
441 int ret;
442 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
443 struct ocfs2_extent_map_entry *old_ent;
444
445 ctxt->need_left = 0;
446 ctxt->need_right = 0;
447 ctxt->old_ent = NULL;
448
449 spin_lock(&OCFS2_I(inode)->ip_lock);
450 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
451 if (!ret) {
452 ctxt->new_ent = NULL;
453 goto out_unlock;
454 }
455
456 /* Since insert_entry failed, the map MUST have old_ent */
457 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
458 le32_to_cpu(rec->e_clusters),
459 NULL, NULL);
460
461 BUG_ON(!old_ent);
462
463 if (old_ent->e_tree_depth < tree_depth) {
464 /* Another thread beat us to the lower tree_depth */
465 ret = -EEXIST;
466 goto out_unlock;
467 }
468
469 if (old_ent->e_tree_depth == tree_depth) {
470 /*
471 * Another thread beat us to this tree_depth.
472 * Let's make sure we agree with that thread (the
473 * extent_rec should be identical).
474 */
475 if (!memcmp(rec, &old_ent->e_rec,
476 sizeof(struct ocfs2_extent_rec)))
477 ret = 0;
478 else
479 /* FIXME: Should this be ESRCH/EBADR??? */
480 ret = -EEXIST;
481
482 goto out_unlock;
483 }
484
485 /* 187 /*
486 * We do it in this order specifically so that no actual tree 188 * Handle contiguousness
487 * changes occur until we have all the pieces we need. We
488 * don't want malloc failures to leave an inconsistent tree.
489 * Whenever we drop the lock, another process could be
490 * inserting. Also note that, if another process just beat us
491 * to an insert, we might not need the same pieces we needed
492 * the first go round. In the end, the pieces we need will
493 * be used, and the pieces we don't will be freed.
494 */ 189 */
495 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > 190 if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
496 le32_to_cpu(old_ent->e_rec.e_cpos)); 191 ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
497 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + 192 ins->ei_flags == emi->ei_flags) {
498 le32_to_cpu(old_ent->e_rec.e_clusters)) > 193 emi->ei_clusters += ins->ei_clusters;
499 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); 194 return 1;
500 ret = -EAGAIN; 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
501 if (ctxt->need_left) { 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
502 if (!ctxt->left_ent) 197 ins->ei_flags == emi->ei_flags) {
503 goto out_unlock; 198 emi->ei_phys = ins->ei_phys;
504 *(ctxt->left_ent) = *old_ent; 199 emi->ei_cpos = ins->ei_cpos;
505 ctxt->left_ent->e_rec.e_clusters = 200 emi->ei_clusters += ins->ei_clusters;
506 cpu_to_le32(le32_to_cpu(rec->e_cpos) - 201 return 1;
507 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
508 }
509 if (ctxt->need_right) {
510 if (!ctxt->right_ent)
511 goto out_unlock;
512 *(ctxt->right_ent) = *old_ent;
513 ctxt->right_ent->e_rec.e_cpos =
514 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
515 le32_to_cpu(rec->e_clusters));
516 ctxt->right_ent->e_rec.e_clusters =
517 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
518 le32_to_cpu(old_ent->e_rec.e_clusters)) -
519 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
520 }
521
522 rb_erase(&old_ent->e_node, &em->em_extents);
523 /* Now that he's erased, set him up for deletion */
524 ctxt->old_ent = old_ent;
525
526 if (ctxt->need_left) {
527 ret = ocfs2_extent_map_insert_entry(em,
528 ctxt->left_ent);
529 if (ret)
530 goto out_unlock;
531 ctxt->left_ent = NULL;
532 } 202 }
533 203
534 if (ctxt->need_right) { 204 /*
535 ret = ocfs2_extent_map_insert_entry(em, 205 * Overlapping extents - this shouldn't happen unless we've
536 ctxt->right_ent); 206 * split an extent to change it's flags. That is exceedingly
537 if (ret) 207 * rare, so there's no sense in trying to optimize it yet.
538 goto out_unlock; 208 */
539 ctxt->right_ent = NULL; 209 if (ocfs2_ei_is_contained(emi, ins) ||
210 ocfs2_ei_is_contained(ins, emi)) {
211 ocfs2_copy_emi_fields(emi, ins);
212 return 1;
540 } 213 }
541 214
542 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); 215 /* No merge was possible. */
543 216 return 0;
544 if (!ret)
545 ctxt->new_ent = NULL;
546
547out_unlock:
548 spin_unlock(&OCFS2_I(inode)->ip_lock);
549
550 return ret;
551} 217}
552 218
553 219/*
554static int ocfs2_extent_map_insert(struct inode *inode, 220 * In order to reduce complexity on the caller, this insert function
555 struct ocfs2_extent_rec *rec, 221 * is intentionally liberal in what it will accept.
556 int tree_depth) 222 *
223 * The only rule is that the truncate call *must* be used whenever
224 * records have been deleted. This avoids inserting overlapping
225 * records with different physical mappings.
226 */
227void ocfs2_extent_map_insert_rec(struct inode *inode,
228 struct ocfs2_extent_rec *rec)
557{ 229{
558 int ret; 230 struct ocfs2_inode_info *oi = OCFS2_I(inode);
559 struct ocfs2_em_insert_context ctxt = {0, }; 231 struct ocfs2_extent_map *em = &oi->ip_extent_map;
560 232 struct ocfs2_extent_map_item *emi, *new_emi = NULL;
561 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > 233 struct ocfs2_extent_map_item ins;
562 OCFS2_I(inode)->ip_map.em_clusters) { 234
563 ret = -EBADR; 235 ins.ei_cpos = le32_to_cpu(rec->e_cpos);
564 mlog_errno(ret); 236 ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
565 return ret; 237 le64_to_cpu(rec->e_blkno));
238 ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
239 ins.ei_flags = rec->e_flags;
240
241search:
242 spin_lock(&oi->ip_lock);
243
244 list_for_each_entry(emi, &em->em_list, ei_list) {
245 if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
246 list_move(&emi->ei_list, &em->em_list);
247 spin_unlock(&oi->ip_lock);
248 goto out;
249 }
566 } 250 }
567 251
568 /* Zero e_clusters means a truncated tail record. It better be EOF */ 252 /*
569 if (!rec->e_clusters) { 253 * No item could be merged.
570 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != 254 *
571 OCFS2_I(inode)->ip_map.em_clusters) { 255 * Either allocate and add a new item, or overwrite the last recently
572 ret = -EBADR; 256 * inserted.
573 mlog_errno(ret); 257 */
574 ocfs2_error(inode->i_sb,
575 "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
576 (unsigned long long)le64_to_cpu(rec->e_blkno),
577 (unsigned long long)OCFS2_I(inode)->ip_blkno);
578 return ret;
579 }
580 258
581 /* Ignore the truncated tail */ 259 if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
582 return 0; 260 if (new_emi == NULL) {
583 } 261 spin_unlock(&oi->ip_lock);
584 262
585 ret = -ENOMEM; 263 new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
586 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, 264 if (new_emi == NULL)
587 GFP_NOFS); 265 goto out;
588 if (!ctxt.new_ent) {
589 mlog_errno(ret);
590 return ret;
591 }
592 266
593 ctxt.new_ent->e_rec = *rec; 267 goto search;
594 ctxt.new_ent->e_tree_depth = tree_depth;
595
596 do {
597 ret = -ENOMEM;
598 if (ctxt.need_left && !ctxt.left_ent) {
599 ctxt.left_ent =
600 kmem_cache_alloc(ocfs2_em_ent_cachep,
601 GFP_NOFS);
602 if (!ctxt.left_ent)
603 break;
604 }
605 if (ctxt.need_right && !ctxt.right_ent) {
606 ctxt.right_ent =
607 kmem_cache_alloc(ocfs2_em_ent_cachep,
608 GFP_NOFS);
609 if (!ctxt.right_ent)
610 break;
611 } 268 }
612 269
613 ret = ocfs2_extent_map_try_insert(inode, rec, 270 ocfs2_copy_emi_fields(new_emi, &ins);
614 tree_depth, &ctxt); 271 list_add(&new_emi->ei_list, &em->em_list);
615 } while (ret == -EAGAIN); 272 em->em_num_items++;
616 273 new_emi = NULL;
617 if ((ret < 0) && (ret != -EEXIST)) 274 } else {
618 mlog_errno(ret); 275 BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
276 emi = list_entry(em->em_list.prev,
277 struct ocfs2_extent_map_item, ei_list);
278 list_move(&emi->ei_list, &em->em_list);
279 ocfs2_copy_emi_fields(emi, &ins);
280 }
619 281
620 if (ctxt.left_ent) 282 spin_unlock(&oi->ip_lock);
621 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
622 if (ctxt.right_ent)
623 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
624 if (ctxt.old_ent)
625 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
626 if (ctxt.new_ent)
627 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
628 283
629 return ret; 284out:
285 if (new_emi)
286 kfree(new_emi);
630} 287}
631 288
632/* 289/*
633 * Append this record to the tail of the extent map. It must be 290 * Return the 1st index within el which contains an extent start
634 * tree_depth 0. The record might be an extension of an existing 291 * larger than v_cluster.
635 * record, and as such that needs to be handled. eg:
636 *
637 * Existing record in the extent map:
638 *
639 * cpos = 10, len = 10
640 * |---------|
641 *
642 * New Record:
643 *
644 * cpos = 10, len = 20
645 * |------------------|
646 *
647 * The passed record is the new on-disk record. The new_clusters value
648 * is how many clusters were added to the file. If the append is a
649 * contiguous append, the new_clusters has been added to
650 * rec->e_clusters. If the append is an entirely new extent, then
651 * rec->e_clusters is == new_clusters.
652 */ 292 */
653int ocfs2_extent_map_append(struct inode *inode, 293static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
654 struct ocfs2_extent_rec *rec, 294 u32 v_cluster)
655 u32 new_clusters)
656{ 295{
657 int ret; 296 int i;
658 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 297 struct ocfs2_extent_rec *rec;
659 struct ocfs2_extent_map_entry *ent;
660 struct ocfs2_extent_rec *old;
661
662 BUG_ON(!new_clusters);
663 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
664 298
665 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { 299 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
666 /* 300 rec = &el->l_recs[i];
667 * Size changed underneath us on disk. Drop any
668 * straddling records and update our idea of
669 * i_clusters
670 */
671 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
672 em->em_clusters = OCFS2_I(inode)->ip_clusters;
673 }
674 301
675 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + 302 if (v_cluster < le32_to_cpu(rec->e_cpos))
676 le32_to_cpu(rec->e_clusters)) != 303 break;
677 (em->em_clusters + new_clusters),
678 "Inode %llu:\n"
679 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
680 "em->em_clusters = %u + new_clusters = %u = %u\n",
681 (unsigned long long)OCFS2_I(inode)->ip_blkno,
682 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
683 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
684 em->em_clusters, new_clusters,
685 em->em_clusters + new_clusters);
686
687 em->em_clusters += new_clusters;
688
689 ret = -ENOENT;
690 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
691 /* This is a contiguous append */
692 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
693 NULL, NULL);
694 if (ent) {
695 old = &ent->e_rec;
696 BUG_ON((le32_to_cpu(rec->e_cpos) +
697 le32_to_cpu(rec->e_clusters)) !=
698 (le32_to_cpu(old->e_cpos) +
699 le32_to_cpu(old->e_clusters) +
700 new_clusters));
701 if (ent->e_tree_depth == 0) {
702 BUG_ON(le32_to_cpu(old->e_cpos) !=
703 le32_to_cpu(rec->e_cpos));
704 BUG_ON(le64_to_cpu(old->e_blkno) !=
705 le64_to_cpu(rec->e_blkno));
706 ret = 0;
707 }
708 /*
709 * Let non-leafs fall through as -ENOENT to
710 * force insertion of the new leaf.
711 */
712 le32_add_cpu(&old->e_clusters, new_clusters);
713 }
714 } 304 }
715 305
716 if (ret == -ENOENT) 306 return i;
717 ret = ocfs2_extent_map_insert(inode, rec, 0);
718 if (ret < 0)
719 mlog_errno(ret);
720 return ret;
721} 307}
722 308
723#if 0
724/* Code here is included but defined out as it completes the extent
725 * map api and may be used in the future. */
726
727/* 309/*
728 * Look up the record containing this cluster offset. This record is 310 * Figure out the size of a hole which starts at v_cluster within the given
729 * part of the extent map. Do not free it. Any changes you make to 311 * extent list.
730 * it will reflect in the extent map. So, if your last extent
731 * is (cpos = 10, clusters = 10) and you truncate the file by 5
732 * clusters, you can do:
733 * 312 *
734 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); 313 * If there is no more allocation past v_cluster, we return the maximum
735 * rec->e_clusters -= 5; 314 * cluster size minus v_cluster.
736 * 315 *
737 * The lookup does not read from disk. If the map isn't filled in for 316 * If we have in-inode extents, then el points to the dinode list and
738 * an entry, you won't find it. 317 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
739 * 318 * containing el.
740 * Also note that the returned record is valid until alloc_sem is
741 * dropped. After that, truncate and extend can happen. Caveat Emptor.
742 */ 319 */
743int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, 320static int ocfs2_figure_hole_clusters(struct inode *inode,
744 struct ocfs2_extent_rec **rec, 321 struct ocfs2_extent_list *el,
745 int *tree_depth) 322 struct buffer_head *eb_bh,
323 u32 v_cluster,
324 u32 *num_clusters)
746{ 325{
747 int ret = -ENOENT; 326 int ret, i;
748 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 327 struct buffer_head *next_eb_bh = NULL;
749 struct ocfs2_extent_map_entry *ent; 328 struct ocfs2_extent_block *eb, *next_eb;
750 329
751 *rec = NULL; 330 i = ocfs2_search_for_hole_index(el, v_cluster);
752 331
753 if (cpos >= OCFS2_I(inode)->ip_clusters) 332 if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
754 return -EINVAL; 333 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
755 334
756 if (cpos >= em->em_clusters) {
757 /* 335 /*
758 * Size changed underneath us on disk. Drop any 336 * Check the next leaf for any extents.
759 * straddling records and update our idea of
760 * i_clusters
761 */ 337 */
762 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
763 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
764 }
765
766 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
767 NULL, NULL);
768 338
769 if (ent) { 339 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
770 *rec = &ent->e_rec; 340 goto no_more_extents;
771 if (tree_depth)
772 *tree_depth = ent->e_tree_depth;
773 ret = 0;
774 }
775 341
776 return ret; 342 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
777} 343 le64_to_cpu(eb->h_next_leaf_blk),
344 &next_eb_bh, OCFS2_BH_CACHED, inode);
345 if (ret) {
346 mlog_errno(ret);
347 goto out;
348 }
349 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
778 350
779int ocfs2_extent_map_get_clusters(struct inode *inode, 351 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
780 u32 v_cpos, int count, 352 ret = -EROFS;
781 u32 *p_cpos, int *ret_count) 353 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
782{ 354 goto out;
783 int ret; 355 }
784 u32 coff, ccount;
785 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
786 struct ocfs2_extent_map_entry *ent = NULL;
787 356
788 *p_cpos = ccount = 0; 357 el = &next_eb->h_list;
789 358
790 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) 359 i = ocfs2_search_for_hole_index(el, v_cluster);
791 return -EINVAL; 360 }
792 361
793 if ((v_cpos + count) > em->em_clusters) { 362no_more_extents:
363 if (i == le16_to_cpu(el->l_next_free_rec)) {
794 /* 364 /*
795 * Size changed underneath us on disk. Drop any 365 * We're at the end of our existing allocation. Just
796 * straddling records and update our idea of 366 * return the maximum number of clusters we could
797 * i_clusters 367 * possibly allocate.
798 */ 368 */
799 ocfs2_extent_map_drop(inode, em->em_clusters - 1); 369 *num_clusters = UINT_MAX - v_cluster;
800 em->em_clusters = OCFS2_I(inode)->ip_clusters; 370 } else {
371 *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
801 } 372 }
802 373
374 ret = 0;
375out:
376 brelse(next_eb_bh);
377 return ret;
378}
803 379
804 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); 380/*
805 if (ret) 381 * Return the index of the extent record which contains cluster #v_cluster.
806 return ret; 382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
807 393
808 if (ent) { 394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
809 /* We should never find ourselves straddling an interval */ 395 rec = &el->l_recs[i];
810 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
811 v_cpos,
812 count))
813 return -ESRCH;
814 396
815 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); 397 rec_start = le32_to_cpu(rec->e_cpos);
816 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 398 clusters = ocfs2_rec_clusters(el, rec);
817 le64_to_cpu(ent->e_rec.e_blkno)) +
818 coff;
819 399
820 if (ret_count) 400 rec_end = rec_start + clusters;
821 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
822 401
823 return 0; 402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
824 } 406 }
825 407
826 408 return ret;
827 return -ENOENT;
828} 409}
829 410
830#endif /* 0 */ 411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
831 412 u32 *p_cluster, u32 *num_clusters,
832int ocfs2_extent_map_get_blocks(struct inode *inode, 413 unsigned int *extent_flags)
833 u64 v_blkno, int count,
834 u64 *p_blkno, int *ret_count)
835{ 414{
836 int ret; 415 int ret, i;
837 u64 boff; 416 unsigned int flags = 0;
838 u32 cpos, clusters; 417 struct buffer_head *di_bh = NULL;
839 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 418 struct buffer_head *eb_bh = NULL;
840 struct ocfs2_extent_map_entry *ent = NULL; 419 struct ocfs2_dinode *di;
841 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 420 struct ocfs2_extent_block *eb;
421 struct ocfs2_extent_list *el;
842 struct ocfs2_extent_rec *rec; 422 struct ocfs2_extent_rec *rec;
423 u32 coff;
843 424
844 *p_blkno = 0; 425 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
845 426 num_clusters, extent_flags);
846 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); 427 if (ret == 0)
847 clusters = ocfs2_blocks_to_clusters(inode->i_sb, 428 goto out;
848 (u64)count + bpc - 1);
849 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
850 ret = -EINVAL;
851 mlog_errno(ret);
852 return ret;
853 }
854
855 if ((cpos + clusters) > em->em_clusters) {
856 /*
857 * Size changed underneath us on disk. Drop any
858 * straddling records and update our idea of
859 * i_clusters
860 */
861 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
862 em->em_clusters = OCFS2_I(inode)->ip_clusters;
863 }
864 429
865 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); 430 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
431 &di_bh, OCFS2_BH_CACHED, inode);
866 if (ret) { 432 if (ret) {
867 mlog_errno(ret); 433 mlog_errno(ret);
868 return ret; 434 goto out;
869 } 435 }
870 436
871 if (ent) 437 di = (struct ocfs2_dinode *) di_bh->b_data;
872 { 438 el = &di->id2.i_list;
873 rec = &ent->e_rec;
874 439
875 /* We should never find ourselves straddling an interval */ 440 if (el->l_tree_depth) {
876 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { 441 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
877 ret = -ESRCH; 442 if (ret) {
878 mlog_errno(ret); 443 mlog_errno(ret);
879 return ret; 444 goto out;
880 } 445 }
881 446
882 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - 447 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
883 le32_to_cpu(rec->e_cpos)); 448 el = &eb->h_list;
884 boff += (v_blkno & (u64)(bpc - 1));
885 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
886 449
887 if (ret_count) { 450 if (el->l_tree_depth) {
888 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, 451 ocfs2_error(inode->i_sb,
889 le32_to_cpu(rec->e_clusters)) - boff; 452 "Inode %lu has non zero tree depth in "
453 "leaf block %llu\n", inode->i_ino,
454 (unsigned long long)eb_bh->b_blocknr);
455 ret = -EROFS;
456 goto out;
890 } 457 }
891
892 return 0;
893 } 458 }
894 459
895 return -ENOENT; 460 i = ocfs2_search_extent_list(el, v_cluster);
896} 461 if (i == -1) {
897 462 /*
898int ocfs2_extent_map_init(struct inode *inode) 463 * A hole was found. Return some canned values that
899{ 464 * callers can key on. If asked for, num_clusters will
900 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 465 * be populated with the size of the hole.
901 466 */
902 em->em_extents = RB_ROOT; 467 *p_cluster = 0;
903 em->em_clusters = 0; 468 if (num_clusters) {
904 469 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
905 return 0; 470 v_cluster,
906} 471 num_clusters);
907 472 if (ret) {
908/* Needs the lock */ 473 mlog_errno(ret);
909static void __ocfs2_extent_map_drop(struct inode *inode, 474 goto out;
910 u32 new_clusters, 475 }
911 struct rb_node **free_head, 476 }
912 struct ocfs2_extent_map_entry **tail_ent) 477 } else {
913{ 478 rec = &el->l_recs[i];
914 struct rb_node *node, *next;
915 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
916 struct ocfs2_extent_map_entry *ent;
917 479
918 *free_head = NULL; 480 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
919 481
920 ent = NULL; 482 if (!rec->e_blkno) {
921 node = rb_last(&em->em_extents); 483 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
922 while (node) 484 "record (%u, %u, 0)", inode->i_ino,
923 { 485 le32_to_cpu(rec->e_cpos),
924 next = rb_prev(node); 486 ocfs2_rec_clusters(el, rec));
487 ret = -EROFS;
488 goto out;
489 }
925 490
926 ent = rb_entry(node, struct ocfs2_extent_map_entry, 491 coff = v_cluster - le32_to_cpu(rec->e_cpos);
927 e_node);
928 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
929 break;
930 492
931 rb_erase(&ent->e_node, &em->em_extents); 493 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
494 le64_to_cpu(rec->e_blkno));
495 *p_cluster = *p_cluster + coff;
932 496
933 node->rb_right = *free_head; 497 if (num_clusters)
934 *free_head = node; 498 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
935 499
936 ent = NULL; 500 flags = rec->e_flags;
937 node = next;
938 }
939 501
940 /* Do we have an entry straddling new_clusters? */ 502 ocfs2_extent_map_insert_rec(inode, rec);
941 if (tail_ent) {
942 if (ent &&
943 ((le32_to_cpu(ent->e_rec.e_cpos) +
944 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
945 *tail_ent = ent;
946 else
947 *tail_ent = NULL;
948 } 503 }
949}
950
951static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
952{
953 struct rb_node *node;
954 struct ocfs2_extent_map_entry *ent;
955 504
956 while (free_head) { 505 if (extent_flags)
957 node = free_head; 506 *extent_flags = flags;
958 free_head = node->rb_right;
959 507
960 ent = rb_entry(node, struct ocfs2_extent_map_entry, 508out:
961 e_node); 509 brelse(di_bh);
962 kmem_cache_free(ocfs2_em_ent_cachep, ent); 510 brelse(eb_bh);
963 } 511 return ret;
964} 512}
965 513
966/* 514/*
967 * Remove all entries past new_clusters, inclusive of an entry that 515 * This expects alloc_sem to be held. The allocation cannot change at
968 * contains new_clusters. This is effectively a cache forget. 516 * all while the map is in the process of being updated.
969 *
970 * If you want to also clip the last extent by some number of clusters,
971 * you need to call ocfs2_extent_map_trunc().
972 * This code does not check or modify ip_clusters.
973 */ 517 */
974int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) 518int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
519 u64 *ret_count, unsigned int *extent_flags)
975{ 520{
976 struct rb_node *free_head = NULL; 521 int ret;
977 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 522 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
978 struct ocfs2_extent_map_entry *ent; 523 u32 cpos, num_clusters, p_cluster;
979 524 u64 boff = 0;
980 spin_lock(&OCFS2_I(inode)->ip_lock);
981 525
982 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); 526 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
983 527
984 if (ent) { 528 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
985 rb_erase(&ent->e_node, &em->em_extents); 529 extent_flags);
986 ent->e_node.rb_right = free_head; 530 if (ret) {
987 free_head = &ent->e_node; 531 mlog_errno(ret);
532 goto out;
988 } 533 }
989 534
990 spin_unlock(&OCFS2_I(inode)->ip_lock); 535 /*
991 536 * p_cluster == 0 indicates a hole.
992 if (free_head) 537 */
993 __ocfs2_extent_map_drop_cleanup(free_head); 538 if (p_cluster) {
994 539 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
995 return 0; 540 boff += (v_blkno & (u64)(bpc - 1));
996} 541 }
997
998/*
999 * Remove all entries past new_clusters and also clip any extent
1000 * straddling new_clusters, if there is one. This does not check
1001 * or modify ip_clusters
1002 */
1003int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
1004{
1005 struct rb_node *free_head = NULL;
1006 struct ocfs2_extent_map_entry *ent = NULL;
1007
1008 spin_lock(&OCFS2_I(inode)->ip_lock);
1009
1010 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
1011
1012 if (ent)
1013 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
1014 le32_to_cpu(ent->e_rec.e_cpos));
1015
1016 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
1017
1018 spin_unlock(&OCFS2_I(inode)->ip_lock);
1019
1020 if (free_head)
1021 __ocfs2_extent_map_drop_cleanup(free_head);
1022
1023 return 0;
1024}
1025 542
1026int __init init_ocfs2_extent_maps(void) 543 *p_blkno = boff;
1027{
1028 ocfs2_em_ent_cachep =
1029 kmem_cache_create("ocfs2_em_ent",
1030 sizeof(struct ocfs2_extent_map_entry),
1031 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1032 if (!ocfs2_em_ent_cachep)
1033 return -ENOMEM;
1034 544
1035 return 0; 545 if (ret_count) {
1036} 546 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
547 *ret_count -= v_blkno & (u64)(bpc - 1);
548 }
1037 549
1038void exit_ocfs2_extent_maps(void) 550out:
1039{ 551 return ret;
1040 kmem_cache_destroy(ocfs2_em_ent_cachep);
1041} 552}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
25#ifndef _EXTENT_MAP_H 25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H 26#define _EXTENT_MAP_H
27 27
28int init_ocfs2_extent_maps(void); 28struct ocfs2_extent_map_item {
29void exit_ocfs2_extent_maps(void); 29 unsigned int ei_cpos;
30 unsigned int ei_phys;
31 unsigned int ei_clusters;
32 unsigned int ei_flags;
30 33
31/* 34 struct list_head ei_list;
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem 35};
33 * to be held. The allocation cannot change at all while the map is 36
34 * in the process of being updated. 37#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
35 */ 38struct ocfs2_extent_map {
36int ocfs2_extent_map_init(struct inode *inode); 39 unsigned int em_num_items;
37int ocfs2_extent_map_append(struct inode *inode, 40 struct list_head em_list;
38 struct ocfs2_extent_rec *rec, 41};
39 u32 new_clusters); 42
40int ocfs2_extent_map_get_blocks(struct inode *inode, 43void ocfs2_extent_map_init(struct inode *inode);
41 u64 v_blkno, int count, 44void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
42 u64 *p_blkno, int *ret_count); 45void ocfs2_extent_map_insert_rec(struct inode *inode,
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); 46 struct ocfs2_extent_rec *rec);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); 47
48int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
49 u32 *num_clusters, unsigned int *extent_flags);
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags);
45 52
46#endif /* _EXTENT_MAP_H */ 53#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/pipe_fs_i.h> 34#include <linux/pipe_fs_i.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h>
36 37
37#define MLOG_MASK_PREFIX ML_INODE 38#define MLOG_MASK_PREFIX ML_INODE
38#include <cluster/masklog.h> 39#include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
215 216
216 mlog_entry_void(); 217 mlog_entry_void();
217 i_size_write(inode, new_i_size); 218 i_size_write(inode, new_i_size);
218 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 219 inode->i_blocks = ocfs2_inode_sector_count(inode);
219 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
220 221
221 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
261{ 262{
262 int status; 263 int status;
263 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di;
264 266
265 mlog_entry_void(); 267 mlog_entry_void();
266 268
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
274 goto out; 276 goto out;
275 } 277 }
276 278
277 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 279 status = ocfs2_journal_access(handle, inode, fe_bh,
280 OCFS2_JOURNAL_ACCESS_WRITE);
281 if (status < 0) {
282 mlog_errno(status);
283 goto out_commit;
284 }
285
286 /*
287 * Do this before setting i_size.
288 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290 if (status) {
291 mlog_errno(status);
292 goto out_commit;
293 }
294
295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298
299 di = (struct ocfs2_dinode *) fe_bh->b_data;
300 di->i_size = cpu_to_le64(new_i_size);
301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303
304 status = ocfs2_journal_dirty(handle, fe_bh);
278 if (status < 0) 305 if (status < 0)
279 mlog_errno(status); 306 mlog_errno(status);
280 307
308out_commit:
281 ocfs2_commit_trans(osb, handle); 309 ocfs2_commit_trans(osb, handle);
282out: 310out:
311
283 mlog_exit(status); 312 mlog_exit(status);
284 return status; 313 return status;
285} 314}
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
342 mlog_errno(status); 371 mlog_errno(status);
343 goto bail; 372 goto bail;
344 } 373 }
345 ocfs2_data_unlock(inode, 1);
346
347 if (le32_to_cpu(fe->i_clusters) ==
348 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
349 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
350 fe->i_clusters);
351 /* No allocation change is required, so lets fast path
352 * this truncate. */
353 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
354 if (status < 0)
355 mlog_errno(status);
356 goto bail;
357 }
358 374
359 /* alright, we're going to need to do a full blown alloc size 375 /* alright, we're going to need to do a full blown alloc size
360 * change. Orphan the inode so that recovery can complete the 376 * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
363 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 379 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
364 if (status < 0) { 380 if (status < 0) {
365 mlog_errno(status); 381 mlog_errno(status);
366 goto bail; 382 goto bail_unlock_data;
367 } 383 }
368 384
369 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 385 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
370 if (status < 0) { 386 if (status < 0) {
371 mlog_errno(status); 387 mlog_errno(status);
372 goto bail; 388 goto bail_unlock_data;
373 } 389 }
374 390
375 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 391 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
376 if (status < 0) { 392 if (status < 0) {
377 mlog_errno(status); 393 mlog_errno(status);
378 goto bail; 394 goto bail_unlock_data;
379 } 395 }
380 396
381 /* TODO: orphan dir cleanup here. */ 397 /* TODO: orphan dir cleanup here. */
398bail_unlock_data:
399 ocfs2_data_unlock(inode, 1);
400
382bail: 401bail:
383 402
384 mlog_exit(status); 403 mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
397 */ 416 */
398int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 417int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
399 struct inode *inode, 418 struct inode *inode,
419 u32 *logical_offset,
400 u32 clusters_to_add, 420 u32 clusters_to_add,
401 struct buffer_head *fe_bh, 421 struct buffer_head *fe_bh,
402 handle_t *handle, 422 handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
460 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 480 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
461 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 481 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
462 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 482 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
463 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 483 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
464 num_bits, meta_ac); 484 *logical_offset, block, num_bits,
485 meta_ac);
465 if (status < 0) { 486 if (status < 0) {
466 mlog_errno(status); 487 mlog_errno(status);
467 goto leave; 488 goto leave;
468 } 489 }
469 490
470 le32_add_cpu(&fe->i_clusters, num_bits);
471 spin_lock(&OCFS2_I(inode)->ip_lock);
472 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
473 spin_unlock(&OCFS2_I(inode)->ip_lock);
474
475 status = ocfs2_journal_dirty(handle, fe_bh); 491 status = ocfs2_journal_dirty(handle, fe_bh);
476 if (status < 0) { 492 if (status < 0) {
477 mlog_errno(status); 493 mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
479 } 495 }
480 496
481 clusters_to_add -= num_bits; 497 clusters_to_add -= num_bits;
498 *logical_offset += num_bits;
482 499
483 if (clusters_to_add) { 500 if (clusters_to_add) {
484 mlog(0, "need to alloc once more, clusters = %u, wanted = " 501 mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
494 return status; 511 return status;
495} 512}
496 513
514/*
515 * For a given allocation, determine which allocators will need to be
516 * accessed, and lock them, reserving the appropriate number of bits.
517 *
518 * Called from ocfs2_extend_allocation() for file systems which don't
519 * support holes, and from ocfs2_write() for file systems which
520 * understand sparse inodes.
521 */
522int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
523 u32 clusters_to_add,
524 struct ocfs2_alloc_context **data_ac,
525 struct ocfs2_alloc_context **meta_ac)
526{
527 int ret, num_free_extents;
528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
529
530 *meta_ac = NULL;
531 *data_ac = NULL;
532
533 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
534 "clusters_to_add = %u\n",
535 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
536 le32_to_cpu(di->i_clusters), clusters_to_add);
537
538 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
539 if (num_free_extents < 0) {
540 ret = num_free_extents;
541 mlog_errno(ret);
542 goto out;
543 }
544
545 /*
546 * Sparse allocation file systems need to be more conservative
547 * with reserving room for expansion - the actual allocation
548 * happens while we've got a journal handle open so re-taking
549 * a cluster lock (because we ran out of room for another
550 * extent) will violate ordering rules.
551 *
552 * Most of the time we'll only be seeing this 1 cluster at a time
553 * anyway.
554 */
555 if (!num_free_extents ||
556 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
557 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
558 if (ret < 0) {
559 if (ret != -ENOSPC)
560 mlog_errno(ret);
561 goto out;
562 }
563 }
564
565 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
566 if (ret < 0) {
567 if (ret != -ENOSPC)
568 mlog_errno(ret);
569 goto out;
570 }
571
572out:
573 if (ret) {
574 if (*meta_ac) {
575 ocfs2_free_alloc_context(*meta_ac);
576 *meta_ac = NULL;
577 }
578
579 /*
580 * We cannot have an error and a non null *data_ac.
581 */
582 }
583
584 return ret;
585}
586
497static int ocfs2_extend_allocation(struct inode *inode, 587static int ocfs2_extend_allocation(struct inode *inode,
498 u32 clusters_to_add) 588 u32 clusters_to_add)
499{ 589{
500 int status = 0; 590 int status = 0;
501 int restart_func = 0; 591 int restart_func = 0;
502 int drop_alloc_sem = 0; 592 int drop_alloc_sem = 0;
503 int credits, num_free_extents; 593 int credits;
504 u32 prev_clusters; 594 u32 prev_clusters, logical_start;
505 struct buffer_head *bh = NULL; 595 struct buffer_head *bh = NULL;
506 struct ocfs2_dinode *fe = NULL; 596 struct ocfs2_dinode *fe = NULL;
507 handle_t *handle = NULL; 597 handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
512 602
513 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 603 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
514 604
605 /*
606 * This function only exists for file systems which don't
607 * support holes.
608 */
609 BUG_ON(ocfs2_sparse_alloc(osb));
610
515 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 611 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
516 OCFS2_BH_CACHED, inode); 612 OCFS2_BH_CACHED, inode);
517 if (status < 0) { 613 if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
526 goto leave; 622 goto leave;
527 } 623 }
528 624
625 logical_start = OCFS2_I(inode)->ip_clusters;
626
529restart_all: 627restart_all:
530 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 628 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
531 629
532 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
533 "clusters_to_add = %u\n",
534 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
535 fe->i_clusters, clusters_to_add);
536
537 num_free_extents = ocfs2_num_free_extents(osb,
538 inode,
539 fe);
540 if (num_free_extents < 0) {
541 status = num_free_extents;
542 mlog_errno(status);
543 goto leave;
544 }
545
546 if (!num_free_extents) {
547 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
548 if (status < 0) {
549 if (status != -ENOSPC)
550 mlog_errno(status);
551 goto leave;
552 }
553 }
554
555 status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
556 if (status < 0) {
557 if (status != -ENOSPC)
558 mlog_errno(status);
559 goto leave;
560 }
561
562 /* blocks peope in read/write from reading our allocation 630 /* blocks peope in read/write from reading our allocation
563 * until we're done changing it. We depend on i_mutex to block 631 * until we're done changing it. We depend on i_mutex to block
564 * other extend/truncate calls while we're here. Ordering wrt 632 * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
566 down_write(&OCFS2_I(inode)->ip_alloc_sem); 634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
567 drop_alloc_sem = 1; 635 drop_alloc_sem = 1;
568 636
637 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
638 &meta_ac);
639 if (status) {
640 mlog_errno(status);
641 goto leave;
642 }
643
569 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 644 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
570 handle = ocfs2_start_trans(osb, credits); 645 handle = ocfs2_start_trans(osb, credits);
571 if (IS_ERR(handle)) { 646 if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
590 665
591 status = ocfs2_do_extend_allocation(osb, 666 status = ocfs2_do_extend_allocation(osb,
592 inode, 667 inode,
668 &logical_start,
593 clusters_to_add, 669 clusters_to_add,
594 bh, 670 bh,
595 handle, 671 handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
778 size_t tail_to_skip) 854 size_t tail_to_skip)
779{ 855{
780 int ret = 0; 856 int ret = 0;
781 u32 clusters_to_add; 857 u32 clusters_to_add = 0;
782 858
783 BUG_ON(!tail_to_skip && !di_bh); 859 BUG_ON(!tail_to_skip && !di_bh);
784 860
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
790 goto out; 866 goto out;
791 BUG_ON(new_i_size < i_size_read(inode)); 867 BUG_ON(new_i_size < i_size_read(inode));
792 868
869 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
870 BUG_ON(tail_to_skip != 0);
871 goto out_update_size;
872 }
873
793 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 874 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
794 OCFS2_I(inode)->ip_clusters; 875 OCFS2_I(inode)->ip_clusters;
795 876
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
825 goto out_unlock; 906 goto out_unlock;
826 } 907 }
827 908
909out_update_size:
828 if (!tail_to_skip) { 910 if (!tail_to_skip) {
829 /* We're being called from ocfs2_setattr() which wants 911 /* We're being called from ocfs2_setattr() which wants
830 * us to update i_size */ 912 * us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
834 } 916 }
835 917
836out_unlock: 918out_unlock:
837 ocfs2_data_unlock(inode, 1); 919 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
920 ocfs2_data_unlock(inode, 1);
838 921
839out: 922out:
840 return ret; 923 return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
972 1055
973 ret = ocfs2_meta_lock(inode, NULL, 0); 1056 ret = ocfs2_meta_lock(inode, NULL, 0);
974 if (ret) { 1057 if (ret) {
975 mlog_errno(ret); 1058 if (ret != -ENOENT)
1059 mlog_errno(ret);
976 goto out; 1060 goto out;
977 } 1061 }
978 1062
@@ -1035,10 +1119,49 @@ out:
1035 return ret; 1119 return ret;
1036} 1120}
1037 1121
1122/*
1123 * Will look for holes and unwritten extents in the range starting at
1124 * pos for count bytes (inclusive).
1125 */
1126static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1127 size_t count)
1128{
1129 int ret = 0;
1130 unsigned int extent_flags;
1131 u32 cpos, clusters, extent_len, phys_cpos;
1132 struct super_block *sb = inode->i_sb;
1133
1134 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1135 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1136
1137 while (clusters) {
1138 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1139 &extent_flags);
1140 if (ret < 0) {
1141 mlog_errno(ret);
1142 goto out;
1143 }
1144
1145 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1146 ret = 1;
1147 break;
1148 }
1149
1150 if (extent_len > clusters)
1151 extent_len = clusters;
1152
1153 clusters -= extent_len;
1154 cpos += extent_len;
1155 }
1156out:
1157 return ret;
1158}
1159
1038static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1160static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1039 loff_t *ppos, 1161 loff_t *ppos,
1040 size_t count, 1162 size_t count,
1041 int appending) 1163 int appending,
1164 int *direct_io)
1042{ 1165{
1043 int ret = 0, meta_level = appending; 1166 int ret = 0, meta_level = appending;
1044 struct inode *inode = dentry->d_inode; 1167 struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1089 } else { 1212 } else {
1090 saved_pos = *ppos; 1213 saved_pos = *ppos;
1091 } 1214 }
1215
1216 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1217 loff_t end = saved_pos + count;
1218
1219 /*
1220 * Skip the O_DIRECT checks if we don't need
1221 * them.
1222 */
1223 if (!direct_io || !(*direct_io))
1224 break;
1225
1226 /*
1227 * Allowing concurrent direct writes means
1228 * i_size changes wouldn't be synchronized, so
1229 * one node could wind up truncating another
1230 * nodes writes.
1231 */
1232 if (end > i_size_read(inode)) {
1233 *direct_io = 0;
1234 break;
1235 }
1236
1237 /*
1238 * We don't fill holes during direct io, so
1239 * check for them here. If any are found, the
1240 * caller will have to retake some cluster
1241 * locks and initiate the io as buffered.
1242 */
1243 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1244 count);
1245 if (ret == 1) {
1246 *direct_io = 0;
1247 ret = 0;
1248 } else if (ret < 0)
1249 mlog_errno(ret);
1250 break;
1251 }
1252
1253 /*
1254 * The rest of this loop is concerned with legacy file
1255 * systems which don't support sparse files.
1256 */
1257
1092 newsize = count + saved_pos; 1258 newsize = count + saved_pos;
1093 1259
1094 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1260 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
1141 return ret; 1307 return ret;
1142} 1308}
1143 1309
1310static inline void
1311ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1312{
1313 const struct iovec *iov = *iovp;
1314 size_t base = *basep;
1315
1316 do {
1317 int copy = min(bytes, iov->iov_len - base);
1318
1319 bytes -= copy;
1320 base += copy;
1321 if (iov->iov_len == base) {
1322 iov++;
1323 base = 0;
1324 }
1325 } while (bytes);
1326 *iovp = iov;
1327 *basep = base;
1328}
1329
1330static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1331 const struct iovec *cur_iov,
1332 size_t iov_offset)
1333{
1334 int ret;
1335 char *buf;
1336 struct page *src_page = NULL;
1337
1338 buf = cur_iov->iov_base + iov_offset;
1339
1340 if (!segment_eq(get_fs(), KERNEL_DS)) {
1341 /*
1342 * Pull in the user page. We want to do this outside
1343 * of the meta data locks in order to preserve locking
1344 * order in case of page fault.
1345 */
1346 ret = get_user_pages(current, current->mm,
1347 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1348 0, 0, &src_page, NULL);
1349 if (ret == 1)
1350 bp->b_src_buf = kmap(src_page);
1351 else
1352 src_page = ERR_PTR(-EFAULT);
1353 } else {
1354 bp->b_src_buf = buf;
1355 }
1356
1357 return src_page;
1358}
1359
1360static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1361 struct page *page)
1362{
1363 if (page) {
1364 kunmap(page);
1365 page_cache_release(page);
1366 }
1367}
1368
1369static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1370 const struct iovec *iov,
1371 unsigned long nr_segs,
1372 size_t count,
1373 ssize_t o_direct_written)
1374{
1375 int ret = 0;
1376 ssize_t copied, total = 0;
1377 size_t iov_offset = 0;
1378 const struct iovec *cur_iov = iov;
1379 struct ocfs2_buffered_write_priv bp;
1380 struct page *page;
1381
1382 /*
1383 * handle partial DIO write. Adjust cur_iov if needed.
1384 */
1385 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1386
1387 do {
1388 bp.b_cur_off = iov_offset;
1389 bp.b_cur_iov = cur_iov;
1390
1391 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1392 if (IS_ERR(page)) {
1393 ret = PTR_ERR(page);
1394 goto out;
1395 }
1396
1397 copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1398 ocfs2_map_and_write_user_data,
1399 &bp);
1400
1401 ocfs2_put_write_source(&bp, page);
1402
1403 if (copied < 0) {
1404 mlog_errno(copied);
1405 ret = copied;
1406 goto out;
1407 }
1408
1409 total += copied;
1410 *ppos = *ppos + copied;
1411 count -= copied;
1412
1413 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1414 } while(count);
1415
1416out:
1417 return total ? total : ret;
1418}
1419
1420static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
1421 unsigned long *nr_segs)
1422{
1423 size_t ocount; /* original count */
1424 unsigned long seg;
1425
1426 ocount = 0;
1427 for (seg = 0; seg < *nr_segs; seg++) {
1428 const struct iovec *iv = &iov[seg];
1429
1430 /*
1431 * If any segment has a negative length, or the cumulative
1432 * length ever wraps negative then return -EINVAL.
1433 */
1434 ocount += iv->iov_len;
1435 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1436 return -EINVAL;
1437 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1438 continue;
1439 if (seg == 0)
1440 return -EFAULT;
1441 *nr_segs = seg;
1442 ocount -= iv->iov_len; /* This segment is no good */
1443 break;
1444 }
1445
1446 *counted = ocount;
1447 return 0;
1448}
1449
1144static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1450static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1145 const struct iovec *iov, 1451 const struct iovec *iov,
1146 unsigned long nr_segs, 1452 unsigned long nr_segs,
1147 loff_t pos) 1453 loff_t pos)
1148{ 1454{
1149 int ret, rw_level, have_alloc_sem = 0; 1455 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1150 struct file *filp = iocb->ki_filp; 1456 int can_do_direct, sync = 0;
1151 struct inode *inode = filp->f_path.dentry->d_inode; 1457 ssize_t written = 0;
1152 int appending = filp->f_flags & O_APPEND ? 1 : 0; 1458 size_t ocount; /* original count */
1153 1459 size_t count; /* after file limit checks */
1154 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1460 loff_t *ppos = &iocb->ki_pos;
1461 struct file *file = iocb->ki_filp;
1462 struct inode *inode = file->f_path.dentry->d_inode;
1463
1464 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1155 (unsigned int)nr_segs, 1465 (unsigned int)nr_segs,
1156 filp->f_path.dentry->d_name.len, 1466 file->f_path.dentry->d_name.len,
1157 filp->f_path.dentry->d_name.name); 1467 file->f_path.dentry->d_name.name);
1158 1468
1159 /* happy write of zero bytes */
1160 if (iocb->ki_left == 0) 1469 if (iocb->ki_left == 0)
1161 return 0; 1470 return 0;
1162 1471
1472 ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
1473 if (ret)
1474 return ret;
1475
1476 count = ocount;
1477
1478 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1479
1480 appending = file->f_flags & O_APPEND ? 1 : 0;
1481 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1482
1163 mutex_lock(&inode->i_mutex); 1483 mutex_lock(&inode->i_mutex);
1484
1485relock:
1164 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1486 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1165 if (filp->f_flags & O_DIRECT) { 1487 if (direct_io) {
1166 have_alloc_sem = 1;
1167 down_read(&inode->i_alloc_sem); 1488 down_read(&inode->i_alloc_sem);
1489 have_alloc_sem = 1;
1168 } 1490 }
1169 1491
1170 /* concurrent O_DIRECT writes are allowed */ 1492 /* concurrent O_DIRECT writes are allowed */
1171 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 1493 rw_level = !direct_io;
1172 ret = ocfs2_rw_lock(inode, rw_level); 1494 ret = ocfs2_rw_lock(inode, rw_level);
1173 if (ret < 0) { 1495 if (ret < 0) {
1174 rw_level = -1;
1175 mlog_errno(ret); 1496 mlog_errno(ret);
1176 goto out; 1497 goto out_sems;
1177 } 1498 }
1178 1499
1179 ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, 1500 can_do_direct = direct_io;
1180 iocb->ki_left, appending); 1501 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1502 iocb->ki_left, appending,
1503 &can_do_direct);
1181 if (ret < 0) { 1504 if (ret < 0) {
1182 mlog_errno(ret); 1505 mlog_errno(ret);
1183 goto out; 1506 goto out;
1184 } 1507 }
1185 1508
1186 /* communicate with ocfs2_dio_end_io */ 1509 /*
1187 ocfs2_iocb_set_rw_locked(iocb); 1510 * We can't complete the direct I/O as requested, fall back to
1511 * buffered I/O.
1512 */
1513 if (direct_io && !can_do_direct) {
1514 ocfs2_rw_unlock(inode, rw_level);
1515 up_read(&inode->i_alloc_sem);
1516
1517 have_alloc_sem = 0;
1518 rw_level = -1;
1188 1519
1189 ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); 1520 direct_io = 0;
1521 sync = 1;
1522 goto relock;
1523 }
1524
1525 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1526 sync = 1;
1527
1528 /*
1529 * XXX: Is it ok to execute these checks a second time?
1530 */
1531 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1532 if (ret)
1533 goto out;
1534
1535 /*
1536 * Set pos so that sync_page_range_nolock() below understands
1537 * where to start from. We might've moved it around via the
1538 * calls above. The range we want to actually sync starts from
1539 * *ppos here.
1540 *
1541 */
1542 pos = *ppos;
1543
1544 /* communicate with ocfs2_dio_end_io */
1545 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1546
1547 if (direct_io) {
1548 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1549 ppos, count, ocount);
1550 if (written < 0) {
1551 ret = written;
1552 goto out_dio;
1553 }
1554 } else {
1555 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1556 count, written);
1557 if (written < 0) {
1558 ret = written;
1559 if (ret != -EFAULT || ret != -ENOSPC)
1560 mlog_errno(ret);
1561 goto out;
1562 }
1563 }
1190 1564
1565out_dio:
1191 /* buffered aio wouldn't have proper lock coverage today */ 1566 /* buffered aio wouldn't have proper lock coverage today */
1192 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1567 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1193 1568
1194 /* 1569 /*
1195 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1570 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1207 } 1582 }
1208 1583
1209out: 1584out:
1585 if (rw_level != -1)
1586 ocfs2_rw_unlock(inode, rw_level);
1587
1588out_sems:
1210 if (have_alloc_sem) 1589 if (have_alloc_sem)
1211 up_read(&inode->i_alloc_sem); 1590 up_read(&inode->i_alloc_sem);
1212 if (rw_level != -1) 1591
1213 ocfs2_rw_unlock(inode, rw_level); 1592 if (written > 0 && sync) {
1593 ssize_t err;
1594
1595 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1596 if (err < 0)
1597 written = err;
1598 }
1599
1214 mutex_unlock(&inode->i_mutex); 1600 mutex_unlock(&inode->i_mutex);
1215 1601
1216 mlog_exit(ret); 1602 mlog_exit(ret);
1603 return written ? written : ret;
1604}
1605
1606static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1607 struct pipe_buffer *buf,
1608 struct splice_desc *sd)
1609{
1610 int ret, count, total = 0;
1611 ssize_t copied = 0;
1612 struct ocfs2_splice_write_priv sp;
1613
1614 ret = buf->ops->pin(pipe, buf);
1615 if (ret)
1616 goto out;
1617
1618 sp.s_sd = sd;
1619 sp.s_buf = buf;
1620 sp.s_pipe = pipe;
1621 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1622 sp.s_buf_offset = buf->offset;
1623
1624 count = sd->len;
1625 if (count + sp.s_offset > PAGE_CACHE_SIZE)
1626 count = PAGE_CACHE_SIZE - sp.s_offset;
1627
1628 do {
1629 /*
1630 * splice wants us to copy up to one page at a
1631 * time. For pagesize > cluster size, this means we
1632 * might enter ocfs2_buffered_write_cluster() more
1633 * than once, so keep track of our progress here.
1634 */
1635 copied = ocfs2_buffered_write_cluster(sd->file,
1636 (loff_t)sd->pos + total,
1637 count,
1638 ocfs2_map_and_write_splice_data,
1639 &sp);
1640 if (copied < 0) {
1641 mlog_errno(copied);
1642 ret = copied;
1643 goto out;
1644 }
1645
1646 count -= copied;
1647 sp.s_offset += copied;
1648 sp.s_buf_offset += copied;
1649 total += copied;
1650 } while (count);
1651
1652 ret = 0;
1653out:
1654
1655 return total ? total : ret;
1656}
1657
1658static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1659 struct file *out,
1660 loff_t *ppos,
1661 size_t len,
1662 unsigned int flags)
1663{
1664 int ret, err;
1665 struct address_space *mapping = out->f_mapping;
1666 struct inode *inode = mapping->host;
1667
1668 ret = __splice_from_pipe(pipe, out, ppos, len, flags,
1669 ocfs2_splice_write_actor);
1670 if (ret > 0) {
1671 *ppos += ret;
1672
1673 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1674 err = generic_osync_inode(inode, mapping,
1675 OSYNC_METADATA|OSYNC_DATA);
1676 if (err)
1677 ret = err;
1678 }
1679 }
1680
1217 return ret; 1681 return ret;
1218} 1682}
1219 1683
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1239 goto out; 1703 goto out;
1240 } 1704 }
1241 1705
1242 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); 1706 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1707 NULL);
1243 if (ret < 0) { 1708 if (ret < 0) {
1244 mlog_errno(ret); 1709 mlog_errno(ret);
1245 goto out_unlock; 1710 goto out_unlock;
1246 } 1711 }
1247 1712
1248 /* ok, we're done with i_size and alloc work */ 1713 /* ok, we're done with i_size and alloc work */
1249 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1714 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1250 1715
1251out_unlock: 1716out_unlock:
1252 ocfs2_rw_unlock(inode, 1); 1717 ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1323 } 1788 }
1324 rw_level = 0; 1789 rw_level = 0;
1325 /* communicate with ocfs2_dio_end_io */ 1790 /* communicate with ocfs2_dio_end_io */
1326 ocfs2_iocb_set_rw_locked(iocb); 1791 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1327 } 1792 }
1328 1793
1329 /* 1794 /*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..2c4460fced52 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start,
42 u32 clusters_to_add, 43 u32 clusters_to_add,
43 struct buffer_head *fe_bh, 44 struct buffer_head *fe_bh,
44 handle_t *handle, 45 handle_t *handle,
45 struct ocfs2_alloc_context *data_ac, 46 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac, 47 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason); 48 enum ocfs2_alloc_restarted *reason);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add,
51 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 54int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat); 55 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..21a605079c62 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
89 inode->i_flags |= S_DIRSYNC; 89 inode->i_flags |= S_DIRSYNC;
90} 90}
91 91
92struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
93 u64 blkno,
94 int delete_vote)
95{
96 struct ocfs2_find_inode_args args;
97
98 /* ocfs2_ilookup_for_vote should *only* be called from the
99 * vote thread */
100 BUG_ON(current != osb->vote_task);
101
102 args.fi_blkno = blkno;
103 args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
104 if (delete_vote)
105 args.fi_flags |= OCFS2_FI_FLAG_DELETE;
106 args.fi_ino = ino_from_blkno(osb->sb, blkno);
107 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
108}
109
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 92struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
111{ 93{
112 struct inode *inode = NULL; 94 struct inode *inode = NULL;
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
182 if (oi->ip_blkno != args->fi_blkno) 164 if (oi->ip_blkno != args->fi_blkno)
183 goto bail; 165 goto bail;
184 166
185 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
186 * ocfs2_ilookup_for_vote which won't create an inode for one
187 * that isn't found. The vote thread which doesn't want to get
188 * an inode which is in the process of going away - otherwise
189 * the call to __wait_on_freeing_inode in find_inode_fast will
190 * cause it to deadlock on an inode which may be waiting on a
191 * vote (or lock release) in delete_inode */
192 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
193 (inode->i_state & (I_FREEING|I_CLEAR))) {
194 /* As stated above, we're not going to return an
195 * inode. In the case of a delete vote, the voting
196 * code is going to signal the other node to go
197 * ahead. Mark that state here, so this freeing inode
198 * has the state when it gets to delete_inode. */
199 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
200 spin_lock(&oi->ip_lock);
201 ocfs2_mark_inode_remotely_deleted(inode);
202 spin_unlock(&oi->ip_lock);
203 }
204 goto bail;
205 }
206
207 ret = 1; 167 ret = 1;
208bail: 168bail:
209 mlog_exit(ret); 169 mlog_exit(ret);
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
261 goto bail; 221 goto bail;
262 } 222 }
263 223
224 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
225 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
226
264 inode->i_version = 1; 227 inode->i_version = 1;
265 inode->i_generation = le32_to_cpu(fe->i_generation); 228 inode->i_generation = le32_to_cpu(fe->i_generation);
266 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 229 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
272 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) 235 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
273 inode->i_blocks = 0; 236 inode->i_blocks = 0;
274 else 237 else
275 inode->i_blocks = 238 inode->i_blocks = ocfs2_inode_sector_count(inode);
276 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
277 inode->i_mapping->a_ops = &ocfs2_aops; 239 inode->i_mapping->a_ops = &ocfs2_aops;
278 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 240 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
279 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 241 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
288 (unsigned long long)OCFS2_I(inode)->ip_blkno, 250 (unsigned long long)OCFS2_I(inode)->ip_blkno,
289 (unsigned long long)fe->i_blkno); 251 (unsigned long long)fe->i_blkno);
290 252
291 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
292 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
293 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
294
295 inode->i_nlink = le16_to_cpu(fe->i_links_count); 253 inode->i_nlink = le16_to_cpu(fe->i_links_count);
296 254
297 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 255 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
347 305
348 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 306 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
349 OCFS2_LOCK_TYPE_META, 0, inode); 307 OCFS2_LOCK_TYPE_META, 0, inode);
308
309 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
310 OCFS2_LOCK_TYPE_OPEN, 0, inode);
350 } 311 }
351 312
352 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 313 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
421 * cluster lock before trusting anything anyway. 382 * cluster lock before trusting anything anyway.
422 */ 383 */
423 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 384 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
424 && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) 385 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
425 && !ocfs2_mount_local(osb); 386 && !ocfs2_mount_local(osb);
426 387
427 /* 388 /*
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
438 OCFS2_LOCK_TYPE_META, 399 OCFS2_LOCK_TYPE_META,
439 generation, inode); 400 generation, inode);
440 401
402 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
403 OCFS2_LOCK_TYPE_OPEN,
404 0, inode);
405
441 if (can_lock) { 406 if (can_lock) {
407 status = ocfs2_open_lock(inode);
408 if (status) {
409 make_bad_inode(inode);
410 mlog_errno(status);
411 return status;
412 }
442 status = ocfs2_meta_lock(inode, NULL, 0); 413 status = ocfs2_meta_lock(inode, NULL, 0);
443 if (status) { 414 if (status) {
444 make_bad_inode(inode); 415 make_bad_inode(inode);
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
447 } 418 }
448 } 419 }
449 420
421 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
422 status = ocfs2_try_open_lock(inode, 0);
423 if (status) {
424 make_bad_inode(inode);
425 return status;
426 }
427 }
428
450 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, 429 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
451 can_lock ? inode : NULL); 430 can_lock ? inode : NULL);
452 if (status < 0) { 431 if (status < 0) {
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
507 struct buffer_head *fe_bh) 486 struct buffer_head *fe_bh)
508{ 487{
509 int status = 0; 488 int status = 0;
510 handle_t *handle = NULL;
511 struct ocfs2_truncate_context *tc = NULL; 489 struct ocfs2_truncate_context *tc = NULL;
512 struct ocfs2_dinode *fe; 490 struct ocfs2_dinode *fe;
491 handle_t *handle = NULL;
513 492
514 mlog_entry_void(); 493 mlog_entry_void();
515 494
516 fe = (struct ocfs2_dinode *) fe_bh->b_data; 495 fe = (struct ocfs2_dinode *) fe_bh->b_data;
517 496
518 /* zero allocation, zero truncate :) */ 497 if (fe->i_clusters) {
519 if (!fe->i_clusters) 498 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
520 goto bail; 499 if (IS_ERR(handle)) {
500 status = PTR_ERR(handle);
501 mlog_errno(status);
502 goto out;
503 }
521 504
522 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 505 status = ocfs2_journal_access(handle, inode, fe_bh,
523 if (IS_ERR(handle)) { 506 OCFS2_JOURNAL_ACCESS_WRITE);
524 status = PTR_ERR(handle); 507 if (status < 0) {
525 handle = NULL; 508 mlog_errno(status);
526 mlog_errno(status); 509 goto out;
527 goto bail; 510 }
528 }
529 511
530 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); 512 i_size_write(inode, 0);
531 if (status < 0) {
532 mlog_errno(status);
533 goto bail;
534 }
535 513
536 ocfs2_commit_trans(osb, handle); 514 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
537 handle = NULL; 515 if (status < 0) {
516 mlog_errno(status);
517 goto out;
518 }
538 519
539 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 520 ocfs2_commit_trans(osb, handle);
540 if (status < 0) { 521 handle = NULL;
541 mlog_errno(status);
542 goto bail;
543 }
544 522
545 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); 523 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
546 if (status < 0) { 524 if (status < 0) {
547 mlog_errno(status); 525 mlog_errno(status);
548 goto bail; 526 goto out;
527 }
528
529 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
530 if (status < 0) {
531 mlog_errno(status);
532 goto out;
533 }
549 } 534 }
550bail: 535
536out:
551 if (handle) 537 if (handle)
552 ocfs2_commit_trans(osb, handle); 538 ocfs2_commit_trans(osb, handle);
553
554 mlog_exit(status); 539 mlog_exit(status);
555 return status; 540 return status;
556} 541}
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
678 struct inode *orphan_dir_inode = NULL; 663 struct inode *orphan_dir_inode = NULL;
679 struct buffer_head *orphan_dir_bh = NULL; 664 struct buffer_head *orphan_dir_bh = NULL;
680 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 665 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
666 struct ocfs2_dinode *di;
681 667
682 /* We've already voted on this so it should be readonly - no 668 di = (struct ocfs2_dinode *) di_bh->b_data;
683 * spinlock needed. */ 669 orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
684 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
685 670
686 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 671 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
687 if (status) 672 if (status)
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
839 goto bail; 824 goto bail;
840 } 825 }
841 826
842 status = ocfs2_request_delete_vote(inode); 827 /*
843 /* -EBUSY means that other nodes are still using the 828 * This is how ocfs2 determines whether an inode is still live
844 * inode. We're done here though, so avoid doing anything on 829 * within the cluster. Every node takes a shared read lock on
845 * disk and let them worry about deleting it. */ 830 * the inode open lock in ocfs2_read_locked_inode(). When we
846 if (status == -EBUSY) { 831 * get to ->delete_inode(), each node tries to convert it's
832 * lock to an exclusive. Trylocks are serialized by the inode
833 * meta data lock. If the upconvert suceeds, we know the inode
834 * is no longer live and can be deleted.
835 *
836 * Though we call this with the meta data lock held, the
837 * trylock keeps us from ABBA deadlock.
838 */
839 status = ocfs2_try_open_lock(inode, 1);
840 if (status == -EAGAIN) {
847 status = 0; 841 status = 0;
848 mlog(0, "Skipping delete of %llu because it is in use on" 842 mlog(0, "Skipping delete of %llu because it is in use on"
849 "other nodes\n", (unsigned long long)oi->ip_blkno); 843 "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
854 goto bail; 848 goto bail;
855 } 849 }
856 850
857 spin_lock(&oi->ip_lock); 851 *wipe = 1;
858 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { 852 mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
859 /* Nobody knew which slot this inode was orphaned 853 (unsigned long long)oi->ip_blkno,
860 * into. This may happen during node death and 854 le16_to_cpu(di->i_orphaned_slot));
861 * recovery knows how to clean it up so we can safely
862 * ignore this inode for now on. */
863 mlog(0, "Nobody knew where inode %llu was orphaned!\n",
864 (unsigned long long)oi->ip_blkno);
865 } else {
866 *wipe = 1;
867
868 mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
869 (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
870 }
871 spin_unlock(&oi->ip_lock);
872 855
873bail: 856bail:
874 return status; 857 return status;
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
1001 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 984 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1002 "Inode=%lu\n", inode->i_ino); 985 "Inode=%lu\n", inode->i_ino);
1003 986
987 /* For remove delete_inode vote, we hold open lock before,
988 * now it is time to unlock PR and EX open locks. */
989 ocfs2_open_unlock(inode);
990
1004 /* Do these before all the other work so that we don't bounce 991 /* Do these before all the other work so that we don't bounce
1005 * the vote thread while waiting to destroy the locks. */ 992 * the vote thread while waiting to destroy the locks. */
1006 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 993 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1007 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 994 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
1008 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); 995 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
996 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1009 997
1010 /* We very well may get a clear_inode before all an inodes 998 /* We very well may get a clear_inode before all an inodes
1011 * metadata has hit disk. Of course, we can't drop any cluster 999 * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
1020 "Clear inode of %llu, inode has io markers\n", 1008 "Clear inode of %llu, inode has io markers\n",
1021 (unsigned long long)oi->ip_blkno); 1009 (unsigned long long)oi->ip_blkno);
1022 1010
1023 ocfs2_extent_map_drop(inode, 0); 1011 ocfs2_extent_map_trunc(inode, 0);
1024 ocfs2_extent_map_init(inode);
1025 1012
1026 status = ocfs2_drop_inode_locks(inode); 1013 status = ocfs2_drop_inode_locks(inode);
1027 if (status < 0) 1014 if (status < 0)
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
1030 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1017 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1031 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1018 ocfs2_lock_res_free(&oi->ip_meta_lockres);
1032 ocfs2_lock_res_free(&oi->ip_data_lockres); 1019 ocfs2_lock_res_free(&oi->ip_data_lockres);
1020 ocfs2_lock_res_free(&oi->ip_open_lockres);
1033 1021
1034 ocfs2_metadata_cache_purge(inode); 1022 ocfs2_metadata_cache_purge(inode);
1035 1023
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
1086 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", 1074 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
1087 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); 1075 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
1088 1076
1089 /* Testing ip_orphaned_slot here wouldn't work because we may
1090 * not have gotten a delete_inode vote from any other nodes
1091 * yet. */
1092 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1077 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
1093 generic_delete_inode(inode); 1078 generic_delete_inode(inode);
1094 else 1079 else
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
1121 return NULL; 1106 return NULL;
1122 } 1107 }
1123 1108
1124 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, 1109 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
1125 &p_blkno, NULL); 1110 NULL);
1126 if (tmperr < 0) { 1111 if (tmperr < 0) {
1127 mlog_errno(tmperr); 1112 mlog_errno(tmperr);
1128 goto fail; 1113 goto fail;
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1259 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) 1244 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1260 inode->i_blocks = 0; 1245 inode->i_blocks = 0;
1261 else 1246 else
1262 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1247 inode->i_blocks = ocfs2_inode_sector_count(inode);
1263 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 1248 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1264 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 1249 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1265 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 1250 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..03ae075869ee 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
26#ifndef OCFS2_INODE_H 26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H 27#define OCFS2_INODE_H
28 28
29#include "extent_map.h"
30
29/* OCFS2 Inode Private Data */ 31/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info 32struct ocfs2_inode_info
31{ 33{
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
34 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres; 38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres;
37 40
38 /* protects allocation changes on this inode. */ 41 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem; 42 struct rw_semaphore ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
42 spinlock_t ip_lock; 45 spinlock_t ip_lock;
43 u32 ip_open_count; 46 u32 ip_open_count;
44 u32 ip_clusters; 47 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers; 48 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48 49
49 struct mutex ip_io_mutex; 50 struct mutex ip_io_mutex;
50 51
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
64 65
65 struct ocfs2_caching_info ip_metadata_cache; 66 struct ocfs2_caching_info ip_metadata_cache;
66 67
68 struct ocfs2_extent_map ip_extent_map;
69
67 struct inode vfs_inode; 70 struct inode vfs_inode;
68}; 71};
69 72
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
117void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
118 121
119/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
120#define OCFS2_FI_FLAG_NOWAIT 0x1 123#define OCFS2_FI_FLAG_SYSFILE 0x4
121#define OCFS2_FI_FLAG_DELETE 0x2 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
122#define OCFS2_FI_FLAG_SYSFILE 0x4
123#define OCFS2_FI_FLAG_NOLOCK 0x8
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode); 126int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry); 127int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 128int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144 142
145void ocfs2_set_inode_flags(struct inode *inode); 143void ocfs2_set_inode_flags(struct inode *inode);
146 144
145static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
146{
147 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
148
149 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
150}
151
147#endif /* OCFS2_INODE_H */ 152#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..5a8a90d1c787 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,29 +649,20 @@ bail:
649static int ocfs2_force_read_journal(struct inode *inode) 649static int ocfs2_force_read_journal(struct inode *inode)
650{ 650{
651 int status = 0; 651 int status = 0;
652 int i, p_blocks; 652 int i;
653 u64 v_blkno, p_blkno; 653 u64 v_blkno, p_blkno, p_blocks, num_blocks;
654#define CONCURRENT_JOURNAL_FILL 32 654#define CONCURRENT_JOURNAL_FILL 32ULL
655 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 655 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
656 656
657 mlog_entry_void(); 657 mlog_entry_void();
658 658
659 BUG_ON(inode->i_blocks !=
660 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
661
662 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 659 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
663 660
664 mlog(0, "Force reading %llu blocks\n", 661 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
665 (unsigned long long)(inode->i_blocks >>
666 (inode->i_sb->s_blocksize_bits - 9)));
667
668 v_blkno = 0; 662 v_blkno = 0;
669 while (v_blkno < 663 while (v_blkno < num_blocks) {
670 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
671
672 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 664 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
673 1, &p_blkno, 665 &p_blkno, &p_blocks, NULL);
674 &p_blocks);
675 if (status < 0) { 666 if (status < 0) {
676 mlog_errno(status); 667 mlog_errno(status);
677 goto bail; 668 goto bail;
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1306 continue; 1297 continue;
1307 1298
1308 iter = ocfs2_iget(osb, le64_to_cpu(de->inode), 1299 iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1309 OCFS2_FI_FLAG_NOLOCK); 1300 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1310 if (IS_ERR(iter)) 1301 if (IS_ERR(iter))
1311 continue; 1302 continue;
1312 1303
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1418 /* Set the proper information to get us going into 1409 /* Set the proper information to get us going into
1419 * ocfs2_delete_inode. */ 1410 * ocfs2_delete_inode. */
1420 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 1411 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1421 oi->ip_orphaned_slot = slot;
1422 spin_unlock(&oi->ip_lock); 1412 spin_unlock(&oi->ip_lock);
1423 1413
1424 iput(inode); 1414 iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
390 /* We may be deleting metadata blocks, so metadata alloc dinode + 390 /* We may be deleting metadata blocks, so metadata alloc dinode +
391 one desc. block for each possible delete. */ 391 one desc. block for each possible delete. */
392 if (tree_depth && next_free == 1 && 392 if (tree_depth && next_free == 1 &&
393 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) 393 ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
394 credits += 1 + tree_depth; 394 credits += 1 + tree_depth;
395 395
396 /* update to the truncate log. */ 396 /* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
85 int ret = 0, lock_level = 0; 85 int ret = 0, lock_level = 0;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
87 87
88 /* We don't want to support shared writable mappings yet. */ 88 /*
89 if (!ocfs2_mount_local(osb) && 89 * Only support shared writeable mmap for local mounts which
90 * don't know about holes.
91 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
90 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
91 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
92 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..2bcf353fd7c5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
175 175
176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
177 if (IS_ERR(inode)) { 177 if (IS_ERR(inode)) {
178 mlog(ML_ERROR, "Unable to create inode %llu\n",
179 (unsigned long long)blkno);
180 ret = ERR_PTR(-EACCES); 178 ret = ERR_PTR(-EACCES);
181 goto bail_unlock; 179 goto bail_unlock;
182 } 180 }
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
189 * unlink. */ 187 * unlink. */
190 spin_lock(&oi->ip_lock); 188 spin_lock(&oi->ip_lock);
191 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; 189 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
192 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
193 spin_unlock(&oi->ip_lock); 190 spin_unlock(&oi->ip_lock);
194 191
195bail_add: 192bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
288 285
289 i_size_write(inode, inode->i_sb->s_blocksize); 286 i_size_write(inode, inode->i_sb->s_blocksize);
290 inode->i_nlink = 2; 287 inode->i_nlink = 2;
291 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); 288 inode->i_blocks = ocfs2_inode_sector_count(inode);
292 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
293 if (status < 0) { 290 if (status < 0) {
294 mlog_errno(status); 291 mlog_errno(status);
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1486 struct buffer_head **bhs = NULL; 1483 struct buffer_head **bhs = NULL;
1487 const char *c; 1484 const char *c;
1488 struct super_block *sb = osb->sb; 1485 struct super_block *sb = osb->sb;
1489 u64 p_blkno; 1486 u64 p_blkno, p_blocks;
1490 int p_blocks;
1491 int virtual, blocks, status, i, bytes_left; 1487 int virtual, blocks, status, i, bytes_left;
1492 1488
1493 bytes_left = i_size_read(inode) + 1; 1489 bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1514 goto bail; 1510 goto bail;
1515 } 1511 }
1516 1512
1517 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, 1513 status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
1518 &p_blocks); 1514 NULL);
1519 if (status < 0) { 1515 if (status < 0) {
1520 mlog_errno(status); 1516 mlog_errno(status);
1521 goto bail; 1517 goto bail;
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
1674 inode->i_rdev = 0; 1670 inode->i_rdev = 0;
1675 newsize = l - 1; 1671 newsize = l - 1;
1676 if (l > ocfs2_fast_symlink_chars(sb)) { 1672 if (l > ocfs2_fast_symlink_chars(sb)) {
1673 u32 offset = 0;
1674
1677 inode->i_op = &ocfs2_symlink_inode_operations; 1675 inode->i_op = &ocfs2_symlink_inode_operations;
1678 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, 1676 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
1677 new_fe_bh,
1679 handle, data_ac, NULL, 1678 handle, data_ac, NULL,
1680 NULL); 1679 NULL);
1681 if (status < 0) { 1680 if (status < 0) {
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
1689 goto bail; 1688 goto bail;
1690 } 1689 }
1691 i_size_write(inode, newsize); 1690 i_size_write(inode, newsize);
1692 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); 1691 inode->i_blocks = ocfs2_inode_sector_count(inode);
1693 } else { 1692 } else {
1694 inode->i_op = &ocfs2_fast_symlink_inode_operations; 1693 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1695 memcpy((char *) fe->id2.i_symlink, symname, l); 1694 memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2222 /* Record which orphan dir our inode now resides 2221 /* Record which orphan dir our inode now resides
2223 * in. delete_inode will use this to determine which orphan 2222 * in. delete_inode will use this to determine which orphan
2224 * dir to lock. */ 2223 * dir to lock. */
2225 spin_lock(&OCFS2_I(inode)->ip_lock); 2224 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
2226 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2227 spin_unlock(&OCFS2_I(inode)->ip_lock);
2228 2225
2229 mlog(0, "Inode %llu orphaned in slot %d\n", 2226 mlog(0, "Inode %llu orphaned in slot %d\n",
2230 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 2227 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..82cc92dcf8a6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
46#include "endian.h" 46#include "endian.h"
47#include "ocfs2_lockid.h" 47#include "ocfs2_lockid.h"
48 48
49struct ocfs2_extent_map {
50 u32 em_clusters;
51 struct rb_root em_extents;
52};
53
54/* Most user visible OCFS2 inodes will have very few pieces of 49/* Most user visible OCFS2 inodes will have very few pieces of
55 * metadata, but larger files (including bitmaps, etc) must be taken 50 * metadata, but larger files (including bitmaps, etc) must be taken
56 * into account when designing an access scheme. We allow a small 51 * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
303 return 1; 298 return 1;
304} 299}
305 300
301static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
302{
303 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
304 return 1;
305 return 0;
306}
307
306/* set / clear functions because cluster events can make these happen 308/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also 309 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock 310 * means that any future flags osb_flags must be protected by spinlock
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
461 return (unsigned long)((bytes + 511) >> 9); 463 return (unsigned long)((bytes + 511) >> 9);
462} 464}
463 465
466static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
467 unsigned long pg_index)
468{
469 u32 clusters = pg_index;
470 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
471
472 if (unlikely(PAGE_CACHE_SHIFT > cbits))
473 clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
474 else if (PAGE_CACHE_SHIFT < cbits)
475 clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
476
477 return clusters;
478}
479
480/*
481 * Find the 1st page index which covers the given clusters.
482 */
483static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
484 u32 clusters)
485{
486 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
487 unsigned long index = clusters;
488
489 if (PAGE_CACHE_SHIFT > cbits) {
490 index = clusters >> (PAGE_CACHE_SHIFT - cbits);
491 } else if (PAGE_CACHE_SHIFT < cbits) {
492 index = clusters << (cbits - PAGE_CACHE_SHIFT);
493 }
494
495 return index;
496}
497
498static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
499{
500 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
501 unsigned int pages_per_cluster = 1;
502
503 if (PAGE_CACHE_SHIFT < cbits)
504 pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
505
506 return pages_per_cluster;
507}
508
464#define ocfs2_set_bit ext2_set_bit 509#define ocfs2_set_bit ext2_set_bit
465#define ocfs2_clear_bit ext2_clear_bit 510#define ocfs2_clear_bit ext2_clear_bit
466#define ocfs2_test_bit ext2_test_bit 511#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..71306479c68f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask) 86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87 87
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91 92
92/* 93/*
@@ -155,6 +156,12 @@
155#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 156#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
156 157
157/* 158/*
159 * Extent record flags (e_node.leaf.flags)
160 */
161#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
162 * unwritten */
163
164/*
158 * ioctl commands 165 * ioctl commands
159 */ 166 */
160#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 167#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
282/* 289/*
283 * On disk extent record for OCFS2 290 * On disk extent record for OCFS2
284 * It describes a range of clusters on disk. 291 * It describes a range of clusters on disk.
292 *
293 * Length fields are divided into interior and leaf node versions.
294 * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
285 */ 295 */
286struct ocfs2_extent_rec { 296struct ocfs2_extent_rec {
287/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ 297/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
288 __le32 e_clusters; /* Clusters covered by this extent */ 298 union {
299 __le32 e_int_clusters; /* Clusters covered by all children */
300 struct {
301 __le16 e_leaf_clusters; /* Clusters covered by this
302 extent */
303 __u8 e_reserved1;
304 __u8 e_flags; /* Extent flags */
305 };
306 };
289 __le64 e_blkno; /* Physical disk offset, in blocks */ 307 __le64 e_blkno; /* Physical disk offset, in blocks */
290/*10*/ 308/*10*/
291}; 309};
@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
311/*00*/ __le16 l_tree_depth; /* Extent tree depth from this 329/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
312 point. 0 means data extents 330 point. 0 means data extents
313 hang directly off this 331 hang directly off this
314 header (a leaf) */ 332 header (a leaf)
333 NOTE: The high 8 bits cannot be
334 used - tree_depth is never that big.
335 */
315 __le16 l_count; /* Number of extent records */ 336 __le16 l_count; /* Number of extent records */
316 __le16 l_next_free_rec; /* Next unused extent slot */ 337 __le16 l_next_free_rec; /* Next unused extent slot */
317 __le16 l_reserved1; 338 __le16 l_reserved1;
@@ -446,7 +467,9 @@ struct ocfs2_dinode {
446 __le32 i_ctime_nsec; 467 __le32 i_ctime_nsec;
447 __le32 i_mtime_nsec; 468 __le32 i_mtime_nsec;
448 __le32 i_attr; 469 __le32 i_attr;
449 __le32 i_reserved1; 470 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
471 was set in i_flags */
472 __le16 i_reserved1;
450/*70*/ __le64 i_reserved2[8]; 473/*70*/ __le64 i_reserved2[8];
451/*B8*/ union { 474/*B8*/ union {
452 __le64 i_pad1; /* Generic way to refer to this 475 __le64 i_pad1; /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
44 OCFS2_LOCK_TYPE_RENAME, 44 OCFS2_LOCK_TYPE_RENAME,
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN,
47 OCFS2_NUM_LOCK_TYPES 48 OCFS2_NUM_LOCK_TYPES
48}; 49};
49 50
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
69 case OCFS2_LOCK_TYPE_DENTRY: 70 case OCFS2_LOCK_TYPE_DENTRY:
70 c = 'N'; 71 c = 'N';
71 break; 72 break;
73 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O';
75 break;
72 default: 76 default:
73 c = '\0'; 77 c = '\0';
74 } 78 }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
85 * important job it does, anyway. */ 89 * important job it does, anyway. */
86 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 90 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
87 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open",
88}; 93};
89 94
90static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
197 goto bail; 197 goto bail;
198 } 198 }
199 199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); 200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
201 if (status < 0) { 201 if (status < 0) {
202 mlog_errno(status); 202 mlog_errno(status);
203 goto bail; 203 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..0da655ae5d6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
381 le32_to_cpu(fe->i_clusters))); 381 le32_to_cpu(fe->i_clusters)));
382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
384 alloc_inode->i_blocks = 384 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
385 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
386 385
387 status = 0; 386 status = 0;
388bail: 387bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..5c9e8243691f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
806 806
807 ocfs2_print_version(); 807 ocfs2_print_version();
808 808
809 if (init_ocfs2_extent_maps())
810 return -ENOMEM;
811
812 status = init_ocfs2_uptodate_cache(); 809 status = init_ocfs2_uptodate_cache();
813 if (status < 0) { 810 if (status < 0) {
814 mlog_errno(status); 811 mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
837 if (status < 0) { 834 if (status < 0) {
838 ocfs2_free_mem_caches(); 835 ocfs2_free_mem_caches();
839 exit_ocfs2_uptodate_cache(); 836 exit_ocfs2_uptodate_cache();
840 exit_ocfs2_extent_maps();
841 } 837 }
842 838
843 mlog_exit(status); 839 mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
863 859
864 unregister_filesystem(&ocfs2_fs_type); 860 unregister_filesystem(&ocfs2_fs_type);
865 861
866 exit_ocfs2_extent_maps();
867
868 exit_ocfs2_uptodate_cache(); 862 exit_ocfs2_uptodate_cache();
869 863
870 mlog_exit_void(); 864 mlog_exit_void();
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
963 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 957 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
964 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 958 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
965 ocfs2_lock_res_init_once(&oi->ip_data_lockres); 959 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
960 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
966 961
967 ocfs2_metadata_cache_init(&oi->vfs_inode); 962 ocfs2_metadata_cache_init(&oi->vfs_inode);
968 963
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
63 __be32 h_node_num; /* node sending this particular message. */ 63 __be32 h_node_num; /* node sending this particular message. */
64}; 64};
65 65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg 66struct ocfs2_vote_msg
70{ 67{
71 struct ocfs2_msg_hdr v_hdr; 68 struct ocfs2_msg_hdr v_hdr;
72 union { 69 __be32 v_reserved1;
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77}; 70};
78 71
79/* Responses are given these values to maintain backwards 72/* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
86{ 79{
87 struct ocfs2_msg_hdr r_hdr; 80 struct ocfs2_msg_hdr r_hdr;
88 __be32 r_response; 81 __be32 r_response;
89 __be32 r_orphaned_slot;
90}; 82};
91 83
92struct ocfs2_vote_work { 84struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
96 88
97enum ocfs2_vote_request { 89enum ocfs2_vote_request {
98 OCFS2_VOTE_REQ_INVALID = 0, 90 OCFS2_VOTE_REQ_INVALID = 0,
99 OCFS2_VOTE_REQ_DELETE,
100 OCFS2_VOTE_REQ_MOUNT, 91 OCFS2_VOTE_REQ_MOUNT,
101 OCFS2_VOTE_REQ_UMOUNT, 92 OCFS2_VOTE_REQ_UMOUNT,
102 OCFS2_VOTE_REQ_LAST 93 OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
151 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); 142 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
152} 143}
153 144
154void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
155{
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157
158 assert_spin_locked(&oi->ip_lock);
159 /* We set the SKIP_DELETE flag on the inode so we don't try to
160 * delete it in delete_inode ourselves, thus avoiding
161 * unecessary lock pinging. If the other node failed to wipe
162 * the inode as a result of a crash, then recovery will pick
163 * up the slack. */
164 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
165}
166
167static int ocfs2_process_delete_request(struct inode *inode,
168 int *orphaned_slot)
169{
170 int response = OCFS2_RESPONSE_BUSY;
171
172 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
173 inode->i_ino, inode->i_nlink, *orphaned_slot);
174
175 spin_lock(&OCFS2_I(inode)->ip_lock);
176
177 /* Whatever our vote response is, we want to make sure that
178 * the orphaned slot is recorded properly on this node *and*
179 * on the requesting node. Technically, if the requesting node
180 * did not know which slot the inode is orphaned in but we
181 * respond with BUSY he doesn't actually need the orphaned
182 * slot, but it doesn't hurt to do it here anyway. */
183 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
184 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
185 OCFS2_INVALID_SLOT &&
186 OCFS2_I(inode)->ip_orphaned_slot !=
187 (*orphaned_slot),
188 "Inode %llu: This node thinks it's "
189 "orphaned in slot %d, messaged it's in %d\n",
190 (unsigned long long)OCFS2_I(inode)->ip_blkno,
191 OCFS2_I(inode)->ip_orphaned_slot,
192 *orphaned_slot);
193
194 mlog(0, "Setting orphaned slot for inode %llu to %d\n",
195 (unsigned long long)OCFS2_I(inode)->ip_blkno,
196 *orphaned_slot);
197
198 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
199 } else {
200 mlog(0, "Sending back orphaned slot %d for inode %llu\n",
201 OCFS2_I(inode)->ip_orphaned_slot,
202 (unsigned long long)OCFS2_I(inode)->ip_blkno);
203
204 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
205 }
206
207 /* vote no if the file is still open. */
208 if (OCFS2_I(inode)->ip_open_count) {
209 mlog(0, "open count = %u\n",
210 OCFS2_I(inode)->ip_open_count);
211 spin_unlock(&OCFS2_I(inode)->ip_lock);
212 goto done;
213 }
214 spin_unlock(&OCFS2_I(inode)->ip_lock);
215
216 /* directories are a bit ugly... What if someone is sitting in
217 * it? We want to make sure the inode is removed completely as
218 * a result of the iput in process_vote. */
219 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
220 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
221 goto done;
222 }
223
224 if (filemap_fdatawrite(inode->i_mapping)) {
225 mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
226 (unsigned long long)OCFS2_I(inode)->ip_blkno);
227 goto done;
228 }
229 sync_mapping_buffers(inode->i_mapping);
230 truncate_inode_pages(inode->i_mapping, 0);
231 ocfs2_extent_map_trunc(inode, 0);
232
233 spin_lock(&OCFS2_I(inode)->ip_lock);
234 /* double check open count - someone might have raced this
235 * thread into ocfs2_file_open while we were writing out
236 * data. If we're to allow a wipe of this inode now, we *must*
237 * hold the spinlock until we've marked it. */
238 if (OCFS2_I(inode)->ip_open_count) {
239 mlog(0, "Raced to wipe! open count = %u\n",
240 OCFS2_I(inode)->ip_open_count);
241 spin_unlock(&OCFS2_I(inode)->ip_lock);
242 goto done;
243 }
244
245 /* Mark the inode as being wiped from disk. */
246 ocfs2_mark_inode_remotely_deleted(inode);
247 spin_unlock(&OCFS2_I(inode)->ip_lock);
248
249 /* Not sure this is necessary anymore. */
250 d_prune_aliases(inode);
251
252 /* If we get here, then we're voting 'yes', so commit the
253 * delete on our side. */
254 response = OCFS2_RESPONSE_OK;
255done:
256 return response;
257}
258
259static void ocfs2_process_vote(struct ocfs2_super *osb, 145static void ocfs2_process_vote(struct ocfs2_super *osb,
260 struct ocfs2_vote_msg *msg) 146 struct ocfs2_vote_msg *msg)
261{ 147{
262 int net_status, vote_response; 148 int net_status, vote_response;
263 int orphaned_slot = 0; 149 unsigned int node_num;
264 unsigned int node_num, generation;
265 u64 blkno; 150 u64 blkno;
266 enum ocfs2_vote_request request; 151 enum ocfs2_vote_request request;
267 struct inode *inode = NULL;
268 struct ocfs2_msg_hdr *hdr = &msg->v_hdr; 152 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
269 struct ocfs2_response_msg response; 153 struct ocfs2_response_msg response;
270 154
271 /* decode the network mumbo jumbo into local variables. */ 155 /* decode the network mumbo jumbo into local variables. */
272 request = be32_to_cpu(hdr->h_request); 156 request = be32_to_cpu(hdr->h_request);
273 blkno = be64_to_cpu(hdr->h_blkno); 157 blkno = be64_to_cpu(hdr->h_blkno);
274 generation = be32_to_cpu(hdr->h_generation);
275 node_num = be32_to_cpu(hdr->h_node_num); 158 node_num = be32_to_cpu(hdr->h_node_num);
276 if (request == OCFS2_VOTE_REQ_DELETE)
277 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
278 159
279 mlog(0, "processing vote: request = %u, blkno = %llu, " 160 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
280 "generation = %u, node_num = %u, priv1 = %u\n", request, 161 request, (unsigned long long)blkno, node_num);
281 (unsigned long long)blkno, generation, node_num,
282 be32_to_cpu(msg->md1.v_generic1));
283 162
284 if (!ocfs2_is_valid_vote_request(request)) { 163 if (!ocfs2_is_valid_vote_request(request)) {
285 mlog(ML_ERROR, "Invalid vote request %d from node %u\n", 164 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
302 break; 181 break;
303 } 182 }
304 183
305 /* We cannot process the remaining message types before we're
306 * fully mounted. It's perfectly safe however to send a 'yes'
307 * response as we can't possibly have any of the state they're
308 * asking us to modify yet. */
309 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
310 goto respond;
311
312 /* If we get here, then the request is against an inode. */
313 inode = ocfs2_ilookup_for_vote(osb, blkno,
314 request == OCFS2_VOTE_REQ_DELETE);
315
316 /* Not finding the inode is perfectly valid - it means we're
317 * not interested in what the other node is about to do to it
318 * so in those cases we automatically respond with an
319 * affirmative. Cluster locking ensures that we won't race
320 * interest in the inode with this vote request. */
321 if (!inode)
322 goto respond;
323
324 /* Check generation values. It's possible for us to get a
325 * request against a stale inode. If so then we proceed as if
326 * we had not found an inode in the first place. */
327 if (inode->i_generation != generation) {
328 mlog(0, "generation passed %u != inode generation = %u, "
329 "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
330 "message type = %u\n", generation, inode->i_generation,
331 OCFS2_I(inode)->ip_flags,
332 (unsigned long long)OCFS2_I(inode)->ip_blkno,
333 (unsigned long long)blkno, atomic_read(&inode->i_count),
334 request);
335 iput(inode);
336 inode = NULL;
337 goto respond;
338 }
339
340 switch (request) {
341 case OCFS2_VOTE_REQ_DELETE:
342 vote_response = ocfs2_process_delete_request(inode,
343 &orphaned_slot);
344 break;
345 default:
346 mlog(ML_ERROR, "node %u, invalid request: %u\n",
347 node_num, request);
348 vote_response = OCFS2_RESPONSE_BAD_MSG;
349 }
350
351respond: 184respond:
352 /* Response struture is small so we just put it on the stack 185 /* Response struture is small so we just put it on the stack
353 * and stuff it inline. */ 186 * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
357 response.r_hdr.h_generation = hdr->h_generation; 190 response.r_hdr.h_generation = hdr->h_generation;
358 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); 191 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
359 response.r_response = cpu_to_be32(vote_response); 192 response.r_response = cpu_to_be32(vote_response);
360 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
361 193
362 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, 194 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
363 osb->net_key, 195 osb->net_key,
@@ -373,9 +205,6 @@ respond:
373 && net_status != -ENOTCONN) 205 && net_status != -ENOTCONN)
374 mlog(ML_ERROR, "message to node %u fails with error %d!\n", 206 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
375 node_num, net_status); 207 node_num, net_status);
376
377 if (inode)
378 iput(inode);
379} 208}
380 209
381static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) 210static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
634static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, 463static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
635 u64 blkno, 464 u64 blkno,
636 unsigned int generation, 465 unsigned int generation,
637 enum ocfs2_vote_request type, 466 enum ocfs2_vote_request type)
638 u32 priv)
639{ 467{
640 struct ocfs2_vote_msg *request; 468 struct ocfs2_vote_msg *request;
641 struct ocfs2_msg_hdr *hdr; 469 struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
651 hdr->h_request = cpu_to_be32(type); 479 hdr->h_request = cpu_to_be32(type);
652 hdr->h_blkno = cpu_to_be64(blkno); 480 hdr->h_blkno = cpu_to_be64(blkno);
653 hdr->h_generation = cpu_to_be32(generation); 481 hdr->h_generation = cpu_to_be32(generation);
654
655 request->md1.v_generic1 = cpu_to_be32(priv);
656 } 482 }
657 483
658 return request; 484 return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
664 struct ocfs2_vote_msg *request, 490 struct ocfs2_vote_msg *request,
665 struct ocfs2_net_response_cb *callback) 491 struct ocfs2_net_response_cb *callback)
666{ 492{
667 int status, response; 493 int status, response = -EBUSY;
668 unsigned int response_id; 494 unsigned int response_id;
669 struct ocfs2_msg_hdr *hdr; 495 struct ocfs2_msg_hdr *hdr;
670 496
@@ -686,109 +512,12 @@ bail:
686 return status; 512 return status;
687} 513}
688 514
689static int ocfs2_request_vote(struct inode *inode,
690 struct ocfs2_vote_msg *request,
691 struct ocfs2_net_response_cb *callback)
692{
693 int status;
694 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
695
696 if (ocfs2_inode_is_new(inode))
697 return 0;
698
699 status = -EAGAIN;
700 while (status == -EAGAIN) {
701 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
702 signal_pending(current))
703 return -ERESTARTSYS;
704
705 status = ocfs2_super_lock(osb, 0);
706 if (status < 0) {
707 mlog_errno(status);
708 break;
709 }
710
711 status = 0;
712 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
713 osb->node_num))
714 status = ocfs2_do_request_vote(osb, request, callback);
715
716 ocfs2_super_unlock(osb, 0);
717 }
718 return status;
719}
720
721static void ocfs2_delete_response_cb(void *priv,
722 struct ocfs2_response_msg *resp)
723{
724 int orphaned_slot, node;
725 struct inode *inode = priv;
726
727 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
728 node = be32_to_cpu(resp->r_hdr.h_node_num);
729 mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
730 node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
731 orphaned_slot);
732
733 /* The other node may not actually know which slot the inode
734 * is orphaned in. */
735 if (orphaned_slot == OCFS2_INVALID_SLOT)
736 return;
737
738 /* Ok, the responding node knows which slot this inode is
739 * orphaned in. We verify that the information is correct and
740 * then record this in the inode. ocfs2_delete_inode will use
741 * this information to determine which lock to take. */
742 spin_lock(&OCFS2_I(inode)->ip_lock);
743 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
744 OCFS2_I(inode)->ip_orphaned_slot
745 != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
746 "orphaned in slot %d, we think it's in %d\n",
747 (unsigned long long)OCFS2_I(inode)->ip_blkno,
748 be32_to_cpu(resp->r_hdr.h_node_num),
749 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
750
751 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
752 spin_unlock(&OCFS2_I(inode)->ip_lock);
753}
754
755int ocfs2_request_delete_vote(struct inode *inode)
756{
757 int orphaned_slot, status;
758 struct ocfs2_net_response_cb delete_cb;
759 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
760 struct ocfs2_vote_msg *request;
761
762 spin_lock(&OCFS2_I(inode)->ip_lock);
763 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
764 spin_unlock(&OCFS2_I(inode)->ip_lock);
765
766 delete_cb.rc_cb = ocfs2_delete_response_cb;
767 delete_cb.rc_priv = inode;
768
769 mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
770 (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
771
772 status = -ENOMEM;
773 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
774 inode->i_generation,
775 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
776 if (request) {
777 status = ocfs2_request_vote(inode, request, &delete_cb);
778
779 kfree(request);
780 }
781
782 return status;
783}
784
785int ocfs2_request_mount_vote(struct ocfs2_super *osb) 515int ocfs2_request_mount_vote(struct ocfs2_super *osb)
786{ 516{
787 int status; 517 int status;
788 struct ocfs2_vote_msg *request = NULL; 518 struct ocfs2_vote_msg *request = NULL;
789 519
790 request = ocfs2_new_vote_request(osb, 0ULL, 0, 520 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
791 OCFS2_VOTE_REQ_MOUNT, 0);
792 if (!request) { 521 if (!request) {
793 status = -ENOMEM; 522 status = -ENOMEM;
794 goto bail; 523 goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
821 int status; 550 int status;
822 struct ocfs2_vote_msg *request = NULL; 551 struct ocfs2_vote_msg *request = NULL;
823 552
824 request = ocfs2_new_vote_request(osb, 0ULL, 0, 553 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
825 OCFS2_VOTE_REQ_UMOUNT, 0);
826 if (!request) { 554 if (!request) {
827 status = -ENOMEM; 555 status = -ENOMEM;
828 goto bail; 556 goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
969 be32_to_cpu(work->w_msg.v_hdr.h_generation)); 697 be32_to_cpu(work->w_msg.v_hdr.h_generation));
970 mlog(0, "h_node_num = %u\n", 698 mlog(0, "h_node_num = %u\n",
971 be32_to_cpu(work->w_msg.v_hdr.h_node_num)); 699 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
972 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
973 700
974 spin_lock(&osb->vote_task_lock); 701 spin_lock(&osb->vote_task_lock);
975 list_add_tail(&work->w_list, &osb->vote_list); 702 list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
38 wake_up(&osb->vote_event); 38 wake_up(&osb->vote_event);
39} 39}
40 40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_mount_vote(struct ocfs2_super *osb); 41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
43int ocfs2_request_umount_vote(struct ocfs2_super *osb); 42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
44int ocfs2_register_net_handlers(struct ocfs2_super *osb); 43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
45void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); 44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
46 45
47void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
48
49void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, 46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
50 int node_num); 47 int node_num);
51#endif 48#endif
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
239/* 239/*
240 * `endbyte' is inclusive 240 * `endbyte' is inclusive
241 */ 241 */
242int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, 242int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
243 unsigned int flags) 243 loff_t endbyte, unsigned int flags)
244{ 244{
245 int ret; 245 int ret;
246 struct address_space *mapping;
247 246
248 mapping = file->f_mapping;
249 if (!mapping) { 247 if (!mapping) {
250 ret = -EINVAL; 248 ret = -EINVAL;
251 goto out; 249 goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
275out: 273out:
276 return ret; 274 return ret;
277} 275}
278EXPORT_SYMBOL_GPL(do_sync_file_range); 276EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 86ec3f4a7da6..095a9c9a64fb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
843extern int fcntl_getlease(struct file *filp); 843extern int fcntl_getlease(struct file *filp);
844 844
845/* fs/sync.c */ 845/* fs/sync.c */
846extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, 846extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
847 unsigned int flags); 847 loff_t endbyte, unsigned int flags);
848static inline int do_sync_file_range(struct file *file, loff_t offset,
849 loff_t endbyte, unsigned int flags)
850{
851 return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
852}
848 853
849/* fs/locks.c */ 854/* fs/locks.c */
850extern void locks_init_lock(struct file_lock *); 855extern void locks_init_lock(struct file_lock *);
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
new file mode 100644
index 000000000000..3d967b6b120a
--- /dev/null
+++ b/include/linux/mtd/ubi.h
@@ -0,0 +1,202 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21#ifndef __LINUX_UBI_H__
22#define __LINUX_UBI_H__
23
24#include <asm/ioctl.h>
25#include <linux/types.h>
26#include <mtd/ubi-user.h>
27
28/*
29 * UBI data type hint constants.
30 *
31 * UBI_LONGTERM: long-term data
32 * UBI_SHORTTERM: short-term data
33 * UBI_UNKNOWN: data persistence is unknown
34 *
35 * These constants are used when data is written to UBI volumes in order to
36 * help the UBI wear-leveling unit to find more appropriate physical
37 * eraseblocks.
38 */
39enum {
40 UBI_LONGTERM = 1,
41 UBI_SHORTTERM,
42 UBI_UNKNOWN
43};
44
45/*
46 * enum ubi_open_mode - UBI volume open mode constants.
47 *
48 * UBI_READONLY: read-only mode
49 * UBI_READWRITE: read-write mode
50 * UBI_EXCLUSIVE: exclusive mode
51 */
52enum {
53 UBI_READONLY = 1,
54 UBI_READWRITE,
55 UBI_EXCLUSIVE
56};
57
58/**
59 * struct ubi_volume_info - UBI volume description data structure.
60 * @vol_id: volume ID
61 * @ubi_num: UBI device number this volume belongs to
62 * @size: how many physical eraseblocks are reserved for this volume
63 * @used_bytes: how many bytes of data this volume contains
64 * @used_ebs: how many physical eraseblocks of this volume actually contain any
65 * data
66 * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
67 * @corrupted: non-zero if the volume is corrupted (static volumes only)
68 * @upd_marker: non-zero if the volume has update marker set
69 * @alignment: volume alignment
70 * @usable_leb_size: how many bytes are available in logical eraseblocks of
71 * this volume
72 * @name_len: volume name length
73 * @name: volume name
74 * @cdev: UBI volume character device major and minor numbers
75 *
76 * The @corrupted flag is only relevant to static volumes and is always zero
77 * for dynamic ones. This is because UBI does not care about dynamic volume
78 * data protection and only cares about protecting static volume data.
79 *
80 * The @upd_marker flag is set if the volume update operation was interrupted.
81 * Before touching the volume data during the update operation, UBI first sets
82 * the update marker flag for this volume. If the volume update operation was
83 * further interrupted, the update marker indicates this. If the update marker
84 * is set, the contents of the volume is certainly damaged and a new volume
85 * update operation has to be started.
86 *
87 * To put it differently, @corrupted and @upd_marker fields have different
88 * semantics:
89 * o the @corrupted flag means that this static volume is corrupted for some
90 * reasons, but not because an interrupted volume update
91 * o the @upd_marker field means that the volume is damaged because of an
92 * interrupted update operation.
93 *
94 * I.e., the @corrupted flag is never set if the @upd_marker flag is set.
95 *
96 * The @used_bytes and @used_ebs fields are only really needed for static
97 * volumes and contain the number of bytes stored in this static volume and how
98 * many eraseblock this data occupies. In case of dynamic volumes, the
99 * @used_bytes field is equivalent to @size*@usable_leb_size, and the @used_ebs
100 * field is equivalent to @size.
101 *
102 * In general, logical eraseblock size is a property of the UBI device, not
103 * of the UBI volume. Indeed, the logical eraseblock size depends on the
104 * physical eraseblock size and on how much bytes UBI headers consume. But
105 * because of the volume alignment (@alignment), the usable size of logical
106 * eraseblocks if a volume may be less. The following equation is true:
107 * @usable_leb_size = LEB size - (LEB size mod @alignment),
108 * where LEB size is the logical eraseblock size defined by the UBI device.
109 *
110 * The alignment is multiple to the minimal flash input/output unit size or %1
111 * if all the available space is used.
112 *
113 * To put this differently, alignment may be considered is a way to change
114 * volume logical eraseblock sizes.
115 */
116struct ubi_volume_info {
117 int ubi_num;
118 int vol_id;
119 int size;
120 long long used_bytes;
121 int used_ebs;
122 int vol_type;
123 int corrupted;
124 int upd_marker;
125 int alignment;
126 int usable_leb_size;
127 int name_len;
128 const char *name;
129 dev_t cdev;
130};
131
132/**
133 * struct ubi_device_info - UBI device description data structure.
134 * @ubi_num: ubi device number
135 * @leb_size: logical eraseblock size on this UBI device
136 * @min_io_size: minimal I/O unit size
137 * @ro_mode: if this device is in read-only mode
138 * @cdev: UBI character device major and minor numbers
139 *
140 * Note, @leb_size is the logical eraseblock size offered by the UBI device.
141 * Volumes of this UBI device may have smaller logical eraseblock size if their
142 * alignment is not equivalent to %1.
143 */
144struct ubi_device_info {
145 int ubi_num;
146 int leb_size;
147 int min_io_size;
148 int ro_mode;
149 dev_t cdev;
150};
151
152/* UBI descriptor given to users when they open UBI volumes */
153struct ubi_volume_desc;
154
155int ubi_get_device_info(int ubi_num, struct ubi_device_info *di);
156void ubi_get_volume_info(struct ubi_volume_desc *desc,
157 struct ubi_volume_info *vi);
158struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode);
159struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name,
160 int mode);
161void ubi_close_volume(struct ubi_volume_desc *desc);
162int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
163 int len, int check);
164int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
165 int offset, int len, int dtype);
166int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
167 int len, int dtype);
168int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum);
169int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum);
170int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum);
171
172/*
173 * This function is the same as the 'ubi_leb_read()' function, but it does not
174 * provide the checking capability.
175 */
176static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf,
177 int offset, int len)
178{
179 return ubi_leb_read(desc, lnum, buf, offset, len, 0);
180}
181
182/*
183 * This function is the same as the 'ubi_leb_write()' functions, but it does
184 * not have the data type argument.
185 */
186static inline int ubi_write(struct ubi_volume_desc *desc, int lnum,
187 const void *buf, int offset, int len)
188{
189 return ubi_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
190}
191
192/*
193 * This function is the same as the 'ubi_leb_change()' functions, but it does
194 * not have the data type argument.
195 */
196static inline int ubi_change(struct ubi_volume_desc *desc, int lnum,
197 const void *buf, int len)
198{
199 return ubi_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
200}
201
202#endif /* !__LINUX_UBI_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe2997a016..a1707583de49 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -196,13 +196,13 @@ extern void init_idle(struct task_struct *idle, int cpu);
196extern cpumask_t nohz_cpu_mask; 196extern cpumask_t nohz_cpu_mask;
197 197
198/* 198/*
199 * Only dump TASK_* tasks. (-1 for all tasks) 199 * Only dump TASK_* tasks. (0 for all tasks)
200 */ 200 */
201extern void show_state_filter(unsigned long state_filter); 201extern void show_state_filter(unsigned long state_filter);
202 202
203static inline void show_state(void) 203static inline void show_state(void)
204{ 204{
205 show_state_filter(-1); 205 show_state_filter(0);
206} 206}
207 207
208extern void show_regs(struct pt_regs *); 208extern void show_regs(struct pt_regs *);
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 52c9eb9b6df2..26e4925bc35b 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -61,10 +61,10 @@ static inline void write_seqlock(seqlock_t *sl)
61{ 61{
62 spin_lock(&sl->lock); 62 spin_lock(&sl->lock);
63 ++sl->sequence; 63 ++sl->sequence;
64 smp_wmb(); 64 smp_wmb();
65} 65}
66 66
67static inline void write_sequnlock(seqlock_t *sl) 67static inline void write_sequnlock(seqlock_t *sl)
68{ 68{
69 smp_wmb(); 69 smp_wmb();
70 sl->sequence++; 70 sl->sequence++;
@@ -77,7 +77,7 @@ static inline int write_tryseqlock(seqlock_t *sl)
77 77
78 if (ret) { 78 if (ret) {
79 ++sl->sequence; 79 ++sl->sequence;
80 smp_wmb(); 80 smp_wmb();
81 } 81 }
82 return ret; 82 return ret;
83} 83}
diff --git a/include/mtd/Kbuild b/include/mtd/Kbuild
index e0fe92b03a4e..4d46b3bdebd8 100644
--- a/include/mtd/Kbuild
+++ b/include/mtd/Kbuild
@@ -3,3 +3,5 @@ header-y += jffs2-user.h
3header-y += mtd-abi.h 3header-y += mtd-abi.h
4header-y += mtd-user.h 4header-y += mtd-user.h
5header-y += nftl-user.h 5header-y += nftl-user.h
6header-y += ubi-header.h
7header-y += ubi-user.h
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index 8e501a75a764..f71dac420394 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -24,6 +24,7 @@ struct mtd_oob_buf {
24#define MTD_NORFLASH 3 24#define MTD_NORFLASH 3
25#define MTD_NANDFLASH 4 25#define MTD_NANDFLASH 4
26#define MTD_DATAFLASH 6 26#define MTD_DATAFLASH 6
27#define MTD_UBIVOLUME 7
27 28
28#define MTD_WRITEABLE 0x400 /* Device is writeable */ 29#define MTD_WRITEABLE 0x400 /* Device is writeable */
29#define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ 30#define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */
diff --git a/include/mtd/ubi-header.h b/include/mtd/ubi-header.h
new file mode 100644
index 000000000000..fa479c71aa34
--- /dev/null
+++ b/include/mtd/ubi-header.h
@@ -0,0 +1,360 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Authors: Artem Bityutskiy (Битюцкий Артём)
19 * Thomas Gleixner
20 * Frank Haverkamp
21 * Oliver Lohmann
22 * Andreas Arnez
23 */
24
25/*
26 * This file defines the layout of UBI headers and all the other UBI on-flash
27 * data structures. May be included by user-space.
28 */
29
30#ifndef __UBI_HEADER_H__
31#define __UBI_HEADER_H__
32
33#include <asm/byteorder.h>
34
35/* The version of UBI images supported by this implementation */
36#define UBI_VERSION 1
37
38/* The highest erase counter value supported by this implementation */
39#define UBI_MAX_ERASECOUNTER 0x7FFFFFFF
40
41/* The initial CRC32 value used when calculating CRC checksums */
42#define UBI_CRC32_INIT 0xFFFFFFFFU
43
44/* Erase counter header magic number (ASCII "UBI#") */
45#define UBI_EC_HDR_MAGIC 0x55424923
46/* Volume identifier header magic number (ASCII "UBI!") */
47#define UBI_VID_HDR_MAGIC 0x55424921
48
49/*
50 * Volume type constants used in the volume identifier header.
51 *
52 * @UBI_VID_DYNAMIC: dynamic volume
53 * @UBI_VID_STATIC: static volume
54 */
55enum {
56 UBI_VID_DYNAMIC = 1,
57 UBI_VID_STATIC = 2
58};
59
60/*
61 * Compatibility constants used by internal volumes.
62 *
63 * @UBI_COMPAT_DELETE: delete this internal volume before anything is written
64 * to the flash
65 * @UBI_COMPAT_RO: attach this device in read-only mode
66 * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its
67 * physical eraseblocks, don't allow the wear-leveling unit to move them
68 * @UBI_COMPAT_REJECT: reject this UBI image
69 */
70enum {
71 UBI_COMPAT_DELETE = 1,
72 UBI_COMPAT_RO = 2,
73 UBI_COMPAT_PRESERVE = 4,
74 UBI_COMPAT_REJECT = 5
75};
76
77/*
78 * ubi16_t/ubi32_t/ubi64_t - 16, 32, and 64-bit integers used in UBI on-flash
79 * data structures.
80 */
81typedef struct {
82 uint16_t int16;
83} __attribute__ ((packed)) ubi16_t;
84
85typedef struct {
86 uint32_t int32;
87} __attribute__ ((packed)) ubi32_t;
88
89typedef struct {
90 uint64_t int64;
91} __attribute__ ((packed)) ubi64_t;
92
93/*
94 * In this implementation of UBI uses the big-endian format for on-flash
95 * integers. The below are the corresponding conversion macros.
96 */
97#define cpu_to_ubi16(x) ((ubi16_t){__cpu_to_be16(x)})
98#define ubi16_to_cpu(x) ((uint16_t)__be16_to_cpu((x).int16))
99
100#define cpu_to_ubi32(x) ((ubi32_t){__cpu_to_be32(x)})
101#define ubi32_to_cpu(x) ((uint32_t)__be32_to_cpu((x).int32))
102
103#define cpu_to_ubi64(x) ((ubi64_t){__cpu_to_be64(x)})
104#define ubi64_to_cpu(x) ((uint64_t)__be64_to_cpu((x).int64))
105
106/* Sizes of UBI headers */
107#define UBI_EC_HDR_SIZE sizeof(struct ubi_ec_hdr)
108#define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr)
109
110/* Sizes of UBI headers without the ending CRC */
111#define UBI_EC_HDR_SIZE_CRC (UBI_EC_HDR_SIZE - sizeof(ubi32_t))
112#define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(ubi32_t))
113
114/**
115 * struct ubi_ec_hdr - UBI erase counter header.
116 * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC)
117 * @version: version of UBI implementation which is supposed to accept this
118 * UBI image
119 * @padding1: reserved for future, zeroes
120 * @ec: the erase counter
121 * @vid_hdr_offset: where the VID header starts
122 * @data_offset: where the user data start
123 * @padding2: reserved for future, zeroes
124 * @hdr_crc: erase counter header CRC checksum
125 *
126 * The erase counter header takes 64 bytes and has a plenty of unused space for
127 * future usage. The unused fields are zeroed. The @version field is used to
128 * indicate the version of UBI implementation which is supposed to be able to
129 * work with this UBI image. If @version is greater then the current UBI
130 * version, the image is rejected. This may be useful in future if something
131 * is changed radically. This field is duplicated in the volume identifier
132 * header.
133 *
134 * The @vid_hdr_offset and @data_offset fields contain the offset of the the
135 * volume identifier header and user data, relative to the beginning of the
136 * physical eraseblock. These values have to be the same for all physical
137 * eraseblocks.
138 */
139struct ubi_ec_hdr {
140 ubi32_t magic;
141 uint8_t version;
142 uint8_t padding1[3];
143 ubi64_t ec; /* Warning: the current limit is 31-bit anyway! */
144 ubi32_t vid_hdr_offset;
145 ubi32_t data_offset;
146 uint8_t padding2[36];
147 ubi32_t hdr_crc;
148} __attribute__ ((packed));
149
150/**
151 * struct ubi_vid_hdr - on-flash UBI volume identifier header.
152 * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC)
153 * @version: UBI implementation version which is supposed to accept this UBI
154 * image (%UBI_VERSION)
155 * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC)
156 * @copy_flag: if this logical eraseblock was copied from another physical
157 * eraseblock (for wear-leveling reasons)
158 * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE,
159 * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT)
160 * @vol_id: ID of this volume
161 * @lnum: logical eraseblock number
162 * @leb_ver: version of this logical eraseblock (IMPORTANT: obsolete, to be
163 * removed, kept only for not breaking older UBI users)
164 * @data_size: how many bytes of data this logical eraseblock contains
165 * @used_ebs: total number of used logical eraseblocks in this volume
166 * @data_pad: how many bytes at the end of this physical eraseblock are not
167 * used
168 * @data_crc: CRC checksum of the data stored in this logical eraseblock
169 * @padding1: reserved for future, zeroes
170 * @sqnum: sequence number
171 * @padding2: reserved for future, zeroes
172 * @hdr_crc: volume identifier header CRC checksum
173 *
174 * The @sqnum is the value of the global sequence counter at the time when this
175 * VID header was created. The global sequence counter is incremented each time
176 * UBI writes a new VID header to the flash, i.e. when it maps a logical
177 * eraseblock to a new physical eraseblock. The global sequence counter is an
178 * unsigned 64-bit integer and we assume it never overflows. The @sqnum
179 * (sequence number) is used to distinguish between older and newer versions of
180 * logical eraseblocks.
181 *
182 * There are 2 situations when there may be more then one physical eraseblock
183 * corresponding to the same logical eraseblock, i.e., having the same @vol_id
184 * and @lnum values in the volume identifier header. Suppose we have a logical
185 * eraseblock L and it is mapped to the physical eraseblock P.
186 *
187 * 1. Because UBI may erase physical eraseblocks asynchronously, the following
188 * situation is possible: L is asynchronously erased, so P is scheduled for
189 * erasure, then L is written to,i.e. mapped to another physical eraseblock P1,
190 * so P1 is written to, then an unclean reboot happens. Result - there are 2
191 * physical eraseblocks P and P1 corresponding to the same logical eraseblock
192 * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the
193 * flash.
194 *
195 * 2. From time to time UBI moves logical eraseblocks to other physical
196 * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P
197 * to P1, and an unclean reboot happens before P is physically erased, there
198 * are two physical eraseblocks P and P1 corresponding to L and UBI has to
199 * select one of them when the flash is attached. The @sqnum field says which
200 * PEB is the original (obviously P will have lower @sqnum) and the copy. But
201 * it is not enough to select the physical eraseblock with the higher sequence
202 * number, because the unclean reboot could have happen in the middle of the
203 * copying process, so the data in P is corrupted. It is also not enough to
204 * just select the physical eraseblock with lower sequence number, because the
205 * data there may be old (consider a case if more data was added to P1 after
206 * the copying). Moreover, the unclean reboot may happen when the erasure of P
207 * was just started, so it result in unstable P, which is "mostly" OK, but
208 * still has unstable bits.
209 *
210 * UBI uses the @copy_flag field to indicate that this logical eraseblock is a
211 * copy. UBI also calculates data CRC when the data is moved and stores it at
212 * the @data_crc field of the copy (P1). So when UBI needs to pick one physical
213 * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is
214 * examined. If it is cleared, the situation* is simple and the newer one is
215 * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC
216 * checksum is correct, this physical eraseblock is selected (P1). Otherwise
217 * the older one (P) is selected.
218 *
219 * Note, there is an obsolete @leb_ver field which was used instead of @sqnum
220 * in the past. But it is not used anymore and we keep it in order to be able
221 * to deal with old UBI images. It will be removed at some point.
222 *
223 * There are 2 sorts of volumes in UBI: user volumes and internal volumes.
224 * Internal volumes are not seen from outside and are used for various internal
225 * UBI purposes. In this implementation there is only one internal volume - the
226 * layout volume. Internal volumes are the main mechanism of UBI extensions.
227 * For example, in future one may introduce a journal internal volume. Internal
228 * volumes have their own reserved range of IDs.
229 *
230 * The @compat field is only used for internal volumes and contains the "degree
231 * of their compatibility". It is always zero for user volumes. This field
232 * provides a mechanism to introduce UBI extensions and to be still compatible
233 * with older UBI binaries. For example, if someone introduced a journal in
234 * future, he would probably use %UBI_COMPAT_DELETE compatibility for the
235 * journal volume. And in this case, older UBI binaries, which know nothing
236 * about the journal volume, would just delete this volume and work perfectly
237 * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image
238 * - it just ignores the Ext3fs journal.
239 *
240 * The @data_crc field contains the CRC checksum of the contents of the logical
241 * eraseblock if this is a static volume. In case of dynamic volumes, it does
242 * not contain the CRC checksum as a rule. The only exception is when the
243 * data of the physical eraseblock was moved by the wear-leveling unit, then
244 * the wear-leveling unit calculates the data CRC and stores it in the
245 * @data_crc field. And of course, the @copy_flag is %in this case.
246 *
247 * The @data_size field is used only for static volumes because UBI has to know
248 * how many bytes of data are stored in this eraseblock. For dynamic volumes,
249 * this field usually contains zero. The only exception is when the data of the
250 * physical eraseblock was moved to another physical eraseblock for
251 * wear-leveling reasons. In this case, UBI calculates CRC checksum of the
252 * contents and uses both @data_crc and @data_size fields. In this case, the
253 * @data_size field contains data size.
254 *
255 * The @used_ebs field is used only for static volumes and indicates how many
256 * eraseblocks the data of the volume takes. For dynamic volumes this field is
257 * not used and always contains zero.
258 *
259 * The @data_pad is calculated when volumes are created using the alignment
260 * parameter. So, effectively, the @data_pad field reduces the size of logical
261 * eraseblocks of this volume. This is very handy when one uses block-oriented
262 * software (say, cramfs) on top of the UBI volume.
263 */
264struct ubi_vid_hdr {
265 ubi32_t magic;
266 uint8_t version;
267 uint8_t vol_type;
268 uint8_t copy_flag;
269 uint8_t compat;
270 ubi32_t vol_id;
271 ubi32_t lnum;
272 ubi32_t leb_ver; /* obsolete, to be removed, don't use */
273 ubi32_t data_size;
274 ubi32_t used_ebs;
275 ubi32_t data_pad;
276 ubi32_t data_crc;
277 uint8_t padding1[4];
278 ubi64_t sqnum;
279 uint8_t padding2[12];
280 ubi32_t hdr_crc;
281} __attribute__ ((packed));
282
283/* Internal UBI volumes count */
284#define UBI_INT_VOL_COUNT 1
285
286/*
287 * Starting ID of internal volumes. There is reserved room for 4096 internal
288 * volumes.
289 */
290#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096)
291
292/* The layout volume contains the volume table */
293
294#define UBI_LAYOUT_VOL_ID UBI_INTERNAL_VOL_START
295#define UBI_LAYOUT_VOLUME_EBS 2
296#define UBI_LAYOUT_VOLUME_NAME "layout volume"
297#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT
298
299/* The maximum number of volumes per one UBI device */
300#define UBI_MAX_VOLUMES 128
301
302/* The maximum volume name length */
303#define UBI_VOL_NAME_MAX 127
304
305/* Size of the volume table record */
306#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record)
307
308/* Size of the volume table record without the ending CRC */
309#define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(ubi32_t))
310
311/**
312 * struct ubi_vtbl_record - a record in the volume table.
313 * @reserved_pebs: how many physical eraseblocks are reserved for this volume
314 * @alignment: volume alignment
315 * @data_pad: how many bytes are unused at the end of the each physical
316 * eraseblock to satisfy the requested alignment
317 * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
318 * @upd_marker: if volume update was started but not finished
319 * @name_len: volume name length
320 * @name: the volume name
321 * @padding2: reserved, zeroes
322 * @crc: a CRC32 checksum of the record
323 *
324 * The volume table records are stored in the volume table, which is stored in
325 * the layout volume. The layout volume consists of 2 logical eraseblock, each
326 * of which contains a copy of the volume table (i.e., the volume table is
327 * duplicated). The volume table is an array of &struct ubi_vtbl_record
328 * objects indexed by the volume ID.
329 *
330 * If the size of the logical eraseblock is large enough to fit
331 * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES
332 * records. Otherwise, it contains as many records as it can fit (i.e., size of
333 * logical eraseblock divided by sizeof(struct ubi_vtbl_record)).
334 *
335 * The @upd_marker flag is used to implement volume update. It is set to %1
336 * before update and set to %0 after the update. So if the update operation was
337 * interrupted, UBI knows that the volume is corrupted.
338 *
339 * The @alignment field is specified when the volume is created and cannot be
340 * later changed. It may be useful, for example, when a block-oriented file
341 * system works on top of UBI. The @data_pad field is calculated using the
342 * logical eraseblock size and @alignment. The alignment must be multiple to the
343 * minimal flash I/O unit. If @alignment is 1, all the available space of
344 * the physical eraseblocks is used.
345 *
346 * Empty records contain all zeroes and the CRC checksum of those zeroes.
347 */
348struct ubi_vtbl_record {
349 ubi32_t reserved_pebs;
350 ubi32_t alignment;
351 ubi32_t data_pad;
352 uint8_t vol_type;
353 uint8_t upd_marker;
354 ubi16_t name_len;
355 uint8_t name[UBI_VOL_NAME_MAX+1];
356 uint8_t padding2[24];
357 ubi32_t crc;
358} __attribute__ ((packed));
359
360#endif /* !__UBI_HEADER_H__ */
diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h
new file mode 100644
index 000000000000..fe06ded0e6b8
--- /dev/null
+++ b/include/mtd/ubi-user.h
@@ -0,0 +1,161 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2006
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Author: Artem Bityutskiy (Битюцкий Артём)
19 */
20
21#ifndef __UBI_USER_H__
22#define __UBI_USER_H__
23
24/*
25 * UBI volume creation
26 * ~~~~~~~~~~~~~~~~~~~
27 *
28 * UBI volumes are created via the %UBI_IOCMKVOL IOCTL command of UBI character
29 * device. A &struct ubi_mkvol_req object has to be properly filled and a
30 * pointer to it has to be passed to the IOCTL.
31 *
32 * UBI volume deletion
33 * ~~~~~~~~~~~~~~~~~~~
34 *
35 * To delete a volume, the %UBI_IOCRMVOL IOCTL command of the UBI character
36 * device should be used. A pointer to the 32-bit volume ID hast to be passed
37 * to the IOCTL.
38 *
39 * UBI volume re-size
40 * ~~~~~~~~~~~~~~~~~~
41 *
42 * To re-size a volume, the %UBI_IOCRSVOL IOCTL command of the UBI character
43 * device should be used. A &struct ubi_rsvol_req object has to be properly
44 * filled and a pointer to it has to be passed to the IOCTL.
45 *
46 * UBI volume update
47 * ~~~~~~~~~~~~~~~~~
48 *
49 * Volume update should be done via the %UBI_IOCVOLUP IOCTL command of the
50 * corresponding UBI volume character device. A pointer to a 64-bit update
51 * size should be passed to the IOCTL. After then, UBI expects user to write
52 * this number of bytes to the volume character device. The update is finished
53 * when the claimed number of bytes is passed. So, the volume update sequence
54 * is something like:
55 *
56 * fd = open("/dev/my_volume");
57 * ioctl(fd, UBI_IOCVOLUP, &image_size);
58 * write(fd, buf, image_size);
59 * close(fd);
60 */
61
62/*
63 * When a new volume is created, users may either specify the volume number they
64 * want to create or to let UBI automatically assign a volume number using this
65 * constant.
66 */
67#define UBI_VOL_NUM_AUTO (-1)
68
69/* Maximum volume name length */
70#define UBI_MAX_VOLUME_NAME 127
71
72/* IOCTL commands of UBI character devices */
73
74#define UBI_IOC_MAGIC 'o'
75
76/* Create an UBI volume */
77#define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req)
78/* Remove an UBI volume */
79#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t)
80/* Re-size an UBI volume */
81#define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req)
82
83/* IOCTL commands of UBI volume character devices */
84
85#define UBI_VOL_IOC_MAGIC 'O'
86
87/* Start UBI volume update */
88#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t)
89/* An eraseblock erasure command, used for debugging, disabled by default */
90#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t)
91
92/*
93 * UBI volume type constants.
94 *
95 * @UBI_DYNAMIC_VOLUME: dynamic volume
96 * @UBI_STATIC_VOLUME: static volume
97 */
98enum {
99 UBI_DYNAMIC_VOLUME = 3,
100 UBI_STATIC_VOLUME = 4
101};
102
103/**
104 * struct ubi_mkvol_req - volume description data structure used in
105 * volume creation requests.
106 * @vol_id: volume number
107 * @alignment: volume alignment
108 * @bytes: volume size in bytes
109 * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
110 * @padding1: reserved for future, not used
111 * @name_len: volume name length
112 * @padding2: reserved for future, not used
113 * @name: volume name
114 *
115 * This structure is used by userspace programs when creating new volumes. The
116 * @used_bytes field is only necessary when creating static volumes.
117 *
118 * The @alignment field specifies the required alignment of the volume logical
119 * eraseblock. This means, that the size of logical eraseblocks will be aligned
120 * to this number, i.e.,
121 * (UBI device logical eraseblock size) mod (@alignment) = 0.
122 *
123 * To put it differently, the logical eraseblock of this volume may be slightly
124 * shortened in order to make it properly aligned. The alignment has to be
125 * multiple of the flash minimal input/output unit, or %1 to utilize the entire
126 * available space of logical eraseblocks.
127 *
128 * The @alignment field may be useful, for example, when one wants to maintain
129 * a block device on top of an UBI volume. In this case, it is desirable to fit
130 * an integer number of blocks in logical eraseblocks of this UBI volume. With
131 * alignment it is possible to update this volume using plane UBI volume image
132 * BLOBs, without caring about how to properly align them.
133 */
134struct ubi_mkvol_req {
135 int32_t vol_id;
136 int32_t alignment;
137 int64_t bytes;
138 int8_t vol_type;
139 int8_t padding1;
140 int16_t name_len;
141 int8_t padding2[4];
142 char name[UBI_MAX_VOLUME_NAME+1];
143} __attribute__ ((packed));
144
145/**
146 * struct ubi_rsvol_req - a data structure used in volume re-size requests.
147 * @vol_id: ID of the volume to re-size
148 * @bytes: new size of the volume in bytes
149 *
150 * Re-sizing is possible for both dynamic and static volumes. But while dynamic
151 * volumes may be re-sized arbitrarily, static volumes cannot be made to be
152 * smaller then the number of bytes they bear. To arbitrarily shrink a static
153 * volume, it must be wiped out first (by means of volume update operation with
154 * zero number of bytes).
155 */
156struct ubi_rsvol_req {
157 int64_t bytes;
158 int32_t vol_id;
159} __attribute__ ((packed));
160
161#endif /* __UBI_USER_H__ */
diff --git a/kernel/sched.c b/kernel/sched.c
index b9a683730148..960d7c5fca39 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4746,7 +4746,7 @@ void show_state_filter(unsigned long state_filter)
4746 * console might take alot of time: 4746 * console might take alot of time:
4747 */ 4747 */
4748 touch_nmi_watchdog(); 4748 touch_nmi_watchdog();
4749 if (p->state & state_filter) 4749 if (!state_filter || (p->state & state_filter))
4750 show_task(p); 4750 show_task(p);
4751 } while_each_thread(g, p); 4751 } while_each_thread(g, p);
4752 4752
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 11a3404d65af..e1f18489db1d 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0;
92int cipso_v4_rbm_strictvalid = 1; 92int cipso_v4_rbm_strictvalid = 1;
93 93
94/* 94/*
95 * Protocol Constants
96 */
97
98/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
99 * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
100#define CIPSO_V4_OPT_LEN_MAX 40
101
102/* Length of the base CIPSO option, this includes the option type (1 byte), the
103 * option length (1 byte), and the DOI (4 bytes). */
104#define CIPSO_V4_HDR_LEN 6
105
106/* Base length of the restrictive category bitmap tag (tag #1). */
107#define CIPSO_V4_TAG_RBM_BLEN 4
108
109/* Base length of the enumerated category tag (tag #2). */
110#define CIPSO_V4_TAG_ENUM_BLEN 4
111
112/* Base length of the ranged categories bitmap tag (tag #5). */
113#define CIPSO_V4_TAG_RNG_BLEN 4
114/* The maximum number of category ranges permitted in the ranged category tag
115 * (tag #5). You may note that the IETF draft states that the maximum number
116 * of category ranges is 7, but if the low end of the last category range is
117 * zero then it is possibile to fit 8 category ranges because the zero should
118 * be omitted. */
119#define CIPSO_V4_TAG_RNG_CAT_MAX 8
120
121/*
95 * Helper Functions 122 * Helper Functions
96 */ 123 */
97 124
@@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
1109 unsigned char *net_cat, 1136 unsigned char *net_cat,
1110 u32 net_cat_len) 1137 u32 net_cat_len)
1111{ 1138{
1112 /* The constant '16' is not random, it is the maximum number of
1113 * high/low category range pairs as permitted by the CIPSO draft based
1114 * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion
1115 * does a sanity check to make sure we don't overflow the array. */
1116 int iter = -1; 1139 int iter = -1;
1117 u16 array[16]; 1140 u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
1118 u32 array_cnt = 0; 1141 u32 array_cnt = 0;
1119 u32 cat_size = 0; 1142 u32 cat_size = 0;
1120 1143
1121 BUG_ON(net_cat_len > 30); 1144 /* make sure we don't overflow the 'array[]' variable */
1145 if (net_cat_len >
1146 (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
1147 return -ENOSPC;
1122 1148
1123 for (;;) { 1149 for (;;) {
1124 iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); 1150 iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
@@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
1196 * Protocol Handling Functions 1222 * Protocol Handling Functions
1197 */ 1223 */
1198 1224
1199#define CIPSO_V4_OPT_LEN_MAX 40
1200#define CIPSO_V4_HDR_LEN 6
1201
1202/** 1225/**
1203 * cipso_v4_gentag_hdr - Generate a CIPSO option header 1226 * cipso_v4_gentag_hdr - Generate a CIPSO option header
1204 * @doi_def: the DOI definition 1227 * @doi_def: the DOI definition
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index e03a3282c551..f2535e7f2869 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -263,9 +263,6 @@ int netlbl_socket_setattr(const struct socket *sock,
263 int ret_val = -ENOENT; 263 int ret_val = -ENOENT;
264 struct netlbl_dom_map *dom_entry; 264 struct netlbl_dom_map *dom_entry;
265 265
266 if ((secattr->flags & NETLBL_SECATTR_DOMAIN) == 0)
267 return -ENOENT;
268
269 rcu_read_lock(); 266 rcu_read_lock();
270 dom_entry = netlbl_domhsh_getentry(secattr->domain); 267 dom_entry = netlbl_domhsh_getentry(secattr->domain);
271 if (dom_entry == NULL) 268 if (dom_entry == NULL)
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index faf2e02e4410..dc3502e30b19 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
8 8
9selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o 9selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
10 10
11selinux-$(CONFIG_NETLABEL) += netlabel.o
12
11EXTRA_CFLAGS += -Isecurity/selinux/include 13EXTRA_CFLAGS += -Isecurity/selinux/include
12 14
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index da8caf10ef97..e4396a89edc6 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -217,6 +217,8 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla
217 audit_log_format(ab, " tcontext=%s", scontext); 217 audit_log_format(ab, " tcontext=%s", scontext);
218 kfree(scontext); 218 kfree(scontext);
219 } 219 }
220
221 BUG_ON(tclass >= ARRAY_SIZE(class_to_string) || !class_to_string[tclass]);
220 audit_log_format(ab, " tclass=%s", class_to_string[tclass]); 222 audit_log_format(ab, " tclass=%s", class_to_string[tclass]);
221} 223}
222 224
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5f02b4be1917..885a9a958b8d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -77,7 +77,7 @@
77#include "objsec.h" 77#include "objsec.h"
78#include "netif.h" 78#include "netif.h"
79#include "xfrm.h" 79#include "xfrm.h"
80#include "selinux_netlabel.h" 80#include "netlabel.h"
81 81
82#define XATTR_SELINUX_SUFFIX "selinux" 82#define XATTR_SELINUX_SUFFIX "selinux"
83#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX 83#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -3123,6 +3123,34 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3123 return ret; 3123 return ret;
3124} 3124}
3125 3125
3126/**
3127 * selinux_skb_extlbl_sid - Determine the external label of a packet
3128 * @skb: the packet
3129 * @base_sid: the SELinux SID to use as a context for MLS only external labels
3130 * @sid: the packet's SID
3131 *
3132 * Description:
3133 * Check the various different forms of external packet labeling and determine
3134 * the external SID for the packet.
3135 *
3136 */
3137static void selinux_skb_extlbl_sid(struct sk_buff *skb,
3138 u32 base_sid,
3139 u32 *sid)
3140{
3141 u32 xfrm_sid;
3142 u32 nlbl_sid;
3143
3144 selinux_skb_xfrm_sid(skb, &xfrm_sid);
3145 if (selinux_netlbl_skbuff_getsid(skb,
3146 (xfrm_sid == SECSID_NULL ?
3147 base_sid : xfrm_sid),
3148 &nlbl_sid) != 0)
3149 nlbl_sid = SECSID_NULL;
3150
3151 *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
3152}
3153
3126/* socket security operations */ 3154/* socket security operations */
3127static int socket_has_perm(struct task_struct *task, struct socket *sock, 3155static int socket_has_perm(struct task_struct *task, struct socket *sock,
3128 u32 perms) 3156 u32 perms)
@@ -3664,9 +3692,7 @@ static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *
3664 if (sock && sock->sk->sk_family == PF_UNIX) 3692 if (sock && sock->sk->sk_family == PF_UNIX)
3665 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); 3693 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
3666 else if (skb) 3694 else if (skb)
3667 security_skb_extlbl_sid(skb, 3695 selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peer_secid);
3668 SECINITSID_UNLABELED,
3669 &peer_secid);
3670 3696
3671 if (peer_secid == SECSID_NULL) 3697 if (peer_secid == SECSID_NULL)
3672 err = -EINVAL; 3698 err = -EINVAL;
@@ -3727,7 +3753,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
3727 u32 newsid; 3753 u32 newsid;
3728 u32 peersid; 3754 u32 peersid;
3729 3755
3730 security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid); 3756 selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid);
3731 if (peersid == SECSID_NULL) { 3757 if (peersid == SECSID_NULL) {
3732 req->secid = sksec->sid; 3758 req->secid = sksec->sid;
3733 req->peer_secid = SECSID_NULL; 3759 req->peer_secid = SECSID_NULL;
@@ -3765,7 +3791,7 @@ static void selinux_inet_conn_established(struct sock *sk,
3765{ 3791{
3766 struct sk_security_struct *sksec = sk->sk_security; 3792 struct sk_security_struct *sksec = sk->sk_security;
3767 3793
3768 security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid); 3794 selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid);
3769} 3795}
3770 3796
3771static void selinux_req_classify_flow(const struct request_sock *req, 3797static void selinux_req_classify_flow(const struct request_sock *req,
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index ad9fb2d69b50..b83e74012a97 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -128,96 +128,6 @@
128 S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease") 128 S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease")
129 S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write") 129 S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write")
130 S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control") 130 S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control")
131 S_(SECCLASS_PASSWD, PASSWD__PASSWD, "passwd")
132 S_(SECCLASS_PASSWD, PASSWD__CHFN, "chfn")
133 S_(SECCLASS_PASSWD, PASSWD__CHSH, "chsh")
134 S_(SECCLASS_PASSWD, PASSWD__ROOTOK, "rootok")
135 S_(SECCLASS_PASSWD, PASSWD__CRONTAB, "crontab")
136 S_(SECCLASS_DRAWABLE, DRAWABLE__CREATE, "create")
137 S_(SECCLASS_DRAWABLE, DRAWABLE__DESTROY, "destroy")
138 S_(SECCLASS_DRAWABLE, DRAWABLE__DRAW, "draw")
139 S_(SECCLASS_DRAWABLE, DRAWABLE__COPY, "copy")
140 S_(SECCLASS_DRAWABLE, DRAWABLE__GETATTR, "getattr")
141 S_(SECCLASS_GC, GC__CREATE, "create")
142 S_(SECCLASS_GC, GC__FREE, "free")
143 S_(SECCLASS_GC, GC__GETATTR, "getattr")
144 S_(SECCLASS_GC, GC__SETATTR, "setattr")
145 S_(SECCLASS_WINDOW, WINDOW__ADDCHILD, "addchild")
146 S_(SECCLASS_WINDOW, WINDOW__CREATE, "create")
147 S_(SECCLASS_WINDOW, WINDOW__DESTROY, "destroy")
148 S_(SECCLASS_WINDOW, WINDOW__MAP, "map")
149 S_(SECCLASS_WINDOW, WINDOW__UNMAP, "unmap")
150 S_(SECCLASS_WINDOW, WINDOW__CHSTACK, "chstack")
151 S_(SECCLASS_WINDOW, WINDOW__CHPROPLIST, "chproplist")
152 S_(SECCLASS_WINDOW, WINDOW__CHPROP, "chprop")
153 S_(SECCLASS_WINDOW, WINDOW__LISTPROP, "listprop")
154 S_(SECCLASS_WINDOW, WINDOW__GETATTR, "getattr")
155 S_(SECCLASS_WINDOW, WINDOW__SETATTR, "setattr")
156 S_(SECCLASS_WINDOW, WINDOW__SETFOCUS, "setfocus")
157 S_(SECCLASS_WINDOW, WINDOW__MOVE, "move")
158 S_(SECCLASS_WINDOW, WINDOW__CHSELECTION, "chselection")
159 S_(SECCLASS_WINDOW, WINDOW__CHPARENT, "chparent")
160 S_(SECCLASS_WINDOW, WINDOW__CTRLLIFE, "ctrllife")
161 S_(SECCLASS_WINDOW, WINDOW__ENUMERATE, "enumerate")
162 S_(SECCLASS_WINDOW, WINDOW__TRANSPARENT, "transparent")
163 S_(SECCLASS_WINDOW, WINDOW__MOUSEMOTION, "mousemotion")
164 S_(SECCLASS_WINDOW, WINDOW__CLIENTCOMEVENT, "clientcomevent")
165 S_(SECCLASS_WINDOW, WINDOW__INPUTEVENT, "inputevent")
166 S_(SECCLASS_WINDOW, WINDOW__DRAWEVENT, "drawevent")
167 S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEEVENT, "windowchangeevent")
168 S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEREQUEST, "windowchangerequest")
169 S_(SECCLASS_WINDOW, WINDOW__SERVERCHANGEEVENT, "serverchangeevent")
170 S_(SECCLASS_WINDOW, WINDOW__EXTENSIONEVENT, "extensionevent")
171 S_(SECCLASS_FONT, FONT__LOAD, "load")
172 S_(SECCLASS_FONT, FONT__FREE, "free")
173 S_(SECCLASS_FONT, FONT__GETATTR, "getattr")
174 S_(SECCLASS_FONT, FONT__USE, "use")
175 S_(SECCLASS_COLORMAP, COLORMAP__CREATE, "create")
176 S_(SECCLASS_COLORMAP, COLORMAP__FREE, "free")
177 S_(SECCLASS_COLORMAP, COLORMAP__INSTALL, "install")
178 S_(SECCLASS_COLORMAP, COLORMAP__UNINSTALL, "uninstall")
179 S_(SECCLASS_COLORMAP, COLORMAP__LIST, "list")
180 S_(SECCLASS_COLORMAP, COLORMAP__READ, "read")
181 S_(SECCLASS_COLORMAP, COLORMAP__STORE, "store")
182 S_(SECCLASS_COLORMAP, COLORMAP__GETATTR, "getattr")
183 S_(SECCLASS_COLORMAP, COLORMAP__SETATTR, "setattr")
184 S_(SECCLASS_PROPERTY, PROPERTY__CREATE, "create")
185 S_(SECCLASS_PROPERTY, PROPERTY__FREE, "free")
186 S_(SECCLASS_PROPERTY, PROPERTY__READ, "read")
187 S_(SECCLASS_PROPERTY, PROPERTY__WRITE, "write")
188 S_(SECCLASS_CURSOR, CURSOR__CREATE, "create")
189 S_(SECCLASS_CURSOR, CURSOR__CREATEGLYPH, "createglyph")
190 S_(SECCLASS_CURSOR, CURSOR__FREE, "free")
191 S_(SECCLASS_CURSOR, CURSOR__ASSIGN, "assign")
192 S_(SECCLASS_CURSOR, CURSOR__SETATTR, "setattr")
193 S_(SECCLASS_XCLIENT, XCLIENT__KILL, "kill")
194 S_(SECCLASS_XINPUT, XINPUT__LOOKUP, "lookup")
195 S_(SECCLASS_XINPUT, XINPUT__GETATTR, "getattr")
196 S_(SECCLASS_XINPUT, XINPUT__SETATTR, "setattr")
197 S_(SECCLASS_XINPUT, XINPUT__SETFOCUS, "setfocus")
198 S_(SECCLASS_XINPUT, XINPUT__WARPPOINTER, "warppointer")
199 S_(SECCLASS_XINPUT, XINPUT__ACTIVEGRAB, "activegrab")
200 S_(SECCLASS_XINPUT, XINPUT__PASSIVEGRAB, "passivegrab")
201 S_(SECCLASS_XINPUT, XINPUT__UNGRAB, "ungrab")
202 S_(SECCLASS_XINPUT, XINPUT__BELL, "bell")
203 S_(SECCLASS_XINPUT, XINPUT__MOUSEMOTION, "mousemotion")
204 S_(SECCLASS_XINPUT, XINPUT__RELABELINPUT, "relabelinput")
205 S_(SECCLASS_XSERVER, XSERVER__SCREENSAVER, "screensaver")
206 S_(SECCLASS_XSERVER, XSERVER__GETHOSTLIST, "gethostlist")
207 S_(SECCLASS_XSERVER, XSERVER__SETHOSTLIST, "sethostlist")
208 S_(SECCLASS_XSERVER, XSERVER__GETFONTPATH, "getfontpath")
209 S_(SECCLASS_XSERVER, XSERVER__SETFONTPATH, "setfontpath")
210 S_(SECCLASS_XSERVER, XSERVER__GETATTR, "getattr")
211 S_(SECCLASS_XSERVER, XSERVER__GRAB, "grab")
212 S_(SECCLASS_XSERVER, XSERVER__UNGRAB, "ungrab")
213 S_(SECCLASS_XEXTENSION, XEXTENSION__QUERY, "query")
214 S_(SECCLASS_XEXTENSION, XEXTENSION__USE, "use")
215 S_(SECCLASS_PAX, PAX__PAGEEXEC, "pageexec")
216 S_(SECCLASS_PAX, PAX__EMUTRAMP, "emutramp")
217 S_(SECCLASS_PAX, PAX__MPROTECT, "mprotect")
218 S_(SECCLASS_PAX, PAX__RANDMMAP, "randmmap")
219 S_(SECCLASS_PAX, PAX__RANDEXEC, "randexec")
220 S_(SECCLASS_PAX, PAX__SEGMEXEC, "segmexec")
221 S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read") 131 S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read")
222 S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write") 132 S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write")
223 S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read") 133 S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read")
@@ -232,16 +142,6 @@
232 S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv") 142 S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv")
233 S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read") 143 S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read")
234 S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write") 144 S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write")
235 S_(SECCLASS_DBUS, DBUS__ACQUIRE_SVC, "acquire_svc")
236 S_(SECCLASS_DBUS, DBUS__SEND_MSG, "send_msg")
237 S_(SECCLASS_NSCD, NSCD__GETPWD, "getpwd")
238 S_(SECCLASS_NSCD, NSCD__GETGRP, "getgrp")
239 S_(SECCLASS_NSCD, NSCD__GETHOST, "gethost")
240 S_(SECCLASS_NSCD, NSCD__GETSTAT, "getstat")
241 S_(SECCLASS_NSCD, NSCD__ADMIN, "admin")
242 S_(SECCLASS_NSCD, NSCD__SHMEMPWD, "shmempwd")
243 S_(SECCLASS_NSCD, NSCD__SHMEMGRP, "shmemgrp")
244 S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost")
245 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") 145 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto")
246 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") 146 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom")
247 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext") 147 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext")
@@ -256,7 +156,5 @@
256 S_(SECCLASS_KEY, KEY__LINK, "link") 156 S_(SECCLASS_KEY, KEY__LINK, "link")
257 S_(SECCLASS_KEY, KEY__SETATTR, "setattr") 157 S_(SECCLASS_KEY, KEY__SETATTR, "setattr")
258 S_(SECCLASS_KEY, KEY__CREATE, "create") 158 S_(SECCLASS_KEY, KEY__CREATE, "create")
259 S_(SECCLASS_CONTEXT, CONTEXT__TRANSLATE, "translate")
260 S_(SECCLASS_CONTEXT, CONTEXT__CONTAINS, "contains")
261 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") 159 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
262 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") 160 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index 2de4b5fe3aa1..5fee1735bffe 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -16,7 +16,6 @@
16#define COMMON_FILE__SWAPON 0x00004000UL 16#define COMMON_FILE__SWAPON 0x00004000UL
17#define COMMON_FILE__QUOTAON 0x00008000UL 17#define COMMON_FILE__QUOTAON 0x00008000UL
18#define COMMON_FILE__MOUNTON 0x00010000UL 18#define COMMON_FILE__MOUNTON 0x00010000UL
19
20#define COMMON_SOCKET__IOCTL 0x00000001UL 19#define COMMON_SOCKET__IOCTL 0x00000001UL
21#define COMMON_SOCKET__READ 0x00000002UL 20#define COMMON_SOCKET__READ 0x00000002UL
22#define COMMON_SOCKET__WRITE 0x00000004UL 21#define COMMON_SOCKET__WRITE 0x00000004UL
@@ -39,7 +38,6 @@
39#define COMMON_SOCKET__RECV_MSG 0x00080000UL 38#define COMMON_SOCKET__RECV_MSG 0x00080000UL
40#define COMMON_SOCKET__SEND_MSG 0x00100000UL 39#define COMMON_SOCKET__SEND_MSG 0x00100000UL
41#define COMMON_SOCKET__NAME_BIND 0x00200000UL 40#define COMMON_SOCKET__NAME_BIND 0x00200000UL
42
43#define COMMON_IPC__CREATE 0x00000001UL 41#define COMMON_IPC__CREATE 0x00000001UL
44#define COMMON_IPC__DESTROY 0x00000002UL 42#define COMMON_IPC__DESTROY 0x00000002UL
45#define COMMON_IPC__GETATTR 0x00000004UL 43#define COMMON_IPC__GETATTR 0x00000004UL
@@ -49,7 +47,6 @@
49#define COMMON_IPC__ASSOCIATE 0x00000040UL 47#define COMMON_IPC__ASSOCIATE 0x00000040UL
50#define COMMON_IPC__UNIX_READ 0x00000080UL 48#define COMMON_IPC__UNIX_READ 0x00000080UL
51#define COMMON_IPC__UNIX_WRITE 0x00000100UL 49#define COMMON_IPC__UNIX_WRITE 0x00000100UL
52
53#define FILESYSTEM__MOUNT 0x00000001UL 50#define FILESYSTEM__MOUNT 0x00000001UL
54#define FILESYSTEM__REMOUNT 0x00000002UL 51#define FILESYSTEM__REMOUNT 0x00000002UL
55#define FILESYSTEM__UNMOUNT 0x00000004UL 52#define FILESYSTEM__UNMOUNT 0x00000004UL
@@ -60,7 +57,6 @@
60#define FILESYSTEM__ASSOCIATE 0x00000080UL 57#define FILESYSTEM__ASSOCIATE 0x00000080UL
61#define FILESYSTEM__QUOTAMOD 0x00000100UL 58#define FILESYSTEM__QUOTAMOD 0x00000100UL
62#define FILESYSTEM__QUOTAGET 0x00000200UL 59#define FILESYSTEM__QUOTAGET 0x00000200UL
63
64#define DIR__IOCTL 0x00000001UL 60#define DIR__IOCTL 0x00000001UL
65#define DIR__READ 0x00000002UL 61#define DIR__READ 0x00000002UL
66#define DIR__WRITE 0x00000004UL 62#define DIR__WRITE 0x00000004UL
@@ -78,13 +74,11 @@
78#define DIR__SWAPON 0x00004000UL 74#define DIR__SWAPON 0x00004000UL
79#define DIR__QUOTAON 0x00008000UL 75#define DIR__QUOTAON 0x00008000UL
80#define DIR__MOUNTON 0x00010000UL 76#define DIR__MOUNTON 0x00010000UL
81
82#define DIR__ADD_NAME 0x00020000UL 77#define DIR__ADD_NAME 0x00020000UL
83#define DIR__REMOVE_NAME 0x00040000UL 78#define DIR__REMOVE_NAME 0x00040000UL
84#define DIR__REPARENT 0x00080000UL 79#define DIR__REPARENT 0x00080000UL
85#define DIR__SEARCH 0x00100000UL 80#define DIR__SEARCH 0x00100000UL
86#define DIR__RMDIR 0x00200000UL 81#define DIR__RMDIR 0x00200000UL
87
88#define FILE__IOCTL 0x00000001UL 82#define FILE__IOCTL 0x00000001UL
89#define FILE__READ 0x00000002UL 83#define FILE__READ 0x00000002UL
90#define FILE__WRITE 0x00000004UL 84#define FILE__WRITE 0x00000004UL
@@ -102,11 +96,9 @@
102#define FILE__SWAPON 0x00004000UL 96#define FILE__SWAPON 0x00004000UL
103#define FILE__QUOTAON 0x00008000UL 97#define FILE__QUOTAON 0x00008000UL
104#define FILE__MOUNTON 0x00010000UL 98#define FILE__MOUNTON 0x00010000UL
105
106#define FILE__EXECUTE_NO_TRANS 0x00020000UL 99#define FILE__EXECUTE_NO_TRANS 0x00020000UL
107#define FILE__ENTRYPOINT 0x00040000UL 100#define FILE__ENTRYPOINT 0x00040000UL
108#define FILE__EXECMOD 0x00080000UL 101#define FILE__EXECMOD 0x00080000UL
109
110#define LNK_FILE__IOCTL 0x00000001UL 102#define LNK_FILE__IOCTL 0x00000001UL
111#define LNK_FILE__READ 0x00000002UL 103#define LNK_FILE__READ 0x00000002UL
112#define LNK_FILE__WRITE 0x00000004UL 104#define LNK_FILE__WRITE 0x00000004UL
@@ -124,7 +116,6 @@
124#define LNK_FILE__SWAPON 0x00004000UL 116#define LNK_FILE__SWAPON 0x00004000UL
125#define LNK_FILE__QUOTAON 0x00008000UL 117#define LNK_FILE__QUOTAON 0x00008000UL
126#define LNK_FILE__MOUNTON 0x00010000UL 118#define LNK_FILE__MOUNTON 0x00010000UL
127
128#define CHR_FILE__IOCTL 0x00000001UL 119#define CHR_FILE__IOCTL 0x00000001UL
129#define CHR_FILE__READ 0x00000002UL 120#define CHR_FILE__READ 0x00000002UL
130#define CHR_FILE__WRITE 0x00000004UL 121#define CHR_FILE__WRITE 0x00000004UL
@@ -142,11 +133,9 @@
142#define CHR_FILE__SWAPON 0x00004000UL 133#define CHR_FILE__SWAPON 0x00004000UL
143#define CHR_FILE__QUOTAON 0x00008000UL 134#define CHR_FILE__QUOTAON 0x00008000UL
144#define CHR_FILE__MOUNTON 0x00010000UL 135#define CHR_FILE__MOUNTON 0x00010000UL
145
146#define CHR_FILE__EXECUTE_NO_TRANS 0x00020000UL 136#define CHR_FILE__EXECUTE_NO_TRANS 0x00020000UL
147#define CHR_FILE__ENTRYPOINT 0x00040000UL 137#define CHR_FILE__ENTRYPOINT 0x00040000UL
148#define CHR_FILE__EXECMOD 0x00080000UL 138#define CHR_FILE__EXECMOD 0x00080000UL
149
150#define BLK_FILE__IOCTL 0x00000001UL 139#define BLK_FILE__IOCTL 0x00000001UL
151#define BLK_FILE__READ 0x00000002UL 140#define BLK_FILE__READ 0x00000002UL
152#define BLK_FILE__WRITE 0x00000004UL 141#define BLK_FILE__WRITE 0x00000004UL
@@ -164,7 +153,6 @@
164#define BLK_FILE__SWAPON 0x00004000UL 153#define BLK_FILE__SWAPON 0x00004000UL
165#define BLK_FILE__QUOTAON 0x00008000UL 154#define BLK_FILE__QUOTAON 0x00008000UL
166#define BLK_FILE__MOUNTON 0x00010000UL 155#define BLK_FILE__MOUNTON 0x00010000UL
167
168#define SOCK_FILE__IOCTL 0x00000001UL 156#define SOCK_FILE__IOCTL 0x00000001UL
169#define SOCK_FILE__READ 0x00000002UL 157#define SOCK_FILE__READ 0x00000002UL
170#define SOCK_FILE__WRITE 0x00000004UL 158#define SOCK_FILE__WRITE 0x00000004UL
@@ -182,7 +170,6 @@
182#define SOCK_FILE__SWAPON 0x00004000UL 170#define SOCK_FILE__SWAPON 0x00004000UL
183#define SOCK_FILE__QUOTAON 0x00008000UL 171#define SOCK_FILE__QUOTAON 0x00008000UL
184#define SOCK_FILE__MOUNTON 0x00010000UL 172#define SOCK_FILE__MOUNTON 0x00010000UL
185
186#define FIFO_FILE__IOCTL 0x00000001UL 173#define FIFO_FILE__IOCTL 0x00000001UL
187#define FIFO_FILE__READ 0x00000002UL 174#define FIFO_FILE__READ 0x00000002UL
188#define FIFO_FILE__WRITE 0x00000004UL 175#define FIFO_FILE__WRITE 0x00000004UL
@@ -200,9 +187,7 @@
200#define FIFO_FILE__SWAPON 0x00004000UL 187#define FIFO_FILE__SWAPON 0x00004000UL
201#define FIFO_FILE__QUOTAON 0x00008000UL 188#define FIFO_FILE__QUOTAON 0x00008000UL
202#define FIFO_FILE__MOUNTON 0x00010000UL 189#define FIFO_FILE__MOUNTON 0x00010000UL
203
204#define FD__USE 0x00000001UL 190#define FD__USE 0x00000001UL
205
206#define SOCKET__IOCTL 0x00000001UL 191#define SOCKET__IOCTL 0x00000001UL
207#define SOCKET__READ 0x00000002UL 192#define SOCKET__READ 0x00000002UL
208#define SOCKET__WRITE 0x00000004UL 193#define SOCKET__WRITE 0x00000004UL
@@ -225,7 +210,6 @@
225#define SOCKET__RECV_MSG 0x00080000UL 210#define SOCKET__RECV_MSG 0x00080000UL
226#define SOCKET__SEND_MSG 0x00100000UL 211#define SOCKET__SEND_MSG 0x00100000UL
227#define SOCKET__NAME_BIND 0x00200000UL 212#define SOCKET__NAME_BIND 0x00200000UL
228
229#define TCP_SOCKET__IOCTL 0x00000001UL 213#define TCP_SOCKET__IOCTL 0x00000001UL
230#define TCP_SOCKET__READ 0x00000002UL 214#define TCP_SOCKET__READ 0x00000002UL
231#define TCP_SOCKET__WRITE 0x00000004UL 215#define TCP_SOCKET__WRITE 0x00000004UL
@@ -248,13 +232,11 @@
248#define TCP_SOCKET__RECV_MSG 0x00080000UL 232#define TCP_SOCKET__RECV_MSG 0x00080000UL
249#define TCP_SOCKET__SEND_MSG 0x00100000UL 233#define TCP_SOCKET__SEND_MSG 0x00100000UL
250#define TCP_SOCKET__NAME_BIND 0x00200000UL 234#define TCP_SOCKET__NAME_BIND 0x00200000UL
251
252#define TCP_SOCKET__CONNECTTO 0x00400000UL 235#define TCP_SOCKET__CONNECTTO 0x00400000UL
253#define TCP_SOCKET__NEWCONN 0x00800000UL 236#define TCP_SOCKET__NEWCONN 0x00800000UL
254#define TCP_SOCKET__ACCEPTFROM 0x01000000UL 237#define TCP_SOCKET__ACCEPTFROM 0x01000000UL
255#define TCP_SOCKET__NODE_BIND 0x02000000UL 238#define TCP_SOCKET__NODE_BIND 0x02000000UL
256#define TCP_SOCKET__NAME_CONNECT 0x04000000UL 239#define TCP_SOCKET__NAME_CONNECT 0x04000000UL
257
258#define UDP_SOCKET__IOCTL 0x00000001UL 240#define UDP_SOCKET__IOCTL 0x00000001UL
259#define UDP_SOCKET__READ 0x00000002UL 241#define UDP_SOCKET__READ 0x00000002UL
260#define UDP_SOCKET__WRITE 0x00000004UL 242#define UDP_SOCKET__WRITE 0x00000004UL
@@ -277,9 +259,7 @@
277#define UDP_SOCKET__RECV_MSG 0x00080000UL 259#define UDP_SOCKET__RECV_MSG 0x00080000UL
278#define UDP_SOCKET__SEND_MSG 0x00100000UL 260#define UDP_SOCKET__SEND_MSG 0x00100000UL
279#define UDP_SOCKET__NAME_BIND 0x00200000UL 261#define UDP_SOCKET__NAME_BIND 0x00200000UL
280
281#define UDP_SOCKET__NODE_BIND 0x00400000UL 262#define UDP_SOCKET__NODE_BIND 0x00400000UL
282
283#define RAWIP_SOCKET__IOCTL 0x00000001UL 263#define RAWIP_SOCKET__IOCTL 0x00000001UL
284#define RAWIP_SOCKET__READ 0x00000002UL 264#define RAWIP_SOCKET__READ 0x00000002UL
285#define RAWIP_SOCKET__WRITE 0x00000004UL 265#define RAWIP_SOCKET__WRITE 0x00000004UL
@@ -302,9 +282,7 @@
302#define RAWIP_SOCKET__RECV_MSG 0x00080000UL 282#define RAWIP_SOCKET__RECV_MSG 0x00080000UL
303#define RAWIP_SOCKET__SEND_MSG 0x00100000UL 283#define RAWIP_SOCKET__SEND_MSG 0x00100000UL
304#define RAWIP_SOCKET__NAME_BIND 0x00200000UL 284#define RAWIP_SOCKET__NAME_BIND 0x00200000UL
305
306#define RAWIP_SOCKET__NODE_BIND 0x00400000UL 285#define RAWIP_SOCKET__NODE_BIND 0x00400000UL
307
308#define NODE__TCP_RECV 0x00000001UL 286#define NODE__TCP_RECV 0x00000001UL
309#define NODE__TCP_SEND 0x00000002UL 287#define NODE__TCP_SEND 0x00000002UL
310#define NODE__UDP_RECV 0x00000004UL 288#define NODE__UDP_RECV 0x00000004UL
@@ -314,7 +292,6 @@
314#define NODE__ENFORCE_DEST 0x00000040UL 292#define NODE__ENFORCE_DEST 0x00000040UL
315#define NODE__DCCP_RECV 0x00000080UL 293#define NODE__DCCP_RECV 0x00000080UL
316#define NODE__DCCP_SEND 0x00000100UL 294#define NODE__DCCP_SEND 0x00000100UL
317
318#define NETIF__TCP_RECV 0x00000001UL 295#define NETIF__TCP_RECV 0x00000001UL
319#define NETIF__TCP_SEND 0x00000002UL 296#define NETIF__TCP_SEND 0x00000002UL
320#define NETIF__UDP_RECV 0x00000004UL 297#define NETIF__UDP_RECV 0x00000004UL
@@ -323,7 +300,6 @@
323#define NETIF__RAWIP_SEND 0x00000020UL 300#define NETIF__RAWIP_SEND 0x00000020UL
324#define NETIF__DCCP_RECV 0x00000040UL 301#define NETIF__DCCP_RECV 0x00000040UL
325#define NETIF__DCCP_SEND 0x00000080UL 302#define NETIF__DCCP_SEND 0x00000080UL
326
327#define NETLINK_SOCKET__IOCTL 0x00000001UL 303#define NETLINK_SOCKET__IOCTL 0x00000001UL
328#define NETLINK_SOCKET__READ 0x00000002UL 304#define NETLINK_SOCKET__READ 0x00000002UL
329#define NETLINK_SOCKET__WRITE 0x00000004UL 305#define NETLINK_SOCKET__WRITE 0x00000004UL
@@ -346,7 +322,6 @@
346#define NETLINK_SOCKET__RECV_MSG 0x00080000UL 322#define NETLINK_SOCKET__RECV_MSG 0x00080000UL
347#define NETLINK_SOCKET__SEND_MSG 0x00100000UL 323#define NETLINK_SOCKET__SEND_MSG 0x00100000UL
348#define NETLINK_SOCKET__NAME_BIND 0x00200000UL 324#define NETLINK_SOCKET__NAME_BIND 0x00200000UL
349
350#define PACKET_SOCKET__IOCTL 0x00000001UL 325#define PACKET_SOCKET__IOCTL 0x00000001UL
351#define PACKET_SOCKET__READ 0x00000002UL 326#define PACKET_SOCKET__READ 0x00000002UL
352#define PACKET_SOCKET__WRITE 0x00000004UL 327#define PACKET_SOCKET__WRITE 0x00000004UL
@@ -369,7 +344,6 @@
369#define PACKET_SOCKET__RECV_MSG 0x00080000UL 344#define PACKET_SOCKET__RECV_MSG 0x00080000UL
370#define PACKET_SOCKET__SEND_MSG 0x00100000UL 345#define PACKET_SOCKET__SEND_MSG 0x00100000UL
371#define PACKET_SOCKET__NAME_BIND 0x00200000UL 346#define PACKET_SOCKET__NAME_BIND 0x00200000UL
372
373#define KEY_SOCKET__IOCTL 0x00000001UL 347#define KEY_SOCKET__IOCTL 0x00000001UL
374#define KEY_SOCKET__READ 0x00000002UL 348#define KEY_SOCKET__READ 0x00000002UL
375#define KEY_SOCKET__WRITE 0x00000004UL 349#define KEY_SOCKET__WRITE 0x00000004UL
@@ -392,7 +366,6 @@
392#define KEY_SOCKET__RECV_MSG 0x00080000UL 366#define KEY_SOCKET__RECV_MSG 0x00080000UL
393#define KEY_SOCKET__SEND_MSG 0x00100000UL 367#define KEY_SOCKET__SEND_MSG 0x00100000UL
394#define KEY_SOCKET__NAME_BIND 0x00200000UL 368#define KEY_SOCKET__NAME_BIND 0x00200000UL
395
396#define UNIX_STREAM_SOCKET__IOCTL 0x00000001UL 369#define UNIX_STREAM_SOCKET__IOCTL 0x00000001UL
397#define UNIX_STREAM_SOCKET__READ 0x00000002UL 370#define UNIX_STREAM_SOCKET__READ 0x00000002UL
398#define UNIX_STREAM_SOCKET__WRITE 0x00000004UL 371#define UNIX_STREAM_SOCKET__WRITE 0x00000004UL
@@ -415,11 +388,9 @@
415#define UNIX_STREAM_SOCKET__RECV_MSG 0x00080000UL 388#define UNIX_STREAM_SOCKET__RECV_MSG 0x00080000UL
416#define UNIX_STREAM_SOCKET__SEND_MSG 0x00100000UL 389#define UNIX_STREAM_SOCKET__SEND_MSG 0x00100000UL
417#define UNIX_STREAM_SOCKET__NAME_BIND 0x00200000UL 390#define UNIX_STREAM_SOCKET__NAME_BIND 0x00200000UL
418
419#define UNIX_STREAM_SOCKET__CONNECTTO 0x00400000UL 391#define UNIX_STREAM_SOCKET__CONNECTTO 0x00400000UL
420#define UNIX_STREAM_SOCKET__NEWCONN 0x00800000UL 392#define UNIX_STREAM_SOCKET__NEWCONN 0x00800000UL
421#define UNIX_STREAM_SOCKET__ACCEPTFROM 0x01000000UL 393#define UNIX_STREAM_SOCKET__ACCEPTFROM 0x01000000UL
422
423#define UNIX_DGRAM_SOCKET__IOCTL 0x00000001UL 394#define UNIX_DGRAM_SOCKET__IOCTL 0x00000001UL
424#define UNIX_DGRAM_SOCKET__READ 0x00000002UL 395#define UNIX_DGRAM_SOCKET__READ 0x00000002UL
425#define UNIX_DGRAM_SOCKET__WRITE 0x00000004UL 396#define UNIX_DGRAM_SOCKET__WRITE 0x00000004UL
@@ -442,7 +413,6 @@
442#define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL 413#define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL
443#define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL 414#define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL
444#define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL 415#define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL
445
446#define PROCESS__FORK 0x00000001UL 416#define PROCESS__FORK 0x00000001UL
447#define PROCESS__TRANSITION 0x00000002UL 417#define PROCESS__TRANSITION 0x00000002UL
448#define PROCESS__SIGCHLD 0x00000004UL 418#define PROCESS__SIGCHLD 0x00000004UL
@@ -473,7 +443,6 @@
473#define PROCESS__EXECHEAP 0x08000000UL 443#define PROCESS__EXECHEAP 0x08000000UL
474#define PROCESS__SETKEYCREATE 0x10000000UL 444#define PROCESS__SETKEYCREATE 0x10000000UL
475#define PROCESS__SETSOCKCREATE 0x20000000UL 445#define PROCESS__SETSOCKCREATE 0x20000000UL
476
477#define IPC__CREATE 0x00000001UL 446#define IPC__CREATE 0x00000001UL
478#define IPC__DESTROY 0x00000002UL 447#define IPC__DESTROY 0x00000002UL
479#define IPC__GETATTR 0x00000004UL 448#define IPC__GETATTR 0x00000004UL
@@ -483,7 +452,6 @@
483#define IPC__ASSOCIATE 0x00000040UL 452#define IPC__ASSOCIATE 0x00000040UL
484#define IPC__UNIX_READ 0x00000080UL 453#define IPC__UNIX_READ 0x00000080UL
485#define IPC__UNIX_WRITE 0x00000100UL 454#define IPC__UNIX_WRITE 0x00000100UL
486
487#define SEM__CREATE 0x00000001UL 455#define SEM__CREATE 0x00000001UL
488#define SEM__DESTROY 0x00000002UL 456#define SEM__DESTROY 0x00000002UL
489#define SEM__GETATTR 0x00000004UL 457#define SEM__GETATTR 0x00000004UL
@@ -493,7 +461,6 @@
493#define SEM__ASSOCIATE 0x00000040UL 461#define SEM__ASSOCIATE 0x00000040UL
494#define SEM__UNIX_READ 0x00000080UL 462#define SEM__UNIX_READ 0x00000080UL
495#define SEM__UNIX_WRITE 0x00000100UL 463#define SEM__UNIX_WRITE 0x00000100UL
496
497#define MSGQ__CREATE 0x00000001UL 464#define MSGQ__CREATE 0x00000001UL
498#define MSGQ__DESTROY 0x00000002UL 465#define MSGQ__DESTROY 0x00000002UL
499#define MSGQ__GETATTR 0x00000004UL 466#define MSGQ__GETATTR 0x00000004UL
@@ -503,12 +470,9 @@
503#define MSGQ__ASSOCIATE 0x00000040UL 470#define MSGQ__ASSOCIATE 0x00000040UL
504#define MSGQ__UNIX_READ 0x00000080UL 471#define MSGQ__UNIX_READ 0x00000080UL
505#define MSGQ__UNIX_WRITE 0x00000100UL 472#define MSGQ__UNIX_WRITE 0x00000100UL
506
507#define MSGQ__ENQUEUE 0x00000200UL 473#define MSGQ__ENQUEUE 0x00000200UL
508
509#define MSG__SEND 0x00000001UL 474#define MSG__SEND 0x00000001UL
510#define MSG__RECEIVE 0x00000002UL 475#define MSG__RECEIVE 0x00000002UL
511
512#define SHM__CREATE 0x00000001UL 476#define SHM__CREATE 0x00000001UL
513#define SHM__DESTROY 0x00000002UL 477#define SHM__DESTROY 0x00000002UL
514#define SHM__GETATTR 0x00000004UL 478#define SHM__GETATTR 0x00000004UL
@@ -518,9 +482,7 @@
518#define SHM__ASSOCIATE 0x00000040UL 482#define SHM__ASSOCIATE 0x00000040UL
519#define SHM__UNIX_READ 0x00000080UL 483#define SHM__UNIX_READ 0x00000080UL
520#define SHM__UNIX_WRITE 0x00000100UL 484#define SHM__UNIX_WRITE 0x00000100UL
521
522#define SHM__LOCK 0x00000200UL 485#define SHM__LOCK 0x00000200UL
523
524#define SECURITY__COMPUTE_AV 0x00000001UL 486#define SECURITY__COMPUTE_AV 0x00000001UL
525#define SECURITY__COMPUTE_CREATE 0x00000002UL 487#define SECURITY__COMPUTE_CREATE 0x00000002UL
526#define SECURITY__COMPUTE_MEMBER 0x00000004UL 488#define SECURITY__COMPUTE_MEMBER 0x00000004UL
@@ -532,12 +494,10 @@
532#define SECURITY__SETBOOL 0x00000100UL 494#define SECURITY__SETBOOL 0x00000100UL
533#define SECURITY__SETSECPARAM 0x00000200UL 495#define SECURITY__SETSECPARAM 0x00000200UL
534#define SECURITY__SETCHECKREQPROT 0x00000400UL 496#define SECURITY__SETCHECKREQPROT 0x00000400UL
535
536#define SYSTEM__IPC_INFO 0x00000001UL 497#define SYSTEM__IPC_INFO 0x00000001UL
537#define SYSTEM__SYSLOG_READ 0x00000002UL 498#define SYSTEM__SYSLOG_READ 0x00000002UL
538#define SYSTEM__SYSLOG_MOD 0x00000004UL 499#define SYSTEM__SYSLOG_MOD 0x00000004UL
539#define SYSTEM__SYSLOG_CONSOLE 0x00000008UL 500#define SYSTEM__SYSLOG_CONSOLE 0x00000008UL
540
541#define CAPABILITY__CHOWN 0x00000001UL 501#define CAPABILITY__CHOWN 0x00000001UL
542#define CAPABILITY__DAC_OVERRIDE 0x00000002UL 502#define CAPABILITY__DAC_OVERRIDE 0x00000002UL
543#define CAPABILITY__DAC_READ_SEARCH 0x00000004UL 503#define CAPABILITY__DAC_READ_SEARCH 0x00000004UL
@@ -569,110 +529,6 @@
569#define CAPABILITY__LEASE 0x10000000UL 529#define CAPABILITY__LEASE 0x10000000UL
570#define CAPABILITY__AUDIT_WRITE 0x20000000UL 530#define CAPABILITY__AUDIT_WRITE 0x20000000UL
571#define CAPABILITY__AUDIT_CONTROL 0x40000000UL 531#define CAPABILITY__AUDIT_CONTROL 0x40000000UL
572
573#define PASSWD__PASSWD 0x00000001UL
574#define PASSWD__CHFN 0x00000002UL
575#define PASSWD__CHSH 0x00000004UL
576#define PASSWD__ROOTOK 0x00000008UL
577#define PASSWD__CRONTAB 0x00000010UL
578
579#define DRAWABLE__CREATE 0x00000001UL
580#define DRAWABLE__DESTROY 0x00000002UL
581#define DRAWABLE__DRAW 0x00000004UL
582#define DRAWABLE__COPY 0x00000008UL
583#define DRAWABLE__GETATTR 0x00000010UL
584
585#define GC__CREATE 0x00000001UL
586#define GC__FREE 0x00000002UL
587#define GC__GETATTR 0x00000004UL
588#define GC__SETATTR 0x00000008UL
589
590#define WINDOW__ADDCHILD 0x00000001UL
591#define WINDOW__CREATE 0x00000002UL
592#define WINDOW__DESTROY 0x00000004UL
593#define WINDOW__MAP 0x00000008UL
594#define WINDOW__UNMAP 0x00000010UL
595#define WINDOW__CHSTACK 0x00000020UL
596#define WINDOW__CHPROPLIST 0x00000040UL
597#define WINDOW__CHPROP 0x00000080UL
598#define WINDOW__LISTPROP 0x00000100UL
599#define WINDOW__GETATTR 0x00000200UL
600#define WINDOW__SETATTR 0x00000400UL
601#define WINDOW__SETFOCUS 0x00000800UL
602#define WINDOW__MOVE 0x00001000UL
603#define WINDOW__CHSELECTION 0x00002000UL
604#define WINDOW__CHPARENT 0x00004000UL
605#define WINDOW__CTRLLIFE 0x00008000UL
606#define WINDOW__ENUMERATE 0x00010000UL
607#define WINDOW__TRANSPARENT 0x00020000UL
608#define WINDOW__MOUSEMOTION 0x00040000UL
609#define WINDOW__CLIENTCOMEVENT 0x00080000UL
610#define WINDOW__INPUTEVENT 0x00100000UL
611#define WINDOW__DRAWEVENT 0x00200000UL
612#define WINDOW__WINDOWCHANGEEVENT 0x00400000UL
613#define WINDOW__WINDOWCHANGEREQUEST 0x00800000UL
614#define WINDOW__SERVERCHANGEEVENT 0x01000000UL
615#define WINDOW__EXTENSIONEVENT 0x02000000UL
616
617#define FONT__LOAD 0x00000001UL
618#define FONT__FREE 0x00000002UL
619#define FONT__GETATTR 0x00000004UL
620#define FONT__USE 0x00000008UL
621
622#define COLORMAP__CREATE 0x00000001UL
623#define COLORMAP__FREE 0x00000002UL
624#define COLORMAP__INSTALL 0x00000004UL
625#define COLORMAP__UNINSTALL 0x00000008UL
626#define COLORMAP__LIST 0x00000010UL
627#define COLORMAP__READ 0x00000020UL
628#define COLORMAP__STORE 0x00000040UL
629#define COLORMAP__GETATTR 0x00000080UL
630#define COLORMAP__SETATTR 0x00000100UL
631
632#define PROPERTY__CREATE 0x00000001UL
633#define PROPERTY__FREE 0x00000002UL
634#define PROPERTY__READ 0x00000004UL
635#define PROPERTY__WRITE 0x00000008UL
636
637#define CURSOR__CREATE 0x00000001UL
638#define CURSOR__CREATEGLYPH 0x00000002UL
639#define CURSOR__FREE 0x00000004UL
640#define CURSOR__ASSIGN 0x00000008UL
641#define CURSOR__SETATTR 0x00000010UL
642
643#define XCLIENT__KILL 0x00000001UL
644
645#define XINPUT__LOOKUP 0x00000001UL
646#define XINPUT__GETATTR 0x00000002UL
647#define XINPUT__SETATTR 0x00000004UL
648#define XINPUT__SETFOCUS 0x00000008UL
649#define XINPUT__WARPPOINTER 0x00000010UL
650#define XINPUT__ACTIVEGRAB 0x00000020UL
651#define XINPUT__PASSIVEGRAB 0x00000040UL
652#define XINPUT__UNGRAB 0x00000080UL
653#define XINPUT__BELL 0x00000100UL
654#define XINPUT__MOUSEMOTION 0x00000200UL
655#define XINPUT__RELABELINPUT 0x00000400UL
656
657#define XSERVER__SCREENSAVER 0x00000001UL
658#define XSERVER__GETHOSTLIST 0x00000002UL
659#define XSERVER__SETHOSTLIST 0x00000004UL
660#define XSERVER__GETFONTPATH 0x00000008UL
661#define XSERVER__SETFONTPATH 0x00000010UL
662#define XSERVER__GETATTR 0x00000020UL
663#define XSERVER__GRAB 0x00000040UL
664#define XSERVER__UNGRAB 0x00000080UL
665
666#define XEXTENSION__QUERY 0x00000001UL
667#define XEXTENSION__USE 0x00000002UL
668
669#define PAX__PAGEEXEC 0x00000001UL
670#define PAX__EMUTRAMP 0x00000002UL
671#define PAX__MPROTECT 0x00000004UL
672#define PAX__RANDMMAP 0x00000008UL
673#define PAX__RANDEXEC 0x00000010UL
674#define PAX__SEGMEXEC 0x00000020UL
675
676#define NETLINK_ROUTE_SOCKET__IOCTL 0x00000001UL 532#define NETLINK_ROUTE_SOCKET__IOCTL 0x00000001UL
677#define NETLINK_ROUTE_SOCKET__READ 0x00000002UL 533#define NETLINK_ROUTE_SOCKET__READ 0x00000002UL
678#define NETLINK_ROUTE_SOCKET__WRITE 0x00000004UL 534#define NETLINK_ROUTE_SOCKET__WRITE 0x00000004UL
@@ -695,10 +551,8 @@
695#define NETLINK_ROUTE_SOCKET__RECV_MSG 0x00080000UL 551#define NETLINK_ROUTE_SOCKET__RECV_MSG 0x00080000UL
696#define NETLINK_ROUTE_SOCKET__SEND_MSG 0x00100000UL 552#define NETLINK_ROUTE_SOCKET__SEND_MSG 0x00100000UL
697#define NETLINK_ROUTE_SOCKET__NAME_BIND 0x00200000UL 553#define NETLINK_ROUTE_SOCKET__NAME_BIND 0x00200000UL
698
699#define NETLINK_ROUTE_SOCKET__NLMSG_READ 0x00400000UL 554#define NETLINK_ROUTE_SOCKET__NLMSG_READ 0x00400000UL
700#define NETLINK_ROUTE_SOCKET__NLMSG_WRITE 0x00800000UL 555#define NETLINK_ROUTE_SOCKET__NLMSG_WRITE 0x00800000UL
701
702#define NETLINK_FIREWALL_SOCKET__IOCTL 0x00000001UL 556#define NETLINK_FIREWALL_SOCKET__IOCTL 0x00000001UL
703#define NETLINK_FIREWALL_SOCKET__READ 0x00000002UL 557#define NETLINK_FIREWALL_SOCKET__READ 0x00000002UL
704#define NETLINK_FIREWALL_SOCKET__WRITE 0x00000004UL 558#define NETLINK_FIREWALL_SOCKET__WRITE 0x00000004UL
@@ -721,10 +575,8 @@
721#define NETLINK_FIREWALL_SOCKET__RECV_MSG 0x00080000UL 575#define NETLINK_FIREWALL_SOCKET__RECV_MSG 0x00080000UL
722#define NETLINK_FIREWALL_SOCKET__SEND_MSG 0x00100000UL 576#define NETLINK_FIREWALL_SOCKET__SEND_MSG 0x00100000UL
723#define NETLINK_FIREWALL_SOCKET__NAME_BIND 0x00200000UL 577#define NETLINK_FIREWALL_SOCKET__NAME_BIND 0x00200000UL
724
725#define NETLINK_FIREWALL_SOCKET__NLMSG_READ 0x00400000UL 578#define NETLINK_FIREWALL_SOCKET__NLMSG_READ 0x00400000UL
726#define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE 0x00800000UL 579#define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE 0x00800000UL
727
728#define NETLINK_TCPDIAG_SOCKET__IOCTL 0x00000001UL 580#define NETLINK_TCPDIAG_SOCKET__IOCTL 0x00000001UL
729#define NETLINK_TCPDIAG_SOCKET__READ 0x00000002UL 581#define NETLINK_TCPDIAG_SOCKET__READ 0x00000002UL
730#define NETLINK_TCPDIAG_SOCKET__WRITE 0x00000004UL 582#define NETLINK_TCPDIAG_SOCKET__WRITE 0x00000004UL
@@ -747,10 +599,8 @@
747#define NETLINK_TCPDIAG_SOCKET__RECV_MSG 0x00080000UL 599#define NETLINK_TCPDIAG_SOCKET__RECV_MSG 0x00080000UL
748#define NETLINK_TCPDIAG_SOCKET__SEND_MSG 0x00100000UL 600#define NETLINK_TCPDIAG_SOCKET__SEND_MSG 0x00100000UL
749#define NETLINK_TCPDIAG_SOCKET__NAME_BIND 0x00200000UL 601#define NETLINK_TCPDIAG_SOCKET__NAME_BIND 0x00200000UL
750
751#define NETLINK_TCPDIAG_SOCKET__NLMSG_READ 0x00400000UL 602#define NETLINK_TCPDIAG_SOCKET__NLMSG_READ 0x00400000UL
752#define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE 0x00800000UL 603#define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE 0x00800000UL
753
754#define NETLINK_NFLOG_SOCKET__IOCTL 0x00000001UL 604#define NETLINK_NFLOG_SOCKET__IOCTL 0x00000001UL
755#define NETLINK_NFLOG_SOCKET__READ 0x00000002UL 605#define NETLINK_NFLOG_SOCKET__READ 0x00000002UL
756#define NETLINK_NFLOG_SOCKET__WRITE 0x00000004UL 606#define NETLINK_NFLOG_SOCKET__WRITE 0x00000004UL
@@ -773,7 +623,6 @@
773#define NETLINK_NFLOG_SOCKET__RECV_MSG 0x00080000UL 623#define NETLINK_NFLOG_SOCKET__RECV_MSG 0x00080000UL
774#define NETLINK_NFLOG_SOCKET__SEND_MSG 0x00100000UL 624#define NETLINK_NFLOG_SOCKET__SEND_MSG 0x00100000UL
775#define NETLINK_NFLOG_SOCKET__NAME_BIND 0x00200000UL 625#define NETLINK_NFLOG_SOCKET__NAME_BIND 0x00200000UL
776
777#define NETLINK_XFRM_SOCKET__IOCTL 0x00000001UL 626#define NETLINK_XFRM_SOCKET__IOCTL 0x00000001UL
778#define NETLINK_XFRM_SOCKET__READ 0x00000002UL 627#define NETLINK_XFRM_SOCKET__READ 0x00000002UL
779#define NETLINK_XFRM_SOCKET__WRITE 0x00000004UL 628#define NETLINK_XFRM_SOCKET__WRITE 0x00000004UL
@@ -796,10 +645,8 @@
796#define NETLINK_XFRM_SOCKET__RECV_MSG 0x00080000UL 645#define NETLINK_XFRM_SOCKET__RECV_MSG 0x00080000UL
797#define NETLINK_XFRM_SOCKET__SEND_MSG 0x00100000UL 646#define NETLINK_XFRM_SOCKET__SEND_MSG 0x00100000UL
798#define NETLINK_XFRM_SOCKET__NAME_BIND 0x00200000UL 647#define NETLINK_XFRM_SOCKET__NAME_BIND 0x00200000UL
799
800#define NETLINK_XFRM_SOCKET__NLMSG_READ 0x00400000UL 648#define NETLINK_XFRM_SOCKET__NLMSG_READ 0x00400000UL
801#define NETLINK_XFRM_SOCKET__NLMSG_WRITE 0x00800000UL 649#define NETLINK_XFRM_SOCKET__NLMSG_WRITE 0x00800000UL
802
803#define NETLINK_SELINUX_SOCKET__IOCTL 0x00000001UL 650#define NETLINK_SELINUX_SOCKET__IOCTL 0x00000001UL
804#define NETLINK_SELINUX_SOCKET__READ 0x00000002UL 651#define NETLINK_SELINUX_SOCKET__READ 0x00000002UL
805#define NETLINK_SELINUX_SOCKET__WRITE 0x00000004UL 652#define NETLINK_SELINUX_SOCKET__WRITE 0x00000004UL
@@ -822,7 +669,6 @@
822#define NETLINK_SELINUX_SOCKET__RECV_MSG 0x00080000UL 669#define NETLINK_SELINUX_SOCKET__RECV_MSG 0x00080000UL
823#define NETLINK_SELINUX_SOCKET__SEND_MSG 0x00100000UL 670#define NETLINK_SELINUX_SOCKET__SEND_MSG 0x00100000UL
824#define NETLINK_SELINUX_SOCKET__NAME_BIND 0x00200000UL 671#define NETLINK_SELINUX_SOCKET__NAME_BIND 0x00200000UL
825
826#define NETLINK_AUDIT_SOCKET__IOCTL 0x00000001UL 672#define NETLINK_AUDIT_SOCKET__IOCTL 0x00000001UL
827#define NETLINK_AUDIT_SOCKET__READ 0x00000002UL 673#define NETLINK_AUDIT_SOCKET__READ 0x00000002UL
828#define NETLINK_AUDIT_SOCKET__WRITE 0x00000004UL 674#define NETLINK_AUDIT_SOCKET__WRITE 0x00000004UL
@@ -845,12 +691,10 @@
845#define NETLINK_AUDIT_SOCKET__RECV_MSG 0x00080000UL 691#define NETLINK_AUDIT_SOCKET__RECV_MSG 0x00080000UL
846#define NETLINK_AUDIT_SOCKET__SEND_MSG 0x00100000UL 692#define NETLINK_AUDIT_SOCKET__SEND_MSG 0x00100000UL
847#define NETLINK_AUDIT_SOCKET__NAME_BIND 0x00200000UL 693#define NETLINK_AUDIT_SOCKET__NAME_BIND 0x00200000UL
848
849#define NETLINK_AUDIT_SOCKET__NLMSG_READ 0x00400000UL 694#define NETLINK_AUDIT_SOCKET__NLMSG_READ 0x00400000UL
850#define NETLINK_AUDIT_SOCKET__NLMSG_WRITE 0x00800000UL 695#define NETLINK_AUDIT_SOCKET__NLMSG_WRITE 0x00800000UL
851#define NETLINK_AUDIT_SOCKET__NLMSG_RELAY 0x01000000UL 696#define NETLINK_AUDIT_SOCKET__NLMSG_RELAY 0x01000000UL
852#define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV 0x02000000UL 697#define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV 0x02000000UL
853
854#define NETLINK_IP6FW_SOCKET__IOCTL 0x00000001UL 698#define NETLINK_IP6FW_SOCKET__IOCTL 0x00000001UL
855#define NETLINK_IP6FW_SOCKET__READ 0x00000002UL 699#define NETLINK_IP6FW_SOCKET__READ 0x00000002UL
856#define NETLINK_IP6FW_SOCKET__WRITE 0x00000004UL 700#define NETLINK_IP6FW_SOCKET__WRITE 0x00000004UL
@@ -873,10 +717,8 @@
873#define NETLINK_IP6FW_SOCKET__RECV_MSG 0x00080000UL 717#define NETLINK_IP6FW_SOCKET__RECV_MSG 0x00080000UL
874#define NETLINK_IP6FW_SOCKET__SEND_MSG 0x00100000UL 718#define NETLINK_IP6FW_SOCKET__SEND_MSG 0x00100000UL
875#define NETLINK_IP6FW_SOCKET__NAME_BIND 0x00200000UL 719#define NETLINK_IP6FW_SOCKET__NAME_BIND 0x00200000UL
876
877#define NETLINK_IP6FW_SOCKET__NLMSG_READ 0x00400000UL 720#define NETLINK_IP6FW_SOCKET__NLMSG_READ 0x00400000UL
878#define NETLINK_IP6FW_SOCKET__NLMSG_WRITE 0x00800000UL 721#define NETLINK_IP6FW_SOCKET__NLMSG_WRITE 0x00800000UL
879
880#define NETLINK_DNRT_SOCKET__IOCTL 0x00000001UL 722#define NETLINK_DNRT_SOCKET__IOCTL 0x00000001UL
881#define NETLINK_DNRT_SOCKET__READ 0x00000002UL 723#define NETLINK_DNRT_SOCKET__READ 0x00000002UL
882#define NETLINK_DNRT_SOCKET__WRITE 0x00000004UL 724#define NETLINK_DNRT_SOCKET__WRITE 0x00000004UL
@@ -899,24 +741,10 @@
899#define NETLINK_DNRT_SOCKET__RECV_MSG 0x00080000UL 741#define NETLINK_DNRT_SOCKET__RECV_MSG 0x00080000UL
900#define NETLINK_DNRT_SOCKET__SEND_MSG 0x00100000UL 742#define NETLINK_DNRT_SOCKET__SEND_MSG 0x00100000UL
901#define NETLINK_DNRT_SOCKET__NAME_BIND 0x00200000UL 743#define NETLINK_DNRT_SOCKET__NAME_BIND 0x00200000UL
902
903#define DBUS__ACQUIRE_SVC 0x00000001UL
904#define DBUS__SEND_MSG 0x00000002UL
905
906#define NSCD__GETPWD 0x00000001UL
907#define NSCD__GETGRP 0x00000002UL
908#define NSCD__GETHOST 0x00000004UL
909#define NSCD__GETSTAT 0x00000008UL
910#define NSCD__ADMIN 0x00000010UL
911#define NSCD__SHMEMPWD 0x00000020UL
912#define NSCD__SHMEMGRP 0x00000040UL
913#define NSCD__SHMEMHOST 0x00000080UL
914
915#define ASSOCIATION__SENDTO 0x00000001UL 744#define ASSOCIATION__SENDTO 0x00000001UL
916#define ASSOCIATION__RECVFROM 0x00000002UL 745#define ASSOCIATION__RECVFROM 0x00000002UL
917#define ASSOCIATION__SETCONTEXT 0x00000004UL 746#define ASSOCIATION__SETCONTEXT 0x00000004UL
918#define ASSOCIATION__POLMATCH 0x00000008UL 747#define ASSOCIATION__POLMATCH 0x00000008UL
919
920#define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL 748#define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL
921#define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL 749#define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL
922#define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE 0x00000004UL 750#define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE 0x00000004UL
@@ -939,7 +767,6 @@
939#define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG 0x00080000UL 767#define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG 0x00080000UL
940#define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG 0x00100000UL 768#define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG 0x00100000UL
941#define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND 0x00200000UL 769#define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND 0x00200000UL
942
943#define APPLETALK_SOCKET__IOCTL 0x00000001UL 770#define APPLETALK_SOCKET__IOCTL 0x00000001UL
944#define APPLETALK_SOCKET__READ 0x00000002UL 771#define APPLETALK_SOCKET__READ 0x00000002UL
945#define APPLETALK_SOCKET__WRITE 0x00000004UL 772#define APPLETALK_SOCKET__WRITE 0x00000004UL
@@ -962,11 +789,9 @@
962#define APPLETALK_SOCKET__RECV_MSG 0x00080000UL 789#define APPLETALK_SOCKET__RECV_MSG 0x00080000UL
963#define APPLETALK_SOCKET__SEND_MSG 0x00100000UL 790#define APPLETALK_SOCKET__SEND_MSG 0x00100000UL
964#define APPLETALK_SOCKET__NAME_BIND 0x00200000UL 791#define APPLETALK_SOCKET__NAME_BIND 0x00200000UL
965
966#define PACKET__SEND 0x00000001UL 792#define PACKET__SEND 0x00000001UL
967#define PACKET__RECV 0x00000002UL 793#define PACKET__RECV 0x00000002UL
968#define PACKET__RELABELTO 0x00000004UL 794#define PACKET__RELABELTO 0x00000004UL
969
970#define KEY__VIEW 0x00000001UL 795#define KEY__VIEW 0x00000001UL
971#define KEY__READ 0x00000002UL 796#define KEY__READ 0x00000002UL
972#define KEY__WRITE 0x00000004UL 797#define KEY__WRITE 0x00000004UL
@@ -974,10 +799,6 @@
974#define KEY__LINK 0x00000010UL 799#define KEY__LINK 0x00000010UL
975#define KEY__SETATTR 0x00000020UL 800#define KEY__SETATTR 0x00000020UL
976#define KEY__CREATE 0x00000040UL 801#define KEY__CREATE 0x00000040UL
977
978#define CONTEXT__TRANSLATE 0x00000001UL
979#define CONTEXT__CONTAINS 0x00000002UL
980
981#define DCCP_SOCKET__IOCTL 0x00000001UL 802#define DCCP_SOCKET__IOCTL 0x00000001UL
982#define DCCP_SOCKET__READ 0x00000002UL 803#define DCCP_SOCKET__READ 0x00000002UL
983#define DCCP_SOCKET__WRITE 0x00000004UL 804#define DCCP_SOCKET__WRITE 0x00000004UL
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index 9f3ebb1bfae6..378799068441 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -2,7 +2,7 @@
2/* 2/*
3 * Security object class definitions 3 * Security object class definitions
4 */ 4 */
5 S_("null") 5 S_(NULL)
6 S_("security") 6 S_("security")
7 S_("process") 7 S_("process")
8 S_("system") 8 S_("system")
@@ -32,19 +32,19 @@
32 S_("msgq") 32 S_("msgq")
33 S_("shm") 33 S_("shm")
34 S_("ipc") 34 S_("ipc")
35 S_("passwd") 35 S_(NULL)
36 S_("drawable") 36 S_(NULL)
37 S_("window") 37 S_(NULL)
38 S_("gc") 38 S_(NULL)
39 S_("font") 39 S_(NULL)
40 S_("colormap") 40 S_(NULL)
41 S_("property") 41 S_(NULL)
42 S_("cursor") 42 S_(NULL)
43 S_("xclient") 43 S_(NULL)
44 S_("xinput") 44 S_(NULL)
45 S_("xserver") 45 S_(NULL)
46 S_("xextension") 46 S_(NULL)
47 S_("pax") 47 S_(NULL)
48 S_("netlink_route_socket") 48 S_("netlink_route_socket")
49 S_("netlink_firewall_socket") 49 S_("netlink_firewall_socket")
50 S_("netlink_tcpdiag_socket") 50 S_("netlink_tcpdiag_socket")
@@ -54,12 +54,12 @@
54 S_("netlink_audit_socket") 54 S_("netlink_audit_socket")
55 S_("netlink_ip6fw_socket") 55 S_("netlink_ip6fw_socket")
56 S_("netlink_dnrt_socket") 56 S_("netlink_dnrt_socket")
57 S_("dbus") 57 S_(NULL)
58 S_("nscd") 58 S_(NULL)
59 S_("association") 59 S_("association")
60 S_("netlink_kobject_uevent_socket") 60 S_("netlink_kobject_uevent_socket")
61 S_("appletalk_socket") 61 S_("appletalk_socket")
62 S_("packet") 62 S_("packet")
63 S_("key") 63 S_("key")
64 S_("context") 64 S_(NULL)
65 S_("dccp_socket") 65 S_("dccp_socket")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index 67cef371ee00..35f309f47873 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -34,19 +34,6 @@
34#define SECCLASS_MSGQ 27 34#define SECCLASS_MSGQ 27
35#define SECCLASS_SHM 28 35#define SECCLASS_SHM 28
36#define SECCLASS_IPC 29 36#define SECCLASS_IPC 29
37#define SECCLASS_PASSWD 30
38#define SECCLASS_DRAWABLE 31
39#define SECCLASS_WINDOW 32
40#define SECCLASS_GC 33
41#define SECCLASS_FONT 34
42#define SECCLASS_COLORMAP 35
43#define SECCLASS_PROPERTY 36
44#define SECCLASS_CURSOR 37
45#define SECCLASS_XCLIENT 38
46#define SECCLASS_XINPUT 39
47#define SECCLASS_XSERVER 40
48#define SECCLASS_XEXTENSION 41
49#define SECCLASS_PAX 42
50#define SECCLASS_NETLINK_ROUTE_SOCKET 43 37#define SECCLASS_NETLINK_ROUTE_SOCKET 43
51#define SECCLASS_NETLINK_FIREWALL_SOCKET 44 38#define SECCLASS_NETLINK_FIREWALL_SOCKET 44
52#define SECCLASS_NETLINK_TCPDIAG_SOCKET 45 39#define SECCLASS_NETLINK_TCPDIAG_SOCKET 45
@@ -56,14 +43,11 @@
56#define SECCLASS_NETLINK_AUDIT_SOCKET 49 43#define SECCLASS_NETLINK_AUDIT_SOCKET 49
57#define SECCLASS_NETLINK_IP6FW_SOCKET 50 44#define SECCLASS_NETLINK_IP6FW_SOCKET 50
58#define SECCLASS_NETLINK_DNRT_SOCKET 51 45#define SECCLASS_NETLINK_DNRT_SOCKET 51
59#define SECCLASS_DBUS 52
60#define SECCLASS_NSCD 53
61#define SECCLASS_ASSOCIATION 54 46#define SECCLASS_ASSOCIATION 54
62#define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55 47#define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55
63#define SECCLASS_APPLETALK_SOCKET 56 48#define SECCLASS_APPLETALK_SOCKET 56
64#define SECCLASS_PACKET 57 49#define SECCLASS_PACKET 57
65#define SECCLASS_KEY 58 50#define SECCLASS_KEY 58
66#define SECCLASS_CONTEXT 59
67#define SECCLASS_DCCP_SOCKET 60 51#define SECCLASS_DCCP_SOCKET 60
68 52
69/* 53/*
diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/netlabel.h
index 2a732c9033e3..218e3f77c350 100644
--- a/security/selinux/include/selinux_netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -38,19 +38,22 @@
38 38
39#ifdef CONFIG_NETLABEL 39#ifdef CONFIG_NETLABEL
40void selinux_netlbl_cache_invalidate(void); 40void selinux_netlbl_cache_invalidate(void);
41int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); 41
42int selinux_netlbl_socket_post_create(struct socket *sock);
43void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
44int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
45 struct sk_buff *skb,
46 struct avc_audit_data *ad);
47void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, 42void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
48 int family); 43 int family);
49void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, 44void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
50 int family); 45 int family);
51void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, 46void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
52 struct sk_security_struct *newssec); 47 struct sk_security_struct *newssec);
48
49int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
50
51void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
52int selinux_netlbl_socket_post_create(struct socket *sock);
53int selinux_netlbl_inode_permission(struct inode *inode, int mask); 53int selinux_netlbl_inode_permission(struct inode *inode, int mask);
54int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
55 struct sk_buff *skb,
56 struct avc_audit_data *ad);
54int selinux_netlbl_socket_setsockopt(struct socket *sock, 57int selinux_netlbl_socket_setsockopt(struct socket *sock,
55 int level, 58 int level,
56 int optname); 59 int optname);
@@ -60,59 +63,53 @@ static inline void selinux_netlbl_cache_invalidate(void)
60 return; 63 return;
61} 64}
62 65
63static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, 66static inline void selinux_netlbl_sk_security_reset(
64 u32 base_sid, 67 struct sk_security_struct *ssec,
65 u32 *sid) 68 int family)
66{ 69{
67 *sid = SECSID_NULL; 70 return;
68 return 0;
69} 71}
70 72static inline void selinux_netlbl_sk_security_init(
71static inline int selinux_netlbl_socket_post_create(struct socket *sock) 73 struct sk_security_struct *ssec,
74 int family)
72{ 75{
73 return 0; 76 return;
74} 77}
75 78static inline void selinux_netlbl_sk_security_clone(
76static inline void selinux_netlbl_sock_graft(struct sock *sk, 79 struct sk_security_struct *ssec,
77 struct socket *sock) 80 struct sk_security_struct *newssec)
78{ 81{
79 return; 82 return;
80} 83}
81 84
82static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 85static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
83 struct sk_buff *skb, 86 u32 base_sid,
84 struct avc_audit_data *ad) 87 u32 *sid)
85{ 88{
89 *sid = SECSID_NULL;
86 return 0; 90 return 0;
87} 91}
88 92
89static inline void selinux_netlbl_sk_security_reset( 93static inline void selinux_netlbl_sock_graft(struct sock *sk,
90 struct sk_security_struct *ssec, 94 struct socket *sock)
91 int family)
92{
93 return;
94}
95
96static inline void selinux_netlbl_sk_security_init(
97 struct sk_security_struct *ssec,
98 int family)
99{ 95{
100 return; 96 return;
101} 97}
102 98static inline int selinux_netlbl_socket_post_create(struct socket *sock)
103static inline void selinux_netlbl_sk_security_clone(
104 struct sk_security_struct *ssec,
105 struct sk_security_struct *newssec)
106{ 99{
107 return; 100 return 0;
108} 101}
109
110static inline int selinux_netlbl_inode_permission(struct inode *inode, 102static inline int selinux_netlbl_inode_permission(struct inode *inode,
111 int mask) 103 int mask)
112{ 104{
113 return 0; 105 return 0;
114} 106}
115 107static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
108 struct sk_buff *skb,
109 struct avc_audit_data *ad)
110{
111 return 0;
112}
116static inline int selinux_netlbl_socket_setsockopt(struct socket *sock, 113static inline int selinux_netlbl_socket_setsockopt(struct socket *sock,
117 int level, 114 int level,
118 int optname) 115 int optname)
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 210eec77e7ff..b94378afea25 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -34,7 +34,7 @@
34#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS 34#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS
35#endif 35#endif
36 36
37struct sk_buff; 37struct netlbl_lsm_secattr;
38 38
39extern int selinux_enabled; 39extern int selinux_enabled;
40extern int selinux_mls_enabled; 40extern int selinux_mls_enabled;
@@ -82,8 +82,6 @@ int security_netif_sid(char *name, u32 *if_sid,
82int security_node_sid(u16 domain, void *addr, u32 addrlen, 82int security_node_sid(u16 domain, void *addr, u32 addrlen,
83 u32 *out_sid); 83 u32 *out_sid);
84 84
85void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid);
86
87int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, 85int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
88 u16 tclass); 86 u16 tclass);
89 87
@@ -102,5 +100,30 @@ int security_fs_use(const char *fstype, unsigned int *behavior,
102int security_genfs_sid(const char *fstype, char *name, u16 sclass, 100int security_genfs_sid(const char *fstype, char *name, u16 sclass,
103 u32 *sid); 101 u32 *sid);
104 102
103#ifdef CONFIG_NETLABEL
104int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
105 u32 base_sid,
106 u32 *sid);
107
108int security_netlbl_sid_to_secattr(u32 sid,
109 struct netlbl_lsm_secattr *secattr);
110#else
111static inline int security_netlbl_secattr_to_sid(
112 struct netlbl_lsm_secattr *secattr,
113 u32 base_sid,
114 u32 *sid)
115{
116 return -EIDRM;
117}
118
119static inline int security_netlbl_sid_to_secattr(u32 sid,
120 struct netlbl_lsm_secattr *secattr)
121{
122 return -ENOENT;
123}
124#endif /* CONFIG_NETLABEL */
125
126const char *security_get_initial_sid_context(u32 sid);
127
105#endif /* _SELINUX_SECURITY_H_ */ 128#endif /* _SELINUX_SECURITY_H_ */
106 129
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
new file mode 100644
index 000000000000..bf8750791dd1
--- /dev/null
+++ b/security/selinux/netlabel.c
@@ -0,0 +1,363 @@
1/*
2 * SELinux NetLabel Support
3 *
4 * This file provides the necessary glue to tie NetLabel into the SELinux
5 * subsystem.
6 *
7 * Author: Paul Moore <paul.moore@hp.com>
8 *
9 */
10
11/*
12 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
22 * the GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 *
28 */
29
30#include <linux/spinlock.h>
31#include <linux/rcupdate.h>
32#include <net/sock.h>
33#include <net/netlabel.h>
34
35#include "objsec.h"
36#include "security.h"
37
38/**
39 * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism
40 * @sock: the socket to label
41 * @sid: the SID to use
42 *
43 * Description:
44 * Attempt to label a socket using the NetLabel mechanism using the given
45 * SID. Returns zero values on success, negative values on failure. The
46 * caller is responsibile for calling rcu_read_lock() before calling this
47 * this function and rcu_read_unlock() after this function returns.
48 *
49 */
50static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid)
51{
52 int rc;
53 struct sk_security_struct *sksec = sock->sk->sk_security;
54 struct netlbl_lsm_secattr secattr;
55
56 rc = security_netlbl_sid_to_secattr(sid, &secattr);
57 if (rc != 0)
58 return rc;
59
60 rc = netlbl_socket_setattr(sock, &secattr);
61 if (rc == 0) {
62 spin_lock_bh(&sksec->nlbl_lock);
63 sksec->nlbl_state = NLBL_LABELED;
64 spin_unlock_bh(&sksec->nlbl_lock);
65 }
66
67 return rc;
68}
69
70/**
71 * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
72 *
73 * Description:
74 * Invalidate the NetLabel security attribute mapping cache.
75 *
76 */
77void selinux_netlbl_cache_invalidate(void)
78{
79 netlbl_cache_invalidate();
80}
81
82/**
83 * selinux_netlbl_sk_security_reset - Reset the NetLabel fields
84 * @ssec: the sk_security_struct
85 * @family: the socket family
86 *
87 * Description:
88 * Called when the NetLabel state of a sk_security_struct needs to be reset.
89 * The caller is responsibile for all the NetLabel sk_security_struct locking.
90 *
91 */
92void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
93 int family)
94{
95 if (family == PF_INET)
96 ssec->nlbl_state = NLBL_REQUIRE;
97 else
98 ssec->nlbl_state = NLBL_UNSET;
99}
100
101/**
102 * selinux_netlbl_sk_security_init - Setup the NetLabel fields
103 * @ssec: the sk_security_struct
104 * @family: the socket family
105 *
106 * Description:
107 * Called when a new sk_security_struct is allocated to initialize the NetLabel
108 * fields.
109 *
110 */
111void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
112 int family)
113{
114 /* No locking needed, we are the only one who has access to ssec */
115 selinux_netlbl_sk_security_reset(ssec, family);
116 spin_lock_init(&ssec->nlbl_lock);
117}
118
119/**
120 * selinux_netlbl_sk_security_clone - Copy the NetLabel fields
121 * @ssec: the original sk_security_struct
122 * @newssec: the cloned sk_security_struct
123 *
124 * Description:
125 * Clone the NetLabel specific sk_security_struct fields from @ssec to
126 * @newssec.
127 *
128 */
129void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
130 struct sk_security_struct *newssec)
131{
132 /* We don't need to take newssec->nlbl_lock because we are the only
133 * thread with access to newssec, but we do need to take the RCU read
134 * lock as other threads could have access to ssec */
135 rcu_read_lock();
136 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
137 newssec->sclass = ssec->sclass;
138 rcu_read_unlock();
139}
140
141/**
142 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
143 * @skb: the packet
144 * @base_sid: the SELinux SID to use as a context for MLS only attributes
145 * @sid: the SID
146 *
147 * Description:
148 * Call the NetLabel mechanism to get the security attributes of the given
149 * packet and use those attributes to determine the correct context/SID to
150 * assign to the packet. Returns zero on success, negative values on failure.
151 *
152 */
153int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
154{
155 int rc;
156 struct netlbl_lsm_secattr secattr;
157
158 netlbl_secattr_init(&secattr);
159 rc = netlbl_skbuff_getattr(skb, &secattr);
160 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
161 rc = security_netlbl_secattr_to_sid(&secattr,
162 base_sid,
163 sid);
164 else
165 *sid = SECSID_NULL;
166 netlbl_secattr_destroy(&secattr);
167
168 return rc;
169}
170
171/**
172 * selinux_netlbl_sock_graft - Netlabel the new socket
173 * @sk: the new connection
174 * @sock: the new socket
175 *
176 * Description:
177 * The connection represented by @sk is being grafted onto @sock so set the
178 * socket's NetLabel to match the SID of @sk.
179 *
180 */
181void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
182{
183 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
184 struct sk_security_struct *sksec = sk->sk_security;
185 struct netlbl_lsm_secattr secattr;
186 u32 nlbl_peer_sid;
187
188 sksec->sclass = isec->sclass;
189
190 rcu_read_lock();
191
192 if (sksec->nlbl_state != NLBL_REQUIRE) {
193 rcu_read_unlock();
194 return;
195 }
196
197 netlbl_secattr_init(&secattr);
198 if (netlbl_sock_getattr(sk, &secattr) == 0 &&
199 secattr.flags != NETLBL_SECATTR_NONE &&
200 security_netlbl_secattr_to_sid(&secattr,
201 SECINITSID_UNLABELED,
202 &nlbl_peer_sid) == 0)
203 sksec->peer_sid = nlbl_peer_sid;
204 netlbl_secattr_destroy(&secattr);
205
206 /* Try to set the NetLabel on the socket to save time later, if we fail
207 * here we will pick up the pieces in later calls to
208 * selinux_netlbl_inode_permission(). */
209 selinux_netlbl_socket_setsid(sock, sksec->sid);
210
211 rcu_read_unlock();
212}
213
214/**
215 * selinux_netlbl_socket_post_create - Label a socket using NetLabel
216 * @sock: the socket to label
217 *
218 * Description:
219 * Attempt to label a socket using the NetLabel mechanism using the given
220 * SID. Returns zero values on success, negative values on failure.
221 *
222 */
223int selinux_netlbl_socket_post_create(struct socket *sock)
224{
225 int rc = 0;
226 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
227 struct sk_security_struct *sksec = sock->sk->sk_security;
228
229 sksec->sclass = isec->sclass;
230
231 rcu_read_lock();
232 if (sksec->nlbl_state == NLBL_REQUIRE)
233 rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
234 rcu_read_unlock();
235
236 return rc;
237}
238
239/**
240 * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled
241 * @inode: the file descriptor's inode
242 * @mask: the permission mask
243 *
244 * Description:
245 * Looks at a file's inode and if it is marked as a socket protected by
246 * NetLabel then verify that the socket has been labeled, if not try to label
247 * the socket now with the inode's SID. Returns zero on success, negative
248 * values on failure.
249 *
250 */
251int selinux_netlbl_inode_permission(struct inode *inode, int mask)
252{
253 int rc;
254 struct sk_security_struct *sksec;
255 struct socket *sock;
256
257 if (!S_ISSOCK(inode->i_mode) ||
258 ((mask & (MAY_WRITE | MAY_APPEND)) == 0))
259 return 0;
260 sock = SOCKET_I(inode);
261 sksec = sock->sk->sk_security;
262
263 rcu_read_lock();
264 if (sksec->nlbl_state != NLBL_REQUIRE) {
265 rcu_read_unlock();
266 return 0;
267 }
268 local_bh_disable();
269 bh_lock_sock_nested(sock->sk);
270 rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
271 bh_unlock_sock(sock->sk);
272 local_bh_enable();
273 rcu_read_unlock();
274
275 return rc;
276}
277
278/**
279 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
280 * @sksec: the sock's sk_security_struct
281 * @skb: the packet
282 * @ad: the audit data
283 *
284 * Description:
285 * Fetch the NetLabel security attributes from @skb and perform an access check
286 * against the receiving socket. Returns zero on success, negative values on
287 * error.
288 *
289 */
290int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
291 struct sk_buff *skb,
292 struct avc_audit_data *ad)
293{
294 int rc;
295 u32 netlbl_sid;
296 u32 recv_perm;
297
298 rc = selinux_netlbl_skbuff_getsid(skb,
299 SECINITSID_UNLABELED,
300 &netlbl_sid);
301 if (rc != 0)
302 return rc;
303
304 if (netlbl_sid == SECSID_NULL)
305 return 0;
306
307 switch (sksec->sclass) {
308 case SECCLASS_UDP_SOCKET:
309 recv_perm = UDP_SOCKET__RECVFROM;
310 break;
311 case SECCLASS_TCP_SOCKET:
312 recv_perm = TCP_SOCKET__RECVFROM;
313 break;
314 default:
315 recv_perm = RAWIP_SOCKET__RECVFROM;
316 }
317
318 rc = avc_has_perm(sksec->sid,
319 netlbl_sid,
320 sksec->sclass,
321 recv_perm,
322 ad);
323 if (rc == 0)
324 return 0;
325
326 netlbl_skbuff_err(skb, rc);
327 return rc;
328}
329
330/**
331 * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
332 * @sock: the socket
333 * @level: the socket level or protocol
334 * @optname: the socket option name
335 *
336 * Description:
337 * Check the setsockopt() call and if the user is trying to replace the IP
338 * options on a socket and a NetLabel is in place for the socket deny the
339 * access; otherwise allow the access. Returns zero when the access is
340 * allowed, -EACCES when denied, and other negative values on error.
341 *
342 */
343int selinux_netlbl_socket_setsockopt(struct socket *sock,
344 int level,
345 int optname)
346{
347 int rc = 0;
348 struct sk_security_struct *sksec = sock->sk->sk_security;
349 struct netlbl_lsm_secattr secattr;
350
351 rcu_read_lock();
352 if (level == IPPROTO_IP && optname == IP_OPTIONS &&
353 sksec->nlbl_state == NLBL_LABELED) {
354 netlbl_secattr_init(&secattr);
355 rc = netlbl_socket_getattr(sock, &secattr);
356 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
357 rc = -EACCES;
358 netlbl_secattr_destroy(&secattr);
359 }
360 rcu_read_unlock();
361
362 return rc;
363}
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 93b3177c7585..aca099aa2ed3 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -96,12 +96,18 @@ enum sel_inos {
96 SEL_COMMIT_BOOLS, /* commit new boolean values */ 96 SEL_COMMIT_BOOLS, /* commit new boolean values */
97 SEL_MLS, /* return if MLS policy is enabled */ 97 SEL_MLS, /* return if MLS policy is enabled */
98 SEL_DISABLE, /* disable SELinux until next reboot */ 98 SEL_DISABLE, /* disable SELinux until next reboot */
99 SEL_AVC, /* AVC management directory */
100 SEL_MEMBER, /* compute polyinstantiation membership decision */ 99 SEL_MEMBER, /* compute polyinstantiation membership decision */
101 SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */ 100 SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */
102 SEL_COMPAT_NET, /* whether to use old compat network packet controls */ 101 SEL_COMPAT_NET, /* whether to use old compat network packet controls */
102 SEL_INO_NEXT, /* The next inode number to use */
103}; 103};
104 104
105static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
106
107#define SEL_INITCON_INO_OFFSET 0x01000000
108#define SEL_BOOL_INO_OFFSET 0x02000000
109#define SEL_INO_MASK 0x00ffffff
110
105#define TMPBUFLEN 12 111#define TMPBUFLEN 12
106static ssize_t sel_read_enforce(struct file *filp, char __user *buf, 112static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
107 size_t count, loff_t *ppos) 113 size_t count, loff_t *ppos)
@@ -777,8 +783,6 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode)
777 return ret; 783 return ret;
778} 784}
779 785
780#define BOOL_INO_OFFSET 30
781
782static ssize_t sel_read_bool(struct file *filep, char __user *buf, 786static ssize_t sel_read_bool(struct file *filep, char __user *buf,
783 size_t count, loff_t *ppos) 787 size_t count, loff_t *ppos)
784{ 788{
@@ -806,14 +810,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
806 } 810 }
807 811
808 inode = filep->f_path.dentry->d_inode; 812 inode = filep->f_path.dentry->d_inode;
809 cur_enforcing = security_get_bool_value(inode->i_ino - BOOL_INO_OFFSET); 813 cur_enforcing = security_get_bool_value(inode->i_ino&SEL_INO_MASK);
810 if (cur_enforcing < 0) { 814 if (cur_enforcing < 0) {
811 ret = cur_enforcing; 815 ret = cur_enforcing;
812 goto out; 816 goto out;
813 } 817 }
814 818
815 length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, 819 length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
816 bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]); 820 bool_pending_values[inode->i_ino&SEL_INO_MASK]);
817 ret = simple_read_from_buffer(buf, count, ppos, page, length); 821 ret = simple_read_from_buffer(buf, count, ppos, page, length);
818out: 822out:
819 mutex_unlock(&sel_mutex); 823 mutex_unlock(&sel_mutex);
@@ -865,7 +869,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
865 new_value = 1; 869 new_value = 1;
866 870
867 inode = filep->f_path.dentry->d_inode; 871 inode = filep->f_path.dentry->d_inode;
868 bool_pending_values[inode->i_ino - BOOL_INO_OFFSET] = new_value; 872 bool_pending_values[inode->i_ino&SEL_INO_MASK] = new_value;
869 length = count; 873 length = count;
870 874
871out: 875out:
@@ -1029,7 +1033,7 @@ static int sel_make_bools(void)
1029 isec->sid = sid; 1033 isec->sid = sid;
1030 isec->initialized = 1; 1034 isec->initialized = 1;
1031 inode->i_fop = &sel_bool_ops; 1035 inode->i_fop = &sel_bool_ops;
1032 inode->i_ino = i + BOOL_INO_OFFSET; 1036 inode->i_ino = i|SEL_BOOL_INO_OFFSET;
1033 d_add(dentry, inode); 1037 d_add(dentry, inode);
1034 } 1038 }
1035 bool_num = num; 1039 bool_num = num;
@@ -1234,6 +1238,56 @@ static int sel_make_avc_files(struct dentry *dir)
1234 goto out; 1238 goto out;
1235 } 1239 }
1236 inode->i_fop = files[i].ops; 1240 inode->i_fop = files[i].ops;
1241 inode->i_ino = ++sel_last_ino;
1242 d_add(dentry, inode);
1243 }
1244out:
1245 return ret;
1246}
1247
1248static ssize_t sel_read_initcon(struct file * file, char __user *buf,
1249 size_t count, loff_t *ppos)
1250{
1251 struct inode *inode;
1252 char *con;
1253 u32 sid, len;
1254 ssize_t ret;
1255
1256 inode = file->f_path.dentry->d_inode;
1257 sid = inode->i_ino&SEL_INO_MASK;
1258 ret = security_sid_to_context(sid, &con, &len);
1259 if (ret < 0)
1260 return ret;
1261
1262 ret = simple_read_from_buffer(buf, count, ppos, con, len);
1263 kfree(con);
1264 return ret;
1265}
1266
1267static const struct file_operations sel_initcon_ops = {
1268 .read = sel_read_initcon,
1269};
1270
1271static int sel_make_initcon_files(struct dentry *dir)
1272{
1273 int i, ret = 0;
1274
1275 for (i = 1; i <= SECINITSID_NUM; i++) {
1276 struct inode *inode;
1277 struct dentry *dentry;
1278 dentry = d_alloc_name(dir, security_get_initial_sid_context(i));
1279 if (!dentry) {
1280 ret = -ENOMEM;
1281 goto out;
1282 }
1283
1284 inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO);
1285 if (!inode) {
1286 ret = -ENOMEM;
1287 goto out;
1288 }
1289 inode->i_fop = &sel_initcon_ops;
1290 inode->i_ino = i|SEL_INITCON_INO_OFFSET;
1237 d_add(dentry, inode); 1291 d_add(dentry, inode);
1238 } 1292 }
1239out: 1293out:
@@ -1252,6 +1306,7 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry)
1252 } 1306 }
1253 inode->i_op = &simple_dir_inode_operations; 1307 inode->i_op = &simple_dir_inode_operations;
1254 inode->i_fop = &simple_dir_operations; 1308 inode->i_fop = &simple_dir_operations;
1309 inode->i_ino = ++sel_last_ino;
1255 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 1310 /* directory inodes start off with i_nlink == 2 (for "." entry) */
1256 inc_nlink(inode); 1311 inc_nlink(inode);
1257 d_add(dentry, inode); 1312 d_add(dentry, inode);
@@ -1314,6 +1369,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1314 ret = -ENOMEM; 1369 ret = -ENOMEM;
1315 goto err; 1370 goto err;
1316 } 1371 }
1372 inode->i_ino = ++sel_last_ino;
1317 isec = (struct inode_security_struct*)inode->i_security; 1373 isec = (struct inode_security_struct*)inode->i_security;
1318 isec->sid = SECINITSID_DEVNULL; 1374 isec->sid = SECINITSID_DEVNULL;
1319 isec->sclass = SECCLASS_CHR_FILE; 1375 isec->sclass = SECCLASS_CHR_FILE;
@@ -1336,6 +1392,21 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1336 ret = sel_make_avc_files(dentry); 1392 ret = sel_make_avc_files(dentry);
1337 if (ret) 1393 if (ret)
1338 goto err; 1394 goto err;
1395
1396 dentry = d_alloc_name(sb->s_root, "initial_contexts");
1397 if (!dentry) {
1398 ret = -ENOMEM;
1399 goto err;
1400 }
1401
1402 ret = sel_make_dir(root_inode, dentry);
1403 if (ret)
1404 goto err;
1405
1406 ret = sel_make_initcon_files(dentry);
1407 if (ret)
1408 goto err;
1409
1339out: 1410out:
1340 return ret; 1411 return ret;
1341err: 1412err:
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 1e52356664d6..40660ffd49b6 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -39,7 +39,6 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/audit.h> 40#include <linux/audit.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <net/sock.h>
43#include <net/netlabel.h> 42#include <net/netlabel.h>
44 43
45#include "flask.h" 44#include "flask.h"
@@ -53,7 +52,7 @@
53#include "conditional.h" 52#include "conditional.h"
54#include "mls.h" 53#include "mls.h"
55#include "objsec.h" 54#include "objsec.h"
56#include "selinux_netlabel.h" 55#include "netlabel.h"
57#include "xfrm.h" 56#include "xfrm.h"
58#include "ebitmap.h" 57#include "ebitmap.h"
59 58
@@ -594,6 +593,13 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
594 593
595#include "initial_sid_to_string.h" 594#include "initial_sid_to_string.h"
596 595
596const char *security_get_initial_sid_context(u32 sid)
597{
598 if (unlikely(sid > SECINITSID_NUM))
599 return NULL;
600 return initial_sid_to_string[sid];
601}
602
597/** 603/**
598 * security_sid_to_context - Obtain a context for a given SID. 604 * security_sid_to_context - Obtain a context for a given SID.
599 * @sid: security identifier, SID 605 * @sid: security identifier, SID
@@ -1050,6 +1056,8 @@ static int validate_classes(struct policydb *p)
1050 1056
1051 for (i = 1; i < kdefs->cts_len; i++) { 1057 for (i = 1; i < kdefs->cts_len; i++) {
1052 def_class = kdefs->class_to_string[i]; 1058 def_class = kdefs->class_to_string[i];
1059 if (!def_class)
1060 continue;
1053 if (i > p->p_classes.nprim) { 1061 if (i > p->p_classes.nprim) {
1054 printk(KERN_INFO 1062 printk(KERN_INFO
1055 "security: class %s not defined in policy\n", 1063 "security: class %s not defined in policy\n",
@@ -1249,6 +1257,7 @@ bad:
1249} 1257}
1250 1258
1251extern void selinux_complete_init(void); 1259extern void selinux_complete_init(void);
1260static int security_preserve_bools(struct policydb *p);
1252 1261
1253/** 1262/**
1254 * security_load_policy - Load a security policy configuration. 1263 * security_load_policy - Load a security policy configuration.
@@ -1325,6 +1334,12 @@ int security_load_policy(void *data, size_t len)
1325 goto err; 1334 goto err;
1326 } 1335 }
1327 1336
1337 rc = security_preserve_bools(&newpolicydb);
1338 if (rc) {
1339 printk(KERN_ERR "security: unable to preserve booleans\n");
1340 goto err;
1341 }
1342
1328 /* Clone the SID table. */ 1343 /* Clone the SID table. */
1329 sidtab_shutdown(&sidtab); 1344 sidtab_shutdown(&sidtab);
1330 if (sidtab_map(&sidtab, clone_sid, &newsidtab)) { 1345 if (sidtab_map(&sidtab, clone_sid, &newsidtab)) {
@@ -1882,6 +1897,37 @@ out:
1882 return rc; 1897 return rc;
1883} 1898}
1884 1899
1900static int security_preserve_bools(struct policydb *p)
1901{
1902 int rc, nbools = 0, *bvalues = NULL, i;
1903 char **bnames = NULL;
1904 struct cond_bool_datum *booldatum;
1905 struct cond_node *cur;
1906
1907 rc = security_get_bools(&nbools, &bnames, &bvalues);
1908 if (rc)
1909 goto out;
1910 for (i = 0; i < nbools; i++) {
1911 booldatum = hashtab_search(p->p_bools.table, bnames[i]);
1912 if (booldatum)
1913 booldatum->state = bvalues[i];
1914 }
1915 for (cur = p->cond_list; cur != NULL; cur = cur->next) {
1916 rc = evaluate_cond_node(p, cur);
1917 if (rc)
1918 goto out;
1919 }
1920
1921out:
1922 if (bnames) {
1923 for (i = 0; i < nbools; i++)
1924 kfree(bnames[i]);
1925 }
1926 kfree(bnames);
1927 kfree(bvalues);
1928 return rc;
1929}
1930
1885/* 1931/*
1886 * security_sid_mls_copy() - computes a new sid based on the given 1932 * security_sid_mls_copy() - computes a new sid based on the given
1887 * sid and the mls portion of mls_sid. 1933 * sid and the mls portion of mls_sid.
@@ -2198,41 +2244,15 @@ void selinux_audit_set_callback(int (*callback)(void))
2198 aurule_callback = callback; 2244 aurule_callback = callback;
2199} 2245}
2200 2246
2201/**
2202 * security_skb_extlbl_sid - Determine the external label of a packet
2203 * @skb: the packet
2204 * @base_sid: the SELinux SID to use as a context for MLS only external labels
2205 * @sid: the packet's SID
2206 *
2207 * Description:
2208 * Check the various different forms of external packet labeling and determine
2209 * the external SID for the packet.
2210 *
2211 */
2212void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid)
2213{
2214 u32 xfrm_sid;
2215 u32 nlbl_sid;
2216
2217 selinux_skb_xfrm_sid(skb, &xfrm_sid);
2218 if (selinux_netlbl_skbuff_getsid(skb,
2219 (xfrm_sid == SECSID_NULL ?
2220 base_sid : xfrm_sid),
2221 &nlbl_sid) != 0)
2222 nlbl_sid = SECSID_NULL;
2223
2224 *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
2225}
2226
2227#ifdef CONFIG_NETLABEL 2247#ifdef CONFIG_NETLABEL
2228/* 2248/*
2229 * This is the structure we store inside the NetLabel cache block. 2249 * NetLabel cache structure
2230 */ 2250 */
2231#define NETLBL_CACHE(x) ((struct netlbl_cache *)(x)) 2251#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x))
2232#define NETLBL_CACHE_T_NONE 0 2252#define NETLBL_CACHE_T_NONE 0
2233#define NETLBL_CACHE_T_SID 1 2253#define NETLBL_CACHE_T_SID 1
2234#define NETLBL_CACHE_T_MLS 2 2254#define NETLBL_CACHE_T_MLS 2
2235struct netlbl_cache { 2255struct selinux_netlbl_cache {
2236 u32 type; 2256 u32 type;
2237 union { 2257 union {
2238 u32 sid; 2258 u32 sid;
@@ -2241,7 +2261,7 @@ struct netlbl_cache {
2241}; 2261};
2242 2262
2243/** 2263/**
2244 * selinux_netlbl_cache_free - Free the NetLabel cached data 2264 * security_netlbl_cache_free - Free the NetLabel cached data
2245 * @data: the data to free 2265 * @data: the data to free
2246 * 2266 *
2247 * Description: 2267 * Description:
@@ -2249,9 +2269,9 @@ struct netlbl_cache {
2249 * netlbl_lsm_cache structure. 2269 * netlbl_lsm_cache structure.
2250 * 2270 *
2251 */ 2271 */
2252static void selinux_netlbl_cache_free(const void *data) 2272static void security_netlbl_cache_free(const void *data)
2253{ 2273{
2254 struct netlbl_cache *cache; 2274 struct selinux_netlbl_cache *cache;
2255 2275
2256 if (data == NULL) 2276 if (data == NULL)
2257 return; 2277 return;
@@ -2266,33 +2286,33 @@ static void selinux_netlbl_cache_free(const void *data)
2266} 2286}
2267 2287
2268/** 2288/**
2269 * selinux_netlbl_cache_add - Add an entry to the NetLabel cache 2289 * security_netlbl_cache_add - Add an entry to the NetLabel cache
2270 * @skb: the packet 2290 * @secattr: the NetLabel packet security attributes
2271 * @ctx: the SELinux context 2291 * @ctx: the SELinux context
2272 * 2292 *
2273 * Description: 2293 * Description:
2274 * Attempt to cache the context in @ctx, which was derived from the packet in 2294 * Attempt to cache the context in @ctx, which was derived from the packet in
2275 * @skb, in the NetLabel subsystem cache. 2295 * @skb, in the NetLabel subsystem cache. This function assumes @secattr has
2296 * already been initialized.
2276 * 2297 *
2277 */ 2298 */
2278static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) 2299static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
2300 struct context *ctx)
2279{ 2301{
2280 struct netlbl_cache *cache = NULL; 2302 struct selinux_netlbl_cache *cache = NULL;
2281 struct netlbl_lsm_secattr secattr;
2282 2303
2283 netlbl_secattr_init(&secattr); 2304 secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
2284 secattr.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); 2305 if (secattr->cache == NULL)
2285 if (secattr.cache == NULL) 2306 return;
2286 goto netlbl_cache_add_return;
2287 2307
2288 cache = kzalloc(sizeof(*cache), GFP_ATOMIC); 2308 cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
2289 if (cache == NULL) 2309 if (cache == NULL)
2290 goto netlbl_cache_add_return; 2310 return;
2291 2311
2292 cache->type = NETLBL_CACHE_T_MLS; 2312 cache->type = NETLBL_CACHE_T_MLS;
2293 if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, 2313 if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
2294 &ctx->range.level[0].cat) != 0) 2314 &ctx->range.level[0].cat) != 0)
2295 goto netlbl_cache_add_return; 2315 return;
2296 cache->data.mls_label.level[1].cat.highbit = 2316 cache->data.mls_label.level[1].cat.highbit =
2297 cache->data.mls_label.level[0].cat.highbit; 2317 cache->data.mls_label.level[0].cat.highbit;
2298 cache->data.mls_label.level[1].cat.node = 2318 cache->data.mls_label.level[1].cat.node =
@@ -2300,52 +2320,40 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx)
2300 cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; 2320 cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
2301 cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; 2321 cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
2302 2322
2303 secattr.cache->free = selinux_netlbl_cache_free; 2323 secattr->cache->free = security_netlbl_cache_free;
2304 secattr.cache->data = (void *)cache; 2324 secattr->cache->data = (void *)cache;
2305 secattr.flags = NETLBL_SECATTR_CACHE; 2325 secattr->flags |= NETLBL_SECATTR_CACHE;
2306
2307 netlbl_cache_add(skb, &secattr);
2308
2309netlbl_cache_add_return:
2310 netlbl_secattr_destroy(&secattr);
2311} 2326}
2312 2327
2313/** 2328/**
2314 * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache 2329 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
2315 *
2316 * Description:
2317 * Invalidate the NetLabel security attribute mapping cache.
2318 *
2319 */
2320void selinux_netlbl_cache_invalidate(void)
2321{
2322 netlbl_cache_invalidate();
2323}
2324
2325/**
2326 * selinux_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
2327 * @skb: the network packet
2328 * @secattr: the NetLabel packet security attributes 2330 * @secattr: the NetLabel packet security attributes
2329 * @base_sid: the SELinux SID to use as a context for MLS only attributes 2331 * @base_sid: the SELinux SID to use as a context for MLS only attributes
2330 * @sid: the SELinux SID 2332 * @sid: the SELinux SID
2331 * 2333 *
2332 * Description: 2334 * Description:
2333 * Convert the given NetLabel packet security attributes in @secattr into a 2335 * Convert the given NetLabel security attributes in @secattr into a
2334 * SELinux SID. If the @secattr field does not contain a full SELinux 2336 * SELinux SID. If the @secattr field does not contain a full SELinux
2335 * SID/context then use the context in @base_sid as the foundation. If @skb 2337 * SID/context then use the context in @base_sid as the foundation. If
2336 * is not NULL attempt to cache as much data as possibile. Returns zero on 2338 * possibile the 'cache' field of @secattr is set and the CACHE flag is set;
2337 * success, negative values on failure. 2339 * this is to allow the @secattr to be used by NetLabel to cache the secattr to
2340 * SID conversion for future lookups. Returns zero on success, negative
2341 * values on failure.
2338 * 2342 *
2339 */ 2343 */
2340static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, 2344int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2341 struct netlbl_lsm_secattr *secattr, 2345 u32 base_sid,
2342 u32 base_sid, 2346 u32 *sid)
2343 u32 *sid)
2344{ 2347{
2345 int rc = -EIDRM; 2348 int rc = -EIDRM;
2346 struct context *ctx; 2349 struct context *ctx;
2347 struct context ctx_new; 2350 struct context ctx_new;
2348 struct netlbl_cache *cache; 2351 struct selinux_netlbl_cache *cache;
2352
2353 if (!ss_initialized) {
2354 *sid = SECSID_NULL;
2355 return 0;
2356 }
2349 2357
2350 POLICY_RDLOCK; 2358 POLICY_RDLOCK;
2351 2359
@@ -2410,8 +2418,8 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb,
2410 if (rc != 0) 2418 if (rc != 0)
2411 goto netlbl_secattr_to_sid_return_cleanup; 2419 goto netlbl_secattr_to_sid_return_cleanup;
2412 2420
2413 if (skb != NULL) 2421 security_netlbl_cache_add(secattr, &ctx_new);
2414 selinux_netlbl_cache_add(skb, &ctx_new); 2422
2415 ebitmap_destroy(&ctx_new.range.level[0].cat); 2423 ebitmap_destroy(&ctx_new.range.level[0].cat);
2416 } else { 2424 } else {
2417 *sid = SECSID_NULL; 2425 *sid = SECSID_NULL;
@@ -2427,338 +2435,43 @@ netlbl_secattr_to_sid_return_cleanup:
2427} 2435}
2428 2436
2429/** 2437/**
2430 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel 2438 * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr
2431 * @skb: the packet 2439 * @sid: the SELinux SID
2432 * @base_sid: the SELinux SID to use as a context for MLS only attributes 2440 * @secattr: the NetLabel packet security attributes
2433 * @sid: the SID
2434 *
2435 * Description:
2436 * Call the NetLabel mechanism to get the security attributes of the given
2437 * packet and use those attributes to determine the correct context/SID to
2438 * assign to the packet. Returns zero on success, negative values on failure.
2439 *
2440 */
2441int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
2442{
2443 int rc;
2444 struct netlbl_lsm_secattr secattr;
2445
2446 netlbl_secattr_init(&secattr);
2447 rc = netlbl_skbuff_getattr(skb, &secattr);
2448 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
2449 rc = selinux_netlbl_secattr_to_sid(skb,
2450 &secattr,
2451 base_sid,
2452 sid);
2453 else
2454 *sid = SECSID_NULL;
2455 netlbl_secattr_destroy(&secattr);
2456
2457 return rc;
2458}
2459
2460/**
2461 * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism
2462 * @sock: the socket to label
2463 * @sid: the SID to use
2464 * 2441 *
2465 * Description: 2442 * Description:
2466 * Attempt to label a socket using the NetLabel mechanism using the given 2443 * Convert the given SELinux SID in @sid into a NetLabel security attribute.
2467 * SID. Returns zero values on success, negative values on failure. The 2444 * Returns zero on success, negative values on failure.
2468 * caller is responsibile for calling rcu_read_lock() before calling this
2469 * this function and rcu_read_unlock() after this function returns.
2470 * 2445 *
2471 */ 2446 */
2472static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) 2447int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr)
2473{ 2448{
2474 int rc = -ENOENT; 2449 int rc = -ENOENT;
2475 struct sk_security_struct *sksec = sock->sk->sk_security;
2476 struct netlbl_lsm_secattr secattr;
2477 struct context *ctx; 2450 struct context *ctx;
2478 2451
2452 netlbl_secattr_init(secattr);
2453
2479 if (!ss_initialized) 2454 if (!ss_initialized)
2480 return 0; 2455 return 0;
2481 2456
2482 netlbl_secattr_init(&secattr);
2483
2484 POLICY_RDLOCK; 2457 POLICY_RDLOCK;
2485
2486 ctx = sidtab_search(&sidtab, sid); 2458 ctx = sidtab_search(&sidtab, sid);
2487 if (ctx == NULL) 2459 if (ctx == NULL)
2488 goto netlbl_socket_setsid_return; 2460 goto netlbl_sid_to_secattr_failure;
2489 2461 secattr->domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1],
2490 secattr.domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1], 2462 GFP_ATOMIC);
2491 GFP_ATOMIC); 2463 secattr->flags |= NETLBL_SECATTR_DOMAIN;
2492 secattr.flags |= NETLBL_SECATTR_DOMAIN; 2464 mls_export_netlbl_lvl(ctx, secattr);
2493 mls_export_netlbl_lvl(ctx, &secattr); 2465 rc = mls_export_netlbl_cat(ctx, secattr);
2494 rc = mls_export_netlbl_cat(ctx, &secattr);
2495 if (rc != 0) 2466 if (rc != 0)
2496 goto netlbl_socket_setsid_return; 2467 goto netlbl_sid_to_secattr_failure;
2497
2498 rc = netlbl_socket_setattr(sock, &secattr);
2499 if (rc == 0) {
2500 spin_lock_bh(&sksec->nlbl_lock);
2501 sksec->nlbl_state = NLBL_LABELED;
2502 spin_unlock_bh(&sksec->nlbl_lock);
2503 }
2504
2505netlbl_socket_setsid_return:
2506 POLICY_RDUNLOCK; 2468 POLICY_RDUNLOCK;
2507 netlbl_secattr_destroy(&secattr);
2508 return rc;
2509}
2510
2511/**
2512 * selinux_netlbl_sk_security_reset - Reset the NetLabel fields
2513 * @ssec: the sk_security_struct
2514 * @family: the socket family
2515 *
2516 * Description:
2517 * Called when the NetLabel state of a sk_security_struct needs to be reset.
2518 * The caller is responsibile for all the NetLabel sk_security_struct locking.
2519 *
2520 */
2521void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
2522 int family)
2523{
2524 if (family == PF_INET)
2525 ssec->nlbl_state = NLBL_REQUIRE;
2526 else
2527 ssec->nlbl_state = NLBL_UNSET;
2528}
2529 2469
2530/** 2470 return 0;
2531 * selinux_netlbl_sk_security_init - Setup the NetLabel fields
2532 * @ssec: the sk_security_struct
2533 * @family: the socket family
2534 *
2535 * Description:
2536 * Called when a new sk_security_struct is allocated to initialize the NetLabel
2537 * fields.
2538 *
2539 */
2540void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
2541 int family)
2542{
2543 /* No locking needed, we are the only one who has access to ssec */
2544 selinux_netlbl_sk_security_reset(ssec, family);
2545 spin_lock_init(&ssec->nlbl_lock);
2546}
2547
2548/**
2549 * selinux_netlbl_sk_security_clone - Copy the NetLabel fields
2550 * @ssec: the original sk_security_struct
2551 * @newssec: the cloned sk_security_struct
2552 *
2553 * Description:
2554 * Clone the NetLabel specific sk_security_struct fields from @ssec to
2555 * @newssec.
2556 *
2557 */
2558void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
2559 struct sk_security_struct *newssec)
2560{
2561 /* We don't need to take newssec->nlbl_lock because we are the only
2562 * thread with access to newssec, but we do need to take the RCU read
2563 * lock as other threads could have access to ssec */
2564 rcu_read_lock();
2565 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
2566 newssec->sclass = ssec->sclass;
2567 rcu_read_unlock();
2568}
2569
2570/**
2571 * selinux_netlbl_socket_post_create - Label a socket using NetLabel
2572 * @sock: the socket to label
2573 *
2574 * Description:
2575 * Attempt to label a socket using the NetLabel mechanism using the given
2576 * SID. Returns zero values on success, negative values on failure.
2577 *
2578 */
2579int selinux_netlbl_socket_post_create(struct socket *sock)
2580{
2581 int rc = 0;
2582 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
2583 struct sk_security_struct *sksec = sock->sk->sk_security;
2584
2585 sksec->sclass = isec->sclass;
2586
2587 rcu_read_lock();
2588 if (sksec->nlbl_state == NLBL_REQUIRE)
2589 rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
2590 rcu_read_unlock();
2591
2592 return rc;
2593}
2594
2595/**
2596 * selinux_netlbl_sock_graft - Netlabel the new socket
2597 * @sk: the new connection
2598 * @sock: the new socket
2599 *
2600 * Description:
2601 * The connection represented by @sk is being grafted onto @sock so set the
2602 * socket's NetLabel to match the SID of @sk.
2603 *
2604 */
2605void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
2606{
2607 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
2608 struct sk_security_struct *sksec = sk->sk_security;
2609 struct netlbl_lsm_secattr secattr;
2610 u32 nlbl_peer_sid;
2611
2612 sksec->sclass = isec->sclass;
2613
2614 rcu_read_lock();
2615
2616 if (sksec->nlbl_state != NLBL_REQUIRE) {
2617 rcu_read_unlock();
2618 return;
2619 }
2620
2621 netlbl_secattr_init(&secattr);
2622 if (netlbl_sock_getattr(sk, &secattr) == 0 &&
2623 secattr.flags != NETLBL_SECATTR_NONE &&
2624 selinux_netlbl_secattr_to_sid(NULL,
2625 &secattr,
2626 SECINITSID_UNLABELED,
2627 &nlbl_peer_sid) == 0)
2628 sksec->peer_sid = nlbl_peer_sid;
2629 netlbl_secattr_destroy(&secattr);
2630
2631 /* Try to set the NetLabel on the socket to save time later, if we fail
2632 * here we will pick up the pieces in later calls to
2633 * selinux_netlbl_inode_permission(). */
2634 selinux_netlbl_socket_setsid(sock, sksec->sid);
2635
2636 rcu_read_unlock();
2637}
2638
2639/**
2640 * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled
2641 * @inode: the file descriptor's inode
2642 * @mask: the permission mask
2643 *
2644 * Description:
2645 * Looks at a file's inode and if it is marked as a socket protected by
2646 * NetLabel then verify that the socket has been labeled, if not try to label
2647 * the socket now with the inode's SID. Returns zero on success, negative
2648 * values on failure.
2649 *
2650 */
2651int selinux_netlbl_inode_permission(struct inode *inode, int mask)
2652{
2653 int rc;
2654 struct sk_security_struct *sksec;
2655 struct socket *sock;
2656
2657 if (!S_ISSOCK(inode->i_mode) ||
2658 ((mask & (MAY_WRITE | MAY_APPEND)) == 0))
2659 return 0;
2660 sock = SOCKET_I(inode);
2661 sksec = sock->sk->sk_security;
2662
2663 rcu_read_lock();
2664 if (sksec->nlbl_state != NLBL_REQUIRE) {
2665 rcu_read_unlock();
2666 return 0;
2667 }
2668 local_bh_disable();
2669 bh_lock_sock_nested(sock->sk);
2670 rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
2671 bh_unlock_sock(sock->sk);
2672 local_bh_enable();
2673 rcu_read_unlock();
2674
2675 return rc;
2676}
2677
2678/**
2679 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
2680 * @sksec: the sock's sk_security_struct
2681 * @skb: the packet
2682 * @ad: the audit data
2683 *
2684 * Description:
2685 * Fetch the NetLabel security attributes from @skb and perform an access check
2686 * against the receiving socket. Returns zero on success, negative values on
2687 * error.
2688 *
2689 */
2690int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
2691 struct sk_buff *skb,
2692 struct avc_audit_data *ad)
2693{
2694 int rc;
2695 u32 netlbl_sid;
2696 u32 recv_perm;
2697
2698 rc = selinux_netlbl_skbuff_getsid(skb,
2699 SECINITSID_UNLABELED,
2700 &netlbl_sid);
2701 if (rc != 0)
2702 return rc;
2703
2704 if (netlbl_sid == SECSID_NULL)
2705 return 0;
2706
2707 switch (sksec->sclass) {
2708 case SECCLASS_UDP_SOCKET:
2709 recv_perm = UDP_SOCKET__RECVFROM;
2710 break;
2711 case SECCLASS_TCP_SOCKET:
2712 recv_perm = TCP_SOCKET__RECVFROM;
2713 break;
2714 default:
2715 recv_perm = RAWIP_SOCKET__RECVFROM;
2716 }
2717
2718 rc = avc_has_perm(sksec->sid,
2719 netlbl_sid,
2720 sksec->sclass,
2721 recv_perm,
2722 ad);
2723 if (rc == 0)
2724 return 0;
2725
2726 netlbl_skbuff_err(skb, rc);
2727 return rc;
2728}
2729
2730/**
2731 * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
2732 * @sock: the socket
2733 * @level: the socket level or protocol
2734 * @optname: the socket option name
2735 *
2736 * Description:
2737 * Check the setsockopt() call and if the user is trying to replace the IP
2738 * options on a socket and a NetLabel is in place for the socket deny the
2739 * access; otherwise allow the access. Returns zero when the access is
2740 * allowed, -EACCES when denied, and other negative values on error.
2741 *
2742 */
2743int selinux_netlbl_socket_setsockopt(struct socket *sock,
2744 int level,
2745 int optname)
2746{
2747 int rc = 0;
2748 struct sk_security_struct *sksec = sock->sk->sk_security;
2749 struct netlbl_lsm_secattr secattr;
2750
2751 rcu_read_lock();
2752 if (level == IPPROTO_IP && optname == IP_OPTIONS &&
2753 sksec->nlbl_state == NLBL_LABELED) {
2754 netlbl_secattr_init(&secattr);
2755 rc = netlbl_socket_getattr(sock, &secattr);
2756 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
2757 rc = -EACCES;
2758 netlbl_secattr_destroy(&secattr);
2759 }
2760 rcu_read_unlock();
2761 2471
2472netlbl_sid_to_secattr_failure:
2473 POLICY_RDUNLOCK;
2474 netlbl_secattr_destroy(secattr);
2762 return rc; 2475 return rc;
2763} 2476}
2764#endif /* CONFIG_NETLABEL */ 2477#endif /* CONFIG_NETLABEL */